差异基因分析之后进行,绘图展示结果。
二、绘图
data_plot = as.data.frame(t(exprSet))
data_plot = data.frame(pairinfo=c(rep(1:2,1),rep(1:9,1),rep(1:7,1)),
group=group,
data_plot,stringsAsFactors = F)
# head(data_plot)
# View(data_plot)
ggplot2作图
未加配对信息
library(ggplot2)
ggplot(data_plot, aes(group,XIST,fill=group)) +
geom_jitter(aes(colour=group), size=2, alpha=0.5)+
xlab("") +
ylab(paste("Expression of ","XIST"))+
theme_classic()+
theme(legend.position = "none")
配对信息
#配对信息
library(ggplot2)
ggplot(data_plot, aes(group,XIST,fill=group)) +
geom_boxplot() +
geom_point(size=2, alpha=0.5) +
geom_line(aes(group=pairinfo), colour="black", linetype="11") +
xlab("") +
ylab(paste("Expression of ","XIST"))+
theme_classic()+
theme(legend.position = "none")
批量作图
可以自己一张张画,也可以批量地作图,批量画出差异最大的8个基因:
library(dplyr)
library(tibble)
allDiff_arrange <- allDiff1 %>%
rownames_to_column(var="genesymbol") %>%
arrange(desc(abs(logFC)))
genes <- allDiff_arrange$genesymbol[1:8]
plotlist <- lapply(genes, function(x){
data =data.frame(data_plot[,c("pairinfo","group")],gene=data_plot[,x])
ggplot(data, aes(group,gene,fill=group)) +
geom_boxplot() +
geom_point(size=2, alpha=0.5) +
geom_line(aes(group=pairinfo), colour="black", linetype="11") +
xlab("") +
ylab(paste("Expression of ",x))+
theme_classic()+
theme(legend.position = "none")
})
library(cowplot)
plot_grid(plotlist=plotlist, ncol=4,labels = LETTERS[1:8])
heatmap热图
library(pheatmap)
## 设定差异基因阈值,减少差异基因用于提取表达矩阵
#allDiff_pair=topTable(fit,adjust='BH',coef="group after",number=Inf,p.value=0.05,lfc =0.5)
allDiff1_2=topTable(fit2,adjust='fdr',coef=1,number=Inf)
allDiff_2 <- allDiff1[allDiff1$P.Value<0.05,]
head(allDiff_2)
## logFC AveExpr t P.Value adj.P.Val B
## SCLY -0.9758699 6.127361 -6.944808 1.758082e-06 0.03548337 4.348307
## PPP3CA 0.8379245 7.538998 6.278825 6.494863e-06 0.05384166 3.371575
## C7 -0.8382160 8.176424 -6.175140 8.003022e-06 0.05384166 3.212340
## FMO4 -0.6061683 7.890618 -5.706477 2.092795e-05 0.10390990 2.468876
## ZFP36L2 0.9218854 10.106224 5.607268 2.574194e-05 0.10390990 2.306620
## PRXL2A 0.8153523 9.321368 5.504897 3.191277e-05 0.10734925 2.137458
write.csv(allDiff1_2, file = "allDiff1_2.csv" );
##提前部分数据用作热图绘制
heatdata <- exprSet[rownames(allDiff_2),]
head(heatdata)
## GSM1539877 GSM1539878 GSM1539879 GSM1539880 GSM1539881 GSM1539882
## SCLY 6.159339 6.152581 5.517800 5.493032 5.491612 6.152265
## PPP3CA 7.868422 7.935910 8.209925 7.706072 7.990117 7.695428
## C7 8.055321 7.432209 7.884244 7.803996 8.152725 7.941745
## FMO4 7.245147 7.505883 7.930238 7.747309 7.585273 7.523062
## ZFP36L2 10.257085 10.801582 10.427174 10.596288 10.707020 10.611701
## PRXL2A 9.716232 9.124566 10.406678 9.351618 9.566426 9.551284
## GSM1539883 GSM1539884 GSM1539885 GSM1539886 GSM1539887 GSM1539888
## SCLY 5.932930 5.826712 5.220746 5.817913 5.818574 6.971664
## PPP3CA 8.132288 7.812878 7.795484 7.362780 8.036777 6.932992
## C7 7.537318 8.063722 7.614597 7.873801 7.901937 9.041997
## FMO4 7.444872 8.052785 7.627332 7.782334 7.515057 8.160536
## ZFP36L2 10.392284 9.665255 10.603515 10.714418 10.392284 9.251844
## PRXL2A 9.885099 9.640123 9.609857 9.748785 9.231477 8.650878
## GSM1539889 GSM1539890 GSM1539891 GSM1539892 GSM1539893 GSM1539894
## SCLY 6.483459 6.193475 6.414772 6.721595 6.806524 7.117499
## PPP3CA 6.436324 7.082618 7.418599 7.105683 7.283640 6.896033
## C7 8.277995 8.889543 9.077364 8.860462 8.400906 8.365755
## FMO4 8.090298 8.380246 8.160536 8.672445 8.524225 8.083539
## ZFP36L2 9.334554 9.793198 10.061164 9.611500 9.724169 8.966992
## PRXL2A 8.798169 8.717065 9.111668 9.155664 8.999288 8.519738
##制作一个分组信息用于注释
annotation_col <- data.frame(group)
rownames(annotation_col) <- colnames(heatdata)
#如果注释出界,可以通过调整格子比例和字体修正
pheatmap(heatdata, #热图的数据
cluster_rows = TRUE,#行聚类
cluster_cols = TRUE,#列聚类,可以看出样本之间的区分度
annotation_col =annotation_col, #标注样本分类
annotation_legend=TRUE, # 显示注释
show_rownames = F,# 显示行名
show_colnames = F,# 显示列名
scale = "row", #以行来标准化,这个功能很不错
color =colorRampPalette(c("blue", "white","red"))(100))
# 画热图的意义在哪?
# 第一看样本质量:本来Non_alcoholic_steatohepatiti,Healthy, Steatosis 组应该完全分开的,
# 但是热图里面Healthy有些样本跟Non_alcoholic_steatohepatiti分不开,要考虑是不是测量失误,
# 还是本身样本就特殊
#
# 第二看差异基因:差异基因提取出来的热图,就应当呈现横竖两条线,
# 把表格分成四个象限,也就是差异基因有高有低,这才符合常识。
火山图
library(ggplot2)
library(ggrepel)
library(dplyr)
#data <- topTable(fit,adjust='BH',coef="group_listafter",number=Inf)
data <- allDiff1
data$significant <- as.factor(data$adj.P.Val<0.05 & abs(data$logFC) > 0.5)
data$gene <- rownames(data)
ggplot(data=data, aes(x=logFC, y =-log10(adj.P.Val),color=significant)) +
geom_point(alpha=0.8, size=1.2,col="black")+
geom_point(data=subset(data, logFC > 0.5),alpha=0.8, size=1.2,col="red")+
geom_point(data=subset(data, logFC < -0.5),alpha=0.6, size=1.2,col="blue")+
labs(x="log2 (fold change)",y="-log10 (adj.P.Val)")+
theme(plot.title = element_text(hjust = 0.4))+
geom_hline(yintercept = -log10(0.05),lty=4,lwd=0.6,alpha=0.8)+
geom_vline(xintercept = c(0.5,-0.5),lty=4,lwd=0.6,alpha=0.8)+
theme_bw()+
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black")) +
geom_point(data=subset(data, abs(logFC) > 1),alpha=0.8, size=3,col="green")+
geom_text_repel(data=subset(data, abs(logFC) > 1),
aes(label=gene),col="black",alpha = 0.8)
火山图画起来简单,难的是如何定制化展示。比如在图上有不同的颜色,不同的点来表示数据。有什么意义呢?我其实理解的也不是很透彻,但是考虑到他的横坐标是变化倍数,纵坐标是p值的负数,那么p值越小,倍数越大的基因就会出现在左右上方。