GEO数据分析之DEG分析后绘图

差异基因分析之后进行，绘图展示结果。

二、绘图

data_plot = as.data.frame(t(exprSet))
data_plot = data.frame(pairinfo=c(rep(1:2,1),rep(1:9,1),rep(1:7,1)),
                       group=group,
                       data_plot,stringsAsFactors = F)
# head(data_plot)
# View(data_plot)

ggplot2作图

未加配对信息

library(ggplot2)

ggplot(data_plot, aes(group,XIST,fill=group)) +
  geom_jitter(aes(colour=group), size=2, alpha=0.5)+
  xlab("") +
  ylab(paste("Expression of ","XIST"))+
  theme_classic()+
  theme(legend.position = "none")

image.png

配对信息

#配对信息
library(ggplot2)
ggplot(data_plot, aes(group,XIST,fill=group)) +
  geom_boxplot() +
  geom_point(size=2, alpha=0.5) +
  geom_line(aes(group=pairinfo), colour="black", linetype="11") +
  xlab("") +
  ylab(paste("Expression of ","XIST"))+
  theme_classic()+
  theme(legend.position = "none")

image.png

批量作图

可以自己一张张画，也可以批量地作图，批量画出差异最大的8个基因：

library(dplyr)
library(tibble)
allDiff_arrange <- allDiff1 %>% 
  rownames_to_column(var="genesymbol") %>% 
  arrange(desc(abs(logFC)))
genes <- allDiff_arrange$genesymbol[1:8]

plotlist <- lapply(genes, function(x){
  data =data.frame(data_plot[,c("pairinfo","group")],gene=data_plot[,x])
  ggplot(data, aes(group,gene,fill=group)) +
    geom_boxplot() +
    geom_point(size=2, alpha=0.5) +
    geom_line(aes(group=pairinfo), colour="black", linetype="11") +
    xlab("") +
    ylab(paste("Expression of ",x))+
    theme_classic()+
    theme(legend.position = "none")
})

library(cowplot)
plot_grid(plotlist=plotlist, ncol=4,labels = LETTERS[1:8])

image.png

heatmap热图

library(pheatmap)
## 设定差异基因阈值，减少差异基因用于提取表达矩阵
#allDiff_pair=topTable(fit,adjust='BH',coef="group after",number=Inf,p.value=0.05,lfc =0.5) 
allDiff1_2=topTable(fit2,adjust='fdr',coef=1,number=Inf)
allDiff_2 <- allDiff1[allDiff1$P.Value<0.05,]
head(allDiff_2)

##              logFC   AveExpr         t      P.Value  adj.P.Val        B
## SCLY    -0.9758699  6.127361 -6.944808 1.758082e-06 0.03548337 4.348307
## PPP3CA   0.8379245  7.538998  6.278825 6.494863e-06 0.05384166 3.371575
## C7      -0.8382160  8.176424 -6.175140 8.003022e-06 0.05384166 3.212340
## FMO4    -0.6061683  7.890618 -5.706477 2.092795e-05 0.10390990 2.468876
## ZFP36L2  0.9218854 10.106224  5.607268 2.574194e-05 0.10390990 2.306620
## PRXL2A   0.8153523  9.321368  5.504897 3.191277e-05 0.10734925 2.137458

write.csv(allDiff1_2, file = "allDiff1_2.csv" );

##提前部分数据用作热图绘制
heatdata <- exprSet[rownames(allDiff_2),]
head(heatdata)

##         GSM1539877 GSM1539878 GSM1539879 GSM1539880 GSM1539881 GSM1539882
## SCLY      6.159339   6.152581   5.517800   5.493032   5.491612   6.152265
## PPP3CA    7.868422   7.935910   8.209925   7.706072   7.990117   7.695428
## C7        8.055321   7.432209   7.884244   7.803996   8.152725   7.941745
## FMO4      7.245147   7.505883   7.930238   7.747309   7.585273   7.523062
## ZFP36L2  10.257085  10.801582  10.427174  10.596288  10.707020  10.611701
## PRXL2A    9.716232   9.124566  10.406678   9.351618   9.566426   9.551284
##         GSM1539883 GSM1539884 GSM1539885 GSM1539886 GSM1539887 GSM1539888
## SCLY      5.932930   5.826712   5.220746   5.817913   5.818574   6.971664
## PPP3CA    8.132288   7.812878   7.795484   7.362780   8.036777   6.932992
## C7        7.537318   8.063722   7.614597   7.873801   7.901937   9.041997
## FMO4      7.444872   8.052785   7.627332   7.782334   7.515057   8.160536
## ZFP36L2  10.392284   9.665255  10.603515  10.714418  10.392284   9.251844
## PRXL2A    9.885099   9.640123   9.609857   9.748785   9.231477   8.650878
##         GSM1539889 GSM1539890 GSM1539891 GSM1539892 GSM1539893 GSM1539894
## SCLY      6.483459   6.193475   6.414772   6.721595   6.806524   7.117499
## PPP3CA    6.436324   7.082618   7.418599   7.105683   7.283640   6.896033
## C7        8.277995   8.889543   9.077364   8.860462   8.400906   8.365755
## FMO4      8.090298   8.380246   8.160536   8.672445   8.524225   8.083539
## ZFP36L2   9.334554   9.793198  10.061164   9.611500   9.724169   8.966992
## PRXL2A    8.798169   8.717065   9.111668   9.155664   8.999288   8.519738

##制作一个分组信息用于注释
annotation_col <- data.frame(group)
rownames(annotation_col) <- colnames(heatdata)

#如果注释出界，可以通过调整格子比例和字体修正
pheatmap(heatdata, #热图的数据
         cluster_rows = TRUE,#行聚类
         cluster_cols = TRUE,#列聚类，可以看出样本之间的区分度
         annotation_col =annotation_col, #标注样本分类
         annotation_legend=TRUE, # 显示注释
         show_rownames = F,# 显示行名
         show_colnames = F,# 显示列名
         scale = "row", #以行来标准化，这个功能很不错
         color =colorRampPalette(c("blue", "white","red"))(100))

image.png

# 画热图的意义在哪？
# 第一看样本质量：本来Non_alcoholic_steatohepatiti,Healthy, Steatosis 组应该完全分开的，
# 但是热图里面Healthy有些样本跟Non_alcoholic_steatohepatiti分不开，要考虑是不是测量失误，
# 还是本身样本就特殊
# 
# 第二看差异基因：差异基因提取出来的热图，就应当呈现横竖两条线，
# 把表格分成四个象限，也就是差异基因有高有低，这才符合常识。

火山图

library(ggplot2)
library(ggrepel)
library(dplyr)

#data <- topTable(fit,adjust='BH',coef="group_listafter",number=Inf) 
data <- allDiff1
data$significant <- as.factor(data$adj.P.Val<0.05 & abs(data$logFC) > 0.5)
data$gene <- rownames(data)
ggplot(data=data, aes(x=logFC, y =-log10(adj.P.Val),color=significant)) +
  geom_point(alpha=0.8, size=1.2,col="black")+
  geom_point(data=subset(data, logFC > 0.5),alpha=0.8, size=1.2,col="red")+
  geom_point(data=subset(data, logFC < -0.5),alpha=0.6, size=1.2,col="blue")+
  labs(x="log2 (fold change)",y="-log10 (adj.P.Val)")+
  theme(plot.title = element_text(hjust = 0.4))+
  geom_hline(yintercept = -log10(0.05),lty=4,lwd=0.6,alpha=0.8)+
  geom_vline(xintercept = c(0.5,-0.5),lty=4,lwd=0.6,alpha=0.8)+
  theme_bw()+
  theme(panel.border = element_blank(),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),   
        axis.line = element_line(colour = "black")) +
  geom_point(data=subset(data, abs(logFC) > 1),alpha=0.8, size=3,col="green")+
  geom_text_repel(data=subset(data, abs(logFC) > 1), 
                  aes(label=gene),col="black",alpha = 0.8)

image.png

火山图画起来简单，难的是如何定制化展示。比如在图上有不同的颜色，不同的点来表示数据。有什么意义呢？我其实理解的也不是很透彻，但是考虑到他的横坐标是变化倍数，纵坐标是p值的负数，那么p值越小，倍数越大的基因就会出现在左右上方。

GEO数据分析之DEG分析后绘图

二、绘图

ggplot2作图

未加配对信息

配对信息

批量作图

可以自己一张张画，也可以批量地作图，批量画出差异最大的8个基因：

heatmap热图

火山图

推荐阅读更多精彩内容