0.输入文件和R包
文件有两个,一个是差异分析结果;一个是msigdb数据库下载的gmt文件。
library(GSEABase)
library(clusterProfiler)
library(DOSE)
library(org.Hs.eg.db)
library(ggplot2)
library(stringr)
#准备差异基因列表
rm(list = ls())
load("step4output.Rdata")
options(stringsAsFactors = F)
data(geneList)
geneList = deg$logFC
names(geneList) = deg$ENTREZID
geneList = sort(geneList,decreasing = T)
#准备gmt文件
geneset <- read.gmt("h.all.v7.1.entrez.gmt")
geneset$ont = str_remove(geneset$ont,"HALLMARK_")
gsea要求的输入数据格式是排序后的logFC值,数据类型为向量,向量名字是entrezid。
2.运行GSEA并可视化
# gsea
egmt <- GSEA(geneList, TERM2GENE=geneset,verbose=F)
#> Warning in fgsea(pathways = geneSets, stats = geneList, nperm = nPerm, minSize = minGSSize, : There are ties in the preranked stats (0.05% of the list).
#> The order of those tied genes will be arbitrary, which may produce unexpected results.
egmt2<- setReadable(egmt,OrgDb=org.Hs.eg.db, keyType = "ENTREZID")
#气泡图,展示geneset被激活还是抑制
dotplot(egmt2,split=".sign")+facet_grid(~.sign)
#dotplot(egmt2,split=".sign")+facet_grid(~.sign)+
# scale_y_discrete(labels=function(x) str_wrap(str_replace_all(x,"_"," "), width=30))
# scale_y_discrete(labels=function(x) str_remove(x,"HALLMARK_"))
#山峦图,展示每个geneset的基因logFC分布
ridgeplot(egmt)
#选择单个gene set作图
egmtd <- data.frame(egmt)
library(enrichplot)
gseaplot2(egmt, geneSetID = 1, title = egmt$Description[1])
#多个gnenset合并展示
gseaplot2(egmt, geneSetID = 1:3,pvalue_table = T)
3.参考资料
#https://mp.weixin.qq.com/s/o658GH6SpR_QnXDiJ_BWYg
#https://hbctraining.github.io/DGE_workshop_salmon/lessons/functional_analysis_2019.html
#https://yulab-smu.github.io/clusterProfiler-book/index.html
#小鼠的GSEA collection:http://bioinf.wehi.edu.au/software/MSigDB/
#https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp