GSEA-analysis
1.加载数据
载入前一步分析得到的表达矩阵
library(ggstatsplot);
library(cowplot);
library(clusterProfiler);
library(stringr);
library(dplyr);
library(tidyr);
library(ggplot2);
library(ggstatsplot);
load(file = 'GSE63067_GSEA.Rdata')#导入上一步分析的数据
exprSet <- data_plot
exprSet[1:3,1:3]
## PAX8 CYP2A6 SCARB1
## GSM1539877 6.506860 11.94711 9.129116
## GSM1539878 6.313513 11.82544 9.402811
## GSM1539879 6.273058 11.42314 8.120977
2.批量相关性分析
将第一行目的基因跟其他行的编码基因批量做相关性分析,得到相关性系数以及p值。
y <- as.numeric(exprSet[,"CCL20"])
colnames <- colnames(exprSet)
cor_data_df <- data.frame(colnames)
for (i in 1:length(colnames)){
test <- cor.test(as.numeric(exprSet[,i]),y,type="spearman")
cor_data_df[i,2] <- test$estimate
cor_data_df[i,3] <- test$p.value
}
names(cor_data_df) <- c("symbol","correlation","pvalue")
# 查看这个数据结构
head(cor_data_df)
## symbol correlation pvalue
## 1 PAX8 -0.23354999 0.350963277
## 2 CYP2A6 -0.60172099 0.008244347
## 3 SCARB1 -0.19907443 0.428394688
## 4 TTLL12 -0.57277340 0.012974684
## 5 CYTOR 0.35144428 0.152686677
## 6 ADAM32 -0.01286106 0.959604984
3.筛选最相关的基因
筛选p值小于0.05,按照相关性系数绝对值选前500个的基因, 数量可以自己定。
cor_data_sig <- cor_data_df %>%
filter(pvalue < 0.05) %>%
arrange(desc(abs(correlation)))%>%
dplyr::slice(1:500)
4.随机选取正的和负的分别作图验证
正相关的选取IL2RG;负相关选取MARK1
#正相关的选取IL2RG
ggscatterstats(data = exprSet,
y = CCL20,
x = IL2RG,
centrality.para = "mean",
margins = "both",
xfill = "#CC79A7",
yfill = "#009E73",
marginal.type = "histogram",
title = "Relationship between CCL20 and IL2RG")
## Warning: This plot can't be further modified with `ggplot2` functions.
## In case you want a `ggplot` object, set `marginal = FALSE`.
#负相关的选取MARK1
ggscatterstats(data = exprSet,
y = CCL20,
x = MARK1,
centrality.para = "mean",
margins = "both",
xfill = "#CC79A7",
yfill = "#009E73",
marginal.type = "histogram",
title = "Relationship between CCL20 and IL2RG")
## Warning: This plot can't be further modified with `ggplot2` functions.
## In case you want a `ggplot` object, set `marginal = FALSE`.
#还可以用cowplot拼图
p1 <- ggscatterstats(data = exprSet,
y = CCL20,
x = IL2RG,
centrality.para = "mean",
margins = "both",
xfill = "#CC79A7",
yfill = "#009E73",
marginal.type = "histogram",
title = "Relationship between CCL20 and IL2RG")
## Warning: This plot can't be further modified with `ggplot2` functions.
## In case you want a `ggplot` object, set `marginal = FALSE`.
p2 <- ggscatterstats(data = exprSet,
y = CCL20,
x = MARK1,
centrality.para = "mean",
margins = "both",
xfill = "#CC79A7",
yfill = "#009E73",
marginal.type = "histogram",
title = "Relationship between CCL20 and IL2RG")
## Warning: This plot can't be further modified with `ggplot2` functions.
## In case you want a `ggplot` object, set `marginal = FALSE`.
plot_grid(p1,p2,nrow = 1,labels = LETTERS[1:2])
5.聚类分析
既然确定了相关性是正确的,那么用筛选的基因进行富集分析就可以反推这个基因的功能。
#获得基因列表
gene <- str_trim(cor_data_sig$symbol,'both')
#基因名称转换,返回的是数据框
gene = bitr(gene, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
go <- enrichGO(gene = gene$ENTREZID, OrgDb = "org.Hs.eg.db", ont="all")
# 这里因为是计算的所有GO分析的三个分类,所以可以合并作图
# 这是条形图
barplot(go, split="ONTOLOGY")+
facet_grid(ONTOLOGY~., scale="free")
# 这是气泡图
dotplot(go, split="ONTOLOGY")+
facet_grid(ONTOLOGY~., scale="free")
#
# 这时候,我们能推断CCL20这个基因主要参与免疫调控和T细胞激活,细胞因子受体活性调剂等功能,大致跟她本身的功能是一致的。
参考