1 介绍
针对基因大量样本量表达数据的分析方法(RNA-seq 、微阵列),建议样本数量大于15,可以挖掘具有相似表达模式的模块并计算与形状的相关性。
2 构建WGCNA的步骤
✓ 构建共表达网络
✓ 基因模块划分
✓ 模块与性状的关联分析
✓ 模块之间的关联分析
✓ 模块中核心基因的鉴定
3 代码展示
3.1 首先读入数据并对数据进行预处理
library(WGCNA)
options(stringsAsFactors = FALSE);
femData = read.csv("LiverFemale3600.csv")
#Take a quick look at what is in the data set:
dim(femData)
[1] 3600 143
names(femData)
head(femData)
#筛选方差前5000的基因
#femData<- as.data.frame(femData[order(apply(femData,1,mad),decreasing = T)[1:5000],])
# femData0 <- femData[rowSums(femData) > 0]
#2.删除不必要的表型信息
datExpr0 = as.data.frame(t(femData[, -c(1:8)]));
names(datExpr0) = femData$substanceBXH;
rownames(datExpr0) = names(femData)[-c(1:8)];
#检查基因个样本并过滤掉有缺失值的基因和样本
#检查并标记缺失值
gsg = goodSamplesGenes(datExpr0, verbose = 3);
gsg$allOK
#删除具有缺失值的基因和样本
#这里如果存在缺失值,gsg$goodGenes显示为F,之后使用paste函数将‘Removing #genes:’与具有缺失值的基因名和模块名粘连在一起并print出来
if (!gsg$allOK)
{
# Optionally, print the gene and sample names that were removed:
if (sum(!gsg$goodGenes)>0)
printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")));
if (sum(!gsg$goodSamples)>0)
printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")));
# Remove the offending genes and samples from the data:
datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
}
#层次聚类观察离群值
#首先构树
sampleTree = hclust(dist(datExpr0), method = "average");
#将图片输出到本地
sizeGrWindow(12,9)
pdf("Plots_sampleClustering.pdf", width = 12, height = 9);
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
cex.axis = 1.5, cex.main = 2)
abline(h = 15, col = "red");
clust = cutreeStatic(sampleTree, cutHeight = 15, minSize = 10)
#别忘了dev.off()
dev.off()
table(clust)
clust
0 1
1 134
#设定阈值之后我们发现有一个离群值,随后我们要将其去除
#删除离群值,保留第一聚类簇
keepSamples = (clust==1)
datExpr = datExpr0[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
#载入表型数据
traitData = read.csv("ClinicalTraits.csv");
dim(traitData)
[1] 361 38
names(traitData)
# 删除不必要的表型数据
allTraits = traitData[, -c(31, 16)];
allTraits = allTraits[, c(2, 11:36) ];
dim(allTraits)
names(allTraits)
# match表型信息与基因表达数据,并删除没有对应的样本
femaleSamples = rownames(datExpr);
traitRows = match(femaleSamples, allTraits$Mice);
datTraits = allTraits[traitRows, -1];
rownames(datTraits) = allTraits[traitRows, 1];
collectGarbage();
#删除离群值后重新进行聚类
pdf("Sample dendrogram and trait heatmap.pdf",height=10,width=20)
sampleTree2 = hclust(dist(datExpr), method = "average")
# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors = numbers2colors(datTraits, signed = FALSE);
# Plot the sample dendrogram and the colors underneath.
plotDendroAndColors(sampleTree2, traitColors,
groupLabels = names(datTraits),
main = "Sample dendrogram and trait heatmap")
dev.off()
3.2 软阈值的筛选
#设定画图显示时的阈值,1-20
powers = c(c(1:10), seq(from = 12, to=20, by=2))
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
sizeGrWindow(9, 5)# 设定窗口大小
par(mfrow = c(1,2));
cex1 = 0.9;
# 计算软阈值,并画图展示,设定abline寻找最适合的阈值
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
# 根据0.9的R^2值筛选阈值
abline(h=0.90,col="red")
# 同时观察不同阈值的连接度
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
从这两个图可以看出最佳阈值是6
3.3 构建网络
#选择6作为软阈值
softPower = 6;
#构建邻接矩阵
adjacency = adjacency(datExpr, power = softPower);
#构建TOM矩阵
TOM = TOMsimilarity(adjacency);
dissTOM = 1-TOM
#使用TOM矩阵进行聚类
geneTree = hclust(as.dist(dissTOM), method = "average");
# 作图展示
sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04);
# 设定模块基因数量的下限-30
minModuleSize = 30;
# 鉴定模块
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
deepSplit = 2, pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
table(dynamicMods)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
88 614 316 311 257 235 225 212 158 153 121 106 102 100 94 91 78 76 65 58 58 48 34
3.4 为模块划分颜色
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
# 层次聚类下添加颜色注释
sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")
3.5 合并相似模块
MEList = moduleEigengenes(datExpr, colors = dynamicColors)
MEs = MEList$eigengenes
# 计算特征值相异性
MEDiss = 1-cor(MEs);
# 聚类模块基因
METree = hclust(as.dist(MEDiss), method = "average");
# 结果可视化
sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
MEDissThres = 0.25
# 在0.25处添加abline,并认为abline下方的模块相似度较高,后续将合并成一个模块
abline(h=MEDissThres, col = "red")
# 调用合并函数,并对abline下的模块进行合并
merge = mergeCloseModules(datExpr, dynamicColors, cutHeight = MEDissThres, verbose = 3)
# 别忘了更改合并后模块的颜色注释
mergedColors = merge$colors;
mergedMEs = merge$newMEs;
#作图展示合并前后模块
sizeGrWindow(12, 9)
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
按照每个模块基因数量对模块进行排序
colorOrder = c("grey", standardColors(50));
#match排序前后的模块顺序
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs;
3.6 计算模块与表型的相关性及其p值
nGenes = ncol(datExpr);
nSamples = nrow(datExpr);
# 按照颜色重新计算模块
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
#利用皮尔森相关系数计算相关性
moduleTraitCor = cor(MEs, datTraits, use = "p");
#计算p值
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);
3.7针对表型和模块的相关性做热图展示结果
sizeGrWindow(10,6)
# 匹配相关性及其p值
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));
# 热图展示
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(datTraits),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
3.8 提取模块
通过上一步我们发现weight表型与magenta模块具有最高的相关性,所以我们提取相关模块进行进一步分析,首先针对与weight表型相关的基因进行热图展示
#提取weight表型信息
weight = as.data.frame(datTraits$weight_g);
names(weight) = "weight"
# 提取模块信息
modNames = substring(names(MEs), 3)
#皮尔森系数计算模块与基因的相关性
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"))
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))
names(geneModuleMembership) = paste("MM", modNames, sep="")
names(MMPvalue) = paste("p.MM", modNames, sep="")
#皮尔森系数计算基因与weight表型相关性
geneTraitSignificance = as.data.frame(cor(datExpr, weight, use = "p"));
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));
names(geneTraitSignificance) = paste("GS.", names(weight), sep="");
names(GSPvalue) = paste("p.GS.", names(weight), sep="");
write.csv(GSPvalue, file = "GSPvalue")
3.9 计算关键模块中基因与表型的相关性
#确定模块magenta
module = "magenta"
column = match(module, modNames);
moduleGenes = moduleColors==module;
#作图展示相关性
sizeGrWindow(7, 7);
par(mfrow = c(1,1));
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
abs(geneTraitSignificance[moduleGenes, 1]),
xlab = paste("Module Membership in", module, "module"),
ylab = "Gene significance for body weight",
main = paste("Module membership vs. gene significance\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
names(datExpr)
names(datExpr)[moduleColors=="magenta"]
3.10 对TOM矩阵的网络相关性计算
#随机筛选400个基因
nSelect = 400
# 数据的可重复性,使用set.seed()
set.seed(10);
select = sample(nGenes, size = nSelect);
selectTOM = dissTOM[select, select];
# 对于随机选择的基因我们无法对以前的聚类树进行剪枝,所以我们进行重新聚类
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = moduleColors[select];
# 作图展示
sizeGrWindow(9,9)
#说实话这个TOM矩阵的热图我也看不出什么意义,取7次方的意义是为了热图更好看吗
plotDiss = selectTOM^7;
diag(plotDiss) = NA;
TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")
3.11 对模块与表型进行聚类
MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes
# 提取表型信息
weight = as.data.frame(datTraits$weight_g);
names(weight) = "weight"
# 在模块中添加weight表型信息
MET = orderMEs(cbind(MEs, weight))
# 随后对模块和感兴趣表型进行聚类
sizeGrWindow(5,7.5);
par(cex = 0.9)
plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle= 90)
这里取消了层次聚类效果
sizeGrWindow(6,6);
par(cex = 1.0)
plotEigengeneNetworks(MET, "Eigengene dendrogram", marDendro = c(0,4,2,0),
plotHeatmaps = FALSE)
# 作图展示
par(cex = 1.0)
plotEigengeneNetworks(MET, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2),
plotDendrograms = FALSE, xLabelsAngle = 90)
3.12 导出模块中文件
#重新计算TOM矩阵
TOM = TOMsimilarityFromExpr(datExpr, power = 6);
# 选择模块,可以选1至多个
modules = c('magenta');
# 选择模块中基因
probes = names(datExpr)
inModule = is.finite(match(moduleColors, modules));
modProbes = probes[inModule];
#modGenes = annot$gene_symbol[match(modProbes, annot$substanceBXH)];
# Select the corresponding Topological Overlap
modTOM = TOM[inModule, inModule];
dimnames(modTOM) = list(modProbes, modProbes)
# 以cytoscape可读的方式导出基因列表以及相关性
cyt = exportNetworkToCytoscape(modTOM,
edgeFile = paste("CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep=""),
nodeFile = paste("CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep=""),
weighted = TRUE,
threshold = 0.02,
nodeNames = modProbes,
#altNodeNames = modGenes,
nodeAttr = moduleColors[inModule]);