http://pages.stat.wisc.edu/~yandell/statgen/ucla/WGCNA/wgcna.html
学习WGCNA总结 | Public Library of Bioinformatics
rm(list = ls())#清空环境变量
getwd()
setwd("D:/AAA-台式机-LY-WorkSpace/data-wrangling-with-R/")
library(WGCNA)
library(reshape2)
library(stringr)
options(stringsAsFactors = F)
#打开多路线程
allowWGCNAThreads()#enableWGCNAThreads()
mycount <- read.csv("D:/AAA-台式机-LY-WorkSpace/data-wrangling-with-R/htseq-count-readcount - 无注释.csv", row.names=1, header=T, check.names=F)
dim(mycount)
data <- log2(mycount+1)
##筛选方差前25%的基因,上一步是为了减少运算量,因为一个测序数据可能会有好几万个探针,
#而可能其中很多基因在各个样本中的表达情况并没有什么太大变化,为了减少运算量,这里我们筛选方差前25%的基因。
m.vars=apply(data,1,var)
data.upper <- data[which(m.vars > quantile(m.vars, probs = seq(0,1,0.25))[4]),]
datExpr <- as.data.frame(t(data.upper))#行和列转换位置
nGenes <- ncol(datExpr)
nSamples <- nrow(datExpr)
##样本聚类检查离群值##
gsg <- goodSamplesGenes(datExpr, verbose = 3)
gsg$allOK
sampleTree <- hclust(dist(datExpr), method = "average")
plot(sampleTree, main = "sample clustering to detect outliers", sub = "", xlab = "")
save(datExpr, file = "readcount-01-dataInput.RData")
##软阈值筛选##
# 横轴是Soft threshold (power),纵轴是无标度网络的评估参数,数值越高,
# 网络越符合无标度特征 (non-scale)
powers <- c(seq(1,10,by=1), seq(12,20,by=2))
sft <- pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
par(mfrow=c(1,2))
cex1 = 0.9
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab = "Soft Threshold (power)", ylab = "Scale Free Topology Model Fit,signed R^2", type = "n",
main = paste("scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels = powers, cex = cex1, col = "red")
abline(h=0.90, col="red")
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab = "Soft Threshold (power)", ylab = "Mean Connectivity", type = "n",
main = paste("Mean Connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels = powers, cex = cex1, col = "red")
# 运行下列代码,让程序推荐你一个power, 数据质量太差啦,程序给了我"NA",自己设定power=14
sft$powerEstimate
#推荐的power是4
##一步法网络构建:One-step network construction and module detection##
net <- blockwiseModules(datExpr,power = 4,maxBlockSize = nGenes,
TOMType = "unsigned", minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = T, pamRespectsDendro = F,
saveTOM = T,
saveTOMFileBase = "AS-green-readcount-TOM",
verbose = 3)
table(net$colors)
#层级聚类树展示各个模块
## 灰色的为**未分类**到模块的基因。
# Convert labels to colors for plotting
moduleLabels = net$colors
moduleColors = labels2colors(moduleLabels)
# Plot the dendrogram and the module colors underneath
# 如果对结果不满意,还可以recutBlockwiseTrees,节省计算时间
plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = F, hang = 0.03,
addGuide = T, guideHang = 0.05)
table(moduleColors)
MEs = net$MEs
geneTree = net$dendrograms[[1]]
save(MEs, moduleLabels, geneTree,
file = "AS-green-readcount-02-networkConstruction-auto.RData")##结果保存###
##表型与模块相关性##(没有,下一步)
moduleLabelsAutomatic = net$colors
moduleColorsAutomatic = labels2colors(moduleLabelsAutomatic)
moduleColorsWW = moduleColorsAutomatic
MEs0 = moduleEigengenes(datExpr, moduleColorsWW)$eigengenes
MEsWW = orderMEs(MEs0)
samples <- read.csv()
modTraitor = cor(MEsWW, )
#绘制模块之间相关性图
# module eigengene, 可以绘制线图,作为每个模块的基因表达趋势的展示
MEs = net$MEs
### 不需要重新计算,改下列名字就好
### 官方教程是重新计算的,其实可以不用这么麻烦
MEs_col = MEs
colnames(MEs_col) = paste0("ME", labels2colors(as.numeric(str_replace_all(colnames(MEs),"ME", ""))))
MEs_col = orderMEs(MEs_col)
# 根据基因间表达量进行聚类所得到的各模块间的相关性图
# marDendro/marHeatmap 设置下、左、上、右的边距
plotEigengeneNetworks(MEs_col, "Eigengene adjacency heatmap",
marDendro = c(3,3,2,4),
marHeatmap = c(3,4,2,2), plotDendrograms = T,
xLabelsAngle = 90)
#可视化基因网络 (TOM plot)
# 如果采用分步计算,或设置的blocksize>=总基因数,直接load计算好的TOM结果
# 否则需要再计算一遍,比较耗费时间
# TOM = TOMsimilarityFromExpr(dataExpr, power=power, corType=corType, networkType=type)
load(net$TOMFiles[1], verbose = T)
TOM <- as.matrix(TOM)
dissTOM = 1-TOM
# Transform dissTOM with a power to make moderately strong
# connections more visible in the heatmap
plotTOM = dissTOM^7
# Set diagonal to NA for a nicer plot
diag(plotTOM) = NA
#call the plot function
# 这一部分特别耗时,行列同时做层级聚类
#TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")
#all genes 耗时太长了,随便选取1000个基因来可视化
nSelect = 1000
# For reproducibility, we set the random seed
set.seed(10)
select = sample(nGenes, size = nSelect)
selectTOM = dissTOM[select, select]
# There’s no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = moduleColors[select]
# Open a graphical window
sizeGrWindow(9,9)
# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing
# the color palette; setting the diagonal to NA also improves the clarity of the plot
plotDiss = selectTOM^7;
diag(plotDiss) = NA;
TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")