1-1从GEO数据库下载数据
rm(list = ls())
options(stringsAsFactors = F)
library(GEOquery)
gse = "GSE4107"
eSet <- getGEO(gse,
destdir = '.',
getGPL = F)
1-2提取表达矩阵exp
exp <- exprs(eSet[[1]])
head(exp)
exp = log2(exp+1)
dim(exp) #22个样本,54975个探针
boxplot(exp) #对样本的总体表达水平范围有一个了解
1-3提取各样本的分组信息
pd <- pData(eSet[[1]])
1-4调整exp的行名顺序与pd列名完全一致
p = identical(rownames(pd),colnames(exp));p
if(!p) exp <- exp[,match(rownames(pd),colnames(exp))]
1-5提取芯片平台编号,保存
gpl <- eSet[[1]]@annotation
2-1提取分组信息
rm(list = ls())
load(file = "step1output.Rdata")
library(stringr)
group_list=ifelse(str_detect(pd$title,"control"),"control","treat")
table(group_list)
group_list = factor(group_list,
levels = c("control","treat"))
2-2获取芯片的探针注释
gpl
#http://www.bio-info-trainee.com/1399.html
if(!require(hgu133plus2.db))BiocManager::install("hgu133plus2.db")
library(hgu133plus2.db)
ls("package:hgu133plus2.db")
ids <- toTable(hgu133plus2SYMBOL) #toTable转换为矩阵
head(ids)
save(exp,group_list,ids,file = "step2output.Rdata")
3借助PCA和热图,进行数据检查
3-1PCA
rm(list = ls())
load(file = "step1output.Rdata")
load(file = "step2output.Rdata")
dat=as.data.frame(t(exp)) #转换为行是样本,列是基因
library(FactoMineR)#画主成分分析图需要加载这两个包
library(factoextra)
# pca
dat.pca <- PCA(dat, graph = FALSE)
pca_plot <- fviz_pca_ind(dat.pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = group_list, # color by groups
#palette = c("#00AFBB", "#E7B800"), #修改颜色
addEllipses = TRUE, # Concentration ellipses
legend.title = "Groups"
)
pca_plot
ggsave(plot = pca_plot,filename = paste0(gse,"PCA.png"))
save(pca_plot,file = "pca_plot.Rdata")
3-2热图
cg=names(tail(sort(apply(exp,1,sd)),1000)) #对exp按行计算每个探针的标准差,从小到大排序,取标准差最大的前1000个探针,并提取对应的探针名
n=exp[cg,]
#绘制热图
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(n)
library(pheatmap)
pheatmap(n,
show_colnames =F,
show_rownames = F,
annotation_col=annotation_col,
scale = "row")
dev.off()
4差异分析
4-1差异基因分析
rm(list = ls())
load(file = "step2output.Rdata")
library(limma)
design=model.matrix(~group_list)
fit=lmFit(exp,design)
fit=eBayes(fit)
deg=topTable(fit,coef=2,number = Inf)
4-2加probe_id列,把行名变成一列
library(dplyr)
deg <- mutate(deg,probe_id=rownames(deg));head(deg)
4-3加symbol列,去重复
deg <- inner_join(deg,ids,by="probe_id");head(deg)
deg <- deg[!duplicated(deg$symbol),]
4-4标记上下调基因
logFC_t=1 #变化超过2倍的视为差异基因
P.Value_t = 0.01 #P值小于等于0.01视为显著
k1 = (deg$P.Value < P.Value_t)&(deg$logFC < -logFC_t)
k2 = (deg$P.Value < P.Value_t)&(deg$logFC > logFC_t)
change = ifelse(k1,"down",ifelse(k2,"up","stable"))
deg <- mutate(deg,change);head(deg)
4-5加ENTREZID列,用于富集分析
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
s2e <- bitr(deg$symbol,
fromType = "SYMBOL",
toType = "ENTREZID",
OrgDb = org.Hs.eg.db)#人类
deg <- inner_join(deg,s2e,by=c("symbol"="SYMBOL"));head(deg)
save(group_list,deg,logFC_t,P.Value_t,file = "step4output.Rdata")
5画火山图
rm(list = ls())
load(file = "step1output.Rdata")
load(file = "step4output.Rdata")
library(dplyr)
library(ggplot2)
dat = deg
for_label <- dat%>%
filter(symbol %in% c("TRPM3","SFRP1"))
p <- ggplot(data = dat,
aes(x = logFC,
y = -log10(P.Value))) +
geom_point(alpha=0.4, size=3.5,
aes(color=change)) +
ylab("-log10(Pvalue)")+
scale_color_manual(values=c("blue", "grey","red"))+
geom_vline(xintercept=c(-logFC_t,logFC_t),lty=4,col="black",lwd=0.8) +
geom_hline(yintercept = -log10(P.Value_t),lty=4,col="black",lwd=0.8) +
theme_bw()
p
volcano_plot <- p +
geom_point(size = 3, shape = 1, data = for_label) +
ggrepel::geom_label_repel(
aes(label = symbol),
data = for_label,
color="black"
)
volcano_plot
ggsave(plot = volcano_plot,filename = paste0(gse,"volcano.png"))
6画热图
load(file = 'step2output.Rdata')
x=deg$logFC
names(x)=deg$probe_id
cg = deg$probe_id[deg$change !="stable"]
n=exp[cg,]
dim(n)
library(pheatmap)
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(n)
library(ggplotify)
heatmap_plot <- as.ggplot(pheatmap(n,show_colnames =F,
show_rownames = F,
scale = "row",
#cluster_cols = F,
annotation_col=annotation_col))
heatmap_plot
ggsave(heatmap_plot,filename = paste0(gse,"heatmap.png"))