原始CEL,GSEXXX_RAW文件处理,RMA标准化MAS5标准化处理代码。
##if (!require("BiocManager", quietly = TRUE))
## install.packages("BiocManager")
##BiocManager::install(c("annotate","annaffy", "GEOquery", "hgu133plus2.db", "org.Hs.eg.db", "affy", "makecdfenv", "AnnotationForge", "data.table"))
##make.cdf.package("GPL15048_HuRSTA_2a520709.CDF.gz", "hursta2a520709cdf", species = "Homo sapiens", compress = TRUE, unlink=TRUE)
##install.packages("hursta2a520709cdf", repos = NULL, type="source" )
##devtools::install_github("steveneschrich/hursta2a520709.db")
{
library(annaffy)
library(affy)
library(annotate)
library(GEOquery)
library(hgu133plus2.db)
library(org.Hs.eg.db)
library(makecdfenv)
library(data.table)
library(hursta2a520709cdf)
rm(list = ls())
}
# gse给出所有的待处理GEO文件名称
gse <- c("GSE72094", "GSE29013", "GSE37745", "GSE50081")
for (n in gse){
# 设置RAW文件所在文件夹的路径
path <- "C:/Users/Administrator/Documents/HypoxiaGeneSignature"
setwd(path)
# dir.create函数创建与GSE文件同名的文件夹,以便压缩包的解压和文件提取
dir.create(n)
# 设置默认路径
setwd(paste(path, n, sep = "/"))
# 对压缩文件解压
untar(paste(paste(path, n, sep = "/"), "RAW.tar", sep = "_"))
# 提取所有的命名中带有gz的文件
files <- dir(pattern="gz$") ##加载文件
# 将files中的文件进行合并
sapply(files, gunzip) ##合并文件
# 将所有名字带有CEL的文件存放到filelist变量中,每个CEL文件就是一个样本的芯片数据
filelist <- dir(pattern="CEL$")
# 使用ReadAffy函数读取每个CEL文件,并对芯片数据进行初步处理
data <- ReadAffy(filenames=filelist)
# 使用rma函数对芯片数据进行标化
eset <- rma(data)
# expres函数用于提取eset对象中的表达数据
eset.e <- data.frame(exprs(eset))
# 获取平台名称
affydb <- annPkgName(data@annotation, type='db')
require(affydb, character.only = TRUE)
# 根据注释转换probe id为Gene symbol
symbols<-as.character(aafSymbol(as.character(rownames(eset)),affydb))
eset.e$genes <- symbols
# 至此实现了对GEO原始数据的RMA标准化处理,处理后的表达谱数据为eset.e
write.csv(eset.e, "rma_exp.csv", row.names = F)
# 有些研究发现可以将RMA标化方法与MAS5标化方法进行结合,具体的实现方式如下
# 使用mas5calls函数对data进行处理,得到mas5标化后的标化数据
calls <- mas5calls(data) # get PMA calls
# exprs函数用于提取表达谱
calls <- exprs(calls)
# 对数据calls进行筛选,最终得到数据即为在RMA标化的基础上,进一步考虑了
absent <- rowSums(calls == 'A') # how may samples are each gene 'absent' in all
absent <- which (absent == ncol(calls)) # which genes are 'absent' in all samples
rmaFiltered <- eset[-absent,] # filters out the genes 'absent' in all samples
filtered <- data.frame(exprs(rmaFiltered))
symbols<-as.character(aafSymbol(as.character(rownames(rmaFiltered)),affydb))
filtered$genes <- symbols
write.csv(filtered, "rma_mas5_exp.csv", row.names = F)
}
rm(list = ls())
{
# get mappings from GPL15048
tmp <- getGEO("GPL15048")
#
mp <- tmp@dataTable@table
#
std <- suppressMessages(data.table(AnnotationDbi::select(org.Hs.eg.db,
keys = AnnotationDbi::keys(org.Hs.eg.db,
keytype = "SYMBOL"),
columns = c("ENTREZID", "SYMBOL"),
keytype = "SYMBOL")))
#
gs <- std$SYMBOL[ match(mp$EntrezGeneID, std$ENTREZID)]
#
affyids <- rownames(eset.e)
#
res <- data.frame(Probe_ID = affyids,
Gene_Symbol = gs,
stringsAsFactors = F)
#
res$median = apply(eset.e, 1, median)
#
res <- res[ !is.na(res$Gene_Symbol),]
#
res = res[order(res$Gene_Symbol, res$median, decreasing = T),]
#
res = res[!duplicated(res$Gene_Symbol),]
#
eset.e <- eset.e[res$Probe_ID,]
#
rownames(eset.e) <- res$Gene_Symbol
#
uni_matrix <- eset.e
#
colnames(uni_matrix) <- gsub('__\\S*', '', colnames(uni_matrix))
#
setwd(path)
#
pheno <- read.csv(file = 'GSE72094_series_matrix.txt')
#
pheno <- data.frame(num1 = strsplit(as.character(pheno[28,]),split = '\t')[[1]][-1],
num2 = gsub('patient_id: ','', strsplit(as.character(pheno[39,]),split = '\t')[[1]][-1]))
# 至此实现了对GEO原始数据的RMA标准化处理,处理后的表达谱数据为uni_matrix
save(uni_matrix,pheno,file = 'uni_matrix.Rdata')
}