上回说到,我废了九牛二虎之力,终于拿到了百十来个GB的GO EVIDENCE 数据,你猜怎么着,接着往下进行的时候发现,R语言根本读不了。。。。心疼自己一秒钟
来吧,今天接着搞起:
library(GOstats)
ago <- read.delim('/vol3/agis/zhoushaoqun_group/wangyantao/GO/swiss_goev_sme',row.names = NULL)
#报错
> colnames(ago) <- c('gene_id','go_id','evi')
Error in names(x) <- value :
'names' attribute [3] must be the same length as the vector [1]
之前我用write.csv生成的EVIDENCE注释文件,在读取时,列与列之间多了',',因此在读入时加了sep=',',而这次列与列之间是空格,同理,要加入sep=' '。
ago <- read.delim('/vol3/agis/zhoushaoqun_group/wangyantao/GO/swiss_goev_sme',row.names = NULL,sep=' ')
成功读入,进入下一步
goframeData <- na.omit(data.frame(ago$go_id,ago$evi,ago$gene_id))
goFrame <- GOFrame(goframeData)
goAllFrame <- GOAllFrame(goFrame)
library('GSEABase')
gsc <- GeneSetCollection(goAllFrame,setType = GOCollection())
universe <- as.vector(ago$gene_id)
DEGfile <- read.csv('xxxxxx.csv',header = T,row.names = NULL)
#读入差异表达基因#
genes = as.vector(DEGfile$gene_id)
params <- GSEAGOHyperGParams(name = 'custom',
geneSetCollection = gsc,
geneIds = genes,
universeGeneIds = universe,
ontology = 'CC',
pvalueCutoff = 0.05,
conditional = FALSE,
testDirection = 'over')
Over <- hyperGTest(params)
write.csv(summary(Over),xxxxxx_GO_CC_hyper.csv')
问题很多:
1、EVIDENCE注释文件的gene_id与DESeq2差异表达的文件中的gene_id要保持一致
sed "s/.1 / /g" swiss_goev_sme > swiss_goev_sme.1
终于成功生成GO注释文件,以下为全部脚本
> library(GOstats)
> ago <- read.delim('/vol3/agis/zhoushaoqun_group/wangyantao/GO/swiss_goev_sly.1',row.names = NULL,sep=' ')
> colnames(ago) <- c('gene_id','go_id','evi')
> goframeData <- na.omit(data.frame(ago$go_id,ago$evi,ago$gene_id))
> goFrame <- GOFrame(goframeData)
> goAllFrame <- GOAllFrame(goFrame)
There were 21 warnings (use warnings() to see them)
> library('GSEABase')
Loading required package: annotate
Loading required package: XML
Attaching package: ‘XML’
The following object is masked from ‘package:graph’:
addNode
Warning messages:
1: package ‘GSEABase’ was built under R version 4.0.3
2: package ‘annotate’ was built under R version 4.0.3
> gsc <- GeneSetCollection(goAllFrame,setType = GOCollection())
> universe <- as.vector(ago$gene_id)
> DEGfile <- read.csv('/vol3/agis/zhoushaoqun_group/wangyantao/GO/diffgene_Sly48.48.0_deseq2.csv',header = T,row.names = NULL)
> genes = as.vector(DEGfile[,1])
> params <- GSEAGOHyperGParams(name = 'custom',
+ geneSetCollection = gsc,
+ geneIds = genes,
+ universeGeneIds = universe,
+ ontology = 'CC',
+ pvalueCutoff = 0.05,
+ conditional = FALSE,
+ testDirection = 'over')
geneSetCollection = gsc,
geneIds = genes,
universeGeneIds = universe,
ontology = 'MF',
pvalueCutoff = 0.05,
conditional = FALSE,
testDirection = 'over')
Warning messages:
1: In makeValidParams(.Object) : removing duplicate IDs in universeGeneIds
2: In makeValidParams(.Object) : removing geneIds not in universeGeneIds
> params1 <- GSEAGOHyperGParams(name = 'custom',
+ geneSetCollection = gsc,
+ geneIds = genes,
+ universeGeneIds = universe,
+ ontology = 'BP',
+ pvalueCutoff = 0.05,
+ conditional = FALSE,
+ testDirection = 'over')
Warning messages:
1: In makeValidParams(.Object) : removing duplicate IDs in universeGeneIds
2: In makeValidParams(.Object) : removing geneIds not in universeGeneIds
> params2 <- GSEAGOHyperGParams(name = 'custom',
+ geneSetCollection = gsc,
+ geneIds = genes,
+ universeGeneIds = universe,
+ ontology = 'MF',
+ pvalueCutoff = 0.05,
+ conditional = FALSE,
+ testDirection = 'over')
Warning messages:
1: In makeValidParams(.Object) : removing duplicate IDs in universeGeneIds
2: In makeValidParams(.Object) : removing geneIds not in universeGeneIds
>
> Over <- hyperGTest(params)
> Over1 <- hyperGTest(params1)
> Over2 <- hyperGTest(params2)
> write.csv(summary(Over),'Sly48_GO_CC_hyper.csv')
> write.csv(summary(Over1),'Sly48_GO_BP_hyper.csv')
> write.csv(summary(Over2),'Sly48_GO_MF_hyper.csv')
> library(ggplot2)
> library(RColorBrewer)
Warning message:
package ‘RColorBrewer’ was built under R version 4.0.3
> display.brewer.all()
> col3 <- brewer.pal(4,'Set3')[c(1,3,4)]
> cc = read.csv('Sly48_GO_CC_hyper.csv',header = T,row.names = NULL)
> bp = read.csv('Sly48_GO_BP_hyper.csv',header = T,row.names = NULL)
> mf = read.csv('Sly48_GO_MF_hyper.csv',header = T,row.names = NULL)
> cc2 <- cc[cc$Pvalue<1e-5,]
> bp2 <- bp[bp$Pvalue<1e-5,]
> mf2 <- mf[mf$Pvalue<1e-5,]
> colnames(cc2)[2] <- c('GOID')
> colnames(bp2)[2] <- c('GOID')
> colnames(mf2)[2] <- c('GOID')
> data <- rbind(mf2,cc2)
> data <- rbind(data,bp2)
> data2 <- data.frame(data,type = c(rep('MF',dim(mf2)[1]),rep('CC',dim(cc2)[1]),rep('BP',dim(bp2)[1])))
> p <- ggplot(data2,aes(x = Term,y = log2(Count),fill = type,order = type))+
+ geom_bar(stat = 'identity')+coord_flip()+
+ scale_fill_manual(values = col3,limits = c('BP','CC','MF'),labels = c('BP','CC','MF'))+
+ theme_bw()+theme(legend.title=element_blank(),
+ axis.text = element_text(size = 10),
+ axis.title.x = element_text(size = rel(1.5)),
+ axis.title.y = element_text(size = rel(1.5),angle = 90),
+ legend.text = element_text(size = rel(1.5)))+
+ ylab('Number of Genes (log2)') + xlab('Terms')
> ggsave('Sly48.go_hyper.tiff',p,device = 'tiff',scale = 1,width = 20,height = 30,dpi = 300)
>
> display.brewer.all()
> col3 <- brewer.pal(4,'Set3')[c(1,3,4)]
> cc = read.csv('Sly48_GO_CC_hyper.csv',header = T,row.names = NULL)
> bp = read.csv('Sly48_GO_BP_hyper.csv',header = T,row.names = NULL)
> mf = read.csv('Sly48_GO_MF_hyper.csv',header = T,row.names = NULL)
> cc2 <- cc[cc$Pvalue<1e-2,]
> bp2 <- bp[bp$Pvalue<1e-2,]
> mf2 <- mf[mf$Pvalue<1e-2,]
> colnames(cc2)[2] <- c('GOID')
> colnames(bp2)[2] <- c('GOID')
> colnames(mf2)[2] <- c('GOID')
> data <- rbind(mf2,cc2)
> data <- rbind(data,bp2)
> data2 <- data.frame(data,type = c(rep('MF',dim(mf2)[1]),rep('CC',dim(cc2)[1]),rep('BP',dim(bp2)[1])))
> p <- ggplot(data2,aes(x = Term,y = log2(Count),fill = type,order = type))+
+ geom_bar(stat = 'identity')+coord_flip()+
+ scale_fill_manual(values = col3,limits = c('BP','CC','MF'),labels = c('BP','CC','MF'))+
+ theme_bw()+theme(legend.title=element_blank(),
+ axis.text = element_text(size = 10),
+ axis.title.x = element_text(size = rel(1.5)),
+ axis.title.y = element_text(size = rel(1.5),angle = 90),
+ legend.text = element_text(size = rel(1.5)))+
+ ylab('Number of Genes (log2)') + xlab('Terms')
> ggsave('Sly48.go_hyper.tiff',p,device = 'tiff',scale = 1,width = 20,height = 30,dpi = 300)
一开始的时候,pvalue设置为10的-5次方,结果并没有生产图片,后面我把pvalue改成10的-2次方,好歹有个结果。
经过测试,以上脚本可用,开始批量处理相关物种差异表达的GO注释