复盘一下,找基因所在的通路下
新的任务,找到这 276 genes encompassed nine major DDR pathways:
base excision repair (BER),
nucleotide excision repair (NER),
mismatch repair (MMR),
the Fanconi anemia (FA) pathway,
homology-dependent recombination (HR),
non-homologous DNA end joining (NHEJ),
direct damage reversal repair (DR),
translesion DNA synthesis (TLS),
nucleotide pool maintenance (NP)
然后搞清楚每个基因出现在多少个通路,跟上次的任务比较像,来自于Genomic and Molecular Landscape of DNA Damage Repair Deficiency across The Cancer Genome Atlashttps://www.cell.com/cell-reports/pdf/S2211-1247(18)30437-6.pdf
文章在线网址https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5961503/
根据Jimmy老师提示KEGGREST包含KEGG通路里的所有基因,那么反过来理论上也是可行的。所以查找KEGGREST包的说明,了解这个包的使用。
了解一下数据
读入数据
rm (list=ls())
Sys.setenv(LANGUAGE = "en") #显示英文报错信息
options(stringsAsFactors = FALSE) #禁止chr转成factor
{ #install.packages
options(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/")
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
if(!require("xlsx")) install.packages("xlsx",update = F,ask = F)
if(!require("ggplot2")) install.packages("ggplot2",update = F,ask = F)
if(!require("tidyverse")) install.packages("tidyverse",update = F,ask = F)
if(!require("BiocManager")) install.packages("BiocManager",update = F,ask = F)
if(!require("KEGGREST")) BiocManager::install("KEGGREST",update = F,ask = F)
if(!require("clusterProfiler")) BiocManager::install("clusterProfiler",update = F,ask = F)
if(!require("UpSetR")) BiocManager::install("UpSetR",update = F,ask = F)
}
# 载入数据,补充材料表1
library("xlsx")
res<- read.xlsx("1-s2.0-S2211124718304376-mmc2.xlsx", 1, header=F, colClasses=NA)[-(1:2),1:2]
head(res)
# colnames(res) <- res[1,]
# res <- res[-1,]
colnames(res) <- c("Entrez_Gene_ID","Gene_Symbol")
head(res)
这里可以查看差异基因在哪些KEGG通路
library(KEGGREST)
listDatabases()
[1] "pathway" "brite" "module" "ko" "genome" "vg" "ag" "compound"
[9] "glycan" "reaction" "rclass" "enzyme" "disease" "drug" "dgroup" "environ"
[17] "genes" "ligand" "kegg"
#
res1 <- paste("hsa:",res[-1, 1]) #根据包说明书对变量重命名
res2 <- gsub(" ", "", res1) #正则表达式去掉空格
res3 <- keggLink("pathway", res2)#查找通路
res3
分析流程:
res1 <- paste("hsa:",res[-1, 1]) #根据包说明书对变量重命名
res2 <- gsub(" ", "", res1) #正则表达式去掉空格
res3 <- keggLink("pathway", res2)#查找通路
res3
dim(as.data.frame(res3))
#开始进行分析
DDR.list <- list("hsa03410", "hsa03420", "hsa03430", "hsa03440", "hsa03450", "hsa03460")
# 批量提取几个pathway的基因
DDRgene <- lapply(DDR.list, function(i){
#i="hsa03410"
d1 <- KEGGREST::keggGet(i)[[1]]$GENE
gene <- sapply(seq(2, length(d1), 2), function(x){
d2 <- unlist(strsplit(d1[x], ";"))[1]
})
})
#有三个通路在KEGG数据库中找不到,在其他数据库
#https://reactome.org/PathwayBrowser/找到的,但是与原文有些不一致
#该数据库的使用方法参考陈同老师的科学网博客科学网—没钱买KEGG怎么办?REACTOME开源通路更强大 - 陈同的博文 http://blog.sciencenet.cn/blog-118204-1142737.html
#direct damage reversal/repair (DR)
#PS:下边虽然找到了该通路,由于该数据库通路名字的原因可能并没有找全
#molecule选项里面的
'
Proteins P16455 UniProt:P16455 MGMT
Proteins Q6NS38 UniProt:Q6NS38 ALKBH2
Proteins Q6P6C2 UniProt:Q6P6C2 ALKBH5
Proteins Q9C0B1 UniProt:Q9C0B1 FTO
Proteins Q9H1I8 UniProt:Q9H1I8 ASCC2
Proteins Q8N3C0 UniProt:Q8N3C0 ASCC3
Proteins Q8N9N2 UniProt:Q8N9N2 ASCC1
Proteins Q96Q83 UniProt:Q96Q83 ALKBH3'
# genenames <- 'Gene.Name'
# ddr <- c("MGMT","ALKBH2","ALKBH5","FTO","ASCC2","ASCC3","ASCC1","ALKBH3")
Gene.Name <- c("MGMT","ALKBH2","ALKBH5","FTO","ASCC2","ASCC3","ASCC1","ALKBH3")
DDR.list2 <- list(Gene.Name=Gene.Name)
#同样的方法分别构建translesion DNA synthesis (TLS) 和 and nucleotide pool maintenance (NP)的通路基因
#translesion DNA synthesis (TLS)
#由于TLS这个通路里面有32个基因,所以可以从文本里面导入。
#与参考博文不一致;参考博文是85个,他应该是把该条通路下面的所有基因汇总了。
#我觉得没有必要,上边的都没有汇总
TLS <- read.xlsx("TLS.xlsx", 1, header=F, colClasses=NA)[,-(1:2)]
TLS <- data.frame(TLS)
colnames(TLS) <- c("Gene.Name")
library(stringr)
TLS <- str_split(as.character(TLS$Gene.Name)," ",simplify = T)[,2]
TLS <- list(Gene.Name =TLS)
DDR.list3 <- TLS
#nucleotide pool maintenance (NP)
#这个没有找到非常匹配的
#按照提示,选取包括基因最多的通路DNA Repair
NP <- read.xlsx("NP.xlsx", 1, header=F, colClasses=NA)[,-(1:2)]
NP <- data.frame(NP)
colnames(NP) <- c("Gene.Name")
NP <- str_split(as.character(NP$Gene.Name)," ",simplify = T)[,2]
NP <- list(Gene.Name =NP)
DDR.list4 <- NP
class(NP)
merged.list <- c( DDRgene , DDR.list2, DDR.list3, DDR.list4)
#names(merged.list) <- c(1:9)
# 文章中276个基因和5条通路的交集
# overlap <- lapply(1:5, function(x){
# res <- intersect(gene$Gene_Symbol, DDRgene[[x]])
# })
intersect(res[-1, 2], DDRgene[[1]])
overlap <- lapply(1:9, function(x){
res <- intersect(res[-1, 2], merged.list[[x]])
})
Reduce(intersect,overlap) # 也没有overlap
library(ggplot2)
library(tidyverse)
count <- unlist(overlap) %>% table()
count <- as.data.frame(count) %>% base::subset(Freq > 1)
names(count)[1] <- "gene"
ggplot(count, aes(gene, Freq),size =1)+
geom_bar(stat="identity")+
coord_flip()+
theme_bw()
ggplot(count, aes(gene, Freq, colour=Freq, size =Freq))+
geom_point(stat="identity")+
coord_flip()+
theme_bw()
韦恩图
#没有生成图片
if(F){
library(UpSetR)
listinput <- list(dfgene = res[-1, 2],
BER = merged.list[[1]],
NER = merged.list[[2]],
MMR = merged.list[[3]],
HR = merged.list[[4]],
NHEJ = merged.list[[5]],
FA = merged.list[[6]],
NP = merged.list[[7]],
TLS = merged.list[[8]],
DR = merged.list[[9]])
pdf(file='upset.pdf',height = 8,width = 8)
p <- upset(fromList(listinput),nsets = 9, order.by = "freq")
dev.off()
}
单个基因/单个通路
keggLink("pathway", "hsa:7157" )#单个基因,比如TP53
png <- keggGet("path:hsa01522", "image")# 看下通路
t <- tempfile()
library(png)
writePNG(png, t)
if (interactive()) browseURL(t)
注意:这个任务中最关键的是如何确定九条信号通路中有哪些基因,但是各个数据库并不一致,这是造成与作者原图有出路的主要原因,作者在补充材料表格3中有276个基因的分类,可以参考。
参考文献
Genomic and Molecular Landscape of DNA Damage Repair Deficiency across The Cancer Genome Atlas
KEGG学习笔记
简介Bioconductor中的几个注释信息数据库