新的任务,找到这 276 genes encompassed nine major DDR pathways:
base excision repair (BER),
nucleotide excision repair (NER),
mismatch repair (MMR),
the Fanconi anemia (FA) pathway,
homology-dependent recombination (HR),
non-homologous DNA end joining (NHEJ),
direct damage reversal repair (DR),
translesion DNA synthesis (TLS),
nucleotide pool maintenance (NP)
然后搞清楚每个基因出现在多少个通路,跟上次的任务比较像,来自于Genomic and Molecular Landscape of DNA Damage Repair Deficiency across The Cancer Genome Atlashttps://www.cell.com/cell-reports/pdf/S2211-1247(18)30437-6.pdf
文章在线网址https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5961503/
276个基因在补充材料表格1,截图如下:
根据Jimmy老师提示KEGGREST包含KEGG通路里的所有基因,那么反过来理论上也是可行的。所以查找KEGGREST包的说明,了解这个包的使用。
rm (list=ls())
Sys.setenv(LANGUAGE = "en") #显示英文报错信息
options(stringsAsFactors = FALSE) #禁止chr转成factor
{ #install.packages
options(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/")
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
if(!require("xlsx")) install.packages("xlsx",update = F,ask = F)
if(!require("ggplot2")) install.packages("ggplot2",update = F,ask = F)
if(!require("tidyverse")) install.packages("tidyverse",update = F,ask = F)
if(!require("BiocManager")) install.packages("BiocManager",update = F,ask = F)
if(!require("KEGGREST")) BiocManager::install("KEGGREST",update = F,ask = F)
if(!require("clusterProfiler")) BiocManager::install("clusterProfiler",update = F,ask = F)
if(!require("UpSetR")) BiocManager::install("UpSetR",update = F,ask = F)
}
# 载入数据,补充材料表1
library("xlsx")
res<- read.xlsx("1-s2.0-S2211124718304376-mmc2.xlsx", 1, header=F, colClasses=NA)[-(1:2),1:2]
这里可以查看差异基因在哪些KEGG通路
library(KEGGREST)
listDatabases()
res1 <- paste("hsa:",res[-1, 1]) #根据包说明书对变量重命名
res2 <- gsub(" ", "", res1) #正则表达式去掉空格
res3 <- keggLink("pathway", res2)#查找通路
res3
开始进行分析
DDR.list <- list("hsa03410", "hsa03420", "hsa03430", "hsa03440", "hsa03450", "hsa03460")
# 批量提取几个pathway的基因
DDRgene <- lapply(DDR.list, function(i){
#i="hsa03410"
d1 <- KEGGREST::keggGet(i)[[1]]$GENE
gene <- sapply(seq(2, length(d1), 2), function(x){
d2 <- unlist(strsplit(d1[x], ";"))[1]
})
})
#有三个通路在KEGG数据库中找不到,在其他数据库
#Reactome/wikipathways/SMPDB/Reactome/BioCarta Pathway/Pathway Commons找到的,但是与原文有些不一致
DDR.list2 <- as.list(read.csv("NP.csv"))
DDR.list2
# $Gene.Name
# [1] "NUDT1" "NUDT15" "NUDT18" "RRM1" "RRM2"
DDR.list3 <- as.list(read.csv("TLS.csv"))
DDR.list3
#$Gene.Name
[1] "REV3L" "SPRTN" "DDX11" "TACC3" "NUP160" "SPDL1" "PARP3"
[8] "TPR" "POLQ" "EIF2AK2" "NDC1" "ZW10" "AURKA" "TTC28"
[15] "POLE2" "NIN" "CEP128" "PABPN1" "PABPC1L" "RAE1" "CDC25B"
[22] "FAM83D" "POLI" "STAG2" "EMD" "BEX4" "PARP4" "RPA3"
[29] "POLE4" "MAD2L2" "ADPRHL2" "CDC20" "NEK2" "CENPF" "RPA2"
[36] "POLK" "DLGAP5" "PARP2" "RPA1" "VPS4A" "PCNA" "REV1"
[43] "FLNB" "CKAP2" "NUPL2" "ODF2" "CDK5RAP2" "NUMA1" "MAPKBP1"
[50] "TUBGCP4" "RMDN3" "PPM1B" "ACTR2" "KIF11" "KIF20B" "PARP9"
[57] "EIF4A3" "PARP1" "POLE3" "TUT1" "DSN1" "LATS2" "UBC"
[64] "TIPARP" "RCHY1" "TOPBP1" "POC1A" "MAD2L1" "CEP44" "PAPD4"
[71] "VCP" "UBB" "NUDCD2" "POLH" "USP32" "WDR73" "POLE"
[78] "CETN1" "UBE2N" "CALML3" "CALML5" "PARP10" "PARPBP" "BRCC3"
[85] "PARG"
DDR.list4<- as.list (read.csv("DR.csv"))
DDR.list4
# $Gene.Name
# [1] "ALKBH5" "ASCC2" "ASCC3" "ASCC1" "FTO" "ALKBH3" "MGMT" "ALKBH2"
merged.list <- c( DDRgene , DDR.list2, DDR.list3, DDR.list4)
#names(merged.list) <- c(1:9)
intersect(res[-1, 2], DDRgene[[1]])
overlap <- lapply(1:9, function(x){
res <- intersect(res[-1, 2], merged.list[[x]])
})
Reduce(intersect,overlap) # 也没有overlap
library(ggplot2)
library(tidyverse)
count <- unlist(overlap) %>% table()
count <- as.data.frame(count) %>% base::subset(Freq > 1)
names(count)[1] <- "gene"
ggplot(count, aes(gene, Freq,colour=Freq, size =Freq))+
geom_point(stat="identity")+coord_flip()
做个韦恩图
library(UpSetR)
listinput <- list(dfgene = res[-1, 2],
BER = merged.list[[1]],
NER = merged.list[[2]],
MMR = merged.list[[3]],
HR = merged.list[[4]],
NHEJ = merged.list[[5]],
FA = merged.list[[6]],
NP = merged.list[[7]],
TLS = merged.list[[8]],
DR = merged.list[[9]])
pdf(file='upset.pdf',height = 8,width = 8)
p <- upset(fromList(listinput),nsets = 9, order.by = "freq")
dev.off()
单个基因/单个通路
keggLink("pathway", "hsa:7157" )#单个基因,比如TP53
png <- keggGet("path:hsa01522", "image")# 看下通路
t <- tempfile()
library(png)
writePNG(png, t)
if (interactive()) browseURL(t)
做个富集分析
library(clusterProfiler)
kegg.result <- enrichKEGG(gene=res[-1,1],
organism="hsa",
pvalueCutoff=0.05,
pAdjustMethod="BH",
qvalueCutoff=0.1,
keyType = "kegg")
barplot(kegg.result)
dotplot(kegg.result)
write.csv(kegg.result,"kegg.csv")
注意:这个任务中最关键的是如何确定九条信号通路中有哪些基因,但是各个数据库并不一致,这是造成与作者原图有出路的主要原因,作者在补充材料表格3中有276个基因的分类,可以参考。
特别感谢内蒙古生信菜鸟团寻找DDR通路中关键基因
,部分代码参考他写的帖子!
参考文献
Genomic and Molecular Landscape of DNA Damage Repair Deficiency across The Cancer Genome Atlas
KEGG学习笔记
简介Bioconductor中的几个注释信息数据库
生信技能树公益视频合辑:学习顺序是linux,r,软件安装,geo,小技巧,ngs组学!
B站链接:https://m.bilibili.com/space/338686099
YouTube链接:https://m.youtube.com/channel/UC67sImqK7V8tSWHMG8azIVA/playlists
生信工程师入门最佳指南:https://mp.weixin.qq.com/s/vaX4ttaLIa19MefD86WfUA
学徒培养:https://mp.weixin.qq.com/s/3jw3_PgZXYd7FomxEMxFmw
生信技能树 - 简书 https://www.jianshu.com/u/d645f768d2d5
https://www.wikipathways.org/index.php/Pathway:WP1928
https://www.cnblogs.com/muchen/p/5412278.html