一、需要的数据
(1)eggnog对基因的注释(名字例如叫:egg.tsv)
TSV格式
(2)ko00001.json文件
下载地址:
https://www.genome.jp/kegg-bin/get_htext?ko00001
(3)目的基因集
二、需要的R包
rio、stringr、tidyverse、clusterprofiler
三、构建过程
1.导入注释文件到R
options(stringsAsFactors = F)
egg<-rio::import("egg.tsv")
2.把注释文件里的空值改为NA
egg[egg==""]<-NA
3.从注释文件里把基因与KEGG提取出来:
gene2ko <- egg %>%
dplyr::select(GID = query_name, KO = KEGG_ko) %>%
na.omit()
4.将KO行中有多个值的拆分为多行
all_ko_list=str_split(gene2ko$KO,",")
gene2ko <- data.frame(GID=rep(gene2ko$GID,times=sapply(all_ko_list,length)),KO=unlist(all_ko_list))
5.将gene2ko中,KO列的"ko:"去掉
gene2ko$KO=str_replace(gene2ko$KO,"ko:","")
6.对json文件操作
if(!file.exists('kegg_info.RData')){
library(jsonlite)
library(purrr)
library(RCurl)
update_kegg <- function(json = "ko00001.json",file=NULL) {
pathway2name <- tibble(Pathway = character(), Name = character())
ko2pathway <- tibble(Ko = character(), Pathway = character())
kegg <- fromJSON(json)
for (a in seq_along(kegg[["children"]][["children"]])) {
A <- kegg[["children"]][["name"]][[a]]
for (b in seq_along(kegg[["children"]][["children"]][[a]][["children"]])) {
B <- kegg[["children"]][["children"]][[a]][["name"]][[b]]
for (c in seq_along(kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]])) {
pathway_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["name"]][[c]]
pathway_id <- str_match(pathway_info, "ko[0-9]{5}")[1]
pathway_name <- str_replace(pathway_info, " \\[PATH:ko[0-9]{5}\\]", "") %>% str_replace("[0-9]{5} ", "")
pathway2name <- rbind(pathway2name, tibble(Pathway = pathway_id, Name = pathway_name))
kos_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]][[c]][["name"]]
kos <- str_match(kos_info, "K[0-9]*")[,1]
ko2pathway <- rbind(ko2pathway, tibble(Ko = kos, Pathway = rep(pathway_id, length(kos))))
}
}
}
save(pathway2name, ko2pathway, file = file)
}
update_kegg(json = "ko00001.json",file="kegg_info.RData")
}
产生一个叫kegg_info.RData的文件
7.加载上一步创建的文件
load("kegg_info.RData")
出现这两个变量
分别是这样:
ko2pathway
pathway2name
8.将ko2pathway的列名,由Ko,Pathway,改为KO,Pathway
colnames(ko2pathway)=c("KO",'Pathway')
9.创建 Term2gene
Term2gene <- gene2ko %>%left_join(ko2pathway, by = "KO") %>%dplyr::select(Pathway,GID) %>%na.omit()
四.enrichr分析
library(clusterProfiler)
keggS7 <- enricher(gene=X557VS550All_resultsfiler$X1,pvalueCutoff = 0.05,pAdjustMethod = "BH",TERM2GENE = Term2gene,TERM2NAME = pathway2name)
gene目的基因集、Term2gene第9步、pathway2name由第7步创建
务必 目的基因集的基因ID和注释文件的基因ID一致
五、简单画图
barplot(keggS7)
dotplot(keggS7)
参考:
详细回顾非模式物种注释构建过程