昨天上完课没有整理完,今天继续
9.之前用的参考基因组不是文章中所用的需要进一步优化
进行基因组的坐标转换,这一步真的有点难,转换的方式比较多可以参考生信菜鸟团的博文https://cloud.tencent.com/developer/article/1054520
暂时先用曾老师代码,尝试理解,后面再深入学习
library(rtracklayer)
path='dm3ToDm6.over.chain'
ch = import.chain(path)
ch
require(ChIPseeker)
require(TxDb.Dmelanogaster.UCSC.dm3.ensGene )
txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene
require(clusterProfiler)
library(ChIPpeakAnno)
peak_dm3 <- readPeakFile('Pho_WT.narrowPeak.bed')
seqlevelsStyle(peak_dm3) = "UCSC" # necessary
peak_dm6 = liftOver(peak_dm3, ch)
class(peak_dm6)
peak_dm3
peak_dm6
他在演示的时候这个R脚本运行较慢,最后用shell脚本完成相关内容,还需要在消化一下
10. 再次重画韦恩图根据原文查看分别在H3K27me3 domain
内外的peaks
rm(list=ls())
require(ChIPseeker)
require(TxDb.Dmelanogaster.UCSC.dm3.ensGene )
txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene
require(clusterProfiler)
(bedFiles=list.files(pattern = '*dm6'))
bedFiles=bedFiles[-2]
library(ChIPpeakAnno)
fs=lapply(bedFiles,function(x){
peak <- readPeakFile( x )
keepChr= !grepl('Het',seqlevels(peak))
seqlevels(peak, pruning.mode="coarse") <- seqlevels(peak)[keepChr]
peak
})
library(stringr)
str_split(bedFiles,'_',simplify = T)[,1]
H3K27 <- readPeakFile('H3K27_peaks.narrowPeak')
keepChr= seqlevels(H3K27) %in% c('2L','2R','3L','3R','4','X','Y','M')
seqlevels(H3K27, pruning.mode="coarse") <- seqlevels(H3K27)[keepChr]
seqlevels(H3K27, pruning.mode="coarse") <- paste0('chr',seqlevels(H3K27))
seqlevels(H3K27)
tmp1=findOverlapsOfPeaks(fs[[1]], H3K27)
tmp2=findOverlapsOfPeaks(fs[[2]], H3K27)
tmp3=findOverlapsOfPeaks(fs[[3]], H3K27)
ol <- findOverlapsOfPeaks(tmp1$peaklist$`fs..1..///H3K27`,
tmp2$peaklist$`fs..2..///H3K27`,
tmp3$peaklist$`fs..3..///H3K27`)
png('3_factors_overlapVenn_in_domain.png')
makeVennDiagram(ol,
NameOfPeaks=str_split(bedFiles,'_',simplify = T)[,1],
TxDb=txdb)
dev.off()
lapply(1:7,function(i){
gr=ol$peaklist[[i]]
dat=as.data.frame(gr)[,1:3]
dat[,1]=gsub('chr','',dat[,1])
write.table(dat,sep = '\t',
col.names = F,file = paste0('G',i,'_in.bed')
,quote = F,row.names = F)
})
ol <- findOverlapsOfPeaks(tmp1$peaklist$fs..1..,
tmp2$peaklist$fs..2..,
tmp3$peaklist$fs..3..)
png('3_factors_overlapVenn_not_domain.png')
makeVennDiagram(ol,
NameOfPeaks=str_split(bedFiles,'_',simplify = T)[,1],
TxDb=txdb)
dev.off()
lapply(1:7,function(i){
gr=ol$peaklist[[i]]
dat=as.data.frame(gr)[,1:3]
dat[,1]=gsub('chr','',dat[,1])
write.table(dat,sep = '\t',
col.names = F,file = paste0('G',i,'_out.bed')
,quote = F,row.names = F)
})
生成2个新的韦恩图,并且得到overlap后的peaks数据集的bed文件
这一步的完成需要上一步得到的转换参考基因组版本后的bed文件,这个内容需要再练习一下
12. 对之前的差异分析提取感兴趣的基因进行注释
load(file = 'deg_output.Rdata')
library(ggpubr)
colnames(DEG_PhoKO)
DEG_PhoKO$log=log(DEG_PhoKO$baseMean+1)
DEG_PhoKO$change=ifelse(DEG_PhoKO$padj>0.05,'stable',
ifelse(DEG_PhoKO$log2FoldChange > 0,'up','down'))
table(DEG_PhoKO$change)
ggscatter(DEG_PhoKO,x="log" ,y="log2FoldChange",color = 'change')
DEG_SppsKO$log=log(DEG_SppsKO$baseMean+1)
DEG_SppsKO$change=ifelse(DEG_SppsKO$padj>0.05,'stable',
ifelse(DEG_SppsKO$log2FoldChange > 0,'up','down'))
table(DEG_SppsKO$change)
ggscatter(DEG_SppsKO,x="log" ,y="log2FoldChange",color = 'change')
library(UpSetR)
SppsKO_up=rownames(DEG_SppsKO[DEG_SppsKO$change=='up',])
SppsKO_down=rownames(DEG_SppsKO[DEG_SppsKO$change=='down',])
PhoKO_up=rownames(DEG_PhoKO[DEG_PhoKO$change=='up',])
PhoKO_down=rownames(DEG_PhoKO[DEG_PhoKO$change=='down',])
allG=unique(c(SppsKO_up,SppsKO_down,PhoKO_up,PhoKO_down))
df=data.frame(allG=allG,
SppsKO_up=as.numeric(allG %in% SppsKO_up),
SppsKO_down=as.numeric(allG %in% SppsKO_down),
PhoKO_up=as.numeric(allG %in% PhoKO_up),
PhoKO_down=as.numeric(allG %in% PhoKO_down))
upset(df)
colnames(df)
# "SppsKO_up" "SppsKO_down" "PhoKO_up" "PhoKO_down"
SppsKO_up_uniq=df[df[,2]==1 & df[,3]==0 & df[,4]==0 & df[,5]==0 ,1]
SppsKO_down_uniq=df[df[,2]==0 & df[,3]==1 & df[,4]==0 & df[,5]==0 ,1]
PhoKO_up_uniq=df[df[,2]==0 & df[,3]==0 & df[,4]==1 & df[,5]==0 ,1]
PhoKO_down_uniq=df[df[,2]==0 & df[,3]==0 & df[,4]==0 & df[,5]==1 ,1]
SppsKO_up_PhoKO_up=df[df[,2]==1 & df[,3]==0 & df[,4]==1 & df[,5]==0 ,1]
SppsKO_down_PhoKO_down=df[df[,2]==0 & df[,3]==1 & df[,4]==0 & df[,5]==1 ,1]
library(org.Dm.eg.db)
t1=toTable(org.Dm.egCHRLOC)
t2=toTable(org.Dm.egCHRLOCEND)
t3=toTable(org.Dm.egSYMBOL)
# 作业
require(TxDb.Dmelanogaster.UCSC.dm6.ensGene )
txdb <- TxDb.Dmelanogaster.UCSC.dm6.ensGene
gs=as.data.frame(genes(txdb))
library(org.Dm.eg.db)
t100=toTable(org.Dm.egFLYBASE)
t101=toTable(org.Dm.egSYMBOL)
t200=merge(t100,t101,by='gene_id')
colnames(gs)[6]="flybase_id"
t300=merge(t200,gs,by="flybase_id")
SppsKO_up_uniq
SppsKO_up_bed=t300[match(SppsKO_up_uniq[SppsKO_up_uniq %in% t300$symbol ],
t300$symbol),4:6]
SppsKO_down_uniq
SppsKO_down_bed=t300[match(SppsKO_down_uniq[SppsKO_down_uniq %in% t300$symbol ],
t300$symbol),4:6]
这个作业是参考上面的代码完成peaks查找对应基因进行的注释并做下有分析
13. 在上一步得到的基因集加载进来进行kegg
注释
rm(list = ls())
load(file = '6-genesets.Rdata')
library(ggplot2)
library(clusterProfiler)
library(org.Dm.eg.db)
kegg_anno <- function(gs,pro){
#pro='SppsKO_up_uniq'
#gs=SppsKO_up_uniq
gene_up= bitr(gs, fromType = "SYMBOL",
toType = c( "ENTREZID"),
OrgDb = org.Dm.eg.db)[,2]
kk.up <- enrichKEGG(gene = gene_up,
organism = 'dme',
keyType = 'ncbi-geneid',
pvalueCutoff = 0.9,
qvalueCutoff =0.9)
head(kk.up)[,1:6]
kk=kk.up
dotplot(kk)
ggsave(paste0(pro,'_kk.png'))
kk=DOSE::setReadable(kk, OrgDb='org.Dm.eg.db',keyType='ENTREZID')
write.csv(kk@result,paste0(pro,'_kk.up.csv'))
}
l=list(SppsKO_up_uniq,SppsKO_down_uniq,
PhoKO_up_uniq, PhoKO_down_uniq,
SppsKO_up_PhoKO_up,SppsKO_down_PhoKO_down)
names(l)=c('SppsKO_up_uniq','SppsKO_down_uniq',
'PhoKO_up_uniq', 'PhoKO_down_uniq',
'SppsKO_up_PhoKO_up','SppsKO_down_PhoKO_down')
lapply(1:length(l), function(i){
kegg_anno(l[[i]],names(l)[i])
})
library(ggplot2)
library(clusterProfiler)
library(org.Dm.eg.db)
gcSample=lapply(l, function(x){
return( bitr(x, fromType = "SYMBOL",
toType = c( "ENTREZID"),
OrgDb = org.Dm.eg.db)[,2] )
})
xx <- compareCluster(gcSample, fun="enrichKEGG",
keyType = 'ncbi-geneid',
organism="dme", pvalueCutoff=0.05)
dotplot(xx)
注:这篇文章可以挖掘的地方进行peaks的通路的注释,也可以根据peaks对应的promoter区对应的调控基因进行注释和下游分析。
本次课程内容很丰富,关于相应分析软件的使用不熟练,需要进一步练习。今天下载了文章原文,后面就开始读文献,好好理解一下周末课程的内容争取消化掉。