本分析从HISAT2比对的counts开始,先整合所有的counts数据,再获取再半数以上样本中CPM>1的基因,再进行差异分析,pathway分析,并并制作GSEA的input文件
suppressMessages(library(DESeq2))
suppressMessages(library(humanid))#参考Jimmy教程,安装这个包
suppressMessages(library(limma))
suppressMessages(library(ggplot2))
suppressMessages(library(pathview))
suppressMessages(library(topGO))
suppressMessages(library(clusterProfiler))
suppressMessages(library(enrichplot))
suppressMessages(library(DOSE))
suppressMessages(library(org.Hs.eg.db))
setwd("~/Fcounts")
##########Combind data############
prefix<-"Your file names"
mytsvfile = list.files(pattern="*.tsv")
list2env(
lapply(setNames(mytsvfile, make.names(gsub("*.tsv$", "", mytsvfile))),
read.table,header=T,row.names=1,check.names=FALSE,skip = 1), envir = .GlobalEnv)
files<-unlist(lapply(mytsvfile, FUN = function(x) {return(strsplit(x, split = ".tsv")[[1]][1])}))
data<-as.matrix(get(files[1])[,6])
rownames(data)<-row.names(get(files[1]))
colnames(data)<-files[1]
info<-get(files[1])[,1:5]
##########get gene whose CPM>1 in more than half samples##########
for (i in files[-1]){
tmp<-as.matrix(get(i)[,6])
rownames(tmp)<-row.names(get(i))
colnames(tmp)<-i
data<-cbind(data,tmp)
}
data<-cbind(info,data)
meta_data<-data[,1:5]
counts<-data[,6:ncol(data)]
colnames(counts)<-files2
data<-cbind(meta_data,counts)
write.csv(data,"BC_featureCounts_counts.csv")
rt <- as.matrix(t(t(counts)/colSums(counts) * 1000000)) #this is CPM values
write.csv(rt,"BC_CPM.csv")
MyFunction<-function(x){
ifelse(x>1,1,0)
}
#设置一个计数函数
keep_genes<-vector()#设置一个空的变量储存基因
row_names<-row.names(rt)
for (i in row_names){
cpm<-rt[i,]
count<-apply(as.matrix(cpm),2,MyFunction)
a<-sum(count)
if (a>=ncol(rt)/2)
keep_genes<-c(keep_genes,i)
}
new_counts<-counts[keep_genes,]
new_cpm<-rt[keep_genes,]
#-----TPM Calculation------
avg_cpm <- data.frame(avg_cpm=rowMeans(new_cpm))
kb <- meta_data$Length / 1000
rpk <- new_counts / kb
tpm <- t(t(rpk)/colSums(rpk) * 1000000)
avg_tpm <- data.frame(avg_tpm=rowMeans(tpm))
write.csv(new_counts,paste0(prefix,"_new_counts_CPM1.csv"))
write.csv(new_cpm,paste0(prefix,"_cpm_CPM1.csv"))
write.csv(avg_tpm,paste0(prefix,"_avg_tpm_CPM1.csv"))
write.csv(avg_cpm,paste0(prefix,"_avg_cpm_CPM1.csv"))
write.csv(tpm,paste0(prefix,"_tpm_CPM1.csv"))
write.csv(cpm,paste0(prefix,"_cpm_CPM1.csv"))
###########DEG analysis############
group_list<-c(rep("NC",3),rep("OE",3))
colData <- data.frame(row.names=colnames(new_counts), group_list=group_list)
dds1 <- DESeqDataSetFromMatrix(countData = new_counts,
colData = colData,
design = ~ group_list)
suppressMessages(dds <- DESeq(dds1))
res <-results(dds, contrast=c("group_list","OE","NC"))
resOrdered <- res[order(res$padj),]
resOrdered=as.data.frame(resOrdered)
sigoutTab1<-resOrdered[abs(resOrdered$log2FoldChange)>1 & resOrdered$padj<0.05,]
sigoutTab2<-resOrdered[abs(resOrdered$log2FoldChange)>0.5849625 & resOrdered$padj<0.05,]
write.csv(resOrdered,paste0(prefix,"_DESeq2_results.csv"))
write.csv(sigoutTab1,paste0(prefix,"_FoldChange_2_DESeq2_results.csv"))
write.csv(sigoutTab2,paste0(prefix,"_FoldChange_1.5_DESeq2_results.csv"))
##########Pathway#########
genes1<-unlist(lapply(row.names(sigoutTab1), FUN = function(x) {return(strsplit(x, split = ".",fixed = T)[[1]][1])}))
gene_list1<-select(org.Hs.eg.db, keys=as.character(genes1), columns=c("SYMBOL","ENTREZID"), keytype="ENSEMBL") #keytype是你输入基因编号类型,columns是你输出对基因编号类型,基因怎么导入不再赘述。
genes2<-unlist(lapply(row.names(sigoutTab2), FUN = function(x) {return(strsplit(x, split = ".",fixed = T)[[1]][1])}))
gene_list2<-select(org.Hs.eg.db, keys=as.character(genes2), columns=c("SYMBOL","ENTREZID"), keytype="ENSEMBL") #keytype是你输入基因编号类型,columns是你输出对基因编号类型,基因怎么导入不再赘述。
entrezIDs1 <- as.character(gene_list1[,3])
entrezIDs2 <- as.character(gene_list2[,3])
go1 <- enrichGO(entrezIDs1, OrgDb = "org.Hs.eg.db", ont="all")
write.csv(as.data.frame(go1@result), file=paste0(prefix,"_go_all_all_FoldChange_2.csv"))
kk1 <- enrichKEGG(gene = entrezIDs1,
organism="human",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
minGSSize = 1,
#readable = TRUE ,
use_internal_data =FALSE)
write.csv(as.data.frame(kk1@result), file=paste0(prefix,"_kegg_all_FoldChange_2.csv"))
tiffFile<-paste0(prefix,"_FoldChange_2_go_all_dotplot.tiff")
p1<-dotplot(go1, split="ONTOLOGY") + facet_grid(ONTOLOGY~., scale="free")
tiff(file=tiffFile,width = 1000,height = 600)
plot(p1)
dev.off()
tiff(file=paste0(prefix,"_FoldChange_2_KEGG_barplot.tiff"),width = 20,height = 20,units ="cm",compression="lzw",bg="white",res=300)
g1<-barplot(kk1, drop = TRUE, showCategory = 12)
plot(g1)
dev.off()
#??ͼ
tiff(file=paste0(prefix,"_FoldChange_2_KEGG_dotplot.tiff"),width = 20,height = 20,units ="cm",compression="lzw",bg="white",res=300)
gg1<-dotplot(kk1)
plot(gg1)
dev.off()
kk2 <- enrichKEGG(gene = entrezIDs2,
organism="human",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
minGSSize = 1,
#readable = TRUE ,
use_internal_data =FALSE)
write.csv(as.data.frame(kk2@result), paste0(prefix,"_kegg_all_FoldChange_1.5.csv"))
tiffFile<-paste0(prefix,"_FoldChange_1.5_go_all_dotplot.tiff")
p2<-dotplot(go2, split="ONTOLOGY") + facet_grid(ONTOLOGY~., scale="free")
tiff(file=tiffFile,width = 1000,height = 600)
plot(p2)
dev.off()
tiff(file=paste0(prefix,"_FoldChange_1.5_KEGG_barplot.tiff"),width = 20,height = 20,units ="cm",compression="lzw",bg="white",res=300)
g2<-barplot(kk2, drop = TRUE, showCategory = 12)
plot(g2)
dev.off()
#??ͼ
tiff(file=paste0(prefix,"_FoldChange_1.5_KEGG_dotplot.tiff"),width = 20,height = 20,units ="cm",compression="lzw",bg="white",res=300)
gg2<-dotplot(kk2)
plot(gg2)
dev.off()
############get the annotation form gtf files#######
#creat .csv from gtf files very easy得到human_v0.0.3.csv
#参考这个https://www.jianshu.com/p/e1f3e5abb44f
ann<-read.csv("~/human_v0.0.3.csv",header = T,row.names = 1,check.names = F)
ann_tmp<-ann[,c("gene_name","gene_type")]
tpm_ann<-cbind(ann_tmp[row.names(tpm),],tpm)
tpm_ann<-as.matrix(tpm_ann[!duplicated(tpm_ann$gene_name),])
row.names(tpm_ann)<-as.character(tpm_ann[,1])
new_tpm<-tpm_ann[,3:ncol(tpm_ann)]
write.csv(new_tpm,paste0(prefix,"_tpm_CPM1_with_annotation.csv"))
createGSEAinput(prefix = prefix, exprSet=new_tpm,group_list=group_list)
##########