单细胞测序-EWCE包

本文主要目的是确定筛选的差异基因主要富集在哪些细胞中，可查看官方protocol https://github.com/NathanSkene/EWCE

install.packages("devtools")
library(devtools)
install_github("nathanskene/ewce")
#若安装ewce时报错，请看[https://www.jianshu.com/p/40676b44a1b2](https://www.jianshu.com/p/40676b44a1b2)
#，亲测有效
library(EWCE)
library(ggplot2)
library(cowplot)
library(limma)
library(readxl)

首先，确定ctd数据集

#.Loading datasets
data("cortex_mrna")  # a list that contain an expression matrix and an annotation data frame
View(cortex_mrna)
cortex_mrna[[1]][1:10,1:5]
cortex_mrna[[2]][1:10,1:10]

image.png

由此，我们可以根据上面的数据格式导入我们自己的数据，即exp就是一个数据表达矩阵，而annot就是一个注释矩阵，这个矩阵不需要像文中example那么多，但须要纳入"cell_id", "level1class" and "level2class"

其次，对于公开可用的转录组数据集来说，使用不正确的基因符号是非常常见的(通常一些基因名称在Excel中打开时会出现错误)。因此这个包提供了一个函数来纠正这类错误。第一次运行它时，需要从MGI下载MRK_List2文件，该文件列出了MGI基因符号的已知同义词。建议在所有输入数据集上运行此操作。

if(!file.exists("MRK_List2.rpt")){
    download.file("http://www.informatics.jax.org/downloads/reports/MRK_List2.rpt", destfile="MRK_List2.rpt")
}
cortex_mrna$exp = fix.bad.mgi.symbols(cortex_mrna$exp,mrk_file_path="MRK_List2.rpt")

##若为人，则下载这个：
##若为人，
if(!file.exists("HGNC_homologene.rpt")){
  download.file("http://www.informatics.jax.org/downloads/reports/HGNC_homologene.rpt", destfile="HGNC_homologene.rpt")
}
cortex_mrna$exp = fix.bad.mgi.symbols(cortex_mrna$exp,mrk_file_path="HGNC_homologene.rpt")

第三、Calculate specificity matrices

# Generate celltype data for just the cortex/hippocampus data
exp_CortexOnly_DROPPED = drop.uninformative.genes(exp=cortex_mrna$exp,level2annot = cortex_mrna$annot$level2class)
annotLevels = list(level1class=cortex_mrna$annot$level1class,level2class=cortex_mrna$annot$level2class)
ctd = generate.celltype.data(exp=exp_CortexOnly_DROPPED,annotLevels=annotLevels,groupName="kiCortexOnly")
print(ctd)

##最后一步是必要的，如果你计划比较人类数据，即由人类遗传学产生的基因集；这里这么说我推测可能是因为提供的example数据集是动物的
#ctd = filter.genes.without.1to1.homolog(fNames_CortexOnly)
#print(ctd)

load(fNames_CortexOnly[1])

image.png

第四步. Application to genetic data

data("example_genelist")
print(example_genelist)

image.png

由此，这里就可以替换成我们自己的差异基因了

###若为小鼠
data("mouse_to_human_homologs")
m2h = unique(mouse_to_human_homologs[,c("HGNC.symbol","MGI.symbol")])
mouse.hits = unique(m2h[m2h$HGNC.symbol %in% example_genelist,"MGI.symbol"])
#mouse.bg  = unique(setdiff(m2h$MGI.symbol,mouse.hits))
mouse.bg  = unique(m2h$MGI.symbol)

image.png

##若为人，则human.hit and human.bg的建立方法
data("mouse_to_human_homologs")
#mouse.bg  = unique(setdiff(m2h$MGI.symbol,mouse.hits))
human.hits<- example_genelist
print(human.hits)
human.bg = unique(c(human.hits,m2h$HGNC.symbol))
length(human.bg)
head(human.bg)

image.png

reps=1000 # <- Use 1000 bootstrap lists so it runs quickly, for publishable analysis use >10000
level=1 # <- Use level 1 annotations (i.e. Interneurons)

# Bootstrap significance testing, without controlling for transcript length and GC content
full_results = bootstrap.enrichment.test(sct_data=ctd,hits=mouse.hits,bg=mouse.bg,
                                reps=reps,annotLevel=level)
print(full_results$results[order(full_results$results$p),3:5][1:6,])
print(ewce.plot(full_results$results,mtc_method="BH"))

image.png

如果您想查看列表中每个基因的富集特性，那么generate.bootstrap。应该使用plot函数。这将这些绘图保存到BootstrapPlots文件夹中。

generate.bootstrap.plots(sct_data=ctd,hits=mouse.hits,bg=mouse.bg,reps=100,annotLevel=1,full_results=full_results,listFileName="VignetteGraphs")

image.png

第五、若数据为转录组数据，那么

data(tt_alzh)
tt_results = ewce_expression_data(sct_data=ctd,tt=tt_alzh,annotLevel=1,ttSpecies="human",sctSpecies="mouse")
ewce.plot(tt_results$joint_results)

image.png

这主要是阅读官方protocol后自己的一点总结，若有不同意见，欢迎大家提出来一起探讨。