day9-1 单细胞数据的二次分群
1.背景
Seurat里的FindClusters函数设置的resolution数值越大,分群的数量就越多,但是当单细胞数量太多的时候,会遇到resolution再变大,分群的数量也不再增加的情况。一次分群分不开,但又想要细分时,就会需要二次分群。
2.加载数据
rm(list = ls())
library(Seurat)
library(dplyr)
load("sce.Rdata")
seu.obj = sce
head(seu.obj@meta.data)
p1 = DimPlot(seu.obj, reduction = "umap",label=T)+NoLegend()
p1
3.二次分群
这里以成纤维细胞为例进行二次分群。想要切换别的细胞类型直接修改下面的my_sub即可。
核心就是提取感兴趣的亚群的细胞组成的Seurat对象,后面就是标准流程和可视化了,没有任何区别。
my_sub = "Fibroblasts"
sub.cells <- subset(seu.obj, idents = my_sub)
f = "obj.Rdata"
if(!file.exists(f)){
sub.cells = sub.cells %>%
NormalizeData() %>%
FindVariableFeatures() %>%
ScaleData(features = rownames(.)) %>%
RunPCA(features = VariableFeatures(.)) %>%
FindNeighbors(dims = 1:15) %>%
FindClusters(resolution = 0.5) %>%
RunUMAP(dims = 1:15)
save(sub.cells,file = f)
}
load(f)
DimPlot(sub.cells, reduction = 'umap',label = T)+NoLegend()
sub.cells@meta.data$celltype = paste0("M",sub.cells$seurat_clusters) #把子对象的亚群注释结果添加到表格上面去。
save(sub.cells,file = "sub.cells.Rdata")
4.marker基因及其可视化
#找亚群间的差异基因
sub.cells.markers <- FindAllMarkers(sub.cells, only.pos = TRUE,
min.pct = 0.25, logfc.threshold = 0.25)
top10 <- sub.cells.markers %>%
group_by(cluster) %>%
top_n(n = 10, wt = avg_log2FC) %>%
pull(gene);top10
# [1] "PLN" "C2orf40" "NDUFA4L2" "TRIO" "CBLB" "ADIRF"
#[7] "TAGLN" "TINAGL1" "PPP1R14A" "MYH11" "TNFAIP6" "MEIS1"
#[13] "ADGRD1" "EMILIN2" "ROR1" "ADAMTS15" "ITGBL1" "BDKRB1"
#[19] "RNF217" "SLPI"
导出的基因结果不大对。。。
#marker基因的5种可视化
VlnPlot(sub.cells, features = top10)
RidgePlot(sub.cells, features = top10)
FeaturePlot(sub.cells, features = top10)
DotPlot(sub.cells,features = top10)+ RotatedAxis()
DoHeatmap(sub.cells, features = top10) + NoLegend()
5.放回原来的Seurat对象里面
5.1 R语言基础知识补充☞match
match函数时匹配,用于找出一个向量中的元素在另一个向量中的对应位置,
x <- c("f","b")
y <- c("b","c","f")
# 使用match函数找到x在y中的位置
match(x, y)
## [1] 3 1
##x里的两个元素f和b分别在y的第3和第1个位置
5.2 放回原来的Seurat对象里
seu.obj$celltype = as.character(Idents(seu.obj))
seu.obj$celltype = ifelse(seu.obj$celltype==my_sub,
sub.cells$celltype[match(colnames(seu.obj),colnames(sub.cells))],
seu.obj$celltype)
使用ifelse
函数实现了分情况讨论:判断seu.obj的每个细胞是否是my_sub(本例是Fibroblasts),如果是的话,就替换成sub.cells里面匹配的每个细胞对应的celltype,也就是M0和M1;否则就不替换,还是原来的celltype。
seu.obj$celltype = as.character(Idents(seu.obj))
seu.obj$celltype = ifelse(seu.obj$celltype==my_sub,
sub.cells$celltype[match(colnames(seu.obj),colnames(sub.cells))],
seu.obj$celltype)
Idents(seu.obj) = seu.obj$celltype
p2 = DimPlot(seu.obj,label = T)+NoLegend()
p1+p2
day9-2 单样本细胞周期
根据在各个周期高表达的基因来计算细胞周期评分,根据评分的高低来推断细胞属于什么周期。
1.读取数据并做好前期的质控
1.1 前面使用的数据
rm(list = ls())
library(tidyverse)
library(Seurat)
ct = Read10X("input/")
seu.obj <- CreateSeuratObject(counts = ct,
min.cells = 3,
min.features = 200)
set.seed(1234)
seu.obj = subset(seu.obj,downsample = 3000)
seu.obj[["percent.mt"]] <- PercentageFeatureSet(seu.obj, pattern = "^MT-")
VlnPlot(seu.obj,
features = c("nFeature_RNA",
"nCount_RNA",
"percent.mt"),
ncol = 3,pt.size = 0.5)
seu.obj = subset(seu.obj,
nFeature_RNA < 6000 &
nCount_RNA < 30000 &
percent.mt < 18)
1.2 marrow
这个数据来自Seurat 的细胞周期Vignette,是一个严重受到细胞周期影响的数据。
https://satijalab.org/seurat/articles/cell_cycle_vignette
加载
exp.mat <- read.delim("nestorawa_forcellcycle_expressionMatrix.txt",row.names = 1)
marrow <- CreateSeuratObject(counts = exp.mat,
project = "b",
min.cells = 3,
min.features = 200)
marrow[["percent.mt"]] <- PercentageFeatureSet(marrow, pattern = "^MT-")
head(marrow@meta.data, 3)
VlnPlot(marrow,
features = c("nFeature_RNA",
"nCount_RNA",
"percent.mt"),
ncol = 3,pt.size = 0.5)
2.计算细胞周期评分
CellCycleScoring
就是计算细胞周期评分的函数。
check_cc = function(ob){
s.genes <- intersect(cc.genes$s.genes,rownames(ob))
g2m.genes <- intersect(cc.genes$g2m.genes,rownames(ob))
ob = ob %>%
NormalizeData() %>%
FindVariableFeatures() %>%
CellCycleScoring(s.features = s.genes,
g2m.features = g2m.genes) %>%
ScaleData(features = rownames(.)) %>%
RunPCA(features = c(s.genes,g2m.genes))
return(ob)
}
ch1 = check_cc(seu.obj)
head(ch1@meta.data)
# orig.ident nCount_RNA nFeature_RNA percent.mt S.Score G2M.Score Phase
#AAACCCACAGTCGGTC-1 SeuratProject 4243 1256 6.292717 -0.02545080 0.05694222 G2M
#AAACGAAAGAATCGCG-1 SeuratProject 7307 2577 2.572875 0.03719043 -0.12520510 S
#AAAGAACAGCTTACGT-1 SeuratProject 8154 1881 4.083885 0.04363704 -0.02283322 S
#AAAGAACAGGTTCATC-1 SeuratProject 8223 2182 4.377964 -0.05930417 -0.03018975 G1
#AAAGAACAGTCTGTAC-1 SeuratProject 3884 1377 4.763131 -0.02251541 0.02168140 G2M
#AAAGAACTCCACCTCA-1 SeuratProject 3997 1307 3.402552 0.01273964 -0.13158839 S
3.比较两个数据的细胞周期评分和PCA
这个函数处理完后,meta.data里面多了4列,分别是s和g2m的评分以及推断的细胞周期。
table(ch1$Phase)
##
## G1 G2M S
## 1555 482 863
ch2 = check_cc(marrow)
table(ch2$Phase)
##
## G1 G2M S
## 279 183 312
PCAPlot(ch1,group.by = "Phase")+ PCAPlot(ch2,group.by = "Phase")
把坐标调到相同范围。xlim和ylim是设置横纵坐标范围,要按需调整到把两个图的点都能放进去。例如图1是横坐标-60到5,纵坐标-6到5,图二横坐标范围-10到15,纵坐标范围-10到15,就设置为横坐标-60到15,纵坐标-10到15。
library(patchwork)
PCAPlot(ch1,group.by = "Phase")+
PCAPlot(ch2,group.by = "Phase")&
xlim(-60,15)&
ylim(-10,15)
再比较一下S.Score和G2M.Score
p1 = VlnPlot(ch1,"S.Score",group.by = "Phase")
p2 = VlnPlot(ch2,"S.Score",group.by = "Phase")
wrap_plots(p1,p2,nrow = 1) & ylim(-0.6,0.6)
p1 = VlnPlot(ch1,"G2M.Score",group.by = "Phase")
p2 = VlnPlot(ch2,"G2M.Score",group.by = "Phase")
wrap_plots(p1,p2,nrow = 1) & ylim(-0.5,1)
比较每一个细胞的S.Score和G2M.Score,就可以理解到每个细胞属于什么细胞周期是怎么得出来的了:每个细胞都有S.score和G2M.score,如果S.score和G2M.score都是负的就判断为G1期,否则,S.score高就是S期,G2M.score高就是G2M期。可以验证来理解:
table(ch1$Phase)
##
## G1 G2M S
## 1555 482 863
sum(ch1$S.Score<0 & ch1$G2M.Score <0) #G1
## [1] 1555
sum(ch1$S.Score>0 & ch1$S.Score > ch1$G2M.Score) #S
## [1] 863
sum(ch1$G2M.Score>0 & ch1$S.Score < ch1$G2M.Score) #G2M
## [1] 482
如果是大多数点都集中在0点附近的,就可以不用去除细胞周期的影响!,分布范围较广或者是有较多的离群值那就需要要去除。
让我们来比较一下去除和不去除细胞周期影响的下游注释看有没有区别吧。
4.比较去除和不去处细胞周期影响的下游注释
4.1 不考虑细胞周期的降维聚类分群
f = "ob1.Rdata"
if(!file.exists(f)){
ob1 = seu.obj %>%
NormalizeData() %>%
FindVariableFeatures() %>%
ScaleData(features = rownames(.)) %>%
RunPCA(features = VariableFeatures(.)) %>%
FindNeighbors(dims = 1:15) %>%
FindClusters(resolution = 0.5) %>%
RunUMAP(dims = 1:15) %>%
RunTSNE(dims = 1:15)
save(ob1,file = f)
}
load(f)
4.2 考虑细胞周期的降维聚类分群
其实是根据s期和g2m期各自有代表性的基因来打分,然后在ScaleData函数中加上vars.to.regress参数来消除影响。
这里的cc.genes是Seurat包里自带的数据,无需任何赋值或加载的操作,可以直接使用。
s.genes <- intersect(cc.genes$s.genes,rownames(seu.obj))
g2m.genes <- intersect(cc.genes$g2m.genes,rownames(seu.obj))
f = "ob2.Rdata"
if(!file.exists(f)){
ob2 = seu.obj %>%
NormalizeData() %>%
FindVariableFeatures() %>%
CellCycleScoring(s.features = s.genes, g2m.features = g2m.genes) %>%
ScaleData(vars.to.regress = c("S.Score", "G2M.Score"),features = rownames(.)) %>% #运行极其慢
RunPCA(features = VariableFeatures(.)) %>%
FindNeighbors(dims = 1:15) %>%
FindClusters(resolution = 0.5) %>%
RunUMAP(dims = 1:15) %>%
RunTSNE(dims = 1:15)
save(ob2,file = f)
}
load(f)
p1 <- DimPlot(ob1, reduction = "umap",label = T)+NoLegend()
p2 <- DimPlot(ob2, reduction = "umap",label = T)+NoLegend()
p1+p2
用singleR来注释
library(celldex)
library(SingleR)
ls("package:celldex")
f = "ref_BlueprintEncode.RData"
if(!file.exists(f)){
ref <- celldex::HumanPrimaryCellAtlasData()
save(ref,file = f)
}
ref <- get(load(f))
library(BiocParallel)
scRNA = ob1
test = scRNA@assays$RNA$data
pred.scRNA <- SingleR(test = test,
ref = ref,
labels = ref$label.main,
clusters = scRNA@active.ident)
new.cluster.ids <- pred.scRNA$pruned.labels
names(new.cluster.ids) <- levels(scRNA)
scRNA <- RenameIdents(scRNA,new.cluster.ids)
p3 <- DimPlot(scRNA, reduction = "umap",label = T,pt.size = 0.5) + NoLegend()
m = scRNA
scRNA = ob2
test = scRNA@assays$RNA$data
pred.scRNA <- SingleR(test = test,
ref = ref,
labels = ref$label.main,
clusters = scRNA@active.ident)
new.cluster.ids <- pred.scRNA$pruned.labels
names(new.cluster.ids) <- levels(scRNA)
scRNA <- RenameIdents(scRNA,new.cluster.ids)
p4 <- DimPlot(scRNA, reduction = "umap",label = T,pt.size = 0.5) + NoLegend()
p3+p4
table(as.character(Idents(m))==as.character(Idents(scRNA)))
## FALSE TRUE
##125 2749
去不去除当然还是有区别的,如果差太多那就采用去除后的结果,就是比较费计算资源。
结果还是不大一样