引言
英文教程及数据:https://satijalab.org/seurat/pbmc3k_tutorial.html
参考教程:健明的单细胞课程(Seurat V2版本)+翻译的中文版Seurat教程https://www.jianshu.com/p/03b94b2034d5 +http://qiubio.com/new/book/chapter-07/#%E5%AF%B9%E5%88%86%E9%9B%86%E7%BB%86%E8%83%9E%E8%BF%9B%E8%A1%8C%E6%A0%87%E8%AE%B0assigning-cell-type-identity-to-clusters
seurat总结
counts
矩阵进来后被包装为对象,方便操作。
然后一定要经过 NormalizeData
和 ScaleData
的操作
函数 FindVariableGenes
可以挑选适合进行下游分析的基因集。
函数 RunPCA
和 RunTSNE
进行降维
函数 FindClusters
直接就分群了,非常方便
函数 FindAllMarkers
可以对分群后各个亚群找标志基因。
函数 FeaturePlot
可以展示不同基因在所有细胞的表达量
函数 VlnPlot
可以展示不同基因在不同分群的表达量差异情况
函数 DoHeatmap
可以选定基因集后绘制热图
载入R包
if (!requireNamespace("BiocManager"))
install.packages("BiocManager")
if (!requireNamespace("Seurat"))
BiocManager::install("Seurat")
rm(list = ls()) # clear the environment
library(Seurat)
packageVersion("Seurat")
options(warn=-1) # turn off warning message globally
suppressMessages(library(Seurat))
下载数据,并创建Seurat对象
健明大佬使用的是scRNA的内置数据集,且Seurat是V2版本,内力不够的我,转换过程比较费劲,觉得官网的数据更方便理解,下载的文件夹里有三个文件。Seurat V3可以直接用Read10X函数读取cellrangerV2 和V3的数据。
pbmc.data <- Read10X(data.dir = "~/Desktop/190722/pbmc/filtered_gene_bc_matrices/hg19")
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k",min.cells = 3, min.features = 20)
View(pbmc)
事先检查
#线粒体细胞占比
#方法1
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
#方法2
mito.genes <- grep(pattern = "^MT-", x = rownames(x = sce@data), value = TRUE)
percent.mito <- Matrix::colSums(sce@raw.data[mito.genes, ]) / Matrix::colSums(sce@raw.data)
#红细胞占比
HB.genes_total <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ") # 人类血液常见红细胞基因
HB_m <- match(HB.genes_total,rownames(pbmc@assays$RNA))
HB.genes <- rownames(pbmc@assays$RNA)[HB_m]
HB.genes <- HB.genes[!is.na(HB.genes)]
pbmc[["percent.HB"]]<-PercentageFeatureSet(pbmc,features=HB.genes)
#Feature、count、线粒体基因、红细胞基因占比可视化。
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt","percent.HB"), ncol = 4)
#v2的nUMI和nGene分别改为了nCount_RNA、nFeature_RNA
均一化与标准化
unique feature counts > 2,500 or < 200,过滤掉
cell >5% mitochondrial counts 过滤掉
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
#标准化
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
特征提取,PCA降维
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
#前10个variable基因
top10 <- head(VariableFeatures(pbmc), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
CombinePlots(plots = list(plot1, plot2),legend="bottom")
# PCA降维
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
head(pbmc@reductions$pca@cell.embeddings)#每个细胞在PC轴上的坐标
head(pbmc@reductions$pca@feature.loadings)#每个基因对每个PC轴的贡献度(loading值)
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
DimPlot(pbmc, reduction = "pca")
DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)
确定维度
pbmc <- JackStraw(pbmc, num.replicate = 100)#注意: 这可能是一个非常费时的过程。
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
plot1<-JackStrawPlot(pbmc, dims = 1:15)#看pc显著差一部分
plot2<-ElbowPlot(pbmc)#看拐点
CombinePlots(plots = list(plot1, plot2),legend="bottom")#PC为10的时候,每个轴是有区分意义的。
聚类
#通过shared nearest neighbor (SNN)算法 识别细胞类群,其中resolution,, use a value above (below) 1.0 if you want to obtain a larger (smaller) number of communities.
pbmc <- FindNeighbors(pbmc, dims = 1:10)
pbmc <- FindClusters(pbmc, resolution = 0.5)
table(pbmc@active.ident) # 查看每一类有多少个细胞
# 提取某一类细胞。
head(subset(as.data.frame(pbmc@active.ident),pbmc@active.ident=="2"))
# Look at cluster IDs of the first 5 cells
head(Idents(pbmc), 5)
#提取某一cluster
subpbmc<-subset(x = pbmc,idents="2")
#WhichCells, 找到某种特征的细胞,Returns a list of cells
#提取部分细胞(30个细胞)
subbset<-subset(x=pbmc,cells=colnames(pbmc@assays$RNA@counts)[1:30])
可视化降维UMAP/tSNE
pbmc <- RunUMAP(pbmc, dims = 1:10)
head(pbmc@reductions$umap@cell.embeddings) # 提取UMAP坐标值。
pbmc <- RunTSNE(pbmc, dims = 1:10)
head(pbmc@reductions$tsne@cell.embeddings)
plot1<-DimPlot(pbmc, reduction = "umap",label = TRUE)+scale_color_npg()
plot2<-DimPlot(pbmc, reduction = "tsne",label = TRUE)+scale_color_npg()
CombinePlots(plots = list(plot1, plot2),legend="bottom")
其中使用UMAP功能时报错,通过library(reticulate) py_install("umap-learn")
后解决
差异分析
# find all markers of cluster 1
cluster1.markers <- FindMarkers(pbmc, ident.1 = 1, min.pct = 0.25)
head(cluster1.markers, n = 5)
#这是一种one-others的差异分析方法,就是cluster1与其余的cluster来做比较,当然这个是可以指定的,参数就是ident.2
# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
head(cluster5.markers, n = 5)
#输出一个总表,找某个cluster与剩余所有细胞的差异marker,only.pos 参数,可以指定返回positive markers 基因。test.use可以指定检验方法,可选择的有:wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2。
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC)
#cluster内部保守的conserved marker基因
head(FindConservedMarkers(pbmc, ident.1 = 0, ident.2 = 1, grouping.var = "groups"))
小提琴图
用整理后的数据绘制基因小提琴图,与用原始的counts绘制小提琴图
plot1<-VlnPlot(pbmc, features = c("MS4A1", "CD79A"),ncol=1)+scale_color_npg()
plot2<- VlnPlot(pbmc, features = c("MS4A1", "CD79A"),ncol=1, same.y.lims=T,slot = "counts", log = TRUE)+scale_color_npg()
CombinePlots(plots = list(plot1, plot2))
featurePlot和热图
plot1<-FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
"CD8A"),min.cutoff = 0, max.cutoff = 4)
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
plot2 <- DoHeatmap(pbmc, features = top10$gene) + NoLegend()+scale_color_npg()
#CombinePlots(plots = list(plot1, plot2))
library(gridExtra)
grid.arrange(plot1,plot2,ncol = 2, nrow = 1)
细胞周期分析
pbmc <- CellCycleScoring(
object = pbmc,
g2m.features = cc.genes$g2m.genes,
s.features = cc.genes$s.genes
)
pbmc <- CellCycleScoring(
object = pbmc,
g2m.features = cc.genes$g2m.genes,
s.features = cc.genes$s.genes
)
#在UMAP中绘制细胞周期信息
umapem<-pbmc@reductions$umap@cell.embeddings
metada= pbmc@meta.data
dim(umapem);dim(metada)
metada$bar<-rownames(metada)
umapem$bar<-rownames(umapem)
ccdata<-merge(umapem,metada,by="bar")
head(ccdata)
library(ggplot2)
plot<-ggplot(ccdata, aes(UMAP_1, UMAP_2,label=Phase))+geom_point(aes(colour = factor(Phase)))+
#plot<-plot+scale_colour_manual(values=c("#CC33FF","Peru","#660000","#660099","#990033","black","red", "#666600", "green","#6699CC","#339900","#0000FF","#FFFF00","#808080"))+
labs("@yunlai",x = "", y="")
plot=plot+scale_color_aaas() +
theme_bw()+theme(panel.grid=element_blank(),legend.title=element_blank(),legend.text = element_text(color="black", size = 10, face = "bold"))
plot<-plot+guides(colour = guide_legend(override.aes = list(size=5))) +theme(plot.title = element_text(hjust = 0.5))
plot
用SingleR定义每一个细胞的类型
new.cluster.ids <- c("Naive CD4 T", "Memory CD4 T", "CD14+ Mono", "B", "CD8 T", "FCGR3A+ Mono",
"NK", "DC", "Platelet")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
plot1<-DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
plot2<-DimPlot(pbmc, reduction = "tsne", label = TRUE, pt.size = 0.5) + NoLegend()
grid.arrange(plot1,plot2,ncol = 2, nrow = 1)
亚群分析
如果我们对参数进行一些调整,比如resolution=0.8,就可以发现CD4 T细胞会被分成两个亚群。我们可以对这一亚群使用上面类似的办法进行分析。
# 生成一个快照, 其实就是把pbmc@ident中的数据拷贝到pbmc@meta.data中。
# 当我们想把pbmc@meta.data中的值赋值给pbmc@ident的话,可以使用SetAllIdent函数。
# 比如pbmc <- SetAllIdent(object=pbmc, id = 'orig.ident')
pbmc <- StashIdent(object = pbmc, save.name = "ClusterNames_0.6")
# 现在我们使用不同的resolution来重新分群。当我们提高resolution的时候,
# 我们可以得到更多的群。
# 之前我们在FindClusters中设置了save.snn=TRUE,所以可以不用再计算SNN了。
# 下面就直接提高一下区分度,设置resolution = 0.8
pbmc <- FindClusters(object = pbmc, reduction.type = "pca",
dims.use = 1:10, resolution = 0.8,
print.output = FALSE)
# 并排画两个tSNE,右边的是用ClusterNames_0.6 (resolution=0.6) 来分组颜色的。
# 我们可以看到当resolution提高之后,CD4 T细胞被分成了两个亚群。
plot1 <- TSNEPlot(object = pbmc, do.return = TRUE,
no.legend = TRUE, do.label = TRUE)
plot2 <- TSNEPlot(object = pbmc, do.return = TRUE,
group.by = "ClusterNames_0.6",
no.legend = TRUE, do.label = TRUE)
plot_grid(plot1, plot2)