【Sc-RNAseq】190723-Seurat包学习

引言

英文教程及数据：https://satijalab.org/seurat/pbmc3k_tutorial.html
参考教程：健明的单细胞课程(Seurat V2版本)+翻译的中文版Seurat教程https://www.jianshu.com/p/03b94b2034d5 +http://qiubio.com/new/book/chapter-07/#%E5%AF%B9%E5%88%86%E9%9B%86%E7%BB%86%E8%83%9E%E8%BF%9B%E8%A1%8C%E6%A0%87%E8%AE%B0assigning-cell-type-identity-to-clusters

seurat总结

counts矩阵进来后被包装为对象，方便操作。
然后一定要经过 NormalizeData 和 ScaleData 的操作
函数 FindVariableGenes 可以挑选适合进行下游分析的基因集。
函数 RunPCA 和 RunTSNE 进行降维
函数 FindClusters 直接就分群了，非常方便
函数 FindAllMarkers 可以对分群后各个亚群找标志基因。
函数 FeaturePlot 可以展示不同基因在所有细胞的表达量
函数 VlnPlot 可以展示不同基因在不同分群的表达量差异情况
函数 DoHeatmap 可以选定基因集后绘制热图

载入R包

if (!requireNamespace("BiocManager"))
    install.packages("BiocManager")
if (!requireNamespace("Seurat"))
    BiocManager::install("Seurat")
rm(list = ls()) # clear the environment
library(Seurat)
packageVersion("Seurat")
options(warn=-1) # turn off warning message globally
suppressMessages(library(Seurat))

下载数据，并创建Seurat对象

健明大佬使用的是scRNA的内置数据集，且Seurat是V2版本，内力不够的我，转换过程比较费劲，觉得官网的数据更方便理解，下载的文件夹里有三个文件。Seurat V3可以直接用Read10X函数读取cellrangerV2 和V3的数据。

pbmc.data <- Read10X(data.dir = "~/Desktop/190722/pbmc/filtered_gene_bc_matrices/hg19")
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k",min.cells = 3, min.features = 20)
View(pbmc)

事先检查

#线粒体细胞占比 
#方法1
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
#方法2
mito.genes <- grep(pattern = "^MT-", x = rownames(x = sce@data), value = TRUE)
percent.mito <- Matrix::colSums(sce@raw.data[mito.genes, ]) / Matrix::colSums(sce@raw.data)
#红细胞占比 
HB.genes_total <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ") # 人类血液常见红细胞基因
HB_m <- match(HB.genes_total,rownames(pbmc@assays$RNA))
HB.genes <- rownames(pbmc@assays$RNA)[HB_m]
HB.genes <- HB.genes[!is.na(HB.genes)]
pbmc[["percent.HB"]]<-PercentageFeatureSet(pbmc,features=HB.genes)
#Feature、count、线粒体基因、红细胞基因占比可视化。
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt","percent.HB"), ncol = 4) 
#v2的nUMI和nGene分别改为了nCount_RNA、nFeature_RNA

均一化与标准化

unique feature counts ＞ 2,500 or ＜ 200，过滤掉
cell >5% mitochondrial counts 过滤掉

pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)  
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
#标准化
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)

特征提取，PCA降维

pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
#前10个variable基因
top10 <- head(VariableFeatures(pbmc), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
CombinePlots(plots = list(plot1, plot2),legend="bottom")
#  PCA降维
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
head(pbmc@reductions$pca@cell.embeddings)#每个细胞在PC轴上的坐标
head(pbmc@reductions$pca@feature.loadings)#每个基因对每个PC轴的贡献度（loading值）
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
DimPlot(pbmc, reduction = "pca")
DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)

确定维度

pbmc <- JackStraw(pbmc, num.replicate = 100)#注意: 这可能是一个非常费时的过程。
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
plot1<-JackStrawPlot(pbmc, dims = 1:15)#看pc显著差一部分
plot2<-ElbowPlot(pbmc)#看拐点 
CombinePlots(plots = list(plot1, plot2),legend="bottom")#PC为10的时候，每个轴是有区分意义的。

聚类

#通过shared nearest neighbor (SNN)算法 识别细胞类群,其中resolution，, use a value above (below) 1.0 if you want to obtain a larger (smaller) number of communities.
pbmc <- FindNeighbors(pbmc, dims = 1:10)
pbmc <- FindClusters(pbmc, resolution = 0.5)
table(pbmc@active.ident) # 查看每一类有多少个细胞
# 提取某一类细胞。
head(subset(as.data.frame(pbmc@active.ident),pbmc@active.ident=="2"))
# Look at cluster IDs of the first 5 cells
head(Idents(pbmc), 5)
#提取某一cluster
subpbmc<-subset(x = pbmc,idents="2")
#WhichCells, 找到某种特征的细胞，Returns a list of cells
#提取部分细胞(30个细胞)
 subbset<-subset(x=pbmc,cells=colnames(pbmc@assays$RNA@counts)[1:30])

可视化降维UMAP/tSNE

pbmc <- RunUMAP(pbmc, dims = 1:10)
head(pbmc@reductions$umap@cell.embeddings) # 提取UMAP坐标值。
pbmc <- RunTSNE(pbmc, dims = 1:10)
 head(pbmc@reductions$tsne@cell.embeddings)
plot1<-DimPlot(pbmc, reduction = "umap",label = TRUE)+scale_color_npg()
plot2<-DimPlot(pbmc, reduction = "tsne",label = TRUE)+scale_color_npg()
CombinePlots(plots = list(plot1, plot2),legend="bottom")

其中使用UMAP功能时报错，通过library(reticulate) py_install("umap-learn")后解决

差异分析

# find all markers of cluster 1
cluster1.markers <- FindMarkers(pbmc, ident.1 = 1, min.pct = 0.25)
head(cluster1.markers, n = 5)
#这是一种one-others的差异分析方法，就是cluster1与其余的cluster来做比较，当然这个是可以指定的,参数就是ident.2
# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
head(cluster5.markers, n = 5)
#输出一个总表，找某个cluster与剩余所有细胞的差异marker，only.pos 参数，可以指定返回positive markers 基因。test.use可以指定检验方法，可选择的有：wilcox，bimod，roc，t，negbinom，poisson，LR，MAST，DESeq2。
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC)
#cluster内部保守的conserved marker基因
head(FindConservedMarkers(pbmc, ident.1 = 0, ident.2 = 1, grouping.var = "groups"))

小提琴图

用整理后的数据绘制基因小提琴图，与用原始的counts绘制小提琴图

plot1<-VlnPlot(pbmc, features = c("MS4A1", "CD79A")，ncol=1)+scale_color_npg()

plot2<- VlnPlot(pbmc, features = c("MS4A1", "CD79A"),ncol=1, same.y.lims=T,slot = "counts", log = TRUE)+scale_color_npg()
CombinePlots(plots = list(plot1, plot2))

featurePlot和热图

plot1<-FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", 
                               "CD8A"),min.cutoff = 0, max.cutoff = 4)
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
plot2 <- DoHeatmap(pbmc, features = top10$gene) + NoLegend()+scale_color_npg()
#CombinePlots(plots = list(plot1, plot2))
library(gridExtra) 
grid.arrange(plot1,plot2,ncol = 2, nrow = 1)

细胞周期分析

pbmc <- CellCycleScoring(
  object = pbmc,
  g2m.features = cc.genes$g2m.genes,
  s.features = cc.genes$s.genes
)
pbmc <- CellCycleScoring(
  object = pbmc,
  g2m.features = cc.genes$g2m.genes,
  s.features = cc.genes$s.genes
)
#在UMAP中绘制细胞周期信息
umapem<-pbmc@reductions$umap@cell.embeddings
metada= pbmc@meta.data
dim(umapem);dim(metada)

metada$bar<-rownames(metada)
umapem$bar<-rownames(umapem)
ccdata<-merge(umapem,metada,by="bar")
head(ccdata)
library(ggplot2)
plot<-ggplot(ccdata, aes(UMAP_1, UMAP_2,label=Phase))+geom_point(aes(colour = factor(Phase)))+
  #plot<-plot+scale_colour_manual(values=c("#CC33FF","Peru","#660000","#660099","#990033","black","red", "#666600", "green","#6699CC","#339900","#0000FF","#FFFF00","#808080"))+
  labs("@yunlai",x = "", y="") 
plot=plot+scale_color_aaas()  +
  theme_bw()+theme(panel.grid=element_blank(),legend.title=element_blank(),legend.text = element_text(color="black", size = 10, face = "bold"))
plot<-plot+guides(colour = guide_legend(override.aes = list(size=5))) +theme(plot.title = element_text(hjust = 0.5))
plot

用SingleR定义每一个细胞的类型

new.cluster.ids <- c("Naive CD4 T", "Memory CD4 T", "CD14+ Mono", "B", "CD8 T", "FCGR3A+ Mono", 
                     "NK", "DC", "Platelet")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
plot1<-DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
plot2<-DimPlot(pbmc, reduction = "tsne", label = TRUE, pt.size = 0.5) + NoLegend()
grid.arrange(plot1,plot2,ncol = 2, nrow = 1)

亚群分析

如果我们对参数进行一些调整，比如resolution=0.8，就可以发现CD4 T细胞会被分成两个亚群。我们可以对这一亚群使用上面类似的办法进行分析。

# 生成一个快照, 其实就是把pbmc@ident中的数据拷贝到pbmc@meta.data中。
# 当我们想把pbmc@meta.data中的值赋值给pbmc@ident的话，可以使用SetAllIdent函数。
# 比如pbmc <- SetAllIdent(object=pbmc, id = 'orig.ident')
pbmc <- StashIdent(object = pbmc, save.name = "ClusterNames_0.6")

# 现在我们使用不同的resolution来重新分群。当我们提高resolution的时候，
# 我们可以得到更多的群。
# 之前我们在FindClusters中设置了save.snn=TRUE，所以可以不用再计算SNN了。
# 下面就直接提高一下区分度，设置resolution = 0.8
pbmc <- FindClusters(object = pbmc, reduction.type = "pca", 
                     dims.use = 1:10, resolution = 0.8, 
                     print.output = FALSE)

# 并排画两个tSNE,右边的是用ClusterNames_0.6 (resolution=0.6) 来分组颜色的。
# 我们可以看到当resolution提高之后，CD4 T细胞被分成了两个亚群。
plot1 <- TSNEPlot(object = pbmc, do.return = TRUE, 
                  no.legend = TRUE, do.label = TRUE)
plot2 <- TSNEPlot(object = pbmc, do.return = TRUE, 
                  group.by = "ClusterNames_0.6",
                  no.legend = TRUE, do.label = TRUE)
plot_grid(plot1, plot2)