1.R程序并行运算和调整R内部允许的对象大小的限制
默认是500 * 1024 ^ 2 = 500 Mb
suppressPackageStartupMessages(library(future))
suppressPackageStartupMessages(library(future.apply))
plan("multicore", workers = 10)
options(future.globals.maxSize = 40000 * 1024^2)
future在seurat的具体应用详见:https://satijalab.org/seurat/archive/v3.0/future_vignette.html
2.检查基因名称是否存在基因表达矩阵中
# 查询单个基因IGKV4
rownames(seurat_obj)[grep('IGKV4', rownames(seurat_obj))]
# 查询genelist
# 方式1:区分基因大小写
marker.genes.filte <- gene_list[gene_list %in% rownames(seurat_obj)]
gene.no.find <- gene_list[!gene_list %in% rownames(seurat_obj)]
# 方式2:不区分基因大小写
data("pbmc_small")
cd_genes <- c('Cd79b', 'Cd19', 'Cd200')
CaseMatch(search = cd_genes, match = rownames(x = pbmc_small))
# 方式3:不区分基因大小写,自定义函数
CheckSpecies <- function(subject){
if(class(subject)=="Seurat"){
subject = rownames(subject)[1]
}
# Human gene
if(subject == toupper(subject))
return("Human")
# Mouse gene
if(subject == Hmisc::capitalize(tolower(subject)))
return("Mouse")
}
FilterGenes <- function(object, marker.genes, unique= TRUE, verbose = T){
if(missing(object))
stop("A seurat object must be provided first")
if(class(object) != "Seurat")
stop("A seurat object must be provided first")
if(missing(marker.genes))
stop("A list of marker genes must be provided")
marker.genes <- as.character(marker.genes)
marker.genes <- unlist(strsplit(marker.genes,","))
marker.genes <- gsub(" ","",marker.genes)
species = CheckSpecies(object)
if(species == "Human") marker.genes <- toupper(marker.genes)
if(species == "Mouse") marker.genes <- Hmisc::capitalize(tolower(marker.genes))
if(verbose) print(paste("Before filtration:",length(marker.genes)))
marker.genes.filter <- CaseMatch(search = marker.genes, match = rownames(object))
if(unique) marker.genes.filter <- unique(marker.genes.filter)
if(verbose){
print(paste("After filtration:",length(marker.genes.filter)))
gene.no.find <- setdiff(marker.genes, marker.genes.filter)
if (length(gene.no.find)>0) print(paste("Not found genes:",paste(gene.no.find, collapse=" ")))
}
return(as.character(marker.genes.filter))
}
#-------------------------------
DefaultAssay(seurat_obj) <- "SCT"
gene_list <- c("Sca1", "Cd34", "Thy1", "Endoglin","CD73","Lepr","VCAM1","Cxcl12","Grem1")
gene_features <- FilterGenes(seurat_obj, gene_list)
# [1] "Before filtration: 9"
# [1] "After filtration: 6"
# [1] "Not found genes: Sca1 Endoglin Cd73"
CaseMatch()为seurat包内部函数,https://github.com/satijalab/seurat/blob/master/R/utilities.R
3.Dimplot绘图调整
# 去除坐标和Legend
umap_theme <- theme(
axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank(),
panel.background=element_blank(),
panel.border=element_blank(),
panel.grid.major=element_blank(),
panel.grid.minor=element_blank()
)
png('figures/basic_umap_clusters.png', width=7, height=7, res=200, units='in')
DimPlot(seurat_obj, reduction = "umap", group.by='seurat_clusters', label=TRUE) +
umap_theme + NoLegend() + ggtitle('UMAP colored by seurat clusters')
dev.off()
png('figures/basic_tsne_clusters.png', width=7, height=7, res=200, units='in')
DimPlot(seurat_obj, reduction = "tsne", group.by='seurat_clusters', label=TRUE) +
umap_theme + NoLegend() + ggtitle('t-SNE colored by seurat clusters')
dev.off()
如果细胞太密,有重叠,可以设置透明度,我一般alpha.use设置0.8或者0.9
Idents(seurat_obj) <- "celltype"
alpha.use <- 0.9
p1 <- DimPlot(object = seurat_obj, reduction = "tsne", label = TRUE, label.size = 3, pt.size=0.3, raster=FALSE)+labs(x="TSNE1", y="TSNE2")
p1$layers[[1]]$mapping$alpha <- alpha.use
p1 <- p1 + scale_alpha_continuous(range = alpha.use, guide = F)
save_plot("celltype_TSNE_clustering.png", p1, base_height = 8, base_aspect_ratio = 1.2, base_width = NULL, dpi=600)
save_plot("celltype_TSNE_clustering.pdf", p1, base_height = 8, base_aspect_ratio = 1.2, base_width = NULL)
4.FeaturePlot绘图配色
一般我常用的配色是:c("lightgrey", "red")或者c("lightgrey", "blue")
plot1 <- FeaturePlot(seurat_obj, features = c("CD3E","CXCR5","PDCD1"), pt.size=0.5, reduction="tsne", cols = c("lightgrey", "red"),label = T)
plot1 <- FeaturePlot(seurat_obj, features = c("CD3E","CXCR5","PDCD1"), pt.size=0.5, reduction="tsne", cols = c("lightgrey", "blue"),label = T)
为了对比细胞间的基因表达差异,常使用不同的配色;
使用viridis调色板
# set up list of canonical cell type markers
canonical_markers <- list(
'Astrocyte' = c('GFAP', 'AQP4', 'SLC1A2'),
'Pan-neuronal' = c('SNAP25', 'SYT1'),
'Excitatory Neuron' = c('SLC17A7', 'SATB2'),
'Inhibitory Neuron' = c('GAD1', 'GAD2'),
'Microglia' = c('CSF1R', 'CD74', 'P2RY12'),
'Oligodendrocyte' = c('MOBP', 'MBP', 'MOG'),
'Olig. Progenitor' = c('PDGFRA', 'CSPG4')
)
library(viridis)
# create feature plots, cutoff expression values for the 98th and 99th percentile
plot_list <- FeaturePlot(
seurat_obj,
features=unlist(canonical_markers),
combine=FALSE, cols=viridis(256),
max.cutoff='q98'
)
# apply theme to each feature plot
for(i in 1:length(plot_list)){
plot_list[[i]] <- plot_list[[i]] + umap_theme + NoLegend()
}
png('figures/basic_canonical_marker_featurePlot.png', width=10, height=10, units='in', res=200)
CombinePlots(plot_list)
dev.off()
可参考的配色有
library(ggplot2)
library(RColorBrewer)
FeaturePlot(pbmc_small,"LYZ") + scale_colour_gradientn(colours = rev(brewer.pal(n = 11, name = "RdBu")))
FeaturePlot(pbmc_small,"LYZ") + scale_colour_gradientn(colours = rev(brewer.pal(n = 11, name = "Spectral")))
5.FeaturePlot基因表达均值
我们常常用一些小的特征基因(3-5 个基因)使用 FeaturePlot() 来辅助celltype的鉴定,文献中也有此操作(https://www.nature.com/articles/s42003-020-0922-4/figures/1)
##### Plot signature of genes in Feature plot #######
Plot_sign <- function(Seraut.object, signature, operator = sum) {
x <- Seraut.object
DefaultAssay(x) <- "RNA"
x[["Sign_exp"]] <- apply(FetchData(object = x,
vars = signature),
1,
operator)
FP <- FeaturePlot(x, reduction = "umap",
features = 'Sign_exp',
label = T,
order=T,
cols = c("lightgrey", "red")) +
#cols = as.vector(coolwarm(5))) +
theme(plot.title = element_text(color="black", size=18, face="bold.italic"),
plot.subtitle = element_text(color="black", size=10, face="italic"),
axis.text.x = element_text(angle = 90, face = "bold", color = 'black', size=16, hjust =1),
axis.title.x = element_text(face = "bold", color = "black", size = 18),
axis.text.y = element_text(angle = 0, face = "bold", color = 'black', size=16),
axis.title.y = element_text(face = "bold", color = "black", size = 18),
legend.text = element_text(face = "bold", color = "black", size = 12),
panel.background = element_rect(fill = "white",colour = "black", size = 1, linetype = "solid")) +
labs(title = "Signature plot", subtitle = paste('MarkerGenes: ',toString(signature), sep=''),
x = "UMAP 1", y = "UMAP 2")
return(FP)
}
# 查看CFD,DLK1,LUM基因的平均表达量
DefaultAssay(seurat_obj) <- 'RNA'
pL <- Plot_sign(seurat_obj,
signature= c('CFD','DLK1','LUM'),
operator = mean, title = 'LEY')
#------------------------------------------
# 也可不用自定义函数
EpithelialCells <- c("EPCAM","KRT8","KRT18")
StromalCells <- c("COL1A1","COL1A2","COL6A1","COL6A2","VWF","PLVAP","CDH5","S100B")
ImmuneCells <- c("CD52","CD2","CD3D","CD3G","CD3E","CD79A","CD79B","CD14","FCGR3A","CD68","CD83","CSF1R","FCER1G")
MyeloidCells <- c("CD68", "XCR1", "CLEC9A", "CLEC10A", "CD1C", "S100A8", "S100A9", "TPSAB1", "OSM")
TCells <- c("NKG7", "KLRC1", "CCR7", "FOXP3", "CTLA4", "CD8B", "CXCR6", "CD3D")
BCells<- c("MZB1", "IGHA1", "SELL", "CD19", "AICDA")
# 计算基因集的平均表达量
seurat_obj@meta.data$EpithelialCells <- apply(seurat_obj@assays$SCT@data[rownames(seurat_obj@assays$SCT@data) %in% EpithelialCells,],2,mean)
seurat_obj@meta.data$StromalCells<- apply(seurat_obj@assays$SCT@data[rownames(seurat_obj@assays$SCT@data) %in% StromalCells,],2,mean)
seurat_obj@meta.data$ImmuneCells <- apply(seurat_obj@assays$SCT@data[rownames(seurat_obj@assays$SCT@data) %in% ImmuneCells,],2,mean)
seurat_obj@meta.data$MyeloidCells<- apply(seurat_obj@assays$SCT@data[rownames(seurat_obj@assays$SCT@data) %in% MyeloidCells,],2,mean)
seurat_obj@meta.data$TCells <- apply(seurat_obj@assays$SCT@data[rownames(seurat_obj@assays$SCT@data) %in% TCells,],2,mean)
seurat_obj@meta.data$BCells <- apply(seurat_obj@assays$SCT@data[rownames(seurat_obj@assays$SCT@data) %in% BCells,],2,mean)
p1 <- FeaturePlot(seurat_obj,reduction = "umap",features = Types, cols = c("gray","orange","orange","red","red"),combine = FALSE)
for(i in 1:length(p1)) {
p1[[i]] <- p1[[i]] + NoLegend() + NoAxes()
}
pdf("Cluster_Score_UMAP.pdf",width = 8,height =8)
cowplot::plot_grid(plotlist = p1)
dev.off()
6.AddModuleScore()使用
AddModuleScore()的算法跟上面的计算基因集均值的方法不同,后面细究下源码。
AddModuleScore()为Seurat包中自带函数,需要先计算基因集中所有基因的平均值,再根据平均值把表达矩阵切割成若干份,然后从切割后的每一份中随机抽取对照基因(基因集外的基因)作为背景值。因此,在整合不同样本的情况下,即使使用相同基因集为相同细胞打分,也会产生不同的富集评分;
# 官方示例
cell_typeA_marker_gene_list <- list(c("Gene1", "Gene2", "Gene3", "Gene4"))
object <- AddModuleScore(object = object, features = cell_typeA_marker_gene_list, name = "cell_typeA_score")
FeaturePlot(object = object, features = "cell_typeA_score1")
# 给出一段GitHub文献中的代码,学习下大佬的使用方法
b_cell <- c("MS4A1") # ref5
macrophage <- c("CD68", "IDO1") # ref5
plasmacytoid_dendritic_cell <- c("CLEC4C","NRP1") # ref5
erythrocyte <- c("HBB", "HBA1", "HBA2") # ref6
cytotoxic_t_cell <- c("GZMA", "GZMK", "IFNG") # ref5
regulatory_t_cell <- c("FOXP3", "IL2RA", "IKZF2") # ref5
t_helper <- c("CXCL13","PDCD1","FABP5") # ref5
naive_t_cell <- c("CCR7", "IL7R", "LEF1") # ref5
progenitor <- c("CD34") # ref5
mast_cell <- c("TPSAB1","TPSB2", "KIT", "GATA1", "GATA2") # ref7
# marker基因打分函数
score_marker_genes <- function(seurat_object, nbins=24){
AddModuleScore(seurat_object,
features = list(b_cell, macrophage, plasmacytoid_dendritic_cell, erythrocyte, cytotoxic_t_cell, regulatory_t_cell, t_helper, naive_t_cell, progenitor, mast_cell),
name=c("b_cell", "macrophage", "plasmacytoid_dendritic_cell", "erythrocyte", "cytotoxic_t_cell", "regulatory_t_cell", "t_helper", "naive_t_cell", "progenitor", "mast_cell"),
nbin=nbins)
}
hl <- score_marker_genes(hl)
# marker基因可视化函数
plot_marker_genes <- function(seurat_object, image_name){
ggsave(file = str_glue('../figures/{image_name}.pdf'),
plot = FeaturePlot(seurat_object,
features = c("b_cell1", "macrophage2", "plasmacytoid_dendritic_cell3", "erythrocyte4", "cytotoxic_t_cell5", "regulatory_t_cell6", "t_helper7", "naive_t_cell8", "progenitor9", "mast_cell10"),
min.cutoff = "q10", max.cutoff = "q90",
ncol=4, label=TRUE, order = TRUE),
device = "pdf", width = 50, height = 40, units = "cm")
}
plot_marker_genes(hl, "hl_markers")
7.DotPlot按基因集分隔显示
DotPlot的features为list时,list组间会有间隔,便于比较不同celltype的marker基因差异,可视化效果更好;
cgs = list(
Epi = c("EPCAM","PAX8","KRT18","CD24","KRT19","SCGB2A2","KRT5","KRT15"),
Meyloid = c("CD68","LYZ","MARCO","AIF1","TYROBR","MS4A6A","CD1E","IL3RA","LAMP3"),
T_cell = c("CD3D",'CD3E','TRAG','CD3G','CD2'),
B_cell = c("CD79A","CD79B","IGKC","CD19","MZB1","MS4A1"),
Endo = c("CLDN5","PECAM1","VWF","FLT1","RAMP2"),
Fibro = c("COL1A1","COL1A2","COL3A1","BGN","DCN","POSTN","C1R")
)
p_umap <- DimPlot(seurat_obj, reduction = "umap", group.by = "AllTypes", label = T)
p <- DotPlot(seurat_obj, features = cgs, assay = "SCT") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + p_umap + plot_layout(widths = c(2, 1))
save_plot("Dotplot_Celltype_UMAP_clustering.png", p, base_height = 8, base_aspect_ratio = 2.4, base_width = NULL, dpi=600)
save_plot("Dotplot_Celltype_UMAP_clustering.pdf", p, base_height = 8, base_aspect_ratio = 2.4, base_width = NULL)
# 横坐标和纵坐标互换,水平翻转
DotPlot(pbmc_small, features = unique(top5$gene), cols = "RdYlBu", col.min = 0, dot.scale = 5) + coord_flip()
# x轴标签倾斜
DotPlot(pbmc_small, features = unique(top5$gene), cols ="Spectral", col.min = 0, dot.scale = 5) + RotatedAxis()
DotPlot(pbmc_small, features = unique(top5$gene), cols ="Spectral", col.min = 0, dot.scale = 5) + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
8.DoHeatmap自定义颜色
library(viridis)
DoHeatmap(seurat_obj, features=top_DEGs, group.by='seurat_clusters', label=FALSE) + scale_fill_gradientn(colors=viridis(256)) + NoLegend()
DoHeatmap(pbmc_small) + scale_fill_viridis()
DoHeatmap(pbmc_small) + scale_fill_gradientn(colors = c("blue", "white", "red"))
mapal <- colorRampPalette(RColorBrewer::brewer.pal(11,"RdBu"))(256)
DoHeatmap(pbmc_small, angle = 90, size = 3) + scale_fill_gradientn(colours = rev(mapal))
#------------------------------------------
seurat_obj <- subset(initial_object, idents = 1:16)
levels(Idents(seurat_obj))
my_cols <- c('3'='#F68282','15'='#31C53F','5'='#1FA195','1'='#B95FBB','13'='#D4D915',
'14'='#28CECA','9'='#ff9a36','8'='#2FF18B','11'='#aeadb3','6'='#faf4cf',
'2'='#CCB1F1','12'='#25aff5','7'='#A4DFF2','4'='#4B4BF7','16'='#AC8F14',
'10'='#E6C122')
my_cols2 <- my_cols[order(as.integer(names(my_cols)))]
scales::show_col(my_cols2)
# 如修改配色,DimPlot和DoHeatmap同步修改
DimPlot(seurat_obj,
cols = my_cols2, label=TRUE , repel=TRUE)
DoHeatmap(seurat_obj, features = c("gene1", "gene2"), group.colors = my_cols2)
9.小鼠基因转人基因
# Function to convert human gene symbols into mouse gene symbols
#https://gist.github.com/FloWuenne/f8fc922477df04c1642e9d8945c48d47
#https://rjbioinformatics.com/2016/10/14/converting-mouse-to-human-gene-names-with-biomart-package/
convertHumanGeneList <- function(x){
require("biomaRt")
human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
genesV2 = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = x , mart = human, attributesL = c("mgi_symbol"), martL = mouse, uniqueRows=T)
humanx <- unique(genesV2[, 2])
no_mouse_genes <- length(x)
no_human_genes <- length(humanx)
if(no_human_genes != no_mouse_genes){
print("Some genes could not be translated!")
genes_not_trans <- setdiff(x,genesV2$HGNC.symbol)
print("These genes could not be translated:")
print(genes_not_trans)
print(paste("A total number of ",length(genes_not_trans),"genes could not be translated!"),sep=" ")
}else{
print("All genes were translated successfully!")
}
# Print all gene names that could not be translated and the number of genes that were not translated
return(humanx)
}
# 示例
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes
s.genes <- convertHumanGeneList(s.genes)
g2m.genes <- convertHumanGeneList(g2m.genes)
10.统计给定基因在cluster的数目
计算表达基因的细胞比例的函数,在https://github.com/satijalab/seurat/issues/371中有网友给出了一个自定义函数,如下:
# Function that can calculate proportion of cells expressing a gene
# calculates total cells expressing a gene (raw counts > 0) by metadata groups
# can be grouped by different samples types or cluster_# based on metadata
# 'ncells' counts to total number of cells, can be passed to have percentages in calc_helper
# you can adjust the threshold for RNA count to select for cells with more 'higher' expression
TotalCellExpringGene <- function(object, genes, group.by = "all"){
if(group.by == "all"){
prct = unlist(lapply(genes,calc_helper, object=object))
result = data.frame(Markers = genes, Cell_proportion = prct)
return(result)
}
else{
list = SplitObject(object, group.by)
factors = names(list)
results = lapply(list, PrctCellExpringGene, genes=genes)
for(i in 1:length(factors)){
results[[i]]$Feature = factors[i]
}
combined = do.call("rbind", results)
return(combined)
}
}
# for total cells:
calc_helper <- function(object,genes){
counts = object[['RNA']]@counts
ncells = ncol(counts)
if(genes %in% row.names(counts)){
sum(counts[genes,]>0)
}else{return(NA)}
}
# 示例
PrctCellExpringGene(seurat_object ,genes =c("geneA","geneB"), group.by = "sample.ID")
#Markers Cell_proportion
#MS1.1 geneA 0.022727273
#MS1.2 geneB 0.045337995
#MS2.1 geneA 0.000000000
#MS2.2 geneB 0.030033951
#MS3.1 geneA 0.001821125
#MS3.2 geneB 0.035410765
同样,也可以计算在cluster的占比
PrctCellExpringGene <- function(object, genes, group.by = "all"){
if(group.by == "all"){
prct = unlist(lapply(genes,calc_helper.2, object=object))
result = data.frame(Markers = genes, Cell_proportion = prct)
return(result)
}
else{
list = SplitObject(object, group.by)
factors = names(list)
results = lapply(list, PrctCellExpringGene, genes=genes)
for(i in 1:length(factors)){
results[[i]]$Feature = factors[i]
}
combined = do.call("rbind", results)
return(combined)
}
}
# for percentage of cells use this function:
calc_helper.2 <- function(object,genes){
counts = object[['RNA']]@counts
ncells = ncol(counts)
if(genes %in% row.names(counts)){
sum(counts[genes,]>0)/ncells
}else{return(NA)}
}
#Example for finding all cells expressing a gene:
#performs the counts based on the cluster assignment
NumberCellperGene <- PrctCellExpringGene(seurat_object,genes = c("Vim","Pdgfra","Pdgfrb","Acta2","Cnn1"), group.by = "seurat_clusters")