Seurat Weekly NO.3|| 直接用Seurat画fig2

很多CNS文章的fig1（寓指tsne/umap图谱），打眼一看就是Seurat出的。Seurat 确实用ggplot2 包装了大量的可视化函数，一如我们介绍的：单细胞转录组数据分析||Seurat新版教程:New data visualization methods in v3.0。

但是在文章的后面，特别是用到统计信息的时候，Seurat的默认参数就不能直接用了。这就到了考验基本功的时候了，有的需要朝里看源码，有的需要朝外包装Seurat。

本期的Seurat Weekly，你们家运来小哥哥就带大家看看扩展Seurat可视化函数的思路。每个图都反映了一种特定的思路，我们来一一解说。

载入R包和我们的老朋友pbmc3k.final。

library(Seurat)
library(SeuratData)
library(ggforce)
library(ggpubr)

pbmc3k.final
An object of class Seurat 
13714 features across 2638 samples within 1 assay 
Active assay: RNA (13714 features)
 2 dimensional reductions calculated: pca, umap

第零种，教程找的勤

找教程确实可以解决大部分问题。如两样本的差异基因可视化：

library(ggplot2)
library(cowplot)
theme_set(theme_cowplot())

pbmc3k.final$stim <- sample(c("rep1", "rep2"), size = ncol(pbmc3k.final), replace = TRUE)

Idents(pbmc3k.final)
t.cells <- subset(pbmc3k.final, idents = "Naive CD4 T")
Idents(t.cells) <- "stim"
avg.t.cells <- log1p(AverageExpression(t.cells, verbose = FALSE)$RNA)
avg.t.cells$gene <- rownames(avg.t.cells)

cd14.mono <- subset(pbmc3k.final, idents = "CD14+ Mono")
Idents(cd14.mono) <- "stim"
avg.cd14.mono <- log1p(AverageExpression(cd14.mono, verbose = FALSE)$RNA)
avg.cd14.mono$gene <- rownames(avg.cd14.mono)

genes.to.label = c("ISG15", "LY6E", "IFI6", "ISG20", "MX1", "IFIT2", "IFIT1", "CXCL10", "CCL8")
p1 <- ggplot(avg.t.cells, aes(rep1, rep2)) + geom_point() + ggtitle("CD4 Naive T Cells")
p1 <- LabelPoints(plot = p1, points = genes.to.label, repel = TRUE)+ theme_bw()
p2 <- ggplot(avg.cd14.mono, aes(rep1, rep2)) + geom_point() + ggtitle("CD14 Monocytes")
p2 <- LabelPoints(plot = p2, points = genes.to.label, repel = TRUE)   + 
  geom_smooth() +  
  geom_abline(intercept=1,slope=1 ,colour="#990000", linetype="dashed" )+ 
  geom_abline(intercept=-1,slope=1 ,colour="#990000", linetype="dashed" )+ theme_bw()
 
plot_grid(p1, p2)

第一种：改源码

以DoHeatmap为例，我们知道Seurat的热图可以加一条idents的bar分组信息，但是没有给参数来加更多的信息。如我们的细胞既有临床分组信息，又有细胞类型，又有细胞周期，我们该怎么办？

为了多一些metadata信息，我们计算一个细胞周期吧。其实这里更多的应该是你的临床样本信息，这样才有意思。

pbmc3k.final <- CellCycleScoring(
  object = pbmc3k.final,
  g2m.features = cc.genes$g2m.genes,
  s.features = cc.genes$s.genes
)
head(pbmc3k.final@meta.data) ## 有了，有了

               orig.ident nCount_RNA nFeature_RNA seurat_annotations percent.mt
AAACATACAACCAC     pbmc3k       2419          779       Memory CD4 T  3.0177759
AAACATTGAGCTAC     pbmc3k       4903         1352                  B  3.7935958
AAACATTGATCAGC     pbmc3k       3147         1129       Memory CD4 T  0.8897363
AAACCGTGCTTCCG     pbmc3k       2639          960         CD14+ Mono  1.7430845
AAACCGTGTATGCG     pbmc3k        980          521                 NK  1.2244898
AAACGCACTGGTAC     pbmc3k       2163          781       Memory CD4 T  1.6643551
               RNA_snn_res.0.5 seurat_clusters     S.Score   G2M.Score Phase
AAACATACAACCAC               1               1  0.10502807 -0.04507596     S
AAACATTGAGCTAC               3               3 -0.02680010 -0.04986215    G1
AAACATTGATCAGC               1               1 -0.01387173  0.07792135   G2M
AAACCGTGCTTCCG               2               2  0.04595348  0.01140204     S
AAACCGTGTATGCG               6               6 -0.02602771  0.04413069   G2M
AAACGCACTGGTAC               1               1 -0.03692587 -0.08341568    G1

为了热图的基因有顺序，我们还是计算一个差异基因吧，使热图有框。

mhgen  <- FindAllMarkers(pbmc3k.final,only.pos = T)

head(mhgen)

              p_val avg_logFC pct.1 pct.2     p_val_adj     cluster  gene
RPS12 2.008629e-140 0.5029988 1.000 0.991 2.754633e-136 Naive CD4 T RPS12
RPS27 2.624075e-140 0.5020359 0.999 0.992 3.598656e-136 Naive CD4 T RPS27
RPS6  1.280169e-138 0.4673635 1.000 0.995 1.755623e-134 Naive CD4 T  RPS6
RPL32 4.358823e-135 0.4242773 0.999 0.995 5.977689e-131 Naive CD4 T RPL32
RPS14 3.618793e-128 0.4283480 1.000 0.994 4.962812e-124 Naive CD4 T RPS14
RPS25 5.612007e-124 0.5203411 0.997 0.975 7.696306e-120 Naive CD4 T RPS25

好了我们可以绘制热图上面的bar了：

# Show we can color anything
cols.use <- list(Phase=c('red', 'blue', 'pink', 'brown'))  # 指定颜色
pbmc3k.final <-ScaleData(pbmc3k.final,features= mk) # 为了我的差异基因都被Scale过
#png('mult.png')
DoMultiBarHeatmap(pbmc3k.final, features=mk, group.by='Phase', additional.group.by = c('seurat_clusters', 'seurat_annotations'), additional.group.sort.by = c('seurat_annotations'), cols.use=cols.use)
# dev.off()

大概这样子啦，想要吗？

第二种，封装它

其实这个方法我们之前见过，Seurat绘制堆积小提琴图用的就是这个。只堆积个小提琴图满足不了审稿人日益增长的需求啊，于是我们想在绘制表达量小提琴图的时候，加上统计信息。这不免让我们想起ggpubr。指定一个基因集和需要作比较的分组信息，我这里是细胞类型，你可以是你的实验设计信息啊。

gene_sig <- c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A")
comparisons <- list(c("Naive CD4 T", "CD8 T"))

Vlnpubr(pbmc3k.final,gene_signature = gene_sig, test_sign = comparisons,group_my = 'seurat_annotations')

第三种，等更新

既不会看源码，也不熟悉ggplot的世界，不会封装。这时候可以选择等Seurat团队把我们的想法实现之后再作图。这个代价有点大，单细胞数据贬值的速度可是正比于其火热的程度啊。

按照细胞类型分组绘制的DotPlot，就是由于需求太过强烈，作者在V3.2中实现了。

packageVersion("Seurat") # 快看看你用的是哪个版本吧。
[1] ‘3.2.2’


Idents(pbmc3k.final) <- as.vector(pbmc3k.final[['seurat_annotations',drop=T]])

# 直接指定某一列作为 Idents 
Idents(pbmc3k, which(is.na(Idents(pbmc3k)))) <- "Unknown"

Bcellmarks  <- c("CD19","CD79A","CD79B","MS4A1")
HLAmarks <- c("HLA-DQA1", "HLA-DQB1", "HLA-DRB3")
Monomarks <- c("CD14","VCAN","FCN1")
Tcellmarks <- c("CD3D","CD3E","CD3G","IL7R","TRAC","TRGC2","TRDC", "CD8A", "CD8B", "CD4")
DCmarks <- c(HLAmarks,"CLEC10A","CLEC9A")
platelet <- c("GP9","PF4")

features <- list("B cell" = Bcellmarks, "T cell" = Tcellmarks, "DC" = DCmarks, "Monocyte" = Monomarks, "Platelet" = platelet)
DotPlot(object = pbmc3k.final, features=features, cluster.idents=T) + theme(axis.text.x = element_text(angle = 90))

DoMultiBarHeatmap

suppressPackageStartupMessages({
  library(rlang)
})

DoMultiBarHeatmap <- function (object, 
                               features = NULL, 
                               cells = NULL, 
                               group.by = "ident", 
                               additional.group.by = NULL, 
                               additional.group.sort.by = NULL, 
                               cols.use = NULL,
                               group.bar = TRUE, 
                               disp.min = -2.5, 
                               disp.max = NULL, 
                               slot = "scale.data", 
                               assay = NULL, 
                               label = TRUE, 
                               size = 5.5, 
                               hjust = 0, 
                               angle = 45, 
                               raster = TRUE, 
                               draw.lines = TRUE, 
                               lines.width = NULL, 
                               group.bar.height = 0.02, 
                               combine = TRUE) 
{
  cells <- cells %||% colnames(x = object)
  if (is.numeric(x = cells)) {
    cells <- colnames(x = object)[cells]
  }
  assay <- assay %||% DefaultAssay(object = object)
  DefaultAssay(object = object) <- assay
  features <- features %||% VariableFeatures(object = object)
  ## Why reverse???
  features <- rev(x = unique(x = features))
  disp.max <- disp.max %||% ifelse(test = slot == "scale.data", 
                                   yes = 2.5, no = 6)
  possible.features <- rownames(x = GetAssayData(object = object, 
                                                 slot = slot))
  if (any(!features %in% possible.features)) {
    bad.features <- features[!features %in% possible.features]
    features <- features[features %in% possible.features]
    if (length(x = features) == 0) {
      stop("No requested features found in the ", slot, 
           " slot for the ", assay, " assay.")
    }
    warning("The following features were omitted as they were not found in the ", 
            slot, " slot for the ", assay, " assay: ", paste(bad.features, 
                                                             collapse = ", "))
  }
  
  if (!is.null(additional.group.sort.by)) {
    if (any(!additional.group.sort.by %in% additional.group.by)) {
      bad.sorts <- additional.group.sort.by[!additional.group.sort.by %in% additional.group.by]
      additional.group.sort.by <- additional.group.sort.by[additional.group.sort.by %in% additional.group.by]
      if (length(x = bad.sorts) > 0) {
        warning("The following additional sorts were omitted as they were not a subset of additional.group.by : ", 
                paste(bad.sorts, collapse = ", "))
      }
    }
  }
  
  data <- as.data.frame(x = as.matrix(x = t(x = GetAssayData(object = object, 
                                                             slot = slot)[features, cells, drop = FALSE])))
  
  object <- suppressMessages(expr = StashIdent(object = object, 
                                               save.name = "ident"))
  group.by <- group.by %||% "ident"
  groups.use <- object[[c(group.by, additional.group.by[!additional.group.by %in% group.by])]][cells, , drop = FALSE]
  plots <- list()
  for (i in group.by) {
    data.group <- data
    if (!is_null(additional.group.by)) {
      additional.group.use <- additional.group.by[additional.group.by!=i]  
      if (!is_null(additional.group.sort.by)){
        additional.sort.use = additional.group.sort.by[additional.group.sort.by != i]  
      } else {
        additional.sort.use = NULL
      }
    } else {
      additional.group.use = NULL
      additional.sort.use = NULL
    }
    
    group.use <- groups.use[, c(i, additional.group.use), drop = FALSE]
    
    for(colname in colnames(group.use)){
      if (!is.factor(x = group.use[[colname]])) {
        group.use[[colname]] <- factor(x = group.use[[colname]])
      }  
    }
    
    if (draw.lines) {
      lines.width <- lines.width %||% ceiling(x = nrow(x = data.group) * 
                                                0.0025)
      placeholder.cells <- sapply(X = 1:(length(x = levels(x = group.use[[i]])) * 
                                           lines.width), FUN = function(x) {
                                             return(Seurat:::RandomName(length = 20))
                                           })
      placeholder.groups <- data.frame(rep(x = levels(x = group.use[[i]]), times = lines.width))
      group.levels <- list()
      group.levels[[i]] = levels(x = group.use[[i]])
      for (j in additional.group.use) {
        group.levels[[j]] <- levels(x = group.use[[j]])
        placeholder.groups[[j]] = NA
      }
      
      colnames(placeholder.groups) <- colnames(group.use)
      rownames(placeholder.groups) <- placeholder.cells
      
      group.use <- sapply(group.use, as.vector)
      rownames(x = group.use) <- cells
      
      group.use <- rbind(group.use, placeholder.groups)
      
      for (j in names(group.levels)) {
        group.use[[j]] <- factor(x = group.use[[j]], levels = group.levels[[j]])
      }
      
      na.data.group <- matrix(data = NA, nrow = length(x = placeholder.cells), 
                              ncol = ncol(x = data.group), dimnames = list(placeholder.cells, 
                                                                           colnames(x = data.group)))
      data.group <- rbind(data.group, na.data.group)
    }
    
    order_expr <- paste0('order(', paste(c(i, additional.sort.use), collapse=','), ')')
    group.use = with(group.use, group.use[eval(parse(text=order_expr)), , drop=F])
    
    plot <- Seurat:::SingleRasterMap(data = data.group, raster = raster, 
                                     disp.min = disp.min, disp.max = disp.max, feature.order = features, 
                                     cell.order = rownames(x = group.use), group.by = group.use[[i]])
    
    if (group.bar) {
      pbuild <- ggplot_build(plot = plot)
      group.use2 <- group.use
      cols <- list()
      na.group <- Seurat:::RandomName(length = 20)
      for (colname in rev(x = colnames(group.use2))) {
        if (colname == i) {
          colid = paste0('Identity (', colname, ')')
        } else {
          colid = colname
        }
        
        # Default
        cols[[colname]] <- c(scales::hue_pal()(length(x = levels(x = group.use[[colname]]))))  
        
        #Overwrite if better value is provided
        if (!is_null(cols.use[[colname]])) {
          req_length = length(x = levels(group.use))
          if (length(cols.use[[colname]]) < req_length){
            warning("Cannot use provided colors for ", colname, " since there aren't enough colors.")
          } else {
            if (!is_null(names(cols.use[[colname]]))) {
              if (all(levels(group.use[[colname]]) %in% names(cols.use[[colname]]))) {
                cols[[colname]] <- as.vector(cols.use[[colname]][levels(group.use[[colname]])])
              } else {
                warning("Cannot use provided colors for ", colname, " since all levels (", paste(levels(group.use[[colname]]), collapse=","), ") are not represented.")
              }
            } else {
              cols[[colname]] <- as.vector(cols.use[[colname]])[c(1:length(x = levels(x = group.use[[colname]])))]
            }
          }
        }
        
        # Add white if there's lines
        if (draw.lines) {
          levels(x = group.use2[[colname]]) <- c(levels(x = group.use2[[colname]]), na.group)  
          group.use2[placeholder.cells, colname] <- na.group
          cols[[colname]] <- c(cols[[colname]], "#FFFFFF")
        }
        names(x = cols[[colname]]) <- levels(x = group.use2[[colname]])
        
        y.range <- diff(x = pbuild$layout$panel_params[[1]]$y.range)
        y.pos <- max(pbuild$layout$panel_params[[1]]$y.range) + y.range * 0.015
        y.max <- y.pos + group.bar.height * y.range
        pbuild$layout$panel_params[[1]]$y.range <- c(pbuild$layout$panel_params[[1]]$y.range[1], y.max)
        
        plot <- suppressMessages(plot + 
                                   annotation_raster(raster = t(x = cols[[colname]][group.use2[[colname]]]),  xmin = -Inf, xmax = Inf, ymin = y.pos, ymax = y.max) + 
                                   annotation_custom(grob = grid::textGrob(label = colid, hjust = 0, gp = gpar(cex = 0.75)), ymin = mean(c(y.pos, y.max)), ymax = mean(c(y.pos, y.max)), xmin = Inf, xmax = Inf) +
                                   coord_cartesian(ylim = c(0, y.max), clip = "off")) 

        #  same time run  or not   ggplot version    ?  
        if ((colname == i ) && label) {
          x.max <- max(pbuild$layout$panel_params[[1]]$x.range)
          x.divs <- pbuild$layout$panel_params[[1]]$x.major
          group.use$x <- x.divs
          label.x.pos <- tapply(X = group.use$x, INDEX = group.use[[colname]],
                                FUN = median) * x.max
          label.x.pos <- data.frame(group = names(x = label.x.pos), 
                                    label.x.pos)
          plot <- plot + geom_text(stat = "identity", 
                                   data = label.x.pos, aes_string(label = "group", 
                                                                  x = "label.x.pos"), y = y.max + y.max * 
                                     0.03 * 0.5, angle = angle, hjust = hjust, 
                                   size = size)
          plot <- suppressMessages(plot + coord_cartesian(ylim = c(0, 
                                                                   y.max + y.max * 0.002 * max(nchar(x = levels(x = group.use[[colname]]))) * 
                                                                     size), clip = "off"))
        }
      
        
        
        
        }
    }
    plot <- plot + theme(line = element_blank())
    plots[[i]] <- plot
  }
  if (combine) {
    plots <- CombinePlots(plots = plots)
  }
  return(plots)
}

Vlnpubr

Vlnpubr <- function(seo, gene_signature, file_name, test_sign,group_my,label  = "p.signif"){
  plot_case1 <- function(signature, y_max = NULL){
    VlnPlot(seo , features = signature,
            pt.size = 0.1, 
            group.by =  group_my, 
            y.max = y_max # add the y-axis maximum value - otherwise p-value hidden
    ) + stat_compare_means(comparisons = test_sign, label = label ) + NoLegend()
  }
  plot_list <- list()
  y_max_list <- list()
  for (gene in gene_signature) {
    plot_list[[gene]] <- plot_case1(gene)
    y_max_list[[gene]] <- max(plot_list[[gene]]$data[[gene]]) # get the max no. for each gene
    plot_list[[gene]] <- plot_case1(gene, y_max = (y_max_list[[gene]] + 1) )
  }
  cowplot::plot_grid(plotlist = plot_list)
  #file_name <- paste0(file_name, "_r.png")
  #ggsave(file_name, width = 14, height = 8)
}

https://github.com/satijalab/seurat/issues/2201

https://www.biostars.org/p/458261/