在seurat中,如果运行了RunUMAP
或者RunTSNE
后自动分群后,FindAllMarkers和FindMarkers基本就是一样的;如果没有进行RunUMAP
或者RunTSNE
分群,那么需要先运行BuildClusterTree(object)
函数,利用树聚类先分群
FindAllMarkers and FindMarkers
示例:
首先加载数据
pbmc.data <- read.csv("GSE117988_raw.expMatrix_PBMC.csv",header=TRUE,row.names = 1)
pbmc.data <- log2(1 + sweep(pbmc.data, 2, median(colSums(pbmc.data))/colSums(pbmc.data), '*'))
pbmc<- CreateSeuratObject(counts = pbmc.data, project = "10X pbmc", min.cells = 1, min.features = 0)
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
sce=pbmc
sce <- NormalizeData(sce, normalization.method = "LogNormalize", scale.factor = 10000)
GetAssay(sce,assay = "RNA")
sce <- FindVariableFeatures(sce, selection.method = "vst", nfeatures = 2000)
sce <- ScaleData(sce)
sce <- RunPCA(object = sce, pc.genes = VariableFeatures(sce))
sce <- FindNeighbors(sce, dims = 1:15)
sce <- FindClusters(sce, resolution = 0.2)
table(sce@meta.data$RNA_snn_res.0.2)
set.seed(123)
sce <- RunTSNE(object = sce, dims = 1:15, do.fast = TRUE)
## 运行 FindAllMarkers
all.markers <- FindAllMarkers(object = sce)
先看FindAllMarkers:
object = sce
assay = NULL
features = NULL
logfc.threshold = 0.25
test.use = 'wilcox'
slot = 'data'
min.pct = 0.1
min.diff.pct = -Inf
node = NULL
verbose = TRUE
only.pos = FALSE
max.cells.per.ident = Inf
random.seed = 1
latent.vars = NULL
min.cells.feature = 3
min.cells.group = 3
mean.fxn = NULL
fc.name = NULL
base = 2
return.thresh = 1e-2
densify = FALSE
if (is.null(x = node)) {
## 将分群的结果进行排序
idents.all <- sort(x = unique(x = Idents(object = object)))
## 一共 7 个类群
##[1] 0 1 2 3 4 5 6
##Levels: 0 1 2 3 4 5 6
}
genes.de <- list()
messages <- list()
## 分别遍历每一个细胞类群( 0 1 2 3 4 5 6 )
for (i in 1:length(x = idents.all)) {
if (verbose) {
message("Calculating cluster ", idents.all[i])
}
genes.de[[i]] <- tryCatch(
expr = {
FindMarkers(
object = object,
assay = assay,
ident.1 = if (is.null(x = node)) {
idents.all[i]
} else {
tree
},
ident.2 = if (is.null(x = node)) {
NULL
} else {
idents.all[i]
},
features = features,
logfc.threshold = logfc.threshold,
test.use = test.use,
slot = slot,
min.pct = min.pct,
min.diff.pct = min.diff.pct,
verbose = verbose,
only.pos = only.pos,
max.cells.per.ident = max.cells.per.ident,
random.seed = random.seed,
latent.vars = latent.vars,
min.cells.feature = min.cells.feature,
min.cells.group = min.cells.group,
mean.fxn = mean.fxn,
fc.name = fc.name,
base = base,
densify = densify
)
},
error = function(cond) {
return(cond$message)
}
)
}
gde.all <- data.frame()
# 对每一类群找到的 marker 基因进行阈值过滤
for (i in 1:length(x = idents.all)) {
gde <- genes.de[[i]]
gde <- gde[order(gde$p_val, -gde[, 2]), ]
gde <- subset(x = gde, subset = p_val < return.thresh)
gde$cluster <- idents.all[i]
gde$gene <- rownames(x = gde)
gde.all <- rbind(gde.all, gde)
# 重命名
rownames(x = gde.all) <- make.unique(names = as.character(x = gde.all$gene))
我们可以知道,当运行完RunUMAP
或者RunTSNE
后自动分群后,FindAllMarkers 内部其实就是调用了 FindMarkers,并且:
# FindMarkers 的 ident.1 为RunUMAP或者RunTSNE后自动分群后每一个类群的细胞
ident.1 = if (is.null(x = node))
{
idents.all[i]
}
# FindMarkers 的 ident.2 默认为空
ident.2 = if (is.null(x = node))
{
NULL
}
而 FindMarkers 在计算差异基因的时候需要先运行 IdentsToCells()
函数
IdentsToCells <- function(
object,
ident.1,
ident.2,
cellnames.use
) {
#
if (is.null(x = ident.1)) {
stop("Please provide ident.1")
} else if ((length(x = ident.1) == 1 && ident.1[1] == 'clustertree') || is(object = ident.1, class2 = 'phylo')) {
if (is.null(x = ident.2)) {
stop("Please pass a node to 'ident.2' to run FindMarkers on a tree")
}
tree <- if (is(object = ident.1, class2 = 'phylo')) {
ident.1
} else {
Tool(object = object, slot = 'BuildClusterTree')
}
if (is.null(x = tree)) {
stop("Please run 'BuildClusterTree' or pass an object of class 'phylo' as 'ident.1'")
}
ident.1 <- tree$tip.label[GetLeftDescendants(tree = tree, node = ident.2)]
ident.2 <- tree$tip.label[GetRightDescendants(tree = tree, node = ident.2)]
}
if (length(x = as.vector(x = ident.1)) > 1 &&
any(as.character(x = ident.1) %in% cellnames.use)) {
bad.cells <- cellnames.use[which(x = !as.character(x = ident.1) %in% cellnames.use)]
if (length(x = bad.cells) > 0) {
stop(paste0("The following cell names provided to ident.1 are not present in the object: ", paste(bad.cells, collapse = ", ")))
}
} else {
ident.1 <- WhichCells(object = object, idents = ident.1)
}
# if NULL for ident.2, use all other cells
if (length(x = as.vector(x = ident.2)) > 1 &&
any(as.character(x = ident.2) %in% cellnames.use)) {
bad.cells <- cellnames.use[which(!as.character(x = ident.2) %in% cellnames.use)]
if (length(x = bad.cells) > 0) {
stop(paste0("The following cell names provided to ident.2 are not present in the object: ", paste(bad.cells, collapse = ", ")))
}
} else {
if (is.null(x = ident.2)) {
ident.2 <- setdiff(x = cellnames.use, y = ident.1)
} else {
ident.2 <- WhichCells(object = object, idents = ident.2)
}
}
return(list(cells.1 = ident.1, cells.2 = ident.2))
}
对于 FindMarkers 若 ident.2 = NULL
,那么 IdentsToCells()
函数将会选择除 ident.1 类群以外的剩下所有细胞
# select which data to use
assay <- assay %||% DefaultAssay(object = sce)
data.use <- object[[assay]]
# cellnames.use 代表所有的细胞
cellnames.use <- colnames(x = data.use)
# if NULL for ident.2, use all other cells
if (is.null(x = ident.2)) {
## 选择除 ident.1 类群以外的其他所有细胞,setdiff 为取差集
ident.2 <- setdiff(x = cellnames.use, y = ident.1)
} else {
ident.2 <- WhichCells(object = object, idents = ident.2)
}
}
因此,对于 FindMarkers 若 ident.2 = NULL
,计算的是 ident.1 细胞类群与除 ident.1 类群以外的剩下所有细胞相比的差异基因;如果 ident.2 = 某一细胞类群
那么计算的是 ident.1 细胞类群与ident.2 类群相比的差异基因
FindConservedMarkers
# 安装必要的包
BiocManager::install('multtest')
install.packages('metap')
install.packages('sn')
library(Seurat)
# 加载示例数据
data("pbmc_small")
# 假设所有细胞一共分为 'g1' 和 'g2' 两种细胞类群
pbmc_small[['groups']] <- sample(x = c('g1', 'g2'), size = ncol(x = pbmc_small), replace = TRUE)
FindConservedMarkers(pbmc_small, ident.1 = 0, ident.2 = 1, grouping.var = "groups")
pbmc_small[['groups']]
> pbmc_small[['groups']]
groups
ATGCCAGAACGACT g1
CATGGCCTGTGCAT g2
GAACCTGATGAACC g1
TGACTGGATTCTCA g1
AGTCAGACTGCACA g2
TCTGATACACGTGT g1
TGGTATCTAAACAG g1
GCAGCTCTGTTTCT g1
GATATAACACGCAT g2
AATGTTGACAGTCA g2
### FindConservedMarkers 源代码分析
object = pbmc_small
ident.1 = 0
ident.2 = 1
grouping.var = "groups"
assay = 'RNA'
slot = 'data'
min.cells.group = 3
meta.method = metap::minimump
verbose = TRUE
# 假设有两个分组, 分别为 'g1' 和 'g2'
# ident.1 和 ident.2 为每一组('g1' 和 'g2') 的细胞分群, ident.1 = 0, ident.2 = 1
# 提取每个细胞对应分组的信息,即每个细胞分为 'g1' 还是 'g2'
object.var <- FetchData(object = object, vars = grouping.var)
# 提取 'g1' 和 'g2' 的细胞
object <- SetIdent(
object = object,
cells = colnames(x = object),
value = paste(Idents(object = object), object.var[, 1], sep = "_")
)
levels.split <- names(x = sort(x = table(object.var[, 1])))
num.groups <- length(levels.split)
cells <- list()
# 分别将 'g1' 和 'g2' 的细胞单独取出来, cells[[1]] 为 'g1' 的细胞, cells[[2]] 为 'g2' 的细胞
for (i in 1:num.groups) {
cells[[i]] <- rownames(
x = object.var[object.var[, 1] == levels.split[i], , drop = FALSE]
)
}
marker.test <- list()
# do marker tests
ident.2.save <- ident.2
# 分别遍历 'g1' 和 'g2'
for (i in 1:num.groups) {
level.use <- levels.split[i]
ident.use.1 <- paste(ident.1, level.use, sep = "_")
ident.use.1.exists <- ident.use.1 %in% Idents(object = object)
ident.2 <- ident.2.save
cells.1 <- WhichCells(object = object, idents = ident.use.1)
ident.use.2 <- paste(ident.2, level.use, sep = "_")
ident.use.2.exists <- ident.use.2 %in% Idents(object = object)
## ident.use.1 = "0_g1" or "0_g2"; ident.use.2 = "1_g1" or "1_g2"
## 分别对 'g1' 计算类群 0 与类群 1 的差异基因; 对 'g2' 计算类群 0 与类群 1 的差异基因
marker.test[[i]] <- FindMarkers(
object = object,
assay = assay,
slot = slot,
ident.1 = ident.use.1,
ident.2 = ident.use.2,
verbose = verbose
)
names(x = marker.test)[i] <- levels.split[i]
}
## 过滤 NULL 值
marker.test <- Filter(f = Negate(f = is.null), x = marker.test)
## 对 'g1' 中类群 0 与类群 1 差异基因item和 'g2' 中类群 0 与类群 1 差异基因item取交集
genes.conserved <- Reduce(
f = intersect,
x = lapply(
X = marker.test,
FUN = function(x) {
return(rownames(x = x))
}
)
)
## 将剩下的 logFC, p_val 整理进去
markers.conserved <- list()
for (i in 1:length(x = marker.test)) {
markers.conserved[[i]] <- marker.test[[i]][genes.conserved, ]
colnames(x = markers.conserved[[i]]) <- paste(
names(x = marker.test)[i],
colnames(x = markers.conserved[[i]]),
sep = "_"
)
}
## 合并对 'g1' 中类群 0 与类群 1 差异基因item和 'g2' 中类群 0 与类群 1 差异基因item取交集后的表格
## 包括 logFC, p_val
markers.combined <- Reduce(cbind, markers.conserved)
pval.codes <- colnames(x = markers.combined)[grepl(pattern = "*_p_val$", x = colnames(x = markers.combined))]
if (length(x = pval.codes) > 1) {
markers.combined$max_pval <- apply(
X = markers.combined[, pval.codes, drop = FALSE],
MARGIN = 1,
FUN = max
)
combined.pval <- data.frame(cp = apply(
X = markers.combined[, pval.codes, drop = FALSE],
MARGIN = 1,
FUN = function(x) {
return(meta.method(x)$p)
}
))
meta.method.name <- as.character(x = formals()$meta.method)
colnames(x = combined.pval) <- paste0(meta.method.name, "_p_val")
markers.combined <- cbind(markers.combined, combined.pval)
markers.combined <- markers.combined[order(markers.combined[, paste0(meta.method.name, "_p_val")]), ]
}
所谓的 FindConservedMarkers 其实是在单细胞分析中,有两个处理组 'g1' 和 'g2' ,而 'g1' 和 'g2' 组中又恰好有相同的细胞亚群 0 和 1,求在 'g1' 中细胞亚群 0 和细胞亚群 1 的差异基因,并且与在 'g2' 中细胞亚群 0 和细胞亚群 1 的差异基因取交集;这部分基因定义为 在 'g1' 和 'g2' 组中,细胞亚群 0 和细胞亚群 1 差异保守的基因