前言
用R语言对单独的变量数据进行t test或者anova test大家肯定耳熟能详。就分两步走
- 用
ggplot
或者基础函数画出boxplot进行可视化 - 用
t.test
oneway.test
等函数进行统计分析 - 重复1和2
这种方法应付少量的变量还可以,当变量是几十个甚至几百个的时候就有点力不从心了。特别是转录组分析,几十个几百个差异基因那可是家常便饭。和这次的主题无关,多变量的时候别忘了Bonferroni矫正(a=0.05/m)去除伪阳。
一次性批量t test
dat<-iris
## 因为是t test,所以要去掉一组数据
dat<-subset(dat,Species !="setosa")
dat$Species<-factor(dat$Species)
## 简单的for循环就可以解决批量鉴定
for(i in 1:4){
boxplot(dat[,i]~dat$Species,
ylab=names(dat[I]),
xlab="Species"
)
print(t.test(dat[,i]~dat$Species))
}
Welch Two Sample t-test
data: dat[, i] by dat$Species
t = -5.6292, df = 94.025, p-value = 1.866e-07
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.8819731 -0.4220269
sample estimates:
mean in group versicolor mean in group virginica
5.936 6.588
Welch Two Sample t-test
data: dat[, i] by dat$Species
t = -3.2058, df = 97.927, p-value = 0.001819
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.33028364 -0.07771636
sample estimates:
mean in group versicolor mean in group virginica
2.770 2.974
Welch Two Sample t-test
data: dat[, i] by dat$Species
t = -12.604, df = 95.57, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-1.49549 -1.08851
sample estimates:
mean in group versicolor mean in group virginica
4.260 5.552
Welch Two Sample t-test
data: dat[, i] by dat$Species
t = -14.625, df = 89.043, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.7951002 -0.6048998
sample estimates:
mean in group versicolor mean in group virginica
1.326 2.026
使用ggpubr画出更直观的图
还是用刚才的两组数据。
library(ggpubr)
x <- which(names(dat) == "Species") # 组名
y <- which(names(dat) == "Sepal.Length" # 需要测试的变量名
| names(dat) == "Sepal.Width"
| names(dat) == "Petal.Length"
| names(dat) == "Petal.Width")
method <- "t.test" # 选择test种类
paired <- FALSE
# 根据数据是否一一对应写一个ifelse循环
for (i in y) {
for (j in x) {
ifelse(paired == TRUE,
p <- ggpaired(dat,
x = colnames(dat[j]), y = colnames(dat[I]),
color = colnames(dat[j]), line.color = "gray", line.size = 0.4,
palette = "npg",
legend = "none",
xlab = colnames(dat[j]),
ylab = colnames(dat[I]),
add = "jitter"
),
p <- ggboxplot(dat,
x = colnames(dat[j]), y = colnames(dat[I]),
color = colnames(dat[j]),
palette = "npg",
legend = "none",
add = "jitter"
)
)
# 添加P值
print(p + stat_compare_means(aes(label = paste0(..method.., ", p-value = ", ..p.format..)),
method = method,
paired = paired,
# group.by = NULL,
ref.group = NULL
))
}
}
批量P值调整
多组比较的时候需要进行bonferroni等调整。同样可以写一段代码来实现批量处理。
raw_pvalue <- numeric(length = length(1:4))
for (i in (1:4)) {
raw_pvalue[i] <- t.test(dat[, i] ~ dat$Species,
paired = FALSE,
alternative = "two.sided"
)$p.value
}
df <- data.frame(
Variable = names(dat[, 1:4]),
raw_pvalue = round(raw_pvalue, 3)
)
df$Bonferroni <-
p.adjust(df$raw_pvalue,
method = "bonferroni"
)
df$BH <-
p.adjust(df$raw_pvalue,
method = "BH"
)
df$Holm <-
p.adjust(df$raw_pvalue,
method = "holm"
)
df$Hochberg <-
p.adjust(df$raw_pvalue,
method = "hochberg"
)
df$Hommel <-
p.adjust(df$raw_pvalue,
method = "hommel"
)
df$BY <-
round(p.adjust(df$raw_pvalue,
method = "BY"
), 3)
df
Variable raw_pvalue Bonferroni BH Holm Hochberg Hommel BY
1 Sepal.Length 0.000 0.000 0.000 0.000 0.000 0.000 0.000
2 Sepal.Width 0.002 0.008 0.002 0.002 0.002 0.002 0.004
3 Petal.Length 0.000 0.000 0.000 0.000 0.000 0.000 0.000
4 Petal.Width 0.000 0.000 0.000 0.000 0.000 0.000 0.000
也可以自己写一个function,完了以后直接套数据就好了。
t_table <- function(data, dvs, iv,
var_equal = TRUE,
p_adj = "none",
alpha = 0.05,
paired = FALSE,
wilcoxon = FALSE) {
if (!inherits(data, "data.frame")) {
stop("data must be a data.frame")
} if (!all(c(dvs, iv) %in% names(data))) {
stop("at least one column given in dvs and iv are not in the data")
} if (!all(sapply(data[, dvs], is.numeric))) {
stop("all dvs must be numeric")
} if (length(unique(na.omit(data[[iv]]))) != 2) {
stop("independent variable must only have two unique values")
}
out <- lapply(dvs, function(x) {
if (paired == FALSE & wilcoxon == FALSE) {
tres <- t.test(data[[x]] ~ data[[iv]], var.equal = var_equal)
}
else if (paired == FALSE & wilcoxon == TRUE) {
tres <- wilcox.test(data[[x]] ~ data[[iv]])
}
else if (paired == TRUE & wilcoxon == FALSE) {
tres <- t.test(data[[x]] ~ data[[iv]],
var.equal = var_equal,
paired = TRUE
)
} else {
tres <- wilcox.test(data[[x]] ~ data[[iv]],
paired = TRUE
)
}
c(
p_value = tres$p.value
)
})
out <- as.data.frame(do.call(rbind, out))
out <- cbind(variable = dvs, out)
names(out) <- gsub("[^0-9A-Za-z_]", "", names(out))
out$p_value <- ifelse(out$p_value < 0.001,
"<0.001",
round(p.adjust(out$p_value, p_adj), 3)
)
out$conclusion <- ifelse(out$p_value < alpha,
paste0("Reject H0 at ", alpha * 100, "%"),
paste0("Do not reject H0 at ", alpha * 100, "%")
)
return(out)
}
然后就出来了这个结果
result <- t_table(
data = dat,
c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"),
"Species"
)result
## variable p_value conclusion
## 1 Sepal.Length <0.001 Reject H0 at 5%
## 2 Sepal.Width 0.002 Reject H0 at 5%
## 3 Petal.Length <0.001 Reject H0 at 5%
## 4 Petal.Width <0.001 Reject H0 at 5%
ANOVA方差分析
把方差分析和1对1的t.test整合到一起
dat <- iris
# Edit from here
x <- which(names(dat) == "Species") # name of grouping variable
y <- which(names(dat) == "Sepal.Length" # names of variables to test
| names(dat) == "Sepal.Width"
| names(dat) == "Petal.Length"
| names(dat) == "Petal.Width")
method1 <- "anova" # one of "anova" or "kruskal.test"
method2 <- "t.test" # one of "wilcox.test" or "t.test"
my_comparisons <- list(c("setosa", "versicolor"), c("setosa", "virginica"), c("versicolor", "virginica")) # comparisons for post-hoc tests
# Edit until here
# Edit at your own risk
for (i in y) {
for (j in x) {
p <- ggboxplot(dat,
x = colnames(dat[j]), y = colnames(dat[I]),
color = colnames(dat[j]),
legend = "none",
palette = "npg",
add = "jitter"
)
print(
p + stat_compare_means(aes(label = paste0(..method.., ", p-value = ", ..p.format..)),
method = method1, label.y = max(dat[, i], na.rm = TRUE)
)
+ stat_compare_means(comparisons = my_comparisons, method = method2, label = "p.format") # remove if p-value of ANOVA or Kruskal-Wallis test >= alpha
)
}
}