DEseq预热
主要就是这几个步骤了。
#准备count tables
cnts <- matrix(rnbinom(n=1000, mu=100, size=1/0.5),ncol = 10) #构建满足随机负二项分布的一组数
cond <- factor(rep(1:2,each=5))
#创建DEseqDataSet
dds2 <- DESeqDataSetFromMatrix(cnts,DataFrame(cond), ~cond)
#标准分析
dds2 <- DESeq(dds2)
res <- results(dds2)
##另一种检验,LRT,似然率检验
#ddsLRT <- DESeq(dds2,test="LRT",reduced=~1)
#resLRT <- results(ddsLRT)
#moderate lfc
resultsNames(dds2)
resLFC <- lfcShrink(dds2,coef = 2,type = "apeglm")
#提取结果
summary()、subset()
1. counts: 查看每个样本在每个基因上的counts数
library(DESeq2)
> dds <- makeExampleDESeqDataSet(m=4)
> head(counts(dds))
sample1 sample2 sample3 sample4
gene1 0 0 2 1
gene2 14 56 50 47
gene3 5 3 3 8
gene4 70 105 116 125
gene5 5 2 2 12
gene6 153 70 70 53
> dds1 <- estimateSizeFactors(dds)
> head(counts(dds1,normalized=TRUE)) #通过估计size对counts数进行标准化
sample1 sample2 sample3 sample4
gene1 0.000000 0.000000 1.883002 0.9839948
gene2 14.041418 53.289007 47.075041 46.2477573
gene3 5.014792 2.854768 2.824502 7.8719587
gene4 70.207091 99.916889 109.214095 122.9993545
gene5 5.014792 1.903179 1.883002 11.8079380
gene6 153.452641 66.611259 65.905057 52.1517263
2. 构建DESeqDataSet
> #from matrix:从数值矩阵得到,比如用featurecount统计得到的数值矩阵
> a <- as.matrix(read.table("6sample_count_matrix.txt",sep="\t",header = T,row.names=1))
> head(a)
WT1 WT2 WT3 acc1.51 acc1.52 acc1.53
AT1G01010 226 183 247 514 353 383
AT1G01020 386 364 453 369 483 443
AT1G01030 77 65 87 52 62 74
AT1G01040 1322 1258 1621 1333 1526 1641
AT1G01050 1531 1518 2029 1573 1888 1987
AT1G01060 124 151 823 146 197 297
> b <- read.table("coldata.txt",header = T,row.names = 1)
> b
condition
WT1 WT
WT2 WT
WT3 WT
acc1.51 acc1.5
acc1.52 acc1.5
acc1.53 acc1.5
> all(rownames(b) == colnames(a)) #make sure that it's true
[1] TRUE
> dds3 <- DESeqDataSetFromMatrix(countData = a,
+ DataFrame(b),
+ design = ~ condition)
> #from htseq-count:由单个htseq-count结果文件合并得到
> c <- read.table("sampletable_for_htseq.txt.txt",sep = "\t", header = F)
> c
V1 V2 V3
1 WT1 SRR3286802.count2 WT
2 WT2 SRR3286803.count2 WT
3 WT3 SRR3286804.count2 WT
4 acc1.51 SRR3286805.count2 acc1.5
5 acc1.52 SRR3286806.count2 acc1.5
6 acc1.53 SRR3286807.count2 acc1.5
> colnames(c) <- c("samplename","filename","condition")
> dds4 <- DESeqDataSetFromHTSeqCount(c, directory = ".", design = ~condition)
3. run DESeq() & results()——主要分析步骤
#是否需要初步过滤一下
#keep <- rowSums(counts(dds4)) >= 20 # 对counts数进行初步的过滤
#dds4 <- dds4[keep,]
> dds4 <- DESeq(dds4) #三步走:文库大小估计;离散程度估计;统计检验
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
> res4 <- results(dds4) #结果呈现
> res4
log2 fold change (MLE): condition WT vs acc1.5
Wald test p-value: condition WT vs acc1.5
DataFrame with 37884 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
AT1G01010 308.89691311743 -0.873161860576389 0.235593150198139 -3.70622770586514 0.000210369178701959 0.00230248659971188
AT1G01020 392.486827906655 -0.014533822556904 0.140844859602549 -0.103190294611512 0.91781194256941 0.95926533475846
AT1G01030 63.2288140763626 0.399739741186851 0.255254339106486 1.56604484212152 0.117338119589006 0.275094993904629
......
4. LFC校正
resultsNames(dds4) #显示谁和谁比较
res.shr <- lfcShrink(dds4,coef = 2) #2表示第2列,即LFC这一列
res.shr
res.shr2 <- lfcShrink(dds4,contrast = c("condition","acc1.5","WT")) #contrast和coef不能同时存在
res.shr2
#更换校正模型
res.ape <- lfcShrink(dds4,coef = 2, type = "apeglm")
res.ash <- lfcShrink(dds4, coef = 2, type = "ashr")
res.ash2 <- lfcShrink(dds4, contrast = c("condition","acc1.5","WT"), type = "ashr")
#这一步只会对LFC的值产生影响,p值是没有改变的
5. 简单作图
plotCounts: 用以检验单个基因在组间的reads数有多少
plotCounts(dds4,"AT1G01010")
plotMA: 所有样本normalized counts的平均值和LFC的关系
从整体上来看gene的上调下调情况,注意用resultsNames()看一下是谁和谁比较
res.shr2 <- lfcShrink(dds4,coef = 2,type = "apeglm") #校正LFC
plotMA(res.shr2,alpha=0.01,main="apeglm") #alpha=0.01,定义显著水平
6. results()——从DESeq分析中提取结果
> res4 <- results(dds4,contrast = c("condition","acc1.5","WT"),alpha = 0.001,tidy = F)
> # dds4经过了DESeq()分析
> # contrast: 定义谁和谁比较
> # lfcThreshold: 定义了lfc阈值,从MA图中可以看到红点减少,此外results()结果中不满足lfc阈值的gene的p值都是1
> # alpha: 定义显著水平
> # tidy: 是否将结果输出为data.frame格式
> summary(res4,alpha=0.001)
out of 27432 with nonzero total read count
adjusted p-value < 0.001
LFC > 0 (up) : 1198, 4.4%
LFC < 0 (down) : 554, 2%
outliers [1] : 63, 0.23%
low counts [2] : 6254, 23%
(mean count < 7)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
> # res4经过了results()操作
> # alpha: 校正p值的阈值,如果没有设置,会沿用results()中的alpha参数
#按照自定义的阈值提取差异基因并导出
#diff_gene_deseq2 <-subset(res, padj < 0.001 & abs(log2FoldChange) > 1)
#write.csv(diff_gene_deseq2,file= "DEG_acc1.5_vs_WT.csv")