1. 获得peak集
这里的逻辑是:把四个样品合并,call peaks,然后获得peaks文件与前面idr 处理后的peaks进行overlap,都overlap的peaks,作为最终的peaks。
gsize=199000000
## callpeak and idr analysis of sample A
s1_r1=SRR6322534
s1_r2=SRR6322535
s1=SRR6322534_SRR6322535
input1=SRR6322538
# call peaks for replicte 1
macs2 callpeak -t ./${s1_r1}.ff.bam -c ./$input1.ff.bam -f BAM -n ${s1_r1} -g $gsize --keep-dup all -p 0.01
# call peaks for replicte 2
macs2 callpeak -t ./${s1_r2}.ff.bam -c ./$input1.ff.bam -f BAM -n ${s1_r2} -g $gsize --keep-dup all -p 0.01
# call peaks for combined dataset
macs2 callpeak -t ./${s1_r1}.ff.bam ./${s1_r2}.ff.bam -c ./$input1.ff.bam -f BAM -n ${s1} -g $gsize --keep-dup all -p 0.01
# run idr
idr --samples ${s1_r1}_peaks.narrowPeak ${s1_r2}_peaks.narrowPeak --peak-list ${s1}_peaks.narrowPeak --input-file-type narrowPeak --output-file ./${s1}.idr --rank p.value --soft-idr-threshold 0.05 --plot
# get idr produced narrowPeak file
cut -f 1-10 ${s1}.idr | sort -k1,1 -k2,2n -k3,3n >${s1}.idr.narrowPeak
## callpeak and idr analysis of sample B
s2_r1=SRR8423051
s2_r2=SRR8423052
s2=SRR8423051_SRR8423052
input2=SRR8423055
# call peaks for replicte 1
macs2 callpeak -t ./${s2_r1}.ff.bam -c ./$input2.ff.bam -f BAM -n ${s2_r1} -g $gsize --keep-dup all -p 0.01
# call peaks for replicte 2
macs2 callpeak -t ./${s2_r2}.ff.bam -c ./$input2.ff.bam -f BAM -n ${s2_r2} -g $gsize --keep-dup all -p 0.01
# call peaks for combined dataset
macs2 callpeak -t ./${s2_r1}.ff.bam ./${s2_r2}.ff.bam -c ./$input2.ff.bam -f BAM -n ${s2} -g $gsize --keep-dup all -p 0.01
# run idr
idr --samples ${s2_r1}_peaks.narrowPeak ${s2_r2}_peaks.narrowPeak --peak-list ${s2}_peaks.narrowPeak --input-file-type narrowPeak --output-file ./${s2}.idr --rank p.value --soft-idr-threshold 0.05 --plot
# get idr produced narrowPeak file
cut -f 1-10 ${s2}.idr | sort -k1,1 -k2,2n -k3,3n >${s2}.idr.narrowPeak
## Peaks in combined sample bams
macs2 callpeak -t ${s1_r1}.ff.bam ${s1_r2}.ff.bam ${s2_r1}.ff.bam ${$s2_r2}.ff.bam -c $input1.ff.bam $input2.ff.bam -f BAM -n ${s1}_${s2} -g $gsize --keep-dup all -p 0.01
获得最终的peaks文件
Cat ${s1_r1}_${s1_r2}.idr.narrowPeak ${s2_r1}_${s2_r2}.idr.narrowPeak | sort -k1,1 -k2,2n -k3,3n | bedtools intersect -a ${s1}_${s2}_peaks.narrowPeak -b - -F 0.5 -u >${s1}_${s2}_peaks.f.narrowPeak
## filter peaks against blacklist
bedtools intersect -a ${s1}_${s2}_peaks.f.narrowPeak -b ../ref/blacklist.bed -f 0.25 -v >temp && mv temp ${s1}_${s2}_peaks.f.narrowPeak
2. 统计peaks区域的counts,主要利用deeptools的 multiBamSummary
s1=SRR6322534
s2=SRR6322535
s3=SRR8423051
s4=SRR8423052
peak=./peak.narrowPeak
## for comparison between samples without replicate
cut -f 1-3 $peak >peak.1.bed #把peaks转换成 bed 文件
multiBamSummary BED-file --bamfiles ../callpeak/downsample/$s2.ff.bam ../callpeak/downsample/$s3.ff.bam --BED peak.1.bed --smartLabels -p 4 --outRawCounts peak_counts.1.txt --extendReads 134
## for comparison between samples with replicates
cut -f 1-3 $peak >peak.2.bed
multiBamSummary BED-file --bamfiles ../callpeak/downsample/$s1.ff.bam ../callpeak/downsample/$s2.ff.bam ../callpeak/downsample/$s3.ff.bam ../callpeak/downsample/$s4.ff.bam --BED peak.2.bed --smartLabels -p 4 --outRawCounts peak_counts.2.txt --extendReads 134
The multiBamSummary command is part of the deepTools package and is used to generate summary statistics for multiple BAM files. Here's an explanation of the options in the example command you provided:
BED-file: The name of the BED file containing the genomic regions of interest.
--bamfiles: A list of BAM files to analyze.
--smartLabels: Automatically generate labels for the BAM files based on their file names.
-p: The number of threads to use for parallel processing.
--outRawCounts: The name of the output file containing the raw counts for each genomic region.
--extendReads: The number of base pairs to extend the reads in each direction.
3. R里面进行差异分析
1. read in sample information
library(DESeq2)
library(tidyverse)
library(pheatmap)
cat meta.2.txt
SRR6322534 rabbit, anti-IR_beta, sc-711 HepG2_IRb 19272489
SRR6322535 rabbit, anti-IR_beta, sc-711 HepG2_IRb 37475005
SRR8423051 rabbit, anti-IR_beta, sc-711 HepG2_IRb_starve 28559621
SRR8423052 rabbit, anti-IR_beta, sc-711 HepG2_IRb_starve 45616747
meta <- read_tsv("meta.2.txt", col_names = F)
meta <- meta[,c(1,3)]
colnames(meta) <- c("id", "group")
meta <- meta %>% column_to_rownames(var = "id")
meta$group <- factor(meta$group, levels = c("HepG2_IRb", "HepG2_IRb_starve"))
head(meta)
group
SRR6322534 HepG2_IRb
SRR6322535 HepG2_IRb
SRR8423051 HepG2_IRb_starve
SRR8423052 HepG2_IRb_starve
2. read in counts
counts <- read_tsv("peak_counts.2.txt", col_names = F, comment = "#")
head(counts)
# A tibble: 6 × 7
X1 X2 X3 X4 X5 X6 X7
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 chr1 8878506 8879058 55 38 49 30
2 chr1 8880265 8880537 25 16 14 26
3 chr19 9768598 9768832 10 15 10 16
4 chr1 7942428 7942608 19 4 14 12
5 chr1 7961280 7961786 44 29 30 29
6 chr17 78187231 78187636 44 23 28 20
colnames(counts) <- c("chr", "start", "end", rownames(meta))
counts.dds <- counts[,-(1:3)] %>% as.data.frame()
rownames(counts.dds) <- paste(counts$chr, counts$start, counts$end, sep="-")
head(counts.dds)
SRR6322534 SRR6322535 SRR8423051 SRR8423052
chr1-8878506-8879058 55 38 49 30
chr1-8880265-8880537 25 16 14 26
chr19-9768598-9768832 10 15 10 16
chr1-7942428-7942608 19 4 14 12
chr1-7961280-7961786 44 29 30 29
chr17-78187231-78187636 44 23 28 20
3. calculate size factor
library(csaw)
dpath <- paste("../../chip/callpeak/downsample/", rownames(meta), ".ff.bam", sep="")
bins <- windowCounts(dpath, bin=T, width=10000, BPPARAM=MulticoreParam(nrow(meta)))
nf <-normFactors(bins, se.out = F)
4. create dds
dds <- DESeqDataSetFromMatrix(countData=counts.dds, colData=meta, design=~group)
#dds <- estimateSizeFactors(dds) #这个是DESE2自带的size factor计算工具,由于我们提供给DeSEq2的是peaks里面的reads counts,因此不适合利用其自带的size factore 函数计算,当然我们如果提供了全基因组以bin'单位的raw reads counts是可以的,但是后续分析还得转换成peaks对应的counts进行分析,这样比较麻烦一点。
sizeFactors(dds) <- nf #前面的size factor赋值
5. 计算差异 peaks
dds <- DESeq(dds)
res <- results(dds)
存取数据
res.out <- res %>% as.data.frame() %>% rownames_to_column(var = "name") %>%
mutate(
change = case_when(
padj >= 0.05 | is.na(padj) ~ "stable",
padj < 0.05 & log2FoldChange < 0 ~ "down",
padj < 0.05 & log2FoldChange > 0 ~ "up")
)
res.out <- data.frame(counts[,1:3], res.out)
write.table(res.out, file = "diff.2.txt", sep = "\t", quote = F, row.names = F, col.names = T)
&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
如果数据计算重复性欠佳,可以考虑用下面的方法对数据进一步标准化,进行相关性计算,
## signal transformation
rld <- rlog(dds, blind=F)
rldMat <- assay(rld)
The rlog function in the DESeq2 package performs a variance-stabilizing transformation on the count data in a DESeqDataSet object. The resulting transformed data can be used for downstream analysis such as clustering, visualization, and differential expression analysis. The blind argument in the rlog function controls whether the transformation is performed in a "blind" manner, meaning that the sample information is not used to estimate the dispersion parameters. By default, blind is set to TRUE, which means that the dispersion parameters are estimated from the data without taking into account the sample information. Setting blind to FALSE allows the dispersion parameters to be estimated using the sample information, which can improve the accuracy of the transformation.
The assay function in the SummarizedExperiment package extracts the assay data from a SummarizedExperiment object. In the case of a DESeqDataSet object that has been transformed using the rlog function, the assay data is the transformed count data. The assay function in the SummarizedExperiment package extracts the assay data from a SummarizedExperiment object. In the case of a DESeqDataSet object that has been transformed using the rlog function, the assay data is the transformed count data.
# distance between samples
png(file="dist.png", width = 400, height = 400)
pheatmap(as.matrix(dist(t(rldMat))), cluster_rows = T, cluster_cols = T)
dev.off()
pca analysis
plotPCA(rld, intgroup = "group")
correlation between samples using transformed data
pheatmap(cor(rldMat), cluster_rows = T, cluster_cols = T, display_numbers = T)
# scatter plot
ggplot(as.data.frame(rldMat), aes(x = SRR8423051, y = SRR8423052)) +
geom_point(size = 0.5) + xlim(0,8) + ylim(0,8) + theme_cowplot()
correlation between samples using normalized counts (DeSeq2 按照size factor normalized,很多文章用的都是这个数据作图的,感觉效果不如transformed data好
)
counts.norm <- counts(dds, normalized = T)
counts.norm.cor <- cor(log2(counts.norm), method = "pearson")
png(file="cor.png", width = 400, height = 400)
pheatmap(counts.norm.cor, cluster_rows = T, cluster_cols = T, display_numbers = T)
dev.off()
ggplot(as.data.frame(log2(counts.norm)), aes(x = SRR8423051, y = SRR8423052)) +
geom_point(size = 0.5) + xlim(0,8) + ylim(0,8) + theme_cowplot()
save.image("diff.2.RData")
参考:基因课------表观基因组学之 ChIP-seq 数据分析