0定义变量
source activate wes
#GATK=~/biosoft/gatk/gatk-4.1.2.0/gatk
ref=/mnt/f/kelly/bioTree/server/wesproject/hg38/Homo_sapiens_assembly38.fasta
snp=/mnt/f/kelly/bioTree/server/wesproject/hg38/dbsnp_146.hg38.vcf.gz
indel=/mnt/f/kelly/bioTree/server/wesproject/hg38/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz
1 标记PCR重复reads
sample=SRR7696207
echo $sample
gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" MarkDuplicates \
-I $sample.bam -O ${sample}_marked.bam \
-M $sample.metrics \
1>log.mark 2>&1
运行结束后的文件如下
├── [ 17K] log.mark
├── [3.8G] SRR7696207.bam
├── [5.0G] SRR7696207_marked.bam
├── [3.3K] SRR7696207.metrics
2 FixMateInformation
gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" FixMateInformation \
-I ${sample}_marked.bam \
-O ${sample}_marked_fixed.bam \
-SO coordinate \
1>${sample}_log.fix 2>&1
这样就得到marked_fixed.bam文件。
接着进行index
samtools index ${sample}_marked_fixed.bam
3 BaseRecalibrator 碱基矫正
gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" BaseRecalibrator \
-R $ref \
-I ${sample}_marked_fixed.bam \
--known-sites $snp \
--known-sites $indel \
-O ${sample}_recal.table \
1>${sample}_log.recal 2>&1
gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" ApplyBQSR \
-R $ref \
-I ${sample}_marked_fixed.bam \
-bqsr ${sample}_recal.table \
-O ${sample}_bqsr.bam \
1>${sample}_log.ApplyBQSR 2>&1
此时文件结构如下
├── [7.2M] SRR7696207_bqsr.bai
├── [8.1G] SRR7696207_bqsr.bam
├── [ 13K] SRR7696207_log.ApplyBQSR
├── [ 24] SRR7696207_log.fix
├── [ 30K] SRR7696207_log.HC
├── [ 39K] SRR7696207_log.recal
├── [5.0G] SRR7696207_marked.bam
├── [5.0G] SRR7696207_marked_fixed.bam
├── [3.3M] SRR7696207_marked_fixed.bam.bai
├── [3.3K] SRR7696207.metrics
├── [246K] SRR7696207_recal.table
这里同样包含了两个步骤:
第一步,BaseRecalibrator,这里计算出了所有需要进行重校正的read和特征值,然后把这些信息输出为一份校准表文件(sample_name.recal_data.table)
第二步,PrintReads,这一步利用第一步得到的校准表文件(sample_name.recal_data.table)重新调整原来BAM文件中的碱基质量值,并使用这个新的质量值重新输出一份新的BAM文件。
注意,因为BQSR实际上是为了(尽可能)校正测序过程中的系统性错误,因此,在执行的时候是按照不同的测序lane或者测序文库来进行的,这个时候@RG信息(BWA比对时所设置的)就显得很重要了,算法就是通过@RG中的ID来识别各个独立的测序过程,这也是我开始强调其重要性的原因。
4 HaplotypeCaller命令
gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" HaplotypeCaller \
-R $ref \
-I ${sample}_bqsr.bam \
--dbsnp $snp \
-O ${sample}_raw.vcf \
1>${sample}_log.HC 2>&1
......
13:37:48.966 INFO ProgressMeter - Starting traversal
13:37:48.966 INFO ProgressMeter - Current Locus Elapsed Minutes Regions Processed Regions/Minute
13:37:58.966 INFO ProgressMeter - chr1:1221125 0.2 5110 30660.0
13:38:09.200 INFO ProgressMeter - chr1:1705254 0.3 7670 22743.9
13:38:19.204 INFO ProgressMeter - chr1:2899974 0.5 13300 26390.6
13:38:29.224 INFO ProgressMeter - chr1:3950942 0.7 18600 27721.2
13:38:39.236 INFO ProgressMeter - chr1:5289141 0.8 25380 30292.4
13:38:49.236 INFO ProgressMeter - chr1:6707838 1.0 32080 31936.3
13:38:59.241 INFO ProgressMeter - chr1:8292545 1.2 39780 33963.7
13:39:09.243 INFO ProgressMeter - chr1:9939444 1.3 47690 35644.1
13:39:19.261 INFO ProgressMeter - chr1:11517156 1.5 55250 36713.0
13:39:29.284 INFO ProgressMeter - chr1:12845200 1.7 61470 36765.1
13:39:39.418 INFO ProgressMeter - chr1:13371271 1.8 63630 34565.2
13:39:49.440 INFO ProgressMeter - chr1:15196274 2.0 72310 36013.0
13:39:59.443 INFO ProgressMeter - chr1:16167558 2.2 77060 35436.1
......
这样就得到vcf文件
以上可以批量进行
#设置环境和变量
source activate wes
#如果把gatk加到了环境变量 就直接按下面走,否则
#gatk=~/biosoft/gatk/gatk-4.1.2.0/gatk
#下面gatk改为$gatk
#下面都设置为你自己的路径
ref=/mnt/f/kelly/bioTree/server/wesproject/hg38/Homo_sapiens_assembly38.fasta
snp=/mnt/f/kelly/bioTree/server/wesproject/hg38/dbsnp_146.hg38.vcf.gz
indel=/mnt/f/kelly/bioTree/server/wesproject/hg38/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz ```
for sample in {file1.sam,file2.sam,file3.sam...}
do
echo $sample
#mark dupulicates
gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" MarkDuplicates \
-I $sample.bam -O ${sample}_marked.bam \
-M $sample.metrics \
1>log.mark 2>&1
#fixmateinformation
gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" FixMateInformation \
-I ${sample}_marked.bam \
-O ${sample}_marked_fixed.bam \
-SO coordinate \
1>log.fix 2>&1
#index
samtools index ${sample}_marked_fixed.bam
#baserecalibrator
gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" BaseRecalibrator \
-R $ref \
-I ${sample}_marked_fixed.bam \
--known-sites $snp \
--known-sites $indel \
-O ${sample}_recal.table \
1>${sample}_log.recal 2>&1
gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" ApplyBQSR \
-R $ref \
-I ${sample}_marked_fixed.bam \
-bqsr ${sample}_recal.table \
-O ${sample}_bqsr.bam \
1>${sample}_log.ApplyBQSR 2>&1
## 使用GATK的HaplotypeCaller命令
gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" HaplotypeCaller \
-R $ref \
-I ${sample}_bqsr.bam \
--dbsnp $snp \
-O ${sample}_raw.vcf \
1>${sample}_log.HC 2>&1
done
可以把上面内容写入脚本,比如
cat gatk4.sh
然后运行就可以了