首先输入文件格式如下下图。
代码如下,这个流程是根据sentieon提供的流程稍加改动而来,这里不再对每一步进行解释,请见谅,另外注意参考基因组要使用染色体分开的那个,直接整条染色体>500Mb,软件不支持:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'wheatomics'
import subprocess
with open('input.txt', 'r') as f:
for line in f:
line = line.strip().split()
if len(line) > 4:
sra1, day1, rg1, sra2, day2, rg2 = line
print sra1, sra2
proc = subprocess.Popen(['fastq-dump', '--split-3', '--helicos', sra1 + '.sra'], shell=False)
proc.wait()
proc = subprocess.Popen(['fastq-dump', '--split-3', '--helicos', sra2 + '.sra'], shell=False)
proc.wait()
proc = subprocess.Popen('cat ' + sra2 + '_1.fastq' + ' >> ' + sra1 + '_1.fastq', shell=True)
proc.wait()
proc = subprocess.Popen('cat ' + sra2 + '_2.fastq' + ' >> ' + sra1 + '_2.fastq', shell=True)
proc.wait()
proc = subprocess.Popen(['shred', '-u', '-z', sra2 + '_1.fastq'], shell=False)
proc.wait()
proc = subprocess.Popen(['shred', '-u', '-z', sra2 + '_2.fastq'], shell=False)
proc.wait()
else:
sra1, day1, rg1 = line
proc = subprocess.Popen(['fastq-dump', '--split-3', '--helicos', sra1 + '.sra'], shell=False)
proc.wait()
proc = subprocess.Popen(
['STAR', '--twopassMode', 'Basic', '--genomeDir', '/data2/Fshare/FastaAndIndex/IWGSC_v1.0_STAR/',
'--runThreadN', '10', '--limitSjdbInsertNsj', '5000000', '--outSAMtype', 'BAM', 'SortedByCoordinate', '--twopass1readsN', '-1',
'--sjdbOverhang', '100', '--readFilesIn', sra1 + '_1.fastq', sra1 + '_2.fastq',
'--outSAMattrRGline', 'ID:' + rg1, 'SM:' + rg1, 'PL:ILLUMINA'], shell=False)
proc.wait()
proc = subprocess.Popen(['mv', 'Aligned.sortedByCoord.out.bam', 'sorted.bam'], shell=False)
proc.wait()
proc = subprocess.Popen(['shred', '-u', '-z', sra1 + '_1.fastq'], shell=False)
proc.wait()
proc = subprocess.Popen(['shred', '-u', '-z', sra1 + '_2.fastq'], shell=False)
proc.wait()
proc = subprocess.Popen(['sentieon', 'util', 'index', 'sorted.bam'], shell=False)
proc.wait()
proc = subprocess.Popen(['sentieon', 'driver', '-r', '/data2/Fshare/FastaAndIndex/IWGSC_v1.0_STAR/IWGSC_v1.0_part.fasta',
'-t', '10', '-i', 'sorted.bam', '--algo', 'MeanQualityByCycle', 'mq_metrics.txt',
'--algo', 'QualDistribution', 'qd_metrics.txt', '--algo', 'GCBias', '--summary', 'gc_summary.txt',
'gc_metrics.txt', '--algo', 'AlignmentStat', '--adapter_seq', "''", 'aln_metrics.txt',
'--algo', 'InsertSizeMetricAlgo', 'is_metrics.txt'], shell=False)
proc.wait()
proc = subprocess.Popen(['sentieon', 'plot', 'metrics', '-o', rg1 + '-metrics-report.pdf', 'gc=gc_metrics.txt',
'qd=qd_metrics.txt', 'mq=mq_metrics.txt', 'isize=is_metrics.txt'], shell=False)
proc.wait()
proc = subprocess.Popen(['sentieon', 'driver', '-t', '10', '-i', 'sorted.bam', '--algo', 'LocusCollector',
'--fun', 'score_info', 'score.txt'], shell=False)
proc.wait()
proc = subprocess.Popen(['sentieon', 'driver', '-t', '10', '-i', 'sorted.bam', '--algo', 'Dedup', '--rmdup',
'--score_info', 'score.txt', '--metrics', 'dedup_metrics.txt', 'deduped.bam'], shell=False)
proc.wait()
proc = subprocess.Popen(['sentieon', 'driver', '-r', '/data2/Fshare/FastaAndIndex/IWGSC_v1.0_STAR/IWGSC_v1.0_part.fasta',
'-t', '10', '-i', 'deduped.bam', '--algo', 'RNASplitReadsAtJunction', '--reassign_mapq',
'255:60', 'splitted.bam'], shell=False)
proc.wait()
proc = subprocess.Popen(['sentieon', 'driver', '-r', '/data2/Fshare/FastaAndIndex/IWGSC_v1.0_STAR/IWGSC_v1.0_part.fasta',
'-t', '10', '-i', 'splitted.bam', '--algo', 'Haplotyper',
'--trim_soft_clip', '--emit_mode', 'gvcf', '--emit_conf', '20', '--call_conf', '20',
rg1 + 'g.vcf.gz'], shell=False)
proc.wait()
最后合并gvcf生成vcf
sentieon driver -r /data2/Fshare/FastaAndIndex/IWGSC_v1.0_STAR/IWGSC_v1.0_part.fasta -t 10 --algo GVCFtyper MS1_rna_seq.vcf *g.vcf.gz