很多时候,call完变异后,需要对vcf文件中的样本名进行修改,小编这里分享一个修改ID的python脚本(若vcf比较大,建议只对表头进行修改,然后cat在一起):
#!/usr/bin/env python
import sys
import gzip
if len(sys.argv) != 4:
print "python chang_vcf_id.py corr.list raw.vcf.gz out.vcf.gz"
exit()
fw = gzip.open(sys.argv[3],'wb')
with open(sys.argv[1]) as fc:
dic = {}
for rows in fc:
row = rows.strip().split('\t')
pre,new = map(str,row[:])
dic[pre] = new
if sys.argv[2].endswith('.gz'):
fh = gzip.open(sys.argv[2],'rb')
else:
fh = open(sys.argv[2],'r')
lst = []
for lines in fh:
if lines.startswith('##'):
fw.write(lines)
elif lines.startswith('#CHROM'):
line = lines.strip().split('\t')
info,samples = line[:9],line[9:]
for i in samples:
if i in dic:
lst.append(dic[i])
else:
lst.append(i)
new_title = '\t'.join(info)+'\t'+'\t'.join(lst)+'\n'
fw.write(new_title)
else:
fw.write(lines)
fh.close()
fw.close()