tabix
tabix -p vcf myvcf.vcf.gz
#或者用
bcftools index -t --threads 10 myvcf.vcf.gz
tabix -h myvcf.vcf.gz chr1 > chr1.vcf #-h会加上vcf的header
#还可以用文件,列出所有要包含的染色体
tabix -h -R regions.list input.vcf.gz > output.vcf
BCFTOOLS
export BCFTOOLS_PLUGINS=/path/to/bcftools/plugins
bcftools +counts input.vcf.gz
#或者直接用
bcftools index -n input.vcf.gz
过滤
bcftools view -i "F_MISSING <= 0.2" 相当于vcftools的max-missing 0.8
或者用bcftools filter -e "F_MISSING > 0.2" 相当于vcftools的max-missing 0.8
bcftools view input/input.recode.vcf.gz -S 170.ID -i "F_MISSING <= 0.5"
bcftools filter -e "MAF < 0.05" input.vcf
vcftools --vcf input.vcf --maf 0.05 --recode --out OUTPUT
bcftools filter -e "F_MISSING > 0.3 || MAF < 0.05" input.vcf.gz -Oz -o output.vcf.gz
vcftools --gzvcf input.vcf.gz --maf 0.05 --miss 0.7 --recode --out output2
bcftools的subset和F_MISSING过滤,一定要分两步做,不然会出错:
bcftools view -S kee.ID input.vcf | bcftools filter -e "F_MISSING > 0.2" > output.vcf #分两步过滤
#相当于
vcftools --vcf input.vcf --max-missing 0.8 --keep keep.ID --recode --out output
bcftools view -i "MAC>=0"
bcftools view -c 1 -c 1:nonmajor
bcftools filter -e "MAC == 0"
提取、合并、删除
bcftools view --region chr1,chr5,chr8 input.vcf.gz
bcftools query -f "%CHROM\t%POS\n"
zcat input.vcf.zg |awk '/^[^#]/{print $1"\t"$2}' > file.pos
bcftools view -R file.pos -Oz -o output.vcf.gz input.vcf.gz
bcftools view input/input.SNP.revhet.recode.vcf.gz -R input_Chr01.pos -S input_samples.list -Oz -o output_fixed.vcf.gz
bcftools view -t Chr01:888 input.recode.vcf.gz |bcftools query -f "%CHROM\t%POS\t[\n%GT]"|sort -u #查看一个sites的genotype
bcftools query -l input.vcf > samples.txt
bcftools reheader --samples <your file> -o <output> <your input file>
#file里面是
oldname newname
oldname2 newname2
#删除之后需要再过滤
bcftools view -s "^sample101" input.vcf.gz |bcftools view -i "MAC >0"-Oz -o output.vcf.gz
bcftools concat -f concat_vcf.list --threads 10 -Oz -o output/output.SNP.revhet.recode.vcf.gz
#不用file.list,可以这样:
bcftools concat -Oz -o output.vcf.gz *.vcf
python
import gzip
#open file according to suffix
def openfile(filename, mode='r'):
if filename.endswith('.gz'):
return gzip.open(filename, mode)
else:
return open(filename, mode)
#decode bytes lines if your read lines from gzip file
def decode_line(line):
if type(line) == bytes:
line = line.decode('utf-8')
return line
with openfile("input.vcf.gz") as fh:
for line in fh:
line = decode_line(line)
#do something
ChangeLog: