目的:分别计算某一人区间的reads。
方法实现:R、perl、Python等等
方法一
#samtools view NT001.bam | awk '{print $3,$4}' > file1.txt
dat <- read.table("file1.txt",header=FALSE,colClasses= c("character","integer"))
colnames(dat)=c("refChr","begin")
chrs <- paste("chr", c(paste(1:22, sep = ""), c("X","Y")), sep = "")
dat <- dat[dat$refChr %in% chrs,]
binindex = ifelse((dat$begin%%50000)==0,floor(dat$begin/50000)-1,floor(dat$begin/50000))
fastbincount = table(paste(dat$refChr,binindex, sep="_"))
file <- data.frame(fastbincount )
colnames(file)=c("binName","counts")
chr_all <- read.table("all_chr.txt")
colnames(chr_all) = "binName"
chr_all$binorder=c(1:61927)
bininfo = merge(chr_all,file, by="binName",all.x=T)
bininfo=bininfo[order(bininfo$binorder),]
bininfo[is.na(bininfo)] <- 0
write.table(bininfo,"bininfo.txt",quote=F,row.names=F,col.names=F)
方法二:pysam
import numpy as np
import pysam
def convert_bam(infile):
bam_file = pysam.AlignmentFile(infile, 'rb')
for index, chr in enumerate(bam_file.references):
#print(index,chr
if index > 23 :
continue
#print(chr,bam_file.lengths[index])
counts = np.zeros(int(bam_file.lengths[index] / float(50000) + 1), dtype=np.int32)
bam_chr = bam_file.fetch(chr)
#print(bam_chr)
for read in bam_chr:
#print(read)
location = read.pos / 50000
counts[int(location)] += 1
for i in counts:
print(i)
if __name__=="__main__":
convert_bam("NT001.bam")
方法三:
bedtools coverage -a segment.bed -b nend.bam > segment_reads.txt
结论:方法一 == 等于方法二 >方法三,就是说方法三没有一、二精确。