rosalind1-10

1.counting DNA nucleotides

rosalind1 题目

#style1
seq=''
ntcount=[]
with open('rosalind1.txt') as f:
    for line in f:
         line = line.rstrip() #去末尾空格
         seq += line.upper()
for nt in ['A','C','G','T']:
    ntcount.append(seq.count(nt))
print('\t'.join(map(str,ntcount)))

#style2
with open('rosalind1.txt') as f:
    for a in f:
        b = list(str(a)) #b是一个个核苷酸的列表
       c={}
       for d in b:
           c[d] = b.count(d)
       print(c)

#style3
from collections import Counter
with open('rosalind_dna.txt') as f:
    list=f.read().rstrip()
    print(Counter(list))

2.将DNA转录成RNA

Rosalind 2 题目

#style1 使用replace函数
with open('rosalind_rna.txt','r') as f:
    list=f.read()
    print(list.replace('T','U'))

#style2 使用re.sub正则表达式
import re
with open('rosalind_rna.txt', 'r') as f:
    list = f.read()
    print(re.sub('T','U',list))

3.获取反向互补链

Rosalind 3 题目

#rosalind 3
#style1
def reverse_complement(seq):
    ntComplement={'A':'T','C':'G','G':'C','T':'A'}
    revSeqList = list(reversed(seq))
    revComSeqList = [ntComplement[k] for k in revSeqList]
    revComSeq = ''.join(revComSeqList)
    return revComSeq

seq=''
with open('rosalind_revc.txt') as f:
    for line in f:
        line = line.rstrip()
        seq += line.rstrip()
        print(reverse_complement(seq))

#style2 使用翻译表
seq=''
with open('rosalind_revc.txt') as f:
    for line in f:
        line = line.rstrip()
        seq += line.rstrip()
transtable = str.maketrans('ATCG','TAGC')
print(seq.translate(transtable)[::-1])

#style3 使用biopython
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

seq=''
with open('rosalind_revc.txt') as f:
    for line in f:
        line = line.rstrip()
        seq += line.rstrip()

my_seq = Seq(seq,IUPAC.unambiguous_dna)
print(my_seq)
print(my_seq.complement()) #互补链
print(my_seq.reverse_complement()) #反向互补链

4.兔子和递归关系

rosalind4 题目

def fibonacciRabbits(n, k):
    F = [1, 1]
    generation = 2
    while generation <= n:
        F.append(F[generation - 1] + F[generation - 2] * k)
        generation += 1
    # 函数返回列表最后一项
    return (F[n - 1])

#调用函数并打印
print (fibonacciRabbits(5, 3))

5.计算GC含量

rosalind 5题目

#style1
from Bio import SeqIO
Target_Seq = SeqIO.parse('rosalind_gc.txt','fasta')
GC_content = 0
ID = ''
for target in Target_Seq:
    if GC_content < (float(target.seq.count('C')+target.seq.count('G'))/len(target.seq))*100:
        GC_content = (float(target.seq.count('C')+target.seq.count('G'))/len(target.seq))*100
        ID = target.description
print(ID,'\n',GC_content)

#style2
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqUtils import GC
my_seq = Seq("GGGATCGTAC",IUPAC.unambiguous_dna)
for index,letter in enumerate(my_seq):
    print(index,letter)
print(len(my_seq))
print(my_seq.count('A'))
print(GC(my_seq))

6.计算突变个数

Rosalind6题目

d = open('rosalind_hamm.txt','r')
s = d.readlines()
a = s[0].strip()
b = s[1].strip()
hum = 0
for i in range(len(a)):
    if a[i] != b[I]:
        hum+=1
print(hum)

7.孟德尔第一定律

Rosalind7题目

from scipy.misc import comb
def fun(k,m,n):
    y = k+m+n
    nn = comb(n,2)/comb(y,2)
    kk = comb(m,2)/comb(y,2)
    mm = comb(n,1)/comb(y,2)
    p = round(1-(nn+kk*1/4+mm*1/2),5)
    return p

#读取数字
d = open('rosalind_iprb.txt','r')
s = d.read()
f = s.split()
k,m,n = int(f[0]),int(f[1]),int(f[2])
print(fun(k,m,n))

8.将RNA翻译成蛋白质

rosalind8 题目

# -*- coding: utf-8 -*-
'''rosaline 将RNA翻译成蛋白质'''
#style1 使用biopython
from Bio import Seq
from Bio.Alphabet import generic_dna,generic_rna
from Bio import SeqIO
from Bio.Data import CodonTable
#载入mRNA序列
RNA_string=open('rosalind_prot.txt','r').read().strip()
my_seq=Seq.Seq(RNA_string,generic_rna)
#载入标准遗传密码表
standard_table=CodonTable.unambiguous_rna_by_name['Standard']
#将mRNA翻译成蛋白质
protein=my_seq.translate(table='Standard')
print(protein)

#style2 自己写出密码表
import re
def mRNA_protein(RNA_string):
    #将mRNA翻译成蛋白质
    start_code = 'AUG'
    end_code = ['UAA', 'UAG', 'UGA']
    protein_table = {'UUU': 'F', 'CUU': 'L', 'AUU': 'I', 'GUU': 'V', \
                     'UUC': 'F', 'CUC': 'L', 'AUC': 'I', 'GUC': 'V', \
                     'UUA': 'L', 'CUA': 'L', 'AUA': 'I', 'GUA': 'V', \
                     'UUG': 'L', 'CUG': 'L', 'AUG': 'M', 'GUG': 'V', \
                     'UCU': 'S', 'CCU': 'P', 'ACU': 'T', 'GCU': 'A', \
                     'UCC': 'S', 'CCC': 'P', 'ACC': 'T', 'GCC': 'A', \
                     'UCA': 'S', 'CCA': 'P', 'ACA': 'T', 'GCA': 'A', \
                     'UCG': 'S', 'CCG': 'P', 'ACG': 'T', 'GCG': 'A', \
                     'UAU': 'Y', 'CAU': 'H', 'AAU': 'N', 'GAU': 'D', \
                     'UAC': 'Y', 'CAC': 'H', 'AAC': 'N', 'GAC': 'D', \
                     'UAA': 'Stop', 'CAA': 'Q', 'AAA': 'K', 'GAA': 'E', \
                     'UAG': 'Stop', 'CAG': 'Q', 'AAG': 'K', 'GAG': 'E', \
                     'UGU': 'C', 'CGU': 'R', 'AGU': 'S', 'GGU': 'G', \
                     'UGC': 'C', 'CGC': 'R', 'AGC': 'S', 'GGC': 'G', \
                     'UGA': 'Stop', 'CGA': 'R', 'AGA': 'R', 'GGA': 'G', \
                     'UGG': 'W', 'CGG': 'R', 'AGG': 'R', 'GGG': 'G'
                     }
    # 找到起始密码子的位置
    start_sit = re.search(start_code, RNA_string)
    protein = ''
    # 按阅读框匹配蛋白质
    for sit in range(start_sit.end(), len(RNA_string), 3):
        protein = protein + protein_table[RNA_string[sit:sit + 3]]
    print(protein)

if __name__ == '__main__':
    RNA_string = open('rosalind_prot.txt','r').read().strip()
    mRNA_protein(RNA_string)

#style3 使用biopython
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
d=open('rosalind_prot.txt','r').read().strip()
messenger_rna=Seq(d,IUPAC.unambiguous_rna)
print(messenger_rna.translate().strip('*'))

9.找到子串位置

rosalind9 题目

# -*- coding: utf-8 -*-
#rosalind9:finding a motif in dna
#style1
d=open('rosalind_subs.txt','r')
s=d.readlines()   #返回列表
a=s[0].strip()
b=s[1].strip()
print(s)
num1,num2=len(a),len(b)
n=num1-num2
I=0
site=[]
for i in range(n):
    c=a[i:i+num2]
    if b==c:
        site.append(i+1)#python的第1位是0，第2位是1，所以要加1

print(" ".join(str(j) for j in site))#输出成sampleoutput里的格式,使用join需要改为字符串格式


#style2
import re
# 如果overlapped   为False, 匹配到位置为2，10
matches = re.finditer('ATAT', 'GATATATGCATATACTT')
# 返回所有匹配项，
for match in matches:  #迭代器可循环
    print(match.start())

re.match,re.search,re.findall,re.finditer区别
re.match从首字母开始匹配，没有，就返回none
re.search返回对象
re.findall返回数组
re.finditer返回迭代器

#style3
seq = 'GATATATGCATATACTT'
pattern = 'ATAT'

def find_motif(seq, pattern):
    position = []
    for i in range(len(seq) - len(pattern)):
        if seq[i:i + len(pattern)] == pattern:
            position.append(str(i + 1))

    print('\t'.join(position))

find_motif(seq, pattern)

10.寻找最可能的共同祖先

Rosalind10 题目

rosalind 10 题目

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

#rosalind10: consensus and profile
#style1
from collections import Counter
from collections import OrderedDict  #有序字典
# 写出到文件
fh = open('cons.txt', 'wt')

rosalind = OrderedDict()
seqLength = 0
with open('rosalind_cons.txt') as f:
    for line in f:
        line = line.rstrip()
        if line.startswith('>'):
            rosalindName = line
            rosalind[rosalindName] = ''
            # 跳出这个循环，进入下一个循环
            continue
        # 如果不是'>'开头，执行这一句
        rosalind[rosalindName] += line
    # len(ATCCAGCT)
    seqLength = len(rosalind[rosalindName])


A, C, G, T = [], [], [], []
consensusSeq = ''
for i in range(seqLength):
    seq = ''
    # rosalind.keys = Rosalind_1...Rosalind_7
    for k in rosalind.keys():
        # 把 Rosalind_1...Rosalind_7 相同顺序的碱基加起来
        seq += rosalind[k][i]
    A.append(seq.count('A'))
    C.append(seq.count('C'))
    G.append(seq.count('G'))
    T.append(seq.count('T'))
    # 为seq计数
    counts = Counter(seq)

    # 从多到少返回,是个list啊，只返回第一个
    consensusSeq += counts.most_common()[0][0]
    print(counts.most_common())

fh.write(consensusSeq + '\n')
fh.write('\n'.join(['A:\t' + '\t'.join(map(str, A)), 'C:\t' + '\t'.join(map(str, C)),
                    'G:\t' + '\t'.join(map(str, G)), 'T:\t' + '\t'.join(map(str, T))]))
# .join(map(str,A))  把 A=[5, 1, 0, 0, 5, 5, 0, 0] 格式转化成 A：5 1 0 0 5 5 0 0
fh.close()


#style2(未成功)
import numpy as np
import os
from collections import Counter
fhand = open('rosalind_cons.txt')
t = []
for line in fhand:
    if line.startswith('>'):
        continue
    else:
        line = line.rstrip()
        line_list = list(line)  # 变成一个list
    t.append(line_list)  # 再把list 放入list
a = np.array(t)  # 创建一个二维数组
print(a)
L1, L2, L3, L4 = [], [], [], []
comsquence = ''

for i in range(a.shape[1]):  # shape[0],是7 行数，shape[1]是8 项目数
    l = [x[i] for x in a]  # 调出二维数组的每一列
    L1.append(l.count('A'))
    L2.append(l.count('C'))
    L3.append(l.count('T'))
    L4.append(l.count('G'))
    comsquence = comsquence + Counter(l).most_common()[0][0]
print(comsquence)
print('A:', L1, '\n', 'C:', L2, '\n', 'T:', L3, '\n', 'G:', L4)


#style3
def update_cnt(list, char, str):
    res = [a + b for a,b in zip(list, [1 if x == char else 0 for x in str])]
    return(res)

def print_cnt(list, label):
    print(label,end = ": ")
    for x in list:
        print(x, end=" ")
    print()

f = open("rosalind_cons.txt", 'r')
fas = f.readlines()

seq = ''
j = 0
for i in range(len(fas)):
    if fas[i][0] == '>' and j == 0:
        j += 1
        next
    elif fas[i][0] == '>':
        if j == 1:
            A = [0 for x in seq]
            C = A
            G = A
            T = A
        A = update_cnt(A, 'A', seq)
        C = update_cnt(C, 'C', seq)
        G = update_cnt(G, 'G', seq)
        T = update_cnt(T, 'T', seq)
        j += 1
        seq = ''
    else:
        seq += fas[i].strip()

## last seq
A = update_cnt(A, 'A', seq)
C = update_cnt(C, 'C', seq)
G = update_cnt(G, 'G', seq)
T = update_cnt(T, 'T', seq)

consensus = ''
for i in range(len(A)):
    m = max(A[i], C[i], G[i], T[I])
    if A[i] == m:
        consensus += 'A'
    elif C[i] == m:
        consensus += 'C'
    elif G[i] == m:
        consensus += 'G'
    elif T[i] == m:
        consensus += 'T'

print(consensus)

print_cnt(A, 'A')
print_cnt(C, 'C')
print_cnt(G, 'G')
print_cnt(T, 'T')


#style4
import numpy as np #引入numpy
d=open('rosalind_cons.txt','r')
from collections import Counter #问题1里求A/T/C/G出现过多少次时用到的函数
list2=[]
a=1 #起标记作用，看是不是第一次遇到'>'
dline=''
for line in d:
    if line.startswith('>'):
        if a==0: #如果不是第一次遇到'>'
            list2.append(list(dline)) #list是为了创建二维数组
            dline=''
        continue
    else:
        a=0 #如果第一次遇到'>'
        dline+=line.rstrip() #去掉每一行右边的换行
#最后一行
list2.append(list(dline))
bi=np.array(list2)#创建二维数组
LA,LT,LC,LG=[],[],[],[]
consensus =''#共同序列

for i in range(bi.shape[1]):#数组第1行有多少个
    l=bi[:,i] #调出数组每一列
    listl=list(l)
    LA.append(listl.count('A'))#数一个加一个，LA是数值列表，每数一列就加一个在末尾
    LC.append(listl.count('C'))
    LT.append(listl.count('T'))
    LG.append(listl.count('G'))
    most=Counter(listl).most_common() #算出最多的那个
    print(most)
    consensus=consensus+most[0][0]#第一列的（排序）第一的相加

print (consensus)
print("A:"," ".join(str(j) for j in LA))
print("C:"," ".join(str(j) for j in LC))
print("G:"," ".join(str(j) for j in LG))
print("T:"," ".join(str(j) for j in LT))

矩阵运算： numpy spicy pandas
counters包

最后编辑于：2018.08.16 15:24:24

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 206,126评论 6赞 481
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 88,254评论 2赞 382
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 152,445评论 0赞 341
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 55,185评论 1赞 278
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 64,178评论 5赞 371
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 48,970评论 1赞 284
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 38,276评论 3赞 399
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 36,927评论 0赞 259
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 43,400评论 1赞 300
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 35,883评论 2赞 323
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 37,997评论 1赞 333
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 33,646评论 4赞 322
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 39,213评论 3赞 307
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 30,204评论 0赞 19
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 31,423评论 1赞 260
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 45,423评论 2赞 352
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 42,722评论 2赞 345

rosalind1-10

1.counting DNA nucleotides

2.将DNA转录成RNA

3.获取反向互补链 Rosalind 3 题目

4.兔子和递归关系

5.计算GC含量

6.计算突变个数

7.孟德尔第一定律 Rosalind7题目

8.将RNA翻译成蛋白质

9.找到子串位置

10.寻找最可能的共同祖先

推荐阅读更多精彩内容

3.获取反向互补链

Rosalind 3 题目

7.孟德尔第一定律

Rosalind7题目