python - 寻找主题 - 输入:一个包含 10 个序列和 10 个主题的 .txt 文件

python - finding a motif - input: a .txt file with 10 seqs and 10 motifs

当我 运行 我的 BruteForce 函数只有一个输入时它工作并且结果是正确的。

def BruteForce(s, t):
    occurrences = []
    for i in range(len(s)-len(t)+1): # loop over alignment
        match = True
        for j in range(len(t)): # loop over characters
            if s[i+j] != t[j]:  # compare characters
                match = False   # mismatch
                break
        if match:   # allchars matched
            occurrences.append(i)

    print(occurrences)

t = 'CAACTTCCA'

s = 'GACAACTTCCAACTTCCAACTTCCCGTCCCAACTTCACAACTTCGGCCCAACTTCCATGCAACTTCACCATCAACTTCGCTCGAAGCTGCCTTCCACTCCAACTTCACAACTTCCTCAACTTCCTCACCAACTTCAGCAACTTCTCTAGGGCCAACTTCCAACTTCTCAACTTCTCAACTTCCAACTTCCGACAACTTCTCCTGGCAACTTCCAACTTCCAACTTCAATACAACTTCGCAGACAACTTCCGCAACTTCGAACAACTTCCAACTTCCCCAACTTCCAACTTCCAACTTCGCCAACTTCCAACTTCCAACTTCCCAACTTCAGATAGCAACTTCGATCTTACACAACTTCACGCAACTTCTCCAACTTCCAACTTCTGTGCAACTTCTCTGAACAACTTCCTCAACTTCCAACTTCGCAACTTCCCCAACTTCCTCAACTTCATGCAACTTCGAGGCAACTTCCCAACTTCGCAACTTCCTATTCCCAACTTCTGTGGCAACTTCTCAACTTCTGGACAACTTCTATGCCCAACTTCACAACTTCCCCAACTTCTTTACAACTTCGACAACTTCATCAACTTCTAGTCAACTTCTGGTCCAACTTCCAACTTCCCCAACTTCCAAAGTGCCGCAACTTCGTAACAACTTCACGCGCTCAACTTCAACCAACTTCTTTTCCCGCAACTTCGCAACTTCACAACTTCTAATCAACTTCCAACTTCGGATCAACTTCCAACTTCGCCAACTTCCAACTTCCAACTTCTCCAGGGACAACTTCAAGTACAACTTCCAACTTCGCAACTTCACAACTTCCCAACTTCGCAACTTCTACACGCAACTTCCAACTTCTGGTCCCAACTTCATCAACTTCAGTCAACTTC'

BruteForce(s, t)

[2, 9, 48, 152, 175, 205, 212, 261, 277, 284, 300, 307, 370, 410, 607, 623, 717, 735, 751, 758, 792, 844]

但是当我在下面的循环中使用它时returns什么都没有!我已经使用 print() 来查看它是否从 input_2.txt 文件中读取了字符串和图案,它确实读取了,但是它 returns 出现的空列表 [].

with open('input_2.txt', 'r') as input_2:
    nt = input_2.readline() #nt is the no of strings+motifs in 1st line
    nt = int(nt) #in this case it is 10

    for i in range(nt):
        s = input_2.readline()
        print(s)
        t = input_2.readline()
        print(t)
        BruteForce(s, t)

能否请您告诉我这里需要进行哪些更改?非常感谢,比塔

input_2.txt

10

GACAACTTCCAACTTCCAACTTCCCGTCCCAACTTCACAACTTCGGCCCAACTTCCATGCAACTTCACCATCAACTTCGCTCGAAGCTGCCTTCCACTCCAACTTCACAACTTCCTCAACTTCCTCACCAACTTCAGCAACTTCTCTAGGGCCAACTTCCAACTTCTCAACTTCTCAACTTCCAACTTCCGACAACTTCTCCTGGCAACTTCCAACTTCCAACTTCAATACAACTTCGCAGACAACTTCCGCAACTTCGAACAACTTCCAACTTCCCCAACTTCCAACTTCCAACTTCGCCAACTTCCAACTTCCAACTTCCCAACTTCAGATAGCAACTTCGATCTTACACAACTTCACGCAACTTCTCCAACTTCCAACTTCTGTGCAACTTCTCTGAACAACTTCCTCAACTTCCAACTTCGCAACTTCCCCAACTTCCTCAACTTCATGCAACTTCGAGGCAACTTCCCAACTTCGCAACTTCCTATTCCCAACTTCTGTGGCAACTTCTCAACTTCTGGACAACTTCTATGCCCAACTTCACAACTTCCCCAACTTCTTTACAACTTCGACAACTTCATCAACTTCTAGTCAACTTCTGGTCCAACTTCCAACTTCCCCAACTTCCAAAGTGCCGCAACTTCGTAACAACTTCACGCGCTCAACTTCAACCAACTTCTTTTCCCGCAACTTCGCAACTTCACAACTTCTAATCAACTTCCAACTTCGGATCAACTTCCAACTTCGCCAACTTCCAACTTCCAACTTCTCCAGGGACAACTTCAAGTACAACTTCCAACTTCGCAACTTCACAACTTCCCAACTTCGCAACTTCTACACGCAACTTCCAACTTCTGGTCCCAACTTCATCAACTTCAGTCAACTTC

CAACTTCCA

ATTGTGTATCGCTATGTATCGTGTATCGGATTTTGTATCGTGTATCGGTGTATCGTGTATCGTGTATCGTATCACTGTATCGTTGTATCGTAGCGTTGTATCGTAATGTATCGCTCTGTATCGTGTATCGGGTTTGTATCGATGTGTATCGCTGTATCGGTGTATCGGTGTATCGCTTGTATCGACAGCGCTTGTATCGTGTATCGACCTGTATCGGTGTATCGTGTATCGAATGTATCGTTGTATCGAATTGTGTATCGTGTATCGTGTATCGTGTATCGTATGTATCGACTGTATCGCTGTATCGTAGCCTGTATCGTGTATCGGTGTATCGGTGTATCGGCGTGTATCGAATGTATCGTTGTATCGCTGTATCGTGTGCTGTGTATCGGTGTATCGACCCGTTGTATCGTGTATCGTGGGTAAGTGTATCGTTGTATCGTAGTGTATCGTGTATCGTGTATCGTCATGGTATTGTATCGTTATAGCTGTATCGCTCGGCTGTATCGTATGTATCGTGTATCGTGTATCGCATGTATCGGTGTATCGCTGTATCGACCATGTATCGTGTATCGGTGTATCGATGTATCGCTTCCATTAGAAATGTATCGTATGTATCGTGTATCGCTGTATCGTTTGTATCGCATGTATCGATTGTGTATCGTGTATCGTTGTATCGTGTATCGTGTATCGTTGTATCGTAATAACGATGTATCGAATGATGTATCGTTGTGATGTATCGTGTATCGACTGTATCGATGTATCGGGTTGTATCGTGTATCGTGTATCGTAGATGTATCGAGAGCCATTGTATCGCCTTGTATCGCGTGTATCGGTGTGTATCGTTTGTATCGTGTTTGAATGTATCGCTGTATCG

TGTATCGTG

TTTCAGCGTGTTTGGCTTTCAGCACTACTTTCAGCGGTTTCAGCTTTTTCAGCTTTTCAGCTATTTTCAGCTTTCAGCCATCTTTCAGCCCAGGTTTCAGCTGTTTCAGCTTTCAGCGAGTTTCAGCTTTCAGCGCGGGGATTTCAGCGTATTTCAGCGACATTTCAGCAGTTTCAGCTGGAGTGAAGCCGTTTCAGCGCTTTTCAGCACCATTTCAGCTAGTAGGTTTCAGCTTTTCAGCCTTTTCAGCTTTCAGCTATTTCAGCTCTAGAAGTCGTTTCAGCTTTCAGCTTCCTTTTCAGCTTTCAGCTTTCAGCCTTTCAGCTTTCAGCAAACTTTCAGCGATTTCAGCTTTCAGCTTTCAGCTTTCAGCTTTCAGCATTTCAGCTTTCAGCCGTTTTCAGCGACTCTTTCAGCGTTTCAGCTTTCAGCTGATCGTTTTCAGCTTTTCAGCCGGTTTCAGCGAAAGTTGGTCTTTTCAGCCAAATTTTCAGCTTTCAGCTTTCAGCGTTTCAGCTTTTCAGCCATTTCAGCCTATTTTCAGCAATCTTTCAGCAATTTTCAGCGCAATTTCAGCCAAAATTTCAGCATTTCAGCTTTCAGCGTAGTTTCAGCTTTCAGCTGAGCCTTTCAGCTTTTCAGCTCTCTTTTCAGCTTTCAGCGATTTCAGCCTTTTCAGCGCTTTCAGCACCACGCCTTTCAGCTTTCAGCTTTTCAGCCTTTCAGCCTATTTTCAGCGGTTTCAGCGTTTCAGCTTTCAGCACTTTCAGCTTTCAGCTTTTCAGCGGAAGGTTTTCAGCA

TTTCAGCTT

AGCATCCGACTGCATCCGTGCATCCGACGGCATCCGGCATCCGCAGCATCCGCACAGCCGCATCCGAGCCCGCATCCGAAGCATCCGGCATCCGGGCATCCGGCCGCATCCGGCATCCGGGCATCCGTGCATCCGGCATCCGTGTGCATCCGGCATCCGGGGTGCTTGCATCCGCGCATCCGTGCATCCGCGCATCCGGCATCCGCAGTCCGCATCCGCGCATCCGGCATCCGAGCATCCGTATTCGCATCCGTCGAGCATCCGTACGTCGCATCCGTGCATCCGGGAGAGGCATCCGGGCATCCGCGCATCCGGTGGACTATAACGCGCATCCGGCGAGCATCCGCCGCGGCATCCGTGCATCCGGTACGGCATCCGGGCTGAGGCATCCGCTCCCGGGGCATCCGCGGCGCATCCGCACGCGCATCCGGCATCCGAGCATCCGTTGCATCCGGAGCATCCGTAGCATCCGATAGCATCCGCAAGCTTGCATCCGGCATCCGCTACAGCATCCGGTGGCATCCGAAGCATCCGCGCATCCGCGGATTGCATCCGCACACGGCATCCGTAGCATCCGTGCATCCGGAGTGCATCCGCCGGGCGCATCCGCTGCATCCGCAGCGCATCCGAGCATCCGATCTTGCATCCGGCATCCGGGAATGCATCCGGGACTGCATCCGGTCTTAAGGGTGCATCCGAATGCATCCGCTGTAGCATCCGGCATCCGAGTAAGCATCCGAGTTCTGCATCCGGAAGCAGCATCCGGCATCCGGACACCAGCATCCGCGCATCCGGGGCGAGCATCCGAGCATCCGGCATCCGGGGCAAGTGGCATCCGGCATCCGTCGTGCATCCGGGCATCCGAA

GCATCCGGC

GACGAGCTGACGAGCGTGACGAGCAGGCGACGAGCGACGAGCATGCGGTGACGAGCGGACGAGCGACGACGAGCGCGGACGAGCGACGAGCGGACGAGCTCCGACGAGCGACGAGCACGACGAGCGCCCGACGAGCGACGAGCATGATGACGAGCGACGAGCTTAGACGAGCATGCTGACGAGCTGACGAGCTGACGAGCTCGTACATCGACGAGCGGACGAGCAGCCCGATAAGCCTTCGACGAGCTGACGAGCCACGACGAGCGACGAGCGACGAGCGGACGAGCGGGACGAGCGTCGGACGAGCGGACGAGCAGGACGAGCACCTCAATCGACGAGCTGACGAGCGGACGAGCCGACGAGCTGAACTGAAGGACGAGCCTACGTTCTAACGTGCCGTCACTGACGAGCGACGAGCGGACGAGCGATGGACGAGCTGACGAGCGCAGGACGAGCGAGAAGGCTGACGAGCAGACGAGCTGACGAGCTCGACGAGCAGACGAGCGGACGAGCTTTGATAAGACGAGCACGTCGACGAGCCTCGTGACGAGCTGACGAGCGACGAGCTAGACGAGCGACGAGCTGACGAGCGACGAGCACTAAACGCGACGAGCTCGACGACGAGCATGGACGAGCGACGAGCTAGGACGAGCGGCGACGAGCAGACGAGCGACGAGCTGACGAGCTTATAGACGAGCCTGACGACGAGCCAAGACGAGCGACGAGCGGACGAGCGACGAGCACGACGAGCCGACGAGCCGCGGACGAGCCGTAGACGAGCCAATCATTAAGACGAGCCGACGAGCCATTTGGGGACGAGCCTCCTCGACGAGCCATAAGACGAGCTGACGAGCCATGACGAGCATGCCCGACGAGC

GACGAGCGA

GAGGACCCCGGACCCCTGGACCCCAGGACCCCGACGGTGGGGCGGACCCCGAGAAGGACCCCTGGACCCCCGCAGGACCCCTTTATCGGACCCCGGACCCCGGACCCCCGGACCCCGGCTGGACCCCGTCGTAAGGACCCCGAATCGGACCCCTAGGACCCCAGGGACCCCTCCCCGGTTGGACCCCCTGGACCCCCTTGAGAGGGACCCCGGACCCCACGCCGTGCTTAAGGACCCCACTATGGACCCCATACGGACCCCGGACCCCGATCAGAGCGACCAGGACCCCCGGCCTGGACCCCTCGGACCCCAGAAGGACCCCTCGGACCCCTGGACCCCGGACCCCACAGGGACCCCGGACCCCTAGCCGCGGTGGACCCCCGGACCCCCGCAGATGGGGACCCCTCATGCGGACCCCCTGACGGACCCCTGGACCCCCGGGACCCCCAGGGACCCCATTTCGAGGACCCCATGGGGACCCCAAGCTGGACCCCGGACCCCTGGGACCCCCGGACCCCGGACCCCGGACCCCATAGGACCCCCGTTTGTTGCCATGGACCCCTTGGGACCCCGAGTCGGACCCCGGACCCCCAGGACCCCACAGGACCCCGGGACCCCGGACCCCATCGATCGGACCCCGGACCCCGGACCCCTACGGACCCCGCTAAGGGACCCCGGACCCCGTGGACCCCCACGGACCCCTAGGACCCCGGGACCCCGGACCCCGGACCCCAGTTGGACCCCCCTTAGGACCCCAGGACCCCACATGAGGACCCCGGGACCCCGGGACCCCTGGACCCCTGGACCCC

GGACCCCGG

AAACCTTCGGACCTTCGTACCTTCGACCTTCGTGAAACCTTCGGACCTTCGTGACCTTCGGGTGACCTTCGGGTACTCACCTTCGGCACCTTCGTATACCTTCGAGACCTTCGCTGGACTGTAAACCTTCGATCGTTATTTTCTACCTTCGCACACCTTCGACCTTCGAGCCGAACCTTCGGACTGGCCTCACCTTCGATCACCTTCGACCTTCGAACCTTCGACCTTCGACCATACCTTCGACCTTCGGACCTTCGGACCTTCGCACCTTCGAACCTTCGCCACCTTCGACCTTCGAACCTTCGGTGGGCCACCTTCGTTACCTTCGAACCTTCGGGCACCTTCGACCTTCGGCTTACACCTTCGGACCTTCGATGACTGACCTTCGAAACCTTCGACCTTCGACCTTCGATTCCACACCTTCGACCTTCGACACCTTCGACCTTCGCGTTCACCTTCGACCTTCGACCTTCGGGCCAATACCTTCGACCTTCGGCCTAATACCAACCTTCGAACCTTCGAACCTTCGGGTGACCTTCGGACCTTCGCGCACACCTTCGCACTTACCTTCGACCTTCGGTGACCTTCGACCTTCGTGCACCTTCGCGTCTCACCTTCGCACCTTCGTTACCTTCGAACCTTCGACCTTCGACAACCTTCGTACCTTCGCTACCTTCGGGAACCTTCGATGTCCTACCTTCGACCTTCGCGACCTTCGACCTTCGTAACCTTCGAACCTTCGACCACCTTCGAACCTTCGAAGGAAGGACCTTCGAATCATTACCTTCGTTACCTTCGTAGTACCTTCGCCGACCTTCGGACCTTAACCTTCGTGA

ACCTTCGAC

AAGTGGTATTCCCAAAAAGATATTCCCGTTATTCCCGCTATAAATATTCCCAACTGTTTATTCCCACTTATTCCCGTATTCCCGTATTCCCCGGGCCTTTTATTCCCCATTCTATTCCCGTATTCCCATATTCCCACTTATTCCCCTATTCCCATATTCCCTATTCCCTATTCCCGTAAGTATTCCCCCCCTTATTCCCCCATTTGTCCATATTCCCTCAATATTCCCTATTCCCCTATTCCCGCTTCCCCAGTCTATCCCAAATTGTATTCCCAATATTCCCCTTTATTCCCTTTATTCCCGTATTCCCTATTCCCGTGGCTATTCCCCCGACGTTATTCCCTCTATATTCCCGTCTCAAAGTATTCCCTTATTCCCAGTATTCCCATTCGCCTGGTATTCCCGTATTCCCGGTATTCCCGTTTATTCCCATCTATTCCCATTATTCCCTGGTATTCCCGCTATTCCCGCAACGGTATTCCCTATTCCCTATTCCCGTATTCCCAATATTCCCTTCCTATTCCCGTATTCCCTAGCTATTCCCATATTCCCGATATTCCCTATTCCCGTATTCCCTATTCCCTTATTCCCTATTCCCTAGTTATTCCCAAACAGGCTATTCCCTATTCCCTATTCCCATATTCCCTTATTCCCTACGTATTCCCTCTTAAAAATATTATTCCCCTATTCCCTATTCCCGTATTCCCCGAAAGTATTCCCCCATATTCCCAATTATATTCCCGGTATTCCCCCGTATTCCCTATTCCCTTATTCCCATATTCCCATATTCCCAAAGTTTATTCCCGTATTCCCCACTATTCCCATAGATTATTCCCTATTCCCTATTCCCTTATTCCCCTATTCCCTATTCCCTTATTCCCGTATTCCCAGTTTATTCCC

TATTCCCTA

GAAAGGATAAAGGATGTCAAAGGATGCAGATATAAAGGATAAAGGATACAAAGGATTAAGTATGTCCGAAAAAGGATAAAGGATTAAAAGGATCGCTGAACCTTACACAAAGGATAGTGAACAAAGGATTCAATAAAAGGATAAAGGATCAAAGGATCAAAAGGATACAAAGGATGCCGATGAAAGGATAAAGGATCTAAAGGATAAAGGATGAGAAAGGATTGGAAAGGATTATGAGATCAAAGGATAAAGGATGGTGTAAAAGCTAAAGGATTCGGCAAAGGATAAAGGATTAAAGGATAAAAAGGATAAAGGATGAAAGGATCCGCAGGGACCAGCAAAAGGATAAAGGATCGAATGGGTAAGAAAGGATCAAAGGATGAAAAAGGATTACTAAAGGATAAAGGATCCTGAAAGGATTACAAAGGATCTTAAAAGGATTCGGAAAGGATCCATAGGAAAAGGATAAAGGATCGCGAAAGGATAAAGGATTAAAGGATAAAGGATAAAAGGATTCAAAGGATAAAGGATAGACAAGGGAGAAAGGATGCAAAGGATGAAAGGATAAAGGATAAAGGATAAAGGATTAGGTTAAAGGATCGAAAGGATCCAAAGAGAGCGAAAAGGATGAGGACGAAAGGATCAAAGGATCCAAAGGATCAAAGGATAAAGGATTGCGATGGAAAGGATGTCAAAGGATCACCAAAGGATAAAGGATAATAAAGGATAAAGGATAAAGGATCAAAGGATTTGAAAAGGATAAAGGATCACGAAAGGATAAAAGGATTGGCAAAGGATGAAAAGGATGAAAGGATAAGCCCTCCAAAGGAT

AAAGGATAA

ATTCAAATACTTCAAATAGTCAAATATCAAATATGCTTCAAATATCAAATATCGGTCAAATAAGTCAAATAAGTCAAATATCAAATATCAAATACTCAAATATCAAATAGTGTCAAATACGAATTGGGTCAAATATCAAATATGTGTTCAAATATTCTTCAAATACTGGACACTCAAATAGGAGTCAAATATCAAATATCAAATACGTGAAGTGCTTGTCAAATATTTCAAATACTTTCAAATATTCAAATACTCAAATATGTTCAAATATCAAATATCAAATATTTCAAATATCAAATATCAAATAGTCCTCAAATAGCAAACCAGTCAAATATCAAATATGTCAAATATGCTCACGGCAACCTCAAATATCAAATATCAAATAGATGTCAAATAAGTTTCAAATATCAAATATCAAATATCAAATATCAAATATTCAAATATCAAATAGCGGGCTCAAATAAGCTCAAATACGTCAAATAGGGGGTCAAATAGATCAAATAGTCAAATATTCAAATACATCAAATAGTCAAATACAAGAACCACCGAGATCTCAAATAATCAAATATCAAATATGGTCAAATAAATATTCAAATAGGTCAAATAAGATCAAATATCAAATAAGTCGTCCATCAAATAGTCAAATAGCTCAAATAACCTCAAATATCAAATATTCAAATACCGGTCATCAAATAGGACAAATCAAATAGTCAAATAAGATCCTCTCAAATAATTCAAATAGCTGTTCAAATACTCAAATATCAAATAGTCAAATATCAAATATTCAAATATCAAATACTTCAAATATGTTCAAATAATCAAATACTCAAATATCAAATAATCAAATAATACTTCAAATATACCAAACGCTCAAATATTAGTTGGATCAAATATCTTCAAATATCAAATAA

TCAAATATC

这个简单的代码将解决你的问题,所以程序读取文件并将每一行放入列表中我过滤了列表,因为我需要删除'\n'并在我运行循环并检查之后每个项目的索引。如果索引是对,则 s 和 t 将等于下一项的值。

f = open('test.txt', 'r')
all = list(filter(lambda a: a != '\n', f.readlines()))

for idx, val in enumerate(all):
    if idx % 2 == 0:
        s=val.rstrip()
        t=all[idx + 1].rstrip()
        BruteForce(s, t)