如何使用来自多个词典的子字符串 select 序列
How to select sequences using substrings from several dictionaries
我有四个包含子字符串的字典:
fw1={'PLAU_fw1':'CCCFFF','EPCAM_fw1':'GGGTTT','MIF_fw1':'HHHFFF'}
fw1_rc={'PLAU_fw1_rc':'cccfff','EPCAM_fw1_rc':'gggttt','MIF_fw1_rc':'hhhfff'}
fw2={'PLAU_fw2':'RRREEE','EPCAM_fw2':'OOOPPP','MIF_fw2':'KKKZZZ'}
fw2_rc={'PLAU_fw2_rc':'rrreee','EPCAM_fw2_rc':'oooppp','MIF_fw2_rc':'kkkzzz'}
和一个 fasta 文件:
>MN00153:75:000H37WNG:1:11102:8051:1085
NNNNNNNNCCCFFFNNNNNGGGTTTNNNNNNN
>MN00153:75:000H37WNG:1:11102:00000:1088
NNNNNNCCCFFFNNNNNrrreeeNNNNNNN
>MN00153:75:000H37WNG:1:11102:16389:1090
NNNHHHFFFNNNNNNNOOOPPPNNNNNNN
>MN00153:75:000H37WNG:1:11102:00000:1095
cccfffNNNNNNNKKKZZZNNNNNNN
如果两个子字符串来自特定词典,我想 select 序列。子字符串的顺序并不重要。
换句话说,如果一个子字符串来自 fw1
而另一个子字符串来自 fw2_rc
字典,我希望我的代码 select 读取 OR
一个子字符串来自fw1_rc
另一个来自 fw2
词典。
这是我的代码;它 select 正确读取但多次重复输出:
from Bio import SeqIO
count=0
with open('file.fasta','r') as f:
for record in SeqIO.parse(f,'fasta'):
for k1,Fw1 in fw1.items():
for k2,Fw1_rc in fw1_rc.items():
for k3,Fw2 in fw2.items():
for k4,Fw2_rc in fw2_rc.items():
if Fw1 in record.seq and Fw2_rc in record.seq:
pos1 = record.seq.find(Fw1) + 1
pos2 = record.seq.find(Fw2_rc) + 1
if pos1 < pos2:
distance = pos2 - pos1
if pos1 > pos2:
distance = pos1 - pos2
print("sample_2")
print(record.id)
print(record.seq)
print(str(k1) + " : " + str(Fw1) + " - The position is " + str(pos1))
print(str(k4) + " : " + str(Fw2_rc) + " - The position is " + str(pos2))
print('\n')
if Fw1_rc in record.seq and Fw2 in record.seq:
pos1 = record.seq.find(Fw1_rc) + 1
pos2 = record.seq.find(Fw2) + 1
if pos1 < pos2:
distance = pos2 - pos1
if pos1 > pos2:
distance = pos1 - pos2
print(record.id)
print(record.seq)
print(str(k2) + " : " + str(Fw1_rc) + " - The position is " + str(pos1))
print(str(k3) + " : " + str(Fw2) + " - The position is " + str(pos2))
print('\n')
count+=1
print("The total number of reads that have both 21nt protein-specific sequences is " + str(count))
所需的输出应该是:
sample_2
MN00153:75:000H37WNG:1:11102:00000:1088
NNNNNNCCCFFFNNNNNrrreeeNNNNNNN
PLAU_fw1 : CCCFFF - The position is 7
PLAU_fw2_rc : rrreee - The position is 18
sample_2
MN00153:75:000H37WNG:1:11102:00000:1095
cccfffNNNNNNNKKKZZZNNNNNNN
PLAU_fw1_rc : cccfff - The position is 1
MIF_fw2 : KKKZZZ - The position is 14
The total number of reads that have both 21nt protein-specific sequences is 2
我无法获得计数,我反转了词典中的 key/value 项以允许查找(否则无法获得您想要的结果)
此外,我无法使用 Bio
,只能从文本文件中读取,但对我的代码所做的更改可以轻松更改为 Bio
seq
和 id
.
import re
fw1={'PLAU_fw1':'CCCFFF','EPCAM_fw1':'GGGTTT','MIF_fw1':'HHHFFF'}
fw1 = dict(zip(fw1.values(), fw1.keys()))
fw1_rc={'PLAU_fw1_rc':'cccfff','EPCAM_fw1_rc':'gggttt','MIF_fw1_rc':'hhhfff'}
fw1_rc= dict(zip(fw1_rc.values(), fw1_rc.keys()))
fw2={'PLAU_fw2':'RRREEE','EPCAM_fw2':'OOOPPP','MIF_fw2':'KKKZZZ'}
fw2 = dict(zip(fw2.values(), fw2.keys()))
fw2_rc={'PLAU_fw2_rc':'rrreee','EPCAM_fw2_rc':'oooppp','MIF_fw2_rc':'kkkzzz'}
fw2_rc= dict(zip(fw2_rc.values(), fw2_rc.keys()))
one_upcase = '(' + '|'.join(fw1.keys()) + ')'
one_locase = '(' + '|'.join(fw1_rc.keys()) + ')'
two_upcase = '(' + '|'.join(fw2.keys()) + ')'
two_locase = '(' + '|'.join(fw2_rc.keys()) + ')'
with open('f1.txt', 'r') as f:
_id = ''
count = 0
for line in f:
line = line.rstrip()
if line.startswith('>'):
_id = line[1:]
else:
if match := re.search(f'(?=.*{one_upcase})(?=.*{two_locase})', line):
print(_id)
print(line)
for item in match.groups():
idx = 1 + line.index(item)
if item.isupper():
print(fw1[item], ': ', end='')
else:
print(fw2_rc[item], ': ', end='')
print(item, 'The position is', idx)
print()
elif match := re.search(f'(?=.*{one_locase})(?=.*{two_upcase})', line):
print(_id)
print(line)
for item in match.groups():
idx = 1 + line.index(item)
if item.isupper():
print(fw2[item], ': ', end='')
else:
print(fw1_rc[item], ': ', end='')
print(item, 'The position is', idx)
print()
输出与您的输出匹配:
MN00153:75:000H37WNG:1:11102:00000:1088
NNNNNNCCCFFFNNNNNrrreeeNNNNNNN
PLAU_fw1 : CCCFFF The position is 7
PLAU_fw2_rc : rrreee The position is 18
MN00153:75:000H37WNG:1:11102:00000:1095
cccfffNNNNNNNKKKZZZNNNNNNN
PLAU_fw1_rc : cccfff The position is 1
MIF_fw2 : KKKZZZ The position is 14
更新
这是不使用正则表达式的解决方案
(此外,我安装了 Bio
以将其用于此解决方案)
from Bio import SeqIO
# make values, (fasta seq), keys in dict and original keys now become values
fw1={'PLAU_fw1':'CCCFFF','EPCAM_fw1':'GGGTTT','MIF_fw1':'HHHFFF'}
fw1 = dict(zip(fw1.values(), fw1.keys()))
fw1_rc={'PLAU_fw1_rc':'cccfff','EPCAM_fw1_rc':'gggttt','MIF_fw1_rc':'hhhfff'}
fw1_rc= dict(zip(fw1_rc.values(), fw1_rc.keys()))
fw2={'PLAU_fw2':'RRREEE','EPCAM_fw2':'OOOPPP','MIF_fw2':'KKKZZZ'}
fw2 = dict(zip(fw2.values(), fw2.keys()))
fw2_rc={'PLAU_fw2_rc':'rrreee','EPCAM_fw2_rc':'oooppp','MIF_fw2_rc':'kkkzzz'}
fw2_rc= dict(zip(fw2_rc.values(), fw2_rc.keys()))
# store fasta substrings in lists
one_upcase = list(fw1.keys())
one_locase = list(fw1_rc.keys())
two_upcase = list(fw2.keys())
two_locase = list(fw2_rc.keys())
with open('fasta.txt', 'r') as f:
count = 0
for record in SeqIO.parse(f,'fasta'):
_id = record.id
seq = record.seq
last = False
for token_fw1 in one_upcase:
if last == True:
break
for token_fw2_rc in two_locase:
if token_fw1 in seq and token_fw2_rc in seq:
print(_id)
print(seq)
print(fw1[token_fw1], ':', token_fw1,
'in position', str(1+seq.index(token_fw1)))
print(fw2_rc[token_fw2_rc], ':', token_fw2_rc,
'in position', str(1+seq.index(token_fw2_rc)))
print()
last = True
break
for token_fw1_rc in one_locase:
if last == True:
break
for token_fw2 in two_upcase:
if token_fw1_rc in seq and token_fw2 in seq:
print(_id)
print(seq)
print(fw1_rc[token_fw1_rc], ':', token_fw1_rc,
'in position', str(1+seq.index(token_fw1_rc)))
print(fw2[token_fw2], ':', token_fw2,
'in position', str(1+seq.index(token_fw2)))
print()
last = True
break
我这里没有数,因为我不知道你想数什么。
我颠倒了字典(我想你创建的字典是错误的),其中值(fasta 子字符串)成为键,键成为值。这允许在我的解决方案中进行查找:
print(fw1_rc[token_fw1_rc] etc
和 print(fw2_rc[token_fw2_rc] etc
(完成此查找的 4 次中的两次)。
另外,注意,我这里说的token
是指fasta子串。
我有四个包含子字符串的字典:
fw1={'PLAU_fw1':'CCCFFF','EPCAM_fw1':'GGGTTT','MIF_fw1':'HHHFFF'}
fw1_rc={'PLAU_fw1_rc':'cccfff','EPCAM_fw1_rc':'gggttt','MIF_fw1_rc':'hhhfff'}
fw2={'PLAU_fw2':'RRREEE','EPCAM_fw2':'OOOPPP','MIF_fw2':'KKKZZZ'}
fw2_rc={'PLAU_fw2_rc':'rrreee','EPCAM_fw2_rc':'oooppp','MIF_fw2_rc':'kkkzzz'}
和一个 fasta 文件:
>MN00153:75:000H37WNG:1:11102:8051:1085
NNNNNNNNCCCFFFNNNNNGGGTTTNNNNNNN
>MN00153:75:000H37WNG:1:11102:00000:1088
NNNNNNCCCFFFNNNNNrrreeeNNNNNNN
>MN00153:75:000H37WNG:1:11102:16389:1090
NNNHHHFFFNNNNNNNOOOPPPNNNNNNN
>MN00153:75:000H37WNG:1:11102:00000:1095
cccfffNNNNNNNKKKZZZNNNNNNN
如果两个子字符串来自特定词典,我想 select 序列。子字符串的顺序并不重要。
换句话说,如果一个子字符串来自 fw1
而另一个子字符串来自 fw2_rc
字典,我希望我的代码 select 读取 OR
一个子字符串来自fw1_rc
另一个来自 fw2
词典。
这是我的代码;它 select 正确读取但多次重复输出:
from Bio import SeqIO
count=0
with open('file.fasta','r') as f:
for record in SeqIO.parse(f,'fasta'):
for k1,Fw1 in fw1.items():
for k2,Fw1_rc in fw1_rc.items():
for k3,Fw2 in fw2.items():
for k4,Fw2_rc in fw2_rc.items():
if Fw1 in record.seq and Fw2_rc in record.seq:
pos1 = record.seq.find(Fw1) + 1
pos2 = record.seq.find(Fw2_rc) + 1
if pos1 < pos2:
distance = pos2 - pos1
if pos1 > pos2:
distance = pos1 - pos2
print("sample_2")
print(record.id)
print(record.seq)
print(str(k1) + " : " + str(Fw1) + " - The position is " + str(pos1))
print(str(k4) + " : " + str(Fw2_rc) + " - The position is " + str(pos2))
print('\n')
if Fw1_rc in record.seq and Fw2 in record.seq:
pos1 = record.seq.find(Fw1_rc) + 1
pos2 = record.seq.find(Fw2) + 1
if pos1 < pos2:
distance = pos2 - pos1
if pos1 > pos2:
distance = pos1 - pos2
print(record.id)
print(record.seq)
print(str(k2) + " : " + str(Fw1_rc) + " - The position is " + str(pos1))
print(str(k3) + " : " + str(Fw2) + " - The position is " + str(pos2))
print('\n')
count+=1
print("The total number of reads that have both 21nt protein-specific sequences is " + str(count))
所需的输出应该是:
sample_2
MN00153:75:000H37WNG:1:11102:00000:1088
NNNNNNCCCFFFNNNNNrrreeeNNNNNNN
PLAU_fw1 : CCCFFF - The position is 7
PLAU_fw2_rc : rrreee - The position is 18
sample_2
MN00153:75:000H37WNG:1:11102:00000:1095
cccfffNNNNNNNKKKZZZNNNNNNN
PLAU_fw1_rc : cccfff - The position is 1
MIF_fw2 : KKKZZZ - The position is 14
The total number of reads that have both 21nt protein-specific sequences is 2
我无法获得计数,我反转了词典中的 key/value 项以允许查找(否则无法获得您想要的结果)
此外,我无法使用 Bio
,只能从文本文件中读取,但对我的代码所做的更改可以轻松更改为 Bio
seq
和 id
.
import re
fw1={'PLAU_fw1':'CCCFFF','EPCAM_fw1':'GGGTTT','MIF_fw1':'HHHFFF'}
fw1 = dict(zip(fw1.values(), fw1.keys()))
fw1_rc={'PLAU_fw1_rc':'cccfff','EPCAM_fw1_rc':'gggttt','MIF_fw1_rc':'hhhfff'}
fw1_rc= dict(zip(fw1_rc.values(), fw1_rc.keys()))
fw2={'PLAU_fw2':'RRREEE','EPCAM_fw2':'OOOPPP','MIF_fw2':'KKKZZZ'}
fw2 = dict(zip(fw2.values(), fw2.keys()))
fw2_rc={'PLAU_fw2_rc':'rrreee','EPCAM_fw2_rc':'oooppp','MIF_fw2_rc':'kkkzzz'}
fw2_rc= dict(zip(fw2_rc.values(), fw2_rc.keys()))
one_upcase = '(' + '|'.join(fw1.keys()) + ')'
one_locase = '(' + '|'.join(fw1_rc.keys()) + ')'
two_upcase = '(' + '|'.join(fw2.keys()) + ')'
two_locase = '(' + '|'.join(fw2_rc.keys()) + ')'
with open('f1.txt', 'r') as f:
_id = ''
count = 0
for line in f:
line = line.rstrip()
if line.startswith('>'):
_id = line[1:]
else:
if match := re.search(f'(?=.*{one_upcase})(?=.*{two_locase})', line):
print(_id)
print(line)
for item in match.groups():
idx = 1 + line.index(item)
if item.isupper():
print(fw1[item], ': ', end='')
else:
print(fw2_rc[item], ': ', end='')
print(item, 'The position is', idx)
print()
elif match := re.search(f'(?=.*{one_locase})(?=.*{two_upcase})', line):
print(_id)
print(line)
for item in match.groups():
idx = 1 + line.index(item)
if item.isupper():
print(fw2[item], ': ', end='')
else:
print(fw1_rc[item], ': ', end='')
print(item, 'The position is', idx)
print()
输出与您的输出匹配:
MN00153:75:000H37WNG:1:11102:00000:1088
NNNNNNCCCFFFNNNNNrrreeeNNNNNNN
PLAU_fw1 : CCCFFF The position is 7
PLAU_fw2_rc : rrreee The position is 18
MN00153:75:000H37WNG:1:11102:00000:1095
cccfffNNNNNNNKKKZZZNNNNNNN
PLAU_fw1_rc : cccfff The position is 1
MIF_fw2 : KKKZZZ The position is 14
更新
这是不使用正则表达式的解决方案
(此外,我安装了 Bio
以将其用于此解决方案)
from Bio import SeqIO
# make values, (fasta seq), keys in dict and original keys now become values
fw1={'PLAU_fw1':'CCCFFF','EPCAM_fw1':'GGGTTT','MIF_fw1':'HHHFFF'}
fw1 = dict(zip(fw1.values(), fw1.keys()))
fw1_rc={'PLAU_fw1_rc':'cccfff','EPCAM_fw1_rc':'gggttt','MIF_fw1_rc':'hhhfff'}
fw1_rc= dict(zip(fw1_rc.values(), fw1_rc.keys()))
fw2={'PLAU_fw2':'RRREEE','EPCAM_fw2':'OOOPPP','MIF_fw2':'KKKZZZ'}
fw2 = dict(zip(fw2.values(), fw2.keys()))
fw2_rc={'PLAU_fw2_rc':'rrreee','EPCAM_fw2_rc':'oooppp','MIF_fw2_rc':'kkkzzz'}
fw2_rc= dict(zip(fw2_rc.values(), fw2_rc.keys()))
# store fasta substrings in lists
one_upcase = list(fw1.keys())
one_locase = list(fw1_rc.keys())
two_upcase = list(fw2.keys())
two_locase = list(fw2_rc.keys())
with open('fasta.txt', 'r') as f:
count = 0
for record in SeqIO.parse(f,'fasta'):
_id = record.id
seq = record.seq
last = False
for token_fw1 in one_upcase:
if last == True:
break
for token_fw2_rc in two_locase:
if token_fw1 in seq and token_fw2_rc in seq:
print(_id)
print(seq)
print(fw1[token_fw1], ':', token_fw1,
'in position', str(1+seq.index(token_fw1)))
print(fw2_rc[token_fw2_rc], ':', token_fw2_rc,
'in position', str(1+seq.index(token_fw2_rc)))
print()
last = True
break
for token_fw1_rc in one_locase:
if last == True:
break
for token_fw2 in two_upcase:
if token_fw1_rc in seq and token_fw2 in seq:
print(_id)
print(seq)
print(fw1_rc[token_fw1_rc], ':', token_fw1_rc,
'in position', str(1+seq.index(token_fw1_rc)))
print(fw2[token_fw2], ':', token_fw2,
'in position', str(1+seq.index(token_fw2)))
print()
last = True
break
我这里没有数,因为我不知道你想数什么。
我颠倒了字典(我想你创建的字典是错误的),其中值(fasta 子字符串)成为键,键成为值。这允许在我的解决方案中进行查找:
print(fw1_rc[token_fw1_rc] etc
和 print(fw2_rc[token_fw2_rc] etc
(完成此查找的 4 次中的两次)。
另外,注意,我这里说的token
是指fasta子串。