Python: 在多对多映射中查找子集
Python: Find subsets across many-to-many mapping
我正在尝试使用多对多映射,找到一组映射到另一组特定子集的子集。
我有很多基因。每个基因都是一个或多个 COG 的成员(反之亦然),例如
- gene1 是 COG1 的成员
- gene1 是 COG1003 的成员
- gene2 是 COG2 的成员
- gene3 是 COG273 的成员
- gene4 是 COG1 的成员
- gene5 是 COG273 的成员
- gene5 是 COG71 的成员
- gene6 是 COG1 的成员
- gene6 是 COG273 的成员
我有一组代表酶的短 COG,例如。 COG1,COG273.
我想找到所有基因组,它们之间具有酶中每个 COG 的成员资格,但没有不必要的重叠(在这种情况下,例如,'gene1 and gene6' 将是虚假的,因为 gene6 已经是一个两个 COG 的成员)。
在这个例子中,答案是:
- 基因 1 和基因 3
- 基因 1 和基因 5
- 基因 3 和基因 4
- 基因 4 和基因 5
- 基因6
虽然我可以获得每个 COG 的所有成员并创建一个 'product',但这将包含虚假结果(如上所述),其中集合中的基因多于必要。
我的映射当前包含在字典中,其中键是基因 ID,值是该基因所属的 COG ID 列表。但是我承认这可能不是存储映射的最佳方式。
一次基本攻击:
Keep your representation as it is for now.
Initialize a dictionary with the COGs as keys; each value is an initial count of 0.
Now start building your list of enzyme coverage sets (ecs_list), one ecs at a time. Do this by starting at the front of the gene list and working your way to the end, considering all combinations.
Write a recursive routine to solve the remaining COGs in the enzyme. Something like this:
def pick_a_gene(gene_list, cog_list, solution_set, cog_count_dict):
pick the first gene in the list that is in at least one cog in the list.
let the rest of the list be remaining_gene_list.
add the gene to the solution set.
for each of the gene's cogs:
increment the cog's count in cog_count_dict
remove the cog from cog_list (if it's still there).
add the gene to the solution set.
is there anything left in the cog_list?
yes:
pick_a_gene(remaining_gene_list, cog_list, solution_set, cog_count_dict)
no: # we have a solution: check it for minimality
from every non-zero entry in cog_count_dict, subtract 1. This gives us a list of excess coverage.
while the excess list is not empty:
pick the next gene in the solution set, starting from the *end* (if none, break the loop)
if the gene's cogs are all covered by the excess:
remove the gene from the solution set.
decrement the excess count of each of its cogs.
The remaining set of genes is an ECS; add it to ecs_list
这对你有用吗?考虑到您的举止良好的例子,我相信它可以正确涵盖最小集合。请注意,当我们检查最小值时,从高端开始防止出现这种情况:
gene1: cog1, cog5
gene2: cog2, cog5
gene3: cog3
gene4: cog1, cog2, cog4
enzyme: cog1 - cog5
我们可以看到我们需要 gene3、gene4 和 gene1 或 gene2。如果我们从低端开始消除,我们将抛弃 gene1 并且永远找不到该解决方案。如果我们从高端开始,我们将消除 gene2,但会在主循环的稍后传递中找到该解决方案。
可以构建一个存在此类 3 向冲突的案例。在那种情况下,我们必须在最小性检查中编写一个额外的循环才能找到它们。不过,我了解到您的数据对我们来说并没有那么糟糕。
这对你有用吗?请注意,既然你说你有一组较短的 COG,我就继续嵌套 for 循环;可能有优化方法...
为了将来参考,请 post 您随问题一起获得的任何代码。
import itertools
d = {'gene1':['COG1','COG1003'], 'gene2':['COG2'], 'gene3':['COG273'], 'gene4':['COG1'], 'gene5':['COG273','COG71'], 'gene6':['COG1','COG273']}
COGs = [set(['COG1','COG273'])] # example list of COGs containing only one enzyme; NOTE: your data should be a list of multiple sets
# create all pair-wise combinations of our data
gene_pairs = [l for l in itertools.combinations(d.keys(),2)]
found = set()
for pair in gene_pairs:
join = set(d[pair[0]] + d[pair[1]]) # set of COGs for gene pairs
for COG in COGs:
# check if gene already part of enzyme
if sorted(d[pair[0]]) == sorted(list(COG)):
found.add(pair[0])
elif sorted(d[pair[1]]) == sorted(list(COG)):
found.add(pair[1])
# check if gene combinations are part of enzyme
if COG <= join and pair[0] not in found and pair[1] not in found:
found.add(pair)
for l in found:
if isinstance(l, tuple): # if tuple
print l[0], l[1]
else:
print l
def findGenes(seq1, seq2, llist):
from collections import OrderedDict
from collections import Counter
from itertools import product
od = OrderedDict()
for b,a in llist:
od.setdefault(a,[]).append(b)
llv = []
for k,v in od.items():
if seq1 == k or seq2 == k:
llv.append(v)
# flat list needed for counting genes frequencies
flatL = [ x for sublist in llv for x in sublist]
cFlatl = Counter(flatL)
# this will gather genes that like gene6 have both sequencies
l_lonely = []
for k in cFlatl:
if cFlatl[k] > 1:
l_lonely.append(k)
newL = []
temp = []
for sublist in llv:
for el in sublist:
if el not in l_lonely:
newL.append(el)
temp.append(newL)
newL = []
# temp contains only genes that do not belong to both sequences
# product will connect genes from different sequence groups
p = product(*temp)
for el in list(p):
print(el)
print(l_lonely)
输出:
lt = [('gene1', 'COG1'), ('gene1', 'COG1003'),('gene2', 'COG2'), ('gene3', 'COG273'), ('gene4', 'COG1'),
('gene5', 'COG273'),('gene5', 'COG71'), ('gene6' ,'COG1'),('gene6', 'COG273')]
findGenes('COG1', 'COG273', lt )
('gene1', 'gene3')
('gene1', 'gene5')
('gene4', 'gene3')
('gene4', 'gene5')
['gene6']
感谢您的建议,他们启发了我使用递归将一些东西组合在一起。我想处理任意基因齿轮关系,所以它需要一个通用的解决方案。这应该产生所有基因组(酶),它们之间是所有必需 COG 的成员,没有重复的酶,也没有冗余基因:
def get_enzyme_cogs(enzyme, gene_cog_dict):
"""Get all COGs of which there is at least one member gene in the enzyme."""
cog_list = []
for gene in enzyme:
cog_list.extend(gene_cog_dict[gene])
return set(cog_list)
def get_gene_by_gene_cogs(enzyme, gene_cog_dict):
"""Get COG memberships for each gene in enzyme."""
cogs_list = []
for gene in enzyme:
cogs_list.append(set(gene_cog_dict[gene]))
return cogs_list
def add_gene(target_enzyme_cogs, gene_cog_dict, cog_gene_dict, proposed_enzyme = None, fulfilled_cogs = None):
"""Generator for all enzymes with membership of all target_enzyme_cogs, without duplicate enzymes or redundant genes."""
base_enzyme_genes = proposed_enzyme or []
fulfilled_cogs = get_enzyme_cogs(base_enzyme_genes, target_enzyme_cogs, gene_cog_dict)
## Which COG will we try to find a member of?
next_cog_to_fill = sorted(list(target_enzyme_cogs-fulfilled_cogs))[0]
gene_members_of_cog = cog_gene_dict[next_cog_to_fill]
for gene in gene_members_of_cog:
## Check whether any already-present gene's COG set is a subset of the proposed gene's COG set, if so skip addition
subset_found = False
proposed_gene_cogs = set(gene_cog_dict[gene]) & target_enzyme_cogs
for gene_cogs_set in get_gene_by_gene_cogs(base_enzyme_genes, target_enzyme_cogs, gene_cog_dict):
if gene_cogs_set.issubset(proposed_gene_cogs):
subset_found = True
break
if subset_found:
continue
## Add gene to proposed enzyme
proposed_enzyme = deepcopy(base_enzyme_genes)
proposed_enzyme.append(gene)
## Determine which COG memberships are fulfilled by the genes in the proposed enzyme
fulfilled_cogs = get_enzyme_cogs(proposed_enzyme, target_enzyme_cogs, gene_cog_dict)
if (fulfilled_cogs & target_enzyme_cogs) == target_enzyme_cogs:
## Proposed enzyme has members of every required COG, so yield
enzyme = deepcopy(proposed_enzyme)
proposed_enzyme.remove(gene)
yield enzyme
else:
## Proposed enzyme is still missing some COG members
for enzyme in add_gene(target_enzyme_cogs, gene_cog_dict, cog_gene_dict, proposed_enzyme, fulfilled_cogs):
yield enzyme
输入:
gene_cog_dict = {'gene1':['COG1','COG1003'], 'gene2':['COG2'], 'gene3':['COG273'], 'gene4':['COG1'], 'gene5':['COG273','COG71'], 'gene6':['COG1','COG273']}
cog_gene_dict = {'COG2': ['gene2'], 'COG1': ['gene1', 'gene4', 'gene6'], 'COG71': ['gene5'], 'COG273': ['gene3', 'gene5', 'gene6'], 'COG1003': ['gene1']}
target_enzyme_cogs = ['COG1','COG273']
用法:
for enzyme in add_gene(target_enzyme_cogs, gene_cog_dict, cog_gene_dict):
print enzyme
输出:
['gene1', 'gene3']
['gene1', 'gene5']
['gene4', 'gene3']
['gene4', 'gene5']
['gene6']
虽然我不知道它的性能。
我正在尝试使用多对多映射,找到一组映射到另一组特定子集的子集。
我有很多基因。每个基因都是一个或多个 COG 的成员(反之亦然),例如
- gene1 是 COG1 的成员
- gene1 是 COG1003 的成员
- gene2 是 COG2 的成员
- gene3 是 COG273 的成员
- gene4 是 COG1 的成员
- gene5 是 COG273 的成员
- gene5 是 COG71 的成员
- gene6 是 COG1 的成员
- gene6 是 COG273 的成员
我有一组代表酶的短 COG,例如。 COG1,COG273.
我想找到所有基因组,它们之间具有酶中每个 COG 的成员资格,但没有不必要的重叠(在这种情况下,例如,'gene1 and gene6' 将是虚假的,因为 gene6 已经是一个两个 COG 的成员)。
在这个例子中,答案是:
- 基因 1 和基因 3
- 基因 1 和基因 5
- 基因 3 和基因 4
- 基因 4 和基因 5
- 基因6
虽然我可以获得每个 COG 的所有成员并创建一个 'product',但这将包含虚假结果(如上所述),其中集合中的基因多于必要。
我的映射当前包含在字典中,其中键是基因 ID,值是该基因所属的 COG ID 列表。但是我承认这可能不是存储映射的最佳方式。
一次基本攻击:
Keep your representation as it is for now.
Initialize a dictionary with the COGs as keys; each value is an initial count of 0.
Now start building your list of enzyme coverage sets (ecs_list), one ecs at a time. Do this by starting at the front of the gene list and working your way to the end, considering all combinations.
Write a recursive routine to solve the remaining COGs in the enzyme. Something like this:
def pick_a_gene(gene_list, cog_list, solution_set, cog_count_dict):
pick the first gene in the list that is in at least one cog in the list.
let the rest of the list be remaining_gene_list.
add the gene to the solution set.
for each of the gene's cogs:
increment the cog's count in cog_count_dict
remove the cog from cog_list (if it's still there).
add the gene to the solution set.
is there anything left in the cog_list?
yes:
pick_a_gene(remaining_gene_list, cog_list, solution_set, cog_count_dict)
no: # we have a solution: check it for minimality
from every non-zero entry in cog_count_dict, subtract 1. This gives us a list of excess coverage.
while the excess list is not empty:
pick the next gene in the solution set, starting from the *end* (if none, break the loop)
if the gene's cogs are all covered by the excess:
remove the gene from the solution set.
decrement the excess count of each of its cogs.
The remaining set of genes is an ECS; add it to ecs_list
这对你有用吗?考虑到您的举止良好的例子,我相信它可以正确涵盖最小集合。请注意,当我们检查最小值时,从高端开始防止出现这种情况:
gene1: cog1, cog5
gene2: cog2, cog5
gene3: cog3
gene4: cog1, cog2, cog4
enzyme: cog1 - cog5
我们可以看到我们需要 gene3、gene4 和 gene1 或 gene2。如果我们从低端开始消除,我们将抛弃 gene1 并且永远找不到该解决方案。如果我们从高端开始,我们将消除 gene2,但会在主循环的稍后传递中找到该解决方案。
可以构建一个存在此类 3 向冲突的案例。在那种情况下,我们必须在最小性检查中编写一个额外的循环才能找到它们。不过,我了解到您的数据对我们来说并没有那么糟糕。
这对你有用吗?请注意,既然你说你有一组较短的 COG,我就继续嵌套 for 循环;可能有优化方法...
为了将来参考,请 post 您随问题一起获得的任何代码。
import itertools
d = {'gene1':['COG1','COG1003'], 'gene2':['COG2'], 'gene3':['COG273'], 'gene4':['COG1'], 'gene5':['COG273','COG71'], 'gene6':['COG1','COG273']}
COGs = [set(['COG1','COG273'])] # example list of COGs containing only one enzyme; NOTE: your data should be a list of multiple sets
# create all pair-wise combinations of our data
gene_pairs = [l for l in itertools.combinations(d.keys(),2)]
found = set()
for pair in gene_pairs:
join = set(d[pair[0]] + d[pair[1]]) # set of COGs for gene pairs
for COG in COGs:
# check if gene already part of enzyme
if sorted(d[pair[0]]) == sorted(list(COG)):
found.add(pair[0])
elif sorted(d[pair[1]]) == sorted(list(COG)):
found.add(pair[1])
# check if gene combinations are part of enzyme
if COG <= join and pair[0] not in found and pair[1] not in found:
found.add(pair)
for l in found:
if isinstance(l, tuple): # if tuple
print l[0], l[1]
else:
print l
def findGenes(seq1, seq2, llist):
from collections import OrderedDict
from collections import Counter
from itertools import product
od = OrderedDict()
for b,a in llist:
od.setdefault(a,[]).append(b)
llv = []
for k,v in od.items():
if seq1 == k or seq2 == k:
llv.append(v)
# flat list needed for counting genes frequencies
flatL = [ x for sublist in llv for x in sublist]
cFlatl = Counter(flatL)
# this will gather genes that like gene6 have both sequencies
l_lonely = []
for k in cFlatl:
if cFlatl[k] > 1:
l_lonely.append(k)
newL = []
temp = []
for sublist in llv:
for el in sublist:
if el not in l_lonely:
newL.append(el)
temp.append(newL)
newL = []
# temp contains only genes that do not belong to both sequences
# product will connect genes from different sequence groups
p = product(*temp)
for el in list(p):
print(el)
print(l_lonely)
输出:
lt = [('gene1', 'COG1'), ('gene1', 'COG1003'),('gene2', 'COG2'), ('gene3', 'COG273'), ('gene4', 'COG1'), ('gene5', 'COG273'),('gene5', 'COG71'), ('gene6' ,'COG1'),('gene6', 'COG273')]
findGenes('COG1', 'COG273', lt )
('gene1', 'gene3')
('gene1', 'gene5')
('gene4', 'gene3')
('gene4', 'gene5')
['gene6']
感谢您的建议,他们启发了我使用递归将一些东西组合在一起。我想处理任意基因齿轮关系,所以它需要一个通用的解决方案。这应该产生所有基因组(酶),它们之间是所有必需 COG 的成员,没有重复的酶,也没有冗余基因:
def get_enzyme_cogs(enzyme, gene_cog_dict):
"""Get all COGs of which there is at least one member gene in the enzyme."""
cog_list = []
for gene in enzyme:
cog_list.extend(gene_cog_dict[gene])
return set(cog_list)
def get_gene_by_gene_cogs(enzyme, gene_cog_dict):
"""Get COG memberships for each gene in enzyme."""
cogs_list = []
for gene in enzyme:
cogs_list.append(set(gene_cog_dict[gene]))
return cogs_list
def add_gene(target_enzyme_cogs, gene_cog_dict, cog_gene_dict, proposed_enzyme = None, fulfilled_cogs = None):
"""Generator for all enzymes with membership of all target_enzyme_cogs, without duplicate enzymes or redundant genes."""
base_enzyme_genes = proposed_enzyme or []
fulfilled_cogs = get_enzyme_cogs(base_enzyme_genes, target_enzyme_cogs, gene_cog_dict)
## Which COG will we try to find a member of?
next_cog_to_fill = sorted(list(target_enzyme_cogs-fulfilled_cogs))[0]
gene_members_of_cog = cog_gene_dict[next_cog_to_fill]
for gene in gene_members_of_cog:
## Check whether any already-present gene's COG set is a subset of the proposed gene's COG set, if so skip addition
subset_found = False
proposed_gene_cogs = set(gene_cog_dict[gene]) & target_enzyme_cogs
for gene_cogs_set in get_gene_by_gene_cogs(base_enzyme_genes, target_enzyme_cogs, gene_cog_dict):
if gene_cogs_set.issubset(proposed_gene_cogs):
subset_found = True
break
if subset_found:
continue
## Add gene to proposed enzyme
proposed_enzyme = deepcopy(base_enzyme_genes)
proposed_enzyme.append(gene)
## Determine which COG memberships are fulfilled by the genes in the proposed enzyme
fulfilled_cogs = get_enzyme_cogs(proposed_enzyme, target_enzyme_cogs, gene_cog_dict)
if (fulfilled_cogs & target_enzyme_cogs) == target_enzyme_cogs:
## Proposed enzyme has members of every required COG, so yield
enzyme = deepcopy(proposed_enzyme)
proposed_enzyme.remove(gene)
yield enzyme
else:
## Proposed enzyme is still missing some COG members
for enzyme in add_gene(target_enzyme_cogs, gene_cog_dict, cog_gene_dict, proposed_enzyme, fulfilled_cogs):
yield enzyme
输入:
gene_cog_dict = {'gene1':['COG1','COG1003'], 'gene2':['COG2'], 'gene3':['COG273'], 'gene4':['COG1'], 'gene5':['COG273','COG71'], 'gene6':['COG1','COG273']}
cog_gene_dict = {'COG2': ['gene2'], 'COG1': ['gene1', 'gene4', 'gene6'], 'COG71': ['gene5'], 'COG273': ['gene3', 'gene5', 'gene6'], 'COG1003': ['gene1']}
target_enzyme_cogs = ['COG1','COG273']
用法:
for enzyme in add_gene(target_enzyme_cogs, gene_cog_dict, cog_gene_dict):
print enzyme
输出:
['gene1', 'gene3']
['gene1', 'gene5']
['gene4', 'gene3']
['gene4', 'gene5']
['gene6']
虽然我不知道它的性能。