字典列表,按列表键无交集的组
List of dicts, group without intersection by list keys
我需要帮助来优化我的代码。
我有一个数据:
data = [
{"ids": [1]},
{"ids": [3, 4]},
{"ids": [1, 2]},
{"ids": [2]},
]
并且我需要将其分组而不按 id 交叉,因此预期数据应为:
expected = [
[{"ids": [1]}, {"ids": [2]}],
[{"ids": [3, 4]}, {"ids": [1, 2]}],
] # only 2 sublist here
我要拆分的代码(未优化):
import itertools as it
def _split(
list_of_dicts,
):
splitted_list_of_dicts = []
sub_list = []
while list_of_dicts:
for dct in list_of_dicts:
ids_in_sub_list = set(
it.chain(*[sub_list_el["ids"] for sub_list_el in sub_list]),
)
if not set(dct["ids"]).intersection(ids_in_sub_list):
sub_list.append(dct)
list_of_dicts.remove(dct)
splitted_list_of_dicts.append(sub_list)
sub_list = []
return splitted_list_of_dicts
我的代码的结果是:
result = [
[{'ids': [1]}, {'ids': [2]}],
[{'ids': [3, 4]}],
[{'ids': [1, 2]}]
] # 3 sublist
我又得到了一份清单,我尝试对其进行优化。
如果您对如何帮助我有任何想法,我会很高兴,感谢您的宝贵时间。
更多示例:
data = [
{"ids": [1]},
{"ids": [3, 4]},
{"ids": [1, 2]},
{"ids": [4]},
{"ids": [3]},
{"ids": [2]},
]
可以分组为 2 个元素列表:
expected = [
[{'ids': [1]}, {'ids': [4]}, {'ids': [2]}, {'ids': [3]}],
[{'ids': [3, 4]}, {'ids': [1, 2]}],
]
但现在我得到了所有 4 个:
result = [
[{'ids': [1]}, {'ids': [4]}, {'ids': [2]}],
[{'ids': [3, 4]}],
[{'ids': [1, 2]}],
[{'ids': [3]}]
]
如果任何不包含重复项的组合是可接受的,您可以简单地遍历 data
列表并将当前元素附加到结果中第一个元素,其中 ID 的 none已经存在。
def split(list_of_dicts):
result_helper = [set()] # This will be a list of sets for easy membership checks
result_list = [[]] # This will be what we return
for d in list_of_dicts:
for s, l, in zip(result_helper, result_list):
if not any(x in s for x in d["ids"]):
s.update(d["ids"])
l.append(d)
break
else:
# for loop ended without being broken
# This means no elements of result_list took this dict item.
# So create a new element
result_list.append([d])
result_helper.append(set(d["ids"]))
return result_list
有了你的原始数据,
data = [
{"ids": [1]},
{"ids": [3, 4]},
{"ids": [1, 2]},
{"ids": [2]},
]
split(data)
我们得到输出:
[
[{'ids': [1]}, {'ids': [3, 4]}, {'ids': [2]}],
[{'ids': [1, 2]}]
]
这似乎是一个可以接受的解决方案,因为 none 个列表具有重复的 ID。
第二个例子:
data = [
{"ids": [1]},
{"ids": [3, 4]},
{"ids": [1, 2]},
{"ids": [4]},
{"ids": [3]},
{"ids": [2]},
]
split(data)
这给出了输出:
[
[{'ids': [1]}, {'ids': [3, 4]}, {'ids': [2]}],
[{'ids': [1, 2]}, {'ids': [4]}, {'ids': [3]}]
]
在这种情况下也没有重复。
据我从你的问题中可以看出,你实际上是在根据每个组的基数对 ID 进行排序。
from itertools import groupby
def transform(data):
cardinality = lambda x: len(x['ids'])
sorted_data = sorted(data, key=cardinality)
return [list(group) for _, group in groupby(sorted_data, key=cardinality)]
给予:
[
[
{'ids': [1]},
{'ids': [4]},
{'ids': [3]},
{'ids': [2]}
],
[
{'ids': [3, 4]},
{'ids': [1, 2]}
]
]
我需要帮助来优化我的代码。
我有一个数据:
data = [
{"ids": [1]},
{"ids": [3, 4]},
{"ids": [1, 2]},
{"ids": [2]},
]
并且我需要将其分组而不按 id 交叉,因此预期数据应为:
expected = [
[{"ids": [1]}, {"ids": [2]}],
[{"ids": [3, 4]}, {"ids": [1, 2]}],
] # only 2 sublist here
我要拆分的代码(未优化):
import itertools as it
def _split(
list_of_dicts,
):
splitted_list_of_dicts = []
sub_list = []
while list_of_dicts:
for dct in list_of_dicts:
ids_in_sub_list = set(
it.chain(*[sub_list_el["ids"] for sub_list_el in sub_list]),
)
if not set(dct["ids"]).intersection(ids_in_sub_list):
sub_list.append(dct)
list_of_dicts.remove(dct)
splitted_list_of_dicts.append(sub_list)
sub_list = []
return splitted_list_of_dicts
我的代码的结果是:
result = [
[{'ids': [1]}, {'ids': [2]}],
[{'ids': [3, 4]}],
[{'ids': [1, 2]}]
] # 3 sublist
我又得到了一份清单,我尝试对其进行优化。 如果您对如何帮助我有任何想法,我会很高兴,感谢您的宝贵时间。
更多示例:
data = [
{"ids": [1]},
{"ids": [3, 4]},
{"ids": [1, 2]},
{"ids": [4]},
{"ids": [3]},
{"ids": [2]},
]
可以分组为 2 个元素列表:
expected = [
[{'ids': [1]}, {'ids': [4]}, {'ids': [2]}, {'ids': [3]}],
[{'ids': [3, 4]}, {'ids': [1, 2]}],
]
但现在我得到了所有 4 个:
result = [
[{'ids': [1]}, {'ids': [4]}, {'ids': [2]}],
[{'ids': [3, 4]}],
[{'ids': [1, 2]}],
[{'ids': [3]}]
]
如果任何不包含重复项的组合是可接受的,您可以简单地遍历 data
列表并将当前元素附加到结果中第一个元素,其中 ID 的 none已经存在。
def split(list_of_dicts):
result_helper = [set()] # This will be a list of sets for easy membership checks
result_list = [[]] # This will be what we return
for d in list_of_dicts:
for s, l, in zip(result_helper, result_list):
if not any(x in s for x in d["ids"]):
s.update(d["ids"])
l.append(d)
break
else:
# for loop ended without being broken
# This means no elements of result_list took this dict item.
# So create a new element
result_list.append([d])
result_helper.append(set(d["ids"]))
return result_list
有了你的原始数据,
data = [
{"ids": [1]},
{"ids": [3, 4]},
{"ids": [1, 2]},
{"ids": [2]},
]
split(data)
我们得到输出:
[
[{'ids': [1]}, {'ids': [3, 4]}, {'ids': [2]}],
[{'ids': [1, 2]}]
]
这似乎是一个可以接受的解决方案,因为 none 个列表具有重复的 ID。
第二个例子:
data = [
{"ids": [1]},
{"ids": [3, 4]},
{"ids": [1, 2]},
{"ids": [4]},
{"ids": [3]},
{"ids": [2]},
]
split(data)
这给出了输出:
[
[{'ids': [1]}, {'ids': [3, 4]}, {'ids': [2]}],
[{'ids': [1, 2]}, {'ids': [4]}, {'ids': [3]}]
]
在这种情况下也没有重复。
据我从你的问题中可以看出,你实际上是在根据每个组的基数对 ID 进行排序。
from itertools import groupby
def transform(data):
cardinality = lambda x: len(x['ids'])
sorted_data = sorted(data, key=cardinality)
return [list(group) for _, group in groupby(sorted_data, key=cardinality)]
给予:
[
[
{'ids': [1]},
{'ids': [4]},
{'ids': [3]},
{'ids': [2]}
],
[
{'ids': [3, 4]},
{'ids': [1, 2]}
]
]