在字典列表中查找重复值
Find duplicate values in list of dictionaries
我需要在一个列表中找到具有相同键值的字典,并创建一个新列表,其中只保留第一个字典。
示例列表:
lst_in = [{'First': 1, 'Second': 4}, {'First': 2, 'Second': 5}, {'First': 3, 'Second': 4}]
要迭代的重复键值应该是 'Second'。所以在这个例子中,第一本和第三本字典是一样的。
我试过查看 Find duplicates in python list of dictionaries and python list of dictionaries find duplicates based on value,但找不到确切的答案。我只看一个键值。字典将始终具有相同的键。
预期输出:
lst_out = [{'First': 1, 'Second': 4}, {'First': 2, 'Second': 5}]
套装非常适合那些“我已经看过了吗?”的人。问题。
lst_in = [{'First': 1, 'Second': 4}, {'First': 2, 'Second': 5}, {'First': 3, 'Second': 4}]
found = set()
lst_out = []
for dct in lst_in:
if dct['Second'] not in found:
lst_out.append(dct)
found.add( dct['Second'] )
一些解决方案和基准。
解决方案
听写很有趣,向前得到顺序,向后得到第一个值。
lst_out = list({d['Second']: d
for s in [1, -1]
for d in lst_in[::s]}.values())
或者使用 setdefault
来跟踪每个值的第一个字典:
tmp = {}
for d in lst_in:
tmp.setdefault(d['Second'], d)
lst_out = list(tmp.values())
有趣且可能更快的版本:
add = {}.setdefault
for d in lst_in:
add(d['Second'], d)
lst_out = list(add.__self__.values())
基准
具有 100 个不同 Second
值的 1000 个字典列表的时间(使用 Python 3.10.0):
361 μs 362 μs 364 μs dict_forward_backward
295 μs 297 μs 297 μs dict_setdefault
231 μs 231 μs 232 μs dict_setdefault_optimized
196 μs 196 μs 197 μs set_in_list_comprehension
190 μs 190 μs 190 μs set_in_list_comprehension_optimized
191 μs 191 μs 191 μs set_in_list_comprehension_optimized_2
201 μs 201 μs 201 μs set_with_loop
1747 μs 1751 μs 1774 μs with_lists
基准代码:
from timeit import repeat, default_timer as timer
from random import choices
lst_in = [{'First': i, 'Second': v}
for i, v in enumerate(choices(range(100), k=1000))]
def dict_forward_backward(lst_in):
return list({d['Second']: d
for s in [1, -1]
for d in lst_in[::s]}.values())
def dict_setdefault(lst_in):
tmp = {}
for d in lst_in:
tmp.setdefault(d['Second'], d)
return list(tmp.values())
def dict_setdefault_optimized(lst_in):
add = {}.setdefault
for d in lst_in:
add(d['Second'], d)
return list(add.__self__.values())
def set_in_list_comprehension(lst_in):
return [s.add(v) or d
for s in [set()]
for d in lst_in
for v in [d['Second']]
if v not in s]
def set_in_list_comprehension_optimized(lst_in):
return [add(v) or d
for s in [set()]
for add in [s.add]
for d in lst_in
for v in [d['Second']]
if v not in s]
def set_in_list_comprehension_optimized_2(lst_in):
s = set()
add = s.add
return [add(v) or d
for d in lst_in
for v in [d['Second']]
if v not in s]
def set_with_loop(lst_in):
found = set()
lst_out = []
for dct in lst_in:
if dct['Second'] not in found:
lst_out.append(dct)
found.add( dct['Second'] )
return lst_out
def with_lists(lst_in):
out = {'keep':[], 'counter':[]}
for dct in lst_in:
if dct['Second'] not in out['counter']:
out['keep'].append(dct)
out['counter'].append(dct['Second'])
return out['keep']
funcs = [
dict_forward_backward,
dict_setdefault,
dict_setdefault_optimized,
set_in_list_comprehension,
set_in_list_comprehension_optimized,
set_in_list_comprehension_optimized_2,
set_with_loop,
with_lists,
]
# Correctness
expect = funcs[0](lst_in)
for func in funcs[1:]:
result = func(lst_in)
print(result == expect, func.__name__)
print()
# Speed
for _ in range(3):
for func in funcs:
ts = sorted(repeat(lambda: func(lst_in), 'gc.enable(); gc.collect()', number=1000))[:3]
print(*('%4d μs ' % (t * 1e3) for t in ts), func.__name__)
print()
我需要在一个列表中找到具有相同键值的字典,并创建一个新列表,其中只保留第一个字典。
示例列表:
lst_in = [{'First': 1, 'Second': 4}, {'First': 2, 'Second': 5}, {'First': 3, 'Second': 4}]
要迭代的重复键值应该是 'Second'。所以在这个例子中,第一本和第三本字典是一样的。 我试过查看 Find duplicates in python list of dictionaries and python list of dictionaries find duplicates based on value,但找不到确切的答案。我只看一个键值。字典将始终具有相同的键。
预期输出:
lst_out = [{'First': 1, 'Second': 4}, {'First': 2, 'Second': 5}]
套装非常适合那些“我已经看过了吗?”的人。问题。
lst_in = [{'First': 1, 'Second': 4}, {'First': 2, 'Second': 5}, {'First': 3, 'Second': 4}]
found = set()
lst_out = []
for dct in lst_in:
if dct['Second'] not in found:
lst_out.append(dct)
found.add( dct['Second'] )
一些解决方案和基准。
解决方案
听写很有趣,向前得到顺序,向后得到第一个值。
lst_out = list({d['Second']: d
for s in [1, -1]
for d in lst_in[::s]}.values())
或者使用 setdefault
来跟踪每个值的第一个字典:
tmp = {}
for d in lst_in:
tmp.setdefault(d['Second'], d)
lst_out = list(tmp.values())
有趣且可能更快的版本:
add = {}.setdefault
for d in lst_in:
add(d['Second'], d)
lst_out = list(add.__self__.values())
基准
具有 100 个不同 Second
值的 1000 个字典列表的时间(使用 Python 3.10.0):
361 μs 362 μs 364 μs dict_forward_backward
295 μs 297 μs 297 μs dict_setdefault
231 μs 231 μs 232 μs dict_setdefault_optimized
196 μs 196 μs 197 μs set_in_list_comprehension
190 μs 190 μs 190 μs set_in_list_comprehension_optimized
191 μs 191 μs 191 μs set_in_list_comprehension_optimized_2
201 μs 201 μs 201 μs set_with_loop
1747 μs 1751 μs 1774 μs with_lists
基准代码:
from timeit import repeat, default_timer as timer
from random import choices
lst_in = [{'First': i, 'Second': v}
for i, v in enumerate(choices(range(100), k=1000))]
def dict_forward_backward(lst_in):
return list({d['Second']: d
for s in [1, -1]
for d in lst_in[::s]}.values())
def dict_setdefault(lst_in):
tmp = {}
for d in lst_in:
tmp.setdefault(d['Second'], d)
return list(tmp.values())
def dict_setdefault_optimized(lst_in):
add = {}.setdefault
for d in lst_in:
add(d['Second'], d)
return list(add.__self__.values())
def set_in_list_comprehension(lst_in):
return [s.add(v) or d
for s in [set()]
for d in lst_in
for v in [d['Second']]
if v not in s]
def set_in_list_comprehension_optimized(lst_in):
return [add(v) or d
for s in [set()]
for add in [s.add]
for d in lst_in
for v in [d['Second']]
if v not in s]
def set_in_list_comprehension_optimized_2(lst_in):
s = set()
add = s.add
return [add(v) or d
for d in lst_in
for v in [d['Second']]
if v not in s]
def set_with_loop(lst_in):
found = set()
lst_out = []
for dct in lst_in:
if dct['Second'] not in found:
lst_out.append(dct)
found.add( dct['Second'] )
return lst_out
def with_lists(lst_in):
out = {'keep':[], 'counter':[]}
for dct in lst_in:
if dct['Second'] not in out['counter']:
out['keep'].append(dct)
out['counter'].append(dct['Second'])
return out['keep']
funcs = [
dict_forward_backward,
dict_setdefault,
dict_setdefault_optimized,
set_in_list_comprehension,
set_in_list_comprehension_optimized,
set_in_list_comprehension_optimized_2,
set_with_loop,
with_lists,
]
# Correctness
expect = funcs[0](lst_in)
for func in funcs[1:]:
result = func(lst_in)
print(result == expect, func.__name__)
print()
# Speed
for _ in range(3):
for func in funcs:
ts = sorted(repeat(lambda: func(lst_in), 'gc.enable(); gc.collect()', number=1000))[:3]
print(*('%4d μs ' % (t * 1e3) for t in ts), func.__name__)
print()