使用嵌套的 defaultdict 重建数组
rebuilding arrays with nested defaultdict
这个问题是上一个问题的延伸:
- 但差异足以保证一个新问题:
我一直在努力解决这个问题。我的数据是来自 sql 查询的字典数组。数组中的每个元素代表一个货件,根据key有共同的值。
data = [
{"CustName":"customer1", "PartNum":"part1", "delKey":"0001", "qty":"10", "memo":"blah1"},
{"CustName":"customer1", "PartNum":"part1", "delKey":"0002", "qty":"10", "memo":"blah2"},
{"CustName":"customer1", "PartNum":"part1", "delKey":"0003", "qty":"10", "memo":"blah3"},
{"CustName":"customer2", "PartNum":"part3", "delKey":"0004", "qty":"20", "memo":"blah4"},
{"CustName":"customer2", "PartNum":"part3", "delKey":"0005", "qty":"20", "memo":"blah5"},
{"CustName":"customer3", "PartNum":"partXYZ", "delKey":"0006", "qty":"50", "memo":"blah6"},
{"CustName":"customer3", "PartNum":"partABC", "delKey":"0007", "qty":"100", "memo":"blah7"}]
我想要的输出是根据特定键分组的
dataOut = [
{"CustName":"customer1", "Parts":[
{"PartNum":"part1", "deliveries":[
{"delKey":"0001", "qty":"10", "memo":"blah1"},
{"delKey":"0002", "qty":"10", "memo":"blah2"},
{"delKey":"0003", "qty":"10", "memo":"blah3"}]}]},
{"CustName":"customer2", "Parts":[
{"PartNum":"part3", "deliveries":[
{"delKey":"0004", "qty":"20", "memo":"blah4"},
{"delKey":"0005", "qty":"20", "memo":"blah5"}]}]},
{"CustName":"customer3", "Parts":[
{"PartNum":"partXYZ", "deliveries":[
{"delKey":"0006", "qty":"50", "memo":"blah6"}]},
{"PartNum":"partABC", "deliveries":[
{"delKey":"0007", "qty":"100", "memo":"blah7"}]}]}]
我可以使用上一个问题提供的 defaultdict 和列表理解并稍作修改来获得单个级别的分组
d = defaultdict(list)
for item in data:
d[item['CustName']].append(item)
print([{'CustName': key, 'parts': value} for key, value in d.items()])
但我似乎无法获得输出数组中的第二级 - PartNum
键的分组 b。通过一些研究,我认为我需要做的是使用 defaultdict
作为外部 `defaultdict' 的类型,如下所示:
d = defaultdict(defaultdict(list))
它会抛出错误,因为 defaultdict returns 是一个函数,所以我需要使用 lambda
(是吗?)
d = defaultdict(lambda:defaultdict(list))
for item in data:
d[item['CustName']].append(item) <----this?
我的问题是如何 "access" 循环中的第二级数组并告诉 "inner" defaultdict 在 (PartNum) 上分组什么?数据来自数据库程序员,项目不断发展以添加越来越多的数据(密钥),所以我希望这个解决方案尽可能通用,以防我收到更多数据。我希望能够 "chain" defaultdicts 取决于我需要去多少级别。我边走边学,所以我在努力理解 lambda
和 defaultdict
类型的基础知识以及从这里去哪里。
这是我能做到的最漂亮的方式。它使用相同的 defaultdict
想法来实现适当的分组,因为 python 的内置 groupby
函数仅适用于有序数据。
请注意,此版本会改变输入数据集中的项,因此结果中的叶项与输入是相同的字典实例,但具有 "CustName"
和 "PartNum"
条目已删除。
from collections import defaultdict
def groupby_mutate(seq, key):
d = defaultdict(list)
for item in seq:
d[item[key]].append(item)
del item[key]
return d
def your_operation(data):
return [ {
'CustName': CustName,
'Parts': [ {
'PartNum': PartNum,
'deliveries': deliveries
} for PartNum,deliveries in groupby_mutate(custItems, 'PartNum').items() ]
} for CustName,custItems in groupby_mutate(data, 'CustName').items() ]
# try it
from pprint import *
data = [
{"CustName":"customer1", "PartNum":"part1", "delKey":"0001", "qty":"10", "memo":"blah1"},
{"CustName":"customer1", "PartNum":"part1", "delKey":"0002", "qty":"10", "memo":"blah2"},
{"CustName":"customer1", "PartNum":"part1", "delKey":"0003", "qty":"10", "memo":"blah3"},
{"CustName":"customer2", "PartNum":"part3", "delKey":"0004", "qty":"20", "memo":"blah4"},
{"CustName":"customer2", "PartNum":"part3", "delKey":"0005", "qty":"20", "memo":"blah5"},
{"CustName":"customer3", "PartNum":"partXYZ", "delKey":"0006", "qty":"50", "memo":"blah6"},
{"CustName":"customer3", "PartNum":"partABC", "delKey":"0007", "qty":"100", "memo":"blah7"}
]
pprint(your_operation(data))
编辑:
以防万一以后有人需要,这里有一个不改变原始数据的版本:
from collections import defaultdict
def groupby_getitem(seq, key):
d = defaultdict(list)
for item in seq:
d[item[key]].append(item)
return d
def your_operation(data):
return [ {
'CustName': CustName,
'Parts': [ {
'PartNum': PartNum,
'deliveries': [ dict(
(k,v) for k,v in delivery.items() if not k in ['CustName', 'PartNum']
) for delivery in deliveries ]
} for PartNum,deliveries in groupby_getitem(custItems, 'PartNum').items() ]
} for CustName,custItems in groupby_getitem(data, 'CustName').items() ]
按照@Pynchia 的建议使用groupby
,按照@hege_hegedus 的建议对无序数据使用sorted
:
from itertools import groupby
dataOut = []
dataSorted = sorted(data, key=lambda x: (x["CustName"], x["PartNum"]))
for cust_name, cust_group in groupby(dataSorted, lambda x: x["CustName"]):
dataOut.append({
"CustName": cust_name,
"Parts": [],
})
for part_num, part_group in groupby(cust_group, lambda x: x["PartNum"]):
dataOut[-1]["Parts"].append({
"PartNum": part_num,
"deliveries": [{
"delKey": delivery["delKey"],
"memo": delivery["memo"],
"qty": delivery["qty"],
} for delivery in part_group]
})
如果您查看第二个 for
循环,这将有望回答您关于在循环中访问第二级数组的问题。
您可以使用基于 OrderedDefaultdict
而不是 defaultdict(list)
的 tree-like 数据结构。 (定义来自我的一个不相关的 answer。)
from collections import OrderedDict
class OrderedDefaultdict(OrderedDict):
def __init__(self, *args, **kwargs):
if not args:
self.default_factory = None
else:
if not (args[0] is None or callable(args[0])):
raise TypeError('first argument must be callable or None')
self.default_factory = args[0]
args = args[1:]
super(OrderedDefaultdict, self).__init__(*args, **kwargs)
def __missing__ (self, key):
if self.default_factory is None:
raise KeyError(key)
self[key] = default = self.default_factory()
return default
Tree = lambda: OrderedDefaultdict(Tree)
d = Tree()
for rec in data:
custName, partNum, delKey = rec['CustName'], rec['PartNum'], rec['delKey']
details = {"qty": rec["qty"], "memo": rec["memo"]}
d[custName]['Parts'][partNum]['deliveries'][delKey] = details
因此,对于您问题中显示的 data
,d
最终将包含:
d = {
"customer1": {
"Parts": {
"part1": {
"deliveries": {"0001": {"memo": "blah1", "qty": "10"},
"0002": {"memo": "blah2", "qty": "10"},
"0003": {"memo": "blah3", "qty": "10"}}}}},
"customer2": {
"Parts": {
"part3": {
"deliveries": {"0004": {"memo": "blah4", "qty": "20"},
"0005": {"memo": "blah5", "qty": "20"}}}}},
"customer3": {
"Parts": {
"partXYZ": {
"deliveries": {"0006": {"memo": "blah6", "qty": "50"}}},
"partABC": {
"deliveries": {"0007": {"memo": "blah7", "qty": "100"}}}}}
}
可以简单地打印出来,因为它现在按照您想要的方式分组。
按 "CustName", "PartNum", "delKey"
排序。遍历每个零件、每个客户的交付项目并累积以匹配您的输出规格。
我喜欢使用 operator.itemgetter
- 对我来说它让事情变得更清晰。
import collections, itertools, operator
cust_name = operator.itemgetter('CustName')
part_num = operator.itemgetter('PartNum')
group_sort = operator.itemgetter('CustName', 'PartNum', 'delKey')
del_key = operator.itemgetter('delKey')
qty = operator.itemgetter('qty')
memo = operator.itemgetter('memo')
# sort on the relavent keys
data.sort(key = group_sort)
result = []
# iterate over customers
for custname, group1 in itertools.groupby(data, cust_name):
cust_dict = {'CustName' : custname, 'Parts': []}
# iterate over parts for this customer
for partnum, group2 in itertools.groupby(group1, part_num):
part_dict = {"PartNum" : partnum, 'deliveries' : []}
# iterate over delivery items for this part
for thing in group2:
part_dict['deliveries'].append({'delKey':del_key(thing),
'qty':qty(thing),
'memo':memo(thing)})
cust_dict['Parts'].append(part_dict)
result.append(cust_dict)
这显然会多次迭代原始数据中的项目,这可能会影响性能——但我看不出有什么方法可以满足您的需求 多次迭代去做。
这个问题是上一个问题的延伸:
我一直在努力解决这个问题。我的数据是来自 sql 查询的字典数组。数组中的每个元素代表一个货件,根据key有共同的值。
data = [
{"CustName":"customer1", "PartNum":"part1", "delKey":"0001", "qty":"10", "memo":"blah1"},
{"CustName":"customer1", "PartNum":"part1", "delKey":"0002", "qty":"10", "memo":"blah2"},
{"CustName":"customer1", "PartNum":"part1", "delKey":"0003", "qty":"10", "memo":"blah3"},
{"CustName":"customer2", "PartNum":"part3", "delKey":"0004", "qty":"20", "memo":"blah4"},
{"CustName":"customer2", "PartNum":"part3", "delKey":"0005", "qty":"20", "memo":"blah5"},
{"CustName":"customer3", "PartNum":"partXYZ", "delKey":"0006", "qty":"50", "memo":"blah6"},
{"CustName":"customer3", "PartNum":"partABC", "delKey":"0007", "qty":"100", "memo":"blah7"}]
我想要的输出是根据特定键分组的
dataOut = [
{"CustName":"customer1", "Parts":[
{"PartNum":"part1", "deliveries":[
{"delKey":"0001", "qty":"10", "memo":"blah1"},
{"delKey":"0002", "qty":"10", "memo":"blah2"},
{"delKey":"0003", "qty":"10", "memo":"blah3"}]}]},
{"CustName":"customer2", "Parts":[
{"PartNum":"part3", "deliveries":[
{"delKey":"0004", "qty":"20", "memo":"blah4"},
{"delKey":"0005", "qty":"20", "memo":"blah5"}]}]},
{"CustName":"customer3", "Parts":[
{"PartNum":"partXYZ", "deliveries":[
{"delKey":"0006", "qty":"50", "memo":"blah6"}]},
{"PartNum":"partABC", "deliveries":[
{"delKey":"0007", "qty":"100", "memo":"blah7"}]}]}]
我可以使用上一个问题提供的 defaultdict 和列表理解并稍作修改来获得单个级别的分组
d = defaultdict(list)
for item in data:
d[item['CustName']].append(item)
print([{'CustName': key, 'parts': value} for key, value in d.items()])
但我似乎无法获得输出数组中的第二级 - PartNum
键的分组 b。通过一些研究,我认为我需要做的是使用 defaultdict
作为外部 `defaultdict' 的类型,如下所示:
d = defaultdict(defaultdict(list))
它会抛出错误,因为 defaultdict returns 是一个函数,所以我需要使用 lambda
(是吗?)
d = defaultdict(lambda:defaultdict(list))
for item in data:
d[item['CustName']].append(item) <----this?
我的问题是如何 "access" 循环中的第二级数组并告诉 "inner" defaultdict 在 (PartNum) 上分组什么?数据来自数据库程序员,项目不断发展以添加越来越多的数据(密钥),所以我希望这个解决方案尽可能通用,以防我收到更多数据。我希望能够 "chain" defaultdicts 取决于我需要去多少级别。我边走边学,所以我在努力理解 lambda
和 defaultdict
类型的基础知识以及从这里去哪里。
这是我能做到的最漂亮的方式。它使用相同的 defaultdict
想法来实现适当的分组,因为 python 的内置 groupby
函数仅适用于有序数据。
请注意,此版本会改变输入数据集中的项,因此结果中的叶项与输入是相同的字典实例,但具有 "CustName"
和 "PartNum"
条目已删除。
from collections import defaultdict
def groupby_mutate(seq, key):
d = defaultdict(list)
for item in seq:
d[item[key]].append(item)
del item[key]
return d
def your_operation(data):
return [ {
'CustName': CustName,
'Parts': [ {
'PartNum': PartNum,
'deliveries': deliveries
} for PartNum,deliveries in groupby_mutate(custItems, 'PartNum').items() ]
} for CustName,custItems in groupby_mutate(data, 'CustName').items() ]
# try it
from pprint import *
data = [
{"CustName":"customer1", "PartNum":"part1", "delKey":"0001", "qty":"10", "memo":"blah1"},
{"CustName":"customer1", "PartNum":"part1", "delKey":"0002", "qty":"10", "memo":"blah2"},
{"CustName":"customer1", "PartNum":"part1", "delKey":"0003", "qty":"10", "memo":"blah3"},
{"CustName":"customer2", "PartNum":"part3", "delKey":"0004", "qty":"20", "memo":"blah4"},
{"CustName":"customer2", "PartNum":"part3", "delKey":"0005", "qty":"20", "memo":"blah5"},
{"CustName":"customer3", "PartNum":"partXYZ", "delKey":"0006", "qty":"50", "memo":"blah6"},
{"CustName":"customer3", "PartNum":"partABC", "delKey":"0007", "qty":"100", "memo":"blah7"}
]
pprint(your_operation(data))
编辑:
以防万一以后有人需要,这里有一个不改变原始数据的版本:
from collections import defaultdict
def groupby_getitem(seq, key):
d = defaultdict(list)
for item in seq:
d[item[key]].append(item)
return d
def your_operation(data):
return [ {
'CustName': CustName,
'Parts': [ {
'PartNum': PartNum,
'deliveries': [ dict(
(k,v) for k,v in delivery.items() if not k in ['CustName', 'PartNum']
) for delivery in deliveries ]
} for PartNum,deliveries in groupby_getitem(custItems, 'PartNum').items() ]
} for CustName,custItems in groupby_getitem(data, 'CustName').items() ]
按照@Pynchia 的建议使用groupby
,按照@hege_hegedus 的建议对无序数据使用sorted
:
from itertools import groupby
dataOut = []
dataSorted = sorted(data, key=lambda x: (x["CustName"], x["PartNum"]))
for cust_name, cust_group in groupby(dataSorted, lambda x: x["CustName"]):
dataOut.append({
"CustName": cust_name,
"Parts": [],
})
for part_num, part_group in groupby(cust_group, lambda x: x["PartNum"]):
dataOut[-1]["Parts"].append({
"PartNum": part_num,
"deliveries": [{
"delKey": delivery["delKey"],
"memo": delivery["memo"],
"qty": delivery["qty"],
} for delivery in part_group]
})
如果您查看第二个 for
循环,这将有望回答您关于在循环中访问第二级数组的问题。
您可以使用基于 OrderedDefaultdict
而不是 defaultdict(list)
的 tree-like 数据结构。 (定义来自我的一个不相关的 answer。)
from collections import OrderedDict
class OrderedDefaultdict(OrderedDict):
def __init__(self, *args, **kwargs):
if not args:
self.default_factory = None
else:
if not (args[0] is None or callable(args[0])):
raise TypeError('first argument must be callable or None')
self.default_factory = args[0]
args = args[1:]
super(OrderedDefaultdict, self).__init__(*args, **kwargs)
def __missing__ (self, key):
if self.default_factory is None:
raise KeyError(key)
self[key] = default = self.default_factory()
return default
Tree = lambda: OrderedDefaultdict(Tree)
d = Tree()
for rec in data:
custName, partNum, delKey = rec['CustName'], rec['PartNum'], rec['delKey']
details = {"qty": rec["qty"], "memo": rec["memo"]}
d[custName]['Parts'][partNum]['deliveries'][delKey] = details
因此,对于您问题中显示的 data
,d
最终将包含:
d = {
"customer1": {
"Parts": {
"part1": {
"deliveries": {"0001": {"memo": "blah1", "qty": "10"},
"0002": {"memo": "blah2", "qty": "10"},
"0003": {"memo": "blah3", "qty": "10"}}}}},
"customer2": {
"Parts": {
"part3": {
"deliveries": {"0004": {"memo": "blah4", "qty": "20"},
"0005": {"memo": "blah5", "qty": "20"}}}}},
"customer3": {
"Parts": {
"partXYZ": {
"deliveries": {"0006": {"memo": "blah6", "qty": "50"}}},
"partABC": {
"deliveries": {"0007": {"memo": "blah7", "qty": "100"}}}}}
}
可以简单地打印出来,因为它现在按照您想要的方式分组。
按 "CustName", "PartNum", "delKey"
排序。遍历每个零件、每个客户的交付项目并累积以匹配您的输出规格。
我喜欢使用 operator.itemgetter
- 对我来说它让事情变得更清晰。
import collections, itertools, operator
cust_name = operator.itemgetter('CustName')
part_num = operator.itemgetter('PartNum')
group_sort = operator.itemgetter('CustName', 'PartNum', 'delKey')
del_key = operator.itemgetter('delKey')
qty = operator.itemgetter('qty')
memo = operator.itemgetter('memo')
# sort on the relavent keys
data.sort(key = group_sort)
result = []
# iterate over customers
for custname, group1 in itertools.groupby(data, cust_name):
cust_dict = {'CustName' : custname, 'Parts': []}
# iterate over parts for this customer
for partnum, group2 in itertools.groupby(group1, part_num):
part_dict = {"PartNum" : partnum, 'deliveries' : []}
# iterate over delivery items for this part
for thing in group2:
part_dict['deliveries'].append({'delKey':del_key(thing),
'qty':qty(thing),
'memo':memo(thing)})
cust_dict['Parts'].append(part_dict)
result.append(cust_dict)
这显然会多次迭代原始数据中的项目,这可能会影响性能——但我看不出有什么方法可以满足您的需求 多次迭代去做。