Python: Explode/Normalize 不同数据类型的字典

Python: Explode/Normalize Dict With Different Data Types

目前我无法使用 Pandas 来完成此操作,因此我必须创建一个手动解决方案。

我需要能够获取以下数据:

example_1 = {"a": 0, "b": [{"c": 1, "d": [10, 100]}, {"c": 2, "d": [20, 200]}]}

example_2 = {'a': [0, 1], 'b': [2, 3]}

example_3 = {'a': 0, 'b': [{'c': 1}, {'c': 2}]}

和return格式如下:

return_example_1 = [
    {"a": 0, "b.c": 1, "b.d": 10},
    {"a": 0, "b.c": 2, "b.d": 20},
    {"a": 0, "b.c": 1, "b.d": 100},
    {"a": 0, "b.c": 2, "b.d": 200},
]
return_example_2 = [
        {"a": 0, "b": 2},
        {"a": 0, "b": 3},
        {"a": 1, "b": 2},
        {"a": 1, "b": 3},
    ] 
return_example_3 = [
        {'a': 0, 'b.c': 1},
        {'a': 0, 'b.c': 2}
    ]

数据具有嵌套值,然后“分解”成多行。并且键是基于嵌套的 k 附加的,v

class DataNormalization:
    def merge(self, main_results, sub_results):
        res = []
        for main_result in main_results:
            for sub_result in sub_results:
                _mr = deepcopy(main_result)
                _mr.update(sub_result)
                res.append(_mr)
        return res

    def update_array(self, main_results, key, array):
        new_result = []
        _len = len(array)
        for main_result in main_results:
            for val in array:
                _result_copy = deepcopy(main_result)
                _result_copy[key] = val
                new_result.append(_result_copy)
        return new_result

    def update_array_of_dicts(self, main_results, key_without_suffix, array_of_dicts):
        new_result = []
        for main_result in main_results:
            for record in array_of_dicts:
                _result_copy = deepcopy(main_result)

                record = {f"{key_without_suffix}.{k}": v for k, v in record.items()}
                _result_copy.update(record)
                new_result.append(_result_copy)
        return new_result

    def check_value_exist(self, test_dict, value):
        items = []
        for key, val in test_dict.items():
            if type(val) == value:
                items.append({key: val})
        return items

    def main(self, record, key_prefix=""):
        results = []
        for key, value in record.items():
            _key = key if key_prefix == "" else f"{key_prefix}.{key}"
            if isinstance(value, dict):
                _results = self.main(value, key_prefix=key)
                results = self.merge(results, _results)
            elif isinstance(value, list):
                if not value:
                    continue
                if isinstance(value[0], dict):
                    results = self.update_array_of_dicts(results, _key, value)
                else:
                    results = self.update_array(results, _key, value)
            else:
                if len(results) > 0:
                    for result in results:
                        result[_key] = value
                else:
                    results = [{_key: value}]
        return results

dm = DataNormalization()
result = dm.main()
print(result)

当前示例适用于示例 3,但不适用于示例 1 和 2。希望了解如何缓解这种情况以涵盖所有情况。

我还有的Pandas例子是:

my_dict = {
    "state": "Florida",
    "shortname": "FL",
    "info": {
        "governor": "Rick Scott",
        "universities": ["University of Central Florida", "Florida State University"],
        "counties": [
            {"name": "Dade", "population": 12345},
            {"name": "Broward", "population": 40000},
            {"name": "Palm Beach", "population": 60000},
        ],
    },
}

输出应该是:

[{'state': 'Florida', 'shortname': 'FL', 'info.governor': 'Rick Scott', 'info.universities': 'University of Central Florida', 'info.counties.name': 'Dade', 'info.counties.population': 12345}, {'state': 'Florida', 'shortname': 'FL', 'info.governor': 'Rick Scott', 'info.universities': 'University of Central Florida', 'info.counties.name': 'Broward', 'info.counties.population': 40000}, {'state': 'Florida', 'shortname': 'FL', 'info.governor': 'Rick Scott', 'info.universities': 'University of Central Florida', 'info.counties.name': 'Palm Beach', 'info.counties.population': 60000}, {'state': 'Florida', 'shortname': 'FL', 'info.governor': 'Rick Scott', 'info.universities': 'Florida State University', 'info.counties.name': 'Dade', 'info.counties.population': 12345}, {'state': 'Florida', 'shortname': 'FL', 'info.governor': 'Rick Scott', 'info.universities': 'Florida State University', 'info.counties.name': 'Broward', 'info.counties.population': 40000}, {'state': 'Florida', 'shortname': 'FL', 'info.governor': 'Rick Scott', 'info.universities': 'Florida State University', 'info.counties.name': 'Palm Beach', 'info.counties.population': 60000}]

您可以对生成器使用递归:

import itertools as it, functools as ft
def all_combos(vals, c = []):
    if not isinstance(vals, dict):
       yield {'.'.join(c):vals}
    else:
       r = [[j for k in (b if isinstance(b, list) else [b]) for j in all_combos(k, c+[a])]
             for a, b in vals.items()]
       for i in it.product(*r):
           yield ft.reduce(lambda x, y:{**x, **y}, i, {})
       
print(list(all_combos({"a": 0, "b": [{"c": 1, "d": [10, 100]}, {"c": 2, "d": [20, 200]}]})))

输出:

[{'a': 0, 'b.c': 1, 'b.d': 10}, {'a': 0, 'b.c': 1, 'b.d': 100}, {'a': 0, 'b.c': 2, 'b.d': 20}, {'a': 0, 'b.c': 2, 'b.d': 200}]

完整结果:

all_vals = [example_1, example_2, example_3, my_dict]
for i in all_vals:
    print(list(all_combos(i)))

输出:

[{'a': 0, 'b.c': 1, 'b.d': 10}, 
 {'a': 0, 'b.c': 1, 'b.d': 100}, 
 {'a': 0, 'b.c': 2, 'b.d': 20}, 
 {'a': 0, 'b.c': 2, 'b.d': 200}]
[{'a': 0, 'b': 2}, 
 {'a': 0, 'b': 3}, 
 {'a': 1, 'b': 2}, 
 {'a': 1, 'b': 3}]
[{'a': 0, 'b.c': 1}, 
 {'a': 0, 'b.c': 2}]
[{'state': 'Florida', 'shortname': 'FL', 'info.governor': 'Rick Scott', 'info.universities': 'University of Central Florida', 'info.counties.name': 'Dade', 'info.counties.population': 12345}, 
 {'state': 'Florida', 'shortname': 'FL', 'info.governor': 'Rick Scott', 'info.universities': 'University of Central Florida', 'info.counties.name': 'Broward', 'info.counties.population': 40000}, 
 {'state': 'Florida', 'shortname': 'FL', 'info.governor': 'Rick Scott', 'info.universities': 'University of Central Florida', 'info.counties.name': 'Palm Beach', 'info.counties.population': 60000}, 
 {'state': 'Florida', 'shortname': 'FL', 'info.governor': 'Rick Scott', 'info.universities': 'Florida State University', 'info.counties.name': 'Dade', 'info.counties.population': 12345}, 
 {'state': 'Florida', 'shortname': 'FL', 'info.governor': 'Rick Scott', 'info.universities': 'Florida State University', 'info.counties.name': 'Broward', 'info.counties.population': 40000}, 
 {'state': 'Florida', 'shortname': 'FL', 'info.governor': 'Rick Scott', 'info.universities': 'Florida State University', 'info.counties.name': 'Palm Beach', 'info.counties.population': 60000}]