如何将嵌套的 JSON 转换为 CSV

How to convert a nested JSON to CSV

我想将嵌套 json 转换为 csv 格式,包括分组 list/dict 的子行。

这是我的json

data =\
{
    "id": "1",
    "name": "HIGHLEVEL",
    "description": "HLD",
    "item": {
        "id": "11",
        "description": "description"
    },
    "packages": [{
            "id": "1",
            "label": "Package 1",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }
            ]
        }, {
            "id": "2",
            "label": "Package 3",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }
            ]
        }
    ]
}
import pandas as pd

df = pd.json_normalize(data)

# display(df)
  description id       name                                                                                                                                                                                                packages item.description item.id
0         HLD  1  HIGHLEVEL  [{'id': '1', 'label': 'Package 1', 'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}, {'id': '2', 'label': 'Package 3', 'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}]      description      11

JSON to CSV Converter

的输出
"id","name","description","item__id","item__description","packages__id","packages__label","packages__products__id","packages__products__price"
"1","HIGHLEVEL","HLD","11","description","1","Package 1","1","5"
"","","","","","","","2","3"
"","","","","","2","Package 3","1","5"
"","","","","","","","2","3"

我尝试了 pandas 规范化,但结果与想要的不一样。 JSON 数组未转换为 csv 中的子行。 我想在 csv 中保留空字符串。

我想做同样的事情,但使用 Python 脚本。

感谢@Trenton McKinney

import pandas as pd
import json

data =\
{'description': 'HLD',
 'id': '1',
 'item': {'description': 'description', 'id': '11'},
 'name': 'HIGHLEVEL',
 'packages': [{'id': '1',
               'label': 'Package 1',
               'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]},
              {'id': '2',
               'label': 'Package 3',
               'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}]}


df = pd.json_normalize(data, record_path=['packages'], meta=['id', 'name', 'description', ['item', 'id'], ['item', 'description']], meta_prefix='top_', sep='_')
df = df.explode('products')

df.rename({'id': 'packages_id', 'label': 'packages_label'}, axis=1, inplace=True)
df = df.join(pd.DataFrame(df.pop('products').values.tolist()))

df.rename({'id': 'packages_products_id', 'price': 'packages_products_price'}, axis=1, inplace=True)

df.columns = df.columns.str.replace('top_', '')

df = df[['id', 'name', 'description', 'item_id', 'item_description', 'packages_id', 'packages_label', 'packages_products_id', 'packages_products_price']]

columns_to_group = ["name", "description", "item_id", "item_description", "packages_id", "packages_label"]

for c in columns_to_group:
  df[c] = df[c].mask(
    df[c].duplicated(), ""
  )

print(df)

df.to_csv('data.csv', index=False)

现在我必须让它更通用,以便它可以用于任何结构化 json。

这应该适合你:

from copy import deepcopy
import pandas


def cross_join(left, right):
    new_rows = [] if right else left
    for left_row in left:
        for right_row in right:
            temp_row = deepcopy(left_row)
            for key, value in right_row.items():
                temp_row[key] = value
            new_rows.append(deepcopy(temp_row))
    return new_rows


def flatten_list(data):
    for elem in data:
        if isinstance(elem, list):
            yield from flatten_list(elem)
        else:
            yield elem


def json_to_dataframe(data_in):
    def flatten_json(data, prev_heading=''):
        if isinstance(data, dict):
            rows = [{}]
            for key, value in data.items():
                rows = cross_join(rows, flatten_json(value, prev_heading + '_' + key))
        elif isinstance(data, list):
            rows = []
            if(len(data) != 0):
                for i in range(len(data)):
                    [rows.append(elem) for elem in flatten_list(flatten_json(data[i], prev_heading))]
            else:
                data.append("")
                [rows.append(elem) for elem in flatten_list(flatten_json(data[0], prev_heading))]
        else:
            rows = [{prev_heading[1:]: data}]
        return rows

    return pandas.DataFrame(flatten_json(data_in))

def remove_duplicates(df):
    columns = list(df)[:7]
    for c in columns:
        df[c] = df[c].mask(df[c].duplicated(), "")

    return df


if __name__ == '__main__':
    df = json_to_dataframe(data)
    df = remove_duplicates(df)

    print(df)
    df.to_csv('data.csv', index=False)

输入 01:

data = {
    "id": "1",
    "name": "HIGHLEVEL",
    "description": "HLD",
    "item": {
        "id": "11",
        "description": "description"
    },
    "packages": [{
            "id": "1",
            "label": "Package 1",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }, {
                    "id": "3",
                    "price": 9
                }
            ]
        }, {
            "id": "2",
            "label": "Package 3",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }, {
                    "id": "3",
                    "price": 9
                }
            ]
        }
    ]
}

输出 01:

输入 02:

data = {
    "id": "1",
    "name": "HIGHLEVEL",
    "description": "HLD",
    "item": {
        "id": "11",
        "description": "description"
    },
    "packages": [{
            "id": "1",
            "label": "Package 1",
            "products": []
        }, {
            "id": "2",
            "label": "Package 3",
            "products": []
        }
    ]
}

输出 02:

希望它能解决您的问题。如果您需要任何解释,请告诉我。

谢谢

import json  # For JSON loading
import csv  # For CSV dict writer


def get_leaves(item, key=None, key_prefix=""):
    """
    This function converts nested dictionary structure to flat
    """
    if isinstance(item, dict):
        leaves = {}
        """Iterates the dictionary and go to leaf node after that calls to get_leaves function recursively to go to leaves level"""
        for item_key in item.keys():
            """Some times leaves and parents or some other leaves might have same key that's why adding leave node key to distinguish"""
            temp_key_prefix = (
                item_key if (key_prefix == "") else (key_prefix + "_" + str(item_key))
            )
            leaves.update(get_leaves(item[item_key], item_key, temp_key_prefix))
        return leaves
    elif isinstance(item, list):
        leaves = {}
        elements = []
        """Iterates the list and go to leaf node after that if it is leave then simply add value to current key's list or 
        calls to get_leaves function recursively to go to leaves level"""
        for element in item:
            if isinstance(element, dict) or isinstance(element, list):
                leaves.update(get_leaves(element, key, key_prefix))
            else:
                elements.append(element)
        if len(elements) > 0:
            leaves[key] = elements
        return leaves
    else:
        return {key_prefix: item}


with open("./campaign-summary.json") as f_input, open("./finalised_output.csv", "w", newline="") as f_output:
    json_data = json.load(f_input, strict=False)
    """'First parse all entries to get the unique fieldnames why because already we have file in RAM level and
    if we put each dictionary after parsing in list or some data structure it will crash your system due to memory constraint
    that's why first we will get the keys first then we convert each dictionary and put it to CSV"""
    fieldnames = set()
    for entry in json_data:
        fieldnames.update(get_leaves(entry).keys())
    csv_output = csv.DictWriter(f_output, delimiter=";", fieldnames=sorted(fieldnames))
    csv_output.writeheader()
    csv_output.writerows(get_leaves(entry) for entry in json_data)