如何将嵌套的 JSON 转换为 CSV
How to convert a nested JSON to CSV
我想将嵌套 json 转换为 csv 格式,包括分组 list/dict 的子行。
这是我的json
data =\
{
"id": "1",
"name": "HIGHLEVEL",
"description": "HLD",
"item": {
"id": "11",
"description": "description"
},
"packages": [{
"id": "1",
"label": "Package 1",
"products": [{
"id": "1",
"price": 5
}, {
"id": "2",
"price": 3
}
]
}, {
"id": "2",
"label": "Package 3",
"products": [{
"id": "1",
"price": 5
}, {
"id": "2",
"price": 3
}
]
}
]
}
import pandas as pd
df = pd.json_normalize(data)
# display(df)
description id name packages item.description item.id
0 HLD 1 HIGHLEVEL [{'id': '1', 'label': 'Package 1', 'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}, {'id': '2', 'label': 'Package 3', 'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}] description 11
的输出
"id","name","description","item__id","item__description","packages__id","packages__label","packages__products__id","packages__products__price"
"1","HIGHLEVEL","HLD","11","description","1","Package 1","1","5"
"","","","","","","","2","3"
"","","","","","2","Package 3","1","5"
"","","","","","","","2","3"
我尝试了 pandas 规范化,但结果与想要的不一样。
JSON 数组未转换为 csv 中的子行。
我想在 csv 中保留空字符串。
我想做同样的事情,但使用 Python 脚本。
感谢@Trenton McKinney
import pandas as pd
import json
data =\
{'description': 'HLD',
'id': '1',
'item': {'description': 'description', 'id': '11'},
'name': 'HIGHLEVEL',
'packages': [{'id': '1',
'label': 'Package 1',
'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]},
{'id': '2',
'label': 'Package 3',
'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}]}
df = pd.json_normalize(data, record_path=['packages'], meta=['id', 'name', 'description', ['item', 'id'], ['item', 'description']], meta_prefix='top_', sep='_')
df = df.explode('products')
df.rename({'id': 'packages_id', 'label': 'packages_label'}, axis=1, inplace=True)
df = df.join(pd.DataFrame(df.pop('products').values.tolist()))
df.rename({'id': 'packages_products_id', 'price': 'packages_products_price'}, axis=1, inplace=True)
df.columns = df.columns.str.replace('top_', '')
df = df[['id', 'name', 'description', 'item_id', 'item_description', 'packages_id', 'packages_label', 'packages_products_id', 'packages_products_price']]
columns_to_group = ["name", "description", "item_id", "item_description", "packages_id", "packages_label"]
for c in columns_to_group:
df[c] = df[c].mask(
df[c].duplicated(), ""
)
print(df)
df.to_csv('data.csv', index=False)
现在我必须让它更通用,以便它可以用于任何结构化 json。
这应该适合你:
from copy import deepcopy
import pandas
def cross_join(left, right):
new_rows = [] if right else left
for left_row in left:
for right_row in right:
temp_row = deepcopy(left_row)
for key, value in right_row.items():
temp_row[key] = value
new_rows.append(deepcopy(temp_row))
return new_rows
def flatten_list(data):
for elem in data:
if isinstance(elem, list):
yield from flatten_list(elem)
else:
yield elem
def json_to_dataframe(data_in):
def flatten_json(data, prev_heading=''):
if isinstance(data, dict):
rows = [{}]
for key, value in data.items():
rows = cross_join(rows, flatten_json(value, prev_heading + '_' + key))
elif isinstance(data, list):
rows = []
if(len(data) != 0):
for i in range(len(data)):
[rows.append(elem) for elem in flatten_list(flatten_json(data[i], prev_heading))]
else:
data.append("")
[rows.append(elem) for elem in flatten_list(flatten_json(data[0], prev_heading))]
else:
rows = [{prev_heading[1:]: data}]
return rows
return pandas.DataFrame(flatten_json(data_in))
def remove_duplicates(df):
columns = list(df)[:7]
for c in columns:
df[c] = df[c].mask(df[c].duplicated(), "")
return df
if __name__ == '__main__':
df = json_to_dataframe(data)
df = remove_duplicates(df)
print(df)
df.to_csv('data.csv', index=False)
输入 01:
data = {
"id": "1",
"name": "HIGHLEVEL",
"description": "HLD",
"item": {
"id": "11",
"description": "description"
},
"packages": [{
"id": "1",
"label": "Package 1",
"products": [{
"id": "1",
"price": 5
}, {
"id": "2",
"price": 3
}, {
"id": "3",
"price": 9
}
]
}, {
"id": "2",
"label": "Package 3",
"products": [{
"id": "1",
"price": 5
}, {
"id": "2",
"price": 3
}, {
"id": "3",
"price": 9
}
]
}
]
}
输出 01:
输入 02:
data = {
"id": "1",
"name": "HIGHLEVEL",
"description": "HLD",
"item": {
"id": "11",
"description": "description"
},
"packages": [{
"id": "1",
"label": "Package 1",
"products": []
}, {
"id": "2",
"label": "Package 3",
"products": []
}
]
}
输出 02:
希望它能解决您的问题。如果您需要任何解释,请告诉我。
谢谢
import json # For JSON loading
import csv # For CSV dict writer
def get_leaves(item, key=None, key_prefix=""):
"""
This function converts nested dictionary structure to flat
"""
if isinstance(item, dict):
leaves = {}
"""Iterates the dictionary and go to leaf node after that calls to get_leaves function recursively to go to leaves level"""
for item_key in item.keys():
"""Some times leaves and parents or some other leaves might have same key that's why adding leave node key to distinguish"""
temp_key_prefix = (
item_key if (key_prefix == "") else (key_prefix + "_" + str(item_key))
)
leaves.update(get_leaves(item[item_key], item_key, temp_key_prefix))
return leaves
elif isinstance(item, list):
leaves = {}
elements = []
"""Iterates the list and go to leaf node after that if it is leave then simply add value to current key's list or
calls to get_leaves function recursively to go to leaves level"""
for element in item:
if isinstance(element, dict) or isinstance(element, list):
leaves.update(get_leaves(element, key, key_prefix))
else:
elements.append(element)
if len(elements) > 0:
leaves[key] = elements
return leaves
else:
return {key_prefix: item}
with open("./campaign-summary.json") as f_input, open("./finalised_output.csv", "w", newline="") as f_output:
json_data = json.load(f_input, strict=False)
"""'First parse all entries to get the unique fieldnames why because already we have file in RAM level and
if we put each dictionary after parsing in list or some data structure it will crash your system due to memory constraint
that's why first we will get the keys first then we convert each dictionary and put it to CSV"""
fieldnames = set()
for entry in json_data:
fieldnames.update(get_leaves(entry).keys())
csv_output = csv.DictWriter(f_output, delimiter=";", fieldnames=sorted(fieldnames))
csv_output.writeheader()
csv_output.writerows(get_leaves(entry) for entry in json_data)
我想将嵌套 json 转换为 csv 格式,包括分组 list/dict 的子行。
这是我的json
data =\
{
"id": "1",
"name": "HIGHLEVEL",
"description": "HLD",
"item": {
"id": "11",
"description": "description"
},
"packages": [{
"id": "1",
"label": "Package 1",
"products": [{
"id": "1",
"price": 5
}, {
"id": "2",
"price": 3
}
]
}, {
"id": "2",
"label": "Package 3",
"products": [{
"id": "1",
"price": 5
}, {
"id": "2",
"price": 3
}
]
}
]
}
import pandas as pd
df = pd.json_normalize(data)
# display(df)
description id name packages item.description item.id
0 HLD 1 HIGHLEVEL [{'id': '1', 'label': 'Package 1', 'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}, {'id': '2', 'label': 'Package 3', 'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}] description 11
的输出
"id","name","description","item__id","item__description","packages__id","packages__label","packages__products__id","packages__products__price"
"1","HIGHLEVEL","HLD","11","description","1","Package 1","1","5"
"","","","","","","","2","3"
"","","","","","2","Package 3","1","5"
"","","","","","","","2","3"
我尝试了 pandas 规范化,但结果与想要的不一样。 JSON 数组未转换为 csv 中的子行。 我想在 csv 中保留空字符串。
我想做同样的事情,但使用 Python 脚本。
感谢@Trenton McKinney
import pandas as pd
import json
data =\
{'description': 'HLD',
'id': '1',
'item': {'description': 'description', 'id': '11'},
'name': 'HIGHLEVEL',
'packages': [{'id': '1',
'label': 'Package 1',
'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]},
{'id': '2',
'label': 'Package 3',
'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}]}
df = pd.json_normalize(data, record_path=['packages'], meta=['id', 'name', 'description', ['item', 'id'], ['item', 'description']], meta_prefix='top_', sep='_')
df = df.explode('products')
df.rename({'id': 'packages_id', 'label': 'packages_label'}, axis=1, inplace=True)
df = df.join(pd.DataFrame(df.pop('products').values.tolist()))
df.rename({'id': 'packages_products_id', 'price': 'packages_products_price'}, axis=1, inplace=True)
df.columns = df.columns.str.replace('top_', '')
df = df[['id', 'name', 'description', 'item_id', 'item_description', 'packages_id', 'packages_label', 'packages_products_id', 'packages_products_price']]
columns_to_group = ["name", "description", "item_id", "item_description", "packages_id", "packages_label"]
for c in columns_to_group:
df[c] = df[c].mask(
df[c].duplicated(), ""
)
print(df)
df.to_csv('data.csv', index=False)
现在我必须让它更通用,以便它可以用于任何结构化 json。
这应该适合你:
from copy import deepcopy
import pandas
def cross_join(left, right):
new_rows = [] if right else left
for left_row in left:
for right_row in right:
temp_row = deepcopy(left_row)
for key, value in right_row.items():
temp_row[key] = value
new_rows.append(deepcopy(temp_row))
return new_rows
def flatten_list(data):
for elem in data:
if isinstance(elem, list):
yield from flatten_list(elem)
else:
yield elem
def json_to_dataframe(data_in):
def flatten_json(data, prev_heading=''):
if isinstance(data, dict):
rows = [{}]
for key, value in data.items():
rows = cross_join(rows, flatten_json(value, prev_heading + '_' + key))
elif isinstance(data, list):
rows = []
if(len(data) != 0):
for i in range(len(data)):
[rows.append(elem) for elem in flatten_list(flatten_json(data[i], prev_heading))]
else:
data.append("")
[rows.append(elem) for elem in flatten_list(flatten_json(data[0], prev_heading))]
else:
rows = [{prev_heading[1:]: data}]
return rows
return pandas.DataFrame(flatten_json(data_in))
def remove_duplicates(df):
columns = list(df)[:7]
for c in columns:
df[c] = df[c].mask(df[c].duplicated(), "")
return df
if __name__ == '__main__':
df = json_to_dataframe(data)
df = remove_duplicates(df)
print(df)
df.to_csv('data.csv', index=False)
输入 01:
data = {
"id": "1",
"name": "HIGHLEVEL",
"description": "HLD",
"item": {
"id": "11",
"description": "description"
},
"packages": [{
"id": "1",
"label": "Package 1",
"products": [{
"id": "1",
"price": 5
}, {
"id": "2",
"price": 3
}, {
"id": "3",
"price": 9
}
]
}, {
"id": "2",
"label": "Package 3",
"products": [{
"id": "1",
"price": 5
}, {
"id": "2",
"price": 3
}, {
"id": "3",
"price": 9
}
]
}
]
}
输出 01:
输入 02:
data = {
"id": "1",
"name": "HIGHLEVEL",
"description": "HLD",
"item": {
"id": "11",
"description": "description"
},
"packages": [{
"id": "1",
"label": "Package 1",
"products": []
}, {
"id": "2",
"label": "Package 3",
"products": []
}
]
}
输出 02:
希望它能解决您的问题。如果您需要任何解释,请告诉我。
谢谢
import json # For JSON loading
import csv # For CSV dict writer
def get_leaves(item, key=None, key_prefix=""):
"""
This function converts nested dictionary structure to flat
"""
if isinstance(item, dict):
leaves = {}
"""Iterates the dictionary and go to leaf node after that calls to get_leaves function recursively to go to leaves level"""
for item_key in item.keys():
"""Some times leaves and parents or some other leaves might have same key that's why adding leave node key to distinguish"""
temp_key_prefix = (
item_key if (key_prefix == "") else (key_prefix + "_" + str(item_key))
)
leaves.update(get_leaves(item[item_key], item_key, temp_key_prefix))
return leaves
elif isinstance(item, list):
leaves = {}
elements = []
"""Iterates the list and go to leaf node after that if it is leave then simply add value to current key's list or
calls to get_leaves function recursively to go to leaves level"""
for element in item:
if isinstance(element, dict) or isinstance(element, list):
leaves.update(get_leaves(element, key, key_prefix))
else:
elements.append(element)
if len(elements) > 0:
leaves[key] = elements
return leaves
else:
return {key_prefix: item}
with open("./campaign-summary.json") as f_input, open("./finalised_output.csv", "w", newline="") as f_output:
json_data = json.load(f_input, strict=False)
"""'First parse all entries to get the unique fieldnames why because already we have file in RAM level and
if we put each dictionary after parsing in list or some data structure it will crash your system due to memory constraint
that's why first we will get the keys first then we convert each dictionary and put it to CSV"""
fieldnames = set()
for entry in json_data:
fieldnames.update(get_leaves(entry).keys())
csv_output = csv.DictWriter(f_output, delimiter=";", fieldnames=sorted(fieldnames))
csv_output.writeheader()
csv_output.writerows(get_leaves(entry) for entry in json_data)