将嵌套 json 转换为 python 中 csv 中的嵌套数组
Convert nested json with nested arrays in csv in python
如果我在 json 文件中有这样一个 dictionary/json 的列表,我如何使用 python 脚本或除手动之外的任何其他方式将其转换为 csv。
我这里的headers将把它展平,每个键都有一个值作为一列。这里的数组Response
,我想让这里的每个元素都是一个单独的行,上面的所有数据都和单独的列一样。因此,例如,如果下面的 Response
数组有 3 个项目,那么列表中应该有 3 行项目,如 adRefId
、addrRefId
等,上面和下面的字段相同数组即 creation_date
、expiration_date
、modification_date
、revision
、adRefId
、addrRefId
、doc_type
等。
[
{
"aggregate_result": [],
"explain": "",
"key_with_document": [
{
"document": {
"creation_date": 1643342434,
"expiration_date": 2053342527,
"modification_date": 1643342527,
"revision": 4,
"struct": {
"MatchResponse": [
{
"adRefId": "e6040-c8dcdb165993",
"addrRefId": "city_list:0",
"MatchCode": "REGI_ADDR_BLOCK",
"maxScore": 0.9968223809704663
},
{
"adRefId": "800-3c7a04dc8d3f",
"addrRefId": "address_list:0",
"MatchCode": "_ADDR_BLOCK",
"maxScore": 0
},
{
"adRefId": "ab39f31d-6b21-4377-9c91-85fdd345c22a",
"addrRefId": "name_block_list:0",
"MatchCode": "ADDR_BLOCK",
"maxScore": 0
}
],
"MatchStatus": 200,
"dataRefs": [
{
"addressRef": {
"addrRefId": "0",
"addrType": "REGISTRATION_ADDRESS",
"addressLine1": "123 Test Street",
"addressLine2": "",
"city": "",
"country": "Federation",
"postalCode": "12345",
"province": ""
},
"dataId": "0"
}
],
"docType": "_SCREEN",
"extRefId1": "b326c63721536765412099",
"extRefId1Type": "",
"extRefId2": "",
"extRefId2Type": "_SETTINGS",
"ules": [
"1213395"
],
"Status": [
"20"
]
}
},
"key": {
"id": [
{
"collection": "__ROOT__",
"string": "3721536765412099_E"
}
],
"is_partial": false
}
}
]
}
]
我尝试了以下但无法在 meta
中包含要包含的列的正确语法。
def main():
so()
data = read_json(filename='Extract1.json')
df2 = pd.json_normalize(data, record_path=['key_with_document', ['document','struct','MatchResponse']], meta=['key_with_document']) # Here how to include keys like creation_date, expiration_date etc.
print(df2)
df2.to_csv('out2.csv')
if __name__ == '__main__':
main()
我的输出看起来像这样,其中 keys_with_document
部分全部在 1 列中,但我希望键位于单独的列中
这似乎符合您的要求。请注意,我忽略了 dataRefs
,因为那似乎是另一个列表。您可以扩展它以吸收其中的元素 [0]。
data="""[
{
"aggregate_result": [],
"explain": "",
"key_with_document": [
{
"document": {
"creation_date": 1643342434,
"expiration_date": 2053342527,
"modification_date": 1643342527,
"revision": 4,
"struct": {
"MatchResponse": [
{
"adRefId": "e6040-c8dcdb165993",
"addrRefId": "city_list:0",
"MatchCode": "REGI_ADDR_BLOCK",
"maxScore": 0.9968223809704663
},
{
"adRefId": "800-3c7a04dc8d3f",
"addrRefId": "address_list:0",
"MatchCode": "_ADDR_BLOCK",
"maxScore": 0
},
{
"adRefId": "ab39f31d-6b21-4377-9c91-85fdd345c22a",
"addrRefId": "name_block_list:0",
"MatchCode": "ADDR_BLOCK",
"maxScore": 0
}
],
"MatchStatus": 200,
"dataRefs": [
{
"addressRef": {
"addrRefId": "0",
"addrType": "REGISTRATION_ADDRESS",
"addressLine1": "123 Test Street",
"addressLine2": "",
"city": "",
"country": "Federation",
"postalCode": "12345",
"province": ""
},
"dataId": "0"
}
],
"docType": "_SCREEN",
"extRefId1": "b326c63721536765412099",
"extRefId1Type": "",
"extRefId2": "",
"extRefId2Type": "_SETTINGS",
"ules": [
"1213395"
],
"Status": [
"20"
]
}
},
"key": {
"id": [
{
"collection": "__ROOT__",
"string": "3721536765412099_E"
}
],
"is_partial": false
}
}
]
}
]"""
import json
import csv
data = json.loads(data)
print(data)
fixed = [
"creation_date",
"expiration_date",
"modification_date",
"revision"
]
fromstruct = [
"docType",
"extRefId1",
"extRefId1Type",
"extRefId2",
"extRefId2Type",
"ules",
"Status"
]
fromresponse = [
"adRefId",
"addrRefId",
"MatchCode",
"maxScore",
]
allfields = fixed + fromstruct + fromresponse
fout = csv.DictWriter(open('my.csv','w',newline=''),fieldnames=allfields)
fout.writeheader()
for obj in data:
for obj2 in obj['key_with_document']:
row = {}
odoc = obj2['document']
ostr = odoc['struct']
for name in fixed:
row[name] = odoc[name]
for name in fromstruct:
if isinstance(ostr[name],list):
row[name] = ostr[name][0]
else:
row[name] = ostr[name]
for obj3 in ostr['MatchResponse']:
for name in fromresponse:
row[name] = obj3[name]
fout.writerow( row )
输出 CSV 文件:
creation_date,expiration_date,modification_date,revision,docType,extRefId1,extRefId1Type,extRefId2,extRefId2Type,ules,Status,adRefId,addrRefId,MatchCode,maxScore
1643342434,2053342527,1643342527,4,_SCREEN,b326c63721536765412099,,,_SETTINGS,1213395,20,e6040-c8dcdb165993,city_list:0,REGI_ADDR_BLOCK,0.9968223809704663
1643342434,2053342527,1643342527,4,_SCREEN,b326c63721536765412099,,,_SETTINGS,1213395,20,800-3c7a04dc8d3f,address_list:0,_ADDR_BLOCK,0
1643342434,2053342527,1643342527,4,_SCREEN,b326c63721536765412099,,,_SETTINGS,1213395,20,ab39f31d-6b21-4377-9c91-85fdd345c22a,name_block_list:0,ADDR_BLOCK,0
我设法使用 pandas
找出答案。这是我的替代方案:
def read_json(filename: str) -> dict:
try:
with open(filename) as f:
data = json.loads(f.read())
except:
raise Exception(f"Reading {filename} file encountered an error")
return data
def main():
data = read_json(filename='ExtractFile1.json')
df3 = pd.json_normalize(data, record_path=['key_with_document', ['document','struct','MatchResponse']], meta=[['key_with_document', 'document', 'creation_date'],['key_with_document', 'document', 'expiration_date'], ['key_with_document', 'document','modification_date'], ['key_with_document', 'document','revision'], ['key_with_document', 'document','struct','MatchStatus'],['key_with_document', 'document','struct','docType'],['key_with_document', 'document','struct','extRefId1'],['key_with_document', 'document','struct','extRefId1Type'],['key_with_document', 'document','struct','extRefId2'],['key_with_document', 'document','struct','extRefId2Type'],['key_with_document', 'document','struct','Rul'],['key_with_document', 'document','struct','Status'],
['key_with_document','document','struct','dataRefs']])
df3.to_csv('out3.csv')
if __name__ == '__main__':
main()
如果我在 json 文件中有这样一个 dictionary/json 的列表,我如何使用 python 脚本或除手动之外的任何其他方式将其转换为 csv。
我这里的headers将把它展平,每个键都有一个值作为一列。这里的数组Response
,我想让这里的每个元素都是一个单独的行,上面的所有数据都和单独的列一样。因此,例如,如果下面的 Response
数组有 3 个项目,那么列表中应该有 3 行项目,如 adRefId
、addrRefId
等,上面和下面的字段相同数组即 creation_date
、expiration_date
、modification_date
、revision
、adRefId
、addrRefId
、doc_type
等。
[
{
"aggregate_result": [],
"explain": "",
"key_with_document": [
{
"document": {
"creation_date": 1643342434,
"expiration_date": 2053342527,
"modification_date": 1643342527,
"revision": 4,
"struct": {
"MatchResponse": [
{
"adRefId": "e6040-c8dcdb165993",
"addrRefId": "city_list:0",
"MatchCode": "REGI_ADDR_BLOCK",
"maxScore": 0.9968223809704663
},
{
"adRefId": "800-3c7a04dc8d3f",
"addrRefId": "address_list:0",
"MatchCode": "_ADDR_BLOCK",
"maxScore": 0
},
{
"adRefId": "ab39f31d-6b21-4377-9c91-85fdd345c22a",
"addrRefId": "name_block_list:0",
"MatchCode": "ADDR_BLOCK",
"maxScore": 0
}
],
"MatchStatus": 200,
"dataRefs": [
{
"addressRef": {
"addrRefId": "0",
"addrType": "REGISTRATION_ADDRESS",
"addressLine1": "123 Test Street",
"addressLine2": "",
"city": "",
"country": "Federation",
"postalCode": "12345",
"province": ""
},
"dataId": "0"
}
],
"docType": "_SCREEN",
"extRefId1": "b326c63721536765412099",
"extRefId1Type": "",
"extRefId2": "",
"extRefId2Type": "_SETTINGS",
"ules": [
"1213395"
],
"Status": [
"20"
]
}
},
"key": {
"id": [
{
"collection": "__ROOT__",
"string": "3721536765412099_E"
}
],
"is_partial": false
}
}
]
}
]
我尝试了以下但无法在 meta
中包含要包含的列的正确语法。
def main():
so()
data = read_json(filename='Extract1.json')
df2 = pd.json_normalize(data, record_path=['key_with_document', ['document','struct','MatchResponse']], meta=['key_with_document']) # Here how to include keys like creation_date, expiration_date etc.
print(df2)
df2.to_csv('out2.csv')
if __name__ == '__main__':
main()
我的输出看起来像这样,其中 keys_with_document
部分全部在 1 列中,但我希望键位于单独的列中
这似乎符合您的要求。请注意,我忽略了 dataRefs
,因为那似乎是另一个列表。您可以扩展它以吸收其中的元素 [0]。
data="""[
{
"aggregate_result": [],
"explain": "",
"key_with_document": [
{
"document": {
"creation_date": 1643342434,
"expiration_date": 2053342527,
"modification_date": 1643342527,
"revision": 4,
"struct": {
"MatchResponse": [
{
"adRefId": "e6040-c8dcdb165993",
"addrRefId": "city_list:0",
"MatchCode": "REGI_ADDR_BLOCK",
"maxScore": 0.9968223809704663
},
{
"adRefId": "800-3c7a04dc8d3f",
"addrRefId": "address_list:0",
"MatchCode": "_ADDR_BLOCK",
"maxScore": 0
},
{
"adRefId": "ab39f31d-6b21-4377-9c91-85fdd345c22a",
"addrRefId": "name_block_list:0",
"MatchCode": "ADDR_BLOCK",
"maxScore": 0
}
],
"MatchStatus": 200,
"dataRefs": [
{
"addressRef": {
"addrRefId": "0",
"addrType": "REGISTRATION_ADDRESS",
"addressLine1": "123 Test Street",
"addressLine2": "",
"city": "",
"country": "Federation",
"postalCode": "12345",
"province": ""
},
"dataId": "0"
}
],
"docType": "_SCREEN",
"extRefId1": "b326c63721536765412099",
"extRefId1Type": "",
"extRefId2": "",
"extRefId2Type": "_SETTINGS",
"ules": [
"1213395"
],
"Status": [
"20"
]
}
},
"key": {
"id": [
{
"collection": "__ROOT__",
"string": "3721536765412099_E"
}
],
"is_partial": false
}
}
]
}
]"""
import json
import csv
data = json.loads(data)
print(data)
fixed = [
"creation_date",
"expiration_date",
"modification_date",
"revision"
]
fromstruct = [
"docType",
"extRefId1",
"extRefId1Type",
"extRefId2",
"extRefId2Type",
"ules",
"Status"
]
fromresponse = [
"adRefId",
"addrRefId",
"MatchCode",
"maxScore",
]
allfields = fixed + fromstruct + fromresponse
fout = csv.DictWriter(open('my.csv','w',newline=''),fieldnames=allfields)
fout.writeheader()
for obj in data:
for obj2 in obj['key_with_document']:
row = {}
odoc = obj2['document']
ostr = odoc['struct']
for name in fixed:
row[name] = odoc[name]
for name in fromstruct:
if isinstance(ostr[name],list):
row[name] = ostr[name][0]
else:
row[name] = ostr[name]
for obj3 in ostr['MatchResponse']:
for name in fromresponse:
row[name] = obj3[name]
fout.writerow( row )
输出 CSV 文件:
creation_date,expiration_date,modification_date,revision,docType,extRefId1,extRefId1Type,extRefId2,extRefId2Type,ules,Status,adRefId,addrRefId,MatchCode,maxScore
1643342434,2053342527,1643342527,4,_SCREEN,b326c63721536765412099,,,_SETTINGS,1213395,20,e6040-c8dcdb165993,city_list:0,REGI_ADDR_BLOCK,0.9968223809704663
1643342434,2053342527,1643342527,4,_SCREEN,b326c63721536765412099,,,_SETTINGS,1213395,20,800-3c7a04dc8d3f,address_list:0,_ADDR_BLOCK,0
1643342434,2053342527,1643342527,4,_SCREEN,b326c63721536765412099,,,_SETTINGS,1213395,20,ab39f31d-6b21-4377-9c91-85fdd345c22a,name_block_list:0,ADDR_BLOCK,0
我设法使用 pandas
找出答案。这是我的替代方案:
def read_json(filename: str) -> dict:
try:
with open(filename) as f:
data = json.loads(f.read())
except:
raise Exception(f"Reading {filename} file encountered an error")
return data
def main():
data = read_json(filename='ExtractFile1.json')
df3 = pd.json_normalize(data, record_path=['key_with_document', ['document','struct','MatchResponse']], meta=[['key_with_document', 'document', 'creation_date'],['key_with_document', 'document', 'expiration_date'], ['key_with_document', 'document','modification_date'], ['key_with_document', 'document','revision'], ['key_with_document', 'document','struct','MatchStatus'],['key_with_document', 'document','struct','docType'],['key_with_document', 'document','struct','extRefId1'],['key_with_document', 'document','struct','extRefId1Type'],['key_with_document', 'document','struct','extRefId2'],['key_with_document', 'document','struct','extRefId2Type'],['key_with_document', 'document','struct','Rul'],['key_with_document', 'document','struct','Status'],
['key_with_document','document','struct','dataRefs']])
df3.to_csv('out3.csv')
if __name__ == '__main__':
main()