将嵌套 json 转换为 python 中 csv 中的嵌套数组

Question

如果我在 json 文件中有这样一个 dictionary/json 的列表，我如何使用 python 脚本或除手动之外的任何其他方式将其转换为 csv。

我这里的headers将把它展平，每个键都有一个值作为一列。这里的数组Response，我想让这里的每个元素都是一个单独的行，上面的所有数据都和单独的列一样。因此，例如，如果下面的 Response 数组有 3 个项目，那么列表中应该有 3 行项目，如 adRefId、addrRefId 等，上面和下面的字段相同数组即 creation_date、expiration_date、modification_date、revision、adRefId、addrRefId、doc_type 等。

[
  {
    "aggregate_result": [],
    "explain": "",
    "key_with_document": [
      {
        "document": {
          "creation_date": 1643342434,
          "expiration_date": 2053342527,
          "modification_date": 1643342527,
          "revision": 4,
          "struct": {
            "MatchResponse": [
              {
                "adRefId": "e6040-c8dcdb165993",
                "addrRefId": "city_list:0",
                "MatchCode": "REGI_ADDR_BLOCK",
                "maxScore": 0.9968223809704663
              },
              {
                "adRefId": "800-3c7a04dc8d3f",
                "addrRefId": "address_list:0",
                "MatchCode": "_ADDR_BLOCK",
                "maxScore": 0
              },
              {
                "adRefId": "ab39f31d-6b21-4377-9c91-85fdd345c22a",
                "addrRefId": "name_block_list:0",
                "MatchCode": "ADDR_BLOCK",
                "maxScore": 0
              }
            ],
            "MatchStatus": 200,
            "dataRefs": [
              {
                "addressRef": {
                  "addrRefId": "0",
                  "addrType": "REGISTRATION_ADDRESS",
                  "addressLine1": "123 Test Street",
                  "addressLine2": "",
                  "city": "",
                  "country": "Federation",
                  "postalCode": "12345",
                  "province": ""
                },
                "dataId": "0"
              }
            ],
            "docType": "_SCREEN",
            "extRefId1": "b326c63721536765412099",
            "extRefId1Type": "",
            "extRefId2": "",
            "extRefId2Type": "_SETTINGS",
            "ules": [
              "1213395"
            ],
            "Status": [
              "20"
            ]
          }
        },
        "key": {
          "id": [
            {
              "collection": "__ROOT__",
              "string": "3721536765412099_E"
            }
          ],
          "is_partial": false
        }
      }
    ]
  }
]

我尝试了以下但无法在 meta 中包含要包含的列的正确语法。

def main():
    so()
    data = read_json(filename='Extract1.json')

    df2 = pd.json_normalize(data, record_path=['key_with_document', ['document','struct','MatchResponse']], meta=['key_with_document']) # Here how to include keys like creation_date, expiration_date etc.
    print(df2)
    df2.to_csv('out2.csv')


if __name__ == '__main__':
    main()

我的输出看起来像这样，其中 keys_with_document 部分全部在 1 列中，但我希望键位于单独的列中

Answer 1

这似乎符合您的要求。请注意，我忽略了 dataRefs，因为那似乎是另一个列表。您可以扩展它以吸收其中的元素 [0]。

data="""[
  {
    "aggregate_result": [],
    "explain": "",
    "key_with_document": [
      {
        "document": {
          "creation_date": 1643342434,
          "expiration_date": 2053342527,
          "modification_date": 1643342527,
          "revision": 4,
          "struct": {
            "MatchResponse": [
              {
                "adRefId": "e6040-c8dcdb165993",
                "addrRefId": "city_list:0",
                "MatchCode": "REGI_ADDR_BLOCK",
                "maxScore": 0.9968223809704663
              },
              {
                "adRefId": "800-3c7a04dc8d3f",
                "addrRefId": "address_list:0",
                "MatchCode": "_ADDR_BLOCK",
                "maxScore": 0
              },
              {
                "adRefId": "ab39f31d-6b21-4377-9c91-85fdd345c22a",
                "addrRefId": "name_block_list:0",
                "MatchCode": "ADDR_BLOCK",
                "maxScore": 0
              }
            ],
            "MatchStatus": 200,
            "dataRefs": [
              {
                "addressRef": {
                  "addrRefId": "0",
                  "addrType": "REGISTRATION_ADDRESS",
                  "addressLine1": "123 Test Street",
                  "addressLine2": "",
                  "city": "",
                  "country": "Federation",
                  "postalCode": "12345",
                  "province": ""
                },
                "dataId": "0"
              }
            ],
            "docType": "_SCREEN",
            "extRefId1": "b326c63721536765412099",
            "extRefId1Type": "",
            "extRefId2": "",
            "extRefId2Type": "_SETTINGS",
            "ules": [
              "1213395"
            ],
            "Status": [
              "20"
            ]
          }
        },
        "key": {
          "id": [
            {
              "collection": "__ROOT__",
              "string": "3721536765412099_E"
            }
          ],
          "is_partial": false
        }
      }
    ]
  }
]"""

import json
import csv

data = json.loads(data)
print(data)
fixed = [
    "creation_date",
    "expiration_date",
    "modification_date",
    "revision"
]
fromstruct = [
    "docType",
    "extRefId1",
    "extRefId1Type",
    "extRefId2",
    "extRefId2Type",
    "ules",
    "Status"
]
fromresponse = [
    "adRefId",
    "addrRefId",
    "MatchCode",
    "maxScore",
]


allfields = fixed + fromstruct + fromresponse
fout = csv.DictWriter(open('my.csv','w',newline=''),fieldnames=allfields)
fout.writeheader()
for obj in data:
    for obj2 in obj['key_with_document']:
        row = {}
        odoc = obj2['document']
        ostr = odoc['struct']
        for name in fixed:
            row[name] = odoc[name]
        for name in fromstruct:
            if isinstance(ostr[name],list):
                row[name] = ostr[name][0]
            else:
                row[name] = ostr[name]
        for obj3 in ostr['MatchResponse']:
            for name in fromresponse:
                row[name] = obj3[name]
            fout.writerow( row )

输出 CSV 文件：

creation_date,expiration_date,modification_date,revision,docType,extRefId1,extRefId1Type,extRefId2,extRefId2Type,ules,Status,adRefId,addrRefId,MatchCode,maxScore
1643342434,2053342527,1643342527,4,_SCREEN,b326c63721536765412099,,,_SETTINGS,1213395,20,e6040-c8dcdb165993,city_list:0,REGI_ADDR_BLOCK,0.9968223809704663
1643342434,2053342527,1643342527,4,_SCREEN,b326c63721536765412099,,,_SETTINGS,1213395,20,800-3c7a04dc8d3f,address_list:0,_ADDR_BLOCK,0
1643342434,2053342527,1643342527,4,_SCREEN,b326c63721536765412099,,,_SETTINGS,1213395,20,ab39f31d-6b21-4377-9c91-85fdd345c22a,name_block_list:0,ADDR_BLOCK,0

Answer 2

我设法使用 pandas 找出答案。这是我的替代方案：


def read_json(filename: str) -> dict:
  
    try:
        with open(filename) as f:
            data = json.loads(f.read())
    except:
        raise Exception(f"Reading {filename} file encountered an error")
  
    return data

def main():
      
      data = read_json(filename='ExtractFile1.json')

      df3 = pd.json_normalize(data, record_path=['key_with_document', ['document','struct','MatchResponse']], meta=[['key_with_document', 'document', 'creation_date'],['key_with_document', 'document', 'expiration_date'], ['key_with_document', 'document','modification_date'], ['key_with_document', 'document','revision'], ['key_with_document', 'document','struct','MatchStatus'],['key_with_document', 'document','struct','docType'],['key_with_document', 'document','struct','extRefId1'],['key_with_document', 'document','struct','extRefId1Type'],['key_with_document', 'document','struct','extRefId2'],['key_with_document', 'document','struct','extRefId2Type'],['key_with_document', 'document','struct','Rul'],['key_with_document', 'document','struct','Status'],
    ['key_with_document','document','struct','dataRefs']])

      df3.to_csv('out3.csv')

if __name__ == '__main__':
    main()

将嵌套 json 转换为 python 中 csv 中的嵌套数组

Convert nested json with nested arrays in csv in python

python

json

python-3.x

export-to-csv