如何将这样的嵌套 JSON 转换为 Data-frame?我尝试使用 pandas json_normalize 但仍然没有得到正确的 Data-frame

How to convert nested JSON like this to a Data-frame? I tried using pandas json_normalize but still doesn't get a proper Data-frame

我正在尝试从这个 JSON 中创建一个 DataFrame,它包含三个键,即 Header、列和行。 问题是 Rows 包含很多嵌套,甚至熊猫的 json_normalize 也无法由此创建有意义的 DataFrame。

这里是 Json:

{'Header': {'Time': '2021-10-08T05:08:48-07:00',
      'ReportName': 'ProfitAndLoss',
      'DateMacro': 'this calendar year-to-date',
      'ReportBasis': 'Accrual',
      'StartPeriod': '2021-01-01',
      'EndPeriod': '2021-10-08',
      'SummarizeColumnsBy': 'Total',
      'Currency': 'USD',
      'Option': [{'Name': 'AccountingStandard', 'Value': 'GAAP'},
       {'Name': 'NoReportData', 'Value': 'false'}]},
     'Columns': {'Column': [{'ColTitle': '',
        'ColType': 'Account',
        'MetaData': [{'Name': 'ColKey', 'Value': 'account'}]},
       {'ColTitle': 'Total',
        'ColType': 'Money',
        'MetaData': [{'Name': 'ColKey', 'Value': 'total'}]}]},
     'Rows': {'Row': [{'Header': {'ColData': [{'value': 'Income'}, {'value': ''}]},
        'Rows': {'Row': [{'ColData': [{'value': 'Design income', 'id': '82'},
            {'value': '2250.00'}],
           'type': 'Data'},
          {'ColData': [{'value': 'Discounts given', 'id': '86'},
            {'value': '-89.50'}],
           'type': 'Data'},
          {'Header': {'ColData': [{'value': 'Landscaping Services', 'id': '45'},
             {'value': '1477.50'}]},
           'Rows': {'Row': [{'Header': {'ColData': [{'value': 'Job Materials',
                 'id': '46'},
                {'value': ''}]},
              'Rows': {'Row': [{'ColData': [{'value': 'Fountains and Garden Lighting',
                   'id': '48'},
                  {'value': '2246.50'}],
                 'type': 'Data'},
                {'ColData': [{'value': 'Plants and Soil', 'id': '49'},
                  {'value': '2351.97'}],
                 'type': 'Data'},
                {'ColData': [{'value': 'Sprinklers and Drip Systems', 'id': '50'},
                  {'value': '138.00'}],
                 'type': 'Data'}]},
              'Summary': {'ColData': [{'value': 'Total Job Materials'},
                {'value': '4736.47'}]},
              'type': 'Section'},
             {'Header': {'ColData': [{'value': 'Labor', 'id': '51'},
                {'value': ''}]},
              'Rows': {'Row': [{'ColData': [{'value': 'Installation', 'id': '52'},
                  {'value': '250.00'}],
                 'type': 'Data'},
                {'ColData': [{'value': 'Maintenance and Repair', 'id': '53'},
                  {'value': '50.00'}],
                 'type': 'Data'}]},
              'Summary': {'ColData': [{'value': 'Total Labor'},
                {'value': '300.00'}]},
              'type': 'Section'}]},
           'Summary': {'ColData': [{'value': 'Total Landscaping Services'},
             {'value': '6513.97'}]},
           'type': 'Section'},
          {'ColData': [{'value': 'Pest Control Services', 'id': '54'},
            {'value': '110.00'}],
           'type': 'Data'},
          {'ColData': [{'value': 'Sales of Product Income', 'id': '79'},
            {'value': '912.75'}],
           'type': 'Data'},
          {'ColData': [{'value': 'Services', 'id': '1'}, {'value': '503.55'}],
           'type': 'Data'}]},
        'Summary': {'ColData': [{'value': 'Total Income'}, {'value': '10200.77'}]},
        'type': 'Section',
        'group': 'Income'},
       {'Header': {'ColData': [{'value': 'Cost of Goods Sold'}, {'value': ''}]},
        'Rows': {'Row': [{'ColData': [{'value': 'Cost of Goods Sold', 'id': '80'},
            {'value': '405.00'}],
           'type': 'Data'}]},
        'Summary': {'ColData': [{'value': 'Total Cost of Goods Sold'},
          {'value': '405.00'}]},
        'type': 'Section',
        'group': 'COGS'},
       {'Summary': {'ColData': [{'value': 'Gross Profit'}, {'value': '9795.77'}]},
        'type': 'Section',
        'group': 'GrossProfit'},
       {'Header': {'ColData': [{'value': 'Expenses'}, {'value': ''}]},
        'Rows': {'Row': [{'ColData': [{'value': 'Advertising', 'id': '7'},
            {'value': '74.86'}],
           'type': 'Data'},
          {'Header': {'ColData': [{'value': 'Automobile', 'id': '55'},
             {'value': '113.96'}]},
           'Rows': {'Row': [{'ColData': [{'value': 'Fuel', 'id': '56'},
               {'value': '349.41'}],
              'type': 'Data'}]},
           'Summary': {'ColData': [{'value': 'Total Automobile'},
             {'value': '463.37'}]},
           'type': 'Section'},
          {'ColData': [{'value': 'Equipment Rental', 'id': '29'},
            {'value': '112.00'}],
           'type': 'Data'},
          {'ColData': [{'value': 'Insurance', 'id': '11'}, {'value': '241.23'}],
           'type': 'Data'},
          {'Header': {'ColData': [{'value': 'Job Expenses', 'id': '58'},
             {'value': '155.07'}]},
           'Rows': {'Row': [{'Header': {'ColData': [{'value': 'Job Materials',
                 'id': '63'},
                {'value': ''}]},
              'Rows': {'Row': [{'ColData': [{'value': 'Decks and Patios',
                   'id': '64'},
                  {'value': '234.04'}],
                 'type': 'Data'},
                {'ColData': [{'value': 'Plants and Soil', 'id': '66'},
                  {'value': '353.12'}],
                 'type': 'Data'},
                {'ColData': [{'value': 'Sprinklers and Drip Systems', 'id': '67'},
                  {'value': '215.66'}],
                 'type': 'Data'}]},
              'Summary': {'ColData': [{'value': 'Total Job Materials'},
                {'value': '802.82'}]},
              'type': 'Section'}]},
           'Summary': {'ColData': [{'value': 'Total Job Expenses'},
             {'value': '957.89'}]},
           'type': 'Section'},
          {'Header': {'ColData': [{'value': 'Legal & Professional Fees',
              'id': '12'},
             {'value': '75.00'}]},
           'Rows': {'Row': [{'ColData': [{'value': 'Accounting', 'id': '69'},
               {'value': '640.00'}],
              'type': 'Data'},
             {'ColData': [{'value': 'Bookkeeper', 'id': '70'}, {'value': '55.00'}],
              'type': 'Data'},
             {'ColData': [{'value': 'Lawyer', 'id': '71'}, {'value': '400.00'}],
              'type': 'Data'}]},
           'Summary': {'ColData': [{'value': 'Total Legal & Professional Fees'},
             {'value': '1170.00'}]},
           'type': 'Section'},
          {'Header': {'ColData': [{'value': 'Maintenance and Repair', 'id': '72'},
             {'value': '185.00'}]},
           'Rows': {'Row': [{'ColData': [{'value': 'Equipment Repairs',
                'id': '75'},
               {'value': '755.00'}],
              'type': 'Data'}]},
           'Summary': {'ColData': [{'value': 'Total Maintenance and Repair'},
             {'value': '940.00'}]},
           'type': 'Section'},
          {'ColData': [{'value': 'Meals and Entertainment', 'id': '13'},
            {'value': '28.49'}],
           'type': 'Data'},
          {'ColData': [{'value': 'Office Expenses', 'id': '15'},
            {'value': '18.08'}],
           'type': 'Data'},
          {'ColData': [{'value': 'Rent or Lease', 'id': '17'},
            {'value': '900.00'}],
           'type': 'Data'},
          {'Header': {'ColData': [{'value': 'Utilities', 'id': '24'},
             {'value': ''}]},
           'Rows': {'Row': [{'ColData': [{'value': 'Gas and Electric', 'id': '76'},
               {'value': '200.53'}],
              'type': 'Data'},
             {'ColData': [{'value': 'Telephone', 'id': '77'}, {'value': '130.86'}],
              'type': 'Data'}]},
           'Summary': {'ColData': [{'value': 'Total Utilities'},
             {'value': '331.39'}]},
           'type': 'Section'}]},
        'Summary': {'ColData': [{'value': 'Total Expenses'},
          {'value': '5237.31'}]},
        'type': 'Section',
        'group': 'Expenses'},
       {'Summary': {'ColData': [{'value': 'Net Operating Income'},
          {'value': '4558.46'}]},
        'type': 'Section',
        'group': 'NetOperatingIncome'},
       {'Header': {'ColData': [{'value': 'Other Expenses'}, {'value': ''}]},
        'Rows': {'Row': [{'ColData': [{'value': 'Miscellaneous', 'id': '14'},
            {'value': '2916.00'}],
           'type': 'Data'}]},
        'Summary': {'ColData': [{'value': 'Total Other Expenses'},
          {'value': '2916.00'}]},
        'type': 'Section',
        'group': 'OtherExpenses'},
       {'Summary': {'ColData': [{'value': 'Net Other Income'},
          {'value': '-2916.00'}]},
        'type': 'Section',
        'group': 'NetOtherIncome'},
       {'Summary': {'ColData': [{'value': 'Net Income'}, {'value': '1642.46'}]},
        'type': 'Section',
        'group': 'NetIncome'}]}}

我从 Quickbook 'profit and loss' API 中获取了这些数据。 'Rows' 包含一个键 'Row',它进一步包含 DataFrame 行的所有数据。 每个分支都包含一个 Header,它还包含一个表示新列标题的值。 非常感谢任何帮助。

尝试flatten_json。它适用于嵌套 json。但是,您的 json 非常嵌套,并不真正适合数据框。设置您的 json = 数据和 运行 下面的代码。 .T 转置数据帧。也许您可以通过这种方式理解数据。否则,您将首先处理 json 对象,然后创建数据框。

from flatten_json import flatten
dic_flattened = (flatten(d, '.') for d in data['Rows']['Row'])
df = pd.DataFrame(dic_flattened)
df.fillna('')  ###or ??? df.fillna('').T

  Header.ColData.0.value Header.ColData.1.value Rows.Row.0.ColData.0.value Rows.Row.0.ColData.0.id  ... Rows.Row.10.Rows.Row.1.type Rows.Row.10.Summary.ColData.0.value Rows.Row.10.Summary.ColData.1.value Rows.Row.10.type
0                 Income                                     Design income                      82  ...                         NaN                                 NaN                                 NaN              NaN
1     Cost of Goods Sold                                Cost of Goods Sold                      80  ...                         NaN                                 NaN                                 NaN              NaN
2                    NaN                    NaN                        NaN                     NaN  ...                         NaN                                 NaN                                 NaN              NaN
3               Expenses                                       Advertising                       7  ...                        Data                     Total Utilities                              331.39          Section
4                    NaN                    NaN                        NaN                     NaN  ...                         NaN                                 NaN                                 NaN              NaN
5         Other Expenses                                     Miscellaneous                      14  ...                         NaN                                 NaN                                 NaN              NaN
6                    NaN                    NaN                        NaN                     NaN  ...                         NaN                                 NaN                                 NaN              NaN
7                    NaN                    NaN                        NaN                     NaN  ...                         NaN                                 NaN                                 NaN              NaN

[8 rows x 152 columns]

并转置

                                                0                   1 2             

    3 4                5 6  7
Header.ColData.0.value                         Income  Cost of Goods Sold            Expenses     Other Expenses
Header.ColData.1.value
Rows.Row.0.ColData.0.value              Design income  Cost of Goods Sold         Advertising      Miscellaneous
Rows.Row.0.ColData.0.id                            82                  80                   7                 14
Rows.Row.0.ColData.1.value                     2250.0               405.0               74.86             2916.0
...                                               ...                 ... ..              ... ..             ... .. ..
Rows.Row.10.Rows.Row.1.ColData.1.value                                                 130.86
Rows.Row.10.Rows.Row.1.type                                                              Data
Rows.Row.10.Summary.ColData.0.value                                           Total Utilities
Rows.Row.10.Summary.ColData.1.value                                                    331.39
Rows.Row.10.type                                                                      Section

[152 rows x 8 columns]

我在回答我自己的问题,所以这个 JSON 是高度嵌套的,无法使用 flatten_json 或 json_normalize 将其展平成有意义的数据框,所以我编写了专门为 Quickbook 报告 APIs 创建的脚本。 这将把这个嵌套的 JSON 作为参数并从中创建一个数据框。任何高度嵌套的 Quickbook 报告 API 都适用于此。

def master(data):
    
    """
    Creates Dataframe using Json received from API
    
    Args:
    data(dictionary)  :  Json response from API
    
    Return:
    Dataframe of data inserted
    
    Example:
    Dataframe = master(data_dict_or_Json)
    
    """
    
    ############### Initial values ###############
    level = 0

    headingdict = {}

    maxlvl, crlist, valuelist, colHeaders, headingdict['Headers'] = [],[],[],[],[]

    current = data['Header']['ReportName']

    for i in data["Columns"]["Column"]:
        colHeaders.append(i["ColTitle"])
    try:
        raw_data = data['Rows']['Row']
    except:
        print('No data Found in {} API'.format(current))
        return
    ############### Initial values ###############

    def header_store(l, r=data['Header']['ReportName']):
        
        "Creates a list of headers for each final value found in Json"
        
        for i in l:
            r += '$' + i
        headingdict['Headers'].append(r)

    def value_extract(Json, valuelist, current, crlist, maxlvl, level):
        
        "Extracts all final values found in nested Json and passes its headers to header_store function"
        
        for i in range(len(Json)):
            if 'Header' in Json[i]:
                current = Json[i]['Header']['ColData'][0]['value']
                new = Json[i]['Rows']['Row']
                crlist.append(current)

                value_extract(new, valuelist, current, crlist, maxlvl, level+1)
                current = Json[i]['Header']['ColData'][0]['value']

            if 'ColData' in Json[i]:
                header_store(crlist)
                valuelist.append(Json[i]['ColData'])
                maxlvl.append(level)

            if i == len(Json)-1:
                try:
                    crlist.pop()
                except:
                    pass

        return valuelist,level-1

    value_extract(raw_data, valuelist, current, crlist,maxlvl,level)


    def get_values_df(valuelist,colHeaders,df={}):
        
        "Creates a data frame from all the values found using value_extract function"
        
        for i in range(len(valuelist)):
            for j in range(len(valuelist[i])):
                if colHeaders[j] not in df:
                    df[colHeaders[j]]=[]
                df[colHeaders[j]].append(valuelist[i][j]['value'])
        responseDf = pd.DataFrame(df)
        return responseDf

    def seperator(indict,lvl,outdict = []):
        
        "Creates a list of lists by splitting each list using $, and adding spaces for hierarchical representation"
        
        for i in indict:
            i = i.split('$')
            if len(i) <= lvl:
                for j in range((lvl)-len(i)):
                    i.append(' ')
                outdict.append(i)
        return outdict

    def heading_lvls(maxlvl,columns=['Form',]):
        
        "Creates Headers column"
        
        for i in range(maxlvl):
            head = 'Header'+'-'+'{}'.format(i+1)
            columns.append(head)
        return columns

    newdf = pd.DataFrame(seperator(headingdict['Headers'],max(maxlvl)+1),columns = heading_lvls(max(maxlvl)))

    result = pd.concat([newdf, get_values_df(valuelist,colHeaders)], axis=1)
    
    return result