解析深度嵌套 JSON 文件

Parse deeply nested JSON file

我正在努力通过 json_normalize 获取我需要的信息。我查看了文档和可能 10 个深度嵌套 JSON 文件的示例,但我无法很好地掌握函数的上下文以提取正确的信息。我正在尝试构建一个数据框,其中包含每个传感器的时间戳值(值键)。 1534023900 是以 UTC 秒为单位的时间戳。

enter image description here

JSON 的简短示例如下。

有什么想法吗?

{
    "created": "2020-05-12T15:10:37Z",
    "device": {
        "device_info": {
            "device_fw": 204,
            "device_sn": "06-02133",
            "device_trait": 2,
            "device_type": 190
        },
        "timeseries": [
            {
                "configuration": {
                    "sensors": [
                        {
                            "measurements": [
                                "BATTERY",
                                "BATTERY_MV"
                            ],
                            "port": 7,
                            "sensor_bonus_value": "Unavailable",
                            "sensor_firmware_ver": "Unavailable",
                            "sensor_number": 133,
                            "sensor_sn": "Unavailable"
                        },
                        {
                            "measurements": [
                                "REFERENCE_KPA",
                                "TEMPC_LOGGER"
                            ],
                            "port": 8,
                            "sensor_bonus_value": "Unavailable",
                            "sensor_firmware_ver": "Unavailable",
                            "sensor_number": 134,
                            "sensor_sn": "Unavailable"
                        }
                    ],
                    "valid_since": "2018-08-11T21:45:00Z",
                    "values": [
                        [
                            1534023900,
                            0,
                            19,
                            [
                                {
                                    "description": "Battery Percent",
                                    "error": false,
                                    "units": "%",
                                    "value": 100
                                },
                                {
                                    "description": "Battery Voltage",
                                    "error": false,
                                    "units": " mV",
                                    "value": 7864
                                }
                            ],
                            [
                                {
                                    "description": "Reference Pressure",
                                    "error": false,
                                    "units": " kPa",
                                    "value": 100.62
                                },
                                {
                                    "description": "Logger Temperature",
                                    "error": false,
                                    "units": " \u00b0C",
                                    "value": 28.34
                                }
                            ]
                        ]
     }
    }
   }
  }
 } 
}

已修订 JSON

{
    "created": "2020-05-12T15:10:37Z",
    "device": {
        "device_info": {
            "device_fw": 204,
            "device_sn": "06-02133",
            "device_trait": 2,
            "device_type": 190
        },
        "timeseries": [
            {
                "configuration": {
                    "sensors": [
                        {
                            "measurements": [
                                "BATTERY",
                                "BATTERY_MV"
                            ],
                            "port": 7,
                            "sensor_bonus_value": "Unavailable",
                            "sensor_firmware_ver": "Unavailable",
                            "sensor_number": 133,
                            "sensor_sn": "Unavailable"
                        },
                        {
                            "measurements": [
                                "REFERENCE_KPA",
                                "TEMPC_LOGGER"
                            ],
                            "port": 8,
                            "sensor_bonus_value": "Unavailable",
                            "sensor_firmware_ver": "Unavailable",
                            "sensor_number": 134,
                            "sensor_sn": "Unavailable"
                        }
                    ],
                    "valid_since": "2018-08-11T21:45:00Z",
                    "values": [
                        [
                            1534023900,
                            0,
                            19,
                            [
                                {
                                    "description": "Battery Percent",
                                    "error": false,
                                    "units": "%",
                                    "value": 100
                                },
                                {
                                    "description": "Battery Voltage",
                                    "error": false,
                                    "units": " mV",
                                    "value": 7864
                                }
                            ],
                            [
                                {
                                    "description": "Reference Pressure",
                                    "error": false,
                                    "units": " kPa",
                                    "value": 100.62
                                },
                                {
                                    "description": "Logger Temperature",
                                    "error": false,
                                    "units": " \u00b0C",
                                    "value": 28.34
                                }
                            ]
                        ]
                            ]
                    }
                }]
            }
}       

jmespath can help with nested data : the docs 非常健壮,但访问数据的基础是:如果它是一个键,那么你可以使用 . 如果它不是数据中的第一个条目,如果它是 array/list 使用 []

您的数据位置摘要:device -> timeseries(dict)->[](array)->configuration(dict)->values(key)->[](array)->[0](array and get the first value)

实际代码:

import jmespath
expression = jmespath.compile('device.timeseries[].configuration.values[][0]')
expression.search(data)
[1534023900]

我一直在处理类似的问题,但 json_normalize 对我帮助不大。但是,我编写了以下代码,它适用于大多数嵌套 JSON 用例。我仍处于开发状态并正在测试各种 json 文件。 任何人,请随时发表评论或提供任何改进这部分的建议。

我的主要目标是保留 JSON 的结构并在数据框中提供它。

import json
import pandas as pd
import numpy as np

def flatten_outer (data):
full_list = []
def flatten_inner(sub_data,first_level_key='',index=0,tot_len=0):
    for k,v in sub_data.items():
        full_key = first_level_key+'.'+k if first_level_key !='' else k
        if isinstance(v, dict): 
            flatten_inner(v, full_key)
            
        elif isinstance(v, list):
            
            for i in range(0, len(v)): 

                if (isinstance(v[i], dict)):
                    
                    flatten_inner(v[i], full_key,index=i, tot_len=len(v))
                else: 
                    val_ls = value_list[full_key] if full_key in value_list.keys() else []
                    val_ls.append(v)
                    value_list[full_key] = val_ls
                    break
        else:

            if full_key in value_list.keys():
                placeholder_list = value_list[full_key]
                placeholder_list[index] = v
                value_list[full_key] = placeholder_list
            else:
                if index == 0:
                    if tot_len == 0:
                        value_list[full_key] = v
                    else:
                        placeholder_list = [None]*tot_len
                        placeholder_list[0] = v
                        value_list[full_key] = placeholder_list
                else:
                    
                    dif = tot_len - index - 1
                    placeholder_list = [None] * index
                    placeholder_list.append(v)
                    placeholder_list = placeholder_list + [None] * dif
                    value_list[full_key] = placeholder_list
            
    return value_list
    
for row in data:
    value_list = dict() #creating a value_list to store key value pairs(column values) for each record
    cv =  flatten_inner(row)
    full_list.append(cv)

return full_list

def df_create_clean(full_list):
    df = pd.DataFrame(full_list)
    df = df.where(pd.notnull(df),None)
    cols = df.columns
    for col in cols:
        df[col] = df[col].apply(lambda x: None if (isinstance(x,list) and len(x)==0) else x)
        df[col] = df[col].apply(lambda x: x[0] if (isinstance(x,list) and len(x)==1) else x)
    return df


def flatten_json(data):
    df = pd.DataFrame(flatten_outer(data))
    cleaned_df = df_create_clean(df)
    return cleaned_df

将上述代码保存到文件中 flatten_json.py。 运行下面的代码。

import flatten_json as fj
data = [{
"created": "2020-05-12T15:10:37Z",
"device": {
    "device_info": {
        "device_fw": 204,
        "device_sn": "06-02133",
        "device_trait": 2,
        "device_type": 190
    },
    "timeseries": [
        {
            "configuration": {
                "sensors": [
                    {
                        "measurements": [
                            "BATTERY",
                            "BATTERY_MV"
                        ],
                        "port": 7,
                        "sensor_bonus_value": "Unavailable",
                        "sensor_firmware_ver": "Unavailable",
                        "sensor_number": 133,
                        "sensor_sn": "Unavailable"
                    },
                    {
                        "measurements": [
                            "REFERENCE_KPA",
                            "TEMPC_LOGGER"
                        ],
                        "port": 8,
                        "sensor_bonus_value": "Unavailable",
                        "sensor_firmware_ver": "Unavailable",
                        "sensor_number": 134,
                        "sensor_sn": "Unavailable"
                    }
                ],
                "valid_since": "2018-08-11T21:45:00Z",
                "values": [
                    [
                        1534023900,
                        0,
                        19,
                        [
                            {
                                "description": "Battery Percent",
                                "error": False,
                                "units": "%",
                                "value": 100
                            },
                            {
                                "description": "Battery Voltage",
                                "error": False,
                                "units": " mV",
                                "value": 7864
                            }
                        ],
                        [
                            {
                                "description": "Reference Pressure",
                                "error": False,
                                "units": " kPa",
                                "value": 100.62
                            },
                            {
                                "description": "Logger Temperature",
                                "error": False,
                                "units": " \u00b0C",
                                "value": 28.34
                            }
                        ]
                    ]
                        ]
                }
            }]
        }
}   ]

df = fj.flatten_json(data)

print(df.loc[0])

输出

created                                                                                     2020-05-12T15:10:37Z
device.device_info.device_fw                                                                                 204
device.device_info.device_sn                                                                            06-02133
device.device_info.device_trait                                                                                2
device.device_info.device_type                                                                               190
device.timeseries.configuration.sensors.measurements           [[BATTERY, BATTERY_MV], [REFERENCE_KPA, TEMPC_LOG...
device.timeseries.configuration.sensors.port                                                              [7, 8]
device.timeseries.configuration.sensors.sensor_bonus_value                            [Unavailable, Unavailable]
device.timeseries.configuration.sensors.sensor_firmware_ver                           [Unavailable, Unavailable]
device.timeseries.configuration.sensors.sensor_number                                                 [133, 134]
device.timeseries.configuration.sensors.sensor_sn                                     [Unavailable, Unavailable]
device.timeseries.configuration.valid_since                                                 2018-08-11T21:45:00Z
device.timeseries.configuration.values                         [1534023900, 0, 19, [{'description': 'Battery ...

现在从这个 df 您可以使用 device.timeseries.configuration.values 列对每个传感器数据进行进一步分析。