解析深度嵌套 JSON 文件
Parse deeply nested JSON file
我正在努力通过 json_normalize 获取我需要的信息。我查看了文档和可能 10 个深度嵌套 JSON 文件的示例,但我无法很好地掌握函数的上下文以提取正确的信息。我正在尝试构建一个数据框,其中包含每个传感器的时间戳值(值键)。 1534023900 是以 UTC 秒为单位的时间戳。
enter image description here
JSON 的简短示例如下。
有什么想法吗?
{
"created": "2020-05-12T15:10:37Z",
"device": {
"device_info": {
"device_fw": 204,
"device_sn": "06-02133",
"device_trait": 2,
"device_type": 190
},
"timeseries": [
{
"configuration": {
"sensors": [
{
"measurements": [
"BATTERY",
"BATTERY_MV"
],
"port": 7,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 133,
"sensor_sn": "Unavailable"
},
{
"measurements": [
"REFERENCE_KPA",
"TEMPC_LOGGER"
],
"port": 8,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 134,
"sensor_sn": "Unavailable"
}
],
"valid_since": "2018-08-11T21:45:00Z",
"values": [
[
1534023900,
0,
19,
[
{
"description": "Battery Percent",
"error": false,
"units": "%",
"value": 100
},
{
"description": "Battery Voltage",
"error": false,
"units": " mV",
"value": 7864
}
],
[
{
"description": "Reference Pressure",
"error": false,
"units": " kPa",
"value": 100.62
},
{
"description": "Logger Temperature",
"error": false,
"units": " \u00b0C",
"value": 28.34
}
]
]
}
}
}
}
}
}
已修订 JSON
{
"created": "2020-05-12T15:10:37Z",
"device": {
"device_info": {
"device_fw": 204,
"device_sn": "06-02133",
"device_trait": 2,
"device_type": 190
},
"timeseries": [
{
"configuration": {
"sensors": [
{
"measurements": [
"BATTERY",
"BATTERY_MV"
],
"port": 7,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 133,
"sensor_sn": "Unavailable"
},
{
"measurements": [
"REFERENCE_KPA",
"TEMPC_LOGGER"
],
"port": 8,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 134,
"sensor_sn": "Unavailable"
}
],
"valid_since": "2018-08-11T21:45:00Z",
"values": [
[
1534023900,
0,
19,
[
{
"description": "Battery Percent",
"error": false,
"units": "%",
"value": 100
},
{
"description": "Battery Voltage",
"error": false,
"units": " mV",
"value": 7864
}
],
[
{
"description": "Reference Pressure",
"error": false,
"units": " kPa",
"value": 100.62
},
{
"description": "Logger Temperature",
"error": false,
"units": " \u00b0C",
"value": 28.34
}
]
]
]
}
}]
}
}
jmespath can help with nested data : the docs 非常健壮,但访问数据的基础是:如果它是一个键,那么你可以使用 .
如果它不是数据中的第一个条目,如果它是 array/list 使用 []
您的数据位置摘要:device -> timeseries(dict)->[](array)->configuration(dict)->values(key)->[](array)->[0](array and get the first value)
实际代码:
import jmespath
expression = jmespath.compile('device.timeseries[].configuration.values[][0]')
expression.search(data)
[1534023900]
我一直在处理类似的问题,但 json_normalize 对我帮助不大。但是,我编写了以下代码,它适用于大多数嵌套 JSON 用例。我仍处于开发状态并正在测试各种 json 文件。
任何人,请随时发表评论或提供任何改进这部分的建议。
我的主要目标是保留 JSON 的结构并在数据框中提供它。
import json
import pandas as pd
import numpy as np
def flatten_outer (data):
full_list = []
def flatten_inner(sub_data,first_level_key='',index=0,tot_len=0):
for k,v in sub_data.items():
full_key = first_level_key+'.'+k if first_level_key !='' else k
if isinstance(v, dict):
flatten_inner(v, full_key)
elif isinstance(v, list):
for i in range(0, len(v)):
if (isinstance(v[i], dict)):
flatten_inner(v[i], full_key,index=i, tot_len=len(v))
else:
val_ls = value_list[full_key] if full_key in value_list.keys() else []
val_ls.append(v)
value_list[full_key] = val_ls
break
else:
if full_key in value_list.keys():
placeholder_list = value_list[full_key]
placeholder_list[index] = v
value_list[full_key] = placeholder_list
else:
if index == 0:
if tot_len == 0:
value_list[full_key] = v
else:
placeholder_list = [None]*tot_len
placeholder_list[0] = v
value_list[full_key] = placeholder_list
else:
dif = tot_len - index - 1
placeholder_list = [None] * index
placeholder_list.append(v)
placeholder_list = placeholder_list + [None] * dif
value_list[full_key] = placeholder_list
return value_list
for row in data:
value_list = dict() #creating a value_list to store key value pairs(column values) for each record
cv = flatten_inner(row)
full_list.append(cv)
return full_list
def df_create_clean(full_list):
df = pd.DataFrame(full_list)
df = df.where(pd.notnull(df),None)
cols = df.columns
for col in cols:
df[col] = df[col].apply(lambda x: None if (isinstance(x,list) and len(x)==0) else x)
df[col] = df[col].apply(lambda x: x[0] if (isinstance(x,list) and len(x)==1) else x)
return df
def flatten_json(data):
df = pd.DataFrame(flatten_outer(data))
cleaned_df = df_create_clean(df)
return cleaned_df
将上述代码保存到文件中 flatten_json.py。 运行下面的代码。
import flatten_json as fj
data = [{
"created": "2020-05-12T15:10:37Z",
"device": {
"device_info": {
"device_fw": 204,
"device_sn": "06-02133",
"device_trait": 2,
"device_type": 190
},
"timeseries": [
{
"configuration": {
"sensors": [
{
"measurements": [
"BATTERY",
"BATTERY_MV"
],
"port": 7,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 133,
"sensor_sn": "Unavailable"
},
{
"measurements": [
"REFERENCE_KPA",
"TEMPC_LOGGER"
],
"port": 8,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 134,
"sensor_sn": "Unavailable"
}
],
"valid_since": "2018-08-11T21:45:00Z",
"values": [
[
1534023900,
0,
19,
[
{
"description": "Battery Percent",
"error": False,
"units": "%",
"value": 100
},
{
"description": "Battery Voltage",
"error": False,
"units": " mV",
"value": 7864
}
],
[
{
"description": "Reference Pressure",
"error": False,
"units": " kPa",
"value": 100.62
},
{
"description": "Logger Temperature",
"error": False,
"units": " \u00b0C",
"value": 28.34
}
]
]
]
}
}]
}
} ]
df = fj.flatten_json(data)
print(df.loc[0])
输出
created 2020-05-12T15:10:37Z
device.device_info.device_fw 204
device.device_info.device_sn 06-02133
device.device_info.device_trait 2
device.device_info.device_type 190
device.timeseries.configuration.sensors.measurements [[BATTERY, BATTERY_MV], [REFERENCE_KPA, TEMPC_LOG...
device.timeseries.configuration.sensors.port [7, 8]
device.timeseries.configuration.sensors.sensor_bonus_value [Unavailable, Unavailable]
device.timeseries.configuration.sensors.sensor_firmware_ver [Unavailable, Unavailable]
device.timeseries.configuration.sensors.sensor_number [133, 134]
device.timeseries.configuration.sensors.sensor_sn [Unavailable, Unavailable]
device.timeseries.configuration.valid_since 2018-08-11T21:45:00Z
device.timeseries.configuration.values [1534023900, 0, 19, [{'description': 'Battery ...
现在从这个 df 您可以使用 device.timeseries.configuration.values 列对每个传感器数据进行进一步分析。
我正在努力通过 json_normalize 获取我需要的信息。我查看了文档和可能 10 个深度嵌套 JSON 文件的示例,但我无法很好地掌握函数的上下文以提取正确的信息。我正在尝试构建一个数据框,其中包含每个传感器的时间戳值(值键)。 1534023900 是以 UTC 秒为单位的时间戳。
enter image description here
JSON 的简短示例如下。
有什么想法吗?
{
"created": "2020-05-12T15:10:37Z",
"device": {
"device_info": {
"device_fw": 204,
"device_sn": "06-02133",
"device_trait": 2,
"device_type": 190
},
"timeseries": [
{
"configuration": {
"sensors": [
{
"measurements": [
"BATTERY",
"BATTERY_MV"
],
"port": 7,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 133,
"sensor_sn": "Unavailable"
},
{
"measurements": [
"REFERENCE_KPA",
"TEMPC_LOGGER"
],
"port": 8,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 134,
"sensor_sn": "Unavailable"
}
],
"valid_since": "2018-08-11T21:45:00Z",
"values": [
[
1534023900,
0,
19,
[
{
"description": "Battery Percent",
"error": false,
"units": "%",
"value": 100
},
{
"description": "Battery Voltage",
"error": false,
"units": " mV",
"value": 7864
}
],
[
{
"description": "Reference Pressure",
"error": false,
"units": " kPa",
"value": 100.62
},
{
"description": "Logger Temperature",
"error": false,
"units": " \u00b0C",
"value": 28.34
}
]
]
}
}
}
}
}
}
已修订 JSON
{
"created": "2020-05-12T15:10:37Z",
"device": {
"device_info": {
"device_fw": 204,
"device_sn": "06-02133",
"device_trait": 2,
"device_type": 190
},
"timeseries": [
{
"configuration": {
"sensors": [
{
"measurements": [
"BATTERY",
"BATTERY_MV"
],
"port": 7,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 133,
"sensor_sn": "Unavailable"
},
{
"measurements": [
"REFERENCE_KPA",
"TEMPC_LOGGER"
],
"port": 8,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 134,
"sensor_sn": "Unavailable"
}
],
"valid_since": "2018-08-11T21:45:00Z",
"values": [
[
1534023900,
0,
19,
[
{
"description": "Battery Percent",
"error": false,
"units": "%",
"value": 100
},
{
"description": "Battery Voltage",
"error": false,
"units": " mV",
"value": 7864
}
],
[
{
"description": "Reference Pressure",
"error": false,
"units": " kPa",
"value": 100.62
},
{
"description": "Logger Temperature",
"error": false,
"units": " \u00b0C",
"value": 28.34
}
]
]
]
}
}]
}
}
jmespath can help with nested data : the docs 非常健壮,但访问数据的基础是:如果它是一个键,那么你可以使用 .
如果它不是数据中的第一个条目,如果它是 array/list 使用 []
您的数据位置摘要:device -> timeseries(dict)->[](array)->configuration(dict)->values(key)->[](array)->[0](array and get the first value)
实际代码:
import jmespath
expression = jmespath.compile('device.timeseries[].configuration.values[][0]')
expression.search(data)
[1534023900]
我一直在处理类似的问题,但 json_normalize 对我帮助不大。但是,我编写了以下代码,它适用于大多数嵌套 JSON 用例。我仍处于开发状态并正在测试各种 json 文件。 任何人,请随时发表评论或提供任何改进这部分的建议。
我的主要目标是保留 JSON 的结构并在数据框中提供它。
import json
import pandas as pd
import numpy as np
def flatten_outer (data):
full_list = []
def flatten_inner(sub_data,first_level_key='',index=0,tot_len=0):
for k,v in sub_data.items():
full_key = first_level_key+'.'+k if first_level_key !='' else k
if isinstance(v, dict):
flatten_inner(v, full_key)
elif isinstance(v, list):
for i in range(0, len(v)):
if (isinstance(v[i], dict)):
flatten_inner(v[i], full_key,index=i, tot_len=len(v))
else:
val_ls = value_list[full_key] if full_key in value_list.keys() else []
val_ls.append(v)
value_list[full_key] = val_ls
break
else:
if full_key in value_list.keys():
placeholder_list = value_list[full_key]
placeholder_list[index] = v
value_list[full_key] = placeholder_list
else:
if index == 0:
if tot_len == 0:
value_list[full_key] = v
else:
placeholder_list = [None]*tot_len
placeholder_list[0] = v
value_list[full_key] = placeholder_list
else:
dif = tot_len - index - 1
placeholder_list = [None] * index
placeholder_list.append(v)
placeholder_list = placeholder_list + [None] * dif
value_list[full_key] = placeholder_list
return value_list
for row in data:
value_list = dict() #creating a value_list to store key value pairs(column values) for each record
cv = flatten_inner(row)
full_list.append(cv)
return full_list
def df_create_clean(full_list):
df = pd.DataFrame(full_list)
df = df.where(pd.notnull(df),None)
cols = df.columns
for col in cols:
df[col] = df[col].apply(lambda x: None if (isinstance(x,list) and len(x)==0) else x)
df[col] = df[col].apply(lambda x: x[0] if (isinstance(x,list) and len(x)==1) else x)
return df
def flatten_json(data):
df = pd.DataFrame(flatten_outer(data))
cleaned_df = df_create_clean(df)
return cleaned_df
将上述代码保存到文件中 flatten_json.py。 运行下面的代码。
import flatten_json as fj
data = [{
"created": "2020-05-12T15:10:37Z",
"device": {
"device_info": {
"device_fw": 204,
"device_sn": "06-02133",
"device_trait": 2,
"device_type": 190
},
"timeseries": [
{
"configuration": {
"sensors": [
{
"measurements": [
"BATTERY",
"BATTERY_MV"
],
"port": 7,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 133,
"sensor_sn": "Unavailable"
},
{
"measurements": [
"REFERENCE_KPA",
"TEMPC_LOGGER"
],
"port": 8,
"sensor_bonus_value": "Unavailable",
"sensor_firmware_ver": "Unavailable",
"sensor_number": 134,
"sensor_sn": "Unavailable"
}
],
"valid_since": "2018-08-11T21:45:00Z",
"values": [
[
1534023900,
0,
19,
[
{
"description": "Battery Percent",
"error": False,
"units": "%",
"value": 100
},
{
"description": "Battery Voltage",
"error": False,
"units": " mV",
"value": 7864
}
],
[
{
"description": "Reference Pressure",
"error": False,
"units": " kPa",
"value": 100.62
},
{
"description": "Logger Temperature",
"error": False,
"units": " \u00b0C",
"value": 28.34
}
]
]
]
}
}]
}
} ]
df = fj.flatten_json(data)
print(df.loc[0])
输出
created 2020-05-12T15:10:37Z
device.device_info.device_fw 204
device.device_info.device_sn 06-02133
device.device_info.device_trait 2
device.device_info.device_type 190
device.timeseries.configuration.sensors.measurements [[BATTERY, BATTERY_MV], [REFERENCE_KPA, TEMPC_LOG...
device.timeseries.configuration.sensors.port [7, 8]
device.timeseries.configuration.sensors.sensor_bonus_value [Unavailable, Unavailable]
device.timeseries.configuration.sensors.sensor_firmware_ver [Unavailable, Unavailable]
device.timeseries.configuration.sensors.sensor_number [133, 134]
device.timeseries.configuration.sensors.sensor_sn [Unavailable, Unavailable]
device.timeseries.configuration.valid_since 2018-08-11T21:45:00Z
device.timeseries.configuration.values [1534023900, 0, 19, [{'description': 'Battery ...
现在从这个 df 您可以使用 device.timeseries.configuration.values 列对每个传感器数据进行进一步分析。