在 Python 中将 JSON 文件扁平化为 Pandas 数据框
Flatting a JSON file into Pandas Dataframe in Python
我有 json 这种格式:
{
"fields": {
"tcidte": {
"mode": "required",
"type": "date",
"format": "%Y%m%d"
},
"tcmcid": {
"mode": "required",
"type": "string"
},
"tcacbr": {
"mode": "required",
"type": "string"
}
}
}
我希望它采用数据帧格式,其中三个字段名称中的每一个都是单独的行。其中一行有一列(例如“格式”),而其他行为空的列应假定为 NULL。
我已尝试使用我在此处找到的 flatten_json 功能,但没有按预期工作,但仍会在此处包括:
def flatten_json(nested_json, exclude=['']):
"""Flatten json object with nested keys into a single level.
Args:
nested_json: A nested json object.
exclude: Keys to exclude from output.
Returns:
The flattened json object if successful, None otherwise.
"""
out = {}
def flatten(x, name='', exclude=exclude):
if type(x) is dict:
for a in x:
if a not in exclude: flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(nested_json)
return out
flatten_json_file = pd.DataFrame(flatten_json(nested_json))
pprint.pprint(flatten_json_file)
额外的复杂性JSON:
{
"fields": {
"action": {
"type": {
"field_type": "string"
},
"mode": "required"
},
"upi": {
"type": {
"field_type": "string"
},
"regex": "^[0-9]{9}$",
"mode": "required"
},
"firstname": {
"type": {
"field_type": "string"
},
"mode": "required"
}
}
}
一个选项是 jmespath
库,它在以下场景中很有用:
# pip install jmespath
import jmespath
import pandas as pd
# think of it like a path
# fields is the first key
# there are sub keys with varying names
# we are only interested in mode, type, format
# hence the * to represent the intermediate key(s)
expression = jmespath.compile('fields.*[mode, type, format]')
pd.DataFrame(expression.search(data), columns = ['mode', 'type', 'format'])
mode type format
0 required date %Y%m%d
1 required string None
2 required string None
jmespath 有很多工具;然而,这应该足够了,并且涵盖了子词典中缺少键(模式、类型、格式)的场景。
df= pd.read_json('test.json')
df_fields = pd.DataFrame(df['fields'].values.tolist(), index=df.index)
print(df_fields)
输出:
mode type format
tcacbr required string NaN
tcidte required date %Y%m%d
tcmcid required string NaN
有
data = {
"fields": {
"tcidte": {
"mode": "required",
"type": "date",
"format": "%Y%m%d"
},
"tcmcid": {
"mode": "required",
"type": "string"
},
"tcacbr": {
"mode": "required",
"type": "string"
}
}
}
这个
df = pd.DataFrame(data["fields"].values())
结果
mode type format
0 required date %Y%m%d
1 required string NaN
2 required string NaN
这是你的目标吗?
如果你想要data["fields"]
的键作为索引:
df = pd.DataFrame(data["fields"]).T
或
df = pd.DataFrame.from_dict(data["fields"], orient="index")
两者都导致
mode type format
tcidte required date %Y%m%d
tcmcid required string NaN
tcacbr required string NaN
有
data = {
"fields": {
"action": {
"type": {
"field_type": "string"
},
"mode": "required"
},
"upi": {
"type": {
"field_type": "string"
},
"regex": "^[0-9]{9}$",
"mode": "required"
},
"firstname": {
"type": {
"field_type": "string"
},
"mode": "required"
}
}
}
你可以做
data = {key: {**d, **d["type"]} for key, d in data["fields"].items()}
df = pd.DataFrame.from_dict(data, orient="index").drop(columns="type")
或
df = pd.DataFrame.from_dict(data["fields"], orient="index")
df = pd.concat(
[df, pd.DataFrame(df.type.to_list(), index=df.index)], axis=1
).drop(columns="type")
结果类似(列位置可能不同)
mode field_type regex
action required string NaN
upi required string ^[0-9]{9}$
firstname required string NaN
我有 json 这种格式:
{
"fields": {
"tcidte": {
"mode": "required",
"type": "date",
"format": "%Y%m%d"
},
"tcmcid": {
"mode": "required",
"type": "string"
},
"tcacbr": {
"mode": "required",
"type": "string"
}
}
}
我希望它采用数据帧格式,其中三个字段名称中的每一个都是单独的行。其中一行有一列(例如“格式”),而其他行为空的列应假定为 NULL。
我已尝试使用我在此处找到的 flatten_json 功能,但没有按预期工作,但仍会在此处包括:
def flatten_json(nested_json, exclude=['']):
"""Flatten json object with nested keys into a single level.
Args:
nested_json: A nested json object.
exclude: Keys to exclude from output.
Returns:
The flattened json object if successful, None otherwise.
"""
out = {}
def flatten(x, name='', exclude=exclude):
if type(x) is dict:
for a in x:
if a not in exclude: flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(nested_json)
return out
flatten_json_file = pd.DataFrame(flatten_json(nested_json))
pprint.pprint(flatten_json_file)
额外的复杂性JSON:
{
"fields": {
"action": {
"type": {
"field_type": "string"
},
"mode": "required"
},
"upi": {
"type": {
"field_type": "string"
},
"regex": "^[0-9]{9}$",
"mode": "required"
},
"firstname": {
"type": {
"field_type": "string"
},
"mode": "required"
}
}
}
一个选项是 jmespath
库,它在以下场景中很有用:
# pip install jmespath
import jmespath
import pandas as pd
# think of it like a path
# fields is the first key
# there are sub keys with varying names
# we are only interested in mode, type, format
# hence the * to represent the intermediate key(s)
expression = jmespath.compile('fields.*[mode, type, format]')
pd.DataFrame(expression.search(data), columns = ['mode', 'type', 'format'])
mode type format
0 required date %Y%m%d
1 required string None
2 required string None
jmespath 有很多工具;然而,这应该足够了,并且涵盖了子词典中缺少键(模式、类型、格式)的场景。
df= pd.read_json('test.json')
df_fields = pd.DataFrame(df['fields'].values.tolist(), index=df.index)
print(df_fields)
输出:
mode type format
tcacbr required string NaN
tcidte required date %Y%m%d
tcmcid required string NaN
有
data = {
"fields": {
"tcidte": {
"mode": "required",
"type": "date",
"format": "%Y%m%d"
},
"tcmcid": {
"mode": "required",
"type": "string"
},
"tcacbr": {
"mode": "required",
"type": "string"
}
}
}
这个
df = pd.DataFrame(data["fields"].values())
结果
mode type format
0 required date %Y%m%d
1 required string NaN
2 required string NaN
这是你的目标吗?
如果你想要data["fields"]
的键作为索引:
df = pd.DataFrame(data["fields"]).T
或
df = pd.DataFrame.from_dict(data["fields"], orient="index")
两者都导致
mode type format
tcidte required date %Y%m%d
tcmcid required string NaN
tcacbr required string NaN
有
data = {
"fields": {
"action": {
"type": {
"field_type": "string"
},
"mode": "required"
},
"upi": {
"type": {
"field_type": "string"
},
"regex": "^[0-9]{9}$",
"mode": "required"
},
"firstname": {
"type": {
"field_type": "string"
},
"mode": "required"
}
}
}
你可以做
data = {key: {**d, **d["type"]} for key, d in data["fields"].items()}
df = pd.DataFrame.from_dict(data, orient="index").drop(columns="type")
或
df = pd.DataFrame.from_dict(data["fields"], orient="index")
df = pd.concat(
[df, pd.DataFrame(df.type.to_list(), index=df.index)], axis=1
).drop(columns="type")
结果类似(列位置可能不同)
mode field_type regex
action required string NaN
upi required string ^[0-9]{9}$
firstname required string NaN