使用 pandas 从 s3 嵌套 json 到数据帧
nested json from s3 to dataframe with pandas
我正在努力解除这个 json 的嵌套,从 s3 中提取,并仅将其部分存储在数据帧中。
这是结构
import boto3
import json
s3 = boto3.resource('s3')
dat = []
content_object = s3.Object(FROM_BUCKET, key['Key'])
file_content = content_object.get()['Body'].read().decode('utf-8')
json_content = json.loads(file_content)
json_content
output:
{'twts': {'101861193645447': {'aiScrs': [{'lfeEvtId': 5,
'orgScr': 0.779,
'adjScr': 0.3865,
'lstScrUtc': '2021-02-24T22:14:17.8420665Z',
'lstScrYmd': '2021-02-24'}]},
'100300192097235': {'aiScrs': [{'lfeEvtId': 5,
'orgScr': 0.765,
'adjScr': 0.365,
'lstScrUtc': '2021-02-24T22:14:17.8420665Z',
'lstScrYmd': '2021-02-24'}]},
'100179311336977': {'aiScrs': [{'lfeEvtId': 5,
'orgScr': 0.732,
'adjScr': 0.332,
'lstScrUtc': '2021-02-24T22:14:17.8420665Z',
'lstScrYmd': '2021-02-24'}]}}}
这是我的尝试
df_dat=[]
dat =[]
response = s3_c.get_object(Bucket=FROM_BUCKET, Key=key['Key'])
df_dat = pd.read_json(response['Body'],convert_axes=False)
df_dat
dat = pd.json_normalize(data=df_dat)
dat
output:
twts
100179311336977 {'aiScrs': [{'lfeEvtId': 5, 'orgScr': 0.732, 'adjScr': 0.332, 'lstScrUtc': '2021-02-24T22:14:17.8420665Z', 'lstScrYmd': '2022-02-24'}]}
100300192097235 {'aiScrs': [{'lfeEvtId': 5, 'orgScr': 0.765, 'adjScr': 0.365, 'lstScrUtc': '2021-02-24T22:14:17.8420665Z', 'lstScrYmd': '2022-02-24'}]}
101861193645447 {'aiScrs': [{'lfeEvtId': 5, 'orgScr': 0.779, 'adjScr': 0.3865, 'lstScrUtc': '2021-02-24T22:14:17.8420665Z', 'lstScrYmd': '2022-02-24'}]}
this last part errors out
--------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-83-0d22f901897d> in <module>
4 df_dat = pd.read_json(response['Body'],convert_axes=False)
5 df_dat
----> 6 dat = pd.json_normalize(data=df_dat)
7 # dat = pd.json_normalize(data=df_dat, record_path=['aiScrs'])
8 dat
~/anaconda3/envs/amazonei_tensorflow2_p36/lib/python3.6/site-packages/pandas/io/json/_normalize.py in _json_normalize(data, record_path, meta, meta_prefix, record_prefix, errors, sep, max_level)
268
269 if record_path is None:
--> 270 if any([isinstance(x, dict) for x in y.values()] for y in data):
271 # naive normalization, this is idempotent for flat records
272 # and potentially will inflate the data considerably for
~/anaconda3/envs/amazonei_tensorflow2_p36/lib/python3.6/site-packages/pandas/io/json/_normalize.py in <genexpr>(.0)
268
269 if record_path is None:
--> 270 if any([isinstance(x, dict) for x in y.values()] for y in data):
271 # naive normalization, this is idempotent for flat records
272 # and potentially will inflate the data considerably for
AttributeError: 'str' object has no attribute 'values'
当我尝试以任何方式操作它时,它都会出错,包括
dat = pd.json_normalize(data=df_dat, record_path=['aiScrs'])
我正在尝试取出 3 行,包括以下所有列
ID lfeEvtId orgScr adjScr lstScrUtc lstScrYmd
X
X
X...
我似乎无法找到一种方法来做到这一点(最好使用 json_normalize)
首先,一些 list-comphrension 将 json_content
塑造成更有用的结构。那么pd.json_normalize
使用简单
tweet_json_list = [{'id': k, **v} for k, v in json_content['twts'].items()]
df = pd.json_normalize(tweet_json_list, record_path='aiScrs', meta=['id'])
输出:
>>> df
lfeEvtId orgScr adjScr lstScrUtc lstScrYmd id
0 5 0.779 0.3865 2021-02-24T22:14:17.8420665Z 2021-02-24 101861193645447
1 5 0.765 0.3650 2021-02-24T22:14:17.8420665Z 2021-02-24 100300192097235
2 5 0.732 0.3320 2021-02-24T22:14:17.8420665Z 2021-02-24 100179311336977
我正在努力解除这个 json 的嵌套,从 s3 中提取,并仅将其部分存储在数据帧中。
这是结构
import boto3
import json
s3 = boto3.resource('s3')
dat = []
content_object = s3.Object(FROM_BUCKET, key['Key'])
file_content = content_object.get()['Body'].read().decode('utf-8')
json_content = json.loads(file_content)
json_content
output:
{'twts': {'101861193645447': {'aiScrs': [{'lfeEvtId': 5,
'orgScr': 0.779,
'adjScr': 0.3865,
'lstScrUtc': '2021-02-24T22:14:17.8420665Z',
'lstScrYmd': '2021-02-24'}]},
'100300192097235': {'aiScrs': [{'lfeEvtId': 5,
'orgScr': 0.765,
'adjScr': 0.365,
'lstScrUtc': '2021-02-24T22:14:17.8420665Z',
'lstScrYmd': '2021-02-24'}]},
'100179311336977': {'aiScrs': [{'lfeEvtId': 5,
'orgScr': 0.732,
'adjScr': 0.332,
'lstScrUtc': '2021-02-24T22:14:17.8420665Z',
'lstScrYmd': '2021-02-24'}]}}}
这是我的尝试
df_dat=[]
dat =[]
response = s3_c.get_object(Bucket=FROM_BUCKET, Key=key['Key'])
df_dat = pd.read_json(response['Body'],convert_axes=False)
df_dat
dat = pd.json_normalize(data=df_dat)
dat
output:
twts
100179311336977 {'aiScrs': [{'lfeEvtId': 5, 'orgScr': 0.732, 'adjScr': 0.332, 'lstScrUtc': '2021-02-24T22:14:17.8420665Z', 'lstScrYmd': '2022-02-24'}]}
100300192097235 {'aiScrs': [{'lfeEvtId': 5, 'orgScr': 0.765, 'adjScr': 0.365, 'lstScrUtc': '2021-02-24T22:14:17.8420665Z', 'lstScrYmd': '2022-02-24'}]}
101861193645447 {'aiScrs': [{'lfeEvtId': 5, 'orgScr': 0.779, 'adjScr': 0.3865, 'lstScrUtc': '2021-02-24T22:14:17.8420665Z', 'lstScrYmd': '2022-02-24'}]}
this last part errors out
--------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-83-0d22f901897d> in <module>
4 df_dat = pd.read_json(response['Body'],convert_axes=False)
5 df_dat
----> 6 dat = pd.json_normalize(data=df_dat)
7 # dat = pd.json_normalize(data=df_dat, record_path=['aiScrs'])
8 dat
~/anaconda3/envs/amazonei_tensorflow2_p36/lib/python3.6/site-packages/pandas/io/json/_normalize.py in _json_normalize(data, record_path, meta, meta_prefix, record_prefix, errors, sep, max_level)
268
269 if record_path is None:
--> 270 if any([isinstance(x, dict) for x in y.values()] for y in data):
271 # naive normalization, this is idempotent for flat records
272 # and potentially will inflate the data considerably for
~/anaconda3/envs/amazonei_tensorflow2_p36/lib/python3.6/site-packages/pandas/io/json/_normalize.py in <genexpr>(.0)
268
269 if record_path is None:
--> 270 if any([isinstance(x, dict) for x in y.values()] for y in data):
271 # naive normalization, this is idempotent for flat records
272 # and potentially will inflate the data considerably for
AttributeError: 'str' object has no attribute 'values'
当我尝试以任何方式操作它时,它都会出错,包括
dat = pd.json_normalize(data=df_dat, record_path=['aiScrs'])
我正在尝试取出 3 行,包括以下所有列
ID lfeEvtId orgScr adjScr lstScrUtc lstScrYmd
X
X
X...
我似乎无法找到一种方法来做到这一点(最好使用 json_normalize)
首先,一些 list-comphrension 将 json_content
塑造成更有用的结构。那么pd.json_normalize
使用简单
tweet_json_list = [{'id': k, **v} for k, v in json_content['twts'].items()]
df = pd.json_normalize(tweet_json_list, record_path='aiScrs', meta=['id'])
输出:
>>> df
lfeEvtId orgScr adjScr lstScrUtc lstScrYmd id
0 5 0.779 0.3865 2021-02-24T22:14:17.8420665Z 2021-02-24 101861193645447
1 5 0.765 0.3650 2021-02-24T22:14:17.8420665Z 2021-02-24 100300192097235
2 5 0.732 0.3320 2021-02-24T22:14:17.8420665Z 2021-02-24 100179311336977