将 Json 展平为 Pandas
Flatten Json into Pandas
我尝试使用 Pandas 将 JSON 文件转换为 DataFrame。我将 pd.json_normalize 与 meta 一起使用,但我的 DataFrame 中又嵌套了一个字典。也许我不能使用 pd.json_normalize,你有什么想法吗?
也许我必须在使用 Pandas.
之前将数据扁平化
request_json_decoding =\
{'_typ': 'Index',
'assetClass': 'EQUITY',
'basket': {'basketDate': '2020-12-02T00:00:00.000+0100',
'constituents': [{'asset': {'_typ': 'Asset',
'allAnalytics': [{'marketCap': 2889054400}],
'assetClass': 'EQUITY',
'ccy': 'EUR',
'exposureCtry': 'DEU',
'id': {'code': '2603021'},
'moreIds': [{'codScheme': 'ISIN',
'code': 'DE000A2GS5D8'},
{'codScheme': 'SEDOL',
'code': 'BFYTTC2'},
{'codScheme': 'INST_NUM',
'code': '5679589'}],
'name': 'DERMAPHARM HOLDING SE',
'sectors': [{'code': '551',
'type': 'HSBC'},
{'code': '4577',
'type': 'ICB'}]},
'exchRate': {'rate': 1.2077999235945769},
'factor': 1,
'price': 53.66,
'weight': 0.0008423553691237516},
{'asset': {'_typ': 'Asset',
'allAnalytics': [{'marketCap': 396594792}],
'assetClass': 'EQUITY',
'ccy': 'EUR',
'exposureCtry': 'DEU',
'id': {'code': '1506422'},
'moreIds': [{'codScheme': 'ISIN',
'code': 'DE0007193500'},
{'codScheme': 'SEDOL',
'code': '5558203'},
{'codScheme': 'INST_NUM',
'code': '1050870'}],
'name': 'KOENIG & BAUER AG',
'sectors': [{'code': '331',
'type': 'HSBC'},
{'code': '2757',
'type': 'ICB'}]},
'exchRate': {'rate': 1.2077999235945769},
'factor': 1,
'price': 24,
'weight': 0.000280826220184372},
{'asset': {'_typ': 'Asset',
'allAnalytics': [{'marketCap': 963572493.252}],
'assetClass': 'EQUITY',
'ccy': 'GBP',
'exposureCtry': 'GBR',
'id': {'code': '634295'},
'moreIds': [{'codScheme': 'ISIN',
'code': 'GB00B0744359'},
{'codScheme': 'SEDOL',
'code': 'B074435'},
{'codScheme': 'INST_NUM',
'code': '1388802'}],
'name': 'ESSENTRA PLC',
'sectors': [{'code': '211',
'type': 'HSBC'},
{'code': '2797',
'type': 'ICB'}]},
'exchRate': {'rate': 1.3325000766187542},
'factor': 1,
'price': 3.194,
'weight': 0.0008855804882485151}]},
'ccy': 'USD',
'id': {'code': '33', 'source': 'INDICE'},
'indexFreq': 'NONE',
'indexType': 'COMPOSITE',
'name': 'MSCI EUROPE (16) SMALL CAP',
'pricingSourceType': 'NF',
'sourceId': {'code': '106233', 'source': 'CODESF'}}
df = pd.json_normalize(request_json_decoding, ['basket', ['constituents']])
输出:
factor price weight asset._typ asset.allAnalytics asset.assetClass asset.ccy asset.exposureCtry asset.id.code asset.moreIds asset.name asset.sectors exchRate.rate
0 1 53.660 0.000842 Asset [{'marketCap': 2889054400}] EQUITY EUR DEU 2603021 [{'codScheme': 'ISIN', 'code': 'DE000A2GS5D8'}, {'codScheme': 'SEDOL', 'code': 'BFYTTC2'}, {'codScheme': 'INST_NUM', 'code': '5679589'}] DERMAPHARM HOLDING SE [{'code': '551', 'type': 'HSBC'}, {'code': '4577', 'type': 'ICB'}] 1.2078
1 1 24.000 0.000281 Asset [{'marketCap': 396594792}] EQUITY EUR DEU 1506422 [{'codScheme': 'ISIN', 'code': 'DE0007193500'}, {'codScheme': 'SEDOL', 'code': '5558203'}, {'codScheme': 'INST_NUM', 'code': '1050870'}] KOENIG & BAUER AG [{'code': '331', 'type': 'HSBC'}, {'code': '2757', 'type': 'ICB'}] 1.2078
2 1 3.194 0.000886 Asset [{'marketCap': 963572493.252}] EQUITY GBP GBR 634295 [{'codScheme': 'ISIN', 'code': 'GB00B0744359'}, {'codScheme': 'SEDOL', 'code': 'B074435'}, {'codScheme': 'INST_NUM', 'code': '1388802'}] ESSENTRA PLC [{'code': '211', 'type': 'HSBC'}, {'code': '2797', 'type': 'ICB'}] 1.3325
方法
- 初始
json_normalize()
- 重复
explode()
列表
reset_index(drop=True)
确保有可用的连接索引
join()
apply(pd.Series)
扩展嵌入的 dict 分解列表
df = (pd.json_normalize(request_json_decoding)
.explode("basket.constituents")
.reset_index(drop=True)
)
df = df.join(df["basket.constituents"].apply(pd.Series)).drop(columns="basket.constituents")
df = (df.join(df["exchRate"].apply(pd.Series))
.join(df["asset"].apply(pd.Series), rsuffix="_asset").drop(columns="asset")
.explode("moreIds")
.reset_index(drop=True)
)
df.join(df["moreIds"].apply(pd.Series)).drop(columns="moreIds")
采样 2 行
_typ
assetClass
ccy
indexFreq
indexType
name
pricingSourceType
basket.basketDate
id.code
id.source
sourceId.code
sourceId.source
exchRate
factor
price
weight
rate
_typ_asset
allAnalytics
assetClass_asset
ccy_asset
exposureCtry
id
name_asset
sectors
codScheme
code
0
Index
EQUITY
USD
NONE
COMPOSITE
MSCI EUROPE (16) SMALL CAP
NF
2020-12-02T00:00:00.000+0100
33
INDICE
106233
CODESF
{'rate': 1.2077999235945769}
1
53.66
0.000842355
1.2078
Asset
[{'marketCap': 2889054400}]
EQUITY
EUR
DEU
{'code': '2603021'}
DERMAPHARM HOLDING SE
[{'code': '551', 'type': 'HSBC'}, {'code': '4577', 'type': 'ICB'}]
ISIN
DE000A2GS5D8
1
Index
EQUITY
USD
NONE
COMPOSITE
MSCI EUROPE (16) SMALL CAP
NF
2020-12-02T00:00:00.000+0100
33
INDICE
106233
CODESF
{'rate': 1.2077999235945769}
1
53.66
0.000842355
1.2078
Asset
[{'marketCap': 2889054400}]
EQUITY
EUR
DEU
{'code': '2603021'}
DERMAPHARM HOLDING SE
[{'code': '551', 'type': 'HSBC'}, {'code': '4577', 'type': 'ICB'}]
SEDOL
BFYTTC2
我的解决方案,但速度很慢,因为我处理了 100 000 行。
%%time
df = pd.json_normalize(request_json_decoding, ['basket', ['constituents']])
df_infoindex = pd.json_normalize(request_json_decoding).drop(['basket.constituents'], axis=1)
df_allanalytics = df.explode('asset.allAnalytics')
df_allanalytics = pd.json_normalize(df_allanalytics['asset.allAnalytics'])
name_allanalytics = df_allanalytics.columns
df['asset.allAnalytics'] = df_allanalytics
df.rename(columns = {'asset.allAnalytics' : name_allanalytics[0]}, inplace = True)
#Find Codification
codification = df['asset.moreIds'].to_dict()
dict_codif = {}
for i in codification:
if isinstance(codification[i],list):
for ii in codification[i]:
if i not in dict_codif:
dict_codif[i] = ({ii['codScheme']:ii['code']})
else:
dict_codif[i].update({ii['codScheme']:ii['code']})
else:
dict_codif[i] = np.nan
df_codification = pd.DataFrame(dict_codif).T
#Find Sector
sector = df['asset.sectors'].to_dict()
dict_codif = {}
for i in sector:
if isinstance(sector[i],list):
for ii in sector[i]:
if i not in dict_codif:
dict_codif[i] = ({ii['type']:ii['code']})
else:
dict_codif[i].update({ii['type']:ii['code']})
else:
dict_codif[i] = np.nan
df_sector = pd.DataFrame(dict_codif).T
df.drop(['asset.moreIds', 'asset.sectors'], axis=1, inplace = True)
df_return = pd.concat([df, df_codification, df_sector,
pd.concat([df_infoindex]*df.shape[0], ignore_index=True)], axis = 1)
我尝试使用 Pandas 将 JSON 文件转换为 DataFrame。我将 pd.json_normalize 与 meta 一起使用,但我的 DataFrame 中又嵌套了一个字典。也许我不能使用 pd.json_normalize,你有什么想法吗? 也许我必须在使用 Pandas.
之前将数据扁平化request_json_decoding =\
{'_typ': 'Index',
'assetClass': 'EQUITY',
'basket': {'basketDate': '2020-12-02T00:00:00.000+0100',
'constituents': [{'asset': {'_typ': 'Asset',
'allAnalytics': [{'marketCap': 2889054400}],
'assetClass': 'EQUITY',
'ccy': 'EUR',
'exposureCtry': 'DEU',
'id': {'code': '2603021'},
'moreIds': [{'codScheme': 'ISIN',
'code': 'DE000A2GS5D8'},
{'codScheme': 'SEDOL',
'code': 'BFYTTC2'},
{'codScheme': 'INST_NUM',
'code': '5679589'}],
'name': 'DERMAPHARM HOLDING SE',
'sectors': [{'code': '551',
'type': 'HSBC'},
{'code': '4577',
'type': 'ICB'}]},
'exchRate': {'rate': 1.2077999235945769},
'factor': 1,
'price': 53.66,
'weight': 0.0008423553691237516},
{'asset': {'_typ': 'Asset',
'allAnalytics': [{'marketCap': 396594792}],
'assetClass': 'EQUITY',
'ccy': 'EUR',
'exposureCtry': 'DEU',
'id': {'code': '1506422'},
'moreIds': [{'codScheme': 'ISIN',
'code': 'DE0007193500'},
{'codScheme': 'SEDOL',
'code': '5558203'},
{'codScheme': 'INST_NUM',
'code': '1050870'}],
'name': 'KOENIG & BAUER AG',
'sectors': [{'code': '331',
'type': 'HSBC'},
{'code': '2757',
'type': 'ICB'}]},
'exchRate': {'rate': 1.2077999235945769},
'factor': 1,
'price': 24,
'weight': 0.000280826220184372},
{'asset': {'_typ': 'Asset',
'allAnalytics': [{'marketCap': 963572493.252}],
'assetClass': 'EQUITY',
'ccy': 'GBP',
'exposureCtry': 'GBR',
'id': {'code': '634295'},
'moreIds': [{'codScheme': 'ISIN',
'code': 'GB00B0744359'},
{'codScheme': 'SEDOL',
'code': 'B074435'},
{'codScheme': 'INST_NUM',
'code': '1388802'}],
'name': 'ESSENTRA PLC',
'sectors': [{'code': '211',
'type': 'HSBC'},
{'code': '2797',
'type': 'ICB'}]},
'exchRate': {'rate': 1.3325000766187542},
'factor': 1,
'price': 3.194,
'weight': 0.0008855804882485151}]},
'ccy': 'USD',
'id': {'code': '33', 'source': 'INDICE'},
'indexFreq': 'NONE',
'indexType': 'COMPOSITE',
'name': 'MSCI EUROPE (16) SMALL CAP',
'pricingSourceType': 'NF',
'sourceId': {'code': '106233', 'source': 'CODESF'}}
df = pd.json_normalize(request_json_decoding, ['basket', ['constituents']])
输出:
factor price weight asset._typ asset.allAnalytics asset.assetClass asset.ccy asset.exposureCtry asset.id.code asset.moreIds asset.name asset.sectors exchRate.rate
0 1 53.660 0.000842 Asset [{'marketCap': 2889054400}] EQUITY EUR DEU 2603021 [{'codScheme': 'ISIN', 'code': 'DE000A2GS5D8'}, {'codScheme': 'SEDOL', 'code': 'BFYTTC2'}, {'codScheme': 'INST_NUM', 'code': '5679589'}] DERMAPHARM HOLDING SE [{'code': '551', 'type': 'HSBC'}, {'code': '4577', 'type': 'ICB'}] 1.2078
1 1 24.000 0.000281 Asset [{'marketCap': 396594792}] EQUITY EUR DEU 1506422 [{'codScheme': 'ISIN', 'code': 'DE0007193500'}, {'codScheme': 'SEDOL', 'code': '5558203'}, {'codScheme': 'INST_NUM', 'code': '1050870'}] KOENIG & BAUER AG [{'code': '331', 'type': 'HSBC'}, {'code': '2757', 'type': 'ICB'}] 1.2078
2 1 3.194 0.000886 Asset [{'marketCap': 963572493.252}] EQUITY GBP GBR 634295 [{'codScheme': 'ISIN', 'code': 'GB00B0744359'}, {'codScheme': 'SEDOL', 'code': 'B074435'}, {'codScheme': 'INST_NUM', 'code': '1388802'}] ESSENTRA PLC [{'code': '211', 'type': 'HSBC'}, {'code': '2797', 'type': 'ICB'}] 1.3325
方法
- 初始
json_normalize()
- 重复
explode()
列表reset_index(drop=True)
确保有可用的连接索引join()
apply(pd.Series)
扩展嵌入的 dict 分解列表
df = (pd.json_normalize(request_json_decoding)
.explode("basket.constituents")
.reset_index(drop=True)
)
df = df.join(df["basket.constituents"].apply(pd.Series)).drop(columns="basket.constituents")
df = (df.join(df["exchRate"].apply(pd.Series))
.join(df["asset"].apply(pd.Series), rsuffix="_asset").drop(columns="asset")
.explode("moreIds")
.reset_index(drop=True)
)
df.join(df["moreIds"].apply(pd.Series)).drop(columns="moreIds")
采样 2 行
_typ | assetClass | ccy | indexFreq | indexType | name | pricingSourceType | basket.basketDate | id.code | id.source | sourceId.code | sourceId.source | exchRate | factor | price | weight | rate | _typ_asset | allAnalytics | assetClass_asset | ccy_asset | exposureCtry | id | name_asset | sectors | codScheme | code | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Index | EQUITY | USD | NONE | COMPOSITE | MSCI EUROPE (16) SMALL CAP | NF | 2020-12-02T00:00:00.000+0100 | 33 | INDICE | 106233 | CODESF | {'rate': 1.2077999235945769} | 1 | 53.66 | 0.000842355 | 1.2078 | Asset | [{'marketCap': 2889054400}] | EQUITY | EUR | DEU | {'code': '2603021'} | DERMAPHARM HOLDING SE | [{'code': '551', 'type': 'HSBC'}, {'code': '4577', 'type': 'ICB'}] | ISIN | DE000A2GS5D8 |
1 | Index | EQUITY | USD | NONE | COMPOSITE | MSCI EUROPE (16) SMALL CAP | NF | 2020-12-02T00:00:00.000+0100 | 33 | INDICE | 106233 | CODESF | {'rate': 1.2077999235945769} | 1 | 53.66 | 0.000842355 | 1.2078 | Asset | [{'marketCap': 2889054400}] | EQUITY | EUR | DEU | {'code': '2603021'} | DERMAPHARM HOLDING SE | [{'code': '551', 'type': 'HSBC'}, {'code': '4577', 'type': 'ICB'}] | SEDOL | BFYTTC2 |
我的解决方案,但速度很慢,因为我处理了 100 000 行。
%%time
df = pd.json_normalize(request_json_decoding, ['basket', ['constituents']])
df_infoindex = pd.json_normalize(request_json_decoding).drop(['basket.constituents'], axis=1)
df_allanalytics = df.explode('asset.allAnalytics')
df_allanalytics = pd.json_normalize(df_allanalytics['asset.allAnalytics'])
name_allanalytics = df_allanalytics.columns
df['asset.allAnalytics'] = df_allanalytics
df.rename(columns = {'asset.allAnalytics' : name_allanalytics[0]}, inplace = True)
#Find Codification
codification = df['asset.moreIds'].to_dict()
dict_codif = {}
for i in codification:
if isinstance(codification[i],list):
for ii in codification[i]:
if i not in dict_codif:
dict_codif[i] = ({ii['codScheme']:ii['code']})
else:
dict_codif[i].update({ii['codScheme']:ii['code']})
else:
dict_codif[i] = np.nan
df_codification = pd.DataFrame(dict_codif).T
#Find Sector
sector = df['asset.sectors'].to_dict()
dict_codif = {}
for i in sector:
if isinstance(sector[i],list):
for ii in sector[i]:
if i not in dict_codif:
dict_codif[i] = ({ii['type']:ii['code']})
else:
dict_codif[i].update({ii['type']:ii['code']})
else:
dict_codif[i] = np.nan
df_sector = pd.DataFrame(dict_codif).T
df.drop(['asset.moreIds', 'asset.sectors'], axis=1, inplace = True)
df_return = pd.concat([df, df_codification, df_sector,
pd.concat([df_infoindex]*df.shape[0], ignore_index=True)], axis = 1)