绕过内存错误以读取 Python 中的大 JSON 文件
Bypass memory error to read large JSON file in Python
我正在使用下面的代码将几个 JSON 文件转换为 CSV,一切正常。但是,当我尝试转换更大的 JSON 文件( 范围 2-4+ GB)时,它会给出 MemoryError
.
当前代码
import json, pandas
from flatten_json import flatten
# Enter the path to the JSON and the filename without appending '.json'
file_path = r'C:\Path\To\file_name'
# Open and load the JSON file
json_list = json.load(open(file_path + '.json', 'r', encoding='utf-8', errors='ignore'))
# Extract data from the defined key names
key_list = ['created', 'emails', 'identities']
json_list = [{k:d[k] for k in key_list} for d in json_list]
# Flatten and convert to a data frame
json_list_flattened = (flatten(d, '.') for d in json_list)
df = pandas.DataFrame(json_list_flattened)
# Drop unwanted columns
df.drop(df.filter(regex='identities.0.favorites|identities.0.likes').columns, axis=1, inplace=True)
# Export to CSV in the same directory with the original file name
export_csv = df.to_csv (file_path + r'.csv', sep=',', encoding='utf-8', index=None, header=True)
在网上查找类似问题后,我似乎可以利用 ijson
来逐行解析大型 JSON 文件,而不是整个文件。您还会在我的代码中看到,我只提取某些 JSON 键以转换为 CSV(created
、emails
、identities
)。
我不确定实现它的最佳方式,但我认为它在我的代码开头会是这样的:
import ijson
...
json_list = ijson.parse(open(file_path + '.json', 'r', encoding='utf-8', errors='ignore'))
...
我无法共享我正在使用的完整 JSON 文件,因为它包含敏感信息。但是您可以使用下面的示例数据进行测试。这只是一张唱片,但我想向您展示我正在使用的布局。
JSON样本
[
{
"callId": "abc123",
"errorCode": 0,
"apiVersion": 2,
"statusCode": 200,
"statusReason": "OK",
"time": "2020-12-14T12:00:32.744Z",
"registeredTimestamp": 1417731582000,
"UID": "_guid_abc123==",
"created": "2014-12-04T22:19:42.894Z",
"createdTimestamp": 1417731582000,
"data": {},
"preferences": {},
"emails": {
"verified": [],
"unverified": []
},
"identities": [
{
"provider": "facebook",
"providerUID": "123",
"allowsLogin": true,
"isLoginIdentity": true,
"isExpiredSession": true,
"lastUpdated": "2014-12-04T22:26:37.002Z",
"lastUpdatedTimestamp": 1417731997002,
"oldestDataUpdated": "2014-12-04T22:26:37.002Z",
"oldestDataUpdatedTimestamp": 1417731997002,
"firstName": "John",
"lastName": "Doe",
"nickname": "John Doe",
"profileURL": "https://www.facebook.com/John.Doe",
"age": 50,
"birthDay": 31,
"birthMonth": 12,
"birthYear": 1969,
"city": "City, State",
"education": [
{
"school": "High School Name",
"schoolType": "High School",
"degree": null,
"startYear": 0,
"fieldOfStudy": null,
"endYear": 0
}
],
"educationLevel": "High School",
"favorites": {
"music": [
{
"name": "Music 1",
"id": "123",
"category": "Musician/band"
},
{
"name": "Music 2",
"id": "123",
"category": "Musician/band"
}
],
"movies": [
{
"name": "Movie 1",
"id": "123",
"category": "Movie"
},
{
"name": "Movie 2",
"id": "123",
"category": "Movie"
}
],
"television": [
{
"name": "TV 1",
"id": "123",
"category": "Tv show"
}
]
},
"followersCount": 0,
"gender": "m",
"hometown": "City, State",
"languages": "English",
"likes": [
{
"name": "Like 1",
"id": "123",
"time": "2014-10-31T23:52:53.0000000Z",
"category": "TV",
"timestamp": "1414799573"
},
{
"name": "Like 2",
"id": "123",
"time": "2014-09-16T08:11:35.0000000Z",
"category": "Music",
"timestamp": "1410855095"
}
],
"locale": "en_US",
"name": "John Doe",
"photoURL": "https://graph.facebook.com/123/picture?type=large",
"timezone": "-8",
"thumbnailURL": "https://graph.facebook.com/123/picture?type=square",
"username": "john.doe",
"verified": "true",
"work": [
{
"companyID": null,
"isCurrent": null,
"endDate": null,
"company": "Company Name",
"industry": null,
"title": "Company Title",
"companySize": null,
"startDate": "2010-12-31T00:00:00"
}
]
}
],
"isActive": true,
"isLockedOut": false,
"isRegistered": true,
"isVerified": false,
"lastLogin": "2014-12-04T22:26:33.002Z",
"lastLoginTimestamp": 1417731993000,
"lastUpdated": "2014-12-04T22:19:42.769Z",
"lastUpdatedTimestamp": 1417731582769,
"loginProvider": "facebook",
"loginIDs": {
"emails": [],
"unverifiedEmails": []
},
"rbaPolicy": {
"riskPolicyLocked": false
},
"oldestDataUpdated": "2014-12-04T22:19:42.894Z",
"oldestDataUpdatedTimestamp": 1417731582894,
"registered": "2014-12-04T22:19:42.956Z",
"regSource": "",
"socialProviders": "facebook"
}
]
预期结果
因此,我希望 运行 我的工作代码在更大的 JSON 文件上不会崩溃并给我一个 MemoryError
.
一般来说,如果您想使用 ijson 来减少内存开销,则需要注意不要让其余代码也引入开销。最好的情况是您将 JSON 对象的单个项目翻译成生成的 CVS 文件中的一行,然后迭代地执行此操作。这意味着不再使用列表推导式(一次作用于所有数据)而不使用 DataFrame(再次一次保存所有内容)。
关于 ijson 的使用:一个廉价的解决方案是使用 ijson.items
遍历 JSON 文档中的每个对象。在我上面描述的最佳情况下,您将删除不必要的字段,并将该对象转换为 CSV 行。类似于:
with open(path, 'rb') as fin:
for obj in ijson.items(fin, 'item'):
filter_object_and_turn_it_into_a_cvs_line(obj)
如果出于某种原因您仍然确实需要继续使用 DataFrame,您至少可以尝试在将数据传递给 DataFrame 之前始终以生成器表达式的形式进行数据清理,以避免额外的数据副本(但请记住,您结束无论如何都将大部分数据加载到内存中):
with open(path, 'rb') as fin:
json_list = ijson.items(fin, 'item')
key_list = ['created', 'emails', 'identities']
json_list = ({k:d[k] for k in key_list} for d in json_list) # this was a list comprehension in the original code
flattened = (flatten(d, '.') for d in json_list)
df = pandas.DataFrame(json_list_flattened)
我正在使用下面的代码将几个 JSON 文件转换为 CSV,一切正常。但是,当我尝试转换更大的 JSON 文件( 范围 2-4+ GB)时,它会给出 MemoryError
.
当前代码
import json, pandas
from flatten_json import flatten
# Enter the path to the JSON and the filename without appending '.json'
file_path = r'C:\Path\To\file_name'
# Open and load the JSON file
json_list = json.load(open(file_path + '.json', 'r', encoding='utf-8', errors='ignore'))
# Extract data from the defined key names
key_list = ['created', 'emails', 'identities']
json_list = [{k:d[k] for k in key_list} for d in json_list]
# Flatten and convert to a data frame
json_list_flattened = (flatten(d, '.') for d in json_list)
df = pandas.DataFrame(json_list_flattened)
# Drop unwanted columns
df.drop(df.filter(regex='identities.0.favorites|identities.0.likes').columns, axis=1, inplace=True)
# Export to CSV in the same directory with the original file name
export_csv = df.to_csv (file_path + r'.csv', sep=',', encoding='utf-8', index=None, header=True)
在网上查找类似问题后,我似乎可以利用 ijson
来逐行解析大型 JSON 文件,而不是整个文件。您还会在我的代码中看到,我只提取某些 JSON 键以转换为 CSV(created
、emails
、identities
)。
我不确定实现它的最佳方式,但我认为它在我的代码开头会是这样的:
import ijson
...
json_list = ijson.parse(open(file_path + '.json', 'r', encoding='utf-8', errors='ignore'))
...
我无法共享我正在使用的完整 JSON 文件,因为它包含敏感信息。但是您可以使用下面的示例数据进行测试。这只是一张唱片,但我想向您展示我正在使用的布局。
JSON样本
[
{
"callId": "abc123",
"errorCode": 0,
"apiVersion": 2,
"statusCode": 200,
"statusReason": "OK",
"time": "2020-12-14T12:00:32.744Z",
"registeredTimestamp": 1417731582000,
"UID": "_guid_abc123==",
"created": "2014-12-04T22:19:42.894Z",
"createdTimestamp": 1417731582000,
"data": {},
"preferences": {},
"emails": {
"verified": [],
"unverified": []
},
"identities": [
{
"provider": "facebook",
"providerUID": "123",
"allowsLogin": true,
"isLoginIdentity": true,
"isExpiredSession": true,
"lastUpdated": "2014-12-04T22:26:37.002Z",
"lastUpdatedTimestamp": 1417731997002,
"oldestDataUpdated": "2014-12-04T22:26:37.002Z",
"oldestDataUpdatedTimestamp": 1417731997002,
"firstName": "John",
"lastName": "Doe",
"nickname": "John Doe",
"profileURL": "https://www.facebook.com/John.Doe",
"age": 50,
"birthDay": 31,
"birthMonth": 12,
"birthYear": 1969,
"city": "City, State",
"education": [
{
"school": "High School Name",
"schoolType": "High School",
"degree": null,
"startYear": 0,
"fieldOfStudy": null,
"endYear": 0
}
],
"educationLevel": "High School",
"favorites": {
"music": [
{
"name": "Music 1",
"id": "123",
"category": "Musician/band"
},
{
"name": "Music 2",
"id": "123",
"category": "Musician/band"
}
],
"movies": [
{
"name": "Movie 1",
"id": "123",
"category": "Movie"
},
{
"name": "Movie 2",
"id": "123",
"category": "Movie"
}
],
"television": [
{
"name": "TV 1",
"id": "123",
"category": "Tv show"
}
]
},
"followersCount": 0,
"gender": "m",
"hometown": "City, State",
"languages": "English",
"likes": [
{
"name": "Like 1",
"id": "123",
"time": "2014-10-31T23:52:53.0000000Z",
"category": "TV",
"timestamp": "1414799573"
},
{
"name": "Like 2",
"id": "123",
"time": "2014-09-16T08:11:35.0000000Z",
"category": "Music",
"timestamp": "1410855095"
}
],
"locale": "en_US",
"name": "John Doe",
"photoURL": "https://graph.facebook.com/123/picture?type=large",
"timezone": "-8",
"thumbnailURL": "https://graph.facebook.com/123/picture?type=square",
"username": "john.doe",
"verified": "true",
"work": [
{
"companyID": null,
"isCurrent": null,
"endDate": null,
"company": "Company Name",
"industry": null,
"title": "Company Title",
"companySize": null,
"startDate": "2010-12-31T00:00:00"
}
]
}
],
"isActive": true,
"isLockedOut": false,
"isRegistered": true,
"isVerified": false,
"lastLogin": "2014-12-04T22:26:33.002Z",
"lastLoginTimestamp": 1417731993000,
"lastUpdated": "2014-12-04T22:19:42.769Z",
"lastUpdatedTimestamp": 1417731582769,
"loginProvider": "facebook",
"loginIDs": {
"emails": [],
"unverifiedEmails": []
},
"rbaPolicy": {
"riskPolicyLocked": false
},
"oldestDataUpdated": "2014-12-04T22:19:42.894Z",
"oldestDataUpdatedTimestamp": 1417731582894,
"registered": "2014-12-04T22:19:42.956Z",
"regSource": "",
"socialProviders": "facebook"
}
]
预期结果
因此,我希望 运行 我的工作代码在更大的 JSON 文件上不会崩溃并给我一个 MemoryError
.
一般来说,如果您想使用 ijson 来减少内存开销,则需要注意不要让其余代码也引入开销。最好的情况是您将 JSON 对象的单个项目翻译成生成的 CVS 文件中的一行,然后迭代地执行此操作。这意味着不再使用列表推导式(一次作用于所有数据)而不使用 DataFrame(再次一次保存所有内容)。
关于 ijson 的使用:一个廉价的解决方案是使用 ijson.items
遍历 JSON 文档中的每个对象。在我上面描述的最佳情况下,您将删除不必要的字段,并将该对象转换为 CSV 行。类似于:
with open(path, 'rb') as fin:
for obj in ijson.items(fin, 'item'):
filter_object_and_turn_it_into_a_cvs_line(obj)
如果出于某种原因您仍然确实需要继续使用 DataFrame,您至少可以尝试在将数据传递给 DataFrame 之前始终以生成器表达式的形式进行数据清理,以避免额外的数据副本(但请记住,您结束无论如何都将大部分数据加载到内存中):
with open(path, 'rb') as fin:
json_list = ijson.items(fin, 'item')
key_list = ['created', 'emails', 'identities']
json_list = ({k:d[k] for k in key_list} for d in json_list) # this was a list comprehension in the original code
flattened = (flatten(d, '.') for d in json_list)
df = pandas.DataFrame(json_list_flattened)