如何让这段代码不消耗那么多 RAM 内存?
How to make this code not to consume so much RAM memory?
我有这两个功能,当我 运行 它们时,我的内核死得很快。我能做些什么来防止它?它发生在将大约 10 个文件附加到数据帧之后。不幸的是 json 文件太大了(每个大约 150 MB,有几十个)我不知道如何将它们连接在一起。
import os
import pandas as pd
from pandas.io.json import json_normalize
import json
def filtering_nodes(df):
id_list = df.index.tolist()
print("Dropping rows without 4 nodes and 3 members...")
for x in id_list:
if len(df['Nodes'][x]) != 4 and len(df['Members'][x]) != 3:
df = df.drop(x)
print("Converting to csv...")
df.to_csv("whole_df.csv", sep='\t')
return df
def merge_JsonFiles(filename):
result = list()
cnt = 0
df_all = None
data_all = None
for f1 in filename:
print("Appending file: ", f1)
with open('../../data' + f1, 'r') as infile:
data_all = json.loads(infile.read())
if cnt == 0:
df_all = pd.json_normalize(data_all, record_path =['List2D'], max_level =2 ,sep = "-")
else:
df_all = df_all.append(pd.json_normalize(data_all, record_path =['List2D'], max_level =2 ,sep = "-"), ignore_index = True)
cnt += 1
return df_all
files = os.listdir('../../data')
df_all_test = merge_JsonFiles(files)
df_all_test_drop = filtering_nodes(df_all_test)
编辑:
由于@jlandercy 的回答,我做了这个:
def merging_to_csv():
for path in pathlib.Path("../../data/loads_data/Dane/hilti/").glob("*.json"):
# Open source file one by one:
with path.open() as handler:
df = pd.json_normalize(json.load(handler), record_path =['List2D'])
# Identify rows to drop (boolean indexing):
q = (df["Nodes"] != 4) & (df["Members"] != 3)
# Inplace drop (no extra copy in RAM):
df.drop(q, inplace=True)
# Append data to disk instead of RAM:
df.to_csv("output.csv", mode="a", header=False)
merging_to_csv()
我遇到了这种类型的错误:
KeyError Traceback (most recent call last)
<ipython-input-55-cf18265ca50e> in <module>
----> 1 merging_to_csv()
<ipython-input-54-698c67461b34> in merging_to_csv()
51 q = (df["Nodes"] != 4) & (df["Members"] != 3)
52 # Inplace drop (no extra copy in RAM):
---> 53 df.drop(q, inplace=True)
54 # Append data to disk instead of RAM:
55 df.to_csv("output.csv", mode="a", header=False)
/opt/conda/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py in drop(self, labels, axis, index, columns, level, inplace, errors)
4906 level=level,
4907 inplace=inplace,
-> 4908 errors=errors,
4909 )
4910
/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
4148 for axis, labels in axes.items():
4149 if labels is not None:
-> 4150 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
4151
4152 if inplace:
/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors)
4183 new_axis = axis.drop(labels, level=level, errors=errors)
4184 else:
-> 4185 new_axis = axis.drop(labels, errors=errors)
4186 result = self.reindex(**{axis_name: new_axis})
4187
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in drop(self, labels, errors)
6016 if mask.any():
6017 if errors != "ignore":
-> 6018 raise KeyError(f"{labels[mask]} not found in axis")
6019 indexer = indexer[~mask]
6020 return self.delete(indexer)
KeyError: '[ True True True True True True True True True True True True\n True True True True True True True True True True True True\n True True True True True True True True True True True True\n True True True True True True True True True True True True\n True True True True True True True True True True True True\n True True True True True True True True True True True True\n True True True True True True True True True True True True\n True True True True True True True True True True True True\n True] not found in axis'
怎么了?我将在此处上传两个最小的 json 文件:
https://drive.google.com/drive/folders/1xlC-kK6NLGr0isdy1Ln2tzGmel45GtPC?usp=sharing
您在原来的方法中遇到了多个问题:
- 数据帧的多个副本:
df = df.drop(...)
;
- 由于
append
; ,RAM 中存储了全部信息
- 不需要 for 循环来过滤行,请改用布尔索引。
这是根据您提供的数据样本解决问题的基线代码段:
import json
import pathlib
import pandas as pd
# Iterate source files:
for path in pathlib.Path(".").glob("result*.json"):
# Open source file one by one:
with path.open() as handler:
# Normalize JSON model:
df = pd.json_normalize(json.load(handler), record_path =['List2D'], max_level=2, sep="-")
# Apply len to list fields to identify rows to drop (boolean indexing):
q = (df["Nodes"].apply(len) != 4) & (df["Members"].apply(len) != 3)
# Filter and append data to disk instead of RAM:
df.loc[~q,:].to_csv("output.csv", mode="a", header=False)
它在 RAM 中一个一个地加载文件,然后将过滤后的行附加到磁盘而不是 RAM。这些修复将大大减少 RAM 使用量,并且应该保持为最大 JSON 文件的两倍。
我有这两个功能,当我 运行 它们时,我的内核死得很快。我能做些什么来防止它?它发生在将大约 10 个文件附加到数据帧之后。不幸的是 json 文件太大了(每个大约 150 MB,有几十个)我不知道如何将它们连接在一起。
import os
import pandas as pd
from pandas.io.json import json_normalize
import json
def filtering_nodes(df):
id_list = df.index.tolist()
print("Dropping rows without 4 nodes and 3 members...")
for x in id_list:
if len(df['Nodes'][x]) != 4 and len(df['Members'][x]) != 3:
df = df.drop(x)
print("Converting to csv...")
df.to_csv("whole_df.csv", sep='\t')
return df
def merge_JsonFiles(filename):
result = list()
cnt = 0
df_all = None
data_all = None
for f1 in filename:
print("Appending file: ", f1)
with open('../../data' + f1, 'r') as infile:
data_all = json.loads(infile.read())
if cnt == 0:
df_all = pd.json_normalize(data_all, record_path =['List2D'], max_level =2 ,sep = "-")
else:
df_all = df_all.append(pd.json_normalize(data_all, record_path =['List2D'], max_level =2 ,sep = "-"), ignore_index = True)
cnt += 1
return df_all
files = os.listdir('../../data')
df_all_test = merge_JsonFiles(files)
df_all_test_drop = filtering_nodes(df_all_test)
编辑: 由于@jlandercy 的回答,我做了这个:
def merging_to_csv():
for path in pathlib.Path("../../data/loads_data/Dane/hilti/").glob("*.json"):
# Open source file one by one:
with path.open() as handler:
df = pd.json_normalize(json.load(handler), record_path =['List2D'])
# Identify rows to drop (boolean indexing):
q = (df["Nodes"] != 4) & (df["Members"] != 3)
# Inplace drop (no extra copy in RAM):
df.drop(q, inplace=True)
# Append data to disk instead of RAM:
df.to_csv("output.csv", mode="a", header=False)
merging_to_csv()
我遇到了这种类型的错误:
KeyError Traceback (most recent call last)
<ipython-input-55-cf18265ca50e> in <module>
----> 1 merging_to_csv()
<ipython-input-54-698c67461b34> in merging_to_csv()
51 q = (df["Nodes"] != 4) & (df["Members"] != 3)
52 # Inplace drop (no extra copy in RAM):
---> 53 df.drop(q, inplace=True)
54 # Append data to disk instead of RAM:
55 df.to_csv("output.csv", mode="a", header=False)
/opt/conda/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py in drop(self, labels, axis, index, columns, level, inplace, errors)
4906 level=level,
4907 inplace=inplace,
-> 4908 errors=errors,
4909 )
4910
/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
4148 for axis, labels in axes.items():
4149 if labels is not None:
-> 4150 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
4151
4152 if inplace:
/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors)
4183 new_axis = axis.drop(labels, level=level, errors=errors)
4184 else:
-> 4185 new_axis = axis.drop(labels, errors=errors)
4186 result = self.reindex(**{axis_name: new_axis})
4187
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in drop(self, labels, errors)
6016 if mask.any():
6017 if errors != "ignore":
-> 6018 raise KeyError(f"{labels[mask]} not found in axis")
6019 indexer = indexer[~mask]
6020 return self.delete(indexer)
KeyError: '[ True True True True True True True True True True True True\n True True True True True True True True True True True True\n True True True True True True True True True True True True\n True True True True True True True True True True True True\n True True True True True True True True True True True True\n True True True True True True True True True True True True\n True True True True True True True True True True True True\n True True True True True True True True True True True True\n True] not found in axis'
怎么了?我将在此处上传两个最小的 json 文件: https://drive.google.com/drive/folders/1xlC-kK6NLGr0isdy1Ln2tzGmel45GtPC?usp=sharing
您在原来的方法中遇到了多个问题:
- 数据帧的多个副本:
df = df.drop(...)
; - 由于
append
; ,RAM 中存储了全部信息
- 不需要 for 循环来过滤行,请改用布尔索引。
这是根据您提供的数据样本解决问题的基线代码段:
import json
import pathlib
import pandas as pd
# Iterate source files:
for path in pathlib.Path(".").glob("result*.json"):
# Open source file one by one:
with path.open() as handler:
# Normalize JSON model:
df = pd.json_normalize(json.load(handler), record_path =['List2D'], max_level=2, sep="-")
# Apply len to list fields to identify rows to drop (boolean indexing):
q = (df["Nodes"].apply(len) != 4) & (df["Members"].apply(len) != 3)
# Filter and append data to disk instead of RAM:
df.loc[~q,:].to_csv("output.csv", mode="a", header=False)
它在 RAM 中一个一个地加载文件,然后将过滤后的行附加到磁盘而不是 RAM。这些修复将大大减少 RAM 使用量,并且应该保持为最大 JSON 文件的两倍。