python 列表中的内存泄漏问题
Memory leakage issue in python list
身份列表包含大量大约 57000 张图像。现在,我正在 itertools.product()
的帮助下创建负面清单。这将整个列表存储在内存中,这是非常昂贵的,我的系统在 4 分钟后挂起。
如何优化以下代码并避免节省内存?`
for i in range(0, len(idendities) - 1):
for j in range(i + 1, len(idendities)):
cross_product = itertools.product(samples_list[i], samples_list[j])
cross_product = list(cross_product)
for cross_sample in cross_product:
negative = []
negative.append(cross_sample[0])
negative.append(cross_sample[1])
negatives.append(negative)
print(len(negatives))
negatives = pd.DataFrame(negatives, columns=["file_x", "file_y"])
negatives["decision"] = "No"
negatives = negatives.sample(positives.shape[0])
内存9.30会越来越高,一度系统完全挂了
我也实现了下面的答案,并根据他的答案修改了代码。
for i in range(0, len(idendities) - 1):
for j in range(i + 1, len(idendities)):
for cross_sample in itertools.product(samples_list[i], samples_list[j]):
negative = [cross_sample[0], cross_sample[1]]
negatives.append(negative)
print(len(negatives))
negatives = pd.DataFrame(negatives, columns=["file_x", "file_y"])
negatives["decision"] = "No"
第三版代码
此 CSV 文件太大,即使您打开一个文件,它也会发出警告,提示您的程序无法加载所有文件。过程,十分钟,然后系统又要彻底挂了。
for i in range(0, len(idendities) - 1):
for j in range(i + 1, len(idendities)):
for cross_sample in itertools.product(samples_list[i], samples_list[j]):
with open('/home/khawar/deepface/tests/results.csv', 'a+') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([cross_sample[0], cross_sample[1]])
negative = [cross_sample[0], cross_sample[1]]
negatives.append(negative)
negatives = pd.DataFrame(negatives, columns=["file_x", "file_y"])
negatives["decision"] = "No"
negatives = negatives.sample(positives.shape[0])
内存截图。
来自 itertools 的 product 是一个 generator 所以它自然不会将整个列表存储在内存中,而是在下一行,cross_product = list(cross_product)
将其转换为列表对象,将整个数据存储在内存中。
生成器的想法是您不会像在调用时那样同时进行所有计算 list(itertools.product(samples_list[i], samples_list[j]))
。所以你要做的就是一个一个生成结果:
试试这样:
for i in range(len(idendities) - 1):
for j in range(i + 1, len(idendities)):
for cross_sample in itertools.product(samples_list[i], samples_list[j]):
# do something ...
所以我想我发现了你的问题;你首先将所有样本附加到否定列表,因为你的记忆力会越来越高,你需要实时写每一行,一次写一行;
你的数据是csv吧?所以你可以这样做:
import csv
for i in range(0, len(idendities) - 1):
for j in range(i + 1, len(idendities)):
for cross_sample in itertools.product(samples_list[i], samples_list[j]):
with open('results.csv', 'a+') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([cross_sample[0], cross_sample[1]])
想法是实时写入行
也检查这个linkhow to write the real time data into csv file in python
@9mat、@cybot 和这些问题 , how to write the real time data into csv file in python
您可以创建一个 class 来表示多个列表的乘积,其行为类似于列表但不存储任何组合。这只会按需“组合”项目。
class ProductList:
def __init__(self,*data):
self.data = data
self.size = 1
for d in self.data: self.size *= len(d)
def __len__(self): return self.size
def __getitem__(self,index):
if isinstance(index,slice):
return [*map(self.__getitem__,range(len(self))[index])]
result = tuple()
for d in reversed(self.data):
index,i = divmod(index,len(d))
result = (d[i],) + result
return result
def __iter__(self):
for i in range(len(self)): yield self[i]
def __contains__(self,value):
return len(value) == len(self.data) \
and all(v in d for v,d in zip(value,self.data))
def index(self,value):
index = 0
for v,d in zip(value,self.data):
index = index*len(d)+d.index(v)
return index
用法:
p = ProductList(range(1234),range(1234,5678),range(5678,9101))
print(*p[:10],sep="\n")
(0, 1234, 5678)
(0, 1234, 5679)
(0, 1234, 5680)
(0, 1234, 5681)
(0, 1234, 5682)
(0, 1234, 5683)
(0, 1234, 5684)
(0, 1234, 5685)
(0, 1234, 5686)
(0, 1234, 5687)
len(p) # 18771376008
p[27] # (2, 6, 12)
for c in p[103350956:103350960]: print(c)
(6, 4763, 5995)
(6, 4763, 5996)
(6, 4763, 5997)
(6, 4763, 5998)
p.index((6, 4763, 5995)) # 103350956
p[103350956] # (6, 4763, 5995)
(6, 4763, 5995) in p # True
(5995, 4763, 6) in p # False
实际上,生成的对保存在你的记忆中,这就是为什么你的记忆会越来越高。
您必须更改生成对的代码并立即从内存中释放它们。
前一码:
for i in range(0, len(idendities) - 1):
for j in range(i + 1, len(idendities)):
cross_product = itertools.product(samples_list[i], samples_list[j])
cross_product = list(cross_product)
for cross_sample in cross_product:
negative = []
negative.append(cross_sample[0])
negative.append(cross_sample[1])
negatives.append(negative)
print(len(negatives))
negatives = pd.DataFrame(negatives, columns=["file_x", "file_y"])
negatives["decision"] = "No"
Memory Efficient Code 将对保存在列表中,第二次无需再次生成。
samples_list = list(identities.values())
negatives = pd.DataFrame()
if Path("positives_negatives.csv").exists():
df = pd.read_csv("positives_negatives.csv")
else:
for combo in tqdm(itertools.combinations(identities.values(), 2), desc="Negatives"):
for cross_sample in itertools.product(combo[0], combo[1]):
negatives = negatives.append(pd.Series({"file_x": cross_sample[0], "file_y": cross_sample[1]}).T,
ignore_index=True)
negatives["decision"] = "No"
negatives = negatives.sample(positives.shape[0])
df = pd.concat([positives, negatives]).reset_index(drop=True)
df.to_csv("positives_negatives.csv", index=False)
身份列表包含大量大约 57000 张图像。现在,我正在 itertools.product()
的帮助下创建负面清单。这将整个列表存储在内存中,这是非常昂贵的,我的系统在 4 分钟后挂起。
如何优化以下代码并避免节省内存?`
for i in range(0, len(idendities) - 1):
for j in range(i + 1, len(idendities)):
cross_product = itertools.product(samples_list[i], samples_list[j])
cross_product = list(cross_product)
for cross_sample in cross_product:
negative = []
negative.append(cross_sample[0])
negative.append(cross_sample[1])
negatives.append(negative)
print(len(negatives))
negatives = pd.DataFrame(negatives, columns=["file_x", "file_y"])
negatives["decision"] = "No"
negatives = negatives.sample(positives.shape[0])
内存9.30会越来越高,一度系统完全挂了
我也实现了下面的答案,并根据他的答案修改了代码。
for i in range(0, len(idendities) - 1):
for j in range(i + 1, len(idendities)):
for cross_sample in itertools.product(samples_list[i], samples_list[j]):
negative = [cross_sample[0], cross_sample[1]]
negatives.append(negative)
print(len(negatives))
negatives = pd.DataFrame(negatives, columns=["file_x", "file_y"])
negatives["decision"] = "No"
第三版代码
此 CSV 文件太大,即使您打开一个文件,它也会发出警告,提示您的程序无法加载所有文件。过程,十分钟,然后系统又要彻底挂了。
for i in range(0, len(idendities) - 1):
for j in range(i + 1, len(idendities)):
for cross_sample in itertools.product(samples_list[i], samples_list[j]):
with open('/home/khawar/deepface/tests/results.csv', 'a+') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([cross_sample[0], cross_sample[1]])
negative = [cross_sample[0], cross_sample[1]]
negatives.append(negative)
negatives = pd.DataFrame(negatives, columns=["file_x", "file_y"])
negatives["decision"] = "No"
negatives = negatives.sample(positives.shape[0])
内存截图。
来自 itertools 的 product 是一个 generator 所以它自然不会将整个列表存储在内存中,而是在下一行,cross_product = list(cross_product)
将其转换为列表对象,将整个数据存储在内存中。
生成器的想法是您不会像在调用时那样同时进行所有计算 list(itertools.product(samples_list[i], samples_list[j]))
。所以你要做的就是一个一个生成结果:
试试这样:
for i in range(len(idendities) - 1):
for j in range(i + 1, len(idendities)):
for cross_sample in itertools.product(samples_list[i], samples_list[j]):
# do something ...
所以我想我发现了你的问题;你首先将所有样本附加到否定列表,因为你的记忆力会越来越高,你需要实时写每一行,一次写一行;
你的数据是csv吧?所以你可以这样做:
import csv
for i in range(0, len(idendities) - 1):
for j in range(i + 1, len(idendities)):
for cross_sample in itertools.product(samples_list[i], samples_list[j]):
with open('results.csv', 'a+') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([cross_sample[0], cross_sample[1]])
想法是实时写入行
也检查这个linkhow to write the real time data into csv file in python
@9mat、@cybot 和这些问题
您可以创建一个 class 来表示多个列表的乘积,其行为类似于列表但不存储任何组合。这只会按需“组合”项目。
class ProductList:
def __init__(self,*data):
self.data = data
self.size = 1
for d in self.data: self.size *= len(d)
def __len__(self): return self.size
def __getitem__(self,index):
if isinstance(index,slice):
return [*map(self.__getitem__,range(len(self))[index])]
result = tuple()
for d in reversed(self.data):
index,i = divmod(index,len(d))
result = (d[i],) + result
return result
def __iter__(self):
for i in range(len(self)): yield self[i]
def __contains__(self,value):
return len(value) == len(self.data) \
and all(v in d for v,d in zip(value,self.data))
def index(self,value):
index = 0
for v,d in zip(value,self.data):
index = index*len(d)+d.index(v)
return index
用法:
p = ProductList(range(1234),range(1234,5678),range(5678,9101))
print(*p[:10],sep="\n")
(0, 1234, 5678)
(0, 1234, 5679)
(0, 1234, 5680)
(0, 1234, 5681)
(0, 1234, 5682)
(0, 1234, 5683)
(0, 1234, 5684)
(0, 1234, 5685)
(0, 1234, 5686)
(0, 1234, 5687)
len(p) # 18771376008
p[27] # (2, 6, 12)
for c in p[103350956:103350960]: print(c)
(6, 4763, 5995)
(6, 4763, 5996)
(6, 4763, 5997)
(6, 4763, 5998)
p.index((6, 4763, 5995)) # 103350956
p[103350956] # (6, 4763, 5995)
(6, 4763, 5995) in p # True
(5995, 4763, 6) in p # False
实际上,生成的对保存在你的记忆中,这就是为什么你的记忆会越来越高。
您必须更改生成对的代码并立即从内存中释放它们。
前一码:
for i in range(0, len(idendities) - 1):
for j in range(i + 1, len(idendities)):
cross_product = itertools.product(samples_list[i], samples_list[j])
cross_product = list(cross_product)
for cross_sample in cross_product:
negative = []
negative.append(cross_sample[0])
negative.append(cross_sample[1])
negatives.append(negative)
print(len(negatives))
negatives = pd.DataFrame(negatives, columns=["file_x", "file_y"])
negatives["decision"] = "No"
Memory Efficient Code 将对保存在列表中,第二次无需再次生成。
samples_list = list(identities.values())
negatives = pd.DataFrame()
if Path("positives_negatives.csv").exists():
df = pd.read_csv("positives_negatives.csv")
else:
for combo in tqdm(itertools.combinations(identities.values(), 2), desc="Negatives"):
for cross_sample in itertools.product(combo[0], combo[1]):
negatives = negatives.append(pd.Series({"file_x": cross_sample[0], "file_y": cross_sample[1]}).T,
ignore_index=True)
negatives["decision"] = "No"
negatives = negatives.sample(positives.shape[0])
df = pd.concat([positives, negatives]).reset_index(drop=True)
df.to_csv("positives_negatives.csv", index=False)