如何防止 multiprocessing.pool 消耗我所有的内存?
How can I prevent multiprocessing.pool from consuming all of my memory?
我的多处理池(8 个内核,16 GB RAM)在摄取大量数据之前就用完了我的所有内存。我在 6 GB 数据集上运行
我尝试过使用各种类型的处理器,包括imap、imap_unordered、apply、map等。我也尝试过maxtasksperchild,这似乎会增加内存使用量。
import string
import re
import multiprocessing as mp
from tqdm import tqdm
linkregex = re.compile(r"http\S+")
puncregex = re.compile(r"(?<=\w)[^\s\w](?![^\s\w])")
emojiregex = re.compile(r"(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])")
sentences = []
def process(item):
return re.sub(emojiregex, r" ", re.sub(puncregex,"",re.sub(linkregex, "link", item))).lower().split()
if __name__ == '__main__':
with mp.Pool(8) as pool:
sentences = list(tqdm(pool.imap_unordered(process, open('scrape/output.txt')
), total=52123146))
print(str(len(sentences)))
with open("final/word2vectweets.txt", "a+") as out:
out.write(sentences)
这应该是 return 文件中已处理行的列表,但它消耗内存的速度太快。之前没有mp,处理简单的版本都成功了
这个看起来怎么样?
import re
import multiprocessing as mp
linkregex = re.compile(r"http\S+")
puncregex = re.compile(r"(?<=\w)[^\s\w](?![^\s\w])")
emojiregex = re.compile(r"(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])")
def process(item):
return re.sub(emojiregex, r" ", re.sub(puncregex,"",re.sub(linkregex, "link", item))).lower().split()
with mp.Pool() as pool, open(in_file_path, 'r') as file_in, open(out_file_path, 'a') as file_out:
for curr_sentence in pool.imap_unordered(process_line, file_in, chunksize=1000):
file_out.write(f'{curr_sentence}\n')
我测试了一堆块大小,1000 似乎是最佳点。我会继续调查的。
我的多处理池(8 个内核,16 GB RAM)在摄取大量数据之前就用完了我的所有内存。我在 6 GB 数据集上运行
我尝试过使用各种类型的处理器,包括imap、imap_unordered、apply、map等。我也尝试过maxtasksperchild,这似乎会增加内存使用量。
import string
import re
import multiprocessing as mp
from tqdm import tqdm
linkregex = re.compile(r"http\S+")
puncregex = re.compile(r"(?<=\w)[^\s\w](?![^\s\w])")
emojiregex = re.compile(r"(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])")
sentences = []
def process(item):
return re.sub(emojiregex, r" ", re.sub(puncregex,"",re.sub(linkregex, "link", item))).lower().split()
if __name__ == '__main__':
with mp.Pool(8) as pool:
sentences = list(tqdm(pool.imap_unordered(process, open('scrape/output.txt')
), total=52123146))
print(str(len(sentences)))
with open("final/word2vectweets.txt", "a+") as out:
out.write(sentences)
这应该是 return 文件中已处理行的列表,但它消耗内存的速度太快。之前没有mp,处理简单的版本都成功了
这个看起来怎么样?
import re
import multiprocessing as mp
linkregex = re.compile(r"http\S+")
puncregex = re.compile(r"(?<=\w)[^\s\w](?![^\s\w])")
emojiregex = re.compile(r"(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])")
def process(item):
return re.sub(emojiregex, r" ", re.sub(puncregex,"",re.sub(linkregex, "link", item))).lower().split()
with mp.Pool() as pool, open(in_file_path, 'r') as file_in, open(out_file_path, 'a') as file_out:
for curr_sentence in pool.imap_unordered(process_line, file_in, chunksize=1000):
file_out.write(f'{curr_sentence}\n')
我测试了一堆块大小,1000 似乎是最佳点。我会继续调查的。