python 多处理中的笛卡尔积与 tqdm 进度条
cartesian product in python multiprocessing with tqdm progress bar
我找到了一些使用 tqdm 和 Python 多处理创建进度条的代码,它使用整数来更新进度条。我将其更改为将其用作文件循环,但 lambda 回调创建了一个带有文件路径的笛卡尔积,这让我的机器 运行 内存不足,并存有大量文件。我试图在其他问题中找到解决方案,但没有找到答案。
我该怎么做才能避免 async_result 中的笛卡尔积(以及内存不足),但仍会创建进度条?
import glob
import jpylyzer
import multiprocessing as mp
from tqdm import tqdm
cores=2
src="/path/to/jp2/files"
def f_process_file(filename):
now=time.strftime("%Y-%m-%d %H:%M:%S")
try:
result = jpylyzer.checkOneFile(filename)
status=result.findtext('isValid')
except Exception as ex:
print("oopsie")
return filename, status, now
# Find JP2 files in the source directory case insensitively
files = [f for f in glob.iglob(src + '/**/*.[jJ][pP]2', recursive=True)]
filecount=len(files)
# Start a multiprocessing pool
pool = mp.Pool(processes = cores)
# Define a progress bar
pbar = tqdm(total=filecount)
# process all files asynchronously and do callback for the progress bar
async_result = [pool.map_async(f_process_file, files, callback=lambda _: pbar.update(1)) for file in files]
# magic for the progress barr
results = [p.get() for p in async_result]
pool.close()
pool.join()
for i in range(len(results)):
if results[i][i][1] != 'True':
print(results[i][i])
我通过从 async_result 中删除 [] 找到了答案,删除了 回调=lambda 并在动态启动之前为进度条声明一个全局变量 pbar
#!/usr/bin/env python3
import glob
from tqdm import tqdm
import time, sys
def f_process_file(filename):
fnctn=sys._getframe().f_code.co_name
now=time.strftime("%Y-%m-%d %H:%M:%S")
try:
# Do some stuff here
result = 'isValid' # for testing purpose declare a value
status = 'True' # for testing purpose declare a value
except Exception as ex:
print("failure in {}".format(fnctn))
#update the progress bar
time.sleep(0.005)
pbar.update(1)
return filename, status, now
def f_doall(src):
files = [f for f in glob.iglob(src + '/**/*.[jJ][pP]2', recursive=True)]
filecount=len(files)
print(filecount)
#Declare a global variable for the progress bar
global pbar
# Initiate the progree bar
pbar = tqdm(total=filecount)
for f in files:
f_process_file(f)
def main():
src="/path/to/images"
f_doall(src)
if __name__ == "__main__":
main()
现在我可以扩展我的代码以使用多处理池
我找到了一些使用 tqdm 和 Python 多处理创建进度条的代码,它使用整数来更新进度条。我将其更改为将其用作文件循环,但 lambda 回调创建了一个带有文件路径的笛卡尔积,这让我的机器 运行 内存不足,并存有大量文件。我试图在其他问题中找到解决方案,但没有找到答案。 我该怎么做才能避免 async_result 中的笛卡尔积(以及内存不足),但仍会创建进度条?
import glob
import jpylyzer
import multiprocessing as mp
from tqdm import tqdm
cores=2
src="/path/to/jp2/files"
def f_process_file(filename):
now=time.strftime("%Y-%m-%d %H:%M:%S")
try:
result = jpylyzer.checkOneFile(filename)
status=result.findtext('isValid')
except Exception as ex:
print("oopsie")
return filename, status, now
# Find JP2 files in the source directory case insensitively
files = [f for f in glob.iglob(src + '/**/*.[jJ][pP]2', recursive=True)]
filecount=len(files)
# Start a multiprocessing pool
pool = mp.Pool(processes = cores)
# Define a progress bar
pbar = tqdm(total=filecount)
# process all files asynchronously and do callback for the progress bar
async_result = [pool.map_async(f_process_file, files, callback=lambda _: pbar.update(1)) for file in files]
# magic for the progress barr
results = [p.get() for p in async_result]
pool.close()
pool.join()
for i in range(len(results)):
if results[i][i][1] != 'True':
print(results[i][i])
我通过从 async_result 中删除 [] 找到了答案,删除了 回调=lambda 并在动态启动之前为进度条声明一个全局变量 pbar
#!/usr/bin/env python3
import glob
from tqdm import tqdm
import time, sys
def f_process_file(filename):
fnctn=sys._getframe().f_code.co_name
now=time.strftime("%Y-%m-%d %H:%M:%S")
try:
# Do some stuff here
result = 'isValid' # for testing purpose declare a value
status = 'True' # for testing purpose declare a value
except Exception as ex:
print("failure in {}".format(fnctn))
#update the progress bar
time.sleep(0.005)
pbar.update(1)
return filename, status, now
def f_doall(src):
files = [f for f in glob.iglob(src + '/**/*.[jJ][pP]2', recursive=True)]
filecount=len(files)
print(filecount)
#Declare a global variable for the progress bar
global pbar
# Initiate the progree bar
pbar = tqdm(total=filecount)
for f in files:
f_process_file(f)
def main():
src="/path/to/images"
f_doall(src)
if __name__ == "__main__":
main()
现在我可以扩展我的代码以使用多处理池