我的 Python 代码中的多处理与代码增强
Multiprocessing vs. Code Enhancement in my Python Code
背景: 我有一个脚本可以从 .xlsx
文件中读取 200 多个大纲(组)数据,并将它们写入单独创建的 .xlsx
并适当地命名它们,同时适当地保留大纲。
脚本: 下面的脚本实现了这个:
from openpyxl import load_workbook
def get_client_rows(sheet):
"""Get client rows.
Skip header and then look for row dimensions without outline level
"""
return [row[0].row for row in sheet.iter_rows(2) if row[0].alignment.indent == 0.0]
return [
row_index
for row_index, row_dimension in sheet.row_dimensions.items()
if row_index > 1 and row_dimension.outline_level == 0
]
def delete_client_block(sheet, start, end):
"""
Delete rows starting from up to and including end.
"""
for row in range(start, end + 1):
sheet.row_dimensions.pop(row, None)
sheet.delete_rows(start, end - start + 1)
def split_workbook(input_file, output_file):
"""
Split workbook each main group into its own sheet.
Not to lose any formatting we copy the current sheet and remove all rows
which do not belong to extacted group.
"""
try:
workbook = load_workbook(input_file)
data_sheet = workbook.active
client_rows = get_client_rows(data_sheet)
for index, client_row in enumerate(client_rows):
# create new sheet for given client, shorten client as it might be too long
client_sheet = workbook.copy_worksheet(data_sheet)
client_sheet.title = data_sheet.cell(client_row, 1).value[:32]
print("Creating New Client Workbook")
# delete rows after current client if available
if index < len(client_rows) - 1:
row_after_client = client_rows[index + 1]
delete_client_block(
client_sheet, row_after_client, client_sheet.max_row
)
# delete rows before current client if available
if index > 0:
first_client_row = client_rows[0]
delete_client_block(
client_sheet, first_client_row, client_row - first_client_row + 1
)
# move left over dimensions to top of the sheet
for row_index in list(client_sheet.row_dimensions.keys()):
# skip header row dimension
if row_index > first_client_row - 1:
row_dimension = client_sheet.row_dimensions.pop(row_index)
new_index = row_index - client_row + first_client_row
row_dimension.index = new_index
client_sheet.row_dimensions[new_index] = row_dimension
del workbook[data_sheet.title]
workbook.save(output_file)
finally:
workbook.close()
if __name__ == "__main__":
input_file = 'input.xlsx'
output_file = "output.xlsx"
split_workbook(input_file, output_file)
我的问题:脚本在处理较小的文件(例如 < 5MB / 10k 行)时工作正常,但是尝试处理较大的文件(例如 50MB 文件 /> 100k 行)我得到以下 MemoryError
:
Traceback (most recent call last):
File "C:/scripts/BillingPull/main.py", line 80, in <module>
split_workbook(input_file, output_file)
File "C:/scripts/BillingPull/main.py", line 41, in split_workbook
client_sheet = workbook.copy_worksheet(data_sheet)
File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\workbook\workbook.py", line 434, in copy_worksheet
cp.copy_worksheet()
File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\worksheet\copier.py", line 37, in copy_worksheet
self._copy_dimensions()
File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\worksheet\copier.py", line 69, in _copy_dimensions
target[key] = copy(dim)
File "C:\Program Files\Python38\lib\copy.py", line 84, in copy
return copier(x)
File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\worksheet\dimensions.py", line 60, in __copy__
cp.__init__(**attrib)
File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\worksheet\dimensions.py", line 107, in __init__
super(RowDimension, self).__init__(index, hidden, outlineLevel,
File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\worksheet\dimensions.py", line 45, in __init__
self.outlineLevel = outlineLevel
File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\descriptors\base.py", line 68, in __set__
super(Convertible, self).__set__(instance, value)
File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\descriptors\base.py", line 43, in __set__
super(Typed, self).__set__(instance, value)
File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\descriptors\base.py", line 24, in __set__
instance.__dict__[self.name] = value
MemoryError
我的问题:我 运行 这个脚本在基于 x64 的 PC 虚拟服务器上,具有以下规格 -
- 处理器 AMD EPYC 7763 64 核处理器,2445 Mhz,2 核,4 逻辑处理器
- 物理内存:16.0 GB
- 虚拟内存:31.8 GB
- Windows 诉 10
我应该探索使用多线程来优化脚本的运行时间,还是代码优化?
解决方案: 我能够重新审视代码的运行方式并找到优化。具体来说,openpyxl
包含 openpyxl.worksheet._read_only.ReadOnlyWorksheet
。文档 here -
Sometimes, you will need to open or write extremely large XLSX files,
and the common routines in openpyxl won’t be able to handle that load.
Fortunately, there are two modes that enable you to read and write
unlimited amounts of data with (near) constant memory consumption.
修改后的代码:
from openpyxl import load_workbook
from openpyxl import LXML
import time
from copy import copy
from openpyxl.cell import WriteOnlyCell
from openpyxl import Workbook
import logging
import sys
logger = logging.getLogger()
def configure_logging():
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
INVALID_TITLE_CHARS = ["]", "[", "*", ":", "?", "/", "\", "'"]
INVALID_TITLE_CHAR_MAP = {ord(x): "" for x in INVALID_TITLE_CHARS}
def clean_sheet_title(title):
title = title or ""
title = title.strip()
title = title.translate(INVALID_TITLE_CHAR_MAP)
return title[:31]
def is_client_row(row, row_dimension):
return row[0].alignment.indent == 0.0
# return row_dimension.outline_level == 0
def create_write_only_cell(source_cell, target_sheet):
target_cell = WriteOnlyCell(target_sheet, value=source_cell.value)
target_cell.data_type = source_cell.data_type
if source_cell.has_style:
target_cell.font = copy(source_cell.font)
# TODO save one border and use it
# target_cell.border = copy(source_cell.border)
# TODO copy client row
# target_cell.fill = copy(source_cell.fill)
target_cell.number_format = copy(source_cell.number_format)
# target_cell.protection = copy(source_cell.protection)
target_cell.alignment = copy(source_cell.alignment)
return target_cell
def create_write_only_row(source_row, target_sheet):
return [create_write_only_cell(cell, target_sheet) for cell in source_row]
def split_workbook(input_file, output_file):
"""
Split workbook each client into its own sheet.
"""
try:
logger.info(f"Loading workbook {input_file}")
workbook = load_workbook(input_file)
data_sheet = workbook.active
output_workbook = Workbook(write_only=True)
client_sheet = None
client_row_index = 2
processing_client = 0
rows = data_sheet.rows
header = next(rows)
for index, row in enumerate(rows, start=2):
# TODO implement skip row
# if skip_row(row) is True:
# continue
row_dimension = data_sheet.row_dimensions[index]
# create new sheet found new client is found
if is_client_row(row, row_dimension):
processing_client += 1
client_sheet_title = clean_sheet_title(row[0].value)
logger.info(f"Processing client {processing_client}")
client_sheet = output_workbook.create_sheet(client_sheet_title)
client_row_index = index
# copy column dimensions
for key, column_dimension in data_sheet.column_dimensions.items():
client_sheet.column_dimensions[key] = copy(column_dimension)
client_sheet.column_dimensions[key].worksheet = client_sheet
client_sheet.append(create_write_only_row(header, client_sheet))
# copy row dimensions
client_sheet.row_dimensions[index - client_row_index + 2] = copy(
row_dimension
)
client_sheet.row_dimensions[index].worksheet = client_sheet
# finally copy row
client_sheet.append(create_write_only_row(row, client_sheet))
if index % 10000 == 0:
logger.info(f"{index} rows processed")
logger.info(f"Writing workbook {output_file}")
output_workbook.save(output_file)
finally:
if workbook:
workbook.close()
if output_workbook:
output_workbook.close()
if __name__ == "__main__":
input_file = "input_file.xlsx"
output_file = "output_file.xlsx"
start = time.time()
configure_logging()
logger.info(f"Using lxml mode: {LXML}")
split_workbook(input_file, output_file)
logger.info("Time consumed: % s seconds" % (time.time() - start))
背景: 我有一个脚本可以从 .xlsx
文件中读取 200 多个大纲(组)数据,并将它们写入单独创建的 .xlsx
并适当地命名它们,同时适当地保留大纲。
脚本: 下面的脚本实现了这个:
from openpyxl import load_workbook
def get_client_rows(sheet):
"""Get client rows.
Skip header and then look for row dimensions without outline level
"""
return [row[0].row for row in sheet.iter_rows(2) if row[0].alignment.indent == 0.0]
return [
row_index
for row_index, row_dimension in sheet.row_dimensions.items()
if row_index > 1 and row_dimension.outline_level == 0
]
def delete_client_block(sheet, start, end):
"""
Delete rows starting from up to and including end.
"""
for row in range(start, end + 1):
sheet.row_dimensions.pop(row, None)
sheet.delete_rows(start, end - start + 1)
def split_workbook(input_file, output_file):
"""
Split workbook each main group into its own sheet.
Not to lose any formatting we copy the current sheet and remove all rows
which do not belong to extacted group.
"""
try:
workbook = load_workbook(input_file)
data_sheet = workbook.active
client_rows = get_client_rows(data_sheet)
for index, client_row in enumerate(client_rows):
# create new sheet for given client, shorten client as it might be too long
client_sheet = workbook.copy_worksheet(data_sheet)
client_sheet.title = data_sheet.cell(client_row, 1).value[:32]
print("Creating New Client Workbook")
# delete rows after current client if available
if index < len(client_rows) - 1:
row_after_client = client_rows[index + 1]
delete_client_block(
client_sheet, row_after_client, client_sheet.max_row
)
# delete rows before current client if available
if index > 0:
first_client_row = client_rows[0]
delete_client_block(
client_sheet, first_client_row, client_row - first_client_row + 1
)
# move left over dimensions to top of the sheet
for row_index in list(client_sheet.row_dimensions.keys()):
# skip header row dimension
if row_index > first_client_row - 1:
row_dimension = client_sheet.row_dimensions.pop(row_index)
new_index = row_index - client_row + first_client_row
row_dimension.index = new_index
client_sheet.row_dimensions[new_index] = row_dimension
del workbook[data_sheet.title]
workbook.save(output_file)
finally:
workbook.close()
if __name__ == "__main__":
input_file = 'input.xlsx'
output_file = "output.xlsx"
split_workbook(input_file, output_file)
我的问题:脚本在处理较小的文件(例如 < 5MB / 10k 行)时工作正常,但是尝试处理较大的文件(例如 50MB 文件 /> 100k 行)我得到以下 MemoryError
:
Traceback (most recent call last):
File "C:/scripts/BillingPull/main.py", line 80, in <module>
split_workbook(input_file, output_file)
File "C:/scripts/BillingPull/main.py", line 41, in split_workbook
client_sheet = workbook.copy_worksheet(data_sheet)
File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\workbook\workbook.py", line 434, in copy_worksheet
cp.copy_worksheet()
File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\worksheet\copier.py", line 37, in copy_worksheet
self._copy_dimensions()
File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\worksheet\copier.py", line 69, in _copy_dimensions
target[key] = copy(dim)
File "C:\Program Files\Python38\lib\copy.py", line 84, in copy
return copier(x)
File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\worksheet\dimensions.py", line 60, in __copy__
cp.__init__(**attrib)
File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\worksheet\dimensions.py", line 107, in __init__
super(RowDimension, self).__init__(index, hidden, outlineLevel,
File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\worksheet\dimensions.py", line 45, in __init__
self.outlineLevel = outlineLevel
File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\descriptors\base.py", line 68, in __set__
super(Convertible, self).__set__(instance, value)
File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\descriptors\base.py", line 43, in __set__
super(Typed, self).__set__(instance, value)
File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\descriptors\base.py", line 24, in __set__
instance.__dict__[self.name] = value
MemoryError
我的问题:我 运行 这个脚本在基于 x64 的 PC 虚拟服务器上,具有以下规格 -
- 处理器 AMD EPYC 7763 64 核处理器,2445 Mhz,2 核,4 逻辑处理器
- 物理内存:16.0 GB
- 虚拟内存:31.8 GB
- Windows 诉 10
我应该探索使用多线程来优化脚本的运行时间,还是代码优化?
解决方案: 我能够重新审视代码的运行方式并找到优化。具体来说,openpyxl
包含 openpyxl.worksheet._read_only.ReadOnlyWorksheet
。文档 here -
Sometimes, you will need to open or write extremely large XLSX files, and the common routines in openpyxl won’t be able to handle that load. Fortunately, there are two modes that enable you to read and write unlimited amounts of data with (near) constant memory consumption.
修改后的代码:
from openpyxl import load_workbook
from openpyxl import LXML
import time
from copy import copy
from openpyxl.cell import WriteOnlyCell
from openpyxl import Workbook
import logging
import sys
logger = logging.getLogger()
def configure_logging():
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
INVALID_TITLE_CHARS = ["]", "[", "*", ":", "?", "/", "\", "'"]
INVALID_TITLE_CHAR_MAP = {ord(x): "" for x in INVALID_TITLE_CHARS}
def clean_sheet_title(title):
title = title or ""
title = title.strip()
title = title.translate(INVALID_TITLE_CHAR_MAP)
return title[:31]
def is_client_row(row, row_dimension):
return row[0].alignment.indent == 0.0
# return row_dimension.outline_level == 0
def create_write_only_cell(source_cell, target_sheet):
target_cell = WriteOnlyCell(target_sheet, value=source_cell.value)
target_cell.data_type = source_cell.data_type
if source_cell.has_style:
target_cell.font = copy(source_cell.font)
# TODO save one border and use it
# target_cell.border = copy(source_cell.border)
# TODO copy client row
# target_cell.fill = copy(source_cell.fill)
target_cell.number_format = copy(source_cell.number_format)
# target_cell.protection = copy(source_cell.protection)
target_cell.alignment = copy(source_cell.alignment)
return target_cell
def create_write_only_row(source_row, target_sheet):
return [create_write_only_cell(cell, target_sheet) for cell in source_row]
def split_workbook(input_file, output_file):
"""
Split workbook each client into its own sheet.
"""
try:
logger.info(f"Loading workbook {input_file}")
workbook = load_workbook(input_file)
data_sheet = workbook.active
output_workbook = Workbook(write_only=True)
client_sheet = None
client_row_index = 2
processing_client = 0
rows = data_sheet.rows
header = next(rows)
for index, row in enumerate(rows, start=2):
# TODO implement skip row
# if skip_row(row) is True:
# continue
row_dimension = data_sheet.row_dimensions[index]
# create new sheet found new client is found
if is_client_row(row, row_dimension):
processing_client += 1
client_sheet_title = clean_sheet_title(row[0].value)
logger.info(f"Processing client {processing_client}")
client_sheet = output_workbook.create_sheet(client_sheet_title)
client_row_index = index
# copy column dimensions
for key, column_dimension in data_sheet.column_dimensions.items():
client_sheet.column_dimensions[key] = copy(column_dimension)
client_sheet.column_dimensions[key].worksheet = client_sheet
client_sheet.append(create_write_only_row(header, client_sheet))
# copy row dimensions
client_sheet.row_dimensions[index - client_row_index + 2] = copy(
row_dimension
)
client_sheet.row_dimensions[index].worksheet = client_sheet
# finally copy row
client_sheet.append(create_write_only_row(row, client_sheet))
if index % 10000 == 0:
logger.info(f"{index} rows processed")
logger.info(f"Writing workbook {output_file}")
output_workbook.save(output_file)
finally:
if workbook:
workbook.close()
if output_workbook:
output_workbook.close()
if __name__ == "__main__":
input_file = "input_file.xlsx"
output_file = "output_file.xlsx"
start = time.time()
configure_logging()
logger.info(f"Using lxml mode: {LXML}")
split_workbook(input_file, output_file)
logger.info("Time consumed: % s seconds" % (time.time() - start))