我的 Python 代码中的多处理与代码增强

Multiprocessing vs. Code Enhancement in my Python Code

背景: 我有一个脚本可以从 .xlsx 文件中读取 200 多个大纲(组)数据,并将它们写入单独创建的 .xlsx 并适当地命名它们,同时适当地保留大纲。

脚本: 下面的脚本实现了这个:

from openpyxl import load_workbook


def get_client_rows(sheet):
    """Get client rows.

    Skip header and then look for row dimensions without outline level
    """
    return [row[0].row for row in sheet.iter_rows(2) if row[0].alignment.indent == 0.0]
    return [
        row_index
        for row_index, row_dimension in sheet.row_dimensions.items()
        if row_index > 1 and row_dimension.outline_level == 0
    ]


def delete_client_block(sheet, start, end):
    """
    Delete rows starting from up to and including end.
    """
    for row in range(start, end + 1):
        sheet.row_dimensions.pop(row, None)
    sheet.delete_rows(start, end - start + 1)


def split_workbook(input_file, output_file):
    """
    Split workbook each main group into its own sheet.

    Not to lose any formatting we copy the current sheet and remove all rows
    which do not belong to extacted group.
    """

    try:
        workbook = load_workbook(input_file)
        data_sheet = workbook.active
        client_rows = get_client_rows(data_sheet)

        for index, client_row in enumerate(client_rows):
            # create new sheet for given client, shorten client as it might be too long
            client_sheet = workbook.copy_worksheet(data_sheet)
            client_sheet.title = data_sheet.cell(client_row, 1).value[:32]
            print("Creating New Client Workbook")

            # delete rows after current client if available
            if index < len(client_rows) - 1:
                row_after_client = client_rows[index + 1]
                delete_client_block(
                    client_sheet, row_after_client, client_sheet.max_row
                )

            # delete rows before current client if available
            if index > 0:
                first_client_row = client_rows[0]
                delete_client_block(
                    client_sheet, first_client_row, client_row - first_client_row + 1
                )

                # move left over dimensions to top of the sheet
                for row_index in list(client_sheet.row_dimensions.keys()):
                    # skip header row dimension
                    if row_index > first_client_row - 1:
                        row_dimension = client_sheet.row_dimensions.pop(row_index)
                        new_index = row_index - client_row + first_client_row
                        row_dimension.index = new_index
                        client_sheet.row_dimensions[new_index] = row_dimension

        del workbook[data_sheet.title]
        workbook.save(output_file)
    finally:
        workbook.close()


if __name__ == "__main__":
    input_file = 'input.xlsx'
    output_file = "output.xlsx"
    split_workbook(input_file, output_file)

我的问题:脚本在处理较小的文件(例如 < 5MB / 10k 行)时工作正常,但是尝试处理较大的文件(例如 50MB 文件 /> 100k 行)我得到以下 MemoryError:

Traceback (most recent call last):
  File "C:/scripts/BillingPull/main.py", line 80, in <module>
    split_workbook(input_file, output_file)
  File "C:/scripts/BillingPull/main.py", line 41, in split_workbook
    client_sheet = workbook.copy_worksheet(data_sheet)
  File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\workbook\workbook.py", line 434, in copy_worksheet
    cp.copy_worksheet()
  File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\worksheet\copier.py", line 37, in copy_worksheet
    self._copy_dimensions()
  File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\worksheet\copier.py", line 69, in _copy_dimensions
    target[key] = copy(dim)
  File "C:\Program Files\Python38\lib\copy.py", line 84, in copy
    return copier(x)
  File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\worksheet\dimensions.py", line 60, in __copy__
    cp.__init__(**attrib)
  File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\worksheet\dimensions.py", line 107, in __init__
    super(RowDimension, self).__init__(index, hidden, outlineLevel,
  File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\worksheet\dimensions.py", line 45, in __init__
    self.outlineLevel = outlineLevel
  File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\descriptors\base.py", line 68, in __set__
    super(Convertible, self).__set__(instance, value)
  File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\descriptors\base.py", line 43, in __set__
    super(Typed, self).__set__(instance, value)
  File "C:\scripts\BillingPull\venv\lib\site-packages\openpyxl\descriptors\base.py", line 24, in __set__
    instance.__dict__[self.name] = value
MemoryError

我的问题:我 运行 这个脚本在基于 x64 的 PC 虚拟服务器上,具有以下规格 -

我应该探索使用多线程来优化脚本的运行时间,还是代码优化?

解决方案: 我能够重新审视代码的运行方式并找到优化。具体来说,openpyxl 包含 openpyxl.worksheet._read_only.ReadOnlyWorksheet。文档 here -

Sometimes, you will need to open or write extremely large XLSX files, and the common routines in openpyxl won’t be able to handle that load. Fortunately, there are two modes that enable you to read and write unlimited amounts of data with (near) constant memory consumption.

修改后的代码:

from openpyxl import load_workbook
from openpyxl import LXML
import time
from copy import copy
from openpyxl.cell import WriteOnlyCell
from openpyxl import Workbook

import logging
import sys

logger = logging.getLogger()


def configure_logging():
    logger.setLevel(logging.INFO)

    handler = logging.StreamHandler(sys.stdout)
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter("%(asctime)s - %(message)s")
    handler.setFormatter(formatter)
    logger.addHandler(handler)


INVALID_TITLE_CHARS = ["]", "[", "*", ":", "?", "/", "\", "'"]
INVALID_TITLE_CHAR_MAP = {ord(x): "" for x in INVALID_TITLE_CHARS}


def clean_sheet_title(title):
    title = title or ""
    title = title.strip()
    title = title.translate(INVALID_TITLE_CHAR_MAP)
    return title[:31]


def is_client_row(row, row_dimension):
    return row[0].alignment.indent == 0.0
    # return row_dimension.outline_level == 0


def create_write_only_cell(source_cell, target_sheet):
    target_cell = WriteOnlyCell(target_sheet, value=source_cell.value)
    target_cell.data_type = source_cell.data_type

    if source_cell.has_style:
        target_cell.font = copy(source_cell.font)
        # TODO save one border and use it
        # target_cell.border = copy(source_cell.border)
        # TODO copy client row
        # target_cell.fill = copy(source_cell.fill)
        target_cell.number_format = copy(source_cell.number_format)
        # target_cell.protection = copy(source_cell.protection)
        target_cell.alignment = copy(source_cell.alignment)

    return target_cell


def create_write_only_row(source_row, target_sheet):
    return [create_write_only_cell(cell, target_sheet) for cell in source_row]


def split_workbook(input_file, output_file):
    """
    Split workbook each client into its own sheet.
    """
    try:
        logger.info(f"Loading workbook {input_file}")
        workbook = load_workbook(input_file)
        data_sheet = workbook.active
        output_workbook = Workbook(write_only=True)
        client_sheet = None
        client_row_index = 2
        processing_client = 0

        rows = data_sheet.rows
        header = next(rows)
        for index, row in enumerate(rows, start=2):
            # TODO implement skip row
            # if skip_row(row) is True:
            #     continue
            row_dimension = data_sheet.row_dimensions[index]

            # create new sheet found new client is found
            if is_client_row(row, row_dimension):
                processing_client += 1
                client_sheet_title = clean_sheet_title(row[0].value)
                logger.info(f"Processing client {processing_client}")
                client_sheet = output_workbook.create_sheet(client_sheet_title)
                client_row_index = index

                # copy column dimensions
                for key, column_dimension in data_sheet.column_dimensions.items():
                    client_sheet.column_dimensions[key] = copy(column_dimension)
                    client_sheet.column_dimensions[key].worksheet = client_sheet

                client_sheet.append(create_write_only_row(header, client_sheet))

            # copy row dimensions
            client_sheet.row_dimensions[index - client_row_index + 2] = copy(
                row_dimension
            )
            client_sheet.row_dimensions[index].worksheet = client_sheet

            # finally copy row
            client_sheet.append(create_write_only_row(row, client_sheet))

            if index % 10000 == 0:
                logger.info(f"{index} rows processed")

        logger.info(f"Writing workbook {output_file}")
        output_workbook.save(output_file)
    finally:
        if workbook:
            workbook.close()
        if output_workbook:
            output_workbook.close()


if __name__ == "__main__":
    input_file = "input_file.xlsx"
    output_file = "output_file.xlsx"

    start = time.time()
    configure_logging()

    logger.info(f"Using lxml mode: {LXML}")
    split_workbook(input_file, output_file)
    logger.info("Time consumed: % s seconds" % (time.time() - start))