通过分组到单独的工作簿表中拆分 Excel 数据

Question

背景：
我有一个 40MB 的大型 XLSX 文件，其中包含按多个级别分组的客户端数据，如下所示：

展开 -

未扩展（对糟糕的虚拟数据感到抱歉！）-

Objective:
我想将客户 A、B C 等及其各自的基础数据拆分成单独的工作表（命名为'Client A' 等...）在工作簿中。

问题：
我假设没有 python 库可以帮助解决这个问题（例如，xlsxwriter ) 并且在拆分和写入 xlsx 文件之前，我可能必须保存到多个 pandas df 中？

样本数据：
这里是一些随机样本数据的link。在此文件中，您只会看到 1 个客户端（总行可以忽略），但想象一下普通文件有 40 个客户端/分组和子级别。

示例代码： 此函数采用 '.xlsxand writes each grouping to an appropriately named tab (e.g., 'Client A') to a separate Worksheet in a new.xlsx`。这段代码的问题在于，因为我基本上是逐个检查和复制每个单元格，所以我不认为要更全面地考虑，但要确保 Groupings/Levels 会被保留。我认为这段代码需要完全重写，欢迎反馈

import openpyxl

from copy import copy
from openpyxl import load_workbook

columns=['A','B','C','D','E','F','G','H','I','J','K','L']
def copy_cell(ws, row,ws_row,ws1):
    for col in columns:
        ws_cell=ws1[col+str(ws_row)]
        new_cell = ws[col+str(row)]

        if ws_cell.has_style:
            new_cell.font = copy(ws_cell.font)
            new_cell.border = copy(ws_cell.border)
            new_cell.fill = copy(ws_cell.fill)
            new_cell.number_format = copy(ws_cell.number_format)
            new_cell.protection = copy(ws_cell.protection)
            new_cell.alignment = copy(ws_cell.alignment)

wb1 = openpyxl.load_workbook('annonamized_test_data_to_be_split.xlsx')

ws1=wb1.active

indexs=[]
clients=[]

index=1

while ws1['A'+str(index)]:
    if str(ws1['A'+str(index)].alignment.indent)=='0.0':
        indexs.append(index)
        clients.append(ws1['A'+str(index)].value)
    if ws1['A'+str(index)].value  is None:
        indexs.append(index)
        break
    index+=1

wb1.close()

wb = openpyxl.Workbook()

ws=wb.active

start_index=1

headers=['Ownership Structure', 'Fee Schedule', 'Management Style', 'Advisory Firm', 'Inception Date', 'Days in Time Period', 'Adjusted Average Daily Balance (No Div, USD)', 'Assets Billed On (USD)',
'Effective Billing Rate', 'Billing Fees (USD)', 'Bill To Account', 'Model Type']

for y,index in enumerate(indexs):
    try:
        client=0
        if len(clients[y])>=32:
            client=clients[y][:31]
        else:
            client=clients[y]
        wb.create_sheet(client)
        ws=wb[client]
        ws.column_dimensions['A'].width=35
        ws.append(headers)
        row_index=2
        for i in range(start_index,indexs[y+1]):
            ws.append([ws1[col+str(i)].value for col in columns])
            copy_cell(ws,row_index,i,ws1)
            row_index+=1
        start_index=indexs[y+1]
    except:
        pass

wb.save('split_data.xlsx')
wb.close()

try:
    wb1 = openpyxl.load_workbook('split_data.xlsx')
    a=wb1['Sheet']
    wb1.remove(a)
    a=wb1['Sheet1']
    wb1.remove(a)
    wb1.save('split_data.xlsx')
    wb1.close()
except:
    pass

请有人能为我指出正确的资源方向，教我如何实现这一目标吗？

Answer 1

from openpyxl import load_workbook


def get_client_rows(sheet):
    """Get client rows.

    Skip header and then look for row dimensions without outline level
    """
    return [row[0].row for row in sheet.iter_rows(2) if row[0].alignment.indent == 0.0]
    return [
        row_index
        for row_index, row_dimension in sheet.row_dimensions.items()
        if row_index > 1 and row_dimension.outline_level == 0
    ]


def delete_client_block(sheet, start, end):
    """
    Delete rows starting from up to and including end.
    """
    for row in range(start, end + 1):
        sheet.row_dimensions.pop(row, None)
    sheet.delete_rows(start, end - start + 1)


def split_workbook(input_file, output_file):
    """
    Split workbook each main group into its own sheet.

    Not too loose any formatting we copy the current sheet and remove all rows
    which do not belong to extacted group.
    """

    try:
        workbook = load_workbook(input_file)
        data_sheet = workbook.active
        client_rows = get_client_rows(data_sheet)

        for index, client_row in enumerate(client_rows):
            # create new sheet for given client, shorten client as it might be too long
            client_sheet = workbook.copy_worksheet(data_sheet)
            client_sheet.title = data_sheet.cell(client_row, 1).value[:32]

            # delete rows after current client if available
            if index < len(client_rows) - 1:
                row_after_client = client_rows[index + 1]
                delete_client_block(
                    client_sheet, row_after_client, client_sheet.max_row
                )

            # delete rows before current client if available
            if index > 0:
                first_client_row = client_rows[0]
                delete_client_block(
                    client_sheet, first_client_row, client_row - first_client_row + 1
                )

                # move left over dimensions to top of the sheet
                for row_index in list(client_sheet.row_dimensions.keys()):
                    # skip header row dimension
                    if row_index > first_client_row - 1:
                        row_dimension = client_sheet.row_dimensions.pop(row_index)
                        new_index = row_index - client_row + first_client_row
                        row_dimension.index = new_index
                        client_sheet.row_dimensions[new_index] = row_dimension

        del workbook[data_sheet.title]
        workbook.save(output_file)
    finally:
        workbook.close()


if __name__ == "__main__":
    # input_file = "annonamized_test_data_to_be_split.xlsx"
    input_file = 'partial_Q1_Client_Billing_Data.xlsx'
    # output_file = "split_data.xlsx"
    output_file = "splitting_full_data.xlsx"

    split_workbook(input_file, output_file)

通过分组到单独的工作簿表中拆分 Excel 数据

Splitting Excel Data by Groupings into Separate Workbook Sheets

python

excel

pandas

xlsxwriter