将 excel 文件分成多个 excel 组
Split excel file into multiple by excel groups
我所说的组是指可扩展的东西:
按下它们时,我们可以扩展一些行,在这种特殊情况下,我需要将行提取到不同的数据位置(无论如何,rows/better 列表 - 其他 excel 文件),分组由 1st, hm, group:
因此,例如在这种情况下:
file1.xlsx 将包括从 6 到 572
的所有行
file2.xlsx 将包括从 573 到 627 的行
等等。
如何执行此操作?它可以是 VBA 脚本,但最好使用一些 python 库,如 openpyxl 或 win32com.client
# -*- coding: utf-8 -*-
import openpyxl
wb = openpyxl.load_workbook(r'path_to_xlsx_file')
ws = wb.active
range_string = ws.calculate_dimension()
print(range_string)
for row_index, row in enumerate(ws.iter_rows(range_string=range_string)):
print(ws.row_dimensions[row_index].index, # just for the great LULZ
ws.row_dimensions[row_index].outline_level, # THAT what I was looking for!
ws.row_dimensions[row_index].hidden, # couple other helpful parameters
ws.row_dimensions[row_index].collapsed,
ws.row_dimensions[row_index].height)
我构建了类似的东西 - 它采用 .xlsx 并将所有分组(在本例中为客户端)拆分为单独的工作表。它使用 openpyxl
openpyxl.worksheet._read_only.ReadOnlyWorksheet
-
进行了优化
Optimised
Modes Read-only mode
Sometimes, you will need to open or
write extremely large XLSX files, and the common routines in openpyxl
won’t be able to handle that load. Fortunately, there are two modes
that enable you to read and write unlimited amounts of data with
(near) constant memory consumption.
脚本:
from openpyxl import load_workbook
from openpyxl import LXML
import time
from copy import copy
from openpyxl.cell import WriteOnlyCell
from openpyxl import Workbook
import logging
import sys
logger = logging.getLogger()
def configure_logging():
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
INVALID_TITLE_CHARS = ["]", "[", "*", ":", "?", "/", "\", "'"]
INVALID_TITLE_CHAR_MAP = {ord(x): "" for x in INVALID_TITLE_CHARS}
def clean_sheet_title(title):
title = title or ""
title = title.strip()
title = title.translate(INVALID_TITLE_CHAR_MAP)
return title[:31]
def is_client_row(row, row_dimension):
return row[0].alignment.indent == 0.0
# return row_dimension.outline_level == 0
def create_write_only_cell(source_cell, target_sheet):
target_cell = WriteOnlyCell(target_sheet, value=source_cell.value)
target_cell.data_type = source_cell.data_type
if source_cell.has_style:
target_cell.font = copy(source_cell.font)
# TODO save one border and use it
# target_cell.border = copy(source_cell.border)
# TODO copy client row
# target_cell.fill = copy(source_cell.fill)
target_cell.number_format = copy(source_cell.number_format)
# target_cell.protection = copy(source_cell.protection)
target_cell.alignment = copy(source_cell.alignment)
return target_cell
def create_write_only_row(source_row, target_sheet):
return [create_write_only_cell(cell, target_sheet) for cell in source_row]
def split_workbook(input_file, output_file):
"""
Split workbook each client into its own sheet.
"""
try:
logger.info(f"Loading workbook {input_file}")
workbook = load_workbook(input_file)
data_sheet = workbook.active
output_workbook = Workbook(write_only=True)
client_sheet = None
client_row_index = 2
processing_client = 0
rows = data_sheet.rows
header = next(rows)
for index, row in enumerate(rows, start=2):
# TODO implement skip row
# if skip_row(row) is True:
# continue
row_dimension = data_sheet.row_dimensions[index]
# create new sheet found new client is found
if is_client_row(row, row_dimension):
processing_client += 1
client_sheet_title = clean_sheet_title(row[0].value)
logger.info(f"Processing client {processing_client}")
client_sheet = output_workbook.create_sheet(client_sheet_title)
client_row_index = index
# copy column dimensions
for key, column_dimension in data_sheet.column_dimensions.items():
client_sheet.column_dimensions[key] = copy(column_dimension)
client_sheet.column_dimensions[key].worksheet = client_sheet
client_sheet.append(create_write_only_row(header, client_sheet))
# copy row dimensions
client_sheet.row_dimensions[index - client_row_index + 2] = copy(
row_dimension
)
client_sheet.row_dimensions[index].worksheet = client_sheet
# finally copy row
client_sheet.append(create_write_only_row(row, client_sheet))
if index % 10000 == 0:
logger.info(f"{index} rows processed")
logger.info(f"Writing workbook {output_file}")
output_workbook.save(output_file)
finally:
if workbook:
workbook.close()
if output_workbook:
output_workbook.close()
if __name__ == "__main__":
input_file = "input_file.xlsx"
output_file = "output_file.xlsx"
start = time.time()
configure_logging()
logger.info(f"Using lxml mode: {LXML}")
split_workbook(input_file, output_file)
logger.info("Time consumed: % s seconds" % (time.time() - start))
我所说的组是指可扩展的东西:
按下它们时,我们可以扩展一些行,在这种特殊情况下,我需要将行提取到不同的数据位置(无论如何,rows/better 列表 - 其他 excel 文件),分组由 1st, hm, group:
因此,例如在这种情况下:
file1.xlsx 将包括从 6 到 572
的所有行file2.xlsx 将包括从 573 到 627 的行
等等。
如何执行此操作?它可以是 VBA 脚本,但最好使用一些 python 库,如 openpyxl 或 win32com.client
# -*- coding: utf-8 -*-
import openpyxl
wb = openpyxl.load_workbook(r'path_to_xlsx_file')
ws = wb.active
range_string = ws.calculate_dimension()
print(range_string)
for row_index, row in enumerate(ws.iter_rows(range_string=range_string)):
print(ws.row_dimensions[row_index].index, # just for the great LULZ
ws.row_dimensions[row_index].outline_level, # THAT what I was looking for!
ws.row_dimensions[row_index].hidden, # couple other helpful parameters
ws.row_dimensions[row_index].collapsed,
ws.row_dimensions[row_index].height)
我构建了类似的东西 - 它采用 .xlsx 并将所有分组(在本例中为客户端)拆分为单独的工作表。它使用 openpyxl
openpyxl.worksheet._read_only.ReadOnlyWorksheet
-
Optimised
Modes Read-only mode
Sometimes, you will need to open or write extremely large XLSX files, and the common routines in openpyxl won’t be able to handle that load. Fortunately, there are two modes that enable you to read and write unlimited amounts of data with (near) constant memory consumption.
脚本:
from openpyxl import load_workbook
from openpyxl import LXML
import time
from copy import copy
from openpyxl.cell import WriteOnlyCell
from openpyxl import Workbook
import logging
import sys
logger = logging.getLogger()
def configure_logging():
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
INVALID_TITLE_CHARS = ["]", "[", "*", ":", "?", "/", "\", "'"]
INVALID_TITLE_CHAR_MAP = {ord(x): "" for x in INVALID_TITLE_CHARS}
def clean_sheet_title(title):
title = title or ""
title = title.strip()
title = title.translate(INVALID_TITLE_CHAR_MAP)
return title[:31]
def is_client_row(row, row_dimension):
return row[0].alignment.indent == 0.0
# return row_dimension.outline_level == 0
def create_write_only_cell(source_cell, target_sheet):
target_cell = WriteOnlyCell(target_sheet, value=source_cell.value)
target_cell.data_type = source_cell.data_type
if source_cell.has_style:
target_cell.font = copy(source_cell.font)
# TODO save one border and use it
# target_cell.border = copy(source_cell.border)
# TODO copy client row
# target_cell.fill = copy(source_cell.fill)
target_cell.number_format = copy(source_cell.number_format)
# target_cell.protection = copy(source_cell.protection)
target_cell.alignment = copy(source_cell.alignment)
return target_cell
def create_write_only_row(source_row, target_sheet):
return [create_write_only_cell(cell, target_sheet) for cell in source_row]
def split_workbook(input_file, output_file):
"""
Split workbook each client into its own sheet.
"""
try:
logger.info(f"Loading workbook {input_file}")
workbook = load_workbook(input_file)
data_sheet = workbook.active
output_workbook = Workbook(write_only=True)
client_sheet = None
client_row_index = 2
processing_client = 0
rows = data_sheet.rows
header = next(rows)
for index, row in enumerate(rows, start=2):
# TODO implement skip row
# if skip_row(row) is True:
# continue
row_dimension = data_sheet.row_dimensions[index]
# create new sheet found new client is found
if is_client_row(row, row_dimension):
processing_client += 1
client_sheet_title = clean_sheet_title(row[0].value)
logger.info(f"Processing client {processing_client}")
client_sheet = output_workbook.create_sheet(client_sheet_title)
client_row_index = index
# copy column dimensions
for key, column_dimension in data_sheet.column_dimensions.items():
client_sheet.column_dimensions[key] = copy(column_dimension)
client_sheet.column_dimensions[key].worksheet = client_sheet
client_sheet.append(create_write_only_row(header, client_sheet))
# copy row dimensions
client_sheet.row_dimensions[index - client_row_index + 2] = copy(
row_dimension
)
client_sheet.row_dimensions[index].worksheet = client_sheet
# finally copy row
client_sheet.append(create_write_only_row(row, client_sheet))
if index % 10000 == 0:
logger.info(f"{index} rows processed")
logger.info(f"Writing workbook {output_file}")
output_workbook.save(output_file)
finally:
if workbook:
workbook.close()
if output_workbook:
output_workbook.close()
if __name__ == "__main__":
input_file = "input_file.xlsx"
output_file = "output_file.xlsx"
start = time.time()
configure_logging()
logger.info(f"Using lxml mode: {LXML}")
split_workbook(input_file, output_file)
logger.info("Time consumed: % s seconds" % (time.time() - start))