在 python 中,如何连接多个 excel 文件中的相应工作表

In python, how to concatenate corresponding sheets in multiple excel files

如何将多个 xlsx 文件与相同的 sheet_names 连接起来。例如, 我有 3 xlsx 个文件,Rob_schedule.xlsxMike_schdule.xlsxJerome_schedule.xlsx.

每个文件都有以下 sheet/tab 个名称:homeofficeschool

下面的代码生成 3 xlsx 文件(您可以复制 + 粘贴和 运行 生成 excel 文件)

##############################Generating the data for Rob_schedule.xlsx########################
import pandas as pd
import numpy as np
df= {
    'Date':[10232020,10242020,10252020,10262020],
    'Class':['AP_Bio','AP_Chem','Physics','History'],
    'Period':[3,1,2,4]}
school = pd.DataFrame(df,columns = ['Date','Class','Period'])
school

df2= {
    'Date':[10232020,10242020,10252020,10262020],
    'Meeting':['MQ1','MQ6','MQ2','MQ8'],
    'Lunch':[1,1,1,3],
    'code':['java','python','C','C++']}
office = pd.DataFrame(df2,columns = ['Date','Meeting','Lunch','code'])
office

df3= {
    'cooking':['C','B','D','B'],
    'Laundry':['color','white','White','color'],
    'cleaning':['balcony','garage','restroom','bathroom']}
home = pd.DataFrame(df3,columns = ['cooking','Laundry','cleaning'])
home

import pandas as pd
#initialze the excel writer
writer = pd.ExcelWriter('Rob_schedule.xlsx', engine='xlsxwriter')

#store your dataframes in a  dict, where the key is the sheet name you want
frames = {'home':home, 'office':office,
        'school':school}

#now loop thru and put each on a specific sheet
for sheet, frame in  frames.items(): 
    frame.to_excel(writer, sheet_name = sheet,index = False)

#critical last step
writer.save()

################################ generating Mike_schedule.xlsx###################################
import pandas as pd
import numpy as np
df= {
    'Date':[10232020,10242020,10252020,10262020],
    'Class':['AP_Bio','AP_Chem','Physics','History'],
    'Period':[3,1,2,4]}
school = pd.DataFrame(df,columns = ['Date','Class','Period'])
school

df2= {
    'Date':[10232020,10242020,10252020,10262020],
    'Meeting':['MQ1','MQ2','MQ4','MQ5'],
    'Lunch':[1,1,1,3],
    'code':['javascript','R','C','C++']}
office = pd.DataFrame(df2,columns = ['Date','Meeting','Lunch','code'])
office

df3= {
    
    'cooking':['A','B','D','B'],
    'Laundry':['color','white','white','color'],
    'cleaning':['patio','garage','living_room','bathroom']}
home = pd.DataFrame(df3,columns = ['cooking','Laundry','cleaning'])
home


#initialze the excel writer
writer = pd.ExcelWriter('Mike_schedule.xlsx', engine='xlsxwriter')

#store your dataframes in a  dict, where the key is the sheet name you want
frames = {'home':home, 'office':office,
        'school':school}

#now loop thru and put each on a specific sheet
for sheet, frame in  frames.items(): # .use .items for python 3.X
    frame.to_excel(writer, sheet_name = sheet,index = False)

#critical last step
writer.save()

######################### Generate Jerome schedule###########################################
df= {
    'Date':[10232020,10242020,10252020,10262020],
    'Class':['French','Math','Physics','History'],
    'Period':[3,1,2,4]}
school = pd.DataFrame(df,columns = ['Date','Class','Period'])
school

df2= {
    'Date':[10232020,10242020,10252020,10262020],
    'Meeting':['MQ1','MQ2','MQ4','MQ5'],
    'Lunch':[1,1,1,3],
    'code':['javascript','python','R','C++']}
office = pd.DataFrame(df2,columns = ['Date','Meeting','Lunch','code'])
office

df3= {
    
    'cooking':['X','B','D','C'],
    'Laundry':['color','white','white','color'],
    'cleaning':['patio','garage','living_room','bathroom']}
home = pd.DataFrame(df3,columns = ['cooking','Laundry','cleaning'])
home

import pandas as pd
#initialze the excel writer
writer = pd.ExcelWriter('Jerome_schedule.xlsx', engine='xlsxwriter')

#store your dataframes in a  dict, where the key is the sheet name you want
frames = {'home':home, 'office':office,
        'school':school}

#now loop thru and put each on a specific sheet
for sheet, frame in  frames.items(): # .use .items for python 3.X
    frame.to_excel(writer, sheet_name = sheet,index = False)

#critical last step
writer.save()

我想

  1. concatenate对应sheets/tabs:home,office,schoolRob_schedule.xlsx,Mike_schedule.xlsx & Jerome_schedule.xlsx

  2. 使用 homeofficeschool 选项卡将串联数据帧导出为 family_schedule.xlsx

我的尝试:

# This code concatenates all the tabs into one tab, but what I want is to concatenate all by their corresponding sheet/tab names

import pandas as pd

path = os.chdir(r'mypath\')
files = os.listdir(path)
files
# pull files with `.xlsx` extension
excel_files = [file for file in files if '.xlsx' in file]
excel_files

def create_df_from_excel(file_name):
    file = pd.ExcelFile(file_name)

    names = file.sheet_names

    return pd.concat([file.parse(name) for name in names])

df = pd.concat(
    [create_df_from_excel(xl) for xl in excel_files]
)

# save the data frame
writer = pd.ExcelWriter('family_reschedule.xlsx')
df.to_excel(writer, '')
writer.save()

我会遍历每个文件,然后遍历每个作品sheet,将每个 sheet 添加到基于 sheet 名称的不同列表。

那么您将拥有类似...

的结构
{
  'sheet1': [df_file1_sheet1, df_file2_sheet1, df_file3_sheet1],
  'sheet2': [df_file1_sheet2, df_file2_sheet2, df_file3_sheet2],
  'sheet3': [df_file1_sheet3, df_file2_sheet3, df_file3_sheet3],
}

然后将每个列表连接成一个数据帧,他们将三个数据帧写入一个 excel 文件。

# This part is just your own code, I've added it here because you
# couldn't figure out where `excel_files` came from
#################################################################

import os
import pandas as pd

path = os.chdir(r'mypath\')
files = os.listdir(path)
files
# pull files with `.xlsx` extension
excel_files = [file for file in files if '.xlsx' in file]
excel_files 

# This part is my actual answer
###############################

from collections import defaultdict

worksheet_lists = defaultdict(list)
for file_name in excel_files:
    workbook = pd.ExcelFile(file_name)
    for sheet_name in workbook.sheet_names:
        worksheet = workbook.parse(sheet_name)
        worksheet['source'] = file_name
        worksheet_lists[sheet_name].append(worksheet)

worksheets = {
    sheet_name: pd.concat(sheet_list)
        for (sheet_name, sheet_list)
        in worksheet_lists.items()
}

writer = pd.ExcelWriter('family_reschedule.xlsx')

for sheet_name, df in worksheets.items():
    df.to_excel(writer, sheet_name=sheet_name, index=False)

writer.save()

考虑通过 运行 跨 sheet 个名称的外部迭代和跨工作簿的内部迭代构建具有 list/dict 理解的串联数据框列表:

import pandas as pd

path = "/path/to/workbooks"
workbooks = [f for f in os.listdir(path) if f.endswith(".xlsx")]
sheets = ["home", "office", "school"]

df_dicts = {
    sh: pd.concat(
        [pd.read_excel(os.path.join(path, wb), sheet_name=sh) 
         for wb in workbooks]
    )
    for sh in sheets
}

然后,导出到单个文件:

with pd.ExcelWriter('family_reschedule.xlsx') as writer:
   for sh, df in df_dict.items(): 
       df.to_excel(writer, sheet_name=sh, index=False)

   writer.save()