遍历 excel 个文件的 sheet 并追加如果 sheet 名称在 Python 中共享公共部分
Iterate through excel files' sheets and append if sheet names share common part in Python
假设我们有许多 excel 个文件,其中包含多个 sheet,如下所示:
Sheet 1: 2021_q1_bj
a b c d
0 1 2 23 2
1 2 3 45 5
Sheet 2: 2021_q2_bj
a b c d
0 1 2 23 6
1 2 3 45 7
Sheet3:2019_q1_sh
a b c
0 1 2 23
1 2 3 45
Sheet 4: 2019_q2_sh
a b c
0 1 2 23
1 2 3 40
如果所有 excel 文件中被 sheet 名称的 _
拆分的最后一部分相同,我希望将所有 sheet 附加到一个。即,sheet 1 将附加 sheet 2,因为它们都有共同的 bj
,如果另一个 excel 文件也有名称为 [=25 的 sheet =],它也将附加到这个,与 sheet 3 和 sheet 4.
相同的逻辑
如何在 Pandas 或其他 Python 软件包中实现这一点?
当前 excel 文件的预期结果为:
bj
:
a b c d
0 1 2 23 2
1 2 3 45 5
2 1 2 23 6
3 2 3 45 7
sh
:
a b c
0 1 2 23
1 2 3 45
2 1 2 23
3 2 3 40
参考代码:
import os, glob
import pandas as pd
files = glob.glob("*.xlsx")
for each in files:
dfs = pd.read_excel(each, sheet_name=None, index_col=[0])
df_out = pd.concat(dfs.values(), keys=dfs.keys())
for n, g in df_out.groupby(df_out.index.to_series().str[0].str.rsplit('_', n=1).str[-1]):
g.droplevel(level=0).dropna(how='all', axis=1).reset_index(drop=True).to_excel(f'Out_{n}.xlsx', index=False)
更新:
您可以从 this link 下载测试 excel 文件和最终预期结果。
尝试:
dfs = pd.read_excel('Downloads/WS_1.xlsx', sheet_name=None, index_col=[0])
df_out = pd.concat(dfs.values(), keys=dfs.keys())
for n, g in df_out.groupby(df_out.index.to_series().str[0].str.rsplit('_', n=1).str[-1]):
g.droplevel(level=0).dropna(how='all', axis=1).reset_index(drop=True).to_excel(f'Out_{n}.xlsx')
更新
import os, glob
import pandas as pd
files = glob.glob("Downloads/test_data/*.xlsx")
writer = pd.ExcelWriter('Downloads/test_data/Output_file.xlsx', engine='xlsxwriter')
excel_dict = {}
for each in files:
dfs = pd.read_excel(each, sheet_name=None, index_col=[0])
excel_dict.update(dfs)
df_out = pd.concat(dfs.values(), keys=dfs.keys())
for n, g in df_out.groupby(df_out.index.to_series().str[0].str.rsplit('_', n=1).str[-1]):
g.droplevel(level=0).dropna(how='all', axis=1).reset_index(drop=True).to_excel(writer, index=False, sheet_name=f'{n}')
writer.save()
writer.close()
我已经实现了整个过程,并通过下面的代码得到了最终的预期结果。
感谢提供更简洁的替代解决方案,或者如果可能的话给我一些建议:
import os, glob
import pandas as pd
from pandas import ExcelWriter
from datetime import datetime
def save_xls(dict_df, path):
writer = ExcelWriter(path)
for key in dict_df:
dict_df[key].to_excel(writer, key, index=False)
writer.save()
root_dir = './original/'
for root, subFolders, files in os.walk(root_dir):
# print(subFolders)
for file in files:
if '.xlsx' in file:
file_path = os.path.join(root_dir, file)
print(file)
f = pd.ExcelFile(file_path)
dict_dfs = {}
for sheet_name in f.sheet_names:
df_new = f.parse(sheet_name = sheet_name)
print(sheet_name)
## get the year and quarter from the sheet name
year, quarter, city = sheet_name.split("_")
# year, quarter, city = sheet_name.split("_")
df_new["year"] = year
df_new["quarter"] = quarter
df_new["city"] = city
dict_dfs[sheet_name] = df_new
save_xls(dict_df = dict_dfs, path = './add_columns_from_sheet_name/' + "new_" + file)
root_dir = './add_columns_from_sheet_name/'
list1 = []
df = pd.DataFrame()
for root, subFolders, files in os.walk(root_dir):
# print(subFolders)
for file in files:
if '.xlsx' in file:
# print(file)
city = file.split('_')[0]
# print(file)
file_path = os.path.join(root_dir, file)
# print(file_path)
dfs = pd.read_excel(file_path, sheet_name=None)
df_out = pd.concat(dfs.values(), keys=dfs.keys())
for n, g in df_out.groupby(df_out.index.to_series().str[0].str.rsplit('_', n=1).str[-1]):
print(n)
timestr = datetime.utcnow().strftime('%Y%m%d-%H%M%S%f')[:-3]
g.droplevel(level=0).dropna(how='all', axis=1).reset_index(drop=True).to_excel(f'./output/{n}_{timestr}.xlsx', index=False)
file_set = set()
file_dir = './output/'
file_list = os.listdir(file_dir)
for file in file_list:
data_type = file.split('_')[0]
file_set.add(data_type)
print(file_set)
file_dir = './output'
file_list = os.listdir(file_dir)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
df3 = pd.DataFrame()
df4 = pd.DataFrame()
file_set = set()
for file in file_list:
if '.xlsx' in file:
# print(file)
df_temp = pd.read_excel(os.path.join(file_dir, file))
if 'bj' in file:
df1 = df1.append(df_temp)
elif 'sh' in file:
df2 = df2.append(df_temp)
elif 'gz' in file:
df3 = df3.append(df_temp)
elif 'sz' in file:
df4 = df4.append(df_temp)
# function
def dfs_tabs(df_list, sheet_list, file_name):
writer = pd.ExcelWriter(file_name,engine='xlsxwriter')
for dataframe, sheet in zip(df_list, sheet_list):
dataframe.to_excel(writer, sheet_name=sheet, startrow=0 , startcol=0, index=False)
writer.save()
# list of dataframes and sheet names
dfs = [df1, df2, df3, df4]
sheets = ['bj', 'sh', 'gz', 'sz']
# run function
dfs_tabs(dfs, sheets, './final/final_result.xlsx')
假设我们有许多 excel 个文件,其中包含多个 sheet,如下所示:
Sheet 1: 2021_q1_bj
a b c d
0 1 2 23 2
1 2 3 45 5
Sheet 2: 2021_q2_bj
a b c d
0 1 2 23 6
1 2 3 45 7
Sheet3:2019_q1_sh
a b c
0 1 2 23
1 2 3 45
Sheet 4: 2019_q2_sh
a b c
0 1 2 23
1 2 3 40
如果所有 excel 文件中被 sheet 名称的 _
拆分的最后一部分相同,我希望将所有 sheet 附加到一个。即,sheet 1 将附加 sheet 2,因为它们都有共同的 bj
,如果另一个 excel 文件也有名称为 [=25 的 sheet =],它也将附加到这个,与 sheet 3 和 sheet 4.
如何在 Pandas 或其他 Python 软件包中实现这一点?
当前 excel 文件的预期结果为:
bj
:
a b c d
0 1 2 23 2
1 2 3 45 5
2 1 2 23 6
3 2 3 45 7
sh
:
a b c
0 1 2 23
1 2 3 45
2 1 2 23
3 2 3 40
参考代码:
import os, glob
import pandas as pd
files = glob.glob("*.xlsx")
for each in files:
dfs = pd.read_excel(each, sheet_name=None, index_col=[0])
df_out = pd.concat(dfs.values(), keys=dfs.keys())
for n, g in df_out.groupby(df_out.index.to_series().str[0].str.rsplit('_', n=1).str[-1]):
g.droplevel(level=0).dropna(how='all', axis=1).reset_index(drop=True).to_excel(f'Out_{n}.xlsx', index=False)
更新:
您可以从 this link 下载测试 excel 文件和最终预期结果。
尝试:
dfs = pd.read_excel('Downloads/WS_1.xlsx', sheet_name=None, index_col=[0])
df_out = pd.concat(dfs.values(), keys=dfs.keys())
for n, g in df_out.groupby(df_out.index.to_series().str[0].str.rsplit('_', n=1).str[-1]):
g.droplevel(level=0).dropna(how='all', axis=1).reset_index(drop=True).to_excel(f'Out_{n}.xlsx')
更新
import os, glob
import pandas as pd
files = glob.glob("Downloads/test_data/*.xlsx")
writer = pd.ExcelWriter('Downloads/test_data/Output_file.xlsx', engine='xlsxwriter')
excel_dict = {}
for each in files:
dfs = pd.read_excel(each, sheet_name=None, index_col=[0])
excel_dict.update(dfs)
df_out = pd.concat(dfs.values(), keys=dfs.keys())
for n, g in df_out.groupby(df_out.index.to_series().str[0].str.rsplit('_', n=1).str[-1]):
g.droplevel(level=0).dropna(how='all', axis=1).reset_index(drop=True).to_excel(writer, index=False, sheet_name=f'{n}')
writer.save()
writer.close()
我已经实现了整个过程,并通过下面的代码得到了最终的预期结果。
感谢提供更简洁的替代解决方案,或者如果可能的话给我一些建议:
import os, glob
import pandas as pd
from pandas import ExcelWriter
from datetime import datetime
def save_xls(dict_df, path):
writer = ExcelWriter(path)
for key in dict_df:
dict_df[key].to_excel(writer, key, index=False)
writer.save()
root_dir = './original/'
for root, subFolders, files in os.walk(root_dir):
# print(subFolders)
for file in files:
if '.xlsx' in file:
file_path = os.path.join(root_dir, file)
print(file)
f = pd.ExcelFile(file_path)
dict_dfs = {}
for sheet_name in f.sheet_names:
df_new = f.parse(sheet_name = sheet_name)
print(sheet_name)
## get the year and quarter from the sheet name
year, quarter, city = sheet_name.split("_")
# year, quarter, city = sheet_name.split("_")
df_new["year"] = year
df_new["quarter"] = quarter
df_new["city"] = city
dict_dfs[sheet_name] = df_new
save_xls(dict_df = dict_dfs, path = './add_columns_from_sheet_name/' + "new_" + file)
root_dir = './add_columns_from_sheet_name/'
list1 = []
df = pd.DataFrame()
for root, subFolders, files in os.walk(root_dir):
# print(subFolders)
for file in files:
if '.xlsx' in file:
# print(file)
city = file.split('_')[0]
# print(file)
file_path = os.path.join(root_dir, file)
# print(file_path)
dfs = pd.read_excel(file_path, sheet_name=None)
df_out = pd.concat(dfs.values(), keys=dfs.keys())
for n, g in df_out.groupby(df_out.index.to_series().str[0].str.rsplit('_', n=1).str[-1]):
print(n)
timestr = datetime.utcnow().strftime('%Y%m%d-%H%M%S%f')[:-3]
g.droplevel(level=0).dropna(how='all', axis=1).reset_index(drop=True).to_excel(f'./output/{n}_{timestr}.xlsx', index=False)
file_set = set()
file_dir = './output/'
file_list = os.listdir(file_dir)
for file in file_list:
data_type = file.split('_')[0]
file_set.add(data_type)
print(file_set)
file_dir = './output'
file_list = os.listdir(file_dir)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
df3 = pd.DataFrame()
df4 = pd.DataFrame()
file_set = set()
for file in file_list:
if '.xlsx' in file:
# print(file)
df_temp = pd.read_excel(os.path.join(file_dir, file))
if 'bj' in file:
df1 = df1.append(df_temp)
elif 'sh' in file:
df2 = df2.append(df_temp)
elif 'gz' in file:
df3 = df3.append(df_temp)
elif 'sz' in file:
df4 = df4.append(df_temp)
# function
def dfs_tabs(df_list, sheet_list, file_name):
writer = pd.ExcelWriter(file_name,engine='xlsxwriter')
for dataframe, sheet in zip(df_list, sheet_list):
dataframe.to_excel(writer, sheet_name=sheet, startrow=0 , startcol=0, index=False)
writer.save()
# list of dataframes and sheet names
dfs = [df1, df2, df3, df4]
sheets = ['bj', 'sh', 'gz', 'sz']
# run function
dfs_tabs(dfs, sheets, './final/final_result.xlsx')