Python - 加载多个 excel 文件,其中包含多个包含特定列的工作表
Python - Load multiple excel files with multiple sheets in it with specific columns
我有一个问题场景,我需要使用 Python
加载 excel 文件
- 从文件夹加载多个 excel 文件 - 完成
- 每个 excel 文件有多个工作表 - 完成
- 只需加载所需的列 ('Receive Date','Process Date','Process Number','Task Name','Series','Office','Department','Unit Manager','AM'),其他列需为ignored/dropped,以上列不存在不应报错在某些工作表中。
- 将所有数据加载到单个数据帧中
------代码------
import pandas as pd
import os
import glob
def getfilepath():
path = 'C:/Users/Tracking Logs/'
files=(os.listdir(path))
allfiles = glob.glob(path+"*.xlsx")
def getdatafromexcel():
for file in allfiles:
rawdf = pd.read_excel(file,sheet_name=None,na_values='null',keep_default_na=False,dtype=object,date_parser=True)
cols=('Receive Date','Process Date','Process Number','Task Name','Series','Office','Department','Unit Manager','AM/AA/PC')
display(df)
getfilepath()
getdatafromexcel()
可以使用 pd.ExcelFile 和 pd.read_excel 来获得所需的结果。
def getdatafromexcel():
for file in allfiles:
xl = pd.ExcelFile(file)
res = len(xl.sheet_names)
if res>1:
for i in range(1, res+1):
df = pd.read_excel(file, sheet_name= '%d' %i)
# Do selection, preprocessing what you want here
if i == 1:
df.to_csv(<your_path> + '1.csv')
df_1 = pd.read_csv(<your_path> + '1.csv')
if i > 1:
df_1 = pd.concat([df_1, df])
else:
df_1 = pd.read_excel(file)
# Do selection, preprocessing what you what here
df_1.to_csv(<your_path> + '.csv', index= False)
我找到了解决方案:
import pandas as pd
import os
import glob
from IPython.display import HTML,display
from openpyxl import load_workbook
path = 'C:/Users/Tracking Logs/'
cols = ['Receive Date','Process Date','Task Name','Series','Office','Department','Unit Manager','AM/AA/PC']
def getfilepath(path):
files=(os.listdir(path))
allfiles = glob.glob(path+"*.xlsx")
#print('Allfiles: ',allfiles)
return allfiles
def getdatafromexcel(cols,allfiles):
for i in range(len(allfiles)):
print('\nCounter: ',i,' \nFilenames: ',allfiles[i])
wb = load_workbook(allfiles[i],read_only=True)
for sheetname in wb.sheetnames:
print('Sheetname: ',sheetname)
try:
df = pd.read_excel(allfiles[i],sheet_name=sheetname,na_values='null',usecols=cols,
keep_default_na=False,dtype=object)
Indexnames = df[(df["Task Name"] == '') & (df["Series"] == '') & (df["Office"] == '')].index
df.drop(Indexnames,inplace=True)
display(df)
fulldf=fulldf.append(df,ignore_index=True)
except Exception as e:
print(e)
finally:
print('this executed')
wb.close()
display(fulldf)
allfiles = getfilepath(path)
getdatafromexcel(cols,allfiles)
我有一个问题场景,我需要使用 Python
加载 excel 文件- 从文件夹加载多个 excel 文件 - 完成
- 每个 excel 文件有多个工作表 - 完成
- 只需加载所需的列 ('Receive Date','Process Date','Process Number','Task Name','Series','Office','Department','Unit Manager','AM'),其他列需为ignored/dropped,以上列不存在不应报错在某些工作表中。
- 将所有数据加载到单个数据帧中
------代码------
import pandas as pd
import os
import glob
def getfilepath():
path = 'C:/Users/Tracking Logs/'
files=(os.listdir(path))
allfiles = glob.glob(path+"*.xlsx")
def getdatafromexcel():
for file in allfiles:
rawdf = pd.read_excel(file,sheet_name=None,na_values='null',keep_default_na=False,dtype=object,date_parser=True)
cols=('Receive Date','Process Date','Process Number','Task Name','Series','Office','Department','Unit Manager','AM/AA/PC')
display(df)
getfilepath()
getdatafromexcel()
可以使用 pd.ExcelFile 和 pd.read_excel 来获得所需的结果。
def getdatafromexcel():
for file in allfiles:
xl = pd.ExcelFile(file)
res = len(xl.sheet_names)
if res>1:
for i in range(1, res+1):
df = pd.read_excel(file, sheet_name= '%d' %i)
# Do selection, preprocessing what you want here
if i == 1:
df.to_csv(<your_path> + '1.csv')
df_1 = pd.read_csv(<your_path> + '1.csv')
if i > 1:
df_1 = pd.concat([df_1, df])
else:
df_1 = pd.read_excel(file)
# Do selection, preprocessing what you what here
df_1.to_csv(<your_path> + '.csv', index= False)
我找到了解决方案:
import pandas as pd
import os
import glob
from IPython.display import HTML,display
from openpyxl import load_workbook
path = 'C:/Users/Tracking Logs/'
cols = ['Receive Date','Process Date','Task Name','Series','Office','Department','Unit Manager','AM/AA/PC']
def getfilepath(path):
files=(os.listdir(path))
allfiles = glob.glob(path+"*.xlsx")
#print('Allfiles: ',allfiles)
return allfiles
def getdatafromexcel(cols,allfiles):
for i in range(len(allfiles)):
print('\nCounter: ',i,' \nFilenames: ',allfiles[i])
wb = load_workbook(allfiles[i],read_only=True)
for sheetname in wb.sheetnames:
print('Sheetname: ',sheetname)
try:
df = pd.read_excel(allfiles[i],sheet_name=sheetname,na_values='null',usecols=cols,
keep_default_na=False,dtype=object)
Indexnames = df[(df["Task Name"] == '') & (df["Series"] == '') & (df["Office"] == '')].index
df.drop(Indexnames,inplace=True)
display(df)
fulldf=fulldf.append(df,ignore_index=True)
except Exception as e:
print(e)
finally:
print('this executed')
wb.close()
display(fulldf)
allfiles = getfilepath(path)
getdatafromexcel(cols,allfiles)