Python - 加载多个 excel 文件，其中包含多个包含特定列的工作表

Question

我有一个问题场景，我需要使用 Python

加载 excel 文件

从文件夹加载多个 excel 文件 - 完成
每个 excel 文件有多个工作表 - 完成
只需加载所需的列 ('Receive Date','Process Date','Process Number','Task Name','Series','Office','Department','Unit Manager','AM'),其他列需为ignored/dropped,以上列不存在不应报错在某些工作表中。
将所有数据加载到单个数据帧中

------代码------

import pandas as pd
import os
import glob

def getfilepath():
    path = 'C:/Users/Tracking Logs/'
    files=(os.listdir(path))
    allfiles = glob.glob(path+"*.xlsx")

def getdatafromexcel():
    for file in allfiles:
        rawdf = pd.read_excel(file,sheet_name=None,na_values='null',keep_default_na=False,dtype=object,date_parser=True)
        cols=('Receive Date','Process Date','Process Number','Task Name','Series','Office','Department','Unit Manager','AM/AA/PC')
        display(df)
    
getfilepath()
getdatafromexcel()

Answer 1

可以使用 pd.ExcelFile 和 pd.read_excel 来获得所需的结果。

def getdatafromexcel():
    for file in allfiles:
        xl = pd.ExcelFile(file)
        res = len(xl.sheet_names)

    
    if res>1:
        for i in range(1, res+1):
            df = pd.read_excel(file, sheet_name= '%d' %i)
            # Do selection, preprocessing what you want here

            if i == 1:
                df.to_csv(<your_path> + '1.csv')
                df_1 = pd.read_csv(<your_path> +  '1.csv')

            if i > 1:
                df_1 = pd.concat([df_1, df])

    else: 
        df_1 = pd.read_excel(file)
        # Do selection, preprocessing what you what here
        df_1.to_csv(<your_path> + '.csv', index= False)

Answer 2

我找到了解决方案：

import pandas as pd
import os
import glob
from IPython.display import HTML,display
from openpyxl import load_workbook    

path = 'C:/Users/Tracking Logs/'
cols = ['Receive Date','Process Date','Task Name','Series','Office','Department','Unit Manager','AM/AA/PC']

def getfilepath(path):    
    files=(os.listdir(path))
    allfiles = glob.glob(path+"*.xlsx") 
    #print('Allfiles: ',allfiles)
    return allfiles

def getdatafromexcel(cols,allfiles):   
    for i in range(len(allfiles)): 
        print('\nCounter: ',i,' \nFilenames: ',allfiles[i])         
        wb = load_workbook(allfiles[i],read_only=True)                     
        for sheetname in wb.sheetnames:    
            print('Sheetname: ',sheetname)
            try:                
                df = pd.read_excel(allfiles[i],sheet_name=sheetname,na_values='null',usecols=cols,
                           keep_default_na=False,dtype=object)
                Indexnames = df[(df["Task Name"] == '') & (df["Series"] == '') & (df["Office"] == '')].index
                df.drop(Indexnames,inplace=True)
                display(df)
                fulldf=fulldf.append(df,ignore_index=True)             
            except Exception as e:
                print(e)                                          
            finally:   
                print('this executed')
                wb.close()                       
    display(fulldf)
    
allfiles = getfilepath(path)
getdatafromexcel(cols,allfiles)

Python - 加载多个 excel 文件，其中包含多个包含特定列的工作表

Python - Load multiple excel files with multiple sheets in it with specific columns

xlrd

python-3.x

pandas