如何使用 pandas 从文件夹中读取和合并名称相似的 .csv 文件

How to read and combine .csv files with similar names from a folder using pandas

我的文件夹中有如下文件名 C/Downloads -

Mango001-003.csv
Mango004-006.csv
Mango007-100.csv
Applefruit.csv
Banana001-003.csv
Banana004-006.csv

如何单独导入水果文件,然后将相同的水果文件合并成一个文件?

预期的是 Mango 输出一个,Apple 输出一个,Banana 输出一个

import os
import re
data_files = os.listdir(r'C:\Downloads')
def load_files(filenames):
    # Pre-compile regex for code readability
    regex = re.compile(r'Mango.*?.csv')
    
    # Map filenames to match objects, filter out not matching names
    matches = [m for m in map(regex.match, filenames) if m is not None]
    
    li = []
    for match in matches:
                
        df = pd.read_csv(match, index_col=None, header=0, dtype=object)
        li.append(df)
        
    #Concatenating the data
    frame = pd.concat(li, axis=0, ignore_index=True)
    return (frame)
    
df  = load_files(data_files)
print(df.shape)
df.head(2)

我遇到错误。另外,不能这么复杂,一定是我做错了。

也许这不是最好的方法,但是对于给定的文件名...

尝试:

import pandas as pd
import glob
import re

path = r'./files' # use your path
all_files = glob.glob(path + "/*.csv")

fruits = []

# for all files in the folder get the fruit name
# this could be where things go wrong if the regex does not
# account for all filename types.  Pattern may need tweaking
# example https://regex101.com/r/E69LWa/1
for file in all_files:
    cleanFile = file.replace('fruit', '')
    match = re.match(r'^.*/([A-Za-z]+)',cleanFile)
    fruits.append(match.group(1))

# There will be one output for Mango, one for Apple & one for Banana hence three...
dfs_man = []
dfs_ban = []
dfs_app = []

# for all files create a df and append to the correct list holding other dfs of the same fruit
for i, file in enumerate(all_files):
    df = pd.read_csv(file)
    if fruits[i] == 'Mango':
        dfs_man.append(df)
    elif fruits[i] == 'Banana':
        dfs_ban.append(df)
    elif fruits[i] == 'Apple':
        dfs_app.append(df)

# concatenate if more than one df in list, else just get the df out of list
if len(dfs_man) > 1:
    df_mango = pd.concat(dfs_man, ignore_index=True)
elif len(dfs_man) == 1:
    df_mango = dfs_man[0]
if len(dfs_ban) > 1:
    df_banana = pd.concat(dfs_ban, ignore_index=True)
elif len(dfs_ban) == 1:
    df_banana = dfs_ban[0]
if len(dfs_app) > 1:
    df_apple = pd.concat(dfs_app, ignore_index=True)
elif len(dfs_app) == 1:
    df_apple = dfs_app[0]
    
print(df_mango.shape, df_banana.shape, df_apple.shape)

我认为最简单的方法是使用 glob.glob 获取以特定水果名称开头的所有文件的列表(这里我使用芒果)并使用 [=13 将它们连接在一起=].

data_files = r"path\to\folder\containing\csv"
df_mango= pd.DataFrame()
df_mango= pd.concat(map(pd.read_csv,glob.glob(os.path.join(data_files,'mango*.csv'))), ignore_index= True)
df_mango.to_csv('mango.csv')

这是我试过的例子:

mango0110.csv
   A  B  C
0  1  2  3
mango01220.csv
   A  B  C
0  4  5  6
To get:
   A  B  C
0  1  2  3
1  4  5  6

谢谢@Vidya Ganesh

data_files = r'C:\Downloads'
list_file_names = ['Mango','Apple','Banana']
for i in list_file_names:
    name = i
    df = pd.DataFrame()
    df= pd.concat(map(pd.read_csv,glob.glob(os.path.join(data_files,str(name)+'*.csv'))), ignore_index= True)
    df = df.loc[:1000,:]
    print (name)
    print (df.shape)
    df.to_csv(str(name)+".csv")