如何使用 pandas 从文件夹中读取和合并名称相似的 .csv 文件
How to read and combine .csv files with similar names from a folder using pandas
我的文件夹中有如下文件名 C/Downloads
-
Mango001-003.csv
Mango004-006.csv
Mango007-100.csv
Applefruit.csv
Banana001-003.csv
Banana004-006.csv
如何单独导入水果文件,然后将相同的水果文件合并成一个文件?
预期的是 Mango 输出一个,Apple 输出一个,Banana 输出一个
import os
import re
data_files = os.listdir(r'C:\Downloads')
def load_files(filenames):
# Pre-compile regex for code readability
regex = re.compile(r'Mango.*?.csv')
# Map filenames to match objects, filter out not matching names
matches = [m for m in map(regex.match, filenames) if m is not None]
li = []
for match in matches:
df = pd.read_csv(match, index_col=None, header=0, dtype=object)
li.append(df)
#Concatenating the data
frame = pd.concat(li, axis=0, ignore_index=True)
return (frame)
df = load_files(data_files)
print(df.shape)
df.head(2)
我遇到错误。另外,不能这么复杂,一定是我做错了。
也许这不是最好的方法,但是对于给定的文件名...
尝试:
import pandas as pd
import glob
import re
path = r'./files' # use your path
all_files = glob.glob(path + "/*.csv")
fruits = []
# for all files in the folder get the fruit name
# this could be where things go wrong if the regex does not
# account for all filename types. Pattern may need tweaking
# example https://regex101.com/r/E69LWa/1
for file in all_files:
cleanFile = file.replace('fruit', '')
match = re.match(r'^.*/([A-Za-z]+)',cleanFile)
fruits.append(match.group(1))
# There will be one output for Mango, one for Apple & one for Banana hence three...
dfs_man = []
dfs_ban = []
dfs_app = []
# for all files create a df and append to the correct list holding other dfs of the same fruit
for i, file in enumerate(all_files):
df = pd.read_csv(file)
if fruits[i] == 'Mango':
dfs_man.append(df)
elif fruits[i] == 'Banana':
dfs_ban.append(df)
elif fruits[i] == 'Apple':
dfs_app.append(df)
# concatenate if more than one df in list, else just get the df out of list
if len(dfs_man) > 1:
df_mango = pd.concat(dfs_man, ignore_index=True)
elif len(dfs_man) == 1:
df_mango = dfs_man[0]
if len(dfs_ban) > 1:
df_banana = pd.concat(dfs_ban, ignore_index=True)
elif len(dfs_ban) == 1:
df_banana = dfs_ban[0]
if len(dfs_app) > 1:
df_apple = pd.concat(dfs_app, ignore_index=True)
elif len(dfs_app) == 1:
df_apple = dfs_app[0]
print(df_mango.shape, df_banana.shape, df_apple.shape)
我认为最简单的方法是使用 glob.glob
获取以特定水果名称开头的所有文件的列表(这里我使用芒果)并使用 [=13 将它们连接在一起=].
data_files = r"path\to\folder\containing\csv"
df_mango= pd.DataFrame()
df_mango= pd.concat(map(pd.read_csv,glob.glob(os.path.join(data_files,'mango*.csv'))), ignore_index= True)
df_mango.to_csv('mango.csv')
这是我试过的例子:
mango0110.csv
A B C
0 1 2 3
mango01220.csv
A B C
0 4 5 6
To get:
A B C
0 1 2 3
1 4 5 6
谢谢@Vidya Ganesh
data_files = r'C:\Downloads'
list_file_names = ['Mango','Apple','Banana']
for i in list_file_names:
name = i
df = pd.DataFrame()
df= pd.concat(map(pd.read_csv,glob.glob(os.path.join(data_files,str(name)+'*.csv'))), ignore_index= True)
df = df.loc[:1000,:]
print (name)
print (df.shape)
df.to_csv(str(name)+".csv")
我的文件夹中有如下文件名 C/Downloads
-
Mango001-003.csv
Mango004-006.csv
Mango007-100.csv
Applefruit.csv
Banana001-003.csv
Banana004-006.csv
如何单独导入水果文件,然后将相同的水果文件合并成一个文件?
预期的是 Mango 输出一个,Apple 输出一个,Banana 输出一个
import os
import re
data_files = os.listdir(r'C:\Downloads')
def load_files(filenames):
# Pre-compile regex for code readability
regex = re.compile(r'Mango.*?.csv')
# Map filenames to match objects, filter out not matching names
matches = [m for m in map(regex.match, filenames) if m is not None]
li = []
for match in matches:
df = pd.read_csv(match, index_col=None, header=0, dtype=object)
li.append(df)
#Concatenating the data
frame = pd.concat(li, axis=0, ignore_index=True)
return (frame)
df = load_files(data_files)
print(df.shape)
df.head(2)
我遇到错误。另外,不能这么复杂,一定是我做错了。
也许这不是最好的方法,但是对于给定的文件名...
尝试:
import pandas as pd
import glob
import re
path = r'./files' # use your path
all_files = glob.glob(path + "/*.csv")
fruits = []
# for all files in the folder get the fruit name
# this could be where things go wrong if the regex does not
# account for all filename types. Pattern may need tweaking
# example https://regex101.com/r/E69LWa/1
for file in all_files:
cleanFile = file.replace('fruit', '')
match = re.match(r'^.*/([A-Za-z]+)',cleanFile)
fruits.append(match.group(1))
# There will be one output for Mango, one for Apple & one for Banana hence three...
dfs_man = []
dfs_ban = []
dfs_app = []
# for all files create a df and append to the correct list holding other dfs of the same fruit
for i, file in enumerate(all_files):
df = pd.read_csv(file)
if fruits[i] == 'Mango':
dfs_man.append(df)
elif fruits[i] == 'Banana':
dfs_ban.append(df)
elif fruits[i] == 'Apple':
dfs_app.append(df)
# concatenate if more than one df in list, else just get the df out of list
if len(dfs_man) > 1:
df_mango = pd.concat(dfs_man, ignore_index=True)
elif len(dfs_man) == 1:
df_mango = dfs_man[0]
if len(dfs_ban) > 1:
df_banana = pd.concat(dfs_ban, ignore_index=True)
elif len(dfs_ban) == 1:
df_banana = dfs_ban[0]
if len(dfs_app) > 1:
df_apple = pd.concat(dfs_app, ignore_index=True)
elif len(dfs_app) == 1:
df_apple = dfs_app[0]
print(df_mango.shape, df_banana.shape, df_apple.shape)
我认为最简单的方法是使用 glob.glob
获取以特定水果名称开头的所有文件的列表(这里我使用芒果)并使用 [=13 将它们连接在一起=].
data_files = r"path\to\folder\containing\csv"
df_mango= pd.DataFrame()
df_mango= pd.concat(map(pd.read_csv,glob.glob(os.path.join(data_files,'mango*.csv'))), ignore_index= True)
df_mango.to_csv('mango.csv')
这是我试过的例子:
mango0110.csv
A B C
0 1 2 3
mango01220.csv
A B C
0 4 5 6
To get:
A B C
0 1 2 3
1 4 5 6
谢谢@Vidya Ganesh
data_files = r'C:\Downloads'
list_file_names = ['Mango','Apple','Banana']
for i in list_file_names:
name = i
df = pd.DataFrame()
df= pd.concat(map(pd.read_csv,glob.glob(os.path.join(data_files,str(name)+'*.csv'))), ignore_index= True)
df = df.loc[:1000,:]
print (name)
print (df.shape)
df.to_csv(str(name)+".csv")