合并名称相似的 CSV python
Merging CSVs with similar name python
剧情简介
给定包含以前缀-Year.csv 命名的 CSV 文件的目录,创建一组名为前缀-aggregate.csv 的新 CSV 文件,其中每个聚合文件都是所有 CSV 文件的组合相同的前缀。
说明
我有一个包含 5,500 个以这种模式命名的 CSV 文件的目录:Prefix-Year.csv。示例:
18394-1999.csv
. . . //consecutive years
18394-2014.csv
18395-1999.csv //next location
我想将具有共同前缀的文件分组并组合成名为 Prefix-aggregate.csv 的文件。
这个怎么样:
import os
import pandas as pd
root, dirs, files = next(os.walk('data_dir'))
with open('18394_aggregate.csv', 'a') as outfile:
for infile in files:
if infile.startswith('18394') and infile.endswith('.csv'):
df = pd.read_csv(os.path.join(root, infile), header=False)
df.to_csv(outfile, index=False, header=False)
您的问题的解决方案是下面的 find_filesets()
方法。我还包含了一个基于 .
的 CSV 合并方法
#!/usr/bin/env python
import glob
import random
import os
import pandas
def rm_minus_rf(dirname):
for r,d,f in os.walk(dirname):
for files in f:
os.remove(os.path.join(r, files))
os.removedirs(r)
def create_testfiles(path):
rm_minus_rf(path)
os.mkdir(path)
random.seed()
for i in range(10):
n = random.randint(10000,99999)
for j in range(random.randint(0,20)):
# year may repeat, doesn't matter
year = 2015 - random.randint(0,20)
with open("{}/{}-{}.csv".format(path, n, year), "w"):
pass
def find_filesets(path="."):
csv_files = {}
for name in glob.glob("{}/*-*.csv".format(path)):
# there's almost certainly a better way to do this
key = os.path.splitext(os.path.basename(name))[0].split('-')[0]
csv_files.setdefault(key, []).append(name)
for key,filelist in csv_files.items():
print key, filelist
# do something with filelist
create_merged_csv(key, filelist)
def create_merged_csv(key, filelist):
with open('{}-aggregate.csv'.format(key), 'w+b') as outfile:
for filename in filelist:
df = pandas.read_csv(filename, header=False)
df.to_csv(outfile, index=False, header=False)
TEST_DIR_NAME="testfiles"
create_testfiles(TEST_DIR_NAME)
find_filesets(TEST_DIR_NAME)
剧情简介
给定包含以前缀-Year.csv 命名的 CSV 文件的目录,创建一组名为前缀-aggregate.csv 的新 CSV 文件,其中每个聚合文件都是所有 CSV 文件的组合相同的前缀。
说明
我有一个包含 5,500 个以这种模式命名的 CSV 文件的目录:Prefix-Year.csv。示例:
18394-1999.csv
. . . //consecutive years
18394-2014.csv
18395-1999.csv //next location
我想将具有共同前缀的文件分组并组合成名为 Prefix-aggregate.csv 的文件。
这个怎么样:
import os
import pandas as pd
root, dirs, files = next(os.walk('data_dir'))
with open('18394_aggregate.csv', 'a') as outfile:
for infile in files:
if infile.startswith('18394') and infile.endswith('.csv'):
df = pd.read_csv(os.path.join(root, infile), header=False)
df.to_csv(outfile, index=False, header=False)
您的问题的解决方案是下面的 find_filesets()
方法。我还包含了一个基于
#!/usr/bin/env python
import glob
import random
import os
import pandas
def rm_minus_rf(dirname):
for r,d,f in os.walk(dirname):
for files in f:
os.remove(os.path.join(r, files))
os.removedirs(r)
def create_testfiles(path):
rm_minus_rf(path)
os.mkdir(path)
random.seed()
for i in range(10):
n = random.randint(10000,99999)
for j in range(random.randint(0,20)):
# year may repeat, doesn't matter
year = 2015 - random.randint(0,20)
with open("{}/{}-{}.csv".format(path, n, year), "w"):
pass
def find_filesets(path="."):
csv_files = {}
for name in glob.glob("{}/*-*.csv".format(path)):
# there's almost certainly a better way to do this
key = os.path.splitext(os.path.basename(name))[0].split('-')[0]
csv_files.setdefault(key, []).append(name)
for key,filelist in csv_files.items():
print key, filelist
# do something with filelist
create_merged_csv(key, filelist)
def create_merged_csv(key, filelist):
with open('{}-aggregate.csv'.format(key), 'w+b') as outfile:
for filename in filelist:
df = pandas.read_csv(filename, header=False)
df.to_csv(outfile, index=False, header=False)
TEST_DIR_NAME="testfiles"
create_testfiles(TEST_DIR_NAME)
find_filesets(TEST_DIR_NAME)