Python: 如何获取多个 .csv 文件的列的第一个值 + 它的名称并用它们创建一个新文件
Python: How to take the first value of a column of multiple .csv files + its name and make a new file with them
我构建了多个时间序列 CSV 数据帧,如下所示:
例如:1.csv
,close,high,low,open,time,volumefrom,volumeto,timestamp
0,0.7,2.0,0.7,1.1,1499990400,49.17,78.14,2017-07-14 02:00:00
1,1.98,1.98,0.7,0.7,1500076800,5.69,9.93,2017-07-15 02:00:00
...
我想做的是创建一个 output.xls
文件,如下所示:
Filename, time
1, 1499990400
2, ...
...
其中 Filename
是 csv 名称(例如 1.csv
、2.csv
等),time
是每个文件第一行的时间文件。
我成功地设置了代码,但是有问题。这是我的尝试:
import glob
%cd /Users/Files/Daily/
output = open('output.csv', 'w')
output.write('filename\n; timestamp')
for filename in glob.glob('*.csv'):
if filename == 'output.csv':
continue
with open(filename, 'r') as input:
如果不出现错误,我将无法继续。提前谢谢你。
这是一个pandas解决方案,首先我们创建数据:
data = '''\
,close,high,low,open,time,volumefrom,volumeto,timestamp
0,0.7,2.0,0.7,1.1,1499990400,49.17,78.14,2017-07-14 02:00:00
1,1.98,1.98,0.7,0.7,1500076800,5.69,9.93,2017-07-15 02:00:00'''
with open('test1.csv','w') as f1, open('test2.csv','w') as f2:
f1.write(data)
f2.write(data)
现在是您可以使用的实际代码:
import glob
import os
import pandas as pd
# Get a generator with the files
files = ((i,os.path.splitext(i)[0]) for i in glob.glob('*.csv') if i != 'output.csv')
# Create the dataframe
df = pd.concat(pd.read_csv(f).iloc[:1,:].assign(filename=i) for f,i in files)
# Output
df[['filename','time']].to_csv('output.csv',index=False)
Returns:
filename,time
test1,1499990400
test2,1499990400
嗨,我对 Pandas 的了解还不足以给你答案,但我可以使用 csv
模块给你答案。
我不确定我生成的随机数据是否与你的数据匹配,强硬:
import os.path
import random
import datetime
import csv
import glob
output_directory = "/Users/Files/Daily"
def create_files_with_random_values(nb_files, nb_rows_in_output_file):
"""Create for us, a number of files with random values"""
for file_number_for_name in range(nb_files):
random_content_filename = os.path.join(output_directory, "{}.csv".format(file_number_for_name + 1))
# Choose a random date after July 14th 2017
start_date = datetime.datetime(2017, 7, 14, 2,0,0) + datetime.timedelta(random.randrange(23))
with open(random_content_filename, 'w', newline='') as random_content_file:
random_writer = csv.writer(random_content_file)
# Write the first row
random_writer.writerow(('', 'close', 'high', 'low',
'open', 'time', 'volumefrom',
'volumeto', 'timestamp'))
# Write the rest of the rows using a generator expression
random_writer.writerows((x,
round(random.uniform(0, 2), 2),
round(random.uniform(0, 2), 2),
round(random.uniform(0, 2), 2),
"".join(random.choices("0123456789", k=10)),
round(random.uniform(0, 100), 2),
round(random.uniform(0, 100), 2),
(start_date + datetime.timedelta(x)).isoformat(' ')
)
for x in range(nb_rows_in_output_file)
)
create_files_with_random_values(30, 25)
output_filename = os.path.join(output_directory, "output.csv")
file_finder_pattern = os.path.join(output_directory, "*.csv")
with open(output_filename, "w", newline='') as output_file:
output_writer = csv.writer(output_file)
output_writer.writerow(('Filename', 'time'))
# Create a list containing couples containing the original file name
# and the first part of the file name (without its path)
files_wanted = [(x, os.path.splitext(os.path.basename(x))[0]) for x in glob.iglob(file_finder_pattern)
if x != output_filename]
# Sort that list on the first part of the file name (without its path)
# using a lambda function
files_wanted.sort(key=lambda x: int(x[1]))
for (input_filename, first_part_filename) in files_wanted:
with open(input_filename, "r", newline='') as input_file:
input_reader = csv.reader(input_file)
next(input_reader) # skip the header and don't keep its value
first_data_row = next(input_reader) # get row
# Write the first part of the file name (without its path) and
# the time component of the first row of this file
output_writer.writerow((first_part_filename, first_data_row[4]))
我已经过了我的就寝时间,所以如果它不是正确的答案,您将必须提供有关您的输入数据和您想要的输出的更多详细信息。
我构建了多个时间序列 CSV 数据帧,如下所示:
例如:1.csv
,close,high,low,open,time,volumefrom,volumeto,timestamp
0,0.7,2.0,0.7,1.1,1499990400,49.17,78.14,2017-07-14 02:00:00
1,1.98,1.98,0.7,0.7,1500076800,5.69,9.93,2017-07-15 02:00:00
...
我想做的是创建一个 output.xls
文件,如下所示:
Filename, time
1, 1499990400
2, ...
...
其中 Filename
是 csv 名称(例如 1.csv
、2.csv
等),time
是每个文件第一行的时间文件。
我成功地设置了代码,但是有问题。这是我的尝试:
import glob
%cd /Users/Files/Daily/
output = open('output.csv', 'w')
output.write('filename\n; timestamp')
for filename in glob.glob('*.csv'):
if filename == 'output.csv':
continue
with open(filename, 'r') as input:
如果不出现错误,我将无法继续。提前谢谢你。
这是一个pandas解决方案,首先我们创建数据:
data = '''\
,close,high,low,open,time,volumefrom,volumeto,timestamp
0,0.7,2.0,0.7,1.1,1499990400,49.17,78.14,2017-07-14 02:00:00
1,1.98,1.98,0.7,0.7,1500076800,5.69,9.93,2017-07-15 02:00:00'''
with open('test1.csv','w') as f1, open('test2.csv','w') as f2:
f1.write(data)
f2.write(data)
现在是您可以使用的实际代码:
import glob
import os
import pandas as pd
# Get a generator with the files
files = ((i,os.path.splitext(i)[0]) for i in glob.glob('*.csv') if i != 'output.csv')
# Create the dataframe
df = pd.concat(pd.read_csv(f).iloc[:1,:].assign(filename=i) for f,i in files)
# Output
df[['filename','time']].to_csv('output.csv',index=False)
Returns:
filename,time
test1,1499990400
test2,1499990400
嗨,我对 Pandas 的了解还不足以给你答案,但我可以使用 csv
模块给你答案。
我不确定我生成的随机数据是否与你的数据匹配,强硬:
import os.path
import random
import datetime
import csv
import glob
output_directory = "/Users/Files/Daily"
def create_files_with_random_values(nb_files, nb_rows_in_output_file):
"""Create for us, a number of files with random values"""
for file_number_for_name in range(nb_files):
random_content_filename = os.path.join(output_directory, "{}.csv".format(file_number_for_name + 1))
# Choose a random date after July 14th 2017
start_date = datetime.datetime(2017, 7, 14, 2,0,0) + datetime.timedelta(random.randrange(23))
with open(random_content_filename, 'w', newline='') as random_content_file:
random_writer = csv.writer(random_content_file)
# Write the first row
random_writer.writerow(('', 'close', 'high', 'low',
'open', 'time', 'volumefrom',
'volumeto', 'timestamp'))
# Write the rest of the rows using a generator expression
random_writer.writerows((x,
round(random.uniform(0, 2), 2),
round(random.uniform(0, 2), 2),
round(random.uniform(0, 2), 2),
"".join(random.choices("0123456789", k=10)),
round(random.uniform(0, 100), 2),
round(random.uniform(0, 100), 2),
(start_date + datetime.timedelta(x)).isoformat(' ')
)
for x in range(nb_rows_in_output_file)
)
create_files_with_random_values(30, 25)
output_filename = os.path.join(output_directory, "output.csv")
file_finder_pattern = os.path.join(output_directory, "*.csv")
with open(output_filename, "w", newline='') as output_file:
output_writer = csv.writer(output_file)
output_writer.writerow(('Filename', 'time'))
# Create a list containing couples containing the original file name
# and the first part of the file name (without its path)
files_wanted = [(x, os.path.splitext(os.path.basename(x))[0]) for x in glob.iglob(file_finder_pattern)
if x != output_filename]
# Sort that list on the first part of the file name (without its path)
# using a lambda function
files_wanted.sort(key=lambda x: int(x[1]))
for (input_filename, first_part_filename) in files_wanted:
with open(input_filename, "r", newline='') as input_file:
input_reader = csv.reader(input_file)
next(input_reader) # skip the header and don't keep its value
first_data_row = next(input_reader) # get row
# Write the first part of the file name (without its path) and
# the time component of the first row of this file
output_writer.writerow((first_part_filename, first_data_row[4]))
我已经过了我的就寝时间,所以如果它不是正确的答案,您将必须提供有关您的输入数据和您想要的输出的更多详细信息。