Python 使用散列键 v2 按列 (header) 动态读取 csv
Python read csv by column (header) dynamically with hash key v2
我尝试在 python 中解析来自几个 zip 文件的几个 csv 文件,以注入石墨时间序列数据库。
我已经发布了一个主题如何手动解析一个 csv 文件,但现在我正在尝试从多个 zip 文件解析多个 csv。
CSV文件组成如下:
Object
Serial number
From : 2021/12/13 12:04
To : 2021/12/14 11:59
sampling rate : 1
"No.","time","00:00:00","00:00:01","00:00:02","00:00:03","00:00:04","00:00:05","00:00:06","00:00:07","00:00:08","00:00:09","00:00:0A"
"1","2021/09/12 02:16",235,610,345,997,446,130,129,94,555,274,4
"2","2021/09/12 02:17",364,210,371,341,294,87,179,106,425,262,3
"3","2021/09/12 02:18",297,343,860,216,275,81,73,113,566,274,3
"4","2021/09/12 02:19",305,243,448,262,387,64,63,119,633,249,3
"5","2021/09/12 02:20",276,151,164,263,315,86,92,175,591,291,1
"6","2021/09/12 02:21",264,343,287,542,312,83,72,122,630,273,4
"7","2021/09/12 02:22",373,157,266,446,246,90,173,90,442,273,2
"8","2021/09/12 02:23",265,112,241,307,329,64,71,82,515,260,3
"9","2021/09/12 02:24",285,247,240,372,176,92,67,83,609,620,1
"10","2021/09/12 02:25",289,964,277,476,356,84,74,104,560,294,1
"11","2021/09/12 02:26",279,747,227,573,569,82,77,99,589,229,5
"12","2021/09/12 02:27",338,370,315,439,653,85,165,346,367,281,2
"13","2021/09/12 02:28",269,135,372,262,307,73,86,93,512,283,4
"14","2021/09/12 02:29",281,207,688,322,233,75,69,85,663,276,2
这是我的脚本:
import re, glob, sys, time, socket, platform, subprocess, pickle, struct, os, zipfile
import pandas as pd
from shutil import copyfile, copy
from pathlib import Path
from io import StringIO
CARBON_SERVER = '127.0.0.1'
CARBON_PICKLE_PORT = 2004
DELAY = 10
def get_filepaths(directory):
file_paths = [] # List which will store all of the full filepaths.
# Walk the tree.
for root, directories, files in os.walk(directory):
for filename in files:
# Join the two strings in order to form the full filepath.
filepath = os.path.join(root, filename)
file_paths.append(filepath) # Add it to the list.
return file_paths # Self-explanatory.
def execute_run(csv_file):
delay = DELAY
if len(sys.argv) > 1:
arg = sys.argv[1]
if arg.isdigit():
delay = int(arg)
else:
sys.stderr.write("Ignoring non-integer argument. Using default: %ss\n" % delay)
sock = socket.socket()
try:
sock.connect( (CARBON_SERVER, CARBON_PICKLE_PORT) )
print("socket ok, write")
except socket.error:
raise SystemExit("Couldn't connect to %(server)s on port %(port)d, is carbon-cache.py running?" % { 'server':CARBON_SERVER, 'port':CARBON_PICKLE_PORT })
try:
run(sock, delay, csv_file)
except KeyboardInterrupt:
sys.stderr.write("\nExiting on CTRL-c\n")
sys.exit(0)
def run(sock, delay, zipobj):
zf = zipfile.ZipFile(zipobj)
df = [pd.read_csv(zf.open(f), skiprows=[0,1,2,3,4,5]) for f in zf.namelist()]
print(df, '\n')
date_pattern='%Y/%m/%d %H:%M'
df['epoch'] = df.apply(lambda row: int(time.mktime(time.strptime(row.time,date_pattern))), axis=1) # create epoch as a column, i block here
print(3)
df
tuples=[] # data will be saved in a list
formated_str='hds.perf.type.serial.object.00.00.00.TOTAL_IOPS'
for each_column in list(df.columns)[2:-1]:
for e in zip(list(df['epoch']),list(df[each_column])):
each_column=each_column.replace("X", '')
print(f"perf.serial.sn.LDEV.{each_column}.TOTAL_IOPS",e)
tuples.append((f"perf.serial.sn.LDEV.{each_column}.TOTAL_IOPS",e))
package = pickle.dumps(tuples, 1)
size = struct.pack('!L', len(package))
sock.sendall(size)
sock.sendall(package)
time.sleep(delay)
def main():
Liste = [ 'MyZip1.zip', 'MyZip2.zip', 'MyZip3.zip' ]
unzip_et()
et_rep='/opt/import2grafana/out/'
full_file_paths = get_filepaths(et_rep)
for idx1, lst_metrics in enumerate(Liste):
for idx2, Lst_f in enumerate(full_file_paths):
if lst_metrics in Lst_f:
zip_file = Lst_f
try:
with zipfile.ZipFile(zip_file) as zipobj:
#print("1 zipobj: ", zipobj.namelist)
#print("1 zipobj: ", zipobj.filename)
execute_run(zipobj.filename)
except Exception as err :
print("Erreur parsing zipfile")
if __name__ == "__main__":
main()
我可以列出 csv,但是下面的行有错误
df['epoch'] = df.apply(lambda row: int(time.mktime(time.strptime(row.time,date_pattern))), axis=1) # create epoch as a column, i block here.
[1435 rows x 6 columns], No. time 00:10:00X 00:10:01X 00:10:02X 00:10:03X 00:10:04X 00:10:05X 00:10:06X 00:10:07X 00:10:08X ... 00:10:75X 00:10:76X 00:10:77X 00:10:78X 00:10:79X 00:10:7AX 00:10:7BX 00:10:7CX 00:10:7DX 00:10:7EX 00:10:7FX
0 1 2021/12/13 12:05 -4 -4 -4 -4 -4 -4 -4 -4 -4 ... -4 -4 -4 -4 -4 -4 -4 -4.0 -4.0 -4.0 -4.0
1 2 2021/12/13 12:06 -4 -4 -4 -4 -4 -4 -4 -4 -4 ... -4 -4 -4 -4 -4 -4 -4 -4.0 -4.0 -4.0 -4.0
2 3 2021/12/13 12:07 -4 -4 -4 -4 -4 -4 -4 -4 -4 ... -4 -4 -4 -4 -4 -4 -4 -4.0 -4.0 -4.0 -4.0
3 4 2021/12/13 12:08 -4 -4 -4 -4 -4 -4 -4 -4 -4 ... -4 -4 -4 -4 -4 -4 -4 -4.0 -4.0 -4.0 -4.0
4 5 2021/12/13 12:09 -4 -4 -4 -4 -4 -4 -4 -4 -4 ... -4 -4 -4 -4 -4 -4 -4 -4.0 -4.0 -4.0 -4.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2866 1431 2021/12/14 11:55 2 0 3 2 0 0 1 1 546 ... 1 1 1 1 2 0 0 NaN NaN NaN NaN
2867 1432 2021/12/14 11:56 2 0 1 3 0 0 1 1 565 ... 1 1 1 1 1 0 0 NaN NaN NaN NaN
2868 1433 2021/12/14 11:57 1 0 1 5 0 0 1 1 549 ... 1 1 1 1 1 0 0 NaN NaN NaN NaN
2869 1434 2021/12/14 11:58 1 0 1 7 0 0 1 1 537 ... 1 3 1 1 1 0 0 NaN NaN NaN NaN
2870 1435 2021/12/14 11:59 1 0 1 5 0 0 1 1 532 ... 1 1 1 1 1 0 0 NaN NaN NaN NaN
非常感谢您的帮助
df
是一个数据帧列表——不是单个数据帧,所以 df['epoch']
自然会抛出错误。循环 zf.namelist()
应该可以解决问题:
def run(sock, delay, zipobj):
zf = zipfile.ZipFile(zipobj)
for f in zf.namelist():
df = pd.read_csv(zf.open(f), skiprows=[0,1,2,3,4,5])
print(df, '\n')
date_pattern='%Y/%m/%d %H:%M'
df['epoch'] = df.apply(lambda row: int(time.mktime(time.strptime(row.time,date_pattern))), axis=1) # create epoch as a column, i block here
#etc..............
我尝试在 python 中解析来自几个 zip 文件的几个 csv 文件,以注入石墨时间序列数据库。
我已经发布了一个主题
CSV文件组成如下:
Object
Serial number
From : 2021/12/13 12:04
To : 2021/12/14 11:59
sampling rate : 1
"No.","time","00:00:00","00:00:01","00:00:02","00:00:03","00:00:04","00:00:05","00:00:06","00:00:07","00:00:08","00:00:09","00:00:0A"
"1","2021/09/12 02:16",235,610,345,997,446,130,129,94,555,274,4
"2","2021/09/12 02:17",364,210,371,341,294,87,179,106,425,262,3
"3","2021/09/12 02:18",297,343,860,216,275,81,73,113,566,274,3
"4","2021/09/12 02:19",305,243,448,262,387,64,63,119,633,249,3
"5","2021/09/12 02:20",276,151,164,263,315,86,92,175,591,291,1
"6","2021/09/12 02:21",264,343,287,542,312,83,72,122,630,273,4
"7","2021/09/12 02:22",373,157,266,446,246,90,173,90,442,273,2
"8","2021/09/12 02:23",265,112,241,307,329,64,71,82,515,260,3
"9","2021/09/12 02:24",285,247,240,372,176,92,67,83,609,620,1
"10","2021/09/12 02:25",289,964,277,476,356,84,74,104,560,294,1
"11","2021/09/12 02:26",279,747,227,573,569,82,77,99,589,229,5
"12","2021/09/12 02:27",338,370,315,439,653,85,165,346,367,281,2
"13","2021/09/12 02:28",269,135,372,262,307,73,86,93,512,283,4
"14","2021/09/12 02:29",281,207,688,322,233,75,69,85,663,276,2
这是我的脚本:
import re, glob, sys, time, socket, platform, subprocess, pickle, struct, os, zipfile
import pandas as pd
from shutil import copyfile, copy
from pathlib import Path
from io import StringIO
CARBON_SERVER = '127.0.0.1'
CARBON_PICKLE_PORT = 2004
DELAY = 10
def get_filepaths(directory):
file_paths = [] # List which will store all of the full filepaths.
# Walk the tree.
for root, directories, files in os.walk(directory):
for filename in files:
# Join the two strings in order to form the full filepath.
filepath = os.path.join(root, filename)
file_paths.append(filepath) # Add it to the list.
return file_paths # Self-explanatory.
def execute_run(csv_file):
delay = DELAY
if len(sys.argv) > 1:
arg = sys.argv[1]
if arg.isdigit():
delay = int(arg)
else:
sys.stderr.write("Ignoring non-integer argument. Using default: %ss\n" % delay)
sock = socket.socket()
try:
sock.connect( (CARBON_SERVER, CARBON_PICKLE_PORT) )
print("socket ok, write")
except socket.error:
raise SystemExit("Couldn't connect to %(server)s on port %(port)d, is carbon-cache.py running?" % { 'server':CARBON_SERVER, 'port':CARBON_PICKLE_PORT })
try:
run(sock, delay, csv_file)
except KeyboardInterrupt:
sys.stderr.write("\nExiting on CTRL-c\n")
sys.exit(0)
def run(sock, delay, zipobj):
zf = zipfile.ZipFile(zipobj)
df = [pd.read_csv(zf.open(f), skiprows=[0,1,2,3,4,5]) for f in zf.namelist()]
print(df, '\n')
date_pattern='%Y/%m/%d %H:%M'
df['epoch'] = df.apply(lambda row: int(time.mktime(time.strptime(row.time,date_pattern))), axis=1) # create epoch as a column, i block here
print(3)
df
tuples=[] # data will be saved in a list
formated_str='hds.perf.type.serial.object.00.00.00.TOTAL_IOPS'
for each_column in list(df.columns)[2:-1]:
for e in zip(list(df['epoch']),list(df[each_column])):
each_column=each_column.replace("X", '')
print(f"perf.serial.sn.LDEV.{each_column}.TOTAL_IOPS",e)
tuples.append((f"perf.serial.sn.LDEV.{each_column}.TOTAL_IOPS",e))
package = pickle.dumps(tuples, 1)
size = struct.pack('!L', len(package))
sock.sendall(size)
sock.sendall(package)
time.sleep(delay)
def main():
Liste = [ 'MyZip1.zip', 'MyZip2.zip', 'MyZip3.zip' ]
unzip_et()
et_rep='/opt/import2grafana/out/'
full_file_paths = get_filepaths(et_rep)
for idx1, lst_metrics in enumerate(Liste):
for idx2, Lst_f in enumerate(full_file_paths):
if lst_metrics in Lst_f:
zip_file = Lst_f
try:
with zipfile.ZipFile(zip_file) as zipobj:
#print("1 zipobj: ", zipobj.namelist)
#print("1 zipobj: ", zipobj.filename)
execute_run(zipobj.filename)
except Exception as err :
print("Erreur parsing zipfile")
if __name__ == "__main__":
main()
我可以列出 csv,但是下面的行有错误
df['epoch'] = df.apply(lambda row: int(time.mktime(time.strptime(row.time,date_pattern))), axis=1) # create epoch as a column, i block here.
[1435 rows x 6 columns], No. time 00:10:00X 00:10:01X 00:10:02X 00:10:03X 00:10:04X 00:10:05X 00:10:06X 00:10:07X 00:10:08X ... 00:10:75X 00:10:76X 00:10:77X 00:10:78X 00:10:79X 00:10:7AX 00:10:7BX 00:10:7CX 00:10:7DX 00:10:7EX 00:10:7FX
0 1 2021/12/13 12:05 -4 -4 -4 -4 -4 -4 -4 -4 -4 ... -4 -4 -4 -4 -4 -4 -4 -4.0 -4.0 -4.0 -4.0
1 2 2021/12/13 12:06 -4 -4 -4 -4 -4 -4 -4 -4 -4 ... -4 -4 -4 -4 -4 -4 -4 -4.0 -4.0 -4.0 -4.0
2 3 2021/12/13 12:07 -4 -4 -4 -4 -4 -4 -4 -4 -4 ... -4 -4 -4 -4 -4 -4 -4 -4.0 -4.0 -4.0 -4.0
3 4 2021/12/13 12:08 -4 -4 -4 -4 -4 -4 -4 -4 -4 ... -4 -4 -4 -4 -4 -4 -4 -4.0 -4.0 -4.0 -4.0
4 5 2021/12/13 12:09 -4 -4 -4 -4 -4 -4 -4 -4 -4 ... -4 -4 -4 -4 -4 -4 -4 -4.0 -4.0 -4.0 -4.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2866 1431 2021/12/14 11:55 2 0 3 2 0 0 1 1 546 ... 1 1 1 1 2 0 0 NaN NaN NaN NaN
2867 1432 2021/12/14 11:56 2 0 1 3 0 0 1 1 565 ... 1 1 1 1 1 0 0 NaN NaN NaN NaN
2868 1433 2021/12/14 11:57 1 0 1 5 0 0 1 1 549 ... 1 1 1 1 1 0 0 NaN NaN NaN NaN
2869 1434 2021/12/14 11:58 1 0 1 7 0 0 1 1 537 ... 1 3 1 1 1 0 0 NaN NaN NaN NaN
2870 1435 2021/12/14 11:59 1 0 1 5 0 0 1 1 532 ... 1 1 1 1 1 0 0 NaN NaN NaN NaN
非常感谢您的帮助
df
是一个数据帧列表——不是单个数据帧,所以 df['epoch']
自然会抛出错误。循环 zf.namelist()
应该可以解决问题:
def run(sock, delay, zipobj):
zf = zipfile.ZipFile(zipobj)
for f in zf.namelist():
df = pd.read_csv(zf.open(f), skiprows=[0,1,2,3,4,5])
print(df, '\n')
date_pattern='%Y/%m/%d %H:%M'
df['epoch'] = df.apply(lambda row: int(time.mktime(time.strptime(row.time,date_pattern))), axis=1) # create epoch as a column, i block here
#etc..............