Python 使用散列键 v2 按列 (header) 动态读取 csv

Python read csv by column (header) dynamically with hash key v2

我尝试在 python 中解析来自几个 zip 文件的几个 csv 文件,以注入石墨时间序列数据库。

我已经发布了一个主题如何手动解析一个 csv 文件,但现在我正在尝试从多个 zip 文件解析多个 csv。

CSV文件组成如下:

Object
Serial number 
From : 2021/12/13 12:04
To   : 2021/12/14 11:59
sampling rate : 1
      


"No.","time","00:00:00","00:00:01","00:00:02","00:00:03","00:00:04","00:00:05","00:00:06","00:00:07","00:00:08","00:00:09","00:00:0A"
            "1","2021/09/12 02:16",235,610,345,997,446,130,129,94,555,274,4
            "2","2021/09/12 02:17",364,210,371,341,294,87,179,106,425,262,3
            "3","2021/09/12 02:18",297,343,860,216,275,81,73,113,566,274,3
            "4","2021/09/12 02:19",305,243,448,262,387,64,63,119,633,249,3
            "5","2021/09/12 02:20",276,151,164,263,315,86,92,175,591,291,1
            "6","2021/09/12 02:21",264,343,287,542,312,83,72,122,630,273,4
            "7","2021/09/12 02:22",373,157,266,446,246,90,173,90,442,273,2
            "8","2021/09/12 02:23",265,112,241,307,329,64,71,82,515,260,3
            "9","2021/09/12 02:24",285,247,240,372,176,92,67,83,609,620,1
            "10","2021/09/12 02:25",289,964,277,476,356,84,74,104,560,294,1
            "11","2021/09/12 02:26",279,747,227,573,569,82,77,99,589,229,5
            "12","2021/09/12 02:27",338,370,315,439,653,85,165,346,367,281,2
            "13","2021/09/12 02:28",269,135,372,262,307,73,86,93,512,283,4
            "14","2021/09/12 02:29",281,207,688,322,233,75,69,85,663,276,2

这是我的脚本:

import re, glob, sys, time, socket, platform, subprocess, pickle, struct, os, zipfile
import pandas as pd
from shutil import copyfile, copy
from pathlib import Path
from io import StringIO

CARBON_SERVER = '127.0.0.1'
CARBON_PICKLE_PORT = 2004
DELAY = 10

def get_filepaths(directory):
    file_paths = []  # List which will store all of the full filepaths.

    # Walk the tree.
    for root, directories, files in os.walk(directory):
        for filename in files:
            # Join the two strings in order to form the full filepath.
            filepath = os.path.join(root, filename)
            file_paths.append(filepath)  # Add it to the list.

    return file_paths  # Self-explanatory.


def execute_run(csv_file):
    delay = DELAY
    if len(sys.argv) > 1:
        arg = sys.argv[1]
        if arg.isdigit():
            delay = int(arg)
        else:
            sys.stderr.write("Ignoring non-integer argument. Using default: %ss\n" % delay)

    sock = socket.socket()
    try:
        sock.connect( (CARBON_SERVER, CARBON_PICKLE_PORT) )
        print("socket ok, write")
    except socket.error:
        raise SystemExit("Couldn't connect to %(server)s on port %(port)d, is carbon-cache.py running?" % { 'server':CARBON_SERVER, 'port':CARBON_PICKLE_PORT })

    try:
        run(sock, delay, csv_file)
    except KeyboardInterrupt:
        sys.stderr.write("\nExiting on CTRL-c\n")
        sys.exit(0)


def run(sock, delay, zipobj):
   zf = zipfile.ZipFile(zipobj)
   df = [pd.read_csv(zf.open(f), skiprows=[0,1,2,3,4,5]) for f in zf.namelist()]
   print(df, '\n')
   date_pattern='%Y/%m/%d %H:%M'
   df['epoch'] = df.apply(lambda row: int(time.mktime(time.strptime(row.time,date_pattern))), axis=1) # create epoch as a column, i block here
   print(3)
   df
   tuples=[] # data will be saved in a list
   formated_str='hds.perf.type.serial.object.00.00.00.TOTAL_IOPS'
   for each_column in list(df.columns)[2:-1]:
          for e in zip(list(df['epoch']),list(df[each_column])):
              each_column=each_column.replace("X", '')
              print(f"perf.serial.sn.LDEV.{each_column}.TOTAL_IOPS",e)
              tuples.append((f"perf.serial.sn.LDEV.{each_column}.TOTAL_IOPS",e))
   package = pickle.dumps(tuples, 1)
   size = struct.pack('!L', len(package))
   sock.sendall(size)
   sock.sendall(package)
   time.sleep(delay)

def main():
   Liste = [ 'MyZip1.zip', 'MyZip2.zip', 'MyZip3.zip' ]
   unzip_et()
   et_rep='/opt/import2grafana/out/'
   full_file_paths = get_filepaths(et_rep)
   for idx1, lst_metrics in enumerate(Liste):
      for idx2, Lst_f in enumerate(full_file_paths):
         if lst_metrics in Lst_f:
            zip_file = Lst_f
            try:
               with zipfile.ZipFile(zip_file) as zipobj:
                  #print("1 zipobj: ", zipobj.namelist)
                  #print("1 zipobj: ", zipobj.filename)
                  execute_run(zipobj.filename)
            except Exception as err :
               print("Erreur parsing zipfile")


if __name__ == "__main__":
   main()

我可以列出 csv,但是下面的行有错误

df['epoch'] = df.apply(lambda row: int(time.mktime(time.strptime(row.time,date_pattern))), axis=1) # create epoch as a column, i block here.

[1435 rows x 6 columns],        No.              time 00:10:00X 00:10:01X 00:10:02X 00:10:03X 00:10:04X 00:10:05X 00:10:06X 00:10:07X 00:10:08X  ... 00:10:75X 00:10:76X 00:10:77X 00:10:78X 00:10:79X 00:10:7AX 00:10:7BX 00:10:7CX 00:10:7DX 00:10:7EX 00:10:7FX
0        1  2021/12/13 12:05        -4        -4        -4        -4        -4        -4        -4        -4        -4  ...        -4        -4        -4        -4        -4        -4        -4      -4.0      -4.0      -4.0      -4.0
1        2  2021/12/13 12:06        -4        -4        -4        -4        -4        -4        -4        -4        -4  ...        -4        -4        -4        -4        -4        -4        -4      -4.0      -4.0      -4.0      -4.0
2        3  2021/12/13 12:07        -4        -4        -4        -4        -4        -4        -4        -4        -4  ...        -4        -4        -4        -4        -4        -4        -4      -4.0      -4.0      -4.0      -4.0
3        4  2021/12/13 12:08        -4        -4        -4        -4        -4        -4        -4        -4        -4  ...        -4        -4        -4        -4        -4        -4        -4      -4.0      -4.0      -4.0      -4.0
4        5  2021/12/13 12:09        -4        -4        -4        -4        -4        -4        -4        -4        -4  ...        -4        -4        -4        -4        -4        -4        -4      -4.0      -4.0      -4.0      -4.0
...    ...               ...       ...       ...       ...       ...       ...       ...       ...       ...       ...  ...       ...       ...       ...       ...       ...       ...       ...       ...       ...       ...       ...
2866  1431  2021/12/14 11:55         2         0         3         2         0         0         1         1       546  ...         1         1         1         1         2         0         0       NaN       NaN       NaN       NaN
2867  1432  2021/12/14 11:56         2         0         1         3         0         0         1         1       565  ...         1         1         1         1         1         0         0       NaN       NaN       NaN       NaN
2868  1433  2021/12/14 11:57         1         0         1         5         0         0         1         1       549  ...         1         1         1         1         1         0         0       NaN       NaN       NaN       NaN
2869  1434  2021/12/14 11:58         1         0         1         7         0         0         1         1       537  ...         1         3         1         1         1         0         0       NaN       NaN       NaN       NaN
2870  1435  2021/12/14 11:59         1         0         1         5         0         0         1         1       532  ...         1         1         1         1         1         0         0       NaN       NaN       NaN       NaN

非常感谢您的帮助

df 是一个数据帧列表——不是单个数据帧,所以 df['epoch'] 自然会抛出错误。循环 zf.namelist() 应该可以解决问题:

def run(sock, delay, zipobj):
   zf = zipfile.ZipFile(zipobj)
   for f in zf.namelist():
       df = pd.read_csv(zf.open(f), skiprows=[0,1,2,3,4,5])
       print(df, '\n')
       date_pattern='%Y/%m/%d %H:%M'
       df['epoch'] = df.apply(lambda row: int(time.mktime(time.strptime(row.time,date_pattern))), axis=1) # create epoch as a column, i block here
       #etc..............