Python 使用散列键 v2 按列 (header) 动态读取 csv

Question

我尝试在 python 中解析来自几个 zip 文件的几个 csv 文件，以注入石墨时间序列数据库。

我已经发布了一个主题如何手动解析一个 csv 文件，但现在我正在尝试从多个 zip 文件解析多个 csv。

CSV文件组成如下：

Object
Serial number 
From : 2021/12/13 12:04
To   : 2021/12/14 11:59
sampling rate : 1
      


"No.","time","00:00:00","00:00:01","00:00:02","00:00:03","00:00:04","00:00:05","00:00:06","00:00:07","00:00:08","00:00:09","00:00:0A"
            "1","2021/09/12 02:16",235,610,345,997,446,130,129,94,555,274,4
            "2","2021/09/12 02:17",364,210,371,341,294,87,179,106,425,262,3
            "3","2021/09/12 02:18",297,343,860,216,275,81,73,113,566,274,3
            "4","2021/09/12 02:19",305,243,448,262,387,64,63,119,633,249,3
            "5","2021/09/12 02:20",276,151,164,263,315,86,92,175,591,291,1
            "6","2021/09/12 02:21",264,343,287,542,312,83,72,122,630,273,4
            "7","2021/09/12 02:22",373,157,266,446,246,90,173,90,442,273,2
            "8","2021/09/12 02:23",265,112,241,307,329,64,71,82,515,260,3
            "9","2021/09/12 02:24",285,247,240,372,176,92,67,83,609,620,1
            "10","2021/09/12 02:25",289,964,277,476,356,84,74,104,560,294,1
            "11","2021/09/12 02:26",279,747,227,573,569,82,77,99,589,229,5
            "12","2021/09/12 02:27",338,370,315,439,653,85,165,346,367,281,2
            "13","2021/09/12 02:28",269,135,372,262,307,73,86,93,512,283,4
            "14","2021/09/12 02:29",281,207,688,322,233,75,69,85,663,276,2

这是我的脚本：

import re, glob, sys, time, socket, platform, subprocess, pickle, struct, os, zipfile
import pandas as pd
from shutil import copyfile, copy
from pathlib import Path
from io import StringIO

CARBON_SERVER = '127.0.0.1'
CARBON_PICKLE_PORT = 2004
DELAY = 10

def get_filepaths(directory):
    file_paths = []  # List which will store all of the full filepaths.

    # Walk the tree.
    for root, directories, files in os.walk(directory):
        for filename in files:
            # Join the two strings in order to form the full filepath.
            filepath = os.path.join(root, filename)
            file_paths.append(filepath)  # Add it to the list.

    return file_paths  # Self-explanatory.


def execute_run(csv_file):
    delay = DELAY
    if len(sys.argv) > 1:
        arg = sys.argv[1]
        if arg.isdigit():
            delay = int(arg)
        else:
            sys.stderr.write("Ignoring non-integer argument. Using default: %ss\n" % delay)

    sock = socket.socket()
    try:
        sock.connect( (CARBON_SERVER, CARBON_PICKLE_PORT) )
        print("socket ok, write")
    except socket.error:
        raise SystemExit("Couldn't connect to %(server)s on port %(port)d, is carbon-cache.py running?" % { 'server':CARBON_SERVER, 'port':CARBON_PICKLE_PORT })

    try:
        run(sock, delay, csv_file)
    except KeyboardInterrupt:
        sys.stderr.write("\nExiting on CTRL-c\n")
        sys.exit(0)


def run(sock, delay, zipobj):
   zf = zipfile.ZipFile(zipobj)
   df = [pd.read_csv(zf.open(f), skiprows=[0,1,2,3,4,5]) for f in zf.namelist()]
   print(df, '\n')
   date_pattern='%Y/%m/%d %H:%M'
   df['epoch'] = df.apply(lambda row: int(time.mktime(time.strptime(row.time,date_pattern))), axis=1) # create epoch as a column, i block here
   print(3)
   df
   tuples=[] # data will be saved in a list
   formated_str='hds.perf.type.serial.object.00.00.00.TOTAL_IOPS'
   for each_column in list(df.columns)[2:-1]:
          for e in zip(list(df['epoch']),list(df[each_column])):
              each_column=each_column.replace("X", '')
              print(f"perf.serial.sn.LDEV.{each_column}.TOTAL_IOPS",e)
              tuples.append((f"perf.serial.sn.LDEV.{each_column}.TOTAL_IOPS",e))
   package = pickle.dumps(tuples, 1)
   size = struct.pack('!L', len(package))
   sock.sendall(size)
   sock.sendall(package)
   time.sleep(delay)

def main():
   Liste = [ 'MyZip1.zip', 'MyZip2.zip', 'MyZip3.zip' ]
   unzip_et()
   et_rep='/opt/import2grafana/out/'
   full_file_paths = get_filepaths(et_rep)
   for idx1, lst_metrics in enumerate(Liste):
      for idx2, Lst_f in enumerate(full_file_paths):
         if lst_metrics in Lst_f:
            zip_file = Lst_f
            try:
               with zipfile.ZipFile(zip_file) as zipobj:
                  #print("1 zipobj: ", zipobj.namelist)
                  #print("1 zipobj: ", zipobj.filename)
                  execute_run(zipobj.filename)
            except Exception as err :
               print("Erreur parsing zipfile")


if __name__ == "__main__":
   main()

我可以列出 csv，但是下面的行有错误

df['epoch'] = df.apply(lambda row: int(time.mktime(time.strptime(row.time,date_pattern))), axis=1) # create epoch as a column, i block here.

[1435 rows x 6 columns],        No.              time 00:10:00X 00:10:01X 00:10:02X 00:10:03X 00:10:04X 00:10:05X 00:10:06X 00:10:07X 00:10:08X  ... 00:10:75X 00:10:76X 00:10:77X 00:10:78X 00:10:79X 00:10:7AX 00:10:7BX 00:10:7CX 00:10:7DX 00:10:7EX 00:10:7FX
0        1  2021/12/13 12:05        -4        -4        -4        -4        -4        -4        -4        -4        -4  ...        -4        -4        -4        -4        -4        -4        -4      -4.0      -4.0      -4.0      -4.0
1        2  2021/12/13 12:06        -4        -4        -4        -4        -4        -4        -4        -4        -4  ...        -4        -4        -4        -4        -4        -4        -4      -4.0      -4.0      -4.0      -4.0
2        3  2021/12/13 12:07        -4        -4        -4        -4        -4        -4        -4        -4        -4  ...        -4        -4        -4        -4        -4        -4        -4      -4.0      -4.0      -4.0      -4.0
3        4  2021/12/13 12:08        -4        -4        -4        -4        -4        -4        -4        -4        -4  ...        -4        -4        -4        -4        -4        -4        -4      -4.0      -4.0      -4.0      -4.0
4        5  2021/12/13 12:09        -4        -4        -4        -4        -4        -4        -4        -4        -4  ...        -4        -4        -4        -4        -4        -4        -4      -4.0      -4.0      -4.0      -4.0
...    ...               ...       ...       ...       ...       ...       ...       ...       ...       ...       ...  ...       ...       ...       ...       ...       ...       ...       ...       ...       ...       ...       ...
2866  1431  2021/12/14 11:55         2         0         3         2         0         0         1         1       546  ...         1         1         1         1         2         0         0       NaN       NaN       NaN       NaN
2867  1432  2021/12/14 11:56         2         0         1         3         0         0         1         1       565  ...         1         1         1         1         1         0         0       NaN       NaN       NaN       NaN
2868  1433  2021/12/14 11:57         1         0         1         5         0         0         1         1       549  ...         1         1         1         1         1         0         0       NaN       NaN       NaN       NaN
2869  1434  2021/12/14 11:58         1         0         1         7         0         0         1         1       537  ...         1         3         1         1         1         0         0       NaN       NaN       NaN       NaN
2870  1435  2021/12/14 11:59         1         0         1         5         0         0         1         1       532  ...         1         1         1         1         1         0         0       NaN       NaN       NaN       NaN

非常感谢您的帮助

Answer 1

df 是一个数据帧列表——不是单个数据帧，所以 df['epoch'] 自然会抛出错误。循环 zf.namelist() 应该可以解决问题：

def run(sock, delay, zipobj):
   zf = zipfile.ZipFile(zipobj)
   for f in zf.namelist():
       df = pd.read_csv(zf.open(f), skiprows=[0,1,2,3,4,5])
       print(df, '\n')
       date_pattern='%Y/%m/%d %H:%M'
       df['epoch'] = df.apply(lambda row: int(time.mktime(time.strptime(row.time,date_pattern))), axis=1) # create epoch as a column, i block here
       #etc..............

Python 使用散列键 v2 按列 (header) 动态读取 csv

Python read csv by column (header) dynamically with hash key v2

python

hash

dynamic

pandas