如何读取一个 data.txt 文本文件,对数据进行排序,然后使用 Python Pandas 将其转换为 DataFrame?

How to read a data.txt text fil, sort the data and then convert it into DataFrame using Python Pandas?

我有一个包含数据的文本文件 (.txt),它显示如下:-

Yield: 99.7598
Timestamp: 2021/February/13-01:55:04
Angle: 0.00309331
ErrorCode 10: 6
ErrorCode 12: 2 

现在我想使用 python pandas 将其转换为数据帧,如下所示:-

文件名 |产量 |时间戳 |角度 |错误代码 10 |错误代码 12

xxxxx     99.75 2021/Feb      0.003      6                2

我试图通过这样做来编写这段代码:-

import os
import pandas as pd

def sortbycode():
    sam_file = open('210107343_summary.txt', 'r')
    sams = []
    for line in sam_file:
        sams.append([i for i in line.strip("\n").split(":")])
    sams.sort(key=lambda x:x[0])
    for sam in sams:
        print("{0:5}|{1:13}".format(*sam))
sortbycode()

这是我目前得到的输出:-

输出:

Angle| 0.00309331  
ErrorCode 10| 6           
ErrorCode 12| 2           
Timestamp| 2021/February/13-01
Yield| 99.7598 

这不好,因为我的计划是构建它并将其转换为数据框。我被困在这一点上,无法将其转换为 Dataframe。此输出还有另一件事,它也缺少文件名。

你能帮我改正这个错误或告诉我正确的方向吗?

更新答案:

正如OP提到的只有一条记录的文本文件,以下解决方案是正确的解决方案:

import pandas as pd
import re
from os import sep, getcwd
from path import glob, Path
from collections import OrderedDict

def oneFileSingleRecordParser(textFilePath):
    fileName = textFilePath.rsplit(sep, 1)[-1]
    
    with open(textFilePath, "r") as textFile:
        # The structure is:
        # Yield:
        # Timestamp
        # Angle
        # ErrorCode 10
        # ErrorCode 12
        # ErrorCode 16
        # ErrorCode 20
        
        # The error codes can be present or absent
        lines = textFile.readlines()
        
        dataDict = OrderedDict()
        dataDict["File Name"] = fileName
        
        for line in lines:
            matchObject = re.match(r"(\w+\s?\d*):\s(.*)", line.strip())
            
            if matchObject is not None:
                key, value = matchObject.groups()
                dataDict[key] = value
            
        return dict(dataDict)

def convertAllFilesToDataFrame(textFilePathsRoot, parser = oneFileSingleRecordParser):
    if not os.path.isdir(textFilePathsRoot):
        raise Exception("Please pass in a valid path to the root of the text files")

    textFilePaths = list(map(lambda path: str(path), Path(textFilePathsRoot).glob("*.txt")))
    
    dataDicts = []
    
    for textFilePath in textFilePaths:
        dataDicts.append(parser(textFilePath))
    
    dataFrame = pd.DataFrame(dataDicts)
    return dataFrame

convertAllFilesToDataFrame("path/to/your/text/file/directory") 仍应产生以下输出(在我的例子中,我只有两个文件具有完全相同的记录):

原答案

根据文本文件的结构,可以通过两种方式解决该问题:

  • 一个文本文件恰好包含五行(一条记录)
  • 单个文本文件可能包含 5 行的倍数(多条记录)

这是我对这两种方式的看法:

import pandas as pd
import re
from os import sep, getcwd
from path import glob, Path
from collections import OrderedDict

def oneFileSingleRecordParser(textFilePath):
    fileName = textFilePath.rsplit(sep, 1)[-1]
    
    with open(textFilePath, "r") as textFile:
        # The structure is:
        # Yield:
        # Timestamp
        # Angle
        # ErrorCode 10
        # ErrorCode 12
        lines = textFile.readlines()
        
        if len(lines) != 5:
            raise Exception("The file at {} doesn't have a proper single record.".format(textFilePath))
        
        dataDict = OrderedDict()
        dataDict["File Name"] = fileName
        
        for line in lines:
            # regex to extract the key and value name
            matchObject = re.match(r"(\w+\s?\d*):\s(.*)", line.strip())
            
            if matchObject is not None:
                key, value = matchObject.groups()
                dataDict[key] = value
            
        return dict(dataDict)

def oneFileMultiRecordParser(textFilePath):
    fileName = textFilePath.rsplit(sep, 1)[-1]
    
    with open(textFilePath, "r") as textFile:
        # The structure is:
        # Yield_1:
        # Timestamp_1:
        # Angle_1:
        # ErrorCode 10_1:
        # ErrorCode 12_1:
        # Yield_2:
        # Timestamp_2:
        # Angle_2:
        # ErrorCode 10_2:
        # ErrorCode 12_2:
        # ...
        lines = textFile.readlines()
        
        if len(lines) % 5 != 0:
            raise Exception("The file at {} doesn't have a uniform structure.".format(textFilePath))
        
        records = []
        
        dataDict = OrderedDict()
        dataDict["File Name"] = fileName
        
        for index, line in enumerate(lines):
            # regex to extract the key and value name
            matchObject = re.match(r"(\w+\s?\d*):\s(.*)", line.strip())
            
            if matchObject is not None:
                key, value = matchObject.groups()
                dataDict[key] = value
            else:
                raise Exception("Line={}, content=\"{}\" has some formatting issues, regex failed".format(index + 1, line))
            
            if (index + 1) % 5 == 0:
                records.append(dataDict)
                dataDict = OrderedDict() # reset for next iteration
                dataDict["File Name"] = fileName
            
        return records

def convertAllFilesToDataFrame(
        parser = oneFileSingleRecordParser, 
        validParserNames = ("oneFileSingleRecordParser", "oneFileMultiRecordParser",)
    ):
    
    if not parser.__name__ in validParserNames:
        raise Exception("Proper parser was not used")
    
    pathToFiles = getcwd()
    textFilePaths = list(map(lambda path: str(path), Path(pathToFiles).glob("*.txt")))
    
    dataDicts = []
    
    for textFilePath in textFilePaths:
        if parser.__name__ == validParserNames[0]:
            dataDicts.append(parser(textFilePath))
        elif parser.__name__ == validParserNames[1]:
            dataDicts.extend(parser(textFilePath))
    
    dataFrame = pd.DataFrame(dataDicts)
    return dataFrame

convertAllFilesToDataFrame(parser = oneFileMultiRecordParser) 将产生:

convertAllFilesToDataFrame(parser = oneFileSingleRecordParser) 将产生:

代码并不完全枯燥,但您可能需要更多时间才能做到这一点。

好的,所以你说每个文件有一条记录,但是文件很多。让我们假设您有一个 thing 给您文件的名称,因此 list(filenames()) 是一个包含相关文件名的列表。

您应该首先构建一个从文件名构建字典的函数:

fieldnames = ['Yield', 'Timestamp', 'Angle', 'ErrorCode 10', 'ErrorCode 12',
              'ErrorCode 13', 'ErrorCode 20']

def getrecord(filename):
    with open(filename) as fd:
        d = {'FileName': filename}
        for line in fd:
            k, v = [i.strip() for i in line.split(':', 1)]
            if k in fieldnames:
                d[k] = v
        return d

您现在可以使用以下方法构建数据框:

df = pd.DataFrame([getrecord(filename) for filename in filenames()],
                  columns = ['FileName'] + fieldnames)