如何读取一个 data.txt 文本文件,对数据进行排序,然后使用 Python Pandas 将其转换为 DataFrame?
How to read a data.txt text fil, sort the data and then convert it into DataFrame using Python Pandas?
我有一个包含数据的文本文件 (.txt),它显示如下:-
Yield: 99.7598
Timestamp: 2021/February/13-01:55:04
Angle: 0.00309331
ErrorCode 10: 6
ErrorCode 12: 2
现在我想使用 python pandas 将其转换为数据帧,如下所示:-
文件名 |产量 |时间戳 |角度 |错误代码 10 |错误代码 12
xxxxx 99.75 2021/Feb 0.003 6 2
我试图通过这样做来编写这段代码:-
import os
import pandas as pd
def sortbycode():
sam_file = open('210107343_summary.txt', 'r')
sams = []
for line in sam_file:
sams.append([i for i in line.strip("\n").split(":")])
sams.sort(key=lambda x:x[0])
for sam in sams:
print("{0:5}|{1:13}".format(*sam))
sortbycode()
这是我目前得到的输出:-
输出:
Angle| 0.00309331
ErrorCode 10| 6
ErrorCode 12| 2
Timestamp| 2021/February/13-01
Yield| 99.7598
这不好,因为我的计划是构建它并将其转换为数据框。我被困在这一点上,无法将其转换为 Dataframe。此输出还有另一件事,它也缺少文件名。
你能帮我改正这个错误或告诉我正确的方向吗?
更新答案:
正如OP提到的只有一条记录的文本文件,以下解决方案是正确的解决方案:
import pandas as pd
import re
from os import sep, getcwd
from path import glob, Path
from collections import OrderedDict
def oneFileSingleRecordParser(textFilePath):
fileName = textFilePath.rsplit(sep, 1)[-1]
with open(textFilePath, "r") as textFile:
# The structure is:
# Yield:
# Timestamp
# Angle
# ErrorCode 10
# ErrorCode 12
# ErrorCode 16
# ErrorCode 20
# The error codes can be present or absent
lines = textFile.readlines()
dataDict = OrderedDict()
dataDict["File Name"] = fileName
for line in lines:
matchObject = re.match(r"(\w+\s?\d*):\s(.*)", line.strip())
if matchObject is not None:
key, value = matchObject.groups()
dataDict[key] = value
return dict(dataDict)
def convertAllFilesToDataFrame(textFilePathsRoot, parser = oneFileSingleRecordParser):
if not os.path.isdir(textFilePathsRoot):
raise Exception("Please pass in a valid path to the root of the text files")
textFilePaths = list(map(lambda path: str(path), Path(textFilePathsRoot).glob("*.txt")))
dataDicts = []
for textFilePath in textFilePaths:
dataDicts.append(parser(textFilePath))
dataFrame = pd.DataFrame(dataDicts)
return dataFrame
convertAllFilesToDataFrame("path/to/your/text/file/directory")
仍应产生以下输出(在我的例子中,我只有两个文件具有完全相同的记录):
原答案
根据文本文件的结构,可以通过两种方式解决该问题:
- 一个文本文件恰好包含五行(一条记录)
- 单个文本文件可能包含 5 行的倍数(多条记录)
这是我对这两种方式的看法:
import pandas as pd
import re
from os import sep, getcwd
from path import glob, Path
from collections import OrderedDict
def oneFileSingleRecordParser(textFilePath):
fileName = textFilePath.rsplit(sep, 1)[-1]
with open(textFilePath, "r") as textFile:
# The structure is:
# Yield:
# Timestamp
# Angle
# ErrorCode 10
# ErrorCode 12
lines = textFile.readlines()
if len(lines) != 5:
raise Exception("The file at {} doesn't have a proper single record.".format(textFilePath))
dataDict = OrderedDict()
dataDict["File Name"] = fileName
for line in lines:
# regex to extract the key and value name
matchObject = re.match(r"(\w+\s?\d*):\s(.*)", line.strip())
if matchObject is not None:
key, value = matchObject.groups()
dataDict[key] = value
return dict(dataDict)
def oneFileMultiRecordParser(textFilePath):
fileName = textFilePath.rsplit(sep, 1)[-1]
with open(textFilePath, "r") as textFile:
# The structure is:
# Yield_1:
# Timestamp_1:
# Angle_1:
# ErrorCode 10_1:
# ErrorCode 12_1:
# Yield_2:
# Timestamp_2:
# Angle_2:
# ErrorCode 10_2:
# ErrorCode 12_2:
# ...
lines = textFile.readlines()
if len(lines) % 5 != 0:
raise Exception("The file at {} doesn't have a uniform structure.".format(textFilePath))
records = []
dataDict = OrderedDict()
dataDict["File Name"] = fileName
for index, line in enumerate(lines):
# regex to extract the key and value name
matchObject = re.match(r"(\w+\s?\d*):\s(.*)", line.strip())
if matchObject is not None:
key, value = matchObject.groups()
dataDict[key] = value
else:
raise Exception("Line={}, content=\"{}\" has some formatting issues, regex failed".format(index + 1, line))
if (index + 1) % 5 == 0:
records.append(dataDict)
dataDict = OrderedDict() # reset for next iteration
dataDict["File Name"] = fileName
return records
def convertAllFilesToDataFrame(
parser = oneFileSingleRecordParser,
validParserNames = ("oneFileSingleRecordParser", "oneFileMultiRecordParser",)
):
if not parser.__name__ in validParserNames:
raise Exception("Proper parser was not used")
pathToFiles = getcwd()
textFilePaths = list(map(lambda path: str(path), Path(pathToFiles).glob("*.txt")))
dataDicts = []
for textFilePath in textFilePaths:
if parser.__name__ == validParserNames[0]:
dataDicts.append(parser(textFilePath))
elif parser.__name__ == validParserNames[1]:
dataDicts.extend(parser(textFilePath))
dataFrame = pd.DataFrame(dataDicts)
return dataFrame
convertAllFilesToDataFrame(parser = oneFileMultiRecordParser)
将产生:
convertAllFilesToDataFrame(parser = oneFileSingleRecordParser)
将产生:
代码并不完全枯燥,但您可能需要更多时间才能做到这一点。
好的,所以你说每个文件有一条记录,但是文件很多。让我们假设您有一个 thing 给您文件的名称,因此 list(filenames())
是一个包含相关文件名的列表。
您应该首先构建一个从文件名构建字典的函数:
fieldnames = ['Yield', 'Timestamp', 'Angle', 'ErrorCode 10', 'ErrorCode 12',
'ErrorCode 13', 'ErrorCode 20']
def getrecord(filename):
with open(filename) as fd:
d = {'FileName': filename}
for line in fd:
k, v = [i.strip() for i in line.split(':', 1)]
if k in fieldnames:
d[k] = v
return d
您现在可以使用以下方法构建数据框:
df = pd.DataFrame([getrecord(filename) for filename in filenames()],
columns = ['FileName'] + fieldnames)
我有一个包含数据的文本文件 (.txt),它显示如下:-
Yield: 99.7598
Timestamp: 2021/February/13-01:55:04
Angle: 0.00309331
ErrorCode 10: 6
ErrorCode 12: 2
现在我想使用 python pandas 将其转换为数据帧,如下所示:-
文件名 |产量 |时间戳 |角度 |错误代码 10 |错误代码 12
xxxxx 99.75 2021/Feb 0.003 6 2
我试图通过这样做来编写这段代码:-
import os
import pandas as pd
def sortbycode():
sam_file = open('210107343_summary.txt', 'r')
sams = []
for line in sam_file:
sams.append([i for i in line.strip("\n").split(":")])
sams.sort(key=lambda x:x[0])
for sam in sams:
print("{0:5}|{1:13}".format(*sam))
sortbycode()
这是我目前得到的输出:-
输出:
Angle| 0.00309331
ErrorCode 10| 6
ErrorCode 12| 2
Timestamp| 2021/February/13-01
Yield| 99.7598
这不好,因为我的计划是构建它并将其转换为数据框。我被困在这一点上,无法将其转换为 Dataframe。此输出还有另一件事,它也缺少文件名。
你能帮我改正这个错误或告诉我正确的方向吗?
更新答案:
正如OP提到的只有一条记录的文本文件,以下解决方案是正确的解决方案:
import pandas as pd
import re
from os import sep, getcwd
from path import glob, Path
from collections import OrderedDict
def oneFileSingleRecordParser(textFilePath):
fileName = textFilePath.rsplit(sep, 1)[-1]
with open(textFilePath, "r") as textFile:
# The structure is:
# Yield:
# Timestamp
# Angle
# ErrorCode 10
# ErrorCode 12
# ErrorCode 16
# ErrorCode 20
# The error codes can be present or absent
lines = textFile.readlines()
dataDict = OrderedDict()
dataDict["File Name"] = fileName
for line in lines:
matchObject = re.match(r"(\w+\s?\d*):\s(.*)", line.strip())
if matchObject is not None:
key, value = matchObject.groups()
dataDict[key] = value
return dict(dataDict)
def convertAllFilesToDataFrame(textFilePathsRoot, parser = oneFileSingleRecordParser):
if not os.path.isdir(textFilePathsRoot):
raise Exception("Please pass in a valid path to the root of the text files")
textFilePaths = list(map(lambda path: str(path), Path(textFilePathsRoot).glob("*.txt")))
dataDicts = []
for textFilePath in textFilePaths:
dataDicts.append(parser(textFilePath))
dataFrame = pd.DataFrame(dataDicts)
return dataFrame
convertAllFilesToDataFrame("path/to/your/text/file/directory")
仍应产生以下输出(在我的例子中,我只有两个文件具有完全相同的记录):
原答案
根据文本文件的结构,可以通过两种方式解决该问题:
- 一个文本文件恰好包含五行(一条记录)
- 单个文本文件可能包含 5 行的倍数(多条记录)
这是我对这两种方式的看法:
import pandas as pd
import re
from os import sep, getcwd
from path import glob, Path
from collections import OrderedDict
def oneFileSingleRecordParser(textFilePath):
fileName = textFilePath.rsplit(sep, 1)[-1]
with open(textFilePath, "r") as textFile:
# The structure is:
# Yield:
# Timestamp
# Angle
# ErrorCode 10
# ErrorCode 12
lines = textFile.readlines()
if len(lines) != 5:
raise Exception("The file at {} doesn't have a proper single record.".format(textFilePath))
dataDict = OrderedDict()
dataDict["File Name"] = fileName
for line in lines:
# regex to extract the key and value name
matchObject = re.match(r"(\w+\s?\d*):\s(.*)", line.strip())
if matchObject is not None:
key, value = matchObject.groups()
dataDict[key] = value
return dict(dataDict)
def oneFileMultiRecordParser(textFilePath):
fileName = textFilePath.rsplit(sep, 1)[-1]
with open(textFilePath, "r") as textFile:
# The structure is:
# Yield_1:
# Timestamp_1:
# Angle_1:
# ErrorCode 10_1:
# ErrorCode 12_1:
# Yield_2:
# Timestamp_2:
# Angle_2:
# ErrorCode 10_2:
# ErrorCode 12_2:
# ...
lines = textFile.readlines()
if len(lines) % 5 != 0:
raise Exception("The file at {} doesn't have a uniform structure.".format(textFilePath))
records = []
dataDict = OrderedDict()
dataDict["File Name"] = fileName
for index, line in enumerate(lines):
# regex to extract the key and value name
matchObject = re.match(r"(\w+\s?\d*):\s(.*)", line.strip())
if matchObject is not None:
key, value = matchObject.groups()
dataDict[key] = value
else:
raise Exception("Line={}, content=\"{}\" has some formatting issues, regex failed".format(index + 1, line))
if (index + 1) % 5 == 0:
records.append(dataDict)
dataDict = OrderedDict() # reset for next iteration
dataDict["File Name"] = fileName
return records
def convertAllFilesToDataFrame(
parser = oneFileSingleRecordParser,
validParserNames = ("oneFileSingleRecordParser", "oneFileMultiRecordParser",)
):
if not parser.__name__ in validParserNames:
raise Exception("Proper parser was not used")
pathToFiles = getcwd()
textFilePaths = list(map(lambda path: str(path), Path(pathToFiles).glob("*.txt")))
dataDicts = []
for textFilePath in textFilePaths:
if parser.__name__ == validParserNames[0]:
dataDicts.append(parser(textFilePath))
elif parser.__name__ == validParserNames[1]:
dataDicts.extend(parser(textFilePath))
dataFrame = pd.DataFrame(dataDicts)
return dataFrame
convertAllFilesToDataFrame(parser = oneFileMultiRecordParser)
将产生:
convertAllFilesToDataFrame(parser = oneFileSingleRecordParser)
将产生:
代码并不完全枯燥,但您可能需要更多时间才能做到这一点。
好的,所以你说每个文件有一条记录,但是文件很多。让我们假设您有一个 thing 给您文件的名称,因此 list(filenames())
是一个包含相关文件名的列表。
您应该首先构建一个从文件名构建字典的函数:
fieldnames = ['Yield', 'Timestamp', 'Angle', 'ErrorCode 10', 'ErrorCode 12',
'ErrorCode 13', 'ErrorCode 20']
def getrecord(filename):
with open(filename) as fd:
d = {'FileName': filename}
for line in fd:
k, v = [i.strip() for i in line.split(':', 1)]
if k in fieldnames:
d[k] = v
return d
您现在可以使用以下方法构建数据框:
df = pd.DataFrame([getrecord(filename) for filename in filenames()],
columns = ['FileName'] + fieldnames)