通过其中的关键字解析 total.txt 文件
Parsing a total.txt file by keywords in it
我在解析文件时遇到问题。我有通过单词 Total 解析文件的代码:如果它的值大于 20.0 并且 returns 数据。我需要将搜索关键字更改为 Tokens eth: 值大于 20.0 并输出分隔符之间的所有数据 ======== 并将所有排序的值另外写入 sort.txt 文件。如果能得到专业帮助,我将不胜感激)
代码:
outlist = []
flag = False
def dump(list_, flag_):
if list_ and flag_:
print('\n'.join(list_))
return [], False
with open('total.txt') as file:
for line in map(str.strip, file):
if line.startswith('='):
outlist, flag = dump(outlist, flag)
else:
tokens = line.split()
if len(tokens) == 3 and tokens[1] == 'Total:':
try:
flag = float(tokens[2][:-1]) > 20.0
except ValueError:
pass
outlist.append(line)
dump(outlist, flag)
total.txt
============
| hafuia
| 0xb34a47885262f9d8673dc77de7b583961134f09fb03620b29d282c32ee6932be
| 0xD0b2612a6eE3111114b43b25322C6F08A251D38D
| Total: 47.62874464666479$
|
|
| Tokens eth:
| 20.608732$ MANA
|
| Protocols cro:
| 17.840052$ VVS Finance
| 8.953779$ V3S Finance
============
| asdf
| 0x72e164aa187feaff7cb28a74b7ff800a0dfe916594c70f141069669e9df5a23b
| 0xC7dFe558ed09F0f3b72eBb0A04e9d4e99af0bd0D
| Total: 22.908481672796988$
|
|
| Tokens eth:
| 22.376087$ SOS
============
| asdf
| 0xbce666bca3c862a2ee44651374f95aca677de16b4922c6d5e7d922cc0ac42a3d
| 0x5870923a244f52fF2D119fbf5525421E32EC006e
| Total: 9.077030269778557$
|
|
| Tokens eth:
| 8.942218$ SOS
============
这是解析文件的方式。
def parse_output(filename):
outlist = []
with open(filename) as file:
new_block = False
to_write = False
lines_arr = []
for line in map(str.strip, file):
if line.startswith('======='):
new_block = not new_block
if new_block:
if to_write:
outlist.append(lines_arr)
lines_arr = []
new_block = False
to_write = False
else:
lines_arr.append(line)
if 'Total:' in line:
num = float(line.split()[-1][:-1])
if num > 20:
to_write = True
return outlist
def write_output(outlist, filename):
for block in outlist:
for line in block:
with open(filename, 'a') as out_file:
out_file.write(line + '\n')
with open(filename, 'a') as out_file:
out_file.write('=======' + '\n')
if __name__ == '__main__':
write_output(parse_output('total.txt'), 'output.txt')
我错过了分类钱包的事情。对于排序,在将数组附加到 outlist 时,您可以使用另一个数组进行排序,或者将数字添加到数组中,对输出进行排序,并在写入时跳过第一个元素。
写的很容易上手。地址也是如此。使用简单的 lambda 函数完成排序。
from pprint import pprint
wallet_splitter = "============"
wallet_content_start = "Tokens eth:"
wallet_line_start = "|"
with open("totals.txt") as infile:
wallets = infile.read().split(wallet_splitter)
print(wallets)
wallets_above_20 = []
for wallet in wallets:
total = 0
separate = []
contents = False
for line in wallet.splitlines():
if wallet_content_start in line:
contents = True
elif contents:
if "$" in line:
separate.append(line.replace(wallet_line_start, "").split("$")[0])
total += float(separate[-1])
else:
contents = False
for amount in separate:
if float(amount) > 20:
wallets_above_20.append({
"total": total,
"data": wallet
})
pprint(sorted(wallets_above_20, key = lambda i: i['total'],reverse=True))
这是另一种简单的可扩展方法,您可以使用它来实现您的需要。评论会解释代码。
# Create a simple representational object with data for every record.
class RateObject:
# You can change the delimiter to whatever you want.
def __init__(self, text_lines: list, delimiter="Tokens eth:"):
self.text_lines = text_lines
index = [i for i, x in enumerate(text_lines) if delimiter in x][0]
# Get the value from delimiter line
self.value = self._get_value(index)
# Override this method, to change the way you extract the value. From same line or different line etc.
def _get_value(self, delimiter_index: int):
# Case of Tokens eth:
value = self.text_lines[delimiter_index + 1]
value = value.strip()
# A bad parsing for numbers, can be improved may be!
number = "".join([x for x in value if x.isdigit() or x == "."])
if number:
return float(number)
else:
# Assume 0 for unknown values
return 0.0
def __str__(self):
# Return the lines as it is
return "".join(self.text_lines)
def __repr__(self):
return "".join(self.text_lines)
# read the source file
with open("src.txt", "r") as src:
line_texts = src.readlines()
# Split the lines into sections, using the delimiter ========
splitters = [index for index, text in enumerate(line_texts) if text == "============\n"]
# Create a list of RateObjects
raw_objects = [RateObject(lt) for lt in [line_texts[splitters[i]:splitters[i + 1]] for i in range(len(splitters) - 1)]]
# Filter the objects, to get only the ones with value > 20
selected_objects = list(filter(lambda x: x.value > 20.0, raw_objects))
# Sort the objects by value
sorted_objects = sorted(selected_objects, key=lambda x: x.value, reverse=True)
# print(selected_objects)
# print(sorted_objects)
# Write the sorted objects to a file
with open("sorted.txt", "w") as dst:
dst.write("\n".join([str(x) for x in sorted_objects]))
这是一个简单的 generator-based 方法。
def items(file):
"""
Generator to yield items from filename
whose "Tokens eth:" is above 20.0
"""
with open(file) as lines:
item = []
tokens = 0
capture = False
for line in lines:
if line == "============\n":
if tokens > 20.0:
yield tokens, item
item = []
tokens = 0
continue
if capture:
tokens = float(line.strip().split()[-2].rstrip("$"))
capture = False
if line.startswith("| Tokens eth:"):
# Set flag to capture next line when we get to it
capture = True
item.append(line)
def main():
import sys
print("============")
for tokens, item in sorted(list(items(sys.argv[1]))):
print("".join(item), end="")
print("============")
if __name__ == "__main__":
main()
为简单起见,我让生成器也执行过滤,但如果您想使其可重用,可以很容易地从调用方删除总数较低的项目。
事实上,我建议您只解析一次这种随意的文件格式,然后将其转换为 CSV 或 JSON 等标准格式,以便进一步处理(如果超过 one-off) .
使用标准库的 re
模块中的正则表达式,例如,您可以将文本分成由分隔符包围的块,然后找出每个块中的 eth 数量,排序并最终过滤他们。
# parameters
total_txt = """from question"""
sorted_file_name = 'sort.txt'
THRESHOLD = 20.
as_dicreasing_order = False
# body
separators = re.finditer('='*12, total_txt)
separators = list(separators)
blocks = map(total_txt.__getitem__, [slice(m1.start(), m2.start()) for m1, m2 in zip(separators, separators[1:])])
amount_block_pairs = [(float(re.search(r'Tokens eth:\n\| (\d*\.\d*)$', block, re.M).group(1)), block) for block in blocks]
# reverse=False for increasing order, True for the opposite
sorted_blocks = sorted(amount_block_pairs, reverse=as_dicreasing_order)
filtered_blocks = [block for amount, block in sorted_blocks if amount >= THRESHOLD]
with open(sorted_file_name, 'w') as fd:
fd.write(''.join(filtered_blocks))
另一种选择是使用 python ttp 模板来解析您的数据。在下面的代码中,它检查你的总值,找出低于 20.0 的值。然后,代码要求输入一个值,该值将替换为小于 20 的 Tokens eth:
。
from ttp import ttp
import json
with open('total.txt') as f:
data_to_parse = f.read()
ttp_template = '''
| Total: {{total}}$
| {{tokens_eth}}$ {{ignore}}
'''
parser = ttp(data=data_to_parse, template=ttp_template)
parser.parse()
# print result in JSON format
results = parser.result(format='json')[0]
#print(results)
#converting str to json.
result = json.loads(results)
# print(result)
for i in result[0]:
# print(i)
if float(i['total']) < 20:
new_tokens_eth = float(input(f"Total value is {i['total']} lower than 20. Enter a new 'Tokens eth:' value: "))
if i['tokens_eth'] in data_to_parse:
data_to_parse = data_to_parse.replace(i['tokens_eth'], str(new_tokens_eth))
print(data_to_parse)
查看解析后的数据:
看代码后的输出是运行.
我在解析文件时遇到问题。我有通过单词 Total 解析文件的代码:如果它的值大于 20.0 并且 returns 数据。我需要将搜索关键字更改为 Tokens eth: 值大于 20.0 并输出分隔符之间的所有数据 ======== 并将所有排序的值另外写入 sort.txt 文件。如果能得到专业帮助,我将不胜感激)
代码:
outlist = []
flag = False
def dump(list_, flag_):
if list_ and flag_:
print('\n'.join(list_))
return [], False
with open('total.txt') as file:
for line in map(str.strip, file):
if line.startswith('='):
outlist, flag = dump(outlist, flag)
else:
tokens = line.split()
if len(tokens) == 3 and tokens[1] == 'Total:':
try:
flag = float(tokens[2][:-1]) > 20.0
except ValueError:
pass
outlist.append(line)
dump(outlist, flag)
total.txt
============
| hafuia
| 0xb34a47885262f9d8673dc77de7b583961134f09fb03620b29d282c32ee6932be
| 0xD0b2612a6eE3111114b43b25322C6F08A251D38D
| Total: 47.62874464666479$
|
|
| Tokens eth:
| 20.608732$ MANA
|
| Protocols cro:
| 17.840052$ VVS Finance
| 8.953779$ V3S Finance
============
| asdf
| 0x72e164aa187feaff7cb28a74b7ff800a0dfe916594c70f141069669e9df5a23b
| 0xC7dFe558ed09F0f3b72eBb0A04e9d4e99af0bd0D
| Total: 22.908481672796988$
|
|
| Tokens eth:
| 22.376087$ SOS
============
| asdf
| 0xbce666bca3c862a2ee44651374f95aca677de16b4922c6d5e7d922cc0ac42a3d
| 0x5870923a244f52fF2D119fbf5525421E32EC006e
| Total: 9.077030269778557$
|
|
| Tokens eth:
| 8.942218$ SOS
============
这是解析文件的方式。
def parse_output(filename):
outlist = []
with open(filename) as file:
new_block = False
to_write = False
lines_arr = []
for line in map(str.strip, file):
if line.startswith('======='):
new_block = not new_block
if new_block:
if to_write:
outlist.append(lines_arr)
lines_arr = []
new_block = False
to_write = False
else:
lines_arr.append(line)
if 'Total:' in line:
num = float(line.split()[-1][:-1])
if num > 20:
to_write = True
return outlist
def write_output(outlist, filename):
for block in outlist:
for line in block:
with open(filename, 'a') as out_file:
out_file.write(line + '\n')
with open(filename, 'a') as out_file:
out_file.write('=======' + '\n')
if __name__ == '__main__':
write_output(parse_output('total.txt'), 'output.txt')
我错过了分类钱包的事情。对于排序,在将数组附加到 outlist 时,您可以使用另一个数组进行排序,或者将数字添加到数组中,对输出进行排序,并在写入时跳过第一个元素。
写的很容易上手。地址也是如此。使用简单的 lambda 函数完成排序。
from pprint import pprint
wallet_splitter = "============"
wallet_content_start = "Tokens eth:"
wallet_line_start = "|"
with open("totals.txt") as infile:
wallets = infile.read().split(wallet_splitter)
print(wallets)
wallets_above_20 = []
for wallet in wallets:
total = 0
separate = []
contents = False
for line in wallet.splitlines():
if wallet_content_start in line:
contents = True
elif contents:
if "$" in line:
separate.append(line.replace(wallet_line_start, "").split("$")[0])
total += float(separate[-1])
else:
contents = False
for amount in separate:
if float(amount) > 20:
wallets_above_20.append({
"total": total,
"data": wallet
})
pprint(sorted(wallets_above_20, key = lambda i: i['total'],reverse=True))
这是另一种简单的可扩展方法,您可以使用它来实现您的需要。评论会解释代码。
# Create a simple representational object with data for every record.
class RateObject:
# You can change the delimiter to whatever you want.
def __init__(self, text_lines: list, delimiter="Tokens eth:"):
self.text_lines = text_lines
index = [i for i, x in enumerate(text_lines) if delimiter in x][0]
# Get the value from delimiter line
self.value = self._get_value(index)
# Override this method, to change the way you extract the value. From same line or different line etc.
def _get_value(self, delimiter_index: int):
# Case of Tokens eth:
value = self.text_lines[delimiter_index + 1]
value = value.strip()
# A bad parsing for numbers, can be improved may be!
number = "".join([x for x in value if x.isdigit() or x == "."])
if number:
return float(number)
else:
# Assume 0 for unknown values
return 0.0
def __str__(self):
# Return the lines as it is
return "".join(self.text_lines)
def __repr__(self):
return "".join(self.text_lines)
# read the source file
with open("src.txt", "r") as src:
line_texts = src.readlines()
# Split the lines into sections, using the delimiter ========
splitters = [index for index, text in enumerate(line_texts) if text == "============\n"]
# Create a list of RateObjects
raw_objects = [RateObject(lt) for lt in [line_texts[splitters[i]:splitters[i + 1]] for i in range(len(splitters) - 1)]]
# Filter the objects, to get only the ones with value > 20
selected_objects = list(filter(lambda x: x.value > 20.0, raw_objects))
# Sort the objects by value
sorted_objects = sorted(selected_objects, key=lambda x: x.value, reverse=True)
# print(selected_objects)
# print(sorted_objects)
# Write the sorted objects to a file
with open("sorted.txt", "w") as dst:
dst.write("\n".join([str(x) for x in sorted_objects]))
这是一个简单的 generator-based 方法。
def items(file):
"""
Generator to yield items from filename
whose "Tokens eth:" is above 20.0
"""
with open(file) as lines:
item = []
tokens = 0
capture = False
for line in lines:
if line == "============\n":
if tokens > 20.0:
yield tokens, item
item = []
tokens = 0
continue
if capture:
tokens = float(line.strip().split()[-2].rstrip("$"))
capture = False
if line.startswith("| Tokens eth:"):
# Set flag to capture next line when we get to it
capture = True
item.append(line)
def main():
import sys
print("============")
for tokens, item in sorted(list(items(sys.argv[1]))):
print("".join(item), end="")
print("============")
if __name__ == "__main__":
main()
为简单起见,我让生成器也执行过滤,但如果您想使其可重用,可以很容易地从调用方删除总数较低的项目。
事实上,我建议您只解析一次这种随意的文件格式,然后将其转换为 CSV 或 JSON 等标准格式,以便进一步处理(如果超过 one-off) .
使用标准库的 re
模块中的正则表达式,例如,您可以将文本分成由分隔符包围的块,然后找出每个块中的 eth 数量,排序并最终过滤他们。
# parameters
total_txt = """from question"""
sorted_file_name = 'sort.txt'
THRESHOLD = 20.
as_dicreasing_order = False
# body
separators = re.finditer('='*12, total_txt)
separators = list(separators)
blocks = map(total_txt.__getitem__, [slice(m1.start(), m2.start()) for m1, m2 in zip(separators, separators[1:])])
amount_block_pairs = [(float(re.search(r'Tokens eth:\n\| (\d*\.\d*)$', block, re.M).group(1)), block) for block in blocks]
# reverse=False for increasing order, True for the opposite
sorted_blocks = sorted(amount_block_pairs, reverse=as_dicreasing_order)
filtered_blocks = [block for amount, block in sorted_blocks if amount >= THRESHOLD]
with open(sorted_file_name, 'w') as fd:
fd.write(''.join(filtered_blocks))
另一种选择是使用 python ttp 模板来解析您的数据。在下面的代码中,它检查你的总值,找出低于 20.0 的值。然后,代码要求输入一个值,该值将替换为小于 20 的 Tokens eth:
。
from ttp import ttp
import json
with open('total.txt') as f:
data_to_parse = f.read()
ttp_template = '''
| Total: {{total}}$
| {{tokens_eth}}$ {{ignore}}
'''
parser = ttp(data=data_to_parse, template=ttp_template)
parser.parse()
# print result in JSON format
results = parser.result(format='json')[0]
#print(results)
#converting str to json.
result = json.loads(results)
# print(result)
for i in result[0]:
# print(i)
if float(i['total']) < 20:
new_tokens_eth = float(input(f"Total value is {i['total']} lower than 20. Enter a new 'Tokens eth:' value: "))
if i['tokens_eth'] in data_to_parse:
data_to_parse = data_to_parse.replace(i['tokens_eth'], str(new_tokens_eth))
print(data_to_parse)
查看解析后的数据:
看代码后的输出是运行.