使用 parsec.py 解析字符串
Parsing string using parsec.py
我想像这样解析一个字符串:
Object A -> Object B [AB_name] Object B -> Object C [BC_name] ...
我的目标是获得三个列表:
Parents = ['Object A', 'Object B', ...]
Children = ['Object B', 'Object C', ...]
PC_names = ['AB_name','BC_name', ...]
我已经有了一个可行的解决方案,但它难以理解,因此难以维护,而且不是很健壮。基本上我的代码在字符串上循环两次,从而将子字符串添加到几个列表中。
为了解决这个问题,我阅读了 parsec.py 库,但到目前为止,我找不到适合像我这样的新手的好例子。我已经尝试在其他文章和文档的帮助下弄清楚它是如何工作的,但到目前为止收效甚微。
我对每一个提示都很满意。
test_input.txt:
Society {
A man -> a child [was once]
A man -> an automobile [has]
A woman -> a person [is]
A man -> a person [is]
A man -> a child [was once]
}
我当前的代码:
from typing import List
from parsec import *
class Type(object):
label: str
class Aspect(object):
domain: Type
codomain: Type
label: str
def __init__(self) -> None:
self.domain = Type()
self.codomain = Type()
class Olog(object):
name: str
aspects: List[Aspect]
def __init__(self):
self.aspects = []
with open ('testinput.txt', 'r') as f:
f_content = f.read()
olog_name = f_content.split("{")[0]
first = Olog()
first.aspects = []
first.name = olog_name
olog_data = f_content.split("{")[1]
olog_data_lines = olog_data.split(']')
orientation = str
counter1 = 0
counter2 = 0
domain_str = ''
codomain_str = ''
type_comma = Type()
type_comma.label = ","
string_store = str
string_store = ''
type_store = Type()
type_store_split = [Type]
for lines in olog_data_lines:
first_type = ''
second_type = ''
aspect_label = str
first_T = Type()
second_T = Type()
lines += ']'
lines_split = lines.split()
type_in_list = False
for word in lines_split:
if word == "}" and counter1 == 0:
print("Olog is empty")
if word == "}":
print(">>>Olog was saved")
break
if word == "->":
counter1 +=1
if counter1 == counter2 and lines_split.index(word) == 0:
first_type = word
if counter1 == counter2 and not lines_split.index(word) == 0:
first_type = first_type + (" " + word)
if word == "->":
orientation = "->"
string_store = string_store + first_type + ", "
type_store.label = string_store
type_store_split = type_store.label.split(",")
for types in type_store_split:
if types == first_type:
domain_str = int(type_store_split.index(types))
type_in_list = True
break
if not type_in_list:
domain_str = int(len(type_store_split)-2)
if not counter1 == counter2:
if word[0] == "[":
aspect_label = (lines.split('[', 1)[1].split(']')[0])
else: second_type = second_type.replace('->','', 1) + " " + word
if (word[len(word)-1]=="]"):
second_T.label = second_type
string_store = string_store + second_type + ", "
type_store.label = string_store
type_store_split = type_store.label.split(",")
for types in type_store_split:
if types == second_type:
codomain_str = int(type_store_split.index(types))
second_T.label = codomain_str
break
elif types == type_store_split[len(type_store_split)-1]:
codomain_str = int(len(type_store_split)-2)
second_T.label = codomain_str
aspect_A = Aspect()
aspect_A.label = aspect_label
aspect_A.domain = Type()
aspect_A.codomain = Type()
aspect_A.domain.label = domain_str
aspect_A.codomain.label = codomain_str
first.aspects.append(aspect_A)
counter2 += 1
``
此解决方案使用 re
并递归解析输入行并遍历结果,yield
返回 parents、children 和 pc_names
:
import re, collections
def parse_line(l):
return [re.findall('\[.*?\]|[\w\s]+', i.strip()) for i in re.split('\s*\-\>\s*', l)]
lines = [parse_line(i) for i in open('test_input.txt') if not re.findall('[\{\}]', i)]
def get_vals(d, s = []):
if len(d) > 1:
yield ('pc_names', d[-1][1:-1])
if not (c:=[b for a, b in lines if d[0] == a[0]]):
yield ('children', d[0])
if (k:=[a for a, _ in lines if a[0] not in s]):
yield from get_vals(k[0], s+[d[0]])
else:
yield ('parents', d[0])
for i in c:
yield from get_vals(i, s+[d[0]])
result = collections.defaultdict(set)
for a, b in get_vals(lines[0][0]):
result[a].add(b)
print({a:list(b) for a, b in result.items()})
输出:
{'parents': ['A woman', 'A man'],
'pc_name': ['was once', 'is', 'has'],
'children': ['a person ', 'an automobile ', 'a child ']}
第二个test_input.txt
内容:
Object A -> Object B [AB_name]
Object B -> Object C [BC_name]
结果:
{'parents': ['Object B', 'Object A'],
'pc_names': ['AB_name', 'BC_name'],
'children': ['Object B ', 'Object C ']}
我想像这样解析一个字符串:
Object A -> Object B [AB_name] Object B -> Object C [BC_name] ...
我的目标是获得三个列表:
Parents = ['Object A', 'Object B', ...]
Children = ['Object B', 'Object C', ...]
PC_names = ['AB_name','BC_name', ...]
我已经有了一个可行的解决方案,但它难以理解,因此难以维护,而且不是很健壮。基本上我的代码在字符串上循环两次,从而将子字符串添加到几个列表中。
为了解决这个问题,我阅读了 parsec.py 库,但到目前为止,我找不到适合像我这样的新手的好例子。我已经尝试在其他文章和文档的帮助下弄清楚它是如何工作的,但到目前为止收效甚微。
我对每一个提示都很满意。
test_input.txt:
Society {
A man -> a child [was once]
A man -> an automobile [has]
A woman -> a person [is]
A man -> a person [is]
A man -> a child [was once]
}
我当前的代码:
from typing import List
from parsec import *
class Type(object):
label: str
class Aspect(object):
domain: Type
codomain: Type
label: str
def __init__(self) -> None:
self.domain = Type()
self.codomain = Type()
class Olog(object):
name: str
aspects: List[Aspect]
def __init__(self):
self.aspects = []
with open ('testinput.txt', 'r') as f:
f_content = f.read()
olog_name = f_content.split("{")[0]
first = Olog()
first.aspects = []
first.name = olog_name
olog_data = f_content.split("{")[1]
olog_data_lines = olog_data.split(']')
orientation = str
counter1 = 0
counter2 = 0
domain_str = ''
codomain_str = ''
type_comma = Type()
type_comma.label = ","
string_store = str
string_store = ''
type_store = Type()
type_store_split = [Type]
for lines in olog_data_lines:
first_type = ''
second_type = ''
aspect_label = str
first_T = Type()
second_T = Type()
lines += ']'
lines_split = lines.split()
type_in_list = False
for word in lines_split:
if word == "}" and counter1 == 0:
print("Olog is empty")
if word == "}":
print(">>>Olog was saved")
break
if word == "->":
counter1 +=1
if counter1 == counter2 and lines_split.index(word) == 0:
first_type = word
if counter1 == counter2 and not lines_split.index(word) == 0:
first_type = first_type + (" " + word)
if word == "->":
orientation = "->"
string_store = string_store + first_type + ", "
type_store.label = string_store
type_store_split = type_store.label.split(",")
for types in type_store_split:
if types == first_type:
domain_str = int(type_store_split.index(types))
type_in_list = True
break
if not type_in_list:
domain_str = int(len(type_store_split)-2)
if not counter1 == counter2:
if word[0] == "[":
aspect_label = (lines.split('[', 1)[1].split(']')[0])
else: second_type = second_type.replace('->','', 1) + " " + word
if (word[len(word)-1]=="]"):
second_T.label = second_type
string_store = string_store + second_type + ", "
type_store.label = string_store
type_store_split = type_store.label.split(",")
for types in type_store_split:
if types == second_type:
codomain_str = int(type_store_split.index(types))
second_T.label = codomain_str
break
elif types == type_store_split[len(type_store_split)-1]:
codomain_str = int(len(type_store_split)-2)
second_T.label = codomain_str
aspect_A = Aspect()
aspect_A.label = aspect_label
aspect_A.domain = Type()
aspect_A.codomain = Type()
aspect_A.domain.label = domain_str
aspect_A.codomain.label = codomain_str
first.aspects.append(aspect_A)
counter2 += 1
``
此解决方案使用 re
并递归解析输入行并遍历结果,yield
返回 parents、children 和 pc_names
:
import re, collections
def parse_line(l):
return [re.findall('\[.*?\]|[\w\s]+', i.strip()) for i in re.split('\s*\-\>\s*', l)]
lines = [parse_line(i) for i in open('test_input.txt') if not re.findall('[\{\}]', i)]
def get_vals(d, s = []):
if len(d) > 1:
yield ('pc_names', d[-1][1:-1])
if not (c:=[b for a, b in lines if d[0] == a[0]]):
yield ('children', d[0])
if (k:=[a for a, _ in lines if a[0] not in s]):
yield from get_vals(k[0], s+[d[0]])
else:
yield ('parents', d[0])
for i in c:
yield from get_vals(i, s+[d[0]])
result = collections.defaultdict(set)
for a, b in get_vals(lines[0][0]):
result[a].add(b)
print({a:list(b) for a, b in result.items()})
输出:
{'parents': ['A woman', 'A man'],
'pc_name': ['was once', 'is', 'has'],
'children': ['a person ', 'an automobile ', 'a child ']}
第二个test_input.txt
内容:
Object A -> Object B [AB_name]
Object B -> Object C [BC_name]
结果:
{'parents': ['Object B', 'Object A'],
'pc_names': ['AB_name', 'BC_name'],
'children': ['Object B ', 'Object C ']}