将 NLTK 块转换为字典列表
Converting NLTK chunks to a list of dictionaries
我需要从下面的示例 nltk 树中获取字典列表:
(S
I/PRP
'll/MD
have/VB
(amount 1/CD)
(plate pizza/NN)
and/CC
(amount 4/CD)
(plate sandwiches/NNS))
想要的输出如下
[{amount: 1, plate: pizza}, {amount: 4, plate: sandwiches}]
我试过下面的代码,但我只得到一个字典列表:
[{数量:4,盘子:三明治}]
看起来列表没有附加新条目,它只更新相同的字典。
import nltk
from nltk.chunk import *
from nltk.chunk.util import *
from nltk.chunk.regexp import *
from nltk import Tree
training = []
hmm_tagger = HiddenMarkovModelTagger.train(training)
sentence = "I'll have 1 pizza and 4 sandwiches"
gram = r"""
plate: {<NN|NNS>}
amount: {<CD|DT>}
"""
cp = nltk.RegexpParser(gram)
for sent in sentence:
tokens = nltk.word_tokenize(sent)
taggex = hmm_tagger.tag(tokens)
treee = cp.parse(taggex)
iob_ts = tree2conlltags(treee)
tree = conlltags2tree(iob_ts)
def conversion(tree):
dlist = []
for leaf in tree:
if type(leaf) == tuple:
for leaf in tree:
key = leaf.label()
value = leaf[0][0]
dlist =[dict(zip(key, value)) for leaf in tree]
return dlist
这里的主要问题是您没有在 conversion
函数内外的每个循环迭代后追加。
from nltk.chunk.regexp import RegexpParser
from nltk import Tree, pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize
gram = r"""
plate: {<NN|NNS>}
amount: {<CD|DT>}
"""
cp = RegexpParser(gram)
text = "I'll have 1 pizza and 4 sandwiches"
def conversion(tree):
dlist = []
d = dict()
for item in tree:
if isinstance(item, Tree):
d[item.label()] = ' '.join([l[0] for l in item.leaves()])
else:
dlist.append(d) if len(d)>0 else None
d = dict()
dlist.append(d) if len(d)>0 else None
return dlist
parsed_text = [cp.parse(pos_tag(word_tokenize(sent)))
for sent in sent_tokenize(text)]
for tree in parsed_text:
print(conversion(tree))
#[{'amount': '1', 'plate': 'pizza'}, {'amount': '4', 'plate': 'sandwiches'}]
我需要从下面的示例 nltk 树中获取字典列表:
(S
I/PRP
'll/MD
have/VB
(amount 1/CD)
(plate pizza/NN)
and/CC
(amount 4/CD)
(plate sandwiches/NNS))
想要的输出如下
[{amount: 1, plate: pizza}, {amount: 4, plate: sandwiches}]
我试过下面的代码,但我只得到一个字典列表: [{数量:4,盘子:三明治}] 看起来列表没有附加新条目,它只更新相同的字典。
import nltk
from nltk.chunk import *
from nltk.chunk.util import *
from nltk.chunk.regexp import *
from nltk import Tree
training = []
hmm_tagger = HiddenMarkovModelTagger.train(training)
sentence = "I'll have 1 pizza and 4 sandwiches"
gram = r"""
plate: {<NN|NNS>}
amount: {<CD|DT>}
"""
cp = nltk.RegexpParser(gram)
for sent in sentence:
tokens = nltk.word_tokenize(sent)
taggex = hmm_tagger.tag(tokens)
treee = cp.parse(taggex)
iob_ts = tree2conlltags(treee)
tree = conlltags2tree(iob_ts)
def conversion(tree):
dlist = []
for leaf in tree:
if type(leaf) == tuple:
for leaf in tree:
key = leaf.label()
value = leaf[0][0]
dlist =[dict(zip(key, value)) for leaf in tree]
return dlist
这里的主要问题是您没有在 conversion
函数内外的每个循环迭代后追加。
from nltk.chunk.regexp import RegexpParser
from nltk import Tree, pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize
gram = r"""
plate: {<NN|NNS>}
amount: {<CD|DT>}
"""
cp = RegexpParser(gram)
text = "I'll have 1 pizza and 4 sandwiches"
def conversion(tree):
dlist = []
d = dict()
for item in tree:
if isinstance(item, Tree):
d[item.label()] = ' '.join([l[0] for l in item.leaves()])
else:
dlist.append(d) if len(d)>0 else None
d = dict()
dlist.append(d) if len(d)>0 else None
return dlist
parsed_text = [cp.parse(pos_tag(word_tokenize(sent)))
for sent in sent_tokenize(text)]
for tree in parsed_text:
print(conversion(tree))
#[{'amount': '1', 'plate': 'pizza'}, {'amount': '4', 'plate': 'sandwiches'}]