尝试在 spaCy 上使用匹配器添加新实体时出现 AssertionError
AssertionError on trying to add new entity using matcher on spaCy
我正在尝试匹配所有电子邮件,例如在一堆文档中查找文本,并将其添加到名为 'EMAIL' 的自定义 NER 标签中。
这是测试用例的代码。
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
EMAIL = nlp.vocab.strings['EMAIL']
def add_email_ent(matcher, doc, i, matches):
match_id, start, end = matches[i]
doc.ents += ((EMAIL, start, end),)
matcher.add('EmailPII', add_email_ent, [{'LIKE_EMAIL': True}])
text = u"Hi, this is John. My email is john@ymail.com and an alternate is john@gmail.com"
doc = nlp(text)
matches = matcher(doc)
for i,[match_id, start, end] in enumerate(matches):
print (i+1, doc[start:end])
for ent in doc.ents:
print (ent.text, ent.label_)
这是我 运行 这段代码时得到的结果。
Traceback (most recent call last):
File "C:/Python27/emailpii.py", line 26, in <module>
matches = matcher(doc)
File "matcher.pyx", line 407, in spacy.matcher.Matcher.__call__
File "C:/Python27/emailpii.py", line 19, in add_event_ent
doc.ents += ((EMAIL, start, end),)
File "doc.pyx", line 415, in spacy.tokens.doc.Doc.ents.__get__
File "span.pyx", line 61, in spacy.tokens.span.Span.__cinit__
AssertionError: 17587345535198158200
但是,在 运行ning 上有一个类似的例子
import spacy
print "*****************"
print(spacy.__version__)
print "*****************"
from spacy.matcher import Matcher
#from spacy import displacy
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
EVENT = nlp.vocab.strings['EVENT']
def add_event_ent(matcher, doc, i, matches):
match_id, start, end = matches[i]
doc.ents += ((EVENT, start, end),)
matcher.add('GoogleIO', add_event_ent,
[{'ORTH': 'Google'}, {'ORTH': 'I'}, {'ORTH': '/'}, {'ORTH': 'O'}],
[{'ORTH': 'Google'}, {'ORTH': 'I'}, {'ORTH': '/'}, {'ORTH': 'O'}, {'IS_DIGIT': True}])
text = u"Google I/O was great this year. See you all again in Google I/O 2018"
doc = nlp(text)
matches = matcher(doc)
for i,[match_id, start, end] in enumerate(matches):
print (i, doc[start:end])
for ent in doc.ents:
print (ent.text, ent.label_)
#displacy.serve(doc, style = 'ent')
我得到了想要的输出:
2.0.1
(0, Google I/O)
(1, Google I/O)
(2, Google I/O 2018)
(u'Google I/O', u'EVENT')
(u'今年', u'DATE')
(u'Google I/O 2018', u'事件')
我是不是遗漏了什么?
我认为您的第一个代码失败是因为您没有为 'EMAIL' 添加实体标签。第二个代码有效,因为 EVENT 是预先存在的实体类型。
关于 matcher.add()
方法的第一个参数实际做什么,文档不是很清楚,但它为您添加了一个 Entity 标签。这里有两个应该起作用并消除混乱的替代方案:
备选方案 1:
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
#EMAIL = nlp.vocab.strings['EMAIL'] #Not needed
def add_email_ent(matcher, doc, i, matches):
match_id, start, end = matches[i]
doc.ents += ((match_id, start, end),)
matcher.add('EMAIL', add_email_ent, [{'LIKE_EMAIL': True}])
text = u"Hi, this is John. My email is john@ymail.com and an alternate is john@gmail.com"
doc = nlp(text)
matches = matcher(doc)
for i,[match_id, start, end] in enumerate(matches):
print (i+1, doc[start:end])
for ent in doc.ents:
print (ent.text, ent.label_)
备选方案 2(我不确定您为什么要这样做,因为您最终会得到两个本质上服务于相同目的的实体标签,但仅供说明之用):
import spacy
from spacy.matcher import Matcher
from spacy.pipeline import EntityRecognizer
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
ner = EntityRecognizer(nlp.vocab)
ner.add_label('EMAIL')
EMAIL = nlp.vocab.strings['EMAIL']
def add_email_ent(matcher, doc, i, matches):
match_id, start, end = matches[i]
doc.ents += ((EMAIL, start, end),)
matcher.add('EmailPII', add_email_ent, [{'LIKE_EMAIL': True}])
text = u"Hi, this is John. My email is john@ymail.com and an alternate is john@gmail.com"
doc = nlp(text)
matches = matcher(doc)
for i,[match_id, start, end] in enumerate(matches):
print (i+1, doc[start:end])
for ent in doc.ents:
print (ent.text, ent.label_)
我正在尝试匹配所有电子邮件,例如在一堆文档中查找文本,并将其添加到名为 'EMAIL' 的自定义 NER 标签中。 这是测试用例的代码。
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
EMAIL = nlp.vocab.strings['EMAIL']
def add_email_ent(matcher, doc, i, matches):
match_id, start, end = matches[i]
doc.ents += ((EMAIL, start, end),)
matcher.add('EmailPII', add_email_ent, [{'LIKE_EMAIL': True}])
text = u"Hi, this is John. My email is john@ymail.com and an alternate is john@gmail.com"
doc = nlp(text)
matches = matcher(doc)
for i,[match_id, start, end] in enumerate(matches):
print (i+1, doc[start:end])
for ent in doc.ents:
print (ent.text, ent.label_)
这是我 运行 这段代码时得到的结果。
Traceback (most recent call last):
File "C:/Python27/emailpii.py", line 26, in <module>
matches = matcher(doc)
File "matcher.pyx", line 407, in spacy.matcher.Matcher.__call__
File "C:/Python27/emailpii.py", line 19, in add_event_ent
doc.ents += ((EMAIL, start, end),)
File "doc.pyx", line 415, in spacy.tokens.doc.Doc.ents.__get__
File "span.pyx", line 61, in spacy.tokens.span.Span.__cinit__
AssertionError: 17587345535198158200
但是,在 运行ning 上有一个类似的例子
import spacy
print "*****************"
print(spacy.__version__)
print "*****************"
from spacy.matcher import Matcher
#from spacy import displacy
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
EVENT = nlp.vocab.strings['EVENT']
def add_event_ent(matcher, doc, i, matches):
match_id, start, end = matches[i]
doc.ents += ((EVENT, start, end),)
matcher.add('GoogleIO', add_event_ent,
[{'ORTH': 'Google'}, {'ORTH': 'I'}, {'ORTH': '/'}, {'ORTH': 'O'}],
[{'ORTH': 'Google'}, {'ORTH': 'I'}, {'ORTH': '/'}, {'ORTH': 'O'}, {'IS_DIGIT': True}])
text = u"Google I/O was great this year. See you all again in Google I/O 2018"
doc = nlp(text)
matches = matcher(doc)
for i,[match_id, start, end] in enumerate(matches):
print (i, doc[start:end])
for ent in doc.ents:
print (ent.text, ent.label_)
#displacy.serve(doc, style = 'ent')
我得到了想要的输出:
2.0.1
(0, Google I/O)
(1, Google I/O)
(2, Google I/O 2018)
(u'Google I/O', u'EVENT')
(u'今年', u'DATE')
(u'Google I/O 2018', u'事件')
我是不是遗漏了什么?
我认为您的第一个代码失败是因为您没有为 'EMAIL' 添加实体标签。第二个代码有效,因为 EVENT 是预先存在的实体类型。
关于 matcher.add()
方法的第一个参数实际做什么,文档不是很清楚,但它为您添加了一个 Entity 标签。这里有两个应该起作用并消除混乱的替代方案:
备选方案 1:
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
#EMAIL = nlp.vocab.strings['EMAIL'] #Not needed
def add_email_ent(matcher, doc, i, matches):
match_id, start, end = matches[i]
doc.ents += ((match_id, start, end),)
matcher.add('EMAIL', add_email_ent, [{'LIKE_EMAIL': True}])
text = u"Hi, this is John. My email is john@ymail.com and an alternate is john@gmail.com"
doc = nlp(text)
matches = matcher(doc)
for i,[match_id, start, end] in enumerate(matches):
print (i+1, doc[start:end])
for ent in doc.ents:
print (ent.text, ent.label_)
备选方案 2(我不确定您为什么要这样做,因为您最终会得到两个本质上服务于相同目的的实体标签,但仅供说明之用):
import spacy
from spacy.matcher import Matcher
from spacy.pipeline import EntityRecognizer
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
ner = EntityRecognizer(nlp.vocab)
ner.add_label('EMAIL')
EMAIL = nlp.vocab.strings['EMAIL']
def add_email_ent(matcher, doc, i, matches):
match_id, start, end = matches[i]
doc.ents += ((EMAIL, start, end),)
matcher.add('EmailPII', add_email_ent, [{'LIKE_EMAIL': True}])
text = u"Hi, this is John. My email is john@ymail.com and an alternate is john@gmail.com"
doc = nlp(text)
matches = matcher(doc)
for i,[match_id, start, end] in enumerate(matches):
print (i+1, doc[start:end])
for ent in doc.ents:
print (ent.text, ent.label_)