Spacy 中的自定义句子分割
Custom sentence segmentation in Spacy
我希望spaCy
使用我提供的句子分割边界而不是它自己的处理。
例如:
get_sentences("Bob meets Alice. @SentBoundary@ They play together.")
# => ["Bob meets Alice.", "They play together."] # two sents
get_sentences("Bob meets Alice. They play together.")
# => ["Bob meets Alice. They play together."] # ONE sent
get_sentences("Bob meets Alice, @SentBoundary@ they play together.")
# => ["Bob meets Alice,", "they play together."] # two sents
这是我目前所拥有的(从文档中借用的东西 here):
import spacy
nlp = spacy.load('en_core_web_sm')
def mark_sentence_boundaries(doc):
for i, token in enumerate(doc):
if token.text == '@SentBoundary@':
doc[i+1].sent_start = True
return doc
nlp.add_pipe(mark_sentence_boundaries, before='parser')
def get_sentences(text):
doc = nlp(text)
return (list(doc.sents))
但是我得到的结果如下:
# Ex1
get_sentences("Bob meets Alice. @SentBoundary@ They play together.")
#=> ["Bob meets Alice.", "@SentBoundary@", "They play together."]
# Ex2
get_sentences("Bob meets Alice. They play together.")
#=> ["Bob meets Alice.", "They play together."]
# Ex3
get_sentences("Bob meets Alice, @SentBoundary@ they play together.")
#=> ["Bob meets Alice, @SentBoundary@", "they play together."]
以下是我面临的主要问题:
- 发现断句时,如何去除
@SentBoundary@
令牌。
- 如果
@SentBoundary@
不存在,如何禁止 spaCy
拆分。
以下代码有效:
import spacy
nlp = spacy.load('en_core_web_sm')
def split_on_breaks(doc):
start = 0
seen_break = False
for word in doc:
if seen_break:
yield doc[start:word.i-1]
start = word.i
seen_break = False
elif word.text == '@SentBoundary@':
seen_break = True
if start < len(doc):
yield doc[start:len(doc)]
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_breaks)
nlp.add_pipe(sbd, first=True)
def get_sentences(text):
doc = nlp(text)
return (list(doc.sents)) # convert to string if required.
# Ex1
get_sentences("Bob meets Alice. @SentBoundary@ They play together.")
# => ["Bob meets Alice.", "They play together."] # two sentences
# Ex2
get_sentences("Bob meets Alice. They play together.")
# => ["Bob meets Alice. They play together."] # ONE sentence
# Ex3
get_sentences("Bob meets Alice, @SentBoundary@ they play together.")
# => ["Bob meets Alice,", "they play together."] # two sentences
正确的做法是检查 SentenceSegmenter than manual boundary setting (examples here). This github 问题也很有帮助。
import spacy
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
from spacy.tokens import Doc
import numpy
nlp = spacy.load('en_core_web_sm')
def mark_sentence_boundaries(doc):
indexes = []
for i, token in enumerate(doc):
if token.text == '@SentBoundary@':
doc[i+1].sent_start = True
indexes.append(token.i)
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
np_array = numpy.delete(np_array, indexes, axis=0)
doc2 = Doc(doc.vocab, words=[t.text for i, t in enumerate(doc) if i not in indexes])
doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
return doc2
nlp.add_pipe(mark_sentence_boundaries, before='parser')
def get_sentences(text):
doc = nlp(text)
return (list(doc.sents))
print(get_sentences("Bob meets Alice. @SentBoundary@ They play together."))
# => ["Bob meets Alice.", "They play together."] # two sents
print(get_sentences("Bob meets Alice. They play together."))
# => ["Bob meets Alice. They play together."] # ONE sent
print(get_sentences("Bob meets Alice, @SentBoundary@ they play together."))
# => ["Bob meets Alice,", "they play together."] # two sents
我希望spaCy
使用我提供的句子分割边界而不是它自己的处理。
例如:
get_sentences("Bob meets Alice. @SentBoundary@ They play together.")
# => ["Bob meets Alice.", "They play together."] # two sents
get_sentences("Bob meets Alice. They play together.")
# => ["Bob meets Alice. They play together."] # ONE sent
get_sentences("Bob meets Alice, @SentBoundary@ they play together.")
# => ["Bob meets Alice,", "they play together."] # two sents
这是我目前所拥有的(从文档中借用的东西 here):
import spacy
nlp = spacy.load('en_core_web_sm')
def mark_sentence_boundaries(doc):
for i, token in enumerate(doc):
if token.text == '@SentBoundary@':
doc[i+1].sent_start = True
return doc
nlp.add_pipe(mark_sentence_boundaries, before='parser')
def get_sentences(text):
doc = nlp(text)
return (list(doc.sents))
但是我得到的结果如下:
# Ex1
get_sentences("Bob meets Alice. @SentBoundary@ They play together.")
#=> ["Bob meets Alice.", "@SentBoundary@", "They play together."]
# Ex2
get_sentences("Bob meets Alice. They play together.")
#=> ["Bob meets Alice.", "They play together."]
# Ex3
get_sentences("Bob meets Alice, @SentBoundary@ they play together.")
#=> ["Bob meets Alice, @SentBoundary@", "they play together."]
以下是我面临的主要问题:
- 发现断句时,如何去除
@SentBoundary@
令牌。 - 如果
@SentBoundary@
不存在,如何禁止spaCy
拆分。
以下代码有效:
import spacy
nlp = spacy.load('en_core_web_sm')
def split_on_breaks(doc):
start = 0
seen_break = False
for word in doc:
if seen_break:
yield doc[start:word.i-1]
start = word.i
seen_break = False
elif word.text == '@SentBoundary@':
seen_break = True
if start < len(doc):
yield doc[start:len(doc)]
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_breaks)
nlp.add_pipe(sbd, first=True)
def get_sentences(text):
doc = nlp(text)
return (list(doc.sents)) # convert to string if required.
# Ex1
get_sentences("Bob meets Alice. @SentBoundary@ They play together.")
# => ["Bob meets Alice.", "They play together."] # two sentences
# Ex2
get_sentences("Bob meets Alice. They play together.")
# => ["Bob meets Alice. They play together."] # ONE sentence
# Ex3
get_sentences("Bob meets Alice, @SentBoundary@ they play together.")
# => ["Bob meets Alice,", "they play together."] # two sentences
正确的做法是检查 SentenceSegmenter than manual boundary setting (examples here). This github 问题也很有帮助。
import spacy
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
from spacy.tokens import Doc
import numpy
nlp = spacy.load('en_core_web_sm')
def mark_sentence_boundaries(doc):
indexes = []
for i, token in enumerate(doc):
if token.text == '@SentBoundary@':
doc[i+1].sent_start = True
indexes.append(token.i)
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
np_array = numpy.delete(np_array, indexes, axis=0)
doc2 = Doc(doc.vocab, words=[t.text for i, t in enumerate(doc) if i not in indexes])
doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
return doc2
nlp.add_pipe(mark_sentence_boundaries, before='parser')
def get_sentences(text):
doc = nlp(text)
return (list(doc.sents))
print(get_sentences("Bob meets Alice. @SentBoundary@ They play together."))
# => ["Bob meets Alice.", "They play together."] # two sents
print(get_sentences("Bob meets Alice. They play together."))
# => ["Bob meets Alice. They play together."] # ONE sent
print(get_sentences("Bob meets Alice, @SentBoundary@ they play together."))
# => ["Bob meets Alice,", "they play together."] # two sents