我们如何使用 Spacy minibatch 和 GoldParse 来训练使用 BILUO 标记方案的 NER 模型?
How can we use Spacy minibatch and GoldParse to train NER model using BILUO tagging scheme?
我对 spacy ner 模型的输入数据在 BILUO
标记方案中,我希望将其用作某些要求的一部分。当我尝试在没有小批量的情况下简单地训练模型时,它工作正常(评论部分)。但是我无法弄清楚如何在这里使用 minibatch 和 GoldParse 来提高模型的准确性。我的期望在这里是否有效,因为我找不到这种组合的单个示例?另外,我已经用开始、结束、标签格式的方法训练了模型。请帮我弄清楚这一部分。我的代码如下,
import spacy
from spacy.gold import offsets_from_biluo_tags
from spacy.gold import biluo_tags_from_offsets
import random
from spacy.util import minibatch, compounding
from os import path
from tqdm import tqdm
def train_spacy(data, iterations, model=None):
TRAIN_DATA = data
print(f"downloads = {model}")
if model is not None and path.exists(model):
print(f"training existing model")
nlp = spacy.load(model)
print("Model is Loaded '%s'" % model)
else:
print(f"Creating new model")
nlp = spacy.blank('en') # create blank Language class
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
else:
ner = nlp.get_pipe('ner')
# Based on template, get labels and save those for further training
LABEL = ["Name", "ORG"]
for i in LABEL:
# print(i)
ner.add_label(i)
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
if model is None:
optimizer = nlp.begin_training()
else:
optimizer = nlp.entity.create_optimizer()
tags = dict()
for itn in range(iterations):
print("Starting iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
# for text, annotations in tqdm(TRAIN_DATA):
# print(f"text={text}, an={annotations}")
# tags['entities'] = offsets_from_biluo_tags(nlp(text), annotations)
# print(f"a={tags}")
# nlp.update([text], # batch of texts
# [tags], # batch of annotations
# drop=0.5, # dropout - make it harder to memorise data
# sgd=optimizer, # callable to update weights
# losses=losses)
# print(losses)
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 16.0, 1.001))
# type 2 with mini batch
for batch in batches:
texts, annotations = zip(*batch)
print(texts)
tags = {'entities': annotations}
nlp.update(
texts, # batch of texts
[tags], # batch of annotations
drop=0.4, # dropout - make it harder to memorise data
losses=losses,
sgd=optimizer
)
print(losses)
return nlp
data_biluo = [
('I am Shah Khan, I work in MS Co', ['O', 'O', 'B-Name', 'L-Name', 'O', 'O', 'O', 'B-ORG', 'L-ORG']),
('I am Tom Tomb, I work in Telecom Networks', ['O', 'O', 'B-Name', 'L-Name', 'O', 'O', 'O', 'B-ORG', 'L-ORG'])
]
model = train_spacy(data_biluo, 10)
model.to_disk('./Vectors/')
你的小批量有 2 个问题:
tags
应该是带有偏移量的 ner 标签的可迭代
- 你的
data_biluo
没有说明句子中间的 ,
。
一旦您纠正了那些就可以了:
import spacy
from spacy.gold import offsets_from_biluo_tags, GoldParse
from spacy.util import minibatch, compounding
import random
from tqdm import tqdm
def train_spacy(data, iterations, model=None):
TRAIN_DATA = data
print(f"downloads = {model}")
if model is not None and path.exists(model):
print(f"training existing model")
nlp = spacy.load(model)
print("Model is Loaded '%s'" % model)
else:
print(f"Creating new model")
nlp = spacy.blank('en') # create blank Language class
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
else:
ner = nlp.get_pipe('ner')
# Based on template, get labels and save those for further training
LABEL = ["Name", "ORG"]
for i in LABEL:
# print(i)
ner.add_label(i)
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
if model is None:
optimizer = nlp.begin_training()
else:
optimizer = nlp.entity.create_optimizer()
tags = dict()
for itn in range(iterations):
print("Starting iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 16.0, 1.001))
# type 2 with mini batch
for batch in batches:
texts, _ = zip(*batch)
golds = [GoldParse(nlp.make_doc(t),entities = a) for t,a in batch]
nlp.update(
texts, # batch of texts
golds, # batch of annotations
drop=0.4, # dropout - make it harder to memorise data
losses=losses,
sgd=optimizer
)
print(losses)
return nlp
data_biluo = [
('I am Shah Khan, I work in MS Co', ['O', 'O', 'B-Name', 'L-Name', 'O', 'O', 'O', 'O', 'B-ORG', 'L-ORG']),
('I am Tom Tomb, I work in Telecom Networks', ['O', 'O', 'B-Name', 'L-Name', 'O', 'O', 'O', 'O', 'B-ORG', 'L-ORG'])
]
model = train_spacy(data_biluo, 10)
Starting iteration 0
{'ner': 17.999998331069946}
Starting iteration 1
{'ner': 16.6766300201416}
Starting iteration 2
{'ner': 16.997647166252136}
Starting iteration 3
{'ner': 16.486496448516846}
Starting iteration 4
{'ner': 15.695325374603271}
Starting iteration 5
{'ner': 14.312554001808167}
Starting iteration 6
{'ner': 12.099276185035706}
Starting iteration 7
{'ner': 11.473928153514862}
Starting iteration 8
{'ner': 8.814643770456314}
Starting iteration 9
{'ner': 7.233813941478729}
我对 spacy ner 模型的输入数据在 BILUO
标记方案中,我希望将其用作某些要求的一部分。当我尝试在没有小批量的情况下简单地训练模型时,它工作正常(评论部分)。但是我无法弄清楚如何在这里使用 minibatch 和 GoldParse 来提高模型的准确性。我的期望在这里是否有效,因为我找不到这种组合的单个示例?另外,我已经用开始、结束、标签格式的方法训练了模型。请帮我弄清楚这一部分。我的代码如下,
import spacy
from spacy.gold import offsets_from_biluo_tags
from spacy.gold import biluo_tags_from_offsets
import random
from spacy.util import minibatch, compounding
from os import path
from tqdm import tqdm
def train_spacy(data, iterations, model=None):
TRAIN_DATA = data
print(f"downloads = {model}")
if model is not None and path.exists(model):
print(f"training existing model")
nlp = spacy.load(model)
print("Model is Loaded '%s'" % model)
else:
print(f"Creating new model")
nlp = spacy.blank('en') # create blank Language class
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
else:
ner = nlp.get_pipe('ner')
# Based on template, get labels and save those for further training
LABEL = ["Name", "ORG"]
for i in LABEL:
# print(i)
ner.add_label(i)
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
if model is None:
optimizer = nlp.begin_training()
else:
optimizer = nlp.entity.create_optimizer()
tags = dict()
for itn in range(iterations):
print("Starting iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
# for text, annotations in tqdm(TRAIN_DATA):
# print(f"text={text}, an={annotations}")
# tags['entities'] = offsets_from_biluo_tags(nlp(text), annotations)
# print(f"a={tags}")
# nlp.update([text], # batch of texts
# [tags], # batch of annotations
# drop=0.5, # dropout - make it harder to memorise data
# sgd=optimizer, # callable to update weights
# losses=losses)
# print(losses)
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 16.0, 1.001))
# type 2 with mini batch
for batch in batches:
texts, annotations = zip(*batch)
print(texts)
tags = {'entities': annotations}
nlp.update(
texts, # batch of texts
[tags], # batch of annotations
drop=0.4, # dropout - make it harder to memorise data
losses=losses,
sgd=optimizer
)
print(losses)
return nlp
data_biluo = [
('I am Shah Khan, I work in MS Co', ['O', 'O', 'B-Name', 'L-Name', 'O', 'O', 'O', 'B-ORG', 'L-ORG']),
('I am Tom Tomb, I work in Telecom Networks', ['O', 'O', 'B-Name', 'L-Name', 'O', 'O', 'O', 'B-ORG', 'L-ORG'])
]
model = train_spacy(data_biluo, 10)
model.to_disk('./Vectors/')
你的小批量有 2 个问题:
tags
应该是带有偏移量的 ner 标签的可迭代- 你的
data_biluo
没有说明句子中间的,
。
一旦您纠正了那些就可以了:
import spacy
from spacy.gold import offsets_from_biluo_tags, GoldParse
from spacy.util import minibatch, compounding
import random
from tqdm import tqdm
def train_spacy(data, iterations, model=None):
TRAIN_DATA = data
print(f"downloads = {model}")
if model is not None and path.exists(model):
print(f"training existing model")
nlp = spacy.load(model)
print("Model is Loaded '%s'" % model)
else:
print(f"Creating new model")
nlp = spacy.blank('en') # create blank Language class
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
else:
ner = nlp.get_pipe('ner')
# Based on template, get labels and save those for further training
LABEL = ["Name", "ORG"]
for i in LABEL:
# print(i)
ner.add_label(i)
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
if model is None:
optimizer = nlp.begin_training()
else:
optimizer = nlp.entity.create_optimizer()
tags = dict()
for itn in range(iterations):
print("Starting iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 16.0, 1.001))
# type 2 with mini batch
for batch in batches:
texts, _ = zip(*batch)
golds = [GoldParse(nlp.make_doc(t),entities = a) for t,a in batch]
nlp.update(
texts, # batch of texts
golds, # batch of annotations
drop=0.4, # dropout - make it harder to memorise data
losses=losses,
sgd=optimizer
)
print(losses)
return nlp
data_biluo = [
('I am Shah Khan, I work in MS Co', ['O', 'O', 'B-Name', 'L-Name', 'O', 'O', 'O', 'O', 'B-ORG', 'L-ORG']),
('I am Tom Tomb, I work in Telecom Networks', ['O', 'O', 'B-Name', 'L-Name', 'O', 'O', 'O', 'O', 'B-ORG', 'L-ORG'])
]
model = train_spacy(data_biluo, 10)
Starting iteration 0
{'ner': 17.999998331069946}
Starting iteration 1
{'ner': 16.6766300201416}
Starting iteration 2
{'ner': 16.997647166252136}
Starting iteration 3
{'ner': 16.486496448516846}
Starting iteration 4
{'ner': 15.695325374603271}
Starting iteration 5
{'ner': 14.312554001808167}
Starting iteration 6
{'ner': 12.099276185035706}
Starting iteration 7
{'ner': 11.473928153514862}
Starting iteration 8
{'ner': 8.814643770456314}
Starting iteration 9
{'ner': 7.233813941478729}