Python 用于训练阿拉伯语 spacy NER 模型的代码未给出结果或错误
Python code for training Arabic spacy NER model not giving result or errors
这是为 NER 训练 spacy 模型的代码。我的数据集是阿拉伯语推文 JSON 文件。我通过 https://dataturks.com 机器学习工具在我的数据集中手动标记了位置,但代码不是 运行。
我使用了这个 link 中的代码
https://dataturks.com/help/dataturks-ner-json-to-spacy-train.php
############################################ NOTE ########################################################
#
# Creates NER training data in Spacy format from JSON downloaded from Dataturks.
#
# Outputs the Spacy training data which can be used for Spacy training.
#
############################################################################################################
############################################################################################################
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
training_data = []
lines=[]
with open(dataturks_JSON_FilePath, 'r') as f:
lines = f.readlines()
for line in lines:
data = json.loads(line)
text = data['content']
entities = []
annotations = data['annotation']
if annotations:
for annotation in annotations:
#only a single point in text annotation.
point = annotation['points'][0]
labels = annotation['label']
# handle both list of labels or a single label.
if not isinstance(labels, list):
labels = [labels]
#print(labels)
for label in labels:
#dataturks indices are both inclusive [start, end] but spacy is not [start, end)
entities.append((point['start'], point['end'] + 1 ,label))
training_data.append((text, {"entities" : entities}))
return training_data
train data
TRAIN_DATA = convert_dataturks_to_spacy("/content/drive/My Drive/Colab Notebooks/Name Entity Recognition/NERTweets.json")
TRAIN_DATA
output of the first three tweets
[('طقس حضرموت صور اوليه سيول وادي رخيه',
{'entities': [(26, 35, 'loc'), (4, 10, 'city')]}),
('سيول وادي العف قرية هدى بمديرية حبان بمحافظة شبوة جنوب اليمن اليوم الاحد مايو م تصوير عدنان القميشي',
{'entities': [(55, 60, 'country'),
(50, 54, 'pre'),
(45, 49, 'city'),
(32, 36, 'loc'),
(20, 23, 'loc'),
(5, 14, 'loc')]}),
('اول مرة قابلته جدة جاها سيول', {'entities': [(15, 18, 'city')]})]
then the train spacey NER model
import spacy
import random
################### Train Spacy NER.###########
def train_spacy():
TRAIN_DATA = convert_dataturks_to_spacy("/content/drive/My Drive/Colab Notebooks/Name Entity Recognition/NERTweets.json");
nlp = spacy.blank('ar') # create blank Language class
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.begin_training()
for itn in range(1):
print("Statring iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
nlp.update(
[text], # batch of texts
[annotations], # batch of annotations
drop=0.2, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights
losses=losses)
print(losses)
#do prediction
doc = nlp("Samsing mobiles below 0")
print ("Entities= " + str(["" + str(ent.text) + "_" + str(ent.label_) for ent in doc.ents]))
train_spacy
output error
Statring iteration 0
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-8-6b61c2d740cf> in <module>()
----> 1 train_spacy()
2 frames
/usr/local/lib/python3.6/dist-packages/spacy/language.py in _format_docs_and_golds(self, docs, golds)
470 err = Errors.E151.format(unexp=unexpected, exp=expected_keys)
471 raise ValueError(err)
--> 472 gold = GoldParse(doc, **gold)
473 doc_objs.append(doc)
474 gold_objs.append(gold)
gold.pyx in spacy.gold.GoldParse.__init__()
gold.pyx in spacy.gold.biluo_tags_from_offsets()
ValueError: [E103] Trying to set conflicting doc.ents: '(42, 47, 'loc')' and '(34, 47, 'loc')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap.
my results upload on colab google in the link below. where is the
problem?
https://drive.google.com/drive/folders/19t33kW4Dwtbv6s4vfMpa2kNwoVNzSu5I
spacy不允许重叠的实体,你应该删除重叠的实体
您的代码将是:
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
training_data = []
lines=[]
with open(dataturks_JSON_FilePath, 'r') as f:
lines = f.readlines()
for line in lines:
#line=lines[0]
data = json.loads(line)
text = data['content']
entities = []
annotations=[]
for annotation in data['annotation']:
point = annotation['points'][0]
label = annotation['label']
annotations.append((point['start'], point['end'] ,label,point['end']-point['start']))
annotations=sorted(annotations, key=lambda student: student[3],reverse=True)
seen_tokens = set()
for annotation in annotations:
start=annotation[0]
end=annotation[1]
labels=annotation[2]
if start not in seen_tokens and end - 1 not in seen_tokens:
seen_tokens.update(range(start, end))
if not isinstance(labels, list):
labels = [labels]
for label in labels:
#dataturks indices are both inclusive [start, end] but spacy is not [start, end)
entities.append((start, end+1 ,label))
training_data.append((text, {"entities" : entities})
这是为 NER 训练 spacy 模型的代码。我的数据集是阿拉伯语推文 JSON 文件。我通过 https://dataturks.com 机器学习工具在我的数据集中手动标记了位置,但代码不是 运行。
我使用了这个 link 中的代码 https://dataturks.com/help/dataturks-ner-json-to-spacy-train.php
############################################ NOTE ########################################################
#
# Creates NER training data in Spacy format from JSON downloaded from Dataturks.
#
# Outputs the Spacy training data which can be used for Spacy training.
#
############################################################################################################
############################################################################################################
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
training_data = []
lines=[]
with open(dataturks_JSON_FilePath, 'r') as f:
lines = f.readlines()
for line in lines:
data = json.loads(line)
text = data['content']
entities = []
annotations = data['annotation']
if annotations:
for annotation in annotations:
#only a single point in text annotation.
point = annotation['points'][0]
labels = annotation['label']
# handle both list of labels or a single label.
if not isinstance(labels, list):
labels = [labels]
#print(labels)
for label in labels:
#dataturks indices are both inclusive [start, end] but spacy is not [start, end)
entities.append((point['start'], point['end'] + 1 ,label))
training_data.append((text, {"entities" : entities}))
return training_data
train data
TRAIN_DATA = convert_dataturks_to_spacy("/content/drive/My Drive/Colab Notebooks/Name Entity Recognition/NERTweets.json")
TRAIN_DATA
output of the first three tweets
[('طقس حضرموت صور اوليه سيول وادي رخيه',
{'entities': [(26, 35, 'loc'), (4, 10, 'city')]}),
('سيول وادي العف قرية هدى بمديرية حبان بمحافظة شبوة جنوب اليمن اليوم الاحد مايو م تصوير عدنان القميشي',
{'entities': [(55, 60, 'country'),
(50, 54, 'pre'),
(45, 49, 'city'),
(32, 36, 'loc'),
(20, 23, 'loc'),
(5, 14, 'loc')]}),
('اول مرة قابلته جدة جاها سيول', {'entities': [(15, 18, 'city')]})]
then the train spacey NER model
import spacy
import random
################### Train Spacy NER.###########
def train_spacy():
TRAIN_DATA = convert_dataturks_to_spacy("/content/drive/My Drive/Colab Notebooks/Name Entity Recognition/NERTweets.json");
nlp = spacy.blank('ar') # create blank Language class
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.begin_training()
for itn in range(1):
print("Statring iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
nlp.update(
[text], # batch of texts
[annotations], # batch of annotations
drop=0.2, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights
losses=losses)
print(losses)
#do prediction
doc = nlp("Samsing mobiles below 0")
print ("Entities= " + str(["" + str(ent.text) + "_" + str(ent.label_) for ent in doc.ents]))
train_spacy
output error
Statring iteration 0
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-8-6b61c2d740cf> in <module>()
----> 1 train_spacy()
2 frames
/usr/local/lib/python3.6/dist-packages/spacy/language.py in _format_docs_and_golds(self, docs, golds)
470 err = Errors.E151.format(unexp=unexpected, exp=expected_keys)
471 raise ValueError(err)
--> 472 gold = GoldParse(doc, **gold)
473 doc_objs.append(doc)
474 gold_objs.append(gold)
gold.pyx in spacy.gold.GoldParse.__init__()
gold.pyx in spacy.gold.biluo_tags_from_offsets()
ValueError: [E103] Trying to set conflicting doc.ents: '(42, 47, 'loc')' and '(34, 47, 'loc')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap.
my results upload on colab google in the link below. where is the problem?
https://drive.google.com/drive/folders/19t33kW4Dwtbv6s4vfMpa2kNwoVNzSu5I
spacy不允许重叠的实体,你应该删除重叠的实体 您的代码将是:
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
training_data = []
lines=[]
with open(dataturks_JSON_FilePath, 'r') as f:
lines = f.readlines()
for line in lines:
#line=lines[0]
data = json.loads(line)
text = data['content']
entities = []
annotations=[]
for annotation in data['annotation']:
point = annotation['points'][0]
label = annotation['label']
annotations.append((point['start'], point['end'] ,label,point['end']-point['start']))
annotations=sorted(annotations, key=lambda student: student[3],reverse=True)
seen_tokens = set()
for annotation in annotations:
start=annotation[0]
end=annotation[1]
labels=annotation[2]
if start not in seen_tokens and end - 1 not in seen_tokens:
seen_tokens.update(range(start, end))
if not isinstance(labels, list):
labels = [labels]
for label in labels:
#dataturks indices are both inclusive [start, end] but spacy is not [start, end)
entities.append((start, end+1 ,label))
training_data.append((text, {"entities" : entities})