使用 huggingface transformer[marianmt] 翻译较大文本的奇怪结果
Strange results with huggingface transformer[marianmt] translation of larger text
我需要翻译数据库中的大量文本。因此,这几天我一直在处理变压器和模型。我绝对不是数据科学专家,不幸的是我没有进一步了解。
问题始于较长的文本。第二个问题是定序器的通常最大令牌大小 (512)。只是截断并不是一个真正的选择。 Here 我确实找到了一个解决方法,但它不能正常工作,结果是在较长的文本(>300 个序列)上出现单词沙拉
这里有一个例子(请忽略警告,这是另一个问题 - 目前没有那么大的伤害);
如果我使用例句 2(55 个序列)或 5 次(163 个序列)- 没问题。
但它会被例如433 个序列(屏幕截图中的第 3 个绿色文本块)。
对于超过 510 个序列,我尝试将其分成块,如上文所述 link。但是这里的结果也很奇怪。
我很确定 - 我不止一个错误低估了这个话题。
但我看不出有其他 (free/cheap) 方法来翻译大量文本。
你们能帮帮我吗?您看到哪些(思考)错误,您会建议如何解决这些问题?非常感谢。
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
if torch.cuda.is_available():
dev = "cuda"
else:
dev = "cpu"
device = torch.device(dev)
mname = 'Helsinki-NLP/opus-mt-de-en'
tokenizer = AutoTokenizer.from_pretrained(mname)
model = AutoModelForSeq2SeqLM.from_pretrained(mname)
model.to(device)
chunksize = 512
text_short = "Nach nur sieben Seiten appellierte man an die Wählerinnen und Wähler, sich richtig zu entscheiden, nämlich für Frieden, Freiheit, Sozialismus. "
text_long = text_short
#this loop is just for debugging/testing and simulating long text
for x in range(30):
text_long = text_long + text_short
tokens = tokenizer.encode_plus(text_long, return_tensors="pt", add_special_tokens=True, padding=False, truncation=False).to(device)
str_len = len(tokens['input_ids'][0])
if str_len > 510:
# split into chunks of 510 tokens, we also convert to list (default is tuple which is immutable)
input_id_chunks = list(tokens['input_ids'][0].split(chunksize - 2))
mask_chunks = list(tokens['attention_mask'][0].split(chunksize - 2))
cnt = 1
for tensor in input_id_chunks:
print('3[96m' + 'chunk ' + str(cnt) + ': ' + str(len(tensor)) + '3[93m')
cnt += 1
# loop through each chunk
# https://towardsdatascience.com/how-to-apply-transformers-to-any-length-of-text-a5601410af7f
for i in range(len(input_id_chunks)):
# add CLS and SEP tokens to input IDs
input_id_chunks[i] = torch.cat([
torch.tensor([101]).to(device), input_id_chunks[i], torch.tensor([102]).to(device)
])
# add attention tokens to attention mask
mask_chunks[i] = torch.cat([
torch.tensor([1]).to(device), mask_chunks[i], torch.tensor([1]).to(device)
])
# get required padding length
pad_len = chunksize - input_id_chunks[i].shape[0]
# check if tensor length satisfies required chunk size
if pad_len > 0:
# if padding length is more than 0, we must add padding
input_id_chunks[i] = torch.cat([
input_id_chunks[i], torch.Tensor([0] * pad_len).to(device)
])
mask_chunks[i] = torch.cat([
mask_chunks[i], torch.Tensor([0] * pad_len).to(device)
])
input_ids = torch.stack(input_id_chunks)
attention_mask = torch.stack(mask_chunks)
input_dict = {'input_ids': input_ids.long(), 'attention_mask': attention_mask.int()}
outputs = model.generate(**input_dict)
#this doesnt work - following error comes to the console --> "host_softmax" not implemented for 'Long'
#probs = torch.nn.functional.softmax(outputs[0], dim=-1)
# probs
# probs = probs.mean(dim=0)
# probs
else:
tokens["input_ids"] = tokens["input_ids"][:, :512] #truncating normally not necessary
tokens["attention_mask"] = tokens["attention_mask"][:, :512]
outputs = model.generate(**tokens)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
print('3[94m' + str(str_len))
print('3[92m' + decoded)
备注;以下库是必需的:
pip3 install torch==1.9.0+cu102 torchvision==0.10.0+cu102 torchaudio===0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
pip install transformers
pip install sentencepiece
要使用转换器翻译长文本,您可以按段落拆分文本,按句子拆分段落,然后将句子分批输入模型。在任何情况下,最好以逐句的方式使用 MarianMT 进行翻译,因为如果您将长文本作为一个片段输入它,它可能会丢失一些部分。
from transformers import MarianMTModel, MarianTokenizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import LineTokenizer
import math
import torch
if torch.cuda.is_available():
dev = "cuda"
else:
dev = "cpu"
device = torch.device(dev)
mname = 'Helsinki-NLP/opus-mt-de-en'
tokenizer = MarianTokenizer.from_pretrained(mname)
model = MarianMTModel.from_pretrained(mname)
model.to(device)
lt = LineTokenizer()
batch_size = 8
text_short = "Nach nur sieben Seiten appellierte man an die Wählerinnen und Wähler, sich richtig zu entscheiden, nämlich für Frieden, Freiheit, Sozialismus. "
text_long = text_short * 30
paragraphs = lt.tokenize(text_long)
translated_paragraphs = []
for paragraph in paragraphs:
sentences = sent_tokenize(paragraph)
batches = math.ceil(len(sentences) / batch_size)
translated = []
for i in range(batches):
sent_batch = sentences[i*batch_size:(i+1)*batch_size]
model_inputs = tokenizer(sent_batch, return_tensors="pt", padding=True, truncation=True, max_length=500).to(device)
with torch.no_grad():
translated_batch = model.generate(**model_inputs)
translated += translated_batch
translated = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
translated_paragraphs += [" ".join(translated)]
translated_text = "\n".join(translated_paragraphs)
我需要翻译数据库中的大量文本。因此,这几天我一直在处理变压器和模型。我绝对不是数据科学专家,不幸的是我没有进一步了解。
问题始于较长的文本。第二个问题是定序器的通常最大令牌大小 (512)。只是截断并不是一个真正的选择。 Here 我确实找到了一个解决方法,但它不能正常工作,结果是在较长的文本(>300 个序列)上出现单词沙拉
这里有一个例子(请忽略警告,这是另一个问题 - 目前没有那么大的伤害);
如果我使用例句 2(55 个序列)或 5 次(163 个序列)- 没问题。
但它会被例如433 个序列(屏幕截图中的第 3 个绿色文本块)。
对于超过 510 个序列,我尝试将其分成块,如上文所述 link。但是这里的结果也很奇怪。
我很确定 - 我不止一个错误低估了这个话题。 但我看不出有其他 (free/cheap) 方法来翻译大量文本。
你们能帮帮我吗?您看到哪些(思考)错误,您会建议如何解决这些问题?非常感谢。
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
if torch.cuda.is_available():
dev = "cuda"
else:
dev = "cpu"
device = torch.device(dev)
mname = 'Helsinki-NLP/opus-mt-de-en'
tokenizer = AutoTokenizer.from_pretrained(mname)
model = AutoModelForSeq2SeqLM.from_pretrained(mname)
model.to(device)
chunksize = 512
text_short = "Nach nur sieben Seiten appellierte man an die Wählerinnen und Wähler, sich richtig zu entscheiden, nämlich für Frieden, Freiheit, Sozialismus. "
text_long = text_short
#this loop is just for debugging/testing and simulating long text
for x in range(30):
text_long = text_long + text_short
tokens = tokenizer.encode_plus(text_long, return_tensors="pt", add_special_tokens=True, padding=False, truncation=False).to(device)
str_len = len(tokens['input_ids'][0])
if str_len > 510:
# split into chunks of 510 tokens, we also convert to list (default is tuple which is immutable)
input_id_chunks = list(tokens['input_ids'][0].split(chunksize - 2))
mask_chunks = list(tokens['attention_mask'][0].split(chunksize - 2))
cnt = 1
for tensor in input_id_chunks:
print('3[96m' + 'chunk ' + str(cnt) + ': ' + str(len(tensor)) + '3[93m')
cnt += 1
# loop through each chunk
# https://towardsdatascience.com/how-to-apply-transformers-to-any-length-of-text-a5601410af7f
for i in range(len(input_id_chunks)):
# add CLS and SEP tokens to input IDs
input_id_chunks[i] = torch.cat([
torch.tensor([101]).to(device), input_id_chunks[i], torch.tensor([102]).to(device)
])
# add attention tokens to attention mask
mask_chunks[i] = torch.cat([
torch.tensor([1]).to(device), mask_chunks[i], torch.tensor([1]).to(device)
])
# get required padding length
pad_len = chunksize - input_id_chunks[i].shape[0]
# check if tensor length satisfies required chunk size
if pad_len > 0:
# if padding length is more than 0, we must add padding
input_id_chunks[i] = torch.cat([
input_id_chunks[i], torch.Tensor([0] * pad_len).to(device)
])
mask_chunks[i] = torch.cat([
mask_chunks[i], torch.Tensor([0] * pad_len).to(device)
])
input_ids = torch.stack(input_id_chunks)
attention_mask = torch.stack(mask_chunks)
input_dict = {'input_ids': input_ids.long(), 'attention_mask': attention_mask.int()}
outputs = model.generate(**input_dict)
#this doesnt work - following error comes to the console --> "host_softmax" not implemented for 'Long'
#probs = torch.nn.functional.softmax(outputs[0], dim=-1)
# probs
# probs = probs.mean(dim=0)
# probs
else:
tokens["input_ids"] = tokens["input_ids"][:, :512] #truncating normally not necessary
tokens["attention_mask"] = tokens["attention_mask"][:, :512]
outputs = model.generate(**tokens)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
print('3[94m' + str(str_len))
print('3[92m' + decoded)
备注;以下库是必需的:
pip3 install torch==1.9.0+cu102 torchvision==0.10.0+cu102 torchaudio===0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
pip install transformers
pip install sentencepiece
要使用转换器翻译长文本,您可以按段落拆分文本,按句子拆分段落,然后将句子分批输入模型。在任何情况下,最好以逐句的方式使用 MarianMT 进行翻译,因为如果您将长文本作为一个片段输入它,它可能会丢失一些部分。
from transformers import MarianMTModel, MarianTokenizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import LineTokenizer
import math
import torch
if torch.cuda.is_available():
dev = "cuda"
else:
dev = "cpu"
device = torch.device(dev)
mname = 'Helsinki-NLP/opus-mt-de-en'
tokenizer = MarianTokenizer.from_pretrained(mname)
model = MarianMTModel.from_pretrained(mname)
model.to(device)
lt = LineTokenizer()
batch_size = 8
text_short = "Nach nur sieben Seiten appellierte man an die Wählerinnen und Wähler, sich richtig zu entscheiden, nämlich für Frieden, Freiheit, Sozialismus. "
text_long = text_short * 30
paragraphs = lt.tokenize(text_long)
translated_paragraphs = []
for paragraph in paragraphs:
sentences = sent_tokenize(paragraph)
batches = math.ceil(len(sentences) / batch_size)
translated = []
for i in range(batches):
sent_batch = sentences[i*batch_size:(i+1)*batch_size]
model_inputs = tokenizer(sent_batch, return_tensors="pt", padding=True, truncation=True, max_length=500).to(device)
with torch.no_grad():
translated_batch = model.generate(**model_inputs)
translated += translated_batch
translated = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
translated_paragraphs += [" ".join(translated)]
translated_text = "\n".join(translated_paragraphs)