Tensorflow "Transformer model for language understanding" 与另一个数据集?

Tensorflow "Transformer model for language understanding" with another Dataset?

我一直在阅读此处的官方指南 (https://www.tensorflow.org/text/tutorials/transformer) 以尝试在 Tensorflow 中重新创建 Vanilla Transformer。我注意到使用的数据集非常具体,在指南的末尾,它说尝试使用不同的数据集。

但这就是我卡了很久的地方!我正在尝试在此处使用 WMT14 数据集(在 Vaswani 等人的原始论文中使用):https://www.tensorflow.org/datasets/catalog/wmt14_translate#wmt14_translatede-en .

我也尝试过 Spacy 的 Multi30k 和 IWSLT 数据集,但是是否有关于如何使数据集适合模型要求的指南?具体来说,将其标记化。官方 TF 指南使用预训练的标记器,它特定于给定的 PR-EN 数据集。

model_name = "ted_hrlr_translate_pt_en_converter"

我想知道如何使用 TF (bert) 分词器来分词 Spacy 数据集?我有 PyTorch 的代码,不幸的是我不知道如何将它改编为 Tensorflow。任何帮助将不胜感激!

import spacy

spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

BOS_WORD = '<s>'
EOS_WORD = '</s>'
BLANK_WORD = "<blank>"
SRC = data.Field(tokenize=tokenize_de, pad_token=BLANK_WORD)
TGT = data.Field(tokenize=tokenize_en, init_token = BOS_WORD, 
                 eos_token = EOS_WORD, pad_token=BLANK_WORD)

MAX_LEN = 100
train, val, test = datasets.IWSLT.splits(
    exts=('.de', '.en'), fields=(SRC, TGT), 
    filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
        len(vars(x)['trg']) <= MAX_LEN)
MIN_FREQ = 2
SRC.build_vocab(train.src, min_freq=MIN_FREQ)
TGT.build_vocab(train.trg, min_freq=MIN_FREQ)

您可以按照本教程构建自己的分词器 https://www.tensorflow.org/text/guide/subwords_tokenizer

这与他们在变形金刚示例中构建 ted_hrlr_translate_pt_en_converter 分词器的方式完全相同,您只需将其调整为您的语言即可。

我为你重写了它,但没有测试它:

import collections
import logging
import os
import pathlib
import re
import string
import sys
import time
import numpy as np
#import matplotlib.pyplot as plt

import tensorflow_datasets as tfds
import tensorflow_text as text
import tensorflow as tf
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab



examples, metadata = tfds.load('wmt14_translate/de-en', with_info=True,
                               as_supervised=True)
train_examples, val_examples = examples['train'], examples['validation']

for de_examples, en_examples in train_examples.batch(3).take(1):
  for pt in de_examples.numpy():
    print(pt.decode('utf-8'))

  print()

  for en in en_examples.numpy():
    print(en.decode('utf-8'))

train_en = train_examples.map(lambda de, en: en)
train_de = train_examples.map(lambda de, en: de)

bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = 8000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

de_vocab = bert_vocab.bert_vocab_from_dataset(
    train_de.batch(1000).prefetch(2),
    **bert_vocab_args
)

print(de_vocab[:10])
print(de_vocab[100:110])
print(de_vocab[1000:1010])
print(de_vocab[-10:])

def write_vocab_file(filepath, vocab):
  with open(filepath, 'w') as f:
    for token in vocab:
      print(token, file=f)

write_vocab_file('de_vocab.txt', de_vocab)

en_vocab = bert_vocab.bert_vocab_from_dataset(
    train_en.batch(1000).prefetch(2),
    **bert_vocab_args
)

print(en_vocab[:10])
print(en_vocab[100:110])
print(en_vocab[1000:1010])
print(en_vocab[-10:])

write_vocab_file('en_vocab.txt', en_vocab)

de_tokenizer = text.BertTokenizer('de_vocab.txt', **bert_tokenizer_params)
en_tokenizer = text.BertTokenizer('en_vocab.txt', **bert_tokenizer_params)

# Tokenize the examples -> (batch, word, word-piece)
token_batch = en_tokenizer.tokenize(en_examples)
# Merge the word and word-piece axes -> (batch, tokens)
token_batch = token_batch.merge_dims(-2,-1)

for ex in token_batch.to_list():
  print(ex)

# Lookup each token id in the vocabulary.
txt_tokens = tf.gather(en_vocab, token_batch)
# Join with spaces.
tf.strings.reduce_join(txt_tokens, separator=' ', axis=-1)

words = en_tokenizer.detokenize(token_batch)
tf.strings.reduce_join(words, separator=' ', axis=-1)

START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

def add_start_end(ragged):
  count = ragged.bounding_shape()[0]
  starts = tf.fill([count,1], START)
  ends = tf.fill([count,1], END)
  return tf.concat([starts, ragged, ends], axis=1)

words = en_tokenizer.detokenize(add_start_end(token_batch))
tf.strings.reduce_join(words, separator=' ', axis=-1)

def cleanup_text(reserved_tokens, token_txt):
  # Drop the reserved tokens, except for "[UNK]".
  bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
  bad_token_re = "|".join(bad_tokens)

  bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
  result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

  # Join them into strings.
  result = tf.strings.reduce_join(result, separator=' ', axis=-1)

  return result

token_batch = en_tokenizer.tokenize(en_examples).merge_dims(-2,-1)
words = en_tokenizer.detokenize(token_batch)

cleanup_text(reserved_tokens, words).numpy()

class CustomTokenizer(tf.Module):
  def __init__(self, reserved_tokens, vocab_path):
    self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
    self._reserved_tokens = reserved_tokens
    self._vocab_path = tf.saved_model.Asset(vocab_path)

    vocab = pathlib.Path(vocab_path).read_text().splitlines()
    self.vocab = tf.Variable(vocab)

    ## Create the signatures for export:

    # Include a tokenize signature for a batch of strings.
    self.tokenize.get_concrete_function(
        tf.TensorSpec(shape=[None], dtype=tf.string))

    # Include `detokenize` and `lookup` signatures for:
    #   * `Tensors` with shapes [tokens] and [batch, tokens]
    #   * `RaggedTensors` with shape [batch, tokens]
    self.detokenize.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.detokenize.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    self.lookup.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.lookup.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    # These `get_*` methods take no arguments
    self.get_vocab_size.get_concrete_function()
    self.get_vocab_path.get_concrete_function()
    self.get_reserved_tokens.get_concrete_function()

  @tf.function
  def tokenize(self, strings):
    enc = self.tokenizer.tokenize(strings)
    # Merge the `word` and `word-piece` axes.
    enc = enc.merge_dims(-2,-1)
    enc = add_start_end(enc)
    return enc

  @tf.function
  def detokenize(self, tokenized):
    words = self.tokenizer.detokenize(tokenized)
    return cleanup_text(self._reserved_tokens, words)

  @tf.function
  def lookup(self, token_ids):
    return tf.gather(self.vocab, token_ids)

  @tf.function
  def get_vocab_size(self):
    return tf.shape(self.vocab)[0]

  @tf.function
  def get_vocab_path(self):
    return self._vocab_path

  @tf.function
  def get_reserved_tokens(self):
    return tf.constant(self._reserved_tokens)

tokenizers = tf.Module()
tokenizers.pt = CustomTokenizer(reserved_tokens, 'de_vocab.txt')
tokenizers.en = CustomTokenizer(reserved_tokens, 'en_vocab.txt')

model_name = 'ted_hrlr_translate_de_en_converter'
tf.saved_model.save(tokenizers, model_name)

实际上,您可以使用一个名为 transformers 的库,它将您的文本标记为模型所需的所需输入,例如:

import transformers 
import tensorflow as tf 
from tensorflow import keras

tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")

tokenizer(["hello world", "this is me", "hello again"], return_tensors="tf", padding=True, truncation=True)

这是结果:

{'input_ids': <tf.Tensor: shape=(3, 5), dtype=int32, numpy=
array([[ 101, 7592, 2088,  102,    0],
       [ 101, 2023, 2003, 2033,  102],
       [ 101, 7592, 2153,  102,    0]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(3, 5), dtype=int32, numpy=
array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(3, 5), dtype=int32, numpy=
array([[1, 1, 1, 1, 0],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 0]], dtype=int32)>}

如您所见,它与 tensorflow 转换器实现兼容

在这个例子中,我使用了 bert tokenizer(但是如果你需要的话,你也可以使用很多其他的 tokenizer)