在 2020 年 TF 峰会上实施 'Learning To Read With Tensorflow' 演讲 - Tensorflow 2.1/2.2 中的 EncoderDecoder Seq2Seq 模型 - 自定义训练步骤
Implementing The 'Learning To Read With Tensorflow' Talk From TF Summit 2020 - EncoderDecoder Seq2Seq Model In Tensorflow 2.1/2.2 - Custom Train Step
背景信息
我正在为 Tensorflow 2020 峰会上我觉得有趣的每个演讲创建 Google Colabs。请注意,我使用的是 Tensorflow 2.1。
我在尝试执行 'Learning To Read With Tensorflow'
演讲时遇到问题。
在我们得到 EncoderDecoder
class 定义之前,一切都是美好的。当我在我的自定义 Model
subclass 上实现 fit 方法时,我收到一个错误,将在下面详细说明。
最后一个严重错误是AttributeError: 'NoneType' object has no attribute 'dtype'
。
但是,我认为这是由于 GradientTape
范围代码 and/or 中的问题以及 的定义问题造成的Decoder Layers
(包括Attention Layers
)
主要代码
# Not normally defined here... but doing so for clarity
MAX_VOCAB_SIZE = 5000
WINDOW_LENGTH = 11
class EncoderDecoder(tf.keras.Model):
def __init__(self,
max_features=MAX_VOCAB_SIZE,
output_seq_len=WINDOW_LENGTH-1,
embedding_dims=200,
rnn_units=512):
super().__init__()
self.max_features = max_features
self.output_seq_len = output_seq_len
self.embedding_dims = embedding_dims
self.rnn_units = rnn_units
self.vectorize_layer = \
tf.keras.layers.experimental.preprocessing.TextVectorization(
max_tokens=self.max_features,
standardize='lower_and_strip_punctuation',
split='whitespace',
ngrams=None,
output_mode='int',
output_sequence_length=self.output_seq_len,
pad_to_max_tokens=True)
# --- <ENCODER STUFF> ---
# Embedding
self.encoder_embedding = \
tf.keras.layers.Embedding(input_dim=self.max_features+1,
output_dim=self.embedding_dims)
# ENCODER
self.lstm_layer = \
tf.keras.layers.LSTM(units=self.rnn_units,
return_state=True)
# --- </ENCODER STUFF> ---
# --- <DECODER STUFF> ---
# Embedding
self.decoder_embedding = \
tf.keras.layers.Embedding(input_dim=self.max_features+1,
output_dim=self.embedding_dims)
# ---------------- MAYBE NOT NECESSARY ----------------
# Sampler (for use during training)
# This was not shown during the talk but it is pretty obvious
sampler = tfa.seq2seq.sampler.TrainingSampler()
# This was not shown during the talk but is required...
# This is my best guess
decoder_cell = tf.keras.layers.LSTMCell(units=self.rnn_units)
# ---------------- MAYBE NOT NECESSARY ----------------
# Output Layer For Decoder
self.projection_layer = \
tf.keras.layers.Dense(self.max_features)
# DECODER
self.decoder = \
tfa.seq2seq.BasicDecoder(cell=decoder_cell,
sampler=sampler,
output_layer=self.projection_layer)
# --- </DECODER STUFF> ---
# --- <ATTN STUFF> ---
# Basic dense attention layer to connect Encoder & Decoder
self.attention = tf.keras.layers.Attention()
# --- </ATTN STUFF> ---
def train_step(self, data):
""" Overwrite built-in train_step method
Args:
data (tuple): The example (ten `words`), and the label (one `word`)
Returns:
Metric results for all passed metrics
"""
# Split data into example (x) and label (y)
x, y = data[0], data[1]
# Vectorize the example words (x)
x = self.vectorize_layer(x)
# Vectorize the labels
# This will by default pad the output to 10 ... but we only need the
# first entry (the true label not the useless padding)
y = self.vectorize_layer(y)[:, 0]
# Convert our label into a one-hot encoding based on the max number of
# features that we will be using for our model
y_one_hot = tf.one_hot(y, self.max_features)
# Everything within GradientTape is recorded
# for later automatic differentiation
with tf.GradientTape() as tape:
# --- <ENCODER STUFF> ---
# Transform the example utilizing the encoder embedding
inputs = self.encoder_embedding(x)
# Get the encoder outputs and state by
# utilizing the encoder (lstm_layer)
# - encoder_outputs : [max_time, batch_size, num_units]
# - encoder_state : [state_h, state_c]
# * state_h --- The Hidden State
# * state_c --- The Cell State
encoder_outputs, state_h, state_c = self.lstm_layer(inputs)
# --- </ENCODER STUFF> ---
# --- <ATTN STUFF> ---
# Pass the encoder outputs and hidden state allowing us
# to track the intermediate state coming out of the encoder layers
attn_output = self.attention([encoder_outputs, state_h])
attn_output = tf.expand_dims(attn_output, axis=1)
# --- </ATTN STUFF> ---
# --- <DECODER STUFF> ---
# ??? Create an empty embedding ???
targets = self.decoder_embedding(tf.zeros_like(y))
# Concat the output of the attention layer to the last axis
# of the empty targets embedding
concat_output = tf.concat([targets, attn_output], axis=-1)
# Predict the targets using the state from the encoder
outputs, _, _ = \
self.decoder(concat_output, initial_state=[state_h, state_c])
# --- </DECODER STUFF> ---
# Automatically differeniate utilizing the loss and trainable variables
gradients = tape.gradient(loss, trainable_variables)
# Collect the outputs so that they can be optimized
self.optimizer.apply_gradients(zip(gradients, trainable_variables))
# Update the metric state prior to return
self.compiled_metrics.update_state(y_one_hot, y_pred)
return {m.name: m.result() for m in self.metrics}
model = EncoderDecoder()
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
optimizer="adam",
metrics=["accuracy"])
model.vectorize_layer.adapt(lines.batch(256))
# ERROR OCCURS ON THIS LINE
model.fit(data.batch(256),
epochs=45,
callbacks=[tf.keras.callbacks.ModelCheckpoint(filepath='text_gen')])
详细错误消息
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-40-779906f7f617> in <module>()
1 model.fit(data.batch(256),
2 epochs=45,
----> 3 callbacks=[tf.keras.callbacks.ModelCheckpoint(filepath='text_gen')])
8 frames
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
817 max_queue_size=max_queue_size,
818 workers=workers,
--> 819 use_multiprocessing=use_multiprocessing)
820
821 def evaluate(self,
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
233 max_queue_size=max_queue_size,
234 workers=workers,
--> 235 use_multiprocessing=use_multiprocessing)
236
237 total_samples = _get_total_number_of_samples(training_data_adapter)
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training_v2.py in _process_training_inputs(model, x, y, batch_size, epochs, sample_weights, class_weights, steps_per_epoch, validation_split, validation_data, validation_steps, shuffle, distribution_strategy, max_queue_size, workers, use_multiprocessing)
591 max_queue_size=max_queue_size,
592 workers=workers,
--> 593 use_multiprocessing=use_multiprocessing)
594 val_adapter = None
595 if validation_data:
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training_v2.py in _process_inputs(model, mode, x, y, batch_size, epochs, sample_weights, class_weights, shuffle, steps, distribution_strategy, max_queue_size, workers, use_multiprocessing)
704 max_queue_size=max_queue_size,
705 workers=workers,
--> 706 use_multiprocessing=use_multiprocessing)
707
708 return adapter
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/data_adapter.py in __init__(self, x, y, sample_weights, standardize_function, **kwargs)
700
701 if standardize_function is not None:
--> 702 x = standardize_function(x)
703
704 # Note that the dataset instance is immutable, its fine to reusing the user
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training_v2.py in standardize_function(dataset)
658 model.sample_weight_mode = getattr(model, 'sample_weight_mode', None)
659
--> 660 standardize(dataset, extract_tensors_from_dataset=False)
661
662 # Then we map using only the tensor standardization portion.
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training.py in _standardize_user_data(self, x, y, sample_weight, class_weight, batch_size, check_steps, steps_name, steps, validation_split, shuffle, extract_tensors_from_dataset)
2358 is_compile_called = False
2359 if not self._is_compiled and self.optimizer:
-> 2360 self._compile_from_inputs(all_inputs, y_input, x, y)
2361 is_compile_called = True
2362
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training.py in _compile_from_inputs(self, all_inputs, target, orig_inputs, orig_target)
2578 if training_utils.has_tensors(target):
2579 target = training_utils.cast_if_floating_dtype_and_mismatch(
-> 2580 target, self.outputs)
2581 training_utils.validate_input_types(target, orig_target,
2582 allow_dict=False, field_name='target')
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training_utils.py in cast_if_floating_dtype_and_mismatch(targets, outputs)
1334 if tensor_util.is_tensor(targets):
1335 # There is one target, so output[0] should be the only output.
-> 1336 return cast_single_tensor(targets, dtype=outputs[0].dtype)
1337 new_targets = []
1338 for target, out in zip(targets, outputs):
AttributeError: 'NoneType' object has no attribute 'dtype'
如何获取 data
和 lines
变量(如果希望复制)
获取数据
>>> wget http://www.thespermwhale.com/jaseweston/babi/CBTest.tgz
>>> tar zxvf CBTest.tgz
>>> rm -rf CBTest.tgz
预处理数据
# Load data from a dataset comprising lines
# from one or more text files.
lines = tf.data.TextLineDataset("<path-to>/cbt_train.txt")
# Filter Out Title Lines First
# This simple fn not included in this Whosebug code
lines = lines.filter(lambda x: not is_title(x))
# Then We Remove All Punctuation
# This simple fn not included in this Whosebug code
lines = lines.map(lambda x: remove_punc(x))
# Then We Remove All Extra Spaces Created By The Previous FN
# This simple fn not included in this Whosebug code
lines = lines.map(lambda x: remove_extra_spaces(x))
# Then We Turn All The Uppercase Letters into Lowercase Letters
# This simple fn not included in this Whosebug code
lines = lines.map(lambda x: make_lower(x))
# Get words from lines
words = lines.map(tf.strings.split)
words = words.unbatch()
# Get wordsets
wordsets = words.batch(11)
# get_example_label is a simple fn to split wordsets into examples and labels
# First ten words are the example and last word is the label
data = wordsets.map(get_example_label)
# Shuffle
data = data.shuffle(1024)
参考资料
提前致谢!!
更新
似乎 Tensorflow 发布了教程,详细介绍了在峰会上进行的所有演示。
结果是您可以检查实际代码并确定他们的代码与您的代码之间的差异。我不会 post 这里的差异,因为它们比我最初想象的更重要
链接
其他资源
当我联系 Tensorflow 时,他们还建议我查看 transformer tutorial,其中详细介绍了如何实现复杂的编码器-解码器和自注意力网络。
- 此外,他们还向我指出了他们的 TF2 implementation of BERT,所以我也将 link 包括在内。
希望这些资源对您有所帮助!
背景信息
我正在为 Tensorflow 2020 峰会上我觉得有趣的每个演讲创建 Google Colabs。请注意,我使用的是 Tensorflow 2.1。
我在尝试执行 'Learning To Read With Tensorflow'
演讲时遇到问题。
在我们得到 EncoderDecoder
class 定义之前,一切都是美好的。当我在我的自定义 Model
subclass 上实现 fit 方法时,我收到一个错误,将在下面详细说明。
最后一个严重错误是AttributeError: 'NoneType' object has no attribute 'dtype'
。
但是,我认为这是由于 GradientTape
范围代码 and/or 中的问题以及 的定义问题造成的Decoder Layers
(包括Attention Layers
)
主要代码
# Not normally defined here... but doing so for clarity
MAX_VOCAB_SIZE = 5000
WINDOW_LENGTH = 11
class EncoderDecoder(tf.keras.Model):
def __init__(self,
max_features=MAX_VOCAB_SIZE,
output_seq_len=WINDOW_LENGTH-1,
embedding_dims=200,
rnn_units=512):
super().__init__()
self.max_features = max_features
self.output_seq_len = output_seq_len
self.embedding_dims = embedding_dims
self.rnn_units = rnn_units
self.vectorize_layer = \
tf.keras.layers.experimental.preprocessing.TextVectorization(
max_tokens=self.max_features,
standardize='lower_and_strip_punctuation',
split='whitespace',
ngrams=None,
output_mode='int',
output_sequence_length=self.output_seq_len,
pad_to_max_tokens=True)
# --- <ENCODER STUFF> ---
# Embedding
self.encoder_embedding = \
tf.keras.layers.Embedding(input_dim=self.max_features+1,
output_dim=self.embedding_dims)
# ENCODER
self.lstm_layer = \
tf.keras.layers.LSTM(units=self.rnn_units,
return_state=True)
# --- </ENCODER STUFF> ---
# --- <DECODER STUFF> ---
# Embedding
self.decoder_embedding = \
tf.keras.layers.Embedding(input_dim=self.max_features+1,
output_dim=self.embedding_dims)
# ---------------- MAYBE NOT NECESSARY ----------------
# Sampler (for use during training)
# This was not shown during the talk but it is pretty obvious
sampler = tfa.seq2seq.sampler.TrainingSampler()
# This was not shown during the talk but is required...
# This is my best guess
decoder_cell = tf.keras.layers.LSTMCell(units=self.rnn_units)
# ---------------- MAYBE NOT NECESSARY ----------------
# Output Layer For Decoder
self.projection_layer = \
tf.keras.layers.Dense(self.max_features)
# DECODER
self.decoder = \
tfa.seq2seq.BasicDecoder(cell=decoder_cell,
sampler=sampler,
output_layer=self.projection_layer)
# --- </DECODER STUFF> ---
# --- <ATTN STUFF> ---
# Basic dense attention layer to connect Encoder & Decoder
self.attention = tf.keras.layers.Attention()
# --- </ATTN STUFF> ---
def train_step(self, data):
""" Overwrite built-in train_step method
Args:
data (tuple): The example (ten `words`), and the label (one `word`)
Returns:
Metric results for all passed metrics
"""
# Split data into example (x) and label (y)
x, y = data[0], data[1]
# Vectorize the example words (x)
x = self.vectorize_layer(x)
# Vectorize the labels
# This will by default pad the output to 10 ... but we only need the
# first entry (the true label not the useless padding)
y = self.vectorize_layer(y)[:, 0]
# Convert our label into a one-hot encoding based on the max number of
# features that we will be using for our model
y_one_hot = tf.one_hot(y, self.max_features)
# Everything within GradientTape is recorded
# for later automatic differentiation
with tf.GradientTape() as tape:
# --- <ENCODER STUFF> ---
# Transform the example utilizing the encoder embedding
inputs = self.encoder_embedding(x)
# Get the encoder outputs and state by
# utilizing the encoder (lstm_layer)
# - encoder_outputs : [max_time, batch_size, num_units]
# - encoder_state : [state_h, state_c]
# * state_h --- The Hidden State
# * state_c --- The Cell State
encoder_outputs, state_h, state_c = self.lstm_layer(inputs)
# --- </ENCODER STUFF> ---
# --- <ATTN STUFF> ---
# Pass the encoder outputs and hidden state allowing us
# to track the intermediate state coming out of the encoder layers
attn_output = self.attention([encoder_outputs, state_h])
attn_output = tf.expand_dims(attn_output, axis=1)
# --- </ATTN STUFF> ---
# --- <DECODER STUFF> ---
# ??? Create an empty embedding ???
targets = self.decoder_embedding(tf.zeros_like(y))
# Concat the output of the attention layer to the last axis
# of the empty targets embedding
concat_output = tf.concat([targets, attn_output], axis=-1)
# Predict the targets using the state from the encoder
outputs, _, _ = \
self.decoder(concat_output, initial_state=[state_h, state_c])
# --- </DECODER STUFF> ---
# Automatically differeniate utilizing the loss and trainable variables
gradients = tape.gradient(loss, trainable_variables)
# Collect the outputs so that they can be optimized
self.optimizer.apply_gradients(zip(gradients, trainable_variables))
# Update the metric state prior to return
self.compiled_metrics.update_state(y_one_hot, y_pred)
return {m.name: m.result() for m in self.metrics}
model = EncoderDecoder()
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
optimizer="adam",
metrics=["accuracy"])
model.vectorize_layer.adapt(lines.batch(256))
# ERROR OCCURS ON THIS LINE
model.fit(data.batch(256),
epochs=45,
callbacks=[tf.keras.callbacks.ModelCheckpoint(filepath='text_gen')])
详细错误消息
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-40-779906f7f617> in <module>()
1 model.fit(data.batch(256),
2 epochs=45,
----> 3 callbacks=[tf.keras.callbacks.ModelCheckpoint(filepath='text_gen')])
8 frames
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
817 max_queue_size=max_queue_size,
818 workers=workers,
--> 819 use_multiprocessing=use_multiprocessing)
820
821 def evaluate(self,
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
233 max_queue_size=max_queue_size,
234 workers=workers,
--> 235 use_multiprocessing=use_multiprocessing)
236
237 total_samples = _get_total_number_of_samples(training_data_adapter)
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training_v2.py in _process_training_inputs(model, x, y, batch_size, epochs, sample_weights, class_weights, steps_per_epoch, validation_split, validation_data, validation_steps, shuffle, distribution_strategy, max_queue_size, workers, use_multiprocessing)
591 max_queue_size=max_queue_size,
592 workers=workers,
--> 593 use_multiprocessing=use_multiprocessing)
594 val_adapter = None
595 if validation_data:
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training_v2.py in _process_inputs(model, mode, x, y, batch_size, epochs, sample_weights, class_weights, shuffle, steps, distribution_strategy, max_queue_size, workers, use_multiprocessing)
704 max_queue_size=max_queue_size,
705 workers=workers,
--> 706 use_multiprocessing=use_multiprocessing)
707
708 return adapter
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/data_adapter.py in __init__(self, x, y, sample_weights, standardize_function, **kwargs)
700
701 if standardize_function is not None:
--> 702 x = standardize_function(x)
703
704 # Note that the dataset instance is immutable, its fine to reusing the user
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training_v2.py in standardize_function(dataset)
658 model.sample_weight_mode = getattr(model, 'sample_weight_mode', None)
659
--> 660 standardize(dataset, extract_tensors_from_dataset=False)
661
662 # Then we map using only the tensor standardization portion.
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training.py in _standardize_user_data(self, x, y, sample_weight, class_weight, batch_size, check_steps, steps_name, steps, validation_split, shuffle, extract_tensors_from_dataset)
2358 is_compile_called = False
2359 if not self._is_compiled and self.optimizer:
-> 2360 self._compile_from_inputs(all_inputs, y_input, x, y)
2361 is_compile_called = True
2362
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training.py in _compile_from_inputs(self, all_inputs, target, orig_inputs, orig_target)
2578 if training_utils.has_tensors(target):
2579 target = training_utils.cast_if_floating_dtype_and_mismatch(
-> 2580 target, self.outputs)
2581 training_utils.validate_input_types(target, orig_target,
2582 allow_dict=False, field_name='target')
/tensorflow-2.1.0/python3.6/tensorflow_core/python/keras/engine/training_utils.py in cast_if_floating_dtype_and_mismatch(targets, outputs)
1334 if tensor_util.is_tensor(targets):
1335 # There is one target, so output[0] should be the only output.
-> 1336 return cast_single_tensor(targets, dtype=outputs[0].dtype)
1337 new_targets = []
1338 for target, out in zip(targets, outputs):
AttributeError: 'NoneType' object has no attribute 'dtype'
如何获取 data
和 lines
变量(如果希望复制)
获取数据
>>> wget http://www.thespermwhale.com/jaseweston/babi/CBTest.tgz
>>> tar zxvf CBTest.tgz
>>> rm -rf CBTest.tgz
预处理数据
# Load data from a dataset comprising lines
# from one or more text files.
lines = tf.data.TextLineDataset("<path-to>/cbt_train.txt")
# Filter Out Title Lines First
# This simple fn not included in this Whosebug code
lines = lines.filter(lambda x: not is_title(x))
# Then We Remove All Punctuation
# This simple fn not included in this Whosebug code
lines = lines.map(lambda x: remove_punc(x))
# Then We Remove All Extra Spaces Created By The Previous FN
# This simple fn not included in this Whosebug code
lines = lines.map(lambda x: remove_extra_spaces(x))
# Then We Turn All The Uppercase Letters into Lowercase Letters
# This simple fn not included in this Whosebug code
lines = lines.map(lambda x: make_lower(x))
# Get words from lines
words = lines.map(tf.strings.split)
words = words.unbatch()
# Get wordsets
wordsets = words.batch(11)
# get_example_label is a simple fn to split wordsets into examples and labels
# First ten words are the example and last word is the label
data = wordsets.map(get_example_label)
# Shuffle
data = data.shuffle(1024)
参考资料
提前致谢!!
更新
似乎 Tensorflow 发布了教程,详细介绍了在峰会上进行的所有演示。
结果是您可以检查实际代码并确定他们的代码与您的代码之间的差异。我不会 post 这里的差异,因为它们比我最初想象的更重要
链接
其他资源
当我联系 Tensorflow 时,他们还建议我查看 transformer tutorial,其中详细介绍了如何实现复杂的编码器-解码器和自注意力网络。
- 此外,他们还向我指出了他们的 TF2 implementation of BERT,所以我也将 link 包括在内。