如何定义具有动态形状的新张量以支持自定义层中的批处理
How to define a new Tensor with a dynamic shape to support batching in a custom layer
我正在尝试实现一个自定义层,该层将标记化的单词序列预处理为一个矩阵,该矩阵具有等于词汇表大小的预定义元素数。本质上,我正在尝试实现一个 'bag of words' 层。这是我能想到的最接近的:
def get_encoder(vocab_size=args.vocab_size):
encoder = TextVectorization(max_tokens=vocab_size)
encoder.adapt(train_dataset.map(lambda text, label: text))
return encoder
class BagOfWords(tf.keras.layers.Layer):
def __init__(self, vocab_size=args.small_vocab_size, batch_size=args.batch_size):
super(BagOfWords, self).__init__()
self.vocab_size = vocab_size
self.batch_size = batch_size
def build(self, input_shape):
super().build(input_shape)
def call(self, inputs):
if inputs.shape[-1] == None:
return tf.constant(np.zeros([self.batch_size, self.vocab_size])) # 32 is the batch size
outputs = tf.zeros([self.batch_size, self.vocab_size])
if inputs.shape[-1] != None:
for i in range(inputs.shape[0]):
for ii in range(inputs.shape[-1]):
ouput_idx = inputs[i][ii]
outputs[i][ouput_idx] = outputs[i][ouput_idx] + 1
return outputs
model = keras.models.Sequential()
model.add(encoder)
model.add(bag_of_words)
model.add(keras.layers.Dense(64, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))
当我在模型上调用 fit() 时出现错误:“不兼容的形状:[8,1] 与 [32,1]”不足为奇。当批处理大小小于 32 时,这发生在最后一步。
我的问题是:撇开性能不谈,我如何为我的词袋矩阵定义输出张量,使其具有批处理的动态形状并让我的代码正常工作?
编辑 1
评论之后,我意识到代码确实不起作用,因为它永远不会进入 'else' 分支。
我对其进行了一些编辑,使其仅使用 tf 函数:
class BagOfWords(tf.keras.layers.Layer):
def __init__(self, vocab_size=args.small_vocab_size, batch_size=args.batch_size):
super(BagOfWords, self).__init__()
self.vocab_size = vocab_size
self.batch_size = batch_size
self.outputs = tf.Variable(tf.zeros([batch_size, vocab_size]))
def build(self, input_shape):
super().build(input_shape)
def call(self, inputs):
if tf.shape(inputs)[-1] == None:
return tf.zeros([self.batch_size, self.vocab_size])
self.outputs.assign(tf.zeros([self.batch_size, self.vocab_size]))
for i in range(tf.shape(inputs)[0]):
for ii in range(tf.shape(inputs)[-1]):
output_idx = inputs[i][ii]
if output_idx >= tf.constant(self.vocab_size, dtype=tf.int64):
output_idx = tf.constant(1, dtype=tf.int64)
self.outputs[i][output_idx].assign(self.outputs[i][output_idx] + 1)
return outputs
虽然没有帮助:AttributeError: 'Tensor' object has no attribute 'assign'.
如果我错了请纠正我,但我认为使用 TextVectorization
层的 output_mode="multi_hot"
就足以完成你想做的事情。根据docs,multi_hot
输出方式:
Outputs a single int array per batch, of either vocab_size or max_tokens size, containing 1s in all elements where the token mapped to that index exists at least once in the batch item
所以它可以像这样简单:
import tensorflow as tf
def get_encoder():
encoder = tf.keras.layers.TextVectorization(output_mode="multi_hot")
encoder.adapt(train_dataset.map(lambda text, label: text))
return encoder
texts = [
'All my cats in a row',
'When my cat sits down, she looks like a Furby toy!',
'The cat from outer space',
'Sunshine loves to sit like this for some reason.']
labels = [[1], [0], [1], [1]]
train_dataset = tf.data.Dataset.from_tensor_slices((texts, labels))
model = tf.keras.Sequential()
model.add(get_encoder())
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss = tf.keras.losses.BinaryCrossentropy())
model.fit(train_dataset.batch(2), epochs=2)
这就是您的文本的编码方式:
import tensorflow as tf
texts = ['All my cats in a row',
'When my cat sits down, she looks like a Furby toy!',
'The cat from outer space',
'Sunshine loves to sit like this for some reason.']
encoder = get_encoder()
inputs = encoder(texts)
print(inputs)
tf.Tensor(
[[0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0.
0. 0. 1. 1.]
[0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0.
0. 1. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1.
0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0.
1. 0. 0. 0.]], shape=(4, 28), dtype=float32)
因此,正如您在自定义层中尝试的那样,序列中存在的单词标记为 1,不存在的单词标记为 0。
这是一个词袋自定义 keras
层的例子,没有使用任何额外的预处理层:
import tensorflow as tf
class BagOfWords(tf.keras.layers.Layer):
def __init__(self, vocabulary_size):
super(BagOfWords, self).__init__()
self.vocabulary_size = vocabulary_size
def call(self, inputs):
batch_size = tf.shape(inputs)[0]
outputs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
for i in range(batch_size):
string = inputs[i]
string_length = tf.shape(tf.where(tf.math.not_equal(string, b'')))[0]
string = string[:string_length]
string_array = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
for s in string:
string_array = string_array.write(string_array.size(), tf.where(tf.equal(s, self.vocabulary_size), 1.0, 0.0))
outputs = outputs.write(i, tf.cast(tf.reduce_any(tf.cast(string_array.stack(), dtype=tf.bool), axis=0), dtype=tf.float32))
return outputs.stack()
下面是手动预处理步骤和模型:
labels = [[1], [0], [1], [0]]
texts = ['All my cats in a row',
'When my cat sits down, she looks like a Furby toy!',
'The cat from the outer space',
'Sunshine loves to sit like this for some reason.']
DEFAULT_STRIP_REGEX = r'[!"#$%&()\*\+,-\./:;<=>?@\[\\]^_`{|}~\']'
tensor_of_strings = tf.constant(texts)
tensor_of_strings = tf.strings.lower(tensor_of_strings)
tensor_of_strings = tf.strings.regex_replace(tensor_of_strings, DEFAULT_STRIP_REGEX, "")
split_strings = tf.strings.split(tensor_of_strings).to_tensor()
flattened_split_strings = tf.reshape(split_strings, (split_strings.shape[0] * split_strings.shape[1]))
unique_words, _ = tf.unique(flattened_split_strings)
unique_words = tf.random.shuffle(unique_words)
bag_of_words = BagOfWords(vocabulary_size = unique_words)
train_dataset = tf.data.Dataset.from_tensor_slices((split_strings, labels))
model = tf.keras.Sequential()
model.add(bag_of_words)
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss = tf.keras.losses.BinaryCrossentropy())
model.fit(train_dataset.batch(2), epochs=2)
Epoch 1/2
4/4 [==============================] - 2s 7ms/step - loss: 0.7081
Epoch 2/2
4/4 [==============================] - 0s 6ms/step - loss: 0.7008
<keras.callbacks.History at 0x7f5ba844bad0>
这就是 4 个编码句子的样子:
print(bag_of_words(split_strings))
tf.Tensor(
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
1. 1. 1. 0.]
[1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0.
0. 1. 1. 0.]
[0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0.
0. 0. 0. 0.]
[0. 1. 0. 1. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
0. 0. 0. 1.]], shape=(4, 28), dtype=float32)
@AloneTogether 上面的回答非常相关。只是想发布我首先想出的工作代码,无需手动处理。
import tensorflow_datasets as tfds
ds, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True, data_dir='/tmp/imdb')
train_dataset = ds['train']
def get_encoder(vocab_size=args.vocab_size):
encoder = TextVectorization(max_tokens=vocab_size)
encoder.adapt(train_dataset.map(lambda text, label: text))
return encoder
class BagOfWords(tf.keras.layers.Layer):
def __init__(self, vocabulary_size):
super(BagOfWords, self).__init__()
self.vocabulary_size = vocabulary_size
def call(self, inputs):
batch_size = tf.shape(inputs)[0]
outputs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
for i in range(batch_size):
int_string = inputs[i]
array_string = tf.TensorArray(dtype=tf.float32, size=self.vocabulary_size)
array_string.unstack(tf.zeros(self.vocabulary_size))
for int_word in int_string:
idx = int_word
idx = tf.cond(idx >= self.vocabulary_size, lambda: 1, lambda: tf.cast(idx, tf.int32))
array_string = array_string.write(idx, array_string.read(idx) + 1.0)
outputs = outputs.write(i, array_string.stack())
return outputs.stack()
encoder = get_encoder(args.small_vocab_size)
bag_of_words = BagOfWords(args.small_vocab_size)
model = keras.models.Sequential()
model.add(encoder)
model.add(bag_of_words)
model.add(keras.layers.Dense(64, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))
for d in train_dataset.batch(args.batch_size).take(1):
model(d[0])
model.compile(optimizer=keras.optimizers.Nadam(learning_rate=1e-3),
loss='binary_crossentropy',
metrics=['accuracy'])
model.summary()
我正在尝试实现一个自定义层,该层将标记化的单词序列预处理为一个矩阵,该矩阵具有等于词汇表大小的预定义元素数。本质上,我正在尝试实现一个 'bag of words' 层。这是我能想到的最接近的:
def get_encoder(vocab_size=args.vocab_size):
encoder = TextVectorization(max_tokens=vocab_size)
encoder.adapt(train_dataset.map(lambda text, label: text))
return encoder
class BagOfWords(tf.keras.layers.Layer):
def __init__(self, vocab_size=args.small_vocab_size, batch_size=args.batch_size):
super(BagOfWords, self).__init__()
self.vocab_size = vocab_size
self.batch_size = batch_size
def build(self, input_shape):
super().build(input_shape)
def call(self, inputs):
if inputs.shape[-1] == None:
return tf.constant(np.zeros([self.batch_size, self.vocab_size])) # 32 is the batch size
outputs = tf.zeros([self.batch_size, self.vocab_size])
if inputs.shape[-1] != None:
for i in range(inputs.shape[0]):
for ii in range(inputs.shape[-1]):
ouput_idx = inputs[i][ii]
outputs[i][ouput_idx] = outputs[i][ouput_idx] + 1
return outputs
model = keras.models.Sequential()
model.add(encoder)
model.add(bag_of_words)
model.add(keras.layers.Dense(64, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))
当我在模型上调用 fit() 时出现错误:“不兼容的形状:[8,1] 与 [32,1]”不足为奇。当批处理大小小于 32 时,这发生在最后一步。
我的问题是:撇开性能不谈,我如何为我的词袋矩阵定义输出张量,使其具有批处理的动态形状并让我的代码正常工作?
编辑 1 评论之后,我意识到代码确实不起作用,因为它永远不会进入 'else' 分支。 我对其进行了一些编辑,使其仅使用 tf 函数:
class BagOfWords(tf.keras.layers.Layer):
def __init__(self, vocab_size=args.small_vocab_size, batch_size=args.batch_size):
super(BagOfWords, self).__init__()
self.vocab_size = vocab_size
self.batch_size = batch_size
self.outputs = tf.Variable(tf.zeros([batch_size, vocab_size]))
def build(self, input_shape):
super().build(input_shape)
def call(self, inputs):
if tf.shape(inputs)[-1] == None:
return tf.zeros([self.batch_size, self.vocab_size])
self.outputs.assign(tf.zeros([self.batch_size, self.vocab_size]))
for i in range(tf.shape(inputs)[0]):
for ii in range(tf.shape(inputs)[-1]):
output_idx = inputs[i][ii]
if output_idx >= tf.constant(self.vocab_size, dtype=tf.int64):
output_idx = tf.constant(1, dtype=tf.int64)
self.outputs[i][output_idx].assign(self.outputs[i][output_idx] + 1)
return outputs
虽然没有帮助:AttributeError: 'Tensor' object has no attribute 'assign'.
如果我错了请纠正我,但我认为使用 TextVectorization
层的 output_mode="multi_hot"
就足以完成你想做的事情。根据docs,multi_hot
输出方式:
Outputs a single int array per batch, of either vocab_size or max_tokens size, containing 1s in all elements where the token mapped to that index exists at least once in the batch item
所以它可以像这样简单:
import tensorflow as tf
def get_encoder():
encoder = tf.keras.layers.TextVectorization(output_mode="multi_hot")
encoder.adapt(train_dataset.map(lambda text, label: text))
return encoder
texts = [
'All my cats in a row',
'When my cat sits down, she looks like a Furby toy!',
'The cat from outer space',
'Sunshine loves to sit like this for some reason.']
labels = [[1], [0], [1], [1]]
train_dataset = tf.data.Dataset.from_tensor_slices((texts, labels))
model = tf.keras.Sequential()
model.add(get_encoder())
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss = tf.keras.losses.BinaryCrossentropy())
model.fit(train_dataset.batch(2), epochs=2)
这就是您的文本的编码方式:
import tensorflow as tf
texts = ['All my cats in a row',
'When my cat sits down, she looks like a Furby toy!',
'The cat from outer space',
'Sunshine loves to sit like this for some reason.']
encoder = get_encoder()
inputs = encoder(texts)
print(inputs)
tf.Tensor(
[[0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0.
0. 0. 1. 1.]
[0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0.
0. 1. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1.
0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0.
1. 0. 0. 0.]], shape=(4, 28), dtype=float32)
因此,正如您在自定义层中尝试的那样,序列中存在的单词标记为 1,不存在的单词标记为 0。
这是一个词袋自定义 keras
层的例子,没有使用任何额外的预处理层:
import tensorflow as tf
class BagOfWords(tf.keras.layers.Layer):
def __init__(self, vocabulary_size):
super(BagOfWords, self).__init__()
self.vocabulary_size = vocabulary_size
def call(self, inputs):
batch_size = tf.shape(inputs)[0]
outputs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
for i in range(batch_size):
string = inputs[i]
string_length = tf.shape(tf.where(tf.math.not_equal(string, b'')))[0]
string = string[:string_length]
string_array = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
for s in string:
string_array = string_array.write(string_array.size(), tf.where(tf.equal(s, self.vocabulary_size), 1.0, 0.0))
outputs = outputs.write(i, tf.cast(tf.reduce_any(tf.cast(string_array.stack(), dtype=tf.bool), axis=0), dtype=tf.float32))
return outputs.stack()
下面是手动预处理步骤和模型:
labels = [[1], [0], [1], [0]]
texts = ['All my cats in a row',
'When my cat sits down, she looks like a Furby toy!',
'The cat from the outer space',
'Sunshine loves to sit like this for some reason.']
DEFAULT_STRIP_REGEX = r'[!"#$%&()\*\+,-\./:;<=>?@\[\\]^_`{|}~\']'
tensor_of_strings = tf.constant(texts)
tensor_of_strings = tf.strings.lower(tensor_of_strings)
tensor_of_strings = tf.strings.regex_replace(tensor_of_strings, DEFAULT_STRIP_REGEX, "")
split_strings = tf.strings.split(tensor_of_strings).to_tensor()
flattened_split_strings = tf.reshape(split_strings, (split_strings.shape[0] * split_strings.shape[1]))
unique_words, _ = tf.unique(flattened_split_strings)
unique_words = tf.random.shuffle(unique_words)
bag_of_words = BagOfWords(vocabulary_size = unique_words)
train_dataset = tf.data.Dataset.from_tensor_slices((split_strings, labels))
model = tf.keras.Sequential()
model.add(bag_of_words)
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss = tf.keras.losses.BinaryCrossentropy())
model.fit(train_dataset.batch(2), epochs=2)
Epoch 1/2
4/4 [==============================] - 2s 7ms/step - loss: 0.7081
Epoch 2/2
4/4 [==============================] - 0s 6ms/step - loss: 0.7008
<keras.callbacks.History at 0x7f5ba844bad0>
这就是 4 个编码句子的样子:
print(bag_of_words(split_strings))
tf.Tensor(
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
1. 1. 1. 0.]
[1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0.
0. 1. 1. 0.]
[0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0.
0. 0. 0. 0.]
[0. 1. 0. 1. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
0. 0. 0. 1.]], shape=(4, 28), dtype=float32)
@AloneTogether 上面的回答非常相关。只是想发布我首先想出的工作代码,无需手动处理。
import tensorflow_datasets as tfds
ds, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True, data_dir='/tmp/imdb')
train_dataset = ds['train']
def get_encoder(vocab_size=args.vocab_size):
encoder = TextVectorization(max_tokens=vocab_size)
encoder.adapt(train_dataset.map(lambda text, label: text))
return encoder
class BagOfWords(tf.keras.layers.Layer):
def __init__(self, vocabulary_size):
super(BagOfWords, self).__init__()
self.vocabulary_size = vocabulary_size
def call(self, inputs):
batch_size = tf.shape(inputs)[0]
outputs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
for i in range(batch_size):
int_string = inputs[i]
array_string = tf.TensorArray(dtype=tf.float32, size=self.vocabulary_size)
array_string.unstack(tf.zeros(self.vocabulary_size))
for int_word in int_string:
idx = int_word
idx = tf.cond(idx >= self.vocabulary_size, lambda: 1, lambda: tf.cast(idx, tf.int32))
array_string = array_string.write(idx, array_string.read(idx) + 1.0)
outputs = outputs.write(i, array_string.stack())
return outputs.stack()
encoder = get_encoder(args.small_vocab_size)
bag_of_words = BagOfWords(args.small_vocab_size)
model = keras.models.Sequential()
model.add(encoder)
model.add(bag_of_words)
model.add(keras.layers.Dense(64, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))
for d in train_dataset.batch(args.batch_size).take(1):
model(d[0])
model.compile(optimizer=keras.optimizers.Nadam(learning_rate=1e-3),
loss='binary_crossentropy',
metrics=['accuracy'])
model.summary()