将数值张量流数据集改编为文本向量
Adapt a numerical tensorflow dataset as a textvector
考虑以下代码:
import numpy as np
import tensorflow as tf
simple_data_samples = np.array([
[1, 1, 1, -1, -1],
[2, 2, 2, -2, -2],
[3, 3, 3, -3, -3],
[4, 4, 4, -4, -4],
[5, 5, 5, -5, -5],
[6, 6, 6, -6, -6],
[7, 7, 7, -7, -7],
[8, 8, 8, -8, -8],
[9, 9, 9, -9, -9],
[10, 10, 10, -10, -10],
[11, 11, 11, -11, -11],
[12, 12, 12, -12, -12],
])
def timeseries_dataset_multistep_combined(features, label_slice, input_sequence_length, output_sequence_length, batch_size):
feature_ds = tf.keras.preprocessing.timeseries_dataset_from_array(features, None, input_sequence_length + output_sequence_length, batch_size=batch_size)
def split_feature_label(x):
x=tf.strings.as_string(x)
return x[:, :input_sequence_length, :], x[:, input_sequence_length:, label_slice]
feature_ds = feature_ds.map(split_feature_label)
return feature_ds
ds = timeseries_dataset_multistep_combined(simple_data_samples, slice(None, None, None), input_sequence_length=4, output_sequence_length=2,
batch_size=1)
def print_dataset(ds):
for inputs, targets in ds:
print("---Batch---")
print("Feature:", inputs.numpy())
print("Label:", targets.numpy())
print("")
print_dataset(ds)
张量流数据集“ds”由输入和目标组成。我想将输入和目标调整为文本向量。以下假设代码显示了我想要实现的目标:
input_vectorization = layers.TextVectorization(
max_tokens=20,
output_mode="int",
output_sequence_length=6,
)
target_vectorization = layers.TextVectorization(
max_tokens=20,
output_mode="int",
output_sequence_length=6 + 1
)
input_vectorization.adapt(ds.input)
target_vectorization.adapt(ds.target)
知道如何使用上述示例对此进行编码吗?
如果我没理解错的话,您可以像这样将现有数据集与 TextVectorization
层一起使用:
import tensorflow as tf
input_vectorization = tf.keras.layers.TextVectorization(
max_tokens=20,
output_mode="int",
output_sequence_length=6,
)
target_vectorization = tf.keras.layers.TextVectorization(
max_tokens=20,
output_mode="int",
output_sequence_length=6 + 1
)
# Get inputs only and flatten them
inputs = ds.map(lambda x, y: tf.reshape(x, (tf.math.reduce_prod(tf.shape(x)), )))
# Get targets only and flatten them
targets = ds.map(lambda x, y: tf.reshape(y, (tf.math.reduce_prod(tf.shape(y)), )))
input_vectorization.adapt(inputs)
target_vectorization.adapt(targets)
print(input_vectorization.get_vocabulary())
print(target_vectorization.get_vocabulary())
['', '[UNK]', '7', '6', '5', '4', '8', '3', '9', '2', '10', '1']
['', '[UNK]', '9', '8', '7', '6', '11', '10', '5', '12']
请注意,adapt
函数只是根据输入创建一个词汇表,词汇表中的每个单词都映射到一个唯一的整数值。另外,由于TextVectorization
层的默认参数standardize='lower_and_strip_punctuation'
,调用adapt
时去掉了负号。如果需要,您可以通过设置例如 standardize='lower'
.
来避免这种行为
考虑以下代码:
import numpy as np
import tensorflow as tf
simple_data_samples = np.array([
[1, 1, 1, -1, -1],
[2, 2, 2, -2, -2],
[3, 3, 3, -3, -3],
[4, 4, 4, -4, -4],
[5, 5, 5, -5, -5],
[6, 6, 6, -6, -6],
[7, 7, 7, -7, -7],
[8, 8, 8, -8, -8],
[9, 9, 9, -9, -9],
[10, 10, 10, -10, -10],
[11, 11, 11, -11, -11],
[12, 12, 12, -12, -12],
])
def timeseries_dataset_multistep_combined(features, label_slice, input_sequence_length, output_sequence_length, batch_size):
feature_ds = tf.keras.preprocessing.timeseries_dataset_from_array(features, None, input_sequence_length + output_sequence_length, batch_size=batch_size)
def split_feature_label(x):
x=tf.strings.as_string(x)
return x[:, :input_sequence_length, :], x[:, input_sequence_length:, label_slice]
feature_ds = feature_ds.map(split_feature_label)
return feature_ds
ds = timeseries_dataset_multistep_combined(simple_data_samples, slice(None, None, None), input_sequence_length=4, output_sequence_length=2,
batch_size=1)
def print_dataset(ds):
for inputs, targets in ds:
print("---Batch---")
print("Feature:", inputs.numpy())
print("Label:", targets.numpy())
print("")
print_dataset(ds)
张量流数据集“ds”由输入和目标组成。我想将输入和目标调整为文本向量。以下假设代码显示了我想要实现的目标:
input_vectorization = layers.TextVectorization(
max_tokens=20,
output_mode="int",
output_sequence_length=6,
)
target_vectorization = layers.TextVectorization(
max_tokens=20,
output_mode="int",
output_sequence_length=6 + 1
)
input_vectorization.adapt(ds.input)
target_vectorization.adapt(ds.target)
知道如何使用上述示例对此进行编码吗?
如果我没理解错的话,您可以像这样将现有数据集与 TextVectorization
层一起使用:
import tensorflow as tf
input_vectorization = tf.keras.layers.TextVectorization(
max_tokens=20,
output_mode="int",
output_sequence_length=6,
)
target_vectorization = tf.keras.layers.TextVectorization(
max_tokens=20,
output_mode="int",
output_sequence_length=6 + 1
)
# Get inputs only and flatten them
inputs = ds.map(lambda x, y: tf.reshape(x, (tf.math.reduce_prod(tf.shape(x)), )))
# Get targets only and flatten them
targets = ds.map(lambda x, y: tf.reshape(y, (tf.math.reduce_prod(tf.shape(y)), )))
input_vectorization.adapt(inputs)
target_vectorization.adapt(targets)
print(input_vectorization.get_vocabulary())
print(target_vectorization.get_vocabulary())
['', '[UNK]', '7', '6', '5', '4', '8', '3', '9', '2', '10', '1']
['', '[UNK]', '9', '8', '7', '6', '11', '10', '5', '12']
请注意,adapt
函数只是根据输入创建一个词汇表,词汇表中的每个单词都映射到一个唯一的整数值。另外,由于TextVectorization
层的默认参数standardize='lower_and_strip_punctuation'
,调用adapt
时去掉了负号。如果需要,您可以通过设置例如 standardize='lower'
.