tensorflow2.xkeras Embedding层处理tf.dataset错误
tensorflow2.x keras Embedding layer process tf.dataset error
此问题是 tensorflow 2 TextVectorization process tensor and dataset error
的后续问题
我想在 Jupyter 上用 tnesorflow 2.8 为处理后的文本做一个词嵌入。
def standardize(input_data):
input_data = tf.strings.lower(input_data)
input_data = tf.strings.regex_replace(input_data, f"[{re.escape(string.punctuation)}]", " ")
return input_data
# the input data loaded from text files by TfRecordDataset(file_paths, "GZIP")
# each file can be 200+MB, totally about 300 files
# each file hold the data with multiple columns
# some columns are text
# after loading, the dataset will be accessed by column name
# e.g. one column is "sports", so the input_dataset["sports"]
# return a tensor, which is like the following example
input_data = tf.constant([["SWIM 2008-07 Baseball"], ["Football"]], shape=(2, 1), dtype=tf.string)
text_layer = tf.keras.layers.TextVectorization( standardize = standardize, max_tokens = 10, output_mode = 'int', output_sequence_length=10 )
dataset = tf.data.Dataset.from_tensors( input_data )
dataset = dataset.batch(2)
text_layer.adapt(dataset)
process_text = dataset.map(text_layer)
emb_layer = layers.Embedding(10, 10)
emb_layer(process_text) # error
错误:
AttributeError: Exception encountered when calling layer "embedding_7" (type Embedding).
'MapDataset' object has no attribute 'dtype'
Call arguments received:
• inputs=<MapDataset element_spec=TensorSpec(shape=(None, 2, 10), dtype=tf.int64, name=None)>
如何将 tf.dataset 转换为 tf.tensor?
这个对我没有帮助。
以上层将在机器学习神经网络模型中实现。
loading data --> processing features (multiple text columns) --> tokens --> embedding --> average pooling --> some dense layers --> output layer
谢谢
我想最后的post回答了你的问题,我该怎么做才能让你理解并按要求继续你的工作?
Standardized
[样本]:
# https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization
#
import tensorflow as tf
import tensorflow_text as tf_text
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
Functions
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
def standardize(input_data):
input_data = tf.strings.lower(input_data)
input_data = tf.strings.regex_replace(input_data, "<[^>]+>", " ")
return input_data
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Varibles
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
input_data = tf.constant([["SWIM 2008-07 Baseball"], ["Football"]], shape=(2, 1), dtype=tf.string)
text_layer = tf.keras.layers.TextVectorization( standardize = standardize, max_tokens = 10, output_mode = 'int', output_sequence_length=10 )
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Working
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
print("")
print("")
print("")
dataset = tf.data.Dataset.from_tensors( standardize(input_data) )
dataset = dataset.batch(2)
process_text = text_layer.adapt(dataset)
print( "standardize: " + str(standardize(input_data)) )
print( "process_text: " + str(process_text) )
[输出]:
standardize: tf.Tensor(
[[b'swim 2008-07 baseball']
[b'football']], shape=(2, 1), dtype=string)
process_text: None
您不能将 tf.data.Dataset
直接提供给 Embedding
层,您可以使用 .map(...)
:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import re
import string
def standardize(input_data):
input_data = tf.strings.lower(input_data)
input_data = tf.strings.regex_replace(input_data, f"[{re.escape(string.punctuation)}]", " ")
return input_data
input_data = tf.constant([["SWIM 2008-07 Baseball"], ["Football"]], shape=(2, 1), dtype=tf.string)
text_layer = tf.keras.layers.TextVectorization( standardize = standardize, max_tokens = 10, output_mode = 'int', output_sequence_length=10 )
dataset = tf.data.Dataset.from_tensors( input_data )
dataset = dataset.batch(2).map(lambda x: tf.squeeze(x, axis=0))
text_layer.adapt(dataset)
process_text = dataset.map(text_layer)
emb_layer = layers.Embedding(10, 10)
process_text = process_text.map(emb_layer)
或者定义您的模型并通过 model.fit(...)
:
提供您的数据集
import tensorflow as tf
import re
import string
def standardize(input_data):
input_data = tf.strings.lower(input_data)
input_data = tf.strings.regex_replace(input_data, f"[{re.escape(string.punctuation)}]", " ")
return input_data
input_data = tf.constant([["SWIM 2008-07 Baseball"], ["Football"]], shape=(2, 1), dtype=tf.string)
text_layer = tf.keras.layers.TextVectorization( standardize = standardize, max_tokens = 10, output_mode = 'int', output_sequence_length=10 )
dataset = tf.data.Dataset.from_tensors( input_data )
dataset = dataset.batch(2)
text_layer.adapt(dataset)
process_text = dataset.map(lambda x: (text_layer(tf.squeeze(x, axis=0)), tf.random.uniform((2, ), maxval=2, dtype=tf.int32))) # add random label to each entry
inputs = tf.keras.layers.Input((10, ))
emb_layer = tf.keras.layers.Embedding(10, 10)
x = emb_layer(inputs)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
outputs = tf.keras.layers.Dense(1, 'sigmoid')(x)
model = tf.keras.Model(inputs, outputs)
model.compile(optimizer='adam', loss='binary_crossentropy')
model.fit(process_text)
此问题是 tensorflow 2 TextVectorization process tensor and dataset error
的后续问题我想在 Jupyter 上用 tnesorflow 2.8 为处理后的文本做一个词嵌入。
def standardize(input_data):
input_data = tf.strings.lower(input_data)
input_data = tf.strings.regex_replace(input_data, f"[{re.escape(string.punctuation)}]", " ")
return input_data
# the input data loaded from text files by TfRecordDataset(file_paths, "GZIP")
# each file can be 200+MB, totally about 300 files
# each file hold the data with multiple columns
# some columns are text
# after loading, the dataset will be accessed by column name
# e.g. one column is "sports", so the input_dataset["sports"]
# return a tensor, which is like the following example
input_data = tf.constant([["SWIM 2008-07 Baseball"], ["Football"]], shape=(2, 1), dtype=tf.string)
text_layer = tf.keras.layers.TextVectorization( standardize = standardize, max_tokens = 10, output_mode = 'int', output_sequence_length=10 )
dataset = tf.data.Dataset.from_tensors( input_data )
dataset = dataset.batch(2)
text_layer.adapt(dataset)
process_text = dataset.map(text_layer)
emb_layer = layers.Embedding(10, 10)
emb_layer(process_text) # error
错误:
AttributeError: Exception encountered when calling layer "embedding_7" (type Embedding).
'MapDataset' object has no attribute 'dtype'
Call arguments received:
• inputs=<MapDataset element_spec=TensorSpec(shape=(None, 2, 10), dtype=tf.int64, name=None)>
如何将 tf.dataset 转换为 tf.tensor?
这个
以上层将在机器学习神经网络模型中实现。
loading data --> processing features (multiple text columns) --> tokens --> embedding --> average pooling --> some dense layers --> output layer
谢谢
我想最后的post回答了你的问题,我该怎么做才能让你理解并按要求继续你的工作? Standardized
[样本]:
# https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization
#
import tensorflow as tf
import tensorflow_text as tf_text
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
Functions
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
def standardize(input_data):
input_data = tf.strings.lower(input_data)
input_data = tf.strings.regex_replace(input_data, "<[^>]+>", " ")
return input_data
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Varibles
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
input_data = tf.constant([["SWIM 2008-07 Baseball"], ["Football"]], shape=(2, 1), dtype=tf.string)
text_layer = tf.keras.layers.TextVectorization( standardize = standardize, max_tokens = 10, output_mode = 'int', output_sequence_length=10 )
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Working
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
print("")
print("")
print("")
dataset = tf.data.Dataset.from_tensors( standardize(input_data) )
dataset = dataset.batch(2)
process_text = text_layer.adapt(dataset)
print( "standardize: " + str(standardize(input_data)) )
print( "process_text: " + str(process_text) )
[输出]:
standardize: tf.Tensor(
[[b'swim 2008-07 baseball']
[b'football']], shape=(2, 1), dtype=string)
process_text: None
您不能将 tf.data.Dataset
直接提供给 Embedding
层,您可以使用 .map(...)
:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import re
import string
def standardize(input_data):
input_data = tf.strings.lower(input_data)
input_data = tf.strings.regex_replace(input_data, f"[{re.escape(string.punctuation)}]", " ")
return input_data
input_data = tf.constant([["SWIM 2008-07 Baseball"], ["Football"]], shape=(2, 1), dtype=tf.string)
text_layer = tf.keras.layers.TextVectorization( standardize = standardize, max_tokens = 10, output_mode = 'int', output_sequence_length=10 )
dataset = tf.data.Dataset.from_tensors( input_data )
dataset = dataset.batch(2).map(lambda x: tf.squeeze(x, axis=0))
text_layer.adapt(dataset)
process_text = dataset.map(text_layer)
emb_layer = layers.Embedding(10, 10)
process_text = process_text.map(emb_layer)
或者定义您的模型并通过 model.fit(...)
:
import tensorflow as tf
import re
import string
def standardize(input_data):
input_data = tf.strings.lower(input_data)
input_data = tf.strings.regex_replace(input_data, f"[{re.escape(string.punctuation)}]", " ")
return input_data
input_data = tf.constant([["SWIM 2008-07 Baseball"], ["Football"]], shape=(2, 1), dtype=tf.string)
text_layer = tf.keras.layers.TextVectorization( standardize = standardize, max_tokens = 10, output_mode = 'int', output_sequence_length=10 )
dataset = tf.data.Dataset.from_tensors( input_data )
dataset = dataset.batch(2)
text_layer.adapt(dataset)
process_text = dataset.map(lambda x: (text_layer(tf.squeeze(x, axis=0)), tf.random.uniform((2, ), maxval=2, dtype=tf.int32))) # add random label to each entry
inputs = tf.keras.layers.Input((10, ))
emb_layer = tf.keras.layers.Embedding(10, 10)
x = emb_layer(inputs)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
outputs = tf.keras.layers.Dense(1, 'sigmoid')(x)
model = tf.keras.Model(inputs, outputs)
model.compile(optimizer='adam', loss='binary_crossentropy')
model.fit(process_text)