使用 tensorflow2/keras 对 imdb 数据集进行训练给出了奇怪的结果
training on imdb dataset with tensorflow2/keras give strange result
我是 tensorflow2/keras 的新手。我在 tensorflow 网站上关注这个 tutorial。我没有将文本数据下载到目录,而是使用 tensorflow_datasets
将 imdb 数据集直接加载到 tensors/numpy 数组。下面是我的代码。
import os
import re
import string
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")
train_data, test_data = tfds.load(name="imdb_reviews", split=["train", "test"],
batch_size=-1, as_supervised=True)
X_train, y_train = tfds.as_numpy(train_data)
X_test, y_test = tfds.as_numpy(test_data)
# process text
def custom_standardization(input_data):
lowercase = tf.strings.lower(input_data)
stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
return tf.strings.regex_replace(stripped_html,
'[%s]' % re.escape(string.punctuation),
'')
max_features = 1000
sequence_length = 50
vectorize_layer = TextVectorization(
standardize=custom_standardization,
max_tokens=max_features,
output_mode='int',
output_sequence_length=sequence_length)
# Make a text-only dataset (without labels), then call adapt
vectorize_layer.adapt(X_train)
def vectorize_text(text):
text = tf.expand_dims(text, -1)
return vectorize_layer(text)
#check data
first_review, first_label = X_train[0], y_train[0]
print("Review", first_review)
print("Vectorized review", vectorize_text(first_review))
print("11 ---> ",vectorize_layer.get_vocabulary()[11])
print(" 44 ---> ",vectorize_layer.get_vocabulary()[44])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))
# vectorize both train and test text data
X_train = vectorize_text(X_train)
X_test = vectorize_text(X_test)
embedding_dim = 16
#define and compile model
model = tf.keras.Sequential([
layers.Embedding(max_features + 1, embedding_dim),
layers.GlobalAveragePooling1D(),
layers.Dropout(0.2),
layers.Dense(256, activation='relu'),
layers.Dropout(0.2),
layers.Dense(1)])
model.summary()
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# fit the model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=2, validation_data=(X_test, y_test))
我得到的输出如下:
_________________________________________________________________
Epoch 1/5
782/782 - 4s - loss: 0.0000e+00 - accuracy: 0.5000 - val_loss: 0.0000e+00 - val_accuracy: 0.5000
Epoch 2/5
782/782 - 4s - loss: 0.0000e+00 - accuracy: 0.5000 - val_loss: 0.0000e+00 - val_accuracy: 0.5000
Epoch 3/5
782/782 - 4s - loss: 0.0000e+00 - accuracy: 0.5000 - val_loss: 0.0000e+00 - val_accuracy: 0.5000
Epoch 4/5
782/782 - 4s - loss: 0.0000e+00 - accuracy: 0.5000 - val_loss: 0.0000e+00 - val_accuracy: 0.5000
Epoch 5/5
782/782 - 4s - loss: 0.0000e+00 - accuracy: 0.5000 - val_loss: 0.0000e+00 - val_accuracy: 0.5000
准确率都是50%!出了些问题。我很困惑为什么会这样?我正在按照教程从头开始训练嵌入层。花了几个小时试图找出原因。
有谁知道为什么出错了?谢谢!
您的致密层是 linear
,这意味着您没有向其中传递任何激活。有几种解决方案,正如您所做的 binary_classification
.
如果你想直接使用Dense(1)
,那么你应该把损失函数改成:
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
metrics=['accuracy'])
或者你可以使用Dense(1, activation = tf.nn.sigmoid)
,那么你的损失函数应该是:
model.compile(optimizer='adam', loss='binary_crossentropy',
metrics=['accuracy'])
或者另一种解决方案,如果你的标签是一个热编码,你可以设置 Dense(2, activation = tf.nn.softmax)
那么损失函数可以是:
model.compile(optimizer='adam', loss='categorical_crossentropy',
metrics=['accuracy'])
我通过更改复制了您的确切代码:
layers.Dense(1, activation = tf.nn.sigmoid)])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=
['accuracy'])
得到:
782/782 [==============================] - 4s 5ms/step - loss: 0.4836 - accuracy: 0.7675 - val_loss: 0.5065 - val_accuracy: 0.7447
编辑 2:sigmoid
将输出压缩到 [0,1] 范围内。例如,要获得预测,您需要全部 x_test
。 X_test[0]
会搞乱预测,因为它只包含第一句话。使用以下方法获取预测:
y_hat = model.predict(X_test)
y_hat
array([[0.8105568 ],
[0.6332975 ],
[0.20526059],
...,
[0.03132877],
[0.5318063 ],
[0.8626927 ]], dtype=float32)
现在设置一个阈值,将它们转换成0和1。这可以通过以下方式完成:
y_pred = [1 * (x[0]>=0.5) for x in y_hat]
y_pred
将包含零和一。请注意,这里的阈值是 0.5,这意味着如果 sigmoid
的输出大于 0.5,那么它属于第二个 class.
我是 tensorflow2/keras 的新手。我在 tensorflow 网站上关注这个 tutorial。我没有将文本数据下载到目录,而是使用 tensorflow_datasets
将 imdb 数据集直接加载到 tensors/numpy 数组。下面是我的代码。
import os
import re
import string
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")
train_data, test_data = tfds.load(name="imdb_reviews", split=["train", "test"],
batch_size=-1, as_supervised=True)
X_train, y_train = tfds.as_numpy(train_data)
X_test, y_test = tfds.as_numpy(test_data)
# process text
def custom_standardization(input_data):
lowercase = tf.strings.lower(input_data)
stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
return tf.strings.regex_replace(stripped_html,
'[%s]' % re.escape(string.punctuation),
'')
max_features = 1000
sequence_length = 50
vectorize_layer = TextVectorization(
standardize=custom_standardization,
max_tokens=max_features,
output_mode='int',
output_sequence_length=sequence_length)
# Make a text-only dataset (without labels), then call adapt
vectorize_layer.adapt(X_train)
def vectorize_text(text):
text = tf.expand_dims(text, -1)
return vectorize_layer(text)
#check data
first_review, first_label = X_train[0], y_train[0]
print("Review", first_review)
print("Vectorized review", vectorize_text(first_review))
print("11 ---> ",vectorize_layer.get_vocabulary()[11])
print(" 44 ---> ",vectorize_layer.get_vocabulary()[44])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))
# vectorize both train and test text data
X_train = vectorize_text(X_train)
X_test = vectorize_text(X_test)
embedding_dim = 16
#define and compile model
model = tf.keras.Sequential([
layers.Embedding(max_features + 1, embedding_dim),
layers.GlobalAveragePooling1D(),
layers.Dropout(0.2),
layers.Dense(256, activation='relu'),
layers.Dropout(0.2),
layers.Dense(1)])
model.summary()
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# fit the model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=2, validation_data=(X_test, y_test))
我得到的输出如下:
_________________________________________________________________
Epoch 1/5
782/782 - 4s - loss: 0.0000e+00 - accuracy: 0.5000 - val_loss: 0.0000e+00 - val_accuracy: 0.5000
Epoch 2/5
782/782 - 4s - loss: 0.0000e+00 - accuracy: 0.5000 - val_loss: 0.0000e+00 - val_accuracy: 0.5000
Epoch 3/5
782/782 - 4s - loss: 0.0000e+00 - accuracy: 0.5000 - val_loss: 0.0000e+00 - val_accuracy: 0.5000
Epoch 4/5
782/782 - 4s - loss: 0.0000e+00 - accuracy: 0.5000 - val_loss: 0.0000e+00 - val_accuracy: 0.5000
Epoch 5/5
782/782 - 4s - loss: 0.0000e+00 - accuracy: 0.5000 - val_loss: 0.0000e+00 - val_accuracy: 0.5000
准确率都是50%!出了些问题。我很困惑为什么会这样?我正在按照教程从头开始训练嵌入层。花了几个小时试图找出原因。 有谁知道为什么出错了?谢谢!
您的致密层是 linear
,这意味着您没有向其中传递任何激活。有几种解决方案,正如您所做的 binary_classification
.
如果你想直接使用Dense(1)
,那么你应该把损失函数改成:
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
metrics=['accuracy'])
或者你可以使用Dense(1, activation = tf.nn.sigmoid)
,那么你的损失函数应该是:
model.compile(optimizer='adam', loss='binary_crossentropy',
metrics=['accuracy'])
或者另一种解决方案,如果你的标签是一个热编码,你可以设置 Dense(2, activation = tf.nn.softmax)
那么损失函数可以是:
model.compile(optimizer='adam', loss='categorical_crossentropy',
metrics=['accuracy'])
我通过更改复制了您的确切代码:
layers.Dense(1, activation = tf.nn.sigmoid)])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=
['accuracy'])
得到:
782/782 [==============================] - 4s 5ms/step - loss: 0.4836 - accuracy: 0.7675 - val_loss: 0.5065 - val_accuracy: 0.7447
编辑 2:sigmoid
将输出压缩到 [0,1] 范围内。例如,要获得预测,您需要全部 x_test
。 X_test[0]
会搞乱预测,因为它只包含第一句话。使用以下方法获取预测:
y_hat = model.predict(X_test)
y_hat
array([[0.8105568 ],
[0.6332975 ],
[0.20526059],
...,
[0.03132877],
[0.5318063 ],
[0.8626927 ]], dtype=float32)
现在设置一个阈值,将它们转换成0和1。这可以通过以下方式完成:
y_pred = [1 * (x[0]>=0.5) for x in y_hat]
y_pred
将包含零和一。请注意,这里的阈值是 0.5,这意味着如果 sigmoid
的输出大于 0.5,那么它属于第二个 class.