使用 GridSearchCV 获得最佳词汇量和嵌入维度
Getting optimal vocab size and embedding dimensionality using GridSearchCV
我正在尝试使用 GridSearchCV
找到 LSTM 模型的最佳超参数,包括词汇大小和词嵌入维数的最佳参数。首先,我准备了我的测试和训练数据。
x = df['tweet_text']
y = df['potentially_harmful']
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=0)
x_train= x_train.to_numpy().reshape(-1, 1)
y_train= y_train.to_numpy().reshape(-1, 1)
x_test = x_test.to_numpy().reshape(-1, 1)
y_test = y_test.to_numpy().reshape(-1,1)
然后我尝试创建一个可以用于 GridSearchCV
的模型。我知道要使用 Keras 模型进行网格搜索,您需要使用 KerasClassifier
或 KerasRegressor
。我还确保适应 x
,而不是 x_train
或任何东西,因为 x
是完整的 x_data
,我认为它需要矢量化所有 x
所以所有输入文档都具有一致的矢量化形式。
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from scikeras.wrappers import KerasClassifier, KerasRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold
def build_model(max_tokens, max_len, dropout):
model = Sequential()
vectorize_layer = TextVectorization(
max_tokens=max_tokens,
output_mode="int",
output_sequence_length=max_len,
)
vectorize_layer.adapt(x)
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)
model.add(Embedding(max_tokens + 1, 128))
model.add(LSTM(64, dropout = dropout, recurrent_dropout = dropout))
model.add(Dense(64, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'],
)
return model
这里我尝试用参数实例化模型。分类器抱怨我应该添加 dropout = 0.2, max_len = 5, max_tokens=25
部分。
model = KerasClassifier(build_fn=build_model, dropout = 0.2, max_len = 5, max_tokens=25)
params = {
"max_tokens" : [25, 50, 500, 5000],
"max_len" : [5, 50, 500, 1000],
"dropout" : [0.1, 0.2, 0.3, 0.4, 0.5],
}
grid = GridSearchCV(estimator = model, scoring = 'accuracy', param_grid = params, cv = 3, verbose = 2, error_score = 'raise')
grid.fit(x_train, y_train)
然后,我得到这个错误:
Fitting 3 folds for each of 80 candidates, totalling 240 fits
ValueError: could not convert string to float: 'promo looks promising pls say absence means fauxfoodies r couple eliminated next round ugh cantstandthem mkr'
这让我很困惑。例如,如果我只是尝试用 model = build_model(...)
之类的东西实例化一个模型并尝试 model.fit(x_train, y_train)
,那么这个模型就可以工作,并且它不会在将字符串转换为浮点数时遇到问题。为什么现在不能了?
我尝试使用 scikeras,但出现错误,因为它不接受 not-numerical inputs (in our case the input is in str format). So I came back to the standard keras wrapper。
这里的重点是模型构建不正确。 TextVectorization
必须放在 Sequential
模型中,如 official documentation.
中所示
因此 build_model
函数变为:
def build_model(max_tokens, max_len, dropout):
vectorize_layer = TextVectorization(
max_tokens=max_tokens,
output_mode="int",
output_sequence_length=max_len,
)
vectorize_layer.adapt(X)
model = Sequential()
model.add(Input(shape=(1,), dtype=tf.string)) ## <=== enable str inputs
model.add(vectorize_layer) ## <==== add TextVectorization inside Sequential
model.add(Embedding(max_tokens + 1, 128))
model.add(LSTM(64, dropout=dropout, recurrent_dropout=dropout))
model.add(Dense(64, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
return model
我用 fetch_20newsgroups
的二进制版本做了一个虚拟实验并且它有效。
# IMPORT USEFUL LIBRARIES
import tensorflow as tf
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
# READ THE DATA
categories = ["rec.sport.baseball", "talk.politics.misc"]
X, y = fetch_20newsgroups(categories=categories, return_X_y=True)
X = np.asarray(X).reshape(-1,1)
y = np.asarray(y).reshape(-1,1)
# HYPERPARAMETER SEARCHING
param_grid = {
"max_tokens" : [25, 50],
"max_len" : [5, 50],
"dropout" : [0.1, 0.2],
}
model = GridSearchCV(KerasClassifier(build_model), param_grid, cv=3, scoring='accuracy')
model.fit(X, y, epochs=3, batch_size=512, verbose=2)
我正在尝试使用 GridSearchCV
找到 LSTM 模型的最佳超参数,包括词汇大小和词嵌入维数的最佳参数。首先,我准备了我的测试和训练数据。
x = df['tweet_text']
y = df['potentially_harmful']
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=0)
x_train= x_train.to_numpy().reshape(-1, 1)
y_train= y_train.to_numpy().reshape(-1, 1)
x_test = x_test.to_numpy().reshape(-1, 1)
y_test = y_test.to_numpy().reshape(-1,1)
然后我尝试创建一个可以用于 GridSearchCV
的模型。我知道要使用 Keras 模型进行网格搜索,您需要使用 KerasClassifier
或 KerasRegressor
。我还确保适应 x
,而不是 x_train
或任何东西,因为 x
是完整的 x_data
,我认为它需要矢量化所有 x
所以所有输入文档都具有一致的矢量化形式。
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from scikeras.wrappers import KerasClassifier, KerasRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold
def build_model(max_tokens, max_len, dropout):
model = Sequential()
vectorize_layer = TextVectorization(
max_tokens=max_tokens,
output_mode="int",
output_sequence_length=max_len,
)
vectorize_layer.adapt(x)
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)
model.add(Embedding(max_tokens + 1, 128))
model.add(LSTM(64, dropout = dropout, recurrent_dropout = dropout))
model.add(Dense(64, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'],
)
return model
这里我尝试用参数实例化模型。分类器抱怨我应该添加 dropout = 0.2, max_len = 5, max_tokens=25
部分。
model = KerasClassifier(build_fn=build_model, dropout = 0.2, max_len = 5, max_tokens=25)
params = {
"max_tokens" : [25, 50, 500, 5000],
"max_len" : [5, 50, 500, 1000],
"dropout" : [0.1, 0.2, 0.3, 0.4, 0.5],
}
grid = GridSearchCV(estimator = model, scoring = 'accuracy', param_grid = params, cv = 3, verbose = 2, error_score = 'raise')
grid.fit(x_train, y_train)
然后,我得到这个错误:
Fitting 3 folds for each of 80 candidates, totalling 240 fits
ValueError: could not convert string to float: 'promo looks promising pls say absence means fauxfoodies r couple eliminated next round ugh cantstandthem mkr'
这让我很困惑。例如,如果我只是尝试用 model = build_model(...)
之类的东西实例化一个模型并尝试 model.fit(x_train, y_train)
,那么这个模型就可以工作,并且它不会在将字符串转换为浮点数时遇到问题。为什么现在不能了?
我尝试使用 scikeras,但出现错误,因为它不接受 not-numerical inputs (in our case the input is in str format). So I came back to the standard keras wrapper。
这里的重点是模型构建不正确。 TextVectorization
必须放在 Sequential
模型中,如 official documentation.
因此 build_model
函数变为:
def build_model(max_tokens, max_len, dropout):
vectorize_layer = TextVectorization(
max_tokens=max_tokens,
output_mode="int",
output_sequence_length=max_len,
)
vectorize_layer.adapt(X)
model = Sequential()
model.add(Input(shape=(1,), dtype=tf.string)) ## <=== enable str inputs
model.add(vectorize_layer) ## <==== add TextVectorization inside Sequential
model.add(Embedding(max_tokens + 1, 128))
model.add(LSTM(64, dropout=dropout, recurrent_dropout=dropout))
model.add(Dense(64, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
return model
我用 fetch_20newsgroups
的二进制版本做了一个虚拟实验并且它有效。
# IMPORT USEFUL LIBRARIES
import tensorflow as tf
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
# READ THE DATA
categories = ["rec.sport.baseball", "talk.politics.misc"]
X, y = fetch_20newsgroups(categories=categories, return_X_y=True)
X = np.asarray(X).reshape(-1,1)
y = np.asarray(y).reshape(-1,1)
# HYPERPARAMETER SEARCHING
param_grid = {
"max_tokens" : [25, 50],
"max_len" : [5, 50],
"dropout" : [0.1, 0.2],
}
model = GridSearchCV(KerasClassifier(build_model), param_grid, cv=3, scoring='accuracy')
model.fit(X, y, epochs=3, batch_size=512, verbose=2)