超参数调整 (Hyperas) 和使用流水线预处理的交叉验证
Hyperparameter-tuning (Hyperas) and Cross-Validation with Pipeline-Preprocessing
tl;dr 我尝试使用 Hyperas 优化和交叉验证我的超参数,但无法使用 KerasClassifier 工作进行预处理(缩放,over/undersampling)管道
我使用 Hyperas(hyperopt 的包装器)来调整我的神经网络(使用 Keras/Tensorflow 构建)超参数,并尝试对最佳参数实施 kfold-cross 验证。但是,我还对数据进行了预处理(Standardscaler 和 MinMaxScaler),然后 Over/undersampling 使用 SMOTETOMEK。
我 read 不应对整个数据集进行特征缩放和重采样,而应仅对用于训练的部分进行特征缩放和重采样以避免溢出。尝试在 hyperopt 中仅针对交叉验证的训练折叠实现这一点有些困难,因为当使用像 imblearn
这样的管道时,该管道仅适用于仅采用模型函数的 KerasClassifier。我不能给他那个模型函数,因为 hyperopt 中的整个验证过程都发生在一个函数中。
你对如何制作这样的作品有什么建议吗?我可以在 def data()
和 optimize/cross 中进行所有预处理来验证整个数据集上的参数吗?这是否会影响正确的参数查找过程? (我确实有一个额外的最终模型测试数据集)
有没有办法让它手动工作?
def data():
import pandas as pd
import feather
df_hyper_X = feather.read_dataframe('df_hyper_X_train.feather')
df_hyper_Y = feather.read_dataframe('df_hyper_Y_train.feather')
return df_hyper_X, df_hyper_Y
def hyper_model(df_hyper_X,df_hyper_Y):
stdscl_features = ['pre_grade', 'math']
normscl_features = 'time'
stdscl_transformer = Pipeline(steps=[('stdscaler', StandardScaler())])
normscl_transformer = Pipeline(steps=[('normscaler', MinMaxScaler())])
preprocessor = ColumnTransformer(transformers=[('stdscl', stdscl_transformer, stdscl_features),('minmaxscl', normscl_transformer, normscl_features)], remainder='passthrough')
metrics = [
tf.keras.metrics.TruePositives(name='tp'),
tf.keras.metrics.FalsePositives(name='fp'),
tf.keras.metrics.TrueNegatives(name='tn'),
tf.keras.metrics.FalseNegatives(name='fn'),
tf.keras.metrics.BinaryAccuracy(name='accuracy'),
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.AUC(name='auc'),
]
model = tf.keras.Sequential()
model.add(Dense({{choice([2,4,8,16,32,64])}}, activation={{choice(['relu', 'sigmoid', 'tanh', 'elu', 'selu'])}}, kernel_initializer={{choice(['lecun_uniform','glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'])}}
, input_shape=(16,))) #If ReLu use --> HE uniform initialization #kernel_regularizer=tf.keras.regularizers.l2({{choice([0.01, 0.05, 0.1])}}
#model.add(LeakyReLU(alpha={{uniform(0.5, 1)}}))
model.add(Dropout({{uniform(0, 1)}}))
if ({{choice(['one', 'two'])}}) == 'two':
model.add(Dense({{choice([2,4,8,16,32,64])}}, activation={{choice(['relu', 'sigmoid', 'tanh', 'elu', 'selu'])}}))
model.add(Dropout({{uniform(0, 1)}}))
#model.add(Dense({{choice([2,4,8,16,32,64])}}, activation={{choice(['relu', 'sigmoid', 'tanh', 'elu', 'selu'])}})) third hidden layer
#model.add(Dropout({{uniform(0, 1)}}))
model.add(Dense(1, activation='sigmoid'))
adam = tf.keras.optimizers.Adam(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
nadam = tf.keras.optimizers.Nadam(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
adamax = tf.keras.optimizers.Adamax(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
adagrad = tf.keras.optimizers.Adagrad(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
adadelta = tf.keras.optimizers.Adadelta(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
sgd = tf.keras.optimizers.SGD(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
rmsprop = tf.keras.optimizers.RMSprop(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
opti_choice = {{choice(['adam', 'nadam', 'adamax','adagrad', 'adadelta', 'sgd','rmsprop'])}}
if opti_choice == 'adam':
optimizer = adam
elif opti_choice == 'nadam':
optimizer = nadam
elif opti_choice == 'adamax':
optimizer = adamax
elif opti_choice == 'adagrad':
optimizer = adagrad
elif opti_choice == 'adadelta':
optimizer = adadelta
elif opti_choice == 'sgd':
optimizer = sgd
else:
optimizer = rmsprop
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=metrics)
kfold = KFold(n_splits=10, shuffle=True, random_state=3)
imba_pipeline = make_pipeline(preprocessor, SMOTETomek(sampling_strategy='auto', random_state=2),
KerasClassifier(model, epochs={{choice([20,30,40,50,60,70])}}, batch_size={{choice([16,32, 64, 128])}}, verbose=0))
results = cross_val_score(imba_pipeline, df_hyper_X, df_hyper_Y, cv=kfold, scoring='precision').mean()
print('Precision', results)
return {'loss': -results, 'status': STATUS_OK, 'model': model}
if __name__ == '__main__':
best_run, best_model = optim.minimize(model=hyper_model,
data=data,
algo=tpe.suggest,
max_evals=30,
trials=Trials(),
notebook_name = 'drive/My Drive/Colab Notebooks/final_NL_EU_Non-EU')
X_train, Y_train, X_test, Y_test = data()
print("Evalutation of best performing model:")
print(best_model.evaluate(X_test, Y_test))
print("Best performing model chosen hyper-parameters:")
print(best_run)
解决了。如果有人感兴趣,这是解决方案:
def data():
import pandas as pd
import feather
df_hyper_X = feather.read_dataframe('df_hyper_X_train.feather')
df_hyper_Y = feather.read_dataframe('df_hyper_Y_train.feather')
return df_hyper_X, df_hyper_Y
def hyper_model(df_hyper_X,df_hyper_Y):
ct = ColumnTransformer([('ct_std', StandardScaler(), ['pre_grade', 'math']),('ct_minmax', MinMaxScaler(), ['time'])
], remainder='passthrough')
metrics = [
tf.keras.metrics.TruePositives(name='tp'),
tf.keras.metrics.FalsePositives(name='fp'),
tf.keras.metrics.TrueNegatives(name='tn'),
tf.keras.metrics.FalseNegatives(name='fn'),
tf.keras.metrics.BinaryAccuracy(name='accuracy'),
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.AUC(name='auc'),
]
model = tf.keras.Sequential()
model.add(Dense({{choice([2,4,8,16,32,64])}}, activation={{choice(['relu', 'sigmoid', 'tanh', 'elu', 'selu'])}}, kernel_initializer={{choice(['lecun_uniform','glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'])}}
, input_shape=(20,)))
model.add(Dropout({{uniform(0, 0.5)}}))
if ({{choice(['one', 'two'])}}) == 'two':
model.add(Dense({{choice([2,4,8,16,32,64])}}, activation={{choice(['relu', 'sigmoid', 'tanh', 'elu', 'selu'])}}))
model.add(Dropout({{uniform(0, 0.5)}}))
model.add(Dense(1, activation='sigmoid'))
adam = tf.keras.optimizers.Adam(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
nadam = tf.keras.optimizers.Nadam(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
adamax = tf.keras.optimizers.Adamax(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
adagrad = tf.keras.optimizers.Adagrad(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
adadelta = tf.keras.optimizers.Adadelta(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
sgd = tf.keras.optimizers.SGD(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
rmsprop = tf.keras.optimizers.RMSprop(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
opti_choice = {{choice(['adam', 'nadam', 'adamax','adagrad', 'adadelta', 'sgd','rmsprop'])}}
if opti_choice == 'adam':
optimizer = adam
elif opti_choice == 'nadam':
optimizer = nadam
elif opti_choice == 'adamax':
optimizer = adamax
elif opti_choice == 'adagrad':
optimizer = adagrad
elif opti_choice == 'adadelta':
optimizer = adadelta
elif opti_choice == 'sgd':
optimizer = sgd
else:
optimizer = rmsprop
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=metrics)
smt = SMOTETomek(sampling_strategy='auto', random_state=2)
kfold = KFold(n_splits=10, shuffle=True, random_state=3)
scores = []
for train_fold_index, val_fold_index in kfold.split(df_hyper_X,df_hyper_Y):
X_train_fold, y_train_fold = df_hyper_X.iloc[train_fold_index], df_hyper_Y.iloc[train_fold_index]
X_val_fold, y_val_fold = df_hyper_X.iloc[val_fold_index], df_hyper_Y.iloc[val_fold_index]
X_train_fold = ct.fit_transform(X_train_fold)
X_val_fold = ct.transform(X_val_fold)
X_train_smtk, y_train_smtk = smt.fit_resample(X_train_fold, y_train_fold)
model.fit(X_train_smtk, y_train_smtk, epochs={{choice([20,30,40,50,60,70])}}, batch_size={{choice([16,32, 64, 128])}})
predicts = model.predict(X_val_fold)
score = precision_score(y_val_fold, predicts.round())
scores.append(score)
avg_score = np.mean(scores)
print('Precision', avg_score)
return {'loss': -avg_score, 'status': STATUS_OK, 'model': model}
if __name__ == '__main__':
best_run, best_model = optim.minimize(model=hyper_model,
data=data,
algo=tpe.suggest,
max_evals=2,
trials=Trials(),
notebook_name = 'drive/My Drive/Colab Notebooks/final_NL_EU_Non-EU')
df_hyper_X, df_hyper_Y = data()
print("Best performing model chosen hyper-parameters:")
print(best_run)
tl;dr 我尝试使用 Hyperas 优化和交叉验证我的超参数,但无法使用 KerasClassifier 工作进行预处理(缩放,over/undersampling)管道
我使用 Hyperas(hyperopt 的包装器)来调整我的神经网络(使用 Keras/Tensorflow 构建)超参数,并尝试对最佳参数实施 kfold-cross 验证。但是,我还对数据进行了预处理(Standardscaler 和 MinMaxScaler),然后 Over/undersampling 使用 SMOTETOMEK。
我 read 不应对整个数据集进行特征缩放和重采样,而应仅对用于训练的部分进行特征缩放和重采样以避免溢出。尝试在 hyperopt 中仅针对交叉验证的训练折叠实现这一点有些困难,因为当使用像 imblearn
这样的管道时,该管道仅适用于仅采用模型函数的 KerasClassifier。我不能给他那个模型函数,因为 hyperopt 中的整个验证过程都发生在一个函数中。
你对如何制作这样的作品有什么建议吗?我可以在 def data()
和 optimize/cross 中进行所有预处理来验证整个数据集上的参数吗?这是否会影响正确的参数查找过程? (我确实有一个额外的最终模型测试数据集)
有没有办法让它手动工作?
def data():
import pandas as pd
import feather
df_hyper_X = feather.read_dataframe('df_hyper_X_train.feather')
df_hyper_Y = feather.read_dataframe('df_hyper_Y_train.feather')
return df_hyper_X, df_hyper_Y
def hyper_model(df_hyper_X,df_hyper_Y):
stdscl_features = ['pre_grade', 'math']
normscl_features = 'time'
stdscl_transformer = Pipeline(steps=[('stdscaler', StandardScaler())])
normscl_transformer = Pipeline(steps=[('normscaler', MinMaxScaler())])
preprocessor = ColumnTransformer(transformers=[('stdscl', stdscl_transformer, stdscl_features),('minmaxscl', normscl_transformer, normscl_features)], remainder='passthrough')
metrics = [
tf.keras.metrics.TruePositives(name='tp'),
tf.keras.metrics.FalsePositives(name='fp'),
tf.keras.metrics.TrueNegatives(name='tn'),
tf.keras.metrics.FalseNegatives(name='fn'),
tf.keras.metrics.BinaryAccuracy(name='accuracy'),
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.AUC(name='auc'),
]
model = tf.keras.Sequential()
model.add(Dense({{choice([2,4,8,16,32,64])}}, activation={{choice(['relu', 'sigmoid', 'tanh', 'elu', 'selu'])}}, kernel_initializer={{choice(['lecun_uniform','glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'])}}
, input_shape=(16,))) #If ReLu use --> HE uniform initialization #kernel_regularizer=tf.keras.regularizers.l2({{choice([0.01, 0.05, 0.1])}}
#model.add(LeakyReLU(alpha={{uniform(0.5, 1)}}))
model.add(Dropout({{uniform(0, 1)}}))
if ({{choice(['one', 'two'])}}) == 'two':
model.add(Dense({{choice([2,4,8,16,32,64])}}, activation={{choice(['relu', 'sigmoid', 'tanh', 'elu', 'selu'])}}))
model.add(Dropout({{uniform(0, 1)}}))
#model.add(Dense({{choice([2,4,8,16,32,64])}}, activation={{choice(['relu', 'sigmoid', 'tanh', 'elu', 'selu'])}})) third hidden layer
#model.add(Dropout({{uniform(0, 1)}}))
model.add(Dense(1, activation='sigmoid'))
adam = tf.keras.optimizers.Adam(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
nadam = tf.keras.optimizers.Nadam(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
adamax = tf.keras.optimizers.Adamax(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
adagrad = tf.keras.optimizers.Adagrad(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
adadelta = tf.keras.optimizers.Adadelta(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
sgd = tf.keras.optimizers.SGD(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
rmsprop = tf.keras.optimizers.RMSprop(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
opti_choice = {{choice(['adam', 'nadam', 'adamax','adagrad', 'adadelta', 'sgd','rmsprop'])}}
if opti_choice == 'adam':
optimizer = adam
elif opti_choice == 'nadam':
optimizer = nadam
elif opti_choice == 'adamax':
optimizer = adamax
elif opti_choice == 'adagrad':
optimizer = adagrad
elif opti_choice == 'adadelta':
optimizer = adadelta
elif opti_choice == 'sgd':
optimizer = sgd
else:
optimizer = rmsprop
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=metrics)
kfold = KFold(n_splits=10, shuffle=True, random_state=3)
imba_pipeline = make_pipeline(preprocessor, SMOTETomek(sampling_strategy='auto', random_state=2),
KerasClassifier(model, epochs={{choice([20,30,40,50,60,70])}}, batch_size={{choice([16,32, 64, 128])}}, verbose=0))
results = cross_val_score(imba_pipeline, df_hyper_X, df_hyper_Y, cv=kfold, scoring='precision').mean()
print('Precision', results)
return {'loss': -results, 'status': STATUS_OK, 'model': model}
if __name__ == '__main__':
best_run, best_model = optim.minimize(model=hyper_model,
data=data,
algo=tpe.suggest,
max_evals=30,
trials=Trials(),
notebook_name = 'drive/My Drive/Colab Notebooks/final_NL_EU_Non-EU')
X_train, Y_train, X_test, Y_test = data()
print("Evalutation of best performing model:")
print(best_model.evaluate(X_test, Y_test))
print("Best performing model chosen hyper-parameters:")
print(best_run)
解决了。如果有人感兴趣,这是解决方案:
def data():
import pandas as pd
import feather
df_hyper_X = feather.read_dataframe('df_hyper_X_train.feather')
df_hyper_Y = feather.read_dataframe('df_hyper_Y_train.feather')
return df_hyper_X, df_hyper_Y
def hyper_model(df_hyper_X,df_hyper_Y):
ct = ColumnTransformer([('ct_std', StandardScaler(), ['pre_grade', 'math']),('ct_minmax', MinMaxScaler(), ['time'])
], remainder='passthrough')
metrics = [
tf.keras.metrics.TruePositives(name='tp'),
tf.keras.metrics.FalsePositives(name='fp'),
tf.keras.metrics.TrueNegatives(name='tn'),
tf.keras.metrics.FalseNegatives(name='fn'),
tf.keras.metrics.BinaryAccuracy(name='accuracy'),
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.AUC(name='auc'),
]
model = tf.keras.Sequential()
model.add(Dense({{choice([2,4,8,16,32,64])}}, activation={{choice(['relu', 'sigmoid', 'tanh', 'elu', 'selu'])}}, kernel_initializer={{choice(['lecun_uniform','glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'])}}
, input_shape=(20,)))
model.add(Dropout({{uniform(0, 0.5)}}))
if ({{choice(['one', 'two'])}}) == 'two':
model.add(Dense({{choice([2,4,8,16,32,64])}}, activation={{choice(['relu', 'sigmoid', 'tanh', 'elu', 'selu'])}}))
model.add(Dropout({{uniform(0, 0.5)}}))
model.add(Dense(1, activation='sigmoid'))
adam = tf.keras.optimizers.Adam(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
nadam = tf.keras.optimizers.Nadam(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
adamax = tf.keras.optimizers.Adamax(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
adagrad = tf.keras.optimizers.Adagrad(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
adadelta = tf.keras.optimizers.Adadelta(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
sgd = tf.keras.optimizers.SGD(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
rmsprop = tf.keras.optimizers.RMSprop(lr={{choice([0.0001, 0.001, 0.01, 0.1])}})
opti_choice = {{choice(['adam', 'nadam', 'adamax','adagrad', 'adadelta', 'sgd','rmsprop'])}}
if opti_choice == 'adam':
optimizer = adam
elif opti_choice == 'nadam':
optimizer = nadam
elif opti_choice == 'adamax':
optimizer = adamax
elif opti_choice == 'adagrad':
optimizer = adagrad
elif opti_choice == 'adadelta':
optimizer = adadelta
elif opti_choice == 'sgd':
optimizer = sgd
else:
optimizer = rmsprop
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=metrics)
smt = SMOTETomek(sampling_strategy='auto', random_state=2)
kfold = KFold(n_splits=10, shuffle=True, random_state=3)
scores = []
for train_fold_index, val_fold_index in kfold.split(df_hyper_X,df_hyper_Y):
X_train_fold, y_train_fold = df_hyper_X.iloc[train_fold_index], df_hyper_Y.iloc[train_fold_index]
X_val_fold, y_val_fold = df_hyper_X.iloc[val_fold_index], df_hyper_Y.iloc[val_fold_index]
X_train_fold = ct.fit_transform(X_train_fold)
X_val_fold = ct.transform(X_val_fold)
X_train_smtk, y_train_smtk = smt.fit_resample(X_train_fold, y_train_fold)
model.fit(X_train_smtk, y_train_smtk, epochs={{choice([20,30,40,50,60,70])}}, batch_size={{choice([16,32, 64, 128])}})
predicts = model.predict(X_val_fold)
score = precision_score(y_val_fold, predicts.round())
scores.append(score)
avg_score = np.mean(scores)
print('Precision', avg_score)
return {'loss': -avg_score, 'status': STATUS_OK, 'model': model}
if __name__ == '__main__':
best_run, best_model = optim.minimize(model=hyper_model,
data=data,
algo=tpe.suggest,
max_evals=2,
trials=Trials(),
notebook_name = 'drive/My Drive/Colab Notebooks/final_NL_EU_Non-EU')
df_hyper_X, df_hyper_Y = data()
print("Best performing model chosen hyper-parameters:")
print(best_run)