网格搜索没有给出最佳参数

Question

当运行对逻辑回归、线性SVM和K最近邻分类器的正则化强度参数和最近邻参数的倒数进行网格搜索时，从网格搜索获得的最佳参数并不是真正最好的通过在相同的训练数据集上进行训练来手动验证。下面的代码

# Convert to a DataFrame.
import pandas as pd
from sklearn.datasets import fetch_openml

df = fetch_openml('credit-g', as_frame=True).frame
df.head(5)

df.dtypes

import matplotlib.pyplot as plt

fig = plt.figure(figsize=(12, 12))
st = fig.suptitle("univariate distributions and target distribution", fontsize=20)

# Using columns that we need for this plot
nfeatures = df[['duration', 'credit_amount' , 'age']]
target = df['class']

# creating 4x4 grid
grid = plt.GridSpec(4, 4, hspace=0.4, wspace=0.4)

# creating the normal plots in grid 1 , 2 ,3 and 4
p1 = fig.add_subplot(grid[:2,:2])
p2 = fig.add_subplot(grid[:2,2:])
p3 = fig.add_subplot(grid[2:,:2])
p4 = fig.add_subplot(grid[2:,2:])

p1.hist(nfeatures['duration'])
p2.hist(nfeatures['credit_amount'])
p3.hist(nfeatures['age'])
p4.hist(target)

p1.set_xlabel('duration')
p2.set_xlabel('credit_amount')
p3.set_xlabel('age')
p4.set_xlabel('class')
# customizing to look neat
st.set_y(0.95)
fig.subplots_adjust(top=0.92)


from sklearn.model_selection import train_test_split

columns = [column for column  in df.columns if column != 'class']
X = df[columns]
y = df['class']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3 ,random_state=11)
#X_train , y_train , X_valid , y_valid = train_test_split(X,) 
# basic preprocessing on train sets
# numeric_columns = ['duration','credit_amount' , 'installment_commitment' , 'residence_since' , 'age' ,'existing_credits' , 'num_dependents' ]
numeric_columns = df.select_dtypes(include=['float64']).columns
categorical_columns = [column for column in columns if column not in numeric_columns]
temp = X_train[categorical_columns]
X_train_ohe = pd.concat([pd.get_dummies(temp),X_train[numeric_columns]],axis=1)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
lr = LogisticRegression(max_iter=1000)

cr = cross_val_score(lr,X_train_ohe,y_train)

print(cr)


from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

# define the data preparation for the categorical columns
t1 = [('cat', OneHotEncoder(), categorical_columns)]
col_transform = ColumnTransformer(transformers=t1)
# define the models
models = {'lr_model':LogisticRegression(max_iter=1000), 'lsvm_model':LinearSVC(max_iter=2500) , 'knn_model':KNeighborsClassifier()}

for name,model in models.items():
  # define the data preparation and modeling pipeline
  pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)])
  # define the model cross-validation configuration
  #cv = KFold(n_splits=10, shuffle=True, random_state=1)
  # evaluate the pipeline using cross validation and calculate MAE
  score = cross_val_score(pipeline, X_train, y_train)
  print(name ,score.mean())

# define the data preparation for the categorical columns and numeric columns
t2 = [('cat', OneHotEncoder(), categorical_columns), ('num', StandardScaler(), numeric_columns)]
col_transform = ColumnTransformer(transformers=t2)
# try with new column transformer
for name,model in models.items():
  # define the data preparation and modeling pipeline
  pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)])
  # define the model cross-validation configuration
  #cv = KFold(n_splits=10, shuffle=True, random_state=1)
  # evaluate the pipeline using cross validation and calculate MAE
  score = cross_val_score(pipeline, X_train, y_train)
  print(name ,score.mean())

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer

f1_scorer = make_scorer(f1_score, pos_label="bad")

# 'prep__num__with_mean': [True, False],
# 'prep__num__with_std': [True, False],
param_grid = {
    'm__C': [0.1, 1.0 , 0.01],
    }

param_grid_knn = {
    'm__n_neighbors': [5, 10 , 15],
    }

for name,model in models.items():
  # define the data preparation and modeling pipeline
  pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)])
  # define the model cross-validation configuration
  #cv = KFold(n_splits=10, shuffle=True, random_state=1)
  # evaluate the pipeline using cross validation and calculate MAE
  if name == 'knn_model':
      grid_clf = GridSearchCV(pipeline, param_grid_knn, cv=5, scoring=f1_scorer )
  else:
      grid_clf = GridSearchCV(pipeline, param_grid, cv=5, scoring=f1_scorer)
  grid_clf.fit(X_train, y_train)
  print(name,grid_clf.best_params_)
  print(name, grid_clf.best_estimator_.score(X_test, y_test))

lr_array = []
lr_c = [0.01,0.1,1]

for c in lr_c:
  pipeline = Pipeline(steps=[('prep',col_transform), ('m', LogisticRegression(max_iter=1000, C=c))])
  pipeline.fit(X_train,y_train)
  y_hat = pipeline.predict(X_train)
  lr_array.append(f1_score(y_train,y_hat,pos_label="bad"))


lsvm_array = []
lsvm_c = [0.01,0.1,1]

for c in lsvm_c:
  pipeline = Pipeline(steps=[('prep',col_transform), ('m', LinearSVC(dual=True,max_iter=2500,C=c))])
  pipeline.fit(X_train,y_train)
  y_hat = pipeline.predict(X_train)
  lsvm_array.append(f1_score(y_train,y_hat,pos_label="bad"))


knn_array = []
knn_n = [5,10,15]

for n in knn_n:
  pipeline = Pipeline(steps=[('prep',col_transform), ('m', KNeighborsClassifier(n_neighbors=n))])
  pipeline.fit(X_train,y_train)
  y_hat = pipeline.predict(X_train)
  knn_array.append(f1_score(y_train,y_hat,pos_label="bad"))

fig = plt.figure(figsize=(12, 12))
# creating 3x1 grid
grid = plt.GridSpec(3, 1, hspace=0.4, wspace=0.4)

# creating the normal plots in grid 1 , 2 ,3
p1 = fig.add_subplot(grid[0,:])
p2 = fig.add_subplot(grid[1,:])
p3 = fig.add_subplot(grid[2,:])

p1.scatter(lr_c,lr_array)
p2.scatter(lsvm_c,lsvm_array)
p3.scatter(knn_n,knn_array)

当使用不同的分数并在测试集而不是训练集上进行评估时，趋势会发生变化，但网格搜索和手动验证的最佳参数似乎永远不会相同。这可能是什么原因？例如，如果您运行上面的代码网格搜索告诉您 10 是 n_neighbors 的最佳值，但最后的图表显示 5 做得更好。是否未正确执行比较？您可以在此 link https://github.com/binodmathews93/AppliedMachineLearningCourse/blob/master/Applied_Machine_Learning_Homework_2.ipynb

处检查带有输出的运行s

Answer 1

超参数调整是在验证（开发）集上执行的，而不是在训练集上执行的。

网格搜索 Cross-Validation 正在使用 K-Fold 策略构建仅用于验证而非训练的验证集。

您正在对同一组手动执行训练和验证，这是一种不正确的方法。

pipeline = Pipeline(steps=[('prep',col_transform), ('m', LogisticRegression(max_iter=1000, C=c))])
pipeline.fit(X_train,y_train)       # <- here is the problem
y_hat = pipeline.predict(X_train)
lr_array.append(f1_score(y_train,y_hat,pos_label="bad"))

这只会导致选择超参数来提高训练集的性能，这不是你想要的（你是一组在测试集上产生良好性能的超参数 - 泛化很好）。

这就是为什么在进行手动测试时 K（在 KNN 中）较低的原因 - 较低的 K 会导致较少的“正则化”，因此从训练集的角度来看是最佳选择，尽管不正确.

如果要手动验证结果，需要自己构建验证集（训练时不要使用），否则需要手动调用K-fold cross-validation程序。

网格搜索没有给出最佳参数

Grid search not giving the best parameters

python

machine-learning

scikit-learn

grid-search

google-colaboratory