Best_params 在网格搜索中
Best_params in GridSearch
我使用 grid_search 来找到参数的最佳组合,我制作了一个图表来查看当参数改变时分数是如何变化的。
当我 运行 gs_clf.best_params_ 我得到这个作为参数的最佳组合:
{'learning_rate': 0.01, 'n_estimators': 200}
我不明白为什么验证图没有显示此参数组合的最佳分数?
下面提供了我的代码。
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, average_precision_score, recall_score, f1_score, precision_recall_curve, auc, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import numpy as np
clf = GradientBoostingClassifier(min_samples_split=300, max_depth=4, random_state=0)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
number_of_estimators= [20,200]
LR=[0.01,1]
grid = GridSearchCV(clf, param_grid = dict(n_estimators=number_of_estimators,learning_rate=LR), cv=kfold, return_train_score=True, scoring = 'accuracy', pre_dispatch='1*n_jobs',n_jobs=1)
gs_clf = grid.fit(X_train, Y_train.values.ravel()) # Fit the Grid Search on Train dataset
scores = [x for x in gs_clf.cv_results_['mean_train_score']]
scores = np.array(scores).reshape(len(number_of_estimators), len(LR))
for ind, i in enumerate(number_of_estimators):
plt.plot(LR, scores[ind], label='Number_of_estimators: ' + str(i))
plt.legend()
plt.xlabel('Learning rate')
plt.ylabel('Mean score')
plt.title('Train score')
plt.show()
scores = [x for x in gs_clf.cv_results_['mean_test_score']]
scores = np.array(scores).reshape(len(number_of_estimators), len(LR))
for ind, i in enumerate(number_of_estimators):
plt.plot(LR, scores[ind], label='Number_of_estimators: ' + str(i))
plt.legend()
plt.xlabel('Learning rate')
plt.ylabel('Mean score')
plt.title('Validation score')
plt.show()
gs_clf.best_params
我得到的地块图片:
Train score plot
Validation score plot
问题实际上出在我在图表上显示数字的方式上。这是绘图的正确代码:
#TRAIN DATA
scores=gs_clf.cv_results_['mean_train_score']
scores = np.array(scores).reshape(len(LR), len(number_of_estimators))
for ind, i in enumerate(LR):
plt.plot(number_of_estimators, scores[ind], label='Learning rate: ' + str(i))
plt.legend()
plt.xlabel('Number_of_estimators')
plt.ylabel('Mean score')
plt.title('Train score')
plt.show()
#VALIDATION DATA
scores=gs_clf.cv_results_['mean_test_score']
scores = np.array(scores).reshape(len(LR), len(number_of_estimators))
for ind, i in enumerate(LR):
plt.plot(number_of_estimators, scores[ind], label='Learning rate: ' + str(i))
plt.legend()
plt.xlabel('Number_of_estimators')
plt.ylabel('Mean score')
plt.title('Validation score')
plt.show()
我使用 grid_search 来找到参数的最佳组合,我制作了一个图表来查看当参数改变时分数是如何变化的。 当我 运行 gs_clf.best_params_ 我得到这个作为参数的最佳组合: {'learning_rate': 0.01, 'n_estimators': 200} 我不明白为什么验证图没有显示此参数组合的最佳分数?
下面提供了我的代码。
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, average_precision_score, recall_score, f1_score, precision_recall_curve, auc, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import numpy as np
clf = GradientBoostingClassifier(min_samples_split=300, max_depth=4, random_state=0)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
number_of_estimators= [20,200]
LR=[0.01,1]
grid = GridSearchCV(clf, param_grid = dict(n_estimators=number_of_estimators,learning_rate=LR), cv=kfold, return_train_score=True, scoring = 'accuracy', pre_dispatch='1*n_jobs',n_jobs=1)
gs_clf = grid.fit(X_train, Y_train.values.ravel()) # Fit the Grid Search on Train dataset
scores = [x for x in gs_clf.cv_results_['mean_train_score']]
scores = np.array(scores).reshape(len(number_of_estimators), len(LR))
for ind, i in enumerate(number_of_estimators):
plt.plot(LR, scores[ind], label='Number_of_estimators: ' + str(i))
plt.legend()
plt.xlabel('Learning rate')
plt.ylabel('Mean score')
plt.title('Train score')
plt.show()
scores = [x for x in gs_clf.cv_results_['mean_test_score']]
scores = np.array(scores).reshape(len(number_of_estimators), len(LR))
for ind, i in enumerate(number_of_estimators):
plt.plot(LR, scores[ind], label='Number_of_estimators: ' + str(i))
plt.legend()
plt.xlabel('Learning rate')
plt.ylabel('Mean score')
plt.title('Validation score')
plt.show()
gs_clf.best_params
我得到的地块图片:
Train score plot
Validation score plot
问题实际上出在我在图表上显示数字的方式上。这是绘图的正确代码:
#TRAIN DATA
scores=gs_clf.cv_results_['mean_train_score']
scores = np.array(scores).reshape(len(LR), len(number_of_estimators))
for ind, i in enumerate(LR):
plt.plot(number_of_estimators, scores[ind], label='Learning rate: ' + str(i))
plt.legend()
plt.xlabel('Number_of_estimators')
plt.ylabel('Mean score')
plt.title('Train score')
plt.show()
#VALIDATION DATA
scores=gs_clf.cv_results_['mean_test_score']
scores = np.array(scores).reshape(len(LR), len(number_of_estimators))
for ind, i in enumerate(LR):
plt.plot(number_of_estimators, scores[ind], label='Learning rate: ' + str(i))
plt.legend()
plt.xlabel('Number_of_estimators')
plt.ylabel('Mean score')
plt.title('Validation score')
plt.show()