从 GridSearch CV 中检索 Shapley 值的模型结果
Retrieve Model Results for Shapley values from GridSearch CV
我使用 GridSearchCV 调整了一个模型。现在我想计算 Shapley 值并将它们可视化。困难在于 shap
包除了一个模型,而不是 GridSearch 结果。同样,当我将 best_estimator_ 属性传递给它时,它也不喜欢。它说不支持该模型。我如何从 GridSearchCV 或其他东西中获取 Shapley 值来计算 Shapley 值。我的专栏之一是分类的,因此需要进行预处理。因为我有来自网格搜索的 best_params,所以我可以 运行 模型作为 xgboost_regressor 模型,但是在没有预处理的情况下这样做已经有一段时间了。
from xgboost import XGBRegressor as xgr
model=xgr(booster ='gbtree', random_state = 13)
cv_inner = KFold(n_splits=5, shuffle=True)
params = {
'model__n_estimators' : [1500,2000]
,'model__learning_rate' : [0.1,0.2,0.3]
,'model__gamma' : [0, 0.005,0.01]
,'model__lambda' : [0.1, 0.2,0.3]
,'model__alpha' : [0, 0.001, 0.05]
,'model__max_depth' : [6]
,'model__min_child_weight' : [1]
,'model__subsample' : [0.8]
}
preprocessor = ColumnTransformer(
transformers=[
('cat', OneHotEncoder(), [0])
]
,remainder = 'passthrough')
mymodel = Pipeline(steps = [
('preprocessor',preprocessor),
('model', model)
])
optimize_hparams = GridSearchCV(
estimator = mymodel, param_grid=params, n_jobs = -1,
cv=cv_inner, scoring='neg_mean_absolute_error')
optimize_hparams.fit(X, y)
import shap
shap_values = shap.TreeExplainer(optimize_hparams.best_estimator_['model']).shap_values(X)
shap.summary_plot(shap_values, X, plot_type="bar")
在计算 Shap 值之前,您需要将预处理器和网格搜索中的最佳模型都拟合到数据中,请参见下面的代码示例。
import shap
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor as xgr
# generate the features and target
X, y = make_regression(n_samples=100, n_features=5, random_state=100)
# add a categorical feature in the first column
X = np.hstack([np.random.choice(['a', 'b', 'c'], size=(X.shape[0], 1)), X])
# set up the grid search
model = xgr(booster='gbtree', random_state=13)
cv_inner = KFold(n_splits=5, shuffle=True)
params = {
'model__n_estimators' : [1500, 2000],
'model__learning_rate' : [0.1, 0.2, 0.3],
'model__gamma' : [0, 0.005, 0.01],
'model__lambda' : [0.1, 0.2, 0.3],
'model__alpha' : [0, 0.001, 0.05],
'model__max_depth' : [6],
'model__min_child_weight' : [1],
'model__subsample' : [0.8],
}
preprocessor = ColumnTransformer(transformers=[('cat', OneHotEncoder(), [0])], remainder='passthrough')
mymodel = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
optimize_hparams = GridSearchCV(
estimator=mymodel,
param_grid=params,
cv=cv_inner,
scoring='neg_mean_absolute_error',
n_jobs=-1,
)
# run the grid search
optimize_hparams.fit(X, y)
# fit the preprocessor
X_encoded = optimize_hparams.best_estimator_['preprocessor'].fit_transform(X)
# fit the model
best_model = optimize_hparams.best_estimator_['model'].fit(X_encoded, y)
# calculate the Shap values
shap_values = shap.TreeExplainer(best_model).shap_values(X_encoded)
# plot the Shap values
shap.summary_plot(shap_values, X_encoded, plot_type='bar')
我使用 GridSearchCV 调整了一个模型。现在我想计算 Shapley 值并将它们可视化。困难在于 shap
包除了一个模型,而不是 GridSearch 结果。同样,当我将 best_estimator_ 属性传递给它时,它也不喜欢。它说不支持该模型。我如何从 GridSearchCV 或其他东西中获取 Shapley 值来计算 Shapley 值。我的专栏之一是分类的,因此需要进行预处理。因为我有来自网格搜索的 best_params,所以我可以 运行 模型作为 xgboost_regressor 模型,但是在没有预处理的情况下这样做已经有一段时间了。
from xgboost import XGBRegressor as xgr
model=xgr(booster ='gbtree', random_state = 13)
cv_inner = KFold(n_splits=5, shuffle=True)
params = {
'model__n_estimators' : [1500,2000]
,'model__learning_rate' : [0.1,0.2,0.3]
,'model__gamma' : [0, 0.005,0.01]
,'model__lambda' : [0.1, 0.2,0.3]
,'model__alpha' : [0, 0.001, 0.05]
,'model__max_depth' : [6]
,'model__min_child_weight' : [1]
,'model__subsample' : [0.8]
}
preprocessor = ColumnTransformer(
transformers=[
('cat', OneHotEncoder(), [0])
]
,remainder = 'passthrough')
mymodel = Pipeline(steps = [
('preprocessor',preprocessor),
('model', model)
])
optimize_hparams = GridSearchCV(
estimator = mymodel, param_grid=params, n_jobs = -1,
cv=cv_inner, scoring='neg_mean_absolute_error')
optimize_hparams.fit(X, y)
import shap
shap_values = shap.TreeExplainer(optimize_hparams.best_estimator_['model']).shap_values(X)
shap.summary_plot(shap_values, X, plot_type="bar")
在计算 Shap 值之前,您需要将预处理器和网格搜索中的最佳模型都拟合到数据中,请参见下面的代码示例。
import shap
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor as xgr
# generate the features and target
X, y = make_regression(n_samples=100, n_features=5, random_state=100)
# add a categorical feature in the first column
X = np.hstack([np.random.choice(['a', 'b', 'c'], size=(X.shape[0], 1)), X])
# set up the grid search
model = xgr(booster='gbtree', random_state=13)
cv_inner = KFold(n_splits=5, shuffle=True)
params = {
'model__n_estimators' : [1500, 2000],
'model__learning_rate' : [0.1, 0.2, 0.3],
'model__gamma' : [0, 0.005, 0.01],
'model__lambda' : [0.1, 0.2, 0.3],
'model__alpha' : [0, 0.001, 0.05],
'model__max_depth' : [6],
'model__min_child_weight' : [1],
'model__subsample' : [0.8],
}
preprocessor = ColumnTransformer(transformers=[('cat', OneHotEncoder(), [0])], remainder='passthrough')
mymodel = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
optimize_hparams = GridSearchCV(
estimator=mymodel,
param_grid=params,
cv=cv_inner,
scoring='neg_mean_absolute_error',
n_jobs=-1,
)
# run the grid search
optimize_hparams.fit(X, y)
# fit the preprocessor
X_encoded = optimize_hparams.best_estimator_['preprocessor'].fit_transform(X)
# fit the model
best_model = optimize_hparams.best_estimator_['model'].fit(X_encoded, y)
# calculate the Shap values
shap_values = shap.TreeExplainer(best_model).shap_values(X_encoded)
# plot the Shap values
shap.summary_plot(shap_values, X_encoded, plot_type='bar')