使用 XGBoost 和 hyperopt 进行交叉验证和参数调整

Question

使用 XGB 模型进行嵌套交叉验证的一种方法是：

from sklearn.model_selection import GridSearchCV, cross_val_score
from xgboost import XGBClassifier

# Let's assume that we have some data for a binary classification
# problem : X (n_samples, n_features) and y (n_samples,)...

gs = GridSearchCV(estimator=XGBClassifier(), 
                  param_grid={'max_depth': [3, 6, 9], 
                              'learning_rate': [0.001, 0.01, 0.05]}, 
                  cv=2)
scores = cross_val_score(gs, X, y, cv=2)

但是，关于 XGB 参数的调整，几个教程（例如 this one) take advantage of the Python hyperopt 库。我希望能够使用 hyperopt 进行嵌套交叉验证（如上所述）来调整 XGB 参数。

为此，我编写了自己的 Scikit-Learn 估算器：

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split
from sklearn.exceptions import NotFittedError
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier


def optimize_params(X, y, params_space, validation_split=0.2):
     """Estimate a set of 'best' model parameters."""
     # Split X, y into train/validation
     X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=validation_split, stratify=y)

    # Estimate XGB params
    def objective(_params):
        _clf = XGBClassifier(n_estimators=10000,
                             max_depth=int(_params['max_depth']),
                             learning_rate=_params['learning_rate'],
                             min_child_weight=_params['min_child_weight'],
                             subsample=_params['subsample'],
                             colsample_bytree=_params['colsample_bytree'],
                             gamma=_params['gamma'])
        _clf.fit(X_train, y_train,
                 eval_set=[(X_train, y_train), (X_val, y_val)],
                 eval_metric='auc',
                 early_stopping_rounds=30)
        y_pred_proba = _clf.predict_proba(X_val)[:, 1]
        roc_auc = roc_auc_score(y_true=y_val, y_score=y_pred_proba)
        return {'loss': 1. - roc_auc, 'status': STATUS_OK}

    trials = Trials()
    return fmin(fn=objective,
                space=params_space,
                algo=tpe.suggest,
                max_evals=100,
                trials=trials,
                verbose=0)


class OptimizedXGB(BaseEstimator, ClassifierMixin):
    """XGB with optimized parameters.

    Parameters
    ----------
    custom_params_space : dict or None
        If not None, dictionary whose keys are the XGB parameters to be
        optimized and corresponding values are 'a priori' probability
        distributions for the given parameter value. If None, a default
        parameters space is used.
    """
    def __init__(self, custom_params_space=None):
        self.custom_params_space = custom_params_space

    def fit(self, X, y, validation_split=0.3):
        """Train a XGB model.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Data.

        y : ndarray, shape (n_samples,) or (n_samples, n_labels)
            Labels.

        validation_split : float (default: 0.3)
            Float between 0 and 1. Corresponds to the percentage of samples in X which will be used as validation data to estimate the 'best' model parameters.
        """
        # If no custom parameters space is given, use a default one.
        if self.custom_params_space is None:
            _space = {
                'learning_rate': hp.uniform('learning_rate', 0.0001, 0.05),
                'max_depth': hp.quniform('max_depth', 8, 15, 1),
                'min_child_weight': hp.quniform('min_child_weight', 1, 5, 1),
                'subsample': hp.quniform('subsample', 0.7, 1, 0.05),
                'gamma': hp.quniform('gamma', 0.9, 1, 0.05),
                'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 0.7, 0.05)
            }
        else:
            _space = self.custom_params_space

        # Estimate best params using X, y
        opt = optimize_params(X, y, _space, validation_split)

        # Instantiate `xgboost.XGBClassifier` with optimized parameters
        best = XGBClassifier(n_estimators=10000,
                             max_depth=int(opt['max_depth']),
                             learning_rate=opt['learning_rate'],
                             min_child_weight=opt['min_child_weight'],
                             subsample=opt['subsample'],
                             gamma=opt['gamma'],
                             colsample_bytree=opt['colsample_bytree'])
        best.fit(X, y)
        self.best_estimator_ = best
        return self

    def predict(self, X):
        """Predict labels with trained XGB model.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)

        Returns
        -------
        output : ndarray, shape (n_samples,) or (n_samples, n_labels)
        """
        if not hasattr(self, 'best_estimator_'):
            raise NotFittedError('Call `fit` before `predict`.')
        else:
            return self.best_estimator_.predict(X)

    def predict_proba(self, X):
        """Predict labels probaiblities with trained XGB model.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)

        Returns
        -------
        output : ndarray, shape (n_samples,) or (n_samples, n_labels)
        """
        if not hasattr(self, 'best_estimator_'):
            raise NotFittedError('Call `fit` before `predict_proba`.')
        else:
            return self.best_estimator_.predict_proba(X)

我的问题是：

这是一种有效的方法吗？例如，在我的 OptimizedXGB 的 fit 方法中，best.fit(X, y) 将在 X、y 上训练 XGB 模型。但是，这可能会导致过度拟合，因为没有指定 eval_set 来确保提前停止。
在玩具示例（著名的鸢尾花数据集）上，这个 OptimizedXGB 比基本的 LogisticRegression 分类器表现更差。这是为什么？是因为例子太简单了吗？请参阅下面的示例代码。

示例：

import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

X, y = load_iris(return_X_y=True)
X = X[:, :2]
X = X[y < 2]
y = y[y < 2]
skf = StratifiedKFold(n_splits=2, random_state=42)

# With a LogisticRegression classifier
pipe = Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression())])
gs = GridSearchCV(estimator=pipe, param_grid={'lr__C': [1., 10.]})
lr_scores = cross_val_score(gs, X, y, cv=skf)

# With OptimizedXGB
xgb_scores = cross_val_score(OptimizedXGB(), X, y, cv=skf)

# Print results
print('Accuracy with LogisticRegression = %.4f (+/- %.4f)' % (np.mean(lr_scores), np.std(lr_scores)))
print('Accuracy with OptimizedXGB = %.4f (+/- %.4f)' % (np.mean(xgb_scores), np.std(xgb_scores)))

输出：

Accuracy with LogisticRegression = 0.9900 (+/- 0.0100)
Accuracy with OptimizedXGB = 0.9100 (+/- 0.0300)

虽然分数接近，但我希望 XGB 模型的分数至少与 LogisticRegression 分类器一样好。

编辑：

similar post

Answer 1

首先，检查这个 post - 可能有帮助 - 。

关于您的问题：

是的，这是正确的方法。一旦你选择了你的超参数，你应该在整个训练数据上拟合你的模型（选择的模型）。但是，由于此模型内部包含一个模型选择过程，因此您只能 "score" 它使用外部 CV 的泛化效果如何，就像您所做的那样。
由于您也在对选择过程进行评分（而不仅仅是模型，比如 XGB 与线性回归），因此选择过程可能存在一些问题。也许您的 hyper space 没有正确定义并且您选择的参数不正确？

使用 XGBoost 和 hyperopt 进行交叉验证和参数调整

Cross-validation and parameters tuning with XGBoost and hyperopt

python

machine-learning

scikit-learn

cross-validation

xgboost