有没有办法自动从 XGBoost 中提取重要特征并用于预测?

Is there a way to extract the important features from XGBoost automatically and use for prediction?


我正在尝试使用 XG Boost 开发预测模型。我的基本想法是开发一个自动预测模型,该模型使用从数据集(700 多行和 90 多列)派生的前 10 个重要特征,并将它们用于预测值。输入数据每周更新一次,因此下周的预测应使用当前周值进行预测。 我已经从我的 XGBoost 模型中提取了重要的特征,但由于错误我无法自动执行相同的特征。
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=100)
eval_set = [(X_train, y_train), (X_test, y_test)]
xg_reg = MyXGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.01,max_depth = 6, reg_alpha = 15, n_estimators = 1000, subsample = 0.5)
predictions = xg_reg.fit(X_train,y_train, early_stopping_rounds=30, eval_metric=["rmse", "mae"], eval_set=eval_set, verbose=True)

以上代码帮助我 运行 回归量和预测值。以下代码会引发错误。

import xgboost as xgb
from xgboost import XGBRegressor

class MyXGBRegressor(XGBRegressor):
    @property
    def coef_(self):

    return None

thresholds = np.sort(xg_reg.feature_importances_)
from sklearn.feature_selection import SelectFromModel



 for thresh in thresholds:
    selection = SelectFromModel(xg_reg, threshold=thresh, prefit = True)
    selected_dataset = selection.transform(X_test)
    feature_idx = selection.get_support()
    feature_name = X.columns[feature_idx]
    selected_dataset = pd.DataFrame(selected_dataset)
    selected_dataset.columns = feature_name

错误如下:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-11-a42c3ed80da2> in <module>
      3 for thresh in thresholds:
      4     selection = SelectFromModel(xg_reg, threshold=thresh, prefit = True)
----> 5     selected_dataset = selection.transform(X_test)
      6 
      7 feature_idx = selection.get_support()

~\Anaconda3\lib\site-packages\sklearn\feature_selection\_base.py in transform(self, X)
     86             force_all_finite=not _safe_tags(self, key="allow_nan"),
     87         )
---> 88         mask = self.get_support()
     89         if not mask.any():
     90             warn("No features were selected: either the data is"

~\Anaconda3\lib\site-packages\sklearn\feature_selection\_base.py in get_support(self, indices)
     50             values are indices into the input feature vector.
     51         """
---> 52         mask = self._get_support_mask()
     53         return mask if not indices else np.where(mask)[0]
     54 

~\Anaconda3\lib\site-packages\sklearn\feature_selection\_from_model.py in _get_support_mask(self)
    186                              ' "prefit=True" while passing the fitted'
    187                              ' estimator to the constructor.')
--> 188         scores = _get_feature_importances(
    189             estimator=estimator, getter=self.importance_getter,
    190             transform_func='norm', norm_order=self.norm_order)

~\Anaconda3\lib\site-packages\sklearn\feature_selection\_base.py in _get_feature_importances(estimator, getter, transform_func, norm_order)
    189         return importances
    190     elif transform_func == "norm":
--> 191         if importances.ndim == 1:
    192             importances = np.abs(importances)
    193         else:

AttributeError: 'NoneType' object has no attribute 'ndim'

问题是 MyXGBRegressorcoef_ 属性设置为 None。如果您使用 XGBRegressor 而不是 MyXGBRegressor,那么 SelectFromModel 将使用 XGBRegressorfeature_importances_ 属性并且您的代码将起作用。

import numpy as np
from xgboost import XGBRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

# generate some data
X, y = make_regression(n_samples=1000, n_features=5, random_state=100)

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# instantiate the model
model = XGBRegressor(objective="reg:squarederror", colsample_bytree=0.3, learning_rate=0.01, max_depth=6, reg_alpha=15, n_estimators=1000, subsample=0.5)

# fit the model
model.fit(X_train, y_train, early_stopping_rounds=30, eval_metric=["rmse", "mae"], eval_set=[(X_train, y_train), (X_test, y_test)], verbose=True)

# extract the feature importances
thresholds = np.sort(model.feature_importances_)

# select the features
selection = SelectFromModel(model, threshold=thresholds[2], prefit=True)

feature_idx = selection.get_support()
print(feature_idx)
# array([ True,  True,  True, False, False])

selected_dataset = selection.transform(X_test)
print(selected_dataset.shape)
# (200, 3)