有没有办法自动从 XGBoost 中提取重要特征并用于预测?
Is there a way to extract the important features from XGBoost automatically and use for prediction?
我正在尝试使用 XG Boost 开发预测模型。我的基本想法是开发一个自动预测模型,该模型使用从数据集(700 多行和 90 多列)派生的前 10 个重要特征,并将它们用于预测值。输入数据每周更新一次,因此下周的预测应使用当前周值进行预测。
我已经从我的 XGBoost 模型中提取了重要的特征,但由于错误我无法自动执行相同的特征。
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=100)
eval_set = [(X_train, y_train), (X_test, y_test)]
xg_reg = MyXGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.01,max_depth = 6, reg_alpha = 15, n_estimators = 1000, subsample = 0.5)
predictions = xg_reg.fit(X_train,y_train, early_stopping_rounds=30, eval_metric=["rmse", "mae"], eval_set=eval_set, verbose=True)
以上代码帮助我 运行 回归量和预测值。以下代码会引发错误。
import xgboost as xgb
from xgboost import XGBRegressor
class MyXGBRegressor(XGBRegressor):
@property
def coef_(self):
return None
thresholds = np.sort(xg_reg.feature_importances_)
from sklearn.feature_selection import SelectFromModel
for thresh in thresholds:
selection = SelectFromModel(xg_reg, threshold=thresh, prefit = True)
selected_dataset = selection.transform(X_test)
feature_idx = selection.get_support()
feature_name = X.columns[feature_idx]
selected_dataset = pd.DataFrame(selected_dataset)
selected_dataset.columns = feature_name
错误如下:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-11-a42c3ed80da2> in <module>
3 for thresh in thresholds:
4 selection = SelectFromModel(xg_reg, threshold=thresh, prefit = True)
----> 5 selected_dataset = selection.transform(X_test)
6
7 feature_idx = selection.get_support()
~\Anaconda3\lib\site-packages\sklearn\feature_selection\_base.py in transform(self, X)
86 force_all_finite=not _safe_tags(self, key="allow_nan"),
87 )
---> 88 mask = self.get_support()
89 if not mask.any():
90 warn("No features were selected: either the data is"
~\Anaconda3\lib\site-packages\sklearn\feature_selection\_base.py in get_support(self, indices)
50 values are indices into the input feature vector.
51 """
---> 52 mask = self._get_support_mask()
53 return mask if not indices else np.where(mask)[0]
54
~\Anaconda3\lib\site-packages\sklearn\feature_selection\_from_model.py in _get_support_mask(self)
186 ' "prefit=True" while passing the fitted'
187 ' estimator to the constructor.')
--> 188 scores = _get_feature_importances(
189 estimator=estimator, getter=self.importance_getter,
190 transform_func='norm', norm_order=self.norm_order)
~\Anaconda3\lib\site-packages\sklearn\feature_selection\_base.py in _get_feature_importances(estimator, getter, transform_func, norm_order)
189 return importances
190 elif transform_func == "norm":
--> 191 if importances.ndim == 1:
192 importances = np.abs(importances)
193 else:
AttributeError: 'NoneType' object has no attribute 'ndim'
问题是 MyXGBRegressor
的 coef_
属性设置为 None
。如果您使用 XGBRegressor
而不是 MyXGBRegressor
,那么 SelectFromModel
将使用 XGBRegressor
的 feature_importances_
属性并且您的代码将起作用。
import numpy as np
from xgboost import XGBRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
# generate some data
X, y = make_regression(n_samples=1000, n_features=5, random_state=100)
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)
# instantiate the model
model = XGBRegressor(objective="reg:squarederror", colsample_bytree=0.3, learning_rate=0.01, max_depth=6, reg_alpha=15, n_estimators=1000, subsample=0.5)
# fit the model
model.fit(X_train, y_train, early_stopping_rounds=30, eval_metric=["rmse", "mae"], eval_set=[(X_train, y_train), (X_test, y_test)], verbose=True)
# extract the feature importances
thresholds = np.sort(model.feature_importances_)
# select the features
selection = SelectFromModel(model, threshold=thresholds[2], prefit=True)
feature_idx = selection.get_support()
print(feature_idx)
# array([ True, True, True, False, False])
selected_dataset = selection.transform(X_test)
print(selected_dataset.shape)
# (200, 3)
我正在尝试使用 XG Boost 开发预测模型。我的基本想法是开发一个自动预测模型,该模型使用从数据集(700 多行和 90 多列)派生的前 10 个重要特征,并将它们用于预测值。输入数据每周更新一次,因此下周的预测应使用当前周值进行预测。 我已经从我的 XGBoost 模型中提取了重要的特征,但由于错误我无法自动执行相同的特征。
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=100)
eval_set = [(X_train, y_train), (X_test, y_test)]
xg_reg = MyXGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.01,max_depth = 6, reg_alpha = 15, n_estimators = 1000, subsample = 0.5)
predictions = xg_reg.fit(X_train,y_train, early_stopping_rounds=30, eval_metric=["rmse", "mae"], eval_set=eval_set, verbose=True)
以上代码帮助我 运行 回归量和预测值。以下代码会引发错误。
import xgboost as xgb
from xgboost import XGBRegressor
class MyXGBRegressor(XGBRegressor):
@property
def coef_(self):
return None
thresholds = np.sort(xg_reg.feature_importances_)
from sklearn.feature_selection import SelectFromModel
for thresh in thresholds:
selection = SelectFromModel(xg_reg, threshold=thresh, prefit = True)
selected_dataset = selection.transform(X_test)
feature_idx = selection.get_support()
feature_name = X.columns[feature_idx]
selected_dataset = pd.DataFrame(selected_dataset)
selected_dataset.columns = feature_name
错误如下:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-11-a42c3ed80da2> in <module>
3 for thresh in thresholds:
4 selection = SelectFromModel(xg_reg, threshold=thresh, prefit = True)
----> 5 selected_dataset = selection.transform(X_test)
6
7 feature_idx = selection.get_support()
~\Anaconda3\lib\site-packages\sklearn\feature_selection\_base.py in transform(self, X)
86 force_all_finite=not _safe_tags(self, key="allow_nan"),
87 )
---> 88 mask = self.get_support()
89 if not mask.any():
90 warn("No features were selected: either the data is"
~\Anaconda3\lib\site-packages\sklearn\feature_selection\_base.py in get_support(self, indices)
50 values are indices into the input feature vector.
51 """
---> 52 mask = self._get_support_mask()
53 return mask if not indices else np.where(mask)[0]
54
~\Anaconda3\lib\site-packages\sklearn\feature_selection\_from_model.py in _get_support_mask(self)
186 ' "prefit=True" while passing the fitted'
187 ' estimator to the constructor.')
--> 188 scores = _get_feature_importances(
189 estimator=estimator, getter=self.importance_getter,
190 transform_func='norm', norm_order=self.norm_order)
~\Anaconda3\lib\site-packages\sklearn\feature_selection\_base.py in _get_feature_importances(estimator, getter, transform_func, norm_order)
189 return importances
190 elif transform_func == "norm":
--> 191 if importances.ndim == 1:
192 importances = np.abs(importances)
193 else:
AttributeError: 'NoneType' object has no attribute 'ndim'
问题是 MyXGBRegressor
的 coef_
属性设置为 None
。如果您使用 XGBRegressor
而不是 MyXGBRegressor
,那么 SelectFromModel
将使用 XGBRegressor
的 feature_importances_
属性并且您的代码将起作用。
import numpy as np
from xgboost import XGBRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
# generate some data
X, y = make_regression(n_samples=1000, n_features=5, random_state=100)
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)
# instantiate the model
model = XGBRegressor(objective="reg:squarederror", colsample_bytree=0.3, learning_rate=0.01, max_depth=6, reg_alpha=15, n_estimators=1000, subsample=0.5)
# fit the model
model.fit(X_train, y_train, early_stopping_rounds=30, eval_metric=["rmse", "mae"], eval_set=[(X_train, y_train), (X_test, y_test)], verbose=True)
# extract the feature importances
thresholds = np.sort(model.feature_importances_)
# select the features
selection = SelectFromModel(model, threshold=thresholds[2], prefit=True)
feature_idx = selection.get_support()
print(feature_idx)
# array([ True, True, True, False, False])
selected_dataset = selection.transform(X_test)
print(selected_dataset.shape)
# (200, 3)