在 XGBoost 回归管道中查找和使用前 10 个特征

Question

我想获得 XGBRegressor 的前 10 项功能 ft_weights_xgb_reg.sort_values(by='weight', ascending=False).head(10) 我获得前 10 项的功能。但是我怎么能在我的管道中使用它呢？

我有这个classFeatureSelector_Only_Top_10，我怎么可能只使用前10个特征然后打印出来呢？例如 print(grid.feature_selection_top_10.top10features).

进口：

import time
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.manifold import TSNE
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso

XGB:

xgb_reg_start = time.time()

xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train_nor, y_train)
training_preds_xgb_reg = xgb_reg.predict(X_train_nor)
val_preds_xgb_reg = xgb_reg.predict(X_test_nor)

xgb_reg_end = time.time()

print(f"Time taken to run: {round((xgb_reg_end - xgb_reg_start)/60,1)} minutes")
print("\nTraining MSE:", round(metrics.mean_squared_error(y_train, training_preds_xgb_reg),4))
print("Validation MSE:", round(metrics.mean_squared_error(y_test, val_preds_xgb_reg),4))
print("\nTraining r2:", round(metrics.r2_score(y_train, training_preds_xgb_reg),4))
print("Validation r2:", round(metrics.r2_score(y_test, val_preds_xgb_reg),4))

ft_weights_xgb_reg = pd.DataFrame(xgb_reg.feature_importances_, columns=['weight'], index=X_train.columns)
ft_weights_xgb_reg.sort_values('weight', inplace=True)
ft_weights_xgb_reg.sort_values(by='weight', ascending=False).head(10)

管道：

class FeatureSelector_Only_Top_10(BaseEstimator, TransformerMixin):
    def __init__(self,n_components = 10):
        self.n_components = n_components
       

    def fit(self, X, y = None):
       # Don't know
        return self

    def transform(self, X, y = None):
        # Don't know
        return X

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
    
steps = [#('feature_selection_top_10', FeatureSelector_Only_Top_10()),
         #('feature_selection', SelectFromModel(estimator=LogisticRegression(max_iter=100))),
         ('lasso', Lasso(alpha=0.03))]

pipeline = Pipeline(steps) 
parameteres = { }

grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)                
grid.fit(X_train, y_train)
print(grid.best_params_)                    
print("score = %3.2f" %(grid.score(X_test,y_test)))

Answer 1

如果您想 select 在您的 Pipeline 中使用数据集的 N 最佳功能，您应该定义一个自定义转换器。

此对象应该在 transform() 方法期间训练和 select 来自 xgboost 的 N 最佳特征。然后在 transform() 方法期间，此转换器应相应地过滤您的数据集。

我会这样做：

from sklearn.datasets import make_regression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import Lasso

import pandas as pd
import xgboost as xgb

class FeatureSelector_Only_Top_10(BaseEstimator, TransformerMixin):
    def __init__(self,n_components = 10):
        self.n_components = n_components
        self.top_n_features = None

    def fit(self, X, y = None):
        X = pd.DataFrame(X)
        xgb_reg = xgb.XGBRegressor()
        xgb_reg.fit(X, y)
        self.top_n_features = (pd.DataFrame(
                                    xgb_reg.feature_importances_,
                                    columns=['weight'],
                                    index=X.columns)
                                .sort_values(by='weight', ascending=False)
                                .head(10)
                              )
        return self

    def transform(self, X, y = None):
        return pd.DataFrame(X).filter(self.top_n_features.index)




X, y = make_regression(n_features=50)
    
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
    
steps = [('feature_selection_top_10', FeatureSelector_Only_Top_10()),
         ('lasso', Lasso(alpha=0.03))]

pipeline = Pipeline(steps) 

pipeline.fit(X, y)       
print("score = %3.2f" %(pipeline.score(X_test,y_test)))

#retrieve the top N features and their weights
pipeline['feature_selection_top_10'].top_n_features

Answer 2

您可以在管道中包含 SelectFromModel，以便根据重要性权重提取前 10 个特征，无需创建自定义转换器。如 documentation 中所述，如果您想要 select 10 个功能，您需要设置 max_features=10 和 threshold=-np.inf。

import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression

X, y = make_regression(n_features=100, n_samples=1000, random_state=42)

X = pd.DataFrame(data=X, columns=['x' + str(i) for i in range(X.shape[1])])
y = pd.Series(y, name='y')

pipeline = Pipeline([
    ('selector', SelectFromModel(estimator=XGBRegressor(), max_features=10, threshold=-np.inf)),
    ('regressor', LinearRegression())
])

pipeline.fit(X, y)

selected_features = pipeline['selector'].get_support()
print(selected_features.sum())
# 10

selected_features_names = X.columns[selected_features].tolist()
print(selected_features_names)
# ['x0', 'x14', 'x17', 'x35', 'x42', 'x43', 'x57', 'x71', 'x84', 'x95']

selected_features_importances = pipeline['selector'].estimator_.feature_importances_[selected_features]
print(selected_features_importances)
# [0.09361505 0.18474296 0.14420615 0.01952794 0.10946904 0.02192107 0.03307951 0.02948984 0.02851948 0.1216883]

selected_features_coefficients = pipeline['regressor'].coef_
print(selected_features_coefficients)
# [49.43000693 83.91437854 78.25242596 -0.76411769 56.67970515  0.16829694 28.81967319  0.50277914 24.55006237 68.17120687]

在 XGBoost 回归管道中查找和使用前 10 个特征

Find and use top 10 features in XGBoost regression pipeline

python

pipeline

feature-selection

scikit-learn

xgboost