在 Python 3 中使用 Pipeline 获取 RFE 的支持和排名属性

Get support and ranking attributes for RFE using Pipeline in Python 3

我目前拥有的代码如下,它运行良好。但是,我想为每个测试的功能数量打印以下 RFE 属性:"rfe.support_[i]", "rfe.ranking_[i]"所选要素的名称 因为“i”指的是索引,第一个属性 returns True or False(是否选择了列)和第二个 returns 他们各自的排名。

换句话说,我想打印每个 RFE 中考虑的列,并且它们不会保留为抽象的东西。

# Explore the number of selected features for RFE
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Get the dataset
def get_dataset(df, target):
    X, y = df.drop(columns = target), df[[target]].values.flatten()
    return X, y

# Get a list of models to evaluate
def get_models(list_num_cols, list_cat_cols):
    num_transformer = Pipeline(steps = [('num_imputer', SimpleImputer(strategy = 'median'))])
    cat_transformer = Pipeline(steps = [('cat_imputer', SimpleImputer(strategy = 'most_frequent')),
                                        ('one-hot-encoder', OneHotEncoder())])
    preprocessor = ColumnTransformer(transformers = [('num', num_transformer, list_num_cols),
                                                     ('cat', cat_transformer, list_cat_cols)])
    models = dict()    
    for i in range(2, 4):
        rfe_dtr = RFE(estimator = DecisionTreeRegressor(), n_features_to_select = i)
        model_dtr = DecisionTreeRegressor()
        models['DecisionTreeRegressor_' + str(i)] = Pipeline(steps = [('preprocessor', preprocessor),
                                                                     ('s_dtr', rfe_dtr), 
                                                                     ('m_dtr', model_dtr)])
    return models

# Evaluate a give model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 7)
    scores = cross_val_score(model, X, y, scoring = 'neg_mean_absolute_error', cv = cv, 
                             n_jobs = -1, error_score = 'raise')
    return scores


# Define the dataset
X, y = get_dataset(my_df, 'my_target')   # It begins here
# Get the models to evaluate
models = get_models(X.select_dtypes(include = 'number').columns.tolist(), 
                    X.select_dtypes(include = 'object').columns.tolist())
# Evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

以下是返回错误:

models['DecisionTreeRegressor_2'].named_steps['s_dtr'].support_[0] # Returns: AttributeError: 'RFE' object has no attribute 'support_'
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].ranking_[0] # Returns: AttributeError: 'RFE' object has no attribute 'ranking_'

重点是您没有明确地安装'DecisionTreeRegressor_2'管道。

的确,尽管 cross_val_score already takes care of fitting the estimator as you might see herecross_val_score 并不像 .fit() 方法那样 return 估计器实例。因此您无法访问 RFE 实例属性。

这是您设置中的玩具示例:

from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_regression

X, y = make_regression()
models = dict()    
for i in range(2, 4):
    rfe_dtr = RFE(estimator = DecisionTreeRegressor(), n_features_to_select = i)
    model_dtr = DecisionTreeRegressor()
    models['DecisionTreeRegressor_' + str(i)] = Pipeline(
    [
        ('s_dtr', rfe_dtr), 
        ('m_dtr', model_dtr)
    ])

models['DecisionTreeRegressor_2'].named_steps['s_dtr'].support_   # this does not work

相反,您可能会看到,在拟合模型后,您将能够访问 support_ranking_ 属性:

models['DecisionTreeRegressor_2'].fit(X,y)
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].support_   # this works
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].ranking_   # this works

我回答了问题。我发布它以防它可以帮助某人。它包括使用“cross_validate”而不是“cross_val_score”和选项“return_estimator = True”,以便能够检索不同折叠和 RFE 中的管道,并访问他们按索引。然后你可以使用“named_steps”。

# Explore the number of selected features for RFE
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Get the dataset
def get_dataset(df, target):
    X, y = df.drop(columns = target), df[[target]].values.flatten()
    return X, y

# Get a list of models to evaluate
def get_models(list_num_cols, list_cat_cols):
    num_transformer = Pipeline(steps = [('num_imputer', SimpleImputer(strategy = 'median'))])
    cat_transformer = Pipeline(steps = [('cat_imputer', SimpleImputer(strategy = 'most_frequent')),
                                        ('one-hot-encoder', OneHotEncoder())])
    preprocessor = ColumnTransformer(transformers = [('num', num_transformer, list_num_cols),
                                                     ('cat', cat_transformer, list_cat_cols)])
    models = dict()    
    for i in range(2, 4):
        rfe_dtr = RFE(estimator = DecisionTreeRegressor(), n_features_to_select = i)
        model_dtr = DecisionTreeRegressor()
        models['DecisionTreeRegressor_' + str(i)] = Pipeline(steps = [('preprocessor', preprocessor),
                                                                     ('s_dtr', rfe_dtr), 
                                                                     ('m_dtr', model_dtr)])
    return models

# Evaluate a give model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 7)
    output = cross_validate(model, X, y, scoring = 'neg_mean_absolute_error', cv = cv, 
                             n_jobs = -1, error_score = 'raise', return_estimator = True)
    return output


# Define the dataset
X, y = get_dataset(my_df, 'my_target')   # It begins here
# Get the models to evaluate
models = get_models(X.select_dtypes(include = 'number').columns.tolist(), 
                    X.select_dtypes(include = 'object').columns.tolist())
# Evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    output = evaluate_model(model, X, y)
    results.append(output['test_score'])
    names.append(name)
    print('%s %.3f (%.3f)' % (name, mean(output['test_score']), std(output['test_score'])))
    print(output)  
    print(output['estimator'][0].named_steps['s_dtr'].support_)
    print(output['estimator'][0].named_steps['s_dtr'].ranking_)
    print(output['estimator'][0].named_steps['s_dtr'].support_[2])
    print(output['estimator'][0].named_steps['s_dtr'].ranking_[2])