如何构建一个管道,以细粒度的方式为每列找到最佳预处理?
How to build a pipeline finding the best preprocessing per column in a fine-grained fashion?
在 sklearn 中,我们可以在管道中使用列转换器将预处理选择应用于特定列,如下所示:
import pandas as pd
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler, ...
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
# this is my x_data
x_data = pd.DataFrame(..., columns=['Variable1', 'Variable2', 'Variable3'])
pipeline = Pipeline(steps=[('preprocessing1', make_column_transformer((StandardScaler(), ['Variable1']),
remainder='passthrough')),
('preprocessing2', make_column_transformer((MaxAbsScaler(), ['Variable2']),
remainder='passthrough')),
('preprocessing3', make_column_transformer((MinMaxScaler(), ['Variable3']),
remainder='passthrough')),
('clf', MLPClassifier(...)
]
)
然后我们将 运行 GridSearchCV 遵循以下内容:
params = [{'preprocessing1': [MinMaxScaler(), MaxAbsScaler(), StandardScaler()], # <<<<<<<<<<<<< How???
'preprocessing2': [MinMaxScaler(), MaxAbsScaler(), StandardScaler()], # <<<<<<<<<<<<< How???
'preprocessing3': [MinMaxScaler(), MaxAbsScaler(), StandardScaler()], # <<<<<<<<<<<<< How???
'ann__hidden_layer_sizes': [(100,), (200,)],
'ann__solver': ['adam', 'lbfs', 'sgd'],
...
}]
cv = GridSearch(pipeline, params, cv=10, verbose=1, n_jobs=-1, refit=True)
我想做的是为每个预测变量找到最好的预处理,因为通常对所有预测变量的一次预处理效果不佳。
管道中的命名约定是使用双下划线 __
来分隔步骤及其参数。
您可以使用 pipeline.get_params()
查看管道的不同参数及其值。
在您的情况下,参数 preprocessing1__standardscaler
引用为管道第一步定义的缩放预处理,这是应该在 GridSearchCV
期间设置的参数。
下面的示例说明了如何执行此操作:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
X, y = make_classification(
n_features=3, n_informative=3, n_redundant=0, random_state=42
)
pipeline = Pipeline(
steps=[
("preprocessing1", make_column_transformer((StandardScaler(), [0]), remainder="passthrough")),
("preprocessing2", make_column_transformer((StandardScaler(), [1]), remainder="passthrough")),
("preprocessing3", make_column_transformer((StandardScaler(), [2]), remainder="passthrough")),
("clf", MLPClassifier()),
]
)
param_grid = {
"preprocessing1__standardscaler": [StandardScaler(), MinMaxScaler(), MaxAbsScaler()],
"preprocessing2__standardscaler": [StandardScaler(), MinMaxScaler(), MaxAbsScaler()],
"preprocessing3__standardscaler": [StandardScaler(), MinMaxScaler(), MaxAbsScaler()],
}
grid_search = GridSearchCV(pipeline, param_grid, cv=10, verbose=1, n_jobs=-1)
grid_search.fit(X, y)
grid_search.best_params_
这将return以下输出:
{'preprocessing1__standardscaler': MinMaxScaler(),
'preprocessing2__standardscaler': StandardScaler(),
'preprocessing3__standardscaler': MaxAbsScaler()}
在 sklearn 中,我们可以在管道中使用列转换器将预处理选择应用于特定列,如下所示:
import pandas as pd
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler, ...
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
# this is my x_data
x_data = pd.DataFrame(..., columns=['Variable1', 'Variable2', 'Variable3'])
pipeline = Pipeline(steps=[('preprocessing1', make_column_transformer((StandardScaler(), ['Variable1']),
remainder='passthrough')),
('preprocessing2', make_column_transformer((MaxAbsScaler(), ['Variable2']),
remainder='passthrough')),
('preprocessing3', make_column_transformer((MinMaxScaler(), ['Variable3']),
remainder='passthrough')),
('clf', MLPClassifier(...)
]
)
然后我们将 运行 GridSearchCV 遵循以下内容:
params = [{'preprocessing1': [MinMaxScaler(), MaxAbsScaler(), StandardScaler()], # <<<<<<<<<<<<< How???
'preprocessing2': [MinMaxScaler(), MaxAbsScaler(), StandardScaler()], # <<<<<<<<<<<<< How???
'preprocessing3': [MinMaxScaler(), MaxAbsScaler(), StandardScaler()], # <<<<<<<<<<<<< How???
'ann__hidden_layer_sizes': [(100,), (200,)],
'ann__solver': ['adam', 'lbfs', 'sgd'],
...
}]
cv = GridSearch(pipeline, params, cv=10, verbose=1, n_jobs=-1, refit=True)
我想做的是为每个预测变量找到最好的预处理,因为通常对所有预测变量的一次预处理效果不佳。
管道中的命名约定是使用双下划线 __
来分隔步骤及其参数。
您可以使用 pipeline.get_params()
查看管道的不同参数及其值。
在您的情况下,参数 preprocessing1__standardscaler
引用为管道第一步定义的缩放预处理,这是应该在 GridSearchCV
期间设置的参数。
下面的示例说明了如何执行此操作:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
X, y = make_classification(
n_features=3, n_informative=3, n_redundant=0, random_state=42
)
pipeline = Pipeline(
steps=[
("preprocessing1", make_column_transformer((StandardScaler(), [0]), remainder="passthrough")),
("preprocessing2", make_column_transformer((StandardScaler(), [1]), remainder="passthrough")),
("preprocessing3", make_column_transformer((StandardScaler(), [2]), remainder="passthrough")),
("clf", MLPClassifier()),
]
)
param_grid = {
"preprocessing1__standardscaler": [StandardScaler(), MinMaxScaler(), MaxAbsScaler()],
"preprocessing2__standardscaler": [StandardScaler(), MinMaxScaler(), MaxAbsScaler()],
"preprocessing3__standardscaler": [StandardScaler(), MinMaxScaler(), MaxAbsScaler()],
}
grid_search = GridSearchCV(pipeline, param_grid, cv=10, verbose=1, n_jobs=-1)
grid_search.fit(X, y)
grid_search.best_params_
这将return以下输出:
{'preprocessing1__standardscaler': MinMaxScaler(),
'preprocessing2__standardscaler': StandardScaler(),
'preprocessing3__standardscaler': MaxAbsScaler()}