"Parallel" 使用网格搜索获得最佳模型的管道
"Parallel" pipeline to get best model using gridsearch
在 sklearn 中,可以定义串行管道,以获得管道所有连续部分的最佳超参数组合。串行管道可以实现如下:
from sklearn.svm import SVC
from sklearn import decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
digits = datasets.load_digits()
X_train = digits.data
y_train = digits.target
#Use Principal Component Analysis to reduce dimensionality
# and improve generalization
pca = decomposition.PCA()
# Use a linear SVC
svm = SVC()
# Combine PCA and SVC to a pipeline
pipe = Pipeline(steps=[('pca', pca), ('svm', svm)])
# Check the training time for the SVC
n_components = [20, 40, 64]
params_grid = {
'svm__C': [1, 10, 100, 1000],
'svm__kernel': ['linear', 'rbf'],
'svm__gamma': [0.001, 0.0001],
'pca__n_components': n_components,
}
但是如果我想为管道的每个步骤尝试不同的算法怎么办?我怎么能
的网格搜索
Principal Component Analysis OR Singular Value Decomposition AND
Support Vector machines OR Random Forest
这需要某种第二级或 "meta-gridsearch",因为模型类型将是超参数之一。这在sklearn中可能吗?
Pipeline 在其 steps
(估算器列表)中支持 None
,通过它可以关闭管道的某些部分。
您可以将 None
参数传递给管道的 named_steps
以不使用该估计器,方法是在传递给 GridSearchCV 的参数中设置该估计器。
假设您要使用 PCA
and TruncatedSVD
.
pca = decomposition.PCA()
svd = decomposition.TruncatedSVD()
svm = SVC()
n_components = [20, 40, 64]
在管道中添加svd
pipe = Pipeline(steps=[('pca', pca), ('svd', svd), ('svm', svm)])
# Change params_grid -> Instead of dict, make it a list of dict**
# In the first element, pass `svd = None`, and in second `pca = None`
params_grid = [{
'svm__C': [1, 10, 100, 1000],
'svm__kernel': ['linear', 'rbf'],
'svm__gamma': [0.001, 0.0001],
'pca__n_components': n_components,
'svd':[None]
},
{
'svm__C': [1, 10, 100, 1000],
'svm__kernel': ['linear', 'rbf'],
'svm__gamma': [0.001, 0.0001],
'pca':[None],
'svd__n_components': n_components,
'svd__algorithm':['randomized']
}]
现在只需将管道对象传递给 gridsearchCV
grd = GridSearchCV(pipe, param_grid = params_grid)
调用 grd.fit()
将搜索 params_grid
列表的两个元素的参数,一次使用一个元素的所有值。
如果参数同名则简化
如果您的 "OR" 中的两个估算器与本例中的参数名称相同,其中 PCA
和 TruncatedSVD
具有 n_components
(或者您只想搜索在这个参数上,这可以简化为:
#Here I have changed the name to `preprocessor`
pipe = Pipeline(steps=[('preprocessor', pca), ('svm', svm)])
#Now assign both estimators to `preprocessor` as below:
params_grid = {
'svm__C': [1, 10, 100, 1000],
'svm__kernel': ['linear', 'rbf'],
'svm__gamma': [0.001, 0.0001],
'preprocessor':[pca, svd],
'preprocessor__n_components': n_components,
}
此方案的推广
我们可以创建一个函数,它可以使用适当的值自动填充我们的 param_grid
以提供给 GridSearchCV
:-
def make_param_grids(steps, param_grids):
final_params=[]
# Itertools.product will do a permutation such that
# (pca OR svd) AND (svm OR rf) will become ->
# (pca, svm) , (pca, rf) , (svd, svm) , (svd, rf)
for estimator_names in itertools.product(*steps.values()):
current_grid = {}
# Step_name and estimator_name should correspond
# i.e preprocessor must be from pca and select.
for step_name, estimator_name in zip(steps.keys(), estimator_names):
for param, value in param_grids.get(estimator_name).iteritems():
if param == 'object':
# Set actual estimator in pipeline
current_grid[step_name]=[value]
else:
# Set parameters corresponding to above estimator
current_grid[step_name+'__'+param]=value
#Append this dictionary to final params
final_params.append(current_grid)
return final_params
并在任意数量的转换器和估算器上使用此函数
# add all the estimators you want to "OR" in single key
# use OR between `pca` and `select`,
# use OR between `svm` and `rf`
# different keys will be evaluated as serial estimator in pipeline
pipeline_steps = {'preprocessor':['pca', 'select'],
'classifier':['svm', 'rf']}
# fill parameters to be searched in this dict
all_param_grids = {'svm':{'object':SVC(),
'C':[0.1,0.2]
},
'rf':{'object':RandomForestClassifier(),
'n_estimators':[10,20]
},
'pca':{'object':PCA(),
'n_components':[10,20]
},
'select':{'object':SelectKBest(),
'k':[5,10]
}
}
# Call the method on the above declared variables
param_grids_list = make_param_grids(pipeline_steps, all_param_grids)
现在用上面使用的名称初始化管道对象pipeline_steps
# The PCA() and SVC() used here are just to initialize the pipeline,
# actual estimators will be used from our `param_grids_list`
pipe = Pipeline(steps=[('preprocessor',PCA()), ('classifier', SVC())])
现在,终于开始设置gridSearchCV对象和拟合数据了
grd = GridSearchCV(pipe, param_grid = param_grids_list)
grd.fit(X, y)
在 sklearn 中,可以定义串行管道,以获得管道所有连续部分的最佳超参数组合。串行管道可以实现如下:
from sklearn.svm import SVC
from sklearn import decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
digits = datasets.load_digits()
X_train = digits.data
y_train = digits.target
#Use Principal Component Analysis to reduce dimensionality
# and improve generalization
pca = decomposition.PCA()
# Use a linear SVC
svm = SVC()
# Combine PCA and SVC to a pipeline
pipe = Pipeline(steps=[('pca', pca), ('svm', svm)])
# Check the training time for the SVC
n_components = [20, 40, 64]
params_grid = {
'svm__C': [1, 10, 100, 1000],
'svm__kernel': ['linear', 'rbf'],
'svm__gamma': [0.001, 0.0001],
'pca__n_components': n_components,
}
但是如果我想为管道的每个步骤尝试不同的算法怎么办?我怎么能
的网格搜索Principal Component Analysis OR Singular Value Decomposition AND Support Vector machines OR Random Forest
这需要某种第二级或 "meta-gridsearch",因为模型类型将是超参数之一。这在sklearn中可能吗?
Pipeline 在其 steps
(估算器列表)中支持 None
,通过它可以关闭管道的某些部分。
您可以将 None
参数传递给管道的 named_steps
以不使用该估计器,方法是在传递给 GridSearchCV 的参数中设置该估计器。
假设您要使用 PCA
and TruncatedSVD
.
pca = decomposition.PCA()
svd = decomposition.TruncatedSVD()
svm = SVC()
n_components = [20, 40, 64]
在管道中添加svd
pipe = Pipeline(steps=[('pca', pca), ('svd', svd), ('svm', svm)])
# Change params_grid -> Instead of dict, make it a list of dict**
# In the first element, pass `svd = None`, and in second `pca = None`
params_grid = [{
'svm__C': [1, 10, 100, 1000],
'svm__kernel': ['linear', 'rbf'],
'svm__gamma': [0.001, 0.0001],
'pca__n_components': n_components,
'svd':[None]
},
{
'svm__C': [1, 10, 100, 1000],
'svm__kernel': ['linear', 'rbf'],
'svm__gamma': [0.001, 0.0001],
'pca':[None],
'svd__n_components': n_components,
'svd__algorithm':['randomized']
}]
现在只需将管道对象传递给 gridsearchCV
grd = GridSearchCV(pipe, param_grid = params_grid)
调用 grd.fit()
将搜索 params_grid
列表的两个元素的参数,一次使用一个元素的所有值。
如果参数同名则简化
如果您的 "OR" 中的两个估算器与本例中的参数名称相同,其中 PCA
和 TruncatedSVD
具有 n_components
(或者您只想搜索在这个参数上,这可以简化为:
#Here I have changed the name to `preprocessor`
pipe = Pipeline(steps=[('preprocessor', pca), ('svm', svm)])
#Now assign both estimators to `preprocessor` as below:
params_grid = {
'svm__C': [1, 10, 100, 1000],
'svm__kernel': ['linear', 'rbf'],
'svm__gamma': [0.001, 0.0001],
'preprocessor':[pca, svd],
'preprocessor__n_components': n_components,
}
此方案的推广
我们可以创建一个函数,它可以使用适当的值自动填充我们的 param_grid
以提供给 GridSearchCV
:-
def make_param_grids(steps, param_grids):
final_params=[]
# Itertools.product will do a permutation such that
# (pca OR svd) AND (svm OR rf) will become ->
# (pca, svm) , (pca, rf) , (svd, svm) , (svd, rf)
for estimator_names in itertools.product(*steps.values()):
current_grid = {}
# Step_name and estimator_name should correspond
# i.e preprocessor must be from pca and select.
for step_name, estimator_name in zip(steps.keys(), estimator_names):
for param, value in param_grids.get(estimator_name).iteritems():
if param == 'object':
# Set actual estimator in pipeline
current_grid[step_name]=[value]
else:
# Set parameters corresponding to above estimator
current_grid[step_name+'__'+param]=value
#Append this dictionary to final params
final_params.append(current_grid)
return final_params
并在任意数量的转换器和估算器上使用此函数
# add all the estimators you want to "OR" in single key
# use OR between `pca` and `select`,
# use OR between `svm` and `rf`
# different keys will be evaluated as serial estimator in pipeline
pipeline_steps = {'preprocessor':['pca', 'select'],
'classifier':['svm', 'rf']}
# fill parameters to be searched in this dict
all_param_grids = {'svm':{'object':SVC(),
'C':[0.1,0.2]
},
'rf':{'object':RandomForestClassifier(),
'n_estimators':[10,20]
},
'pca':{'object':PCA(),
'n_components':[10,20]
},
'select':{'object':SelectKBest(),
'k':[5,10]
}
}
# Call the method on the above declared variables
param_grids_list = make_param_grids(pipeline_steps, all_param_grids)
现在用上面使用的名称初始化管道对象pipeline_steps
# The PCA() and SVC() used here are just to initialize the pipeline,
# actual estimators will be used from our `param_grids_list`
pipe = Pipeline(steps=[('preprocessor',PCA()), ('classifier', SVC())])
现在,终于开始设置gridSearchCV对象和拟合数据了
grd = GridSearchCV(pipe, param_grid = param_grids_list)
grd.fit(X, y)