将管道与 GridSearchCV 一起使用
Using Pipeline with GridSearchCV
假设我有这个 Pipeline
对象:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
('my_transform', my_transform()),
('estimator', SVC())
])
要将超参数传递给我的支持向量分类器 (SVC),我可以这样做:
pipe_parameters = {
'estimator__gamma': (0.1, 1),
'estimator__kernel': (rbf)
}
然后,我可以使用 GridSearchCV
:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, pipe_parameters)
grid.fit(X_train, y_train)
我们知道线性内核不使用伽玛作为超参数。 那么,我怎样才能在这个 GridSearch 中包含 linear 内核?
例如,在一个简单的 GridSearch
(没有管道)中我可以这样做:
param_grid = [
{'C': [ 0.1, 1, 10, 100, 1000],
'gamma': [0.0001, 0.001, 0.01, 0.1, 1],
'kernel': ['rbf']},
{'C': [0.1, 1, 10, 100, 1000],
'kernel': ['linear']},
{'C': [0.1, 1, 10, 100, 1000],
'gamma': [0.0001, 0.001, 0.01, 0.1, 1],
'degree': [2, 3],
'kernel': ['poly']}
]
grid = GridSearchCV(SVC(), param_grid)
因此,我需要此类代码的工作版本:
pipe_parameters = {
'bag_of_words__max_features': (None, 1500),
'estimator__kernel': (rbf),
'estimator__gamma': (0.1, 1),
'estimator__kernel': (linear),
'estimator__C': (0.1, 1),
}
意思是我想使用以下组合作为超参数:
kernel = rbf, gamma = 0.1
kernel = rbf, gamma = 1
kernel = linear, C = 0.1
kernel = linear, C = 1
你快到了。与您为 SVC
模型创建多个词典的方式类似,为管道创建一个词典列表。
试试这个例子:
from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]
remove = ('headers', 'footers', 'quotes')
data_train = fetch_20newsgroups(subset='train', categories=categories,
shuffle=True, random_state=42,
remove=remove)
pipe = Pipeline([
('bag_of_words', CountVectorizer()),
('estimator', SVC())])
pipe_parameters = [
{'bag_of_words__max_features': (None, 1500),
'estimator__C': [ 0.1, ],
'estimator__gamma': [0.0001, 1],
'estimator__kernel': ['rbf']},
{'bag_of_words__max_features': (None, 1500),
'estimator__C': [0.1, 1],
'estimator__kernel': ['linear']}
]
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, pipe_parameters, cv=2)
grid.fit(data_train.data, data_train.target)
grid.best_params_
# {'bag_of_words__max_features': None,
# 'estimator__C': 0.1,
# 'estimator__kernel': 'linear'}
假设我有这个 Pipeline
对象:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
('my_transform', my_transform()),
('estimator', SVC())
])
要将超参数传递给我的支持向量分类器 (SVC),我可以这样做:
pipe_parameters = {
'estimator__gamma': (0.1, 1),
'estimator__kernel': (rbf)
}
然后,我可以使用 GridSearchCV
:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, pipe_parameters)
grid.fit(X_train, y_train)
我们知道线性内核不使用伽玛作为超参数。 那么,我怎样才能在这个 GridSearch 中包含 linear 内核?
例如,在一个简单的 GridSearch
(没有管道)中我可以这样做:
param_grid = [
{'C': [ 0.1, 1, 10, 100, 1000],
'gamma': [0.0001, 0.001, 0.01, 0.1, 1],
'kernel': ['rbf']},
{'C': [0.1, 1, 10, 100, 1000],
'kernel': ['linear']},
{'C': [0.1, 1, 10, 100, 1000],
'gamma': [0.0001, 0.001, 0.01, 0.1, 1],
'degree': [2, 3],
'kernel': ['poly']}
]
grid = GridSearchCV(SVC(), param_grid)
因此,我需要此类代码的工作版本:
pipe_parameters = {
'bag_of_words__max_features': (None, 1500),
'estimator__kernel': (rbf),
'estimator__gamma': (0.1, 1),
'estimator__kernel': (linear),
'estimator__C': (0.1, 1),
}
意思是我想使用以下组合作为超参数:
kernel = rbf, gamma = 0.1
kernel = rbf, gamma = 1
kernel = linear, C = 0.1
kernel = linear, C = 1
你快到了。与您为 SVC
模型创建多个词典的方式类似,为管道创建一个词典列表。
试试这个例子:
from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]
remove = ('headers', 'footers', 'quotes')
data_train = fetch_20newsgroups(subset='train', categories=categories,
shuffle=True, random_state=42,
remove=remove)
pipe = Pipeline([
('bag_of_words', CountVectorizer()),
('estimator', SVC())])
pipe_parameters = [
{'bag_of_words__max_features': (None, 1500),
'estimator__C': [ 0.1, ],
'estimator__gamma': [0.0001, 1],
'estimator__kernel': ['rbf']},
{'bag_of_words__max_features': (None, 1500),
'estimator__C': [0.1, 1],
'estimator__kernel': ['linear']}
]
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, pipe_parameters, cv=2)
grid.fit(data_train.data, data_train.target)
grid.best_params_
# {'bag_of_words__max_features': None,
# 'estimator__C': 0.1,
# 'estimator__kernel': 'linear'}