使用显式(预定义)验证集进行 sklearn 网格搜索
Using explicit (predefined) validation set for grid search with sklearn
我有一个数据集,它之前被分成了 3 组:训练、验证和测试。必须按给定的方式使用这些集合,以便比较不同算法的性能。
我现在想使用验证集优化 SVM 的参数。但是,我找不到如何将验证集显式输入 sklearn.grid_search.GridSearchCV()
。下面是我以前用于对训练集进行 K 折交叉验证的一些代码。但是,对于这个问题,我需要使用给定的验证集。我该怎么做?
from sklearn import svm, cross_validation
from sklearn.grid_search import GridSearchCV
# (some code left out to simplify things)
skf = cross_validation.StratifiedKFold(y_train, n_folds=5, shuffle = True)
clf = GridSearchCV(svm.SVC(tol=0.005, cache_size=6000,
class_weight=penalty_weights),
param_grid=tuned_parameters,
n_jobs=2,
pre_dispatch="n_jobs",
cv=skf,
scoring=scorer)
clf.fit(X_train, y_train)
ps = PredefinedSplit(test_fold=your_test_fold)
然后在GridSearchCV
中设置cv=ps
test_fold : “array-like, shape (n_samples,)
test_fold[i] gives the test set fold of sample i. A value of -1 indicates that the corresponding sample is not part of any test set folds, but will instead always be put into the training fold.
另见 here
when using a validation set, set the test_fold to 0 for all samples that are part of the validation set, and to -1 for all other samples.
考虑使用我是其作者的 hypopt
Python 包 (pip install hypopt
)。它是专门为使用验证集进行参数优化而创建的专业包。它适用于开箱即用的任何 scikit-learn 模型,也可以与 Tensorflow、PyTorch、Caffe2 等一起使用。
# Code from https://github.com/cgnorthcutt/hypopt
# Assuming you already have train, test, val sets and a model.
from hypopt import GridSearch
param_grid = [
{'C': [1, 10, 100], 'kernel': ['linear']},
{'C': [1, 10, 100], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
# Grid-search all parameter combinations using a validation set.
opt = GridSearch(model = SVR(), param_grid = param_grid)
opt.fit(X_train, y_train, X_val, y_val)
print('Test Score for Optimized Parameters:', opt.score(X_test, y_test))
编辑:我(认为我)收到了此回复的 -1,因为我建议使用我编写的包。这很不幸,因为该包是专门为解决此类问题而创建的。
# Import Libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import PredefinedSplit
# Split Data to Train and Validation
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.8, stratify = y,random_state = 2020)
# Create a list where train data indices are -1 and validation data indices are 0
split_index = [-1 if x in X_train.index else 0 for x in X.index]
# Use the list to create PredefinedSplit
pds = PredefinedSplit(test_fold = split_index)
# Use PredefinedSplit in GridSearchCV
clf = GridSearchCV(estimator = estimator,
cv=pds,
param_grid=param_grid)
# Fit with all data
clf.fit(X, y)
添加到原始答案中,当训练有效测试拆分未使用 Scikit-learn 的 train_test_split()
函数完成时,即数据帧已经事先手动拆分 scaled/normalized 所以为了防止训练数据泄漏,可以将numpy数组连接起来。
import numpy as np
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
from sklearn.model_selection import PredefinedSplit, GridSearchCV
split_index = [-1]*len(X_train) + [0]*len(X_val)
X = np.concatenate((X_train, X_val), axis=0)
y = np.concatenate((y_train, y_val), axis=0)
pds = PredefinedSplit(test_fold = split_index)
clf = GridSearchCV(estimator = estimator,
cv=pds,
param_grid=param_grid)
# Fit with all data
clf.fit(X, y)
我想提供一些可重现的代码,使用最后 20% 的观察结果创建验证拆分。
from sklearn import datasets
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
# load data
df_train = datasets.fetch_california_housing(as_frame=True).data
y = datasets.fetch_california_housing().target
param_grid = {"max_depth": [5, 6],
'learning_rate': [0.03, 0.06],
'subsample': [.5, .75]
}
model = GradientBoostingRegressor()
# Create a single validation split
val_prop = .2
n_val_rows = round(len(df_train) * val_prop)
val_starting_index = len(df_train) - n_val_rows
cv = PredefinedSplit([-1 if i < val_starting_index else 0 for i in df_train.index])
# Use PredefinedSplit in GridSearchCV
results = GridSearchCV(estimator = model,
cv=cv,
param_grid=param_grid,
verbose=True,
n_jobs=-1)
# Fit with all data
results.fit(df_train, y)
results.best_params_
我有一个数据集,它之前被分成了 3 组:训练、验证和测试。必须按给定的方式使用这些集合,以便比较不同算法的性能。
我现在想使用验证集优化 SVM 的参数。但是,我找不到如何将验证集显式输入 sklearn.grid_search.GridSearchCV()
。下面是我以前用于对训练集进行 K 折交叉验证的一些代码。但是,对于这个问题,我需要使用给定的验证集。我该怎么做?
from sklearn import svm, cross_validation
from sklearn.grid_search import GridSearchCV
# (some code left out to simplify things)
skf = cross_validation.StratifiedKFold(y_train, n_folds=5, shuffle = True)
clf = GridSearchCV(svm.SVC(tol=0.005, cache_size=6000,
class_weight=penalty_weights),
param_grid=tuned_parameters,
n_jobs=2,
pre_dispatch="n_jobs",
cv=skf,
scoring=scorer)
clf.fit(X_train, y_train)
ps = PredefinedSplit(test_fold=your_test_fold)
然后在GridSearchCV
cv=ps
test_fold : “array-like, shape (n_samples,)
test_fold[i] gives the test set fold of sample i. A value of -1 indicates that the corresponding sample is not part of any test set folds, but will instead always be put into the training fold.
另见 here
when using a validation set, set the test_fold to 0 for all samples that are part of the validation set, and to -1 for all other samples.
考虑使用我是其作者的 hypopt
Python 包 (pip install hypopt
)。它是专门为使用验证集进行参数优化而创建的专业包。它适用于开箱即用的任何 scikit-learn 模型,也可以与 Tensorflow、PyTorch、Caffe2 等一起使用。
# Code from https://github.com/cgnorthcutt/hypopt
# Assuming you already have train, test, val sets and a model.
from hypopt import GridSearch
param_grid = [
{'C': [1, 10, 100], 'kernel': ['linear']},
{'C': [1, 10, 100], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
# Grid-search all parameter combinations using a validation set.
opt = GridSearch(model = SVR(), param_grid = param_grid)
opt.fit(X_train, y_train, X_val, y_val)
print('Test Score for Optimized Parameters:', opt.score(X_test, y_test))
编辑:我(认为我)收到了此回复的 -1,因为我建议使用我编写的包。这很不幸,因为该包是专门为解决此类问题而创建的。
# Import Libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import PredefinedSplit
# Split Data to Train and Validation
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.8, stratify = y,random_state = 2020)
# Create a list where train data indices are -1 and validation data indices are 0
split_index = [-1 if x in X_train.index else 0 for x in X.index]
# Use the list to create PredefinedSplit
pds = PredefinedSplit(test_fold = split_index)
# Use PredefinedSplit in GridSearchCV
clf = GridSearchCV(estimator = estimator,
cv=pds,
param_grid=param_grid)
# Fit with all data
clf.fit(X, y)
添加到原始答案中,当训练有效测试拆分未使用 Scikit-learn 的 train_test_split()
函数完成时,即数据帧已经事先手动拆分 scaled/normalized 所以为了防止训练数据泄漏,可以将numpy数组连接起来。
import numpy as np
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
from sklearn.model_selection import PredefinedSplit, GridSearchCV
split_index = [-1]*len(X_train) + [0]*len(X_val)
X = np.concatenate((X_train, X_val), axis=0)
y = np.concatenate((y_train, y_val), axis=0)
pds = PredefinedSplit(test_fold = split_index)
clf = GridSearchCV(estimator = estimator,
cv=pds,
param_grid=param_grid)
# Fit with all data
clf.fit(X, y)
我想提供一些可重现的代码,使用最后 20% 的观察结果创建验证拆分。
from sklearn import datasets
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
# load data
df_train = datasets.fetch_california_housing(as_frame=True).data
y = datasets.fetch_california_housing().target
param_grid = {"max_depth": [5, 6],
'learning_rate': [0.03, 0.06],
'subsample': [.5, .75]
}
model = GradientBoostingRegressor()
# Create a single validation split
val_prop = .2
n_val_rows = round(len(df_train) * val_prop)
val_starting_index = len(df_train) - n_val_rows
cv = PredefinedSplit([-1 if i < val_starting_index else 0 for i in df_train.index])
# Use PredefinedSplit in GridSearchCV
results = GridSearchCV(estimator = model,
cv=cv,
param_grid=param_grid,
verbose=True,
n_jobs=-1)
# Fit with all data
results.fit(df_train, y)
results.best_params_