使 sklearn 中的网格搜索功能忽略空模型
Make grid search functions in sklearn to ignore empty models
使用 python 和 scikit-learn,我想进行网格搜索。但是我的一些模型最终是空的。如何让网格搜索功能忽略那些模型?
我想我可以有一个评分函数,如果模型为空,returns 0,但我不确定如何。
predictor = sklearn.svm.LinearSVC(penalty='l1', dual=False, class_weight='auto')
param_dist = {'C': pow(2.0, np.arange(-10, 11))}
learner = sklearn.grid_search.GridSearchCV(estimator=predictor,
param_grid=param_dist,
n_jobs=self.n_jobs, cv=5,
verbose=0)
learner.fit(X, y)
我的数据是这样的 learner
对象将选择对应于空模型的 C
。知道如何确保模型不为空吗?
编辑:"empty model" 我的意思是一个模型选择了 0 个要使用的特征。特别是对于 l1
正则化模型,这很容易发生。所以在这种情况下,如果SVM中的C
足够小,优化问题会找到0向量作为系数的最优解。因此 predictor.coef_
将是 0
s 的向量。
我不认为有这样的内置功能;但是,制作自定义网格搜索器很容易:
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
import itertools
from sklearn import metrics
import operator
def model_eval(X, y, model, cv):
scores = []
for train_idx, test_idx in cv:
X_train, y_train = X[train_idx], y[train_idx]
X_test, y_test = X[test_idx], y[test_idx]
model.fit(X_train, y_train)
nonzero_coefs = len(np.nonzero(model.coef_)[0]) #check for nonzero coefs
if nonzero_coefs == 0: #if they're all zero, don't evaluate any further; move to next hyperparameter combo
return 0
predictions = model.predict(X_test)
score = metrics.accuracy_score(y_test, predictions)
scores.append(score)
return np.array(scores).mean()
X, y = make_classification(n_samples=1000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=2,
random_state=0,
shuffle=False)
C = pow(2.0, np.arange(-20, 11))
penalty = {'l1', 'l2'}
parameter_grid = itertools.product(C, penalty)
kf = KFold(X.shape[0], n_folds=5) #use the same folds to evaluate each hyperparameter combo
hyperparameter_scores = {}
for C, penalty in parameter_grid:
model = svm.LinearSVC(dual=False, C=C, penalty=penalty)
result = model_eval(X, y, model, kf)
hyperparameter_scores[(C, penalty)] = result
sorted_scores = sorted(hyperparameter_scores.items(), key=operator.itemgetter(1))
best_parameters, best_score = sorted_scores[-1]
print best_parameters
print best_score
尝试实现自定义记分器,类似于:
import numpy as np
def scorer_(estimator, X, y):
# Your criterion here
if np.allclose(estimator.coef_, np.zeros_like(estimator.coef_)):
return 0
else:
return estimator.score(X, y)
learner = sklearn.grid_search.GridSearchCV(...
scoring=scorer_)
使用 python 和 scikit-learn,我想进行网格搜索。但是我的一些模型最终是空的。如何让网格搜索功能忽略那些模型?
我想我可以有一个评分函数,如果模型为空,returns 0,但我不确定如何。
predictor = sklearn.svm.LinearSVC(penalty='l1', dual=False, class_weight='auto')
param_dist = {'C': pow(2.0, np.arange(-10, 11))}
learner = sklearn.grid_search.GridSearchCV(estimator=predictor,
param_grid=param_dist,
n_jobs=self.n_jobs, cv=5,
verbose=0)
learner.fit(X, y)
我的数据是这样的 learner
对象将选择对应于空模型的 C
。知道如何确保模型不为空吗?
编辑:"empty model" 我的意思是一个模型选择了 0 个要使用的特征。特别是对于 l1
正则化模型,这很容易发生。所以在这种情况下,如果SVM中的C
足够小,优化问题会找到0向量作为系数的最优解。因此 predictor.coef_
将是 0
s 的向量。
我不认为有这样的内置功能;但是,制作自定义网格搜索器很容易:
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
import itertools
from sklearn import metrics
import operator
def model_eval(X, y, model, cv):
scores = []
for train_idx, test_idx in cv:
X_train, y_train = X[train_idx], y[train_idx]
X_test, y_test = X[test_idx], y[test_idx]
model.fit(X_train, y_train)
nonzero_coefs = len(np.nonzero(model.coef_)[0]) #check for nonzero coefs
if nonzero_coefs == 0: #if they're all zero, don't evaluate any further; move to next hyperparameter combo
return 0
predictions = model.predict(X_test)
score = metrics.accuracy_score(y_test, predictions)
scores.append(score)
return np.array(scores).mean()
X, y = make_classification(n_samples=1000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=2,
random_state=0,
shuffle=False)
C = pow(2.0, np.arange(-20, 11))
penalty = {'l1', 'l2'}
parameter_grid = itertools.product(C, penalty)
kf = KFold(X.shape[0], n_folds=5) #use the same folds to evaluate each hyperparameter combo
hyperparameter_scores = {}
for C, penalty in parameter_grid:
model = svm.LinearSVC(dual=False, C=C, penalty=penalty)
result = model_eval(X, y, model, kf)
hyperparameter_scores[(C, penalty)] = result
sorted_scores = sorted(hyperparameter_scores.items(), key=operator.itemgetter(1))
best_parameters, best_score = sorted_scores[-1]
print best_parameters
print best_score
尝试实现自定义记分器,类似于:
import numpy as np
def scorer_(estimator, X, y):
# Your criterion here
if np.allclose(estimator.coef_, np.zeros_like(estimator.coef_)):
return 0
else:
return estimator.score(X, y)
learner = sklearn.grid_search.GridSearchCV(...
scoring=scorer_)