GridSearchCV 可以与自定义分类器一起使用吗?
Can GridSearchCV be used with a custom classifier?
我创建了一个自定义的手工编码分类器,它实现了标准的 sklearn 分类器函数(fit()
、predict()
和 predict_proba()
)。这可以直接与 sklearn 实用程序一起使用 GridSearchCV()
还是应该添加任何内容?
编辑 1:
根据 cel 的建议,我尝试直接应用它
第一步是添加 get_params 和 set_params,如 here 所述。
果然,完整的交叉验证程序确实 运行 但最终出现以下错误
return self._fit(X, y, ParameterGrid(self.param_grid))
best_estimator.fit(X, y, **self.fit_params)
AttributeError: 'NoneType' object has no attribute 'fit'
编辑 2:添加分类器代码(它是一个基于 theano 的逻辑回归分类器)
class LogisticRegression:
""" Apply minibatch logistic regression
:type n_in: int
:param n_in: number of input units, the dimension of the space in
which the datapoints lie
:type n_out: int
:param n_out: number of output units, the dimension of the space in
which the labels lie
"""
def __init__(self,n_in,n_out,batch_size=600,learning_rate=0.13,iters=500,verbose=0):
self.n_in = n_in
self.n_out = n_out
self.batch_size = batch_size
self.learning_rate = learning_rate
self.iters = iters
self.verbose = verbose
self.single_layer = Layer(self.n_in,self.n_out,T.nnet.softmax)
self.minibatch_count = 0
def get_params(self,deep=True):
return {"n_in" : self.n_in,"n_out" : self.n_out,"batch_size" : self.batch_size,
"learning_rate" : self.learning_rate,"iters" : self.iters,
"verbose" : self.verbose}
def set_params(self, **parameters):
for parameter, value in parameters.items():
setattr(self, parameter, value)
def minibatch_trainer(self,data_x,data_y):
n_batches = data_x.get_value(borrow=True).shape[0]/self.batch_size
tensor_x = T.matrix('x')
tensor_y = T.ivector('y')
index = T.lscalar('index')
cost = self.single_layer.negative_log_likelihood(tensor_x, tensor_y)
g_W = T.grad(cost,self.single_layer.W)
g_b = T.grad(cost,self.single_layer.b)
updates = [(self.single_layer.W,self.single_layer.W - g_W*self.learning_rate),
(self.single_layer.b,self.single_layer.b - g_b*self.learning_rate)]
train_batch = theano.function([index],[cost],
updates=updates,
givens={tensor_x : data_x[index*self.batch_size : (index + 1)*self.batch_size],
tensor_y : data_y[index*self.batch_size : (index + 1)*self.batch_size]})
return np.mean([train_batch(i) for i in xrange(n_batches)])
def fit(self,data_x,data_y):
data_x,data_y = shared_dataset(data_x,data_y)
start = time.clock()
for iter in xrange(self.iters):
train_err = self.minibatch_trainer(data_x,data_y)
if self.verbose==1: print "Iter %d --> %f" % (iter,train_err)
end = time.clock()
print "Finished Training Logistic Regression Model\n" \
"Iterations %d\n" \
"Time Taken : %d secs" % (self.iters,end - start)
return self
def partial_fit(self,data_x,data_y):
data_x,data_y = shared_dataset(data_x,data_y)
self.minibatch_count += 1
err = self.minibatch_trainer(data_x, data_y)
print "MiniBatch %d --> %f" % (self.minibatch_count,err)
def predict(self,data_x):
data_x = shared_dataset(data_x)
n_batches = data_x.get_value(borrow=True).shape[0]/self.batch_size
tensor_x = T.matrix('x')
index = T.lscalar('index')
tensor_ypred = self.prediction_tensor(tensor_x)
predictor = theano.function([index],tensor_ypred,
givens={tensor_x : data_x[index*self.batch_size:(index + 1)*self.batch_size]})
ypred = [predictor(i) for i in xrange(n_batches)]
return np.hstack(ypred)
def predict_proba(self,data_x):
data_x = shared_dataset(data_x)
tensor_x = T.matrix('x')
tensor_ypredproba = self.single_layer.decision_function_tensor(tensor_x)
predproba_func = theano.function([],tensor_ypredproba,
givens={tensor_x : data_x})
return predproba_func()
def prediction_tensor(self,tensor_x):
"""
Returns the predicted y value as a tensor variable
:param tensor_x: TensorType matrix on input data
:return: TensorType tensor_ypred output
"""
return T.argmax(self.single_layer.decision_function_tensor(tensor_x),axis=1)
编辑 3:添加 GridSearchCV 的确切用法
clf_cv = GridSearchCV(LogisticRegression(n_in=200,n_out=2),{"iters" : [3]},cv=4,scoring="roc_auc",n_jobs=-1,verbose=1)
我也试过添加 BaseEstimator 和 ClassifierMixin; sklearn.base.clone 不输出任何错误
几分钟前遇到了同样的问题。 documentation 不正确。您必须将 set_params
更改为 return self
:
def set_params(self, **parameters):
for parameter, value in parameters.items():
setattr(self, parameter, value)
return self
我创建了一个自定义的手工编码分类器,它实现了标准的 sklearn 分类器函数(fit()
、predict()
和 predict_proba()
)。这可以直接与 sklearn 实用程序一起使用 GridSearchCV()
还是应该添加任何内容?
编辑 1: 根据 cel 的建议,我尝试直接应用它
第一步是添加 get_params 和 set_params,如 here 所述。 果然,完整的交叉验证程序确实 运行 但最终出现以下错误
return self._fit(X, y, ParameterGrid(self.param_grid))
best_estimator.fit(X, y, **self.fit_params)
AttributeError: 'NoneType' object has no attribute 'fit'
编辑 2:添加分类器代码(它是一个基于 theano 的逻辑回归分类器)
class LogisticRegression:
""" Apply minibatch logistic regression
:type n_in: int
:param n_in: number of input units, the dimension of the space in
which the datapoints lie
:type n_out: int
:param n_out: number of output units, the dimension of the space in
which the labels lie
"""
def __init__(self,n_in,n_out,batch_size=600,learning_rate=0.13,iters=500,verbose=0):
self.n_in = n_in
self.n_out = n_out
self.batch_size = batch_size
self.learning_rate = learning_rate
self.iters = iters
self.verbose = verbose
self.single_layer = Layer(self.n_in,self.n_out,T.nnet.softmax)
self.minibatch_count = 0
def get_params(self,deep=True):
return {"n_in" : self.n_in,"n_out" : self.n_out,"batch_size" : self.batch_size,
"learning_rate" : self.learning_rate,"iters" : self.iters,
"verbose" : self.verbose}
def set_params(self, **parameters):
for parameter, value in parameters.items():
setattr(self, parameter, value)
def minibatch_trainer(self,data_x,data_y):
n_batches = data_x.get_value(borrow=True).shape[0]/self.batch_size
tensor_x = T.matrix('x')
tensor_y = T.ivector('y')
index = T.lscalar('index')
cost = self.single_layer.negative_log_likelihood(tensor_x, tensor_y)
g_W = T.grad(cost,self.single_layer.W)
g_b = T.grad(cost,self.single_layer.b)
updates = [(self.single_layer.W,self.single_layer.W - g_W*self.learning_rate),
(self.single_layer.b,self.single_layer.b - g_b*self.learning_rate)]
train_batch = theano.function([index],[cost],
updates=updates,
givens={tensor_x : data_x[index*self.batch_size : (index + 1)*self.batch_size],
tensor_y : data_y[index*self.batch_size : (index + 1)*self.batch_size]})
return np.mean([train_batch(i) for i in xrange(n_batches)])
def fit(self,data_x,data_y):
data_x,data_y = shared_dataset(data_x,data_y)
start = time.clock()
for iter in xrange(self.iters):
train_err = self.minibatch_trainer(data_x,data_y)
if self.verbose==1: print "Iter %d --> %f" % (iter,train_err)
end = time.clock()
print "Finished Training Logistic Regression Model\n" \
"Iterations %d\n" \
"Time Taken : %d secs" % (self.iters,end - start)
return self
def partial_fit(self,data_x,data_y):
data_x,data_y = shared_dataset(data_x,data_y)
self.minibatch_count += 1
err = self.minibatch_trainer(data_x, data_y)
print "MiniBatch %d --> %f" % (self.minibatch_count,err)
def predict(self,data_x):
data_x = shared_dataset(data_x)
n_batches = data_x.get_value(borrow=True).shape[0]/self.batch_size
tensor_x = T.matrix('x')
index = T.lscalar('index')
tensor_ypred = self.prediction_tensor(tensor_x)
predictor = theano.function([index],tensor_ypred,
givens={tensor_x : data_x[index*self.batch_size:(index + 1)*self.batch_size]})
ypred = [predictor(i) for i in xrange(n_batches)]
return np.hstack(ypred)
def predict_proba(self,data_x):
data_x = shared_dataset(data_x)
tensor_x = T.matrix('x')
tensor_ypredproba = self.single_layer.decision_function_tensor(tensor_x)
predproba_func = theano.function([],tensor_ypredproba,
givens={tensor_x : data_x})
return predproba_func()
def prediction_tensor(self,tensor_x):
"""
Returns the predicted y value as a tensor variable
:param tensor_x: TensorType matrix on input data
:return: TensorType tensor_ypred output
"""
return T.argmax(self.single_layer.decision_function_tensor(tensor_x),axis=1)
编辑 3:添加 GridSearchCV 的确切用法
clf_cv = GridSearchCV(LogisticRegression(n_in=200,n_out=2),{"iters" : [3]},cv=4,scoring="roc_auc",n_jobs=-1,verbose=1)
我也试过添加 BaseEstimator 和 ClassifierMixin; sklearn.base.clone 不输出任何错误
几分钟前遇到了同样的问题。 documentation 不正确。您必须将 set_params
更改为 return self
:
def set_params(self, **parameters):
for parameter, value in parameters.items():
setattr(self, parameter, value)
return self