使用自定义分类器时 GridSearchCV 出现问题
Problem with GridSearchCV when using Custom Classifier
我已经编写了自己的 CustomClassifier,它可以将因变量二值化。这是代码
class OwnClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, estimator=None):
self.yt = None
if estimator is None:
estimator = LogisticRegression(solver='liblinear')
self.estimator = estimator
self.discr = KBinsDiscretizer(n_bins=4, encode='ordinal')
def fit(self, X, y):
self.yt = y.copy()
self.yt = self.discr.fit_transform(self.yt.reshape(-1, 1)).astype(int)
self.estimator.fit(X,self.yt.ravel())
return self
def predict(self, X):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y=None):
return accuracy_score(self.yt, self.predict(X))
在其上使用 GridSearchCV 时,抛出错误:
grid = [{'estimator__C': [1, 10, 100, 1000]}]
myLogi = OwnClassifier()
gridCv = GridSearchCV(myLogi, grid)
gridCv.fit(X, y)
分类器如何与 GridSearchCV 兼容?
我使用波士顿住房数据
boston_data = load_boston()
X = boston_data['data']
y = boston_data['target']
错误:
ValueError: Found input variables with inconsistent numbers of samples: [404, 102]
问题出在 score 方法中,因为您强制它始终使用训练数据 self.yt
来计算准确性,这就是回溯显示形状不兼容的原因。这已在以下代码中修复:
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_boston
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import GridSearchCV
class OwnClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, estimator=None):
if estimator is None:
estimator = LogisticRegression(solver='liblinear')
self.estimator = estimator
self.discr = KBinsDiscretizer(n_bins=4, encode='ordinal')
def fit(self, X, y):
# fit the discretizer
self.discr.fit(y.reshape(-1, 1))
# transform the target
yt = self.discr.transform(y.reshape(-1, 1)).astype(int).ravel()
# fit the model
self.estimator.fit(X, yt)
def predict(self, X):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
# transform the target using the fitted discretizer
yt = self.discr.transform(y.reshape(-1, 1)).astype(int).ravel()
# calculate the accuracy using the fitted model
return accuracy_score(yt, self.predict(X))
boston_data = load_boston()
X = boston_data['data']
y = boston_data['target']
grid = [{'estimator__C': [1, 10, 100, 1000]}]
myLogi = OwnClassifier()
gridCv = GridSearchCV(myLogi, grid)
gridCv.fit(X, y)
我已经编写了自己的 CustomClassifier,它可以将因变量二值化。这是代码
class OwnClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, estimator=None):
self.yt = None
if estimator is None:
estimator = LogisticRegression(solver='liblinear')
self.estimator = estimator
self.discr = KBinsDiscretizer(n_bins=4, encode='ordinal')
def fit(self, X, y):
self.yt = y.copy()
self.yt = self.discr.fit_transform(self.yt.reshape(-1, 1)).astype(int)
self.estimator.fit(X,self.yt.ravel())
return self
def predict(self, X):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y=None):
return accuracy_score(self.yt, self.predict(X))
在其上使用 GridSearchCV 时,抛出错误:
grid = [{'estimator__C': [1, 10, 100, 1000]}]
myLogi = OwnClassifier()
gridCv = GridSearchCV(myLogi, grid)
gridCv.fit(X, y)
分类器如何与 GridSearchCV 兼容?
我使用波士顿住房数据
boston_data = load_boston()
X = boston_data['data']
y = boston_data['target']
错误:
ValueError: Found input variables with inconsistent numbers of samples: [404, 102]
问题出在 score 方法中,因为您强制它始终使用训练数据 self.yt
来计算准确性,这就是回溯显示形状不兼容的原因。这已在以下代码中修复:
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_boston
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import GridSearchCV
class OwnClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, estimator=None):
if estimator is None:
estimator = LogisticRegression(solver='liblinear')
self.estimator = estimator
self.discr = KBinsDiscretizer(n_bins=4, encode='ordinal')
def fit(self, X, y):
# fit the discretizer
self.discr.fit(y.reshape(-1, 1))
# transform the target
yt = self.discr.transform(y.reshape(-1, 1)).astype(int).ravel()
# fit the model
self.estimator.fit(X, yt)
def predict(self, X):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
# transform the target using the fitted discretizer
yt = self.discr.transform(y.reshape(-1, 1)).astype(int).ravel()
# calculate the accuracy using the fitted model
return accuracy_score(yt, self.predict(X))
boston_data = load_boston()
X = boston_data['data']
y = boston_data['target']
grid = [{'estimator__C': [1, 10, 100, 1000]}]
myLogi = OwnClassifier()
gridCv = GridSearchCV(myLogi, grid)
gridCv.fit(X, y)