如何基于构造函数参数创建具有 class 属性的 subclass 以用于 GridSearchCV 的估算器?

How to create a subclass with class attributes based on constructor function arguments for use in an estimator for GridSearchCV?

我想子class sklearn.svm.LinearSVC 并将其用作 sklearn.model_selection.GridSearchCV 的估算器。我之前在使用 subclassing 时遇到了一些问题,我想我已经根据我之前的 和选择的答案修复了它。

但是,现在我的 objective 是创建一个 sklearn.kernel_approximation.RBFSampler 对象作为我的新 class 的属性。现在这是一个例子,我在这里有一个更广泛的问题是:

问题: 最终期望使用我的新估算器class和GridSearchCV,如何我根据传递给构造函数的参数值(或缺少参数值)创建属性?

到目前为止,我已经尝试过类似以下的操作:

from sklearn.datasets import make_classification
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.kernel_approximation import RBFSampler
from sklearn.datasets import load_breast_cancer

RANDOM_STATE = 123


class LinearSVCSub(LinearSVC):
    def __init__(self, penalty='l2', loss='squared_hinge', sampler_gamma=None, sampler_n=None,
                 dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1,
                 class_weight=None, verbose=0, random_state=None, max_iter=1000):

        super(LinearSVCSub, self).__init__(penalty=penalty, loss=loss, dual=dual, tol=tol,
                                           C=C, multi_class=multi_class, fit_intercept=fit_intercept,
                                           intercept_scaling=intercept_scaling, class_weight=class_weight,
                                           verbose=verbose, random_state=random_state, max_iter=max_iter)

        self.sampler_gamma = sampler_gamma
        self.sampler_n = sampler_n
        
        # I have also tried a conditional statement here instead of
        #  within a separate function create_sampler()
        self.sampler = create_sampler()
       

    def fit(self, X, y, sample_weight=None):
        X = self.transform_this(X)
        super(LinearSVCSub, self).fit(X, y, sample_weight)
        return self

    def predict(self, X):
        X = self.transform_this(X)
        return super(LinearSVCSub, self).predict(X)

    def score(self, X, y, sample_weight=None):
        X = self.transform_this(X)
        return super(LinearSVCSub, self).score(X, y, sample_weight)

    def decision_function(self, X):
        X = self.transform_this(X)
        return super(LinearSVCSub, self).decision_function(X)

    def transform_this(self, X):
        if self.sampler is not None:
            X = sampler.fit_transform(X)
        return X
    
    def create_sampler(self):
         # If sampler_gamma and sampler_n have been given, create a sampler
        if (self.sampler_gamma is not None) and (self.sampler_n is not None):
            sampler = RBFSampler(gamma=self.sampler_gamma, n_components=self.sampler_n)
        else:
            sampler = None
        
        return sampler


if __name__ == '__main__':
    data = load_breast_cancer()
    X, y = data.data, data.target

    # Parameter tuning with custom LinearSVC
    param_grid = {'C': [0.00001, 0.0005],
                      'dual': (True, False), 'random_state': [RANDOM_STATE],
                      'sampler_gamma': [0.90, 0.60, 0.30],
                      'sampler_n': [10, 200]}

    gs_model = GridSearchCV(estimator=LinearSVCSub(), verbose=1, param_grid=param_grid,
                            scoring='roc_auc', n_jobs=-1, cv=2)
    gs_model.fit(X, y)
    gs_model.cv_results_

不过,据我所知, GridSearchCV initiates the estimator objects with the default values first and has a similar implementation to the feature_importances_ attribute in sklearn.tree.DecisionTreeClassifier

此外,我从上面的代码中得到的错误是:

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-a11420cc931e> in <module>
     66                       'sampler_n': [10, 200]}
     67 
---> 68     gs_model = GridSearchCV(estimator=LinearSVCSub(), verbose=1, param_grid=param_grid,
     69                             scoring='roc_auc', n_jobs=-1, cv=2)
     70     gs_model.fit(X, y)

<ipython-input-6-a11420cc931e> in __init__(self, penalty, loss, sampler_gamma, sampler_n, dual, tol, C, multi_class, fit_intercept, intercept_scaling, class_weight, verbose, random_state, max_iter)
     21         self.sampler_n = sampler_n
     22 
---> 23         self.sampler = create_sampler()
     24 
     25 

NameError: name 'create_sampler' is not defined
  1. 使用__init__构造函数作为容器来存储属性。
  2. 在方法中做所有相应的逻辑
from sklearn.datasets import make_classification
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.kernel_approximation import RBFSampler
from sklearn.datasets import load_breast_cancer

RANDOM_STATE = 123


class LinearSVCSub(LinearSVC):
    
    def __init__(self, penalty='l2', loss='squared_hinge', sampler_gamma=None, sampler_n=None,
                 dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1,
                 class_weight=None, verbose=0, random_state=None, max_iter=1000, sampler=None):

        super(LinearSVCSub, self).__init__(penalty=penalty, loss=loss, dual=dual, tol=tol,
                                           C=C, multi_class=multi_class, fit_intercept=fit_intercept,
                                           intercept_scaling=intercept_scaling, class_weight=class_weight,
                                           verbose=verbose, random_state=random_state, max_iter=max_iter)

        self.sampler_gamma = sampler_gamma
        self.sampler_n = sampler_n
        self.sampler = sampler
       
    def fit(self, X, y, sample_weight=None):
        X = self.transform_this(X)
        super(LinearSVCSub, self).fit(X, y, sample_weight)
        return self

    def predict(self, X):
        X = self.transform_this(X)
        return super(LinearSVCSub, self).predict(X)

    def score(self, X, y, sample_weight=None):
        X = self.transform_this(X)
        return super(LinearSVCSub, self).score(X, y, sample_weight)

    def decision_function(self, X):
        X = self.transform_this(X)
        return super(LinearSVCSub, self).decision_function(X)

    def transform_this(self, X):
        if self.sampler:
            X = RBFSampler(gamma=self.sampler_gamma, n_components=self.sampler_n).fit_transform(X)
        return X



data = load_breast_cancer()
X, y = data.data, data.target

# Parameter tuning with custom LinearSVC
param_grid = {'C': [0.00001, 0.0005],
                  'dual': (True, False), 'random_state': [RANDOM_STATE],
                  'sampler_gamma': [0.90, 0.60, 0.30],
                  'sampler_n': [10, 200],
                  'sampler':[0,1]
             }

gs_model = GridSearchCV(estimator=LinearSVCSub(sampler=1), verbose=1, param_grid=param_grid,
                        scoring='roc_auc', n_jobs=-1, cv=2)
gs_model.fit(X, y)
gs_model.cv_results_