除了拟合和预测之外的 sklearn 管道方法
sklearn pipeline methods besides fit and predict
我正在做一个需要编写自己的转换器和估算器的金融项目,所以我的转换器下有一个 scores_ 方法,我的估算器下有一个残差方法:
例如:
class my_transformer(base.BaseEstimator, base.TransformerMixin):
def __init__(self, some_arg):
self.some_arg=some_arg
def fit(self, X, y=None):
return self
def scores_(self, X):
return somefunc(X, y, self.some_arg)
def transform(self, X):
scores=self.scores_(X)
return factorSelect(scores,X)
和
class my_estimator(base.BaseEstimator, base.RegressorMixin):
def __init__(self, some_arg):
self.some_arg=some_arg
def fit(self, X, y):
some_other_func(X,y,self.some_arg)
return self
def predict(self, X):
result=some_other_other_func(X)
return result
def residuals(self, X, y):
result=self.predict(X)
return result.sub(y,axis='index')
所以现在我这样设置管道:
pipe=Pipeline([
('transformer', my_transformer(some_arg)),
('Estimator',my_estimator(some_arg))
])
这在进行拟合和预测时效果很好:
pipe.fit(X,y)
pipe.predict(X)
但是,我需要为以下步骤获取 scores
和 residuals
。我只能通过以下方式访问:
pipe['transformer'].scores(X)
pipe['estimator'].residuals(Not_X,y)
在这里,.scores
效果很好,但是对于 .residuals
,我必须输入 Not_X=pipe['transformer'].fit_transform(X)
而不是 X..
这很麻烦,而且与使用pipeline的目的相矛盾... 那么,我应该如何使用PipeLine呢?
如果 PipeLine 不会这样做,还有其他建议吗???
谢谢!
How should I do it with Pipeline?
与您处理任何与其他人的代码交互的编程任务的方式相同——添加层直到它完成您想要的 ;)
特别是,您想要的功能并没有那么复杂。问题中最难的部分是弄清楚你想要的语法是什么样的(例如,你总是想转换你正在访问的任何函数的第一个参数并不明显),并且考虑到 sklearn 专注于干净的文档和简单的 API,他们还没有解决您的确切用例也就不足为奇了。
像下面这样的东西应该作为灵感(唯一真正挑剔的一点是使用 __getattr__
来覆盖属性访问——这就是我们可以键入 some_partial_pipeline.residuals
的原因,即使 PartialPipeline
没有 residuals
属性。请参阅 the getattr docs):
from sklearn.pipeline import Pipeline as _Pipeline
class PartialPipeline:
"""Represents a sequence of steps without any of the bells and whistles of a sklearn.pipeline.Pipeline"""
def __init__(self, steps):
self._steps = steps
def __getattr__(self, attr_name):
obj = getattr(self._steps[-1][-1], attr_name)
if not callable(obj):
return obj
def _f(X, *args, **kwargs):
for _,v in self._steps[:-1]:
X = v.transform(X)
return obj(X, *args, **kwargs)
return _f
class Pipeline(_Pipeline):
"""Wrapper around sklearn.pipeline.Pipeline allowing easy access to the attributes of its steps"""
def __init__(self, steps, *, memory=None, verbose=False):
self.__steps = dict(steps) # Python >=3.6 for dict ordering
self.__memory = memory
self.__verbose = verbose
super().__init__(steps, memory=memory, verbose=verbose)
def at(self, step_name):
i = list(self.__steps).index(step_name)
return PartialPipeline(list(self.__steps.items())[:i+1])
#
# Example use
#
pipe=Pipeline([
('transformer', my_transformer(some_arg))
, ('estimator', my_estimator(some_arg))
])
scores = pipe.at('transformer').scores(X)
residuals = pipe.at('estimator').residuals(X, y)
# The thing you were already trying
custom_residuals = pipe['estimator'].residuals(X_pretransformed, y)
# E.g., if the estimator were an MLPRegressor
loss_values = pipe.at('estimator').loss_curve_
loss_values = pipe['estimator'].loss_curve_
我正在做一个需要编写自己的转换器和估算器的金融项目,所以我的转换器下有一个 scores_ 方法,我的估算器下有一个残差方法:
例如:
class my_transformer(base.BaseEstimator, base.TransformerMixin):
def __init__(self, some_arg):
self.some_arg=some_arg
def fit(self, X, y=None):
return self
def scores_(self, X):
return somefunc(X, y, self.some_arg)
def transform(self, X):
scores=self.scores_(X)
return factorSelect(scores,X)
和
class my_estimator(base.BaseEstimator, base.RegressorMixin):
def __init__(self, some_arg):
self.some_arg=some_arg
def fit(self, X, y):
some_other_func(X,y,self.some_arg)
return self
def predict(self, X):
result=some_other_other_func(X)
return result
def residuals(self, X, y):
result=self.predict(X)
return result.sub(y,axis='index')
所以现在我这样设置管道:
pipe=Pipeline([
('transformer', my_transformer(some_arg)),
('Estimator',my_estimator(some_arg))
])
这在进行拟合和预测时效果很好:
pipe.fit(X,y)
pipe.predict(X)
但是,我需要为以下步骤获取 scores
和 residuals
。我只能通过以下方式访问:
pipe['transformer'].scores(X)
pipe['estimator'].residuals(Not_X,y)
在这里,.scores
效果很好,但是对于 .residuals
,我必须输入 Not_X=pipe['transformer'].fit_transform(X)
而不是 X..
这很麻烦,而且与使用pipeline的目的相矛盾... 那么,我应该如何使用PipeLine呢? 如果 PipeLine 不会这样做,还有其他建议吗???
谢谢!
How should I do it with Pipeline?
与您处理任何与其他人的代码交互的编程任务的方式相同——添加层直到它完成您想要的 ;)
特别是,您想要的功能并没有那么复杂。问题中最难的部分是弄清楚你想要的语法是什么样的(例如,你总是想转换你正在访问的任何函数的第一个参数并不明显),并且考虑到 sklearn 专注于干净的文档和简单的 API,他们还没有解决您的确切用例也就不足为奇了。
像下面这样的东西应该作为灵感(唯一真正挑剔的一点是使用 __getattr__
来覆盖属性访问——这就是我们可以键入 some_partial_pipeline.residuals
的原因,即使 PartialPipeline
没有 residuals
属性。请参阅 the getattr docs):
from sklearn.pipeline import Pipeline as _Pipeline
class PartialPipeline:
"""Represents a sequence of steps without any of the bells and whistles of a sklearn.pipeline.Pipeline"""
def __init__(self, steps):
self._steps = steps
def __getattr__(self, attr_name):
obj = getattr(self._steps[-1][-1], attr_name)
if not callable(obj):
return obj
def _f(X, *args, **kwargs):
for _,v in self._steps[:-1]:
X = v.transform(X)
return obj(X, *args, **kwargs)
return _f
class Pipeline(_Pipeline):
"""Wrapper around sklearn.pipeline.Pipeline allowing easy access to the attributes of its steps"""
def __init__(self, steps, *, memory=None, verbose=False):
self.__steps = dict(steps) # Python >=3.6 for dict ordering
self.__memory = memory
self.__verbose = verbose
super().__init__(steps, memory=memory, verbose=verbose)
def at(self, step_name):
i = list(self.__steps).index(step_name)
return PartialPipeline(list(self.__steps.items())[:i+1])
#
# Example use
#
pipe=Pipeline([
('transformer', my_transformer(some_arg))
, ('estimator', my_estimator(some_arg))
])
scores = pipe.at('transformer').scores(X)
residuals = pipe.at('estimator').residuals(X, y)
# The thing you were already trying
custom_residuals = pipe['estimator'].residuals(X_pretransformed, y)
# E.g., if the estimator were an MLPRegressor
loss_values = pipe.at('estimator').loss_curve_
loss_values = pipe['estimator'].loss_curve_