如何在 sk-learn 管道中使用我自己的自定义函数?
How can I use my own custom function in an sk-learn pipeline?
我是 sk-learn 管道的新手,想使用我自己的离散分箱形式。我需要根据与原始列相关联的另一列的累计和对一列值进行分类。我有一个工作函数:
def dynamic_bin(df, column, weight, minimum):
"""
Parameters
----------
df : dataframe
column : column to be binned
weight : column that will dictate the bin
minimum : minimum weight per bin
Returns
-------
df : dataframe with new binned column
"""
bins = [-np.inf]
labels = []
hold_over = []
for i in sorted(df[column].unique()):
g = df[df[column] == i].groupby(column).agg({weight:'sum'}).reset_index()
if g[weight].values[0] < minimum:
if hold_over is None:
hold_over.append(g[weight].values[0])
elif (sum(hold_over) + g[weight].values[0]) < minimum:
hold_over.append(g[weight].values[0])
elif (sum(hold_over) + g[weight].values[0]) >= minimum:
hold_over.clear()
bins.append(g[column].values[0])
labels.append(g[column].values[0])
elif g[weight].values[0] >= minimum:
bins.append(g[column].values[0])
labels.append(g[column].values[0])
bins.pop()
bins.append(np.inf)
str_column = str(column)+str("_binned")
# print(str_column)
df[str_column] = pd.cut(df[column],
bins = bins,
labels = labels)
return df
这就是我试图使它成为 class 的方式。
from sklearn.base import BaseEstimator, TransformerMixin
class dynamic_bin(BaseEstimator, TransformerMixin):
def __init__(self, weight, minimum):
self.weight = weight
self.minimum = minimum
def fit(self, X, y=None):
return self
def tranform(self, X):
"""
Parameters
----------
df : dataframe
column : column to be binned
weight : column that will dictate the bin
minimum : minimum weight per bin
Returns
-------
df : dataframe with new binned column
"""
bins = [-np.inf]
labels = []
hold_over = []
for i in sorted(df[column].unique()):
g = df[df[column] == i].groupby(column).agg({weight:'sum'}).reset_index()
if g[weight].values[0] < minimum:
if hold_over is None:
hold_over.append(g[weight].values[0])
elif (sum(hold_over) + g[weight].values[0]) < minimum:
hold_over.append(g[weight].values[0])
elif (sum(hold_over) + g[weight].values[0]) >= minimum:
hold_over.clear()
bins.append(g[column].values[0])
labels.append(g[column].values[0])
elif g[weight].values[0] >= minimum:
bins.append(g[column].values[0])
labels.append(g[column].values[0])
bins.pop()
bins.append(np.inf)
str_column = str(column)+str("_binned")
# print(str_column)
df[str_column] = pd.cut(df[column],
bins = bins,
labels = labels)
return df[str_column]
当我尝试按以下方式实现它时,我得到了下面的错误:
column_trans = ColumnTransformer(
[
("binned_numeric", dynamic_bin(weight = 'Exposure', minimum = 1000),
["VehAge", "DrivAge"]),
("onehot_categorical", OneHotEncoder(),
["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
("passthrough_numeric", "passthrough",
["BonusMalus"]),
("log_scaled_numeric", log_scale_transformer,
["Density"]),
],
remainder="drop",
)
X = column_trans.fit_transform(df)
TypeError: All estimators should implement fit and transform, or can be 'drop' or 'passthrough' specifiers. 'dynamic_bin(minimum=1000, weight='Exposure')' (type <class 'dynamic_bin.dynamic_bin'>) doesn't.
我阅读了以下内容,但我并没有真正理解它。
有人发现我犯的错误吗?
错误本身是由于您的方法声明中的拼写错误造成的。您在自定义转换器 class 中实现了一个名为 tranform
的函数(注意缺少的 's')。这就是解释器抱怨您的自定义转换器未实现 transform
.
的原因
虽然这将是一个简单的修复,但您还应该知道您没有调整要在您定义的 class 中使用的自定义函数。例如:
- 变量
df
应重命名为 X
weight
和 minimum
现在是对象属性,需要引用为 self.weight
和 self.minimum
- 变量
column
未声明
您还需要解决这些问题。关于这一点,请注意 ColumnTransformer
只会将列的子集传递给打算由该特定转换器转换的转换器。这意味着,如果您仅将 VehAge
和 DrivAge
列传递给 dynamic_bin
,则它无法访问 Exposure
.
列
我是 sk-learn 管道的新手,想使用我自己的离散分箱形式。我需要根据与原始列相关联的另一列的累计和对一列值进行分类。我有一个工作函数:
def dynamic_bin(df, column, weight, minimum):
"""
Parameters
----------
df : dataframe
column : column to be binned
weight : column that will dictate the bin
minimum : minimum weight per bin
Returns
-------
df : dataframe with new binned column
"""
bins = [-np.inf]
labels = []
hold_over = []
for i in sorted(df[column].unique()):
g = df[df[column] == i].groupby(column).agg({weight:'sum'}).reset_index()
if g[weight].values[0] < minimum:
if hold_over is None:
hold_over.append(g[weight].values[0])
elif (sum(hold_over) + g[weight].values[0]) < minimum:
hold_over.append(g[weight].values[0])
elif (sum(hold_over) + g[weight].values[0]) >= minimum:
hold_over.clear()
bins.append(g[column].values[0])
labels.append(g[column].values[0])
elif g[weight].values[0] >= minimum:
bins.append(g[column].values[0])
labels.append(g[column].values[0])
bins.pop()
bins.append(np.inf)
str_column = str(column)+str("_binned")
# print(str_column)
df[str_column] = pd.cut(df[column],
bins = bins,
labels = labels)
return df
这就是我试图使它成为 class 的方式。
from sklearn.base import BaseEstimator, TransformerMixin
class dynamic_bin(BaseEstimator, TransformerMixin):
def __init__(self, weight, minimum):
self.weight = weight
self.minimum = minimum
def fit(self, X, y=None):
return self
def tranform(self, X):
"""
Parameters
----------
df : dataframe
column : column to be binned
weight : column that will dictate the bin
minimum : minimum weight per bin
Returns
-------
df : dataframe with new binned column
"""
bins = [-np.inf]
labels = []
hold_over = []
for i in sorted(df[column].unique()):
g = df[df[column] == i].groupby(column).agg({weight:'sum'}).reset_index()
if g[weight].values[0] < minimum:
if hold_over is None:
hold_over.append(g[weight].values[0])
elif (sum(hold_over) + g[weight].values[0]) < minimum:
hold_over.append(g[weight].values[0])
elif (sum(hold_over) + g[weight].values[0]) >= minimum:
hold_over.clear()
bins.append(g[column].values[0])
labels.append(g[column].values[0])
elif g[weight].values[0] >= minimum:
bins.append(g[column].values[0])
labels.append(g[column].values[0])
bins.pop()
bins.append(np.inf)
str_column = str(column)+str("_binned")
# print(str_column)
df[str_column] = pd.cut(df[column],
bins = bins,
labels = labels)
return df[str_column]
当我尝试按以下方式实现它时,我得到了下面的错误:
column_trans = ColumnTransformer(
[
("binned_numeric", dynamic_bin(weight = 'Exposure', minimum = 1000),
["VehAge", "DrivAge"]),
("onehot_categorical", OneHotEncoder(),
["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
("passthrough_numeric", "passthrough",
["BonusMalus"]),
("log_scaled_numeric", log_scale_transformer,
["Density"]),
],
remainder="drop",
)
X = column_trans.fit_transform(df)
TypeError: All estimators should implement fit and transform, or can be 'drop' or 'passthrough' specifiers. 'dynamic_bin(minimum=1000, weight='Exposure')' (type <class 'dynamic_bin.dynamic_bin'>) doesn't.
我阅读了以下内容,但我并没有真正理解它。
有人发现我犯的错误吗?
错误本身是由于您的方法声明中的拼写错误造成的。您在自定义转换器 class 中实现了一个名为 tranform
的函数(注意缺少的 's')。这就是解释器抱怨您的自定义转换器未实现 transform
.
虽然这将是一个简单的修复,但您还应该知道您没有调整要在您定义的 class 中使用的自定义函数。例如:
- 变量
df
应重命名为X
weight
和minimum
现在是对象属性,需要引用为self.weight
和self.minimum
- 变量
column
未声明
您还需要解决这些问题。关于这一点,请注意 ColumnTransformer
只会将列的子集传递给打算由该特定转换器转换的转换器。这意味着,如果您仅将 VehAge
和 DrivAge
列传递给 dynamic_bin
,则它无法访问 Exposure
.