使用模型时先转换数据与在管道中完成所有操作会导致不同的结果
transforming data first vs doing everything in pipe results in different results when using a model
我想在管道中对数据进行所有自定义转换。我认为我可以使用它作为 pipe.fit_transform(X)
在模型中使用它之前转换我的 X,但我也认为我将能够附加到管道模型本身并简单地使用它作为一个使用 pipe.steps.append(('model', self.model))
.
不幸的是,在构建完所有内容后,我注意到在转换数据并直接在模型中使用它与在一个管道中完成所有操作时,我得到了不同的结果。有没有人经历过这样的事情?
添加代码:
# Base pipeline to be used
BASE_PIPE = Pipeline([
('dim_increase_num', data_num_mix()),
('dim_increase_cat', data_cat_mix()),
('start', data_get_dummies()),
('dm_correlation', data_x_corr_()),
('scaler', DFStandardScaler()),
('column_ectraction', ColumnExtractor(columns_catboost)),
])
class base_model_class:
def fit_predict(self, X_train:pd.DataFrame=X_train, y_train:pd.Series=y_train, X_test:pd.DataFrame=X_test):
return self.fit(X_train, y_train).predict(X_test)
def evaluate(self, X:pd.DataFrame=X, y:pd.Series=y):
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
y_pred = self.fit(X_train, y_train).predict(X_test)
result= r2_score(y_test, y_pred)
return result
class model_linear_regression(base_model_class):
def __init__(self, pipe=None, inverse=False):
self.name = 'Linear_Regression'
self.model = LinearRegression()
if pipe==None:
self.pipe = Pipeline([('model', self.model)])
else:
self.pipe = deepcopy(pipe)
self.pipe.steps.append(('model', self.model))
if inverse:
self.pipe = TransformedTargetRegressor( regressor=self.pipe,
func=np.log1p,
inverse_func=np.expm1)
def fit(self, X:pd.DataFrame=X_train, y:pd.Series=y_train):
self.pipe.fit(X, y)
return self
def predict(self, X:pd.DataFrame=X_test):
y_pred = self.pipe.predict(X)
return y_pred
然后,当使用所有东西时给出不同的 R2 分数:
Xx=BASE_PIPE.fit_transform(X)
model_linear_regression(inverse=False).evaluate(Xx,y)
>>> 0.7415005607713974
model_linear_regression(BASE_PIPE, inverse=False).evaluate(X,y)
>>> -6.306970505602111e+22
编辑:
提供使用的管道中的所有步骤:
class data_num_mix(BaseEstimator, TransformerMixin):
def __init__(self, columns:list=NUMERIC_FEATURES):
self.columns = columns
def fit(self, X, y = None):
return self
def transform(self, X, y = None):
X_ = X.copy()
self.frames = [X_]
for col in self.columns:
A = pd.DataFrame(X_[col].map(lambda x: np.sqrt(x) if x>0 else -np.sqrt(-x)))
A = A.rename(columns={col:col+'^s'})
self.frames += [A]
B = pd.DataFrame(X_[col] * X_[col])
B = B.rename(columns={col:col+'^2'})
self.frames += [B]
return pd.concat(self.frames, axis=1)
class data_cat_mix(BaseEstimator, TransformerMixin):
def __init__(self, columns:list=CATEGORICAL_FEATURES):
self.columns = columns
def fit(self, X, y = None):
return self
def transform(self, X, y = None) -> pd.DataFrame:
X_ = X.copy()
for col in self.columns:
df_col_count = X_[col].value_counts().to_frame().reset_index()
df_col_count.columns = ["var_name", "var_count"]
df_col_count["var_freq"] = df_col_count["var_count"] / df_col_count["var_count"].sum()
X_['C_'+col] = X_[col].replace(df_col_count.set_index('var_name')['var_count'])
X_['F_'+col] = X_[col].replace(df_col_count.set_index('var_name')['var_freq'])
return X_
class data_get_dummies(BaseEstimator, TransformerMixin):
def __init__(self, columns:list = CATEGORICAL_FEATURES):
self.columns = columns
self.encoder = make_column_transformer((OneHotEncoder(handle_unknown="ignore", sparse=False), self.columns),remainder='passthrough')
def fit(self, X, y = None):
self.encoder.fit(X)
return self
def transform(self, X, y = None) -> pd.DataFrame:
X_ = X.copy()
encoder_columns = self.encoder.get_feature_names_out()
fixed_columns = [x.replace('onehotencoder__','').replace('remainder__','') for x in encoder_columns ]
df_temp=pd.DataFrame(self.encoder.transform(X_), columns=fixed_columns)
return df_temp
class data_x_corr(BaseEstimator, TransformerMixin):
def __init__(self, columns:list=NUMERIC_FEATURES_, corr_val:float=0.95):
self.columns = columns
self.corr_val = corr_val
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
# prepare numeric df
X_ = X.copy()
x = X_[self.columns]
corr_matrix = x.corr(method='spearman')
iters = range(len(corr_matrix.columns) - 1)
drop_cols = []
# Iterates through Correlation Matrix Table to find correlated columns
for i in iters:
for j in range(i):
item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
col = item.columns
row = item.index
val = item.values
if val >= self.corr_val:
drop_cols.append(i)
drops = sorted(set(drop_cols))[::-1]
# Drops the correlated columns
for i in drops:
col = x.iloc[:, (i+1):(i+2)].columns.values
X_ = X_.drop(col, axis=1)
return X_
class DFStandardScaler(TransformerMixin):
# StandardScaler but for pandas DataFrames
def __init__(self):
self.ss = None
self.mean_ = None
self.scale_ = None
def fit(self, X, y=None):
self.ss = StandardScaler()
self.ss.fit(X)
self.mean_ = pd.Series(self.ss.mean_, index=X.columns)
self.scale_ = pd.Series(self.ss.scale_, index=X.columns)
return self
def transform(self, X) -> pd.DataFrame:
# assumes X is a DataFrame
Xss = self.ss.transform(X)
Xscaled = pd.DataFrame(Xss, index=X.index, columns=X.columns)
return Xscaled
def __str__(self):
return "DF_StandardScaler"
def __repr__(self):
return "DF_StandardScaler"
class ColumnExtractor(TransformerMixin, BaseEstimator):
def __init__(self, cols):
self.cols = cols
def fit(self, X, y=None):
# stateless transformer
return self
def transform(self, X):
# assumes X is a DataFrame
Xcols = X[self.cols]
return Xcols
对我来说最突出的一个转换器是 data_cat_mix
,特别是 count-of-level 列。当应用于训练+测试时,这些是一致的(但会泄漏测试信息);当单独应用时,train 中的值通常会高得多(只是因为它的大小是原来的三倍),所以模型并不真正理解如何在测试集中处理它们。
我想在管道中对数据进行所有自定义转换。我认为我可以使用它作为 pipe.fit_transform(X)
在模型中使用它之前转换我的 X,但我也认为我将能够附加到管道模型本身并简单地使用它作为一个使用 pipe.steps.append(('model', self.model))
.
不幸的是,在构建完所有内容后,我注意到在转换数据并直接在模型中使用它与在一个管道中完成所有操作时,我得到了不同的结果。有没有人经历过这样的事情?
添加代码:
# Base pipeline to be used
BASE_PIPE = Pipeline([
('dim_increase_num', data_num_mix()),
('dim_increase_cat', data_cat_mix()),
('start', data_get_dummies()),
('dm_correlation', data_x_corr_()),
('scaler', DFStandardScaler()),
('column_ectraction', ColumnExtractor(columns_catboost)),
])
class base_model_class:
def fit_predict(self, X_train:pd.DataFrame=X_train, y_train:pd.Series=y_train, X_test:pd.DataFrame=X_test):
return self.fit(X_train, y_train).predict(X_test)
def evaluate(self, X:pd.DataFrame=X, y:pd.Series=y):
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
y_pred = self.fit(X_train, y_train).predict(X_test)
result= r2_score(y_test, y_pred)
return result
class model_linear_regression(base_model_class):
def __init__(self, pipe=None, inverse=False):
self.name = 'Linear_Regression'
self.model = LinearRegression()
if pipe==None:
self.pipe = Pipeline([('model', self.model)])
else:
self.pipe = deepcopy(pipe)
self.pipe.steps.append(('model', self.model))
if inverse:
self.pipe = TransformedTargetRegressor( regressor=self.pipe,
func=np.log1p,
inverse_func=np.expm1)
def fit(self, X:pd.DataFrame=X_train, y:pd.Series=y_train):
self.pipe.fit(X, y)
return self
def predict(self, X:pd.DataFrame=X_test):
y_pred = self.pipe.predict(X)
return y_pred
然后,当使用所有东西时给出不同的 R2 分数:
Xx=BASE_PIPE.fit_transform(X)
model_linear_regression(inverse=False).evaluate(Xx,y)
>>> 0.7415005607713974
model_linear_regression(BASE_PIPE, inverse=False).evaluate(X,y)
>>> -6.306970505602111e+22
编辑: 提供使用的管道中的所有步骤:
class data_num_mix(BaseEstimator, TransformerMixin):
def __init__(self, columns:list=NUMERIC_FEATURES):
self.columns = columns
def fit(self, X, y = None):
return self
def transform(self, X, y = None):
X_ = X.copy()
self.frames = [X_]
for col in self.columns:
A = pd.DataFrame(X_[col].map(lambda x: np.sqrt(x) if x>0 else -np.sqrt(-x)))
A = A.rename(columns={col:col+'^s'})
self.frames += [A]
B = pd.DataFrame(X_[col] * X_[col])
B = B.rename(columns={col:col+'^2'})
self.frames += [B]
return pd.concat(self.frames, axis=1)
class data_cat_mix(BaseEstimator, TransformerMixin):
def __init__(self, columns:list=CATEGORICAL_FEATURES):
self.columns = columns
def fit(self, X, y = None):
return self
def transform(self, X, y = None) -> pd.DataFrame:
X_ = X.copy()
for col in self.columns:
df_col_count = X_[col].value_counts().to_frame().reset_index()
df_col_count.columns = ["var_name", "var_count"]
df_col_count["var_freq"] = df_col_count["var_count"] / df_col_count["var_count"].sum()
X_['C_'+col] = X_[col].replace(df_col_count.set_index('var_name')['var_count'])
X_['F_'+col] = X_[col].replace(df_col_count.set_index('var_name')['var_freq'])
return X_
class data_get_dummies(BaseEstimator, TransformerMixin):
def __init__(self, columns:list = CATEGORICAL_FEATURES):
self.columns = columns
self.encoder = make_column_transformer((OneHotEncoder(handle_unknown="ignore", sparse=False), self.columns),remainder='passthrough')
def fit(self, X, y = None):
self.encoder.fit(X)
return self
def transform(self, X, y = None) -> pd.DataFrame:
X_ = X.copy()
encoder_columns = self.encoder.get_feature_names_out()
fixed_columns = [x.replace('onehotencoder__','').replace('remainder__','') for x in encoder_columns ]
df_temp=pd.DataFrame(self.encoder.transform(X_), columns=fixed_columns)
return df_temp
class data_x_corr(BaseEstimator, TransformerMixin):
def __init__(self, columns:list=NUMERIC_FEATURES_, corr_val:float=0.95):
self.columns = columns
self.corr_val = corr_val
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
# prepare numeric df
X_ = X.copy()
x = X_[self.columns]
corr_matrix = x.corr(method='spearman')
iters = range(len(corr_matrix.columns) - 1)
drop_cols = []
# Iterates through Correlation Matrix Table to find correlated columns
for i in iters:
for j in range(i):
item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
col = item.columns
row = item.index
val = item.values
if val >= self.corr_val:
drop_cols.append(i)
drops = sorted(set(drop_cols))[::-1]
# Drops the correlated columns
for i in drops:
col = x.iloc[:, (i+1):(i+2)].columns.values
X_ = X_.drop(col, axis=1)
return X_
class DFStandardScaler(TransformerMixin):
# StandardScaler but for pandas DataFrames
def __init__(self):
self.ss = None
self.mean_ = None
self.scale_ = None
def fit(self, X, y=None):
self.ss = StandardScaler()
self.ss.fit(X)
self.mean_ = pd.Series(self.ss.mean_, index=X.columns)
self.scale_ = pd.Series(self.ss.scale_, index=X.columns)
return self
def transform(self, X) -> pd.DataFrame:
# assumes X is a DataFrame
Xss = self.ss.transform(X)
Xscaled = pd.DataFrame(Xss, index=X.index, columns=X.columns)
return Xscaled
def __str__(self):
return "DF_StandardScaler"
def __repr__(self):
return "DF_StandardScaler"
class ColumnExtractor(TransformerMixin, BaseEstimator):
def __init__(self, cols):
self.cols = cols
def fit(self, X, y=None):
# stateless transformer
return self
def transform(self, X):
# assumes X is a DataFrame
Xcols = X[self.cols]
return Xcols
对我来说最突出的一个转换器是 data_cat_mix
,特别是 count-of-level 列。当应用于训练+测试时,这些是一致的(但会泄漏测试信息);当单独应用时,train 中的值通常会高得多(只是因为它的大小是原来的三倍),所以模型并不真正理解如何在测试集中处理它们。