在 imblearn 管道中使用 SMOTENC 实现 FAMD 时出现 AttributeError
AttributeError while implementing FAMD with SMOTENC in a imblearn pipeline
我正在尝试使用 FAMD、SMOTENC 和其他预处理步骤来实施管道。但是每次都会出错。如果我从管道中删除 FAMD 它工作正常。
我的代码:
#Seperate the dataset in two parts
num_df= X_train_new.select_dtypes(include=[np.number]).columns
cat_df= X_train_new.select_dtypes(exclude=[np.number]).columns
#Create a mask for categorical features
categorical_feature_mask = X_train_new.dtypes == object
print(categorical_feature_mask)
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
#Create a pipeline to automate the preprocessing steps and SMOTENC together
num_pipe = make_pipeline(SimpleImputer(strategy='median'))
cat_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'),
OneHotEncoder(handle_unknown='ignore'))
transformer= make_column_transformer((num_pipe, selector(dtype_include='number')),
(cat_pipe, selector(dtype_include='object')),n_jobs=2)
#Undersampling with SMOTENC
from imblearn.over_sampling import SMOTENC
smote= SMOTENC(categorical_features=categorical_feature_mask,random_state=99)
!pip install prince
from prince import FAMD
famd=FAMD(n_components=4,random_state=99)
from imblearn.pipeline import make_pipeline as imb_pipeline
#Fit the random forest learner
rf=RandomForestClassifier(n_estimators=300random_state=99)
pipe=imb_pipeline(transformer,smote,famd,rf)
pipe.fit(X_train_new,y_train_new)
print('Training Accuracy:%s'%pipe.score(X_train_new,y_train_new))
错误:
AttributeError Traceback (most recent call last)
<ipython-input-24-2b7ea084a318> in <module>()
3 rf=RandomForestClassifier(n_estimators=300,max_features=3,criterion='entropy',random_state=99)
4 pipe=imb_pipeline(transformer,smote,famd,rf)
----> 5 pipe.fit(X_train_new,y_train_new)
6 print('Training Accuracy:%s'%pipe.score(X_train_new,y_train_new))
6 frames
/usr/local/lib/python3.7/dist-packages/imblearn/pipeline.py in fit(self, X, y, **fit_params)
235
236 """
--> 237 Xt, yt, fit_params = self._fit(X, y, **fit_params)
238 if self._final_estimator is not None:
239 self._final_estimator.fit(Xt, yt, **fit_params)
/usr/local/lib/python3.7/dist-packages/imblearn/pipeline.py in _fit(self, X, y, **fit_params)
195 Xt, fitted_transformer = fit_transform_one_cached(
196 cloned_transformer, None, Xt, yt,
--> 197 **fit_params_steps[name])
198 elif hasattr(cloned_transformer, "fit_resample"):
199 Xt, yt, fitted_transformer = fit_resample_one_cached(
/usr/local/lib/python3.7/dist-packages/joblib/memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
/usr/local/lib/python3.7/dist-packages/imblearn/pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
564 def _fit_transform_one(transformer, weight, X, y, **fit_params):
565 if hasattr(transformer, 'fit_transform'):
--> 566 res = transformer.fit_transform(X, y, **fit_params)
567 else:
568 res = transformer.fit(X, y, **fit_params).transform(X)
/usr/local/lib/python3.7/dist-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
572 else:
573 # fit method of arity 2 (supervised transformation)
--> 574 return self.fit(X, y, **fit_params).transform(X)
575
576
/usr/local/lib/python3.7/dist-packages/prince/famd.py in fit(self, X, y)
27
28 # Separate numerical columns from categorical columns
---> 29 num_cols = X.select_dtypes(np.number).columns.tolist()
30 cat_cols = list(set(X.columns) - set(num_cols))
31
/usr/local/lib/python3.7/dist-packages/scipy/sparse/base.py in __getattr__(self, attr)
689 return self.getnnz()
690 else:
--> 691 raise AttributeError(attr + " not found")
692
693 def transpose(self, axes=None, copy=False):
AttributeError: select_dtypes not found
tl;dr:尝试将 sparse=False
添加到您的 OneHotEncoder
。考虑使用 prince
提出问题以处理稀疏输入。
您可以从回溯中看到问题是 FAMD.fit
尝试 X.select_dtypes
来分离分类数据和数字数据。 select_dtypes
是一个 pandas 函数,所以通常我会假设 prince
被编写为对数据帧进行操作,而不是 sklearn 内部使用的 numpy 数组(必要时从帧转换后)。但是,查看源代码,他们确实从 numpy 数组转换为数据帧。 但是,最后一条跟踪消息来自scipy。这暗示您的 X
实际上可能是一个稀疏数组。事实上 OneHotEncoder
(在你的管道中的早期)更喜欢输出稀疏数组,并且 ColumnTransformer
根据其组成部分和参数 sparse_threshold
.[=21 确定是转换为稀疏还是密集=]
我正在尝试使用 FAMD、SMOTENC 和其他预处理步骤来实施管道。但是每次都会出错。如果我从管道中删除 FAMD 它工作正常。
我的代码:
#Seperate the dataset in two parts
num_df= X_train_new.select_dtypes(include=[np.number]).columns
cat_df= X_train_new.select_dtypes(exclude=[np.number]).columns
#Create a mask for categorical features
categorical_feature_mask = X_train_new.dtypes == object
print(categorical_feature_mask)
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
#Create a pipeline to automate the preprocessing steps and SMOTENC together
num_pipe = make_pipeline(SimpleImputer(strategy='median'))
cat_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'),
OneHotEncoder(handle_unknown='ignore'))
transformer= make_column_transformer((num_pipe, selector(dtype_include='number')),
(cat_pipe, selector(dtype_include='object')),n_jobs=2)
#Undersampling with SMOTENC
from imblearn.over_sampling import SMOTENC
smote= SMOTENC(categorical_features=categorical_feature_mask,random_state=99)
!pip install prince
from prince import FAMD
famd=FAMD(n_components=4,random_state=99)
from imblearn.pipeline import make_pipeline as imb_pipeline
#Fit the random forest learner
rf=RandomForestClassifier(n_estimators=300random_state=99)
pipe=imb_pipeline(transformer,smote,famd,rf)
pipe.fit(X_train_new,y_train_new)
print('Training Accuracy:%s'%pipe.score(X_train_new,y_train_new))
错误:
AttributeError Traceback (most recent call last)
<ipython-input-24-2b7ea084a318> in <module>()
3 rf=RandomForestClassifier(n_estimators=300,max_features=3,criterion='entropy',random_state=99)
4 pipe=imb_pipeline(transformer,smote,famd,rf)
----> 5 pipe.fit(X_train_new,y_train_new)
6 print('Training Accuracy:%s'%pipe.score(X_train_new,y_train_new))
6 frames
/usr/local/lib/python3.7/dist-packages/imblearn/pipeline.py in fit(self, X, y, **fit_params)
235
236 """
--> 237 Xt, yt, fit_params = self._fit(X, y, **fit_params)
238 if self._final_estimator is not None:
239 self._final_estimator.fit(Xt, yt, **fit_params)
/usr/local/lib/python3.7/dist-packages/imblearn/pipeline.py in _fit(self, X, y, **fit_params)
195 Xt, fitted_transformer = fit_transform_one_cached(
196 cloned_transformer, None, Xt, yt,
--> 197 **fit_params_steps[name])
198 elif hasattr(cloned_transformer, "fit_resample"):
199 Xt, yt, fitted_transformer = fit_resample_one_cached(
/usr/local/lib/python3.7/dist-packages/joblib/memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
/usr/local/lib/python3.7/dist-packages/imblearn/pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
564 def _fit_transform_one(transformer, weight, X, y, **fit_params):
565 if hasattr(transformer, 'fit_transform'):
--> 566 res = transformer.fit_transform(X, y, **fit_params)
567 else:
568 res = transformer.fit(X, y, **fit_params).transform(X)
/usr/local/lib/python3.7/dist-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
572 else:
573 # fit method of arity 2 (supervised transformation)
--> 574 return self.fit(X, y, **fit_params).transform(X)
575
576
/usr/local/lib/python3.7/dist-packages/prince/famd.py in fit(self, X, y)
27
28 # Separate numerical columns from categorical columns
---> 29 num_cols = X.select_dtypes(np.number).columns.tolist()
30 cat_cols = list(set(X.columns) - set(num_cols))
31
/usr/local/lib/python3.7/dist-packages/scipy/sparse/base.py in __getattr__(self, attr)
689 return self.getnnz()
690 else:
--> 691 raise AttributeError(attr + " not found")
692
693 def transpose(self, axes=None, copy=False):
AttributeError: select_dtypes not found
tl;dr:尝试将 sparse=False
添加到您的 OneHotEncoder
。考虑使用 prince
提出问题以处理稀疏输入。
您可以从回溯中看到问题是 FAMD.fit
尝试 X.select_dtypes
来分离分类数据和数字数据。 select_dtypes
是一个 pandas 函数,所以通常我会假设 prince
被编写为对数据帧进行操作,而不是 sklearn 内部使用的 numpy 数组(必要时从帧转换后)。但是,查看源代码,他们确实从 numpy 数组转换为数据帧。 但是,最后一条跟踪消息来自scipy。这暗示您的 X
实际上可能是一个稀疏数组。事实上 OneHotEncoder
(在你的管道中的早期)更喜欢输出稀疏数组,并且 ColumnTransformer
根据其组成部分和参数 sparse_threshold
.[=21 确定是转换为稀疏还是密集=]