imblearn 管道框架内的数据预处理步骤
Data pre-processing steps within the imblearn pipeline framework
如何在下面的 imblearn.pipeline
管道中包含 SimpleImputer(strategy='constant',fill_value= 0))
和 ("scale", MaxAbsScaler())
?我没有看到在 imblearn.pipeline
管道框架中包含数据预处理步骤的任何选项。非常感谢您的建议!
smote_pipe = make_imb_pipeline(SMOTE(), LogisticRegression())
scores = cross_validate(smote_pipe, X_train, y_train, cv=10,
scoring=('roc_auc', 'average_precision'))
pd.DataFrame(scores)[['test_roc_auc', 'test_average_precision']].mean()
文档中可能没有提到它们,但您可以包括它们。
这是一个相当简单的可重现示例:
import numpy as np
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_imb_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
X = np.array([[-0.718,np.nan,-0.626,np.nan,-0.286,-0.262,-0.526,-0.662,-0.578,-0.418,-0.374,-0.482,-0.678,-0.678,-0.562,-0.362,-0.346,-0.562,-0.442,-0.466,-0.434,-0.314,-0.110,-0.146,-0.390,-0.614,-0.262,-0.310,-0.458,-0.350,-0.190,-0.0940,-0.0860,-0.182,-0.190,-0.170,-0.298,-0.562,-0.742,-0.750,-0.566,-0.438,0.0660,-0.0300,-0.0660,0.0300,-0.0220,-0.142,-0.122,0.0460,-0.0980,0.0300,0.138,0.150],[0.279,0.439,0.643,0.715,0.555,0.347,0.295,0.431,0.491,0.507,0.407,0.403,0.483,0.599,0.771,0.747,0.559,0.627,0.763,0.519,0.695,0.723,0.563,0.227,0.175,0.211,0.259,0.275,0.223,0.103,0.143,0.347,0.563,0.715,0.399,0.615,0.811,0.791,0.611,0.315,0.239,0.123,0.395,0.399,0.531,0.515,0.471,0.523,0.663,0.651,0.427,0.347,0.443,0.587]]).reshape(-1, 2)
y = np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1])
oversample_pipe = make_imb_pipeline(
SimpleImputer(strategy='constant', fill_value=0),
MinMaxScaler(),
SMOTE(),
LogisticRegression()
)
scores = cross_validate(
oversample_pipe, X, y, cv=5, scoring=("roc_auc", "average_precision")
)
print(scores["test_roc_auc"].mean())
print(scores["test_average_precision"].mean())
输出(大致上有一些随机性):
0.8375
0.7485714285714286
如何在下面的 imblearn.pipeline
管道中包含 SimpleImputer(strategy='constant',fill_value= 0))
和 ("scale", MaxAbsScaler())
?我没有看到在 imblearn.pipeline
管道框架中包含数据预处理步骤的任何选项。非常感谢您的建议!
smote_pipe = make_imb_pipeline(SMOTE(), LogisticRegression())
scores = cross_validate(smote_pipe, X_train, y_train, cv=10,
scoring=('roc_auc', 'average_precision'))
pd.DataFrame(scores)[['test_roc_auc', 'test_average_precision']].mean()
文档中可能没有提到它们,但您可以包括它们。
这是一个相当简单的可重现示例:
import numpy as np
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_imb_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
X = np.array([[-0.718,np.nan,-0.626,np.nan,-0.286,-0.262,-0.526,-0.662,-0.578,-0.418,-0.374,-0.482,-0.678,-0.678,-0.562,-0.362,-0.346,-0.562,-0.442,-0.466,-0.434,-0.314,-0.110,-0.146,-0.390,-0.614,-0.262,-0.310,-0.458,-0.350,-0.190,-0.0940,-0.0860,-0.182,-0.190,-0.170,-0.298,-0.562,-0.742,-0.750,-0.566,-0.438,0.0660,-0.0300,-0.0660,0.0300,-0.0220,-0.142,-0.122,0.0460,-0.0980,0.0300,0.138,0.150],[0.279,0.439,0.643,0.715,0.555,0.347,0.295,0.431,0.491,0.507,0.407,0.403,0.483,0.599,0.771,0.747,0.559,0.627,0.763,0.519,0.695,0.723,0.563,0.227,0.175,0.211,0.259,0.275,0.223,0.103,0.143,0.347,0.563,0.715,0.399,0.615,0.811,0.791,0.611,0.315,0.239,0.123,0.395,0.399,0.531,0.515,0.471,0.523,0.663,0.651,0.427,0.347,0.443,0.587]]).reshape(-1, 2)
y = np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1])
oversample_pipe = make_imb_pipeline(
SimpleImputer(strategy='constant', fill_value=0),
MinMaxScaler(),
SMOTE(),
LogisticRegression()
)
scores = cross_validate(
oversample_pipe, X, y, cv=5, scoring=("roc_auc", "average_precision")
)
print(scores["test_roc_auc"].mean())
print(scores["test_average_precision"].mean())
输出(大致上有一些随机性):
0.8375
0.7485714285714286