分类概率校准代码
Code for probability calibration for classification
我正在尝试创建一个 class 来校准 class 转换器。我一直在阅读关于概率校准的资源,但我对我们应该校准 classifier 的数据集有点困惑。我创建了一个 class 来拆分训练集以进一步训练和验证该集。然后,classifier 首先拟合到训练集并预测验证集上的未校准概率。
然后,我创建了 CalibrationCV class 的 cal_model 实例,然后将其拟合到验证集并再次预测验证集的校准概率。
有人可以看看下面的代码并为我更正代码吗?
class calibrate_model:
"""
A class that will split the training dataset to both train and validation set and then does
probability calibration.
model = Classification model
Xtrain = Independent feature set
ytrain = target variable set
cv = cross validation method
cal_method = 'sigmoid' or 'isotonic'.
"""
def __init__(self, model, Xtrain, ytrain, cv, cal_method):
self.model = model
self.Xtrain = Xtrain
self.ytrain = ytrain
self.cv = cv
self.cal_method = cal_method
def calibrate_probability(self):
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV
from sklearn.calibration import calibration_curve
train_X, val_X, train_y, val_y = train_test_split(self.Xtrain,
self.ytrain,
test_size = 0.2,
random_state = seed)
#uncalibrated model
for train_index, test_index in self.cv.split(train_X, train_y):
X_train_kfold, X_val_kfold = train_X[train_index], train_X[test_index]
y_train_kfold, y_val_kfold = train_y[train_index], train_y[test_index]
self.model.fit(X_train_kfold, y_train_kfold)
uc_probs = self.model.predict_proba(val_X)[:, 1]
uc_fop, uc_mpv = calibration_curve(val_y, uc_probs, n_bins=10, normalize=True,
strategy = 'quantile')
#Calibrating Model
self.cal_model = CalibratedClassifierCV(self.model, method=self.cal_method, cv=self.cv)
self.cal_model.fit(val_X, val_y)
# predict probabilities
c_probs = self.cal_model.predict_proba(val_X)[:, 1]
# reliability diagram
c_fop, c_mpv = calibration_curve(val_y, c_probs, n_bins=10, normalize=True,
strategy = 'quantile')
# plot CATBOOST calibrated
plt.plot([0, 1], [0, 1], linestyle='--');
# plot un calibrated model reliability
plt.plot(uc_mpv, uc_fop, marker='.', label = 'Uncalibrated');
# plot calibrated reliability
plt.plot(c_mpv, c_fop, marker='.', label = 'Calibrated');
plt.title(type(self.model).__name__ + ' ' + self.cal_method)
plt.ylabel('Fraction of Positives (fop)')
plt.xlabel('Mean Predicted Value (mpv)')
plt.legend();
plt.tight_layout()
calibration_curve 代码是正确的。我正在比较逻辑回归校准与 xgboost 校准。数据帧包含 predict_proba[:,1] 值或发生的概率。见 (https://github.com/dnishimoto/python-deep-learning/blob/master/Credit%20Loan%20Risk%20.ipynb)
y_pred_prob_lr=pipeline['lr'].predict_proba(X_test)
y_preds_proba_lr_df=pd.DataFrame(y_pred_prob_lr[:,1],columns=
["pred_default_proba"])
xg_cl=
xgb.XGBClassifier(objective='binary:logistic',n_estimators=10,seed=123)
xg_cl.fit(X_train,y_train)
y_pred_xg=xg_cl.predict(X_test)
y_pred_proba_xg=xg_cl.predict_proba(X_test)
y_preds_proba_xg_df = pd.DataFrame(y_pred_proba_xg[:,1], columns =
['prob_default'])
frac_of_pos, mean_pred_val = calibration_curve(y_test,preds_proba_df , n_bins=10, normalize=True,
strategy = 'quantile')
frac_of_pos_lr, mean_pred_val_lr = calibration_curve(y_test,y_pred_prob_lr_df , n_bins=10, normalize=True,
strategy = 'quantile')
plt.plot([0, 1], [0, 1], 'k:', label="Perfectly calibrated")
plt.plot(mean_pred_val, frac_of_pos,
's-', label='%s' % 'XGBoost Regression')
plt.plot(mean_pred_val_lr, frac_of_pos_lr,
's-', label='%s' % 'Logistic Regression')
plt.xlabel('Fraction of positives')
plt.ylabel('Average Predicted Probability')
plt.legend()
plt.title('Calibration Curve')
plt.show()
我正在尝试创建一个 class 来校准 class 转换器。我一直在阅读关于概率校准的资源,但我对我们应该校准 classifier 的数据集有点困惑。我创建了一个 class 来拆分训练集以进一步训练和验证该集。然后,classifier 首先拟合到训练集并预测验证集上的未校准概率。
然后,我创建了 CalibrationCV class 的 cal_model 实例,然后将其拟合到验证集并再次预测验证集的校准概率。
有人可以看看下面的代码并为我更正代码吗?
class calibrate_model:
"""
A class that will split the training dataset to both train and validation set and then does
probability calibration.
model = Classification model
Xtrain = Independent feature set
ytrain = target variable set
cv = cross validation method
cal_method = 'sigmoid' or 'isotonic'.
"""
def __init__(self, model, Xtrain, ytrain, cv, cal_method):
self.model = model
self.Xtrain = Xtrain
self.ytrain = ytrain
self.cv = cv
self.cal_method = cal_method
def calibrate_probability(self):
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV
from sklearn.calibration import calibration_curve
train_X, val_X, train_y, val_y = train_test_split(self.Xtrain,
self.ytrain,
test_size = 0.2,
random_state = seed)
#uncalibrated model
for train_index, test_index in self.cv.split(train_X, train_y):
X_train_kfold, X_val_kfold = train_X[train_index], train_X[test_index]
y_train_kfold, y_val_kfold = train_y[train_index], train_y[test_index]
self.model.fit(X_train_kfold, y_train_kfold)
uc_probs = self.model.predict_proba(val_X)[:, 1]
uc_fop, uc_mpv = calibration_curve(val_y, uc_probs, n_bins=10, normalize=True,
strategy = 'quantile')
#Calibrating Model
self.cal_model = CalibratedClassifierCV(self.model, method=self.cal_method, cv=self.cv)
self.cal_model.fit(val_X, val_y)
# predict probabilities
c_probs = self.cal_model.predict_proba(val_X)[:, 1]
# reliability diagram
c_fop, c_mpv = calibration_curve(val_y, c_probs, n_bins=10, normalize=True,
strategy = 'quantile')
# plot CATBOOST calibrated
plt.plot([0, 1], [0, 1], linestyle='--');
# plot un calibrated model reliability
plt.plot(uc_mpv, uc_fop, marker='.', label = 'Uncalibrated');
# plot calibrated reliability
plt.plot(c_mpv, c_fop, marker='.', label = 'Calibrated');
plt.title(type(self.model).__name__ + ' ' + self.cal_method)
plt.ylabel('Fraction of Positives (fop)')
plt.xlabel('Mean Predicted Value (mpv)')
plt.legend();
plt.tight_layout()
calibration_curve 代码是正确的。我正在比较逻辑回归校准与 xgboost 校准。数据帧包含 predict_proba[:,1] 值或发生的概率。见 (https://github.com/dnishimoto/python-deep-learning/blob/master/Credit%20Loan%20Risk%20.ipynb)
y_pred_prob_lr=pipeline['lr'].predict_proba(X_test)
y_preds_proba_lr_df=pd.DataFrame(y_pred_prob_lr[:,1],columns=
["pred_default_proba"])
xg_cl=
xgb.XGBClassifier(objective='binary:logistic',n_estimators=10,seed=123)
xg_cl.fit(X_train,y_train)
y_pred_xg=xg_cl.predict(X_test)
y_pred_proba_xg=xg_cl.predict_proba(X_test)
y_preds_proba_xg_df = pd.DataFrame(y_pred_proba_xg[:,1], columns =
['prob_default'])
frac_of_pos, mean_pred_val = calibration_curve(y_test,preds_proba_df , n_bins=10, normalize=True,
strategy = 'quantile')
frac_of_pos_lr, mean_pred_val_lr = calibration_curve(y_test,y_pred_prob_lr_df , n_bins=10, normalize=True,
strategy = 'quantile')
plt.plot([0, 1], [0, 1], 'k:', label="Perfectly calibrated")
plt.plot(mean_pred_val, frac_of_pos,
's-', label='%s' % 'XGBoost Regression')
plt.plot(mean_pred_val_lr, frac_of_pos_lr,
's-', label='%s' % 'Logistic Regression')
plt.xlabel('Fraction of positives')
plt.ylabel('Average Predicted Probability')
plt.legend()
plt.title('Calibration Curve')
plt.show()