分类概率校准代码

Code for probability calibration for classification

我正在尝试创建一个 class 来校准 class 转换器。我一直在阅读关于概率校准的资源,但我对我们应该校准 classifier 的数据集有点困惑。我创建了一个 class 来拆分训练集以进一步训练和验证该集。然后,classifier 首先拟合到训练集并预测验证集上的未校准概率。

然后,我创建了 CalibrationCV class 的 cal_model 实例,然后将其拟合到验证集并再次预测验证集的校准概率。

有人可以看看下面的代码并为我更正代码吗?

class calibrate_model:
    """
    A class that will split the training dataset to both train and validation set and then does
    probability calibration.
    
    model = Classification model
    Xtrain = Independent feature set
    ytrain = target variable set
    cv = cross validation method
    cal_method = 'sigmoid' or 'isotonic'.
    
    """
    def __init__(self, model, Xtrain, ytrain, cv, cal_method):
        self.model = model
        self.Xtrain = Xtrain
        self.ytrain = ytrain
        self.cv = cv
        self.cal_method = cal_method
        
    def calibrate_probability(self):
        
        from sklearn.model_selection import train_test_split
        from sklearn.calibration import CalibratedClassifierCV
        from sklearn.calibration import calibration_curve
        
        train_X, val_X, train_y, val_y = train_test_split(self.Xtrain, 
                                                          self.ytrain, 
                                                          test_size = 0.2, 
                                                          random_state = seed)
        
        
        #uncalibrated model
        
        for train_index, test_index in self.cv.split(train_X, train_y): 
            X_train_kfold, X_val_kfold = train_X[train_index], train_X[test_index] 
            y_train_kfold, y_val_kfold = train_y[train_index], train_y[test_index] 
            self.model.fit(X_train_kfold, y_train_kfold)
            
        uc_probs = self.model.predict_proba(val_X)[:, 1]
        uc_fop, uc_mpv = calibration_curve(val_y, uc_probs, n_bins=10, normalize=True, 
                                           strategy = 'quantile')
    

        #Calibrating Model
        self.cal_model = CalibratedClassifierCV(self.model, method=self.cal_method, cv=self.cv)
        self.cal_model.fit(val_X, val_y)
        
        # predict probabilities
        c_probs = self.cal_model.predict_proba(val_X)[:, 1]
        
        # reliability diagram
        c_fop, c_mpv = calibration_curve(val_y, c_probs, n_bins=10, normalize=True,
                                        strategy = 'quantile')

        # plot CATBOOST calibrated
        plt.plot([0, 1], [0, 1], linestyle='--');

        # plot un calibrated model reliability 
        plt.plot(uc_mpv, uc_fop, marker='.', label = 'Uncalibrated');

        # plot calibrated reliability
        plt.plot(c_mpv, c_fop, marker='.', label = 'Calibrated');

        plt.title(type(self.model).__name__ + ' ' + self.cal_method)
        plt.ylabel('Fraction of Positives (fop)')
        plt.xlabel('Mean Predicted Value (mpv)')
        plt.legend();
        plt.tight_layout()

calibration_curve 代码是正确的。我正在比较逻辑回归校准与 xgboost 校准。数据帧包含 predict_proba[:,1] 值或发生的概率。见 (https://github.com/dnishimoto/python-deep-learning/blob/master/Credit%20Loan%20Risk%20.ipynb)

 y_pred_prob_lr=pipeline['lr'].predict_proba(X_test)
 y_preds_proba_lr_df=pd.DataFrame(y_pred_prob_lr[:,1],columns= 
 ["pred_default_proba"])

 xg_cl= 
 xgb.XGBClassifier(objective='binary:logistic',n_estimators=10,seed=123)

 xg_cl.fit(X_train,y_train)

  y_pred_xg=xg_cl.predict(X_test)
  y_pred_proba_xg=xg_cl.predict_proba(X_test)

  y_preds_proba_xg_df = pd.DataFrame(y_pred_proba_xg[:,1], columns = 
  ['prob_default'])


  frac_of_pos, mean_pred_val = calibration_curve(y_test,preds_proba_df , n_bins=10, normalize=True,
                                    strategy = 'quantile')

  frac_of_pos_lr, mean_pred_val_lr = calibration_curve(y_test,y_pred_prob_lr_df , n_bins=10, normalize=True,
                                    strategy = 'quantile')



  plt.plot([0, 1], [0, 1], 'k:', label="Perfectly calibrated")    
  plt.plot(mean_pred_val, frac_of_pos,
     's-', label='%s' % 'XGBoost Regression')

  plt.plot(mean_pred_val_lr, frac_of_pos_lr,
     's-', label='%s' % 'Logistic Regression')

  plt.xlabel('Fraction of positives')
  plt.ylabel('Average Predicted Probability')
  plt.legend()

  plt.title('Calibration Curve')
  plt.show()