绘制多类问题的 ROC 曲线

Question

我正在尝试将 sklearn ROC extension to multiclass 的想法应用于我的数据集。我的每个 class ROC 曲线看起来都找到了一条直线，取消了显示曲线波动的 sklearn 示例。

我在下面给出一个 MWE 来说明我的意思：

# all imports
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.datasets import  make_classification
from sklearn.ensemble import RandomForestClassifier
# dummy dataset
X, y = make_classification(10000, n_classes=5, n_informative=10, weights=[.04, .4, .12, .5, .04])
train, test, ytrain, ytest = train_test_split(X, y, test_size=.3, random_state=42)

# random forest model
model = RandomForestClassifier()
model.fit(train, ytrain)
yhat = model.predict(test)

然后以下函数绘制 ROC 曲线：

def plot_roc_curve(y_test, y_pred):
  
  n_classes = len(np.unique(y_test))
  y_test = label_binarize(y_test, classes=np.arange(n_classes))
  y_pred = label_binarize(y_pred, classes=np.arange(n_classes))

  # Compute ROC curve and ROC area for each class
  fpr = dict()
  tpr = dict()
  roc_auc = dict()
  for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
  
  # Compute micro-average ROC curve and ROC area
  fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
  roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

  # First aggregate all false positive rates
  all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

  # Then interpolate all ROC curves at this points
  mean_tpr = np.zeros_like(all_fpr)
  for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

  # Finally average it and compute AUC
  mean_tpr /= n_classes

  fpr["macro"] = all_fpr
  tpr["macro"] = mean_tpr
  roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

  # Plot all ROC curves
  #plt.figure(figsize=(10,5))
  plt.figure(dpi=600)
  lw = 2
  plt.plot(fpr["micro"], tpr["micro"],
    label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
    color="deeppink", linestyle=":", linewidth=4,)

  plt.plot(fpr["macro"], tpr["macro"],
    label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
    color="navy", linestyle=":", linewidth=4,)

  colors = cycle(["aqua", "darkorange", "darkgreen", "yellow", "blue"])
  for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
        label="ROC curve of class {0} (area = {1:0.2f})".format(i, roc_auc[i]),)

  plt.plot([0, 1], [0, 1], "k--", lw=lw)
  plt.xlim([0.0, 1.0])
  plt.ylim([0.0, 1.05])
  plt.xlabel("False Positive Rate")
  plt.ylabel("True Positive Rate")
  plt.title("Receiver Operating Characteristic (ROC) curve")
  plt.legend()

输出：

plot_roc_curve(ytest, yhat)

那种直线弯曲一次。我想看到模型在不同阈值下的表现，而不仅仅是一个，类似于 sklearn's illustration for 3-classes 如下图所示：

Answer 1

重点是您使用 predict() 而不是 predict_proba()/decision_function() 来定义您的 y_hat。这意味着 - 考虑到阈值向量是由 y_hat 中不同值的数量定义的（请参阅 here 以供参考），每个 class [=41= 的阈值很少]only 在其上计算 tpr 和 fpr（这反过来意味着你的曲线只在几个点上被评估）。
确实，请考虑 doc 表示要在 roc_curve() 中传递给 y_scores 的内容，无论是概率估计值还是决策值。在 sklearn 的示例中，决策值用于计算分数。鉴于您正在考虑 RandomForestClassifier()，考虑 y_hat 中的概率估计应该是可行的方法。

标签二值化输出的意义何在？ ROC 的标准定义是二进制 classification。要传递给 multiclass 问题，您必须使用 OneVsAll 方法将您的问题转换为二元问题，这样您将拥有 n_class 条 ROC 曲线。（事实上，由于 SVC() 默认以 OvO 方式处理 multiclass 问题，在示例中，他们不得不通过应用 OneVsRestClassifier 构造函数强制使用 OvA；使用 RandomForestClassifier 你没有这样的问题，因为它本质上是 multiclass，请参阅 here 以供参考）。用这些术语来说，一旦切换到 predict_proba()，您就会发现标签二值化预测没有多大意义。

 # all imports
 import numpy as np
 import matplotlib.pyplot as plt
 from itertools import cycle
 from sklearn import svm, datasets
 from sklearn.metrics import roc_curve, auc
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import label_binarize
 from sklearn.datasets import  make_classification
 from sklearn.ensemble import RandomForestClassifier
 # dummy dataset
 X, y = make_classification(10000, n_classes=5, n_informative=10, weights=[.04, .4, .12, .5, .04])
 train, test, ytrain, ytest = train_test_split(X, y, test_size=.3, random_state=42)

 # random forest model
 model = RandomForestClassifier()
 model.fit(train, ytrain)
 yhat = model.predict_proba(test)

 def plot_roc_curve(y_test, y_pred):
     n_classes = len(np.unique(y_test))
     y_test = label_binarize(y_test, classes=np.arange(n_classes))

     # Compute ROC curve and ROC area for each class
     fpr = dict()
     tpr = dict()
     roc_auc = dict()
     thresholds = dict()
     for i in range(n_classes):
       fpr[i], tpr[i], thresholds[i] = roc_curve(y_test[:, i], y_pred[:, i], drop_intermediate=False)
     roc_auc[i] = auc(fpr[i], tpr[i])

     # Compute micro-average ROC curve and ROC area
     fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
     roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

     # First aggregate all false positive rates
     all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

     # Then interpolate all ROC curves at this points
     mean_tpr = np.zeros_like(all_fpr)
     for i in range(n_classes):
       mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

     # Finally average it and compute AUC
     mean_tpr /= n_classes

     fpr["macro"] = all_fpr
     tpr["macro"] = mean_tpr
     roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

     # Plot all ROC curves
     #plt.figure(figsize=(10,5))
     plt.figure(dpi=600)
     lw = 2
     plt.plot(fpr["micro"], tpr["micro"],
     label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
     color="deeppink", linestyle=":", linewidth=4,)

     plt.plot(fpr["macro"], tpr["macro"],
     label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
     color="navy", linestyle=":", linewidth=4,)

     colors = cycle(["aqua", "darkorange", "darkgreen", "yellow", "blue"])
     for i, color in zip(range(n_classes), colors):
       plt.plot(fpr[i], tpr[i], color=color, lw=lw,
       label="ROC curve of class {0} (area = {1:0.2f})".format(i, roc_auc[i]),)

     plt.plot([0, 1], [0, 1], "k--", lw=lw)
     plt.xlim([0.0, 1.0])
     plt.ylim([0.0, 1.05])
     plt.xlabel("False Positive Rate")
     plt.ylabel("True Positive Rate")
     plt.title("Receiver Operating Characteristic (ROC) curve")
     plt.legend()

最后，请考虑 roc_curve() 还有一个 drop_intermediate 参数，用于降低次优阈值（了解它可能会有用）。

绘制多类问题的 ROC 曲线

Plotting the ROC curve for a multiclass problem

python

machine-learning

roc

scikit-learn

auc