绘制多类问题的 ROC 曲线
Plotting the ROC curve for a multiclass problem
我正在尝试将 sklearn
ROC extension to multiclass 的想法应用于我的数据集。我的每个 class ROC 曲线看起来都找到了一条直线,取消了显示曲线波动的 sklearn
示例。
我在下面给出一个 MWE 来说明我的意思:
# all imports
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
# dummy dataset
X, y = make_classification(10000, n_classes=5, n_informative=10, weights=[.04, .4, .12, .5, .04])
train, test, ytrain, ytest = train_test_split(X, y, test_size=.3, random_state=42)
# random forest model
model = RandomForestClassifier()
model.fit(train, ytrain)
yhat = model.predict(test)
然后以下函数绘制 ROC 曲线:
def plot_roc_curve(y_test, y_pred):
n_classes = len(np.unique(y_test))
y_test = label_binarize(y_test, classes=np.arange(n_classes))
y_pred = label_binarize(y_pred, classes=np.arange(n_classes))
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Plot all ROC curves
#plt.figure(figsize=(10,5))
plt.figure(dpi=600)
lw = 2
plt.plot(fpr["micro"], tpr["micro"],
label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
color="deeppink", linestyle=":", linewidth=4,)
plt.plot(fpr["macro"], tpr["macro"],
label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
color="navy", linestyle=":", linewidth=4,)
colors = cycle(["aqua", "darkorange", "darkgreen", "yellow", "blue"])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label="ROC curve of class {0} (area = {1:0.2f})".format(i, roc_auc[i]),)
plt.plot([0, 1], [0, 1], "k--", lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) curve")
plt.legend()
输出:
plot_roc_curve(ytest, yhat)
那种直线弯曲一次。我想看到模型在不同阈值下的表现,而不仅仅是一个,类似于 sklearn's illustration for 3-classes 如下图所示:
重点是您使用 predict()
而不是 predict_proba()
/decision_function()
来定义您的 y_hat
。这意味着 - 考虑到阈值向量是由 y_hat
中不同值的数量定义的(请参阅 here 以供参考),每个 class [=41= 的阈值很少]only 在其上计算 tpr
和 fpr
(这反过来意味着你的曲线只在几个点上被评估)。
确实,请考虑 doc 表示要在 roc_curve()
中传递给 y_scores
的内容,无论是概率估计值还是决策值。在 sklearn
的示例中,决策值用于计算分数。鉴于您正在考虑 RandomForestClassifier()
,考虑 y_hat
中的概率估计应该是可行的方法。
标签二值化输出的意义何在? ROC 的标准定义是二进制 classification。要传递给 multiclass 问题,您必须使用 OneVsAll 方法将您的问题转换为二元问题,这样您将拥有 n_class
条 ROC 曲线。 (事实上 ,由于 SVC()
默认以 OvO 方式处理 multiclass 问题,在示例中,他们不得不通过应用 OneVsRestClassifier
构造函数强制使用 OvA;使用 RandomForestClassifier
你没有这样的问题,因为它本质上是 multiclass,请参阅 here 以供参考)。用这些术语来说,一旦切换到 predict_proba()
,您就会发现标签二值化预测没有多大意义。
# all imports
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
# dummy dataset
X, y = make_classification(10000, n_classes=5, n_informative=10, weights=[.04, .4, .12, .5, .04])
train, test, ytrain, ytest = train_test_split(X, y, test_size=.3, random_state=42)
# random forest model
model = RandomForestClassifier()
model.fit(train, ytrain)
yhat = model.predict_proba(test)
def plot_roc_curve(y_test, y_pred):
n_classes = len(np.unique(y_test))
y_test = label_binarize(y_test, classes=np.arange(n_classes))
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
thresholds = dict()
for i in range(n_classes):
fpr[i], tpr[i], thresholds[i] = roc_curve(y_test[:, i], y_pred[:, i], drop_intermediate=False)
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Plot all ROC curves
#plt.figure(figsize=(10,5))
plt.figure(dpi=600)
lw = 2
plt.plot(fpr["micro"], tpr["micro"],
label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
color="deeppink", linestyle=":", linewidth=4,)
plt.plot(fpr["macro"], tpr["macro"],
label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
color="navy", linestyle=":", linewidth=4,)
colors = cycle(["aqua", "darkorange", "darkgreen", "yellow", "blue"])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label="ROC curve of class {0} (area = {1:0.2f})".format(i, roc_auc[i]),)
plt.plot([0, 1], [0, 1], "k--", lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) curve")
plt.legend()
最后,请考虑 roc_curve()
还有一个 drop_intermediate
参数,用于降低次优阈值(了解它可能会有用)。
我正在尝试将 sklearn
ROC extension to multiclass 的想法应用于我的数据集。我的每个 class ROC 曲线看起来都找到了一条直线,取消了显示曲线波动的 sklearn
示例。
我在下面给出一个 MWE 来说明我的意思:
# all imports
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
# dummy dataset
X, y = make_classification(10000, n_classes=5, n_informative=10, weights=[.04, .4, .12, .5, .04])
train, test, ytrain, ytest = train_test_split(X, y, test_size=.3, random_state=42)
# random forest model
model = RandomForestClassifier()
model.fit(train, ytrain)
yhat = model.predict(test)
然后以下函数绘制 ROC 曲线:
def plot_roc_curve(y_test, y_pred):
n_classes = len(np.unique(y_test))
y_test = label_binarize(y_test, classes=np.arange(n_classes))
y_pred = label_binarize(y_pred, classes=np.arange(n_classes))
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Plot all ROC curves
#plt.figure(figsize=(10,5))
plt.figure(dpi=600)
lw = 2
plt.plot(fpr["micro"], tpr["micro"],
label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
color="deeppink", linestyle=":", linewidth=4,)
plt.plot(fpr["macro"], tpr["macro"],
label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
color="navy", linestyle=":", linewidth=4,)
colors = cycle(["aqua", "darkorange", "darkgreen", "yellow", "blue"])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label="ROC curve of class {0} (area = {1:0.2f})".format(i, roc_auc[i]),)
plt.plot([0, 1], [0, 1], "k--", lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) curve")
plt.legend()
输出:
plot_roc_curve(ytest, yhat)
那种直线弯曲一次。我想看到模型在不同阈值下的表现,而不仅仅是一个,类似于 sklearn's illustration for 3-classes 如下图所示:
重点是您使用
predict()
而不是predict_proba()
/decision_function()
来定义您的y_hat
。这意味着 - 考虑到阈值向量是由y_hat
中不同值的数量定义的(请参阅 here 以供参考),每个 class [=41= 的阈值很少]only 在其上计算tpr
和fpr
(这反过来意味着你的曲线只在几个点上被评估)。确实,请考虑 doc 表示要在
roc_curve()
中传递给y_scores
的内容,无论是概率估计值还是决策值。在sklearn
的示例中,决策值用于计算分数。鉴于您正在考虑RandomForestClassifier()
,考虑y_hat
中的概率估计应该是可行的方法。标签二值化输出的意义何在? ROC 的标准定义是二进制 classification。要传递给 multiclass 问题,您必须使用 OneVsAll 方法将您的问题转换为二元问题,这样您将拥有
n_class
条 ROC 曲线。 (事实上 ,由于SVC()
默认以 OvO 方式处理 multiclass 问题,在示例中,他们不得不通过应用OneVsRestClassifier
构造函数强制使用 OvA;使用RandomForestClassifier
你没有这样的问题,因为它本质上是 multiclass,请参阅 here 以供参考)。用这些术语来说,一旦切换到predict_proba()
,您就会发现标签二值化预测没有多大意义。# all imports import numpy as np import matplotlib.pyplot as plt from itertools import cycle from sklearn import svm, datasets from sklearn.metrics import roc_curve, auc from sklearn.model_selection import train_test_split from sklearn.preprocessing import label_binarize from sklearn.datasets import make_classification from sklearn.ensemble import RandomForestClassifier # dummy dataset X, y = make_classification(10000, n_classes=5, n_informative=10, weights=[.04, .4, .12, .5, .04]) train, test, ytrain, ytest = train_test_split(X, y, test_size=.3, random_state=42) # random forest model model = RandomForestClassifier() model.fit(train, ytrain) yhat = model.predict_proba(test) def plot_roc_curve(y_test, y_pred): n_classes = len(np.unique(y_test)) y_test = label_binarize(y_test, classes=np.arange(n_classes)) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() thresholds = dict() for i in range(n_classes): fpr[i], tpr[i], thresholds[i] = roc_curve(y_test[:, i], y_pred[:, i], drop_intermediate=False) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += np.interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) # Plot all ROC curves #plt.figure(figsize=(10,5)) plt.figure(dpi=600) lw = 2 plt.plot(fpr["micro"], tpr["micro"], label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]), color="deeppink", linestyle=":", linewidth=4,) plt.plot(fpr["macro"], tpr["macro"], label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]), color="navy", linestyle=":", linewidth=4,) colors = cycle(["aqua", "darkorange", "darkgreen", "yellow", "blue"]) for i, color in zip(range(n_classes), colors): plt.plot(fpr[i], tpr[i], color=color, lw=lw, label="ROC curve of class {0} (area = {1:0.2f})".format(i, roc_auc[i]),) plt.plot([0, 1], [0, 1], "k--", lw=lw) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.title("Receiver Operating Characteristic (ROC) curve") plt.legend()
最后,请考虑 roc_curve()
还有一个 drop_intermediate
参数,用于降低次优阈值(了解它可能会有用)。