使用 scikit-learn 计算 AUC 的正确方法是什么?
Which is the correct way to calculate AUC with scikit-learn?
我注意到下面两个代码的结果是不同的。
#1
metrics.plot_roc_curve(classifier, X_test, y_test, ax=plt.gca())
#2
metrics.plot_roc_curve(classifier, X_test, y_test, ax=plt.gca(), label=clsname + ' (AUC = %.2f)' % roc_auc_score(y_test, y_predicted))
那么,哪种方法是正确的呢?
我添加了一个简单的可重现示例:
from sklearn.metrics import roc_auc_score
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=12)
svclassifier = SVC(kernel='rbf')
svclassifier.fit(X_train, y_train)
y_predicted = svclassifier.predict(X_test)
print('AUC = %.2f' % roc_auc_score(y_test, y_predicted)) #1
metrics.plot_roc_curve(svclassifier, X_test, y_test, ax=plt.gca()) #2
plt.show()
输出(#1):
AUC = 0.86
同时 (#2):
这里的区别可能是 sklearn 在内部使用 predict_proba()
来获得每个 class 的概率,并从中发现 auc
示例,当您使用 classifier.predict()
import matplotlib.pyplot as plt
from sklearn import datasets, metrics, model_selection, svm
X, y = datasets.make_classification(random_state=0)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state=0)
clf = svm.SVC(random_state=0,probability=False)
clf.fit(X_train, y_train)
clf.predict(X_test)
>> array([1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
1, 0, 0])
# calculate auc
metrics.roc_auc_score(y_test, clf.predict(X_test))
>>>0.8782051282051283 # ~0.88
如果您使用 classifier.predict_proba()
X, y = datasets.make_classification(random_state=0)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state=0)
# set probability=True
clf = svm.SVC(random_state=0,probability=True)
clf.fit(X_train, y_train)
clf.predict_proba(X_test)
>> array([[0.13625954, 0.86374046],
[0.90517034, 0.09482966],
[0.19754525, 0.80245475],
[0.96741274, 0.03258726],
[0.80850602, 0.19149398],
......................,
[0.31927198, 0.68072802],
[0.8454472 , 0.1545528 ],
[0.75919018, 0.24080982]])
# calculate auc
# when computing the roc auc metrics, by default, estimators.classes_[1] is
# considered as the positive class here 'clf.predict_proba(X_test)[:,1]'
metrics.roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
>> 0.9102564102564102
所以对于你的问题 metrics.plot_roc_curve(classifier, X_test, y_test, ax=plt.gca())
可能使用默认 predict_proba()
来预测 auc
,而对于 metrics.plot_roc_curve(classifier, X_test, y_test, ax=plt.gca(), label=clsname + ' (AUC = %.2f)' % roc_auc_score(y_test, y_predicted))
,你正在计算 roc_auc_score
并通过分数作为标签。
我注意到下面两个代码的结果是不同的。
#1
metrics.plot_roc_curve(classifier, X_test, y_test, ax=plt.gca())
#2
metrics.plot_roc_curve(classifier, X_test, y_test, ax=plt.gca(), label=clsname + ' (AUC = %.2f)' % roc_auc_score(y_test, y_predicted))
那么,哪种方法是正确的呢?
我添加了一个简单的可重现示例:
from sklearn.metrics import roc_auc_score
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=12)
svclassifier = SVC(kernel='rbf')
svclassifier.fit(X_train, y_train)
y_predicted = svclassifier.predict(X_test)
print('AUC = %.2f' % roc_auc_score(y_test, y_predicted)) #1
metrics.plot_roc_curve(svclassifier, X_test, y_test, ax=plt.gca()) #2
plt.show()
输出(#1):
AUC = 0.86
同时 (#2):
这里的区别可能是 sklearn 在内部使用 predict_proba()
来获得每个 class 的概率,并从中发现 auc
示例,当您使用 classifier.predict()
import matplotlib.pyplot as plt
from sklearn import datasets, metrics, model_selection, svm
X, y = datasets.make_classification(random_state=0)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state=0)
clf = svm.SVC(random_state=0,probability=False)
clf.fit(X_train, y_train)
clf.predict(X_test)
>> array([1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
1, 0, 0])
# calculate auc
metrics.roc_auc_score(y_test, clf.predict(X_test))
>>>0.8782051282051283 # ~0.88
如果您使用 classifier.predict_proba()
X, y = datasets.make_classification(random_state=0)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state=0)
# set probability=True
clf = svm.SVC(random_state=0,probability=True)
clf.fit(X_train, y_train)
clf.predict_proba(X_test)
>> array([[0.13625954, 0.86374046],
[0.90517034, 0.09482966],
[0.19754525, 0.80245475],
[0.96741274, 0.03258726],
[0.80850602, 0.19149398],
......................,
[0.31927198, 0.68072802],
[0.8454472 , 0.1545528 ],
[0.75919018, 0.24080982]])
# calculate auc
# when computing the roc auc metrics, by default, estimators.classes_[1] is
# considered as the positive class here 'clf.predict_proba(X_test)[:,1]'
metrics.roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
>> 0.9102564102564102
所以对于你的问题 metrics.plot_roc_curve(classifier, X_test, y_test, ax=plt.gca())
可能使用默认 predict_proba()
来预测 auc
,而对于 metrics.plot_roc_curve(classifier, X_test, y_test, ax=plt.gca(), label=clsname + ' (AUC = %.2f)' % roc_auc_score(y_test, y_predicted))
,你正在计算 roc_auc_score
并通过分数作为标签。