手动 ROC 曲线与 sklearn.metrics 不匹配
Manually ROC curve doen´t match with sklearn.metrics
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.preprocessing import binarize
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score, precision_score
数据
y_pred = np.array([0.4, 0.2, 0.3, 0.6, 0.1, 0.3, 0.7, 0.2, 0.3, 0.8, 0.3, 0.9, 0.3, 0.2, 0.2,
0.4, 0.9, 0.4, 0.3, 0.6, 0.7, 0.2, 0.8, 0.2, 0.6, 0.1, 0.1])
y_test =np.array(["No","No","No","Yes","No","No","Yes","No","No","Yes","No","Yes",
"No","No","No", "No","Yes","No","No","No","No","Yes",
"No","Yes","No","No","No"])
主程序
我调整了阈值并将召回率和 fpr 保存在列表中以绘制它。
另外,我将 sklearn 指标返回的值保存在另一个列表中,以确保我得到正确的值。
def recall_fpr(confusion_matrix):
"""Given a confusion matrix will return the recall and teh false positive reate"""
cm = confusion_matrix
Recall = round(cm[0, 0] / (cm[0, 0] + cm[0, 1]), 3) # TP /(TP + FN)
Precision = round(cm[0, 0] / (cm[0, 0] + cm[1, 0]), 3) #TP /(TP + FP)
False_Positive_rate = round((1 - Precision), 3)
return Recall, False_Positive_rate
list_recall = []
list_fpr = []
list_recall_sk = []
list_fpr_sk = []
for i in range (1, 10):
y_pred = y_pred.reshape(-1, 1)
y_pred2 = binarize(y_pred, i/10)
y_pred2 = np.where(y_pred2 == 1, 'Yes', 'No')
cm = confusion_matrix(y_test, y_pred2, labels=["Yes", "No"])
Recall, fpr = recall_fpr(cm)
list_recall.append(Recall)
list_fpr.append(fpr)
# I just add that to check I m getting right the results
recall_sk = round(recall_score(y_test, y_pred2, pos_label="Yes"), 3)
list_recall_sk.append(recall_sk)
fpr_sk = round(1 - round(precision_score(y_test, y_pred2, pos_label="Yes"), 3),3)
list_fpr_sk.append(fpr_sk)
绘图值
df_threshold = pd.DataFrame({"Recall":list_recall, "False_Positives_rate": list_fpr})
df_threshold.plot(x='False_Positives_rate', y='Recall', style='o')
通过 sklearn 方法计算指标。
fpr_2, tpr_2, thresholds_2 = roc_curve(y_test, y_pred, pos_label = "Yes")
plt.plot(fpr_2, tpr_2, linewidth=2)
plt.plot([0, 1], [0, 1], 'k--' )
ax = plt.subplot(1, 1, 1)
ax.scatter(list_fpr, list_recall, c='red')
plt.show()
为什么我计算的值与 sklearn 指标不匹配?
FPR 不是 1 精度。前者为FP/(FP+TN)
,后者为FP/(FP+TP)
.
将 recall_fpr
函数更正为
False_Positive_rate = round(cm[1, 0] / (cm[1, 0] + cm[1, 1]), 3) #FP /(FP + TN)
给出正确的ROC曲线:
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.preprocessing import binarize
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score, precision_score
数据
y_pred = np.array([0.4, 0.2, 0.3, 0.6, 0.1, 0.3, 0.7, 0.2, 0.3, 0.8, 0.3, 0.9, 0.3, 0.2, 0.2,
0.4, 0.9, 0.4, 0.3, 0.6, 0.7, 0.2, 0.8, 0.2, 0.6, 0.1, 0.1])
y_test =np.array(["No","No","No","Yes","No","No","Yes","No","No","Yes","No","Yes",
"No","No","No", "No","Yes","No","No","No","No","Yes",
"No","Yes","No","No","No"])
主程序
我调整了阈值并将召回率和 fpr 保存在列表中以绘制它。 另外,我将 sklearn 指标返回的值保存在另一个列表中,以确保我得到正确的值。
def recall_fpr(confusion_matrix):
"""Given a confusion matrix will return the recall and teh false positive reate"""
cm = confusion_matrix
Recall = round(cm[0, 0] / (cm[0, 0] + cm[0, 1]), 3) # TP /(TP + FN)
Precision = round(cm[0, 0] / (cm[0, 0] + cm[1, 0]), 3) #TP /(TP + FP)
False_Positive_rate = round((1 - Precision), 3)
return Recall, False_Positive_rate
list_recall = []
list_fpr = []
list_recall_sk = []
list_fpr_sk = []
for i in range (1, 10):
y_pred = y_pred.reshape(-1, 1)
y_pred2 = binarize(y_pred, i/10)
y_pred2 = np.where(y_pred2 == 1, 'Yes', 'No')
cm = confusion_matrix(y_test, y_pred2, labels=["Yes", "No"])
Recall, fpr = recall_fpr(cm)
list_recall.append(Recall)
list_fpr.append(fpr)
# I just add that to check I m getting right the results
recall_sk = round(recall_score(y_test, y_pred2, pos_label="Yes"), 3)
list_recall_sk.append(recall_sk)
fpr_sk = round(1 - round(precision_score(y_test, y_pred2, pos_label="Yes"), 3),3)
list_fpr_sk.append(fpr_sk)
绘图值
df_threshold = pd.DataFrame({"Recall":list_recall, "False_Positives_rate": list_fpr})
df_threshold.plot(x='False_Positives_rate', y='Recall', style='o')
通过 sklearn 方法计算指标。
fpr_2, tpr_2, thresholds_2 = roc_curve(y_test, y_pred, pos_label = "Yes")
plt.plot(fpr_2, tpr_2, linewidth=2)
plt.plot([0, 1], [0, 1], 'k--' )
ax = plt.subplot(1, 1, 1)
ax.scatter(list_fpr, list_recall, c='red')
plt.show()
为什么我计算的值与 sklearn 指标不匹配?
FPR 不是 1 精度。前者为FP/(FP+TN)
,后者为FP/(FP+TP)
.
将 recall_fpr
函数更正为
False_Positive_rate = round(cm[1, 0] / (cm[1, 0] + cm[1, 1]), 3) #FP /(FP + TN)
给出正确的ROC曲线: