如何为 catboost 创建自定义评估指标?
How to create custom eval metric for catboost?
类似的问题:
- Python Catboost: Multiclass F1 score custom metric
Catboost 教程
问题
在这道题中,我有一个二分类问题。建模后我们得到测试模型预测 y_pred
并且我们已经有了真实的测试标签 y_true
.
我想获得由以下等式定义的自定义评估指标:
profit = 400 * truePositive - 200*fasleNegative - 100*falsePositive
此外,由于利润越高越好,我想最大化函数而不是最小化它。
如何在 catboost 中获得此 eval_metric?
使用 sklearn
def get_profit(y_true, y_pred):
tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y_true,y_pred).ravel()
loss = 400*tp - 200*fn - 100*fp
return loss
scoring = sklearn.metrics.make_scorer(get_profit, greater_is_better=True)
使用 catboost
class ProfitMetric(object):
def get_final_error(self, error, weight):
return error / (weight + 1e-38)
def is_max_optimal(self):
return True
def evaluate(self, approxes, target, weight):
assert len(approxes) == 1
assert len(target) == len(approxes[0])
approx = approxes[0]
error_sum = 0.0
weight_sum = 0.0
** I don't know here**
return error_sum, weight_sum
问题
如何在 catboost 中完成自定义评估指标?
更新
到目前为止我的更新
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
def get_profit(y_true, y_pred):
tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y_true,y_pred).ravel()
profit = 400*tp - 200*fn - 100*fp
return profit
class ProfitMetric:
def is_max_optimal(self):
return True # greater is better
def evaluate(self, approxes, target, weight):
assert len(approxes) == 1
assert len(target) == len(approxes[0])
approx = approxes[0]
y_pred = np.rint(approx)
y_true = np.array(target).astype(int)
output_weight = 1 # weight is not used
score = get_profit(y_true, y_pred)
return score, output_weight
def get_final_error(self, error, weight):
return error
df = sns.load_dataset('titanic')
X = df[['survived','pclass','age','sibsp','fare']]
y = X.pop('survived')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)
model = CatBoostClassifier(metric_period=50,
n_estimators=200,
eval_metric=ProfitMetric()
)
model.fit(X, y, eval_set=(X_test, y_test)) # this fails
例如,我实现了一个非常简单的指标。
它计算y_pred != y_true 在多class classifier 中的次数。
class CountErrors:
'''Count of wrong predictions'''
def is_max_optimal(self):
False
def evaluate(self, approxes, target, weight):
y_pred = np.array(approxes).argmax(0)
y_true = np.array(target)
return sum(y_pred!=y_true), 1
def get_final_error(self, error, weight):
return error
如果你运行这个代码,你可以看到它被使用:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
class CountErrors:
'''Count number of wrong predictions'''
def is_max_optimal(self):
False # Lower is better
def evaluate(self, approxes, target, weight):
y_pred = np.array(approxes).argmax(0)
y_true = np.array(target)
return sum(y_pred!=y_true), 1
def get_final_error(self, error, weight):
return error
df = pd.read_csv('https://raw.githubusercontent.com/mkleinbort/resource-datasets/master/abalone/abalone.csv')
y = df['sex']
X = df.drop(columns=['sex'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
model = CatBoostClassifier(metric_period=50, n_estimators=200, eval_metric=CountErrors())
model.fit(X, y, eval_set=(X_test, y_test))
希望您可以根据您的用例进行调整。
与你的主要区别在于:
@staticmethod
def get_profit(y_true, y_pred):
y_pred = expit(y_pred).astype(int)
y_true = y_true.astype(int)
#print("ACCURACY:",(y_pred==y_true).mean())
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
loss = 400*tp - 200*fn - 100*fp
return loss
从 example 你链接的预测是什么并不明显,但在检查后发现 catboost
在内部将预测视为 raw log-赔率(帽子提示@Ben)。因此,要正确使用 confusion_matrix
,您需要确保 y_true
和 y_pred
都是整数 class 标签。这是通过以下方式完成的:
y_pred = scipy.special.expit(y_pred)
y_true = y_true.astype(int)
所以完整的工作代码是:
import seaborn as sns
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from scipy.special import expit
df = sns.load_dataset('titanic')
X = df[['survived','pclass','age','sibsp','fare']]
y = X.pop('survived')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)
class ProfitMetric:
@staticmethod
def get_profit(y_true, y_pred):
y_pred = expit(y_pred).astype(int)
y_true = y_true.astype(int)
#print("ACCURACY:",(y_pred==y_true).mean())
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
loss = 400*tp - 200*fn - 100*fp
return loss
def is_max_optimal(self):
return True # greater is better
def evaluate(self, approxes, target, weight):
assert len(approxes) == 1
assert len(target) == len(approxes[0])
y_true = np.array(target).astype(int)
approx = approxes[0]
score = self.get_profit(y_true, approx)
return score, 1
def get_final_error(self, error, weight):
return error
model = CatBoostClassifier(metric_period=50,
n_estimators=200,
eval_metric=ProfitMetric()
)
model.fit(X, y, eval_set=(X_test, y_test))
类似的问题:
- Python Catboost: Multiclass F1 score custom metric
Catboost 教程
问题
在这道题中,我有一个二分类问题。建模后我们得到测试模型预测 y_pred
并且我们已经有了真实的测试标签 y_true
.
我想获得由以下等式定义的自定义评估指标:
profit = 400 * truePositive - 200*fasleNegative - 100*falsePositive
此外,由于利润越高越好,我想最大化函数而不是最小化它。
如何在 catboost 中获得此 eval_metric?
使用 sklearn
def get_profit(y_true, y_pred):
tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y_true,y_pred).ravel()
loss = 400*tp - 200*fn - 100*fp
return loss
scoring = sklearn.metrics.make_scorer(get_profit, greater_is_better=True)
使用 catboost
class ProfitMetric(object):
def get_final_error(self, error, weight):
return error / (weight + 1e-38)
def is_max_optimal(self):
return True
def evaluate(self, approxes, target, weight):
assert len(approxes) == 1
assert len(target) == len(approxes[0])
approx = approxes[0]
error_sum = 0.0
weight_sum = 0.0
** I don't know here**
return error_sum, weight_sum
问题
如何在 catboost 中完成自定义评估指标?
更新
到目前为止我的更新
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
def get_profit(y_true, y_pred):
tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y_true,y_pred).ravel()
profit = 400*tp - 200*fn - 100*fp
return profit
class ProfitMetric:
def is_max_optimal(self):
return True # greater is better
def evaluate(self, approxes, target, weight):
assert len(approxes) == 1
assert len(target) == len(approxes[0])
approx = approxes[0]
y_pred = np.rint(approx)
y_true = np.array(target).astype(int)
output_weight = 1 # weight is not used
score = get_profit(y_true, y_pred)
return score, output_weight
def get_final_error(self, error, weight):
return error
df = sns.load_dataset('titanic')
X = df[['survived','pclass','age','sibsp','fare']]
y = X.pop('survived')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)
model = CatBoostClassifier(metric_period=50,
n_estimators=200,
eval_metric=ProfitMetric()
)
model.fit(X, y, eval_set=(X_test, y_test)) # this fails
例如,我实现了一个非常简单的指标。
它计算y_pred != y_true 在多class classifier 中的次数。
class CountErrors:
'''Count of wrong predictions'''
def is_max_optimal(self):
False
def evaluate(self, approxes, target, weight):
y_pred = np.array(approxes).argmax(0)
y_true = np.array(target)
return sum(y_pred!=y_true), 1
def get_final_error(self, error, weight):
return error
如果你运行这个代码,你可以看到它被使用:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
class CountErrors:
'''Count number of wrong predictions'''
def is_max_optimal(self):
False # Lower is better
def evaluate(self, approxes, target, weight):
y_pred = np.array(approxes).argmax(0)
y_true = np.array(target)
return sum(y_pred!=y_true), 1
def get_final_error(self, error, weight):
return error
df = pd.read_csv('https://raw.githubusercontent.com/mkleinbort/resource-datasets/master/abalone/abalone.csv')
y = df['sex']
X = df.drop(columns=['sex'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
model = CatBoostClassifier(metric_period=50, n_estimators=200, eval_metric=CountErrors())
model.fit(X, y, eval_set=(X_test, y_test))
希望您可以根据您的用例进行调整。
与你的主要区别在于:
@staticmethod
def get_profit(y_true, y_pred):
y_pred = expit(y_pred).astype(int)
y_true = y_true.astype(int)
#print("ACCURACY:",(y_pred==y_true).mean())
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
loss = 400*tp - 200*fn - 100*fp
return loss
从 example 你链接的预测是什么并不明显,但在检查后发现 catboost
在内部将预测视为 raw log-赔率(帽子提示@Ben)。因此,要正确使用 confusion_matrix
,您需要确保 y_true
和 y_pred
都是整数 class 标签。这是通过以下方式完成的:
y_pred = scipy.special.expit(y_pred)
y_true = y_true.astype(int)
所以完整的工作代码是:
import seaborn as sns
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from scipy.special import expit
df = sns.load_dataset('titanic')
X = df[['survived','pclass','age','sibsp','fare']]
y = X.pop('survived')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)
class ProfitMetric:
@staticmethod
def get_profit(y_true, y_pred):
y_pred = expit(y_pred).astype(int)
y_true = y_true.astype(int)
#print("ACCURACY:",(y_pred==y_true).mean())
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
loss = 400*tp - 200*fn - 100*fp
return loss
def is_max_optimal(self):
return True # greater is better
def evaluate(self, approxes, target, weight):
assert len(approxes) == 1
assert len(target) == len(approxes[0])
y_true = np.array(target).astype(int)
approx = approxes[0]
score = self.get_profit(y_true, approx)
return score, 1
def get_final_error(self, error, weight):
return error
model = CatBoostClassifier(metric_period=50,
n_estimators=200,
eval_metric=ProfitMetric()
)
model.fit(X, y, eval_set=(X_test, y_test))