在 Python API 中再现 LightGBM 的 `logloss`
Reproducing LightGBM's `logloss` in the Python API
我想开始在 LightGBM 中使用自定义分类损失函数,我认为 binary_logloss
的自定义实现是一个很好的起点。根据答案 ,我设法获得了一个自定义 logloss,其性能与内置 logloss
(在 scikit-learn API 中)大致相同。
我尝试在 Python API 中遵循相同的逻辑:
import lightgbm
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
x, x_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
train_data = lightgbm.Dataset(x, label=y, free_raw_data=False)
test_data = lightgbm.Dataset(x_test, label=y_test)
# DEFINE CUSTOM LOSS FUNCTION
def my_logloss(preds, data):
y_true = data.get_label()
preds = np.where(preds >= 0,
1. / (1. + np.exp(-preds)),
np.exp(preds) / (1. + np.exp(preds)))
grad = -(y_true - preds)
hess = preds * (1.0 - preds)
return grad, hess
# DEFINE CUSTOM EVAL LOSS FUNCTION
def logloss_eval(preds, data):
y_true = data.get_label()
preds = np.where(preds >= 0,
1. / (1. + np.exp(-preds)),
np.exp(preds) / (1. + np.exp(preds)))
loss = -(y_true * np.log(preds)) - ((1 - y_true) * np.log(1 - preds))
return "binary_logloss", np.mean(loss), False
# RUN MODEL WITH BUILTIN LOSS
parameters = {
'application': 'binary',
'objective': 'binary',
'metric': 'binary_logloss',
'boosting': 'gbdt',
'num_leaves': 10,
'feature_fraction': 0.5,
'bagging_fraction': 0.5,
'bagging_freq': 20,
'learning_rate': 0.05,
'verbose': 1
}
model = lightgbm.train(parameters,
train_data,
valid_sets=[train_data, test_data],
valid_names=['train','valid'],
num_boost_round=100
)
# RUN MODEL WITH CUSTOM LOSS
parameters = {
'application': 'binary',
# 'objective': 'binary',
# 'metric': 'binary_logloss',
'boosting': 'gbdt',
'num_leaves': 10,
'feature_fraction': 0.5,
'bagging_fraction': 0.5,
'bagging_freq': 20,
'learning_rate': 0.05,
'verbose': 1
}
model = lightgbm.train(parameters,
train_data,
valid_sets=[train_data, test_data],
valid_names=['train','valid'],
num_boost_round=100,
fobj=my_logloss,
,feval=logloss_eval
)
与 scikit-learn API 案例不同,我在两个模型之间没有得到相同的 training/validation 损失序列。例如,第一个模型以 0.159872 的训练损失结束,第二个模型以 0.157686 的训练损失结束。
有没有办法修改我上面的代码,使其与内置 logloss
完全匹配?如果是这样,我将不胜感激知道如何。如果不是,我想知道是什么原因造成的。
结果的差异是由于:
提供自定义损失函数时 LightGBM 使用的不同初始化,此 GitHub issue 解释了如何解决它。最简单的解决方案是设置 'boost_from_average': False
.
由于feature_fraction < 1
而对特征进行子采样。这可能需要在 GitHub 中提出一个问题,因为目前尚不清楚为什么 feature_fraction_seed
默认情况下已修复结果无法重现。
import lightgbm
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
x, x_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
train_data = lightgbm.Dataset(data=x, label=y, free_raw_data=False)
test_data = lightgbm.Dataset(data=x_test, label=y_test)
# DEFINE CUSTOM LOSS FUNCTION
def my_logloss(preds, data):
y_true = data.get_label()
preds = np.where(preds >= 0, 1. / (1. + np.exp(- preds)), np.exp(preds) / (1. + np.exp(preds)))
grad = - (y_true - preds)
hess = preds * (1.0 - preds)
return grad, hess
# DEFINE CUSTOM EVAL LOSS FUNCTION
def logloss_eval(preds, data):
y_true = data.get_label()
preds = np.where(preds >= 0, 1. / (1. + np.exp(-preds)), np.exp(preds) / (1. + np.exp(preds)))
loss = - (y_true * np.log(preds)) - ((1 - y_true) * np.log(1 - preds))
return 'custom_loss', np.mean(loss), False
# RUN MODEL WITH BUILTIN LOSS
parameters = {
'objective': 'binary',
'metric': 'binary_logloss',
'boosting': 'gbdt',
'num_leaves': 10,
# 'feature_fraction': 0.5,
'bagging_fraction': 0.5,
'bagging_freq': 20,
'learning_rate': 0.05,
'verbose': 1,
'boost_from_average': False,
}
model = lightgbm.train(params=parameters,
train_set=train_data,
valid_sets=[train_data, test_data],
valid_names=['train', 'valid'],
num_boost_round=100)
# [100] train's binary_logloss: 0.133561 valid's binary_logloss: 0.271294
# RUN MODEL WITH CUSTOM LOSS
parameters = {
'boosting': 'gbdt',
'num_leaves': 10,
# 'feature_fraction': 0.5,
'bagging_fraction': 0.5,
'bagging_freq': 20,
'learning_rate': 0.05,
'verbose': 1,
'boost_from_average': False,
}
model = lightgbm.train(params=parameters,
train_set=train_data,
valid_sets=[train_data, test_data],
valid_names=['train', 'valid'],
num_boost_round=100,
fobj=my_logloss,
feval=logloss_eval)
# [100] train's custom_loss: 0.133561 valid's custom_loss: 0.271294
我想开始在 LightGBM 中使用自定义分类损失函数,我认为 binary_logloss
的自定义实现是一个很好的起点。根据答案 logloss
(在 scikit-learn API 中)大致相同。
我尝试在 Python API 中遵循相同的逻辑:
import lightgbm
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
x, x_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
train_data = lightgbm.Dataset(x, label=y, free_raw_data=False)
test_data = lightgbm.Dataset(x_test, label=y_test)
# DEFINE CUSTOM LOSS FUNCTION
def my_logloss(preds, data):
y_true = data.get_label()
preds = np.where(preds >= 0,
1. / (1. + np.exp(-preds)),
np.exp(preds) / (1. + np.exp(preds)))
grad = -(y_true - preds)
hess = preds * (1.0 - preds)
return grad, hess
# DEFINE CUSTOM EVAL LOSS FUNCTION
def logloss_eval(preds, data):
y_true = data.get_label()
preds = np.where(preds >= 0,
1. / (1. + np.exp(-preds)),
np.exp(preds) / (1. + np.exp(preds)))
loss = -(y_true * np.log(preds)) - ((1 - y_true) * np.log(1 - preds))
return "binary_logloss", np.mean(loss), False
# RUN MODEL WITH BUILTIN LOSS
parameters = {
'application': 'binary',
'objective': 'binary',
'metric': 'binary_logloss',
'boosting': 'gbdt',
'num_leaves': 10,
'feature_fraction': 0.5,
'bagging_fraction': 0.5,
'bagging_freq': 20,
'learning_rate': 0.05,
'verbose': 1
}
model = lightgbm.train(parameters,
train_data,
valid_sets=[train_data, test_data],
valid_names=['train','valid'],
num_boost_round=100
)
# RUN MODEL WITH CUSTOM LOSS
parameters = {
'application': 'binary',
# 'objective': 'binary',
# 'metric': 'binary_logloss',
'boosting': 'gbdt',
'num_leaves': 10,
'feature_fraction': 0.5,
'bagging_fraction': 0.5,
'bagging_freq': 20,
'learning_rate': 0.05,
'verbose': 1
}
model = lightgbm.train(parameters,
train_data,
valid_sets=[train_data, test_data],
valid_names=['train','valid'],
num_boost_round=100,
fobj=my_logloss,
,feval=logloss_eval
)
与 scikit-learn API 案例不同,我在两个模型之间没有得到相同的 training/validation 损失序列。例如,第一个模型以 0.159872 的训练损失结束,第二个模型以 0.157686 的训练损失结束。
有没有办法修改我上面的代码,使其与内置 logloss
完全匹配?如果是这样,我将不胜感激知道如何。如果不是,我想知道是什么原因造成的。
结果的差异是由于:
提供自定义损失函数时 LightGBM 使用的不同初始化,此 GitHub issue 解释了如何解决它。最简单的解决方案是设置
'boost_from_average': False
.由于
feature_fraction < 1
而对特征进行子采样。这可能需要在 GitHub 中提出一个问题,因为目前尚不清楚为什么feature_fraction_seed
默认情况下已修复结果无法重现。
import lightgbm
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
x, x_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
train_data = lightgbm.Dataset(data=x, label=y, free_raw_data=False)
test_data = lightgbm.Dataset(data=x_test, label=y_test)
# DEFINE CUSTOM LOSS FUNCTION
def my_logloss(preds, data):
y_true = data.get_label()
preds = np.where(preds >= 0, 1. / (1. + np.exp(- preds)), np.exp(preds) / (1. + np.exp(preds)))
grad = - (y_true - preds)
hess = preds * (1.0 - preds)
return grad, hess
# DEFINE CUSTOM EVAL LOSS FUNCTION
def logloss_eval(preds, data):
y_true = data.get_label()
preds = np.where(preds >= 0, 1. / (1. + np.exp(-preds)), np.exp(preds) / (1. + np.exp(preds)))
loss = - (y_true * np.log(preds)) - ((1 - y_true) * np.log(1 - preds))
return 'custom_loss', np.mean(loss), False
# RUN MODEL WITH BUILTIN LOSS
parameters = {
'objective': 'binary',
'metric': 'binary_logloss',
'boosting': 'gbdt',
'num_leaves': 10,
# 'feature_fraction': 0.5,
'bagging_fraction': 0.5,
'bagging_freq': 20,
'learning_rate': 0.05,
'verbose': 1,
'boost_from_average': False,
}
model = lightgbm.train(params=parameters,
train_set=train_data,
valid_sets=[train_data, test_data],
valid_names=['train', 'valid'],
num_boost_round=100)
# [100] train's binary_logloss: 0.133561 valid's binary_logloss: 0.271294
# RUN MODEL WITH CUSTOM LOSS
parameters = {
'boosting': 'gbdt',
'num_leaves': 10,
# 'feature_fraction': 0.5,
'bagging_fraction': 0.5,
'bagging_freq': 20,
'learning_rate': 0.05,
'verbose': 1,
'boost_from_average': False,
}
model = lightgbm.train(params=parameters,
train_set=train_data,
valid_sets=[train_data, test_data],
valid_names=['train', 'valid'],
num_boost_round=100,
fobj=my_logloss,
feval=logloss_eval)
# [100] train's custom_loss: 0.133561 valid's custom_loss: 0.271294