如何控制xgboost的模型大小?

How to control the model size of xgboost?

遇到一种需要逐步训练模型的情况,我想得到一个小尺寸的模型,但是就像下面的例子,我的临时模型尺寸每次迭代都会增加,我没有'知道如何控制模型大小。

# -*- coding: utf-8 -*-

import xgboost as xgb
from sklearn.model_selection import train_test_split as ttsplit
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error as mse

X = load_boston()['data']
y = load_boston()['target']

# split data into training and testing sets
# then split training set in half
X_train, X_test, y_train, y_test = ttsplit(X, y, test_size=0.1, random_state=0)
X_train_1, X_train_2, y_train_1, y_train_2 = ttsplit(X_train, y_train, test_size=0.5, random_state=0)

xg_train_1 = xgb.DMatrix(X_train_1, label=y_train_1)
xg_train_2 = xgb.DMatrix(X_train_2, label=y_train_2)
xg_test = xgb.DMatrix(X_test, label=y_test)

params = {'objective': 'reg:squarederror', 'tree_method': 'hist', 'max_depth': 3, 'max_leaves': 100, 'grow_policy': 'lossguide'}
model_1 = xgb.train(params, xg_train_1, 30)
# model_1.save_model('t0.json')

# ================= train two versions of the model =====================#
model_v1 = xgb.train(params, xg_train_2, 30)
model_v1.save_model('./t1.json')
print(model_v1.__dict__)

model_v2 = xgb.train(params, xg_train_2, 30, xgb_model='t1.json')
model_v2.save_model('./t2.json')
print(model_v2.__dict__)

model_v3 = xgb.train(params, xg_train_2, 30, xgb_model='t2.json')
model_v3.save_model('./t3.json')
print(model_v3.__dict__)

model_v4 = xgb.train(params, xg_train_2, 30, xgb_model='t3.json')
model_v4.save_model('./t4.json')
print(model_v4.__dict__)

model_v5 = xgb.train(params, xg_train_2, 30, xgb_model='t4.json')
model_v5.save_model('./t4.json')
print(model_v5.__dict__)

顺便说一句,我已经阅读了训练模型的 XGBoost 的每个参数并尝试了 max_depth、max_leaves 和许多其他组合,但都不起作用。

JSON数据的细节显示越来越多的树被添加,再次仔细阅读文档后我发现这两个参数是为此设计的。(max_depth,max_leaves仅适用于树 属性,但不适用于树编号)

我们可以通过下面的代码解决这个问题

# -*- coding: utf-8 -*-

import xgboost as xgb
from sklearn.model_selection import train_test_split as ttsplit
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error as mse

X = load_boston()['data']
y = load_boston()['target']

# split data into training and testing sets
# then split training set in half
X_train, X_test, y_train, y_test = ttsplit(X, y, test_size=0.1, random_state=0)
X_train_1, X_train_2, y_train_1, y_train_2 = ttsplit(X_train, y_train, test_size=0.5, random_state=0)

xg_train_1 = xgb.DMatrix(X_train_1, label=y_train_1)
xg_train_2 = xgb.DMatrix(X_train_2, label=y_train_2)
xg_test = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'max_depth': 3,
    'max_leaves': 100,
}
model_1 = xgb.train(params, xg_train_1, 10)
# model_1.save_model('t0.json')

# ================= train two versions of the model =====================#
model_v1 = xgb.train(params, xg_train_2)
model_v1.save_model('./t1.json')
print(model_v1.__dict__)

params1 = params.copy()
# key arguments for this
params1['process_type'] = 'update'
params1['updater'] = 'refresh'

model_v2 = xgb.train(params1, xg_train_2, xgb_model='t1.json')
model_v2.save_model('./t2.json')
print(model_v2.__dict__)

参考:

[1]https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.training [2]https://github.com/dmlc/xgboost/issues/3055