尝试实施核心学习模型的交叉验证时出现问题?
Problems while trying to implement a cross validated out of core learning model?
我正在研究文本分类,在特征提取步骤之后我得到了漂亮的矩阵,因此我尝试使用增量学习如下:
import xgboost as xgb
from sklearn.model_selection import ShuffleSplit, train_test_split
from sklearn.metrics import accuracy_score as acc
def incremental_learning2(X, y):
# split data into training and testing sets
# then split training set in half
X_train, X_test, y_train, y_test = train_test_split(X,
y, test_size=0.1,
random_state=0)
X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(X_train,
y_train,
test_size=0.5,
random_state=0)
xg_train_1 = xgb.DMatrix(X_train_1, label=y_train_1)
xg_train_2 = xgb.DMatrix(X_train_2, label=y_train_2)
xg_test = xgb.DMatrix(X_test, label=y_test)
#params = {'objective': 'reg:linear', 'verbose': False}
params = {}
model_1 = xgb.train(params, xg_train_1, 30)
model_1.save_model('model_1.model')
# ================= train two versions of the model =====================#
model_2_v1 = xgb.train(params, xg_train_2, 30)
model_2_v2 = xgb.train(params, xg_train_2, 30, xgb_model='model_1.model')
#Predictions
y_pred = model_2_v2.predict(X_test)
kfold = StratifiedKFold(n_splits=10, random_state=1).split(X_train, y_train)
scores = []
for k, (train, test) in enumerate(kfold):
model_2_v2.fit(X_train[train], y_train[train])
score = model_2_v2.score(X_train[test], y_train[test])
scores.append(score)
print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1, np.bincount(y_train[train]), score))
print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
关于上面的代码。我试图进行交叉验证并预测一些实例。但是,它不起作用。在非常大的数据集上拟合和更新 GBM 模型后,如何修复上述代码以获得交叉验证的指标和预测?
这是我想到的解决方案。首先我们导入必要的模块并定义一个简单的函数来计算均方根误差:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import StratifiedKFold
def rmse(a, b):
return np.sqrt(((a - b) ** 2).mean())
均方根误差的计算方法可能有所不同(查看 this thread 了解详情),但为了清楚起见,我选择了一个明确的公式。
这里是您的函数的快速版本。我试图使您的代码结构保持不变,但为了可读性,我进行了一些重构。
def incremental_learning2(X, y, n_splits=10, params = {}):
# Initialize score arrays
sc_1, sc_2_v1, sc_2_v2 = (np.zeros(n_splits) for i in range(3))
# Create cross-validator
kfold = StratifiedKFold(n_splits=n_splits, random_state=0).split(X, y)
# Iterate through folds
for k, (train, test) in enumerate(kfold):
# Split data
X_test, y_test = X[test], y[test]
splits = tts(X[train], y[train], test_size=0.5, random_state=0)
X_train_1, X_train_2, y_train_1, y_train_2 = splits
# Create data matrices
xg_train_1 = xgb.DMatrix(X_train_1, label=y_train_1)
xg_train_2 = xgb.DMatrix(X_train_2, label=y_train_2)
xg_test = xgb.DMatrix(X_test, label=y_test)
# Fit models
model_1 = xgb.train(params, xg_train_1, 30)
model_1.save_model('model_1.model')
model_2_v1 = xgb.train(params, xg_train_2, 30)
model_2_v2 = xgb.train(params, xg_train_2, 30, xgb_model='model_1.model')
# Make predictions and compute scores
preds = (m.predict(xg_test) for m in [model_1, model_2_v1, model_2_v2])
sc_1[k], sc_2_v1[k], sc_2_v2[k] = (rmse(p, y_test) for p in preds)
# Return scores
return sc_1, sc_2_v1, sc_2_v2
我还改进了输出格式,以 table 的形式显示结果。此功能在单独的函数中实现:
def display_results(a, b, c):
def hline():
print('-'*50)
print('Cross-validation root mean square error\n')
print('Fold\tmodel_v1\tmodel_2_v1\tmodel_2_v2')
hline()
for k, (ak, bk, ck) in enumerate(zip(a, b, c)):
print('%s\t%.3f\t\t%.3f\t\t%.3f' % (k+1, ak, bk, ck))
hline()
print('Avg\t%.3f\t\t%.3f\t\t%.3f' % tuple(np.mean(s) for s in [a, b, c]))
print('Std\t%.3f\t\t%.3f\t\t%.3f' % tuple(np.std(s) for s in [a, b, c]))
演示
因为你没有分享你的数据集,我不得不生成模拟数据来测试我的代码。
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=500000, centers=50, random_state=0)
scores_1, scores_2_v1, scores2_v2 = incremental_learning2(X, y)
display_results(scores_1, scores_2_v1, scores2_v2)
上面的代码 运行 没有错误,输出如下所示:
Cross-validation root mean square error
Fold model_v1 model_2_v1 model_2_v2
--------------------------------------------------
1 9.127 9.136 9.116
2 9.168 9.155 9.128
3 9.117 9.095 9.080
4 9.107 9.113 9.089
5 9.122 9.126 9.109
6 9.096 9.099 9.084
7 9.148 9.163 9.145
8 9.089 9.090 9.069
9 9.128 9.122 9.108
10 9.185 9.162 9.160
--------------------------------------------------
Avg 9.129 9.126 9.109
Std 0.029 0.026 0.028
备注
- 为了比较,我还交叉验证了
model_1
。
- 在示例 运行 中,
model_1
和 model_2_v1
具有大致相同的准确度,而 model_2_v2
表现稍好,正如人们合理预期的那样。
- 我研究了数据集的大小 (
n_samples
) 和 类 (centers
) 的数量,有趣的是,当这些参数的值减少时 model_2_v2
是三者中最不准确的。
- 希望使用不同的配置,即正确设置命名函数参数
params
,应该能让事情按预期工作。
我正在研究文本分类,在特征提取步骤之后我得到了漂亮的矩阵,因此我尝试使用增量学习如下:
import xgboost as xgb
from sklearn.model_selection import ShuffleSplit, train_test_split
from sklearn.metrics import accuracy_score as acc
def incremental_learning2(X, y):
# split data into training and testing sets
# then split training set in half
X_train, X_test, y_train, y_test = train_test_split(X,
y, test_size=0.1,
random_state=0)
X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(X_train,
y_train,
test_size=0.5,
random_state=0)
xg_train_1 = xgb.DMatrix(X_train_1, label=y_train_1)
xg_train_2 = xgb.DMatrix(X_train_2, label=y_train_2)
xg_test = xgb.DMatrix(X_test, label=y_test)
#params = {'objective': 'reg:linear', 'verbose': False}
params = {}
model_1 = xgb.train(params, xg_train_1, 30)
model_1.save_model('model_1.model')
# ================= train two versions of the model =====================#
model_2_v1 = xgb.train(params, xg_train_2, 30)
model_2_v2 = xgb.train(params, xg_train_2, 30, xgb_model='model_1.model')
#Predictions
y_pred = model_2_v2.predict(X_test)
kfold = StratifiedKFold(n_splits=10, random_state=1).split(X_train, y_train)
scores = []
for k, (train, test) in enumerate(kfold):
model_2_v2.fit(X_train[train], y_train[train])
score = model_2_v2.score(X_train[test], y_train[test])
scores.append(score)
print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1, np.bincount(y_train[train]), score))
print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
关于上面的代码。我试图进行交叉验证并预测一些实例。但是,它不起作用。在非常大的数据集上拟合和更新 GBM 模型后,如何修复上述代码以获得交叉验证的指标和预测?
这是我想到的解决方案。首先我们导入必要的模块并定义一个简单的函数来计算均方根误差:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import StratifiedKFold
def rmse(a, b):
return np.sqrt(((a - b) ** 2).mean())
均方根误差的计算方法可能有所不同(查看 this thread 了解详情),但为了清楚起见,我选择了一个明确的公式。
这里是您的函数的快速版本。我试图使您的代码结构保持不变,但为了可读性,我进行了一些重构。
def incremental_learning2(X, y, n_splits=10, params = {}):
# Initialize score arrays
sc_1, sc_2_v1, sc_2_v2 = (np.zeros(n_splits) for i in range(3))
# Create cross-validator
kfold = StratifiedKFold(n_splits=n_splits, random_state=0).split(X, y)
# Iterate through folds
for k, (train, test) in enumerate(kfold):
# Split data
X_test, y_test = X[test], y[test]
splits = tts(X[train], y[train], test_size=0.5, random_state=0)
X_train_1, X_train_2, y_train_1, y_train_2 = splits
# Create data matrices
xg_train_1 = xgb.DMatrix(X_train_1, label=y_train_1)
xg_train_2 = xgb.DMatrix(X_train_2, label=y_train_2)
xg_test = xgb.DMatrix(X_test, label=y_test)
# Fit models
model_1 = xgb.train(params, xg_train_1, 30)
model_1.save_model('model_1.model')
model_2_v1 = xgb.train(params, xg_train_2, 30)
model_2_v2 = xgb.train(params, xg_train_2, 30, xgb_model='model_1.model')
# Make predictions and compute scores
preds = (m.predict(xg_test) for m in [model_1, model_2_v1, model_2_v2])
sc_1[k], sc_2_v1[k], sc_2_v2[k] = (rmse(p, y_test) for p in preds)
# Return scores
return sc_1, sc_2_v1, sc_2_v2
我还改进了输出格式,以 table 的形式显示结果。此功能在单独的函数中实现:
def display_results(a, b, c):
def hline():
print('-'*50)
print('Cross-validation root mean square error\n')
print('Fold\tmodel_v1\tmodel_2_v1\tmodel_2_v2')
hline()
for k, (ak, bk, ck) in enumerate(zip(a, b, c)):
print('%s\t%.3f\t\t%.3f\t\t%.3f' % (k+1, ak, bk, ck))
hline()
print('Avg\t%.3f\t\t%.3f\t\t%.3f' % tuple(np.mean(s) for s in [a, b, c]))
print('Std\t%.3f\t\t%.3f\t\t%.3f' % tuple(np.std(s) for s in [a, b, c]))
演示
因为你没有分享你的数据集,我不得不生成模拟数据来测试我的代码。
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=500000, centers=50, random_state=0)
scores_1, scores_2_v1, scores2_v2 = incremental_learning2(X, y)
display_results(scores_1, scores_2_v1, scores2_v2)
上面的代码 运行 没有错误,输出如下所示:
Cross-validation root mean square error
Fold model_v1 model_2_v1 model_2_v2
--------------------------------------------------
1 9.127 9.136 9.116
2 9.168 9.155 9.128
3 9.117 9.095 9.080
4 9.107 9.113 9.089
5 9.122 9.126 9.109
6 9.096 9.099 9.084
7 9.148 9.163 9.145
8 9.089 9.090 9.069
9 9.128 9.122 9.108
10 9.185 9.162 9.160
--------------------------------------------------
Avg 9.129 9.126 9.109
Std 0.029 0.026 0.028
备注
- 为了比较,我还交叉验证了
model_1
。 - 在示例 运行 中,
model_1
和model_2_v1
具有大致相同的准确度,而model_2_v2
表现稍好,正如人们合理预期的那样。 - 我研究了数据集的大小 (
n_samples
) 和 类 (centers
) 的数量,有趣的是,当这些参数的值减少时model_2_v2
是三者中最不准确的。 - 希望使用不同的配置,即正确设置命名函数参数
params
,应该能让事情按预期工作。