XGBoostError: [10:10:03] /workspace/src/tree/updater_gpu_hist.cu:1407: Exception in gpu_hist: NCCL failure
XGBoostError: [10:10:03] /workspace/src/tree/updater_gpu_hist.cu:1407: Exception in gpu_hist: NCCL failure
项目
- Nvidia Developer project
- 在 Google 协作环境中
我的代码
import csv
import numpy as np
import os.path
import pandas
import time
import xgboost as xgb
import sys
if sys.version_info[0] >= 3:
from urllib.request import urlretrieve
else:
from urllib import urlretrieve
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
dmatrix_train_filename = "higgs_train.dmatrix"
dmatrix_test_filename = "higgs_test.dmatrix"
csv_filename = "HIGGS.csv.gz"
train_rows = 10500000
test_rows = 500000
num_round = 1000
plot = True
# return xgboost dmatrix
def load_higgs():
if os.path.isfile(dmatrix_train_filename) and os.path.isfile(dmatrix_test_filename):
dtrain = xgb.DMatrix(dmatrix_train_filename)
dtest = xgb.DMatrix(dmatrix_test_filename)
if dtrain.num_row() == train_rows and dtest.num_row() == test_rows:
print("Loading cached dmatrix...")
return dtrain, dtest
if not os.path.isfile(csv_filename):
print("Downloading higgs file...")
urlretrieve(data_url, csv_filename)
df_higgs_train = pandas.read_csv(csv_filename, dtype=np.float32,
nrows=train_rows, header=None)
dtrain = xgb.DMatrix(df_higgs_train.loc[:, 1:29], df_higgs_train[0])
dtrain.save_binary(dmatrix_train_filename)
df_higgs_test = pandas.read_csv(csv_filename, dtype=np.float32,
skiprows=train_rows, nrows=test_rows,
header=None)
dtest = xgb.DMatrix(df_higgs_test.loc[:, 1:29], df_higgs_test[0])
dtest.save_binary(dmatrix_test_filename)
return dtrain, dtest
dtrain, dtest = load_higgs()
param = {}
param['objective'] = 'binary:logitraw'
param['eval_metric'] = 'error'
param['tree_method'] = 'gpu_hist'
param['silent'] = 1
print("Training with GPU ...")
tmp = time.time()
gpu_res = {}
xgb.train(param, dtrain, num_round, evals=[(dtest, "test")],
evals_result=gpu_res)
gpu_time = time.time() - tmp
print("GPU Training Time: %s seconds" % (str(gpu_time)))
print("Training with CPU ...")
param['tree_method'] = 'hist'
tmp = time.time()
cpu_res = {}
xgb.train(param, dtrain, num_round, evals=[(dtest, "test")],
evals_result=cpu_res)
cpu_time = time.time() - tmp
print("CPU Training Time: %s seconds" % (str(cpu_time)))
if plot:
import matplotlib.pyplot as plt
min_error = min(min(gpu_res["test"][param['eval_metric']]),
min(cpu_res["test"][param['eval_metric']]))
gpu_iteration_time = [x / (num_round * 1.0) * gpu_time for x in range(0, num_round)]
cpu_iteration_time = [x / (num_round * 1.0) * cpu_time for x in range(0, num_round)]
plt.plot(gpu_iteration_time, gpu_res['test'][param['eval_metric']],
label='Tesla P100')
plt.plot(cpu_iteration_time, cpu_res['test'][param['eval_metric']],
label='2x Haswell E5-2698 v3 (32 cores)')
plt.legend()
plt.xlabel('Time (s)')
plt.ylabel('Test error')
plt.axhline(y=min_error, color='r', linestyle='dashed')
plt.margins(x=0)
plt.ylim((0.23, 0.35))
plt.show()
错误
Training with GPU ...
---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
<ipython-input-4-edcaeafd94a7> in <module>()
58 gpu_res = {}
59 xgb.train(param, dtrain, num_round, evals=[(dtest, "test")],
---> 60 evals_result=gpu_res)
61 gpu_time = time.time() - tmp
62 print("GPU Training Time: %s seconds" % (str(gpu_time)))
3 frames
/usr/local/lib/python3.6/dist-packages/xgboost/training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, learning_rates)
214 evals=evals,
215 obj=obj, feval=feval,
--> 216 xgb_model=xgb_model, callbacks=callbacks)
217
218
/usr/local/lib/python3.6/dist-packages/xgboost/training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
72 # Skip the first update if it is a recovery step.
73 if version % 2 == 0:
---> 74 bst.update(dtrain, i, obj)
75 bst.save_rabit_checkpoint()
76 version += 1
/usr/local/lib/python3.6/dist-packages/xgboost/core.py in update(self, dtrain, iteration, fobj)
1107 if fobj is None:
1108 _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, ctypes.c_int(iteration),
-> 1109 dtrain.handle))
1110 else:
1111 pred = self.predict(dtrain)
/usr/local/lib/python3.6/dist-packages/xgboost/core.py in _check_call(ret)
174 """
175 if ret != 0:
--> 176 raise XGBoostError(py_str(_LIB.XGBGetLastError()))
177
178
XGBoostError: [10:10:03] /workspace/src/tree/updater_gpu_hist.cu:1407: Exception in gpu_hist: NCCL failure :unhandled cuda error /workspace/src/tree/../common/device_helpers.cuh(896)
Stack trace:
[bt] (0) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x24) [0x7fb070340cb4]
[bt] (1) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::Update(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::DMatrix*, std::vector<xgboost::RegTree*, std::allocator<xgboost::RegTree*> > const&)+0x1270) [0x7fb07057c7f0]
[bt] (2) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBTree::BoostNewTrees(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::DMatrix*, int, std::vector<std::unique_ptr<xgboost::RegTree, std::default_delete<xgboost::RegTree> >, std::allocator<std::unique_ptr<xgboost::RegTree, std::default_delete<xgboost::RegTree> > > >*)+0xa81) [0x7fb0703c6791]
[bt] (3) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBTree::DoBoost(xgboost::DMatrix*, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::ObjFunction*)+0xd65) [0x7fb0703c7c95]
[bt] (4) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*)+0x396) [0x7fb0703da556]
[bt] (5) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(XGBoosterUpdateOneIter+0x35) [0x7fb07033daa5]
[bt] (6) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7fb09b874dae]
[bt] (7) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x22f) [0x7fb09b87471f]
[bt] (8) /usr/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2b4) [0x7fb09ba885c4]
我试过的解决方案
- 我有正确的安装 - XGBoost with GPU support on Google Colab
- 我已经更正了 -
问题是库不兼容。这个 docker 容器解决了我的问题:
https://github.com/Kaggle/docker-python/commit/a6ba32e0bb017a30e079cf8bccab613cd4243a5f
对于使用共享 GPU 集群的其他人:
我的问题是没有使用集群的资源调度程序slurm/sbatch。
我需要使用 .sb 文件安排一个 sbatch 作业,而不是 python xgboost_code.py
,并使用 sbatch run_job.sb
调用它。祝你好运。
我也遇到了这个错误,但是在我自己的专用 GPU 服务器上。经过调查,我发现 nvidia-smi
也返回 Driver/library version mismatch
错误。该问题是由于将以前的运行文件安装与(当前)包管理器安装 (apt-get) 混合使用引起的。这可以通过重新启动来解决,或者按照 对于 version mismatch
错误所述手动重新加载驱动程序来解决:
lsmod | grep nvidia
sudo rmmod nvidia_drm
sudo rmmod nvidia_modeset
sudo rmmod nvidia_uvm
sudo rmmod nvidia
解决 rmmod: ERROR: Module nvidia is in use
错误:
sudo lsof /dev/nvidia*
验证没有加载驱动程序:
lsmod | grep nvidia # should return nothing
确认错误已解决:
nvidia-smi
项目
- Nvidia Developer project
- 在 Google 协作环境中
我的代码
import csv
import numpy as np
import os.path
import pandas
import time
import xgboost as xgb
import sys
if sys.version_info[0] >= 3:
from urllib.request import urlretrieve
else:
from urllib import urlretrieve
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
dmatrix_train_filename = "higgs_train.dmatrix"
dmatrix_test_filename = "higgs_test.dmatrix"
csv_filename = "HIGGS.csv.gz"
train_rows = 10500000
test_rows = 500000
num_round = 1000
plot = True
# return xgboost dmatrix
def load_higgs():
if os.path.isfile(dmatrix_train_filename) and os.path.isfile(dmatrix_test_filename):
dtrain = xgb.DMatrix(dmatrix_train_filename)
dtest = xgb.DMatrix(dmatrix_test_filename)
if dtrain.num_row() == train_rows and dtest.num_row() == test_rows:
print("Loading cached dmatrix...")
return dtrain, dtest
if not os.path.isfile(csv_filename):
print("Downloading higgs file...")
urlretrieve(data_url, csv_filename)
df_higgs_train = pandas.read_csv(csv_filename, dtype=np.float32,
nrows=train_rows, header=None)
dtrain = xgb.DMatrix(df_higgs_train.loc[:, 1:29], df_higgs_train[0])
dtrain.save_binary(dmatrix_train_filename)
df_higgs_test = pandas.read_csv(csv_filename, dtype=np.float32,
skiprows=train_rows, nrows=test_rows,
header=None)
dtest = xgb.DMatrix(df_higgs_test.loc[:, 1:29], df_higgs_test[0])
dtest.save_binary(dmatrix_test_filename)
return dtrain, dtest
dtrain, dtest = load_higgs()
param = {}
param['objective'] = 'binary:logitraw'
param['eval_metric'] = 'error'
param['tree_method'] = 'gpu_hist'
param['silent'] = 1
print("Training with GPU ...")
tmp = time.time()
gpu_res = {}
xgb.train(param, dtrain, num_round, evals=[(dtest, "test")],
evals_result=gpu_res)
gpu_time = time.time() - tmp
print("GPU Training Time: %s seconds" % (str(gpu_time)))
print("Training with CPU ...")
param['tree_method'] = 'hist'
tmp = time.time()
cpu_res = {}
xgb.train(param, dtrain, num_round, evals=[(dtest, "test")],
evals_result=cpu_res)
cpu_time = time.time() - tmp
print("CPU Training Time: %s seconds" % (str(cpu_time)))
if plot:
import matplotlib.pyplot as plt
min_error = min(min(gpu_res["test"][param['eval_metric']]),
min(cpu_res["test"][param['eval_metric']]))
gpu_iteration_time = [x / (num_round * 1.0) * gpu_time for x in range(0, num_round)]
cpu_iteration_time = [x / (num_round * 1.0) * cpu_time for x in range(0, num_round)]
plt.plot(gpu_iteration_time, gpu_res['test'][param['eval_metric']],
label='Tesla P100')
plt.plot(cpu_iteration_time, cpu_res['test'][param['eval_metric']],
label='2x Haswell E5-2698 v3 (32 cores)')
plt.legend()
plt.xlabel('Time (s)')
plt.ylabel('Test error')
plt.axhline(y=min_error, color='r', linestyle='dashed')
plt.margins(x=0)
plt.ylim((0.23, 0.35))
plt.show()
错误
Training with GPU ...
---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
<ipython-input-4-edcaeafd94a7> in <module>()
58 gpu_res = {}
59 xgb.train(param, dtrain, num_round, evals=[(dtest, "test")],
---> 60 evals_result=gpu_res)
61 gpu_time = time.time() - tmp
62 print("GPU Training Time: %s seconds" % (str(gpu_time)))
3 frames
/usr/local/lib/python3.6/dist-packages/xgboost/training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, learning_rates)
214 evals=evals,
215 obj=obj, feval=feval,
--> 216 xgb_model=xgb_model, callbacks=callbacks)
217
218
/usr/local/lib/python3.6/dist-packages/xgboost/training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
72 # Skip the first update if it is a recovery step.
73 if version % 2 == 0:
---> 74 bst.update(dtrain, i, obj)
75 bst.save_rabit_checkpoint()
76 version += 1
/usr/local/lib/python3.6/dist-packages/xgboost/core.py in update(self, dtrain, iteration, fobj)
1107 if fobj is None:
1108 _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, ctypes.c_int(iteration),
-> 1109 dtrain.handle))
1110 else:
1111 pred = self.predict(dtrain)
/usr/local/lib/python3.6/dist-packages/xgboost/core.py in _check_call(ret)
174 """
175 if ret != 0:
--> 176 raise XGBoostError(py_str(_LIB.XGBGetLastError()))
177
178
XGBoostError: [10:10:03] /workspace/src/tree/updater_gpu_hist.cu:1407: Exception in gpu_hist: NCCL failure :unhandled cuda error /workspace/src/tree/../common/device_helpers.cuh(896)
Stack trace:
[bt] (0) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x24) [0x7fb070340cb4]
[bt] (1) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::Update(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::DMatrix*, std::vector<xgboost::RegTree*, std::allocator<xgboost::RegTree*> > const&)+0x1270) [0x7fb07057c7f0]
[bt] (2) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBTree::BoostNewTrees(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::DMatrix*, int, std::vector<std::unique_ptr<xgboost::RegTree, std::default_delete<xgboost::RegTree> >, std::allocator<std::unique_ptr<xgboost::RegTree, std::default_delete<xgboost::RegTree> > > >*)+0xa81) [0x7fb0703c6791]
[bt] (3) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBTree::DoBoost(xgboost::DMatrix*, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::ObjFunction*)+0xd65) [0x7fb0703c7c95]
[bt] (4) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*)+0x396) [0x7fb0703da556]
[bt] (5) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(XGBoosterUpdateOneIter+0x35) [0x7fb07033daa5]
[bt] (6) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7fb09b874dae]
[bt] (7) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x22f) [0x7fb09b87471f]
[bt] (8) /usr/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2b4) [0x7fb09ba885c4]
我试过的解决方案
- 我有正确的安装 - XGBoost with GPU support on Google Colab
- 我已经更正了 -
问题是库不兼容。这个 docker 容器解决了我的问题:
https://github.com/Kaggle/docker-python/commit/a6ba32e0bb017a30e079cf8bccab613cd4243a5f
对于使用共享 GPU 集群的其他人:
我的问题是没有使用集群的资源调度程序slurm/sbatch。
我需要使用 .sb 文件安排一个 sbatch 作业,而不是 python xgboost_code.py
,并使用 sbatch run_job.sb
调用它。祝你好运。
我也遇到了这个错误,但是在我自己的专用 GPU 服务器上。经过调查,我发现 nvidia-smi
也返回 Driver/library version mismatch
错误。该问题是由于将以前的运行文件安装与(当前)包管理器安装 (apt-get) 混合使用引起的。这可以通过重新启动来解决,或者按照 version mismatch
错误所述手动重新加载驱动程序来解决:
lsmod | grep nvidia
sudo rmmod nvidia_drm
sudo rmmod nvidia_modeset
sudo rmmod nvidia_uvm
sudo rmmod nvidia
解决 rmmod: ERROR: Module nvidia is in use
错误:
sudo lsof /dev/nvidia*
验证没有加载驱动程序:
lsmod | grep nvidia # should return nothing
确认错误已解决:
nvidia-smi