XGBoostError: [10:10:03] /workspace/src/tree/updater_gpu_hist.cu:1407: Exception in gpu_hist: NCCL failure

XGBoostError: [10:10:03] /workspace/src/tree/updater_gpu_hist.cu:1407: Exception in gpu_hist: NCCL failure

项目

我的代码

import csv
import numpy as np
import os.path
import pandas
import time
import xgboost as xgb
import sys
if sys.version_info[0] >= 3:
    from urllib.request import urlretrieve
else:
    from urllib import urlretrieve

data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
dmatrix_train_filename = "higgs_train.dmatrix"
dmatrix_test_filename = "higgs_test.dmatrix"
csv_filename = "HIGGS.csv.gz"
train_rows = 10500000
test_rows = 500000
num_round = 1000

plot = True

# return xgboost dmatrix
def load_higgs():
    if os.path.isfile(dmatrix_train_filename) and os.path.isfile(dmatrix_test_filename):           
        dtrain = xgb.DMatrix(dmatrix_train_filename)
        dtest = xgb.DMatrix(dmatrix_test_filename)
        if dtrain.num_row() == train_rows and dtest.num_row() == test_rows:
            print("Loading cached dmatrix...")
            return dtrain, dtest

    if not os.path.isfile(csv_filename):
        print("Downloading higgs file...")
        urlretrieve(data_url, csv_filename)

    df_higgs_train = pandas.read_csv(csv_filename, dtype=np.float32, 
                                     nrows=train_rows, header=None)
    dtrain = xgb.DMatrix(df_higgs_train.loc[:, 1:29], df_higgs_train[0])
    dtrain.save_binary(dmatrix_train_filename)
    df_higgs_test = pandas.read_csv(csv_filename, dtype=np.float32, 
                                    skiprows=train_rows, nrows=test_rows, 
                                    header=None)
    dtest = xgb.DMatrix(df_higgs_test.loc[:, 1:29], df_higgs_test[0])
    dtest.save_binary(dmatrix_test_filename)

    return dtrain, dtest


dtrain, dtest = load_higgs()
param = {}
param['objective'] = 'binary:logitraw'
param['eval_metric'] = 'error'
param['tree_method'] = 'gpu_hist'
param['silent'] = 1

print("Training with GPU ...")
tmp = time.time()
gpu_res = {}
xgb.train(param, dtrain, num_round, evals=[(dtest, "test")], 
          evals_result=gpu_res)
gpu_time = time.time() - tmp
print("GPU Training Time: %s seconds" % (str(gpu_time)))

print("Training with CPU ...")
param['tree_method'] = 'hist'
tmp = time.time()
cpu_res = {}
xgb.train(param, dtrain, num_round, evals=[(dtest, "test")], 
          evals_result=cpu_res)
cpu_time = time.time() - tmp
print("CPU Training Time: %s seconds" % (str(cpu_time)))

if plot:
    import matplotlib.pyplot as plt
    min_error = min(min(gpu_res["test"][param['eval_metric']]), 
                    min(cpu_res["test"][param['eval_metric']]))
    gpu_iteration_time = [x / (num_round * 1.0) * gpu_time for x in range(0, num_round)]
    cpu_iteration_time = [x / (num_round * 1.0) * cpu_time for x in range(0, num_round)]
    plt.plot(gpu_iteration_time, gpu_res['test'][param['eval_metric']], 
             label='Tesla P100')
    plt.plot(cpu_iteration_time, cpu_res['test'][param['eval_metric']], 
             label='2x Haswell E5-2698 v3 (32 cores)')
    plt.legend()
    plt.xlabel('Time (s)')
    plt.ylabel('Test error')
    plt.axhline(y=min_error, color='r', linestyle='dashed')
    plt.margins(x=0)
    plt.ylim((0.23, 0.35))
    plt.show()

错误

Training with GPU ...
---------------------------------------------------------------------------
XGBoostError                              Traceback (most recent call last)
<ipython-input-4-edcaeafd94a7> in <module>()
     58 gpu_res = {}
     59 xgb.train(param, dtrain, num_round, evals=[(dtest, "test")], 
---> 60           evals_result=gpu_res)
     61 gpu_time = time.time() - tmp
     62 print("GPU Training Time: %s seconds" % (str(gpu_time)))

3 frames
/usr/local/lib/python3.6/dist-packages/xgboost/training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, learning_rates)
    214                            evals=evals,
    215                            obj=obj, feval=feval,
--> 216                            xgb_model=xgb_model, callbacks=callbacks)
    217 
    218 

/usr/local/lib/python3.6/dist-packages/xgboost/training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
     72         # Skip the first update if it is a recovery step.
     73         if version % 2 == 0:
---> 74             bst.update(dtrain, i, obj)
     75             bst.save_rabit_checkpoint()
     76             version += 1

/usr/local/lib/python3.6/dist-packages/xgboost/core.py in update(self, dtrain, iteration, fobj)
   1107         if fobj is None:
   1108             _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, ctypes.c_int(iteration),
-> 1109                                                     dtrain.handle))
   1110         else:
   1111             pred = self.predict(dtrain)

/usr/local/lib/python3.6/dist-packages/xgboost/core.py in _check_call(ret)
    174     """
    175     if ret != 0:
--> 176         raise XGBoostError(py_str(_LIB.XGBGetLastError()))
    177 
    178 

XGBoostError: [10:10:03] /workspace/src/tree/updater_gpu_hist.cu:1407: Exception in gpu_hist: NCCL failure :unhandled cuda error /workspace/src/tree/../common/device_helpers.cuh(896)

Stack trace:
  [bt] (0) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x24) [0x7fb070340cb4]
  [bt] (1) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::Update(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::DMatrix*, std::vector<xgboost::RegTree*, std::allocator<xgboost::RegTree*> > const&)+0x1270) [0x7fb07057c7f0]
  [bt] (2) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBTree::BoostNewTrees(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::DMatrix*, int, std::vector<std::unique_ptr<xgboost::RegTree, std::default_delete<xgboost::RegTree> >, std::allocator<std::unique_ptr<xgboost::RegTree, std::default_delete<xgboost::RegTree> > > >*)+0xa81) [0x7fb0703c6791]
  [bt] (3) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBTree::DoBoost(xgboost::DMatrix*, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::ObjFunction*)+0xd65) [0x7fb0703c7c95]
  [bt] (4) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*)+0x396) [0x7fb0703da556]
  [bt] (5) /usr/local/lib/python3.6/dist-packages/xgboost/./lib/libxgboost.so(XGBoosterUpdateOneIter+0x35) [0x7fb07033daa5]
  [bt] (6) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7fb09b874dae]
  [bt] (7) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x22f) [0x7fb09b87471f]
  [bt] (8) /usr/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2b4) [0x7fb09ba885c4]

我试过的解决方案

问题是库不兼容。这个 docker 容器解决了我的问题:

https://github.com/Kaggle/docker-python/commit/a6ba32e0bb017a30e079cf8bccab613cd4243a5f

对于使用共享 GPU 集群的其他人:

我的问题是没有使用集群的资源调度程序slurm/sbatch。 我需要使用 .sb 文件安排一个 sbatch 作业,而不是 python xgboost_code.py,并使用 sbatch run_job.sb 调用它。祝你好运。

我也遇到了这个错误,但是在我自己的专用 GPU 服务器上。经过调查,我发现 nvidia-smi 也返回 Driver/library version mismatch 错误。该问题是由于将以前的运行文件安装与(当前)包管理器安装 (apt-get) 混合使用引起的。这可以通过重新启动来解决,或者按照 对于 version mismatch 错误所述手动重新加载驱动程序来解决:

lsmod | grep nvidia
sudo rmmod nvidia_drm
sudo rmmod nvidia_modeset
sudo rmmod nvidia_uvm
sudo rmmod nvidia

解决 rmmod: ERROR: Module nvidia is in use 错误:

sudo lsof /dev/nvidia*

验证没有加载驱动程序:

lsmod | grep nvidia # should return nothing

确认错误已解决:

nvidia-smi