worker 0 上的内存分配错误:std::bad_alloc:CUDA 错误
Memory allocation error on worker 0: std::bad_alloc: CUDA error
描述
- 我只是想为模型提供训练集和测试集,但出现以下错误
- 第一个数据包 -
train_data = xgboost.DMatrix(data=X_train, label=y_train)
直到我 运行 只是这个并做训练和任何东西,只有这个不会给出错误消息
- 第二个数据包 -
test_data = xgboost.DMatrix(data=X_test, label=y_test)
沿线耦合单元格,它们不一起执行
环境
- 遵循指南 - https://github.com/rapidsai-community/notebooks-contrib/blob/branch-0.14/intermediate_notebooks/E2E/synthetic_3D/rapids_ml_workflow_demo.ipynb
conda create -n rapids-0.16 -c rapidsai -c nvidia -c conda-forge -c defaults rapids=0.16 python=3.7 cudatoolkit=10.2
- AWS EC2:深度学习 AMI (Ubuntu 18.04) 版本 36.0 - ami-063585f0e06d22308:MXNet-1.7.0、TensorFlow-2.3.1、2.1.0 和 1.15.3、PyTorch-1.4。 0 和 1.7.0、神经元等。 NVIDIA CUDA、cuDNN、NCCL、英特尔 MKL-DNN、Docker、NVIDIA-Docker 和 EFA 支持。对于完全托管的体验,请检查:https://aws.amazon.com/sagemaker
- AWS EC2 实例 - g4dn.4xlarge - 16GB VRAM,64GB RAM
旁注
- 错误 GB VRAM 大小不是 30GB 或 15GB
- 1 539 047 424 = 1.5 GB,
- 3 091 258 960 = 3 GB,
- 3 015 442 432 = 3GB,
- 3 091 258 960 = 3 GB。
- GPU 有 16 GB VRAM,所以我认为这不能回答问题。
错误
---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
<ipython-input-25-7bd66d4fabf4> in <module>
1 #train = xgboost.DMatrix(data=X, label=y) #ORIGINAL
----> 2 test_data = xgboost.DMatrix(data=X_test, label=y_test)
~/anaconda3/envs/rapids/lib/python3.7/site-packages/xgboost/core.py in __init__(self, data, label, weight, base_margin, missing, silent, feature_names, feature_types, nthread, enable_categorical)
448 feature_names=feature_names,
449 feature_types=feature_types,
--> 450 enable_categorical=enable_categorical)
451 assert handle is not None
452 self.handle = handle
~/anaconda3/envs/rapids/lib/python3.7/site-packages/xgboost/data.py in dispatch_data_backend(data, missing, threads, feature_names, feature_types, enable_categorical)
543 if _is_cudf_df(data):
544 return _from_cudf_df(data, missing, threads, feature_names,
--> 545 feature_types)
546 if _is_cudf_ser(data):
547 return _from_cudf_df(data, missing, threads, feature_names,
~/anaconda3/envs/rapids/lib/python3.7/site-packages/xgboost/data.py in _from_cudf_df(data, missing, nthread, feature_names, feature_types)
400 ctypes.c_float(missing),
401 ctypes.c_int(nthread),
--> 402 ctypes.byref(handle)))
403 return handle, feature_names, feature_types
404
~/anaconda3/envs/rapids/lib/python3.7/site-packages/xgboost/core.py in _check_call(ret)
184 """
185 if ret != 0:
--> 186 raise XGBoostError(py_str(_LIB.XGBGetLastError()))
187
188
XGBoostError: [12:32:18] /opt/conda/envs/rapids/conda-bld/xgboost_1603491651651/work/src/c_api/../data/../common/device_helpers.cuh:400: Memory allocation error on worker 0: std::bad_alloc: CUDA error at: ../include/rmm/mr/device/cuda_memory_resource.hpp:68: cudaErrorMemoryAllocation out of memory
- Free memory: 1539047424
- Requested memory: 3091258960
Stack trace:
[bt] (0) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(+0x13674f) [0x7fad04f7274f]
[bt] (1) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(dh::detail::ThrowOOMError(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, unsigned long)+0x3ad) [0x7fad05190b0d]
[bt] (2) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(dh::detail::XGBDefaultDeviceAllocatorImpl<xgboost::Entry>::allocate(unsigned long)+0x1df) [0x7fad051ac11f]
[bt] (3) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(thrust::detail::vector_base<xgboost::Entry, dh::detail::XGBDefaultDeviceAllocatorImpl<xgboost::Entry> >::fill_insert(thrust::detail::normal_iterator<thrust::device_ptr<xgboost::Entry> >, unsigned long, xgboost::Entry const&)+0x26d) [0x7fad051d0d0d]
[bt] (4) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(xgboost::HostDeviceVector<xgboost::Entry>::Resize(unsigned long, xgboost::Entry)+0xc9) [0x7fad051d1cc9]
[bt] (5) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(xgboost::data::SimpleDMatrix::SimpleDMatrix<xgboost::data::CudfAdapter>(xgboost::data::CudfAdapter*, float, int)+0x3df) [0x7fad052259cf]
[bt] (6) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(xgboost::DMatrix* xgboost::DMatrix::Create<xgboost::data::CudfAdapter>(xgboost::data::CudfAdapter*, float, int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, unsigned long)+0x133) [0x7fad051f3aa3]
[bt] (7) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(XGDMatrixCreateFromArrayInterfaceColumns+0xc6) [0x7fad0518c286]
[bt] (8) /home/ubuntu/anaconda3/envs/rapids/lib/python3.7/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7fae60078630]
CODE 2 如果我清除并重新启动在 1 个单元格中一起执行它们的笔记本。
train_data = xgboost.DMatrix(data=X_train, label=y_train)
test_data = xgboost.DMatrix(data=X_test, label=y_test)
错误 2
---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
<ipython-input-20-f0c3710678a8> in <module>
1 #train = xgboost.DMatrix(data=X, label=y) #ORIGINAL
2 train_data = xgboost.DMatrix(data=X_train, label=y_train)
----> 3 test_data = xgboost.DMatrix(data=X_test, label=y_test)
~/anaconda3/envs/rapids/lib/python3.7/site-packages/xgboost/core.py in __init__(self, data, label, weight, base_margin, missing, silent, feature_names, feature_types, nthread, enable_categorical)
448 feature_names=feature_names,
449 feature_types=feature_types,
--> 450 enable_categorical=enable_categorical)
451 assert handle is not None
452 self.handle = handle
~/anaconda3/envs/rapids/lib/python3.7/site-packages/xgboost/data.py in dispatch_data_backend(data, missing, threads, feature_names, feature_types, enable_categorical)
543 if _is_cudf_df(data):
544 return _from_cudf_df(data, missing, threads, feature_names,
--> 545 feature_types)
546 if _is_cudf_ser(data):
547 return _from_cudf_df(data, missing, threads, feature_names,
~/anaconda3/envs/rapids/lib/python3.7/site-packages/xgboost/data.py in _from_cudf_df(data, missing, nthread, feature_names, feature_types)
400 ctypes.c_float(missing),
401 ctypes.c_int(nthread),
--> 402 ctypes.byref(handle)))
403 return handle, feature_names, feature_types
404
~/anaconda3/envs/rapids/lib/python3.7/site-packages/xgboost/core.py in _check_call(ret)
184 """
185 if ret != 0:
--> 186 raise XGBoostError(py_str(_LIB.XGBGetLastError()))
187
188
XGBoostError: [15:20:36] /opt/conda/envs/rapids/conda-bld/xgboost_1603491651651/work/src/c_api/../data/../common/device_helpers.cuh:400: Memory allocation error on worker 0: std::bad_alloc: CUDA error at: ../include/rmm/mr/device/cuda_memory_resource.hpp:68: cudaErrorMemoryAllocation out of memory
- Free memory: 3015442432
- Requested memory: 3091258960
Stack trace:
[bt] (0) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(+0x13674f) [0x7f7eea73674f]
[bt] (1) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(dh::detail::ThrowOOMError(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, unsigned long)+0x3ad) [0x7f7eea954b0d]
[bt] (2) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(dh::detail::XGBDefaultDeviceAllocatorImpl<xgboost::Entry>::allocate(unsigned long)+0x1df) [0x7f7eea97011f]
[bt] (3) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(thrust::detail::vector_base<xgboost::Entry, dh::detail::XGBDefaultDeviceAllocatorImpl<xgboost::Entry> >::fill_insert(thrust::detail::normal_iterator<thrust::device_ptr<xgboost::Entry> >, unsigned long, xgboost::Entry const&)+0x26d) [0x7f7eea994d0d]
[bt] (4) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(xgboost::HostDeviceVector<xgboost::Entry>::Resize(unsigned long, xgboost::Entry)+0xc9) [0x7f7eea995cc9]
[bt] (5) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(xgboost::data::SimpleDMatrix::SimpleDMatrix<xgboost::data::CudfAdapter>(xgboost::data::CudfAdapter*, float, int)+0x3df) [0x7f7eea9e99cf]
[bt] (6) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(xgboost::DMatrix* xgboost::DMatrix::Create<xgboost::data::CudfAdapter>(xgboost::data::CudfAdapter*, float, int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, unsigned long)+0x133) [0x7f7eea9b7aa3]
[bt] (7) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(XGDMatrixCreateFromArrayInterfaceColumns+0xc6) [0x7f7eea950286]
[bt] (8) /home/ubuntu/anaconda3/envs/rapids/lib/python3.7/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7f8044f8d630]
根据你这部分的错误,
XGBoostError: [12:32:18] /opt/conda/envs/rapids/conda-bld/xgboost_1603491651651/work/src/c_api/../data/../common/device_helpers.cuh:400: Memory allocation error on worker 0: std::bad_alloc: CUDA error at: ../include/rmm/mr/device/cuda_memory_resource.hpp:68: cudaErrorMemoryAllocation out of memory
- Free memory: 1539047424
- Requested memory: 3091258960
您的 GPU 显存对于这个特定的单 GPU 笔记本来说不够大。
最简单的解决方案是使用 p3
实例来获取 32GB GPU(或者 p4dn
如果您想尝试 A100s @ 40GB)
如果您出于某种原因需要在 g4
个实例上使用 T4,或者只是想在 dask-cudf
中进行更多练习,则需要您付出更多的努力。您可以:
- 使用多 GPU g4dn.12xlarge 和 apply
dask-cudf
and set up your dask cluster instead of using the single GPU cudf
with xgboost.dask
for multi-gpu boosting。那么它将适用于您的 16GB T4s
- try the same
dask-cudf
and xboost.dask
with the smaller single GPU g4
个实例。
多 GPU 版本将是一项了不起的社区贡献。
如果没有,就使用p3
实例。 I've made an issue 我们将在未来的 notebook-contrib PR 中添加警告。感谢您让我们意识到这一点!
描述
- 我只是想为模型提供训练集和测试集,但出现以下错误
- 第一个数据包 -
train_data = xgboost.DMatrix(data=X_train, label=y_train)
直到我 运行 只是这个并做训练和任何东西,只有这个不会给出错误消息 - 第二个数据包 -
test_data = xgboost.DMatrix(data=X_test, label=y_test)
沿线耦合单元格,它们不一起执行
环境
- 遵循指南 - https://github.com/rapidsai-community/notebooks-contrib/blob/branch-0.14/intermediate_notebooks/E2E/synthetic_3D/rapids_ml_workflow_demo.ipynb
conda create -n rapids-0.16 -c rapidsai -c nvidia -c conda-forge -c defaults rapids=0.16 python=3.7 cudatoolkit=10.2
- AWS EC2:深度学习 AMI (Ubuntu 18.04) 版本 36.0 - ami-063585f0e06d22308:MXNet-1.7.0、TensorFlow-2.3.1、2.1.0 和 1.15.3、PyTorch-1.4。 0 和 1.7.0、神经元等。 NVIDIA CUDA、cuDNN、NCCL、英特尔 MKL-DNN、Docker、NVIDIA-Docker 和 EFA 支持。对于完全托管的体验,请检查:https://aws.amazon.com/sagemaker
- AWS EC2 实例 - g4dn.4xlarge - 16GB VRAM,64GB RAM
旁注
- 错误 GB VRAM 大小不是 30GB 或 15GB
- 1 539 047 424 = 1.5 GB,
- 3 091 258 960 = 3 GB,
- 3 015 442 432 = 3GB,
- 3 091 258 960 = 3 GB。
- GPU 有 16 GB VRAM,所以我认为这不能回答问题。
错误
---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
<ipython-input-25-7bd66d4fabf4> in <module>
1 #train = xgboost.DMatrix(data=X, label=y) #ORIGINAL
----> 2 test_data = xgboost.DMatrix(data=X_test, label=y_test)
~/anaconda3/envs/rapids/lib/python3.7/site-packages/xgboost/core.py in __init__(self, data, label, weight, base_margin, missing, silent, feature_names, feature_types, nthread, enable_categorical)
448 feature_names=feature_names,
449 feature_types=feature_types,
--> 450 enable_categorical=enable_categorical)
451 assert handle is not None
452 self.handle = handle
~/anaconda3/envs/rapids/lib/python3.7/site-packages/xgboost/data.py in dispatch_data_backend(data, missing, threads, feature_names, feature_types, enable_categorical)
543 if _is_cudf_df(data):
544 return _from_cudf_df(data, missing, threads, feature_names,
--> 545 feature_types)
546 if _is_cudf_ser(data):
547 return _from_cudf_df(data, missing, threads, feature_names,
~/anaconda3/envs/rapids/lib/python3.7/site-packages/xgboost/data.py in _from_cudf_df(data, missing, nthread, feature_names, feature_types)
400 ctypes.c_float(missing),
401 ctypes.c_int(nthread),
--> 402 ctypes.byref(handle)))
403 return handle, feature_names, feature_types
404
~/anaconda3/envs/rapids/lib/python3.7/site-packages/xgboost/core.py in _check_call(ret)
184 """
185 if ret != 0:
--> 186 raise XGBoostError(py_str(_LIB.XGBGetLastError()))
187
188
XGBoostError: [12:32:18] /opt/conda/envs/rapids/conda-bld/xgboost_1603491651651/work/src/c_api/../data/../common/device_helpers.cuh:400: Memory allocation error on worker 0: std::bad_alloc: CUDA error at: ../include/rmm/mr/device/cuda_memory_resource.hpp:68: cudaErrorMemoryAllocation out of memory
- Free memory: 1539047424
- Requested memory: 3091258960
Stack trace:
[bt] (0) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(+0x13674f) [0x7fad04f7274f]
[bt] (1) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(dh::detail::ThrowOOMError(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, unsigned long)+0x3ad) [0x7fad05190b0d]
[bt] (2) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(dh::detail::XGBDefaultDeviceAllocatorImpl<xgboost::Entry>::allocate(unsigned long)+0x1df) [0x7fad051ac11f]
[bt] (3) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(thrust::detail::vector_base<xgboost::Entry, dh::detail::XGBDefaultDeviceAllocatorImpl<xgboost::Entry> >::fill_insert(thrust::detail::normal_iterator<thrust::device_ptr<xgboost::Entry> >, unsigned long, xgboost::Entry const&)+0x26d) [0x7fad051d0d0d]
[bt] (4) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(xgboost::HostDeviceVector<xgboost::Entry>::Resize(unsigned long, xgboost::Entry)+0xc9) [0x7fad051d1cc9]
[bt] (5) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(xgboost::data::SimpleDMatrix::SimpleDMatrix<xgboost::data::CudfAdapter>(xgboost::data::CudfAdapter*, float, int)+0x3df) [0x7fad052259cf]
[bt] (6) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(xgboost::DMatrix* xgboost::DMatrix::Create<xgboost::data::CudfAdapter>(xgboost::data::CudfAdapter*, float, int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, unsigned long)+0x133) [0x7fad051f3aa3]
[bt] (7) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(XGDMatrixCreateFromArrayInterfaceColumns+0xc6) [0x7fad0518c286]
[bt] (8) /home/ubuntu/anaconda3/envs/rapids/lib/python3.7/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7fae60078630]
CODE 2 如果我清除并重新启动在 1 个单元格中一起执行它们的笔记本。
train_data = xgboost.DMatrix(data=X_train, label=y_train)
test_data = xgboost.DMatrix(data=X_test, label=y_test)
错误 2
---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
<ipython-input-20-f0c3710678a8> in <module>
1 #train = xgboost.DMatrix(data=X, label=y) #ORIGINAL
2 train_data = xgboost.DMatrix(data=X_train, label=y_train)
----> 3 test_data = xgboost.DMatrix(data=X_test, label=y_test)
~/anaconda3/envs/rapids/lib/python3.7/site-packages/xgboost/core.py in __init__(self, data, label, weight, base_margin, missing, silent, feature_names, feature_types, nthread, enable_categorical)
448 feature_names=feature_names,
449 feature_types=feature_types,
--> 450 enable_categorical=enable_categorical)
451 assert handle is not None
452 self.handle = handle
~/anaconda3/envs/rapids/lib/python3.7/site-packages/xgboost/data.py in dispatch_data_backend(data, missing, threads, feature_names, feature_types, enable_categorical)
543 if _is_cudf_df(data):
544 return _from_cudf_df(data, missing, threads, feature_names,
--> 545 feature_types)
546 if _is_cudf_ser(data):
547 return _from_cudf_df(data, missing, threads, feature_names,
~/anaconda3/envs/rapids/lib/python3.7/site-packages/xgboost/data.py in _from_cudf_df(data, missing, nthread, feature_names, feature_types)
400 ctypes.c_float(missing),
401 ctypes.c_int(nthread),
--> 402 ctypes.byref(handle)))
403 return handle, feature_names, feature_types
404
~/anaconda3/envs/rapids/lib/python3.7/site-packages/xgboost/core.py in _check_call(ret)
184 """
185 if ret != 0:
--> 186 raise XGBoostError(py_str(_LIB.XGBGetLastError()))
187
188
XGBoostError: [15:20:36] /opt/conda/envs/rapids/conda-bld/xgboost_1603491651651/work/src/c_api/../data/../common/device_helpers.cuh:400: Memory allocation error on worker 0: std::bad_alloc: CUDA error at: ../include/rmm/mr/device/cuda_memory_resource.hpp:68: cudaErrorMemoryAllocation out of memory
- Free memory: 3015442432
- Requested memory: 3091258960
Stack trace:
[bt] (0) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(+0x13674f) [0x7f7eea73674f]
[bt] (1) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(dh::detail::ThrowOOMError(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, unsigned long)+0x3ad) [0x7f7eea954b0d]
[bt] (2) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(dh::detail::XGBDefaultDeviceAllocatorImpl<xgboost::Entry>::allocate(unsigned long)+0x1df) [0x7f7eea97011f]
[bt] (3) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(thrust::detail::vector_base<xgboost::Entry, dh::detail::XGBDefaultDeviceAllocatorImpl<xgboost::Entry> >::fill_insert(thrust::detail::normal_iterator<thrust::device_ptr<xgboost::Entry> >, unsigned long, xgboost::Entry const&)+0x26d) [0x7f7eea994d0d]
[bt] (4) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(xgboost::HostDeviceVector<xgboost::Entry>::Resize(unsigned long, xgboost::Entry)+0xc9) [0x7f7eea995cc9]
[bt] (5) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(xgboost::data::SimpleDMatrix::SimpleDMatrix<xgboost::data::CudfAdapter>(xgboost::data::CudfAdapter*, float, int)+0x3df) [0x7f7eea9e99cf]
[bt] (6) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(xgboost::DMatrix* xgboost::DMatrix::Create<xgboost::data::CudfAdapter>(xgboost::data::CudfAdapter*, float, int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, unsigned long)+0x133) [0x7f7eea9b7aa3]
[bt] (7) /home/ubuntu/anaconda3/envs/rapids/lib/libxgboost.so(XGDMatrixCreateFromArrayInterfaceColumns+0xc6) [0x7f7eea950286]
[bt] (8) /home/ubuntu/anaconda3/envs/rapids/lib/python3.7/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7f8044f8d630]
根据你这部分的错误,
XGBoostError: [12:32:18] /opt/conda/envs/rapids/conda-bld/xgboost_1603491651651/work/src/c_api/../data/../common/device_helpers.cuh:400: Memory allocation error on worker 0: std::bad_alloc: CUDA error at: ../include/rmm/mr/device/cuda_memory_resource.hpp:68: cudaErrorMemoryAllocation out of memory
- Free memory: 1539047424
- Requested memory: 3091258960
您的 GPU 显存对于这个特定的单 GPU 笔记本来说不够大。
最简单的解决方案是使用 p3
实例来获取 32GB GPU(或者 p4dn
如果您想尝试 A100s @ 40GB)
如果您出于某种原因需要在 g4
个实例上使用 T4,或者只是想在 dask-cudf
中进行更多练习,则需要您付出更多的努力。您可以:
- 使用多 GPU g4dn.12xlarge 和 apply
dask-cudf
and set up your dask cluster instead of using the single GPUcudf
withxgboost.dask
for multi-gpu boosting。那么它将适用于您的 16GB T4s - try the same
dask-cudf
andxboost.dask
with the smaller single GPUg4
个实例。
多 GPU 版本将是一项了不起的社区贡献。
如果没有,就使用p3
实例。 I've made an issue 我们将在未来的 notebook-contrib PR 中添加警告。感谢您让我们意识到这一点!