cuda thrust::sort 当我还有足够的内存时遇到内存问题
cuda thrust::sort met memory problem when I still have enough memory
我在 ubuntu18.04 上使用 cuda10.2。我的gpu是tesla T4,内存16G,目前gpu上没有其他程序运行。
简短的代码如下:
#include <iostream>
#include <algorithm>
#include <random>
#include <vector>
#include <numeric>
#include <algorithm>
#include <chrono>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
struct sort_functor {
thrust::device_ptr<float> data;
int stride = 1;
__host__ __device__
void operator()(int idx) {
thrust::sort(thrust::device,
data + idx * stride,
data + (idx + 1) * stride);
}
};
int main() {
std::random_device rd;
std::mt19937 engine;
engine.seed(rd());
std::uniform_real_distribution<float> u(0, 90.);
int M = 8;
int N = 8 * 384 * 300;
std::vector<float> v(M * N);
std::generate(v.begin(), v.end(), [&](){return u(engine);});
thrust::host_vector<float> hv(v.begin(), v.end());
thrust::device_vector<float> dv = hv;
thrust::device_vector<float> res(dv.begin(), dv.end());
thrust::device_vector<int> index(M);
thrust::sequence(thrust::device, index.begin(), index.end(), 0, 1);
thrust::for_each(thrust::device, index.begin(), index.end(),
sort_functor{res.data(), N}
);
cudaDeviceSynchronize();
return 0;
}
错误信息是:
temporary_buffer::allocate: get_temporary_buffer failed
temporary_buffer::allocate: get_temporary_buffer failed
temporary_buffer::allocate: get_temporary_buffer failed
temporary_buffer::allocate: get_temporary_buffer failed
temporary_buffer::allocate: get_temporary_buffer failed
temporary_buffer::allocate: get_temporary_buffer failed
terminate called after throwing an instance of 'thrust::system::system_error'
what(): for_each: failed to synchronize: cudaErrorLaunchFailure: unspecified launch failure
Aborted (core dumped)
请问我该如何解决这个问题?
thrust::sort
requires O(N) temporary memory allocation. When you call it from device code (in your functor), that temporary memory allocation (for each call - i.e. from each of your 8 calls) will be done on the device, using new
or malloc
under the hood, and the allocation will come out of the "device heap" space. The ,但您可以更改它。您正在达到此限制。
如果您在 main
例程的顶部添加以下内容:
cudaError_t err = cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1048576ULL*1024);
你的代码运行时对我来说没有任何运行时错误。
我并不是在暗示我仔细计算了上面的 1GB 值。我简单地选择了一个比 8MB 大得多但比 16GB 小得多的值,它似乎可以工作。在一般情况下,您可能 需要的临时分配大小。
我在 ubuntu18.04 上使用 cuda10.2。我的gpu是tesla T4,内存16G,目前gpu上没有其他程序运行。 简短的代码如下:
#include <iostream>
#include <algorithm>
#include <random>
#include <vector>
#include <numeric>
#include <algorithm>
#include <chrono>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
struct sort_functor {
thrust::device_ptr<float> data;
int stride = 1;
__host__ __device__
void operator()(int idx) {
thrust::sort(thrust::device,
data + idx * stride,
data + (idx + 1) * stride);
}
};
int main() {
std::random_device rd;
std::mt19937 engine;
engine.seed(rd());
std::uniform_real_distribution<float> u(0, 90.);
int M = 8;
int N = 8 * 384 * 300;
std::vector<float> v(M * N);
std::generate(v.begin(), v.end(), [&](){return u(engine);});
thrust::host_vector<float> hv(v.begin(), v.end());
thrust::device_vector<float> dv = hv;
thrust::device_vector<float> res(dv.begin(), dv.end());
thrust::device_vector<int> index(M);
thrust::sequence(thrust::device, index.begin(), index.end(), 0, 1);
thrust::for_each(thrust::device, index.begin(), index.end(),
sort_functor{res.data(), N}
);
cudaDeviceSynchronize();
return 0;
}
错误信息是:
temporary_buffer::allocate: get_temporary_buffer failed
temporary_buffer::allocate: get_temporary_buffer failed
temporary_buffer::allocate: get_temporary_buffer failed
temporary_buffer::allocate: get_temporary_buffer failed
temporary_buffer::allocate: get_temporary_buffer failed
temporary_buffer::allocate: get_temporary_buffer failed
terminate called after throwing an instance of 'thrust::system::system_error'
what(): for_each: failed to synchronize: cudaErrorLaunchFailure: unspecified launch failure
Aborted (core dumped)
请问我该如何解决这个问题?
thrust::sort
requires O(N) temporary memory allocation. When you call it from device code (in your functor), that temporary memory allocation (for each call - i.e. from each of your 8 calls) will be done on the device, using new
or malloc
under the hood, and the allocation will come out of the "device heap" space. The
如果您在 main
例程的顶部添加以下内容:
cudaError_t err = cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1048576ULL*1024);
你的代码运行时对我来说没有任何运行时错误。
我并不是在暗示我仔细计算了上面的 1GB 值。我简单地选择了一个比 8MB 大得多但比 16GB 小得多的值,它似乎可以工作。在一般情况下,您可能