命名空间 thrust::system::cuda::thrust 中无法解释的错误,特别是在 "system_error" 和 "cuda_category" 中
Unexplained errors in namespace thrust::system::cuda::thrust specifically in "system_error" and "cuda_category"
我正在尝试使用 thrust::raw_pointer_cast 转换原始指针以捕获函子中的输出。我尝试了多种方法来将指针传递给浮点数,但不断出现内存冲突和两个智能感知错误 thrust::system::cuda::thrust 没有成员 "system_error" 并且没有成员 "cuda_category"。奇怪的是,它似乎是程序 throw_on_error.hpp 中的一个错误,它似乎是 BULK 库的一部分,即使我没有具体引用 BULK。我是 C++ 的新手,所以我可能误解了指针,或者我缺少某种包含。
以下是我一直试图开始工作的代码版本。任何帮助将不胜感激。
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/sequence.h>
#include <cstdlib>
#include <ctime>
#include <vector>
#include <algorithm>
#include <memory.h>
#include <cstdio>
#include <thread>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/reduce.h>
using namespace std;
const int num_segs = 1; // number of segments to sort
const int num_vals = 5; // number of values in each segment
template <typename T>
struct sort_vector
{
T *Ndata;
T *Ddata;
T *answer;
sort_vector(T *_Ndata, T *_Ddata, float *a) : Ndata(_Ndata), Ddata(_Ddata), answer(a) {};
__host__ __device__ void operator()(int idx)
{
thrust::sort(thrust::seq, Ndata + idx*num_vals, Ndata + ((idx + 1)*num_vals));
thrust::sort(thrust::seq, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));
*answer = thrust::reduce(thrust::device, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));
}
};
int main() {
thrust::device_vector<float> d_Ndata(num_segs*num_vals);
d_Ndata[0] = 30;
d_Ndata[1] = 5.5;
d_Ndata[2] = 60;
d_Ndata[3] = 21;
d_Ndata[4] = 2;
thrust::device_vector<float> d_Ddata(num_segs*num_vals);
d_Ddata[0] = 50;
d_Ddata[1] = 9.5;
d_Ddata[2] = 30;
d_Ddata[3] = 8.1;
d_Ddata[4] = 1;
cout << "original norm" << endl;
int f = 0;
while (f < num_segs*num_vals){
cout << d_Ndata[f] << endl;
f++;
}
cout << "original dut" << endl;
int g = 0;
while (g < num_segs*num_vals){
cout << d_Ddata[g] << endl;
g++;
}
thrust::device_vector<int> d_idxs(num_segs);
thrust::sequence(d_idxs.begin(), d_idxs.end());
float *answer = (float*)malloc(sizeof(float));
cudaStream_t s1;
cudaStreamCreate(&s1);
clock_t start;
double duration;
start = clock();
thrust::for_each(thrust::cuda::par.on(s1),
d_idxs.begin(),
d_idxs.end(), sort_vector<float>(thrust::raw_pointer_cast(d_Ndata.data()), thrust::raw_pointer_cast(d_Ddata.data()), thrust::raw_pointer_cast(answer)));
cudaStreamSynchronize(s1);
cout << "sum" << endl;
cout << answer << endl;
//free(answer);
cudaStreamDestroy(s1);
duration = (clock() - start) / (double)CLOCKS_PER_SEC;
cout << "time " << duration << endl;
cin.get();
return 0;
}
主要问题在这里:
float *answer = (float*)malloc(sizeof(float));
这是在创建一个主机内存分配。当您将该指针传递给仿函数时:
thrust::raw_pointer_cast(answer)
您正在将指向 主机内存 的指针传递给将在设备代码中 运行 的仿函数。如果仿函数试图访问该位置,这将是非法访问。在 CUDA 中,不允许设备代码直接访问主机指针位置,反之亦然(忽略此处未涉及的各种概念)。
因此,当您的仿函数代码执行此操作时:
*answer = thrust::reduce(thrust::device, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));
尝试写入 *answer
时会触发非法访问。
一个直接的解决方案是创建 answer
指向设备内存中正确分配的位置。以下代码演示了更改和 运行s 对我来说没有错误:
$ cat t1190.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/sequence.h>
#include <cstdlib>
#include <ctime>
#include <vector>
#include <algorithm>
#include <memory.h>
#include <cstdio>
#include <thread>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/reduce.h>
using namespace std;
const int num_segs = 1; // number of segments to sort
const int num_vals = 5; // number of values in each segment
template <typename T>
struct sort_vector
{
T *Ndata;
T *Ddata;
T *answer;
sort_vector(T *_Ndata, T *_Ddata, float *a) : Ndata(_Ndata), Ddata(_Ddata), answer(a) {};
__host__ __device__ void operator()(int idx)
{
thrust::sort(thrust::seq, Ndata + idx*num_vals, Ndata + ((idx + 1)*num_vals));
thrust::sort(thrust::seq, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));
*answer = thrust::reduce(thrust::device, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));
}
};
int main() {
thrust::device_vector<float> d_Ndata(num_segs*num_vals);
d_Ndata[0] = 30;
d_Ndata[1] = 5.5;
d_Ndata[2] = 60;
d_Ndata[3] = 21;
d_Ndata[4] = 2;
thrust::device_vector<float> d_Ddata(num_segs*num_vals);
d_Ddata[0] = 50;
d_Ddata[1] = 9.5;
d_Ddata[2] = 30;
d_Ddata[3] = 8.1;
d_Ddata[4] = 1;
cout << "original norm" << endl;
int f = 0;
while (f < num_segs*num_vals){
cout << d_Ndata[f] << endl;
f++;
}
cout << "original dut" << endl;
int g = 0;
while (g < num_segs*num_vals){
cout << d_Ddata[g] << endl;
g++;
}
thrust::device_vector<int> d_idxs(num_segs);
thrust::sequence(d_idxs.begin(), d_idxs.end());
thrust::device_vector<float> dv_answer(1);
//float *answer = (float*)malloc(sizeof(float));
cudaStream_t s1;
cudaStreamCreate(&s1);
clock_t start;
double duration;
start = clock();
thrust::for_each(thrust::cuda::par.on(s1),
d_idxs.begin(),
d_idxs.end(), sort_vector<float>(thrust::raw_pointer_cast(d_Ndata.data()), thrust::raw_pointer_cast(d_Ddata.data()), thrust::raw_pointer_cast(dv_answer.data())));
cudaStreamSynchronize(s1);
cout << "sum" << endl;
cout << dv_answer[0] << endl;
//free(answer);
cudaStreamDestroy(s1);
duration = (clock() - start) / (double)CLOCKS_PER_SEC;
cout << "time " << duration << endl;
return 0;
}
$ nvcc -std=c++11 t1190.cu -o t1190
$ ./t1190
original norm
30
5.5
60
21
2
original dut
50
9.5
30
8.1
1
sum
98.6
time 0.000919
$
我不会尝试解释智能感知错误。 Intellisense 通常不能很好地与 CUDA 配合使用,正如您所见,intellisense 可能会标记出实际上可以正常编译的内容(例如这个问题中的这段代码)。如果 CUDA 代码编译正确,很有可能可以安全地忽略 Intellisense 报告的问题。
补充几点:
对于推力初学者来说,您似乎走上了一条奇怪的道路,运行从仿函数中使用推力算法。您正在做的事情在技术上没有任何问题,但这种类型的代码通常会保留用于特定情况,而不是用于一般推力用途。由于您的 num_segs
在此示例中为 1,因此您将 运行 宁一个 CUDA 线程来执行所有这些工作,这肯定不是高性能的。如果您打算以后扩大规模,那很好。之前也有类似的评论,这里就不多说了。
此仿函数写入单个位置 (*answer
) 以存放其结果。如果将其扩展到多个线程,则必须提供多个位置供仿函数写入(每个线程或传递给 for_each
的向量中的每个元素一个),否则线程将相互覆盖结果。
我正在尝试使用 thrust::raw_pointer_cast 转换原始指针以捕获函子中的输出。我尝试了多种方法来将指针传递给浮点数,但不断出现内存冲突和两个智能感知错误 thrust::system::cuda::thrust 没有成员 "system_error" 并且没有成员 "cuda_category"。奇怪的是,它似乎是程序 throw_on_error.hpp 中的一个错误,它似乎是 BULK 库的一部分,即使我没有具体引用 BULK。我是 C++ 的新手,所以我可能误解了指针,或者我缺少某种包含。
以下是我一直试图开始工作的代码版本。任何帮助将不胜感激。
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/sequence.h>
#include <cstdlib>
#include <ctime>
#include <vector>
#include <algorithm>
#include <memory.h>
#include <cstdio>
#include <thread>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/reduce.h>
using namespace std;
const int num_segs = 1; // number of segments to sort
const int num_vals = 5; // number of values in each segment
template <typename T>
struct sort_vector
{
T *Ndata;
T *Ddata;
T *answer;
sort_vector(T *_Ndata, T *_Ddata, float *a) : Ndata(_Ndata), Ddata(_Ddata), answer(a) {};
__host__ __device__ void operator()(int idx)
{
thrust::sort(thrust::seq, Ndata + idx*num_vals, Ndata + ((idx + 1)*num_vals));
thrust::sort(thrust::seq, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));
*answer = thrust::reduce(thrust::device, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));
}
};
int main() {
thrust::device_vector<float> d_Ndata(num_segs*num_vals);
d_Ndata[0] = 30;
d_Ndata[1] = 5.5;
d_Ndata[2] = 60;
d_Ndata[3] = 21;
d_Ndata[4] = 2;
thrust::device_vector<float> d_Ddata(num_segs*num_vals);
d_Ddata[0] = 50;
d_Ddata[1] = 9.5;
d_Ddata[2] = 30;
d_Ddata[3] = 8.1;
d_Ddata[4] = 1;
cout << "original norm" << endl;
int f = 0;
while (f < num_segs*num_vals){
cout << d_Ndata[f] << endl;
f++;
}
cout << "original dut" << endl;
int g = 0;
while (g < num_segs*num_vals){
cout << d_Ddata[g] << endl;
g++;
}
thrust::device_vector<int> d_idxs(num_segs);
thrust::sequence(d_idxs.begin(), d_idxs.end());
float *answer = (float*)malloc(sizeof(float));
cudaStream_t s1;
cudaStreamCreate(&s1);
clock_t start;
double duration;
start = clock();
thrust::for_each(thrust::cuda::par.on(s1),
d_idxs.begin(),
d_idxs.end(), sort_vector<float>(thrust::raw_pointer_cast(d_Ndata.data()), thrust::raw_pointer_cast(d_Ddata.data()), thrust::raw_pointer_cast(answer)));
cudaStreamSynchronize(s1);
cout << "sum" << endl;
cout << answer << endl;
//free(answer);
cudaStreamDestroy(s1);
duration = (clock() - start) / (double)CLOCKS_PER_SEC;
cout << "time " << duration << endl;
cin.get();
return 0;
}
主要问题在这里:
float *answer = (float*)malloc(sizeof(float));
这是在创建一个主机内存分配。当您将该指针传递给仿函数时:
thrust::raw_pointer_cast(answer)
您正在将指向 主机内存 的指针传递给将在设备代码中 运行 的仿函数。如果仿函数试图访问该位置,这将是非法访问。在 CUDA 中,不允许设备代码直接访问主机指针位置,反之亦然(忽略此处未涉及的各种概念)。
因此,当您的仿函数代码执行此操作时:
*answer = thrust::reduce(thrust::device, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));
尝试写入 *answer
时会触发非法访问。
一个直接的解决方案是创建 answer
指向设备内存中正确分配的位置。以下代码演示了更改和 运行s 对我来说没有错误:
$ cat t1190.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/sequence.h>
#include <cstdlib>
#include <ctime>
#include <vector>
#include <algorithm>
#include <memory.h>
#include <cstdio>
#include <thread>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/reduce.h>
using namespace std;
const int num_segs = 1; // number of segments to sort
const int num_vals = 5; // number of values in each segment
template <typename T>
struct sort_vector
{
T *Ndata;
T *Ddata;
T *answer;
sort_vector(T *_Ndata, T *_Ddata, float *a) : Ndata(_Ndata), Ddata(_Ddata), answer(a) {};
__host__ __device__ void operator()(int idx)
{
thrust::sort(thrust::seq, Ndata + idx*num_vals, Ndata + ((idx + 1)*num_vals));
thrust::sort(thrust::seq, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));
*answer = thrust::reduce(thrust::device, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));
}
};
int main() {
thrust::device_vector<float> d_Ndata(num_segs*num_vals);
d_Ndata[0] = 30;
d_Ndata[1] = 5.5;
d_Ndata[2] = 60;
d_Ndata[3] = 21;
d_Ndata[4] = 2;
thrust::device_vector<float> d_Ddata(num_segs*num_vals);
d_Ddata[0] = 50;
d_Ddata[1] = 9.5;
d_Ddata[2] = 30;
d_Ddata[3] = 8.1;
d_Ddata[4] = 1;
cout << "original norm" << endl;
int f = 0;
while (f < num_segs*num_vals){
cout << d_Ndata[f] << endl;
f++;
}
cout << "original dut" << endl;
int g = 0;
while (g < num_segs*num_vals){
cout << d_Ddata[g] << endl;
g++;
}
thrust::device_vector<int> d_idxs(num_segs);
thrust::sequence(d_idxs.begin(), d_idxs.end());
thrust::device_vector<float> dv_answer(1);
//float *answer = (float*)malloc(sizeof(float));
cudaStream_t s1;
cudaStreamCreate(&s1);
clock_t start;
double duration;
start = clock();
thrust::for_each(thrust::cuda::par.on(s1),
d_idxs.begin(),
d_idxs.end(), sort_vector<float>(thrust::raw_pointer_cast(d_Ndata.data()), thrust::raw_pointer_cast(d_Ddata.data()), thrust::raw_pointer_cast(dv_answer.data())));
cudaStreamSynchronize(s1);
cout << "sum" << endl;
cout << dv_answer[0] << endl;
//free(answer);
cudaStreamDestroy(s1);
duration = (clock() - start) / (double)CLOCKS_PER_SEC;
cout << "time " << duration << endl;
return 0;
}
$ nvcc -std=c++11 t1190.cu -o t1190
$ ./t1190
original norm
30
5.5
60
21
2
original dut
50
9.5
30
8.1
1
sum
98.6
time 0.000919
$
我不会尝试解释智能感知错误。 Intellisense 通常不能很好地与 CUDA 配合使用,正如您所见,intellisense 可能会标记出实际上可以正常编译的内容(例如这个问题中的这段代码)。如果 CUDA 代码编译正确,很有可能可以安全地忽略 Intellisense 报告的问题。
补充几点:
对于推力初学者来说,您似乎走上了一条奇怪的道路,运行从仿函数中使用推力算法。您正在做的事情在技术上没有任何问题,但这种类型的代码通常会保留用于特定情况,而不是用于一般推力用途。由于您的
num_segs
在此示例中为 1,因此您将 运行 宁一个 CUDA 线程来执行所有这些工作,这肯定不是高性能的。如果您打算以后扩大规模,那很好。之前也有类似的评论,这里就不多说了。此仿函数写入单个位置 (
*answer
) 以存放其结果。如果将其扩展到多个线程,则必须提供多个位置供仿函数写入(每个线程或传递给for_each
的向量中的每个元素一个),否则线程将相互覆盖结果。