将 cuBLAS 与来自 Thrust 的复数结合使用
Using cuBLAS with complex numbers from Thrust
在我的代码中,我使用推力库中的复数数组,我想使用 cublasZgeam() 来转置数组。
使用 cuComplex.h 中的复数不是一个更好的选择,因为我在数组上做了很多算术运算,而 cuComplex 没有定义运算符,例如 * +=.
这就是我定义要转置的数组的方式
thrust::complex<float> u[xmax][xmax];
我找到了这个 https://github.com/jtravs/cuda_complex,但是这样使用它:
#include "cuComplex.hpp"
不允许我在使用 nvcc 编译时使用提到的运算符
error: no operator "+=" matches these operands
operand types are: cuComplex += cuComplex
有解决办法吗?来自 github 的代码是旧的,可能存在问题或者我使用错误
编辑:这是有效的代码,与 talonmies 代码的唯一区别是添加了简单的内核和指向相同数据的指针,但是 thrust::complex
#include <iostream>
#include <thrust/fill.h>
#include <thrust/complex.h>
#include <cublas_v2.h>
using namespace std;
__global__ void test(thrust::complex<double>* u) {
u[0] += thrust::complex<double>(3.3,3.3);
}
int main()
{
int xmax = 100;
thrust::complex<double> u[xmax][xmax];
double arrSize = sizeof(thrust::complex<double>) * xmax * xmax;
thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex<double>(1.0,1.0));
u[49][51] += thrust::complex<double>(665.0,665.0);
u[51][49] *= 2.0;
cout << "Before:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
cout << u[0][0] << endl;
thrust::complex<double> alpha(1.0, 0.0);
thrust::complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
cuDoubleComplex* d_u;
cuDoubleComplex* d_v;
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cudaMalloc(&d_u, arrSize);
cudaMalloc(&d_v, arrSize);
cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
thrust::complex<double>* d_vTest = reinterpret_cast<thrust::complex<double>* >(d_v);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, d_u, xmax,
_beta, d_u, xmax,
d_v, xmax);
test<<<1,1>>>(d_vTest);
cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
cout << "After:" << endl;
cout << u[0][0] << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
return 0;
}
尽管您提出相反的抗议,但 C++ 标准库 complex
(或 thrust::complex
)肯定可以与 CUBLAS 一起使用。 cuComplex
和 cuDoubleComplex
设计为与标准主机复杂类型二进制兼容,以便在传递给使用设备上复杂数据的 CUBLAS 函数时不会转换数据。
对您在评论中发布的代码进行简单修改,就可以像您想象的那样工作:
#include <algorithm>
#include <iostream>
#include <complex>
#include <cublas_v2.h>
using namespace std;
int main()
{
int xmax = 100;
complex<double> u[xmax][xmax];
size_t arrSize = sizeof(complex<double>) * xmax * xmax;
fill(&u[0][0], &u[0][0] + (xmax * xmax), complex<double>(1.0,1.0));
u[49][51] += complex<double>(665.0,665.0);
u[51][49] *= 2.0;
cout << "Before:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
complex<double> alpha(1.0, 0.0);
complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
cuDoubleComplex* d_u;
cuDoubleComplex* d_v;
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cudaMalloc(&d_u, arrSize);
cudaMalloc(&d_v, arrSize);
cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, d_u, xmax,
_beta, d_u, xmax,
d_v, xmax);
cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
cout << "After:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
return 0;
}
已构建并 运行 像这样:
~/SO$ nvcc -std=c++11 -arch=sm_52 -o complex_transpose complex_transpose.cu -lcublas
~/SO$ ./complex_transpose
Before:
(666,666)
(2,2)
After:
(2,2)
(666,666)
唯一需要的修改是将 std::complex<double>
类型显式转换为 cuDoubleComplex
。这样做,一切都会按预期进行。
使用推力,代码看起来几乎一模一样:
#include <iostream>
#include <thrust/fill.h>
#include <thrust/complex.h>
#include <cublas_v2.h>
using namespace std;
int main()
{
int xmax = 100;
thrust::complex<double> u[xmax][xmax];
size_t arrSize = sizeof(thrust::complex<double>) * xmax * xmax;
thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex<double>(1.0,1.0));
u[49][51] += thrust::complex<double>(665.0,665.0);
u[51][49] *= 2.0;
cout << "Before:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
thrust::complex<double> alpha(1.0, 0.0);
thrust::complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
cuDoubleComplex* d_u;
cuDoubleComplex* d_v;
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cudaMalloc(&d_u, arrSize);
cudaMalloc(&d_v, arrSize);
cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, d_u, xmax,
_beta, d_u, xmax,
d_v, xmax);
cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
cout << "After:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
return 0;
}
也许更接近您的用例,使用推力设备容器和内核在 CUBLAS 调用之前执行一些初始化:
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/complex.h>
#include <thrust/execution_policy.h>
#include <thrust/copy.h>
#include <cublas_v2.h>
__global__ void setup_kernel(thrust::complex<double>* u, int xmax)
{
u[51 + 49*xmax] += thrust::complex<double>(665.0,665.0);
u[49 + 51*xmax] *= 2.0;
}
int main()
{
int xmax = 100;
thrust::complex<double> alpha(1.0, 0.0);
thrust::complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
thrust::device_vector<thrust::complex<double>> d_u(xmax * xmax, thrust::complex<double>(1.0,1.0));
thrust::device_vector<thrust::complex<double>> d_v(xmax * xmax, thrust::complex<double>(0.,0.));
setup_kernel<<<1,1>>>(thrust::raw_pointer_cast(d_u.data()), xmax);
cuDoubleComplex* _d_u = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_u.data()));
cuDoubleComplex* _d_v = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_v.data()));
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, _d_u, xmax,
_beta, _d_u, xmax,
_d_v, xmax);
thrust::complex<double> u[xmax][xmax];
thrust::copy(d_u.begin(), d_u.end(), &u[0][0]);
std::cout << "Before:" << std::endl;
std::cout << u[49][51] << std::endl;
std::cout << u[51][49] << std::endl;
thrust::copy(d_v.begin(), d_v.end(), &u[0][0]);
std::cout << "After:" << std::endl;
std::cout << u[49][51] << std::endl;
std::cout << u[51][49] << std::endl;
return 0;
}
在我的代码中,我使用推力库中的复数数组,我想使用 cublasZgeam() 来转置数组。
使用 cuComplex.h 中的复数不是一个更好的选择,因为我在数组上做了很多算术运算,而 cuComplex 没有定义运算符,例如 * +=.
这就是我定义要转置的数组的方式
thrust::complex<float> u[xmax][xmax];
我找到了这个 https://github.com/jtravs/cuda_complex,但是这样使用它:
#include "cuComplex.hpp"
不允许我在使用 nvcc 编译时使用提到的运算符
error: no operator "+=" matches these operands
operand types are: cuComplex += cuComplex
有解决办法吗?来自 github 的代码是旧的,可能存在问题或者我使用错误
编辑:这是有效的代码,与 talonmies 代码的唯一区别是添加了简单的内核和指向相同数据的指针,但是 thrust::complex
#include <iostream>
#include <thrust/fill.h>
#include <thrust/complex.h>
#include <cublas_v2.h>
using namespace std;
__global__ void test(thrust::complex<double>* u) {
u[0] += thrust::complex<double>(3.3,3.3);
}
int main()
{
int xmax = 100;
thrust::complex<double> u[xmax][xmax];
double arrSize = sizeof(thrust::complex<double>) * xmax * xmax;
thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex<double>(1.0,1.0));
u[49][51] += thrust::complex<double>(665.0,665.0);
u[51][49] *= 2.0;
cout << "Before:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
cout << u[0][0] << endl;
thrust::complex<double> alpha(1.0, 0.0);
thrust::complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
cuDoubleComplex* d_u;
cuDoubleComplex* d_v;
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cudaMalloc(&d_u, arrSize);
cudaMalloc(&d_v, arrSize);
cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
thrust::complex<double>* d_vTest = reinterpret_cast<thrust::complex<double>* >(d_v);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, d_u, xmax,
_beta, d_u, xmax,
d_v, xmax);
test<<<1,1>>>(d_vTest);
cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
cout << "After:" << endl;
cout << u[0][0] << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
return 0;
}
尽管您提出相反的抗议,但 C++ 标准库 complex
(或 thrust::complex
)肯定可以与 CUBLAS 一起使用。 cuComplex
和 cuDoubleComplex
设计为与标准主机复杂类型二进制兼容,以便在传递给使用设备上复杂数据的 CUBLAS 函数时不会转换数据。
对您在评论中发布的代码进行简单修改,就可以像您想象的那样工作:
#include <algorithm>
#include <iostream>
#include <complex>
#include <cublas_v2.h>
using namespace std;
int main()
{
int xmax = 100;
complex<double> u[xmax][xmax];
size_t arrSize = sizeof(complex<double>) * xmax * xmax;
fill(&u[0][0], &u[0][0] + (xmax * xmax), complex<double>(1.0,1.0));
u[49][51] += complex<double>(665.0,665.0);
u[51][49] *= 2.0;
cout << "Before:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
complex<double> alpha(1.0, 0.0);
complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
cuDoubleComplex* d_u;
cuDoubleComplex* d_v;
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cudaMalloc(&d_u, arrSize);
cudaMalloc(&d_v, arrSize);
cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, d_u, xmax,
_beta, d_u, xmax,
d_v, xmax);
cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
cout << "After:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
return 0;
}
已构建并 运行 像这样:
~/SO$ nvcc -std=c++11 -arch=sm_52 -o complex_transpose complex_transpose.cu -lcublas
~/SO$ ./complex_transpose
Before:
(666,666)
(2,2)
After:
(2,2)
(666,666)
唯一需要的修改是将 std::complex<double>
类型显式转换为 cuDoubleComplex
。这样做,一切都会按预期进行。
使用推力,代码看起来几乎一模一样:
#include <iostream>
#include <thrust/fill.h>
#include <thrust/complex.h>
#include <cublas_v2.h>
using namespace std;
int main()
{
int xmax = 100;
thrust::complex<double> u[xmax][xmax];
size_t arrSize = sizeof(thrust::complex<double>) * xmax * xmax;
thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex<double>(1.0,1.0));
u[49][51] += thrust::complex<double>(665.0,665.0);
u[51][49] *= 2.0;
cout << "Before:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
thrust::complex<double> alpha(1.0, 0.0);
thrust::complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
cuDoubleComplex* d_u;
cuDoubleComplex* d_v;
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cudaMalloc(&d_u, arrSize);
cudaMalloc(&d_v, arrSize);
cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, d_u, xmax,
_beta, d_u, xmax,
d_v, xmax);
cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
cout << "After:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
return 0;
}
也许更接近您的用例,使用推力设备容器和内核在 CUBLAS 调用之前执行一些初始化:
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/complex.h>
#include <thrust/execution_policy.h>
#include <thrust/copy.h>
#include <cublas_v2.h>
__global__ void setup_kernel(thrust::complex<double>* u, int xmax)
{
u[51 + 49*xmax] += thrust::complex<double>(665.0,665.0);
u[49 + 51*xmax] *= 2.0;
}
int main()
{
int xmax = 100;
thrust::complex<double> alpha(1.0, 0.0);
thrust::complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
thrust::device_vector<thrust::complex<double>> d_u(xmax * xmax, thrust::complex<double>(1.0,1.0));
thrust::device_vector<thrust::complex<double>> d_v(xmax * xmax, thrust::complex<double>(0.,0.));
setup_kernel<<<1,1>>>(thrust::raw_pointer_cast(d_u.data()), xmax);
cuDoubleComplex* _d_u = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_u.data()));
cuDoubleComplex* _d_v = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_v.data()));
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, _d_u, xmax,
_beta, _d_u, xmax,
_d_v, xmax);
thrust::complex<double> u[xmax][xmax];
thrust::copy(d_u.begin(), d_u.end(), &u[0][0]);
std::cout << "Before:" << std::endl;
std::cout << u[49][51] << std::endl;
std::cout << u[51][49] << std::endl;
thrust::copy(d_v.begin(), d_v.end(), &u[0][0]);
std::cout << "After:" << std::endl;
std::cout << u[49][51] << std::endl;
std::cout << u[51][49] << std::endl;
return 0;
}