使用 cublas sgemv 时如何跳过 float4 中的第四个元素？

Question

我正在处理的部分代码需要尽可能快地执行矩阵向量乘法，即使用优化的第三方库，如 cublas（尽管相同的原则适用于任何 cpu blas）。

问题是向量中的元素之间有一种跨度，如下所示：

矩阵存储为 3Nx3N 一维浮点数组。

向量存储为 N 个一维 float4 数组，但只使用每个 float4 的前三个元素，应忽略第四个元素。

N是百万级。

如果向量存储为 float3 而不是 float4，我可以将指针转换为 float，就像在这个工作示例中一样：

//Compile with nvcc test.cu -O3 -lcublas -o test

/*
Multiply a 3Nx3N float matrix, M,  by a vector, X, of N float3 elements 

The result, Y, is a 3N float vector
-----------------------

What if X is a vector of N float4?

How can I tell cublas to skip the forth element?

*/

#include<iostream>
#include<thrust/device_vector.h>
#include<cuda_runtime.h>
#include<cublas_v2.h>

using namespace std;

int main(){

  int N = 3;

  thrust::device_vector<float3> X(N);

  thrust::device_vector<float> Y(3*N);

  for(int i=0; i<N; i++) 
     X[i] = make_float3(1,1,1); //make_float4(1,1,1,0); //in the case of float4 i.e., The result should be the same 

  thrust::device_vector<float> M(3*N*3*N, 1);


  cublasHandle_t handle;

  cublasCreate(&handle);

  float beta = 0.0f;
  float alpha = 1.0f;
  cublasSgemv(handle, CUBLAS_OP_T,
          3*N, 3*N,
          &alpha,
          thrust::raw_pointer_cast(&M[0]), 3*N,
          (float*) thrust::raw_pointer_cast(&X[0]), 1,
          &beta,
          thrust::raw_pointer_cast(&Y[0]), 1);

  cout<<"Performed Y = M·X\n\tX = ";
  for(int i=0; i<N; i++){
    float3 Xi = X[i];
    cout<<Xi.x<<" "<<Xi.y<<" "<<Xi.z<<" ";
  }  
  cout<<"\n\tY = ";
  for(int i=0; i<3*N; i++){
    cout<<Y[i]<<" ";
  }
  cout<<endl;

  return 0;
}

但是，如果 X 向量存储为 float4，我该如何执行此操作？

考虑到 float4* 可以解释为具有 4 倍以上元素的 float*，问题可能更笼统（尽管我只对 float4 的情况感兴趣）；如果每 3 个 "useful" 元素之间有一个步幅。我想对 cublas 说，数组在内存中没有合并。但是像这样：开头有 3 个元素，接下来的三个是 "stride" 之后的元素，等等。类似于您可以在 OpenGL 中使用顶点数组对象执行的操作。

编辑：

答案表明，最可行的方法是将跨步数组复制到 cublas 理解的时间、转换的 float3 数组中。

目前有两个选择：

1. Use cudaMemcpy2D
2. Use a thrust transformation
3. Use a custom copy kernel

我写这段代码来测试三种情况：

//Compile with Compile with: nvcc test.cu -O3 -lcublas -o test
#include<iostream>
#include<thrust/device_vector.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include<cublas_v2.h>

using namespace std;


struct Timer{
  cudaEvent_t start, stop;
  float time;

  void tic(){
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
  }
  float toc(){
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return time;
  }

};



struct copy_functor{
  copy_functor(){}
  __device__ float3 operator() (const float4& X4){
    return make_float3(X4.x, X4.y, X4.z);
  }
};


__global__ void copy_kernel(const float4* __restrict__ X4, float3* __restrict__ X3, int N){
  int id = blockIdx.x*blockDim.x + threadIdx.x;
  if(id < N){
    float4 x4 = X4[id];
    X3[id] = make_float3(x4.x, x4.y, x4.z);
  }
}

int main(){

  int N = 1000000;
  int Ntest = 1000;

  Timer t;

  thrust::device_vector<float3> X3(N, make_float3(0,0,0));
  thrust::device_vector<float4> X4(N, make_float4(1,1,1,10));


  /*************************CUDAMEMCPY2D*******************/
  t.tic();

  for(int i= 0; i<Ntest; i++){
    cudaMemcpy2DAsync(thrust::raw_pointer_cast(&X3[0]),
              3*sizeof(float),
              thrust::raw_pointer_cast(&X4[0]),
              4*sizeof(float),
              3*sizeof(float),
              N,
              cudaMemcpyDeviceToDevice);
     cudaDeviceSynchronize();
   }
   printf ("Time for cudaMemcpy2DAsync: %f ms\n", t.toc()/(float)Ntest);


   /************************THRUST***********************/
   t.tic();

   for(int i= 0; i<Ntest; i++){
     transform(X4.begin(), X4.end(), X3.begin(), copy_functor());  
     cudaDeviceSynchronize();
   }

   printf ("Time for thrust transformation: %f ms\n", t.toc()/(float)Ntest);

   /*********************COPY KERNEL*****************************/

   t.tic();
   for(int i= 0; i<Ntest; i++){
     copy_kernel<<< N/128 + 1, 128 >>>(thrust::raw_pointer_cast(&X4[0]),
                       thrust::raw_pointer_cast(&X3[0]), N);
     cudaDeviceSynchronize();
   }
   printf ("Time for copy kernel: %f ms\n", t.toc()/(float)Ntest);


return 0;
}

请注意，我计算的是 1000 份的平均值。

此代码在 GTX 980 中的输出如下：

Time for cudaMemcpy2DAsync: 1.465522 ms
Time for thrust transformation: 0.178745 ms
Time for copy kernel: 0.168507 ms

cudaMemcpy2D 比其他的慢一个数量级。

thrust和copy kernel很相似也是最快的方式

这种行为似乎对任意数量的元素都存在。

编辑2：

其他答案表明 GEMM 可用于传达步幅。 无需时间数组。

解释矩阵向量 mul。作为 Matrix Matrix mul。会这样做：

 cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T,
              3*N, 1 /*m*/, 3*N,
              &alpha,
              thrust::raw_pointer_cast(&M[0]), 3*N,
              (float*) thrust::raw_pointer_cast(&X3[0]), 1 /*ldb*/,
              &beta,
              thrust::raw_pointer_cast(&Y[0]), 3*N);

但是，此时我不知道如何传递 X4 而不是 X3。解决方案似乎在 m 和 ldb 参数中。

Answer 1

您可以将一维 float4 向量视为行步幅为 4 的 Nx3 二维浮点矩阵，并使用 cudaMemcpy2DAsync 将步幅从 4 更改为 3

cudaMemcpy2DAsync(dst,
                  3*sizeof(float),
                  src,
                  4*sizeof(float),
                  3*sizeof(float),
                  N,
                  cudaMemcpyDeviceToDevice);

那么dst可以当成3N个一维浮点向量直接传给gemv()

鉴于您 N 的规模，与 gemv() 相比，复制时间并不明显。

编辑

@Apo 的基准测试结果表明，使用复制内核比 cudaMemcpy2DAsync 更好。我对 cudaMemcpy2DAsync 的期望值过高，认为它会得到很好的优化并且在所有情况下都有最佳性能。

使用 cublas sgemv 时如何跳过 float4 中的第四个元素？

How can I skip the fourth element in a float4 when using cublas sgemv?

c++

arrays

cuda

thrust

cublas