有没有办法在 cuBLAS 中做 "saypx"?

is there a way to do "saypx" in cuBLAS?

cublasSaxpy 计算 y' = a * x + y,其中 x 和 y 是向量,a 是标量。

原来我需要计算 y' = a * y + x。我没有看到如何扭曲 cuBLAS 库来做到这一点。

(当然,我可以计算 y' = a * y,然后 y' = y' + x,但是在这种情况下 y' 被读取得太频繁了。我可以编写自己的 CUDA 代码来完成它,但它可能没有 cuBLAS 代码那么快。我很惊讶没有明显的方法可以直接执行 "saypx"。)

[已添加] Intel 版本的cblas 中有类似"saxpby" 的功能,可以满足我的需要。但奇怪的是,这不在 cuBLAS 中。

[添加 #2] 看起来我可以使用 cudnnAddTensor 函数,使用一些描述符的别名(我有一个指向张量的 FilterDescriptor,AddTensor 不会接受,但我应该能够别名具有相同内存和形状的 TensorDescriptor。)

我不知道有什么方法可以在 CUBLAS 或标准 BLAS 中完成您的要求。您在 MKL 中发现的是英特尔添加的扩展,但我不记得在其他主机和加速器 BLAS 实现中看到过类似的东西。

好消息是您关于 "I could write my own CUDA code to do it, but then it's likely not anywhere near as fast as the cuBLAS code" 的断言是不正确的,至少对于像 saxpy 这样微不足道的操作来说是这样。即使是简单的 saxpy 实现也会非常接近 CUBLAS,因为读取两个数组、执行 FMAD 并写回结果的次数确实不多。只要您获得正确的内存合并,编写高性能代码就非常简单。例如:

#include <vector>
#include <algorithm>
#include <cassert>
#include <iostream>
#include <cmath>

#include "cublas_v2.h"

typedef enum
{ 
    AXPY = 0,
    AXPBY = 1
} saxpy_op_t;

__device__ __host__ __inline__ 
float axpby_op(float y, float x, float a)
{
    return a * y + x;
}

__device__ __host__ __inline__ 
float axpy_op(float y, float x, float a)
{
    return y + a * x;
}

template<typename T>
class pitched_accessor
{
    T * p;
    size_t pitch;

    public:
    __host__ __device__
    pitched_accessor(T *p_, size_t pitch_) : p(p_), pitch(pitch_) {};

    __host__ __device__
    T& operator[](size_t idx) { return p[pitch*idx]; };

    __host__ __device__ 
    const T& operator[](size_t idx) const { return p[pitch*idx]; };
};


template<saxpy_op_t op>
__global__ 
void saxpy_kernel(pitched_accessor<float> y, pitched_accessor<float> x, 
                  const float a, const unsigned int N1)
{
    unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int stride = gridDim.x * blockDim.x;

    #pragma unroll 8
    for(; idx < N1; idx += stride) {
        switch (op) {
            case AXPY:
                y[idx] = axpy_op(y[idx], x[idx], a);
                break;
            case AXPBY:
                y[idx] = axpby_op(y[idx], x[idx], a);
                break;
        }
    }
}

__host__ void saxby(const unsigned int N, const float a, 
                    float *x, int xinc, float *y, int yinc)
{
    int gridsize, blocksize;
    cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPBY>);
    saxpy_kernel<AXPBY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc), 
                                                 pitched_accessor<float>(x, xinc), a, N);
}

__host__ void saxpy(const unsigned int N, const float a, 
                    float *x, int xinc, float *y, int yinc)
{
    int gridsize, blocksize;
    cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPY>);
    saxpy_kernel<AXPY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc), 
                                                pitched_accessor<float>(x, xinc), a, N);
}

void check_result(std::vector<float> &yhat, float result, float tolerance=1e-5f)
{
    auto it = yhat.begin();
    for(; it != yhat.end(); ++it) {
        float err = std::fabs(*it - result);
        assert( err < tolerance ); 
    }
}

int main()
{

    const int N = 1<<22;

    std::vector<float> x_h(N);
    std::vector<float> y_h(N);

    const float a = 2.f, y0 = 1234.f, x0 = 532.f;
    std::fill(y_h.begin(), y_h.end(), y0);
    std::fill(x_h.begin(), x_h.end(), x0);

    float *x_d, *y_d;
    size_t sz = sizeof(float) * size_t(N);
    cudaMalloc((void **)&x_d, sz);
    cudaMalloc((void **)&y_d, sz);

    cudaMemcpy(x_d, &x_h[0], sz, cudaMemcpyHostToDevice);

    {
        cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
        saxby(N, a, x_d, 1, y_d, 1);
        std::vector<float> yhat(N);
        cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
        check_result(yhat, axpby_op(y0, x0, a));
    }

    {
        cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
        saxpy(N, a, x_d, 1, y_d, 1);
        std::vector<float> yhat(N);
        cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
        check_result(yhat, axpy_op(y0, x0, a));
    }

    {
        cublasHandle_t handle;
        cublasCreate(&handle);
        cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
        cublasSaxpy(handle, N, &a, x_d, 1, y_d, 1);
        std::vector<float> yhat(N);
        cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
        check_result(yhat, axpy_op(y0, x0, a));
        cublasDestroy(handle);
    }

    return int(cudaDeviceReset());
}

这表明一个非常简单的 axpy 内核可以很容易地适应执行标准操作和您想要的版本,并且 运行 在 运行 上 CUBLAS 时间的 10% 以内计算 5.2 设备 我测试了它:

$ nvcc -std=c++11 -arch=sm_52 -Xptxas="-v" -o saxby saxby.cu -lcublas
ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj' for 'sm_52'
ptxas info    : Function properties for _Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 17 registers, 360 bytes cmem[0]
ptxas info    : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj' for 'sm_52'
ptxas info    : Function properties for _Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 17 registers, 360 bytes cmem[0]

$ nvprof ./saxby
==26806== NVPROF is profiling process 26806, command: ./saxby
==26806== Profiling application: ./saxby
==26806== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 54.06%  11.190ms         5  2.2381ms     960ns  2.9094ms  [CUDA memcpy HtoD]
 40.89%  8.4641ms         3  2.8214ms  2.8039ms  2.8310ms  [CUDA memcpy DtoH]
  1.73%  357.59us         1  357.59us  357.59us  357.59us  void saxpy_kernel<saxpy_op_t=1>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int)
  1.72%  355.15us         1  355.15us  355.15us  355.15us  void saxpy_kernel<saxpy_op_t=0>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int)
  1.60%  332.21us         1  332.21us  332.21us  332.21us  void axpy_kernel_val<float, int=0>(cublasAxpyParamsVal<float>)