cublas 中是否有可以将 sigmoid 函数应用于向量的函数?

Is there a function in the cublas that can apply the sigmoid function with a vector?

正如标题所说,我想在 vector 中做 element-wise 操作 function.I 想知道 cublas 库中是否有任何函数可以做到这一点?

我不知道可以协助完成任务的合适 CUBLAS 函数。但是,您可以轻松编写自己的代码,将 sigmoid 函数或与此相关的任何其他单参数函数逐元素应用于向量。请注意,在大多数情况下,此类代码将受内存限制而不是计算限制。请参阅下面的 CUDA 程序以获取有效示例,特别是 sigmoid_kernel()。该程序的输出应如下所示:

source[0]= 0.0000000000000000e+000  source[99999]= 9.9999000000000005e-001
result[0]= 5.0000000000000000e-001  result[99999]= 7.3105661250612963e-001

.

#include <stdlib.h>
#include <stdio.h>
#include <math.h>

#define DEFAULT_LEN   100000

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaThreadSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

__device__ __forceinline__ double sigmoid (double a)
{
    return 1.0 / (1.0 + exp (-a));
}

__global__ void sigmoid_kernel (const double * __restrict__ src, 
                                double * __restrict__ dst, int len)
{
    int stride = gridDim.x * blockDim.x;
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    for (int i = tid; i < len; i += stride) {
        dst[i] = sigmoid (src[i]);
    }
}    

int main (void)
{
    double *source, *result;
    double *d_a = 0, *d_b = 0;

    int len = DEFAULT_LEN;

    /* Allocate memory on host */
    source = (double *)malloc (len * sizeof (source[0]));
    if (!source) return EXIT_FAILURE;
    result = (double *)malloc (len * sizeof (result[0]));
    if (!result) return EXIT_FAILURE;

    /* create source data */
    for (int i = 0; i < len; i++) source [i] = i * 1e-5;

    /* spot check of source data */
    printf ("source[0]=% 23.16e  source[%d]=% 23.16e\n", 
            source[0], len-1, source[len-1]);

    /* Allocate memory on device */
    CUDA_SAFE_CALL (cudaMalloc((void**)&d_a, sizeof(d_a[0]) * len));
    CUDA_SAFE_CALL (cudaMalloc((void**)&d_b, sizeof(d_b[0]) * len));

    /* Push source data to device */
    CUDA_SAFE_CALL (cudaMemcpy (d_a, source, sizeof(d_a[0]) * len,
                                cudaMemcpyHostToDevice));

    /* Compute execution configuration */
    dim3 dimBlock(256);
    int threadBlocks = (len + (dimBlock.x - 1)) / dimBlock.x;
    if (threadBlocks > 65520) threadBlocks = 65520;
    dim3 dimGrid(threadBlocks);

    sigmoid_kernel<<<dimGrid,dimBlock>>>(d_a, d_b, len);
    CHECK_LAUNCH_ERROR();

    /* retrieve results from device */
    CUDA_SAFE_CALL (cudaMemcpy (result, d_b, sizeof (result[0]) * len,
                                cudaMemcpyDeviceToHost));

    /* spot check of  results */
    printf ("result[0]=% 23.16e  result[%d]=% 23.16e\n", 
            result[0], len-1, result[len-1]);

    /* free memory on host and device */
    CUDA_SAFE_CALL (cudaFree(d_a));
    CUDA_SAFE_CALL (cudaFree(d_b));
    free (result);
    free (source);

    return EXIT_SUCCESS;
}