NVidia推力任意变换与三维网格

NVidia thrust arbitrary transform with three-dimensional grid

我想使用 NVidia 推力在 GPU 上并行化以下嵌套 for 循环。

// complex multiplication
inline __host__ __device__  float2 operator* (const float2 a, const float2 b) {
    return make_float2(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
}

int main()
{
    const int M = 100, N = 100, K = 100;
    float2 A[M*N*K], E[M*N*K];

    float vec_M[M], vec_N[N], vec_K[K];

    for (int i_M = 0; i_M < M; i_M++)
    {
        for (int i_N = 0; i_N < N; i_N++)
        {
            for (int i_K = 0; i_K < K; i_K++)
            {
                unsigned int idx = i_M * (N * K) + i_N * K + i_K; // linear index
                float2 a = A[idx];
                float b = vec_M[i_M];
                float c = vec_N[i_N];
                float d = vec_K[i_K];
                float arg1 = (b + c) / d;
                float2 val1 = make_float2(cosf(arg1), sinf(arg1));
                E[idx].x = a * val1; // calls the custom multiplication operator
            }
        }
    }

    return 0;
}

我可以将其实现为

thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin(), D.begin(), E.begin())),
                 thrust::make_zip_iterator(thrust::make_tuple(A.end(),   B.end(),   C.end(),   D.end(),   E.end())),
                 custom_functor());

类似于thrust/examples/arbitrary_transformation.cu中给出的示例。

但是,为此,我必须创建矩阵 BCD,每个矩阵的大小为 M x N x K,即使三个向量sizeMNK足以表示相应的值。创建这些矩阵需要 (3 * M * N * K) / (M + N + K) 比仅使用向量更多的内存。

是否有类似 MATLAB 的 meshgrid() 函数允许使用三个向量创建 3D 网格而无需实际存储这些矩阵?在我看来,伪代码应该是这样的:

thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), meshgrid(vec_M.begin(), vec_N.begin(), vec_K.begin()), E.begin())),
                 thrust::make_zip_iterator(thrust::make_tuple(A.end(),   meshgrid(vec_M.end(),   vec_N.end(),   vec_K.end()),   E.end())),
                 custom_functor());

您可以简单地将嵌套循环折叠成一个循环,然后将 for_each 与计数迭代器一起使用。在函子中,您需要从单个循环变量中计算三个索引。

#include <iostream>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>

struct Op{
    int N;
    int M;
    int K;

    Op(int n, int m, int k) : N(n), M(m), K(k){}

    void operator()(int x){
        const int k = x % K;
        const int m = (x / K) % M;
        const int n = (x / (K * M));
        std::cerr << "x = " << x << " (" << n << ", " << m << ", " << k << ")\n";
    }
};

int main(){
    const int N = 2;
    const int M = 3;
    const int K = 2;

    Op op{N,M,K};

    std::cerr << "with nested loops\n";
    for(int n = 0; n < N; n++){
        for(int m = 0; m < M; m++){
            for(int k = 0; k < K; k++){
                std::cerr << "(" << n << ", " << m << ", " << k << ")\n";        
            }
        }
    }
    std::cerr << "\n";

    std::cerr << "with single loop\n";
    for(int x = 0; x < N * M * K; x++){
        op(x);
    }
    std::cerr << "\n";

    std::cerr << "with thrust\n";
    auto xIterator = thrust::make_counting_iterator(0);
    thrust::for_each(thrust::seq, xIterator, xIterator + N*M*K, op);
}

输出

with nested loops
(0, 0, 0)
(0, 0, 1)
(0, 1, 0)
(0, 1, 1)
(0, 2, 0)
(0, 2, 1)
(1, 0, 0)
(1, 0, 1)
(1, 1, 0)
(1, 1, 1)
(1, 2, 0)
(1, 2, 1)

with single loop
x = 0 (0, 0, 0)
x = 1 (0, 0, 1)
x = 2 (0, 1, 0)
x = 3 (0, 1, 1)
x = 4 (0, 2, 0)
x = 5 (0, 2, 1)
x = 6 (1, 0, 0)
x = 7 (1, 0, 1)
x = 8 (1, 1, 0)
x = 9 (1, 1, 1)
x = 10 (1, 2, 0)
x = 11 (1, 2, 1)

with thrust
x = 0 (0, 0, 0)
x = 1 (0, 0, 1)
x = 2 (0, 1, 0)
x = 3 (0, 1, 1)
x = 4 (0, 2, 0)
x = 5 (0, 2, 1)
x = 6 (1, 0, 0)
x = 7 (1, 0, 1)
x = 8 (1, 1, 0)
x = 9 (1, 1, 1)
x = 10 (1, 2, 0)
x = 11 (1, 2, 1)

附带说明: CUDA 有一个函数 sincosf,它可能比用相同的参数同时调用 sincos 更有效。 https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html#group__CUDA__MATH__SINGLE_1g9456ff9df91a3874180d89a94b36fd46