在 GPU 上使用两个以上列表生成笛卡尔积
Generate Cartesian Product using more than two lists on GPU
我想知道如何使用 CUDA 生成两个以上列表的笛卡尔积。
如何使此代码适用于三个或更多列表?
它适用于两个列表,但不适用于三个列表,我尝试了 /, % 但没有成功。
这是基本的。
#include <thrust/device_vector.h>
#include <thrust/pair.h>
#include <thrust/copy.h>
#include <iterator>
__global__ void cartesian_product(const int *a, size_t a_size,
const int *b, size_t b_size,
const int *c, size_t c_size)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < a_size * b_size * c_size)
{
unsigned int a_idx = idx / a_size;
unsigned int b_idx = idx % a_size;
// ?
unsigned int c_idx = idx % a_size;
printf("a[a_idx] and b[b_idx] and c[c_idx] are: %d %d %d\n\n",a[a_idx], b[b_idx], c[c_idx]);
//1 3 5 , 1 3 6 , 1 4 5 , 1 4 6 , 2 3 5 , 2 3 6 , 2 4 5 , 2 4 6
//0 0 0 , 0 0 1 , 0 1 0 , 0 1 1 , 1 0 0 , 1 0 1 , 1 1 0 , 1 1 1
}
}
int main()
{
// host_vector is stored in host memory while device_vector livesin GPU device memory.
// a has storage for 2 integers
thrust::device_vector<int> a(2);
// initialize individual elements
a[0] = 1;
a[1] = 2;
// b has storage for 2 integers
thrust::device_vector<int> b(2);
// initialize individual elements
b[0] = 3;
b[1] = 4;
// d has storage for 2 integers
thrust::device_vector<int> c(2);
// initialize individual elements
c[0] = 5;
c[1] = 6;
unsigned int block_size = 256;
unsigned int num_blocks = (8 + (block_size - 1)) / block_size;
// raw_pointer_cast creates a "raw" pointer from a pointer-like type, simply returning the wrapped pointer, should it exist.
cartesian_product<<<num_blocks, block_size>>>(thrust::raw_pointer_cast(a.data()), a.size(),
thrust::raw_pointer_cast(b.data()), b.size(),
thrust::raw_pointer_cast(c.data()), c.size());
return 0;
}
如果我想要三个以上的列表,如何在内核和后续数组中获得正确的c_idx?
在我看来你想要“词法索引”:
idx == (a_idx * b_size + b_idx) * c_size + c_idx
所以你得到这样的指数:
c_idx = idx % c_size;
b_idx = (idx / c_size) % b_size;
a_idx = (idx / c_size) / b_size;
这很容易推广到更多维度。例如。在四个方面你有
idx == ((a_idx * b_size + b_idx) * c_size + c_idx) * d_size + d_idx
然后:
d_idx = idx % d_size;
c_idx = (idx / d_size) % c_size;
b_idx = ((idx / d_size) / c_size) % b_size;
a_idx = ((idx / d_size) / c_size) / b_size;
在C/C++编程中,人们喜欢用它来计算代表多维数据集的一维动态数组的索引。在 CUDA 中你通常不需要它那么多,因为 CUDA 给你最多三维 threadIdx
/blockIdx
/etc.. 所以对于三个数组的笛卡尔积,你不需要这种技术,但可以只使用固有的 CUDA 功能。即使在三个以上,最高效的解决方案也会从内核的三个维度中的两个获得两个索引,并在第三个维度上使用词法索引:
__global__ void cartesian_product_5d(const int *a, size_t a_size,
const int *b, size_t b_size,
const int *c, size_t c_size,
const int *d, size_t d_size,
const int *e, size_t e_size)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int d_idx = blockIdx.y * blockDim.y + threadIdx.y;
int e_idx = blockIdx.z * blockDim.z + threadIdx.z;
/* idx == (c_idx * b_size + b_idx) * a_size + a_idx */
int a_idx = idx % a_size;
int b_idx = (idx / a_size) % b_size;
int c_idx = (idx / a_size) / b_size;
/* ... */
}
int main()
{
/* ... */
dim3 threadsPerBlock(8, 8, 8);
dim3 numBlocks((a_size + b_size + c_size + threadsPerBlock.x - 1) /
threadsPerBlock.x,
(d_size + threadsPerBlock.y - 1) / threadsPerBlock.y,
(e_size + threadsPerBlock.z - 1) / threadsPerBlock.z);
cartesian_product_5d<<<numBlocks, threadsPerBlock>>>(/* ... */);
/* ... */
}
我想知道如何使用 CUDA 生成两个以上列表的笛卡尔积。
如何使此代码适用于三个或更多列表?
它适用于两个列表,但不适用于三个列表,我尝试了 /, % 但没有成功。
这是基本的。
#include <thrust/device_vector.h>
#include <thrust/pair.h>
#include <thrust/copy.h>
#include <iterator>
__global__ void cartesian_product(const int *a, size_t a_size,
const int *b, size_t b_size,
const int *c, size_t c_size)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < a_size * b_size * c_size)
{
unsigned int a_idx = idx / a_size;
unsigned int b_idx = idx % a_size;
// ?
unsigned int c_idx = idx % a_size;
printf("a[a_idx] and b[b_idx] and c[c_idx] are: %d %d %d\n\n",a[a_idx], b[b_idx], c[c_idx]);
//1 3 5 , 1 3 6 , 1 4 5 , 1 4 6 , 2 3 5 , 2 3 6 , 2 4 5 , 2 4 6
//0 0 0 , 0 0 1 , 0 1 0 , 0 1 1 , 1 0 0 , 1 0 1 , 1 1 0 , 1 1 1
}
}
int main()
{
// host_vector is stored in host memory while device_vector livesin GPU device memory.
// a has storage for 2 integers
thrust::device_vector<int> a(2);
// initialize individual elements
a[0] = 1;
a[1] = 2;
// b has storage for 2 integers
thrust::device_vector<int> b(2);
// initialize individual elements
b[0] = 3;
b[1] = 4;
// d has storage for 2 integers
thrust::device_vector<int> c(2);
// initialize individual elements
c[0] = 5;
c[1] = 6;
unsigned int block_size = 256;
unsigned int num_blocks = (8 + (block_size - 1)) / block_size;
// raw_pointer_cast creates a "raw" pointer from a pointer-like type, simply returning the wrapped pointer, should it exist.
cartesian_product<<<num_blocks, block_size>>>(thrust::raw_pointer_cast(a.data()), a.size(),
thrust::raw_pointer_cast(b.data()), b.size(),
thrust::raw_pointer_cast(c.data()), c.size());
return 0;
}
如果我想要三个以上的列表,如何在内核和后续数组中获得正确的c_idx?
在我看来你想要“词法索引”:
idx == (a_idx * b_size + b_idx) * c_size + c_idx
所以你得到这样的指数:
c_idx = idx % c_size;
b_idx = (idx / c_size) % b_size;
a_idx = (idx / c_size) / b_size;
这很容易推广到更多维度。例如。在四个方面你有
idx == ((a_idx * b_size + b_idx) * c_size + c_idx) * d_size + d_idx
然后:
d_idx = idx % d_size;
c_idx = (idx / d_size) % c_size;
b_idx = ((idx / d_size) / c_size) % b_size;
a_idx = ((idx / d_size) / c_size) / b_size;
在C/C++编程中,人们喜欢用它来计算代表多维数据集的一维动态数组的索引。在 CUDA 中你通常不需要它那么多,因为 CUDA 给你最多三维 threadIdx
/blockIdx
/etc.. 所以对于三个数组的笛卡尔积,你不需要这种技术,但可以只使用固有的 CUDA 功能。即使在三个以上,最高效的解决方案也会从内核的三个维度中的两个获得两个索引,并在第三个维度上使用词法索引:
__global__ void cartesian_product_5d(const int *a, size_t a_size,
const int *b, size_t b_size,
const int *c, size_t c_size,
const int *d, size_t d_size,
const int *e, size_t e_size)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int d_idx = blockIdx.y * blockDim.y + threadIdx.y;
int e_idx = blockIdx.z * blockDim.z + threadIdx.z;
/* idx == (c_idx * b_size + b_idx) * a_size + a_idx */
int a_idx = idx % a_size;
int b_idx = (idx / a_size) % b_size;
int c_idx = (idx / a_size) / b_size;
/* ... */
}
int main()
{
/* ... */
dim3 threadsPerBlock(8, 8, 8);
dim3 numBlocks((a_size + b_size + c_size + threadsPerBlock.x - 1) /
threadsPerBlock.x,
(d_size + threadsPerBlock.y - 1) / threadsPerBlock.y,
(e_size + threadsPerBlock.z - 1) / threadsPerBlock.z);
cartesian_product_5d<<<numBlocks, threadsPerBlock>>>(/* ... */);
/* ... */
}