Cuda 展开循环变量保留在线程之间
Cuda unrolled loop variable preserved between threads
我一直在研究一个有一些循环展开的 cuda 程序,并且似乎在线程之间维护了展开的变量。我不完全确定这不是共享内存问题,但我认为我的索引是正确的。我使用的是块大小为 256 的一维布局。
test.cu
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#define BLOCK_SIZE 256
using namespace std;
__global__ void test_kernel(unsigned char *arr) {
int tid = threadIdx.x;
int bid = blockIdx.x*blockDim.x + threadIdx.x;
__shared__ unsigned char sharr[32 * BLOCK_SIZE];
#pragma unroll
for (int i=0; i < 32; ++i) {
sharr[tid*32+i] = i;
}
__syncthreads();
#pragma unroll
for (int j=0; j < 32; ++j) {
arr[bid+j] = sharr[tid*32+j];
}
}
int main(int argc, char **argv) {
int size = 1024;
unsigned char *device_test_arr;
cudaMalloc((void **) &device_test_arr, size * 32 * sizeof(unsigned char));
const dim3 block_size(256);
const dim3 num_blocks(size / block_size.x);
test_kernel<<<num_blocks, block_size>>>(device_test_arr);
unsigned char *host_test_arr = (unsigned char *)malloc(size * 32 * sizeof(unsigned char));
cudaMemcpy(host_test_arr, device_test_arr, size * 32 * sizeof(unsigned char), cudaMemcpyDeviceToHost);
for (int i=0; i < 5; ++i) {
for (int j=0; j < 32; ++j) {
cout << static_cast<unsigned char>(host_test_arr[i*32+j]) << ", ";
}
cout << "\n";
}
}
我希望输出为:
0, 1, 2, 3, 4, ..., 30, 31
0, 1, 2, 3, 4, ..., 30, 31
相反,我得到:
0, 1, 2, 3, 4, ..., 30, 31
31, 31, 31, 31, ..., 31, 31
我正在测试计算能力 3.5 和 cuda 7.0
我做了更多必要的修改,但重要的是:
arr[bid*32+j] = sharr[tid*32+j];
总代码:
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#define BLOCK_SIZE 256
using namespace std;
__global__ void test_kernel(unsigned char *arr) {
int tid = threadIdx.x; //0-255
int bid = blockIdx.x*blockDim.x + threadIdx.x;//0-1024
__shared__ unsigned char sharr[32 * BLOCK_SIZE];//32*256
#pragma unroll
for (int i=0; i < 32; ++i) {
sharr[tid*32+i] = i;//0,0,0,0,0,0...1,1,1,1,1,1...2,2,2,2,2.....
}
__syncthreads();
#pragma unroll
for (int j=0; j < 32; ++j) {//
//arr = 1024*32 unsigned chars
arr[bid*32+j] = sharr[tid*32+j];
}
}
int main(int argc, char **argv) {
int size = 1024;
unsigned char *device_test_arr;
cudaMalloc((void **) &device_test_arr, size * 32 * sizeof(unsigned char));
const dim3 block_size(256);
const dim3 num_blocks(size / block_size.x);
//<<<4 , 256>>>
test_kernel<<<num_blocks, block_size>>>(device_test_arr);
unsigned char host_test_arr[size*32];
cudaMemcpy(host_test_arr, device_test_arr, size * 32 * sizeof(unsigned char), cudaMemcpyDeviceToHost);
for (int i=0; i < 5; ++i) {
for (int j=0; j < 32; ++j) {
cout << (int)host_test_arr[i*32+j] << ", ";
}cout << endl;
}
}
我一直在研究一个有一些循环展开的 cuda 程序,并且似乎在线程之间维护了展开的变量。我不完全确定这不是共享内存问题,但我认为我的索引是正确的。我使用的是块大小为 256 的一维布局。
test.cu
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#define BLOCK_SIZE 256
using namespace std;
__global__ void test_kernel(unsigned char *arr) {
int tid = threadIdx.x;
int bid = blockIdx.x*blockDim.x + threadIdx.x;
__shared__ unsigned char sharr[32 * BLOCK_SIZE];
#pragma unroll
for (int i=0; i < 32; ++i) {
sharr[tid*32+i] = i;
}
__syncthreads();
#pragma unroll
for (int j=0; j < 32; ++j) {
arr[bid+j] = sharr[tid*32+j];
}
}
int main(int argc, char **argv) {
int size = 1024;
unsigned char *device_test_arr;
cudaMalloc((void **) &device_test_arr, size * 32 * sizeof(unsigned char));
const dim3 block_size(256);
const dim3 num_blocks(size / block_size.x);
test_kernel<<<num_blocks, block_size>>>(device_test_arr);
unsigned char *host_test_arr = (unsigned char *)malloc(size * 32 * sizeof(unsigned char));
cudaMemcpy(host_test_arr, device_test_arr, size * 32 * sizeof(unsigned char), cudaMemcpyDeviceToHost);
for (int i=0; i < 5; ++i) {
for (int j=0; j < 32; ++j) {
cout << static_cast<unsigned char>(host_test_arr[i*32+j]) << ", ";
}
cout << "\n";
}
}
我希望输出为:
0, 1, 2, 3, 4, ..., 30, 31
0, 1, 2, 3, 4, ..., 30, 31
相反,我得到:
0, 1, 2, 3, 4, ..., 30, 31
31, 31, 31, 31, ..., 31, 31
我正在测试计算能力 3.5 和 cuda 7.0
我做了更多必要的修改,但重要的是:
arr[bid*32+j] = sharr[tid*32+j];
总代码:
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#define BLOCK_SIZE 256
using namespace std;
__global__ void test_kernel(unsigned char *arr) {
int tid = threadIdx.x; //0-255
int bid = blockIdx.x*blockDim.x + threadIdx.x;//0-1024
__shared__ unsigned char sharr[32 * BLOCK_SIZE];//32*256
#pragma unroll
for (int i=0; i < 32; ++i) {
sharr[tid*32+i] = i;//0,0,0,0,0,0...1,1,1,1,1,1...2,2,2,2,2.....
}
__syncthreads();
#pragma unroll
for (int j=0; j < 32; ++j) {//
//arr = 1024*32 unsigned chars
arr[bid*32+j] = sharr[tid*32+j];
}
}
int main(int argc, char **argv) {
int size = 1024;
unsigned char *device_test_arr;
cudaMalloc((void **) &device_test_arr, size * 32 * sizeof(unsigned char));
const dim3 block_size(256);
const dim3 num_blocks(size / block_size.x);
//<<<4 , 256>>>
test_kernel<<<num_blocks, block_size>>>(device_test_arr);
unsigned char host_test_arr[size*32];
cudaMemcpy(host_test_arr, device_test_arr, size * 32 * sizeof(unsigned char), cudaMemcpyDeviceToHost);
for (int i=0; i < 5; ++i) {
for (int j=0; j < 32; ++j) {
cout << (int)host_test_arr[i*32+j] << ", ";
}cout << endl;
}
}