重用共享块内存的 Cuda 未对齐地址
Cuda misaligned address for a reused shared block memory
我的内核为数据存储分配了一个共享内存,但是如果我改变共享内存的大小就会出现错误报告,请参阅附件。
#include <stdio.h>
#include <assert.h>
#define cucheck_dev(call) \
{ \
cudaError_t cucheck_err = (call); \
if(cucheck_err != cudaSuccess) { \
const char *err_str = cudaGetErrorString(cucheck_err); \
printf("%s (%d): %s\n", __FILE__, __LINE__, err_str); \
assert(0); \
} \
}
__global__ void kernel(int datanum)
{
extern __shared__ int sh[];
// assign data for data 1
float2* data_ptr1((float2*)sh);
for (int thid = threadIdx.x; thid < datanum; thid += blockDim.x)
{
data_ptr1[thid] = make_float2(0., 0.);
}
__syncthreads();
// assign data for data 2
size_t shOffset = (sizeof(float2)/sizeof(int)*(datanum));
if(threadIdx.x == 0) printf("Offset: %d\n", (int)(shOffset));
__syncthreads();
float4 *data_ptr2((float4*)&sh[shOffset]);
for (int thid = threadIdx.x; thid < datanum; thid += blockDim.x)
{
data_ptr2[thid] = make_float4(0., 0., 0., 0.);
}
__syncthreads();
}
int main()
{
int datanum = 21; // bug reports for datanum = 21, but everthing works fine for datanum = 20
int blocknum = 1;
int threadperblock = 128;
int preallocated = 768;
size_t shmem = datanum*sizeof(float2) + preallocated*sizeof(int);
printf("Allocated Shared memory byte: %d Nums: %d\n", (int)shmem, (int)(shmem/sizeof(int)));
kernel<<<blocknum, threadperblock, shmem>>>(datanum);
cudaDeviceSynchronize();
cucheck_dev(cudaGetLastError());
}
OS: Ubuntu 18.02
库达:10.1
设备:RTX 2060
g++: 7.5.0
如图所示,共享内存包括两个区域,一个是固定数据,类型为float2。
其他区域可能会保存不同的类型,如 int 或 float4,与共享内存条目的偏移量。
当我将 datanum 设置为 20 时,代码工作正常。
但是当datanum改为21时,代码报错地址
非常感谢任何回复或建议。
谢谢!
这里贴出一些cuda-memcheck提供的资料供参考:
========= Invalid __shared__ write of size 16
========= at 0x00000280 in kernel(int)
========= by thread (20,0,0) in block (0,0,0)
========= Address 0x000001e8 is misaligned
========= Device Frame:kernel(int) (kernel(int) : 0x280)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x34e) [0x2efabe]
========= Host Frame:test [0x13de9]
========= Host Frame:test [0x13e77]
========= Host Frame:test [0x4a1c5]
========= Host Frame:test [0x6f32]
========= Host Frame:test [0x6df5]
========= Host Frame:test [0x6e2e]
========= Host Frame:test [0x6c14]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:test [0x69fa]
=========
========= Invalid __shared__ write of size 16
========= at 0x00000280 in kernel(int)
========= by thread (19,0,0) in block (0,0,0)
========= Address 0x000001d8 is misaligned
========= Device Frame:kernel(int) (kernel(int) : 0x280)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x34e) [0x2efabe]
========= Host Frame:test [0x13de9]
========= Host Frame:test [0x13e77]
========= Host Frame:test [0x4a1c5]
========= Host Frame:test [0x6f32]
========= Host Frame:test [0x6df5]
========= Host Frame:test [0x6e2e]
========= Host Frame:test [0x6c14]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:test [0x69fa]
=========
========= Invalid __shared__ write of size 16
========= at 0x00000280 in kernel(int)
========= by thread (18,0,0) in block (0,0,0)
========= Address 0x000001c8 is misaligned
========= Device Frame:kernel(int) (kernel(int) : 0x280)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x34e) [0x2efabe]
========= Host Frame:test [0x13de9]
========= Host Frame:test [0x13e77]
========= Host Frame:test [0x4a1c5]
========= Host Frame:test [0x6f32]
========= Host Frame:test [0x6df5]
========= Host Frame:test [0x6e2e]
========= Host Frame:test [0x6c14]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:test [0x69fa]
=========
========= Invalid __shared__ write of size 16
========= at 0x00000280 in kernel(int)
========= by thread (17,0,0) in block (0,0,0)
========= Address 0x000001b8 is misaligned
========= Device Frame:kernel(int) (kernel(int) : 0x280)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x34e) [0x2efabe]
========= Host Frame:test [0x13de9]
========= Host Frame:test [0x13e77]
========= Host Frame:test [0x4a1c5]
========= Host Frame:test [0x6f32]
========= Host Frame:test [0x6df5]
========= Host Frame:test [0x6e2e]
========= Host Frame:test [0x6c14]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:test [0x69fa]
您的问题是 float4
的对齐高于 float2
。因此行
size_t shOffset = (sizeof(float2)/sizeof(int)*(datanum));
float4 *data_ptr2((float4*)&sh[shOffset]);
不保证 data_ptr2 的适当对齐,除非 datanum 是偶数。
我在这里为这个问题写了一些代码:
最简单的解决方法是交换 data_ptr1
和 data_ptr2
。对齐较大的类型使用内存的前面
我的内核为数据存储分配了一个共享内存,但是如果我改变共享内存的大小就会出现错误报告,请参阅附件。
#include <stdio.h>
#include <assert.h>
#define cucheck_dev(call) \
{ \
cudaError_t cucheck_err = (call); \
if(cucheck_err != cudaSuccess) { \
const char *err_str = cudaGetErrorString(cucheck_err); \
printf("%s (%d): %s\n", __FILE__, __LINE__, err_str); \
assert(0); \
} \
}
__global__ void kernel(int datanum)
{
extern __shared__ int sh[];
// assign data for data 1
float2* data_ptr1((float2*)sh);
for (int thid = threadIdx.x; thid < datanum; thid += blockDim.x)
{
data_ptr1[thid] = make_float2(0., 0.);
}
__syncthreads();
// assign data for data 2
size_t shOffset = (sizeof(float2)/sizeof(int)*(datanum));
if(threadIdx.x == 0) printf("Offset: %d\n", (int)(shOffset));
__syncthreads();
float4 *data_ptr2((float4*)&sh[shOffset]);
for (int thid = threadIdx.x; thid < datanum; thid += blockDim.x)
{
data_ptr2[thid] = make_float4(0., 0., 0., 0.);
}
__syncthreads();
}
int main()
{
int datanum = 21; // bug reports for datanum = 21, but everthing works fine for datanum = 20
int blocknum = 1;
int threadperblock = 128;
int preallocated = 768;
size_t shmem = datanum*sizeof(float2) + preallocated*sizeof(int);
printf("Allocated Shared memory byte: %d Nums: %d\n", (int)shmem, (int)(shmem/sizeof(int)));
kernel<<<blocknum, threadperblock, shmem>>>(datanum);
cudaDeviceSynchronize();
cucheck_dev(cudaGetLastError());
}
OS: Ubuntu 18.02 库达:10.1 设备:RTX 2060 g++: 7.5.0
如图所示,共享内存包括两个区域,一个是固定数据,类型为float2。
其他区域可能会保存不同的类型,如 int 或 float4,与共享内存条目的偏移量。
当我将 datanum 设置为 20 时,代码工作正常。
但是当datanum改为21时,代码报错地址
非常感谢任何回复或建议。
谢谢!
这里贴出一些cuda-memcheck提供的资料供参考:
========= Invalid __shared__ write of size 16
========= at 0x00000280 in kernel(int)
========= by thread (20,0,0) in block (0,0,0)
========= Address 0x000001e8 is misaligned
========= Device Frame:kernel(int) (kernel(int) : 0x280)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x34e) [0x2efabe]
========= Host Frame:test [0x13de9]
========= Host Frame:test [0x13e77]
========= Host Frame:test [0x4a1c5]
========= Host Frame:test [0x6f32]
========= Host Frame:test [0x6df5]
========= Host Frame:test [0x6e2e]
========= Host Frame:test [0x6c14]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:test [0x69fa]
=========
========= Invalid __shared__ write of size 16
========= at 0x00000280 in kernel(int)
========= by thread (19,0,0) in block (0,0,0)
========= Address 0x000001d8 is misaligned
========= Device Frame:kernel(int) (kernel(int) : 0x280)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x34e) [0x2efabe]
========= Host Frame:test [0x13de9]
========= Host Frame:test [0x13e77]
========= Host Frame:test [0x4a1c5]
========= Host Frame:test [0x6f32]
========= Host Frame:test [0x6df5]
========= Host Frame:test [0x6e2e]
========= Host Frame:test [0x6c14]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:test [0x69fa]
=========
========= Invalid __shared__ write of size 16
========= at 0x00000280 in kernel(int)
========= by thread (18,0,0) in block (0,0,0)
========= Address 0x000001c8 is misaligned
========= Device Frame:kernel(int) (kernel(int) : 0x280)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x34e) [0x2efabe]
========= Host Frame:test [0x13de9]
========= Host Frame:test [0x13e77]
========= Host Frame:test [0x4a1c5]
========= Host Frame:test [0x6f32]
========= Host Frame:test [0x6df5]
========= Host Frame:test [0x6e2e]
========= Host Frame:test [0x6c14]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:test [0x69fa]
=========
========= Invalid __shared__ write of size 16
========= at 0x00000280 in kernel(int)
========= by thread (17,0,0) in block (0,0,0)
========= Address 0x000001b8 is misaligned
========= Device Frame:kernel(int) (kernel(int) : 0x280)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x34e) [0x2efabe]
========= Host Frame:test [0x13de9]
========= Host Frame:test [0x13e77]
========= Host Frame:test [0x4a1c5]
========= Host Frame:test [0x6f32]
========= Host Frame:test [0x6df5]
========= Host Frame:test [0x6e2e]
========= Host Frame:test [0x6c14]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:test [0x69fa]
您的问题是 float4
的对齐高于 float2
。因此行
size_t shOffset = (sizeof(float2)/sizeof(int)*(datanum));
float4 *data_ptr2((float4*)&sh[shOffset]);
不保证 data_ptr2 的适当对齐,除非 datanum 是偶数。
我在这里为这个问题写了一些代码:
最简单的解决方法是交换 data_ptr1
和 data_ptr2
。对齐较大的类型使用内存的前面