Cuda 统一内存:当使用指针或非指针对象作为 class 成员时,程序会得到不同的结果
Cuda unified memory: Program gets different result when use pointer or non-pointer object as class member
最近学习了如何使用cuda统一内存进行编码。
但是奇怪的是,当我用非指针对象替换指针对象时,内核报告不同的结果。
请参考Core.cuh和main.cu。
ClassManaged.h 是用于新建和删除重载的基础 class 以及用于构建测试用例的 CMakeList.txt。
//ClassManaged.h This file overloads the new and delete operator for children class
#ifndef __CLASS_MANAGED_H__
#define __CLASS_MANAGED_H__
#include <cuda_runtime_api.h>
class Managed
{
public:
void *operator new(size_t len)
{
printf("-->Managed call!\n");
void *ptr;
cudaMallocManaged(&ptr, len);
cudaDeviceSynchronize();
printf(" Address for Managed constructor: %p\n", ptr);
return ptr;
}
void operator delete(void *ptr)
{
cudaDeviceSynchronize();
cudaFree(ptr);
}
void* operator new[] (size_t len)
{
void *ptr;
cudaMallocManaged(&ptr, len);
cudaDeviceSynchronize();
return ptr;
}
void operator delete[] (void* ptr)
{
cudaDeviceSynchronize();
cudaFree(ptr);
}
};
#endif
//Core.cuh where the bug appears
#ifndef __CORE_CUH__
#define __CORE_CUH__
#include "ClassManaged.h"
#include "cuda_runtime.h"
#include <string>
#include "stdio.h"
class Box : public Managed{
public:
int a;
int b;
};
class Core : public Managed{
public:
__host__ __device__ Core(int cnumin)
{
c_num = cnumin;
}
__host__ __device__ ~Core()
{
cudaFree(datan);
}
void initialize()
{
cudaMallocManaged((void**)&datan, offset*c_num*sizeof(int));
//cudaMallocManaged((void**)&box, sizeof(Box)); // Test case 1 pointer object
//box = new Box(); // Test case 1 pointer object
}
public:
//Box* box; // Test Case 1: pointer object (Everything is ok!)
Box box; // Test Case 2: non-pointer object (with BUG)
int* datan;
public:
int m_id = 0;
int c_num;
int support_num = 0;
const int offset = 12;
float delta = 1.2;
};
// A minimal version for kernel
__global__ void WorkFlow_kernel_forcore(Core* core)
{
volatile int coreno = blockIdx.x;
if(threadIdx.x == 0)
{
printf("\n--->Kernel data!\n");
printf(" Core address in kernel: %p\n", core);
printf(" Box address in kernel: %p\n", &(core->box));
//printf(" Box address in kernel: %p\n", core->box);
printf("\n Addr m_id: %p\n", &(core->m_id));
printf(" Addr c_num: %p\n", &(core->c_num));
printf(" Addr support_num: %p\n", &(core->support_num));
printf(" Addr Offset: %p\n", &(core->offset));
printf(" Addr Delta: %p\n", &(core->delta));
printf("\n Val m_id: %d\n", core->m_id);
printf(" Val c_num: %d\n", core->c_num);
printf(" Val support_num: %d\n", core->support_num);
printf(" Val Offset: %d\n", core->offset);
printf(" Val Delta: %.5f\n", core->delta);
}
// The kernel outputs the wrong result for non-pointer Core::box.
}
//main.cu
#include <cuda_runtime.h>
#include "Core.cuh"
int main()
{
// 1 Only Core involved
// This is a minimal version suggested by Sebastian (only Core and kernel existed here)
Core* core = new Core(20); // Here, the Core still inherits from Managed. Because it seems more convenient to execute constructor on device with help of new and delete overload.
core->initialize();
printf(" Double check core address: %p\n", core);
printf(" Double check box address: %p\n", &(core->box));
//printf(" Double check box address: %p\n", core->box);
printf("\n Double check Addr m_id: %p\n", &(core->m_id));
printf(" Double check Addr c_num: %p\n", &(core->c_num));
printf(" Double check Addr support_num: %p\n", &(core->support_num));
printf(" Double check Addr Offset: %p\n", &(core->offset));
printf(" Double check Addr Delta: %p\n", &(core->delta));
WorkFlow_kernel_forcore<<<1,1>>>(core); // The output is the wrong result when non-pointer Core::box defined!
delete core;
// ----------------------------------Wrong result address output
// -->Managed call!
// Address for Managed constructor: 0000000A00000000
// Double check core address: 0000000A00000000
// Double check box address: 0000000000000000
// Double check Addr m_id: 0000000A00000010
// Double check Addr c_num: 0000000A00000014
// Double check Addr support_num: 0000000A00000018
// Double check Addr Offset: 0000000A0000001C
// Double check Addr Delta: 0000000A00000020
// --->Kernel data!
// Core address in kernel: 0000000A00000000
// Box address in kernel: 0000000A00000004
// Addr m_id: 0000000A00000018
// Addr c_num: 0000000A0000001C
// Addr support_num: 0000000A00000020
// Addr Offset: 0000000A00000024
// Addr Delta: 0000000A00000028
// Val m_id: 0
// Val c_num: 12
// Val support_num: 1067030938
// Val Offset: 0
// Val Delta: 0.00000
// ----------------------------------Correct result address output
// -->Managed call!
// Address for Managed constructor: 0000000A00000000
// -->Managed call!
// Address for Managed constructor: 0000000A00030000
// Double check core address: 0000000A00000000
// Double check box address: 0000000A00030000
// Double check Addr m_id: 0000000A00000010
// Double check Addr c_num: 0000000A00000014
// Double check Addr support_num: 0000000A00000018
// Double check Addr Offset: 0000000A0000001C
// Double check Addr Delta: 0000000A00000020
// --->Kernel data!
// Core address in kernel: 0000000A00000000
// Box address in kernel: 0000000A00030000
// Addr m_id: 0000000A00000010
// Addr c_num: 0000000A00000014
// Addr support_num: 0000000A00000018
// Addr Offset: 0000000A0000001C
// Addr Delta: 0000000A00000020
// Val m_id: 0
// Val c_num: 20
// Val support_num: 0
// Val Offset: 12
// Val Delta: 1.20000
// 2 This version replace the unified memory of core by cudaMalloc and cudaMemcpy.
// NOTE: Before run the test 2, please comment the (cancel the inheritance from Managed)
// class Core /*: public Managed*/ {
//Core* host_core = new Core(20);
//Core* device_core;
//cudaMalloc(&device_core, sizeof(Core));
//cudaMemcpy(device_core, host_core, sizeof(Core), cudaMemcpyHostToDevice);
//WorkFlow_kernel_forcore<<<1,1>>>(device_core);
// !!!---> This kernel output the correct information: 0, 20, 0, 12, 1.2
//delete host_core;
//cudaFree(device_core);
return 0;
}
//CMakeList.txt
project (gputask CXX CUDA)
CMAKE_MINIMUM_REQUIRED(VERSION 3.10 FATAL_ERROR)
if (MSVC)
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif (MSVC)
if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
set(CMAKE_INSTALL_PREFIX "${CMAKE_SOURCE_DIR}/gputask" CACHE PATH "This is default path" FORCE)
endif()
SET(CMAKE_SKIP_BUILD_RPATH FALSE)
SET(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE)
SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
option(ENABLE_EMBED_CUDA "Enable embedding of the CUDA libraries into lib" OFF)
set(GPUTASK_NVCC_ARCHS_DEFAULT "")
list(APPEND GPUTASK_NVCC_ARCHS_DEFAULT 75)
set(GPUTASK_NVCC_ARCHS ${GPUTASK_NVCC_ARCHS_DEFAULT} CACHE STRING "The SM architectures to build code for.")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe --diag_suppress=code_is_unreachable")
if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
message(STATUS "Setting build type to 'Release' as none was specified.")
set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release"
"MinSizeRel" "RelWithDebInfo")
endif()
set(CMAKE_CXX_STANDARD 14)
SET(CMAKE_CUDA_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-unknown-pragmas -Wno-deprecated-declarations -DMPM_CODE")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-unknown-pragmas")
endif()
set(CUDA_ARCH_LIST 70 75 CACHE STRING "List of target sm_ architectures to compile CUDA code for. Separate with semicolons.")
foreach(_cuda_arch ${CUDA_ARCH_LIST})
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${_cuda_arch},code=sm_${_cuda_arch}")
endforeach (_cuda_arch)
if (ENABLE_EMBED_CUDA)
get_filename_component(_cuda_libdir ${CUDA_CUDART_LIBRARY} PATH)
FILE(GLOB _cuda_libs ${_cuda_libdir}/libcurand.* ${_cuda_libdir}/libcufft.* ${_cuda_libdir}/libcusolver.* ${_cuda_libdir}/libcusparse.*)
install(PROGRAMS ${_cuda_libs} DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
endif ()
set(GPUTASK_COMMON_LIBS ${ADDITIONAL_LIBS})
list(APPEND GPUTASK_COMMON_LIBS ${CUDA_LIBRARIES} ${CUDA_cufft_LIBRARY} ${CUDA_curand_LIBRARY})
if (ENABLE_NVTOOLS)
list(APPEND GPUTASK_COMMON_LIBS ${CUDA_nvToolsExt_LIBRARY})
endif()
include_directories(${CUDA_INCLUDE})
exec_program("date +%x" OUTPUT_VARIABLE COMPILE_DATE)
set(CUDA_VERBOSE_BUILD on)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DNVCC -ftz=true")
set(GPUTASK_ROOT "${CMAKE_SOURCE_DIR}")
ADD_EXECUTABLE(mytask ${CMAKE_CURRENT_SOURCE_DIR}/main.cu)
INSTALL(TARGETS mytask DESTINATION ${CMAKE_INSTALL_PREFIX}/bin)
内核为class内核打印非指针或指针之间的不同信息见printf代码块。
平台信息:
OS: 赢 10
Cuda:11.1.74 附带 RTX 2060
Win SDK 10.0.18362.0
MSVC 19.28.29334.0
Visual Studio 16 2019
简而言之,main.cu 的 test1 中的错误输出似乎是由 class Core : public Managed (overload统一内存新建和删除)。
新修改的代码在cudaMallocManaged和kernel阶段打印Core所有成员的地址
很明显,在调用内核时,box的地址在BUG版本中不同(比如box地址从0跳到4)。
正确版本中没有这个东西。
可以推导出盒子地址从某处流向某处?
是否意味着内存超出范围或泄漏? (我猜但不确定)。
已解决------------------------>!!!!
感谢罗伯特,我找到了这个错误的原因。
请参考NVIDIA DOC.
The CUDA compiler follows the IA64 ABI for class layout, while the Microsoft host compiler does not. Let T denote a pointer to member type, or a class type that satisfies any of the following conditions:
T has virtual functions.
T has a virtual base class.
T has multiple inheritance with more than one direct or indirect empty base class.
All direct and indirect base classes B of T are empty and the type of the first field F of T uses B in its definition, such that B is laid out at offset 0 in the definition of F.
由于box和Core都是Managed的children,如果我们把box放在第一位,代码匹配第四种情况,所有直接和间接基classes B的T是空的...
由于 cuda 的 IA64 ABI 与 x64 ABI(Win 主机)相比,Win OS 可能会出现未定义的行为。
非常感谢您的建议!
非常感谢!
问题表明,当Core由cudaMallocManaged创建时,会出现bug。但是,对于cudaMalloc和cudaMemcpy创建的Core,内核给出了正确的答案。
此错误与 CUDA DOC 有关。
CUDA DOC 详细说明:
The CUDA compiler follows the IA64 ABI for class layout, while the Microsoft host compiler does not. Let T denote a pointer to member type, or a class type that satisfies any of the following conditions:
T has virtual functions.
T has a virtual base class.
T has multiple inheritance with more than one direct or indirect empty base class.
All direct and indirect base classes B of T are empty and the type of the first field F of T uses B in its definition, such that B is laid out at offset 0 in the definition of F.
Let C denote T or a class type that has T as a field type or as a base class type. The CUDA compiler may compute the class layout and size differently than the Microsoft host compiler for the type C.
As long as the type C is used exclusively in host or device code, the program should work correctly.
Passing an object of type C between host and device code has undefined behavior e.g., as an argument to a global function or through cudaMemcpy*() calls.
因为Box和Core都是children的Managed(空class重载new和delete运算符)
如果我们把方框(non-pointer object)放在Core的第一个字段,就遇到了第四种情况所有直接和间接基数classes B of T 为空,T 的第一个字段 F 的类型在其定义中使用 B.
由于Windows主机(x64)和CUDA设备(IA64)之间的ABI不同,结果出现内核未定义的行为。
------------> 个人分析
CUDA DOC 还表示内核的未定义行为可以与在主机上创建但在设备上创建的 class 相关联,反之亦然相反。
换句话说,使用 cudaMalloc 创建的核心可以通过一致的创建和 运行 环境(主机或设备)避免错误。
盒子和指针object一样,因为它通过避免第四种情况(children class 空基 class 定位来消除错误在第一个字段)。
最近学习了如何使用cuda统一内存进行编码。 但是奇怪的是,当我用非指针对象替换指针对象时,内核报告不同的结果。
请参考Core.cuh和main.cu。
ClassManaged.h 是用于新建和删除重载的基础 class 以及用于构建测试用例的 CMakeList.txt。
//ClassManaged.h This file overloads the new and delete operator for children class
#ifndef __CLASS_MANAGED_H__
#define __CLASS_MANAGED_H__
#include <cuda_runtime_api.h>
class Managed
{
public:
void *operator new(size_t len)
{
printf("-->Managed call!\n");
void *ptr;
cudaMallocManaged(&ptr, len);
cudaDeviceSynchronize();
printf(" Address for Managed constructor: %p\n", ptr);
return ptr;
}
void operator delete(void *ptr)
{
cudaDeviceSynchronize();
cudaFree(ptr);
}
void* operator new[] (size_t len)
{
void *ptr;
cudaMallocManaged(&ptr, len);
cudaDeviceSynchronize();
return ptr;
}
void operator delete[] (void* ptr)
{
cudaDeviceSynchronize();
cudaFree(ptr);
}
};
#endif
//Core.cuh where the bug appears
#ifndef __CORE_CUH__
#define __CORE_CUH__
#include "ClassManaged.h"
#include "cuda_runtime.h"
#include <string>
#include "stdio.h"
class Box : public Managed{
public:
int a;
int b;
};
class Core : public Managed{
public:
__host__ __device__ Core(int cnumin)
{
c_num = cnumin;
}
__host__ __device__ ~Core()
{
cudaFree(datan);
}
void initialize()
{
cudaMallocManaged((void**)&datan, offset*c_num*sizeof(int));
//cudaMallocManaged((void**)&box, sizeof(Box)); // Test case 1 pointer object
//box = new Box(); // Test case 1 pointer object
}
public:
//Box* box; // Test Case 1: pointer object (Everything is ok!)
Box box; // Test Case 2: non-pointer object (with BUG)
int* datan;
public:
int m_id = 0;
int c_num;
int support_num = 0;
const int offset = 12;
float delta = 1.2;
};
// A minimal version for kernel
__global__ void WorkFlow_kernel_forcore(Core* core)
{
volatile int coreno = blockIdx.x;
if(threadIdx.x == 0)
{
printf("\n--->Kernel data!\n");
printf(" Core address in kernel: %p\n", core);
printf(" Box address in kernel: %p\n", &(core->box));
//printf(" Box address in kernel: %p\n", core->box);
printf("\n Addr m_id: %p\n", &(core->m_id));
printf(" Addr c_num: %p\n", &(core->c_num));
printf(" Addr support_num: %p\n", &(core->support_num));
printf(" Addr Offset: %p\n", &(core->offset));
printf(" Addr Delta: %p\n", &(core->delta));
printf("\n Val m_id: %d\n", core->m_id);
printf(" Val c_num: %d\n", core->c_num);
printf(" Val support_num: %d\n", core->support_num);
printf(" Val Offset: %d\n", core->offset);
printf(" Val Delta: %.5f\n", core->delta);
}
// The kernel outputs the wrong result for non-pointer Core::box.
}
//main.cu
#include <cuda_runtime.h>
#include "Core.cuh"
int main()
{
// 1 Only Core involved
// This is a minimal version suggested by Sebastian (only Core and kernel existed here)
Core* core = new Core(20); // Here, the Core still inherits from Managed. Because it seems more convenient to execute constructor on device with help of new and delete overload.
core->initialize();
printf(" Double check core address: %p\n", core);
printf(" Double check box address: %p\n", &(core->box));
//printf(" Double check box address: %p\n", core->box);
printf("\n Double check Addr m_id: %p\n", &(core->m_id));
printf(" Double check Addr c_num: %p\n", &(core->c_num));
printf(" Double check Addr support_num: %p\n", &(core->support_num));
printf(" Double check Addr Offset: %p\n", &(core->offset));
printf(" Double check Addr Delta: %p\n", &(core->delta));
WorkFlow_kernel_forcore<<<1,1>>>(core); // The output is the wrong result when non-pointer Core::box defined!
delete core;
// ----------------------------------Wrong result address output
// -->Managed call!
// Address for Managed constructor: 0000000A00000000
// Double check core address: 0000000A00000000
// Double check box address: 0000000000000000
// Double check Addr m_id: 0000000A00000010
// Double check Addr c_num: 0000000A00000014
// Double check Addr support_num: 0000000A00000018
// Double check Addr Offset: 0000000A0000001C
// Double check Addr Delta: 0000000A00000020
// --->Kernel data!
// Core address in kernel: 0000000A00000000
// Box address in kernel: 0000000A00000004
// Addr m_id: 0000000A00000018
// Addr c_num: 0000000A0000001C
// Addr support_num: 0000000A00000020
// Addr Offset: 0000000A00000024
// Addr Delta: 0000000A00000028
// Val m_id: 0
// Val c_num: 12
// Val support_num: 1067030938
// Val Offset: 0
// Val Delta: 0.00000
// ----------------------------------Correct result address output
// -->Managed call!
// Address for Managed constructor: 0000000A00000000
// -->Managed call!
// Address for Managed constructor: 0000000A00030000
// Double check core address: 0000000A00000000
// Double check box address: 0000000A00030000
// Double check Addr m_id: 0000000A00000010
// Double check Addr c_num: 0000000A00000014
// Double check Addr support_num: 0000000A00000018
// Double check Addr Offset: 0000000A0000001C
// Double check Addr Delta: 0000000A00000020
// --->Kernel data!
// Core address in kernel: 0000000A00000000
// Box address in kernel: 0000000A00030000
// Addr m_id: 0000000A00000010
// Addr c_num: 0000000A00000014
// Addr support_num: 0000000A00000018
// Addr Offset: 0000000A0000001C
// Addr Delta: 0000000A00000020
// Val m_id: 0
// Val c_num: 20
// Val support_num: 0
// Val Offset: 12
// Val Delta: 1.20000
// 2 This version replace the unified memory of core by cudaMalloc and cudaMemcpy.
// NOTE: Before run the test 2, please comment the (cancel the inheritance from Managed)
// class Core /*: public Managed*/ {
//Core* host_core = new Core(20);
//Core* device_core;
//cudaMalloc(&device_core, sizeof(Core));
//cudaMemcpy(device_core, host_core, sizeof(Core), cudaMemcpyHostToDevice);
//WorkFlow_kernel_forcore<<<1,1>>>(device_core);
// !!!---> This kernel output the correct information: 0, 20, 0, 12, 1.2
//delete host_core;
//cudaFree(device_core);
return 0;
}
//CMakeList.txt
project (gputask CXX CUDA)
CMAKE_MINIMUM_REQUIRED(VERSION 3.10 FATAL_ERROR)
if (MSVC)
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif (MSVC)
if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
set(CMAKE_INSTALL_PREFIX "${CMAKE_SOURCE_DIR}/gputask" CACHE PATH "This is default path" FORCE)
endif()
SET(CMAKE_SKIP_BUILD_RPATH FALSE)
SET(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE)
SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
option(ENABLE_EMBED_CUDA "Enable embedding of the CUDA libraries into lib" OFF)
set(GPUTASK_NVCC_ARCHS_DEFAULT "")
list(APPEND GPUTASK_NVCC_ARCHS_DEFAULT 75)
set(GPUTASK_NVCC_ARCHS ${GPUTASK_NVCC_ARCHS_DEFAULT} CACHE STRING "The SM architectures to build code for.")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe --diag_suppress=code_is_unreachable")
if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
message(STATUS "Setting build type to 'Release' as none was specified.")
set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release"
"MinSizeRel" "RelWithDebInfo")
endif()
set(CMAKE_CXX_STANDARD 14)
SET(CMAKE_CUDA_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-unknown-pragmas -Wno-deprecated-declarations -DMPM_CODE")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-unknown-pragmas")
endif()
set(CUDA_ARCH_LIST 70 75 CACHE STRING "List of target sm_ architectures to compile CUDA code for. Separate with semicolons.")
foreach(_cuda_arch ${CUDA_ARCH_LIST})
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${_cuda_arch},code=sm_${_cuda_arch}")
endforeach (_cuda_arch)
if (ENABLE_EMBED_CUDA)
get_filename_component(_cuda_libdir ${CUDA_CUDART_LIBRARY} PATH)
FILE(GLOB _cuda_libs ${_cuda_libdir}/libcurand.* ${_cuda_libdir}/libcufft.* ${_cuda_libdir}/libcusolver.* ${_cuda_libdir}/libcusparse.*)
install(PROGRAMS ${_cuda_libs} DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
endif ()
set(GPUTASK_COMMON_LIBS ${ADDITIONAL_LIBS})
list(APPEND GPUTASK_COMMON_LIBS ${CUDA_LIBRARIES} ${CUDA_cufft_LIBRARY} ${CUDA_curand_LIBRARY})
if (ENABLE_NVTOOLS)
list(APPEND GPUTASK_COMMON_LIBS ${CUDA_nvToolsExt_LIBRARY})
endif()
include_directories(${CUDA_INCLUDE})
exec_program("date +%x" OUTPUT_VARIABLE COMPILE_DATE)
set(CUDA_VERBOSE_BUILD on)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DNVCC -ftz=true")
set(GPUTASK_ROOT "${CMAKE_SOURCE_DIR}")
ADD_EXECUTABLE(mytask ${CMAKE_CURRENT_SOURCE_DIR}/main.cu)
INSTALL(TARGETS mytask DESTINATION ${CMAKE_INSTALL_PREFIX}/bin)
内核为class内核打印非指针或指针之间的不同信息见printf代码块。
平台信息:
OS: 赢 10
Cuda:11.1.74 附带 RTX 2060
Win SDK 10.0.18362.0
MSVC 19.28.29334.0
Visual Studio 16 2019
简而言之,main.cu 的 test1 中的错误输出似乎是由 class Core : public Managed (overload统一内存新建和删除)。
新修改的代码在cudaMallocManaged和kernel阶段打印Core所有成员的地址
很明显,在调用内核时,box的地址在BUG版本中不同(比如box地址从0跳到4)。
正确版本中没有这个东西。 可以推导出盒子地址从某处流向某处?
是否意味着内存超出范围或泄漏? (我猜但不确定)。
已解决------------------------>!!!!
感谢罗伯特,我找到了这个错误的原因。 请参考NVIDIA DOC.
The CUDA compiler follows the IA64 ABI for class layout, while the Microsoft host compiler does not. Let T denote a pointer to member type, or a class type that satisfies any of the following conditions:
T has virtual functions.
T has a virtual base class.
T has multiple inheritance with more than one direct or indirect empty base class.
All direct and indirect base classes B of T are empty and the type of the first field F of T uses B in its definition, such that B is laid out at offset 0 in the definition of F.
由于box和Core都是Managed的children,如果我们把box放在第一位,代码匹配第四种情况,所有直接和间接基classes B的T是空的...
由于 cuda 的 IA64 ABI 与 x64 ABI(Win 主机)相比,Win OS 可能会出现未定义的行为。
非常感谢您的建议! 非常感谢!
问题表明,当Core由cudaMallocManaged创建时,会出现bug。但是,对于cudaMalloc和cudaMemcpy创建的Core,内核给出了正确的答案。
此错误与 CUDA DOC 有关。
CUDA DOC 详细说明:
The CUDA compiler follows the IA64 ABI for class layout, while the Microsoft host compiler does not. Let T denote a pointer to member type, or a class type that satisfies any of the following conditions:
T has virtual functions.
T has a virtual base class.
T has multiple inheritance with more than one direct or indirect empty base class.
All direct and indirect base classes B of T are empty and the type of the first field F of T uses B in its definition, such that B is laid out at offset 0 in the definition of F.
Let C denote T or a class type that has T as a field type or as a base class type. The CUDA compiler may compute the class layout and size differently than the Microsoft host compiler for the type C. As long as the type C is used exclusively in host or device code, the program should work correctly.
Passing an object of type C between host and device code has undefined behavior e.g., as an argument to a global function or through cudaMemcpy*() calls.
因为Box和Core都是children的Managed(空class重载new和delete运算符)
如果我们把方框(non-pointer object)放在Core的第一个字段,就遇到了第四种情况所有直接和间接基数classes B of T 为空,T 的第一个字段 F 的类型在其定义中使用 B.
由于Windows主机(x64)和CUDA设备(IA64)之间的ABI不同,结果出现内核未定义的行为。
------------> 个人分析
CUDA DOC 还表示内核的未定义行为可以与在主机上创建但在设备上创建的 class 相关联,反之亦然相反。
换句话说,使用 cudaMalloc 创建的核心可以通过一致的创建和 运行 环境(主机或设备)避免错误。
盒子和指针object一样,因为它通过避免第四种情况(children class 空基 class 定位来消除错误在第一个字段)。