双原子操作,OpenCL
Atomic operations with double, OpenCL
我想知道是否有一种方法可以实现双精度类型的原子操作(特别是atomic_add)。
对于浮点数,此代码有效,但 atomic_xchg 不支持双精度数:
while ((value = atomic_xchg(addr, atomic_xchg(addr, 0.0f)+value))!=0.0f);
我过去一直在寻找同样的东西,我发现了这个:https://github.com/ddemidov/vexcl-experiments/blob/master/sort-by-key-atomic.cpp。
最后我想出了解决问题的不同方法,所以我没有使用它。这是代码:
"#pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
"#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable\n"
"void AtomicAdd(__global double *val, double delta) {\n"
" union {\n"
" double f;\n"
" ulong i;\n"
" } old;\n"
" union {\n"
" double f;\n"
" ulong i;\n"
" } new;\n"
" do {\n"
" old.f = *val;\n"
" new.f = old.f + delta;\n"
" } while (atom_cmpxchg ( (volatile __global ulong *)val, old.i, new.i) != old.i);\n"
"}\n"
"kernel void atomic_reduce(\n"
" ulong n,\n"
" global const int * key,\n"
" global const double * val,\n"
" global double * sum\n"
")\n"
"{\n"
" for(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0))\n"
" AtomicAdd(sum + key[idx], val[idx]);\n"
"}\n",
"atomic_reduce"
初始 post 的两种方法和 doqtor work well. Basically there are two ways to implement them on doubles: using unions or using OpenCL as_type functions 的答案。答案末尾提供了 OpenCL 1.0 代码片段(对于 OpenCL 2.x,它们可以缩短,但 NVIDIA 目前还不支持它)。至于性能,我个人有 AMD OpenCL 在 Tahiti 芯片上实现的经验,所有这些变体产生或多或少相同的执行时间(as_ 和 union 变体甚至在大多数测试编译器上产生相同的优化 ISA 代码)。因此,使用一种或另一种变体是个人品味的问题。
// define REALDOUBLES for double precision, undefine for single
#if REALDOUBLES
// extensions needed
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#ifdef cl_khr_int64_base_atomics
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
// definitions
#define UINTVAR ulong
#define AS_INT as_ulong
#define AS_REAL as_double
#define ATOM_CMPXCHG atom_cmpxchg
#define ATOM_XCHG atom_xchg
#else
// extensions needed
#ifdef cl_khr_local_int32_base_atomics
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#endif
#ifdef cl_khr_global_int32_base_atomics
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#endif
// definitions
#define UINTVAR uint
#define AS_INT as_uint
#define AS_REAL as_float
#define ATOM_CMPXCHG atomic_cmpxchg
#define ATOM_XCHG atomic_xchg
#endif
// as_ variants
// variant from GROMACS - https://streamhpc.com/blog/2016-02-09/atomic-operations-for-floats-in-opencl-improved/
inline void atomic_add_local(volatile local REAL * const source, const REAL operand) {
UINTVAR expected, current;
current = AS_INT(*source);
do {
expected = current;
current = ATOM_CMPXCHG((volatile local UINTVAR *)source, expected, AS_INT(AS_REAL(expected) + operand));
} while (current != expected);
}
// NVIDIA variant
inline void atomic_add_local(local REAL * const source, const REAL operand) {
UINTVAR old = AS_INT(operand);
while ((old = ATOM_XCHG((local UINTVAR *)source, AS_INT(AS_REAL(ATOM_XCHG((local UINTVAR *)source, AS_INT((REAL)0))) + AS_REAL(old)))) != AS_INT((REAL)0));
}
// union variants
typedef union {
UINTVAR intVal;
REAL floatVal;
} uni;
// NVIDIA variant
inline void atomic_add_local(local REAL * const source, const REAL operand) {
uni old, t, zero;
old.floatVal = operand;
zero.floatVal = 0;
do {
t.intVal = ATOM_XCHG((local UINTVAR *)source, zero.intVal);
t.floatVal += old.floatVal;
} while ((old.intVal = ATOM_XCHG((local UINTVAR *)source, t.intVal)) != zero.intVal);
}
// shortened variant from GROMACS - https://streamhpc.com/blog/2016-02-09/atomic-operations-for-floats-in-opencl-improved/
inline void atomic_add_local(volatile local REAL * const source, const REAL operand) {
uni expected, current;
current.floatVal = *source;
do {
expected.floatVal = current.floatVal;
current.floatVal = expected.floatVal + operand;
current.intVal = ATOM_CMPXCHG((volatile local UINTVAR *)source, expected.intVal, current.intVal);
} while (current.intVal != expected.intVal);
}
并且明显替代 local<->global
全局内存。
我想知道是否有一种方法可以实现双精度类型的原子操作(特别是atomic_add)。
对于浮点数,此代码有效,但 atomic_xchg 不支持双精度数:
while ((value = atomic_xchg(addr, atomic_xchg(addr, 0.0f)+value))!=0.0f);
我过去一直在寻找同样的东西,我发现了这个:https://github.com/ddemidov/vexcl-experiments/blob/master/sort-by-key-atomic.cpp。 最后我想出了解决问题的不同方法,所以我没有使用它。这是代码:
"#pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
"#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable\n"
"void AtomicAdd(__global double *val, double delta) {\n"
" union {\n"
" double f;\n"
" ulong i;\n"
" } old;\n"
" union {\n"
" double f;\n"
" ulong i;\n"
" } new;\n"
" do {\n"
" old.f = *val;\n"
" new.f = old.f + delta;\n"
" } while (atom_cmpxchg ( (volatile __global ulong *)val, old.i, new.i) != old.i);\n"
"}\n"
"kernel void atomic_reduce(\n"
" ulong n,\n"
" global const int * key,\n"
" global const double * val,\n"
" global double * sum\n"
")\n"
"{\n"
" for(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0))\n"
" AtomicAdd(sum + key[idx], val[idx]);\n"
"}\n",
"atomic_reduce"
初始 post 的两种方法和 doqtor work well. Basically there are two ways to implement them on doubles: using unions or using OpenCL as_type functions 的答案。答案末尾提供了 OpenCL 1.0 代码片段(对于 OpenCL 2.x,它们可以缩短,但 NVIDIA 目前还不支持它)。至于性能,我个人有 AMD OpenCL 在 Tahiti 芯片上实现的经验,所有这些变体产生或多或少相同的执行时间(as_ 和 union 变体甚至在大多数测试编译器上产生相同的优化 ISA 代码)。因此,使用一种或另一种变体是个人品味的问题。
// define REALDOUBLES for double precision, undefine for single
#if REALDOUBLES
// extensions needed
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#ifdef cl_khr_int64_base_atomics
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
// definitions
#define UINTVAR ulong
#define AS_INT as_ulong
#define AS_REAL as_double
#define ATOM_CMPXCHG atom_cmpxchg
#define ATOM_XCHG atom_xchg
#else
// extensions needed
#ifdef cl_khr_local_int32_base_atomics
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#endif
#ifdef cl_khr_global_int32_base_atomics
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#endif
// definitions
#define UINTVAR uint
#define AS_INT as_uint
#define AS_REAL as_float
#define ATOM_CMPXCHG atomic_cmpxchg
#define ATOM_XCHG atomic_xchg
#endif
// as_ variants
// variant from GROMACS - https://streamhpc.com/blog/2016-02-09/atomic-operations-for-floats-in-opencl-improved/
inline void atomic_add_local(volatile local REAL * const source, const REAL operand) {
UINTVAR expected, current;
current = AS_INT(*source);
do {
expected = current;
current = ATOM_CMPXCHG((volatile local UINTVAR *)source, expected, AS_INT(AS_REAL(expected) + operand));
} while (current != expected);
}
// NVIDIA variant
inline void atomic_add_local(local REAL * const source, const REAL operand) {
UINTVAR old = AS_INT(operand);
while ((old = ATOM_XCHG((local UINTVAR *)source, AS_INT(AS_REAL(ATOM_XCHG((local UINTVAR *)source, AS_INT((REAL)0))) + AS_REAL(old)))) != AS_INT((REAL)0));
}
// union variants
typedef union {
UINTVAR intVal;
REAL floatVal;
} uni;
// NVIDIA variant
inline void atomic_add_local(local REAL * const source, const REAL operand) {
uni old, t, zero;
old.floatVal = operand;
zero.floatVal = 0;
do {
t.intVal = ATOM_XCHG((local UINTVAR *)source, zero.intVal);
t.floatVal += old.floatVal;
} while ((old.intVal = ATOM_XCHG((local UINTVAR *)source, t.intVal)) != zero.intVal);
}
// shortened variant from GROMACS - https://streamhpc.com/blog/2016-02-09/atomic-operations-for-floats-in-opencl-improved/
inline void atomic_add_local(volatile local REAL * const source, const REAL operand) {
uni expected, current;
current.floatVal = *source;
do {
expected.floatVal = current.floatVal;
current.floatVal = expected.floatVal + operand;
current.intVal = ATOM_CMPXCHG((volatile local UINTVAR *)source, expected.intVal, current.intVal);
} while (current.intVal != expected.intVal);
}
并且明显替代 local<->global
全局内存。