PyCUDA 和固定内存的点积
dot product with PyCUDA and pinned memory
我目前正在使用 PyCUDA 开发带有固定内存的点积。我对大数组有疑问。
我正在与:
- 英伟达 GTX 1060
- CUDA 9.1
- PyCUDA 2017.1.1
密码是:
#!/usr/bin/env python
import numpy as np
import argparse
import math
import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule
from time import time
dot_mod = SourceModule("""
__global__ void full_dot( double* v1, double* v2, double* out, int N ) {
__shared__ double cache[ 1024 ];
int i = blockIdx.x * blockDim.x + threadIdx.x;
cache[ threadIdx.x ] = 0.f;
while( i < N ) {
cache[ threadIdx.x ] += v1[ i ] * v2[ i ];
i += gridDim.x * blockDim.x;
}
__syncthreads(); // required because later on the current thread is accessing
// data written by another thread
i = 1024 / 2;
while( i > 0 ) {
if( threadIdx.x < i ) cache[ threadIdx.x ] += cache[ threadIdx.x + i ];
__syncthreads();
i /= 2; //not sure bitwise operations are actually faster
}
#ifndef NO_SYNC // serialized access to shared data;
if( threadIdx.x == 0 ) atomicAdd( out, cache[ 0 ] );
#else // no sync, what most likely happens is:
// 1) all threads read 0
// 2) all threads write concurrently 16 (local block dot product)
if( threadIdx.x == 0 ) *out += cache[ 0 ];
#endif
}
""")
def main(args):
dot = dot_mod.get_function("full_dot")
N = args.number
BLOCK_SIZE = 1024
BLOCKS = int(math.ceil(N/BLOCK_SIZE))
THREADS_PER_BLOCK = BLOCK_SIZE
# Time use of pinned host memory:
x = drv.aligned_empty((N), dtype=np.float64, order='C')
x = drv.register_host_memory(x, flags=drv.mem_host_register_flags.DEVICEMAP)
x_gpu_ptr = np.intp(x.base.get_device_pointer())
# Time use of pinned host memory:
y = drv.aligned_empty((N), dtype=np.float64, order='C')
y = drv.register_host_memory(y, flags=drv.mem_host_register_flags.DEVICEMAP)
y_gpu_ptr = np.intp(y.base.get_device_pointer())
# Time use of pinned host memory:
z = drv.aligned_empty((1), dtype=np.float64, order='C')
z = drv.register_host_memory(z, flags=drv.mem_host_register_flags.DEVICEMAP)
z_gpu_ptr = np.intp(z.base.get_device_pointer())
z[:] = np.zeros(1)
x[:] = np.zeros(N)
y[:] = np.zeros(N)
x[:] = np.random.rand(N)
y[:] = x[:]
x_orig = x.copy()
y_orig = y.copy()
start = time()
dot(x_gpu_ptr, y_gpu_ptr, z_gpu_ptr, np.uint32(N), block=(THREADS_PER_BLOCK, 1, 1), grid=(BLOCKS,1))
times = time()-start
print "Average kernel GPU dot product execution time with pinned memory: %3.7f" % np.mean(times)
start = time()
ydot=np.dot(x_orig,y_orig)
times = time()-start
print "Average numpy dot product execution time: %3.7f" % np.mean(times)
print N,ydot,z[0]
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=' ')
parser.add_argument('-n', dest='number', type=long, help="Number of samples ", required=True)
args = parser.parse_args()
main(args)
我编写的这段代码在样本数组的大小大约小于 1024*12 时运行良好,但对于像 1024*1024 这样的大数组,值会给出错误的结果。-
➜ ./test_dot_pinned.py -n 16384
Average kernel GPU dot product execution time with pinned memory: 0.0001669
Average numpy dot product execution time: 0.0000119
16384 5468.09590706 5468.09590706
SIZE np.dot() GPU-dot-pinned
➜ ./test_dot_pinned.py -n 1048576
Average kernel GPU dot product execution time with pinned memory: 0.0002351
Average numpy dot product execution time: 0.0010922
1048576 349324.532564 258321.148593
SIZE np.dot() GPU-dot-pinned
谢谢大家,希望有人能帮帮我。
pycuda 在内核启动后不强制执行任何同步。通常,如果您在内核启动后执行设备-> 主机数据副本,该操作将强制同步,即它会强制内核完成。
但是你的代码中没有这样的同步。由于您使用的是固定内存,因此随着内核执行时间的增长(由于工作量更大),最终当您打印出 z[0]
时,您只会得到部分结果,因为此时内核尚未完成。
这样做的一个副作用是您的内核时间测量不准确。
您可以通过在完成时间测量之前强制内核完成来解决这两个问题:
dot(x_gpu_ptr, y_gpu_ptr, z_gpu_ptr, np.uint32(N), block=(THREADS_PER_BLOCK, 1, 1), grid=(BLOCKS,1))
#add the next line of code:
drv.Context.synchronize()
times = time()-start
我目前正在使用 PyCUDA 开发带有固定内存的点积。我对大数组有疑问。
我正在与:
- 英伟达 GTX 1060
- CUDA 9.1
- PyCUDA 2017.1.1
密码是:
#!/usr/bin/env python
import numpy as np
import argparse
import math
import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule
from time import time
dot_mod = SourceModule("""
__global__ void full_dot( double* v1, double* v2, double* out, int N ) {
__shared__ double cache[ 1024 ];
int i = blockIdx.x * blockDim.x + threadIdx.x;
cache[ threadIdx.x ] = 0.f;
while( i < N ) {
cache[ threadIdx.x ] += v1[ i ] * v2[ i ];
i += gridDim.x * blockDim.x;
}
__syncthreads(); // required because later on the current thread is accessing
// data written by another thread
i = 1024 / 2;
while( i > 0 ) {
if( threadIdx.x < i ) cache[ threadIdx.x ] += cache[ threadIdx.x + i ];
__syncthreads();
i /= 2; //not sure bitwise operations are actually faster
}
#ifndef NO_SYNC // serialized access to shared data;
if( threadIdx.x == 0 ) atomicAdd( out, cache[ 0 ] );
#else // no sync, what most likely happens is:
// 1) all threads read 0
// 2) all threads write concurrently 16 (local block dot product)
if( threadIdx.x == 0 ) *out += cache[ 0 ];
#endif
}
""")
def main(args):
dot = dot_mod.get_function("full_dot")
N = args.number
BLOCK_SIZE = 1024
BLOCKS = int(math.ceil(N/BLOCK_SIZE))
THREADS_PER_BLOCK = BLOCK_SIZE
# Time use of pinned host memory:
x = drv.aligned_empty((N), dtype=np.float64, order='C')
x = drv.register_host_memory(x, flags=drv.mem_host_register_flags.DEVICEMAP)
x_gpu_ptr = np.intp(x.base.get_device_pointer())
# Time use of pinned host memory:
y = drv.aligned_empty((N), dtype=np.float64, order='C')
y = drv.register_host_memory(y, flags=drv.mem_host_register_flags.DEVICEMAP)
y_gpu_ptr = np.intp(y.base.get_device_pointer())
# Time use of pinned host memory:
z = drv.aligned_empty((1), dtype=np.float64, order='C')
z = drv.register_host_memory(z, flags=drv.mem_host_register_flags.DEVICEMAP)
z_gpu_ptr = np.intp(z.base.get_device_pointer())
z[:] = np.zeros(1)
x[:] = np.zeros(N)
y[:] = np.zeros(N)
x[:] = np.random.rand(N)
y[:] = x[:]
x_orig = x.copy()
y_orig = y.copy()
start = time()
dot(x_gpu_ptr, y_gpu_ptr, z_gpu_ptr, np.uint32(N), block=(THREADS_PER_BLOCK, 1, 1), grid=(BLOCKS,1))
times = time()-start
print "Average kernel GPU dot product execution time with pinned memory: %3.7f" % np.mean(times)
start = time()
ydot=np.dot(x_orig,y_orig)
times = time()-start
print "Average numpy dot product execution time: %3.7f" % np.mean(times)
print N,ydot,z[0]
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=' ')
parser.add_argument('-n', dest='number', type=long, help="Number of samples ", required=True)
args = parser.parse_args()
main(args)
我编写的这段代码在样本数组的大小大约小于 1024*12 时运行良好,但对于像 1024*1024 这样的大数组,值会给出错误的结果。-
➜ ./test_dot_pinned.py -n 16384
Average kernel GPU dot product execution time with pinned memory: 0.0001669
Average numpy dot product execution time: 0.0000119
16384 5468.09590706 5468.09590706
SIZE np.dot() GPU-dot-pinned
➜ ./test_dot_pinned.py -n 1048576
Average kernel GPU dot product execution time with pinned memory: 0.0002351
Average numpy dot product execution time: 0.0010922
1048576 349324.532564 258321.148593
SIZE np.dot() GPU-dot-pinned
谢谢大家,希望有人能帮帮我。
pycuda 在内核启动后不强制执行任何同步。通常,如果您在内核启动后执行设备-> 主机数据副本,该操作将强制同步,即它会强制内核完成。
但是你的代码中没有这样的同步。由于您使用的是固定内存,因此随着内核执行时间的增长(由于工作量更大),最终当您打印出 z[0]
时,您只会得到部分结果,因为此时内核尚未完成。
这样做的一个副作用是您的内核时间测量不准确。
您可以通过在完成时间测量之前强制内核完成来解决这两个问题:
dot(x_gpu_ptr, y_gpu_ptr, z_gpu_ptr, np.uint32(N), block=(THREADS_PER_BLOCK, 1, 1), grid=(BLOCKS,1))
#add the next line of code:
drv.Context.synchronize()
times = time()-start