无法正确打印两个值(python3.5+numba+CUDA8.0)
Two values can't be printed correctly (python3.5+numba+CUDA8.0)
有一个数组,我会在 GPU 中用它做一些计算。
在我计算之前,我应该得到这个数组的子集。
当我打印子集时,发现两个值不对。
代码如下:
import os,sys,time
import pandas as pd
import numpy as np
from numba import cuda, float32
os.environ['NUMBAPRO_NVVM']=r'D:\NVIDIA GPU Computing Toolkit\CUDA\v8.0\nvvm\bin\nvvm64_31_0.dll'
os.environ['NUMBAPRO_LIBDEVICE']=r'D:\NVIDIA GPU Computing Toolkit\CUDA\v8.0\nvvm\libdevice'
bpg = (3,1)
tpb = (2,2)
@cuda.jit
def calcu_TE(D,TE):
gw = cuda.gridDim.x
bx = cuda.blockIdx.x
tx = cuda.threadIdx.x
bw = cuda.blockDim.x
ty = cuda.threadIdx.y
bh = cuda.blockDim.y
c_num = D.shape[0]
#print(c_num)
c_index = bx
while c_index<c_num*c_num:
c_x = int(c_index/c_num)
c_y = c_index%c_num
if c_x==c_y:
TE[0] = 0.0
else:
X = D[c_x,:]
Y = D[c_y,:]
if bx==1 :
print('c_index,bx,tx,ty,X: ',c_index,bx,tx,ty,' ',X[0],X[1],X[2],X[3],X[4],X[5],X[6],X[7],X[8],X[9])
print('c_index,bx,tx,ty,Y: ',c_index,bx,tx,ty,' ',Y[0],Y[1],Y[2],Y[3],Y[4],Y[5],Y[6],Y[7],Y[8],Y[9])
#print('c_index,bx,tx,ty,Y: ',c_index,bx,tx,ty,Y[0],Y[1],Y[2],Y[3],Y[4],Y[5],Y[6],Y[7],Y[8],Y[9])
h = tx
if h==0:
Xi = X[1:]
Xi1 = X[:-1]
Yi = Y[1:]
if bx==1 :
print('bx,tx,ty: ',bx,tx,ty,'\n Xi',Xi[0],Xi[1],Xi[2],Xi[3],Xi[4],Xi[5],Xi[6],Xi[7],Xi[8],
'\n Xi1',Xi1[0],Xi1[1],Xi1[2],Xi1[3],Xi1[4],Xi1[5],Xi1[6],Xi1[7],Xi1[8],
'\n Yi',Yi[0],Yi[1],Yi[2],Yi[3],Yi[4],Yi[5],Yi[6],Yi[7],Yi[8])
c_index +=gw
D = np.array([[ 0.42487645,0.41607881,0.42027071,0.43751907,0.43512794,0.43656972,0.43940639,0.43864551,0.43447691,0.43120232],
[2.989578,2.834707,2.942902,3.294948,2.868170,2.975180,3.066900,2.712719,2.835360,2.607334]], dtype=np.float32)
TE = np.empty([1,1])
print('D: ',D)
stream = cuda.stream()
with stream.auto_synchronize():
dD = cuda.to_device(D, stream)
dTE = cuda.to_device(TE, stream)
calcu_TE[bpg, tpb, stream](dD,dTE)
输出为:
D: [[ 0.42487645 0.41607881 0.42027071 0.43751907 0.43512794 0.43656972
0.43940639 0.43864551 0.43447691 0.43120232]
[ 2.98957801 2.83470702 2.94290209 3.2949481 2.86817002 2.97517991
3.06690001 2.71271896 2.83536005 2.6073339 ]]
c_index,bx,tx,ty,X: 1 1 0 0 0.424876 0.416079 0.420271 0.437519 0.435128 0.436570 0.439406 0.438646 0.434477 0.431202
c_index,bx,tx,ty,X: 1 1 1 0 0.424876 0.416079 0.420271 0.437519 0.435128 0.436570 0.439406 0.438646 0.434477 0.431202
c_index,bx,tx,ty,X: 1 1 0 1 0.424876 0.416079 0.420271 0.437519 0.435128 0.436570 0.439406 0.438646 0.434477 0.431202
c_index,bx,tx,ty,X: 1 1 1 1 0.424876 0.416079 0.420271 0.437519 0.435128 0.436570 0.439406 0.438646 0.434477 0.431202
c_index,bx,tx,ty,Y: 1 1 0 0 2.989578 2.834707 2.942902 3.294948 2.868170 2.975180 3.066900 2.712719 2.835360 2.607334
c_index,bx,tx,ty,Y: 1 1 1 0 2.989578 2.834707 2.942902 3.294948 2.868170 2.975180 3.066900 2.712719 2.835360 2.607334
c_index,bx,tx,ty,Y: 1 1 0 1 2.989578 2.834707 2.942902 3.294948 2.868170 2.975180 3.066900 2.712719 2.835360 2.607334
c_index,bx,tx,ty,Y: 1 1 1 1 2.989578 2.834707 2.942902 3.294948 2.868170 2.975180 3.066900 2.712719 2.835360 2.607334
bx,tx,ty: 1 0 0
Xi 0.416079 0.420271 0.437519 0.435128 0.436570 0.439406 0.438646 0.434477 0.431202
Xi1 0.424876 0.416079 0.420271 0.437519 0.435128 0.436570 0.439406 0.438646 0.434477
Yi 2.834707 2.942902 3.294948 2.868170 2.975180 3.066900 2.712719 0.000000 18949972373983835000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
bx,tx,ty: 1 0 1
Xi 0.416079 0.420271 0.437519 0.435128 0.436570 0.439406 0.438646 0.434477 0.431202
Xi1 0.424876 0.416079 0.420271 0.437519 0.435128 0.436570 0.439406 0.438646 0.434477
Yi 2.834707 2.942902 3.294948 2.868170 2.975180 3.066900 2.712719 0.000000 18949972373983835000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
太奇怪了。
宜Yi 2.834707 2.942902 3.294948 2.868170 2.975180 3.066900 2.712719 2.835360 2.607334
。
但是打印出来了Yi 2.834707 2.942902 3.294948 2.868170 2.975180 3.066900 2.712719 0.000000 18949972373983835000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
。
有两个值错误。
我不知道为什么会这样。有什么我忽略的吗?
这似乎是 Numba 编译器在您的内核中为非常长的打印语句生成代码的方式的问题,与您的内核的正确性无关。如果您像这样更改代码(即缩短打印语句):
@cuda.jit
def calcu_TE(D,TE):
gw = cuda.gridDim.x
bx = cuda.blockIdx.x
tx = cuda.threadIdx.x
bw = cuda.blockDim.x
ty = cuda.threadIdx.y
bh = cuda.blockDim.y
c_num = D.shape[0]
c_index = bx
while c_index<c_num*c_num:
c_x = int(c_index/c_num)
c_y = c_index%c_num
if c_x==c_y:
TE[0] = 0.0
else:
X = D[c_x,:]
Y = D[c_y,:]
if bx==1 :
print('c_index,bx,tx,ty,X: ',c_index,bx,tx,ty,' ',X[0],X[1],X[2],X[3],X[4],X[5],X[6],X[7],X[8],X[9])
print('c_index,bx,tx,ty,Y: ',c_index,bx,tx,ty,' ',Y[0],Y[1],Y[2],Y[3],Y[4],Y[5],Y[6],Y[7],Y[8],Y[9])
h = tx
if h==0:
Xi = X[1:]
Xi1 = X[:-1]
Yi = Y[1:]
if bx==1 :
print('bx,tx,ty,Yi:',bx,tx,ty,' ',Yi[0],Yi[1],Yi[2],Yi[3],Yi[4],Yi[5],Yi[6],Yi[7],Yi[8])
c_index +=gw
您应该会发现 Yi
打印正确。一般来说,依靠打印语句来检测 CUDA 中的内核是一个相当糟糕的主意,这样做只会让你自己感到困惑,就像在这种情况下一样。
有一个数组,我会在 GPU 中用它做一些计算。
在我计算之前,我应该得到这个数组的子集。
当我打印子集时,发现两个值不对。
代码如下:
import os,sys,time
import pandas as pd
import numpy as np
from numba import cuda, float32
os.environ['NUMBAPRO_NVVM']=r'D:\NVIDIA GPU Computing Toolkit\CUDA\v8.0\nvvm\bin\nvvm64_31_0.dll'
os.environ['NUMBAPRO_LIBDEVICE']=r'D:\NVIDIA GPU Computing Toolkit\CUDA\v8.0\nvvm\libdevice'
bpg = (3,1)
tpb = (2,2)
@cuda.jit
def calcu_TE(D,TE):
gw = cuda.gridDim.x
bx = cuda.blockIdx.x
tx = cuda.threadIdx.x
bw = cuda.blockDim.x
ty = cuda.threadIdx.y
bh = cuda.blockDim.y
c_num = D.shape[0]
#print(c_num)
c_index = bx
while c_index<c_num*c_num:
c_x = int(c_index/c_num)
c_y = c_index%c_num
if c_x==c_y:
TE[0] = 0.0
else:
X = D[c_x,:]
Y = D[c_y,:]
if bx==1 :
print('c_index,bx,tx,ty,X: ',c_index,bx,tx,ty,' ',X[0],X[1],X[2],X[3],X[4],X[5],X[6],X[7],X[8],X[9])
print('c_index,bx,tx,ty,Y: ',c_index,bx,tx,ty,' ',Y[0],Y[1],Y[2],Y[3],Y[4],Y[5],Y[6],Y[7],Y[8],Y[9])
#print('c_index,bx,tx,ty,Y: ',c_index,bx,tx,ty,Y[0],Y[1],Y[2],Y[3],Y[4],Y[5],Y[6],Y[7],Y[8],Y[9])
h = tx
if h==0:
Xi = X[1:]
Xi1 = X[:-1]
Yi = Y[1:]
if bx==1 :
print('bx,tx,ty: ',bx,tx,ty,'\n Xi',Xi[0],Xi[1],Xi[2],Xi[3],Xi[4],Xi[5],Xi[6],Xi[7],Xi[8],
'\n Xi1',Xi1[0],Xi1[1],Xi1[2],Xi1[3],Xi1[4],Xi1[5],Xi1[6],Xi1[7],Xi1[8],
'\n Yi',Yi[0],Yi[1],Yi[2],Yi[3],Yi[4],Yi[5],Yi[6],Yi[7],Yi[8])
c_index +=gw
D = np.array([[ 0.42487645,0.41607881,0.42027071,0.43751907,0.43512794,0.43656972,0.43940639,0.43864551,0.43447691,0.43120232],
[2.989578,2.834707,2.942902,3.294948,2.868170,2.975180,3.066900,2.712719,2.835360,2.607334]], dtype=np.float32)
TE = np.empty([1,1])
print('D: ',D)
stream = cuda.stream()
with stream.auto_synchronize():
dD = cuda.to_device(D, stream)
dTE = cuda.to_device(TE, stream)
calcu_TE[bpg, tpb, stream](dD,dTE)
输出为:
D: [[ 0.42487645 0.41607881 0.42027071 0.43751907 0.43512794 0.43656972
0.43940639 0.43864551 0.43447691 0.43120232]
[ 2.98957801 2.83470702 2.94290209 3.2949481 2.86817002 2.97517991
3.06690001 2.71271896 2.83536005 2.6073339 ]]
c_index,bx,tx,ty,X: 1 1 0 0 0.424876 0.416079 0.420271 0.437519 0.435128 0.436570 0.439406 0.438646 0.434477 0.431202
c_index,bx,tx,ty,X: 1 1 1 0 0.424876 0.416079 0.420271 0.437519 0.435128 0.436570 0.439406 0.438646 0.434477 0.431202
c_index,bx,tx,ty,X: 1 1 0 1 0.424876 0.416079 0.420271 0.437519 0.435128 0.436570 0.439406 0.438646 0.434477 0.431202
c_index,bx,tx,ty,X: 1 1 1 1 0.424876 0.416079 0.420271 0.437519 0.435128 0.436570 0.439406 0.438646 0.434477 0.431202
c_index,bx,tx,ty,Y: 1 1 0 0 2.989578 2.834707 2.942902 3.294948 2.868170 2.975180 3.066900 2.712719 2.835360 2.607334
c_index,bx,tx,ty,Y: 1 1 1 0 2.989578 2.834707 2.942902 3.294948 2.868170 2.975180 3.066900 2.712719 2.835360 2.607334
c_index,bx,tx,ty,Y: 1 1 0 1 2.989578 2.834707 2.942902 3.294948 2.868170 2.975180 3.066900 2.712719 2.835360 2.607334
c_index,bx,tx,ty,Y: 1 1 1 1 2.989578 2.834707 2.942902 3.294948 2.868170 2.975180 3.066900 2.712719 2.835360 2.607334
bx,tx,ty: 1 0 0
Xi 0.416079 0.420271 0.437519 0.435128 0.436570 0.439406 0.438646 0.434477 0.431202
Xi1 0.424876 0.416079 0.420271 0.437519 0.435128 0.436570 0.439406 0.438646 0.434477
Yi 2.834707 2.942902 3.294948 2.868170 2.975180 3.066900 2.712719 0.000000 18949972373983835000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
bx,tx,ty: 1 0 1
Xi 0.416079 0.420271 0.437519 0.435128 0.436570 0.439406 0.438646 0.434477 0.431202
Xi1 0.424876 0.416079 0.420271 0.437519 0.435128 0.436570 0.439406 0.438646 0.434477
Yi 2.834707 2.942902 3.294948 2.868170 2.975180 3.066900 2.712719 0.000000 18949972373983835000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
太奇怪了。
宜Yi 2.834707 2.942902 3.294948 2.868170 2.975180 3.066900 2.712719 2.835360 2.607334
。
但是打印出来了Yi 2.834707 2.942902 3.294948 2.868170 2.975180 3.066900 2.712719 0.000000 18949972373983835000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
。
有两个值错误。
我不知道为什么会这样。有什么我忽略的吗?
这似乎是 Numba 编译器在您的内核中为非常长的打印语句生成代码的方式的问题,与您的内核的正确性无关。如果您像这样更改代码(即缩短打印语句):
@cuda.jit
def calcu_TE(D,TE):
gw = cuda.gridDim.x
bx = cuda.blockIdx.x
tx = cuda.threadIdx.x
bw = cuda.blockDim.x
ty = cuda.threadIdx.y
bh = cuda.blockDim.y
c_num = D.shape[0]
c_index = bx
while c_index<c_num*c_num:
c_x = int(c_index/c_num)
c_y = c_index%c_num
if c_x==c_y:
TE[0] = 0.0
else:
X = D[c_x,:]
Y = D[c_y,:]
if bx==1 :
print('c_index,bx,tx,ty,X: ',c_index,bx,tx,ty,' ',X[0],X[1],X[2],X[3],X[4],X[5],X[6],X[7],X[8],X[9])
print('c_index,bx,tx,ty,Y: ',c_index,bx,tx,ty,' ',Y[0],Y[1],Y[2],Y[3],Y[4],Y[5],Y[6],Y[7],Y[8],Y[9])
h = tx
if h==0:
Xi = X[1:]
Xi1 = X[:-1]
Yi = Y[1:]
if bx==1 :
print('bx,tx,ty,Yi:',bx,tx,ty,' ',Yi[0],Yi[1],Yi[2],Yi[3],Yi[4],Yi[5],Yi[6],Yi[7],Yi[8])
c_index +=gw
您应该会发现 Yi
打印正确。一般来说,依靠打印语句来检测 CUDA 中的内核是一个相当糟糕的主意,这样做只会让你自己感到困惑,就像在这种情况下一样。