一起使用 pycuda 和 theano:pycuda._driver.LogicError
Using pycuda and theano together: pycuda._driver.LogicError
我有一些复杂的公式,直接用CUDA代码实现比较容易。另一方面,我需要利用theano的特性搭建一个神经网络,单独训练。
如何安全地同时使用pycuda和theano?
以下代码适用于我的机器:
import numpy as np
import pycuda.autoinit as cuauto
import pycuda.driver as cuda
import pycuda.compiler as cudacc
import pycuda.gpuarray as gpuarray
import theano
import theano.tensor as T
def get_pycuda_func():
mod = cudacc.SourceModule("""
__global__ void mul(double *dest, double *a, double *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
mul = mod.get_function("mul")
mul.prepare("PPP")
def f(a,b):
N = len(a)
gpu_a = gpuarray.to_gpu(a)
gpu_b = gpuarray.to_gpu(b)
c = gpuarray.empty((N,),dtype=np.float64)
mul.prepared_call(
(1,1,1),(N,1,1),
c.gpudata,
gpu_a.gpudata,
gpu_b.gpudata
)
return c.get()
return f
def get_theano_func():
a = T.vector('a')
b = T.vector('b')
c = a*b
f = theano.function([a,b],c,allow_input_downcast=True)
return f
def get_cpu_func():
def f(a,b):
return a*b
return f
if __name__ == "__main__":
np.random.seed(12345)
a = np.random.randn(400)
b = np.random.randn(400)
f_cuda = get_pycuda_func()
f_cpu = get_cpu_func()
f_theano = get_theano_func()
for k in range(10):
x = f_cuda(a,b)
y = f_theano(a,b)
z = f_cpu(a,b)
print(k)
print(np.allclose(x,z))
print(np.allclose(y,z))
输出:
$ python3 test_theano_pycuda_simpler.py
Using cuDNN version 7003 on context None
Mapped name None to device cuda: GeForce GTX TITAN Black (0000:01:00.0)
0
True
True
1
True
True
2
True
True
3
True
True
4
True
True
5
True
True
6
True
True
7
True
True
8
True
True
9
True
True
但是如果我做一个更复杂的theano计算,它就不起作用了。以下不起作用:
import numpy as np
import pycuda.autoinit as cuauto
import pycuda.driver as cuda
import pycuda.compiler as cudacc
import pycuda.gpuarray as gpuarray
import theano
import theano.tensor as T
def get_pycuda_func():
mod = cudacc.SourceModule("""
__global__ void mul(double *dest, double *a, double *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
mul = mod.get_function("mul")
mul.prepare("PPP")
def f(a,b):
N = len(a)
gpu_a = gpuarray.to_gpu(a)
gpu_b = gpuarray.to_gpu(b)
c = gpuarray.empty((N,),dtype=np.float64)
mul.prepared_call(
(1,1,1),(N,1,1),
c.gpudata,
gpu_a.gpudata,
gpu_b.gpudata
)
return c.get()
return f
floatX=theano.config.floatX
def init_bias(size):
tmp = np.random.rand(size)
return theano.shared(np.asarray(tmp,dtype=floatX))
def init_weights(in_size,out_size):
s = np.sqrt(2./(in_size+out_size))
tmp = np.random.normal(loc=0.,scale=s,size=(in_size,out_size))
return theano.shared(np.asarray(tmp,dtype=floatX))
def adam(params, gparams,learning_rate = 0.0001, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8):
updates = []
t_pre = theano.shared(np.asarray(.0, dtype=theano.config.floatX))
t = t_pre + 1
a_t = learning_rate * T.sqrt(1 - beta2 ** t) / (1 - beta1 ** t)
for (p,g) in zip(params, gparams):
v = p.get_value(borrow = True)
m_pre = theano.shared(np.zeros(v.shape, dtype = v.dtype), broadcastable = p.broadcastable)
v_pre = theano.shared(np.zeros(v.shape, dtype = v.dtype), broadcastable = p.broadcastable)
m_t = beta1 * m_pre + (1 - beta1) * g
v_t = beta2 * v_pre + (1 - beta2) * g ** 2
step = a_t * m_t / (T.sqrt(v_t) + epsilon)
p_update = p - step
updates.append((m_pre, m_t))
updates.append((v_pre, v_t))
updates.append((p, p_update))
updates.append((t_pre, t))
return updates
class test_network:
def __init__(self,hidden=[100,100]):
self.hidden = hidden
self._create_params()
self._create_train_func()
self._create_func()
def _create_params(self):
hidden = self.hidden
W0 = init_weights(1,hidden[0])
W1 = init_weights(hidden[0],hidden[1])
W2 = init_weights(hidden[1],1)
b0 = init_bias(hidden[0])
b1 = init_bias(hidden[1])
b2 = init_bias(1)
self.params = [
W0,W1,W2,
b0,b1,b2,
]
def predict(self,x):
[
W0,W1,W2,
b0,b1,b2,
] = self.params
H0 = T.dot(x,W0) + b0
H0 = T.nnet.relu(H0)
H1 = T.dot(H0,W1) + b1
H1 = T.nnet.relu(H1)
ret = T.dot(H1,W2) + b2
return ret
def _create_func(self):
x = T.matrix('x')
y = self.predict(x)
self.f = theano.function([x],y,allow_input_downcast=True)
def _create_train_func(self):
y_in = T.matrix('y_in')
x = T.matrix('x')
y = self.predict(x)
loss = T.mean((y-y_in)*(y-y_in))
grad_loss = T.grad(loss,self.params)
updates = adam(self.params,grad_loss)
self.train = theano.function(inputs=[x,y_in],
outputs=loss,
updates=updates,
allow_input_downcast=True)
def get_cpu_func():
def f(a,b):
return a*b
return f
if __name__ == "__main__":
np.random.seed(12345)
a = np.random.randn(400)
b = np.random.randn(400)
f_cuda = get_pycuda_func()
f_cpu = get_cpu_func()
T = test_network()
for k in range(10):
x = f_cuda(a,b)
z = f_cpu(a,b)
print(k)
print(np.allclose(x,z))
batch_size = 256
for k in range(1000):
x = np.random.rand(batch_size)
y = x*x
x = x.reshape(batch_size,1)
y = y.reshape(batch_size,1)
loss = T.train(x,y)
print("k=%d, loss=%g" % (k,loss))
我会得到:
$ python3 test_theano_pycuda.py
Using cuDNN version 7003 on context None
Mapped name None to device cuda: GeForce GTX TITAN Black (0000:01:00.0)
Traceback (most recent call last):
File "test_theano_pycuda.py", line 160, in <module>
x = f_cuda(a,b)
File "test_theano_pycuda.py", line 32, in f
gpu_b.gpudata
File "/usr/local/lib/python3.5/dist-packages/pycuda-2017.1.1-py3.5-linux-x86_64.egg/pycuda/driver.py", line 447, in function_prepared_call
func._set_block_shape(*block)
pycuda._driver.LogicError: cuFuncSetBlockShape failed: invalid resource handle
我确信我的 test_theano_pycuda.py 可以工作,因为我已经通过强制 theano 使用 CPU 而不是 cuda 来测试它。 (通过修改~/.theanorc):
来自 this。我打赌它应该与pycuda和theano都在一个进程中创建上下文的问题有关。
with gpuarray_cuda_context:
pycuda_context = pycuda.driver.Context.attach()
那个gpuarray_cuda_context
从哪里来的?有没有我可以测试的可行示例?
gpuarray_cuda_context
这里只是 GpuArray 变量的现有上下文。
例如,您可以在 theano/gpuarray/fft.py
中找到一个例子,我认为 skcuda.misc.init()
会调用 pycuda.driver.Context.attach()
或做类似的事情。
我有一些复杂的公式,直接用CUDA代码实现比较容易。另一方面,我需要利用theano的特性搭建一个神经网络,单独训练。
如何安全地同时使用pycuda和theano?
以下代码适用于我的机器:
import numpy as np
import pycuda.autoinit as cuauto
import pycuda.driver as cuda
import pycuda.compiler as cudacc
import pycuda.gpuarray as gpuarray
import theano
import theano.tensor as T
def get_pycuda_func():
mod = cudacc.SourceModule("""
__global__ void mul(double *dest, double *a, double *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
mul = mod.get_function("mul")
mul.prepare("PPP")
def f(a,b):
N = len(a)
gpu_a = gpuarray.to_gpu(a)
gpu_b = gpuarray.to_gpu(b)
c = gpuarray.empty((N,),dtype=np.float64)
mul.prepared_call(
(1,1,1),(N,1,1),
c.gpudata,
gpu_a.gpudata,
gpu_b.gpudata
)
return c.get()
return f
def get_theano_func():
a = T.vector('a')
b = T.vector('b')
c = a*b
f = theano.function([a,b],c,allow_input_downcast=True)
return f
def get_cpu_func():
def f(a,b):
return a*b
return f
if __name__ == "__main__":
np.random.seed(12345)
a = np.random.randn(400)
b = np.random.randn(400)
f_cuda = get_pycuda_func()
f_cpu = get_cpu_func()
f_theano = get_theano_func()
for k in range(10):
x = f_cuda(a,b)
y = f_theano(a,b)
z = f_cpu(a,b)
print(k)
print(np.allclose(x,z))
print(np.allclose(y,z))
输出:
$ python3 test_theano_pycuda_simpler.py
Using cuDNN version 7003 on context None
Mapped name None to device cuda: GeForce GTX TITAN Black (0000:01:00.0)
0
True
True
1
True
True
2
True
True
3
True
True
4
True
True
5
True
True
6
True
True
7
True
True
8
True
True
9
True
True
但是如果我做一个更复杂的theano计算,它就不起作用了。以下不起作用:
import numpy as np
import pycuda.autoinit as cuauto
import pycuda.driver as cuda
import pycuda.compiler as cudacc
import pycuda.gpuarray as gpuarray
import theano
import theano.tensor as T
def get_pycuda_func():
mod = cudacc.SourceModule("""
__global__ void mul(double *dest, double *a, double *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
mul = mod.get_function("mul")
mul.prepare("PPP")
def f(a,b):
N = len(a)
gpu_a = gpuarray.to_gpu(a)
gpu_b = gpuarray.to_gpu(b)
c = gpuarray.empty((N,),dtype=np.float64)
mul.prepared_call(
(1,1,1),(N,1,1),
c.gpudata,
gpu_a.gpudata,
gpu_b.gpudata
)
return c.get()
return f
floatX=theano.config.floatX
def init_bias(size):
tmp = np.random.rand(size)
return theano.shared(np.asarray(tmp,dtype=floatX))
def init_weights(in_size,out_size):
s = np.sqrt(2./(in_size+out_size))
tmp = np.random.normal(loc=0.,scale=s,size=(in_size,out_size))
return theano.shared(np.asarray(tmp,dtype=floatX))
def adam(params, gparams,learning_rate = 0.0001, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8):
updates = []
t_pre = theano.shared(np.asarray(.0, dtype=theano.config.floatX))
t = t_pre + 1
a_t = learning_rate * T.sqrt(1 - beta2 ** t) / (1 - beta1 ** t)
for (p,g) in zip(params, gparams):
v = p.get_value(borrow = True)
m_pre = theano.shared(np.zeros(v.shape, dtype = v.dtype), broadcastable = p.broadcastable)
v_pre = theano.shared(np.zeros(v.shape, dtype = v.dtype), broadcastable = p.broadcastable)
m_t = beta1 * m_pre + (1 - beta1) * g
v_t = beta2 * v_pre + (1 - beta2) * g ** 2
step = a_t * m_t / (T.sqrt(v_t) + epsilon)
p_update = p - step
updates.append((m_pre, m_t))
updates.append((v_pre, v_t))
updates.append((p, p_update))
updates.append((t_pre, t))
return updates
class test_network:
def __init__(self,hidden=[100,100]):
self.hidden = hidden
self._create_params()
self._create_train_func()
self._create_func()
def _create_params(self):
hidden = self.hidden
W0 = init_weights(1,hidden[0])
W1 = init_weights(hidden[0],hidden[1])
W2 = init_weights(hidden[1],1)
b0 = init_bias(hidden[0])
b1 = init_bias(hidden[1])
b2 = init_bias(1)
self.params = [
W0,W1,W2,
b0,b1,b2,
]
def predict(self,x):
[
W0,W1,W2,
b0,b1,b2,
] = self.params
H0 = T.dot(x,W0) + b0
H0 = T.nnet.relu(H0)
H1 = T.dot(H0,W1) + b1
H1 = T.nnet.relu(H1)
ret = T.dot(H1,W2) + b2
return ret
def _create_func(self):
x = T.matrix('x')
y = self.predict(x)
self.f = theano.function([x],y,allow_input_downcast=True)
def _create_train_func(self):
y_in = T.matrix('y_in')
x = T.matrix('x')
y = self.predict(x)
loss = T.mean((y-y_in)*(y-y_in))
grad_loss = T.grad(loss,self.params)
updates = adam(self.params,grad_loss)
self.train = theano.function(inputs=[x,y_in],
outputs=loss,
updates=updates,
allow_input_downcast=True)
def get_cpu_func():
def f(a,b):
return a*b
return f
if __name__ == "__main__":
np.random.seed(12345)
a = np.random.randn(400)
b = np.random.randn(400)
f_cuda = get_pycuda_func()
f_cpu = get_cpu_func()
T = test_network()
for k in range(10):
x = f_cuda(a,b)
z = f_cpu(a,b)
print(k)
print(np.allclose(x,z))
batch_size = 256
for k in range(1000):
x = np.random.rand(batch_size)
y = x*x
x = x.reshape(batch_size,1)
y = y.reshape(batch_size,1)
loss = T.train(x,y)
print("k=%d, loss=%g" % (k,loss))
我会得到:
$ python3 test_theano_pycuda.py
Using cuDNN version 7003 on context None
Mapped name None to device cuda: GeForce GTX TITAN Black (0000:01:00.0)
Traceback (most recent call last):
File "test_theano_pycuda.py", line 160, in <module>
x = f_cuda(a,b)
File "test_theano_pycuda.py", line 32, in f
gpu_b.gpudata
File "/usr/local/lib/python3.5/dist-packages/pycuda-2017.1.1-py3.5-linux-x86_64.egg/pycuda/driver.py", line 447, in function_prepared_call
func._set_block_shape(*block)
pycuda._driver.LogicError: cuFuncSetBlockShape failed: invalid resource handle
我确信我的 test_theano_pycuda.py 可以工作,因为我已经通过强制 theano 使用 CPU 而不是 cuda 来测试它。 (通过修改~/.theanorc):
来自 this。我打赌它应该与pycuda和theano都在一个进程中创建上下文的问题有关。
with gpuarray_cuda_context:
pycuda_context = pycuda.driver.Context.attach()
那个gpuarray_cuda_context
从哪里来的?有没有我可以测试的可行示例?
gpuarray_cuda_context
这里只是 GpuArray 变量的现有上下文。
例如,您可以在 theano/gpuarray/fft.py
中找到一个例子,我认为 skcuda.misc.init()
会调用 pycuda.driver.Context.attach()
或做类似的事情。