Tensorflow:如何利用多 GPU?
Tensorflow: How to take advantage of multi GPUs?
我有一个 CNN,运行 配备 1 个 GPU。现在我转到另一台有 2 个 GPU 的计算机,我想使用两个 GPU 来训练我的网络以节省时间。我该怎么做?
我读了https://www.tensorflow.org/tutorials/using_gpu,但我认为这个例子太简单了,老实说我不知道如何在我的真实网络上应用它。
谁能在我的网络上给我一个简单的插图吗? (我正在做自动编码器)。
非常感谢!
graphCNN = tf.Graph()
with graphCNN.as_default():
# Input
x = tf.placeholder(tf.float32, shape=(None, img_w, img_h,img_ch), name="X") # X
# Output expected
y_ = tf.placeholder(tf.float32, shape=(None, img_w, img_h,img_ch), name="Y") # Y_
# Dropout
dropout = tf.placeholder(tf.float32)
### Model
def model(data):
### Encoder
c64 = ConvLayer(data, depth_in=1, depth_out=64, name="c64", kernel_size=3, acti=True)
c128 = ConvLayer(c64, depth_in=64, depth_out=128, name="c128", kernel_size=3, acti=True)
c256 = ConvLayer(c128, depth_in=128, depth_out=256, name="c256", kernel_size=3, acti=True)
c512_1 = ConvLayer(c256, depth_in=256, depth_out=512, name="c512_1", kernel_size=3, acti=True)
c512_2 = ConvLayer(c512_1, depth_in=512, depth_out=512, name="c512_2", kernel_size=3, acti=True)
c512_3 = ConvLayer(c512_2, depth_in=512, depth_out=512, name="c512_3", kernel_size=3, acti=True)
c512_4 = ConvLayer(c512_3, depth_in=512, depth_out=512, name="c512_4", kernel_size=3, acti=True)
c512_5 = ConvLayer(c512_4, depth_in=512, depth_out=512, name="c512_5", kernel_size=3, acti=True)
### Decoder
dc512_5 = DeconvLayer(c512_5, depth_in=512, depth_out=512, name="dc512_5", kernel_size=3, acti=True)
dc512_4 = DeconvLayer(dc512_5, depth_in=512, depth_out=512, name="dc512_4", kernel_size=3, acti=True)
dc512_3 = DeconvLayer(dc512_4, depth_in=512, depth_out=512, name="dc512_3", kernel_size=3, acti=True)
dc512_2 = DeconvLayer(dc512_3, depth_in=512, depth_out=512, name="dc512_2", kernel_size=3, acti=True)
dc512_1 = DeconvLayer(dc512_2, depth_in=512, depth_out=512, name="dc512_1", kernel_size=3, acti=True)
dc256 = DeconvLayer(dc512_1, depth_in=512, depth_out=256, name="dc256", kernel_size=3, acti=True)
dc128 = DeconvLayer(dc256, depth_in=256, depth_out=128, name="dc128", kernel_size=3, acti=True)
dc64 = DeconvLayer(dc128, depth_in=128, depth_out=64, name="dc64", kernel_size=3, acti=True)
output = ConvLayer(dc64, depth_in=64, depth_out=1, name="conv_out", kernel_size=3, acti=True)
return output
# Predictions
y = model(x)
y_image = tf.reshape(y, [-1, img_w, img_h, 1])
tf.summary.image('output', y_image, 6)
#Loss
loss = tf.reduce_sum(tf.pow(y - y_,2))/(img_w*img_h*img_ch) # MSE
loss_summary = tf.summary.scalar("Training_Loss", loss)
# Optimizer.
with tf.name_scope("train"):
train_step = tf.train.AdamOptimizer(learning_rate=learn_rate).minimize(loss)
如果您想查看更多详细信息
def ConvLayer(input, depth_in, depth_out, name="conv", kernel_size=3, acti=True):
with tf.name_scope(name):
w = tf.Variable(tf.truncated_normal([kernel_size, kernel_size, depth_in, depth_out],
stddev=0.1), name="W")
b = tf.Variable(tf.constant(0.1, shape=[depth_out]), name="B")
conv = tf.nn.conv2d(input, w, strides=[1, 1, 1, 1], padding="SAME")
tf.summary.histogram("weights", w)
tf.summary.histogram("biases", b)
if (acti==True):
act = tf.nn.relu(conv + b)
tf.summary.histogram("activations", act)
result = act
else:
result = conv + b
result_maxpooled = max_pool(result,2)
return result_maxpooled
.
def DeconvLayer(input, depth_in, depth_out, name="deconv", kernel_size=3, acti=True):
with tf.name_scope(name):
w = tf.Variable(tf.truncated_normal([kernel_size, kernel_size, depth_out,depth_in],
stddev=0.1), name="W")
b = tf.Variable(tf.constant(0.1, shape=[depth_out]), name="B")
input_shape = tf.shape(input)
output_shape = tf.stack([input_shape[0], input_shape[1]*2, input_shape[2]*2, input_shape[3]//2])
deconv = tf.nn.conv2d_transpose(input, w, output_shape, strides=[1, 1, 1, 1], padding='SAME')
tf.summary.histogram("weights", w)
tf.summary.histogram("biases", b)
if (acti==True):
act = tf.nn.relu(deconv + b)
tf.summary.histogram("activations", act)
result = act
else:
result = deconv + b
return result
如何在多个 GPU 上实现 CNN(卷积神经网络)?
引自“Training a Model Using Multiple GPU Cards”(来自 Tensorflow 的教程)
- Place an individual model replica on each GPU.
- Update model parameters synchronously by waiting for all GPUs to finish processing a batch of data.
为了通过了解主内存-CPU-GPU 之间的数据流来提高性能,请查看此答案:Why should preprocessing be done on CPU rather than GPU? :
我有一个 CNN,运行 配备 1 个 GPU。现在我转到另一台有 2 个 GPU 的计算机,我想使用两个 GPU 来训练我的网络以节省时间。我该怎么做?
我读了https://www.tensorflow.org/tutorials/using_gpu,但我认为这个例子太简单了,老实说我不知道如何在我的真实网络上应用它。
谁能在我的网络上给我一个简单的插图吗? (我正在做自动编码器)。
非常感谢!
graphCNN = tf.Graph()
with graphCNN.as_default():
# Input
x = tf.placeholder(tf.float32, shape=(None, img_w, img_h,img_ch), name="X") # X
# Output expected
y_ = tf.placeholder(tf.float32, shape=(None, img_w, img_h,img_ch), name="Y") # Y_
# Dropout
dropout = tf.placeholder(tf.float32)
### Model
def model(data):
### Encoder
c64 = ConvLayer(data, depth_in=1, depth_out=64, name="c64", kernel_size=3, acti=True)
c128 = ConvLayer(c64, depth_in=64, depth_out=128, name="c128", kernel_size=3, acti=True)
c256 = ConvLayer(c128, depth_in=128, depth_out=256, name="c256", kernel_size=3, acti=True)
c512_1 = ConvLayer(c256, depth_in=256, depth_out=512, name="c512_1", kernel_size=3, acti=True)
c512_2 = ConvLayer(c512_1, depth_in=512, depth_out=512, name="c512_2", kernel_size=3, acti=True)
c512_3 = ConvLayer(c512_2, depth_in=512, depth_out=512, name="c512_3", kernel_size=3, acti=True)
c512_4 = ConvLayer(c512_3, depth_in=512, depth_out=512, name="c512_4", kernel_size=3, acti=True)
c512_5 = ConvLayer(c512_4, depth_in=512, depth_out=512, name="c512_5", kernel_size=3, acti=True)
### Decoder
dc512_5 = DeconvLayer(c512_5, depth_in=512, depth_out=512, name="dc512_5", kernel_size=3, acti=True)
dc512_4 = DeconvLayer(dc512_5, depth_in=512, depth_out=512, name="dc512_4", kernel_size=3, acti=True)
dc512_3 = DeconvLayer(dc512_4, depth_in=512, depth_out=512, name="dc512_3", kernel_size=3, acti=True)
dc512_2 = DeconvLayer(dc512_3, depth_in=512, depth_out=512, name="dc512_2", kernel_size=3, acti=True)
dc512_1 = DeconvLayer(dc512_2, depth_in=512, depth_out=512, name="dc512_1", kernel_size=3, acti=True)
dc256 = DeconvLayer(dc512_1, depth_in=512, depth_out=256, name="dc256", kernel_size=3, acti=True)
dc128 = DeconvLayer(dc256, depth_in=256, depth_out=128, name="dc128", kernel_size=3, acti=True)
dc64 = DeconvLayer(dc128, depth_in=128, depth_out=64, name="dc64", kernel_size=3, acti=True)
output = ConvLayer(dc64, depth_in=64, depth_out=1, name="conv_out", kernel_size=3, acti=True)
return output
# Predictions
y = model(x)
y_image = tf.reshape(y, [-1, img_w, img_h, 1])
tf.summary.image('output', y_image, 6)
#Loss
loss = tf.reduce_sum(tf.pow(y - y_,2))/(img_w*img_h*img_ch) # MSE
loss_summary = tf.summary.scalar("Training_Loss", loss)
# Optimizer.
with tf.name_scope("train"):
train_step = tf.train.AdamOptimizer(learning_rate=learn_rate).minimize(loss)
如果您想查看更多详细信息
def ConvLayer(input, depth_in, depth_out, name="conv", kernel_size=3, acti=True):
with tf.name_scope(name):
w = tf.Variable(tf.truncated_normal([kernel_size, kernel_size, depth_in, depth_out],
stddev=0.1), name="W")
b = tf.Variable(tf.constant(0.1, shape=[depth_out]), name="B")
conv = tf.nn.conv2d(input, w, strides=[1, 1, 1, 1], padding="SAME")
tf.summary.histogram("weights", w)
tf.summary.histogram("biases", b)
if (acti==True):
act = tf.nn.relu(conv + b)
tf.summary.histogram("activations", act)
result = act
else:
result = conv + b
result_maxpooled = max_pool(result,2)
return result_maxpooled
.
def DeconvLayer(input, depth_in, depth_out, name="deconv", kernel_size=3, acti=True):
with tf.name_scope(name):
w = tf.Variable(tf.truncated_normal([kernel_size, kernel_size, depth_out,depth_in],
stddev=0.1), name="W")
b = tf.Variable(tf.constant(0.1, shape=[depth_out]), name="B")
input_shape = tf.shape(input)
output_shape = tf.stack([input_shape[0], input_shape[1]*2, input_shape[2]*2, input_shape[3]//2])
deconv = tf.nn.conv2d_transpose(input, w, output_shape, strides=[1, 1, 1, 1], padding='SAME')
tf.summary.histogram("weights", w)
tf.summary.histogram("biases", b)
if (acti==True):
act = tf.nn.relu(deconv + b)
tf.summary.histogram("activations", act)
result = act
else:
result = deconv + b
return result
如何在多个 GPU 上实现 CNN(卷积神经网络)?
引自“Training a Model Using Multiple GPU Cards”(来自 Tensorflow 的教程)
- Place an individual model replica on each GPU.
- Update model parameters synchronously by waiting for all GPUs to finish processing a batch of data.
为了通过了解主内存-CPU-GPU 之间的数据流来提高性能,请查看此答案:Why should preprocessing be done on CPU rather than GPU? :