Tensorflow：如何利用多 GPU？

Question

我有一个 CNN，运行配备 1 个 GPU。现在我转到另一台有 2 个 GPU 的计算机，我想使用两个 GPU 来训练我的网络以节省时间。我该怎么做？

我读了https://www.tensorflow.org/tutorials/using_gpu，但我认为这个例子太简单了，老实说我不知道如何在我的真实网络上应用它。

谁能在我的网络上给我一个简单的插图吗？（我正在做自动编码器）。

非常感谢！

graphCNN = tf.Graph()
with graphCNN.as_default():
    # Input
    x = tf.placeholder(tf.float32, shape=(None, img_w, img_h,img_ch), name="X") # X
    # Output expected
    y_ = tf.placeholder(tf.float32, shape=(None, img_w, img_h,img_ch), name="Y") # Y_    
    # Dropout
    dropout = tf.placeholder(tf.float32)

### Model
    def model(data):
        ### Encoder
        c64 = ConvLayer(data, depth_in=1, depth_out=64, name="c64", kernel_size=3, acti=True)
        c128 = ConvLayer(c64, depth_in=64, depth_out=128, name="c128", kernel_size=3, acti=True)
        c256 = ConvLayer(c128, depth_in=128, depth_out=256, name="c256", kernel_size=3, acti=True)
        c512_1 = ConvLayer(c256, depth_in=256, depth_out=512, name="c512_1", kernel_size=3, acti=True)
        c512_2 = ConvLayer(c512_1, depth_in=512, depth_out=512, name="c512_2", kernel_size=3, acti=True)
        c512_3 = ConvLayer(c512_2, depth_in=512, depth_out=512, name="c512_3", kernel_size=3, acti=True)
        c512_4 = ConvLayer(c512_3, depth_in=512, depth_out=512, name="c512_4", kernel_size=3, acti=True)
        c512_5 = ConvLayer(c512_4, depth_in=512, depth_out=512, name="c512_5", kernel_size=3, acti=True)

        ### Decoder
        dc512_5 = DeconvLayer(c512_5, depth_in=512, depth_out=512, name="dc512_5", kernel_size=3, acti=True)
        dc512_4 = DeconvLayer(dc512_5, depth_in=512, depth_out=512, name="dc512_4", kernel_size=3, acti=True)
        dc512_3 = DeconvLayer(dc512_4, depth_in=512, depth_out=512, name="dc512_3", kernel_size=3, acti=True)
        dc512_2 = DeconvLayer(dc512_3, depth_in=512, depth_out=512, name="dc512_2", kernel_size=3, acti=True)
        dc512_1 = DeconvLayer(dc512_2, depth_in=512, depth_out=512, name="dc512_1", kernel_size=3, acti=True)
        dc256 = DeconvLayer(dc512_1, depth_in=512, depth_out=256, name="dc256", kernel_size=3, acti=True)
        dc128 = DeconvLayer(dc256, depth_in=256, depth_out=128, name="dc128", kernel_size=3, acti=True)
        dc64 = DeconvLayer(dc128, depth_in=128, depth_out=64, name="dc64", kernel_size=3, acti=True)

        output = ConvLayer(dc64, depth_in=64, depth_out=1, name="conv_out", kernel_size=3, acti=True)
        return output
    # Predictions
    y = model(x)
    y_image = tf.reshape(y, [-1, img_w, img_h, 1])
    tf.summary.image('output', y_image, 6)

    #Loss
    loss = tf.reduce_sum(tf.pow(y - y_,2))/(img_w*img_h*img_ch) # MSE
    loss_summary = tf.summary.scalar("Training_Loss", loss)    

    # Optimizer.
    with tf.name_scope("train"):
        train_step = tf.train.AdamOptimizer(learning_rate=learn_rate).minimize(loss)

如果您想查看更多详细信息

def ConvLayer(input, depth_in, depth_out, name="conv", kernel_size=3, acti=True):
with tf.name_scope(name):
    w = tf.Variable(tf.truncated_normal([kernel_size, kernel_size, depth_in, depth_out], 
                                        stddev=0.1), name="W")
    b = tf.Variable(tf.constant(0.1, shape=[depth_out]), name="B")
    conv = tf.nn.conv2d(input, w, strides=[1, 1, 1, 1], padding="SAME")
    tf.summary.histogram("weights", w)
    tf.summary.histogram("biases", b)
    if (acti==True):
        act = tf.nn.relu(conv + b)
        tf.summary.histogram("activations", act)
        result = act
    else:
        result = conv + b

    result_maxpooled = max_pool(result,2)
    return result_maxpooled

.

def DeconvLayer(input, depth_in, depth_out, name="deconv", kernel_size=3, acti=True):
with tf.name_scope(name):
    w = tf.Variable(tf.truncated_normal([kernel_size, kernel_size, depth_out,depth_in], 
                                        stddev=0.1), name="W")
    b = tf.Variable(tf.constant(0.1, shape=[depth_out]), name="B")


    input_shape = tf.shape(input)
    output_shape = tf.stack([input_shape[0], input_shape[1]*2, input_shape[2]*2, input_shape[3]//2])
    deconv = tf.nn.conv2d_transpose(input, w, output_shape, strides=[1, 1, 1, 1], padding='SAME')

    tf.summary.histogram("weights", w)
    tf.summary.histogram("biases", b)
    if (acti==True):
        act = tf.nn.relu(deconv + b)
        tf.summary.histogram("activations", act)
        result = act
    else:
        result = deconv + b
    return result

Answer 1

如何在多个 GPU 上实现 CNN（卷积神经网络）？

引自“Training a Model Using Multiple GPU Cards”（来自 Tensorflow 的教程）

Place an individual model replica on each GPU.

Update model parameters synchronously by waiting for all GPUs to finish processing a batch of data.

为了通过了解主内存-CPU-GPU 之间的数据流来提高性能，请查看此答案：Why should preprocessing be done on CPU rather than GPU? :

Tensorflow：如何利用多 GPU？

Tensorflow: How to take advantage of multi GPUs?

multi-gpu

tensorflow

tensorflow-gpu