不要为我的 GAN 实现获得完整的 GPU 利用率

Dont get full GPU-Utilization for my GAN implementation

我构建了一个 GAN 网络来预测形状 (40,40,6) 的输出形成形状 [(40,40,4),(20,20,6)].

的两个输入

该模型实际上正在运行并且已经交付了结果,但我 "only" 的 GPU 利用率在 60% 到 70% 之间(由 nvidia-smi 显示)。

我的问题是,对于这样的模型来说,这是否是固有的,因为它必须在 train_on_batch 的调用之间做一些事情,或者是否有办法加快这个过程?

关于随机数据的极简工作示例如下所示:

import numpy as np
import os


import tensorflow as tf

from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import UpSampling3D
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import Add
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Lambda

from tensorflow.keras.optimizers import Adam


gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)






# =============================================================================
# define the model    
# =============================================================================

def resBlock(X_in, num_of_features, kernel_size, scale):

    x = Conv2D(num_of_features, kernel_size, kernel_initializer='he_uniform', padding='same')(X_in)
    x = Activation('relu')(x)
    x = Conv2D(num_of_features, kernel_size, kernel_initializer='he_uniform', padding='same')(x)
    x = Lambda(lambda x: x * scale)(x)

    X_out = Add()([X_in,x])

    return X_out

class Generator(object):

    def __init__(self, noise_shape):

        self.noise_shape = noise_shape
        self.num_of_features = 128
        self.kernel_size = (3,3)
        self.scale = 0.1
        self.padding=8
        self.hp = int(self.padding/2) # half padding

    def generator(self):

        # get the inputs and do upsampling
        inputs_channels_A = Input((32+self.padding,32+self.padding,4),name = 'input_A')
        inputs_channels_B = Input((16+self.hp,16+self.hp,6),name = 'input_B')
        inputs_channels_B_upsampled = UpSampling3D(size = (2,2,1))(inputs_channels_B)

        # concentrate everything
        concentrated_input = concatenate([inputs_channels_A,
                                          inputs_channels_B_upsampled],
                                          axis=3,)

        # do the first convolution
        x = Conv2D(self.num_of_features,
                   self.kernel_size,
                   activation = 'relu',
                   padding = 'same',
                   kernel_initializer = 'he_normal')(concentrated_input)

        # do the resBlock iterations
        for resblock_index in range(6):
            x = resBlock(x,self.num_of_features, self.kernel_size, self.scale)

        # doing the last conv to resize it to (28,28,6)
        x = Conv2D(6, (3, 3), kernel_initializer='he_uniform', padding='same')(x)

        # last scipt connection
        output = Add()([x,inputs_channels_B_upsampled])

        # defining model
        generator_model = Model(inputs = [inputs_channels_A,inputs_channels_B], outputs = output)

        return generator_model

def discriminator_block(model, filters, kernel_size, strides):

    model = Conv2D(filters = filters, kernel_size = kernel_size, strides = strides, padding = "same")(model)
    model = BatchNormalization(momentum = 0.5)(model)
    model = LeakyReLU(alpha = 0.2)(model)

    return model

class Discriminator(object):

    def __init__(self, image_shape):

        self.image_shape = image_shape

    def discriminator(self):

        dis_input = Input(shape = (self.image_shape))

        model = Conv2D(filters = 64, kernel_size = 3, strides = 1, padding = "same")(dis_input)
        model = LeakyReLU(alpha = 0.2)(model)

        model = discriminator_block(model, 64, 3, 2)
        model = discriminator_block(model, 128, 3, 1)
        model = discriminator_block(model, 128, 3, 2)
        model = discriminator_block(model, 256, 3, 1)
        model = discriminator_block(model, 256, 3, 2)
        model = discriminator_block(model, 512, 3, 1)
        model = discriminator_block(model, 512, 3, 2)

        model = Flatten()(model)
        model = Dense(1024)(model)
        model = LeakyReLU(alpha = 0.2)(model)

        model = Dense(1)(model)
        model = Activation('sigmoid')(model) 

        discriminator_model = Model(inputs = dis_input, outputs = model)

        return discriminator_model   

def get_gan_network(discriminator, shape_list_AB, generator, optimizer, loss):

    discriminator.trainable = False

    gan_input_A = Input(shape=shape_list_AB[0])
    gan_input_B = Input(shape=shape_list_AB[1])

    x = generator([gan_input_A,gan_input_B])
    gan_output = discriminator(x)

    gan = Model(inputs=[gan_input_A,gan_input_B], outputs=[x,gan_output])
    gan.compile(loss=[loss, "binary_crossentropy"], loss_weights=[1., 1e-3], optimizer=optimizer)

    return gan


def get_optimizer():
    adam = Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    return adam





# =============================================================================
# choose some parameters and compile the model
# =============================================================================

batch_size = 128
shape_input_A = (40,40,4)
shape_input_B = (20,20,6)
shape_output = (40,40,6)


generator = Generator(shape_input_B).generator() # todo shape
discriminator = Discriminator(shape_output).discriminator() # todo shape

optimizer = get_optimizer()

generator.compile(loss="mse", optimizer=optimizer)
discriminator.compile(loss="binary_crossentropy", optimizer=optimizer)

gan = get_gan_network(discriminator, [shape_input_A,shape_input_B], generator, optimizer, "mse")  




# =============================================================================
# training
# =============================================================================

def get_random_data(mod):

    # get the networks input
    if mod == 0: 
        return [np.random.rand(batch_size,40,40,4),np.random.rand(batch_size,20,20,6)]

    # get the networks output
    else: 
        return np.random.rand(batch_size,40,40,6)


# initalize empty arrays
rand_nums = np.empty(batch_size,dtype=np.int)
image_batch_lr = np.empty((batch_size,)+shape_input_B)
image_batch_hr = np.empty((batch_size,)+shape_output)
generated_images_sr = np.empty_like(image_batch_hr)
real_data_Y = np.empty(batch_size)
fake_data_Y = np.empty(batch_size)

for e in range(1, 10):

    print("epoch:",e)

    for batchindex in range(200):

        generated_images_sr[:] = generator.predict(get_random_data(0))

        real_data_Y[:] = np.ones(batch_size) - np.random.random_sample(batch_size)*0.2
        fake_data_Y[:] = np.random.random_sample(batch_size)*0.2

        discriminator.trainable = True

        d_loss_real = discriminator.train_on_batch(get_random_data(1), real_data_Y)
        d_loss_fake = discriminator.train_on_batch(generated_images_sr, fake_data_Y)
        discriminator_loss = 0.5 * np.add(d_loss_fake, d_loss_real)

        gan_Y = np.ones(batch_size) - np.random.random_sample(batch_size)*0.2
        discriminator.trainable = False
        gan_loss = gan.train_on_batch(get_random_data(0), [get_random_data(1),gan_Y])


    print("discriminator_loss : %f" % discriminator_loss)
    print("gan_loss :", gan_loss)

我 运行 我的 GTX2080 上的 docker 容器 tensorflow/tensorflow:2.0.0-gpu-py3 中的此代码。

训练 GAN 意味着一些不会在 GPU 上执行的开销。在您的情况下,获取 real_data_Yfake_data_Y,执行 get_random_data() 并计算损失将导致 GPU 空闲时间。

您可以尝试使用 python -mcProfile -o performance.prof xxx.py 分析您的程序,看看是否存在可以改进的瓶颈,但 60% 到 70% 似乎已经不错了。