不要为我的 GAN 实现获得完整的 GPU 利用率
Dont get full GPU-Utilization for my GAN implementation
我构建了一个 GAN
网络来预测形状 (40,40,6)
的输出形成形状 [(40,40,4),(20,20,6)]
.
的两个输入
该模型实际上正在运行并且已经交付了结果,但我 "only" 的 GPU 利用率在 60% 到 70% 之间(由 nvidia-smi 显示)。
我的问题是,对于这样的模型来说,这是否是固有的,因为它必须在 train_on_batch
的调用之间做一些事情,或者是否有办法加快这个过程?
关于随机数据的极简工作示例如下所示:
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import UpSampling3D
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import Add
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Lambda
from tensorflow.keras.optimizers import Adam
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
try:
# Currently, memory growth needs to be the same across GPUs
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
except RuntimeError as e:
# Memory growth must be set before GPUs have been initialized
print(e)
# =============================================================================
# define the model
# =============================================================================
def resBlock(X_in, num_of_features, kernel_size, scale):
x = Conv2D(num_of_features, kernel_size, kernel_initializer='he_uniform', padding='same')(X_in)
x = Activation('relu')(x)
x = Conv2D(num_of_features, kernel_size, kernel_initializer='he_uniform', padding='same')(x)
x = Lambda(lambda x: x * scale)(x)
X_out = Add()([X_in,x])
return X_out
class Generator(object):
def __init__(self, noise_shape):
self.noise_shape = noise_shape
self.num_of_features = 128
self.kernel_size = (3,3)
self.scale = 0.1
self.padding=8
self.hp = int(self.padding/2) # half padding
def generator(self):
# get the inputs and do upsampling
inputs_channels_A = Input((32+self.padding,32+self.padding,4),name = 'input_A')
inputs_channels_B = Input((16+self.hp,16+self.hp,6),name = 'input_B')
inputs_channels_B_upsampled = UpSampling3D(size = (2,2,1))(inputs_channels_B)
# concentrate everything
concentrated_input = concatenate([inputs_channels_A,
inputs_channels_B_upsampled],
axis=3,)
# do the first convolution
x = Conv2D(self.num_of_features,
self.kernel_size,
activation = 'relu',
padding = 'same',
kernel_initializer = 'he_normal')(concentrated_input)
# do the resBlock iterations
for resblock_index in range(6):
x = resBlock(x,self.num_of_features, self.kernel_size, self.scale)
# doing the last conv to resize it to (28,28,6)
x = Conv2D(6, (3, 3), kernel_initializer='he_uniform', padding='same')(x)
# last scipt connection
output = Add()([x,inputs_channels_B_upsampled])
# defining model
generator_model = Model(inputs = [inputs_channels_A,inputs_channels_B], outputs = output)
return generator_model
def discriminator_block(model, filters, kernel_size, strides):
model = Conv2D(filters = filters, kernel_size = kernel_size, strides = strides, padding = "same")(model)
model = BatchNormalization(momentum = 0.5)(model)
model = LeakyReLU(alpha = 0.2)(model)
return model
class Discriminator(object):
def __init__(self, image_shape):
self.image_shape = image_shape
def discriminator(self):
dis_input = Input(shape = (self.image_shape))
model = Conv2D(filters = 64, kernel_size = 3, strides = 1, padding = "same")(dis_input)
model = LeakyReLU(alpha = 0.2)(model)
model = discriminator_block(model, 64, 3, 2)
model = discriminator_block(model, 128, 3, 1)
model = discriminator_block(model, 128, 3, 2)
model = discriminator_block(model, 256, 3, 1)
model = discriminator_block(model, 256, 3, 2)
model = discriminator_block(model, 512, 3, 1)
model = discriminator_block(model, 512, 3, 2)
model = Flatten()(model)
model = Dense(1024)(model)
model = LeakyReLU(alpha = 0.2)(model)
model = Dense(1)(model)
model = Activation('sigmoid')(model)
discriminator_model = Model(inputs = dis_input, outputs = model)
return discriminator_model
def get_gan_network(discriminator, shape_list_AB, generator, optimizer, loss):
discriminator.trainable = False
gan_input_A = Input(shape=shape_list_AB[0])
gan_input_B = Input(shape=shape_list_AB[1])
x = generator([gan_input_A,gan_input_B])
gan_output = discriminator(x)
gan = Model(inputs=[gan_input_A,gan_input_B], outputs=[x,gan_output])
gan.compile(loss=[loss, "binary_crossentropy"], loss_weights=[1., 1e-3], optimizer=optimizer)
return gan
def get_optimizer():
adam = Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
return adam
# =============================================================================
# choose some parameters and compile the model
# =============================================================================
batch_size = 128
shape_input_A = (40,40,4)
shape_input_B = (20,20,6)
shape_output = (40,40,6)
generator = Generator(shape_input_B).generator() # todo shape
discriminator = Discriminator(shape_output).discriminator() # todo shape
optimizer = get_optimizer()
generator.compile(loss="mse", optimizer=optimizer)
discriminator.compile(loss="binary_crossentropy", optimizer=optimizer)
gan = get_gan_network(discriminator, [shape_input_A,shape_input_B], generator, optimizer, "mse")
# =============================================================================
# training
# =============================================================================
def get_random_data(mod):
# get the networks input
if mod == 0:
return [np.random.rand(batch_size,40,40,4),np.random.rand(batch_size,20,20,6)]
# get the networks output
else:
return np.random.rand(batch_size,40,40,6)
# initalize empty arrays
rand_nums = np.empty(batch_size,dtype=np.int)
image_batch_lr = np.empty((batch_size,)+shape_input_B)
image_batch_hr = np.empty((batch_size,)+shape_output)
generated_images_sr = np.empty_like(image_batch_hr)
real_data_Y = np.empty(batch_size)
fake_data_Y = np.empty(batch_size)
for e in range(1, 10):
print("epoch:",e)
for batchindex in range(200):
generated_images_sr[:] = generator.predict(get_random_data(0))
real_data_Y[:] = np.ones(batch_size) - np.random.random_sample(batch_size)*0.2
fake_data_Y[:] = np.random.random_sample(batch_size)*0.2
discriminator.trainable = True
d_loss_real = discriminator.train_on_batch(get_random_data(1), real_data_Y)
d_loss_fake = discriminator.train_on_batch(generated_images_sr, fake_data_Y)
discriminator_loss = 0.5 * np.add(d_loss_fake, d_loss_real)
gan_Y = np.ones(batch_size) - np.random.random_sample(batch_size)*0.2
discriminator.trainable = False
gan_loss = gan.train_on_batch(get_random_data(0), [get_random_data(1),gan_Y])
print("discriminator_loss : %f" % discriminator_loss)
print("gan_loss :", gan_loss)
我 运行 我的 GTX2080
上的 docker 容器 tensorflow/tensorflow:2.0.0-gpu-py3
中的此代码。
训练 GAN 意味着一些不会在 GPU 上执行的开销。在您的情况下,获取 real_data_Y
和 fake_data_Y
,执行 get_random_data()
并计算损失将导致 GPU 空闲时间。
您可以尝试使用 python -mcProfile -o performance.prof xxx.py
分析您的程序,看看是否存在可以改进的瓶颈,但 60% 到 70% 似乎已经不错了。
我构建了一个 GAN
网络来预测形状 (40,40,6)
的输出形成形状 [(40,40,4),(20,20,6)]
.
该模型实际上正在运行并且已经交付了结果,但我 "only" 的 GPU 利用率在 60% 到 70% 之间(由 nvidia-smi 显示)。
我的问题是,对于这样的模型来说,这是否是固有的,因为它必须在 train_on_batch
的调用之间做一些事情,或者是否有办法加快这个过程?
关于随机数据的极简工作示例如下所示:
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import UpSampling3D
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import Add
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Lambda
from tensorflow.keras.optimizers import Adam
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
try:
# Currently, memory growth needs to be the same across GPUs
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
except RuntimeError as e:
# Memory growth must be set before GPUs have been initialized
print(e)
# =============================================================================
# define the model
# =============================================================================
def resBlock(X_in, num_of_features, kernel_size, scale):
x = Conv2D(num_of_features, kernel_size, kernel_initializer='he_uniform', padding='same')(X_in)
x = Activation('relu')(x)
x = Conv2D(num_of_features, kernel_size, kernel_initializer='he_uniform', padding='same')(x)
x = Lambda(lambda x: x * scale)(x)
X_out = Add()([X_in,x])
return X_out
class Generator(object):
def __init__(self, noise_shape):
self.noise_shape = noise_shape
self.num_of_features = 128
self.kernel_size = (3,3)
self.scale = 0.1
self.padding=8
self.hp = int(self.padding/2) # half padding
def generator(self):
# get the inputs and do upsampling
inputs_channels_A = Input((32+self.padding,32+self.padding,4),name = 'input_A')
inputs_channels_B = Input((16+self.hp,16+self.hp,6),name = 'input_B')
inputs_channels_B_upsampled = UpSampling3D(size = (2,2,1))(inputs_channels_B)
# concentrate everything
concentrated_input = concatenate([inputs_channels_A,
inputs_channels_B_upsampled],
axis=3,)
# do the first convolution
x = Conv2D(self.num_of_features,
self.kernel_size,
activation = 'relu',
padding = 'same',
kernel_initializer = 'he_normal')(concentrated_input)
# do the resBlock iterations
for resblock_index in range(6):
x = resBlock(x,self.num_of_features, self.kernel_size, self.scale)
# doing the last conv to resize it to (28,28,6)
x = Conv2D(6, (3, 3), kernel_initializer='he_uniform', padding='same')(x)
# last scipt connection
output = Add()([x,inputs_channels_B_upsampled])
# defining model
generator_model = Model(inputs = [inputs_channels_A,inputs_channels_B], outputs = output)
return generator_model
def discriminator_block(model, filters, kernel_size, strides):
model = Conv2D(filters = filters, kernel_size = kernel_size, strides = strides, padding = "same")(model)
model = BatchNormalization(momentum = 0.5)(model)
model = LeakyReLU(alpha = 0.2)(model)
return model
class Discriminator(object):
def __init__(self, image_shape):
self.image_shape = image_shape
def discriminator(self):
dis_input = Input(shape = (self.image_shape))
model = Conv2D(filters = 64, kernel_size = 3, strides = 1, padding = "same")(dis_input)
model = LeakyReLU(alpha = 0.2)(model)
model = discriminator_block(model, 64, 3, 2)
model = discriminator_block(model, 128, 3, 1)
model = discriminator_block(model, 128, 3, 2)
model = discriminator_block(model, 256, 3, 1)
model = discriminator_block(model, 256, 3, 2)
model = discriminator_block(model, 512, 3, 1)
model = discriminator_block(model, 512, 3, 2)
model = Flatten()(model)
model = Dense(1024)(model)
model = LeakyReLU(alpha = 0.2)(model)
model = Dense(1)(model)
model = Activation('sigmoid')(model)
discriminator_model = Model(inputs = dis_input, outputs = model)
return discriminator_model
def get_gan_network(discriminator, shape_list_AB, generator, optimizer, loss):
discriminator.trainable = False
gan_input_A = Input(shape=shape_list_AB[0])
gan_input_B = Input(shape=shape_list_AB[1])
x = generator([gan_input_A,gan_input_B])
gan_output = discriminator(x)
gan = Model(inputs=[gan_input_A,gan_input_B], outputs=[x,gan_output])
gan.compile(loss=[loss, "binary_crossentropy"], loss_weights=[1., 1e-3], optimizer=optimizer)
return gan
def get_optimizer():
adam = Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
return adam
# =============================================================================
# choose some parameters and compile the model
# =============================================================================
batch_size = 128
shape_input_A = (40,40,4)
shape_input_B = (20,20,6)
shape_output = (40,40,6)
generator = Generator(shape_input_B).generator() # todo shape
discriminator = Discriminator(shape_output).discriminator() # todo shape
optimizer = get_optimizer()
generator.compile(loss="mse", optimizer=optimizer)
discriminator.compile(loss="binary_crossentropy", optimizer=optimizer)
gan = get_gan_network(discriminator, [shape_input_A,shape_input_B], generator, optimizer, "mse")
# =============================================================================
# training
# =============================================================================
def get_random_data(mod):
# get the networks input
if mod == 0:
return [np.random.rand(batch_size,40,40,4),np.random.rand(batch_size,20,20,6)]
# get the networks output
else:
return np.random.rand(batch_size,40,40,6)
# initalize empty arrays
rand_nums = np.empty(batch_size,dtype=np.int)
image_batch_lr = np.empty((batch_size,)+shape_input_B)
image_batch_hr = np.empty((batch_size,)+shape_output)
generated_images_sr = np.empty_like(image_batch_hr)
real_data_Y = np.empty(batch_size)
fake_data_Y = np.empty(batch_size)
for e in range(1, 10):
print("epoch:",e)
for batchindex in range(200):
generated_images_sr[:] = generator.predict(get_random_data(0))
real_data_Y[:] = np.ones(batch_size) - np.random.random_sample(batch_size)*0.2
fake_data_Y[:] = np.random.random_sample(batch_size)*0.2
discriminator.trainable = True
d_loss_real = discriminator.train_on_batch(get_random_data(1), real_data_Y)
d_loss_fake = discriminator.train_on_batch(generated_images_sr, fake_data_Y)
discriminator_loss = 0.5 * np.add(d_loss_fake, d_loss_real)
gan_Y = np.ones(batch_size) - np.random.random_sample(batch_size)*0.2
discriminator.trainable = False
gan_loss = gan.train_on_batch(get_random_data(0), [get_random_data(1),gan_Y])
print("discriminator_loss : %f" % discriminator_loss)
print("gan_loss :", gan_loss)
我 运行 我的 GTX2080
上的 docker 容器 tensorflow/tensorflow:2.0.0-gpu-py3
中的此代码。
训练 GAN 意味着一些不会在 GPU 上执行的开销。在您的情况下,获取 real_data_Y
和 fake_data_Y
,执行 get_random_data()
并计算损失将导致 GPU 空闲时间。
您可以尝试使用 python -mcProfile -o performance.prof xxx.py
分析您的程序,看看是否存在可以改进的瓶颈,但 60% 到 70% 似乎已经不错了。