TensorFlow MirroredStrategy() 不适用于多 GPU 训练

TensorFlow MirroredStrategy() not working for multi-gpu training

我正在尝试在 2 个 Nvidia Titan RTX 显卡上实现 TensorFlows MirroredStrategy() 到 运行 3DUNet。该代码经验证适用于 1 个 GPU。我的 OS 是 Red Hat Enterprise Linux 8 (RHEL8)。错误出现在 model.fit().

我已经安装了适当的 NCCL Nvidia 驱动程序,并验证了我可以使用来自 tensorflow.org 的示例将训练数据解析到两个 GPU 上。

代码:

def get_model(optimizer, loss_metric, metrics, lr=1e-3):
    inputs = Input((sample_width, sample_height, sample_depth, 1))
    conv1 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(inputs)
    conv1 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(conv1)
    pool1 = MaxPooling3D(pool_size=(2, 2, 2))(conv1)
    drop1 = Dropout(0.5)(pool1)

    conv2 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(drop1)
    conv2 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(conv2)
    pool2 = MaxPooling3D(pool_size=(2, 2, 2))(conv2)
    drop2 = Dropout(0.5)(pool2)

    conv3 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(drop2)
    conv3 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(conv3)
    pool3 = MaxPooling3D(pool_size=(2, 2, 2))(conv3)
    drop3 = Dropout(0.3)(pool3)

    conv4 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(drop3)
    conv4 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(conv4)
    pool4 = MaxPooling3D(pool_size=(2, 2, 2))(conv4)
    drop4 = Dropout(0.3)(pool4)

    conv5 = Conv3D(512, (3, 3, 3), activation='relu', padding='same')(drop4)
    conv5 = Conv3D(512, (3, 3, 3), activation='relu', padding='same')(conv5)

    up6 = concatenate([Conv3DTranspose(256, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv5), conv4], axis=4)
    conv6 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(up6)
    conv6 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(conv6)

    up7 = concatenate([Conv3DTranspose(128, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv6), conv3], axis=4)
    conv7 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(up7)
    conv7 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(conv7)

    up8 = concatenate([Conv3DTranspose(64, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv7), conv2], axis=4)
    conv8 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(up8)
    conv8 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(conv8)

    up9 = concatenate([Conv3DTranspose(32, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv8), conv1], axis=4)
    conv9 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(up9)
    conv9 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(conv9)

    conv10 = Conv3D(1, (1, 1, 1), activation='sigmoid')(conv9)

    model = Model(inputs=[inputs], outputs=[conv10])
    model.compile(optimizer=optimizer(lr=lr), loss=loss_metric, metrics=metrics)
    return model

mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
    model = get_model(optimizer=Adam, loss_metric=dice_coef_loss, metrics=[dice_coef], lr=1e-3)

observe_var = 'dice_coef'
strategy = 'max'

model_checkpoint = ModelCheckpoint('unet_seg_cs9300_3d_{epoch:04}.model', monitor=observe_var, save_best_only=False, period = 1000)
model.fit(train_x, train_y, batch_size= 1, epochs= 100, verbose=1, shuffle=True, validation_split=0.2, callbacks=[model_checkpoint])
model.save('unet_seg_final_3d_test.model')

错误:

---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
<ipython-input-3-15c1c64c47ab> in <module>
    423 model_checkpoint = ModelCheckpoint('unet_seg_cs9300_3d_{epoch:04}.model', monitor=observe_var, save_best_only=False, period = 1000)
    424 
--> 425 model.fit(train_x, train_y, batch_size= 1, epochs= 100, verbose=1, shuffle=True, validation_split=0.2, callbacks=[model_checkpoint])
    426 
    427 model.save('unet_seg_final_3d_test.model')

~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
   1211         else:
   1212             fit_inputs = x + y + sample_weights
-> 1213         self._make_train_function()
   1214         fit_function = self.train_function
   1215 

~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/engine/training.py in _make_train_function(self)
    314                     training_updates = self.optimizer.get_updates(
    315                         params=self._collected_trainable_weights,
--> 316                         loss=self.total_loss)
    317                 updates = self.updates + training_updates
    318 

~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
     89                 warnings.warn('Update your `' + object_name + '` call to the ' +
     90                               'Keras 2 API: ' + signature, stacklevel=2)
---> 91             return func(*args, **kwargs)
     92         wrapper._original_function = func
     93         return wrapper

~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py in symbolic_fn_wrapper(*args, **kwargs)
     73         if _SYMBOLIC_SCOPE.value:
     74             with get_graph().as_default():
---> 75                 return func(*args, **kwargs)
     76         else:
     77             return func(*args, **kwargs)

~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/optimizers.py in get_updates(self, loss, params)
    548 
    549             # Apply constraints.
--> 550             if getattr(p, 'constraint', None) is not None:
    551                 new_p = p.constraint(new_p)
    552 

~/anaconda3/envs/gputest/lib/python3.7/site-packages/tensorflow_core/python/ops/variables.py in constraint(self)
    566       Can be `None` if no constraint was passed.
    567     """
--> 568     raise NotImplementedError
    569 
    570   def assign(self, value, use_locking=False, name=None, read_value=True):

NotImplementedError: 

此答案基于对 OP 问题的评论。

在使用 tf.distribute.MirroredStrategy 进行多 GPU 训练时,应该使用 tf.keras API 而不是 keras 包的 tensorflow 后端。

一般来说,tf.keraskeras最好不要混用。

尝试不同的方式cross_device_ops,不要求助于 NCCL

strategy = tf.distribute.MirroredStrategy(
    cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())

strategy = tf.distribute.MirroredStrategy(
     cross_device_ops=tf.distribute.ReductionToOneDevice())