TensorFlow MirroredStrategy() 不适用于多 GPU 训练
TensorFlow MirroredStrategy() not working for multi-gpu training
我正在尝试在 2 个 Nvidia Titan RTX 显卡上实现 TensorFlows MirroredStrategy()
到 运行 3DUNet。该代码经验证适用于 1 个 GPU。我的 OS 是 Red Hat Enterprise Linux 8 (RHEL8)。错误出现在 model.fit()
.
我已经安装了适当的 NCCL Nvidia 驱动程序,并验证了我可以使用来自 tensorflow.org 的示例将训练数据解析到两个 GPU 上。
代码:
def get_model(optimizer, loss_metric, metrics, lr=1e-3):
inputs = Input((sample_width, sample_height, sample_depth, 1))
conv1 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(inputs)
conv1 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(conv1)
pool1 = MaxPooling3D(pool_size=(2, 2, 2))(conv1)
drop1 = Dropout(0.5)(pool1)
conv2 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(drop1)
conv2 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(conv2)
pool2 = MaxPooling3D(pool_size=(2, 2, 2))(conv2)
drop2 = Dropout(0.5)(pool2)
conv3 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(drop2)
conv3 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(conv3)
pool3 = MaxPooling3D(pool_size=(2, 2, 2))(conv3)
drop3 = Dropout(0.3)(pool3)
conv4 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(drop3)
conv4 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(conv4)
pool4 = MaxPooling3D(pool_size=(2, 2, 2))(conv4)
drop4 = Dropout(0.3)(pool4)
conv5 = Conv3D(512, (3, 3, 3), activation='relu', padding='same')(drop4)
conv5 = Conv3D(512, (3, 3, 3), activation='relu', padding='same')(conv5)
up6 = concatenate([Conv3DTranspose(256, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv5), conv4], axis=4)
conv6 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(up6)
conv6 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(conv6)
up7 = concatenate([Conv3DTranspose(128, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv6), conv3], axis=4)
conv7 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(up7)
conv7 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(conv7)
up8 = concatenate([Conv3DTranspose(64, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv7), conv2], axis=4)
conv8 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(up8)
conv8 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(conv8)
up9 = concatenate([Conv3DTranspose(32, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv8), conv1], axis=4)
conv9 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(up9)
conv9 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(conv9)
conv10 = Conv3D(1, (1, 1, 1), activation='sigmoid')(conv9)
model = Model(inputs=[inputs], outputs=[conv10])
model.compile(optimizer=optimizer(lr=lr), loss=loss_metric, metrics=metrics)
return model
mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
model = get_model(optimizer=Adam, loss_metric=dice_coef_loss, metrics=[dice_coef], lr=1e-3)
observe_var = 'dice_coef'
strategy = 'max'
model_checkpoint = ModelCheckpoint('unet_seg_cs9300_3d_{epoch:04}.model', monitor=observe_var, save_best_only=False, period = 1000)
model.fit(train_x, train_y, batch_size= 1, epochs= 100, verbose=1, shuffle=True, validation_split=0.2, callbacks=[model_checkpoint])
model.save('unet_seg_final_3d_test.model')
错误:
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
<ipython-input-3-15c1c64c47ab> in <module>
423 model_checkpoint = ModelCheckpoint('unet_seg_cs9300_3d_{epoch:04}.model', monitor=observe_var, save_best_only=False, period = 1000)
424
--> 425 model.fit(train_x, train_y, batch_size= 1, epochs= 100, verbose=1, shuffle=True, validation_split=0.2, callbacks=[model_checkpoint])
426
427 model.save('unet_seg_final_3d_test.model')
~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
1211 else:
1212 fit_inputs = x + y + sample_weights
-> 1213 self._make_train_function()
1214 fit_function = self.train_function
1215
~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/engine/training.py in _make_train_function(self)
314 training_updates = self.optimizer.get_updates(
315 params=self._collected_trainable_weights,
--> 316 loss=self.total_loss)
317 updates = self.updates + training_updates
318
~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
89 warnings.warn('Update your `' + object_name + '` call to the ' +
90 'Keras 2 API: ' + signature, stacklevel=2)
---> 91 return func(*args, **kwargs)
92 wrapper._original_function = func
93 return wrapper
~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py in symbolic_fn_wrapper(*args, **kwargs)
73 if _SYMBOLIC_SCOPE.value:
74 with get_graph().as_default():
---> 75 return func(*args, **kwargs)
76 else:
77 return func(*args, **kwargs)
~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/optimizers.py in get_updates(self, loss, params)
548
549 # Apply constraints.
--> 550 if getattr(p, 'constraint', None) is not None:
551 new_p = p.constraint(new_p)
552
~/anaconda3/envs/gputest/lib/python3.7/site-packages/tensorflow_core/python/ops/variables.py in constraint(self)
566 Can be `None` if no constraint was passed.
567 """
--> 568 raise NotImplementedError
569
570 def assign(self, value, use_locking=False, name=None, read_value=True):
NotImplementedError:
此答案基于对 OP 问题的评论。
在使用 tf.distribute.MirroredStrategy
进行多 GPU 训练时,应该使用 tf.keras
API 而不是 keras
包的 tensorflow
后端。
一般来说,tf.keras
和keras
最好不要混用。
尝试不同的方式cross_device_ops,不要求助于 NCCL
strategy = tf.distribute.MirroredStrategy(
cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())
strategy = tf.distribute.MirroredStrategy(
cross_device_ops=tf.distribute.ReductionToOneDevice())
我正在尝试在 2 个 Nvidia Titan RTX 显卡上实现 TensorFlows MirroredStrategy()
到 运行 3DUNet。该代码经验证适用于 1 个 GPU。我的 OS 是 Red Hat Enterprise Linux 8 (RHEL8)。错误出现在 model.fit()
.
我已经安装了适当的 NCCL Nvidia 驱动程序,并验证了我可以使用来自 tensorflow.org 的示例将训练数据解析到两个 GPU 上。
代码:
def get_model(optimizer, loss_metric, metrics, lr=1e-3):
inputs = Input((sample_width, sample_height, sample_depth, 1))
conv1 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(inputs)
conv1 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(conv1)
pool1 = MaxPooling3D(pool_size=(2, 2, 2))(conv1)
drop1 = Dropout(0.5)(pool1)
conv2 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(drop1)
conv2 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(conv2)
pool2 = MaxPooling3D(pool_size=(2, 2, 2))(conv2)
drop2 = Dropout(0.5)(pool2)
conv3 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(drop2)
conv3 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(conv3)
pool3 = MaxPooling3D(pool_size=(2, 2, 2))(conv3)
drop3 = Dropout(0.3)(pool3)
conv4 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(drop3)
conv4 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(conv4)
pool4 = MaxPooling3D(pool_size=(2, 2, 2))(conv4)
drop4 = Dropout(0.3)(pool4)
conv5 = Conv3D(512, (3, 3, 3), activation='relu', padding='same')(drop4)
conv5 = Conv3D(512, (3, 3, 3), activation='relu', padding='same')(conv5)
up6 = concatenate([Conv3DTranspose(256, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv5), conv4], axis=4)
conv6 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(up6)
conv6 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(conv6)
up7 = concatenate([Conv3DTranspose(128, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv6), conv3], axis=4)
conv7 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(up7)
conv7 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(conv7)
up8 = concatenate([Conv3DTranspose(64, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv7), conv2], axis=4)
conv8 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(up8)
conv8 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(conv8)
up9 = concatenate([Conv3DTranspose(32, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv8), conv1], axis=4)
conv9 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(up9)
conv9 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(conv9)
conv10 = Conv3D(1, (1, 1, 1), activation='sigmoid')(conv9)
model = Model(inputs=[inputs], outputs=[conv10])
model.compile(optimizer=optimizer(lr=lr), loss=loss_metric, metrics=metrics)
return model
mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
model = get_model(optimizer=Adam, loss_metric=dice_coef_loss, metrics=[dice_coef], lr=1e-3)
observe_var = 'dice_coef'
strategy = 'max'
model_checkpoint = ModelCheckpoint('unet_seg_cs9300_3d_{epoch:04}.model', monitor=observe_var, save_best_only=False, period = 1000)
model.fit(train_x, train_y, batch_size= 1, epochs= 100, verbose=1, shuffle=True, validation_split=0.2, callbacks=[model_checkpoint])
model.save('unet_seg_final_3d_test.model')
错误:
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
<ipython-input-3-15c1c64c47ab> in <module>
423 model_checkpoint = ModelCheckpoint('unet_seg_cs9300_3d_{epoch:04}.model', monitor=observe_var, save_best_only=False, period = 1000)
424
--> 425 model.fit(train_x, train_y, batch_size= 1, epochs= 100, verbose=1, shuffle=True, validation_split=0.2, callbacks=[model_checkpoint])
426
427 model.save('unet_seg_final_3d_test.model')
~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
1211 else:
1212 fit_inputs = x + y + sample_weights
-> 1213 self._make_train_function()
1214 fit_function = self.train_function
1215
~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/engine/training.py in _make_train_function(self)
314 training_updates = self.optimizer.get_updates(
315 params=self._collected_trainable_weights,
--> 316 loss=self.total_loss)
317 updates = self.updates + training_updates
318
~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
89 warnings.warn('Update your `' + object_name + '` call to the ' +
90 'Keras 2 API: ' + signature, stacklevel=2)
---> 91 return func(*args, **kwargs)
92 wrapper._original_function = func
93 return wrapper
~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py in symbolic_fn_wrapper(*args, **kwargs)
73 if _SYMBOLIC_SCOPE.value:
74 with get_graph().as_default():
---> 75 return func(*args, **kwargs)
76 else:
77 return func(*args, **kwargs)
~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/optimizers.py in get_updates(self, loss, params)
548
549 # Apply constraints.
--> 550 if getattr(p, 'constraint', None) is not None:
551 new_p = p.constraint(new_p)
552
~/anaconda3/envs/gputest/lib/python3.7/site-packages/tensorflow_core/python/ops/variables.py in constraint(self)
566 Can be `None` if no constraint was passed.
567 """
--> 568 raise NotImplementedError
569
570 def assign(self, value, use_locking=False, name=None, read_value=True):
NotImplementedError:
此答案基于对 OP 问题的评论。
在使用 tf.distribute.MirroredStrategy
进行多 GPU 训练时,应该使用 tf.keras
API 而不是 keras
包的 tensorflow
后端。
一般来说,tf.keras
和keras
最好不要混用。
尝试不同的方式cross_device_ops,不要求助于 NCCL
strategy = tf.distribute.MirroredStrategy(
cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())
strategy = tf.distribute.MirroredStrategy(
cross_device_ops=tf.distribute.ReductionToOneDevice())