TF2.x eager模式现在不支持ParameterServerStrategy?
TF2.x eager mode can not support ParameterServerStrategy now?
TF版本:最新主控,b083cea
下面是一个使用 TF2.0 eager 模式的简单示例,运行使用 MirroredStrategy 成功,但使用 ParameterServerStrategy 时出错。
TF2.0 eager模式现在可以支持ParameterServerStrategy了吗?到目前为止我还没有找到成功的例子:(
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import tensorflow_datasets as tfds
import os, json
datasets, info = tfds.load(name='mnist', with_info=True, as_supervised=True)
mnist_train, mnist_test = datasets['train'], datasets['test']
os.environ['TF_CONFIG'] = json.dumps({
"cluster": {
"worker": ["localhost:12345"],
"ps": ["localhost:12346"]
},
"task": {"type": "worker", "index": 0}
})
strategy = tf.distribute.experimental.ParameterServerStrategy()
#strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
num_train_examples = info.splits['train'].num_examples
num_test_examples = info.splits['test'].num_examples
BUFFER_SIZE = 10000
BATCH_SIZE_PER_REPLICA = 64
BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
def scale(image, label):
image = tf.cast(image, tf.float32)
image /= 255
return image, label
train_dataset = mnist_train.map(scale).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
eval_dataset = mnist_test.map(scale).batch(BATCH_SIZE)
with strategy.scope():
model = tf.keras.Sequential([
tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(10, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy',
optimizer=tf.keras.optimizers.Adam(),
metrics=['accuracy'])
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
# Function for decaying the learning rate.
# You can define any decay function you need.
def decay(epoch):
if epoch < 3:
return 1e-3
elif epoch >= 3 and epoch < 7:
return 1e-4
else:
return 1e-5
# Callback for printing the LR at the end of each epoch.
class PrintLR(tf.keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs=None):
print('\nLearning rate for epoch {} is {}'.format(epoch + 1,
model.optimizer.lr.numpy()))
callbacks = [
tf.keras.callbacks.TensorBoard(log_dir='./logs'),
tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
save_weights_only=True),
tf.keras.callbacks.LearningRateScheduler(decay),
PrintLR()
]
model.fit(train_dataset, epochs=12, callbacks=callbacks)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
eval_loss, eval_acc = model.evaluate(eval_dataset)
print('Eval loss: {}, Eval Accuracy: {}'.format(eval_loss, eval_acc))
错误信息
tf.keras.layers.Dense(10, 激活='softmax')
文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/training/tracking/base.py”,第 456 行,在 _method_wrapper 中
结果 = 方法(自我,*args,**kwargs)
文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/keras/engine/sequential.py”,第 116 行,在 __init__ 中
super(Sequential, self).__init__(name=name, autocast=False)
文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py”,第 199 行,在 __init__ 中
self._init_batch_counters()
文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/training/tracking/base.py”,第 456 行,在 _method_wrapper 中
结果 = 方法(自我,*args,**kwargs)
文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py”,第 206 行,在 _init_batch_counters 中
self._train_counter = variables.Variable(0, dtype='int64', 聚合=agg)
文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/ops/variables.py”,第 261 行,在 __call__ 中
return cls._variable_v2_call(*args, **kwargs)
文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/ops/variables.py”,第 255 行,在 _variable_v2_call 中
形状=形状)
文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/ops/variables.py”,第 66 行,在 getter 中
return captured_getter(captured_previous, **kwargs)
文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py”,第 1769 行,在 creator_with_resource_vars 中
return self._create_variable(next_creator, **kwargs)
文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/distribute/parameter_server_strategy.py”,第 455 行,在 _create_variable
ops.device(self._variable_device):
文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/ops.py”,第 5183 行,在设备中
"tf.device does not support functions when eager execution "
RuntimeError:tf.device 启用急切执行时不支持函数。
在tf.distribute.experimental.ParameterServerStrategy页面中,如下所述
Note: This strategy only works with the Estimator API. Pass an instance of this strategy to the experimental_distribute argument when you create the RunConfig. This instance of RunConfig should then be passed to the Estimator instance on which train_and_evaluate is called.
以下是如何使用 tf.distribute.experimental.ParameterServerStrategy()
-
的示例
strategy = tf.distribute.experimental.ParameterServerStrategy()
run_config = tf.estimator.RunConfig(
experimental_distribute.train_distribute=strategy)
estimator = tf.estimator.Estimator(config=run_config)
tf.estimator.train_and_evaluate(estimator,...)
此外,如果您转到页面Distributed training with TensorFlow,它说明了目前TF 2.0在哪些场景中支持什么,
希望这能回答您的问题。快乐学习。
TF版本:最新主控,b083cea
下面是一个使用 TF2.0 eager 模式的简单示例,运行使用 MirroredStrategy 成功,但使用 ParameterServerStrategy 时出错。
TF2.0 eager模式现在可以支持ParameterServerStrategy了吗?到目前为止我还没有找到成功的例子:(
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import tensorflow_datasets as tfds
import os, json
datasets, info = tfds.load(name='mnist', with_info=True, as_supervised=True)
mnist_train, mnist_test = datasets['train'], datasets['test']
os.environ['TF_CONFIG'] = json.dumps({
"cluster": {
"worker": ["localhost:12345"],
"ps": ["localhost:12346"]
},
"task": {"type": "worker", "index": 0}
})
strategy = tf.distribute.experimental.ParameterServerStrategy()
#strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
num_train_examples = info.splits['train'].num_examples
num_test_examples = info.splits['test'].num_examples
BUFFER_SIZE = 10000
BATCH_SIZE_PER_REPLICA = 64
BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
def scale(image, label):
image = tf.cast(image, tf.float32)
image /= 255
return image, label
train_dataset = mnist_train.map(scale).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
eval_dataset = mnist_test.map(scale).batch(BATCH_SIZE)
with strategy.scope():
model = tf.keras.Sequential([
tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(10, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy',
optimizer=tf.keras.optimizers.Adam(),
metrics=['accuracy'])
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
# Function for decaying the learning rate.
# You can define any decay function you need.
def decay(epoch):
if epoch < 3:
return 1e-3
elif epoch >= 3 and epoch < 7:
return 1e-4
else:
return 1e-5
# Callback for printing the LR at the end of each epoch.
class PrintLR(tf.keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs=None):
print('\nLearning rate for epoch {} is {}'.format(epoch + 1,
model.optimizer.lr.numpy()))
callbacks = [
tf.keras.callbacks.TensorBoard(log_dir='./logs'),
tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
save_weights_only=True),
tf.keras.callbacks.LearningRateScheduler(decay),
PrintLR()
]
model.fit(train_dataset, epochs=12, callbacks=callbacks)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
eval_loss, eval_acc = model.evaluate(eval_dataset)
print('Eval loss: {}, Eval Accuracy: {}'.format(eval_loss, eval_acc))
错误信息
tf.keras.layers.Dense(10, 激活='softmax') 文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/training/tracking/base.py”,第 456 行,在 _method_wrapper 中 结果 = 方法(自我,*args,**kwargs) 文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/keras/engine/sequential.py”,第 116 行,在 __init__ 中 super(Sequential, self).__init__(name=name, autocast=False) 文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py”,第 199 行,在 __init__ 中 self._init_batch_counters() 文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/training/tracking/base.py”,第 456 行,在 _method_wrapper 中 结果 = 方法(自我,*args,**kwargs) 文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py”,第 206 行,在 _init_batch_counters 中 self._train_counter = variables.Variable(0, dtype='int64', 聚合=agg) 文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/ops/variables.py”,第 261 行,在 __call__ 中 return cls._variable_v2_call(*args, **kwargs) 文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/ops/variables.py”,第 255 行,在 _variable_v2_call 中 形状=形状) 文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/ops/variables.py”,第 66 行,在 getter 中 return captured_getter(captured_previous, **kwargs) 文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py”,第 1769 行,在 creator_with_resource_vars 中 return self._create_variable(next_creator, **kwargs) 文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/distribute/parameter_server_strategy.py”,第 455 行,在 _create_variable ops.device(self._variable_device): 文件“/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/ops.py”,第 5183 行,在设备中 "tf.device does not support functions when eager execution " RuntimeError:tf.device 启用急切执行时不支持函数。
在tf.distribute.experimental.ParameterServerStrategy页面中,如下所述
Note: This strategy only works with the Estimator API. Pass an instance of this strategy to the experimental_distribute argument when you create the RunConfig. This instance of RunConfig should then be passed to the Estimator instance on which train_and_evaluate is called.
以下是如何使用 tf.distribute.experimental.ParameterServerStrategy()
-
strategy = tf.distribute.experimental.ParameterServerStrategy()
run_config = tf.estimator.RunConfig(
experimental_distribute.train_distribute=strategy)
estimator = tf.estimator.Estimator(config=run_config)
tf.estimator.train_and_evaluate(estimator,...)
此外,如果您转到页面Distributed training with TensorFlow,它说明了目前TF 2.0在哪些场景中支持什么,
希望这能回答您的问题。快乐学习。