TensorFlow1.15,多GPU-1-machine,如何设置batch_size?
TensorFlow1.15, multi-GPU-1-machine, how to set batch_size?
我在一台机器上用 4 个 GPU 预训练 BERT。
输入功能码:
def input_fn(params):
"""The actual input function."""
batch_size = FLAGS.train_batch_size
name_to_features = {
"input_ids":
tf.FixedLenFeature([max_seq_length], tf.int64),
"input_mask":
tf.FixedLenFeature([max_seq_length], tf.int64),
"segment_ids":
tf.FixedLenFeature([max_seq_length], tf.int64),
"masked_lm_positions":
tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
"masked_lm_ids":
tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
"masked_lm_weights":
tf.FixedLenFeature([max_predictions_per_seq], tf.float32),
"next_sentence_labels":
tf.FixedLenFeature([1], tf.int64),
}
# For training, we want a lot of parallel reading and shuffling.
# For eval, we want no shuffling and parallel reading doesn't matter.
if is_training:
d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
d = d.repeat()
d = d.shuffle(buffer_size=len(input_files))
# `cycle_length` is the number of parallel files that get read.
cycle_length = min(num_cpu_threads, len(input_files))
# `sloppy` mode means that the interleaving is not exact. This adds
# even more randomness to the training pipeline.
d = d.apply(
tf.contrib.data.parallel_interleave(
tf.data.TFRecordDataset,
sloppy=is_training,
cycle_length=cycle_length))
d = d.shuffle(buffer_size=100)
else:
d = tf.data.TFRecordDataset(input_files)
# Since we evaluate for a fixed number of steps we don't want to encounter
# out-of-range exceptions.
d = d.repeat()
# We must `drop_remainder` on training because the TPU requires fixed
# size dimensions. For eval, we assume we are evaluating on the CPU or GPU
# and we *don't* want to drop the remainder, otherwise we wont cover
# every sample.
d = d.apply(
tf.contrib.data.map_and_batch(
lambda record: _decode_record(record, name_to_features),
batch_size=batch_size,
num_parallel_batches=num_cpu_threads,
drop_remainder=True))
d = d.prefetch(10)
return d
mirrow策略代码:
distribution = tf.contrib.distribute.MirroredStrategy(
devices=["device:GPU:%d" % i for i in range(FLAGS.n_gpus)],
# num_gpus=4,
cross_tower_ops=tf.distribute.HierarchicalCopyAllReduce())
run_config = RunConfig(
train_distribute=distribution,
# eval_distribute=dist_strategy,
log_step_count_steps=log_every_n_steps,
model_dir=FLAGS.output_dir,
save_checkpoints_steps=FLAGS.save_checkpoints_steps)
model_fn = model_fn_builder(
bert_config=bert_config,
init_checkpoint=FLAGS.init_checkpoint,
learning_rate=FLAGS.learning_rate,
num_train_steps=FLAGS.num_train_steps,
num_warmup_steps=FLAGS.num_warmup_steps,
use_tpu=FLAGS.use_tpu,
use_one_hot_embeddings=FLAGS.use_tpu)
# If TPU is not available, this will fall back to normal Estimator on CPU
# or GPU.
estimator = Estimator(
model_fn=model_fn,
params={},
config=run_config)
问题是我有 4 个 GPU。每个 GPU 最多可以 运行 8 个批大小。
我设置 train_batch_size = 8
不是 32。可以,但我不知道每个 GPU 在一个训练步骤中获得不同的数据。
如果我设置train_batch_size = 32
,它会内存不足(OOM)。
我的代码现在是吗?会不会把数据分发到4个GPU,每个GPU得到不同的数据?
根据BERT-GPU。
代码没有问题。
batch_size
用于 1 个 GPU。
我在一台机器上用 4 个 GPU 预训练 BERT。
输入功能码:
def input_fn(params):
"""The actual input function."""
batch_size = FLAGS.train_batch_size
name_to_features = {
"input_ids":
tf.FixedLenFeature([max_seq_length], tf.int64),
"input_mask":
tf.FixedLenFeature([max_seq_length], tf.int64),
"segment_ids":
tf.FixedLenFeature([max_seq_length], tf.int64),
"masked_lm_positions":
tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
"masked_lm_ids":
tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
"masked_lm_weights":
tf.FixedLenFeature([max_predictions_per_seq], tf.float32),
"next_sentence_labels":
tf.FixedLenFeature([1], tf.int64),
}
# For training, we want a lot of parallel reading and shuffling.
# For eval, we want no shuffling and parallel reading doesn't matter.
if is_training:
d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
d = d.repeat()
d = d.shuffle(buffer_size=len(input_files))
# `cycle_length` is the number of parallel files that get read.
cycle_length = min(num_cpu_threads, len(input_files))
# `sloppy` mode means that the interleaving is not exact. This adds
# even more randomness to the training pipeline.
d = d.apply(
tf.contrib.data.parallel_interleave(
tf.data.TFRecordDataset,
sloppy=is_training,
cycle_length=cycle_length))
d = d.shuffle(buffer_size=100)
else:
d = tf.data.TFRecordDataset(input_files)
# Since we evaluate for a fixed number of steps we don't want to encounter
# out-of-range exceptions.
d = d.repeat()
# We must `drop_remainder` on training because the TPU requires fixed
# size dimensions. For eval, we assume we are evaluating on the CPU or GPU
# and we *don't* want to drop the remainder, otherwise we wont cover
# every sample.
d = d.apply(
tf.contrib.data.map_and_batch(
lambda record: _decode_record(record, name_to_features),
batch_size=batch_size,
num_parallel_batches=num_cpu_threads,
drop_remainder=True))
d = d.prefetch(10)
return d
mirrow策略代码:
distribution = tf.contrib.distribute.MirroredStrategy(
devices=["device:GPU:%d" % i for i in range(FLAGS.n_gpus)],
# num_gpus=4,
cross_tower_ops=tf.distribute.HierarchicalCopyAllReduce())
run_config = RunConfig(
train_distribute=distribution,
# eval_distribute=dist_strategy,
log_step_count_steps=log_every_n_steps,
model_dir=FLAGS.output_dir,
save_checkpoints_steps=FLAGS.save_checkpoints_steps)
model_fn = model_fn_builder(
bert_config=bert_config,
init_checkpoint=FLAGS.init_checkpoint,
learning_rate=FLAGS.learning_rate,
num_train_steps=FLAGS.num_train_steps,
num_warmup_steps=FLAGS.num_warmup_steps,
use_tpu=FLAGS.use_tpu,
use_one_hot_embeddings=FLAGS.use_tpu)
# If TPU is not available, this will fall back to normal Estimator on CPU
# or GPU.
estimator = Estimator(
model_fn=model_fn,
params={},
config=run_config)
问题是我有 4 个 GPU。每个 GPU 最多可以 运行 8 个批大小。
我设置 train_batch_size = 8
不是 32。可以,但我不知道每个 GPU 在一个训练步骤中获得不同的数据。
如果我设置train_batch_size = 32
,它会内存不足(OOM)。
我的代码现在是吗?会不会把数据分发到4个GPU,每个GPU得到不同的数据?
根据BERT-GPU。
代码没有问题。
batch_size
用于 1 个 GPU。