在 TPU 上训练 RNN 时获取
Getting while training RNN on TPU
我在训练我的简单 RNN 模型时使用 TPU 时遇到此错误。
(0) Internal: {{function_node __inference_train_function_4697}} Compilation failure: RET_CHECK failure (third_party/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc:1343) operand != nullptr
TPU compilation failed
[[{{node tpu_compile_succeeded_assert/_15240937476258052108/_5}}]]
[[tpu_compile_succeeded_assert/_15240937476258052108/_5/_245]]
(1) Internal: {{function_node __inference_train_function_4697}} Compilation failure: RET_CHECK failure (third_party/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc:1343) operand != nullptr
TPU compilation failed
[[{{node tpu_compile_succeeded_assert/_15240937476258052108/_5}}]]
[[tpu_compile_succeeded_assert/_15240937476258052108/_5/_197]]
(2) Internal: {{function_node __inference_train_function_4697}} Compilation failure: RET_CHECK failure (third_party/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc:1343) operand != nullptr
TPU compilation failed
[[{{node tpu_compile_succeeded_assert/_15240937476258052108/_5}}]]
[[tpu_compile_succeeded_assert/_15240937476258052108/_5/_233]]
(3) Internal: {{function_node __inference_train_function_4697}} Compilation failure: RET_CHECK failure (third_party/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc:1343) operand != nullptr
TPU compilation failed
[[{{node tpu_compile_succeeded_assert/_15240937476258052108/_5}}]]
[[tpu_compile_succeeded_assert/_15240937476258052108/_5/_173]]
(4) Internal: {{function_node __inference_train_function_4697}} Compilation failure: RET_CHECK failure (third_party/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc:1343) operand != nullptr
TPU compilation failed
[[{{node tpu_compile_succeeded_assert/_15240937476258052108/_5}}]]
[[tpu_compile_succeeded_assert/_15240937476258052108/_5/_257]]
(5) Internal: {{function_node __inference_train_function_4697}} Compilation failure: RET_CHECK failure (third_party/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc:1343) operand != nullptr
TPU compilation failed
[[{{node tpu_compile_succeeded_assert/_15240937476258052108/_5}}]]
[[tpu_compile_succeeded_assert/_15240937476258052108/_5/_221]]
(6) Internal: {{function_node __inference_train_function_4697}} Compilation failure: RET_CHECK failure (third_party/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc:1343) operand != nullptr
TPU compilation failed
[[{{node tpu_compile_succeeded_assert/_15240937476258052108/_5}}]]
[[tpu_compile_succeeded_assert/_15240937476258052108/_5/_185]]
(7) Internal: {{function_node __inference_train_function_4697}} Compilation failure: RET_CHECK failure (third_party/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc:1343) operand != nullptr
TPU compilation failed
[[{{node tpu_compile_succeeded_assert/_15240937476258052108/_5}}]]
[[tpu_compile_succeeded_assert/_15240937476258052108/_5/_209]]
(8) Internal: {{function_node __inference_train_function_4697}} Compilation failure: RET_CHECK failure (third_party/tensorflow/ ... [truncated]
这是初始化TPU的代码
try:
# TPU detection. No parameters necessary if TPU_NAME environment variable is
# set: this is always the case on Kaggle.
#tpu = None
resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(resolver)
print('Running on TPU ', resolver.master())
except ValueError:
resolver = None
if resolver:
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))
strategy = tf.distribute.TPUStrategy(resolver)
else:
# Default distribution strategy in Tensorflow. Works on CPU and single GPU.
strategy = tf.distribute.get_strategy()
print("REPLICAS: ", strategy.num_replicas_in_sync)
这是模型,我正在训练:
with strategy.scope():
# A simpleRNN without any pretrained embeddings and one dense layer
model = Sequential()
model.add(
Embedding(len(word_index) + 1,
300,
input_length=max_len)
)
model.add(SimpleRNN(100))
model.add(Dense(no_of_categories, activation='sigmoid'))
model.compile(
loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy']
)
model.summary()
请帮忙解决这个问题。
您可以尝试将 SimpleRNN
图层的 unroll
参数设置为 True
:
import tensorflow as tf
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))
strategy = tf.distribute.TPUStrategy(resolver)
with strategy.scope():
vocab_size = 100
max_len = 20
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size + 1,300, input_length=max_len))
model.add(tf.keras.layers.SimpleRNN(100, unroll=True))
model.add(tf.keras.layers.Dense(10, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
x = tf.random.uniform((50, max_len), maxval=vocab_size, dtype=tf.int32)
y = tf.random.uniform((50, 1), maxval=10, dtype=tf.int32)
y = tf.keras.utils.to_categorical(y, num_classes=10)
model.fit(x, y, batch_size=5, epochs=5)
另外,检查这个 and this post。
我在训练我的简单 RNN 模型时使用 TPU 时遇到此错误。
(0) Internal: {{function_node __inference_train_function_4697}} Compilation failure: RET_CHECK failure (third_party/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc:1343) operand != nullptr
TPU compilation failed
[[{{node tpu_compile_succeeded_assert/_15240937476258052108/_5}}]]
[[tpu_compile_succeeded_assert/_15240937476258052108/_5/_245]]
(1) Internal: {{function_node __inference_train_function_4697}} Compilation failure: RET_CHECK failure (third_party/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc:1343) operand != nullptr
TPU compilation failed
[[{{node tpu_compile_succeeded_assert/_15240937476258052108/_5}}]]
[[tpu_compile_succeeded_assert/_15240937476258052108/_5/_197]]
(2) Internal: {{function_node __inference_train_function_4697}} Compilation failure: RET_CHECK failure (third_party/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc:1343) operand != nullptr
TPU compilation failed
[[{{node tpu_compile_succeeded_assert/_15240937476258052108/_5}}]]
[[tpu_compile_succeeded_assert/_15240937476258052108/_5/_233]]
(3) Internal: {{function_node __inference_train_function_4697}} Compilation failure: RET_CHECK failure (third_party/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc:1343) operand != nullptr
TPU compilation failed
[[{{node tpu_compile_succeeded_assert/_15240937476258052108/_5}}]]
[[tpu_compile_succeeded_assert/_15240937476258052108/_5/_173]]
(4) Internal: {{function_node __inference_train_function_4697}} Compilation failure: RET_CHECK failure (third_party/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc:1343) operand != nullptr
TPU compilation failed
[[{{node tpu_compile_succeeded_assert/_15240937476258052108/_5}}]]
[[tpu_compile_succeeded_assert/_15240937476258052108/_5/_257]]
(5) Internal: {{function_node __inference_train_function_4697}} Compilation failure: RET_CHECK failure (third_party/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc:1343) operand != nullptr
TPU compilation failed
[[{{node tpu_compile_succeeded_assert/_15240937476258052108/_5}}]]
[[tpu_compile_succeeded_assert/_15240937476258052108/_5/_221]]
(6) Internal: {{function_node __inference_train_function_4697}} Compilation failure: RET_CHECK failure (third_party/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc:1343) operand != nullptr
TPU compilation failed
[[{{node tpu_compile_succeeded_assert/_15240937476258052108/_5}}]]
[[tpu_compile_succeeded_assert/_15240937476258052108/_5/_185]]
(7) Internal: {{function_node __inference_train_function_4697}} Compilation failure: RET_CHECK failure (third_party/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc:1343) operand != nullptr
TPU compilation failed
[[{{node tpu_compile_succeeded_assert/_15240937476258052108/_5}}]]
[[tpu_compile_succeeded_assert/_15240937476258052108/_5/_209]]
(8) Internal: {{function_node __inference_train_function_4697}} Compilation failure: RET_CHECK failure (third_party/tensorflow/ ... [truncated]
这是初始化TPU的代码
try:
# TPU detection. No parameters necessary if TPU_NAME environment variable is
# set: this is always the case on Kaggle.
#tpu = None
resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(resolver)
print('Running on TPU ', resolver.master())
except ValueError:
resolver = None
if resolver:
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))
strategy = tf.distribute.TPUStrategy(resolver)
else:
# Default distribution strategy in Tensorflow. Works on CPU and single GPU.
strategy = tf.distribute.get_strategy()
print("REPLICAS: ", strategy.num_replicas_in_sync)
这是模型,我正在训练:
with strategy.scope():
# A simpleRNN without any pretrained embeddings and one dense layer
model = Sequential()
model.add(
Embedding(len(word_index) + 1,
300,
input_length=max_len)
)
model.add(SimpleRNN(100))
model.add(Dense(no_of_categories, activation='sigmoid'))
model.compile(
loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy']
)
model.summary()
请帮忙解决这个问题。
您可以尝试将 SimpleRNN
图层的 unroll
参数设置为 True
:
import tensorflow as tf
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))
strategy = tf.distribute.TPUStrategy(resolver)
with strategy.scope():
vocab_size = 100
max_len = 20
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size + 1,300, input_length=max_len))
model.add(tf.keras.layers.SimpleRNN(100, unroll=True))
model.add(tf.keras.layers.Dense(10, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
x = tf.random.uniform((50, max_len), maxval=vocab_size, dtype=tf.int32)
y = tf.random.uniform((50, 1), maxval=10, dtype=tf.int32)
y = tf.keras.utils.to_categorical(y, num_classes=10)
model.fit(x, y, batch_size=5, epochs=5)
另外,检查这个