为什么 Google colab TPU 很慢?
Why is Google colab TPU slow?
我正在使用 Talos 对 Keras 模型进行 运行 超参数调整。 运行 Google colab TPU 上的这个短代码非常慢。我认为这与数据类型有关。我是否应该将其转换为张量以使 TPU 更快?
%tensorflow_version 2.x
import os
import tensorflow as tf
import talos as ta
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
def iris_model(x_train, y_train, x_val, y_val, params):
# Specify a distributed strategy to use TPU
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_host(resolver.master())
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.experimental.TPUStrategy(resolver)
# Use the strategy to create and compile a Keras model
with strategy.scope():
model = Sequential()
model.add(Dense(32, input_shape=(4,), activation=tf.nn.relu, name="relu"))
model.add(Dense(3, activation=tf.nn.softmax, name="softmax"))
model.compile(optimizer=Adam(learning_rate=0.1), loss=params['losses'])
# Convert data type to use TPU
x_train = x_train.astype('float32')
x_val = x_val.astype('float32')
# Fit the Keras model on the dataset
out = model.fit(x_train, y_train, batch_size=params['batch_size'], epochs=params['epochs'], validation_data=[x_val, y_val], verbose=0, steps_per_epoch=0)
return out, model
# Load dataset
X, y = ta.templates.datasets.iris()
# Train and test set
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.30, shuffle=False)
# Create a hyperparameter distributions
p = {'losses': ['logcosh'], 'batch_size': [128, 256, 384, 512, 1024], 'epochs': [10, 20]}
# Use Talos to scan the best hyperparameters of the Keras model
scan_object = ta.Scan(x_train, y_train, params=p, model=iris_model, experiment_name='test', x_val=x_val, y_val=y_val, fraction_limit=0.5)
感谢您的提问。
很遗憾,我无法在 TensorFlow 2.2 上获得您的代码示例 运行,因此我不知道您最初看到的性能如何。通过以下更改,我能够修复它并在 TPU 上 运行 获得它:
- 将
tf.config.experimental_connect_to_host(resolver.master())
替换为tf.config.experimental_connect_to_cluster(resolver)
- 将 TPU 初始化移到
iris_model()
之外。
- 使用 tf.data.Dataset 作为 TPU 输入。
这是修改后的 Colab 代码:
# Run this to install Talos before running the rest of the code.
!pip install git+https://github.com/autonomio/talos@1.0
%tensorflow_version 2.x
import os
import tensorflow as tf
import talos as ta
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
print(tf.__version__) # TF 2.2.0 in my case
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
def iris_model(x_train, y_train, x_val, y_val, params):
# Use the strategy to create and compile a Keras model
strategy = tf.distribute.experimental.TPUStrategy(resolver)
with strategy.scope():
model = Sequential()
model.add(Dense(32, input_shape=(4,), activation=tf.nn.relu, name="relu"))
model.add(Dense(3, activation=tf.nn.softmax, name="softmax"))
model.compile(optimizer=Adam(learning_rate=0.1), loss=params['losses'])
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(params['batch_size'])
val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(params['batch_size'])
# Fit the Keras model on the dataset
out = model.fit(train_dataset, epochs=params['epochs'], validation_data=val_dataset)
return out, model
# Load dataset
X, y = ta.templates.datasets.iris()
# Train and test set
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.30, shuffle=False)
# Create a hyperparameter distributions
p = {'losses': ['logcosh'], 'batch_size': [128, 256, 384, 512, 1024], 'epochs': [10, 20]}
# Use Talos to scan the best hyperparameters of the Keras model
scan_object = ta.Scan(x_train, y_train, params=p, model=iris_model, experiment_name='test', x_val=x_val, y_val=y_val, fraction_limit=0.5)
对我来说,最后一个电话用了不到 2 分钟。
对于众所周知的数据集,您可以使用 TensorFlow Datasets library. TFDS does have the iris dataset in their library. For an end-to-end example of using TFDS with TPUs, see TensorFlow's official TPU guide.
跳过创建自己的 tf.data.Dataset
的步骤
我正在使用 Talos 对 Keras 模型进行 运行 超参数调整。 运行 Google colab TPU 上的这个短代码非常慢。我认为这与数据类型有关。我是否应该将其转换为张量以使 TPU 更快?
%tensorflow_version 2.x
import os
import tensorflow as tf
import talos as ta
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
def iris_model(x_train, y_train, x_val, y_val, params):
# Specify a distributed strategy to use TPU
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_host(resolver.master())
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.experimental.TPUStrategy(resolver)
# Use the strategy to create and compile a Keras model
with strategy.scope():
model = Sequential()
model.add(Dense(32, input_shape=(4,), activation=tf.nn.relu, name="relu"))
model.add(Dense(3, activation=tf.nn.softmax, name="softmax"))
model.compile(optimizer=Adam(learning_rate=0.1), loss=params['losses'])
# Convert data type to use TPU
x_train = x_train.astype('float32')
x_val = x_val.astype('float32')
# Fit the Keras model on the dataset
out = model.fit(x_train, y_train, batch_size=params['batch_size'], epochs=params['epochs'], validation_data=[x_val, y_val], verbose=0, steps_per_epoch=0)
return out, model
# Load dataset
X, y = ta.templates.datasets.iris()
# Train and test set
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.30, shuffle=False)
# Create a hyperparameter distributions
p = {'losses': ['logcosh'], 'batch_size': [128, 256, 384, 512, 1024], 'epochs': [10, 20]}
# Use Talos to scan the best hyperparameters of the Keras model
scan_object = ta.Scan(x_train, y_train, params=p, model=iris_model, experiment_name='test', x_val=x_val, y_val=y_val, fraction_limit=0.5)
感谢您的提问。
很遗憾,我无法在 TensorFlow 2.2 上获得您的代码示例 运行,因此我不知道您最初看到的性能如何。通过以下更改,我能够修复它并在 TPU 上 运行 获得它:
- 将
tf.config.experimental_connect_to_host(resolver.master())
替换为tf.config.experimental_connect_to_cluster(resolver)
- 将 TPU 初始化移到
iris_model()
之外。 - 使用 tf.data.Dataset 作为 TPU 输入。
这是修改后的 Colab 代码:
# Run this to install Talos before running the rest of the code.
!pip install git+https://github.com/autonomio/talos@1.0
%tensorflow_version 2.x
import os
import tensorflow as tf
import talos as ta
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
print(tf.__version__) # TF 2.2.0 in my case
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
def iris_model(x_train, y_train, x_val, y_val, params):
# Use the strategy to create and compile a Keras model
strategy = tf.distribute.experimental.TPUStrategy(resolver)
with strategy.scope():
model = Sequential()
model.add(Dense(32, input_shape=(4,), activation=tf.nn.relu, name="relu"))
model.add(Dense(3, activation=tf.nn.softmax, name="softmax"))
model.compile(optimizer=Adam(learning_rate=0.1), loss=params['losses'])
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(params['batch_size'])
val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(params['batch_size'])
# Fit the Keras model on the dataset
out = model.fit(train_dataset, epochs=params['epochs'], validation_data=val_dataset)
return out, model
# Load dataset
X, y = ta.templates.datasets.iris()
# Train and test set
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.30, shuffle=False)
# Create a hyperparameter distributions
p = {'losses': ['logcosh'], 'batch_size': [128, 256, 384, 512, 1024], 'epochs': [10, 20]}
# Use Talos to scan the best hyperparameters of the Keras model
scan_object = ta.Scan(x_train, y_train, params=p, model=iris_model, experiment_name='test', x_val=x_val, y_val=y_val, fraction_limit=0.5)
对我来说,最后一个电话用了不到 2 分钟。
对于众所周知的数据集,您可以使用 TensorFlow Datasets library. TFDS does have the iris dataset in their library. For an end-to-end example of using TFDS with TPUs, see TensorFlow's official TPU guide.
跳过创建自己的tf.data.Dataset
的步骤