为什么我的 Resnet56 实现的准确性低于原始论文?
Why does my Resnet56 implementation have less accuracy than in the original paper?
我试图在 Tensorflow 中实现 Resnet56 来对 CIFAR10 图像进行分类,但不知何故我得到的准确率低于原始创作者。
我完全按照论文中的描述做了所有事情:相同的架构、相同的数据增强、相同的学习率调度、相同的批量大小...
但不知何故,我的实现只产生了 91.84% 的准确率,而在原始论文中,56 层 Resnet 的准确率达到了 93.03%。
这是 Resnet 论文的 link:https://arxiv.org/pdf/1512.03385.pdf
我发现了我的问题所在(如果有兴趣,请参阅答案),在这里您可以找到我的(现在正确的)实现,现在可以达到完全相同的精度:
import argparse
import datetime
import os
import re
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_addons as tfa
import tensorflow_datasets as tfds
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2") # Report only TF errors and warnings by default
parser = argparse.ArgumentParser()
parser.add_argument("--resnet_n", default=9, type=int, help="n from Resnet paper.")
parser.add_argument("--seed", default=42, type=int, help="Random seed.")
parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.")
class ResNet(keras.Model):
class ResidualBlock(tf.Module):
def __init__(self, filters: int, down_sample: bool):
super().__init__()
self.filters = filters
self.down_sample = down_sample
def __call__(self, x):
out = x
out = keras.layers.Conv2D(filters=self.filters,
kernel_size=(3, 3),
strides=(1, 1) if not self.down_sample else (2, 2),
padding="same",
use_bias=False,
kernel_initializer=tf.keras.initializers.HeNormal)(out)
out = keras.layers.BatchNormalization()(out)
out = keras.layers.ReLU()(out)
out = keras.layers.Conv2D(filters=self.filters,
kernel_size=(3, 3),
strides=(1, 1),
padding="same",
use_bias=False,
kernel_initializer=tf.keras.initializers.HeNormal)(out)
out = keras.layers.BatchNormalization()(out)
if self.down_sample:
residual = keras.layers.Conv2D(filters=self.filters, kernel_size=(1, 1), strides=(2, 2),
padding="same",
use_bias=False,
kernel_initializer=tf.keras.initializers.HeNormal)(x)
residual = tf.keras.layers.BatchNormalization()(residual)
else:
residual = x
out = out + residual
out = keras.layers.ReLU()(out)
return out
def __init__(self, args):
inputs = keras.layers.Input(shape=(32, 32, 3), dtype=tf.float32)
outputs = keras.layers.Conv2D(filters=16, kernel_size=(3, 3), strides=(1, 1), padding="same", use_bias=False,
kernel_initializer=tf.keras.initializers.HeNormal)(
inputs)
outputs = keras.layers.BatchNormalization()(outputs)
outputs = keras.layers.ReLU()(outputs)
for _ in range(0, args.resnet_n):
outputs = self.ResidualBlock(16, False)(outputs)
outputs = self.ResidualBlock(32, True)(outputs)
for _ in range(1, args.resnet_n):
outputs = self.ResidualBlock(32, False)(outputs)
outputs = self.ResidualBlock(64, True)(outputs)
for _ in range(1, args.resnet_n):
outputs = self.ResidualBlock(64, False)(outputs)
outputs = keras.layers.GlobalAveragePooling2D()(outputs)
outputs = keras.layers.Dense(10, activation=tf.nn.softmax)(outputs)
super().__init__(inputs, outputs)
def main(args, tb_callback):
ds_train,ds_test = tfds.load("cifar10",split=["train","test"],as_supervised=True)
img_augmentation = keras.Sequential(
[
keras.layers.RandomFlip("horizontal"),
keras.layers.RandomTranslation(height_factor=0.125, width_factor=0.125, fill_mode="constant",
fill_value=0.5)
]
)
ds_train = ds_train.map(lambda img, label: (tf.cast(img, tf.float32) / 255.0, label))
ds_test = ds_test.map(lambda img, label: (tf.cast(img, tf.float32) / 255.0, label))
total_count, per_pixel_sum = ds_train.reduce((np.float32(0), tf.zeros((32, 32, 3))),
lambda prev, curr: (prev[0] + 1.0, prev[1] + curr[0]))
per_pixel_mean = per_pixel_sum / total_count
ds_train = ds_train.map(lambda img, label: (img_augmentation(img, training=True), tf.one_hot(label, 10)))
ds_test = ds_test.map(lambda img, label: (img, tf.one_hot(label, 10)))
ds_train = ds_train.map(lambda img, label: (img - per_pixel_mean, label))
ds_test = ds_test.map(lambda img, label: (img - per_pixel_mean, label))
ds_train = ds_train.shuffle(5000).batch(128, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
ds_test = ds_test.shuffle(5000).batch(128, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
model = ResNet(args)
learning_rate = keras.optimizers.schedules.PiecewiseConstantDecay(
[32000, 48000], [0.1, 0.01, 0.001]
)
weight_decay = keras.optimizers.schedules.PiecewiseConstantDecay(
[32000, 48000], [1e-4, 1e-5, 1e-6]
)
model.compile(
optimizer=tfa.optimizers.SGDW(weight_decay=weight_decay, learning_rate=learning_rate, momentum=0.9,
nesterov=False),
loss=tf.losses.CategoricalCrossentropy(),
metrics=[tf.metrics.CategoricalAccuracy("accuracy")],
)
model.fit(x=ds_train, epochs=200, validation_data=ds_test, callbacks=[tb_callback], use_multiprocessing=True,
workers=args.threads)
model.save(args.logdir + '/model')
print('OK')
if __name__ == "__main__":
args = parser.parse_args([] if "__file__" not in globals() else None)
# Fix random seeds and threads
np.random.seed(args.seed)
tf.random.set_seed(args.seed)
tf.config.threading.set_inter_op_parallelism_threads(args.threads)
tf.config.threading.set_intra_op_parallelism_threads(args.threads)
# Create logdir name
args.logdir = os.path.join("{}/{}".format("logs", os.path.basename(globals().get("__file__", "notebook"))),
"{}-{}".format(
datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"),
",".join(("{}={}".format(re.sub("(.)[^_]*_?", r"", key), value) for key, value in
sorted(vars(args).items())))
))
tb_callback = tf.keras.callbacks.TensorBoard(args.logdir, histogram_freq=1, update_freq=100, profile_batch=0)
main(args, tb_callback)
我发现我的问题是什么:
- 我没有正确应用数据扩充,将
img_augmentation(img)
更改为 img_augmentation(img, training=True)
- 将内核初始化程序更改为 HeNormal,这是他们在论文中使用的
- 添加每像素均值减法作为归一化
- 禁用 nesterov 在某种程度上有所帮助(IDK 为什么)
我试图在 Tensorflow 中实现 Resnet56 来对 CIFAR10 图像进行分类,但不知何故我得到的准确率低于原始创作者。
我完全按照论文中的描述做了所有事情:相同的架构、相同的数据增强、相同的学习率调度、相同的批量大小...
但不知何故,我的实现只产生了 91.84% 的准确率,而在原始论文中,56 层 Resnet 的准确率达到了 93.03%。
这是 Resnet 论文的 link:https://arxiv.org/pdf/1512.03385.pdf
我发现了我的问题所在(如果有兴趣,请参阅答案),在这里您可以找到我的(现在正确的)实现,现在可以达到完全相同的精度:
import argparse
import datetime
import os
import re
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_addons as tfa
import tensorflow_datasets as tfds
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2") # Report only TF errors and warnings by default
parser = argparse.ArgumentParser()
parser.add_argument("--resnet_n", default=9, type=int, help="n from Resnet paper.")
parser.add_argument("--seed", default=42, type=int, help="Random seed.")
parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.")
class ResNet(keras.Model):
class ResidualBlock(tf.Module):
def __init__(self, filters: int, down_sample: bool):
super().__init__()
self.filters = filters
self.down_sample = down_sample
def __call__(self, x):
out = x
out = keras.layers.Conv2D(filters=self.filters,
kernel_size=(3, 3),
strides=(1, 1) if not self.down_sample else (2, 2),
padding="same",
use_bias=False,
kernel_initializer=tf.keras.initializers.HeNormal)(out)
out = keras.layers.BatchNormalization()(out)
out = keras.layers.ReLU()(out)
out = keras.layers.Conv2D(filters=self.filters,
kernel_size=(3, 3),
strides=(1, 1),
padding="same",
use_bias=False,
kernel_initializer=tf.keras.initializers.HeNormal)(out)
out = keras.layers.BatchNormalization()(out)
if self.down_sample:
residual = keras.layers.Conv2D(filters=self.filters, kernel_size=(1, 1), strides=(2, 2),
padding="same",
use_bias=False,
kernel_initializer=tf.keras.initializers.HeNormal)(x)
residual = tf.keras.layers.BatchNormalization()(residual)
else:
residual = x
out = out + residual
out = keras.layers.ReLU()(out)
return out
def __init__(self, args):
inputs = keras.layers.Input(shape=(32, 32, 3), dtype=tf.float32)
outputs = keras.layers.Conv2D(filters=16, kernel_size=(3, 3), strides=(1, 1), padding="same", use_bias=False,
kernel_initializer=tf.keras.initializers.HeNormal)(
inputs)
outputs = keras.layers.BatchNormalization()(outputs)
outputs = keras.layers.ReLU()(outputs)
for _ in range(0, args.resnet_n):
outputs = self.ResidualBlock(16, False)(outputs)
outputs = self.ResidualBlock(32, True)(outputs)
for _ in range(1, args.resnet_n):
outputs = self.ResidualBlock(32, False)(outputs)
outputs = self.ResidualBlock(64, True)(outputs)
for _ in range(1, args.resnet_n):
outputs = self.ResidualBlock(64, False)(outputs)
outputs = keras.layers.GlobalAveragePooling2D()(outputs)
outputs = keras.layers.Dense(10, activation=tf.nn.softmax)(outputs)
super().__init__(inputs, outputs)
def main(args, tb_callback):
ds_train,ds_test = tfds.load("cifar10",split=["train","test"],as_supervised=True)
img_augmentation = keras.Sequential(
[
keras.layers.RandomFlip("horizontal"),
keras.layers.RandomTranslation(height_factor=0.125, width_factor=0.125, fill_mode="constant",
fill_value=0.5)
]
)
ds_train = ds_train.map(lambda img, label: (tf.cast(img, tf.float32) / 255.0, label))
ds_test = ds_test.map(lambda img, label: (tf.cast(img, tf.float32) / 255.0, label))
total_count, per_pixel_sum = ds_train.reduce((np.float32(0), tf.zeros((32, 32, 3))),
lambda prev, curr: (prev[0] + 1.0, prev[1] + curr[0]))
per_pixel_mean = per_pixel_sum / total_count
ds_train = ds_train.map(lambda img, label: (img_augmentation(img, training=True), tf.one_hot(label, 10)))
ds_test = ds_test.map(lambda img, label: (img, tf.one_hot(label, 10)))
ds_train = ds_train.map(lambda img, label: (img - per_pixel_mean, label))
ds_test = ds_test.map(lambda img, label: (img - per_pixel_mean, label))
ds_train = ds_train.shuffle(5000).batch(128, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
ds_test = ds_test.shuffle(5000).batch(128, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
model = ResNet(args)
learning_rate = keras.optimizers.schedules.PiecewiseConstantDecay(
[32000, 48000], [0.1, 0.01, 0.001]
)
weight_decay = keras.optimizers.schedules.PiecewiseConstantDecay(
[32000, 48000], [1e-4, 1e-5, 1e-6]
)
model.compile(
optimizer=tfa.optimizers.SGDW(weight_decay=weight_decay, learning_rate=learning_rate, momentum=0.9,
nesterov=False),
loss=tf.losses.CategoricalCrossentropy(),
metrics=[tf.metrics.CategoricalAccuracy("accuracy")],
)
model.fit(x=ds_train, epochs=200, validation_data=ds_test, callbacks=[tb_callback], use_multiprocessing=True,
workers=args.threads)
model.save(args.logdir + '/model')
print('OK')
if __name__ == "__main__":
args = parser.parse_args([] if "__file__" not in globals() else None)
# Fix random seeds and threads
np.random.seed(args.seed)
tf.random.set_seed(args.seed)
tf.config.threading.set_inter_op_parallelism_threads(args.threads)
tf.config.threading.set_intra_op_parallelism_threads(args.threads)
# Create logdir name
args.logdir = os.path.join("{}/{}".format("logs", os.path.basename(globals().get("__file__", "notebook"))),
"{}-{}".format(
datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"),
",".join(("{}={}".format(re.sub("(.)[^_]*_?", r"", key), value) for key, value in
sorted(vars(args).items())))
))
tb_callback = tf.keras.callbacks.TensorBoard(args.logdir, histogram_freq=1, update_freq=100, profile_batch=0)
main(args, tb_callback)
我发现我的问题是什么:
- 我没有正确应用数据扩充,将
img_augmentation(img)
更改为img_augmentation(img, training=True)
- 将内核初始化程序更改为 HeNormal,这是他们在论文中使用的
- 添加每像素均值减法作为归一化
- 禁用 nesterov 在某种程度上有所帮助(IDK 为什么)