预测图像中的点序列
Predicting point sequence in image
我的训练集是一组图像(3 通道或 1 ofc 我只使用一种类型的通道)。标签是我想从图像中预测的特定顺序的一系列点。
我正在使用受 tensorflow 网站上的图像字幕示例启发的模型。这也是本文采用的方法 https://arxiv.org/pdf/1901.03781.pdf
class CNN_Encoder(tf.keras.Model):
# Since you have already extracted the features and dumped it using pickle
# This encoder passes those features through a Fully connected layer
def __init__(self, embedding_dim):
super(CNN_Encoder, self).__init__()
self.fc = tf.keras.layers.Dense(embedding_dim)
def call(self, x):
x = self.fc(x)
x = tf.nn.relu(x)
return x
class RNN_Decoder(tf.keras.Model):
def __init__(self, embedding_dim, units, output_dim):
super(RNN_Decoder, self).__init__()
self.units = units
self.gru = tf.keras.layers.GRU(self.units,
return_sequences=True,
return_state=True,
recurrent_initializer='glorot_uniform')
self.fc1 = tf.keras.layers.Dense(self.units)
self.fc2 = tf.keras.layers.Dense(output_dim)
def call(self, x, features, hidden):
x = tf.concat((features, x), axis=-1)
output, state = self.gru(x)
x = self.fc1(state)
x = self.fc2(x)
return x
def reset_state(self, batch_size):
return tf.zeros((batch_size, self.units))
@tf.function
def train_step(img_tensor, target):
loss = 0
hidden = decoder.reset_state(batch_size=target.shape[0])
dec_input = tf.expand_dims([[0., 0.]] * target.shape[0], 1)
with tf.GradientTape() as tape:
features = encoder(img_tensor)
for i in (range(1, target.shape[1])):
predictions = decoder(dec_input, features, hidden)
loss += loss_function(target[:, i], predictions)
# using teacher forcing
dec_input = tf.expand_dims(target[:, i], 1)
total_loss = (loss / int(target.shape[1]))
trainable_variables = encoder.trainable_variables + decoder.trainable_variables
gradients = tape.gradient(loss, trainable_variables)
optimizer.apply_gradients(zip(gradients, trainable_variables))
return loss, total_loss
EPOCHS = 20
batch_size = 8
for epoch in tqdm(range(start_epoch, EPOCHS)):
start = time.time()
total_loss = 0
for (batch, (img_tensor, target)) in enumerate((data_generator(preds_t, labels_t))):
img_tensor = img_tensor.reshape((-1, 1, 128*128))
batch_loss, t_loss = train_step(img_tensor, target)
total_loss += t_loss
if batch % 100 == 0:
print ('Epoch {} Batch {} Loss {:.4f}'.format(
epoch + 1, batch, batch_loss.numpy() / int(target.shape[1])))
if batch == 10000:
break
# storing the epoch end loss value to plot later
#loss_plot.append(total_loss / num_steps)
if epoch % 5 == 0:
ckpt_manager.save()
print ('Epoch {} Loss {:.6f}'.format(epoch + 1,
total_loss/num_steps))
print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
对于特征向量。我正在提取 unet 的最后一层。所以每张图片的大小都是 1x128x128。我将它重塑为 1x1x128*128。然后我通过一个完全连接的层。然后形状变成 1x1x256
我想预测的标签是图像坐标 (x, y)。 gru层的输入是
串联 1x1x256 , 1x1x2 (t-1 坐标)。然后我进一步通过一个 2 层 fc 层,输出维度为 2 为 2 坐标。我暂时删除了注意力以获得更简单的模型。我规范化我的图像。我用 0,0 填充坐标序列,开始 -1,-1 结束,-2,-2 常规填充,以获得 350x2 的统一序列长度。
网络好像学的不多。我只是在图像上沿对角线散布了几个点。我看到的图像字幕模型最大的不同是单词可以转换为嵌入,然后你有一个 128 个图像特征和 128 个单词特征被连接起来并输入到 lstm 中。在我的例子中,序列信息只有 1 个条目。这可能是网络学习不多的原因。
如果有人对我应该更改的内容有任何见解那就太好了
你的问题需要一定的经验和深入的调查。我只会针对欠拟合问题提出一般性建议。这是 things to try.
的列表
就我个人而言,我会先尝试对单个批次进行过度拟合。
我的训练集是一组图像(3 通道或 1 ofc 我只使用一种类型的通道)。标签是我想从图像中预测的特定顺序的一系列点。
我正在使用受 tensorflow 网站上的图像字幕示例启发的模型。这也是本文采用的方法 https://arxiv.org/pdf/1901.03781.pdf
class CNN_Encoder(tf.keras.Model):
# Since you have already extracted the features and dumped it using pickle
# This encoder passes those features through a Fully connected layer
def __init__(self, embedding_dim):
super(CNN_Encoder, self).__init__()
self.fc = tf.keras.layers.Dense(embedding_dim)
def call(self, x):
x = self.fc(x)
x = tf.nn.relu(x)
return x
class RNN_Decoder(tf.keras.Model):
def __init__(self, embedding_dim, units, output_dim):
super(RNN_Decoder, self).__init__()
self.units = units
self.gru = tf.keras.layers.GRU(self.units,
return_sequences=True,
return_state=True,
recurrent_initializer='glorot_uniform')
self.fc1 = tf.keras.layers.Dense(self.units)
self.fc2 = tf.keras.layers.Dense(output_dim)
def call(self, x, features, hidden):
x = tf.concat((features, x), axis=-1)
output, state = self.gru(x)
x = self.fc1(state)
x = self.fc2(x)
return x
def reset_state(self, batch_size):
return tf.zeros((batch_size, self.units))
@tf.function
def train_step(img_tensor, target):
loss = 0
hidden = decoder.reset_state(batch_size=target.shape[0])
dec_input = tf.expand_dims([[0., 0.]] * target.shape[0], 1)
with tf.GradientTape() as tape:
features = encoder(img_tensor)
for i in (range(1, target.shape[1])):
predictions = decoder(dec_input, features, hidden)
loss += loss_function(target[:, i], predictions)
# using teacher forcing
dec_input = tf.expand_dims(target[:, i], 1)
total_loss = (loss / int(target.shape[1]))
trainable_variables = encoder.trainable_variables + decoder.trainable_variables
gradients = tape.gradient(loss, trainable_variables)
optimizer.apply_gradients(zip(gradients, trainable_variables))
return loss, total_loss
EPOCHS = 20
batch_size = 8
for epoch in tqdm(range(start_epoch, EPOCHS)):
start = time.time()
total_loss = 0
for (batch, (img_tensor, target)) in enumerate((data_generator(preds_t, labels_t))):
img_tensor = img_tensor.reshape((-1, 1, 128*128))
batch_loss, t_loss = train_step(img_tensor, target)
total_loss += t_loss
if batch % 100 == 0:
print ('Epoch {} Batch {} Loss {:.4f}'.format(
epoch + 1, batch, batch_loss.numpy() / int(target.shape[1])))
if batch == 10000:
break
# storing the epoch end loss value to plot later
#loss_plot.append(total_loss / num_steps)
if epoch % 5 == 0:
ckpt_manager.save()
print ('Epoch {} Loss {:.6f}'.format(epoch + 1,
total_loss/num_steps))
print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
对于特征向量。我正在提取 unet 的最后一层。所以每张图片的大小都是 1x128x128。我将它重塑为 1x1x128*128。然后我通过一个完全连接的层。然后形状变成 1x1x256
我想预测的标签是图像坐标 (x, y)。 gru层的输入是 串联 1x1x256 , 1x1x2 (t-1 坐标)。然后我进一步通过一个 2 层 fc 层,输出维度为 2 为 2 坐标。我暂时删除了注意力以获得更简单的模型。我规范化我的图像。我用 0,0 填充坐标序列,开始 -1,-1 结束,-2,-2 常规填充,以获得 350x2 的统一序列长度。
网络好像学的不多。我只是在图像上沿对角线散布了几个点。我看到的图像字幕模型最大的不同是单词可以转换为嵌入,然后你有一个 128 个图像特征和 128 个单词特征被连接起来并输入到 lstm 中。在我的例子中,序列信息只有 1 个条目。这可能是网络学习不多的原因。
如果有人对我应该更改的内容有任何见解那就太好了
你的问题需要一定的经验和深入的调查。我只会针对欠拟合问题提出一般性建议。这是 things to try.
的列表就我个人而言,我会先尝试对单个批次进行过度拟合。