Keras Sequential model 的准确性低于 Functional Model，尽管它们是相同的

Question

我训练了两个具有相同可训练参数和相同结构的模型。但是与顺序模型相比，功能模型的性能更好。尝试从给定图像预测向量。图像输出来自 vgg16 模型。不包括顶层。将原始向量与预测向量进行比较时。功能模型往往与原始向量具有更大的相似性。有人可以解释为什么会这样吗？

下面的代码 -

from keras.models import Sequential
import numpy as np
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from tensorflow import keras
from numpy import random
from sklearn.metrics.pairwise import cosine_similarity

epochs=2000

x = random.random_sample((1, 4096))
y = np.array([ 0.01897711, 0.00196044, -0.0100884 , 0.08048831, 0.07945059, -0.13450155, -0.00228113, 0.30315322, -0.2170798 , 0.12462355, -0.12226178, -0.19237731, -0.14406398, 0.11556922, 0.04466464, -0.22505943, -0.07492258, -0.05925079, 0.02871693, -0.32403016, 0.16885516, -0.01677704, 0.03490563, 0.08720589, -0.03105724, -0.10850648, 0.04820024, -0.1348836 , -0.26358405, 0.08388387, 0.13177398, 0.00133367, -0.01074621, -0.01703981, 0.14912938, 0.13562258, 0.12910905, -0.02097122, -0.05823291, -0.21523051, -0.1051832 , -0.0112495 , -0.02306462, 0.30883443, 0.24211378, -0.01332151, -0.04171557, -0.07624041, 0.05742156, 0.17561561, -0.05971769, -0.22914584, -0.2354534 , -0.12413627, -0.02892042, -0.08661073, 0.14135012, -0.15514424, -0.09965582, -0.13770337, 0.09548005, 0.0925705 , -0.10030732, 0.16057852, -0.17537649, 0.23076315, -0.12471516, 0.2811343 , -0.1576465 , 0.17364068, 0.0658261 , 0.044597 , 0.27390295, -0.04520088, 0.00317772, 0.05926268, 0.06897669, -0.2579084 , -0.30417407, -0.08170868, -0.10205928, -0.14339833, -0.2291172 , 0.1584655 , -0.108877 , 0.03841971, -0.02097263, -0.00477816, -0.08784705, 0.00944081, 0.01409219, 0.1655657 , 0.09393094, 0.233216 , 0.28611556, -0.00573498, 0.1374636 , -0.19641444, 0.14472656, 0.254758 , -0.26166946, 0.30998066, 0.1026804 , -0.0578127 , -0.0882837 , -0.25514072, 0.12337176, 0.1786545 , 0.04052542, -0.17535737, -0.05401937, -0.27649277, -0.04952267, 0.08122452, 0.04374097, -0.07044917, 0.0653659 , -0.36983526, -0.02356564, -0.01144519, 0.1440273 , 0.12321867, 0.10163002, -0.13444787, -0.06148207, 0.11309719, -0.24679276, -0.04028287, -0.0930292 , -0.06392674, 0.10477038, 0.00828285, -0.11968364, -0.16145884, -0.08808196, 0.14231506, -0.02768413, -0.24046096, 0.02477906, -0.3868386 , 0.08224358, -0.30728677, -0.31634584, -0.24805053, -0.19289431, -0.04890246, -0.23479757, 0.13149938, 0.02801071, 0.12761658, 0.02897108, -0.14499697, 0.05322106, 0.06153642, -0.21517622, 0.255269 , 0.08573797, 0.09940388, -0.10590497, 0.13063994, 0.11253715, 0.15636472, -0.19782121, 0.01258014, -0.04391019, 0.16168897, -0.05669969, -0.17957021, -0.04841055, -0.00175814, -0.25425357, 0.14485207, 0.08319512, -0.20990393, 0.04344559, 0.20995931, -0.16608813, 0.28736553, 0.12240092, 0.12146739, 0.05718496, 0.01994314, 0.09686041, 0.13452487, 0.1052431 , 0.10266875, -0.01051683, 0.01536175, 0.25623122, 0.11273847, 0.06577922, -0.09992851, -0.02046986, -0.11516961, 0.12051879, 0.00518495, 0.0988002 , -0.279763 , -0.09997523, -0.04474135])
y = y.reshape(1,-1)

inputs  = Input(shape=(4096,))
decoder = Dense(256, activation="sigmoid")(inputs)
decoder = Dense(256, activation="sigmoid")(decoder)
decoder = Dense(256, activation="sigmoid")(decoder)
outputs = Dense(200, activation="sigmoid")(decoder)

functional = Model(inputs=inputs, outputs=outputs)
opt = keras.optimizers.Adam(learning_rate=0.01)
functional.compile(loss="mse", optimizer=opt)

sequen = Sequential()
sequen.add(Dense(256,input_shape=(4096,),activation="sigmoid"))
sequen.add(Dense(256,activation="sigmoid"))
sequen.add(Dense(256,activation="sigmoid"))
sequen.add(Dense(200,activation="sigmoid"))
sequen.compile(loss="mse", optimizer=opt)

functional.fit(x,y,verbose=1,validation_data=(x, y),epochs=epochs)
sequen.fit(x,y,verbose=1,validation_data=(x, y),epochs=epochs)


functional_output = cosine_similarity(functional.predict(x),y)
sequential_output = cosine_similarity(sequen.predict(x),y)
print(functional_output,sequential_output)

#Calculating cosine_similarity between both outputs. Functional api gives gives better output.
#output - array([[0.65056009]]), array([[0.19631703]])

功能模型结构

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_1 (InputLayer)        [(None, 4096)]            0         
                                                                 
 dense (Dense)               (None, 256)               1048832   
                                                                 
 dense_1 (Dense)             (None, 256)               65792     
                                                                 
 dense_2 (Dense)             (None, 256)               65792     
                                                                 
 dense_3 (Dense)             (None, 200)               51400     
                                                                 
=================================================================
Total params: 1,231,816
Trainable params: 1,231,816
Non-trainable params: 0
_________________________________________________________________

顺序模型结构

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense_4 (Dense)             (None, 256)               1048832   
                                                                 
 dense_5 (Dense)             (None, 256)               65792     
                                                                 
 dense_6 (Dense)             (None, 256)               65792     
                                                                 
 dense_7 (Dense)             (None, 200)               51400     
                                                                 
=================================================================
Total params: 1,231,816
Trainable params: 1,231,816
Non-trainable params: 0
_________________________________________________________________

Answer 1

这两个模型使用不同的权重和偏差进行了初始化。您可以通过添加参数 kernel_initializer=tf.keras.initializers.Zeros() 和 bias_initializer=tf.keras.initializers.Zeros() 将模型的权重和偏差初始化为零矩阵。如果你运行这段代码，你会看到类似的结果，但不完全相同。
正如 @AloneTogether 所指出的，在训练您的第一个模型后，优化器已经有了一个内部状态。因此，再次初始化该优化器将解决此问题。

因此，如果您运行此代码，您将得到相同的结果：

from keras.models import Sequential
import numpy as np
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from tensorflow import keras
import tensorflow as tf
from numpy import random
from sklearn.metrics.pairwise import cosine_similarity

epochs=200

x = random.random_sample((1, 4096))
y = np.array([ 0.01897711, 0.00196044, -0.0100884 , 0.08048831, 0.07945059, -0.13450155, -0.00228113, 0.30315322, -0.2170798 , 0.12462355, -0.12226178, -0.19237731, -0.14406398, 0.11556922, 0.04466464, -0.22505943, -0.07492258, -0.05925079, 0.02871693, -0.32403016, 0.16885516, -0.01677704, 0.03490563, 0.08720589, -0.03105724, -0.10850648, 0.04820024, -0.1348836 , -0.26358405, 0.08388387, 0.13177398, 0.00133367, -0.01074621, -0.01703981, 0.14912938, 0.13562258, 0.12910905, -0.02097122, -0.05823291, -0.21523051, -0.1051832 , -0.0112495 , -0.02306462, 0.30883443, 0.24211378, -0.01332151, -0.04171557, -0.07624041, 0.05742156, 0.17561561, -0.05971769, -0.22914584, -0.2354534 , -0.12413627, -0.02892042, -0.08661073, 0.14135012, -0.15514424, -0.09965582, -0.13770337, 0.09548005, 0.0925705 , -0.10030732, 0.16057852, -0.17537649, 0.23076315, -0.12471516, 0.2811343 , -0.1576465 , 0.17364068, 0.0658261 , 0.044597 , 0.27390295, -0.04520088, 0.00317772, 0.05926268, 0.06897669, -0.2579084 , -0.30417407, -0.08170868, -0.10205928, -0.14339833, -0.2291172 , 0.1584655 , -0.108877 , 0.03841971, -0.02097263, -0.00477816, -0.08784705, 0.00944081, 0.01409219, 0.1655657 , 0.09393094, 0.233216 , 0.28611556, -0.00573498, 0.1374636 , -0.19641444, 0.14472656, 0.254758 , -0.26166946, 0.30998066, 0.1026804 , -0.0578127 , -0.0882837 , -0.25514072, 0.12337176, 0.1786545 , 0.04052542, -0.17535737, -0.05401937, -0.27649277, -0.04952267, 0.08122452, 0.04374097, -0.07044917, 0.0653659 , -0.36983526, -0.02356564, -0.01144519, 0.1440273 , 0.12321867, 0.10163002, -0.13444787, -0.06148207, 0.11309719, -0.24679276, -0.04028287, -0.0930292 , -0.06392674, 0.10477038, 0.00828285, -0.11968364, -0.16145884, -0.08808196, 0.14231506, -0.02768413, -0.24046096, 0.02477906, -0.3868386 , 0.08224358, -0.30728677, -0.31634584, -0.24805053, -0.19289431, -0.04890246, -0.23479757, 0.13149938, 0.02801071, 0.12761658, 0.02897108, -0.14499697, 0.05322106, 0.06153642, -0.21517622, 0.255269 , 0.08573797, 0.09940388, -0.10590497, 0.13063994, 0.11253715, 0.15636472, -0.19782121, 0.01258014, -0.04391019, 0.16168897, -0.05669969, -0.17957021, -0.04841055, -0.00175814, -0.25425357, 0.14485207, 0.08319512, -0.20990393, 0.04344559, 0.20995931, -0.16608813, 0.28736553, 0.12240092, 0.12146739, 0.05718496, 0.01994314, 0.09686041, 0.13452487, 0.1052431 , 0.10266875, -0.01051683, 0.01536175, 0.25623122, 0.11273847, 0.06577922, -0.09992851, -0.02046986, -0.11516961, 0.12051879, 0.00518495, 0.0988002 , -0.279763 , -0.09997523, -0.04474135])
y = y.reshape(1,-1)

inputs  = Input(shape=(4096,))
decoder = Dense(256, activation="sigmoid", kernel_initializer=tf.keras.initializers.Zeros(), bias_initializer=tf.keras.initializers.Zeros())(inputs)
decoder = Dense(256, activation="sigmoid", kernel_initializer=tf.keras.initializers.Zeros(), bias_initializer=tf.keras.initializers.Zeros())(decoder)
decoder = Dense(256, activation="sigmoid", kernel_initializer=tf.keras.initializers.Zeros(), bias_initializer=tf.keras.initializers.Zeros())(decoder)
outputs = Dense(200, activation="sigmoid", kernel_initializer=tf.keras.initializers.Zeros(), bias_initializer=tf.keras.initializers.Zeros())(decoder)

functional = Model(inputs=inputs, outputs=outputs)
opt = keras.optimizers.Adam(learning_rate=0.01)
functional.compile(loss="mse", optimizer=opt)

sequen = Sequential()
sequen.add(Input(shape=(4096,)))
sequen.add(Dense(256,activation="sigmoid", kernel_initializer=tf.keras.initializers.Zeros(), bias_initializer=tf.keras.initializers.Zeros()))
sequen.add(Dense(256,activation="sigmoid", kernel_initializer=tf.keras.initializers.Zeros(), bias_initializer=tf.keras.initializers.Zeros()))
sequen.add(Dense(256,activation="sigmoid", kernel_initializer=tf.keras.initializers.Zeros(), bias_initializer=tf.keras.initializers.Zeros()))
sequen.add(Dense(200,activation="sigmoid", kernel_initializer=tf.keras.initializers.Zeros(), bias_initializer=tf.keras.initializers.Zeros()))

opt2 = keras.optimizers.Adam(learning_rate=0.01)
sequen.compile(loss="mse", optimizer=opt2)

functional.fit(x,y,verbose=1,validation_data=(x, y),epochs=epochs)
sequen.fit(x,y,verbose=1,validation_data=(x, y),epochs=epochs)


functional_output = cosine_similarity(functional.predict(x),y)
sequential_output = cosine_similarity(sequen.predict(x),y)
print(functional_output,sequential_output)

Answer 2

我认为主要问题是您使用相同的优化器来训练您的模型，并且在训练您的第一个模型后，优化器已经具有内部状态。使用两个单独的优化器似乎产生（几乎）相同的结果：

...
opt1 = keras.optimizers.Adam(learning_rate=0.01)
opt2 = keras.optimizers.Adam(learning_rate=0.01)
...

...
[[0.65034289]] [[0.65033581]]

Keras Sequential model 的准确性低于 Functional Model，尽管它们是相同的

Keras Sequential model gives less accuracy than Functional Model though being same

python

machine-learning

keras

tensorflow