神经网络梯度下降:权重导数的矩阵形状未对齐
Neural Network Gradient Descent: Matrix Shapes of Derivatives of Weights not Aligned
我正在尝试从头开始创建自己的网络(不使用 keras 或 tensorflow 等库)以更好地理解机器学习和神经网络。我有 运行 问题,当使用具有特定层配置的网络时,梯度下降无法正常运行。由于每一层的值是各自权重集的导数,并且导数相乘将更接近输入的权重链接到输出,层自身相乘不起作用。例如,一个具有 2 个输入神经元、3 个隐藏神经元和 1 个输出神经元的神经网络,与成本和权重集相关的导数 link 输入和隐藏必须包含导数的乘法(值存储在每一层)以便将权重链接到输出。
完整代码如下:
(尝试为可变网络输入 [[3,1,None],[2,None,None],[1,None,None]]
以获得可重现的错误)
import numpy as np
import random
from matplotlib import pyplot as plt
def sigmoid(x):
return 1/(1+np.exp(-x))
def sigmoid_p(x):
return sigmoid(x)*(1 -sigmoid(x))
def network_propagation(weights,biases,activations,input_data):
pre_funcs = []
outputs = []
input_layer = input_data
for i in range(len(network)):
pre_func = np.dot(input_layer,weights[i]) + biases[i]
pre_funcs.append(pre_func)
if activations[i]:
output = activations[i](pre_func)
else:
output = pre_func
outputs.append(output)
input_layer = output
return pre_funcs,outputs
def initialize_network(network):
weights = []
biases = []
activations = []
for layer in network:
layer_weights = []
layer_size = layer[0]
input_size = layer[1]
activation = layer[2]
if input_size == None:
input_size = network[network.index(layer)-1][0]
activations.append(activation)
biases.append(np.random.randn())
for i in range(layer_size*input_size):
layer_weights.append(np.random.randn())
weights.append(np.reshape(np.array(layer_weights),(input_size,layer_size)))
return weights,biases,activations
def train(data,answers,network,weights,biases,activations):
learning_rate = 0.2
loss_history = []
learning_rate_history = []
epochs = 20000
threshold_value = 100
threshold = False
lowest_c = np.inf
schedule = True
best_weights = weights
best_biases = biases
for i in range(epochs):
if threshold == False:
ri = np.random.randint(len(data))
point = data[ri]
target = answers[ri]
pre_funcs,outputs = network_propagation(weights,biases,activations,point)
pred = outputs[-1]
cost = np.square(pred - target)
if i % 100 == 0:
c = 0
for j in range(len(data)):
p = data[j]
target = answers[j]
pre_funcs,outputs = network_propagation(weights,biases,activations,p)
p_pred = outputs[-1]
c += np.square(p_pred - target)
loss_history.append(c)
dcost_dpred = 2 * (pred - target)
dpred_dz = sigmoid_p(pre_funcs[-1])
#Changes start here
dz_dweights = [[]] * len(weights)
dz_dweights[0] = point
# if activations[-1]:
# dz_dweights[0] = sigmoid_p(np.array(point))
for i in range(0,len(pre_funcs[:-1])):
if activations[i]:
dz_dweights[i+1] = sigmoid_p(pre_funcs[:-1][i])
else:
dz_dweights[i+1] = pre_funcs[:-1][i]
for j in range(len(dz_dweights)):
if np.array(dz_dweights[i-j]).tolist() and i-j > 0:
dz_dweights[i+1] *= dz_dweights[i-j]
dz_dbias = 1
dcost_dz = dcost_dpred*dpred_dz
dcost_dweights = [[]] * len(weights)
for i in range(len(dcost_dweights)):
dcost_dweights[i] = np.dot(dcost_dz,[dz_dweights[i]])
dcost_dbias = dcost_dz*dz_dbias
for i in range(len(weights)):
weights[i] -= learning_rate*dcost_dweights[i][0]
for i in range(len(biases)):
biases[i] -= learning_rate*np.array(dcost_dbias)
acc = (1-c)*100
if c < lowest_c:
lowest_c = c
best_weights = weights
best_biases = biases
if round(acc[0]) >= threshold_value:
threshold = True
return best_weights,best_biases,loss_history
def training_stats(loss_history,weights,biases,activations,data,answers):
plt.plot(loss_history)
pre_funcs,outputs = network_propagation(weights,biases,activations,data)
answers = np.reshape(answers,outputs[-1].shape)
loss = (outputs[-1] - answers) ** 2
min_loss = sum(loss)[0]
first_loss = loss_history[0]
improvement = round(((first_loss[0] - min_loss)/first_loss[0]),0)
max_acc = (1-min_loss)*100
print('Minimum Loss:',round(min_loss,2))
print('Improvement:',str(improvement*100)+'%'+' (From '+str(round(first_loss[0],2))+')')
print('Highest Accuracy:',round(max_acc,2))
print('Best Weights:',weights)
print('Best Biases:',biases)
def normalize_data(data):
data = np.array(data)
data_shape = data.shape
flatten = lambda l: [item for sublist in l for item in sublist]
data = flatten(data)
min_val = min(data)
max_val = max(data)
norm_data = []
for term in data:
term = (term-min_val)/(max_val-min_val)
norm_data.append(term)
norm_data = np.reshape(np.array(norm_data),data_shape)
return norm_data
def prediction(pred_data,weights,biases,activations):
pre_funcs,outputs = network_propagation(weights,biases,activations,pred_data)
return outputs[-1]
# Layer_size,input_size,activation
network = [[1,1,None],[1,None,None]]
data = [[1],[2],[3],[4],[5]]
answers = [2,4,6,8,10]
weights,biases,activations = initialize_network(network)
weights,biases,loss_history = train(data,answers,network,weights,biases,activations)
training_stats(loss_history,weights,biases,activations,data,answers)
pred = prediction(data,weights,biases,activations)
是我的理解有误还是我的代码有问题?
问题在于,当网络具有隐藏层时,您没有包含一个层可能以多种方式影响下一层的事实。当不同层之间有多个 link 时,请尝试使用 np.dot。
我正在尝试从头开始创建自己的网络(不使用 keras 或 tensorflow 等库)以更好地理解机器学习和神经网络。我有 运行 问题,当使用具有特定层配置的网络时,梯度下降无法正常运行。由于每一层的值是各自权重集的导数,并且导数相乘将更接近输入的权重链接到输出,层自身相乘不起作用。例如,一个具有 2 个输入神经元、3 个隐藏神经元和 1 个输出神经元的神经网络,与成本和权重集相关的导数 link 输入和隐藏必须包含导数的乘法(值存储在每一层)以便将权重链接到输出。
完整代码如下:
(尝试为可变网络输入 [[3,1,None],[2,None,None],[1,None,None]]
以获得可重现的错误)
import numpy as np
import random
from matplotlib import pyplot as plt
def sigmoid(x):
return 1/(1+np.exp(-x))
def sigmoid_p(x):
return sigmoid(x)*(1 -sigmoid(x))
def network_propagation(weights,biases,activations,input_data):
pre_funcs = []
outputs = []
input_layer = input_data
for i in range(len(network)):
pre_func = np.dot(input_layer,weights[i]) + biases[i]
pre_funcs.append(pre_func)
if activations[i]:
output = activations[i](pre_func)
else:
output = pre_func
outputs.append(output)
input_layer = output
return pre_funcs,outputs
def initialize_network(network):
weights = []
biases = []
activations = []
for layer in network:
layer_weights = []
layer_size = layer[0]
input_size = layer[1]
activation = layer[2]
if input_size == None:
input_size = network[network.index(layer)-1][0]
activations.append(activation)
biases.append(np.random.randn())
for i in range(layer_size*input_size):
layer_weights.append(np.random.randn())
weights.append(np.reshape(np.array(layer_weights),(input_size,layer_size)))
return weights,biases,activations
def train(data,answers,network,weights,biases,activations):
learning_rate = 0.2
loss_history = []
learning_rate_history = []
epochs = 20000
threshold_value = 100
threshold = False
lowest_c = np.inf
schedule = True
best_weights = weights
best_biases = biases
for i in range(epochs):
if threshold == False:
ri = np.random.randint(len(data))
point = data[ri]
target = answers[ri]
pre_funcs,outputs = network_propagation(weights,biases,activations,point)
pred = outputs[-1]
cost = np.square(pred - target)
if i % 100 == 0:
c = 0
for j in range(len(data)):
p = data[j]
target = answers[j]
pre_funcs,outputs = network_propagation(weights,biases,activations,p)
p_pred = outputs[-1]
c += np.square(p_pred - target)
loss_history.append(c)
dcost_dpred = 2 * (pred - target)
dpred_dz = sigmoid_p(pre_funcs[-1])
#Changes start here
dz_dweights = [[]] * len(weights)
dz_dweights[0] = point
# if activations[-1]:
# dz_dweights[0] = sigmoid_p(np.array(point))
for i in range(0,len(pre_funcs[:-1])):
if activations[i]:
dz_dweights[i+1] = sigmoid_p(pre_funcs[:-1][i])
else:
dz_dweights[i+1] = pre_funcs[:-1][i]
for j in range(len(dz_dweights)):
if np.array(dz_dweights[i-j]).tolist() and i-j > 0:
dz_dweights[i+1] *= dz_dweights[i-j]
dz_dbias = 1
dcost_dz = dcost_dpred*dpred_dz
dcost_dweights = [[]] * len(weights)
for i in range(len(dcost_dweights)):
dcost_dweights[i] = np.dot(dcost_dz,[dz_dweights[i]])
dcost_dbias = dcost_dz*dz_dbias
for i in range(len(weights)):
weights[i] -= learning_rate*dcost_dweights[i][0]
for i in range(len(biases)):
biases[i] -= learning_rate*np.array(dcost_dbias)
acc = (1-c)*100
if c < lowest_c:
lowest_c = c
best_weights = weights
best_biases = biases
if round(acc[0]) >= threshold_value:
threshold = True
return best_weights,best_biases,loss_history
def training_stats(loss_history,weights,biases,activations,data,answers):
plt.plot(loss_history)
pre_funcs,outputs = network_propagation(weights,biases,activations,data)
answers = np.reshape(answers,outputs[-1].shape)
loss = (outputs[-1] - answers) ** 2
min_loss = sum(loss)[0]
first_loss = loss_history[0]
improvement = round(((first_loss[0] - min_loss)/first_loss[0]),0)
max_acc = (1-min_loss)*100
print('Minimum Loss:',round(min_loss,2))
print('Improvement:',str(improvement*100)+'%'+' (From '+str(round(first_loss[0],2))+')')
print('Highest Accuracy:',round(max_acc,2))
print('Best Weights:',weights)
print('Best Biases:',biases)
def normalize_data(data):
data = np.array(data)
data_shape = data.shape
flatten = lambda l: [item for sublist in l for item in sublist]
data = flatten(data)
min_val = min(data)
max_val = max(data)
norm_data = []
for term in data:
term = (term-min_val)/(max_val-min_val)
norm_data.append(term)
norm_data = np.reshape(np.array(norm_data),data_shape)
return norm_data
def prediction(pred_data,weights,biases,activations):
pre_funcs,outputs = network_propagation(weights,biases,activations,pred_data)
return outputs[-1]
# Layer_size,input_size,activation
network = [[1,1,None],[1,None,None]]
data = [[1],[2],[3],[4],[5]]
answers = [2,4,6,8,10]
weights,biases,activations = initialize_network(network)
weights,biases,loss_history = train(data,answers,network,weights,biases,activations)
training_stats(loss_history,weights,biases,activations,data,answers)
pred = prediction(data,weights,biases,activations)
是我的理解有误还是我的代码有问题?
问题在于,当网络具有隐藏层时,您没有包含一个层可能以多种方式影响下一层的事实。当不同层之间有多个 link 时,请尝试使用 np.dot。