分类问题神经网络最后一层的输出向量停留在 0.5
Output vector of The final Layer of a neural net for a classification problem stuck at 0.5
输出层卡在 [0.5, 0.5] 向量处。如果代码有任何问题,谁能帮助理解。
我要训练的神经网络是一个 X-OR 门,所以在这种情况下,输出向量应该接近代表正确 class(0 或 1)的一个热向量,但是所有纪元之后的输出向量仍然停留在 [0.5, 0.5]
class Backpropogation:
def setupWeightsBiases(self):
for i in range(1, self.num_layers):
self.weights_dict[i] = rnd.rand(self.layer_spec[i], self.layer_spec[i - 1])
self.bias_dict[i] = rnd.rand(self.layer_spec[i], 1)
def __init__(self, hidden_layer_neurons_tuple, train_data, num_output_classes, output_layer_func='sigmoid'):
self.train_input = train_data[0]
self.input_layer_size = self.train_input[0].size
self.train_input = self.train_input.reshape(self.train_input.shape[0], self.input_layer_size).T
self.output_layer_size = num_output_classes
self.train_output = train_data[1]
print(self.train_output.shape)
num_hidden_layer = len(hidden_layer_neurons_tuple)
self.hidden_layer_neurons_tuple = hidden_layer_neurons_tuple
self.layer_spec = [self.input_layer_size] + \
list(hidden_layer_neurons_tuple) + \
[num_output_classes]
self.layer_spec = tuple(self.layer_spec)
self.num_layers = num_hidden_layer + 2
self.train_data = train_data
self.activation_layer_gradient_dict = {}
self.preactivation_layer_gradient_dict = {}
self.weights_gradient_dict = {}
self.bias_gradient_dict = {}
self.curr_input = None
self.curr_output = None
self.weights_dict = {}
self.preactivation_layer_dict = {}
self.activation_layer_dict = {}
self.bias_dict = {}
self.setupWeightsBiases()
self.output = None
self.output_diff = None
self.num_output_classes = num_output_classes
def predictClass(self):
return np.argmax(self.activation_layer_dict[self.num_layers - 1])
def forwardPropogation(self, input):
# Load h[0] as the input data
self.activation_layer_dict[0] = input
'''
load input data into h[0]
for i in (1,L):
a[k] = W[k] * h[k-1] + b[k]
and finally calculate the Lth layer output with the special activation function
'''
for i in range(1, self.num_layers):
self.preactivation_layer_dict[i] = \
np.matmul(self.weights_dict[i], self.activation_layer_dict[i - 1]) + \
self.bias_dict[i]
# print(self.preactivation_layer_dict[i])
vec = self.preactivation_layer_dict[i]
self.activation_layer_dict[i] = self.activationFunction(vec)
# This will change h[L] to y'
self.activation_layer_dict[self.num_layers - 1] = self.outputFunction()
def findGradients(self, index):
class_label = self.train_output[index]
output_one_hot_vector = np.zeros((self.num_output_classes, 1))
output_one_hot_vector[class_label] = 1
output = self.activation_layer_dict[self.num_layers - 1]
self.preactivation_layer_gradient_dict[self.num_layers - 1] = -1 * (output_one_hot_vector - output)
for layer in reversed(range(1, self.num_layers)):
self.weights_gradient_dict[layer] = np.matmul(self.preactivation_layer_gradient_dict[layer],
self.activation_layer_dict[layer - 1].T)
self.bias_gradient_dict[layer] = self.preactivation_layer_gradient_dict[layer]
self.activation_layer_gradient_dict[layer - 1] = np.matmul(self.weights_dict[layer].T,
self.preactivation_layer_gradient_dict[layer])
if layer != 1:
self.preactivation_layer_gradient_dict[layer - 1] = np.multiply(
self.activation_layer_gradient_dict[layer - 1],
self.outputFunctionDiff(layer - 1))
def activationFunction(self, vec, type='sigmoid'):
if type == 'sigmoid':
return 1 / (1 + expit(-vec))
else:
print('Please select correct output function')
exit()
def outputFunction(self, type='sigmoid'):
if type == 'sigmoid':
return 1 / (1 + expit(-self.preactivation_layer_dict[self.num_layers - 1]))
else:
print('Please select correct output function')
exit()
def outputFunctionDiff(self, layer, type='sigmoid'):
op_layer = self.num_layers - 1
if type == 'sigmoid':
vec = self.preactivation_layer_dict[layer]
return np.multiply(self.activationFunction(vec), 1 - self.activationFunction(vec))
else:
print('Please select correct output function')
exit()
def updateWeightsAndBiases(self, learning_rate):
for layer in range(1, self.num_layers):
self.weights_dict[layer] = self.weights_dict[layer] - learning_rate * self.weights_gradient_dict[layer]
self.preactivation_layer_dict[layer] = self.preactivation_layer_dict[layer] - \
learning_rate * self.preactivation_layer_gradient_dict[layer]
if not (layer == self.num_layers - 1):
self.activation_layer_dict[layer] = self.activation_layer_dict[layer] - \
learning_rate * self.activation_layer_gradient_dict[layer]
self.bias_dict[layer] = self.bias_dict[layer] - learning_rate * self.bias_gradient_dict[layer]
def getLoss(self, index):
return np.log2(self.activation_layer_dict[self.num_layers - 1][self.train_output[index], 0])
def train(self, learning_rate, num_epochs):
for curr_epoch in range(num_epochs):
print('Evaluating at ' + str(curr_epoch))
index_array = list(np.arange(0, self.train_input.shape[1]))
np.random.shuffle(index_array)
for train_data_index in index_array:
test_input = self.train_input[:, [train_data_index]]
self.forwardPropogation(test_input)
# print(self.activation_layer_dict[self.num_layers - 1])
self.findGradients(train_data_index)
self.updateWeightsAndBiases(learning_rate)
print('Loss ' + str(self.getLoss(train_data_index)))
# Assumes a 2D array of 784xN array as test input
# This will return output classes of the data
def test(self, test_data):
index_range = test_data.shape[1]
test_class_list = []
for index in range(index_range):
self.forwardPropogation(test_data[:, [index]])
test_class_list.append(self.predictClass())
return test_class_list
# train the NN with BP
train_data = (np.array([[0, 0], [0, 1], [1,0], [1, 1]]), np.array([0, 1, 1, 0]))
b = Backpropogation((2, 2), train_data, 2)
下面的代码(检查 this for implementation and this 的理论)从头开始实现一个带有反向传播的神经网络,使用带有 sigmoid 激活的单个输出单元(否则它看起来与您的实现相似)。
使用此异或函数可以以适当的学习率和时期学习(虽然它有时会卡在局部最小值,但您可以考虑实施 drop-out 等正则化器)。此外,您可以将其转换为您的 2 输出(softmax?)版本,您能找出实施中的任何问题吗?例如,您可以查看以下指针:
- 在反向传播期间批量更新参数而不是随机更新
- 运行 足够的纪元
- 改变学习率
- 对隐藏层使用 Relu 激活而不是 sigmoid(以应对梯度消失)
等等
from sklearn.metrics import accuracy_score, mean_squared_error
class FFSNNetwork:
def __init__(self, n_inputs, hidden_sizes=[2]):
#intialize the inputs
self.nx = n_inputs
self.ny = 1 # number of neurons in the output layer
self.nh = len(hidden_sizes)
self.sizes = [self.nx] + hidden_sizes + [self.ny]
self.W = {}
self.B = {}
for i in range(self.nh+1):
self.W[i+1] = np.random.rand(self.sizes[i], self.sizes[i+1])
self.B[i+1] = np.random.rand(1, self.sizes[i+1])
def sigmoid(self, x):
return 1.0/(1.0 + np.exp(-x))
def forward_pass(self, x):
self.A = {}
self.H = {}
self.H[0] = x.reshape(1, -1)
for i in range(self.nh+1):
self.A[i+1] = np.matmul(self.H[i], self.W[i+1]) + self.B[i+1]
self.H[i+1] = self.sigmoid(self.A[i+1])
return self.H[self.nh+1]
def grad_sigmoid(self, x):
return x*(1-x)
def grad(self, x, y):
self.forward_pass(x)
self.dW = {}
self.dB = {}
self.dH = {}
self.dA = {}
L = self.nh + 1
self.dA[L] = (self.H[L] - y)
for k in range(L, 0, -1):
self.dW[k] = np.matmul(self.H[k-1].T, self.dA[k])
self.dB[k] = self.dA[k]
self.dH[k-1] = np.matmul(self.dA[k], self.W[k].T)
self.dA[k-1] = np.multiply(self.dH[k-1], self.grad_sigmoid(self.H[k-1]))
def fit(self, X, Y, epochs=1, learning_rate=1, initialize=True):
# initialize w, b
if initialize:
for i in range(self.nh+1):
self.W[i+1] = np.random.randn(self.sizes[i], self.sizes[i+1])
self.B[i+1] = np.zeros((1, self.sizes[i+1]))
for e in range(epochs):
dW = {}
dB = {}
for i in range(self.nh+1):
dW[i+1] = np.zeros((self.sizes[i], self.sizes[i+1]))
dB[i+1] = np.zeros((1, self.sizes[i+1]))
for x, y in zip(X, Y):
self.grad(x, y)
for i in range(self.nh+1):
dW[i+1] += self.dW[i+1]
dB[i+1] += self.dB[i+1]
m = X.shape[1]
for i in range(self.nh+1):
self.W[i+1] -= learning_rate * dW[i+1] / m
self.B[i+1] -= learning_rate * dB[i+1] / m
Y_pred = self.predict(X)
print('loss at epoch {} = {}'.format(e, mean_squared_error(Y_pred, Y)))
def predict(self, X):
Y_pred = []
for x in X:
y_pred = self.forward_pass(x)
Y_pred.append(y_pred)
return np.array(Y_pred).squeeze()
现在,训练网络:
#train the network with two hidden layers - 2 neurons and 2 neurons
ffsnn = FFSNNetwork(2, [2, 2])
# XOR data
X_train, y_train = np.array([[0, 0], [0, 1], [1,0], [1, 1]]), np.array([0, 1, 1, 0])
ffsnn.fit(X_train, y_train, epochs=5000, learning_rate=.15)
接下来,用网络进行预测:
y_pred_prob = ffsnn.predict(X_train) # P(y = 1)
y_pred = (y_pred_prob >= 0.5).astype("int").ravel() # threshold = 0.5
X_train
# array([[0, 0], [0, 1], [1, 0], [1, 1]])
y_train
# array([0, 1, 1, 0])
y_pred_prob
# array([0.00803102, 0.99439243, 0.99097831, 0.00664639])
y_pred
# array([0, 1, 1, 0])
accuracy_score(y_train, y_pred)
# 1.0
请注意,这里使用真实和预测 y 值之间的 MSE 来绘制损失函数,您也可以绘制 BCE(交叉熵)损失函数。
最后,以下动画展示了如何最小化损失函数以及如何学习决策边界:
请注意,在上面的动画中,绿色和红色点分别代表正(标签为 1)和负(标签为 0)训练数据点,请注意它们在最终的决策边界中是如何分离的训练时期的阶段(对应于 XOR 的负数据点的较暗区域和正数据点的较亮区域)。
您可以使用高级深度学习库实现相同的功能,例如 keras
只需几行代码:
import tensorflow as tf
from tensorflow import keras
inputs = keras.Input(shape=(2,), name="in")
x = layers.Dense(4, activation="relu", name="dense_1")(inputs)
x = layers.Dense(4, activation="relu", name="dense_2")(x)
outputs = layers.Dense(1, activation="sigmoid", name="out")(x)
model = keras.Model(inputs=inputs, outputs=outputs)
X_train, y_train = np.array([[0, 0], [0, 1], [1,0], [1, 1]]), np.array([0, 1, 1, 0])
model.compile(
optimizer=keras.optimizers.Adam(), # Optimizer
# Loss function to minimize
loss=tf.keras.losses.BinaryCrossentropy(),
# List of metrics to monitor
metrics=[keras.metrics.BinaryAccuracy(name="accuracy")],
)
print("Fit model on training data")
history = model.fit(
X_train,
y_train,
batch_size=4,
epochs=1000)
# ...
# Epoch 371/1000
# 4/4 [==============================] - 0s 500us/sample - loss: 0.5178 - accuracy: 0.7500
# Epoch 372/1000
# 4/4 [==============================] - 0s 499us/sample - loss: 0.5169 - accuracy: 0.7500
# Epoch 373/1000
# 4/4 [==============================] - 0s 499us/sample - loss: 0.5160 - accuracy: 1.0000
# Epoch 374/1000
# 4/4 [==============================] - 0s 499us/sample - loss: 0.5150 - accuracy: 1.0000
# ...
print("Evaluate")
results = model.evaluate(X_train, y_train, batch_size=4)
print("loss, acc:", results)
# loss, acc: [0.1260240525007248, 1.0]
下图显示了训练时期的损失/准确性。
最后,keras
和 softmax
(而不是 sigmoid
):
from keras.utils import to_categorical
X_train, y_train = np.array([[0, 0], [0, 1], [1,0], [1, 1]]), np.array([0, 1, 1, 0])
y_train = to_categorical(y_train, num_classes=2)
inputs = keras.Input(shape=(2,), name="in")
x = layers.Dense(4, activation="relu", name="dense_1")(inputs)
x = layers.Dense(4, activation="relu", name="dense_2")(x)
outputs = layers.Dense(2, activation="softmax", name="out")(x)
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(
optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['acc']
)
print("Fit model on training data")
history = model.fit(
X_train,
y_train,
batch_size=4,
epochs=2000)
# Epoch 663/2000
# 4/4 [==============================] - 0s 500us/sample - loss: 0.3893 - acc: 0.7500
# Epoch 664/2000
# 4/4 [==============================] - 0s 500us/sample - loss: 0.3888 - acc: 1.0000
# Epoch 665/2000
# 4/4 [==============================] - 0s 500us/sample - loss: 0.3878 - acc: 1.0000
print("Evaluate")
results = model.evaluate(X_train, y_train, batch_size=4)
print("loss, acc:", results)
# loss, acc: [0.014970880933105946, 1.0]
具有以下损失/精度收敛:
输出层卡在 [0.5, 0.5] 向量处。如果代码有任何问题,谁能帮助理解。
我要训练的神经网络是一个 X-OR 门,所以在这种情况下,输出向量应该接近代表正确 class(0 或 1)的一个热向量,但是所有纪元之后的输出向量仍然停留在 [0.5, 0.5]
class Backpropogation:
def setupWeightsBiases(self):
for i in range(1, self.num_layers):
self.weights_dict[i] = rnd.rand(self.layer_spec[i], self.layer_spec[i - 1])
self.bias_dict[i] = rnd.rand(self.layer_spec[i], 1)
def __init__(self, hidden_layer_neurons_tuple, train_data, num_output_classes, output_layer_func='sigmoid'):
self.train_input = train_data[0]
self.input_layer_size = self.train_input[0].size
self.train_input = self.train_input.reshape(self.train_input.shape[0], self.input_layer_size).T
self.output_layer_size = num_output_classes
self.train_output = train_data[1]
print(self.train_output.shape)
num_hidden_layer = len(hidden_layer_neurons_tuple)
self.hidden_layer_neurons_tuple = hidden_layer_neurons_tuple
self.layer_spec = [self.input_layer_size] + \
list(hidden_layer_neurons_tuple) + \
[num_output_classes]
self.layer_spec = tuple(self.layer_spec)
self.num_layers = num_hidden_layer + 2
self.train_data = train_data
self.activation_layer_gradient_dict = {}
self.preactivation_layer_gradient_dict = {}
self.weights_gradient_dict = {}
self.bias_gradient_dict = {}
self.curr_input = None
self.curr_output = None
self.weights_dict = {}
self.preactivation_layer_dict = {}
self.activation_layer_dict = {}
self.bias_dict = {}
self.setupWeightsBiases()
self.output = None
self.output_diff = None
self.num_output_classes = num_output_classes
def predictClass(self):
return np.argmax(self.activation_layer_dict[self.num_layers - 1])
def forwardPropogation(self, input):
# Load h[0] as the input data
self.activation_layer_dict[0] = input
'''
load input data into h[0]
for i in (1,L):
a[k] = W[k] * h[k-1] + b[k]
and finally calculate the Lth layer output with the special activation function
'''
for i in range(1, self.num_layers):
self.preactivation_layer_dict[i] = \
np.matmul(self.weights_dict[i], self.activation_layer_dict[i - 1]) + \
self.bias_dict[i]
# print(self.preactivation_layer_dict[i])
vec = self.preactivation_layer_dict[i]
self.activation_layer_dict[i] = self.activationFunction(vec)
# This will change h[L] to y'
self.activation_layer_dict[self.num_layers - 1] = self.outputFunction()
def findGradients(self, index):
class_label = self.train_output[index]
output_one_hot_vector = np.zeros((self.num_output_classes, 1))
output_one_hot_vector[class_label] = 1
output = self.activation_layer_dict[self.num_layers - 1]
self.preactivation_layer_gradient_dict[self.num_layers - 1] = -1 * (output_one_hot_vector - output)
for layer in reversed(range(1, self.num_layers)):
self.weights_gradient_dict[layer] = np.matmul(self.preactivation_layer_gradient_dict[layer],
self.activation_layer_dict[layer - 1].T)
self.bias_gradient_dict[layer] = self.preactivation_layer_gradient_dict[layer]
self.activation_layer_gradient_dict[layer - 1] = np.matmul(self.weights_dict[layer].T,
self.preactivation_layer_gradient_dict[layer])
if layer != 1:
self.preactivation_layer_gradient_dict[layer - 1] = np.multiply(
self.activation_layer_gradient_dict[layer - 1],
self.outputFunctionDiff(layer - 1))
def activationFunction(self, vec, type='sigmoid'):
if type == 'sigmoid':
return 1 / (1 + expit(-vec))
else:
print('Please select correct output function')
exit()
def outputFunction(self, type='sigmoid'):
if type == 'sigmoid':
return 1 / (1 + expit(-self.preactivation_layer_dict[self.num_layers - 1]))
else:
print('Please select correct output function')
exit()
def outputFunctionDiff(self, layer, type='sigmoid'):
op_layer = self.num_layers - 1
if type == 'sigmoid':
vec = self.preactivation_layer_dict[layer]
return np.multiply(self.activationFunction(vec), 1 - self.activationFunction(vec))
else:
print('Please select correct output function')
exit()
def updateWeightsAndBiases(self, learning_rate):
for layer in range(1, self.num_layers):
self.weights_dict[layer] = self.weights_dict[layer] - learning_rate * self.weights_gradient_dict[layer]
self.preactivation_layer_dict[layer] = self.preactivation_layer_dict[layer] - \
learning_rate * self.preactivation_layer_gradient_dict[layer]
if not (layer == self.num_layers - 1):
self.activation_layer_dict[layer] = self.activation_layer_dict[layer] - \
learning_rate * self.activation_layer_gradient_dict[layer]
self.bias_dict[layer] = self.bias_dict[layer] - learning_rate * self.bias_gradient_dict[layer]
def getLoss(self, index):
return np.log2(self.activation_layer_dict[self.num_layers - 1][self.train_output[index], 0])
def train(self, learning_rate, num_epochs):
for curr_epoch in range(num_epochs):
print('Evaluating at ' + str(curr_epoch))
index_array = list(np.arange(0, self.train_input.shape[1]))
np.random.shuffle(index_array)
for train_data_index in index_array:
test_input = self.train_input[:, [train_data_index]]
self.forwardPropogation(test_input)
# print(self.activation_layer_dict[self.num_layers - 1])
self.findGradients(train_data_index)
self.updateWeightsAndBiases(learning_rate)
print('Loss ' + str(self.getLoss(train_data_index)))
# Assumes a 2D array of 784xN array as test input
# This will return output classes of the data
def test(self, test_data):
index_range = test_data.shape[1]
test_class_list = []
for index in range(index_range):
self.forwardPropogation(test_data[:, [index]])
test_class_list.append(self.predictClass())
return test_class_list
# train the NN with BP
train_data = (np.array([[0, 0], [0, 1], [1,0], [1, 1]]), np.array([0, 1, 1, 0]))
b = Backpropogation((2, 2), train_data, 2)
下面的代码(检查 this for implementation and this 的理论)从头开始实现一个带有反向传播的神经网络,使用带有 sigmoid 激活的单个输出单元(否则它看起来与您的实现相似)。
使用此异或函数可以以适当的学习率和时期学习(虽然它有时会卡在局部最小值,但您可以考虑实施 drop-out 等正则化器)。此外,您可以将其转换为您的 2 输出(softmax?)版本,您能找出实施中的任何问题吗?例如,您可以查看以下指针:
- 在反向传播期间批量更新参数而不是随机更新
- 运行 足够的纪元
- 改变学习率
- 对隐藏层使用 Relu 激活而不是 sigmoid(以应对梯度消失) 等等
from sklearn.metrics import accuracy_score, mean_squared_error
class FFSNNetwork:
def __init__(self, n_inputs, hidden_sizes=[2]):
#intialize the inputs
self.nx = n_inputs
self.ny = 1 # number of neurons in the output layer
self.nh = len(hidden_sizes)
self.sizes = [self.nx] + hidden_sizes + [self.ny]
self.W = {}
self.B = {}
for i in range(self.nh+1):
self.W[i+1] = np.random.rand(self.sizes[i], self.sizes[i+1])
self.B[i+1] = np.random.rand(1, self.sizes[i+1])
def sigmoid(self, x):
return 1.0/(1.0 + np.exp(-x))
def forward_pass(self, x):
self.A = {}
self.H = {}
self.H[0] = x.reshape(1, -1)
for i in range(self.nh+1):
self.A[i+1] = np.matmul(self.H[i], self.W[i+1]) + self.B[i+1]
self.H[i+1] = self.sigmoid(self.A[i+1])
return self.H[self.nh+1]
def grad_sigmoid(self, x):
return x*(1-x)
def grad(self, x, y):
self.forward_pass(x)
self.dW = {}
self.dB = {}
self.dH = {}
self.dA = {}
L = self.nh + 1
self.dA[L] = (self.H[L] - y)
for k in range(L, 0, -1):
self.dW[k] = np.matmul(self.H[k-1].T, self.dA[k])
self.dB[k] = self.dA[k]
self.dH[k-1] = np.matmul(self.dA[k], self.W[k].T)
self.dA[k-1] = np.multiply(self.dH[k-1], self.grad_sigmoid(self.H[k-1]))
def fit(self, X, Y, epochs=1, learning_rate=1, initialize=True):
# initialize w, b
if initialize:
for i in range(self.nh+1):
self.W[i+1] = np.random.randn(self.sizes[i], self.sizes[i+1])
self.B[i+1] = np.zeros((1, self.sizes[i+1]))
for e in range(epochs):
dW = {}
dB = {}
for i in range(self.nh+1):
dW[i+1] = np.zeros((self.sizes[i], self.sizes[i+1]))
dB[i+1] = np.zeros((1, self.sizes[i+1]))
for x, y in zip(X, Y):
self.grad(x, y)
for i in range(self.nh+1):
dW[i+1] += self.dW[i+1]
dB[i+1] += self.dB[i+1]
m = X.shape[1]
for i in range(self.nh+1):
self.W[i+1] -= learning_rate * dW[i+1] / m
self.B[i+1] -= learning_rate * dB[i+1] / m
Y_pred = self.predict(X)
print('loss at epoch {} = {}'.format(e, mean_squared_error(Y_pred, Y)))
def predict(self, X):
Y_pred = []
for x in X:
y_pred = self.forward_pass(x)
Y_pred.append(y_pred)
return np.array(Y_pred).squeeze()
现在,训练网络:
#train the network with two hidden layers - 2 neurons and 2 neurons
ffsnn = FFSNNetwork(2, [2, 2])
# XOR data
X_train, y_train = np.array([[0, 0], [0, 1], [1,0], [1, 1]]), np.array([0, 1, 1, 0])
ffsnn.fit(X_train, y_train, epochs=5000, learning_rate=.15)
接下来,用网络进行预测:
y_pred_prob = ffsnn.predict(X_train) # P(y = 1)
y_pred = (y_pred_prob >= 0.5).astype("int").ravel() # threshold = 0.5
X_train
# array([[0, 0], [0, 1], [1, 0], [1, 1]])
y_train
# array([0, 1, 1, 0])
y_pred_prob
# array([0.00803102, 0.99439243, 0.99097831, 0.00664639])
y_pred
# array([0, 1, 1, 0])
accuracy_score(y_train, y_pred)
# 1.0
请注意,这里使用真实和预测 y 值之间的 MSE 来绘制损失函数,您也可以绘制 BCE(交叉熵)损失函数。
最后,以下动画展示了如何最小化损失函数以及如何学习决策边界:
请注意,在上面的动画中,绿色和红色点分别代表正(标签为 1)和负(标签为 0)训练数据点,请注意它们在最终的决策边界中是如何分离的训练时期的阶段(对应于 XOR 的负数据点的较暗区域和正数据点的较亮区域)。
您可以使用高级深度学习库实现相同的功能,例如 keras
只需几行代码:
import tensorflow as tf
from tensorflow import keras
inputs = keras.Input(shape=(2,), name="in")
x = layers.Dense(4, activation="relu", name="dense_1")(inputs)
x = layers.Dense(4, activation="relu", name="dense_2")(x)
outputs = layers.Dense(1, activation="sigmoid", name="out")(x)
model = keras.Model(inputs=inputs, outputs=outputs)
X_train, y_train = np.array([[0, 0], [0, 1], [1,0], [1, 1]]), np.array([0, 1, 1, 0])
model.compile(
optimizer=keras.optimizers.Adam(), # Optimizer
# Loss function to minimize
loss=tf.keras.losses.BinaryCrossentropy(),
# List of metrics to monitor
metrics=[keras.metrics.BinaryAccuracy(name="accuracy")],
)
print("Fit model on training data")
history = model.fit(
X_train,
y_train,
batch_size=4,
epochs=1000)
# ...
# Epoch 371/1000
# 4/4 [==============================] - 0s 500us/sample - loss: 0.5178 - accuracy: 0.7500
# Epoch 372/1000
# 4/4 [==============================] - 0s 499us/sample - loss: 0.5169 - accuracy: 0.7500
# Epoch 373/1000
# 4/4 [==============================] - 0s 499us/sample - loss: 0.5160 - accuracy: 1.0000
# Epoch 374/1000
# 4/4 [==============================] - 0s 499us/sample - loss: 0.5150 - accuracy: 1.0000
# ...
print("Evaluate")
results = model.evaluate(X_train, y_train, batch_size=4)
print("loss, acc:", results)
# loss, acc: [0.1260240525007248, 1.0]
下图显示了训练时期的损失/准确性。
最后,keras
和 softmax
(而不是 sigmoid
):
from keras.utils import to_categorical
X_train, y_train = np.array([[0, 0], [0, 1], [1,0], [1, 1]]), np.array([0, 1, 1, 0])
y_train = to_categorical(y_train, num_classes=2)
inputs = keras.Input(shape=(2,), name="in")
x = layers.Dense(4, activation="relu", name="dense_1")(inputs)
x = layers.Dense(4, activation="relu", name="dense_2")(x)
outputs = layers.Dense(2, activation="softmax", name="out")(x)
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(
optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['acc']
)
print("Fit model on training data")
history = model.fit(
X_train,
y_train,
batch_size=4,
epochs=2000)
# Epoch 663/2000
# 4/4 [==============================] - 0s 500us/sample - loss: 0.3893 - acc: 0.7500
# Epoch 664/2000
# 4/4 [==============================] - 0s 500us/sample - loss: 0.3888 - acc: 1.0000
# Epoch 665/2000
# 4/4 [==============================] - 0s 500us/sample - loss: 0.3878 - acc: 1.0000
print("Evaluate")
results = model.evaluate(X_train, y_train, batch_size=4)
print("loss, acc:", results)
# loss, acc: [0.014970880933105946, 1.0]
具有以下损失/精度收敛: