使用多层感知器的 XOR 分类

XOR classification using multilayer perceptron

我想实现一个多层感知器。
我在 GitHub 上找到了一些代码,可以很好地对 MNIST 进行分类 (96%)。但是,由于某些原因,它无法处理异或任务。
我想明白为什么。
这是代码:

perceptron.py

import random
import numpy as np


class Perceptron:

    def __init__(self, *, layer_sizes, activation_functions, cost_function_deriv):
        self.layer_sizes = layer_sizes
        if len(self.layer_sizes) - 1 != len(activation_functions):
            raise ValueError("...")
        self.activation_functions = activation_functions
        self.cost_function_deriv = cost_function_deriv
        self.biases = [np.random.randn(y, 1) for y in layer_sizes[1:]]
        self.weights = [np.random.randn(y, x) for x, y in zip(layer_sizes[:-1], layer_sizes[1:])]

    def train(self, training_data, test_data, epochs, mini_batch_size, lr):
        test_data_len = len(test_data)
        for epoch in range(epochs):
            random.shuffle(training_data)
            mini_batches = [training_data[x: x + mini_batch_size]
                            for x in range(0, len(training_data), mini_batch_size)]
            for mini_batch in mini_batches:
                mb_len = len(mini_batch)
                gradient_weights = [np.zeros(w.shape) for w in self.weights]
                gradient_biases = [np.zeros(b.shape) for b in self.biases]
                for x, y in mini_batch:
                    delta_gradient_biases, delta_gradient_weights = self.backpropagation(np.array(x), y)
                    gradient_weights = [grad + delta for grad, delta in zip(gradient_weights, delta_gradient_weights)]
                    gradient_biases = [grad + delta for grad, delta in zip(gradient_biases, delta_gradient_biases)]
                self.weights = [w - (lr / mb_len) * grad for w, grad in zip(self.weights, gradient_weights)]
                self.biases = [b - (lr / mb_len) * grad for b, grad in zip(self.biases, gradient_biases)]
            correct_answers = self.how_many_correct_answers(test_data)
            print(f"Epoch number {epoch}: {correct_answers}/{test_data_len} correct answers")

    def backpropagation(self, x, y):
        gradient_b = [np.zeros(b.shape) for b in self.biases]
        gradient_w = [np.zeros(w.shape) for w in self.weights]
        activations = [x]
        prev_activation = x
        for i, (b, w) in enumerate(zip(self.biases, self.weights)):
            current_activation = self.activation_functions[i](np.dot(w, prev_activation) + b)
            activations.append(current_activation)
            prev_activation = current_activation
        delta = self.cost_function_deriv(activations[-1], y) * self.activation_functions[-1].deriv(activations[-1])
        gradient_b[-1] = delta
        gradient_w[-1] = np.dot(delta, activations[-2].T)
        for i in range(2, len(self.layer_sizes)):
            z = activations[-i]
            act_der = self.activation_functions[-i + 1].deriv(z)
            delta = np.dot(self.weights[-i + 1].T, delta) * act_der
            gradient_b[-i] = delta
            gradient_w[-i] = np.dot(delta, activations[-i - 1].T)
        # Normal indexing variant:
        # for i in range(len(self.layers) - 1, 0, -1):
        #     z = activations[i]
        #     act_der = self.activation_functions[i].deriv(z)
        #     delta = np.dot(self.weights[i].T, delta) * act_der
        #     gradient_b[i - 1] = delta
        #     gradient_w[i - 1] = np.dot(delta, activations[i - 1].T)
        return gradient_b, gradient_w

    def feedforward(self, a):
        for i, (b, w) in enumerate(zip(self.biases, self.weights)):
            a = self.activation_functions[i](np.dot(w, a) + b)
        return a

    def how_many_correct_answers(self, test_data):
        k = 0
        for x, y in test_data:
            y_predict = np.argmax(self.feedforward(x))
            print(y_predict, y)
            k += int(y_predict == y)
        return k

main.py

from copy import deepcopy
import numpy as np
from perceptron import Perceptron


class Sigmoid:
    out_min_max = [0, 1]

    def __call__(self, x):
        return 1. / (1. + np.exp(-x))

    def deriv(self, y):
        # t = self(x)
        # return t * (1. - t)
        return y * (1. - y)


def cost_function_derivative(y_predict, y_true_label):
    label_vector = np.zeros(y_predict.shape)
    label_vector[y_true_label] = 1.0
    return y_predict - label_vector


def main():
    training_data = np.asarray([[[[0], [0]], 0],
                                [[[0], [1]], 1],
                                [[[1], [0]], 1],
                                [[[1], [1]], 0]])
    layer_sizes = [2, 8, 2]
    model = Perceptron(layer_sizes=layer_sizes,
                       activation_functions=[Sigmoid(), Sigmoid()],
                       cost_function_deriv=cost_function_derivative)
    model.train(deepcopy(training_data),
                deepcopy(training_data),
                epochs=10000,
                mini_batch_size=4,
                lr=0.01)


if __name__ == '__main__':
    main()

格式为'y_predict y_true'的最终输出(每个纪元之后):
0 0
0 1
0 1
0 0

如果删除 random.shuffle(training_data) 则:
1 0
0 1
1 1
0 0
但不是 0 1 1 0

我明白了。它需要以下内容。

mini_batch_size=1
# random.shuffle(training_data) -- comment
epochs=10000

最好这样做:

lr=0.1

大多数情况下的结果是在 ~1000 个 epoch 后获得的:
0 0
1 1
1 1
0 0