从头开始实现的 C++ 神经网络在 MNIST 上不能超过 50%
C++ neural network implemented from scratch cannot get above 50% on MNIST
所以我用 C++ 实现了一个完全连接的单隐藏层神经网络,使用 Eigen 进行矩阵乘法。它使用小批量梯度下降。
但是,我的模型在 mnist 上的准确率无法超过 50%。我尝试过 0.0001 到 10 之间的学习率。该模型在训练规模小于 100 时会过度拟合(准确率约为 90%,这仍然很糟糕),尽管速度非常慢。
是什么导致了这种低准确性和极其缓慢的学习?我主要担心的是反向传播不正确。此外,我不希望添加任何其他优化技术(学习率计划、正则化等)。
前馈和反向传播代码:
z1 = (w1 * mbX).colwise() + b1;
a1 = sigmoid(z1);
z2 = (w2 * a1).colwise() + b2;
a2 = sigmoid(z2);
MatrixXd err = ((double) epsilon)/((double) minibatch_size) * ((a2 - mbY).array() * sigmoid_derivative(z2).array()).matrix();
b2 = b2 - err * ones;
w2 = w2 - (err * a1.transpose());
err = ((w2.transpose() * err).array() * sigmoid_derivative(z1).array()).matrix();
b1 = b1 - err * ones;
w1 = w1 - (err * mbX.transpose());
完整程序代码:
#include <iostream>
#include <fstream>
#include <math.h>
#include <cstdlib>
#include <Eigen/Dense>
#include <vector>
#include <string>
using namespace Eigen;
#define N 30
#define epsilon 0.7
#define epoch 1000
//sizes
const int minibatch_size = 10;
const int training_size = 10000;
const int val_size = 10;
unsigned int num, magic, rows, cols;
//images
unsigned int image[training_size][28][28];
unsigned int val_image[val_size][28][28];
//labels
unsigned int label[training_size];
unsigned int val_label[val_size];
//inputs
MatrixXd X(784, training_size);
MatrixXd Y = MatrixXd::Zero(10, training_size);
//minibatch
MatrixXd mbX(784, minibatch_size);
MatrixXd mbY = MatrixXd::Zero(10, minibatch_size);
//validation
MatrixXd Xv(784, val_size);
MatrixXd Yv = MatrixXd::Zero(10, val_size);
//Image processing courtesy of https://whosebug.com/users/11146076/%e5%bc%a0%e4%ba%91%e9%93%ad
unsigned int in(std::ifstream& icin, unsigned int size) {
unsigned int ans = 0;
for (int i = 0; i < size; i++) {
unsigned char x;
icin.read((char*)&x, 1);
unsigned int temp = x;
ans <<= 8;
ans += temp;
}
return ans;
}
void input(std::string ipath, std::string lpath, std::string ipath2, std::string lpath2) {
std::ifstream icin;
//training data
icin.open(ipath, std::ios::binary);
magic = in(icin, 4), num = in(icin, 4), rows = in(icin, 4), cols = in(icin, 4);
for (int i = 0; i < training_size; i++) {
int val = 0;
for (int x = 0; x < rows; x++) {
for (int y = 0; y < cols; y++) {
image[i][x][y] = in(icin, 1);
X(val, i) = image[i][x][y]/255;
val++;
}
}
}
icin.close();
//training labels
icin.open(lpath, std::ios::binary);
magic = in(icin, 4), num = in(icin, 4);
for (int i = 0; i < training_size; i++) {
label[i] = in(icin, 1);
Y(label[i], i) = 1;
}
icin.close();
//validation data
icin.open(ipath2, std::ios::binary);
magic = in(icin, 4), num = in(icin, 4), rows = in(icin, 4), cols = in(icin, 4);
for (int i = 0; i < val_size; i++) {
int val = 0;
for (int x = 0; x < rows; x++) {
for (int y = 0; y < cols; y++) {
val_image[i][x][y] = in(icin, 1);
Xv(val, i) = val_image[i][x][y]/255;
val++;
}
}
}
icin.close();
//validation labels
icin.open(lpath2, std::ios::binary);
magic = in(icin, 4), num = in(icin, 4);
for (int i = 0; i < val_size; i++) {
val_label[i] = in(icin, 1);
Yv(val_label[i], i) = 1;
}
icin.close();
}
//Neural Network calculations
MatrixXd sigmoid(MatrixXd m) {
m *= -1;
return (1/(1 + m.array().exp())).matrix();
}
MatrixXd sigmoid_derivative(MatrixXd m) {
return (sigmoid(m).array() * (1 - sigmoid(m).array())).matrix();
}
//Initialize weights and biases
//hidden layer
VectorXd b1 = MatrixXd::Zero(N, 1);
MatrixXd w1 = MatrixXd::Random(N, 784);
//output
VectorXd b2 = MatrixXd::Zero(10, 1);
MatrixXd w2 = MatrixXd::Random(10, N);
//Initialize intermediate values
MatrixXd z1, z2, a1, a2, z1v, z2v, a1v, a2v;
MatrixXd ones = MatrixXd::Constant(minibatch_size, 1, 1);
int main() {
input("C:\Users\Aaron\Documents\Test\train-images-idx3-ubyte\train-images.idx3-ubyte", "C:\Users\Aaron\Documents\Test\train-labels-idx1-ubyte\train-labels.idx1-ubyte", "C:\Users\Aaron\Documents\Test\t10k-images-idx3-ubyte\t10k-images.idx3-ubyte", "C:\Users\Aaron\Documents\Test\t10k-labels-idx1-ubyte\t10k-labels.idx1-ubyte");
std::cout << "Finished Image Processing" << std::endl;
//std::cout << w1 << std::endl;
std::vector<double> val_ac;
std::vector<double> c;
std::vector<int> order;
for (int i = 0; i < training_size; i++) {
order.push_back(i);
}
for (int i = 0; i < epoch; i++) {
//feed forward
std::random_shuffle(order.begin(), order.end());
for (int j = 0; j < training_size/minibatch_size; j++) {
for (int k = 0; k < minibatch_size; k++) {
int index = order[j * minibatch_size + k];
mbX.col(k) = X.col(index);
mbY.col(k) = Y.col(index);
}
z1 = (w1 * mbX).colwise() + b1;
a1 = sigmoid(z1);
z2 = (w2 * a1).colwise() + b2;
a2 = sigmoid(z2);
MatrixXd err = ((double) epsilon)/((double) minibatch_size) * ((a2 - mbY).array() * sigmoid_derivative(z2).array()).matrix();
//std::cout << err << std::endl;
b2 = b2 - err * ones;
w2 = w2 - (err * a1.transpose());
err = ((w2.transpose() * err).array() * sigmoid_derivative(z1).array()).matrix();
//std::cout << err << std::endl;
b1 = b1 - err * ones;
w1 = w1 - (err * mbX.transpose());
}
//validation
z1 = (w1 * X).colwise() + b1;
a1 = sigmoid(z1);
z2 = (w2 * a1).colwise() + b2;
a2 = sigmoid(z2);
double cost = 1/((double) training_size) * ((a2 - Y).array() * (a2 - Y).array()).matrix().sum();
c.push_back(cost);
int correct = 0;
for (int i = 0; i < training_size; i++) {
double maxP = -1;
int na;
for (int j = 0; j < 10; j++) {
if (a2(j, i) > maxP) {
maxP = a2(j, i);
na = j;
}
}
if (na == label[i]) correct++;
}
val_ac.push_back(((double) correct) / ((double) training_size));
std::cout << "Finished Epoch " << i + 1 << std::endl;
std::cout << "Cost: " << cost << std::endl;
std::cout << "Accuracy: " << ((double) correct) / ((double) training_size) << std::endl;
}
//plot accuracy
FILE * gp = _popen("gnuplot", "w");
fprintf(gp, "set terminal wxt size 600,400 \n");
fprintf(gp, "set grid \n");
fprintf(gp, "set title '%s' \n", "NN");
fprintf(gp, "plot '-' w line, '-' w lines \n");
for (int i = 0; i < epoch; i++) {
fprintf(gp, "%f %f \n", i + 1.0, c[i]);
}
fprintf(gp, "e\n");
//validation accuracy
for (int i = 0; i < epoch; i++) {
fprintf(gp, "%f %f \n", i + 1.0, val_ac[i]);
}
fprintf(gp, "e\n");
fflush(gp);
system("pause");
_pclose(gp);
return 0;
}
UPD
这是训练数据集(绿色)和损失(紫色)的准确度图
https://i.stack.imgur.com/Ya2yR.png
这是训练数据和验证数据的损失图:
验证数据的损失在超过某个点后不断增加,这显示出过度拟合的迹象。然而,即使在训练数据上,准确性仍然很差。
unsigned int val_image[val_size][28][28];
Xv(val, i) = val_image[i][x][y]/255;
你能再试一次吗Xv(val, i) = val_image[i][x][y] / 255.0;
还有:
X(val, i) = image[i][x][y]/255;
根据编写的代码,Xv
经常为 0,当图像值为 255 时为 1。使用浮点除法,您将获得 0.0 到 1.0 之间的值。
您需要检查您的代码,了解您可能除以整数的其他地方。
N.b.: 在C++中,240/255为0.
所以我用 C++ 实现了一个完全连接的单隐藏层神经网络,使用 Eigen 进行矩阵乘法。它使用小批量梯度下降。
但是,我的模型在 mnist 上的准确率无法超过 50%。我尝试过 0.0001 到 10 之间的学习率。该模型在训练规模小于 100 时会过度拟合(准确率约为 90%,这仍然很糟糕),尽管速度非常慢。
是什么导致了这种低准确性和极其缓慢的学习?我主要担心的是反向传播不正确。此外,我不希望添加任何其他优化技术(学习率计划、正则化等)。
前馈和反向传播代码:
z1 = (w1 * mbX).colwise() + b1;
a1 = sigmoid(z1);
z2 = (w2 * a1).colwise() + b2;
a2 = sigmoid(z2);
MatrixXd err = ((double) epsilon)/((double) minibatch_size) * ((a2 - mbY).array() * sigmoid_derivative(z2).array()).matrix();
b2 = b2 - err * ones;
w2 = w2 - (err * a1.transpose());
err = ((w2.transpose() * err).array() * sigmoid_derivative(z1).array()).matrix();
b1 = b1 - err * ones;
w1 = w1 - (err * mbX.transpose());
完整程序代码:
#include <iostream>
#include <fstream>
#include <math.h>
#include <cstdlib>
#include <Eigen/Dense>
#include <vector>
#include <string>
using namespace Eigen;
#define N 30
#define epsilon 0.7
#define epoch 1000
//sizes
const int minibatch_size = 10;
const int training_size = 10000;
const int val_size = 10;
unsigned int num, magic, rows, cols;
//images
unsigned int image[training_size][28][28];
unsigned int val_image[val_size][28][28];
//labels
unsigned int label[training_size];
unsigned int val_label[val_size];
//inputs
MatrixXd X(784, training_size);
MatrixXd Y = MatrixXd::Zero(10, training_size);
//minibatch
MatrixXd mbX(784, minibatch_size);
MatrixXd mbY = MatrixXd::Zero(10, minibatch_size);
//validation
MatrixXd Xv(784, val_size);
MatrixXd Yv = MatrixXd::Zero(10, val_size);
//Image processing courtesy of https://whosebug.com/users/11146076/%e5%bc%a0%e4%ba%91%e9%93%ad
unsigned int in(std::ifstream& icin, unsigned int size) {
unsigned int ans = 0;
for (int i = 0; i < size; i++) {
unsigned char x;
icin.read((char*)&x, 1);
unsigned int temp = x;
ans <<= 8;
ans += temp;
}
return ans;
}
void input(std::string ipath, std::string lpath, std::string ipath2, std::string lpath2) {
std::ifstream icin;
//training data
icin.open(ipath, std::ios::binary);
magic = in(icin, 4), num = in(icin, 4), rows = in(icin, 4), cols = in(icin, 4);
for (int i = 0; i < training_size; i++) {
int val = 0;
for (int x = 0; x < rows; x++) {
for (int y = 0; y < cols; y++) {
image[i][x][y] = in(icin, 1);
X(val, i) = image[i][x][y]/255;
val++;
}
}
}
icin.close();
//training labels
icin.open(lpath, std::ios::binary);
magic = in(icin, 4), num = in(icin, 4);
for (int i = 0; i < training_size; i++) {
label[i] = in(icin, 1);
Y(label[i], i) = 1;
}
icin.close();
//validation data
icin.open(ipath2, std::ios::binary);
magic = in(icin, 4), num = in(icin, 4), rows = in(icin, 4), cols = in(icin, 4);
for (int i = 0; i < val_size; i++) {
int val = 0;
for (int x = 0; x < rows; x++) {
for (int y = 0; y < cols; y++) {
val_image[i][x][y] = in(icin, 1);
Xv(val, i) = val_image[i][x][y]/255;
val++;
}
}
}
icin.close();
//validation labels
icin.open(lpath2, std::ios::binary);
magic = in(icin, 4), num = in(icin, 4);
for (int i = 0; i < val_size; i++) {
val_label[i] = in(icin, 1);
Yv(val_label[i], i) = 1;
}
icin.close();
}
//Neural Network calculations
MatrixXd sigmoid(MatrixXd m) {
m *= -1;
return (1/(1 + m.array().exp())).matrix();
}
MatrixXd sigmoid_derivative(MatrixXd m) {
return (sigmoid(m).array() * (1 - sigmoid(m).array())).matrix();
}
//Initialize weights and biases
//hidden layer
VectorXd b1 = MatrixXd::Zero(N, 1);
MatrixXd w1 = MatrixXd::Random(N, 784);
//output
VectorXd b2 = MatrixXd::Zero(10, 1);
MatrixXd w2 = MatrixXd::Random(10, N);
//Initialize intermediate values
MatrixXd z1, z2, a1, a2, z1v, z2v, a1v, a2v;
MatrixXd ones = MatrixXd::Constant(minibatch_size, 1, 1);
int main() {
input("C:\Users\Aaron\Documents\Test\train-images-idx3-ubyte\train-images.idx3-ubyte", "C:\Users\Aaron\Documents\Test\train-labels-idx1-ubyte\train-labels.idx1-ubyte", "C:\Users\Aaron\Documents\Test\t10k-images-idx3-ubyte\t10k-images.idx3-ubyte", "C:\Users\Aaron\Documents\Test\t10k-labels-idx1-ubyte\t10k-labels.idx1-ubyte");
std::cout << "Finished Image Processing" << std::endl;
//std::cout << w1 << std::endl;
std::vector<double> val_ac;
std::vector<double> c;
std::vector<int> order;
for (int i = 0; i < training_size; i++) {
order.push_back(i);
}
for (int i = 0; i < epoch; i++) {
//feed forward
std::random_shuffle(order.begin(), order.end());
for (int j = 0; j < training_size/minibatch_size; j++) {
for (int k = 0; k < minibatch_size; k++) {
int index = order[j * minibatch_size + k];
mbX.col(k) = X.col(index);
mbY.col(k) = Y.col(index);
}
z1 = (w1 * mbX).colwise() + b1;
a1 = sigmoid(z1);
z2 = (w2 * a1).colwise() + b2;
a2 = sigmoid(z2);
MatrixXd err = ((double) epsilon)/((double) minibatch_size) * ((a2 - mbY).array() * sigmoid_derivative(z2).array()).matrix();
//std::cout << err << std::endl;
b2 = b2 - err * ones;
w2 = w2 - (err * a1.transpose());
err = ((w2.transpose() * err).array() * sigmoid_derivative(z1).array()).matrix();
//std::cout << err << std::endl;
b1 = b1 - err * ones;
w1 = w1 - (err * mbX.transpose());
}
//validation
z1 = (w1 * X).colwise() + b1;
a1 = sigmoid(z1);
z2 = (w2 * a1).colwise() + b2;
a2 = sigmoid(z2);
double cost = 1/((double) training_size) * ((a2 - Y).array() * (a2 - Y).array()).matrix().sum();
c.push_back(cost);
int correct = 0;
for (int i = 0; i < training_size; i++) {
double maxP = -1;
int na;
for (int j = 0; j < 10; j++) {
if (a2(j, i) > maxP) {
maxP = a2(j, i);
na = j;
}
}
if (na == label[i]) correct++;
}
val_ac.push_back(((double) correct) / ((double) training_size));
std::cout << "Finished Epoch " << i + 1 << std::endl;
std::cout << "Cost: " << cost << std::endl;
std::cout << "Accuracy: " << ((double) correct) / ((double) training_size) << std::endl;
}
//plot accuracy
FILE * gp = _popen("gnuplot", "w");
fprintf(gp, "set terminal wxt size 600,400 \n");
fprintf(gp, "set grid \n");
fprintf(gp, "set title '%s' \n", "NN");
fprintf(gp, "plot '-' w line, '-' w lines \n");
for (int i = 0; i < epoch; i++) {
fprintf(gp, "%f %f \n", i + 1.0, c[i]);
}
fprintf(gp, "e\n");
//validation accuracy
for (int i = 0; i < epoch; i++) {
fprintf(gp, "%f %f \n", i + 1.0, val_ac[i]);
}
fprintf(gp, "e\n");
fflush(gp);
system("pause");
_pclose(gp);
return 0;
}
UPD
这是训练数据集(绿色)和损失(紫色)的准确度图
https://i.stack.imgur.com/Ya2yR.png
这是训练数据和验证数据的损失图:
验证数据的损失在超过某个点后不断增加,这显示出过度拟合的迹象。然而,即使在训练数据上,准确性仍然很差。
unsigned int val_image[val_size][28][28];
Xv(val, i) = val_image[i][x][y]/255;
你能再试一次吗Xv(val, i) = val_image[i][x][y] / 255.0;
还有:
X(val, i) = image[i][x][y]/255;
根据编写的代码,Xv
经常为 0,当图像值为 255 时为 1。使用浮点除法,您将获得 0.0 到 1.0 之间的值。
您需要检查您的代码,了解您可能除以整数的其他地方。
N.b.: 在C++中,240/255为0.