上周提起我司每个C/C++工程师大概都能1、2天理解并不依赖现有数据分析库实现一个反向传播。于是自己试着手写一下,算了下大致时间 - 看懂数学具体细节大概3-4个小时(不算证明部分),实现大概3个小时,纠错大概30分钟。最后结果是对于不同的输入数据训练后,test_data误差都能收敛。
算法高度浓缩一下就是一套公式和一套算法 (ref):
涉及到数学的地方很少(证明部分除外) - 第三步里的计算“∇aC”(如果取C为MSE (mean square error)的话,那这一步也就变得非常简单了)和σ′的计算,手边放一本高等代数的书就可以了。矩阵乘法不优化的传统实现也是straightforward。
有点意思的部分是数学,代码实现正常的C/C++ eng就完全可以。
另 - 培根说的“写作让人精确”,受益匪浅,很多感受心理和事实细节非下笔不能仔细把握也。理解一个算法细节其实一样 - code it out!
#include <algorithm>
#include <cassert>
#include <chrono>
#include <cmath>
#include <vector>
#include <random>
typedef std::vector<double> vec;
typedef std::vector<vec> matrix;
typedef std::vector<matrix> matrix_vec;
typedef std::pair<vec, vec> sample;
typedef std::vector<sample> sample_data;
class Network {
public:
Network(const std::vector<int>& layer_sizes_, double eta_):
layer_num(layer_sizes_.size()), layer_sizes(layer_sizes_), eta(eta_)
{
// Generates initial values for weights & bias, using normal distribution (0, 0.4).
shape(weights);
shape(bias);
for (int l = 1; l < layer_num; ++l) {
unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
std::default_random_engine generator(seed);
std::normal_distribution<double> distribution(0, 0.4);
for (int i = 0; i < layer_sizes[l]; ++i) {
bias[l][i] = distribution(generator);
for (int j = 0; j < layer_sizes[l-1]; ++j)
weights[l][i][j] = distribution(generator);
}
}
}
void train(const sample_data& data, const sample_data& test_data) {
printf("Pre-epoch cost: %f\n", cost(test_data));
const int mini_data_size = 50;
int epoch = 1;
for (auto p = data.begin(), q = data.end(); p < q; p += mini_data_size, ++epoch) {
train_batch(p, p + mini_data_size < q ? p + mini_data_size : q);
printf("Epoch: %d: %f\n", epoch, cost(test_data));
}
}
void train_batch(sample_data::const_iterator start, sample_data::const_iterator end) {
const int n = end - start;
matrix_vec w_delta;
matrix b_delta;
shape(w_delta);
shape(b_delta);
for (auto i = start; i < end; ++i)
backprop(*i, w_delta, b_delta);
// now update weights and bias
double t = eta / (double)n;
for (int l = 1; l < layer_num; ++l) {
for (int j = 0; j < layer_sizes[l]; ++j) {
bias[l][j] -= t * b_delta[l][j];
for (int k = 0; k < layer_sizes[l - 1]; ++k) {
weights[l][j][k] -= t * w_delta[l][j][k];
}
}
}
}
void print_weights_and_bias() {
for (int l = 1; l < layer_num; ++l) {
printf("Layer %d\n", l);
printf(" bias:");
for (int j = 0; j < layer_sizes[l]; ++j) {
printf(" %f", bias[l][j]);
}
printf("\n");
printf(" weights:\n");
for (int j = 0; j < layer_sizes[l]; ++j) {
for (int k = 0; k < layer_sizes[l - 1]; ++k)
printf(" %8f", weights[l][j][k]);
printf("\n");
}
}
}
// Given a single training (X, Y), calculate weights_delta & bias_delta.
void backprop(const sample& s, matrix_vec& weights_delta, matrix& bias_delta) {
// For convience.
const vec& X = s.first;
const vec& Y = s.second;
assert(X.size() == layer_sizes[0]);
assert(Y.size() == layer_sizes[layer_num - 1]);
matrix z_values;
matrix a_values;
feed_forward(X, z_values, a_values);
// Now compute errors at the output layer.
matrix errors;
shape(errors);
const vec& aL = a_values[layer_num - 1];
const vec& zL = z_values[layer_num - 1];
vec& eL = errors[layer_num - 1];
for (int n = 0; n < layer_sizes[layer_num - 1]; ++n) {
eL[n] = (aL[n] - Y[n]) * sigmoid_prime(zL[n]);
}
// Back propagate errors.
for (int l = layer_num - 2; l >= 1; --l) {
// Transpose weights[l + 1], which is of shape:
// (layer_sizes[l+1] x layer_sizes[l]).
// So the transposed twl is of shape:
// (layer_sizes[l] x layer_sizes[l+1].
matrix twl;
transpose(weights[l + 1], twl);
// Now twl is of shape (layer_num[l], layer_num[l+1]).
// Multiply it by shape(layer_num[l+1]), 1), result in (layer_num[l],1).
vec result;
multiply(twl, errors[l + 1], result);
assert(result.size() == layer_sizes[l]);
for (int i = 0; i < errors[l].size(); ++i)
errors[l][i] = result[i] * sigmoid_prime(z_values[l][i]);
}
// Now we have error[1..L-1], a[0..L-1] ready, we are good to
// compute gradient vector.
for (int l = 1; l < layer_num; ++l) {
for (int a = 0; a < layer_sizes[l]; ++a) {
bias_delta[l][a] += errors[l][a];
for (int b = 0; b < layer_sizes[l - 1]; ++b) {
weights_delta[l][a][b] += a_values[l - 1][b] * errors[l][a];
}
}
}
}
protected:
static void multiply(const matrix& m, const vec& v, vec& result) {
assert(m[0].size() == v.size());
result = vec(m.size());
for (int a = 0; a < m.size(); ++a) {
double t(0);
for (int b = 0; b < m[a].size(); ++b) {
t += m[a][b] * v[b];
}
result[a] = t;
}
}
void shape(matrix& m) const {
m = std::vector<vec>(layer_num);
for (int l = 0; l < layer_num; ++l)
shape(m[l], layer_sizes[l]);
}
void shape(matrix_vec& mv) const {
mv = std::vector<matrix>(layer_num);
for (int l = 1; l < layer_num; ++l) {
shape(mv[l], layer_sizes[l], layer_sizes[l - 1]);
}
}
// z_values[l][a] is the input value of layer[l], neuron a.
// a_values[l][a] is the activation value of layer[l], neuron a.
// In general, a_values = sigmoid(z_values).
// They are important values used in backprop.
void feed_forward(const vec X, matrix& z_values, matrix& a_values) const {
shape(z_values);
shape(a_values);
z_values[0] = X;
for (int i = 0; i < z_values[0].size(); ++i) {
a_values[0][i] = sigmoid(z_values[0][i]);
}
for (int l = 1; l < layer_num; ++l) {
for (int n = 0; n < layer_sizes[l]; ++n) {
double z_sum = bias[l][n];
for (int m = 0; m < layer_sizes[l - 1]; ++m) {
z_sum += a_values[l - 1][m] * weights[l][n][m];
}
z_values[l][n] = z_sum;
a_values[l][n] = sigmoid(z_sum);
}
}
}
double cost(const sample_data& test_data) const {
double c = 0.0;
std::for_each(test_data.begin(), test_data.end(),
[&c, this](const sample& s) {
c += this->cost(s);
});
return c / test_data.size();
}
double cost(const sample& s) const {
matrix z_values;
matrix a_values;
const vec& X = s.first;
const vec& Y = s.second;
feed_forward(X, z_values, a_values);
double c = 0.0;
for (int k = 0, l = layer_num - 1; k < layer_sizes[layer_num - 1]; ++k) {
double t = (a_values[l][k] - Y[k]);
c += t * t;
}
return c;
}
static void shape(vec& v, int n) {
v = vec(n, 0.0f);
}
static void shape(matrix& m, int r, int c) {
m = matrix(r);
std::for_each(m.begin(), m.end(),
[c](vec& v) {v = vec(c, 0.0f);});
}
static double sigmoid(double z) {
return 1.0 / (1 + exp(-z));
}
static double sigmoid_prime(double z) {
return sigmoid(z)*(1 - sigmoid(z));
}
static void transpose(const matrix& m1, matrix& m2) {
shape(m2, m1[0].size(), m1.size());
for (int i = 0; i < m1.size(); ++i)
for (int j = 0; j < m1[i].size(); ++j)
m2[j][i] = m1[i][j];
}
private:
// How many layers does the network have?
const int layer_num;
// Each element is the number of neurons of each layer. For example
// <1,8,7> means a 3-layer network, with 1 neuron in 1st layer, 8 in
// 2nd layer, and 7 in 3rd layer.
const std::vector<int> layer_sizes;
// bias[l][a] means bias of (layer l, neuron a). 1 <= l < layer_num.
matrix bias;
// weights[l][a][b] means: 1 <= l < layer_num;
// weight that connects: (layer l-1, neuron b) to (layer l, neuron a).
// Note the sequence of a, b. "a" is the neuron on layer l, "b" is on layer l - 1.
std::vector<matrix> weights;
// Hyper parameter for learning.
double eta;
};
inline static double P(double d) {
if (d < 0) d = -d;
return d > 1 ? 1 / d : d;
}
// A target function to be learned.
double func(const vec& X) {
double v(0);
std::for_each(X.begin(), X.end(),
[&v](const double d) {
v += std::sin(d) / std::cos(1+d) + std::tan(d);
});
static std::default_random_engine generator(
std::chrono::system_clock::now().time_since_epoch().count());
static std::normal_distribution<double> distribution(0, 0.05);
return P(v + distribution(generator));
}
void generate_data(sample_data& training_data, double seed, const int n) {
training_data = sample_data(n);
std::for_each(training_data.begin(),
training_data.end(),
[&seed](sample& sam) {
double di(seed);
vec input = {P(di), P(di * seed), P(di / seed), P(di / 3)};
double output = func(input);
sam = sample(input, {output});
seed = output;
});
}
int main() {
sample_data training_data, test_data;
generate_data(training_data, 1, 50000);
generate_data(test_data, 0.01, 200);
Network network({(int)training_data[0].first.size(), 10, 9, 8, 7, 1}, 0.09);
network.train(training_data, test_data);
network.print_weights_and_bias();
return 0;
}