diff --git a/setup.py b/setup.py index 4912d2718228462ec26ff0dae418da436378b501..a36ca6a1accd4d20ecd5ad1d1e639eb23631ca42 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ include_dirs = [ xbob.core.get_include(), ] -packages = ['bob-machine >= 1.2.2', 'bob-trainer >= 1.2.2'] +packages = ['bob-machine >= 1.2.2'] version = '2.0.0a0' setup( @@ -63,6 +63,13 @@ setup( ), Extension("xbob.learn.mlp._library", [ + "xbob/learn/mlp/cxx/machine.cpp", + "xbob/learn/mlp/cxx/cross_entropy.cpp", + "xbob/learn/mlp/cxx/square_error.cpp", + "xbob/learn/mlp/cxx/shuffler.cpp", + "xbob/learn/mlp/cxx/base_trainer.cpp", + "xbob/learn/mlp/cxx/backprop.cpp", + "xbob/learn/mlp/cxx/rprop.cpp", "xbob/learn/mlp/shuffler.cpp", "xbob/learn/mlp/cost.cpp", "xbob/learn/mlp/machine.cpp", diff --git a/xbob/learn/mlp/cost.cpp b/xbob/learn/mlp/cost.cpp index a8e9b8c6881cb6be281d8f5495cb1971d648da8d..54bdbd8ece7739c81826780b67926b219c0d0322 100644 --- a/xbob/learn/mlp/cost.cpp +++ b/xbob/learn/mlp/cost.cpp @@ -300,10 +300,10 @@ static PyObject* PyBobLearnCost_f if (PyNumber_Check(arg) && !(PyArray_Check(arg) || PyBlitzArray_Check(arg))) return apply_scalar(self, s_f_str, - boost::bind(&bob::trainer::Cost::f, self->cxx, _1, _2), args, kwds); + boost::bind(&bob::learn::mlp::Cost::f, self->cxx, _1, _2), args, kwds); return apply_array(self, s_f_str, - boost::bind(&bob::trainer::Cost::f, self->cxx, _1, _2), args, kwds); + boost::bind(&bob::learn::mlp::Cost::f, self->cxx, _1, _2), args, kwds); } @@ -349,11 +349,11 @@ static PyObject* PyBobLearnCost_f_prime if (PyNumber_Check(arg) && !(PyArray_Check(arg) || PyBlitzArray_Check(arg))) return apply_scalar(self, s_f_prime_str, - boost::bind(&bob::trainer::Cost::f_prime, + boost::bind(&bob::learn::mlp::Cost::f_prime, self->cxx, _1, _2), args, kwds); return apply_array(self, s_f_prime_str, - boost::bind(&bob::trainer::Cost::f_prime, + boost::bind(&bob::learn::mlp::Cost::f_prime, self->cxx, _1, _2), args, kwds); } @@ -412,10 +412,10 @@ static PyObject* PyBobLearnCost_error if (PyNumber_Check(arg) && !(PyArray_Check(arg) || PyBlitzArray_Check(arg))) return apply_scalar(self, s_error_str, - boost::bind(&bob::trainer::Cost::error, self->cxx, _1, _2), args, kwds); + boost::bind(&bob::learn::mlp::Cost::error, self->cxx, _1, _2), args, kwds); return apply_array(self, s_error_str, - boost::bind(&bob::trainer::Cost::error, self->cxx, _1, _2), args, kwds); + boost::bind(&bob::learn::mlp::Cost::error, self->cxx, _1, _2), args, kwds); } @@ -516,7 +516,7 @@ static int PyBobLearnSquareError_init try { auto _actfun = reinterpret_cast<PyBobLearnActivationObject*>(actfun); - self->cxx = new bob::trainer::SquareError(_actfun->cxx); + self->cxx = new bob::learn::mlp::SquareError(_actfun->cxx); } catch (std::exception& ex) { PyErr_SetString(PyExc_RuntimeError, ex.what()); @@ -635,7 +635,7 @@ static int PyBobLearnCrossEntropyLoss_init try { auto _actfun = reinterpret_cast<PyBobLearnActivationObject*>(actfun); - self->cxx = new bob::trainer::CrossEntropyLoss(_actfun->cxx); + self->cxx = new bob::learn::mlp::CrossEntropyLoss(_actfun->cxx); } catch (std::exception& ex) { PyErr_SetString(PyExc_RuntimeError, ex.what()); diff --git a/xbob/learn/mlp/cxx/backprop.cpp b/xbob/learn/mlp/cxx/backprop.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f6b3388e7b5d5dfcf6586c4ee6e9d8c00cfb4cd3 --- /dev/null +++ b/xbob/learn/mlp/cxx/backprop.cpp @@ -0,0 +1,185 @@ +/** + * @date Mon Jul 18 18:11:22 2011 +0200 + * @author Andre Anjos <andre.anjos@idiap.ch> + * @author Laurent El Shafey <Laurent.El-Shafey@idiap.ch> + * + * @brief Implementation of the BackProp algorithm for MLP training. + * + * Copyright (C) 2011-2014 Idiap Research Institute, Martigny, Switzerland + */ + +#include <algorithm> +#include <bob/core/check.h> +#include <bob/math/linear.h> + +#include <xbob.learn.mlp/backprop.h> + +bob::learn::mlp::BackProp::BackProp(size_t batch_size, + boost::shared_ptr<bob::learn::mlp::Cost> cost): + bob::learn::mlp::BaseTrainer(batch_size, cost), + m_learning_rate(0.1), + m_momentum(0.0), + m_prev_deriv(numberOfHiddenLayers() + 1), + m_prev_deriv_bias(numberOfHiddenLayers() + 1) +{ + reset(); +} + +bob::learn::mlp::BackProp::BackProp(size_t batch_size, + boost::shared_ptr<bob::learn::mlp::Cost> cost, + const bob::learn::mlp::Machine& machine): + bob::learn::mlp::BaseTrainer(batch_size, cost, machine), + m_learning_rate(0.1), + m_momentum(0.0), + m_prev_deriv(numberOfHiddenLayers() + 1), + m_prev_deriv_bias(numberOfHiddenLayers() + 1) +{ + initialize(machine); +} + +bob::learn::mlp::BackProp::BackProp(size_t batch_size, + boost::shared_ptr<bob::learn::mlp::Cost> cost, + const bob::learn::mlp::Machine& machine, bool train_biases): + bob::learn::mlp::BaseTrainer(batch_size, cost, machine, train_biases), + m_learning_rate(0.1), + m_momentum(0.0), + m_prev_deriv(numberOfHiddenLayers() + 1), + m_prev_deriv_bias(numberOfHiddenLayers() + 1) +{ + initialize(machine); +} + +bob::learn::mlp::BackProp::~BackProp() { } + +bob::learn::mlp::BackProp::BackProp(const BackProp& other): + bob::learn::mlp::BaseTrainer(other), + m_learning_rate(other.m_learning_rate), + m_momentum(other.m_momentum) +{ + bob::core::array::ccopy(other.m_prev_deriv, m_prev_deriv); + bob::core::array::ccopy(other.m_prev_deriv_bias, m_prev_deriv_bias); +} + +bob::learn::mlp::BackProp& bob::learn::mlp::BackProp::operator= +(const bob::learn::mlp::BackProp& other) { + if (this != &other) + { + bob::learn::mlp::BaseTrainer::operator=(other); + m_learning_rate = other.m_learning_rate; + m_momentum = other.m_momentum; + + bob::core::array::ccopy(other.m_prev_deriv, m_prev_deriv); + bob::core::array::ccopy(other.m_prev_deriv_bias, m_prev_deriv_bias); + } + return *this; +} + +void bob::learn::mlp::BackProp::reset() { + for (size_t k=0; k<(numberOfHiddenLayers() + 1); ++k) { + m_prev_deriv[k] = 0; + m_prev_deriv_bias[k] = 0; + } +} + +void bob::learn::mlp::BackProp::backprop_weight_update(bob::learn::mlp::Machine& machine, + const blitz::Array<double,2>& input) +{ + std::vector<blitz::Array<double,2> >& machine_weight = + machine.updateWeights(); + std::vector<blitz::Array<double,1> >& machine_bias = + machine.updateBiases(); + const std::vector<blitz::Array<double,2> >& deriv = getDerivatives(); + for (size_t k=0; k<machine_weight.size(); ++k) { //for all layers + machine_weight[k] -= (((1-m_momentum)*m_learning_rate*deriv[k]) + + (m_momentum*m_prev_deriv[k])); + m_prev_deriv[k] = m_learning_rate*deriv[k]; + + // Here we decide if we should train the biases or not + if (!getTrainBiases()) continue; + + const std::vector<blitz::Array<double,1> >& deriv_bias = getBiasDerivatives(); + // We do the same for the biases, with the exception that biases can be + // considered as input neurons connecting the respective layers, with a + // fixed input = +1. This means we only need to probe for the error at + // layer k. + machine_bias[k] -= (((1-m_momentum)*m_learning_rate*deriv_bias[k]) + + (m_momentum*m_prev_deriv_bias[k])); + m_prev_deriv_bias[k] = m_learning_rate*deriv_bias[k]; + } +} + +void bob::learn::mlp::BackProp::setPreviousDerivatives(const std::vector<blitz::Array<double,2> >& v) { + bob::core::array::assertSameDimensionLength(v.size(), m_prev_deriv.size()); + for (size_t k=0; k<v.size(); ++k) { + bob::core::array::assertSameShape(v[k], m_prev_deriv[k]); + m_prev_deriv[k] = v[k]; + } +} + +void bob::learn::mlp::BackProp::setPreviousDerivative(const blitz::Array<double,2>& v, const size_t k) { + if (k >= m_prev_deriv.size()) { + boost::format m("MLPRPropTrainer: index for setting previous derivative array %lu is not on the expected range of [0, %lu]"); + m % k % (m_prev_deriv.size()-1); + throw std::runtime_error(m.str()); + } + bob::core::array::assertSameShape(v, m_prev_deriv[k]); + m_prev_deriv[k] = v; +} + +void bob::learn::mlp::BackProp::setPreviousBiasDerivatives(const std::vector<blitz::Array<double,1> >& v) { + bob::core::array::assertSameDimensionLength(v.size(), m_prev_deriv_bias.size()); + for (size_t k=0; k<v.size(); ++k) + { + bob::core::array::assertSameShape(v[k], m_prev_deriv_bias[k]); + m_prev_deriv_bias[k] = v[k]; + } +} + +void bob::learn::mlp::BackProp::setPreviousBiasDerivative(const blitz::Array<double,1>& v, const size_t k) { + if (k >= m_prev_deriv_bias.size()) { + boost::format m("MLPRPropTrainer: index for setting previous bias derivative array %lu is not on the expected range of [0, %lu]"); + m % k % (m_prev_deriv_bias.size()-1); + throw std::runtime_error(m.str()); + } + bob::core::array::assertSameShape(v, m_prev_deriv_bias[k]); + m_prev_deriv_bias[k] = v; +} + +void bob::learn::mlp::BackProp::initialize(const bob::learn::mlp::Machine& machine) +{ + bob::learn::mlp::BaseTrainer::initialize(machine); + + const std::vector<blitz::Array<double,2> >& machine_weight = + machine.getWeights(); + const std::vector<blitz::Array<double,1> >& machine_bias = + machine.getBiases(); + + m_prev_deriv.resize(numberOfHiddenLayers() + 1); + m_prev_deriv_bias.resize(numberOfHiddenLayers() + 1); + for (size_t k=0; k<(numberOfHiddenLayers() + 1); ++k) { + m_prev_deriv[k].reference(blitz::Array<double,2>(machine_weight[k].shape())); + m_prev_deriv_bias[k].reference(blitz::Array<double,1>(machine_bias[k].shape())); + } + + reset(); +} + +void bob::learn::mlp::BackProp::train(bob::learn::mlp::Machine& machine, + const blitz::Array<double,2>& input, + const blitz::Array<double,2>& target) { + if (!isCompatible(machine)) { + throw std::runtime_error("input machine is incompatible with this trainer"); + } + bob::core::array::assertSameDimensionLength(getBatchSize(), input.extent(0)); + bob::core::array::assertSameDimensionLength(getBatchSize(), target.extent(0)); + train_(machine, input, target); +} + +void bob::learn::mlp::BackProp::train_(bob::learn::mlp::Machine& machine, + const blitz::Array<double,2>& input, + const blitz::Array<double,2>& target) { + // To be called in this sequence for a general backprop algorithm + forward_step(machine, input); + backward_step(machine, input, target); + backprop_weight_update(machine, input); +} diff --git a/xbob/learn/mlp/cxx/base_trainer.cpp b/xbob/learn/mlp/cxx/base_trainer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..df88929d71ef0c61013055c1122c396953cfaa62 --- /dev/null +++ b/xbob/learn/mlp/cxx/base_trainer.cpp @@ -0,0 +1,311 @@ +/** + * @date Tue May 14 12:04:51 CEST 2013 + * @author Andre Anjos <andre.anjos@idiap.ch> + * @author Laurent El Shafey <Laurent.El-Shafey@idiap.ch> + * + * Copyright (C) 2011-2014 Idiap Research Institute, Martigny, Switzerland + */ + +#include <algorithm> +#include <bob/core/assert.h> +#include <bob/core/check.h> +#include <bob/math/linear.h> + +#include <xbob.learn.mlp/base_trainer.h> + +bob::learn::mlp::BaseTrainer::BaseTrainer(size_t batch_size, + boost::shared_ptr<bob::learn::mlp::Cost> cost): + m_batch_size(batch_size), + m_cost(cost), + m_train_bias(true), + m_H(0), ///< handy! + m_deriv(1), + m_deriv_bias(1), + m_error(1), + m_output(1) +{ + m_deriv[0].reference(blitz::Array<double,2>(0,0)); + m_deriv_bias[0].reference(blitz::Array<double,1>(0)); + m_error[0].reference(blitz::Array<double,2>(0,0)); + m_output[0].reference(blitz::Array<double,2>(0,0)); + reset(); +} + +bob::learn::mlp::BaseTrainer::BaseTrainer(size_t batch_size, + boost::shared_ptr<bob::learn::mlp::Cost> cost, + const bob::learn::mlp::Machine& machine): + m_batch_size(batch_size), + m_cost(cost), + m_train_bias(true), + m_H(machine.numOfHiddenLayers()), ///< handy! + m_deriv(m_H + 1), + m_deriv_bias(m_H + 1), + m_error(m_H + 1), + m_output(m_H + 1) +{ + initialize(machine); +} + +bob::learn::mlp::BaseTrainer::BaseTrainer(size_t batch_size, + boost::shared_ptr<bob::learn::mlp::Cost> cost, + const bob::learn::mlp::Machine& machine, + bool train_biases): + m_batch_size(batch_size), + m_cost(cost), + m_train_bias(train_biases), + m_H(machine.numOfHiddenLayers()), ///< handy! + m_deriv(m_H + 1), + m_deriv_bias(m_H + 1), + m_error(m_H + 1), + m_output(m_H + 1) +{ + initialize(machine); +} + +bob::learn::mlp::BaseTrainer::~BaseTrainer() { } + +bob::learn::mlp::BaseTrainer::BaseTrainer(const BaseTrainer& other): + m_batch_size(other.m_batch_size), + m_cost(other.m_cost), + m_train_bias(other.m_train_bias), + m_H(other.m_H) +{ + bob::core::array::ccopy(other.m_deriv, m_deriv); + bob::core::array::ccopy(other.m_deriv_bias, m_deriv_bias); + bob::core::array::ccopy(other.m_error, m_error); + bob::core::array::ccopy(other.m_output, m_output); +} + +bob::learn::mlp::BaseTrainer& bob::learn::mlp::BaseTrainer::operator= +(const bob::learn::mlp::BaseTrainer& other) { + if (this != &other) + { + m_batch_size = other.m_batch_size; + m_cost = other.m_cost; + m_train_bias = other.m_train_bias; + m_H = other.m_H; + + bob::core::array::ccopy(other.m_deriv, m_deriv); + bob::core::array::ccopy(other.m_deriv_bias, m_deriv_bias); + bob::core::array::ccopy(other.m_error, m_error); + bob::core::array::ccopy(other.m_output, m_output); + } + return *this; +} + +void bob::learn::mlp::BaseTrainer::setBatchSize (size_t batch_size) { + // m_output: values after the activation function + // m_error: error values; + + m_batch_size = batch_size; + + for (size_t k=0; k<m_output.size(); ++k) { + m_output[k].resize(batch_size, m_deriv[k].extent(1)); + } + + for (size_t k=0; k<m_error.size(); ++k) { + m_error[k].resize(batch_size, m_deriv[k].extent(1)); + } +} + +bool bob::learn::mlp::BaseTrainer::isCompatible(const bob::learn::mlp::Machine& machine) const +{ + if (m_H != machine.numOfHiddenLayers()) return false; + + if (m_deriv.back().extent(1) != (int)machine.outputSize()) return false; + + if (m_deriv[0].extent(0) != (int)machine.inputSize()) return false; + + //also, each layer should be of the same size + for (size_t k=0; k<(m_H + 1); ++k) { + if (!bob::core::array::hasSameShape(m_deriv[k], machine.getWeights()[k])) return false; + } + + //if you get to this point, you can only return true + return true; +} + +void bob::learn::mlp::BaseTrainer::forward_step(const bob::learn::mlp::Machine& machine, + const blitz::Array<double,2>& input) +{ + const std::vector<blitz::Array<double,2> >& machine_weight = machine.getWeights(); + const std::vector<blitz::Array<double,1> >& machine_bias = machine.getBiases(); + + boost::shared_ptr<bob::machine::Activation> hidden_actfun = machine.getHiddenActivation(); + boost::shared_ptr<bob::machine::Activation> output_actfun = machine.getOutputActivation(); + + for (size_t k=0; k<machine_weight.size(); ++k) { //for all layers + if (k == 0) bob::math::prod_(input, machine_weight[k], m_output[k]); + else bob::math::prod_(m_output[k-1], machine_weight[k], m_output[k]); + boost::shared_ptr<bob::machine::Activation> cur_actfun = + (k == (machine_weight.size()-1) ? output_actfun : hidden_actfun ); + for (int i=0; i<(int)m_batch_size; ++i) { //for every example + for (int j=0; j<m_output[k].extent(1); ++j) { //for all variables + m_output[k](i,j) = cur_actfun->f(m_output[k](i,j) + machine_bias[k](j)); + } + } + } +} + +void bob::learn::mlp::BaseTrainer::backward_step +(const bob::learn::mlp::Machine& machine, + const blitz::Array<double,2>& input, const blitz::Array<double,2>& target) +{ + const std::vector<blitz::Array<double,2> >& machine_weight = machine.getWeights(); + + //last layer + boost::shared_ptr<bob::machine::Activation> output_actfun = machine.getOutputActivation(); + for (int i=0; i<(int)m_batch_size; ++i) { //for every example + for (int j=0; j<m_error[m_H].extent(1); ++j) { //for all variables + m_error[m_H](i,j) = m_cost->error(m_output[m_H](i,j), target(i,j)); + } + } + + //all other layers + boost::shared_ptr<bob::machine::Activation> hidden_actfun = machine.getHiddenActivation(); + for (size_t k=m_H; k>0; --k) { + bob::math::prod_(m_error[k], machine_weight[k].transpose(1,0), m_error[k-1]); + for (int i=0; i<(int)m_batch_size; ++i) { //for every example + for (int j=0; j<m_error[k-1].extent(1); ++j) { //for all variables + m_error[k-1](i,j) *= hidden_actfun->f_prime_from_f(m_output[k-1](i,j)); + } + } + } + + //calculate the derivatives of the cost w.r.t. the weights and biases + for (size_t k=0; k<machine_weight.size(); ++k) { //for all layers + // For the weights + if (k == 0) bob::math::prod_(input.transpose(1,0), m_error[k], m_deriv[k]); + else bob::math::prod_(m_output[k-1].transpose(1,0), m_error[k], m_deriv[k]); + m_deriv[k] /= m_batch_size; + // For the biases + blitz::secondIndex bj; + m_deriv_bias[k] = blitz::mean(m_error[k].transpose(1,0), bj); + } +} + +double bob::learn::mlp::BaseTrainer::cost +(const blitz::Array<double,2>& target) const { + bob::core::array::assertSameShape(m_output[m_H], target); + double retval = 0.0; + for (int i=0; i<target.extent(0); ++i) { //for every example + for (int j=0; j<target.extent(1); ++j) { //for all variables + retval += m_cost->f(m_output[m_H](i,j), target(i,j)); + } + } + return retval / target.extent(0); +} + +double bob::learn::mlp::BaseTrainer::cost +(const bob::learn::mlp::Machine& machine, const blitz::Array<double,2>& input, + const blitz::Array<double,2>& target) { + forward_step(machine, input); + return cost(target); +} + +void bob::learn::mlp::BaseTrainer::initialize(const bob::learn::mlp::Machine& machine) +{ + const std::vector<blitz::Array<double,2> >& machine_weight = + machine.getWeights(); + const std::vector<blitz::Array<double,1> >& machine_bias = + machine.getBiases(); + + m_H = machine.numOfHiddenLayers(); + m_deriv.resize(m_H + 1); + m_deriv_bias.resize(m_H + 1); + m_output.resize(m_H + 1); + m_error.resize(m_H + 1); + for (size_t k=0; k<(m_H + 1); ++k) { + m_deriv[k].reference(blitz::Array<double,2>(machine_weight[k].shape())); + m_deriv_bias[k].reference(blitz::Array<double,1>(machine_bias[k].shape())); + m_output[k].resize(m_batch_size, m_deriv[k].extent(1)); + m_error[k].resize(m_batch_size, m_deriv[k].extent(1)); + } + + reset(); +} + +void bob::learn::mlp::BaseTrainer::setError(const std::vector<blitz::Array<double,2> >& error) { + bob::core::array::assertSameDimensionLength(error.size(), m_error.size()); + for (size_t k=0; k<error.size(); ++k) + { + bob::core::array::assertSameShape(error[k], m_error[k]); + m_error[k] = error[k]; + } +} + +void bob::learn::mlp::BaseTrainer::setError(const blitz::Array<double,2>& error, const size_t id) { + if (id >= m_error.size()) { + boost::format m("BaseTrainer: index for setting error array %lu is not on the expected range of [0, %lu]"); + m % id % (m_error.size()-1); + throw std::runtime_error(m.str()); + } + bob::core::array::assertSameShape(error, m_error[id]); + m_error[id] = error; +} + +void bob::learn::mlp::BaseTrainer::setOutput(const std::vector<blitz::Array<double,2> >& output) { + bob::core::array::assertSameDimensionLength(output.size(), m_output.size()); + for (size_t k=0; k<output.size(); ++k) + { + bob::core::array::assertSameShape(output[k], m_output[k]); + m_output[k] = output[k]; + } +} + +void bob::learn::mlp::BaseTrainer::setOutput(const blitz::Array<double,2>& output, const size_t id) { + if (id >= m_output.size()) { + boost::format m("BaseTrainer: index for setting output array %lu is not on the expected range of [0, %lu]"); + m % id % (m_output.size()-1); + throw std::runtime_error(m.str()); + } + bob::core::array::assertSameShape(output, m_output[id]); + m_output[id] = output; +} + +void bob::learn::mlp::BaseTrainer::setDerivatives(const std::vector<blitz::Array<double,2> >& deriv) { + bob::core::array::assertSameDimensionLength(deriv.size(), m_deriv.size()); + for (size_t k=0; k<deriv.size(); ++k) + { + bob::core::array::assertSameShape(deriv[k], m_deriv[k]); + m_deriv[k] = deriv[k]; + } +} + +void bob::learn::mlp::BaseTrainer::setDerivative(const blitz::Array<double,2>& deriv, const size_t id) { + if (id >= m_deriv.size()) { + boost::format m("BaseTrainer: index for setting derivative array %lu is not on the expected range of [0, %lu]"); + m % id % (m_deriv.size()-1); + throw std::runtime_error(m.str()); + } + bob::core::array::assertSameShape(deriv, m_deriv[id]); + m_deriv[id] = deriv; +} + +void bob::learn::mlp::BaseTrainer::setBiasDerivatives(const std::vector<blitz::Array<double,1> >& deriv_bias) { + bob::core::array::assertSameDimensionLength(deriv_bias.size(), m_deriv_bias.size()); + for (size_t k=0; k<deriv_bias.size(); ++k) + { + bob::core::array::assertSameShape(deriv_bias[k], m_deriv_bias[k]); + m_deriv_bias[k] = deriv_bias[k]; + } +} + +void bob::learn::mlp::BaseTrainer::setBiasDerivative(const blitz::Array<double,1>& deriv_bias, const size_t id) { + if (id >= m_deriv_bias.size()) { + boost::format m("BaseTrainer: index for setting bias derivative array %lu is not on the expected range of [0, %lu]"); + m % id % (m_deriv_bias.size()-1); + throw std::runtime_error(m.str()); + } + bob::core::array::assertSameShape(deriv_bias, m_deriv_bias[id]); + m_deriv_bias[id] = deriv_bias; +} + +void bob::learn::mlp::BaseTrainer::reset() { + for (size_t k=0; k<(m_H + 1); ++k) { + m_deriv[k] = 0.; + m_deriv_bias[k] = 0.; + m_error[k] = 0.; + m_output[k] = 0.; + } +} diff --git a/xbob/learn/mlp/cxx/cross_entropy.cpp b/xbob/learn/mlp/cxx/cross_entropy.cpp new file mode 100644 index 0000000000000000000000000000000000000000..aa3c5443a60d6f10801f15c054916eadebaa4030 --- /dev/null +++ b/xbob/learn/mlp/cxx/cross_entropy.cpp @@ -0,0 +1,39 @@ +/** + * @author Andre Anjos <andre.anjos@idiap.ch> + * @date Fri 31 May 23:52:08 2013 CEST + * + * @brief Implementation of the cross entropy loss function + * + * Copyright (C) 2011-2014 Idiap Research Institute, Martigny, Switzerland + */ + +#include <xbob.learn.mlp/cross_entropy.h> + +namespace bob { namespace learn { namespace mlp { + + CrossEntropyLoss::CrossEntropyLoss(boost::shared_ptr<bob::machine::Activation> actfun) + : m_actfun(actfun), + m_logistic_activation(m_actfun->unique_identifier() == "bob.machine.Activation.Logistic") {} + + CrossEntropyLoss::~CrossEntropyLoss() {} + + double CrossEntropyLoss::f (double output, double target) const { + return - (target * std::log(output)) - ((1-target)*std::log(1-output)); + } + + double CrossEntropyLoss::f_prime (double output, double target) const { + return (output-target) / (output * (1-output)); + } + + double CrossEntropyLoss::error (double output, double target) const { + return m_logistic_activation? (output - target) : m_actfun->f_prime_from_f(output) * f_prime(output, target); + } + + std::string CrossEntropyLoss::str() const { + std::string retval = "J = - target*log(output) - (1-target)*log(1-output) (cross-entropy loss)"; + if (m_logistic_activation) retval += " [+ logistic activation]"; + else retval += " [+ unknown activation]"; + return retval; + } + +}}} diff --git a/xbob/learn/mlp/cxx/machine.cpp b/xbob/learn/mlp/cxx/machine.cpp new file mode 100644 index 0000000000000000000000000000000000000000..27ce9ef184b784477444418137ce7e9417ef5bbf --- /dev/null +++ b/xbob/learn/mlp/cxx/machine.cpp @@ -0,0 +1,438 @@ +/** + * @date Tue Jan 18 17:07:26 2011 +0100 + * @author André Anjos <andre.anjos@idiap.ch> + * + * @brief Implementation of MLPs + * + * Copyright (C) 2011-2014 Idiap Research Institute, Martigny, Switzerland + */ + +#include <sys/time.h> +#include <cmath> +#include <boost/format.hpp> +#include <boost/make_shared.hpp> + +#include <bob/core/check.h> +#include <bob/core/array_copy.h> +#include <bob/core/assert.h> +#include <bob/math/linear.h> + +#include <xbob.learn.mlp/machine.h> + +bob::learn::mlp::Machine::Machine (size_t input, size_t output): + m_input_sub(input), + m_input_div(input), + m_weight(1), + m_bias(1), + m_hidden_activation(boost::make_shared<bob::machine::HyperbolicTangentActivation>()), + m_output_activation(m_hidden_activation), + m_buffer(1) +{ + resize(input, output); + m_input_sub = 0; + m_input_div = 1; + setWeights(0); + setBiases(0); +} + +bob::learn::mlp::Machine::Machine (size_t input, size_t hidden, size_t output): + m_input_sub(input), + m_input_div(input), + m_weight(2), + m_bias(2), + m_hidden_activation(boost::make_shared<bob::machine::HyperbolicTangentActivation>()), + m_output_activation(m_hidden_activation), + m_buffer(2) +{ + resize(input, hidden, output); + m_input_sub = 0; + m_input_div = 1; + setWeights(0); + setBiases(0); +} + +bob::learn::mlp::Machine::Machine (size_t input, const std::vector<size_t>& hidden, size_t output): + m_input_sub(input), + m_input_div(input), + m_weight(hidden.size()+1), + m_bias(hidden.size()+1), + m_hidden_activation(boost::make_shared<bob::machine::HyperbolicTangentActivation>()), + m_output_activation(m_hidden_activation), + m_buffer(hidden.size()+1) +{ + resize(input, hidden, output); + m_input_sub = 0; + m_input_div = 1; + setWeights(0); + setBiases(0); +} + +bob::learn::mlp::Machine::Machine (const std::vector<size_t>& shape): + m_hidden_activation(boost::make_shared<bob::machine::HyperbolicTangentActivation>()), + m_output_activation(m_hidden_activation) +{ + resize(shape); + m_input_sub = 0; + m_input_div = 1; + setWeights(0); + setBiases(0); +} + +bob::learn::mlp::Machine::Machine (const bob::learn::mlp::Machine& other): + m_input_sub(bob::core::array::ccopy(other.m_input_sub)), + m_input_div(bob::core::array::ccopy(other.m_input_div)), + m_weight(other.m_weight.size()), + m_bias(other.m_bias.size()), + m_hidden_activation(other.m_hidden_activation), + m_output_activation(other.m_output_activation), + m_buffer(other.m_buffer.size()) +{ + for (size_t i=0; i<other.m_weight.size(); ++i) { + m_weight[i].reference(bob::core::array::ccopy(other.m_weight[i])); + m_bias[i].reference(bob::core::array::ccopy(other.m_bias[i])); + m_buffer[i].reference(bob::core::array::ccopy(other.m_buffer[i])); + } +} + +bob::learn::mlp::Machine::Machine (bob::io::HDF5File& config) { + load(config); +} + +bob::learn::mlp::Machine::~Machine() { } + +bob::learn::mlp::Machine& bob::learn::mlp::Machine::operator= (const bob::learn::mlp::Machine& other) { + if (this != &other) + { + m_input_sub.reference(bob::core::array::ccopy(other.m_input_sub)); + m_input_div.reference(bob::core::array::ccopy(other.m_input_div)); + m_weight.resize(other.m_weight.size()); + m_bias.resize(other.m_bias.size()); + m_hidden_activation = other.m_hidden_activation; + m_output_activation = other.m_output_activation; + m_buffer.resize(other.m_buffer.size()); + for (size_t i=0; i<other.m_weight.size(); ++i) { + m_weight[i].reference(bob::core::array::ccopy(other.m_weight[i])); + m_bias[i].reference(bob::core::array::ccopy(other.m_bias[i])); + m_buffer[i].reference(bob::core::array::ccopy(other.m_buffer[i])); + } + } + return *this; +} + +bool bob::learn::mlp::Machine::operator== (const bob::learn::mlp::Machine& other) const { + return (bob::core::array::isEqual(m_input_sub, other.m_input_sub) && + bob::core::array::isEqual(m_input_div, other.m_input_div) && + bob::core::array::isEqual(m_weight, other.m_weight) && + bob::core::array::isEqual(m_bias, other.m_bias) && + m_hidden_activation->str() == other.m_hidden_activation->str() && + m_output_activation->str() == other.m_output_activation->str()); +} + +bool bob::learn::mlp::Machine::operator!= (const bob::learn::mlp::Machine& other) const { + return !(this->operator==(other)); +} + +bool bob::learn::mlp::Machine::is_similar_to(const bob::learn::mlp::Machine& other, + const double r_epsilon, const double a_epsilon) const +{ + return (bob::core::array::isClose(m_input_sub, other.m_input_sub, r_epsilon, a_epsilon) && + bob::core::array::isClose(m_input_div, other.m_input_div, r_epsilon, a_epsilon) && + bob::core::array::isClose(m_weight, other.m_weight, r_epsilon, a_epsilon) && + bob::core::array::isClose(m_bias, other.m_bias, r_epsilon, a_epsilon) && + m_hidden_activation->str() == other.m_hidden_activation->str() && + m_output_activation->str() == other.m_output_activation->str()); +} + + +void bob::learn::mlp::Machine::load (bob::io::HDF5File& config) { + uint8_t nhidden = config.read<uint8_t>("nhidden"); + m_weight.resize(nhidden+1); + m_bias.resize(nhidden+1); + m_buffer.resize(nhidden+1); + + //configures the input + m_input_sub.reference(config.readArray<double,1>("input_sub")); + m_input_div.reference(config.readArray<double,1>("input_div")); + + boost::format weight("weight_%d"); + boost::format bias("bias_%d"); + ++nhidden; + for (size_t i=0; i<nhidden; ++i) { + weight % i; + m_weight[i].reference(config.readArray<double,2>(weight.str())); + bias % i; + m_bias[i].reference(config.readArray<double,1>(bias.str())); + } + + //switch between different versions - support for version 2 + if (config.hasAttribute(".", "version")) { //new version + config.cd("hidden_activation"); + m_hidden_activation = bob::machine::load_activation(config); + config.cd("../output_activation"); + m_output_activation = bob::machine::load_activation(config); + config.cd(".."); + } + else { //old version + uint32_t act = config.read<uint32_t>("activation"); + m_hidden_activation = bob::machine::make_deprecated_activation(act); + m_output_activation = m_hidden_activation; + } + + //setup buffers: first, input + m_buffer[0].reference(blitz::Array<double,1>(m_input_sub.shape())); + for (size_t i=1; i<m_weight.size(); ++i) { + //buffers have to be sized the same as the input for the next layer + m_buffer[i].reference(blitz::Array<double,1>(m_weight[i].extent(0))); + } +} + +void bob::learn::mlp::Machine::save (bob::io::HDF5File& config) const { + config.setAttribute(".", "version", 1); + config.setArray("input_sub", m_input_sub); + config.setArray("input_div", m_input_div); + config.set("nhidden", (uint8_t)(m_weight.size()-1)); + boost::format weight("weight_%d"); + boost::format bias("bias_%d"); + for (size_t i=0; i<m_weight.size(); ++i) { + weight % i; + bias % i; + config.setArray(weight.str(), m_weight[i]); + config.setArray(bias.str(), m_bias[i]); + } + config.createGroup("hidden_activation"); + config.cd("hidden_activation"); + m_hidden_activation->save(config); + config.cd(".."); + config.createGroup("output_activation"); + config.cd("output_activation"); + m_output_activation->save(config); + config.cd(".."); +} + +void bob::learn::mlp::Machine::forward_ (const blitz::Array<double,1>& input, + blitz::Array<double,1>& output) { + + //doesn't check input, just computes + m_buffer[0] = (input - m_input_sub) / m_input_div; + + //input -> hidden[0]; hidden[0] -> hidden[1], ..., hidden[N-2] -> hidden[N-1] + for (size_t j=1; j<m_weight.size(); ++j) { + bob::math::prod_(m_buffer[j-1], m_weight[j-1], m_buffer[j]); + m_buffer[j] += m_bias[j-1]; + for (int i=0; i<m_buffer[j].extent(0); ++i) { + m_buffer[j](i) = m_hidden_activation->f(m_buffer[j](i)); + } + } + + //hidden[N-1] -> output + bob::math::prod_(m_buffer.back(), m_weight.back(), output); + output += m_bias.back(); + for (int i=0; i<output.extent(0); ++i) { + output(i) = m_output_activation->f(output(i)); + } +} + +void bob::learn::mlp::Machine::forward (const blitz::Array<double,1>& input, + blitz::Array<double,1>& output) { + + //checks input + if (m_weight.front().extent(0) != input.extent(0)) {//checks input + boost::format m("mismatch on the input dimension: expected a vector with %d positions, but you input %d"); + m % m_weight.front().extent(0) % input.extent(0); + throw std::runtime_error(m.str()); + } + if (m_weight.back().extent(1) != output.extent(0)) {//checks output + boost::format m("mismatch on the output dimension: expected a vector with %d positions, but you input %d"); + m % m_weight.back().extent(1) % output.extent(0); + throw std::runtime_error(m.str()); + } + forward_(input, output); +} + +void bob::learn::mlp::Machine::forward_ (const blitz::Array<double,2>& input, + blitz::Array<double,2>& output) { + + blitz::Range all = blitz::Range::all(); + for (int i=0; i<input.extent(0); ++i) { + blitz::Array<double,1> inref(input(i,all)); + blitz::Array<double,1> outref(output(i,all)); + forward_(inref, outref); + } +} + +void bob::learn::mlp::Machine::forward (const blitz::Array<double,2>& input, + blitz::Array<double,2>& output) { + + //checks input + if (m_weight.front().extent(0) != input.extent(1)) {//checks input + boost::format m("mismatch on the input dimension: expected a vector with %d positions, but you input %d"); + m % m_weight.front().extent(0) % input.extent(1); + throw std::runtime_error(m.str()); + } + if (m_weight.back().extent(1) != output.extent(1)) {//checks output + boost::format m("mismatch on the output dimension: expected a vector with %d positions, but you input %d"); + m % m_weight.back().extent(1) % output.extent(1); + throw std::runtime_error(m.str()); + } + //checks output + bob::core::array::assertSameDimensionLength(input.extent(0), output.extent(0)); + forward_(input, output); +} + +void bob::learn::mlp::Machine::resize (size_t input, size_t output) { + m_input_sub.resize(input); + m_input_sub = 0; + m_input_div.resize(input); + m_input_div = 1; + m_weight.resize(1); + m_weight[0].reference(blitz::Array<double,2>(input, output)); + m_bias.resize(1); + m_bias[0].reference(blitz::Array<double,1>(output)); + m_buffer.resize(1); + m_buffer[0].reference(blitz::Array<double,1>(input)); + setWeights(0); + setBiases(0); +} + +void bob::learn::mlp::Machine::resize (size_t input, size_t hidden, size_t output) { + std::vector<size_t> vhidden(1, hidden); + resize(input, vhidden, output); +} + +void bob::learn::mlp::Machine::resize (size_t input, const std::vector<size_t>& hidden, + size_t output) { + + if (hidden.size() == 0) { + resize(input, output); + return; + } + + m_input_sub.resize(input); + m_input_sub = 0; + m_input_div.resize(input); + m_input_div = 1; + m_weight.resize(hidden.size()+1); + m_bias.resize(hidden.size()+1); + m_buffer.resize(hidden.size()+1); + + //initializes first layer + m_weight[0].reference(blitz::Array<double,2>(input, hidden[0])); + m_bias[0].reference(blitz::Array<double,1>(hidden[0])); + m_buffer[0].reference(blitz::Array<double,1>(input)); + + //initializes hidden layers + const size_t NH1 = hidden.size()-1; + for (size_t i=0; i<NH1; ++i) { + m_weight[i+1].reference(blitz::Array<double,2>(hidden[i], hidden[i+1])); + m_bias[i+1].reference(blitz::Array<double,1>(hidden[i+1])); + m_buffer[i+1].reference(blitz::Array<double,1>(hidden[i])); + } + + //initializes the last layer + m_weight.back().reference(blitz::Array<double,2>(hidden.back(), output)); + m_bias.back().reference(blitz::Array<double,1>(output)); + m_buffer.back().reference(blitz::Array<double,1>(hidden.back())); + + setWeights(0); + setBiases(0); +} + +void bob::learn::mlp::Machine::resize (const std::vector<size_t>& shape) { + + if (shape.size() < 2) { + boost::format m("invalid shape for MLP: %d"); + m % shape.size(); + throw std::runtime_error(m.str()); + } + + if (shape.size() == 2) { + resize(shape[0], shape[1]); + return; + } + + //falls back to the normal case + size_t input = shape.front(); + size_t output = shape.back(); + std::vector<size_t> vhidden(shape.size()-2); + for (size_t i=1; i<(shape.size()-1); ++i) vhidden[i-1] = shape[i]; + resize(input, vhidden, output); +} + +void bob::learn::mlp::Machine::setInputSubtraction(const blitz::Array<double,1>& v) { + if (m_weight.front().extent(0) != v.extent(0)) { + boost::format m("mismatch on the input subtraction dimension: expected a vector with %d positions, but you input %d"); + m % m_weight.front().extent(0) % v.extent(0); + throw std::runtime_error(m.str()); + } + m_input_sub.reference(bob::core::array::ccopy(v)); +} + +void bob::learn::mlp::Machine::setInputDivision(const blitz::Array<double,1>& v) { + if (m_weight.front().extent(0) != v.extent(0)) { + boost::format m("mismatch on the input division dimension: expected a vector with %d positions, but you input %d"); + m % m_weight.front().extent(0) % v.extent(0); + throw std::runtime_error(m.str()); + } + m_input_div.reference(bob::core::array::ccopy(v)); +} + +void bob::learn::mlp::Machine::setWeights(const std::vector<blitz::Array<double,2> >& weight) { + if (m_weight.size() != weight.size()) { + boost::format m("mismatch on the number of weight layers to set: expected %d layers, but you input %d"); + m % m_weight.size() % weight.size(); + } + for (size_t i=0; i<m_weight.size(); ++i) { + if (!bob::core::array::hasSameShape(m_weight[i], weight[i])) { + boost::format m("mismatch on the shape of weight layer %d"); + m % i; + throw std::runtime_error(m.str()); + } + } + //if you got to this point, the sizes are correct, just set + for (size_t i=0; i<m_weight.size(); ++i) m_weight[i] = weight[i]; +} + +void bob::learn::mlp::Machine::setWeights(double v) { + for (size_t i=0; i<m_weight.size(); ++i) m_weight[i] = v; +} + +void bob::learn::mlp::Machine::setBiases(const std::vector<blitz::Array<double,1> >& bias) { + if (m_bias.size() != bias.size()) { + boost::format m("mismatch on the number of bias layers to set: expected %d layers, but you input %d"); + m % m_bias.size() % bias.size(); + throw std::runtime_error(m.str()); + } + for (size_t i=0; i<m_bias.size(); ++i) { + if (!bob::core::array::hasSameShape(m_bias[i], bias[i])) { + boost::format m("mismatch on the shape of bias layer %d: expected a vector with length %d, but you input %d"); + m % i % m_bias[i].shape()[0] % bias[i].shape()[0]; + throw std::runtime_error(m.str()); + } + } + //if you got to this point, the sizes are correct, just set + for (size_t i=0; i<m_bias.size(); ++i) m_bias[i] = bias[i]; +} + +void bob::learn::mlp::Machine::setBiases(double v) { + for (size_t i=0; i<m_bias.size(); ++i) m_bias[i] = v; +} + +void bob::learn::mlp::Machine::randomize(boost::mt19937& rng, double lower_bound, double upper_bound) { + boost::uniform_real<double> draw(lower_bound, upper_bound); + + for (size_t k=0; k<m_weight.size(); ++k) { + for (int i=0; i<m_weight[k].extent(0); ++i) { + for (int j=0; j<m_weight[k].extent(1); ++j) { + m_weight[k](i,j) = draw(rng); + } + } + for (int i=0; i<m_bias[k].extent(0); ++i) m_bias[k](i) = draw(rng); + } +} + +void bob::learn::mlp::Machine::randomize(double lower_bound, double upper_bound) { + struct timeval tv; + gettimeofday(&tv, 0); + boost::mt19937 rng(tv.tv_sec + tv.tv_usec); + randomize(rng, lower_bound, upper_bound); +} diff --git a/xbob/learn/mlp/cxx/rprop.cpp b/xbob/learn/mlp/cxx/rprop.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1f080bfd084a57e135dd82e02ad337e46bc56392 --- /dev/null +++ b/xbob/learn/mlp/cxx/rprop.cpp @@ -0,0 +1,304 @@ +/** + * @date Mon Jul 11 16:19:08 2011 +0200 + * @author Andre Anjos <andre.anjos@idiap.ch> + * @author Laurent El Shafey <Laurent.El-Shafey@idiap.ch> + * + * @brief Implementation of the RProp algorithm for MLP training. + * + * Copyright (C) 2011-2014 Idiap Research Institute, Martigny, Switzerland + */ + +#include <algorithm> +#include <bob/core/check.h> +#include <bob/core/array_copy.h> +#include <bob/math/linear.h> + +#include <xbob.learn.mlp/rprop.h> + +bob::learn::mlp::RProp::RProp(size_t batch_size, + boost::shared_ptr<bob::learn::mlp::Cost> cost): + bob::learn::mlp::BaseTrainer(batch_size, cost), + m_eta_minus(0.5), + m_eta_plus(1.2), + m_delta_zero(0.1), + m_delta_min(1e-6), + m_delta_max(50.0), + m_delta(numberOfHiddenLayers() + 1), + m_delta_bias(numberOfHiddenLayers() + 1), + m_prev_deriv(numberOfHiddenLayers() + 1), + m_prev_deriv_bias(numberOfHiddenLayers() + 1) +{ + reset(); +} + + +bob::learn::mlp::RProp::RProp(size_t batch_size, + boost::shared_ptr<bob::learn::mlp::Cost> cost, + const bob::learn::mlp::Machine& machine): + bob::learn::mlp::BaseTrainer(batch_size, cost, machine), + m_eta_minus(0.5), + m_eta_plus(1.2), + m_delta_zero(0.1), + m_delta_min(1e-6), + m_delta_max(50.0), + m_delta(numberOfHiddenLayers() + 1), + m_delta_bias(numberOfHiddenLayers() + 1), + m_prev_deriv(numberOfHiddenLayers() + 1), + m_prev_deriv_bias(numberOfHiddenLayers() + 1) +{ + initialize(machine); +} + +bob::learn::mlp::RProp::RProp(size_t batch_size, + boost::shared_ptr<bob::learn::mlp::Cost> cost, + const bob::learn::mlp::Machine& machine, + bool train_biases): + bob::learn::mlp::BaseTrainer(batch_size, cost, machine, train_biases), + m_eta_minus(0.5), + m_eta_plus(1.2), + m_delta_zero(0.1), + m_delta_min(1e-6), + m_delta_max(50.0), + m_delta(numberOfHiddenLayers() + 1), + m_delta_bias(numberOfHiddenLayers() + 1), + m_prev_deriv(numberOfHiddenLayers() + 1), + m_prev_deriv_bias(numberOfHiddenLayers() + 1) +{ + initialize(machine); +} + +bob::learn::mlp::RProp::~RProp() { } + +bob::learn::mlp::RProp::RProp(const RProp& other): + bob::learn::mlp::BaseTrainer(other), + m_eta_minus(other.m_eta_minus), + m_eta_plus(other.m_eta_plus), + m_delta_zero(other.m_delta_zero), + m_delta_min(other.m_delta_min), + m_delta_max(other.m_delta_max), + m_delta(numberOfHiddenLayers() + 1), + m_delta_bias(numberOfHiddenLayers() + 1), + m_prev_deriv(numberOfHiddenLayers() + 1), + m_prev_deriv_bias(numberOfHiddenLayers() + 1) +{ + bob::core::array::ccopy(other.m_delta, m_delta); + bob::core::array::ccopy(other.m_delta_bias, m_delta_bias); + bob::core::array::ccopy(other.m_prev_deriv, m_prev_deriv); + bob::core::array::ccopy(other.m_prev_deriv_bias, m_prev_deriv_bias); +} + +bob::learn::mlp::RProp& bob::learn::mlp::RProp::operator= +(const bob::learn::mlp::RProp& other) { + if (this != &other) + { + bob::learn::mlp::BaseTrainer::operator=(other); + + m_eta_minus = other.m_eta_minus; + m_eta_plus = other.m_eta_plus; + m_delta_zero = other.m_delta_zero; + m_delta_min = other.m_delta_min; + m_delta_max = other.m_delta_max; + + bob::core::array::ccopy(other.m_delta, m_delta); + bob::core::array::ccopy(other.m_delta_bias, m_delta_bias); + bob::core::array::ccopy(other.m_prev_deriv, m_prev_deriv); + bob::core::array::ccopy(other.m_prev_deriv_bias, m_prev_deriv_bias); + } + return *this; +} + +void bob::learn::mlp::RProp::reset() { + for (size_t k=0; k<(numberOfHiddenLayers() + 1); ++k) { + m_delta[k] = m_delta_zero; + m_delta_bias[k] = m_delta_zero; + m_prev_deriv[k] = 0; + m_prev_deriv_bias[k] = 0; + } +} + +/** + * A function that returns the sign of a double number (zero if the value is + * 0). + */ +static int8_t sign (double x) { + if (x > 0) return +1; + return (x == 0)? 0 : -1; +} + +void bob::learn::mlp::RProp::rprop_weight_update(bob::learn::mlp::Machine& machine, + const blitz::Array<double,2>& input) +{ + std::vector<blitz::Array<double,2> >& machine_weight = machine.updateWeights(); + std::vector<blitz::Array<double,1> >& machine_bias = machine.updateBiases(); + const std::vector<blitz::Array<double,2> >& deriv = getDerivatives(); + + for (size_t k=0; k<machine_weight.size(); ++k) { //for all layers + // Calculates the sign change as prescribed on the RProp paper. Depending + // on the sign change, we update the "weight_update" matrix and apply the + // updates on the respective weights. + for (int i=0; i<deriv[k].extent(0); ++i) { + for (int j=0; j<deriv[k].extent(1); ++j) { + int8_t M = sign(deriv[k](i,j) * m_prev_deriv[k](i,j)); + // Implementations equations (4-6) on the RProp paper: + if (M > 0) { + m_delta[k](i,j) = std::min(m_delta[k](i,j)*m_eta_plus, m_delta_max); + machine_weight[k](i,j) -= sign(deriv[k](i,j)) * m_delta[k](i,j); + m_prev_deriv[k](i,j) = deriv[k](i,j); + } + else if (M < 0) { + m_delta[k](i,j) = std::max(m_delta[k](i,j)*m_eta_minus, m_delta_min); + m_prev_deriv[k](i,j) = 0; + } + else { //M == 0 + machine_weight[k](i,j) -= sign(deriv[k](i,j)) * m_delta[k](i,j); + m_prev_deriv[k](i,j) = deriv[k](i,j); + } + } + } + + // Here we decide if we should train the biases or not + if (!getTrainBiases()) continue; + + const std::vector<blitz::Array<double,1> >& deriv_bias = getBiasDerivatives(); + + // We do the same for the biases, with the exception that biases can be + // considered as input neurons connecting the respective layers, with a + // fixed input = +1. This means we only need to probe for the error at + // layer k. + for (int i=0; i<deriv_bias[k].extent(0); ++i) { + int8_t M = sign(deriv_bias[k](i) * m_prev_deriv_bias[k](i)); + // Implementations equations (4-6) on the RProp paper: + if (M > 0) { + m_delta_bias[k](i) = std::min(m_delta_bias[k](i)*m_eta_plus, m_delta_max); + machine_bias[k](i) -= sign(deriv_bias[k](i)) * m_delta_bias[k](i); + m_prev_deriv_bias[k](i) = deriv_bias[k](i); + } + else if (M < 0) { + m_delta_bias[k](i) = std::max(m_delta_bias[k](i)*m_eta_minus, m_delta_min); + m_prev_deriv_bias[k](i) = 0; + } + else { //M == 0 + machine_bias[k](i) -= sign(deriv_bias[k](i)) * m_delta_bias[k](i); + m_prev_deriv_bias[k](i) = deriv_bias[k](i); + } + } + } +} + +void bob::learn::mlp::RProp::initialize(const bob::learn::mlp::Machine& machine) +{ + bob::learn::mlp::BaseTrainer::initialize(machine); + + const std::vector<blitz::Array<double,2> >& machine_weight = + machine.getWeights(); + const std::vector<blitz::Array<double,1> >& machine_bias = + machine.getBiases(); + + m_delta.resize(numberOfHiddenLayers() + 1); + m_delta_bias.resize(numberOfHiddenLayers() + 1); + m_prev_deriv.resize(numberOfHiddenLayers() + 1); + m_prev_deriv_bias.resize(numberOfHiddenLayers() + 1); + for (size_t k=0; k<(numberOfHiddenLayers() + 1); ++k) { + m_delta[k].reference(blitz::Array<double,2>(machine_weight[k].shape())); + m_delta_bias[k].reference(blitz::Array<double,1>(machine_bias[k].shape())); + m_prev_deriv[k].reference(blitz::Array<double,2>(machine_weight[k].shape())); + m_prev_deriv_bias[k].reference(blitz::Array<double,1>(machine_bias[k].shape())); + } + + reset(); +} + +void bob::learn::mlp::RProp::train(bob::learn::mlp::Machine& machine, + const blitz::Array<double,2>& input, + const blitz::Array<double,2>& target) { + if (!isCompatible(machine)) { + throw std::runtime_error("input machine is incompatible with this trainer"); + } + bob::core::array::assertSameDimensionLength(getBatchSize(), input.extent(0)); + bob::core::array::assertSameDimensionLength(getBatchSize(), target.extent(0)); + train_(machine, input, target); +} + +void bob::learn::mlp::RProp::train_(bob::learn::mlp::Machine& machine, + const blitz::Array<double,2>& input, + const blitz::Array<double,2>& target) { + + // To be called in this sequence for a general backprop algorithm + forward_step(machine, input); + backward_step(machine, input, target); + rprop_weight_update(machine, input); +} + +void bob::learn::mlp::RProp::setPreviousDerivatives(const std::vector<blitz::Array<double,2> >& v) { + bob::core::array::assertSameDimensionLength(v.size(), m_prev_deriv.size()); + for (size_t k=0; k<v.size(); ++k) { + bob::core::array::assertSameShape(v[k], m_prev_deriv[k]); + m_prev_deriv[k] = v[k]; + } +} + +void bob::learn::mlp::RProp::setPreviousDerivative(const blitz::Array<double,2>& v, const size_t k) { + if (k >= m_prev_deriv.size()) { + boost::format m("RProp: index for setting derivative array %lu is not on the expected range of [0, %lu]"); + m % k % (m_prev_deriv.size()-1); + throw std::runtime_error(m.str()); + } + bob::core::array::assertSameShape(v, m_prev_deriv[k]); + m_prev_deriv[k] = v; +} + +void bob::learn::mlp::RProp::setPreviousBiasDerivatives(const std::vector<blitz::Array<double,1> >& v) { + bob::core::array::assertSameDimensionLength(v.size(), m_prev_deriv_bias.size()); + for (size_t k=0; k<v.size(); ++k) + { + bob::core::array::assertSameShape(v[k], m_prev_deriv_bias[k]); + m_prev_deriv_bias[k] = v[k]; + } +} + +void bob::learn::mlp::RProp::setPreviousBiasDerivative(const blitz::Array<double,1>& v, const size_t k) { + if (k >= m_prev_deriv_bias.size()) { + boost::format m("RProp: index for setting derivative bias array %lu is not on the expected range of [0, %lu]"); + m % k % (m_prev_deriv_bias.size()-1); + throw std::runtime_error(m.str()); + } + bob::core::array::assertSameShape(v, m_prev_deriv_bias[k]); + m_prev_deriv_bias[k] = v; +} + +void bob::learn::mlp::RProp::setDeltas(const std::vector<blitz::Array<double,2> >& v) { + bob::core::array::assertSameDimensionLength(v.size(), m_delta.size()); + for (size_t k=0; k<v.size(); ++k) { + bob::core::array::assertSameShape(v[k], m_delta[k]); + m_delta[k] = v[k]; + } +} + +void bob::learn::mlp::RProp::setDelta(const blitz::Array<double,2>& v, const size_t k) { + if (k >= m_delta.size()) { + boost::format m("RProp: index for setting delta array %lu is not on the expected range of [0, %lu]"); + m % k % (m_delta.size()-1); + throw std::runtime_error(m.str()); + } + bob::core::array::assertSameShape(v, m_delta[k]); + m_delta[k] = v; +} + +void bob::learn::mlp::RProp::setBiasDeltas(const std::vector<blitz::Array<double,1> >& v) { + bob::core::array::assertSameDimensionLength(v.size(), m_delta_bias.size()); + for (size_t k=0; k<v.size(); ++k) + { + bob::core::array::assertSameShape(v[k], m_delta_bias[k]); + m_delta_bias[k] = v[k]; + } +} + +void bob::learn::mlp::RProp::setBiasDelta(const blitz::Array<double,1>& v, const size_t k) { + if (k >= m_delta_bias.size()) { + boost::format m("RProp: index for setting delta bias array %lu is not on the expected range of [0, %lu]"); + m % k % (m_delta_bias.size()-1); + throw std::runtime_error(m.str()); + } + bob::core::array::assertSameShape(v, m_delta_bias[k]); + m_delta_bias[k] = v; +} diff --git a/xbob/learn/mlp/cxx/shuffler.cpp b/xbob/learn/mlp/cxx/shuffler.cpp new file mode 100644 index 0000000000000000000000000000000000000000..63cccadd28b4de9c642f97b7be87872b5b027615 --- /dev/null +++ b/xbob/learn/mlp/cxx/shuffler.cpp @@ -0,0 +1,206 @@ +/** + * @date Wed Jul 13 16:58:26 2011 +0200 + * @author Andre Anjos <andre.anjos@idiap.ch> + * + * @brief Implementation of the DataShuffler. + * + * Copyright (C) 2011-2014 Idiap Research Institute, Martigny, Switzerland + */ + +#include <stdexcept> +#include <sys/time.h> +#include <boost/format.hpp> + +#include <bob/core/assert.h> +#include <bob/core/array_copy.h> + +#include <xbob.learn.mlp/shuffler.h> + +bob::learn::mlp::DataShuffler::DataShuffler +(const std::vector<blitz::Array<double,2> >& data, + const std::vector<blitz::Array<double,1> >& target): + m_data(data.size()), + m_target(target.size()), + m_range(), + m_do_stdnorm(false), + m_mean(), + m_stddev() +{ + if (data.size() == 0) + throw std::runtime_error("data vector cannot be empty"); + if (target.size() == 0) + throw std::runtime_error("target vector cannot be empty"); + + bob::core::array::assertSameDimensionLength(data.size(), target.size()); + + // checks shapes, minimum number of examples + for (size_t k=0; k<data.size(); ++k) { + if (data[k].size() == 0) { + boost::format m("class %u has no samples"); + m % k; + throw std::runtime_error(m.str()); + } + //this may also trigger if I cannot get doubles from the Arrayset + bob::core::array::assertSameDimensionLength(data[0].extent(1), data[k].extent(1)); + bob::core::array::assertSameShape(target[0], target[k]); + } + + // set save values for the mean and stddev (even if not used at start) + m_mean.resize(data[0].extent(1)); + m_mean = 0.; + m_stddev.resize(data[0].extent(1)); + m_stddev = 1.; + + // copies the target data to my own variable + for (size_t k=0; k<target.size(); ++k) { + m_data[k].reference(bob::core::array::ccopy(data[k])); + m_target[k].reference(bob::core::array::ccopy(target[k])); + } + + // creates one range tailored for the range of each data object + for (size_t i=0; i<data.size(); ++i) { + m_range.push_back(boost::uniform_int<size_t>(0, m_data[i].extent(0)-1)); + } +} + +bob::learn::mlp::DataShuffler::DataShuffler(const bob::learn::mlp::DataShuffler& other): + m_data(other.m_data.size()), + m_target(other.m_target.size()), + m_range(other.m_range), + m_do_stdnorm(other.m_do_stdnorm), + m_mean(bob::core::array::ccopy(other.m_mean)), + m_stddev(bob::core::array::ccopy(other.m_stddev)) +{ + for (size_t k=0; k<m_target.size(); ++k) { + m_data[k].reference(bob::core::array::ccopy(other.m_data[k])); + m_target[k].reference(bob::core::array::ccopy(other.m_target[k])); + } +} + +bob::learn::mlp::DataShuffler::~DataShuffler() { } + +bob::learn::mlp::DataShuffler& bob::learn::mlp::DataShuffler::operator=(const bob::learn::mlp::DataShuffler& other) { + + m_data.resize(other.m_data.size()); + m_target.resize(other.m_target.size()); + + for (size_t k=0; k<m_target.size(); ++k) { + m_data[k].reference(bob::core::array::ccopy(other.m_data[k])); + m_target[k].reference(bob::core::array::ccopy(other.m_target[k])); + } + + m_range = other.m_range; + + m_mean.reference(bob::core::array::ccopy(other.m_mean)); + m_stddev.reference(bob::core::array::ccopy(other.m_stddev)); + m_do_stdnorm = other.m_do_stdnorm; + + return *this; +} + +/** + * Calculates mean and std.dev. in a single loop. + * see: http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + */ +void evaluateStdNormParameters(const std::vector<blitz::Array<double,2> >& data, + blitz::Array<double,1>& mean, blitz::Array<double,1>& stddev) { + + mean = 0.; + stddev = 0.; ///< temporarily used to accumulate square sum! + double samples = 0; + + blitz::Range all = blitz::Range::all(); + for (size_t k=0; k<data.size(); ++k) { + for (int i=0; i<data[k].extent(0); ++i) { + mean += data[k](i,all); + stddev += blitz::pow2(data[k](i,all)); + ++samples; + } + } + stddev -= blitz::pow2(mean) / samples; + stddev /= (samples-1); ///< note: unbiased sample variance + stddev = blitz::sqrt(stddev); + + mean /= (samples); +} + +/** + * Applies standard normalization parameters to all data arrays given + */ +void applyStdNormParameters(std::vector<blitz::Array<double,2> >& data, + const blitz::Array<double,1>& mean, const blitz::Array<double,1>& stddev) { + blitz::Range all = blitz::Range::all(); + for (size_t k=0; k<data.size(); ++k) { + for (int i=0; i<data[k].extent(0); ++i) { + data[k](i,all) = (data[k](i,all) - mean) / stddev; + } + } +} + +/** + * Inverts the application of std normalization parameters + */ +void invertApplyStdNormParameters(std::vector<blitz::Array<double,2> >& data, + const blitz::Array<double,1>& mean, const blitz::Array<double,1>& stddev) { + blitz::Range all = blitz::Range::all(); + for (size_t k=0; k<data.size(); ++k) { + for (int i=0; i<data[k].extent(0); ++i) { + data[k](i,all) = (data[k](i,all) * stddev) + mean; + } + } +} + +void bob::learn::mlp::DataShuffler::setAutoStdNorm(bool s) { + if (s && !m_do_stdnorm) { + evaluateStdNormParameters(m_data, m_mean, m_stddev); + applyStdNormParameters(m_data, m_mean, m_stddev); + } + if (!s && m_do_stdnorm) { + invertApplyStdNormParameters(m_data, m_mean, m_stddev); + m_mean = 0.; + m_stddev = 1.; + } + m_do_stdnorm = s; +} + +void bob::learn::mlp::DataShuffler::getStdNorm(blitz::Array<double,1>& mean, + blitz::Array<double,1>& stddev) const { + bob::core::array::assertSameShape(mean, m_mean); + bob::core::array::assertSameShape(stddev, m_stddev); + if (m_do_stdnorm) { + mean = m_mean; + stddev = m_stddev; + } + else { + evaluateStdNormParameters(m_data, mean, stddev); + } +} + +void bob::learn::mlp::DataShuffler::operator() (boost::mt19937& rng, + blitz::Array<double,2>& data, blitz::Array<double,2>& target) { + + bob::core::array::assertSameDimensionLength(data.extent(0), target.extent(0)); + + size_t counter = 0; + size_t max = data.extent(0); + blitz::Range all = blitz::Range::all(); + while (true) { + for (size_t i=0; i<m_data.size(); ++i) { //for all classes + size_t index = m_range[i](rng); //pick a random position within class + data(counter,all) = m_data[i](index,all); + target(counter,all) = m_target[i]; + ++counter; + if (counter >= max) break; + } + if (counter >= max) break; + } + +} + +void bob::learn::mlp::DataShuffler::operator() (blitz::Array<double,2>& data, + blitz::Array<double,2>& target) { + struct timeval tv; + gettimeofday(&tv, 0); + boost::mt19937 rng(tv.tv_sec + tv.tv_usec); + operator()(rng, data, target); +} diff --git a/xbob/learn/mlp/cxx/square_error.cpp b/xbob/learn/mlp/cxx/square_error.cpp new file mode 100644 index 0000000000000000000000000000000000000000..def386417ecf72a6cc078f4adb0aac06fce94900 --- /dev/null +++ b/xbob/learn/mlp/cxx/square_error.cpp @@ -0,0 +1,37 @@ +/** + * @author Andre Anjos <andre.anjos@idiap.ch> + * @date Fri 31 May 18:07:53 2013 + * + * @brief Implementation of the squared error cost function + * + * Copyright (C) 2011-2014 Idiap Research Institute, Martigny, Switzerland + */ + +#include <cmath> + +#include <xbob.learn.mlp/square_error.h> + +namespace bob { namespace learn { namespace mlp { + + SquareError::SquareError(boost::shared_ptr<bob::machine::Activation> actfun): + m_actfun(actfun) {} + + SquareError::~SquareError() {} + + double SquareError::f (double output, double target) const { + return 0.5 * std::pow(output-target, 2); + } + + double SquareError::f_prime (double output, double target) const { + return output - target; + } + + double SquareError::error (double output, double target) const { + return m_actfun->f_prime_from_f(output) * f_prime(output, target); + } + + std::string SquareError::str() const { + return "J = (output-target)^2 / 2 (square error)"; + } + +}}} diff --git a/xbob/learn/mlp/include/xbob.learn.mlp/api.h b/xbob/learn/mlp/include/xbob.learn.mlp/api.h index 8271870e26d4a6f0fcb9d2af685b2e5289a1baba..140df1d39d143869ccc907e1d34734568f04200e 100644 --- a/xbob/learn/mlp/include/xbob.learn.mlp/api.h +++ b/xbob/learn/mlp/include/xbob.learn.mlp/api.h @@ -1,8 +1,6 @@ /** * @author Andre Anjos <andre.anjos@idiap.ch> * @date Thu 24 Apr 17:32:07 2014 CEST - * - * @brief C/C++ API for bob::machine */ #ifndef XBOB_LEARN_MLP_H @@ -10,11 +8,12 @@ #include <Python.h> #include <xbob.learn.mlp/config.h> -#include <bob/machine/MLP.h> -#include <bob/trainer/Cost.h> -#include <bob/trainer/SquareError.h> -#include <bob/trainer/CrossEntropyLoss.h> -#include <bob/trainer/DataShuffler.h> + +#include "machine.h" +#include "cost.h" +#include "square_error.h" +#include "cross_entropy.h" +#include "shuffler.h" #define XBOB_LEARN_MLP_MODULE_PREFIX xbob.learn.mlp #define XBOB_LEARN_MLP_MODULE_NAME _library @@ -54,7 +53,7 @@ enum _PyBobLearnMLP_ENUM{ typedef struct { PyObject_HEAD - bob::machine::MLP* cxx; + bob::learn::mlp::Machine* cxx; } PyBobLearnMLPMachineObject; #define PyBobLearnMLPMachine_Type_TYPE PyTypeObject @@ -67,7 +66,7 @@ typedef struct { typedef struct { PyObject_HEAD - bob::trainer::Cost* cxx; + bob::learn::mlp::Cost* cxx; } PyBobLearnCostObject; #define PyBobLearnCost_Type_TYPE PyTypeObject @@ -77,21 +76,21 @@ typedef struct { typedef struct { PyBobLearnCostObject parent; - bob::trainer::SquareError* cxx; + bob::learn::mlp::SquareError* cxx; } PyBobLearnSquareErrorObject; #define PyBobLearnSquareError_Type_TYPE PyTypeObject typedef struct { PyBobLearnCostObject parent; - bob::trainer::CrossEntropyLoss* cxx; + bob::learn::mlp::CrossEntropyLoss* cxx; } PyBobLearnCrossEntropyLossObject; #define PyBobLearnCrossEntropyLoss_Type_TYPE PyTypeObject typedef struct { PyObject_HEAD - bob::trainer::DataShuffler* cxx; + bob::learn::mlp::DataShuffler* cxx; } PyBobLearnDataShufflerObject; #define PyBobLearnDataShuffler_Type_TYPE PyTypeObject diff --git a/xbob/learn/mlp/include/xbob.learn.mlp/backprop.h b/xbob/learn/mlp/include/xbob.learn.mlp/backprop.h new file mode 100644 index 0000000000000000000000000000000000000000..25c879f9d0dd67f75fa7e42ce18dfe100390f474 --- /dev/null +++ b/xbob/learn/mlp/include/xbob.learn.mlp/backprop.h @@ -0,0 +1,246 @@ +/** + * @date Mon Jul 18 18:11:22 2011 +0200 + * @author Andre Anjos <andre.anjos@idiap.ch> + * @author Laurent El Shafey <Laurent.El-Shafey@idiap.ch> + * + * @brief A MLP trainer based on vanilla back-propagation. You can get an + * overview of this method at "Pattern Recognition and Machine Learning" + * by C.M. Bishop (Chapter 5). + * + * Copyright (C) 2011-2014 Idiap Research Institute, Martigny, Switzerland + */ + +#ifndef BOB_LEARN_MLP_BACKPROP_H +#define BOB_LEARN_MLP_BACKPROP_H + +#include <vector> +#include <boost/function.hpp> + +#include "machine.h" +#include "base_trainer.h" + +namespace bob { namespace learn { namespace mlp { + + /** + * @brief Sets an MLP to perform discrimination based on vanilla error + * back-propagation as defined in "Pattern Recognition and Machine Learning" + * by C.M. Bishop, chapter 5. + */ + class BackProp: public BaseTrainer { + + public: //api + + /** + * @brief Initializes a new BackProp trainer according to a + * given machine settings and a training batch size. + * + * @param batch_size The number of examples passed at each iteration. If + * you set this to 1, then you are implementing stochastic training. + * + * @param cost This is the cost function to use for the current training. + * + * @note Using this constructor, the internals of the trainer remain + * uninitialized. You must call <code>initialize()</code> with a proper + * Machine to initialize the trainer before using it. + * + * @note Using this constructor, you set biases training to + * <code>true</code> + * + * @note Good values for batch sizes are tens of samples. This may affect + * the convergence. + * + * You can also change default values for the learning rate and momentum. + * By default we train w/o any momenta. + * + * If you want to adjust a potential learning rate decay, you can and + * should do it outside the scope of this trainer, in your own way. + */ + BackProp(size_t batch_size, boost::shared_ptr<Cost> cost); + + /** + * @brief Initializes a new BackProp trainer according to a + * given machine settings and a training batch size. + * + * @param batch_size The number of examples passed at each iteration. If + * you set this to 1, then you are implementing stochastic training. + * + * @param cost This is the cost function to use for the current training. + * + * @param machine Clone this machine weights and prepare the trainer + * internally mirroring machine properties. + * + * @note Using this constructor, you set biases training to + * <code>true</code> + * + * @note Good values for batch sizes are tens of samples. This may affect + * the convergence. + * + * You can also change default values for the learning rate and momentum. + * By default we train w/o any momenta. + * + * If you want to adjust a potential learning rate decay, you can and + * should do it outside the scope of this trainer, in your own way. + */ + BackProp(size_t batch_size, boost::shared_ptr<Cost> cost, + const Machine& machine); + + /** + * @brief Initializes a new BackProp trainer according to a + * given machine settings and a training batch size. + * + * @param batch_size The number of examples passed at each iteration. If + * you set this to 1, then you are implementing stochastic training. + * + * @param cost This is the cost function to use for the current training. + * + * @param machine Clone this machine weights and prepare the trainer + * internally mirroring machine properties. + * + * @note Good values for batch sizes are tens of samples. BackProp is not + * necessarily a "batch" training algorithm, but performs in a smoother + * if the batch size is larger. This may also affect the convergence. + * + * @param train_biases A boolean, indicating if we need to train the + * biases or not. + * + * You can also change default values for the learning rate and momentum. + * By default we train w/o any momenta. + * + * If you want to adjust a potential learning rate decay, you can and + * should do it outside the scope of this trainer, in your own way. + */ + BackProp(size_t batch_size, boost::shared_ptr<Cost> cost, + const Machine& machine, bool train_biases); + + /** + * @brief Destructor virtualisation + */ + virtual ~BackProp(); + + /** + * @brief Copy construction. + */ + BackProp(const BackProp& other); + + /** + * @brief Copy operator + */ + BackProp& operator=(const BackProp& other); + + /** + * @brief Re-initializes the whole training apparatus to start training a + * new machine. This will effectively reset all Delta matrices to their + * intial values and set the previous derivatives to zero. + */ + void reset(); + + /** + * @brief Gets the current learning rate + */ + double getLearningRate() const { return m_learning_rate; } + + /** + * @brief Sets the current learning rate + */ + void setLearningRate(double v) { m_learning_rate = v; } + + /** + * @brief Gets the current momentum + */ + double getMomentum() const { return m_momentum; } + + /** + * @brief Sets the current momentum + */ + void setMomentum(double v) { m_momentum = v; } + + /** + * @brief Returns the derivatives of the cost wrt. the weights + */ + const std::vector<blitz::Array<double,2> >& getPreviousDerivatives() const { return m_prev_deriv; } + + /** + * @brief Returns the derivatives of the cost wrt. the biases + */ + const std::vector<blitz::Array<double,1> >& getPreviousBiasDerivatives() const { return m_prev_deriv_bias; } + + /** + * @brief Sets the previous derivatives of the cost + */ + void setPreviousDerivatives(const std::vector<blitz::Array<double,2> >& v); + + /** + * @brief Sets the previous derivatives of the cost of a given index + */ + void setPreviousDerivative(const blitz::Array<double,2>& v, const size_t index); + + /** + * @brief Sets the previous derivatives of the cost (biases) + */ + void setPreviousBiasDerivatives(const std::vector<blitz::Array<double,1> >& v); + + /** + * @brief Sets the previous derivatives of the cost (biases) of a given + * index + */ + void setPreviousBiasDerivative(const blitz::Array<double,1>& v, const size_t index); + + /** + * @brief Initialize the internal buffers for the current machine + */ + virtual void initialize(const Machine& machine); + + /** + * @brief Trains the MLP to perform discrimination. The training is + * executed outside the machine context, but uses all the current + * machine layout. The given machine is updated with new weights and + * biases on the end of the training that is performed a single time. + * Iterate as much as you want to refine the training. + * + * The machine given as input is checked for compatibility with the + * current initialized settings. If the two are not compatible, an + * exception is thrown. + * + * Note: In BackProp, training may be done in batches. The number of rows + * in the input (and target) determines the batch size. If the batch size + * currently set is incompatible with the given data an exception is + * raised. + * + * Note2: The machine is not initialized randomly at each train() call. + * It is your task to call MLP::randomize() once on the machine you want + * to train and then call train() as many times as you think are + * necessary. This design allows for a training criteria to be encoded + * outside the scope of this trainer and to this type to focus only on + * input, target applying the training when requested to. + */ + void train(Machine& machine, + const blitz::Array<double,2>& input, + const blitz::Array<double,2>& target); + + /** + * @brief This is a version of the train() method above, which does no + * compatibility check on the input machine. + */ + void train_(Machine& machine, + const blitz::Array<double,2>& input, + const blitz::Array<double,2>& target); + + private: + /** + * Weight update -- calculates the weight-update using derivatives as + * required by back-prop. + */ + void backprop_weight_update(Machine& machine, + const blitz::Array<double,2>& input); + + /// training parameters: + double m_learning_rate; + double m_momentum; + + std::vector<blitz::Array<double,2> > m_prev_deriv; ///< prev.weight derivs + std::vector<blitz::Array<double,1> > m_prev_deriv_bias; ///< prev. bias derivs + }; + +}}} + +#endif /* BOB_LEARN_MLP_BACKPROP_H */ diff --git a/xbob/learn/mlp/include/xbob.learn.mlp/base_trainer.h b/xbob/learn/mlp/include/xbob.learn.mlp/base_trainer.h new file mode 100644 index 0000000000000000000000000000000000000000..221921b337f6bb26b53d66858cc900d99d3cdeec --- /dev/null +++ b/xbob/learn/mlp/include/xbob.learn.mlp/base_trainer.h @@ -0,0 +1,305 @@ +/** + * @date Tue May 14 12:00:03 CEST 2013 + * @author Andre Anjos <andre.anjos@idiap.ch> + * @author Laurent El Shafey <Laurent.El-Shafey@idiap.ch> + * + * Copyright (C) 2011-2014 Idiap Research Institute, Martigny, Switzerland + */ + +#ifndef BOB_LEARN_MLP_BASE_TRAINER_H +#define BOB_LEARN_MLP_BASE_TRAINER_H + +#include <vector> +#include <boost/shared_ptr.hpp> + +#include "machine.h" +#include "cost.h" + +namespace bob { namespace learn { namespace mlp { + + /** + * @brief Base class for training MLP. This provides forward and backward + * functions over a batch of samples, as well as accessors to the internal + * states of the networks. + * + * Here is an overview of the backprop algorithm executed by this trainer: + * + * -# Take the <em>local gradient</em> of a neuron + * @f[ b^{(l)} @f] + * + * -# Multiply that value by the <em>output</em> of the previous layer; + * @f[ + * b^{(l)} \times a^{(l-1)} + * @f] + * + * -# Multiply the result of the previous step by the learning rate; + * @f[ + * \eta \times b^{(l)} \times a^{(l-1)} + * @f] + * + * -# Add the result of the previous setup to the current weight, + * possibly weighting the sum with a momentum ponderator. + * @f[ + * w_{n+1} = (1-\mu) \times (w_{n} + \eta \times b^{(l)} + * \times a^{(l-1)}) + (\mu) \times w_{n-1} + * @f] + */ + class BaseTrainer { + + public: //api + + /** + * @brief Initializes a new BaseTrainer trainer according to a given + * training batch size. + * + * @param batch_size The number of examples passed at each iteration. If + * you set this to 1, then you are implementing stochastic training. + * + * @param cost This is the cost function to use for the current training. + * + * @note Using this constructor, the internals of the trainer remain + * uninitialized. You must call <code>initialize()</code> with a proper + * Machine to initialize the trainer before using it. + * + * @note Using this constructor, you set biases training to + * <code>true</code> + * + * @note Good values for batch sizes are tens of samples. This may affect + * the convergence. + */ + BaseTrainer(size_t batch_size, + boost::shared_ptr<Cost> cost); + + /** + * @brief Initializes a new BaseTrainer trainer according to a given + * machine settings and a training batch size. + * + * @param batch_size The number of examples passed at each iteration. If + * you set this to 1, then you are implementing stochastic training. + * + * @param cost This is the cost function to use for the current training. + * + * @param machine Clone this machine weights and prepare the trainer + * internally mirroring machine properties. + * + * @note Using this constructor, you set biases training to + * <code>true</code> + * + * @note Good values for batch sizes are tens of samples. This may affect + * the convergence. + */ + BaseTrainer(size_t batch_size, + boost::shared_ptr<Cost> cost, + const Machine& machine); + + /** + * @brief Initializes a new BaseTrainer trainer according to a given + * machine settings and a training batch size. + * + * @param batch_size The number of examples passed at each iteration. If + * you set this to 1, then you are implementing stochastic training. + * + * @param cost This is the cost function to use for the current training. + * + * @param machine Clone this machine weights and prepare the trainer + * internally mirroring machine properties. + * + * @param train_biases A boolean, indicating if we need to train the + * biases or not. + * + * @note Good values for batch sizes are tens of samples. This may affect + * the convergence. + */ + BaseTrainer(size_t batch_size, + boost::shared_ptr<Cost> cost, + const Machine& machine, + bool train_biases); + + /** + * @brief Destructor virtualisation + */ + virtual ~BaseTrainer(); + + /** + * @brief Copy construction. + */ + BaseTrainer(const BaseTrainer& other); + + /** + * @brief Copy operator + */ + BaseTrainer& operator=(const BaseTrainer& other); + + /** + * @brief Gets the batch size + */ + size_t getBatchSize() const { return m_batch_size; } + + /** + * @brief Sets the batch size + */ + void setBatchSize(size_t batch_size); + + /** + * @brief Gets the cost to be minimized + */ + boost::shared_ptr<Cost> getCost() const { return m_cost; } + + /** + * @brief Sets the cost to be minimized + */ + void setCost(boost::shared_ptr<Cost> cost) { m_cost = cost; } + + /** + * @brief Gets the current settings for bias training (defaults to true) + */ + inline bool getTrainBiases() const { return m_train_bias; } + + /** + * @brief Sets the bias training option + */ + inline void setTrainBiases(bool v) { m_train_bias = v; } + + /** + * @brief Checks if a given machine is compatible with my inner settings. + */ + bool isCompatible(const Machine& machine) const; + + /** + * @brief Returns the number of hidden layers on the target machine + */ + size_t numberOfHiddenLayers() const { return m_H; } + + /** + * @brief Forward step -- this is a second implementation of that used on + * the MLP itself to allow access to some internal buffers. In our + * current setup, we keep the "m_output"'s of every individual layer + * separately as we are going to need them for the weight update. + * + * Another factor is the normalization normally applied at MLPs. We + * ignore that here as the DataShuffler should be capable of handling + * this in a more efficient way. You should make sure that the final MLP + * does have the standard normalization settings applied if it was set to + * automatically apply the standard normalization before giving me the + * data. + */ + void forward_step(const Machine& machine, + const blitz::Array<double,2>& input); + + /** + * @brief Backward step -- back-propagates the calculated error up to each + * neuron on the first layer and calculates the cost w.r.t. to each + * weight and bias on the network. This is explained on Bishop's formula + * 5.55 and 5.56, at page 244 (see also figure 5.7 for a graphical + * representation). + */ + void backward_step(const Machine& machine, + const blitz::Array<double,2>& input, + const blitz::Array<double,2>& target); + + /** + * @brief Calculates the cost for a given target. + * + * The cost for a given target is the sum of the individually calculated + * costs for every output, averaged for all examples. + * + * This method assumes you have already called forward_step() before. If + * that is not the case, use the next variant. + * + * @return The cost averaged over all targets + */ + double cost(const blitz::Array<double,2>& target) const; + + /** + * @brief Calculates the cost for a given target. + * + * The cost for a given target is the sum of the individually calculated + * costs for every output, averaged for all examples. + * + * This method also calls forward_step(), so you can call backward_step() + * just after it, if you wish to do so. + * + * @return The cost averaged over all targets + */ + double cost(const Machine& machine, + const blitz::Array<double,2>& input, + const blitz::Array<double,2>& target); + + /** + * @brief Initialize the internal buffers for the current machine + */ + virtual void initialize(const Machine& machine); + + /** + * @brief Returns the errors + */ + const std::vector<blitz::Array<double,2> >& getError() const { return m_error; } + /** + * @brief Returns the outputs + */ + const std::vector<blitz::Array<double,2> >& getOutput() const { return m_output; } + /** + * @brief Returns the derivatives of the cost wrt. the weights + */ + const std::vector<blitz::Array<double,2> >& getDerivatives() const { return m_deriv; } + /** + * @brief Returns the derivatives of the cost wrt. the biases + */ + const std::vector<blitz::Array<double,1> >& getBiasDerivatives() const { return m_deriv_bias; } + /** + * @brief Sets the error + */ + void setError(const std::vector<blitz::Array<double,2> >& error); + /** + * @brief Sets the error of a given index + */ + void setError(const blitz::Array<double,2>& error, const size_t index); + /** + * @brief Sets the outputs + */ + void setOutput(const std::vector<blitz::Array<double,2> >& output); + /** + * @brief Sets the output of a given index + */ + void setOutput(const blitz::Array<double,2>& output, const size_t index); + /** + * @brief Sets the derivatives of the cost + */ + void setDerivatives(const std::vector<blitz::Array<double,2> >& deriv); + /** + * @brief Sets the derivatives of the cost of a given index + */ + void setDerivative(const blitz::Array<double,2>& deriv, const size_t index); + /** + * @brief Sets the derivatives of the cost (biases) + */ + void setBiasDerivatives(const std::vector<blitz::Array<double,1> >& deriv_bias); + /** + * @brief Sets the derivatives of the cost (biases) of a given index + */ + void setBiasDerivative(const blitz::Array<double,1>& deriv_bias, const size_t index); + + private: //representation + + /** + * @brief Resets the buffer to 0 value + */ + void reset(); + + /// training parameters: + size_t m_batch_size; ///< the batch size + boost::shared_ptr<Cost> m_cost; ///< cost function to be minimized + bool m_train_bias; ///< shall we be training biases? (default: true) + size_t m_H; ///< number of hidden layers on the target machine + + std::vector<blitz::Array<double,2> > m_deriv; ///< derivatives of the cost wrt. the weights + std::vector<blitz::Array<double,1> > m_deriv_bias; ///< derivatives of the cost wrt. the biases + + /// buffers that are dependent on the batch_size + std::vector<blitz::Array<double,2> > m_error; ///< error (+deltas) + std::vector<blitz::Array<double,2> > m_output; ///< layer output + }; + +}}} + +#endif /* BOB_LEARN_MLP_BASE_TRAINER_H */ diff --git a/xbob/learn/mlp/include/xbob.learn.mlp/cost.h b/xbob/learn/mlp/include/xbob.learn.mlp/cost.h new file mode 100644 index 0000000000000000000000000000000000000000..8614bacd2f85f246c1463158c40492ab0a099540 --- /dev/null +++ b/xbob/learn/mlp/include/xbob.learn.mlp/cost.h @@ -0,0 +1,80 @@ +/** + * @author Andre Anjos <andre.anjos@idiap.ch> + * @date Fri 31 May 15:08:46 2013 + * + * @brief Implements the concept of a 'cost' function for MLP training + * + * Copyright (C) 2011-2014 Idiap Research Institute, Martigny, Switzerland + */ + +#ifndef BOB_LEARN_MLP_COST_H +#define BOB_LEARN_MLP_COST_H + +#include <string> +#include <boost/shared_ptr.hpp> +#include "bob/machine/Activation.h" + +namespace bob { namespace learn { namespace mlp { + + /** + * Base class for cost function used for Linear machine or MLP training + * from this one. + */ + class Cost { + + public: + + /** + * Computes cost, given the current output of the linear machine or MLP + * and the expected output. + * + * @param output Real output from the linear machine or MLP + * + * @param target Target output you are training to achieve + * + * @return The cost + */ + virtual double f (double output, double target) const =0; + + /** + * Computes the derivative of the cost w.r.t. output. + * + * @param output Real output from the linear machine or MLP + * + * @param target Target output you are training to achieve + * + * @return The calculated error + */ + virtual double f_prime (double output, double target) const =0; + + /** + * Computes the back-propagated error for a given MLP <b>output</b> + * layer, given its activation function and outputs - i.e., the + * error back-propagated through the last layer neuron up to the + * synapse connecting the last hidden layer to the output layer. + * + * This entry point allows for optimization in the calculation of the + * back-propagated errors in cases where there is a possibility of + * mathematical simplification when using a certain combination of + * cost-function and activation. For example, using a ML-cost and a + * logistic activation function. + * + * @param output Real output from the linear machine or MLP + * + * @param target Target output you are training to achieve + * + * @return The calculated error, backpropagated to before the output + * neuron. + */ + virtual double error (double output, double target) const =0; + + /** + * Returns a stringified representation for this Activation function + */ + virtual std::string str() const =0; + + }; + +}}} + +#endif /* BOB_LEARN_MLP_COST_H */ diff --git a/xbob/learn/mlp/include/xbob.learn.mlp/cross_entropy.h b/xbob/learn/mlp/include/xbob.learn.mlp/cross_entropy.h new file mode 100644 index 0000000000000000000000000000000000000000..b36b0ab97474b1ce46563453dce5d0d4c5351e73 --- /dev/null +++ b/xbob/learn/mlp/include/xbob.learn.mlp/cross_entropy.h @@ -0,0 +1,128 @@ +/** + * @author Andre Anjos <andre.anjos@idiap.ch> + * @date Fri 31 May 15:08:46 2013 + * + * @brief Implements the Cross Entropy Loss function + * + * Copyright (C) 2011-2014 Idiap Research Institute, Martigny, Switzerland + */ + +#ifndef BOB_LEARN_MLP_CROSSENTROPYLOSS_H +#define BOB_LEARN_MLP_CROSSENTROPYLOSS_H + +#include "cost.h" + +namespace bob { namespace learn { namespace mlp { + + /** + * Calculates the Cross-Entropy Loss between output and target. The cross + * entropy loss is defined as follows: + * + * \f[ + * J = - y \cdot \log{(\hat{y})} - (1-y) \log{(1-\hat{y})} + * \f] + * + * where \f$\hat{y}\f$ is the output estimated by your machine and \f$y\f$ is + * the expected output. + */ + class CrossEntropyLoss: public Cost { + + public: + + /** + * Constructor + * + * @param actfun Sets the underlying activation function used for error + * calculation. A special case is foreseen for using this loss function + * with a logistic activation. In this case, a mathematical + * simplification is possible in which error() can benefit increasing the + * numerical stability of the training process. The simplification goes + * as follows: + * + * \f[ + * b = \delta \cdot \varphi'(z) + * \f] + * + * But, for the CrossEntropyLoss: + * + * \f[ + * \delta = \frac{\hat{y} - y}{\hat{y}(1 - \hat{y}} + * \f] + * + * and \f$\varphi'(z) = \hat{y} - (1 - \hat{y})\f$, so: + * + * \f[ + * b = \hat{y} - y + * \f] + */ + CrossEntropyLoss(boost::shared_ptr<bob::machine::Activation> actfun); + + /** + * Virtualized destructor + */ + virtual ~CrossEntropyLoss(); + + /** + * Tells if this CrossEntropyLoss is set to operate together with a + * bob::machine::LogisticActivation. + */ + bool logistic_activation() const { return m_logistic_activation; } + + /** + * Computes cost, given the current output of the linear machine or MLP + * and the expected output. + * + * @param output Real output from the linear machine or MLP + * + * @param target Target output you are training to achieve + * + * @return The cost + */ + virtual double f (double output, double target) const; + + /** + * Computes the derivative of the cost w.r.t. output. + * + * @param output Real output from the linear machine or MLP + * + * @param target Target output you are training to achieve + * + * @return The calculated error + */ + virtual double f_prime (double output, double target) const; + + /** + * Computes the back-propagated errors for a given MLP <b>output</b> + * layer, given its activation function and activation values - i.e., the + * error back-propagated through the last layer neurons up to the + * synapses connecting the last hidden layer to the output layer. + * + * This entry point allows for optimization in the calculation of the + * back-propagated errors in cases where there is a possibility of + * mathematical simplification when using a certain combination of + * cost-function and activation. For example, using a ML-cost and a + * logistic activation function. + * + * @param output Real output from the linear machine or MLP + * @param target Target output you are training to achieve + * + * @return The calculated error, backpropagated to before the output + * neuron. + */ + virtual double error (double output, double target) const; + + /** + * Returns a stringified representation for this Cost function + */ + virtual std::string str() const; + + private: //representation + + boost::shared_ptr<bob::machine::Activation> m_actfun; //act. function + bool m_logistic_activation; ///< if 'true', simplify backprop_error() + + }; + +}}} + +#endif /* BOB_LEARN_MLP_CROSSENTROPYLOSS_H */ diff --git a/xbob/learn/mlp/include/xbob.learn.mlp/machine.h b/xbob/learn/mlp/include/xbob.learn.mlp/machine.h new file mode 100644 index 0000000000000000000000000000000000000000..b4a5ab2eb96746e5ae8649cfcf0020d9ef2eb00b --- /dev/null +++ b/xbob/learn/mlp/include/xbob.learn.mlp/machine.h @@ -0,0 +1,377 @@ +/** + * @date Tue Jan 18 17:07:26 2011 +0100 + * @author André Anjos <andre.anjos@idiap.ch> + * @author Laurent El Shafey <Laurent.El-Shafey@idiap.ch> + * + * @brief The representation of a Multi-Layer Perceptron (MLP). + * + * Copyright (C) 2011-2014 Idiap Research Institute, Martigny, Switzerland + */ + +#ifndef BOB_LEARN_MLP_MACHINE_H +#define BOB_LEARN_MLP_MACHINE_H + +#include <boost/random.hpp> +#include <boost/shared_ptr.hpp> +#include <blitz/array.h> + +#include <bob/io/HDF5File.h> +#include <bob/machine/Activation.h> + +namespace bob { namespace learn { namespace mlp { + + /** + * An MLP object is a representation of a Multi-Layer Perceptron. This + * implementation is feed-forward and fully-connected. The implementation + * allows setting of input normalization values and a global activation + * function. References to fully-connected feed-forward networks: Bishop's + * Pattern Recognition and Machine Learning, Chapter 5. Figure 5.1 shows what + * we mean. + * + * MLPs normally are multi-layered systems, with 1 or more hidden layers. As + * a special case, this implementation also supports connecting the input + * directly to the output by means of a single weight matrix. This is + * equivalent of a LinearMachine, with the advantage it can be trained by MLP + * trainers. + */ + class Machine { + + public: //api + + /** + * Constructor, builds a new MLP. Internal values are uninitialized. In + * this case, there are no hidden layers and the resulting machine is + * equivalent to a linear machine except, perhaps for the activation + * function which is set to be a hyperbolic tangent. + * + * @param input Size of input vector + * @param output Size of output vector + */ + Machine (size_t input, size_t output); + + /** + * Constructor, builds a new MLP. Internal values are uninitialized. In + * this case, the number of hidden layers equals 1 and its size can be + * defined by the middle parameter. The default activation function will + * be set to hyperbolic tangent. + * + * @param input Size of input vector + * @param hidden Size of the hidden layer + * @param output Size of output vector + */ + Machine (size_t input, size_t hidden, size_t output); + + /** + * Constructor, builds a new MLP. Internal values are uninitialized. With + * this constructor you can control the number of hidden layers your MLP + * will have. The default activation function will be set to hyperbolic + * tangent. + * + * @param input Size of input vector + * @param hidden The number and size of each hidden layer + * @param output Size of output vector + */ + Machine (size_t input, const std::vector<size_t>& hidden, size_t output); + + /** + * Builds a new MLP with a shape containing the number of inputs (first + * element), number of outputs (last element) and the number of neurons + * in each hidden layer (elements between the first and last element of + * the vector). The default activation function will be set to hyperbolic + * tangent. + */ + Machine (const std::vector<size_t>& shape); + + /** + * Copies another machine + */ + Machine (const Machine& other); + + /** + * Starts a new MLP from an existing Configuration object. + */ + Machine (bob::io::HDF5File& config); + + /** + * Just to virtualise the destructor + */ + virtual ~Machine(); + + /** + * Assigns from a different machine + */ + Machine& operator= (const Machine& other); + + /** + * @brief Equal to + */ + bool operator== (const Machine& other) const; + + /** + * @brief Not equal to + */ + bool operator!= (const Machine& other) const; + + /** + * @brief Similar to + */ + bool is_similar_to(const Machine& other, const double r_epsilon=1e-5, + const double a_epsilon=1e-8) const; + + + /** + * Loads data from an existing configuration object. Resets the current + * state. + */ + void load (bob::io::HDF5File& config); + + /** + * Saves an existing machine to a Configuration object. + */ + void save (bob::io::HDF5File& config) const; + + /** + * Forwards data through the network, outputs the values of each output + * neuron. + * + * The input and output are NOT checked for compatibility each time. It + * is your responsibility to do it. + */ + void forward_ (const blitz::Array<double,1>& input, + blitz::Array<double,1>& output); + + /** + * Forwards data through the network, outputs the values of each output + * neuron. + * + * The input and output are checked for compatibility each time the + * forward method is applied. + */ + void forward (const blitz::Array<double,1>& input, + blitz::Array<double,1>& output); + + /** + * Forwards data through the network, outputs the values of each output + * neuron. This variant will take a number of inputs in one single input + * matrix with inputs arranged row-wise (i.e., every row contains an + * individual input). + * + * The input and output are NOT checked for compatibility each time. It + * is your responsibility to do it. + */ + void forward_ (const blitz::Array<double,2>& input, + blitz::Array<double,2>& output); + + /** + * Forwards data through the network, outputs the values of each output + * neuron. This variant will take a number of inputs in one single input + * matrix with inputs arranged row-wise (i.e., every row contains an + * individual input). + * + * The input and output are checked for compatibility each time the + * forward method is applied. + */ + void forward (const blitz::Array<double,2>& input, + blitz::Array<double,2>& output); + + /** + * Resizes the machine. This causes this MLP to be completely + * re-initialized and should be considered invalid for calculation after + * this operation. Using this method there will be no hidden layers in + * the resized machine. + */ + void resize (size_t input, size_t output); + + /** + * Resizes the machine. This causes this MLP to be completely + * re-initialized and should be considered invalid for calculation after + * this operation. Using this method there will be precisely 1 hidden + * layer in the resized machine. + */ + void resize (size_t input, size_t hidden, size_t output); + + /** + * Resizes the machine. This causes this MLP to be completely + * re-initialized and should be considered invalid for calculation after + * this operation. Using this method there will be as many hidden layers + * as there are size_t's in the vector parameter "hidden". + */ + void resize (size_t input, const std::vector<size_t>& hidden, + size_t output); + + /** + * Resizes the machine. This causes this MLP to be completely + * re-initialized and should be considered invalid for calculation after + * this operation. Using this method there will be as many hidden layers + * as there are size_t's in the vector parameter "hidden". + */ + void resize (const std::vector<size_t>& shape); + + /** + * Returns the number of inputs expected by this machine + */ + size_t inputSize () const { return m_weight.front().extent(0); } + + /** + * Returns the number of hidden layers this MLP has + */ + size_t numOfHiddenLayers() const { return m_weight.size() - 1; } + + /** + * Returns the number of outputs generated by this machine + */ + size_t outputSize () const { return m_weight.back().extent(1); } + + /** + * Returns the input subtraction factor + */ + const blitz::Array<double, 1>& getInputSubtraction() const + { return m_input_sub; } + + /** + * Sets the current input subtraction factor. We will check that the + * number of inputs (first dimension of weights) matches the number of + * values currently set and will raise an exception if that is not the + * case. + */ + void setInputSubtraction(const blitz::Array<double,1>& v); + + /** + * Sets all input subtraction values to a specific value. + */ + void setInputSubtraction(double v) { m_input_sub = v; } + + /** + * Returns the input division factor + */ + const blitz::Array<double, 1>& getInputDivision() const + { return m_input_div; } + + /** + * Sets the current input division factor. We will check that the number + * of inputs (first dimension of weights) matches the number of values + * currently set and will raise an exception if that is not the case. + */ + void setInputDivision(const blitz::Array<double,1>& v); + + /** + * Sets all input division values to a specific value. + */ + void setInputDivision(double v) { m_input_div = v; } + + /** + * Returns the weights of all layers. + */ + const std::vector<blitz::Array<double, 2> >& getWeights() const + { return m_weight; } + + /** + * @brief Returns the weights of all layers in order to be updated. + * This method should only be used by trainers. + */ + std::vector<blitz::Array<double, 2> >& updateWeights() + { return m_weight; } + + /** + * Sets weights for all layers. The number of inputs, outputs and total + * number of weights should be the same as set before, or this method + * will raise. If you would like to set this MLP to a different weight + * configuration, consider first using resize(). + */ + void setWeights(const std::vector<blitz::Array<double,2> >& weight); + + /** + * Sets all weights to a single specific value. + */ + void setWeights(double v); + + /** + * Returns the biases of this classifier, for every hidden layer and + * output layer we have. + */ + const std::vector<blitz::Array<double, 1> >& getBiases() const + { return m_bias; } + + /** + * @brief Returns the biases of this classifier, for every hidden layer + * and output layer we have, in order to be updated. + * This method should only be used by trainers. + */ + std::vector<blitz::Array<double, 1> >& updateBiases() + { return m_bias; } + + /** + * Sets the current biases. We will check that the number of biases + * matches the number of weights (first dimension) currently set and + * will raise an exception if that is not the case. + */ + void setBiases(const std::vector<blitz::Array<double,1> >& bias); + + /** + * Sets all output bias values to a specific value. + */ + void setBiases(double v); + + /** + * Returns the currently set activation function for the hidden layers + */ + boost::shared_ptr<bob::machine::Activation> getHiddenActivation() const + { return m_hidden_activation; } + + /** + * Sets the activation function for each of the hidden layers. + */ + void setHiddenActivation(boost::shared_ptr<bob::machine::Activation> a) { + m_hidden_activation = a; + } + + /** + * Returns the currently set output activation function + */ + boost::shared_ptr<bob::machine::Activation> getOutputActivation() const + { return m_output_activation; } + + /** + * Sets the activation function for the outputs of the last layer. + */ + void setOutputActivation(boost::shared_ptr<bob::machine::Activation> a) { + m_output_activation = a; + } + + /** + * Reset all weights and biases. You can (optionally) specify the + * lower and upper bound for the uniform distribution that will be used + * to draw values from. The default values are the ones recommended by + * most implementations. Be sure of what you are doing before training to + * change this too radically. + * + * Values are drawn using boost::uniform_real class. Values are taken + * from the range [lower_bound, upper_bound) according to the + * boost::random documentation. + */ + void randomize(boost::mt19937& rng, double lower_bound=-0.1, + double upper_bound=+0.1); + + /** + * This is equivalent to randomize() above, but we will create the boost + * random number generator ourselves using a time-based seed. Results + * after each call will be probably different as long as they are + * separated by at least 1 microsecond (from the machine clock). + */ + void randomize(double lower_bound=-0.1, double upper_bound=+0.1); + + private: //representation + + blitz::Array<double, 1> m_input_sub; ///< input subtraction + blitz::Array<double, 1> m_input_div; ///< input division + std::vector<blitz::Array<double, 2> > m_weight; ///< weights + std::vector<blitz::Array<double, 1> > m_bias; ///< biases for the output + boost::shared_ptr<bob::machine::Activation> m_hidden_activation; ///< currently set activation type + boost::shared_ptr<bob::machine::Activation> m_output_activation; ///< currently set activation type + mutable std::vector<blitz::Array<double, 1> > m_buffer; ///< buffer for the outputs of each layer + + }; + +}}} + +#endif /* BOB_LEARN_MLP_MACHINE_H */ diff --git a/xbob/learn/mlp/include/xbob.learn.mlp/rprop.h b/xbob/learn/mlp/include/xbob.learn.mlp/rprop.h new file mode 100644 index 0000000000000000000000000000000000000000..ad34f632ceb3f15689cc85afe57380d877b42bd4 --- /dev/null +++ b/xbob/learn/mlp/include/xbob.learn.mlp/rprop.h @@ -0,0 +1,316 @@ +/** + * @date Wed Jul 6 17:32:35 2011 +0200 + * @author Andre Anjos <andre.anjos@idiap.ch> + * @author Laurent El Shafey<Laurent.El-Shafey@idiap.ch> + * + * @brief A MLP trainer based on resilient back-propagation: A Direct Adaptive + * Method for Faster Backpropagation Learning: The RPROP Algorithm, by Martin + * Riedmiller and Heinrich Braun on IEEE International Conference on Neural + * Networks, pp. 586--591, 1993. + * + * Copyright (C) 2011-2014 Idiap Research Institute, Martigny, Switzerland + */ + +#ifndef BOB_LEARN_MLP_RPROP_H +#define BOB_LEARN_MLP_RPROP_H + +#include <vector> +#include <boost/function.hpp> + +#include "machine.h" +#include "base_trainer.h" + +namespace bob { namespace learn { namespace mlp { + + /** + * @brief Sets an MLP to perform discrimination based on RProp: A Direct + * Adaptive Method for Faster Backpropagation Learning: The RPROP Algorithm, + * by Martin Riedmiller and Heinrich Braun on IEEE International Conference + * on Neural Networks, pp. 586--591, 1993. + */ + class RProp: public BaseTrainer { + + public: //api + + /** + * @brief Initializes a new RProp trainer according to a given + * training batch size. + * + * @param batch_size The number of examples passed at each iteration. + * This should be a big number (tens of samples) - Resilient + * Back-propagation is a <b>batch</b> algorithm, it requires large sample + * sizes + * + * @param cost This is the cost function to use for the current training. + * + * @note Good values for batch sizes are tens of samples. This may affect + * the convergence. + */ + RProp(size_t batch_size, + boost::shared_ptr<Cost> cost); + + /** + * @brief Initializes a new RProp trainer according to a given + * machine settings and a training batch size. + * + * @param batch_size The number of examples passed at each iteration. + * This should be a big number (tens of samples) - Resilient + * Back-propagation is a <b>batch</b> algorithm, it requires large sample + * sizes + * + * @param cost This is the cost function to use for the current training. + * + * @param machine Clone this machine weights and prepare the trainer + * internally mirroring machine properties. + * + * @note Good values for batch sizes are tens of samples. This may affect + * the convergence. + */ + RProp(size_t batch_size, + boost::shared_ptr<Cost> cost, + const Machine& machine); + + /** + * @brief Initializes a new RProp trainer according to a + * given machine settings and a training batch size. + * + * @param batch_size The number of examples passed at each iteration. + * This should be a big number (tens of samples) - Resilient + * Back-propagation is a <b>batch</b> algorithm, it requires large sample + * sizes + * + * @param cost This is the cost function to use for the current training. + * + * @param machine Clone this machine weights and prepare the trainer + * internally mirroring machine properties. + * + * @note Good values for batch sizes are tens of samples. BackProp is not + * necessarily a "batch" training algorithm, but performs in a smoother + * if the batch size is larger. This may also affect the convergence. + * + * @param train_biases A boolean, indicating if we need to train the + * biases or not. + * + * You can also change default values for the learning rate and momentum. + * By default we train w/o any momenta. + * + * If you want to adjust a potential learning rate decay, you can and + * should do it outside the scope of this trainer, in your own way. + */ + RProp(size_t batch_size, boost::shared_ptr<Cost> cost, + const Machine& machine, bool train_biases); + + /** + * @brief Destructor virtualisation + */ + virtual ~RProp(); + + /** + * @brief Copy construction. + */ + RProp(const RProp& other); + + /** + * @brief Copy operator + */ + RProp& operator=(const RProp& other); + + /** + * @brief Re-initializes the whole training apparatus to start training + * a new machine. This will effectively reset all Delta matrices to their + * intial values and set the previous derivatives to zero as described on + * the section II.C of the RProp paper. + */ + void reset(); + + /** + * @brief Initialize the internal buffers for the current machine + */ + virtual void initialize(const Machine& machine); + + /** + * @brief Trains the MLP to perform discrimination. The training is + * executed outside the machine context, but uses all the current machine + * layout. The given machine is updated with new weights and biases on + * the end of the training that is performed a single time. Iterate as + * much as you want to refine the training. + * + * The machine given as input is checked for compatibility with the + * current initialized settings. If the two are not compatible, an + * exception is thrown. + * + * Note: In RProp, training is done in batches. The number of rows in the + * input (and target) determines the batch size. If the batch size + * currently set is incompatible with the given data an exception is + * raised. + * + * Note2: The machine is not initialized randomly at each train() call. + * It is your task to call MLP::randomize() once on the machine you + * want to train and then call train() as many times as you think are + * necessary. This design allows for a training criteria to be encoded + * outside the scope of this trainer and to this type to focus only on + input, target applying the training when requested to. + */ + void train(Machine& machine, + const blitz::Array<double,2>& input, + const blitz::Array<double,2>& target); + + /** + * @brief This is a version of the train() method above, which does no + * compatibility check on the input machine. + */ + void train_(Machine& machine, + const blitz::Array<double,2>& input, + const blitz::Array<double,2>& target); + + /** + * Accessors for algorithm parameters + */ + + /** + * @brief Gets the de-enforcement parameter (default is 0.5) + */ + double getEtaMinus() const { return m_eta_minus; } + + /** + * @brief Sets the de-enforcement parameter (default is 0.5) + */ + void setEtaMinus(double v) { m_eta_minus = v; } + + /** + * @brief Gets the enforcement parameter (default is 1.2) + */ + double getEtaPlus() const { return m_eta_plus; } + + /** + * @brief Sets the enforcement parameter (default is 1.2) + */ + void setEtaPlus(double v) { m_eta_plus = v; } + + /** + * @brief Gets the initial weight update (default is 0.1) + */ + double getDeltaZero() const { return m_delta_zero; } + + /** + * @brief Sets the initial weight update (default is 0.1) + */ + void setDeltaZero(double v) { m_delta_zero = v; } + + /** + * @brief Gets the minimal weight update (default is 1e-6) + */ + double getDeltaMin() const { return m_delta_min; } + + /** + * @brief Sets the minimal weight update (default is 1e-6) + */ + void setDeltaMin(double v) { m_delta_min = v; } + + /** + * @brief Gets the maximal weight update (default is 50.0) + */ + double getDeltaMax() const { return m_delta_max; } + + /** + * @brief Sets the maximal weight update (default is 50.0) + */ + void setDeltaMax(double v) { m_delta_max = v; } + + /** + * @brief Returns the deltas + */ + const std::vector<blitz::Array<double,2> >& getDeltas() const { return m_delta; } + + /** + * @brief Returns the deltas + */ + const std::vector<blitz::Array<double,1> >& getBiasDeltas() const { return m_delta_bias; } + + /** + * @brief Sets the deltas + */ + void setDeltas(const std::vector<blitz::Array<double,2> >& v); + + /** + * @brief Sets the deltas for a given index + */ + void setDelta(const blitz::Array<double,2>& v, const size_t index); + + /** + * @brief Sets the bias deltas + */ + void setBiasDeltas(const std::vector<blitz::Array<double,1> >& v); + + /** + * @brief Sets the bias deltas for a given index + */ + void setBiasDelta(const blitz::Array<double,1>& v, const size_t index); + + /** + * @brief Returns the derivatives of the cost wrt. the weights + */ + const std::vector<blitz::Array<double,2> >& getPreviousDerivatives() const { return m_prev_deriv; } + + /** + * @brief Returns the derivatives of the cost wrt. the biases + */ + const std::vector<blitz::Array<double,1> >& getPreviousBiasDerivatives() const { return m_prev_deriv_bias; } + + /** + * @brief Sets the previous derivatives of the cost + */ + void setPreviousDerivatives(const std::vector<blitz::Array<double,2> >& v); + + /** + * @brief Sets the previous derivatives of the cost of a given index + */ + void setPreviousDerivative(const blitz::Array<double,2>& v, const size_t index); + + /** + * @brief Sets the previous derivatives of the cost (biases) + */ + void setPreviousBiasDerivatives(const std::vector<blitz::Array<double,1> >& v); + + /** + * @brief Sets the previous derivatives of the cost (biases) of a given + * index + */ + void setPreviousBiasDerivative(const blitz::Array<double,1>& v, const size_t index); + + private: //representation + + /** + * Weight update -- calculates the weight-update using derivatives as + * explained in Bishop's formula 5.53, page 243. + * + * Note: For RProp, specifically, we only care about the derivative's + * sign, current and the previous. This is the place where standard + * backprop and rprop diverge. + * + * For extra insight, double-check the Technical Report entitled "Rprop - + * Description and Implementation Details" by Martin Riedmiller, 1994. + * Just browse the internet for it. Keep it under your pillow ;-) + */ + void rprop_weight_update(Machine& machine, + const blitz::Array<double,2>& input); + + double m_eta_minus; ///< de-enforcement parameter (0.5) + double m_eta_plus; ///< enforcement parameter (1.2) + double m_delta_zero;///< initial value for the weight change (0.1) + double m_delta_min; ///< minimum value for the weight change (1e-6) + double m_delta_max; ///< maximum value for the weight change (50.0) + + std::vector<blitz::Array<double,2> > m_delta; ///< R-prop weights deltas + std::vector<blitz::Array<double,1> > m_delta_bias; ///< R-prop biases deltas + + std::vector<blitz::Array<double,2> > m_prev_deriv; ///< prev.weight deriv. + std::vector<blitz::Array<double,1> > m_prev_deriv_bias; ///< pr.bias der. + }; + + /** + * @} + */ +}}} + +#endif /* BOB_LEARN_MLP_RPROP_H */ diff --git a/xbob/learn/mlp/include/xbob.learn.mlp/shuffler.h b/xbob/learn/mlp/include/xbob.learn.mlp/shuffler.h new file mode 100644 index 0000000000000000000000000000000000000000..e4a4571736e454bf7d19910c815078989ed0578e --- /dev/null +++ b/xbob/learn/mlp/include/xbob.learn.mlp/shuffler.h @@ -0,0 +1,129 @@ +/** + * @date Wed Jul 13 16:58:26 2011 +0200 + * @author Andre Anjos <andre.anjos@idiap.ch> + * + * @brief A class that implements data shuffling for multi-class supervised and + * unsupervised training. + * + * Copyright (C) 2011-2014 Idiap Research Institute, Martigny, Switzerland + */ + +#ifndef BOB_LEARN_MLP_DATASHUFFLER_H +#define BOB_LEARN_MLP_DATASHUFFLER_H + +#include <vector> +#include <blitz/array.h> +#include <boost/shared_ptr.hpp> +#include <boost/random.hpp> + +namespace bob { namespace learn { namespace mlp { + + /** + * A data shuffler is capable of being populated with data from one or + * multiple classes and matching target values. Once setup, the shuffer can + * randomly select a number of vectors and accompaning targets for the + * different classes, filling up user containers. + * + * Data shufflers are particular useful for training neural networks. + */ + class DataShuffler { + + public: //api + + /** + * Initializes the shuffler with some data classes and corresponding + * targets. The data is read by considering examples are lying on + * different rows of the input data. Data is copied internally. + */ + DataShuffler(const std::vector<blitz::Array<double,2> >& data, + const std::vector<blitz::Array<double,1> >& target); + + /** + * Copy constructor + */ + DataShuffler(const DataShuffler& other); + + /** + * D'tor virtualization + */ + virtual ~DataShuffler(); + + /** + * Assignment. This will also copy seeds set on the other shuffler. + */ + DataShuffler& operator= (const DataShuffler& other); + + /** + * Calculates and returns mean and standard deviation from the input + * data. + */ + void getStdNorm(blitz::Array<double,1>& mean, + blitz::Array<double,1>& stddev) const; + + /** + * Set automatic standard normalization. + */ + void setAutoStdNorm(bool s); + + /** + * Gets current automatic standard normalization settings + */ + inline bool getAutoStdNorm() const { return m_do_stdnorm; } + + /** + * The data shape + */ + inline size_t getDataWidth() const { return m_data[0].extent(1); } + + /** + * The target shape + */ + inline size_t getTargetWidth() const { return m_target[0].extent(0); } + + /** + * Populates the output matrices by randomly selecting N arrays from the + * input arraysets and matching targets in the most possible fair way. + * The 'data' and 'target' matrices will contain N rows and the number of + * columns that are dependent on input arraysets and target arrays. + * + * We check don't 'data' and 'target' for size compatibility and is your + * responsibility to do so. + * + * Note this operation is non-const - we do alter the state of our ranges + * internally. + */ + void operator() (boost::mt19937& rng, blitz::Array<double,2>& data, + blitz::Array<double,2>& target); + + /** + * Populates the output matrices by randomly selecting N arrays from the + * input arraysets and matching targets in the most possible fair way. + * The 'data' and 'target' matrices will contain N rows and the number of + * columns that are dependent on input arraysets and target arrays. + * + * We check don't 'data' and 'target' for size compatibility and is your + * responsibility to do so. + * + * This version is a shortcut to the previous declaration of operator() + * that actually instantiates its own random number generator and seed it + * a time-based variable. We guarantee two calls will lead to different + * results if they are at least 1 microsecond appart (procedure uses the + * machine clock). + */ + void operator() (blitz::Array<double,2>& data, + blitz::Array<double,2>& target); + + private: //representation + + std::vector<blitz::Array<double,2> > m_data; + std::vector<blitz::Array<double,1> > m_target; + std::vector<boost::uniform_int<size_t> > m_range; + bool m_do_stdnorm; ///< should we apply standard normalization + blitz::Array<double,1> m_mean; ///< mean to be used for std. norm. + blitz::Array<double,1> m_stddev; ///< std.dev for std. norm. + + }; + +}}} + +#endif /* BOB_LEARN_MLP_DATASHUFFLER_H */ diff --git a/xbob/learn/mlp/include/xbob.learn.mlp/square_error.h b/xbob/learn/mlp/include/xbob.learn.mlp/square_error.h new file mode 100644 index 0000000000000000000000000000000000000000..eb247e2e8df56d3ffe4523685290ea6e6b5aa16c --- /dev/null +++ b/xbob/learn/mlp/include/xbob.learn.mlp/square_error.h @@ -0,0 +1,98 @@ +/** + * @author Andre Anjos <andre.anjos@idiap.ch> + * @date Fri 31 May 15:08:46 2013 + * + * @brief Implements the Square Error Cost function + * + * Copyright (C) 2011-2014 Idiap Research Institute, Martigny, Switzerland + */ + +#ifndef BOB_LEARN_MLP_SQUAREERROR_H +#define BOB_LEARN_MLP_SQUAREERROR_H + +#include "cost.h" + +namespace bob { namespace learn { namespace mlp { + + /** + * Calculates the Square-Error between output and target. The square error is + * defined as follows: + * + * \f[ + * J = \frac{(\hat{y} - y)^2}{2} + * \f] + * + * where \f$\hat{y}\f$ is the output estimated by your machine and \f$y\f$ is + * the expected output. + */ + class SquareError: public Cost { + + public: + + /** + * Builds a SquareError functor with an existing activation function. + */ + SquareError(boost::shared_ptr<bob::machine::Activation> actfun); + + /** + * Virtualized destructor + */ + virtual ~SquareError(); + + /** + * Computes cost, given the current output of the linear machine or MLP + * and the expected output. + * + * @param output Real output from the linear machine or MLP + * + * @param target Target output you are training to achieve + * + * @return The cost + */ + virtual double f (double output, double target) const; + + /** + * Computes the derivative of the cost w.r.t. output. + * + * @param output Real output from the linear machine or MLP + * + * @param target Target output you are training to achieve + * + * @return The calculated error + */ + virtual double f_prime (double output, double target) const; + + /** + * Computes the back-propagated errors for a given MLP <b>output</b> + * layer, given its activation function and activation values - i.e., the + * error back-propagated through the last layer neurons up to the + * synapses connecting the last hidden layer to the output layer. + * + * This entry point allows for optimization in the calculation of the + * back-propagated errors in cases where there is a possibility of + * mathematical simplification when using a certain combination of + * cost-function and activation. For example, using a ML-cost and a + * logistic activation function. + * + * @param output Real output from the linear machine or MLP + * @param target Target output you are training to achieve + * + * @return The calculated error, backpropagated to before the output + * neuron. + */ + virtual double error (double output, double target) const; + + /** + * Returns a stringified representation for this Cost function + */ + virtual std::string str() const; + + private: //representation + + boost::shared_ptr<bob::machine::Activation> m_actfun; //act. function + + }; + +}}} + +#endif /* BOB_LEARN_MLP_SQUAREERROR_H */ diff --git a/xbob/learn/mlp/machine.cpp b/xbob/learn/mlp/machine.cpp index b6ba2b19219230b5ad23dd641380be8b64c10855..d2eb4f5ade29cad420bd17aa6691273e18a37c0d 100644 --- a/xbob/learn/mlp/machine.cpp +++ b/xbob/learn/mlp/machine.cpp @@ -85,7 +85,7 @@ static int PyBobLearnMLPMachine_init_sizes } try { - self->cxx = new bob::machine::MLP(cxx_shape); + self->cxx = new bob::learn::mlp::Machine(cxx_shape); } catch (std::exception& ex) { PyErr_SetString(PyExc_RuntimeError, ex.what()); @@ -115,7 +115,7 @@ static int PyBobLearnMLPMachine_init_hdf5(PyBobLearnMLPMachineObject* self, auto h5f = reinterpret_cast<PyBobIoHDF5FileObject*>(config); try { - self->cxx = new bob::machine::MLP(*(h5f->f)); + self->cxx = new bob::learn::mlp::Machine(*(h5f->f)); } catch (std::exception& ex) { PyErr_SetString(PyExc_RuntimeError, ex.what()); @@ -145,7 +145,7 @@ static int PyBobLearnMLPMachine_init_copy auto copy = reinterpret_cast<PyBobLearnMLPMachineObject*>(other); try { - self->cxx = new bob::machine::MLP(*(copy->cxx)); + self->cxx = new bob::learn::mlp::Machine(*(copy->cxx)); } catch (std::exception& ex) { PyErr_SetString(PyExc_RuntimeError, ex.what()); @@ -1041,7 +1041,7 @@ PyObject* PyBobLearnMLPMachine_NewFromSize PyBobLearnMLPMachineObject* retval = (PyBobLearnMLPMachineObject*)PyBobLearnMLPMachine_new(&PyBobLearnMLPMachine_Type, 0, 0); - retval->cxx = new bob::machine::MLP(input, output); + retval->cxx = new bob::learn::mlp::Machine(input, output); return reinterpret_cast<PyObject*>(retval); diff --git a/xbob/learn/mlp/shuffler.cpp b/xbob/learn/mlp/shuffler.cpp index c282bd86f1a5c96f3e5e2c6935eaafc19ccd4abf..4e08a233917b78a4c02b61c7eebbd091fb9ffd20 100644 --- a/xbob/learn/mlp/shuffler.cpp +++ b/xbob/learn/mlp/shuffler.cpp @@ -137,7 +137,7 @@ static int PyBobLearnDataShuffler_init // proceed to object initialization try { - self->cxx = new bob::trainer::DataShuffler(data_seq, target_seq); + self->cxx = new bob::learn::mlp::DataShuffler(data_seq, target_seq); } catch (std::exception& ex) { PyErr_SetString(PyExc_RuntimeError, ex.what()); @@ -271,7 +271,7 @@ static PyObject* PyBobLearnDataShuffler_Call if (!target) { Py_ssize_t shape[2]; shape[0] = n; - shape[1] = self->cxx->getDataWidth(); + shape[1] = self->cxx->getTargetWidth(); target = (PyBlitzArrayObject*)PyBlitzArray_SimpleNew(NPY_FLOAT64, 2, shape); if (!target) return 0; target_ = make_safe(target); diff --git a/xbob/learn/mlp/test_shuffler.py b/xbob/learn/mlp/test_shuffler.py index 3bff296756f7f01e6dbf4ecb94d01827396032f7..918d4a4a94362078c036d431ba6e3848e81eb90f 100644 --- a/xbob/learn/mlp/test_shuffler.py +++ b/xbob/learn/mlp/test_shuffler.py @@ -217,5 +217,5 @@ def test_normalization_big(): #but the std normalization values remain the same... shuffle.auto_stdnorm = False back_mean, back_stddev = shuffle.stdnorm() - assert abs( (back_mean - prev_mean ).sum() < 1e-10) - assert abs( (back_stddev - prev_stddev).sum() < 1e-10) + assert abs(back_mean - prev_mean).sum() < 1e-1 + assert abs(back_stddev - prev_stddev).sum() < 1e-10