trainer.h 10.4 KB
 1 2 3 4 5 6 7 8 9 10 11 12 13 14 /** * @date Tue May 14 12:00:03 CEST 2013 * @author Andre Anjos * @author Laurent El Shafey * * Copyright (C) 2011-2014 Idiap Research Institute, Martigny, Switzerland */ #ifndef BOB_LEARN_MLP_BASE_TRAINER_H #define BOB_LEARN_MLP_BASE_TRAINER_H #include #include  Manuel Günther committed Aug 19, 2014 15 16 #include #include  17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46  namespace bob { namespace learn { namespace mlp { /** * @brief Base class for training MLP. This provides forward and backward * functions over a batch of samples, as well as accessors to the internal * states of the networks. * * Here is an overview of the backprop algorithm executed by this trainer: * * -# Take the local gradient of a neuron * @f[ b^{(l)} @f] * * -# Multiply that value by the output of the previous layer; * @f[ * b^{(l)} \times a^{(l-1)} * @f] * * -# Multiply the result of the previous step by the learning rate; * @f[ * \eta \times b^{(l)} \times a^{(l-1)} * @f] * * -# Add the result of the previous setup to the current weight, * possibly weighting the sum with a momentum ponderator. * @f[ * w_{n+1} = (1-\mu) \times (w_{n} + \eta \times b^{(l)} * \times a^{(l-1)}) + (\mu) \times w_{n-1} * @f] */  André Anjos committed May 07, 2014 47  class Trainer {  48 49 50 51  public: //api /**  André Anjos committed May 07, 2014 52  * @brief Initializes a new Trainer trainer according to a given  53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69  * training batch size. * * @param batch_size The number of examples passed at each iteration. If * you set this to 1, then you are implementing stochastic training. * * @param cost This is the cost function to use for the current training. * * @note Using this constructor, the internals of the trainer remain * uninitialized. You must call initialize() with a proper * Machine to initialize the trainer before using it. * * @note Using this constructor, you set biases training to * true * * @note Good values for batch sizes are tens of samples. This may affect * the convergence. */  André Anjos committed May 07, 2014 70  Trainer(size_t batch_size, boost::shared_ptr cost);  71 72  /**  André Anjos committed May 07, 2014 73  * @brief Initializes a new Trainer trainer according to a given  74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89  * machine settings and a training batch size. * * @param batch_size The number of examples passed at each iteration. If * you set this to 1, then you are implementing stochastic training. * * @param cost This is the cost function to use for the current training. * * @param machine Clone this machine weights and prepare the trainer * internally mirroring machine properties. * * @note Using this constructor, you set biases training to * true * * @note Good values for batch sizes are tens of samples. This may affect * the convergence. */  André Anjos committed May 07, 2014 90  Trainer(size_t batch_size, boost::shared_ptr cost,  91 92 93  const Machine& machine); /**  André Anjos committed May 07, 2014 94  * @brief Initializes a new Trainer trainer according to a given  95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110  * machine settings and a training batch size. * * @param batch_size The number of examples passed at each iteration. If * you set this to 1, then you are implementing stochastic training. * * @param cost This is the cost function to use for the current training. * * @param machine Clone this machine weights and prepare the trainer * internally mirroring machine properties. * * @param train_biases A boolean, indicating if we need to train the * biases or not. * * @note Good values for batch sizes are tens of samples. This may affect * the convergence. */  André Anjos committed May 07, 2014 111 112  Trainer(size_t batch_size, boost::shared_ptr cost, const Machine& machine, bool train_biases);  113 114 115 116  /** * @brief Destructor virtualisation */  André Anjos committed May 07, 2014 117  virtual ~Trainer();  118 119 120 121  /** * @brief Copy construction. */  André Anjos committed May 07, 2014 122  Trainer(const Trainer& other);  123 124 125 126  /** * @brief Copy operator */  André Anjos committed May 07, 2014 127  Trainer& operator=(const Trainer& other);  128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301  /** * @brief Gets the batch size */ size_t getBatchSize() const { return m_batch_size; } /** * @brief Sets the batch size */ void setBatchSize(size_t batch_size); /** * @brief Gets the cost to be minimized */ boost::shared_ptr getCost() const { return m_cost; } /** * @brief Sets the cost to be minimized */ void setCost(boost::shared_ptr cost) { m_cost = cost; } /** * @brief Gets the current settings for bias training (defaults to true) */ inline bool getTrainBiases() const { return m_train_bias; } /** * @brief Sets the bias training option */ inline void setTrainBiases(bool v) { m_train_bias = v; } /** * @brief Checks if a given machine is compatible with my inner settings. */ bool isCompatible(const Machine& machine) const; /** * @brief Returns the number of hidden layers on the target machine */ size_t numberOfHiddenLayers() const { return m_H; } /** * @brief Forward step -- this is a second implementation of that used on * the MLP itself to allow access to some internal buffers. In our * current setup, we keep the "m_output"'s of every individual layer * separately as we are going to need them for the weight update. * * Another factor is the normalization normally applied at MLPs. We * ignore that here as the DataShuffler should be capable of handling * this in a more efficient way. You should make sure that the final MLP * does have the standard normalization settings applied if it was set to * automatically apply the standard normalization before giving me the * data. */ void forward_step(const Machine& machine, const blitz::Array& input); /** * @brief Backward step -- back-propagates the calculated error up to each * neuron on the first layer and calculates the cost w.r.t. to each * weight and bias on the network. This is explained on Bishop's formula * 5.55 and 5.56, at page 244 (see also figure 5.7 for a graphical * representation). */ void backward_step(const Machine& machine, const blitz::Array& input, const blitz::Array& target); /** * @brief Calculates the cost for a given target. * * The cost for a given target is the sum of the individually calculated * costs for every output, averaged for all examples. * * This method assumes you have already called forward_step() before. If * that is not the case, use the next variant. * * @return The cost averaged over all targets */ double cost(const blitz::Array& target) const; /** * @brief Calculates the cost for a given target. * * The cost for a given target is the sum of the individually calculated * costs for every output, averaged for all examples. * * This method also calls forward_step(), so you can call backward_step() * just after it, if you wish to do so. * * @return The cost averaged over all targets */ double cost(const Machine& machine, const blitz::Array& input, const blitz::Array& target); /** * @brief Initialize the internal buffers for the current machine */ virtual void initialize(const Machine& machine); /** * @brief Returns the errors */ const std::vector >& getError() const { return m_error; } /** * @brief Returns the outputs */ const std::vector >& getOutput() const { return m_output; } /** * @brief Returns the derivatives of the cost wrt. the weights */ const std::vector >& getDerivatives() const { return m_deriv; } /** * @brief Returns the derivatives of the cost wrt. the biases */ const std::vector >& getBiasDerivatives() const { return m_deriv_bias; } /** * @brief Sets the error */ void setError(const std::vector >& error); /** * @brief Sets the error of a given index */ void setError(const blitz::Array& error, const size_t index); /** * @brief Sets the outputs */ void setOutput(const std::vector >& output); /** * @brief Sets the output of a given index */ void setOutput(const blitz::Array& output, const size_t index); /** * @brief Sets the derivatives of the cost */ void setDerivatives(const std::vector >& deriv); /** * @brief Sets the derivatives of the cost of a given index */ void setDerivative(const blitz::Array& deriv, const size_t index); /** * @brief Sets the derivatives of the cost (biases) */ void setBiasDerivatives(const std::vector >& deriv_bias); /** * @brief Sets the derivatives of the cost (biases) of a given index */ void setBiasDerivative(const blitz::Array& deriv_bias, const size_t index); private: //representation /** * @brief Resets the buffer to 0 value */ void reset(); /// training parameters: size_t m_batch_size; ///< the batch size boost::shared_ptr m_cost; ///< cost function to be minimized bool m_train_bias; ///< shall we be training biases? (default: true) size_t m_H; ///< number of hidden layers on the target machine std::vector > m_deriv; ///< derivatives of the cost wrt. the weights std::vector > m_deriv_bias; ///< derivatives of the cost wrt. the biases /// buffers that are dependent on the batch_size std::vector > m_error; ///< error (+deltas) std::vector > m_output; ///< layer output }; }}} #endif /* BOB_LEARN_MLP_BASE_TRAINER_H */