trainer.h 10.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
/**
 * @date Tue May 14 12:00:03 CEST 2013
 * @author Andre Anjos <andre.anjos@idiap.ch>
 * @author Laurent El Shafey <Laurent.El-Shafey@idiap.ch>
 *
 * Copyright (C) 2011-2014 Idiap Research Institute, Martigny, Switzerland
 */

#ifndef BOB_LEARN_MLP_BASE_TRAINER_H
#define BOB_LEARN_MLP_BASE_TRAINER_H

#include <vector>
#include <boost/shared_ptr.hpp>

15
16
#include <bob.learn.mlp/machine.h>
#include <bob.learn.mlp/cost.h>
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46

namespace bob { namespace learn { namespace mlp {

  /**
   * @brief Base class for training MLP. This provides forward and backward
   * functions over a batch of samples, as well as accessors to the internal
   * states of the networks.
   *
   * Here is an overview of the backprop algorithm executed by this trainer:
   *
   * -# Take the <em>local gradient</em> of a neuron
   *    @f[ b^{(l)} @f]
   *
   * -# Multiply that value by the <em>output</em> of the previous layer;
   *    @f[
   *    b^{(l)} \times a^{(l-1)}
   *    @f]
   *
   * -# Multiply the result of the previous step by the learning rate;
   *    @f[
   *    \eta \times b^{(l)} \times a^{(l-1)}
   *    @f]
   *
   * -# Add the result of the previous setup to the current weight,
   *    possibly weighting the sum with a momentum ponderator.
   *    @f[
   *    w_{n+1} = (1-\mu) \times (w_{n} + \eta \times b^{(l)}
   *    \times a^{(l-1)}) + (\mu) \times w_{n-1}
   *    @f]
   */
47
  class Trainer {
48
49
50
51

    public: //api

      /**
52
       * @brief Initializes a new Trainer trainer according to a given
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
       * training batch size.
       *
       * @param batch_size The number of examples passed at each iteration. If
       * you set this to 1, then you are implementing stochastic training.
       *
       * @param cost This is the cost function to use for the current training.
       *
       * @note Using this constructor, the internals of the trainer remain
       * uninitialized. You must call <code>initialize()</code> with a proper
       * Machine to initialize the trainer before using it.
       *
       * @note Using this constructor, you set biases training to
       * <code>true</code>
       *
       * @note Good values for batch sizes are tens of samples. This may affect
       * the convergence.
       */
70
      Trainer(size_t batch_size, boost::shared_ptr<Cost> cost);
71
72

      /**
73
       * @brief Initializes a new Trainer trainer according to a given
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
       * machine settings and a training batch size.
       *
       * @param batch_size The number of examples passed at each iteration. If
       * you set this to 1, then you are implementing stochastic training.
       *
       * @param cost This is the cost function to use for the current training.
       *
       * @param machine Clone this machine weights and prepare the trainer
       * internally mirroring machine properties.
       *
       * @note Using this constructor, you set biases training to
       * <code>true</code>
       *
       * @note Good values for batch sizes are tens of samples. This may affect
       * the convergence.
       */
90
      Trainer(size_t batch_size, boost::shared_ptr<Cost> cost,
91
92
93
          const Machine& machine);

      /**
94
       * @brief Initializes a new Trainer trainer according to a given
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
       * machine settings and a training batch size.
       *
       * @param batch_size The number of examples passed at each iteration. If
       * you set this to 1, then you are implementing stochastic training.
       *
       * @param cost This is the cost function to use for the current training.
       *
       * @param machine Clone this machine weights and prepare the trainer
       * internally mirroring machine properties.
       *
       * @param train_biases A boolean, indicating if we need to train the
       * biases or not.
       *
       * @note Good values for batch sizes are tens of samples. This may affect
       * the convergence.
       */
111
112
      Trainer(size_t batch_size, boost::shared_ptr<Cost> cost,
          const Machine& machine, bool train_biases);
113
114
115
116

      /**
       * @brief Destructor virtualisation
       */
117
      virtual ~Trainer();
118
119
120
121

      /**
       * @brief Copy construction.
       */
122
      Trainer(const Trainer& other);
123
124
125
126

      /**
       * @brief Copy operator
       */
127
      Trainer& operator=(const Trainer& other);
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301

      /**
       * @brief Gets the batch size
       */
      size_t getBatchSize() const { return m_batch_size; }

      /**
       * @brief Sets the batch size
       */
      void setBatchSize(size_t batch_size);

      /**
       * @brief Gets the cost to be minimized
       */
      boost::shared_ptr<Cost> getCost() const { return m_cost; }

      /**
       * @brief Sets the cost to be minimized
       */
      void setCost(boost::shared_ptr<Cost> cost) { m_cost = cost; }

      /**
       * @brief Gets the current settings for bias training (defaults to true)
       */
      inline bool getTrainBiases() const { return m_train_bias; }

      /**
       * @brief Sets the bias training option
       */
      inline void setTrainBiases(bool v) { m_train_bias = v; }

      /**
       * @brief Checks if a given machine is compatible with my inner settings.
       */
      bool isCompatible(const Machine& machine) const;

      /**
       * @brief Returns the number of hidden layers on the target machine
       */
      size_t numberOfHiddenLayers() const { return m_H; }

      /**
       * @brief Forward step -- this is a second implementation of that used on
       * the MLP itself to allow access to some internal buffers. In our
       * current setup, we keep the "m_output"'s of every individual layer
       * separately as we are going to need them for the weight update.
       *
       * Another factor is the normalization normally applied at MLPs. We
       * ignore that here as the DataShuffler should be capable of handling
       * this in a more efficient way. You should make sure that the final MLP
       * does have the standard normalization settings applied if it was set to
       * automatically apply the standard normalization before giving me the
       * data.
       */
      void forward_step(const Machine& machine,
        const blitz::Array<double,2>& input);

      /**
       * @brief Backward step -- back-propagates the calculated error up to each
       * neuron on the first layer and calculates the cost w.r.t. to each
       * weight and bias on the network. This is explained on Bishop's formula
       * 5.55 and 5.56, at page 244 (see also figure 5.7 for a graphical
       * representation).
       */
      void backward_step(const Machine& machine,
        const blitz::Array<double,2>& input,
        const blitz::Array<double,2>& target);

      /**
       * @brief Calculates the cost for a given target.
       *
       * The cost for a given target is the sum of the individually calculated
       * costs for every output, averaged for all examples.
       *
       * This method assumes you have already called forward_step() before. If
       * that is not the case, use the next variant.
       *
       * @return The cost averaged over all targets
       */
      double cost(const blitz::Array<double,2>& target) const;

      /**
       * @brief Calculates the cost for a given target.
       *
       * The cost for a given target is the sum of the individually calculated
       * costs for every output, averaged for all examples.
       *
       * This method also calls forward_step(), so you can call backward_step()
       * just after it, if you wish to do so.
       *
       * @return The cost averaged over all targets
       */
      double cost(const Machine& machine,
        const blitz::Array<double,2>& input,
        const blitz::Array<double,2>& target);

      /**
       * @brief Initialize the internal buffers for the current machine
       */
      virtual void initialize(const Machine& machine);

      /**
       * @brief Returns the errors
       */
      const std::vector<blitz::Array<double,2> >& getError() const { return m_error; }
      /**
       * @brief Returns the outputs
       */
      const std::vector<blitz::Array<double,2> >& getOutput() const { return m_output; }
      /**
       * @brief Returns the derivatives of the cost wrt. the weights
       */
      const std::vector<blitz::Array<double,2> >& getDerivatives() const { return m_deriv; }
      /**
       * @brief Returns the derivatives of the cost wrt. the biases
       */
      const std::vector<blitz::Array<double,1> >& getBiasDerivatives() const { return m_deriv_bias; }
      /**
       * @brief Sets the error
       */
      void setError(const std::vector<blitz::Array<double,2> >& error);
      /**
       * @brief Sets the error of a given index
       */
      void setError(const blitz::Array<double,2>& error, const size_t index);
      /**
       * @brief Sets the outputs
       */
      void setOutput(const std::vector<blitz::Array<double,2> >& output);
      /**
       * @brief Sets the output of a given index
       */
      void setOutput(const blitz::Array<double,2>& output, const size_t index);
      /**
       * @brief Sets the derivatives of the cost
       */
      void setDerivatives(const std::vector<blitz::Array<double,2> >& deriv);
      /**
       * @brief Sets the derivatives of the cost of a given index
       */
      void setDerivative(const blitz::Array<double,2>& deriv, const size_t index);
      /**
       * @brief Sets the derivatives of the cost (biases)
       */
      void setBiasDerivatives(const std::vector<blitz::Array<double,1> >& deriv_bias);
      /**
       * @brief Sets the derivatives of the cost (biases) of a given index
       */
      void setBiasDerivative(const blitz::Array<double,1>& deriv_bias, const size_t index);

    private: //representation

      /**
       * @brief Resets the buffer to 0 value
       */
      void reset();

      /// training parameters:
      size_t m_batch_size; ///< the batch size
      boost::shared_ptr<Cost> m_cost; ///< cost function to be minimized
      bool m_train_bias; ///< shall we be training biases? (default: true)
      size_t m_H; ///< number of hidden layers on the target machine

      std::vector<blitz::Array<double,2> > m_deriv; ///< derivatives of the cost wrt. the weights
      std::vector<blitz::Array<double,1> > m_deriv_bias; ///< derivatives of the cost wrt. the biases

      /// buffers that are dependent on the batch_size
      std::vector<blitz::Array<double,2> > m_error; ///< error (+deltas)
      std::vector<blitz::Array<double,2> > m_output; ///< layer output
  };

}}}

#endif /* BOB_LEARN_MLP_BASE_TRAINER_H */