trainer.cpp 10.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
/**
 * @date Tue May 14 12:04:51 CEST 2013
 * @author Andre Anjos <andre.anjos@idiap.ch>
 * @author Laurent El Shafey <Laurent.El-Shafey@idiap.ch>
 *
 * Copyright (C) 2011-2014 Idiap Research Institute, Martigny, Switzerland
 */

#include <algorithm>
10
11
12
#include <bob.core/assert.h>
#include <bob.core/check.h>
#include <bob.math/linear.h>
13

André Anjos's avatar
André Anjos committed
14
#include <bob.learn.mlp/trainer.h>
15

16
bob::learn::mlp::Trainer::Trainer(size_t batch_size,
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
    boost::shared_ptr<bob::learn::mlp::Cost> cost):
  m_batch_size(batch_size),
  m_cost(cost),
  m_train_bias(true),
  m_H(0), ///< handy!
  m_deriv(1),
  m_deriv_bias(1),
  m_error(1),
  m_output(1)
{
  m_deriv[0].reference(blitz::Array<double,2>(0,0));
  m_deriv_bias[0].reference(blitz::Array<double,1>(0));
  m_error[0].reference(blitz::Array<double,2>(0,0));
  m_output[0].reference(blitz::Array<double,2>(0,0));
  reset();
}

34
bob::learn::mlp::Trainer::Trainer(size_t batch_size,
35
36
37
38
39
40
41
42
43
44
45
46
47
48
    boost::shared_ptr<bob::learn::mlp::Cost> cost,
    const bob::learn::mlp::Machine& machine):
  m_batch_size(batch_size),
  m_cost(cost),
  m_train_bias(true),
  m_H(machine.numOfHiddenLayers()), ///< handy!
  m_deriv(m_H + 1),
  m_deriv_bias(m_H + 1),
  m_error(m_H + 1),
  m_output(m_H + 1)
{
  initialize(machine);
}

49
bob::learn::mlp::Trainer::Trainer(size_t batch_size,
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
    boost::shared_ptr<bob::learn::mlp::Cost> cost,
    const bob::learn::mlp::Machine& machine,
    bool train_biases):
  m_batch_size(batch_size),
  m_cost(cost),
  m_train_bias(train_biases),
  m_H(machine.numOfHiddenLayers()), ///< handy!
  m_deriv(m_H + 1),
  m_deriv_bias(m_H + 1),
  m_error(m_H + 1),
  m_output(m_H + 1)
{
  initialize(machine);
}

65
bob::learn::mlp::Trainer::~Trainer() { }
66

67
bob::learn::mlp::Trainer::Trainer(const Trainer& other):
68
69
70
71
72
73
74
75
76
77
78
  m_batch_size(other.m_batch_size),
  m_cost(other.m_cost),
  m_train_bias(other.m_train_bias),
  m_H(other.m_H)
{
  bob::core::array::ccopy(other.m_deriv, m_deriv);
  bob::core::array::ccopy(other.m_deriv_bias, m_deriv_bias);
  bob::core::array::ccopy(other.m_error, m_error);
  bob::core::array::ccopy(other.m_output, m_output);
}

79
80
bob::learn::mlp::Trainer& bob::learn::mlp::Trainer::operator=
(const bob::learn::mlp::Trainer& other) {
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
  if (this != &other)
  {
    m_batch_size = other.m_batch_size;
    m_cost = other.m_cost;
    m_train_bias = other.m_train_bias;
    m_H = other.m_H;

    bob::core::array::ccopy(other.m_deriv, m_deriv);
    bob::core::array::ccopy(other.m_deriv_bias, m_deriv_bias);
    bob::core::array::ccopy(other.m_error, m_error);
    bob::core::array::ccopy(other.m_output, m_output);
  }
  return *this;
}

96
void bob::learn::mlp::Trainer::setBatchSize (size_t batch_size) {
97
98
99
100
101
102
103
104
105
106
107
108
109
110
  // m_output: values after the activation function
  // m_error: error values;

  m_batch_size = batch_size;

  for (size_t k=0; k<m_output.size(); ++k) {
    m_output[k].resize(batch_size, m_deriv[k].extent(1));
  }

  for (size_t k=0; k<m_error.size(); ++k) {
    m_error[k].resize(batch_size, m_deriv[k].extent(1));
  }
}

111
bool bob::learn::mlp::Trainer::isCompatible(const bob::learn::mlp::Machine& machine) const
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
{
  if (m_H != machine.numOfHiddenLayers()) return false;

  if (m_deriv.back().extent(1) != (int)machine.outputSize()) return false;

  if (m_deriv[0].extent(0) != (int)machine.inputSize()) return false;

  //also, each layer should be of the same size
  for (size_t k=0; k<(m_H + 1); ++k) {
    if (!bob::core::array::hasSameShape(m_deriv[k], machine.getWeights()[k])) return false;
  }

  //if you get to this point, you can only return true
  return true;
}

128
void bob::learn::mlp::Trainer::forward_step(const bob::learn::mlp::Machine& machine,
129
130
131
132
133
  const blitz::Array<double,2>& input)
{
  const std::vector<blitz::Array<double,2> >& machine_weight = machine.getWeights();
  const std::vector<blitz::Array<double,1> >& machine_bias = machine.getBiases();

134
135
  boost::shared_ptr<bob::learn::activation::Activation> hidden_actfun = machine.getHiddenActivation();
  boost::shared_ptr<bob::learn::activation::Activation> output_actfun = machine.getOutputActivation();
136
137

  for (size_t k=0; k<machine_weight.size(); ++k) { //for all layers
138
139
    if (k == 0) bob::math::prod(input, machine_weight[k], m_output[k]);
    else bob::math::prod(m_output[k-1], machine_weight[k], m_output[k]);
140
    boost::shared_ptr<bob::learn::activation::Activation> cur_actfun =
141
142
143
144
145
146
147
148
149
      (k == (machine_weight.size()-1) ? output_actfun : hidden_actfun );
    for (int i=0; i<(int)m_batch_size; ++i) { //for every example
      for (int j=0; j<m_output[k].extent(1); ++j) { //for all variables
        m_output[k](i,j) = cur_actfun->f(m_output[k](i,j) + machine_bias[k](j));
      }
    }
  }
}

150
void bob::learn::mlp::Trainer::backward_step
151
152
153
154
155
156
(const bob::learn::mlp::Machine& machine,
 const blitz::Array<double,2>& input, const blitz::Array<double,2>& target)
{
  const std::vector<blitz::Array<double,2> >& machine_weight = machine.getWeights();

  //last layer
157
  boost::shared_ptr<bob::learn::activation::Activation> output_actfun = machine.getOutputActivation();
158
159
160
161
162
163
164
  for (int i=0; i<(int)m_batch_size; ++i) { //for every example
    for (int j=0; j<m_error[m_H].extent(1); ++j) { //for all variables
      m_error[m_H](i,j) = m_cost->error(m_output[m_H](i,j), target(i,j));
    }
  }

  //all other layers
165
  boost::shared_ptr<bob::learn::activation::Activation> hidden_actfun = machine.getHiddenActivation();
166
  for (size_t k=m_H; k>0; --k) {
167
    bob::math::prod(m_error[k], machine_weight[k].transpose(1,0), m_error[k-1]);
168
169
170
171
172
173
174
175
176
177
    for (int i=0; i<(int)m_batch_size; ++i) { //for every example
      for (int j=0; j<m_error[k-1].extent(1); ++j) { //for all variables
        m_error[k-1](i,j) *= hidden_actfun->f_prime_from_f(m_output[k-1](i,j));
      }
    }
  }

  //calculate the derivatives of the cost w.r.t. the weights and biases
  for (size_t k=0; k<machine_weight.size(); ++k) { //for all layers
    // For the weights
178
179
    if (k == 0) bob::math::prod(input.transpose(1,0), m_error[k], m_deriv[k]);
    else bob::math::prod(m_output[k-1].transpose(1,0), m_error[k], m_deriv[k]);
180
181
182
183
184
185
186
    m_deriv[k] /= m_batch_size;
    // For the biases
    blitz::secondIndex bj;
    m_deriv_bias[k] = blitz::mean(m_error[k].transpose(1,0), bj);
  }
}

187
double bob::learn::mlp::Trainer::cost
188
189
190
191
192
193
194
195
196
197
198
(const blitz::Array<double,2>& target) const {
  bob::core::array::assertSameShape(m_output[m_H], target);
  double retval = 0.0;
  for (int i=0; i<target.extent(0); ++i) { //for every example
    for (int j=0; j<target.extent(1); ++j) { //for all variables
      retval += m_cost->f(m_output[m_H](i,j), target(i,j));
    }
  }
  return retval / target.extent(0);
}

199
double bob::learn::mlp::Trainer::cost
200
201
202
203
204
205
(const bob::learn::mlp::Machine& machine, const blitz::Array<double,2>& input,
 const blitz::Array<double,2>& target) {
  forward_step(machine, input);
  return cost(target);
}

206
void bob::learn::mlp::Trainer::initialize(const bob::learn::mlp::Machine& machine)
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
{
  const std::vector<blitz::Array<double,2> >& machine_weight =
    machine.getWeights();
  const std::vector<blitz::Array<double,1> >& machine_bias =
    machine.getBiases();

  m_H = machine.numOfHiddenLayers();
  m_deriv.resize(m_H + 1);
  m_deriv_bias.resize(m_H + 1);
  m_output.resize(m_H + 1);
  m_error.resize(m_H + 1);
  for (size_t k=0; k<(m_H + 1); ++k) {
    m_deriv[k].reference(blitz::Array<double,2>(machine_weight[k].shape()));
    m_deriv_bias[k].reference(blitz::Array<double,1>(machine_bias[k].shape()));
    m_output[k].resize(m_batch_size, m_deriv[k].extent(1));
    m_error[k].resize(m_batch_size, m_deriv[k].extent(1));
  }

  reset();
}

228
void bob::learn::mlp::Trainer::setError(const std::vector<blitz::Array<double,2> >& error) {
229
230
231
232
233
234
235
236
  bob::core::array::assertSameDimensionLength(error.size(), m_error.size());
  for (size_t k=0; k<error.size(); ++k)
  {
    bob::core::array::assertSameShape(error[k], m_error[k]);
    m_error[k] = error[k];
  }
}

237
void bob::learn::mlp::Trainer::setError(const blitz::Array<double,2>& error, const size_t id) {
238
  if (id >= m_error.size()) {
239
    boost::format m("Trainer: index for setting error array %lu is not on the expected range of [0, %lu]");
240
241
242
243
244
245
246
    m % id % (m_error.size()-1);
    throw std::runtime_error(m.str());
  }
  bob::core::array::assertSameShape(error, m_error[id]);
  m_error[id] = error;
}

247
void bob::learn::mlp::Trainer::setOutput(const std::vector<blitz::Array<double,2> >& output) {
248
249
250
251
252
253
254
255
  bob::core::array::assertSameDimensionLength(output.size(), m_output.size());
  for (size_t k=0; k<output.size(); ++k)
  {
    bob::core::array::assertSameShape(output[k], m_output[k]);
    m_output[k] = output[k];
  }
}

256
void bob::learn::mlp::Trainer::setOutput(const blitz::Array<double,2>& output, const size_t id) {
257
  if (id >= m_output.size()) {
258
    boost::format m("Trainer: index for setting output array %lu is not on the expected range of [0, %lu]");
259
260
261
262
263
264
265
    m % id % (m_output.size()-1);
    throw std::runtime_error(m.str());
  }
  bob::core::array::assertSameShape(output, m_output[id]);
  m_output[id] = output;
}

266
void bob::learn::mlp::Trainer::setDerivatives(const std::vector<blitz::Array<double,2> >& deriv) {
267
268
269
270
271
272
273
274
  bob::core::array::assertSameDimensionLength(deriv.size(), m_deriv.size());
  for (size_t k=0; k<deriv.size(); ++k)
  {
    bob::core::array::assertSameShape(deriv[k], m_deriv[k]);
    m_deriv[k] = deriv[k];
  }
}

275
void bob::learn::mlp::Trainer::setDerivative(const blitz::Array<double,2>& deriv, const size_t id) {
276
  if (id >= m_deriv.size()) {
277
    boost::format m("Trainer: index for setting derivative array %lu is not on the expected range of [0, %lu]");
278
279
280
281
282
283
284
    m % id % (m_deriv.size()-1);
    throw std::runtime_error(m.str());
  }
  bob::core::array::assertSameShape(deriv, m_deriv[id]);
  m_deriv[id] = deriv;
}

285
void bob::learn::mlp::Trainer::setBiasDerivatives(const std::vector<blitz::Array<double,1> >& deriv_bias) {
286
287
288
289
290
291
292
293
  bob::core::array::assertSameDimensionLength(deriv_bias.size(), m_deriv_bias.size());
  for (size_t k=0; k<deriv_bias.size(); ++k)
  {
    bob::core::array::assertSameShape(deriv_bias[k], m_deriv_bias[k]);
    m_deriv_bias[k] = deriv_bias[k];
  }
}

294
void bob::learn::mlp::Trainer::setBiasDerivative(const blitz::Array<double,1>& deriv_bias, const size_t id) {
295
  if (id >= m_deriv_bias.size()) {
296
    boost::format m("Trainer: index for setting bias derivative array %lu is not on the expected range of [0, %lu]");
297
298
299
300
301
302
303
    m % id % (m_deriv_bias.size()-1);
    throw std::runtime_error(m.str());
  }
  bob::core::array::assertSameShape(deriv_bias, m_deriv_bias[id]);
  m_deriv_bias[id] = deriv_bias;
}

304
void bob::learn::mlp::Trainer::reset() {
305
306
307
308
309
310
311
  for (size_t k=0; k<(m_H + 1); ++k) {
    m_deriv[k] = 0.;
    m_deriv_bias[k] = 0.;
    m_error[k] = 0.;
    m_output[k] = 0.;
  }
}