ScikitClassifier.py 14.1 KB
Newer Older
anjith2006's avatar
anjith2006 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# -*- coding: utf-8 -*-
# @author: Anjith George

from bob.pad.base.algorithm import Algorithm
from bob.bio.video.utils import FrameContainer
import numpy as np
import pickle
import logging
from bob.pad.base.utils import convert_frame_cont_to_array, convert_list_of_frame_cont_to_array


class ScikitClassifier(Algorithm):
    """
    This class is designed to train any generic scikit-learn binary or anomaly detectors (one class classifiers)
    classifier given Frame Containers with features of real and attack classes. The procedure is the following:

    1. First, the input data is normalized using the scaler class, it should follow the API of scikit preprocessors.

    2. Second, the Scikit Algorithm is trained on normalized input features (either 2 class or one class), and the models are saved.

    3. At test time, input features are classified using pre-trained Scikit model.

    Parameters
anjith2006's avatar
Mods    
anjith2006 committed
24
    ----------
anjith2006's avatar
anjith2006 committed
25
26

    clf : object
anjith2006's avatar
Mods    
anjith2006 committed
27
            An sklearn binary classifier or outlier detector instance, which is initialized in the config file.
anjith2006's avatar
anjith2006 committed
28
29

    scaler : object 
anjith2006's avatar
Mods    
anjith2006 committed
30
            An sklearn scaler instance which is initialized in the config file.
anjith2006's avatar
anjith2006 committed
31
32

    frame_level_scores_flag :bool
anjith2006's avatar
Mods    
anjith2006 committed
33
34
            Return scores for each frame individually if True. Otherwise, return a
            single score per video. Default: False.
anjith2006's avatar
anjith2006 committed
35
36

    subsample_train_data_flag : bool
anjith2006's avatar
Mods    
anjith2006 committed
37
            Uniformly subsample the training data if True. Default: False.
anjith2006's avatar
anjith2006 committed
38
39

    subsampling_step : int
anjith2006's avatar
Mods    
anjith2006 committed
40
41
            Training data subsampling step, only valid is
            subsample_train_data_flag = True. Default: 10 .
anjith2006's avatar
anjith2006 committed
42
43

    subsample_videos_flag : bool
anjith2006's avatar
Mods    
anjith2006 committed
44
            Uniformly subsample the training videos if True. Default: False.
anjith2006's avatar
anjith2006 committed
45
46

    video_subsampling_step : int
anjith2006's avatar
Mods    
anjith2006 committed
47
48
            Training videos subsampling step, only valid is
            subsample_videos_flag = True. Default: 3 .
anjith2006's avatar
anjith2006 committed
49
    norm_on_bonafide : bool 
anjith2006's avatar
Mods    
anjith2006 committed
50
51
            If set to `True` the normalizayion parameters are found from bonafide samples
            only. If set to `False`, both bonafide and attacks will be used to find normalization parameters.
anjith2006's avatar
anjith2006 committed
52
    one_class : bool
anjith2006's avatar
Mods    
anjith2006 committed
53
54
            If set to `True`, the classifier is assumed to be one class, and training and scoring would be performed
            according to this assumption. The type of classifer either binary/ one class should be specified with this argument.
anjith2006's avatar
anjith2006 committed
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76

    """

    def __init__(self,
                 clf=None,
                 scaler=None,
                 frame_level_scores_flag=False,
                 subsample_train_data_flag=False,
                 subsampling_step=10,
                 subsample_videos_flag=False,
                 video_subsampling_step=3,
                 norm_on_bonafide=True, one_class=False):

        Algorithm.__init__(self,
                           clf=clf,
                           scaler=scaler,
                           frame_level_scores_flag=frame_level_scores_flag,
                           subsample_train_data_flag=subsample_train_data_flag,
                           subsampling_step=subsampling_step,
                           subsample_videos_flag=subsample_videos_flag,
                           video_subsampling_step=video_subsampling_step,
                           performs_projection=True,
anjith2006's avatar
Mods    
anjith2006 committed
77
                           requires_projector_training=True,
anjith2006's avatar
anjith2006 committed
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
                           norm_on_bonafide=norm_on_bonafide,
                           one_class=one_class)

        self.clf = clf

        self.scaler = scaler

        self.frame_level_scores_flag = frame_level_scores_flag

        self.subsample_train_data_flag = subsample_train_data_flag

        self.subsampling_step = subsampling_step

        self.subsample_videos_flag = subsample_videos_flag

        self.video_subsampling_step = video_subsampling_step

        self.norm_on_bonafide = norm_on_bonafide

        self.one_class = one_class

        if self.one_class:
anjith2006's avatar
Mods    
anjith2006 committed
100
            assert('score_samples' in dir(clf))
anjith2006's avatar
anjith2006 committed
101
        else:
anjith2006's avatar
Mods    
anjith2006 committed
102
            assert('predict_proba' in dir(clf))
anjith2006's avatar
anjith2006 committed
103
104
105
106
107
108
109
110

    def _normalize(self, features, train=False):
        """
        The features in the input 2D array are normalized.
        The rows are samples, the columns are features. If train==True then 
        the scaler is trained, else the trained scaler is used for the normalization.

        Parameters
anjith2006's avatar
Mods    
anjith2006 committed
111
        ----------
anjith2006's avatar
anjith2006 committed
112
113

        features : numpy.ndarray
anjith2006's avatar
Mods    
anjith2006 committed
114
                Array of features to be normalized.
anjith2006's avatar
anjith2006 committed
115
116
117
118
119

        Returns
        -------

        features_norm : numpy.ndarray
anjith2006's avatar
Mods    
anjith2006 committed
120
                Normalized array of features.
anjith2006's avatar
anjith2006 committed
121
122
123
124
125
126
127
128
129

        """

        if self.scaler is not None:
            if train:
                self.scaler.fit(features)

            features_norm = self.scaler.transform(features)
        else:
anjith2006's avatar
Mods    
anjith2006 committed
130
            features_norm = features.copy()
anjith2006's avatar
anjith2006 committed
131
132
133
134
135
136
137
138
139

        return features_norm

    def norm_train_data(self, real, attack):
        """
        Mean-std normalization of input data arrays. The mean and std normalizers
        are computed using real class only, unless `self.norm_on_bonafide` is set to `True` .

        Parameters
anjith2006's avatar
Mods    
anjith2006 committed
140
        ----------
anjith2006's avatar
anjith2006 committed
141
142

        real : numpy.ndarray
anjith2006's avatar
Mods    
anjith2006 committed
143
                Training features for the real class.
anjith2006's avatar
anjith2006 committed
144
145

        attack : numpy.ndarray
anjith2006's avatar
Mods    
anjith2006 committed
146
                Training features for the attack class.
anjith2006's avatar
anjith2006 committed
147
148
149
150
151

        Returns
        -------

        real_norm : numpy.ndarray
anjith2006's avatar
Mods    
anjith2006 committed
152
                Mean-std normalized training features for the real class.
anjith2006's avatar
anjith2006 committed
153
154

        attack_norm : numpy.ndarray
anjith2006's avatar
Mods    
anjith2006 committed
155
                Mean-std normalized training features for the attack class.
anjith2006's avatar
anjith2006 committed
156
157
        """

anjith2006's avatar
Mods    
anjith2006 committed
158
        if self.norm_on_bonafide:  # normalization parameters calculated from bonafide only
anjith2006's avatar
anjith2006 committed
159
160
161
162
163
164
165

            real_norm = self._normalize(real, train=True)

            attack_norm = self._normalize(attack, train=False)

        else:

anjith2006's avatar
Mods    
anjith2006 committed
166
            all_data = np.vstack([real, attack])
anjith2006's avatar
anjith2006 committed
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181

            _ = self._normalize(all_data, train=True)

            real_norm = self._normalize(real, train=False)

            attack_norm = self._normalize(attack, train=False)

        return real_norm, attack_norm

    def train_clf(self, real, attack):
        """
        Train Scikit classifier given real and attack classes. Prior to training
        the data is mean-std normalized.

        Parameters
anjith2006's avatar
Mods    
anjith2006 committed
182
        ----------
anjith2006's avatar
anjith2006 committed
183
184

        real : numpy.ndarray
anjith2006's avatar
Mods    
anjith2006 committed
185
                Training features for the real class.
anjith2006's avatar
anjith2006 committed
186
187

        attack : numpy.ndarray
anjith2006's avatar
Mods    
anjith2006 committed
188
                Training features for the attack class.
anjith2006's avatar
anjith2006 committed
189
190
191
192

        """

        if self.one_class:
anjith2006's avatar
Mods    
anjith2006 committed
193
            assert(self.norm_on_bonafide == True)
anjith2006's avatar
anjith2006 committed
194
195
196
197
198
199
200
201

        real, attack = self.norm_train_data(real, attack)
        # real and attack - are now mean-std normalized

        assert(self.clf is not None)

        if self.one_class:

anjith2006's avatar
Mods    
anjith2006 committed
202
            X = real.copy()
anjith2006's avatar
anjith2006 committed
203

anjith2006's avatar
Mods    
anjith2006 committed
204
            Y = np.ones(len(real))
anjith2006's avatar
anjith2006 committed
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222

            self.clf.fit(X)

        else:
            X = np.vstack([real, attack])

            Y = np.hstack([np.ones(len(real)), np.zeros(len(attack))])

            self.clf.fit(X, Y)

        return True

    def save_clf_and_mean_std(self, projector_file):
        """
        Saves the Scikit Classifier and  scaling parameters to  '.obj' files.
        The absolute name of the file is specified in projector_file string.

        Parameters
anjith2006's avatar
Mods    
anjith2006 committed
223
        ----------
anjith2006's avatar
anjith2006 committed
224
225

        projector_file  : str
anjith2006's avatar
Mods    
anjith2006 committed
226
227
                Absolute name of the file to save the data to, as returned by
                bob.pad.base framework
anjith2006's avatar
anjith2006 committed
228
229
230
231

        """

        # Saving scikit classifier
anjith2006's avatar
Mods    
anjith2006 committed
232
        projector_file_n = projector_file[:-5] + '_skmodel.obj'
anjith2006's avatar
anjith2006 committed
233
234
235
236
        with open(projector_file_n, 'wb') as fp:
            pickle.dump(self.clf, fp)

        # Saving the scaler
anjith2006's avatar
Mods    
anjith2006 committed
237
        scaler_file_n = projector_file[:-5] + '_scaler.obj'
anjith2006's avatar
anjith2006 committed
238
239
240
241
242
243
244
245
        with open(scaler_file_n, 'wb') as fp:
            pickle.dump(self.scaler, fp)

    def subsample_train_videos(self, training_features, step):
        """
        Uniformly select subset of frmae containes from the input list

        Parameters
anjith2006's avatar
Mods    
anjith2006 committed
246
        ----------
anjith2006's avatar
anjith2006 committed
247
        training_features : [FrameContainer]
anjith2006's avatar
Mods    
anjith2006 committed
248
                A list of FrameContainers
anjith2006's avatar
anjith2006 committed
249

250
        step : `int`
anjith2006's avatar
Mods    
anjith2006 committed
251
                Data selection step.
anjith2006's avatar
anjith2006 committed
252

253
254
        Returns
        -------
anjith2006's avatar
anjith2006 committed
255
        training_features_subset : [FrameContainer]
256
            A list with selected FrameContainers
anjith2006's avatar
anjith2006 committed
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
        """

        indexes = range(0, len(training_features), step)

        training_features_subset = [training_features[x] for x in indexes]

        return training_features_subset

    def train_projector(self, training_features, projector_file):
        """
        Train Scikit Classifier for feature projection and save them to files.
        The requires_projector_training = True flag must be set to True
        to enable this function.

        Parameters
anjith2006's avatar
Mods    
anjith2006 committed
272
        ----------
anjith2006's avatar
anjith2006 committed
273
274

        training_features : [[FrameContainer], [FrameContainer]]
anjith2006's avatar
Mods    
anjith2006 committed
275
276
277
                A list containing two elements: [0] - a list of Frame Containers with
                feature vectors for the real class; [1] - a list of Frame Containers with
                feature vectors for the attack class.
anjith2006's avatar
anjith2006 committed
278
279

        projector_file  :`str`
anjith2006's avatar
Mods    
anjith2006 committed
280
281
                The file to save the trained projector to, as returned by the
                bob.pad.base framework.
anjith2006's avatar
anjith2006 committed
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
        """

        # training_features[0] - training features for the REAL class.
        # training_features[1] - training features for the ATTACK class.

        if self.subsample_videos_flag:  # subsample videos of the real class

            real = convert_list_of_frame_cont_to_array(self.subsample_train_videos(training_features[0], self.video_subsampling_step))  # output is array

        else:

            real = convert_list_of_frame_cont_to_array(training_features[0])  # output is array

        if self.subsample_train_data_flag:

            real = real[range(0, len(real), self.subsampling_step), :]

        if self.subsample_videos_flag:  # subsample videos of the real class

            attack = convert_list_of_frame_cont_to_array(self.subsample_train_videos(training_features[1], self.video_subsampling_step))  # output is array

        else:

            attack = convert_list_of_frame_cont_to_array(training_features[1])  # output is array

        if self.subsample_train_data_flag:

            attack = attack[range(0, len(attack), self.subsampling_step), :]

        # Train the Scikit Classifier and get normalizers:
        self.train_clf(real=real, attack=attack)

        # Save the Scikit Classifier and normalizers:
        self.save_clf_and_mean_std(projector_file)

    def load_clf_and_mean_std(self, projector_file):
        """
        Loads the machine, features mean and std from the hdf5 file.
        The absolute name of the file is specified in projector_file string.

        Parameters
anjith2006's avatar
Mods    
anjith2006 committed
323
        ----------
anjith2006's avatar
anjith2006 committed
324
325

        projector_file : str
anjith2006's avatar
Mods    
anjith2006 committed
326
327
                Absolute name of the file to load the trained projector from, as
                returned by bob.pad.base framework.
anjith2006's avatar
anjith2006 committed
328
329
330

        """

anjith2006's avatar
Mods    
anjith2006 committed
331
        projector_file_n = projector_file[:-5] + '_skmodel.obj'
anjith2006's avatar
anjith2006 committed
332
333
334
335
336

        # Load the params of the machine:
        with open(projector_file_n, 'rb') as fp:
            self.clf = pickle.load(fp)

anjith2006's avatar
Mods    
anjith2006 committed
337
        scaler_file_n = projector_file[:-5] + '_scaler.obj'
anjith2006's avatar
anjith2006 committed
338
339
340
341
342
343
344
345
346
347
348
349

        # Load parameters of the scaler:
        with open(scaler_file_n, 'rb') as fp:
            self.scaler = pickle.load(fp)

    def load_projector(self, projector_file):
        """
        The absolute name of the file is specified in projector_file string.

        This function sets the arguments self.clf, with loaded machines.

        Parameters
anjith2006's avatar
Mods    
anjith2006 committed
350
        ----------
anjith2006's avatar
anjith2006 committed
351
352

        projector_file  : str
anjith2006's avatar
Mods    
anjith2006 committed
353
354
                The file to read the projector from, as returned by the
                bob.pad.base framework. 
anjith2006's avatar
anjith2006 committed
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
        """

        self.load_clf_and_mean_std(projector_file)

    def project(self, feature):
        """
        This function computes a vector of scores for each sample in the input
        array of features. The following steps are apllied:

        1. First, the input data is mean-std normalized using mean and std of the
           real class only.

        2. The input features are next classified using pre-trained Scikit classifier.

        Set performs_projection = True in the constructor to enable this function.
        It is assured that the :py:meth:`load_projector` was **called before** the
        project function is executed.

        Parameters
anjith2006's avatar
Mods    
anjith2006 committed
374
        ----------
anjith2006's avatar
anjith2006 committed
375
376

        feature : FrameContainer or numpy.ndarray
anjith2006's avatar
Mods    
anjith2006 committed
377
378
379
380
                Two types of inputs are accepted.
                A Frame Container conteining the features of an individual,
                see bob.bio.video.utils.FrameContainer.
                Or a 2D feature array of the size (N_samples x N_features).
anjith2006's avatar
anjith2006 committed
381
382
383
384
385

        Returns
        -------

        scores : numpy.ndarray
anjith2006's avatar
Mods    
anjith2006 committed
386
387
388
                Vector of scores. Scores for the real class are expected to be
                higher, than the scores of the negative / attack class.
                In this case scores are probabilities.
anjith2006's avatar
anjith2006 committed
389
390
391
392
393
394
395
396
397
398
399
        """

        # 1. Convert input array to numpy array if necessary.
        if isinstance(feature, FrameContainer):  # if FrameContainer convert to 2D numpy array

            features_array = convert_frame_cont_to_array(feature)

        else:

            features_array = feature.copy()

anjith2006's avatar
Mods    
anjith2006 committed
400
        features_array_norm = self._normalize(features_array, train=False)
anjith2006's avatar
anjith2006 committed
401
402

        if self.one_class:
anjith2006's avatar
Mods    
anjith2006 committed
403
            scores = self.clf.score_samples(features_array_norm)
anjith2006's avatar
anjith2006 committed
404
405
406
407
408
409
410
411
412
        else:
            scores = self.clf.predict_proba(features_array_norm)[:, 1]
        return scores

    def score(self, toscore):
        """
        Returns a probability of a sample being a real class.

        Parameters
anjith2006's avatar
Mods    
anjith2006 committed
413
        ----------
anjith2006's avatar
anjith2006 committed
414
415

        toscore : numpy.ndarray
anjith2006's avatar
Mods    
anjith2006 committed
416
417
                Vector with scores for each frame/sample defining the probability
                of the frame being a sample of the real class.
anjith2006's avatar
anjith2006 committed
418
419
420
421
422

        Returns
        -------

        score : float
anjith2006's avatar
Mods    
anjith2006 committed
423
424
425
426
427
428
                If frame_level_scores_flag = False a single score is returned.
                One score per video. This score is placed into a list, because
                the score must be an iterable.
                Score is a probability of a sample being a real class.
                If frame_level_scores_flag = True a list of scores is returned.
                One score per frame/sample.
anjith2006's avatar
anjith2006 committed
429
430
431
432
433
434
435
436
437
438
439
        """

        if self.frame_level_scores_flag:

            score = list(toscore)

        else:

            score = [np.mean(toscore)]  # compute a single score per video

        return score