From 4151b396e48cecbd1edb05d17804f1467f5f3ed2 Mon Sep 17 00:00:00 2001 From: Amir MOHAMMADI <amir.mohammadi@idiap.ch> Date: Thu, 19 Oct 2017 19:09:56 +0200 Subject: [PATCH] use 4 spaces. Implement IO for sequential preprocessors and extractors --- bob/bio/base/extractor/stacks.py | 234 ++++++++++++++-------------- bob/bio/base/preprocessor/stacks.py | 103 ++++++------ bob/bio/base/utils/processors.py | 122 +++++++-------- 3 files changed, 238 insertions(+), 221 deletions(-) diff --git a/bob/bio/base/extractor/stacks.py b/bob/bio/base/extractor/stacks.py index 98cb83ce..0f85207a 100644 --- a/bob/bio/base/extractor/stacks.py +++ b/bob/bio/base/extractor/stacks.py @@ -4,133 +4,141 @@ from bob.io.base import HDF5File class MultipleExtractor(Extractor): - """Base class for SequentialExtractor and ParallelExtractor. This class is - not meant to be used directly.""" - - def get_attributes(self, processors): - requires_training = any(p.requires_training for p in processors) - split_training_data_by_client = any(p.split_training_data_by_client for p - in processors) - min_extractor_file_size = min(p.min_extractor_file_size for p in - processors) - min_feature_file_size = min(p.min_feature_file_size for p in processors) - return (requires_training, split_training_data_by_client, - min_extractor_file_size, min_feature_file_size) - - def get_extractor_groups(self): - groups = ['E_{}'.format(i + 1) for i in range(len(self.processors))] - return groups - - def train_one(self, e, training_data, extractor_file, apply=False): - if not e.requires_training: - return - # if any of the extractors require splitting the data, the - # split_training_data_by_client is True. - if e.split_training_data_by_client: - e.train(training_data, extractor_file) - if not apply: - return - training_data = [[e(d) for d in datalist] for datalist in training_data] - # when no extractor needs splitting - elif not self.split_training_data_by_client: - e.train(training_data, extractor_file) - if not apply: - return - training_data = [e(d) for d in training_data] - # when e here wants it flat but the data is split - else: - # make training_data flat - training_data_len = [len(datalist) for datalist in training_data] - training_data = [d for datalist in training_data for d in datalist] - e.train(training_data, extractor_file) - if not apply: - return - # split training data - new_training_data, i = [], 0 - for length in training_data_len: - class_data = [] - for _ in range(length): - class_data.append(e(training_data[i])) - i += 1 - new_training_data.append(class_data) - training_data = new_training_data - return training_data - - def load(self, extractor_file): - with HDF5File(extractor_file) as f: - groups = self.get_extractor_groups() - for e, group in zip(self.processors, groups): - f.cd(group) - e.load(f) - f.cd('..') + """Base class for SequentialExtractor and ParallelExtractor. This class is + not meant to be used directly.""" + + def get_attributes(self, processors): + requires_training = any(p.requires_training for p in processors) + split_training_data_by_client = any(p.split_training_data_by_client for + p in processors) + min_extractor_file_size = min(p.min_extractor_file_size for p in + processors) + min_feature_file_size = min( + p.min_feature_file_size for p in processors) + return (requires_training, split_training_data_by_client, + min_extractor_file_size, min_feature_file_size) + + def get_extractor_groups(self): + groups = ['E_{}'.format(i + 1) for i in range(len(self.processors))] + return groups + + def train_one(self, e, training_data, extractor_file, apply=False): + if not e.requires_training: + return + # if any of the extractors require splitting the data, the + # split_training_data_by_client is True. + if e.split_training_data_by_client: + e.train(training_data, extractor_file) + if not apply: + return + training_data = [[e(d) for d in datalist] + for datalist in training_data] + # when no extractor needs splitting + elif not self.split_training_data_by_client: + e.train(training_data, extractor_file) + if not apply: + return + training_data = [e(d) for d in training_data] + # when e here wants it flat but the data is split + else: + # make training_data flat + training_data_len = [len(datalist) for datalist in training_data] + training_data = [d for datalist in training_data for d in datalist] + e.train(training_data, extractor_file) + if not apply: + return + # split training data + new_training_data, i = [], 0 + for length in training_data_len: + class_data = [] + for _ in range(length): + class_data.append(e(training_data[i])) + i += 1 + new_training_data.append(class_data) + training_data = new_training_data + return training_data + + def load(self, extractor_file): + with HDF5File(extractor_file) as f: + groups = self.get_extractor_groups() + for e, group in zip(self.processors, groups): + f.cd(group) + e.load(f) + f.cd('..') class SequentialExtractor(SequentialProcessor, MultipleExtractor): - __doc__ = SequentialProcessor.__doc__ + __doc__ = SequentialProcessor.__doc__ - def __init__(self, processors): + def __init__(self, processors): - (requires_training, split_training_data_by_client, - min_extractor_file_size, min_feature_file_size) = \ - self.get_attributes(processors) + (requires_training, split_training_data_by_client, + min_extractor_file_size, min_feature_file_size) = \ + self.get_attributes(processors) - super(SequentialExtractor, self).__init__( - processors=processors, - requires_training=requires_training, - split_training_data_by_client=split_training_data_by_client, - min_extractor_file_size=min_extractor_file_size, - min_feature_file_size=min_feature_file_size) + super(SequentialExtractor, self).__init__( + processors=processors, + requires_training=requires_training, + split_training_data_by_client=split_training_data_by_client, + min_extractor_file_size=min_extractor_file_size, + min_feature_file_size=min_feature_file_size) - def train(self, training_data, extractor_file): - with HDF5File(extractor_file, 'w') as f: - groups = self.get_extractor_groups() - for e, group in zip(self.processors, groups): - f.create_group(group) - f.cd(group) - training_data = self.train_one(e, training_data, f, apply=True) - f.cd('..') + def train(self, training_data, extractor_file): + with HDF5File(extractor_file, 'w') as f: + groups = self.get_extractor_groups() + for e, group in zip(self.processors, groups): + f.create_group(group) + f.cd(group) + training_data = self.train_one(e, training_data, f, apply=True) + f.cd('..') + + def read_feature(self, feature_file): + return self.processors[-1].read_feature(feature_file) + + def write_feature(self, feature, feature_file): + return self.processors[-1].write_feature(feature, feature_file) class ParallelExtractor(ParallelProcessor, MultipleExtractor): - __doc__ = ParallelProcessor.__doc__ + __doc__ = ParallelProcessor.__doc__ - def __init__(self, processors): + def __init__(self, processors): - (requires_training, split_training_data_by_client, - min_extractor_file_size, min_feature_file_size) = self.get_attributes( - processors) + (requires_training, split_training_data_by_client, + min_extractor_file_size, min_feature_file_size) = self.get_attributes( + processors) - super(ParallelExtractor, self).__init__( - processors=processors, - requires_training=requires_training, - split_training_data_by_client=split_training_data_by_client, - min_extractor_file_size=min_extractor_file_size, - min_feature_file_size=min_feature_file_size) + super(ParallelExtractor, self).__init__( + processors=processors, + requires_training=requires_training, + split_training_data_by_client=split_training_data_by_client, + min_extractor_file_size=min_extractor_file_size, + min_feature_file_size=min_feature_file_size) - def train(self, training_data, extractor_file): - with HDF5File(extractor_file, 'w') as f: - groups = self.get_extractor_groups() - for e, group in zip(self.processors, groups): - f.create_group(group) - f.cd(group) - self.train_one(e, training_data, f, apply=False) - f.cd('..') + def train(self, training_data, extractor_file): + with HDF5File(extractor_file, 'w') as f: + groups = self.get_extractor_groups() + for e, group in zip(self.processors, groups): + f.create_group(group) + f.cd(group) + self.train_one(e, training_data, f, apply=False) + f.cd('..') class CallableExtractor(Extractor): - """A simple extractor that takes a callable and applies that callable to the - input. - - Attributes - ---------- - callable : object - Anything that is callable. It will be used as an extractor in - bob.bio.base. - """ - - def __init__(self, callable, **kwargs): - super(CallableExtractor, self).__init__(**kwargs) - self.callable = callable - - def __call__(self, data): - return self.callable(data) + """A simple extractor that takes a callable and applies that callable to + the input. + + Attributes + ---------- + callable : object + Anything that is callable. It will be used as an extractor in + bob.bio.base. + """ + + def __init__(self, callable, **kwargs): + super(CallableExtractor, self).__init__(**kwargs) + self.callable = callable + + def __call__(self, data): + return self.callable(data) diff --git a/bob/bio/base/preprocessor/stacks.py b/bob/bio/base/preprocessor/stacks.py index d7822915..1a4b3100 100644 --- a/bob/bio/base/preprocessor/stacks.py +++ b/bob/bio/base/preprocessor/stacks.py @@ -3,62 +3,71 @@ from .Preprocessor import Preprocessor class SequentialPreprocessor(SequentialProcessor, Preprocessor): - __doc__ = SequentialProcessor.__doc__ + __doc__ = SequentialProcessor.__doc__ - def __init__(self, processors, **kwargs): - min_preprocessed_file_size = 1000 - try: - min_preprocessed_file_size = min( - (p.min_preprocessed_file_size for p in processors)) - except AttributeError: - pass + def __init__(self, processors, **kwargs): + min_preprocessed_file_size = 1000 + try: + min_preprocessed_file_size = min( + (p.min_preprocessed_file_size for p in processors)) + except AttributeError: + pass - SequentialProcessor.__init__(self, processors) - Preprocessor.__init__( - self, min_preprocessed_file_size=min_preprocessed_file_size, **kwargs) + SequentialProcessor.__init__(self, processors) + Preprocessor.__init__( + self, min_preprocessed_file_size=min_preprocessed_file_size, + **kwargs) - def __call__(self, data, annotations): - return super(SequentialPreprocessor, self).__call__( - data, annotations=annotations) + def __call__(self, data, annotations): + return super(SequentialPreprocessor, self).__call__( + data, annotations=annotations) + + def read_data(self, data_file): + return self.processors[-1].read_data(data_file) + + def write_data(self, data, data_file): + return self.processors[-1].write_data(data, data_file) class ParallelPreprocessor(ParallelProcessor, Preprocessor): - __doc__ = ParallelProcessor.__doc__ + __doc__ = ParallelProcessor.__doc__ - def __init__(self, processors, **kwargs): - min_preprocessed_file_size = min(p.min_preprocessed_file_size for p in - processors) + def __init__(self, processors, **kwargs): + min_preprocessed_file_size = min(p.min_preprocessed_file_size for p in + processors) - ParallelProcessor.__init__(self, processors) - Preprocessor.__init__( - self, min_preprocessed_file_size=min_preprocessed_file_size, **kwargs) + ParallelProcessor.__init__(self, processors) + Preprocessor.__init__( + self, min_preprocessed_file_size=min_preprocessed_file_size, + **kwargs) - def __call__(self, data, annotations): - return super(ParallelPreprocessor, self).__call__( - data, annotations=annotations) + def __call__(self, data, annotations): + return super(ParallelPreprocessor, self).__call__( + data, annotations=annotations) class CallablePreprocessor(Preprocessor): - """A simple preprocessor that takes a callable and applies that callable to - the input. - - Attributes - ---------- - accepts_annotations : bool - If False, annotations are not passed to the callable. - callable : object - Anything that is callable. It will be used as a preprocessor in - bob.bio.base. - """ - - def __init__(self, callable, accepts_annotations=True, **kwargs): - super(CallablePreprocessor, self).__init__( - callable=callable, accepts_annotations=accepts_annotations, **kwargs) - self.callable = callable - self.accepts_annotations = accepts_annotations - - def __call__(self, data, annotations): - if self.accepts_annotations: - return self.callable(data, annotations) - else: - return self.callable(data) + """A simple preprocessor that takes a callable and applies that callable to + the input. + + Attributes + ---------- + accepts_annotations : bool + If False, annotations are not passed to the callable. + callable : object + Anything that is callable. It will be used as a preprocessor in + bob.bio.base. + """ + + def __init__(self, callable, accepts_annotations=True, **kwargs): + super(CallablePreprocessor, self).__init__( + callable=callable, accepts_annotations=accepts_annotations, + **kwargs) + self.callable = callable + self.accepts_annotations = accepts_annotations + + def __call__(self, data, annotations): + if self.accepts_annotations: + return self.callable(data, annotations) + else: + return self.callable(data) diff --git a/bob/bio/base/utils/processors.py b/bob/bio/base/utils/processors.py index 03dd2e98..1939d860 100644 --- a/bob/bio/base/utils/processors.py +++ b/bob/bio/base/utils/processors.py @@ -2,77 +2,77 @@ import numpy class SequentialProcessor(object): - """A helper class which takes several processors and applies them one by one - sequentially + """A helper class which takes several processors and applies them one by + one sequentially - Attributes - ---------- - processors : list - A list of processors to apply. - """ + Attributes + ---------- + processors : list + A list of processors to apply. + """ - def __init__(self, processors, **kwargs): - super(SequentialProcessor, self).__init__() - self.processors = processors + def __init__(self, processors, **kwargs): + super(SequentialProcessor, self).__init__() + self.processors = processors - def __call__(self, data, **kwargs): - """Applies the processors on the data sequentially. The output of the first - one goes as input to the next one. + def __call__(self, data, **kwargs): + """Applies the processors on the data sequentially. The output of the + first one goes as input to the next one. - Parameters - ---------- - data : object - The data that needs to be processed. - **kwargs - Any kwargs are passed to the processors. + Parameters + ---------- + data : object + The data that needs to be processed. + **kwargs + Any kwargs are passed to the processors. - Returns - ------- - object - The processed data. - """ - for processor in self.processors: - data = processor(data, **kwargs) - return data + Returns + ------- + object + The processed data. + """ + for processor in self.processors: + data = processor(data, **kwargs) + return data class ParallelProcessor(object): - """A helper class which takes several processors and applies them on each - processor separately and outputs a list of their outputs in the end. + """A helper class which takes several processors and applies them on each + processor separately and outputs a list of their outputs in the end. - Attributes - ---------- - processors : list - A list of processors to apply. - stack : bool - If True (default), :any:`numpy.hstack` is called on the list of outputs. - """ + Attributes + ---------- + processors : list + A list of processors to apply. + stack : bool + If True (default), :any:`numpy.hstack` is called on the list of outputs + """ - def __init__(self, processors, stack=True, **kwargs): - super(ParallelProcessor, self).__init__() - self.processors = processors - self.stack = stack + def __init__(self, processors, stack=True, **kwargs): + super(ParallelProcessor, self).__init__() + self.processors = processors + self.stack = stack - def __call__(self, data, **kwargs): - """Applies the processors on the data independently and outputs a list of - their outputs. + def __call__(self, data, **kwargs): + """Applies the processors on the data independently and outputs a list of + their outputs. - Parameters - ---------- - data : object - The data that needs to be processed. - **kwargs - Any kwargs are passed to the processors. + Parameters + ---------- + data : object + The data that needs to be processed. + **kwargs + Any kwargs are passed to the processors. - Returns - ------- - object - The processed data. - """ - output = [] - for processor in self.processors: - out = processor(data, **kwargs) - output.append(out) - if self.stack: - output = numpy.hstack(output) - return output + Returns + ------- + object + The processed data. + """ + output = [] + for processor in self.processors: + out = processor(data, **kwargs) + output.append(out) + if self.stack: + output = numpy.hstack(output) + return output -- GitLab