Skip to content
Snippets Groups Projects
Commit 4151b396 authored by Amir MOHAMMADI's avatar Amir MOHAMMADI
Browse files

use 4 spaces. Implement IO for sequential preprocessors and extractors

parent 8b031ce9
No related branches found
No related tags found
1 merge request!102Add sequential and parallel processors, pre-processors, and extractors
Pipeline #
......@@ -4,133 +4,141 @@ from bob.io.base import HDF5File
class MultipleExtractor(Extractor):
"""Base class for SequentialExtractor and ParallelExtractor. This class is
not meant to be used directly."""
def get_attributes(self, processors):
requires_training = any(p.requires_training for p in processors)
split_training_data_by_client = any(p.split_training_data_by_client for p
in processors)
min_extractor_file_size = min(p.min_extractor_file_size for p in
processors)
min_feature_file_size = min(p.min_feature_file_size for p in processors)
return (requires_training, split_training_data_by_client,
min_extractor_file_size, min_feature_file_size)
def get_extractor_groups(self):
groups = ['E_{}'.format(i + 1) for i in range(len(self.processors))]
return groups
def train_one(self, e, training_data, extractor_file, apply=False):
if not e.requires_training:
return
# if any of the extractors require splitting the data, the
# split_training_data_by_client is True.
if e.split_training_data_by_client:
e.train(training_data, extractor_file)
if not apply:
return
training_data = [[e(d) for d in datalist] for datalist in training_data]
# when no extractor needs splitting
elif not self.split_training_data_by_client:
e.train(training_data, extractor_file)
if not apply:
return
training_data = [e(d) for d in training_data]
# when e here wants it flat but the data is split
else:
# make training_data flat
training_data_len = [len(datalist) for datalist in training_data]
training_data = [d for datalist in training_data for d in datalist]
e.train(training_data, extractor_file)
if not apply:
return
# split training data
new_training_data, i = [], 0
for length in training_data_len:
class_data = []
for _ in range(length):
class_data.append(e(training_data[i]))
i += 1
new_training_data.append(class_data)
training_data = new_training_data
return training_data
def load(self, extractor_file):
with HDF5File(extractor_file) as f:
groups = self.get_extractor_groups()
for e, group in zip(self.processors, groups):
f.cd(group)
e.load(f)
f.cd('..')
"""Base class for SequentialExtractor and ParallelExtractor. This class is
not meant to be used directly."""
def get_attributes(self, processors):
requires_training = any(p.requires_training for p in processors)
split_training_data_by_client = any(p.split_training_data_by_client for
p in processors)
min_extractor_file_size = min(p.min_extractor_file_size for p in
processors)
min_feature_file_size = min(
p.min_feature_file_size for p in processors)
return (requires_training, split_training_data_by_client,
min_extractor_file_size, min_feature_file_size)
def get_extractor_groups(self):
groups = ['E_{}'.format(i + 1) for i in range(len(self.processors))]
return groups
def train_one(self, e, training_data, extractor_file, apply=False):
if not e.requires_training:
return
# if any of the extractors require splitting the data, the
# split_training_data_by_client is True.
if e.split_training_data_by_client:
e.train(training_data, extractor_file)
if not apply:
return
training_data = [[e(d) for d in datalist]
for datalist in training_data]
# when no extractor needs splitting
elif not self.split_training_data_by_client:
e.train(training_data, extractor_file)
if not apply:
return
training_data = [e(d) for d in training_data]
# when e here wants it flat but the data is split
else:
# make training_data flat
training_data_len = [len(datalist) for datalist in training_data]
training_data = [d for datalist in training_data for d in datalist]
e.train(training_data, extractor_file)
if not apply:
return
# split training data
new_training_data, i = [], 0
for length in training_data_len:
class_data = []
for _ in range(length):
class_data.append(e(training_data[i]))
i += 1
new_training_data.append(class_data)
training_data = new_training_data
return training_data
def load(self, extractor_file):
with HDF5File(extractor_file) as f:
groups = self.get_extractor_groups()
for e, group in zip(self.processors, groups):
f.cd(group)
e.load(f)
f.cd('..')
class SequentialExtractor(SequentialProcessor, MultipleExtractor):
__doc__ = SequentialProcessor.__doc__
__doc__ = SequentialProcessor.__doc__
def __init__(self, processors):
def __init__(self, processors):
(requires_training, split_training_data_by_client,
min_extractor_file_size, min_feature_file_size) = \
self.get_attributes(processors)
(requires_training, split_training_data_by_client,
min_extractor_file_size, min_feature_file_size) = \
self.get_attributes(processors)
super(SequentialExtractor, self).__init__(
processors=processors,
requires_training=requires_training,
split_training_data_by_client=split_training_data_by_client,
min_extractor_file_size=min_extractor_file_size,
min_feature_file_size=min_feature_file_size)
super(SequentialExtractor, self).__init__(
processors=processors,
requires_training=requires_training,
split_training_data_by_client=split_training_data_by_client,
min_extractor_file_size=min_extractor_file_size,
min_feature_file_size=min_feature_file_size)
def train(self, training_data, extractor_file):
with HDF5File(extractor_file, 'w') as f:
groups = self.get_extractor_groups()
for e, group in zip(self.processors, groups):
f.create_group(group)
f.cd(group)
training_data = self.train_one(e, training_data, f, apply=True)
f.cd('..')
def train(self, training_data, extractor_file):
with HDF5File(extractor_file, 'w') as f:
groups = self.get_extractor_groups()
for e, group in zip(self.processors, groups):
f.create_group(group)
f.cd(group)
training_data = self.train_one(e, training_data, f, apply=True)
f.cd('..')
def read_feature(self, feature_file):
return self.processors[-1].read_feature(feature_file)
def write_feature(self, feature, feature_file):
return self.processors[-1].write_feature(feature, feature_file)
class ParallelExtractor(ParallelProcessor, MultipleExtractor):
__doc__ = ParallelProcessor.__doc__
__doc__ = ParallelProcessor.__doc__
def __init__(self, processors):
def __init__(self, processors):
(requires_training, split_training_data_by_client,
min_extractor_file_size, min_feature_file_size) = self.get_attributes(
processors)
(requires_training, split_training_data_by_client,
min_extractor_file_size, min_feature_file_size) = self.get_attributes(
processors)
super(ParallelExtractor, self).__init__(
processors=processors,
requires_training=requires_training,
split_training_data_by_client=split_training_data_by_client,
min_extractor_file_size=min_extractor_file_size,
min_feature_file_size=min_feature_file_size)
super(ParallelExtractor, self).__init__(
processors=processors,
requires_training=requires_training,
split_training_data_by_client=split_training_data_by_client,
min_extractor_file_size=min_extractor_file_size,
min_feature_file_size=min_feature_file_size)
def train(self, training_data, extractor_file):
with HDF5File(extractor_file, 'w') as f:
groups = self.get_extractor_groups()
for e, group in zip(self.processors, groups):
f.create_group(group)
f.cd(group)
self.train_one(e, training_data, f, apply=False)
f.cd('..')
def train(self, training_data, extractor_file):
with HDF5File(extractor_file, 'w') as f:
groups = self.get_extractor_groups()
for e, group in zip(self.processors, groups):
f.create_group(group)
f.cd(group)
self.train_one(e, training_data, f, apply=False)
f.cd('..')
class CallableExtractor(Extractor):
"""A simple extractor that takes a callable and applies that callable to the
input.
Attributes
----------
callable : object
Anything that is callable. It will be used as an extractor in
bob.bio.base.
"""
def __init__(self, callable, **kwargs):
super(CallableExtractor, self).__init__(**kwargs)
self.callable = callable
def __call__(self, data):
return self.callable(data)
"""A simple extractor that takes a callable and applies that callable to
the input.
Attributes
----------
callable : object
Anything that is callable. It will be used as an extractor in
bob.bio.base.
"""
def __init__(self, callable, **kwargs):
super(CallableExtractor, self).__init__(**kwargs)
self.callable = callable
def __call__(self, data):
return self.callable(data)
......@@ -3,62 +3,71 @@ from .Preprocessor import Preprocessor
class SequentialPreprocessor(SequentialProcessor, Preprocessor):
__doc__ = SequentialProcessor.__doc__
__doc__ = SequentialProcessor.__doc__
def __init__(self, processors, **kwargs):
min_preprocessed_file_size = 1000
try:
min_preprocessed_file_size = min(
(p.min_preprocessed_file_size for p in processors))
except AttributeError:
pass
def __init__(self, processors, **kwargs):
min_preprocessed_file_size = 1000
try:
min_preprocessed_file_size = min(
(p.min_preprocessed_file_size for p in processors))
except AttributeError:
pass
SequentialProcessor.__init__(self, processors)
Preprocessor.__init__(
self, min_preprocessed_file_size=min_preprocessed_file_size, **kwargs)
SequentialProcessor.__init__(self, processors)
Preprocessor.__init__(
self, min_preprocessed_file_size=min_preprocessed_file_size,
**kwargs)
def __call__(self, data, annotations):
return super(SequentialPreprocessor, self).__call__(
data, annotations=annotations)
def __call__(self, data, annotations):
return super(SequentialPreprocessor, self).__call__(
data, annotations=annotations)
def read_data(self, data_file):
return self.processors[-1].read_data(data_file)
def write_data(self, data, data_file):
return self.processors[-1].write_data(data, data_file)
class ParallelPreprocessor(ParallelProcessor, Preprocessor):
__doc__ = ParallelProcessor.__doc__
__doc__ = ParallelProcessor.__doc__
def __init__(self, processors, **kwargs):
min_preprocessed_file_size = min(p.min_preprocessed_file_size for p in
processors)
def __init__(self, processors, **kwargs):
min_preprocessed_file_size = min(p.min_preprocessed_file_size for p in
processors)
ParallelProcessor.__init__(self, processors)
Preprocessor.__init__(
self, min_preprocessed_file_size=min_preprocessed_file_size, **kwargs)
ParallelProcessor.__init__(self, processors)
Preprocessor.__init__(
self, min_preprocessed_file_size=min_preprocessed_file_size,
**kwargs)
def __call__(self, data, annotations):
return super(ParallelPreprocessor, self).__call__(
data, annotations=annotations)
def __call__(self, data, annotations):
return super(ParallelPreprocessor, self).__call__(
data, annotations=annotations)
class CallablePreprocessor(Preprocessor):
"""A simple preprocessor that takes a callable and applies that callable to
the input.
Attributes
----------
accepts_annotations : bool
If False, annotations are not passed to the callable.
callable : object
Anything that is callable. It will be used as a preprocessor in
bob.bio.base.
"""
def __init__(self, callable, accepts_annotations=True, **kwargs):
super(CallablePreprocessor, self).__init__(
callable=callable, accepts_annotations=accepts_annotations, **kwargs)
self.callable = callable
self.accepts_annotations = accepts_annotations
def __call__(self, data, annotations):
if self.accepts_annotations:
return self.callable(data, annotations)
else:
return self.callable(data)
"""A simple preprocessor that takes a callable and applies that callable to
the input.
Attributes
----------
accepts_annotations : bool
If False, annotations are not passed to the callable.
callable : object
Anything that is callable. It will be used as a preprocessor in
bob.bio.base.
"""
def __init__(self, callable, accepts_annotations=True, **kwargs):
super(CallablePreprocessor, self).__init__(
callable=callable, accepts_annotations=accepts_annotations,
**kwargs)
self.callable = callable
self.accepts_annotations = accepts_annotations
def __call__(self, data, annotations):
if self.accepts_annotations:
return self.callable(data, annotations)
else:
return self.callable(data)
......@@ -2,77 +2,77 @@ import numpy
class SequentialProcessor(object):
"""A helper class which takes several processors and applies them one by one
sequentially
"""A helper class which takes several processors and applies them one by
one sequentially
Attributes
----------
processors : list
A list of processors to apply.
"""
Attributes
----------
processors : list
A list of processors to apply.
"""
def __init__(self, processors, **kwargs):
super(SequentialProcessor, self).__init__()
self.processors = processors
def __init__(self, processors, **kwargs):
super(SequentialProcessor, self).__init__()
self.processors = processors
def __call__(self, data, **kwargs):
"""Applies the processors on the data sequentially. The output of the first
one goes as input to the next one.
def __call__(self, data, **kwargs):
"""Applies the processors on the data sequentially. The output of the
first one goes as input to the next one.
Parameters
----------
data : object
The data that needs to be processed.
**kwargs
Any kwargs are passed to the processors.
Parameters
----------
data : object
The data that needs to be processed.
**kwargs
Any kwargs are passed to the processors.
Returns
-------
object
The processed data.
"""
for processor in self.processors:
data = processor(data, **kwargs)
return data
Returns
-------
object
The processed data.
"""
for processor in self.processors:
data = processor(data, **kwargs)
return data
class ParallelProcessor(object):
"""A helper class which takes several processors and applies them on each
processor separately and outputs a list of their outputs in the end.
"""A helper class which takes several processors and applies them on each
processor separately and outputs a list of their outputs in the end.
Attributes
----------
processors : list
A list of processors to apply.
stack : bool
If True (default), :any:`numpy.hstack` is called on the list of outputs.
"""
Attributes
----------
processors : list
A list of processors to apply.
stack : bool
If True (default), :any:`numpy.hstack` is called on the list of outputs
"""
def __init__(self, processors, stack=True, **kwargs):
super(ParallelProcessor, self).__init__()
self.processors = processors
self.stack = stack
def __init__(self, processors, stack=True, **kwargs):
super(ParallelProcessor, self).__init__()
self.processors = processors
self.stack = stack
def __call__(self, data, **kwargs):
"""Applies the processors on the data independently and outputs a list of
their outputs.
def __call__(self, data, **kwargs):
"""Applies the processors on the data independently and outputs a list of
their outputs.
Parameters
----------
data : object
The data that needs to be processed.
**kwargs
Any kwargs are passed to the processors.
Parameters
----------
data : object
The data that needs to be processed.
**kwargs
Any kwargs are passed to the processors.
Returns
-------
object
The processed data.
"""
output = []
for processor in self.processors:
out = processor(data, **kwargs)
output.append(out)
if self.stack:
output = numpy.hstack(output)
return output
Returns
-------
object
The processed data.
"""
output = []
for processor in self.processors:
out = processor(data, **kwargs)
output.append(out)
if self.stack:
output = numpy.hstack(output)
return output
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment