Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
bob
bob.bio.base
Commits
9ae5f1c9
Commit
9ae5f1c9
authored
Jun 12, 2017
by
Amir MOHAMMADI
Browse files
Add a function to read features with generators
parent
720c1052
Pipeline
#10516
failed with stages
in 3 minutes and 6 seconds
Changes
8
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
bob/bio/base/algorithm/BIC.py
View file @
9ae5f1c9
...
...
@@ -103,7 +103,7 @@ class BIC(Algorithm):
def
_trainset_for
(
self
,
pairs
):
"""Computes the array containing the comparison results for the given set of image pairs."""
return
numpy
.
vstack
(
[
self
.
comparison_function
(
f1
,
f2
)
for
(
f1
,
f2
)
in
pairs
]
)
return
numpy
.
vstack
(
self
.
comparison_function
(
f1
,
f2
)
for
(
f1
,
f2
)
in
pairs
)
def
train_enroller
(
self
,
train_features
,
enroller_file
):
"""Trains the BIC by computing intra-personal and extra-personal subspaces.
...
...
bob/bio/base/algorithm/Distance.py
View file @
9ae5f1c9
...
...
@@ -70,7 +70,7 @@ class Distance (Algorithm):
assert
len
(
enroll_features
)
[
self
.
_check_feature
(
feature
)
for
feature
in
enroll_features
]
# just store all the features
return
numpy
.
vstack
(
[
f
.
flatten
()
for
f
in
enroll_features
]
)
return
numpy
.
vstack
(
f
.
flatten
()
for
f
in
enroll_features
)
def
score
(
self
,
model
,
probe
):
...
...
bob/bio/base/algorithm/LDA.py
View file @
9ae5f1c9
...
...
@@ -109,7 +109,7 @@ class LDA (Algorithm):
if
len
(
client_files
)
<
2
:
logger
.
warn
(
"Skipping one client since the number of client files is only %d"
,
len
(
client_files
))
continue
data
.
append
(
numpy
.
vstack
(
[
feature
.
flatten
()
for
feature
in
client_files
]
))
data
.
append
(
numpy
.
vstack
(
feature
.
flatten
()
for
feature
in
client_files
))
# Returns the list of lists of arrays
return
data
...
...
@@ -117,7 +117,7 @@ class LDA (Algorithm):
def
_train_pca
(
self
,
training_set
):
"""Trains and returns a LinearMachine that is trained using PCA"""
data_list
=
[
feature
for
client
in
training_set
for
feature
in
client
]
data_list
=
(
feature
for
client
in
training_set
for
feature
in
client
)
data
=
numpy
.
vstack
(
data_list
)
logger
.
info
(
" -> Training Linear Machine using PCA"
)
...
...
@@ -145,7 +145,7 @@ class LDA (Algorithm):
def
_perform_pca
(
self
,
machine
,
training_set
):
"""Perform PCA on data of the training set"""
return
[
numpy
.
vstack
(
[
machine
(
feature
)
for
feature
in
client_features
]
)
for
client_features
in
training_set
]
return
[
numpy
.
vstack
(
machine
(
feature
)
for
feature
in
client_features
)
for
client_features
in
training_set
]
def
train_projector
(
self
,
training_features
,
projector_file
):
...
...
bob/bio/base/algorithm/PLDA.py
View file @
9ae5f1c9
...
...
@@ -72,7 +72,7 @@ class PLDA (Algorithm):
def
_train_pca
(
self
,
training_set
):
"""Trains and returns a LinearMachine that is trained using PCA"""
data
=
numpy
.
vstack
(
[
feature
for
feature
in
training_set
]
)
data
=
numpy
.
vstack
(
feature
for
feature
in
training_set
)
logger
.
info
(
" -> Training LinearMachine using PCA "
)
trainer
=
bob
.
learn
.
linear
.
PCATrainer
()
...
...
@@ -103,7 +103,7 @@ class PLDA (Algorithm):
if
len
(
client_files
)
<
2
:
logger
.
warn
(
"Skipping one client since the number of client files is only %d"
,
len
(
client_files
))
continue
data
.
append
(
numpy
.
vstack
(
[
feature
.
flatten
()
for
feature
in
client_files
]
))
data
.
append
(
numpy
.
vstack
(
feature
.
flatten
()
for
feature
in
client_files
))
# Returns the list of lists of arrays
return
data
...
...
@@ -179,7 +179,7 @@ class PLDA (Algorithm):
In this base class implementation, it computes the scores for each probe file using the 'score' method,
and fuses the scores using the fusion method specified in the constructor of this class."""
if
self
.
pca_machine
is
not
None
:
probes
=
[
self
.
pca_machine
(
probe
)
for
probe
in
probes
]
probes
=
(
self
.
pca_machine
(
probe
)
for
probe
in
probes
)
# forward
if
self
.
score_set
==
'joint_likelihood'
:
return
model
.
log_likelihood_ratio
(
numpy
.
vstack
(
probes
))
...
...
bob/bio/base/script/fuse_scores.py
View file @
9ae5f1c9
...
...
@@ -70,8 +70,8 @@ def main(command_line_options = None):
import
numpy
trainer
=
bob
.
learn
.
linear
.
CGLogRegTrainer
(
0.5
,
args
.
convergence_threshold
,
args
.
max_iterations
,
mean_std_norm
=
not
args
.
no_whitening
)
data_neg
=
numpy
.
vstack
(
[
data
[
k
][
0
]
for
k
in
range
(
n_systems
)
]
).
T
data_pos
=
numpy
.
vstack
(
[
data
[
k
][
1
]
for
k
in
range
(
n_systems
)
]
).
T
data_neg
=
numpy
.
vstack
(
data
[
k
][
0
]
for
k
in
range
(
n_systems
)).
T
data_pos
=
numpy
.
vstack
(
data
[
k
][
1
]
for
k
in
range
(
n_systems
)).
T
machine
=
trainer
.
train
(
data_neg
,
data_pos
)
# fuse development scores
...
...
bob/bio/base/test/test_utils.py
View file @
9ae5f1c9
...
...
@@ -3,7 +3,7 @@ import bob.learn.linear
import
pkg_resources
import
os
import
numpy
import
nose
import
bob.io.base.test_utils
from
.
import
utils
...
...
@@ -84,6 +84,81 @@ def test_io():
if
os
.
path
.
exists
(
filename
):
os
.
remove
(
filename
)
def
test_io_vstack
():
paths
=
[
1
,
2
,
3
,
4
,
5
]
def
oracle
(
reader
,
paths
):
return
numpy
.
vstack
([
reader
(
p
)
for
p
in
paths
])
def
reader_same_size_C
(
path
):
return
numpy
.
arange
(
10
).
reshape
(
5
,
2
)
def
reader_different_size_C
(
path
):
return
numpy
.
arange
(
2
*
path
).
reshape
(
path
,
2
)
def
reader_same_size_F
(
path
):
return
numpy
.
asfortranarray
(
numpy
.
arange
(
10
).
reshape
(
5
,
2
))
def
reader_different_size_F
(
path
):
return
numpy
.
asfortranarray
(
numpy
.
arange
(
2
*
path
).
reshape
(
path
,
2
))
def
reader_same_size_C2
(
path
):
return
numpy
.
arange
(
30
).
reshape
(
5
,
2
,
3
)
def
reader_different_size_C2
(
path
):
return
numpy
.
arange
(
6
*
path
).
reshape
(
path
,
2
,
3
)
def
reader_same_size_F2
(
path
):
return
numpy
.
asfortranarray
(
numpy
.
arange
(
30
).
reshape
(
5
,
2
,
3
))
def
reader_different_size_F2
(
path
):
return
numpy
.
asfortranarray
(
numpy
.
arange
(
6
*
path
).
reshape
(
path
,
2
,
3
))
def
reader_wrong_size
(
path
):
return
numpy
.
arange
(
2
*
path
).
reshape
(
2
,
path
)
# test C and F readers
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_different_size_C
,
paths
,
False
)
==
oracle
(
reader_different_size_C
,
paths
))
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_different_size_F
,
paths
,
False
)
==
oracle
(
reader_different_size_F
,
paths
))
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_same_size_C
,
paths
,
False
)
==
oracle
(
reader_same_size_C
,
paths
))
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_same_size_F
,
paths
,
False
)
==
oracle
(
reader_same_size_F
,
paths
))
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_same_size_C
,
paths
,
True
)
==
oracle
(
reader_same_size_C
,
paths
))
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_same_size_F
,
paths
,
True
)
==
oracle
(
reader_same_size_F
,
paths
))
# test 3 dimensional readers
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_different_size_C2
,
paths
,
False
)
==
oracle
(
reader_different_size_C2
,
paths
))
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_different_size_F2
,
paths
,
False
)
==
oracle
(
reader_different_size_F2
,
paths
))
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_same_size_C2
,
paths
,
False
)
==
oracle
(
reader_same_size_C2
,
paths
))
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_same_size_F2
,
paths
,
False
)
==
oracle
(
reader_same_size_F2
,
paths
))
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_same_size_C2
,
paths
,
True
)
==
oracle
(
reader_same_size_C2
,
paths
))
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_same_size_F2
,
paths
,
True
)
==
oracle
(
reader_same_size_F2
,
paths
))
with
nose
.
tools
.
assert_raises
(
AssertionError
):
bob
.
bio
.
base
.
vstack_features
(
reader_wrong_size
,
paths
)
def
test_sampling
():
# test selection of elements
indices
=
bob
.
bio
.
base
.
selected_indices
(
100
,
10
)
...
...
bob/bio/base/utils/io.py
View file @
9ae5f1c9
import
os
import
tempfile
,
tarfile
import
tempfile
import
tarfile
import
collections
# this is needed for the sphinx documentation
import
functools
# this is needed for the sphinx documentation
import
numpy
import
logging
logger
=
logging
.
getLogger
(
"bob.bio.base"
)
logger
=
logging
.
getLogger
(
__name__
)
from
..
import
database
import
bob.io.base
def
filter_missing_files
(
file_names
,
split_by_client
=
False
,
allow_missing_files
=
True
):
"""This function filters out files that do not exist, but only if ``allow_missing_files`` is set to ``True``, otherwise the list of ``file_names`` is returned unaltered."""
...
...
@@ -15,8 +19,10 @@ def filter_missing_files(file_names, split_by_client=False, allow_missing_files=
if
split_by_client
:
# filter out missing files and empty clients
existing_files
=
[[
f
for
f
in
client_files
if
os
.
path
.
exists
(
f
)]
for
client_files
in
file_names
]
existing_files
=
[
client_files
for
client_files
in
existing_files
if
client_files
]
existing_files
=
[
[
f
for
f
in
client_files
if
os
.
path
.
exists
(
f
)]
for
client_files
in
file_names
]
existing_files
=
[
client_files
for
client_files
in
existing_files
if
client_files
]
else
:
# filter out missing files
existing_files
=
[
f
for
f
in
file_names
if
os
.
path
.
exists
(
f
)]
...
...
@@ -28,17 +34,17 @@ def filter_none(data, split_by_client=False):
if
split_by_client
:
# filter out missing files and empty clients
existing_data
=
[[
d
for
d
in
client_data
if
d
is
not
None
]
for
client_data
in
data
]
existing_data
=
[
client_data
for
client_data
in
existing_data
if
client_data
]
existing_data
=
[[
d
for
d
in
client_data
if
d
is
not
None
]
for
client_data
in
data
]
existing_data
=
[
client_data
for
client_data
in
existing_data
if
client_data
]
else
:
# filter out missing files
existing_data
=
[
d
for
d
in
data
if
d
is
not
None
]
return
existing_data
def
check_file
(
filename
,
force
,
expected_file_size
=
1
):
def
check_file
(
filename
,
force
,
expected_file_size
=
1
):
"""Checks if the file with the given ``filename`` exists and has size greater or equal to ``expected_file_size``.
If the file is to small, **or** if the ``force`` option is set to ``True``, the file is removed.
This function returns ``True`` is the file exists (and has not been removed), otherwise ``False``"""
...
...
@@ -86,18 +92,20 @@ def load(file):
else
:
return
bob
.
io
.
base
.
load
(
file
)
def
save
(
data
,
file
,
compression
=
0
):
"""Saves the data to file using HDF5. The given file might be an HDF5 file open for writing, or a string.
If the given data contains a ``save`` method, this method is called with the given HDF5 file.
Otherwise the data is written to the HDF5 file using the given compression."""
f
=
file
if
isinstance
(
file
,
bob
.
io
.
base
.
HDF5File
)
else
bob
.
io
.
base
.
HDF5File
(
file
,
'w'
)
f
=
file
if
isinstance
(
file
,
bob
.
io
.
base
.
HDF5File
)
else
bob
.
io
.
base
.
HDF5File
(
file
,
'w'
)
if
hasattr
(
data
,
'save'
):
data
.
save
(
f
)
else
:
f
.
set
(
"array"
,
data
,
compression
=
compression
)
def
open_compressed
(
filename
,
open_flag
=
'r'
,
compression_type
=
'bz2'
):
def
open_compressed
(
filename
,
open_flag
=
'r'
,
compression_type
=
'bz2'
):
"""Opens a compressed HDF5File with the given opening flags.
For the 'r' flag, the given compressed file will be extracted to a local space.
For 'w', an empty HDF5File is created.
...
...
@@ -108,7 +116,7 @@ def open_compressed(filename, open_flag = 'r', compression_type='bz2'):
if
open_flag
==
'r'
:
# extract the HDF5 file from the given file name into a temporary file name
tar
=
tarfile
.
open
(
filename
,
mode
=
"r:"
+
compression_type
)
tar
=
tarfile
.
open
(
filename
,
mode
=
"r:"
+
compression_type
)
memory_file
=
tar
.
extractfile
(
tar
.
next
())
real_file
=
open
(
hdf5_file_name
,
'wb'
)
real_file
.
write
(
memory_file
.
read
())
...
...
@@ -130,13 +138,14 @@ def close_compressed(filename, hdf5_file, compression_type='bz2', create_link=Fa
if
is_writable
:
# create compressed tar file
tar
=
tarfile
.
open
(
filename
,
mode
=
"w:"
+
compression_type
)
tar
=
tarfile
.
open
(
filename
,
mode
=
"w:"
+
compression_type
)
tar
.
add
(
hdf5_file_name
,
os
.
path
.
basename
(
filename
))
tar
.
close
()
if
create_link
:
extension
=
{
''
:
'.tar'
,
'bz2'
:
'.tar.bz2'
,
'gz'
:
'tar.gz'
}[
compression_type
]
link_file
=
filename
+
extension
extension
=
{
''
:
'.tar'
,
'bz2'
:
'.tar.bz2'
,
'gz'
:
'tar.gz'
}[
compression_type
]
link_file
=
filename
+
extension
if
not
os
.
path
.
exists
(
link_file
):
os
.
symlink
(
os
.
path
.
basename
(
filename
),
link_file
)
...
...
@@ -165,3 +174,132 @@ def save_compressed(data, filename, compression_type='bz2', create_link=False):
hdf5
=
open_compressed
(
filename
,
'w'
)
save
(
data
,
hdf5
)
close_compressed
(
filename
,
hdf5
,
compression_type
,
create_link
)
def
_generate_features
(
reader
,
paths
):
"""Load and stack features a memory efficient way. This function is meant to
be used inside :py:func:`vstack_features`.
Parameters
----------
reader : collections.Callable
See the documentation of :py:func:`vstack_features`.
paths : collections.Iterable
See the documentation of :py:func:`vstack_features`.
Yields
------
object
The first object returned is the :py:type:`numpy.dtype` of features. The
second objects returned is the shape of the first feature. The rest of
objects are the actual values in features. The features are returned in
C order.
Examples
--------
This function can be used to with :py:func:`numpy.fromiter`:
>>> def reader(path):
... # in each file, there are 5 samples and features are 2 dimensional.
... return numpy.arange(10).reshape(5,2)
>>> paths = ['path1', 'path2']
>>> iterator = _generate_features(reader, paths)
>>> dtype = next(iterator)
>>> dtype
dtype('int64')
>>> first_feature_shape = next(iterator)
>>> first_feature_shape
(5, 2)
>>> all_features_flat = numpy.fromiter(iterator, dtype)
>>> all_features_flat
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
>>> all_features = all_features_flat.reshape(-1, first_feature_shape[1])
>>> all_features
array([[0, 1],
[2, 3],
[4, 5],
[6, 7],
[8, 9],
[0, 1],
[2, 3],
[4, 5],
[6, 7],
[8, 9]])
>>> all_features_with_more_memory = numpy.vstack([reader(p) for p in paths])
>>> assert numpy.allclose(all_features == all_features_with_more_memory)
You can allocate the array at once to improve the performance if you know
that all features in paths have the same shape and you know the total number
of the paths:
>>> iterator = _generate_features(reader, paths)
>>> dtype = next(iterator)
>>> first_feature_shape = next(iterator)
>>> total_size = len(paths) * numpy.prod(first_feature_shape)
>>> all_features_flat = numpy.fromiter(iterator, dtype, total_size)
>>> all_features = all_features_flat.reshape(-1, first_feature_shape[1])
>>> all_features
array([[0, 1],
[2, 3],
[4, 5],
[6, 7],
[8, 9],
[0, 1],
[2, 3],
[4, 5],
[6, 7],
[8, 9]])
"""
for
i
,
path
in
enumerate
(
paths
):
feature
=
numpy
.
atleast_2d
(
reader
(
path
))
feature
=
numpy
.
ascontiguousarray
(
feature
)
if
i
==
0
:
dtype
=
feature
.
dtype
shape
=
list
(
feature
.
shape
)
yield
dtype
yield
shape
else
:
# make sure all features have the same shape[1:]
assert
shape
[
1
:]
==
list
(
feature
.
shape
[
1
:])
for
value
in
feature
.
flat
:
yield
value
def
vstack_features
(
reader
,
paths
,
same_size
=
False
):
"""Stacks all features in a memory efficient way.
Parameters
----------
reader : collections.Callable
The function to load the features. The function should only take one
argument being the path to the features. Use :py:type:`functools.partial`
to accommodate your reader to this format. The features returned by
``reader`` are expected to have the same :py:type:`numpy.dtype` and the
same shape except for their first dimension. First dimension is should
correspond to the number of samples.
paths : collections.Iterable
An iterable of paths to iterate on. Whatever is inside path is given to
``reader``. If ``same_size`` is ``True``, ``len(paths)`` must be valid.
same_size : :obj:`bool`, optional
If ``True``, it assumes that arrays inside all the paths are the same
shape. If you know the features are the same size in all paths, set this
to ``True`` to improve the performance.
Returns
-------
numpy.ndarray
The read features with the shape (n_samples, *features_shape[1:]).
"""
iterable
=
_generate_features
(
reader
,
paths
)
dtype
=
next
(
iterable
)
shape
=
next
(
iterable
)
if
same_size
:
total_size
=
int
(
len
(
paths
)
*
numpy
.
prod
(
shape
))
all_features
=
numpy
.
fromiter
(
iterable
,
dtype
,
total_size
)
else
:
all_features
=
numpy
.
fromiter
(
iterable
,
dtype
)
# the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 3).
shape
=
list
(
shape
)
shape
[
0
]
=
-
1
return
numpy
.
reshape
(
all_features
,
shape
,
order
=
'C'
)
version.txt
View file @
9ae5f1c9
3.1.3b0
\ No newline at end of file
3.2.0b0
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment