Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
bob.bio.base
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
bob
bob.bio.base
Commits
3f78e620
There was a problem fetching the pipeline summary.
Commit
3f78e620
authored
7 years ago
by
Amir MOHAMMADI
Browse files
Options
Downloads
Patches
Plain Diff
improve allow_missing_files option and add tests
parent
68412583
No related branches found
No related tags found
1 merge request
!103
Propagated the option --allow-missing-files .....
Pipeline
#
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
bob/bio/base/test/test_utils.py
+73
-35
73 additions, 35 deletions
bob/bio/base/test/test_utils.py
bob/bio/base/utils/io.py
+36
-19
36 additions, 19 deletions
bob/bio/base/utils/io.py
with
109 additions
and
54 deletions
bob/bio/base/test/test_utils.py
+
73
−
35
View file @
3f78e620
...
@@ -119,45 +119,83 @@ def test_io_vstack():
...
@@ -119,45 +119,83 @@ def test_io_vstack():
def
reader_wrong_size
(
path
):
def
reader_wrong_size
(
path
):
return
numpy
.
arange
(
2
*
path
).
reshape
(
2
,
path
)
return
numpy
.
arange
(
2
*
path
).
reshape
(
2
,
path
)
# test C and F readers
# when same_size is False
Tiago de Freitas Pereira
@tiago.pereira
·
Oct 19, 2017
Owner
Awesome, thanks for adding that.
Awesome, thanks for adding that.
Please
register
or
sign in
to reply
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_different_size_C
,
for
reader
in
[
paths
,
False
)
==
reader_different_size_C
,
oracle
(
reader_different_size_C
,
paths
))
reader_different_size_F
,
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_different_size_F
,
reader_same_size_C
,
paths
,
False
)
==
reader_same_size_F
,
oracle
(
reader_different_size_F
,
paths
))
reader_different_size_C2
,
reader_different_size_F2
,
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_same_size_C
,
paths
,
False
)
==
reader_same_size_C2
,
oracle
(
reader_same_size_C
,
paths
))
reader_same_size_F2
,
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_same_size_F
,
paths
,
False
)
==
]:
oracle
(
reader_same_size_F
,
paths
))
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader
,
paths
)
==
oracle
(
reader
,
paths
))
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_same_size_C
,
paths
,
True
)
==
oracle
(
reader_same_size_C
,
paths
))
# when same_size is True
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_same_size_F
,
paths
,
True
)
==
for
reader
in
[
oracle
(
reader_same_size_F
,
paths
))
reader_same_size_C
,
reader_same_size_F
,
# test 3 dimensional readers
reader_same_size_C2
,
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_different_size_C2
,
reader_same_size_F2
,
paths
,
False
)
==
]:
oracle
(
reader_different_size_C2
,
paths
))
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader
,
paths
,
True
)
==
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_different_size_F2
,
oracle
(
reader
,
paths
))
paths
,
False
)
==
oracle
(
reader_different_size_F2
,
paths
))
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_same_size_C2
,
paths
,
False
)
==
oracle
(
reader_same_size_C2
,
paths
))
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_same_size_F2
,
paths
,
False
)
==
oracle
(
reader_same_size_F2
,
paths
))
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_same_size_C2
,
paths
,
True
)
==
oracle
(
reader_same_size_C2
,
paths
))
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader_same_size_F2
,
paths
,
True
)
==
oracle
(
reader_same_size_F2
,
paths
))
with
nose
.
tools
.
assert_raises
(
AssertionError
):
with
nose
.
tools
.
assert_raises
(
AssertionError
):
bob
.
bio
.
base
.
vstack_features
(
reader_wrong_size
,
paths
)
bob
.
bio
.
base
.
vstack_features
(
reader_wrong_size
,
paths
)
# test actual files
paths
=
[
bob
.
io
.
base
.
test_utils
.
temporary_filename
(),
bob
.
io
.
base
.
test_utils
.
temporary_filename
(),
bob
.
io
.
base
.
test_utils
.
temporary_filename
()]
try
:
# try different readers:
for
reader
in
[
reader_different_size_C
,
reader_different_size_F
,
reader_same_size_C
,
reader_same_size_F
,
reader_different_size_C2
,
reader_different_size_F2
,
reader_same_size_C2
,
reader_same_size_F2
,
]:
# save some data in files
for
i
,
path
in
enumerate
(
paths
):
bob
.
bio
.
base
.
save
(
reader
(
i
+
1
),
path
)
# test when all data is present
reference
=
oracle
(
bob
.
bio
.
base
.
load
,
paths
)
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
bob
.
bio
.
base
.
load
,
paths
)
==
reference
)
# delete the first one
os
.
remove
(
paths
[
0
])
reference
=
oracle
(
bob
.
bio
.
base
.
load
,
paths
[
1
:])
target
=
bob
.
bio
.
base
.
vstack_features
(
bob
.
bio
.
base
.
load
,
paths
,
False
,
True
)
numpy
.
all
(
target
==
reference
)
# save back first one and delete second one
bob
.
bio
.
base
.
save
(
reader
(
1
),
paths
[
0
])
os
.
remove
(
paths
[
1
])
reference
=
oracle
(
bob
.
bio
.
base
.
load
,
paths
[:
1
]
+
paths
[
2
:])
target
=
bob
.
bio
.
base
.
vstack_features
(
bob
.
bio
.
base
.
load
,
paths
,
False
,
True
)
numpy
.
all
(
target
==
reference
)
# Check if RuntimeError is raised when one of the files is missing and
# allow_missing_files if False
with
nose
.
tools
.
assert_raises
(
RuntimeError
):
bob
.
bio
.
base
.
vstack_features
(
bob
.
bio
.
base
.
load
,
paths
)
# Check if ValueError is raised.
with
nose
.
tools
.
assert_raises
(
ValueError
):
bob
.
bio
.
base
.
vstack_features
(
bob
.
bio
.
base
.
load
,
paths
,
True
,
True
)
finally
:
try
:
for
path
in
paths
:
os
.
remove
(
path
)
except
Exception
:
pass
def
test_sampling
():
def
test_sampling
():
# test selection of elements
# test selection of elements
...
...
This diff is collapsed.
Click to expand it.
bob/bio/base/utils/io.py
+
36
−
19
View file @
3f78e620
...
@@ -175,9 +175,10 @@ def save_compressed(data, filename, compression_type='bz2', create_link=False):
...
@@ -175,9 +175,10 @@ def save_compressed(data, filename, compression_type='bz2', create_link=False):
close_compressed
(
filename
,
hdf5
,
compression_type
,
create_link
)
close_compressed
(
filename
,
hdf5
,
compression_type
,
create_link
)
def
_generate_features
(
reader
,
paths
,
allow_missing_files
=
False
):
def
_generate_features
(
reader
,
paths
,
same_size
=
False
,
"""
Load and stack features a memory efficient way. This function is meant to
allow_missing_files
=
False
):
be used inside :py:func:`vstack_features`.
"""
Load and stack features in a memory efficient way. This function is meant
to be used inside :py:func:`vstack_features`.
Parameters
Parameters
----------
----------
...
@@ -185,8 +186,10 @@ def _generate_features(reader, paths, allow_missing_files=False):
...
@@ -185,8 +186,10 @@ def _generate_features(reader, paths, allow_missing_files=False):
See the documentation of :py:func:`vstack_features`.
See the documentation of :py:func:`vstack_features`.
paths : ``collections.Iterable``
paths : ``collections.Iterable``
See the documentation of :py:func:`vstack_features`.
See the documentation of :py:func:`vstack_features`.
same_size : bool, optional
See the documentation of :py:func:`vstack_features`.
allow_missing_files : :obj:`bool`, optional
allow_missing_files : :obj:`bool`, optional
If ``True``, it ignores files that doesn
'
t exists
See the documentation of :py:func:`vstack_features`.
Yields
Yields
------
------
...
@@ -195,23 +198,26 @@ def _generate_features(reader, paths, allow_missing_files=False):
...
@@ -195,23 +198,26 @@ def _generate_features(reader, paths, allow_missing_files=False):
features and the shape of the first feature. The rest of objects are
features and the shape of the first feature. The rest of objects are
the actual values in features. The features are returned in C order.
the actual values in features. The features are returned in C order.
"""
"""
shape_
check
=
False
shape_
determined
=
False
for
i
,
path
in
enumerate
(
paths
):
for
i
,
path
in
enumerate
(
paths
):
if
allow_missing_files
and
not
os
.
path
.
isfile
(
path
):
if
allow_missing_files
and
not
os
.
path
.
isfile
(
path
):
logger
.
debug
(
"
...
The f
ile
{0}
, that does not exist, has been ignored
.
"
.
format
(
path
)
)
logger
.
debug
(
"
...
F
ile
%s
, that does not exist, has been ignored
.
"
,
path
)
continue
continue
feature
=
numpy
.
atleast_2d
(
reader
(
path
))
feature
=
numpy
.
atleast_2d
(
reader
(
path
))
feature
=
numpy
.
ascontiguousarray
(
feature
)
feature
=
numpy
.
ascontiguousarray
(
feature
)
if
not
shape_
check
:
if
not
shape_
determined
:
shape_
check
=
True
shape_
determined
=
True
dtype
=
feature
.
dtype
dtype
=
feature
.
dtype
shape
=
list
(
feature
.
shape
)
shape
=
list
(
feature
.
shape
)
yield
(
dtype
,
shape
)
yield
(
dtype
,
shape
)
else
:
else
:
# make sure all features have the same shape[1:] and dtype
# make sure all features have the same shape and dtype
assert
shape
[
1
:]
==
list
(
feature
.
shape
[
1
:])
if
same_size
:
assert
shape
==
list
(
feature
.
shape
)
else
:
assert
shape
[
1
:]
==
list
(
feature
.
shape
[
1
:])
assert
dtype
==
feature
.
dtype
assert
dtype
==
feature
.
dtype
for
value
in
feature
.
flat
:
for
value
in
feature
.
flat
:
...
@@ -232,23 +238,29 @@ def vstack_features(reader, paths, same_size=False, allow_missing_files=False):
...
@@ -232,23 +238,29 @@ def vstack_features(reader, paths, same_size=False, allow_missing_files=False):
dimension. First dimension is should correspond to the number of samples.
dimension. First dimension is should correspond to the number of samples.
paths : ``collections.Iterable``
paths : ``collections.Iterable``
An iterable of paths to iterate on. Whatever is inside path is given to
An iterable of paths to iterate on. Whatever is inside path is given to
``reader``. If ``same_size`` is ``True``, ``len(paths)`` must be valid.
``reader`` so they do not need to be necessarily paths to actual files.
If ``same_size`` is ``True``, ``len(paths)`` must be valid.
same_size : :obj:`bool`, optional
same_size : :obj:`bool`, optional
If ``True``, it assumes that arrays inside all the paths are the same
If ``True``, it assumes that arrays inside all the paths are the same
shape. If you know the features are the same size in all paths, set this
shape. If you know the features are the same size in all paths, set this
to ``True`` to improve the performance.
to ``True`` to improve the performance.
allow_missing_files : :obj:`bool`, optional
allow_missing_files : :obj:`bool`, optional
If ``True``, it
ignores files that doesn
'
t exists
If ``True``, it
assumes that the items inside paths are actual files and
ignores the ones that do not exist.
Returns
Returns
-------
-------
numpy.ndarray
numpy.ndarray
The read features with the shape (n_samples, \*features_shape[1:]).
The read features with the shape (n_samples, \*features_shape[1:]).
Raises
------
ValueError
If both same_size and allow_missing_files are ``True``.
Examples
Examples
--------
--------
This function is equivalent to calling
This function
in a simple way
is equivalent to calling
``numpy.vstack(reader(p) for p in paths)``.
``numpy.vstack(reader(p) for p in paths)``.
>>>
import
numpy
>>>
import
numpy
...
@@ -288,8 +300,13 @@ def vstack_features(reader, paths, same_size=False, allow_missing_files=False):
...
@@ -288,8 +300,13 @@ def vstack_features(reader, paths, same_size=False, allow_missing_files=False):
[
4
,
5
],
[
4
,
5
],
[
6
,
7
],
[
6
,
7
],
[
8
,
9
]])
[
8
,
9
]])
"""
"""
iterable
=
_generate_features
(
reader
,
paths
,
allow_missing_files
=
allow_missing_files
)
if
same_size
and
allow_missing_files
:
raise
ValueError
(
"
Both same_size and allow_missing_files cannot be True at
"
"
the same time.
"
)
iterable
=
_generate_features
(
reader
,
paths
,
allow_missing_files
=
allow_missing_files
)
dtype
,
shape
=
next
(
iterable
)
dtype
,
shape
=
next
(
iterable
)
if
same_size
:
if
same_size
:
total_size
=
int
(
len
(
paths
)
*
numpy
.
prod
(
shape
))
total_size
=
int
(
len
(
paths
)
*
numpy
.
prod
(
shape
))
...
@@ -297,7 +314,7 @@ def vstack_features(reader, paths, same_size=False, allow_missing_files=False):
...
@@ -297,7 +314,7 @@ def vstack_features(reader, paths, same_size=False, allow_missing_files=False):
else
:
else
:
all_features
=
numpy
.
fromiter
(
iterable
,
dtype
)
all_features
=
numpy
.
fromiter
(
iterable
,
dtype
)
# the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3,
3
).
# the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3,
4
).
shape
=
list
(
shape
)
shape
=
list
(
shape
)
shape
[
0
]
=
-
1
shape
[
0
]
=
-
1
return
numpy
.
reshape
(
all_features
,
shape
,
order
=
'
C
'
)
return
numpy
.
reshape
(
all_features
,
shape
,
order
=
'
C
'
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment