Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
bob.bio.base
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
bob
bob.bio.base
Commits
42f23502
Commit
42f23502
authored
4 years ago
by
Amir MOHAMMADI
Browse files
Options
Downloads
Patches
Plain Diff
Remove vstack_features -> use the one in bob.pipelines
parent
c95c92d6
No related branches found
Branches containing commit
No related tags found
Tags containing commit
2 merge requests
!187
Remove vstack_features -> use the one in bob.pipelines
,
!180
[dask] Preparing bob.bio.base for dask pipelines
Pipeline
#39681
passed
4 years ago
Stage: build
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
bob/bio/base/test/test_utils.py
+0
-113
0 additions, 113 deletions
bob/bio/base/test/test_utils.py
bob/bio/base/utils/io.py
+0
-147
0 additions, 147 deletions
bob/bio/base/utils/io.py
doc/py_api.rst
+0
-1
0 additions, 1 deletion
doc/py_api.rst
with
0 additions
and
261 deletions
bob/bio/base/test/test_utils.py
+
0
−
113
View file @
42f23502
...
...
@@ -3,7 +3,6 @@ import bob.learn.linear
import
pkg_resources
import
os
import
numpy
import
nose
import
bob.io.base.test_utils
from
.
import
utils
...
...
@@ -67,118 +66,6 @@ def test_io():
os
.
remove
(
filename
)
def
test_io_vstack
():
paths
=
[
1
,
2
,
3
,
4
,
5
]
def
oracle
(
reader
,
paths
):
return
numpy
.
vstack
([
reader
(
p
)
for
p
in
paths
])
def
reader_same_size_C
(
path
):
return
numpy
.
arange
(
10
).
reshape
(
5
,
2
)
def
reader_different_size_C
(
path
):
return
numpy
.
arange
(
2
*
path
).
reshape
(
path
,
2
)
def
reader_same_size_F
(
path
):
return
numpy
.
asfortranarray
(
numpy
.
arange
(
10
).
reshape
(
5
,
2
))
def
reader_different_size_F
(
path
):
return
numpy
.
asfortranarray
(
numpy
.
arange
(
2
*
path
).
reshape
(
path
,
2
))
def
reader_same_size_C2
(
path
):
return
numpy
.
arange
(
30
).
reshape
(
5
,
2
,
3
)
def
reader_different_size_C2
(
path
):
return
numpy
.
arange
(
6
*
path
).
reshape
(
path
,
2
,
3
)
def
reader_same_size_F2
(
path
):
return
numpy
.
asfortranarray
(
numpy
.
arange
(
30
).
reshape
(
5
,
2
,
3
))
def
reader_different_size_F2
(
path
):
return
numpy
.
asfortranarray
(
numpy
.
arange
(
6
*
path
).
reshape
(
path
,
2
,
3
))
def
reader_wrong_size
(
path
):
return
numpy
.
arange
(
2
*
path
).
reshape
(
2
,
path
)
# when same_size is False
for
reader
in
[
reader_different_size_C
,
reader_different_size_F
,
reader_same_size_C
,
reader_same_size_F
,
reader_different_size_C2
,
reader_different_size_F2
,
reader_same_size_C2
,
reader_same_size_F2
,
]:
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader
,
paths
)
==
oracle
(
reader
,
paths
))
# when same_size is True
for
reader
in
[
reader_same_size_C
,
reader_same_size_F
,
reader_same_size_C2
,
reader_same_size_F2
,
]:
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
reader
,
paths
,
True
)
==
oracle
(
reader
,
paths
))
with
nose
.
tools
.
assert_raises
(
AssertionError
):
bob
.
bio
.
base
.
vstack_features
(
reader_wrong_size
,
paths
)
# test actual files
paths
=
[
bob
.
io
.
base
.
test_utils
.
temporary_filename
(),
bob
.
io
.
base
.
test_utils
.
temporary_filename
(),
bob
.
io
.
base
.
test_utils
.
temporary_filename
()]
try
:
# try different readers:
for
reader
in
[
reader_different_size_C
,
reader_different_size_F
,
reader_same_size_C
,
reader_same_size_F
,
reader_different_size_C2
,
reader_different_size_F2
,
reader_same_size_C2
,
reader_same_size_F2
,
]:
# save some data in files
for
i
,
path
in
enumerate
(
paths
):
bob
.
bio
.
base
.
save
(
reader
(
i
+
1
),
path
)
# test when all data is present
reference
=
oracle
(
bob
.
bio
.
base
.
load
,
paths
)
numpy
.
all
(
bob
.
bio
.
base
.
vstack_features
(
bob
.
bio
.
base
.
load
,
paths
)
==
reference
)
# delete the first one
os
.
remove
(
paths
[
0
])
reference
=
oracle
(
bob
.
bio
.
base
.
load
,
paths
[
1
:])
target
=
bob
.
bio
.
base
.
vstack_features
(
bob
.
bio
.
base
.
load
,
paths
,
False
,
True
)
numpy
.
all
(
target
==
reference
)
# save back first one and delete second one
bob
.
bio
.
base
.
save
(
reader
(
1
),
paths
[
0
])
os
.
remove
(
paths
[
1
])
reference
=
oracle
(
bob
.
bio
.
base
.
load
,
paths
[:
1
]
+
paths
[
2
:])
target
=
bob
.
bio
.
base
.
vstack_features
(
bob
.
bio
.
base
.
load
,
paths
,
False
,
True
)
numpy
.
all
(
target
==
reference
)
# Check if RuntimeError is raised when one of the files is missing and
# allow_missing_files if False
with
nose
.
tools
.
assert_raises
(
RuntimeError
):
bob
.
bio
.
base
.
vstack_features
(
bob
.
bio
.
base
.
load
,
paths
)
# Check if ValueError is raised.
with
nose
.
tools
.
assert_raises
(
ValueError
):
bob
.
bio
.
base
.
vstack_features
(
bob
.
bio
.
base
.
load
,
paths
,
True
,
True
)
finally
:
try
:
for
path
in
paths
:
os
.
remove
(
path
)
except
Exception
:
pass
def
test_sampling
():
# test selection of elements
indices
=
bob
.
bio
.
base
.
selected_indices
(
100
,
10
)
...
...
This diff is collapsed.
Click to expand it.
bob/bio/base/utils/io.py
+
0
−
147
View file @
42f23502
...
...
@@ -173,150 +173,3 @@ def save_compressed(data, filename, compression_type='bz2', create_link=False):
hdf5
=
open_compressed
(
filename
,
'
w
'
)
save
(
data
,
hdf5
)
close_compressed
(
filename
,
hdf5
,
compression_type
,
create_link
)
def
_generate_features
(
reader
,
paths
,
same_size
=
False
,
allow_missing_files
=
False
):
"""
Load and stack features in a memory efficient way. This function is meant
to be used inside :py:func:`vstack_features`.
Parameters
----------
reader : ``collections.Callable``
See the documentation of :py:func:`vstack_features`.
paths : ``collections.Iterable``
See the documentation of :py:func:`vstack_features`.
same_size : :obj:`bool`, optional
See the documentation of :py:func:`vstack_features`.
allow_missing_files : :obj:`bool`, optional
See the documentation of :py:func:`vstack_features`.
Yields
------
object
The first object returned is a tuple of :py:class:`numpy.dtype` of
features and the shape of the first feature. The rest of objects are
the actual values in features. The features are returned in C order.
"""
shape_determined
=
False
for
i
,
path
in
enumerate
(
paths
):
if
allow_missing_files
and
not
os
.
path
.
isfile
(
path
):
logger
.
debug
(
"
... File %s, that does not exist, has been ignored.
"
,
path
)
continue
feature
=
numpy
.
atleast_2d
(
reader
(
path
))
feature
=
numpy
.
ascontiguousarray
(
feature
)
if
not
shape_determined
:
shape_determined
=
True
dtype
=
feature
.
dtype
shape
=
list
(
feature
.
shape
)
yield
(
dtype
,
shape
)
else
:
# make sure all features have the same shape and dtype
if
same_size
:
assert
shape
==
list
(
feature
.
shape
)
else
:
assert
shape
[
1
:]
==
list
(
feature
.
shape
[
1
:])
assert
dtype
==
feature
.
dtype
for
value
in
feature
.
flat
:
yield
value
def
vstack_features
(
reader
,
paths
,
same_size
=
False
,
allow_missing_files
=
False
):
"""
Stacks all features in a memory efficient way.
Parameters
----------
reader : ``collections.Callable``
The function to load the features. The function should only take one
argument being the path to the features. Use
:any:`functools.partial` to accommodate your reader to this format.
The features returned by ``reader`` are expected to have the same
:py:class:`numpy.dtype` and the same shape except for their first
dimension. First dimension is should correspond to the number of samples.
paths : ``collections.Iterable``
An iterable of paths to iterate on. Whatever is inside path is given to
``reader`` so they do not need to be necessarily paths to actual files.
If ``same_size`` is ``True``, ``len(paths)`` must be valid.
same_size : :obj:`bool`, optional
If ``True``, it assumes that arrays inside all the paths are the same
shape. If you know the features are the same size in all paths, set this
to ``True`` to improve the performance.
allow_missing_files : :obj:`bool`, optional
If ``True``, it assumes that the items inside paths are actual files and
ignores the ones that do not exist.
Returns
-------
numpy.ndarray
The read features with the shape (n_samples, \*features_shape[1:]).
Raises
------
ValueError
If both same_size and allow_missing_files are ``True``.
Examples
--------
This function in a simple way is equivalent to calling
``numpy.vstack(reader(p) for p in paths)``.
>>>
import
numpy
>>>
from
bob.bio.base
import
vstack_features
>>>
def
reader
(
path
):
...
# in each file, there are 5 samples and features are 2 dimensional.
...
return
numpy
.
arange
(
10
).
reshape
(
5
,
2
)
>>>
paths
=
[
'
path1
'
,
'
path2
'
]
>>>
all_features
=
vstack_features
(
reader
,
paths
)
>>>
all_features
array
([[
0
,
1
],
[
2
,
3
],
[
4
,
5
],
[
6
,
7
],
[
8
,
9
],
[
0
,
1
],
[
2
,
3
],
[
4
,
5
],
[
6
,
7
],
[
8
,
9
]])
>>>
all_features_with_more_memory
=
numpy
.
vstack
(
reader
(
p
)
for
p
in
paths
)
>>>
numpy
.
allclose
(
all_features
,
all_features_with_more_memory
)
True
You can allocate the array at once to improve the performance if you know
that all features in paths have the same shape and you know the total number
of the paths:
>>>
vstack_features
(
reader
,
paths
,
same_size
=
True
)
array
([[
0
,
1
],
[
2
,
3
],
[
4
,
5
],
[
6
,
7
],
[
8
,
9
],
[
0
,
1
],
[
2
,
3
],
[
4
,
5
],
[
6
,
7
],
[
8
,
9
]])
"""
if
same_size
and
allow_missing_files
:
raise
ValueError
(
"
Both same_size and allow_missing_files cannot be True at
"
"
the same time.
"
)
iterable
=
_generate_features
(
reader
,
paths
,
same_size
,
allow_missing_files
)
try
:
dtype
,
shape
=
next
(
iterable
)
except
StopIteration
:
return
numpy
.
array
([])
if
same_size
:
total_size
=
int
(
len
(
paths
)
*
numpy
.
prod
(
shape
))
all_features
=
numpy
.
fromiter
(
iterable
,
dtype
,
total_size
)
else
:
all_features
=
numpy
.
fromiter
(
iterable
,
dtype
)
# the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 4).
shape
=
list
(
shape
)
shape
[
0
]
=
-
1
return
numpy
.
reshape
(
all_features
,
shape
,
order
=
'
C
'
)
This diff is collapsed.
Click to expand it.
doc/py_api.rst
+
0
−
1
View file @
42f23502
...
...
@@ -17,7 +17,6 @@ IO-related functions
bob.bio.base.open_compressed
bob.bio.base.close_compressed
bob.bio.base.check_file
bob.bio.base.vstack_features
Pipelines
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment