Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
bob
bob.learn.tensorflow
Commits
5884afff
Commit
5884afff
authored
Sep 07, 2016
by
Tiago de Freitas Pereira
Browse files
Added prefetching and text loading
parent
65a91493
Changes
11
Hide whitespace changes
Inline
Side-by-side
bob/learn/tensorflow/data/BaseDataShuffler.py
0 → 100644
View file @
5884afff
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
# @date: Wed 11 May 2016 09:39:36 CEST
import
numpy
import
tensorflow
as
tf
class
BaseDataShuffler
(
object
):
def
__init__
(
self
,
data
,
labels
,
input_shape
,
perc_train
=
0.9
,
scale
=
True
,
train_batch_size
=
1
,
validation_batch_size
=
300
):
"""
The class provide base functionoalies to shuffle the data
**Parameters**
data:
labels:
perc_train:
scale:
train_batch_size:
validation_batch_size:
"""
self
.
scale
=
scale
self
.
scale_value
=
0.00390625
# TODO: Check if the bacth size is higher than the input data
self
.
train_batch_size
=
train_batch_size
self
.
validation_batch_size
=
validation_batch_size
self
.
data
=
data
self
.
train_shape
=
tuple
([
train_batch_size
]
+
input_shape
)
self
.
validation_shape
=
tuple
([
validation_batch_size
]
+
input_shape
)
# TODO: Check if the labels goes from O to N-1
self
.
labels
=
labels
self
.
total_labels
=
max
(
labels
)
+
1
# Spliting in train and validation
self
.
n_samples
=
len
(
self
.
labels
)
self
.
n_train_samples
=
int
(
round
(
self
.
n_samples
*
perc_train
))
self
.
n_validation_samples
=
self
.
n_samples
-
self
.
n_train_samples
# Shuffling all the indexes
self
.
indexes
=
numpy
.
array
(
range
(
self
.
n_samples
))
numpy
.
random
.
shuffle
(
self
.
indexes
)
def
get_placeholders_forprefetch
(
self
,
name
=
""
,
train_dataset
=
True
):
"""
Returns a place holder with the size of your batch
"""
shape
=
self
.
train_shape
if
train_dataset
else
self
.
validation_shape
data
=
tf
.
placeholder
(
tf
.
float32
,
shape
=
tuple
([
None
]
+
list
(
shape
[
1
:])),
name
=
name
)
labels
=
tf
.
placeholder
(
tf
.
int64
,
shape
=
[
None
,
])
return
data
,
labels
def
get_placeholders
(
self
,
name
=
""
,
train_dataset
=
True
):
"""
Returns a place holder with the size of your batch
"""
shape
=
self
.
train_shape
if
train_dataset
else
self
.
validation_shape
data
=
tf
.
placeholder
(
tf
.
float32
,
shape
=
shape
,
name
=
name
)
labels
=
tf
.
placeholder
(
tf
.
int64
,
shape
=
shape
[
0
])
return
data
,
labels
bob/learn/tensorflow/data/DataShuffler.py
→
bob/learn/tensorflow/data/
Memory
DataShuffler.py
View file @
5884afff
...
...
@@ -6,6 +6,8 @@
import
numpy
import
tensorflow
as
tf
from
.BaseDataShuffler
import
BaseDataShuffler
def
scale_mean_norm
(
data
,
scale
=
0.00390625
):
mean
=
numpy
.
mean
(
data
)
data
=
(
data
-
mean
)
*
scale
...
...
@@ -13,66 +15,39 @@ def scale_mean_norm(data, scale=0.00390625):
return
data
,
mean
class
DataShuffler
(
object
):
def
__init__
(
self
,
data
,
labels
,
perc_train
=
0.9
,
scale
=
True
,
train_batch_size
=
1
,
validation_batch_size
=
300
):
"""
The class provide some functionalities for shuffling data
**Parameters**
data:
"""
self
.
perc_train
=
perc_train
self
.
scale
=
scale
self
.
scale_value
=
0.00390625
self
.
train_batch_size
=
train_batch_size
self
.
validation_batch_size
=
validation_batch_size
self
.
data
=
data
self
.
labels
=
labels
# From O to N-1
self
.
total_labels
=
max
(
labels
)
+
1
self
.
n_samples
=
self
.
data
.
shape
[
0
]
self
.
width
=
self
.
data
.
shape
[
1
]
self
.
height
=
self
.
data
.
shape
[
2
]
self
.
channels
=
self
.
data
.
shape
[
3
]
self
.
start_shuffler
()
def
get_placeholders
(
self
,
name
=
""
,
train_dataset
=
True
):
"""
"""
batch
=
self
.
train_batch_size
if
train_dataset
else
self
.
validation_batch_size
data
=
tf
.
placeholder
(
tf
.
float32
,
shape
=
(
batch
,
self
.
width
,
self
.
height
,
self
.
channels
),
name
=
name
)
labels
=
tf
.
placeholder
(
tf
.
int64
,
shape
=
batch
)
return
data
,
labels
def
start_shuffler
(
self
):
class
MemoryDataShuffler
(
BaseDataShuffler
):
def
__init__
(
self
,
data
,
labels
,
input_shape
,
perc_train
=
0.9
,
scale
=
True
,
train_batch_size
=
1
,
validation_batch_size
=
300
):
"""
S
ome base functions for neural network
s
S
huffler that deal with memory dataset
s
**Parameters**
data:
labels:
perc_train:
scale:
train_batch_size:
validation_batch_size:
"""
indexes
=
numpy
.
array
(
range
(
self
.
n_samples
))
numpy
.
random
.
shuffle
(
indexes
)
# Spliting train and validation
train_samples
=
int
(
round
(
self
.
n_samples
*
self
.
perc_train
))
validation_samples
=
self
.
n_samples
-
train_samples
self
.
train_data
=
self
.
data
[
indexes
[
0
:
train_samples
],
:,
:,
:]
self
.
train_labels
=
self
.
labels
[
indexes
[
0
:
train_samples
]]
self
.
validation_data
=
self
.
data
[
indexes
[
train_samples
:
train_samples
+
validation_samples
],
:,
:,
:]
self
.
validation_labels
=
self
.
labels
[
indexes
[
train_samples
:
train_samples
+
validation_samples
]]
super
(
MemoryDataShuffler
,
self
).
__init__
(
data
=
data
,
labels
=
labels
,
input_shape
=
input_shape
,
perc_train
=
perc_train
,
scale
=
scale
,
train_batch_size
=
train_batch_size
,
validation_batch_size
=
validation_batch_size
)
# Spliting between train and test
self
.
train_data
=
self
.
data
[
self
.
indexes
[
0
:
self
.
n_train_samples
],
...]
self
.
train_labels
=
self
.
labels
[
self
.
indexes
[
0
:
self
.
n_train_samples
]]
self
.
validation_data
=
self
.
data
[
self
.
indexes
[
self
.
n_train_samples
:
self
.
n_train_samples
+
self
.
n_validation_samples
],
...]
self
.
validation_labels
=
self
.
labels
[
self
.
indexes
[
self
.
n_train_samples
:
self
.
n_train_samples
+
self
.
n_validation_samples
]]
if
self
.
scale
:
# data = scale_minmax_norm(data,lower_bound = -1, upper_bound = 1)
self
.
train_data
,
self
.
mean
=
scale_mean_norm
(
self
.
train_data
)
self
.
validation_data
=
(
self
.
validation_data
-
self
.
mean
)
*
self
.
scale_value
...
...
@@ -80,13 +55,10 @@ class DataShuffler(object):
if
train_dataset
:
n_samples
=
self
.
train_batch_size
else
:
n_samples
=
self
.
validation_batch_size
if
train_dataset
:
data
=
self
.
train_data
label
=
self
.
train_labels
else
:
n_samples
=
self
.
validation_batch_size
data
=
self
.
validation_data
label
=
self
.
validation_labels
...
...
bob/learn/tensorflow/data/MemoryPairDataShuffler.py
0 → 100644
View file @
5884afff
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
# @date: Wed 11 May 2016 09:39:36 CEST
import
numpy
from
.MemoryDataShuffler
import
MemoryDataShuffler
class
MemoryPairDataShuffler
(
MemoryDataShuffler
):
def
__init__
(
self
,
data
,
labels
,
input_shape
,
perc_train
=
0.9
,
scale
=
True
,
train_batch_size
=
1
,
validation_batch_size
=
300
):
"""
The class provide some functionalities for shuffling data
**Parameters**
data:
"""
data
=
data
labels
=
labels
input_shape
=
input_shape
perc_train
=
perc_train
scale
=
scale
train_batch_size
=
train_batch_size
validation_batch_size
=
validation_batch_size
super
(
MemoryPairDataShuffler
,
self
).
__init__
(
data
,
labels
,
input_shape
=
input_shape
,
perc_train
=
perc_train
,
scale
=
scale
,
train_batch_size
=
train_batch_size
*
2
,
validation_batch_size
=
validation_batch_size
)
def
get_pair
(
self
,
train_dataset
=
True
,
zero_one_labels
=
True
):
"""
Get a random pair of samples
**Parameters**
is_target_set_train: Defining the target set to get the batch
**Return**
"""
def
get_genuine_or_not
(
input_data
,
input_labels
,
genuine
=
True
):
if
genuine
:
# TODO: THIS KEY SELECTION NEEDS TO BE MORE EFFICIENT
# Getting a client
index
=
numpy
.
random
.
randint
(
self
.
total_labels
)
# Getting the indexes of the data from a particular client
indexes
=
numpy
.
where
(
input_labels
==
index
)[
0
]
numpy
.
random
.
shuffle
(
indexes
)
# Picking a pair
data
=
input_data
[
indexes
[
0
],
...]
data_p
=
input_data
[
indexes
[
1
],
...]
else
:
# Picking a pair from different clients
index
=
numpy
.
random
.
choice
(
self
.
total_labels
,
2
,
replace
=
False
)
# Getting the indexes of the two clients
indexes
=
numpy
.
where
(
input_labels
==
index
[
0
])[
0
]
indexes_p
=
numpy
.
where
(
input_labels
==
index
[
1
])[
0
]
numpy
.
random
.
shuffle
(
indexes
)
numpy
.
random
.
shuffle
(
indexes_p
)
# Picking a pair
data
=
input_data
[
indexes
[
0
],
...]
data_p
=
input_data
[
indexes_p
[
0
],
...]
return
data
,
data_p
if
train_dataset
:
target_data
=
self
.
train_data
target_labels
=
self
.
train_labels
shape
=
self
.
train_shape
else
:
target_data
=
self
.
validation_data
target_labels
=
self
.
validation_labels
shape
=
self
.
validation_shape
data
=
numpy
.
zeros
(
shape
=
shape
,
dtype
=
'float32'
)
data_p
=
numpy
.
zeros
(
shape
=
shape
,
dtype
=
'float32'
)
labels_siamese
=
numpy
.
zeros
(
shape
=
shape
[
0
],
dtype
=
'float32'
)
genuine
=
True
for
i
in
range
(
shape
[
0
]):
data
[
i
,
...],
data_p
[
i
,
...]
=
get_genuine_or_not
(
target_data
,
target_labels
,
genuine
=
genuine
)
if
zero_one_labels
:
labels_siamese
[
i
]
=
not
genuine
else
:
labels_siamese
[
i
]
=
-
1
if
genuine
else
+
1
genuine
=
not
genuine
return
data
,
data_p
,
labels_siamese
def
get_triplet
(
self
,
n_labels
,
n_triplets
=
1
,
is_target_set_train
=
True
):
"""
Get a triplet
**Parameters**
is_target_set_train: Defining the target set to get the batch
**Return**
"""
def
get_one_triplet
(
input_data
,
input_labels
):
# Getting a pair of clients
index
=
numpy
.
random
.
choice
(
n_labels
,
2
,
replace
=
False
)
label_positive
=
index
[
0
]
label_negative
=
index
[
1
]
# Getting the indexes of the data from a particular client
indexes
=
numpy
.
where
(
input_labels
==
index
[
0
])[
0
]
numpy
.
random
.
shuffle
(
indexes
)
# Picking a positive pair
data_anchor
=
input_data
[
indexes
[
0
],
:,
:,
:]
data_positive
=
input_data
[
indexes
[
1
],
:,
:,
:]
# Picking a negative sample
indexes
=
numpy
.
where
(
input_labels
==
index
[
1
])[
0
]
numpy
.
random
.
shuffle
(
indexes
)
data_negative
=
input_data
[
indexes
[
0
],
:,
:,
:]
return
data_anchor
,
data_positive
,
data_negative
,
label_positive
,
label_positive
,
label_negative
if
is_target_set_train
:
target_data
=
self
.
train_data
target_labels
=
self
.
train_labels
else
:
target_data
=
self
.
validation_data
target_labels
=
self
.
validation_labels
c
=
target_data
.
shape
[
3
]
w
=
target_data
.
shape
[
1
]
h
=
target_data
.
shape
[
2
]
data_a
=
numpy
.
zeros
(
shape
=
(
n_triplets
,
w
,
h
,
c
),
dtype
=
'float32'
)
data_p
=
numpy
.
zeros
(
shape
=
(
n_triplets
,
w
,
h
,
c
),
dtype
=
'float32'
)
data_n
=
numpy
.
zeros
(
shape
=
(
n_triplets
,
w
,
h
,
c
),
dtype
=
'float32'
)
labels_a
=
numpy
.
zeros
(
shape
=
n_triplets
,
dtype
=
'float32'
)
labels_p
=
numpy
.
zeros
(
shape
=
n_triplets
,
dtype
=
'float32'
)
labels_n
=
numpy
.
zeros
(
shape
=
n_triplets
,
dtype
=
'float32'
)
for
i
in
range
(
n_triplets
):
data_a
[
i
,
:,
:,
:],
data_p
[
i
,
:,
:,
:],
data_n
[
i
,
:,
:,
:],
\
labels_a
[
i
],
labels_p
[
i
],
labels_n
[
i
]
=
\
get_one_triplet
(
target_data
,
target_labels
)
return
data_a
,
data_p
,
data_n
,
labels_a
,
labels_p
,
labels_n
bob/learn/tensorflow/data/TextDataShuffler.py
0 → 100644
View file @
5884afff
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
# @date: Wed 11 May 2016 09:39:36 CEST
import
numpy
import
bob.io.base
import
bob.io.image
import
tensorflow
as
tf
from
.BaseDataShuffler
import
BaseDataShuffler
#def scale_mean_norm(data, scale=0.00390625):
# mean = numpy.mean(data)
# data = (data - mean) * scale
# return data, mean
class
TextDataShuffler
(
BaseDataShuffler
):
def
__init__
(
self
,
data
,
labels
,
input_shape
,
perc_train
=
0.9
,
scale
=
0.00390625
,
train_batch_size
=
1
,
validation_batch_size
=
300
):
"""
Shuffler that deal with file list
**Parameters**
data:
labels:
perc_train:
scale:
train_batch_size:
validation_batch_size:
"""
super
(
TextDataShuffler
,
self
).
__init__
(
data
=
data
,
labels
=
labels
,
input_shape
=
input_shape
,
perc_train
=
perc_train
,
scale
=
scale
,
train_batch_size
=
train_batch_size
,
validation_batch_size
=
validation_batch_size
)
if
isinstance
(
self
.
data
,
list
):
self
.
data
=
numpy
.
array
(
self
.
data
)
if
isinstance
(
self
.
labels
,
list
):
self
.
labels
=
numpy
.
array
(
self
.
labels
)
# Spliting between train and test
self
.
train_data
=
self
.
data
[
self
.
indexes
[
0
:
self
.
n_train_samples
]]
self
.
train_labels
=
self
.
labels
[
self
.
indexes
[
0
:
self
.
n_train_samples
]]
self
.
validation_data
=
self
.
data
[
self
.
indexes
[
self
.
n_train_samples
:
self
.
n_train_samples
+
self
.
n_validation_samples
]]
self
.
validation_labels
=
self
.
labels
[
self
.
indexes
[
self
.
n_train_samples
:
self
.
n_train_samples
+
self
.
n_validation_samples
]]
def
get_batch
(
self
,
train_dataset
=
True
):
if
train_dataset
:
batch_size
=
self
.
train_batch_size
shape
=
self
.
train_shape
files_names
=
self
.
train_data
label
=
self
.
train_labels
else
:
batch_size
=
self
.
validation_batch_size
shape
=
self
.
validation_shape
files_names
=
self
.
validation_data
label
=
self
.
validation_labels
# Shuffling samples
indexes
=
numpy
.
array
(
range
(
files_names
.
shape
[
0
]))
numpy
.
random
.
shuffle
(
indexes
)
selected_data
=
numpy
.
zeros
(
shape
=
shape
)
for
i
in
range
(
batch_size
):
file_name
=
files_names
[
indexes
[
i
]]
d
=
bob
.
io
.
base
.
load
(
file_name
)
if
len
(
d
.
shape
)
==
2
:
data
=
numpy
.
zeros
(
shape
=
tuple
(
shape
[
1
:]))
data
[:,
:,
0
]
=
d
else
:
data
=
d
selected_data
[
i
,
...]
=
data
if
self
.
scale
is
not
None
:
selected_data
[
i
,
...]
*=
self
.
scale
selected_labels
=
label
[
indexes
[
0
:
batch_size
]]
return
selected_data
.
astype
(
"float32"
),
selected_labels
bob/learn/tensorflow/data/TextPairDataShuffler.py
0 → 100644
View file @
5884afff
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
# @date: Wed 11 May 2016 09:39:36 CEST
import
numpy
from
.TextDataShuffler
import
TextDataShuffler
class
TextPairDataShuffler
(
TextDataShuffler
):
def
__init__
(
self
,
data
,
labels
,
input_shape
,
perc_train
=
0.9
,
scale
=
True
,
train_batch_size
=
1
,
validation_batch_size
=
300
):
"""
The class provide some functionalities for shuffling data
**Parameters**
data:
"""
data
=
data
labels
=
labels
input_shape
=
input_shape
perc_train
=
perc_train
scale
=
scale
train_batch_size
=
train_batch_size
validation_batch_size
=
validation_batch_size
super
(
TextPairDataShuffler
,
self
).
__init__
(
data
,
labels
,
input_shape
=
input_shape
,
perc_train
=
perc_train
,
scale
=
scale
,
train_batch_size
=
train_batch_size
*
2
,
validation_batch_size
=
validation_batch_size
)
def
get_pair
(
self
,
train_dataset
=
True
,
zero_one_labels
=
True
):
"""
Get a random pair of samples
**Parameters**
is_target_set_train: Defining the target set to get the batch
**Return**
"""
def
get_genuine_or_not
(
input_data
,
input_labels
,
genuine
=
True
):
if
genuine
:
# TODO: THIS KEY SELECTION NEEDS TO BE MORE EFFICIENT
# Getting a client
index
=
numpy
.
random
.
randint
(
self
.
total_labels
)
# Getting the indexes of the data from a particular client
indexes
=
numpy
.
where
(
input_labels
==
index
)[
0
]
numpy
.
random
.
shuffle
(
indexes
)
# Picking a pair
data
=
input_data
[
indexes
[
0
]]
data_p
=
input_data
[
indexes
[
1
]]
else
:
# Picking a pair from different clients
index
=
numpy
.
random
.
choice
(
self
.
total_labels
,
2
,
replace
=
False
)
# Getting the indexes of the two clients
indexes
=
numpy
.
where
(
input_labels
==
index
[
0
])[
0
]
indexes_p
=
numpy
.
where
(
input_labels
==
index
[
1
])[
0
]
numpy
.
random
.
shuffle
(
indexes
)
numpy
.
random
.
shuffle
(
indexes_p
)
# Picking a pair
data
=
input_data
[
indexes
[
0
]]
data_p
=
input_data
[
indexes_p
[
0
]]
return
data
,
data_p
if
train_dataset
:
target_data
=
self
.
train_data
target_labels
=
self
.
train_labels
shape
=
self
.
train_shape
else
:
target_data
=
self
.
validation_data
target_labels
=
self
.
validation_labels
shape
=
self
.
validation_shape
data
=
numpy
.
zeros
(
shape
=
shape
,
dtype
=
'float32'
)
data_p
=
numpy
.
zeros
(
shape
=
shape
,
dtype
=
'float32'
)
labels_siamese
=
numpy
.
zeros
(
shape
=
shape
[
0
],
dtype
=
'float32'
)
genuine
=
True
for
i
in
range
(
shape
[
0
]):
data
[
i
,
...],
data_p
[
i
,
...]
=
get_genuine_or_not
(
target_data
,
target_labels
,
genuine
=
genuine
)
if
zero_one_labels
:
labels_siamese
[
i
]
=
not
genuine
else
:
labels_siamese
[
i
]
=
-
1
if
genuine
else
+
1
genuine
=
not
genuine
return
data
,
data_p
,
labels_siamese
def
get_triplet
(
self
,
n_labels
,
n_triplets
=
1
,
is_target_set_train
=
True
):
"""
Get a triplet
**Parameters**
is_target_set_train: Defining the target set to get the batch
**Return**
"""
def
get_one_triplet
(
input_data
,
input_labels
):
# Getting a pair of clients
index
=
numpy
.
random
.
choice
(
n_labels
,
2
,
replace
=
False
)
label_positive
=
index
[
0
]
label_negative
=
index
[
1
]
# Getting the indexes of the data from a particular client
indexes
=
numpy
.
where
(
input_labels
==
index
[
0
])[
0
]
numpy
.
random
.
shuffle
(
indexes
)
# Picking a positive pair
data_anchor
=
input_data
[
indexes
[
0
],
:,
:,
:]
data_positive
=
input_data
[
indexes
[
1
],
:,
:,
:]
# Picking a negative sample
indexes
=
numpy
.
where
(
input_labels
==
index
[
1
])[
0
]
numpy
.
random
.
shuffle
(
indexes
)
data_negative
=
input_data
[
indexes
[
0
],
:,
:,
:]
return
data_anchor
,
data_positive
,
data_negative
,
label_positive
,
label_positive
,
label_negative
if
is_target_set_train
:
target_data
=
self
.
train_data
target_labels
=
self
.
train_labels
else
:
target_data
=
self
.
validation_data
target_labels
=
self
.
validation_labels
c
=
target_data
.
shape
[
3
]
w
=
target_data
.
shape
[
1
]
h
=
target_data
.
shape
[
2
]
data_a
=
numpy
.
zeros
(
shape
=
(
n_triplets
,
w
,
h
,
c
),
dtype
=
'float32'
)
data_p
=
numpy
.
zeros
(
shape
=
(
n_triplets
,
w
,
h
,
c
),
dtype
=
'float32'
)
data_n
=
numpy
.
zeros
(
shape
=
(
n_triplets
,
w
,
h
,
c
),
dtype
=
'float32'
)
labels_a
=
numpy
.
zeros
(
shape
=
n_triplets
,
dtype
=
'float32'
)
labels_p
=
numpy
.
zeros
(
shape
=
n_triplets
,
dtype
=
'float32'
)
labels_n
=
numpy
.
zeros
(
shape
=
n_triplets
,
dtype
=
'float32'
)