Skip to content
Snippets Groups Projects
Commit dafa7052 authored by Yannick DAYER's avatar Yannick DAYER
Browse files

[py] Update ISV to accept probe templates.

parent 49c8696f
No related branches found
No related tags found
1 merge request!73Update ISV to accept probe templates. Update QA.
Pipeline #69695 failed
...@@ -2,20 +2,20 @@ ...@@ -2,20 +2,20 @@
# See https://pre-commit.com/hooks.html for more hooks # See https://pre-commit.com/hooks.html for more hooks
repos: repos:
- repo: https://github.com/timothycrosley/isort - repo: https://github.com/timothycrosley/isort
rev: 5.10.1 rev: 5.12.0
hooks: hooks:
- id: isort - id: isort
args: [--settings-path, "pyproject.toml"] args: [--settings-path, "pyproject.toml"]
- repo: https://github.com/psf/black - repo: https://github.com/psf/black
rev: 22.3.0 rev: 22.12.0
hooks: hooks:
- id: black - id: black
- repo: https://github.com/pycqa/flake8 - repo: https://github.com/pycqa/flake8
rev: 3.9.2 rev: 6.0.0
hooks: hooks:
- id: flake8 - id: flake8
- repo: https://github.com/pre-commit/pre-commit-hooks - repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.2.0 rev: v4.4.0
hooks: hooks:
- id: check-ast - id: check-ast
- id: check-case-conflict - id: check-case-conflict
......
...@@ -148,7 +148,6 @@ class FactorAnalysisBase(BaseEstimator): ...@@ -148,7 +148,6 @@ class FactorAnalysisBase(BaseEstimator):
@property @property
def feature_dimension(self): def feature_dimension(self):
"""Get the UBM Dimension""" """Get the UBM Dimension"""
# TODO: Add this on the GMMMachine class # TODO: Add this on the GMMMachine class
return self.ubm.means.shape[1] return self.ubm.means.shape[1]
...@@ -159,16 +158,12 @@ class FactorAnalysisBase(BaseEstimator): ...@@ -159,16 +158,12 @@ class FactorAnalysisBase(BaseEstimator):
@property @property
def mean_supervector(self): def mean_supervector(self):
""" """Returns the mean supervector"""
Returns the mean supervector
"""
return self.ubm.means.flatten() return self.ubm.means.flatten()
@property @property
def variance_supervector(self): def variance_supervector(self):
""" """Returns the variance supervector"""
Returns the variance supervector
"""
return self.ubm.variances.flatten() return self.ubm.variances.flatten()
@property @property
...@@ -199,10 +194,7 @@ class FactorAnalysisBase(BaseEstimator): ...@@ -199,10 +194,7 @@ class FactorAnalysisBase(BaseEstimator):
self._V = np.array(value) self._V = np.array(value)
def estimate_number_of_classes(self, y): def estimate_number_of_classes(self, y):
""" """Estimates the number of classes given the labels"""
Estimates the number of classes given the labels
"""
return len(unique_labels(y)) return len(unique_labels(y))
def initialize_using_array(self, X): def initialize_using_array(self, X):
...@@ -279,6 +271,7 @@ class FactorAnalysisBase(BaseEstimator): ...@@ -279,6 +271,7 @@ class FactorAnalysisBase(BaseEstimator):
D: (n_gaussians*feature_dimension) represents the client offset vector D: (n_gaussians*feature_dimension) represents the client offset vector
""" """
if self.random_state is not None: if self.random_state is not None:
np.random.seed(self.random_state) np.random.seed(self.random_state)
...@@ -319,6 +312,7 @@ class FactorAnalysisBase(BaseEstimator): ...@@ -319,6 +312,7 @@ class FactorAnalysisBase(BaseEstimator):
(n_classes, n_gaussians) representing the accumulated 0th order statistics (n_classes, n_gaussians) representing the accumulated 0th order statistics
""" """
# 0th order stats # 0th order stats
n_acc = np.zeros((n_classes, self.ubm.n_gaussians), like=X[0].n) n_acc = np.zeros((n_classes, self.ubm.n_gaussians), like=X[0].n)
...@@ -380,12 +374,11 @@ class FactorAnalysisBase(BaseEstimator): ...@@ -380,12 +374,11 @@ class FactorAnalysisBase(BaseEstimator):
i: int i: int
Class id to return the statistics for Class id to return the statistics for
""" """
indices = np.where(np.array(y) == i)[0] indices = np.where(np.array(y) == i)[0]
return [X[i] for i in indices] return [X[i] for i in indices]
""" # Estimating U and x #
Estimating U and x
"""
def _compute_id_plus_u_prod_ih(self, x_i, UProd): def _compute_id_plus_u_prod_ih(self, x_i, UProd):
""" """
...@@ -430,6 +423,7 @@ class FactorAnalysisBase(BaseEstimator): ...@@ -430,6 +423,7 @@ class FactorAnalysisBase(BaseEstimator):
E[y_i] for class `i` E[y_i] for class `i`
""" """
f_i = x_i.sum_px f_i = x_i.sum_px
n_i = x_i.n n_i = x_i.n
n_ic = np.repeat(n_i, self.feature_dimension) n_ic = np.repeat(n_i, self.feature_dimension)
...@@ -447,9 +441,8 @@ class FactorAnalysisBase(BaseEstimator): ...@@ -447,9 +441,8 @@ class FactorAnalysisBase(BaseEstimator):
self.mean_supervector + self._D * latent_z_i self.mean_supervector + self._D * latent_z_i
) )
""" # JFA Part (eq 29) #
# JFA Part (eq 29)
"""
V_dot_v = V @ latent_y_i if latent_y_i is not None else 0 V_dot_v = V @ latent_y_i if latent_y_i is not None else 0
fn_x_ih -= n_ic * V_dot_v if latent_y_i is not None else 0 fn_x_ih -= n_ic * V_dot_v if latent_y_i is not None else 0
...@@ -545,6 +538,7 @@ class FactorAnalysisBase(BaseEstimator): ...@@ -545,6 +538,7 @@ class FactorAnalysisBase(BaseEstimator):
Accumulated statistics for U_A2(n_gaussians* feature_dimension, r_U) Accumulated statistics for U_A2(n_gaussians* feature_dimension, r_U)
""" """
# Inverting A1 over the zero axis # Inverting A1 over the zero axis
# https://stackoverflow.com/questions/11972102/is-there-a-way-to-efficiently-invert-an-array-of-matrices-with-numpy # https://stackoverflow.com/questions/11972102/is-there-a-way-to-efficiently-invert-an-array-of-matrices-with-numpy
inv_A1 = np.linalg.inv(acc_U_A1) inv_A1 = np.linalg.inv(acc_U_A1)
...@@ -568,6 +562,7 @@ class FactorAnalysisBase(BaseEstimator): ...@@ -568,6 +562,7 @@ class FactorAnalysisBase(BaseEstimator):
https://gitlab.idiap.ch/bob/bob.learn.em/-/blob/da92d0e5799d018f311f1bf5cdd5a80e19e142ca/bob/learn/em/cpp/FABaseTrainer.cpp#L325 https://gitlab.idiap.ch/bob/bob.learn.em/-/blob/da92d0e5799d018f311f1bf5cdd5a80e19e142ca/bob/learn/em/cpp/FABaseTrainer.cpp#L325
""" """
# UProd = (self.ubm.n_gaussians, self.r_U, self.r_U) # UProd = (self.ubm.n_gaussians, self.r_U, self.r_U)
Uc = self._U.reshape( Uc = self._U.reshape(
...@@ -662,9 +657,7 @@ class FactorAnalysisBase(BaseEstimator): ...@@ -662,9 +657,7 @@ class FactorAnalysisBase(BaseEstimator):
return acc_U_A1, acc_U_A2 return acc_U_A1, acc_U_A2
""" # Estimating D and z #
Estimating D and z
"""
def update_z(self, X, y, latent_x, latent_y, latent_z, n_acc, f_acc): def update_z(self, X, y, latent_x, latent_y, latent_z, n_acc, f_acc):
""" """
...@@ -878,6 +871,7 @@ class FactorAnalysisBase(BaseEstimator): ...@@ -878,6 +871,7 @@ class FactorAnalysisBase(BaseEstimator):
latent_x = (n_classes, r_U, n_sessions) latent_x = (n_classes, r_U, n_sessions)
""" """
kw = dict(like=like) if isinstance(like, dask.array.core.Array) else {} kw = dict(like=like) if isinstance(like, dask.array.core.Array) else {}
# x (Eq. 36) # x (Eq. 36)
...@@ -897,9 +891,7 @@ class FactorAnalysisBase(BaseEstimator): ...@@ -897,9 +891,7 @@ class FactorAnalysisBase(BaseEstimator):
return latent_x, latent_y, latent_z return latent_x, latent_y, latent_z
""" # Estimating V and y #
Estimating V and y
"""
def update_y( def update_y(
self, self,
...@@ -948,6 +940,7 @@ class FactorAnalysisBase(BaseEstimator): ...@@ -948,6 +940,7 @@ class FactorAnalysisBase(BaseEstimator):
Accumulated 1st order statistics for each class (math:`F_{i}`) Accumulated 1st order statistics for each class (math:`F_{i}`)
""" """
# V.T / sigma # V.T / sigma
VTinvSigma = self._V.T / self.variance_supervector VTinvSigma = self._V.T / self.variance_supervector
...@@ -1019,7 +1012,7 @@ class FactorAnalysisBase(BaseEstimator): ...@@ -1019,7 +1012,7 @@ class FactorAnalysisBase(BaseEstimator):
I = np.eye(self.r_V, self.r_V) # noqa: E741 I = np.eye(self.r_V, self.r_V) # noqa: E741
# TODO: make the invertion matrix function as a parameter # TODO: make the inversion matrix function as a parameter
return np.linalg.inv(I + (VProd * n_acc_i[:, None, None]).sum(axis=0)) return np.linalg.inv(I + (VProd * n_acc_i[:, None, None]).sum(axis=0))
def _compute_vprod(self): def _compute_vprod(self):
...@@ -1176,17 +1169,15 @@ class FactorAnalysisBase(BaseEstimator): ...@@ -1176,17 +1169,15 @@ class FactorAnalysisBase(BaseEstimator):
) # Fn_yi = sum_{sessions h}(N_{i,h}*(o_{i,h} - m - D*z_{i}) ) # Fn_yi = sum_{sessions h}(N_{i,h}*(o_{i,h} - m - D*z_{i})
# Looping over the sessions of a ;ane; # Looping over the sessions of a ;ane;
for session_id in range(len(X_i)): for session_id, x_i_s in enumerate(X_i):
n_i = X_i[session_id].n n_i = x_i_s[session_id].n
U_dot_x = U @ latent_x_i[:, session_id] U_dot_x = U @ latent_x_i[:, session_id]
tmp_CD = np.repeat(n_i, self.feature_dimension) tmp_CD = np.repeat(n_i, self.feature_dimension)
fn_y_i -= tmp_CD * U_dot_x fn_y_i -= tmp_CD * U_dot_x
return fn_y_i return fn_y_i
""" # Scoring #
Scoring
"""
def estimate_x(self, X): def estimate_x(self, X):
...@@ -1212,6 +1203,7 @@ class FactorAnalysisBase(BaseEstimator): ...@@ -1212,6 +1203,7 @@ class FactorAnalysisBase(BaseEstimator):
X_i: list of :py:class:`bob.learn.em.GMMStats` X_i: list of :py:class:`bob.learn.em.GMMStats`
List of statistics for a class List of statistics for a class
""" """
I = np.eye(self.r_U, self.r_U) # noqa: E741 I = np.eye(self.r_U, self.r_U) # noqa: E741
Uc = self._U.reshape( Uc = self._U.reshape(
...@@ -1319,10 +1311,11 @@ class FactorAnalysisBase(BaseEstimator): ...@@ -1319,10 +1311,11 @@ class FactorAnalysisBase(BaseEstimator):
z z
""" """
return self.enroll([self.ubm.acc_stats(X)]) return self.enroll([self.ubm.acc_stats(X)])
def _prepare_dask_input(self, X, y): def _prepare_dask_input(self, X, y):
"""Perpare the input for the fit method""" """Prepare the input for the fit method"""
logger.info( logger.info(
"Rechunking bag of stats to delayed list of stats per class. If your worker runs " "Rechunking bag of stats to delayed list of stats per class. If your worker runs "
"out of memory in this training step, you have to use workers with more memory." "out of memory in this training step, you have to use workers with more memory."
...@@ -1406,9 +1399,7 @@ class ISVMachine(FactorAnalysisBase): ...@@ -1406,9 +1399,7 @@ class ISVMachine(FactorAnalysisBase):
) )
def e_step(self, X, y, n_samples_per_class, n_acc, f_acc): def e_step(self, X, y, n_samples_per_class, n_acc, f_acc):
""" """E-step of the EM algorithm."""
E-step of the EM algorithm
"""
# self.initialize_XYZ(y) # self.initialize_XYZ(y)
UProd = self._compute_uprod() UProd = self._compute_uprod()
_, _, latent_z = self.initialize_XYZ( _, _, latent_z = self.initialize_XYZ(
...@@ -1452,6 +1443,7 @@ class ISVMachine(FactorAnalysisBase): ...@@ -1452,6 +1443,7 @@ class ISVMachine(FactorAnalysisBase):
Accumulated statistics for U_A2(n_gaussians* feature_dimension, r_U) Accumulated statistics for U_A2(n_gaussians* feature_dimension, r_U)
""" """
acc_U_A1 = [acc[0] for acc in acc_U_A1_acc_U_A2_list] acc_U_A1 = [acc[0] for acc in acc_U_A1_acc_U_A2_list]
acc_U_A2 = [acc[1] for acc in acc_U_A1_acc_U_A2_list] acc_U_A2 = [acc[1] for acc in acc_U_A1_acc_U_A2_list]
...@@ -1476,6 +1468,7 @@ class ISVMachine(FactorAnalysisBase): ...@@ -1476,6 +1468,7 @@ class ISVMachine(FactorAnalysisBase):
Returns self. Returns self.
""" """
if isinstance(X, dask.bag.Bag): if isinstance(X, dask.bag.Bag):
X, y = self._prepare_dask_input(X, y) X, y = self._prepare_dask_input(X, y)
...@@ -1535,6 +1528,7 @@ class ISVMachine(FactorAnalysisBase): ...@@ -1535,6 +1528,7 @@ class ISVMachine(FactorAnalysisBase):
z z
""" """
iterations = self.enroll_iterations iterations = self.enroll_iterations
# We have only one class for enrollment # We have only one class for enrollment
y = list(np.zeros(len(X), dtype=np.int32)) y = list(np.zeros(len(X), dtype=np.int32))
...@@ -1584,6 +1578,7 @@ class ISVMachine(FactorAnalysisBase): ...@@ -1584,6 +1578,7 @@ class ISVMachine(FactorAnalysisBase):
z z
""" """
return self.enroll([self.ubm.acc_stats(X)]) return self.enroll([self.ubm.acc_stats(X)])
def score(self, latent_z, data): def score(self, latent_z, data):
...@@ -1596,7 +1591,7 @@ class ISVMachine(FactorAnalysisBase): ...@@ -1596,7 +1591,7 @@ class ISVMachine(FactorAnalysisBase):
Latent representation of the client (E[z_i]) Latent representation of the client (E[z_i])
data : list of :py:class:`bob.learn.em.GMMStats` data : list of :py:class:`bob.learn.em.GMMStats`
List of statistics to be scored List of statistics of one probe template to be scored
Returns Returns
------- -------
...@@ -1604,7 +1599,14 @@ class ISVMachine(FactorAnalysisBase): ...@@ -1604,7 +1599,14 @@ class ISVMachine(FactorAnalysisBase):
The linear scored The linear scored
""" """
x = self.estimate_x(data)
template_data = data[0]
if len(data) > 1:
for d in data[1:]:
template_data += d
x = self.estimate_x(template_data)
Ux = self._U @ x Ux = self._U @ x
# TODO: I don't know why this is not the enrolled model # TODO: I don't know why this is not the enrolled model
...@@ -1615,10 +1617,10 @@ class ISVMachine(FactorAnalysisBase): ...@@ -1615,10 +1617,10 @@ class ISVMachine(FactorAnalysisBase):
return linear_scoring( return linear_scoring(
z.reshape((self.ubm.n_gaussians, self.feature_dimension)), z.reshape((self.ubm.n_gaussians, self.feature_dimension)),
self.ubm, self.ubm,
data, template_data,
Ux.reshape((self.ubm.n_gaussians, self.feature_dimension)), Ux.reshape((self.ubm.n_gaussians, self.feature_dimension)),
frame_length_normalization=True, frame_length_normalization=True,
)[0] )[0][0]
class JFAMachine(FactorAnalysisBase): class JFAMachine(FactorAnalysisBase):
...@@ -1759,6 +1761,7 @@ class JFAMachine(FactorAnalysisBase): ...@@ -1759,6 +1761,7 @@ class JFAMachine(FactorAnalysisBase):
Accumulated statistics for V_A2(n_gaussians* feature_dimension, r_V) Accumulated statistics for V_A2(n_gaussians* feature_dimension, r_V)
""" """
acc_V_A1 = [acc[0] for acc in acc_V_A1_acc_V_A2_list] acc_V_A1 = [acc[0] for acc in acc_V_A1_acc_V_A2_list]
acc_V_A2 = [acc[1] for acc in acc_V_A1_acc_V_A2_list] acc_V_A2 = [acc[1] for acc in acc_V_A1_acc_V_A2_list]
...@@ -1807,6 +1810,7 @@ class JFAMachine(FactorAnalysisBase): ...@@ -1807,6 +1810,7 @@ class JFAMachine(FactorAnalysisBase):
E[y] E[y]
""" """
VProd = self._compute_vprod() VProd = self._compute_vprod()
latent_x, latent_y, latent_z = self.initialize_XYZ( latent_x, latent_y, latent_z = self.initialize_XYZ(
...@@ -1856,6 +1860,7 @@ class JFAMachine(FactorAnalysisBase): ...@@ -1856,6 +1860,7 @@ class JFAMachine(FactorAnalysisBase):
Accumulated statistics for U_A2(n_gaussians* feature_dimension, r_U) Accumulated statistics for U_A2(n_gaussians* feature_dimension, r_U)
""" """
# self.initialize_XYZ(y) # self.initialize_XYZ(y)
UProd = self._compute_uprod() UProd = self._compute_uprod()
latent_x, _, latent_z = self.initialize_XYZ( latent_x, _, latent_z = self.initialize_XYZ(
...@@ -1897,6 +1902,7 @@ class JFAMachine(FactorAnalysisBase): ...@@ -1897,6 +1902,7 @@ class JFAMachine(FactorAnalysisBase):
Accumulated statistics for V_A2(n_gaussians* feature_dimension, r_V) Accumulated statistics for V_A2(n_gaussians* feature_dimension, r_V)
""" """
acc_U_A1 = [acc[0] for acc in acc_U_A1_acc_U_A2_list] acc_U_A1 = [acc[0] for acc in acc_U_A1_acc_U_A2_list]
acc_U_A2 = [acc[1] for acc in acc_U_A1_acc_U_A2_list] acc_U_A2 = [acc[1] for acc in acc_U_A1_acc_U_A2_list]
...@@ -1995,6 +2001,7 @@ class JFAMachine(FactorAnalysisBase): ...@@ -1995,6 +2001,7 @@ class JFAMachine(FactorAnalysisBase):
Accumulated statistics for D_A2(n_gaussians* feature_dimension, ) Accumulated statistics for D_A2(n_gaussians* feature_dimension, )
""" """
_, _, latent_z = self.initialize_XYZ( _, _, latent_z = self.initialize_XYZ(
n_samples_per_class=n_samples_per_class n_samples_per_class=n_samples_per_class
) )
...@@ -2030,6 +2037,7 @@ class JFAMachine(FactorAnalysisBase): ...@@ -2030,6 +2037,7 @@ class JFAMachine(FactorAnalysisBase):
Accumulated statistics for D_A2(n_gaussians* feature_dimension, ) Accumulated statistics for D_A2(n_gaussians* feature_dimension, )
""" """
acc_D_A1 = [acc[0] for acc in acc_D_A1_acc_D_A2_list] acc_D_A1 = [acc[0] for acc in acc_D_A1_acc_D_A2_list]
acc_D_A2 = [acc[1] for acc in acc_D_A1_acc_D_A2_list] acc_D_A2 = [acc[1] for acc in acc_D_A1_acc_D_A2_list]
...@@ -2055,6 +2063,7 @@ class JFAMachine(FactorAnalysisBase): ...@@ -2055,6 +2063,7 @@ class JFAMachine(FactorAnalysisBase):
z, y latent variables z, y latent variables
""" """
iterations = self.enroll_iterations iterations = self.enroll_iterations
# We have only one class for enrollment # We have only one class for enrollment
y = list(np.zeros(len(X), dtype=np.int32)) y = list(np.zeros(len(X), dtype=np.int32))
...@@ -2118,6 +2127,7 @@ class JFAMachine(FactorAnalysisBase): ...@@ -2118,6 +2127,7 @@ class JFAMachine(FactorAnalysisBase):
Returns self. Returns self.
""" """
if isinstance(X, dask.bag.Bag): if isinstance(X, dask.bag.Bag):
X, y = self._prepare_dask_input(X, y) X, y = self._prepare_dask_input(X, y)
...@@ -2240,9 +2250,16 @@ class JFAMachine(FactorAnalysisBase): ...@@ -2240,9 +2250,16 @@ class JFAMachine(FactorAnalysisBase):
The linear scored The linear scored
""" """
latent_y = model[0] latent_y = model[0]
latent_z = model[1] latent_z = model[1]
template_data = data[0]
if len(data > 1):
for d in data[1:]:
template_data += d
x = self.estimate_x(data) x = self.estimate_x(data)
Ux = self._U @ x Ux = self._U @ x
......
...@@ -287,8 +287,13 @@ class IVectorMachine(BaseEstimator): ...@@ -287,8 +287,13 @@ class IVectorMachine(BaseEstimator):
) )
self.sigma = copy.deepcopy(self.ubm.variances) self.sigma = copy.deepcopy(self.ubm.variances)
logger.info("Training I-Vector...")
for step in range(self.max_iterations): for step in range(self.max_iterations):
logger.info(
f"IVector step {step+1:{len(str(self.max_iterations))}d}/{self.max_iterations}."
)
if chunky: if chunky:
# Compute the IVectorStats of each chunk
stats = [ stats = [
dask.delayed(e_step)( dask.delayed(e_step)(
machine=self, machine=self,
...@@ -300,27 +305,27 @@ class IVectorMachine(BaseEstimator): ...@@ -300,27 +305,27 @@ class IVectorMachine(BaseEstimator):
# Workaround to prevent memory issues at compute with too many chunks. # Workaround to prevent memory issues at compute with too many chunks.
# This adds pairs of stats together instead of sending all the stats to # This adds pairs of stats together instead of sending all the stats to
# one worker. # one worker.
while (l := len(stats)) > 1: while (length := len(stats)) > 1:
last = stats[-1] last = stats[-1]
stats = [ stats = [
dask.delayed(operator.add)(stats[i], stats[l // 2 + i]) dask.delayed(operator.add)(
for i in range(l // 2) stats[i], stats[length // 2 + i]
)
for i in range(length // 2)
] ]
if l % 2 != 0: if length % 2 != 0:
stats.append(last) stats.append(last)
stats_sum = stats[0] stats_sum = stats[0]
# Update the machine parameters with the aggregated stats
new_machine = dask.compute( new_machine = dask.compute(
dask.delayed(m_step)(self, stats_sum) dask.delayed(m_step)(self, stats_sum)
)[0] )[0]
for attr in ["T", "sigma"]: for attr in ["T", "sigma"]:
setattr(self, attr, getattr(new_machine, attr)) setattr(self, attr, getattr(new_machine, attr))
else: else: # Working directly on numpy array, not dask.Bags
stats = e_step(machine=self, data=X) stats = e_step(machine=self, data=X)
_ = m_step(self, stats) _ = m_step(self, stats)
logger.info(
f"IVector step {step+1:{len(str(self.max_iterations))}d}/{self.max_iterations}."
)
logger.info(f"Reached {step+1} steps.") logger.info(f"Reached {step+1} steps.")
return self return self
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment