Commit e2755d8e authored by Amir MOHAMMADI's avatar Amir MOHAMMADI

Merge branch 'compute-vad' into 'master'

Compute vad

See merge request !3
parents c830347a 5639642c
Pipeline #9407 passed with stages
in 12 minutes and 23 seconds
from .mfcc import mfcc
from .mfcc import mfcc_from_path
from .mfcc import compute_vad
from .gmm import ubm_train
from .gmm import ubm_full_train
from .gmm import ubm_enroll
......
......@@ -57,7 +57,7 @@ def ubm_train(feats, ubmname, num_threads=4, num_frames=500000,
Returns
-------
str
A path to the the trained ubm model.
A text formatted trained Kaldi global DiagGMM model.
"""
......@@ -66,6 +66,7 @@ def ubm_train(feats, ubmname, num_threads=4, num_frames=500000,
binary3 = 'gmm-gselect'
binary4 = 'gmm-global-acc-stats'
binary5 = 'gmm-global-est'
binary6 = 'gmm-global-copy'
# 1. Initialize a single diagonal GMM
cmd1 = [binary1] # gmm-global-init-from-feats
......@@ -175,16 +176,34 @@ def ubm_train(feats, ubmname, num_threads=4, num_frames=500000,
os.unlink(inModel)
inModel = estfile.name
os.unlink(gselfile.name)
# 6. Copy a single diagonal GMM as text string (for the BEAT platform)
ret = ""
with tempfile.NamedTemporaryFile(suffix='.txt') as txtfile, \
tempfile.NamedTemporaryFile(suffix='.log') as logfile:
cmd = [binary6] # gmm-global-copy
cmd += [
'--binary=false',
estfile.name,
txtfile.name,
]
pipe = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=logfile)
pipe.communicate()
with open(logfile.name) as fp:
logtxt = fp.read()
logger.debug("%s", logtxt)
with open(txtfile.name, 'rt') as f:
ubmtxt = f.read()
ret = ubmtxt
shutil.copyfile(estfile.name, ubmname)
shutil.copyfile(estfile.name, ubmname + '.dubm')
os.unlink(estfile.name)
return ubmname + '.dubm'
os.unlink(gselfile.name)
return ret
def ubm_full_train(feats, dubmname, num_gselect=20, num_iters=4,
def ubm_full_train(feats, dubm, fubmfile, num_gselect=20, num_iters=4,
min_gaussian_weight=1.0e-04):
""" Implements Kaldi egs/sre10/v1/train_full_ubm.sh
......@@ -192,10 +211,10 @@ def ubm_full_train(feats, dubmname, num_gselect=20, num_iters=4,
----------
feats : numpy.ndarray
A 2D numpy ndarray object containing MFCCs.
dubmname : str
A path to the UBM model.
dubm : str
A text formatted trained Kaldi global DiagGMM model.
fubmfile : str
A path to the full covariance UBM model.
num_gselect : :obj:`int`, optional
Number of Gaussians to keep per frame.
num_iters : :obj:`int`, optional
......@@ -207,19 +226,22 @@ def ubm_full_train(feats, dubmname, num_gselect=20, num_iters=4,
Returns
-------
str
A path to the the trained full covariance UBM model.
A path to the full covariance UBM model.
"""
binary1 = 'gmm-global-to-fgmm'
binary2 = 'fgmm-global-to-gmm'
# binary2 = 'fgmm-global-to-gmm'
binary3 = 'subsample-feats'
binary4 = 'gmm-gselect'
binary5 = 'fgmm-global-acc-stats'
binary6 = 'fgmm-global-est'
origdubm = dubmname
dubmname += '.dubm'
# Convert UBM string to a file
with tempfile.NamedTemporaryFile(
delete=False, suffix='.dump') as dubmfile:
with open(dubmfile.name, 'wt') as fp:
fp.write(dubm)
# 1. Init (diagonal GMM to full-cov. GMM)
# gmm-global-to-fgmm $srcdir/final.dubm $dir/0.ubm || exit 1;
......@@ -229,7 +251,7 @@ def ubm_full_train(feats, dubmname, num_gselect=20, num_iters=4,
initfile, tempfile.NamedTemporaryFile(suffix='.log') as logfile:
inModel = initfile.name
cmd1 += [
dubmname,
dubmfile.name,
inModel,
]
pipe1 = Popen(cmd1, stdin=PIPE, stdout=PIPE, stderr=logfile)
......@@ -243,20 +265,10 @@ def ubm_full_train(feats, dubmname, num_gselect=20, num_iters=4,
# gmm-gselect --n=$num_gselect "fgmm-global-to-gmm $dir/0.ubm - \
# |" "$feats" \
# "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
cmd2 = [binary2] # fgmm-global-to-gmm
with tempfile.NamedTemporaryFile(suffix='.dubm') as dubmfile, \
tempfile.NamedTemporaryFile(suffix='.ark') as arkfile, \
# cmd2 = [binary2] # fgmm-global-to-gmm
# with tempfile.NamedTemporaryFile(suffix='.dubm') as dubmfile, \
with tempfile.NamedTemporaryFile(suffix='.ark') as arkfile, \
tempfile.NamedTemporaryFile(suffix='.gz') as gselfile:
cmd2 += [
inModel,
dubmfile.name,
]
with tempfile.NamedTemporaryFile(suffix='.log') as logfile:
pipe2 = Popen(cmd2, stdin=PIPE, stdout=PIPE, stderr=logfile)
pipe2.communicate()
with open(logfile.name) as fp:
logtxt = fp.read()
logger.debug("%s", logtxt)
# subsample-feats --n=$subsample ark:- ark:- |"
cmd = [binary3] # subsample-feats
cmd += [
......@@ -337,36 +349,43 @@ def ubm_full_train(feats, dubmname, num_gselect=20, num_iters=4,
os.unlink(inModel)
inModel = estfile.name
shutil.copyfile(estfile.name, origdubm + '.fubm')
shutil.copyfile(estfile.name, fubmfile)
os.unlink(estfile.name)
os.unlink(dubmfile.name)
return origdubm + '.fubm'
return fubmfile # ToDo : covert to a text format
def ubm_enroll(feats, ubm_file):
def ubm_enroll(feats, ubm):
"""Performes MAP adaptation of GMM-UBM model.
Parameters
----------
feats : numpy.ndarray
A 2D numpy ndarray object containing MFCCs.
ubm_file : str
A path to the Kaldi global GMM.
ubm : str
A text formatted Kaldi global DiagGMM.
Returns
-------
str
A path to the enrolled GMM.
A text formatted Kaldi enrolled DiagGMM.
"""
binary1 = 'gmm-global-acc-stats'
binary2 = 'global-gmm-adapt-map'
binary3 = 'gmm-global-copy'
with tempfile.NamedTemporaryFile(
delete=False, suffix='.dump') as ubmfile:
with open(ubmfile.name, 'wt') as fp:
fp.write(ubm)
# 1. Accumulate stats for training a diagonal-covariance GMM.
cmd1 = [binary1] # gmm-global-acc-stats
cmd1 += [
ubm_file,
ubmfile.name,
'ark:-',
'-',
]
......@@ -375,7 +394,7 @@ def ubm_enroll(feats, ubm_file):
estfile, tempfile.NamedTemporaryFile(suffix='.log') as logfile:
cmd2 += [
'--update-flags=m',
ubm_file,
ubmfile.name,
'-',
estfile.name,
]
......@@ -392,9 +411,33 @@ def ubm_enroll(feats, ubm_file):
logtxt = fp.read()
logger.debug("%s", logtxt)
return estfile.name
def gmm_score(feats, gmm_file, ubm_file):
# 3. Copy adapted diagonal GMM as text string (for the BEAT platform)
ret = ""
with tempfile.NamedTemporaryFile(suffix='.txt') as txtfile, \
tempfile.NamedTemporaryFile(suffix='.log') as logfile:
cmd = [binary3] # gmm-global-copy
cmd += [
'--binary=false',
estfile.name,
txtfile.name,
]
pipe = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=logfile)
pipe.communicate()
with open(logfile.name) as fp:
logtxt = fp.read()
logger.debug("%s", logtxt)
with open(txtfile.name, 'rt') as f:
ubmtxt = f.read()
ret = ubmtxt
os.unlink(ubmfile.name)
os.unlink(estfile.name)
return ret
def gmm_score(feats, spkubm, ubm):
"""Print out per-frame log-likelihoods for input utterance.
Parameters
......@@ -402,10 +445,10 @@ def gmm_score(feats, gmm_file, ubm_file):
feats : numpy.ndarray
A 2D numpy ndarray object containing MFCCs.
gmm_file : str
A path to Kaldi adapted global GMM.
ubm_file : str
A path to Kaldi global GMM.
spkubm : str
A text formatted Kaldi adapted global DiagGMM.
ubm : str
A text formatted Kaldi global DiagGMM.
Returns
......@@ -416,10 +459,22 @@ def gmm_score(feats, gmm_file, ubm_file):
"""
binary1 = 'gmm-global-get-frame-likes'
# Convert UBM string to a file
with tempfile.NamedTemporaryFile(
delete=False, suffix='.dubm') as ubmfile:
with open(ubmfile.name, 'wt') as fp:
fp.write(ubm)
# Convert speaker UBM string to a file
with tempfile.NamedTemporaryFile(
delete=False, suffix='.dubm') as spkubmfile:
with open(spkubmfile.name, 'wt') as fp:
fp.write(spkubm)
models = [
gmm_file,
ubm_file
spkubmfile.name,
ubmfile.name
]
ret = [0, 0]
# import ipdb; ipdb.set_trace()
......@@ -449,6 +504,8 @@ def gmm_score(feats, gmm_file, ubm_file):
logtxt = fp.read()
logger.debug("%s", logtxt)
os.unlink(ubmfile.name)
os.unlink(spkubmfile.name)
return ret[0] - ret[1]
# def gmm_score_fast(feats, gmm_file, ubm_file):
......
......@@ -16,18 +16,20 @@ import logging
logger = logging.getLogger(__name__)
def ivector_train(feats, projector_file, num_gselect=20, ivector_dim=600,
use_weights=False, num_iters=5, min_post=0.025,
num_samples_for_weights=3, posterior_scale=1.0):
def ivector_train(feats, fubm, ivector_extractor, num_gselect=20,
ivector_dim=600, use_weights=False, num_iters=5,
min_post=0.025, num_samples_for_weights=3,
posterior_scale=1.0):
"""Implements Kaldi egs/sre10/v1/train_ivector_extractor.sh
Parameters
----------
feats : numpy.ndarray
A 2D numpy ndarray object containing MFCCs.
projector_file : str
A path to global GMM file
fubm : str
A path to full-diagonal UBM file
ivector_extractor : str
A path to the ivector extractor
num_gselect : :obj:`int`, optional
Number of Gaussians to keep per frame.
......@@ -61,8 +63,6 @@ def ivector_train(feats, projector_file, num_gselect=20, ivector_dim=600,
binary6 = 'ivector-extractor-acc-stats'
binary7 = 'ivector-extractor-est'
fgmm_model = projector_file + '.fubm'
# 1. Create Kaldi training data structure
# ToDo: implement Bob's function for that
with tempfile.NamedTemporaryFile(delete=False, suffix='.ark') as arkfile:
......@@ -79,7 +79,7 @@ def ivector_train(feats, projector_file, num_gselect=20, ivector_dim=600,
with tempfile.NamedTemporaryFile(delete=False, suffix='.dubm') as \
dubmfile, tempfile.NamedTemporaryFile(suffix='.log') as logfile:
cmd1 += [
fgmm_model,
fubm,
dubmfile.name,
]
pipe1 = Popen(cmd1, stdin=PIPE, stdout=PIPE, stderr=logfile)
......@@ -94,7 +94,7 @@ def ivector_train(feats, projector_file, num_gselect=20, ivector_dim=600,
cmd2 += [
'--ivector-dim=' + str(ivector_dim),
'--use-weights=' + str(use_weights).lower(),
fgmm_model,
fubm,
iefile.name,
]
pipe2 = Popen(cmd2, stdin=PIPE, stdout=PIPE, stderr=logfile)
......@@ -130,7 +130,7 @@ def ivector_train(feats, projector_file, num_gselect=20, ivector_dim=600,
cmd4 = [binary4] # fgmm-global-gselect-to-post
cmd4 += [
'--min-post=' + str(min_post),
fgmm_model,
fubm,
'ark:' + arkfile.name,
'ark:' + gselfile.name,
'ark:-',
......@@ -201,21 +201,23 @@ def ivector_train(feats, projector_file, num_gselect=20, ivector_dim=600,
os.unlink(inModel)
inModel = estfile.name
shutil.copyfile(inModel, projector_file + '.ie')
shutil.copyfile(inModel, ivector_extractor)
os.unlink(inModel)
return projector_file + '.ie'
return ivector_extractor # ToDo: covert to the string
def ivector_extract(feats, projector_file, num_gselect=20, min_post=0.025,
posterior_scale=1.0):
def ivector_extract(feats, fubm, ivector_extractor, num_gselect=20,
min_post=0.025, posterior_scale=1.0):
"""Implements Kaldi egs/sre10/v1/extract_ivectors.sh
Parameters
----------
feats : numpy.ndarray
A 2D numpy ndarray object containing MFCCs.
projector_file : str
fubm : str
A path to full-diagonal UBM file
ivector_extractor : str
A path to global GMM file.
num_gselect : :obj:`int`, optional
Number of Gaussians to keep per frame.
......@@ -241,14 +243,13 @@ def ivector_extract(feats, projector_file, num_gselect=20, min_post=0.025,
# import ipdb; ipdb.set_trace()
# ivector-extract --verbose=2 $srcdir/final.ie "$feats" ark,s,cs:- \
# ark,scp,t:$dir/ivector.JOB.ark,$dir/ivector.JOB.scp || exit 1;
fgmm_model = projector_file + '.fubm'
# Initialize the i-vector extractor using the FGMM input
cmd1 = [binary1] # fgmm-global-to-gmm
with tempfile.NamedTemporaryFile(delete=False, suffix='.dubm') as \
dubmfile, tempfile.NamedTemporaryFile(suffix='.log') as logfile:
cmd1 += [
fgmm_model,
fubm,
dubmfile.name,
]
pipe1 = Popen(cmd1, stdin=PIPE, stdout=PIPE, stderr=logfile)
......@@ -277,7 +278,7 @@ def ivector_extract(feats, projector_file, num_gselect=20, min_post=0.025,
cmd2 = [binary3] # fgmm-global-gselect-to-post
cmd2 += [
'--min-post=' + str(min_post),
fgmm_model,
fubm,
'ark:-',
'ark,s,cs:' + gselfile.name,
'ark:-',
......@@ -302,7 +303,7 @@ def ivector_extract(feats, projector_file, num_gselect=20, min_post=0.025,
cmd4 = [binary5] # ivector-extract
cmd4 += [
projector_file + '.ie',
ivector_extractor,
'ark:-',
'ark,s,cs:' + postfile.name,
'ark:-',
......
......@@ -10,6 +10,7 @@ import numpy as np
from . import io
from subprocess import PIPE, Popen
from os.path import isfile
import tempfile
import logging
logger = logging.getLogger(__name__)
......@@ -18,7 +19,7 @@ def mfcc(data, rate=8000, preemphasis_coefficient=0.97, raw_energy=True,
frame_length=25, frame_shift=10, num_ceps=13, num_mel_bins=23,
cepstral_lifter=22, low_freq=20, high_freq=0, dither=1.0,
snip_edges=True, normalization=True):
"""Computes the MFCCs for a given input signal
"""Computes the MFCCs for given speech samples.
Parameters
----------
......@@ -66,7 +67,7 @@ def mfcc(data, rate=8000, preemphasis_coefficient=0.97, raw_energy=True,
32-bit floats).
"""
binary1 = 'compute-mfcc-feats'
cmd1 = [binary1]
binary2 = 'add-deltas'
......@@ -116,11 +117,9 @@ def mfcc(data, rate=8000, preemphasis_coefficient=0.97, raw_energy=True,
io.write_wav(pipe1.stdin, data, rate)
pipe1.stdin.close()
# read ark from pipe3.stdout
ret = [mat for name, mat in io.read_mat_ark(pipe3.stdout)][0]
return ret
def mfcc_from_path(filename, channel=0, preemphasis_coefficient=0.97,
raw_energy=True, frame_length=25, frame_shift=10,
num_ceps=13, num_mel_bins=23, cepstral_lifter=22,
......@@ -222,68 +221,69 @@ def mfcc_from_path(filename, channel=0, preemphasis_coefficient=0.97,
ret = [mat for name, mat in io.read_mat_ark(pipe3.stdout)][0]
return ret
# def compute_vad(feats, vad_energy_mean_scale=0.5, vad_energy_threshold=5,
# vad_frames_context=0, vad_proportion_threshold=0.6):
# """Computes speech/non-speech segments given a Kaldi feature matrix
# Parameters:
# feats (matrix): A 2-D numpy array, with log-energy being in the first
# component of each feature vector
# Returns:
# A list of speech segments as a int32 numpy array with start and end times
def compute_vad(samples, rate, vad_energy_mean_scale=0.5, vad_energy_th=5,
vad_frames_context=0, vad_proportion_th=0.6):
"""Performs Voice Activity Detection on a Kaldi feature matrix
# Raises:
# RuntimeError: if any problem was detected during the conversion.
# IOError: if the binary to be executed does not exist
Parameters
----------
feats : numpy.ndarray
A 2-D numpy array, with log-energy being in the first
component of each feature vector
rate : float
The sampling rate of the input signal in ``samples``.
vad_energy_mean_scale: :obj:`float`, optional
If this is set to s, to get the actual threshold we let m be the mean
log-energy of the file, and use s*m + vad-energy-th
vad_energy_th: :obj:`float`, optional
Constant term in energy threshold for MFCC0 for VAD.
vad_frames_context: :obj:`int`, optional
Number of frames of context on each side of central frame,
in window for which energy is monitored
vad_proportion_th: :obj:`float`, optional
Parameter controlling the proportion of frames within the window that
need to have more energy than the threshold
# """
Returns
-------
# name = 'abc'
# binary1 = utils.kaldi_path(['src', 'ivectorbin', 'compute-vad'])
# cmd1 = [binary1]
numpy.ndarray
The labels [1/0] of voiced features (1D array of floats).
"""
# # compute features into the ark file
# cmd1 += [
# '--vad-energy-mean-scale=' + str(vad_energy_mean_scale),
# '--vad-energy-threshold=' + str(vad_energy_threshold),
# '--vad-frames-context=' + str(vad_frames_context),
# '--vad-proportion-threshold=' + str(vad_proportion_threshold),
# 'ark:-',
# 'ark:-',
# ]
binary1 = 'compute-mfcc-feats'
cmd1 = [binary1]
binary2 = 'compute-vad'
cmd2 = [binary2]
# with tempfile.NamedTemporaryFile(suffix='.seg') as segfile:
# binary2 = utils.kaldi_path(
# ['src', 'ivectorbin', 'create-split-from-vad'])
# cmd2 = [binary2]
cmd1 += [
'--sample-frequency=' + str(rate),
'ark:-',
'ark:-',
]
cmd2 += [
'--vad-energy-mean-scale=' + str(vad_energy_mean_scale),
'--vad-energy-threshold=' + str(vad_energy_th),
'--vad-frames-context=' + str(vad_frames_context),
'--vad-proportion-threshold=' + str(vad_proportion_th),
'ark:-',
'ark:-',
]
# cmd2 += [
# 'ark:-',
# segfile.name,
# ]
samples /= np.max(np.abs(samples), axis=0) # normalize to [-1,1]
# with open(os.devnull, "w") as fnull:
# # pipe1 numpy matrix -> compute-vad
# pipe1 = Popen(cmd1, stdout=PIPE, stdin=PIPE, stderr=fnull)
# pipe2 = Popen(cmd2, stdout=PIPE, stdin=pipe1.stdout, stderr=fnull)
with tempfile.NamedTemporaryFile(suffix='.log') as logfile:
pipe1 = Popen(cmd1, stdin=PIPE, stdout=PIPE, stderr=logfile)
pipe2 = Popen(cmd2, stdin=pipe1.stdout, stdout=PIPE, stderr=logfile)
# # write ark file into pipe.stdin
# io.write_mat(pipe1.stdin, feats, key='abc')
# pipe1.stdin.close()
pipe1.stdin.write(b'abc ')
io.write_wav(pipe1.stdin, samples, rate)
pipe1.stdin.close()
# # wait for piped execution to finish
# pipe2.communicate()
with open(logfile.name) as fp:
logtxt = fp.read()
logger.debug("%s", logtxt)
# # segfile should have the segmented output. read the file
# segs = []
# with open(segfile.name) as fp:
# for l in fp.readlines():
# start, end = l.split()[2:]
# segs.append([start, end])
# return np.array(segs, dtype='int32')
# read ark from pipe2.stdout
ret = [mat for name, mat in io.read_vec_flt_ark(pipe2.stdout)][0]
return ret
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1