Commit 428b4d2f authored by Milos CERNAK's avatar Milos CERNAK

Add speech recognition monophone training

parent 69bed322
Pipeline #12349 passed with stages
in 7 minutes and 46 seconds
include LICENSE README.rst buildout.cfg develop.cfg version.txt requirements.txt include LICENSE README.rst buildout.cfg develop.cfg version.txt requirements.txt
recursive-include doc conf.py *.rst recursive-include doc conf.py *.rst
recursive-include bob/kaldi/test/dnn *.txt recursive-include bob/kaldi/test/dnn *.txt
recursive-include bob/kaldi/test/hmm *.txt *.fst
recursive-include bob/kaldi/test/data *.wav *.txt *.npy *.ivector *.ie recursive-include bob/kaldi/test/data *.wav *.txt *.npy *.ivector *.ie
...@@ -14,6 +14,7 @@ from .ivector import plda_enroll ...@@ -14,6 +14,7 @@ from .ivector import plda_enroll
from .ivector import plda_score from .ivector import plda_score
from .dnn import nnet_forward from .dnn import nnet_forward
from .dnn import compute_dnn_vad from .dnn import compute_dnn_vad
from .hmm import train_mono
def get_config(): def get_config():
"""Returns a string containing the configuration information. """Returns a string containing the configuration information.
......
#!/usr/bin/zsh
#
# # Allow setshell
# software=/idiap/resource/software
# source $software/initfiles/shrc $software
# SETSHELL kaldi
out=test/kaldi
if [[ ! -e $out ]]; then
mkdir $out
fi
nnet-forward --feature-transform=dnn/ami.feature_transform \
dnn/ami.nnet \
ark,t:$out/sample16k.cmvn.deltas.ark \
ark,t:$out/sample16k.posteriors.ark
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Milos Cernak <milos.cernak@idiap.ch>
# Septebmer 9, 2017
import os
import numpy as np
from . import io
from subprocess import PIPE, Popen
from os.path import isfile
import tempfile
import logging
import pkg_resources
import shutil
import bob.kaldi
logger = logging.getLogger(__name__)
def train_mono(feats, trans_words, fst_L, topology_in, shared_phones='', numgauss=1000, power=0.25, num_iters=40, beam=6):
"""Monophone model training.
Parameters
----------
feats: dict
The input cepstral features (2D array of 32-bit floats).
trans_words: str
Text transcription of the `feats` (the word labels)
fst_L: str
A filename of the lexicon compiled as FST.
topology_in : str
A topology file that specifies 3-state left-to-right HMM, and
default transition probs.
shared_phones : :obj:`str`, optional
A filename of the of phones whose pdfs should be shared.
numgauss : :obj:`int`, optional
A number of Gaussians of GMMs.
power : :obj:`float`, optional
Power to allocate Gaussians to states.
num_iters : :obj:`int`, optional
A number of iteration for re-estimation of GMMs.
beam : :obj:`float`, optional
Decoding beam used in alignment.
Returns
-------
str
The mono-phones acoustic models.
"""
feat_dim = -1
with tempfile.NamedTemporaryFile(delete=False, suffix='.ark') as arkf:
with open(arkf.name, 'wb') as f:
for k in feats.keys():
uttid = k
io.write_mat(f, feats[k], key=uttid.encode('utf-8'))
if feat_dim < 1:
(m, feat_dim) = feats[k].shape
with tempfile.NamedTemporaryFile(delete=False, suffix='.top') as topof:
with open(topof.name, 'wt') as f:
f.write(topology_in)
with tempfile.NamedTemporaryFile(delete=False, suffix='.tra') as traf:
with open(traf.name, 'wt') as f:
f.write(trans_words)
binary1 = 'gmm-init-mono'
cmd1 = [binary1]
binary2 = 'compile-train-graphs'
cmd2 = [binary2]
binary3 = 'align-equal-compiled'
cmd3 = [binary3]
binary4 = 'gmm-acc-stats-ali'
cmd4 = [binary4]
binary5 = 'gmm-est'
cmd5 = [binary5]
binary6 = 'gmm-align-compiled'
with tempfile.NamedTemporaryFile(suffix='.mdl') as initf, \
tempfile.NamedTemporaryFile(suffix='.tree') as treef, \
tempfile.NamedTemporaryFile(suffix='.fst') as fstf, \
tempfile.NamedTemporaryFile(suffix='.ali') as alif, \
tempfile.NamedTemporaryFile(suffix='.acc') as accf, \
tempfile.NamedTemporaryFile(suffix='.est') as estf:
if shared_phones != '':
cmd1 += [
'--shared-phones=' + str(shared_phones),
]
cmd1 += [
'--train-feats=ark:copy-feats ark:'+arkf.name+' ark:-|',
topof.name,
str(feat_dim),
initf.name,
treef.name,
]
with tempfile.NamedTemporaryFile(suffix='.log') as logfile:
pipe1 = Popen(cmd1, stdin=PIPE,
stdout=PIPE, stderr=logfile)
pipe1.communicate()
with open(logfile.name) as fp:
logtxt = fp.read()
logger.debug("%s", logtxt)
cmd2 += [
treef.name,
initf.name,
str(fst_L),
'ark,t:' + traf.name,
'ark,t:' + fstf.name,
]
with tempfile.NamedTemporaryFile(suffix='.log') as logfile:
pipe2 = Popen(cmd2, stdin=PIPE,
stdout=PIPE, stderr=logfile)
pipe2.communicate()
with open(logfile.name) as fp:
logtxt = fp.read()
logger.debug("%s", logtxt)
cmd3 += [
'ark,t:' + fstf.name,
'ark:' + arkf.name,
'ark,t:' + alif.name,
]
with tempfile.NamedTemporaryFile(suffix='.log') as logfile:
pipe3 = Popen(cmd3, stdin=PIPE,
stdout=PIPE, stderr=logfile)
pipe3.communicate()
with open(logfile.name) as fp:
logtxt = fp.read()
logger.debug("%s", logtxt)
cmd4 += [
initf.name,
'ark:' + arkf.name,
'ark,t:' + alif.name,
accf.name,
]
with tempfile.NamedTemporaryFile(suffix='.log') as logfile:
pipe4 = Popen(cmd4, stdin=PIPE,
stdout=PIPE, stderr=logfile)
pipe4.communicate()
with open(logfile.name) as fp:
logtxt = fp.read()
logger.debug("%s", logtxt)
cmd5 += [
'--min-gaussian-occupancy=3',
'--mix-up=' + str (numgauss),
'--power=' + str(power),
'--binary=false',
initf.name,
accf.name,
estf.name,
]
with tempfile.NamedTemporaryFile(suffix='.log') as logfile:
pipe5 = Popen(cmd5, stdin=PIPE,
stdout=PIPE, stderr=logfile)
pipe5.communicate()
with open(logfile.name) as fp:
logtxt = fp.read()
logger.debug("%s", logtxt)
inModel=estf.name
for x in range(0, num_iters):
logger.info("Training pass " + str(x))
cmd6 = [
binary6,
'--transition-scale=1.0',
'--acoustic-scale=0.1',
'--self-loop-scale=0.1',
'--beam=' + str(beam),
'--retry-beam=' + str(beam*4),
'--careful=false',
inModel,
'ark,t:' + fstf.name,
'ark:' + arkf.name,
'ark:' + alif.name,
]
with tempfile.NamedTemporaryFile(suffix='.log') as logfile:
pipe6 = Popen(cmd6, stdin=PIPE,
stdout=PIPE, stderr=logfile)
pipe6.communicate()
with open(logfile.name) as fp:
logtxt = fp.read()
logger.debug("%s", logtxt)
cmd7 = [
binary4,
inModel,
'ark:' + arkf.name,
'ark:' + alif.name,
accf.name,
]
with tempfile.NamedTemporaryFile(suffix='.log') as logfile:
pipe7 = Popen(cmd7, stdin=PIPE,
stdout=PIPE, stderr=logfile)
pipe7.communicate()
with open(logfile.name) as fp:
logtxt = fp.read()
logger.debug("%s", logtxt)
with tempfile.NamedTemporaryFile(delete=False,
suffix='.est') as itf:
cmd8 = [
binary5,
'--binary=false',
'--mix-up=' + str (numgauss),
'--power=' + str(power),
inModel,
accf.name,
itf.name,
]
with tempfile.NamedTemporaryFile(suffix='.log') as logfile:
pipe8 = Popen(cmd8, stdin=PIPE,
stdout=PIPE, stderr=logfile)
pipe8.communicate()
with open(logfile.name) as fp:
logtxt = fp.read()
logger.debug("%s", logtxt)
if x>0: # do not remove estf.name; just itf.name
os.unlink(inModel)
inModel=itf.name
# shutil.copyfile(inModel,'final.mdl')
os.unlink(arkf.name)
os.unlink(topof.name)
os.unlink(traf.name)
with open(inModel) as fp:
hmmtxt = fp.read()
return hmmtxt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
<Topology>
<TopologyEntry>
<ForPhones>
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State>
<State> 1 <PdfClass> 1 <Transition> 1 0.75 <Transition> 2 0.25 </State>
<State> 2 <PdfClass> 2 <Transition> 2 0.75 <Transition> 3 0.25 </State>
<State> 3 </State>
</TopologyEntry>
<TopologyEntry>
<ForPhones>
1 2 3 4
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.25 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 </State>
<State> 1 <PdfClass> 1 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 2 <PdfClass> 2 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 3 <PdfClass> 3 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 4 <PdfClass> 4 <Transition> 4 0.75 <Transition> 5 0.25 </State>
<State> 5 </State>
</TopologyEntry>
</Topology>
This diff is collapsed.
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Milos Cernak <milos.cernak@idiap.ch>
# September 1, 2017
'''Tests for Kaldi bindings'''
import pkg_resources
import os.path
import numpy as np
import bob.io.audio
import bob.kaldi
def test_train_mono():
sample = pkg_resources.resource_filename(__name__, 'data/sample16k.wav')
fstfile = pkg_resources.resource_filename(__name__, 'hmm/L.fst')
topofile = pkg_resources.resource_filename(__name__, 'hmm/topo.txt')
phfile = pkg_resources.resource_filename(__name__, 'hmm/sets.txt')
# word labels
uttid='test'
labels = uttid + ' 27312 27312 27312'
data = bob.io.audio.reader(sample)
feats = bob.kaldi.cepstral(data.load()[0], 'mfcc', data.rate,
normalization=False)
train_set={}
train_set[uttid]=feats
with open(topofile) as topof:
topo = topof.read()
out = bob.kaldi.train_mono(train_set, labels, fstfile, topo,
phfile , numgauss=2, num_iters=2)
assert out.find('TransitionModel')
...@@ -149,5 +149,66 @@ but might be used also for the laughter and noise detection as well. ...@@ -149,5 +149,66 @@ but might be used also for the laughter and noise detection as well.
>>> print (ours.shape) >>> print (ours.shape)
(317, 43) (317, 43)
===================
Speech recognition
===================
Speech recognition is a processes that generates a text transcript
given speech audio. Most of current Automatic Speech Recognition
(ASR) systems use the following pipeline:
.. image:: img/ASR.png
The ASR system has to be first trained. More specifically, its key
statistical models:
* Pronunciation model, the lexicon, that associates written and spoken
form of words. The lexicon contains words :math:`W` and defines them
as sequences of phonemes (the speech sounds) :math:`Q`.
* Acoustic model, GMMs or DNNs, that associates the speech features
:math:`O` and the spoken words :math:`Q`.
* Language model, usually n-gram model, that captures most probably
sequences of :math:`W` of a particular language.
The transcript of the input audio waveform :math:`X` is then generated
by transformation of :math:`X` to features :math:`O` (for example
ceptral features computed by :py:func:`bob.kaldi.cepstral`), and an
ASR decoder that outputs the most probable transcript :math:`W^*`
using the pre-trained statistical models.
Acoustic models
---------------
The basic acoustic model is called monophone model, where :math:`Q`
consists just of the phonemes, and consider them contextually
independent. The training of such model has following pipeline:
* Model initialization for a given Hidden Markov Model (HMM)
structure, usually 3-state left-to-right model.
* Compiling training graphs that compiles Finite State Transducers
(FSTs), one for each train utterance. This requires the lexicon, and
the word transcription of the training data.
* First alignment and update stage that produces a transition-model
and GMM objects for equally spaced alignments.
* Iterative alignment and update stage.
.. doctest::
>>> fstfile = pkg_resources.resource_filename('bob.kaldi', 'test/hmm/L.fst')
>>> topofile = pkg_resources.resource_filename('bob.kaldi', 'test/hmm/topo.txt')
>>> phfile = pkg_resources.resource_filename('bob.kaldi', 'test/hmm/sets.txt')
>>> # word labels
>>> uttid='test'
>>> labels = uttid + ' 27312 27312 27312'
>>> train_set={}
>>> train_set[uttid]=feats
>>> topof = open(topofile)
>>> topo = topof.read()
>>> topof.close()
>>> model = bob.kaldi.train_mono(train_set, labels, fstfile, topo, phfile , numgauss=2, num_iters=2)
>>> print (model.find('TransitionModel'))
1
.. include:: links.rst .. include:: links.rst
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment