Merge branch 'phone-decoding' into 'master'

Add phone frame decoding See merge request !10

Merge branch 'phone-decoding' into 'master'
2c1de5ad · Milos CERNAK · 9b0f5be6 · bcd118a0 · 2c1de5ad · 2c1de5ad
Commit 2c1de5ad authored 7 years ago by Milos CERNAK
--- a/bob/kaldi/__init__.py
+++ b/bob/kaldi/__init__.py
@@ -14,6 +14,7 @@ from .ivector import plda_enroll
 from .ivector import plda_score
 from .dnn import nnet_forward
 from .dnn import compute_dnn_vad
+from .dnn import compute_dnn_phone
 from .hmm import train_mono

 def get_config():

--- a/bob/kaldi/dnn.py
+++ b/bob/kaldi/dnn.py
@@ -149,3 +149,43 @@ def compute_dnn_vad(samples, rate, silence_threshold=0.9, posterior=0):
            vad.append(1.0)

    return vad
+
+def compute_dnn_phone(samples, rate):
+    """Computes phone posteriors on a Kaldi feature matrix
+
+    Parameters
+    ----------
+    feats : numpy.ndarray
+        A 2-D numpy array, with log-energy being in the first
+        component of each feature vector
+    rate : float
+        The sampling rate of the input signal in ``samples``.
+
+    Returns
+    -------
+    numpy.ndarray
+        The phone posteriors and labels.
+    """
+
+    nnetfile   = pkg_resources.resource_filename(__name__,
+    'test/dnn/ami.nnet.txt')
+    transfile = pkg_resources.resource_filename(__name__,
+    'test/dnn/ami.feature_transform.txt')
+    labfile = pkg_resources.resource_filename(__name__,
+    'test/dnn/ami.phones.txt')
+
+    feats = bob.kaldi.cepstral(samples, 'mfcc', rate,
+    normalization=False)
+
+    with open(nnetfile) as nnetf, \
+        open(transfile) as trnf:
+        dnn = nnetf.read()
+        trn = trnf.read()
+        post = bob.kaldi.nnet_forward(feats, dnn, trn)
+
+    labels = a=np.genfromtxt(labfile, dtype='str', skip_header=1)
+    lab = []
+    for l in labels:
+        lab.append(l[0])
+
+    return [post, lab]
--- a/bob/kaldi/dnn.sh
+++ b/bob/kaldi/dnn.sh
-#!/usr/bin/zsh
-#
-
-# # Allow setshell
-# software=/idiap/resource/software
-# source $software/initfiles/shrc $software
-# SETSHELL kaldi
-
-out=test/kaldi
-if [[ ! -e $out ]]; then
-    mkdir $out
-fi
-
-nnet-forward --feature-transform=dnn/ami.feature_transform \
-	     dnn/ami.nnet \
-	     ark,t:$out/sample16k.cmvn.deltas.ark \
-	     ark,t:$out/sample16k.posteriors.ark
-
--- a/bob/kaldi/test/data/librivox.wav
+++ b/bob/kaldi/test/data/librivox.wav
--- a/bob/kaldi/test/dnn/ami.phones.txt
+++ b/bob/kaldi/test/dnn/ami.phones.txt
+<eps> 0
+sil 1
+laughter 2
+noise 3
+oov 4
+AA 5
+AE 6
+AH 7
+AO 8
+AW 9
+AY 10
+B 11
+CH 12
+D 13
+DH 14
+EH 15
+ER 16
+EY 17
+F 18
+G 19
+HH 20
+IH 21
+IY 22
+JH 23
+K 24
+L 25
+M 26
+N 27
+NG 28
+OW 29
+OY 30
+P 31
+R 32
+S 33
+SH 34
+T 35
+TH 36
+UH 37
+UW 38
+V 39
+W 40
+Y 41
+Z 42
+ZH 43
+#0 44
+#1 45
+#2 46
+#3 47
+#4 48
+#5 49
+#6 50
+#7 51
+#8 52
+#9 53
--- a/bob/kaldi/test/test_dnn.py
+++ b/bob/kaldi/test/test_dnn.py
@@ -52,5 +52,15 @@ def test_compute_dnn_vad():

    assert np.allclose(ours, theirs)

+def test_compute_dnn_phone():

+    sample = pkg_resources.resource_filename(__name__, 'data/librivox.wav')

+    data = bob.io.audio.reader(sample)
+
+    post, labs = bob.kaldi.compute_dnn_phone(data.load()[0], data.rate)
+
+    mdecoding=np.argmax(post,axis=1) # max decoding
+
+    # check if the last spoken sound at frame 250 is 'N' (word DOMAIN)
+    assert(labs[mdecoding[250]]=='N')
--- a/doc/guide.rst
+++ b/doc/guide.rst
@@ -210,5 +210,23 @@ independent. The training of such model has following pipeline:
    >>> print (model.find('TransitionModel'))
    1 

+Phone frame decoding
+--------------------
+
+Simple frame decoding can by done by finding the indices of the
+maximum values along the frame axis. The following example performs
+a forward pass with pre-trained phone DNN, and finds :math:`argmax()`
+of the output posterior features. Looking at the DNN labels, the
+phones are decoded per frame.
+
+.. doctest::
+
+    >>> sample = pkg_resources.resource_filename('bob.kaldi', 'test/data/librivox.wav')
+    >>> data = bob.io.audio.reader(sample)
+    >>> post, labs = bob.kaldi.compute_dnn_phone(data.load()[0], data.rate)
+    >>> mdecoding = numpy.argmax(post,axis=1) # max decoding
+    >>> print (labs[mdecoding[250]]) # the last spoken sound of sample is N (of the word DOMAIN)
+    N
+
 .. include:: links.rst