Commit 2c1de5ad authored by Milos CERNAK's avatar Milos CERNAK

Merge branch 'phone-decoding' into 'master'

Add phone frame decoding

See merge request !10
parents 9b0f5be6 bcd118a0
Pipeline #12982 passed with stages
in 6 minutes and 17 seconds
......@@ -14,6 +14,7 @@ from .ivector import plda_enroll
from .ivector import plda_score
from .dnn import nnet_forward
from .dnn import compute_dnn_vad
from .dnn import compute_dnn_phone
from .hmm import train_mono
def get_config():
......
......@@ -149,3 +149,43 @@ def compute_dnn_vad(samples, rate, silence_threshold=0.9, posterior=0):
vad.append(1.0)
return vad
def compute_dnn_phone(samples, rate):
"""Computes phone posteriors on a Kaldi feature matrix
Parameters
----------
feats : numpy.ndarray
A 2-D numpy array, with log-energy being in the first
component of each feature vector
rate : float
The sampling rate of the input signal in ``samples``.
Returns
-------
numpy.ndarray
The phone posteriors and labels.
"""
nnetfile = pkg_resources.resource_filename(__name__,
'test/dnn/ami.nnet.txt')
transfile = pkg_resources.resource_filename(__name__,
'test/dnn/ami.feature_transform.txt')
labfile = pkg_resources.resource_filename(__name__,
'test/dnn/ami.phones.txt')
feats = bob.kaldi.cepstral(samples, 'mfcc', rate,
normalization=False)
with open(nnetfile) as nnetf, \
open(transfile) as trnf:
dnn = nnetf.read()
trn = trnf.read()
post = bob.kaldi.nnet_forward(feats, dnn, trn)
labels = a=np.genfromtxt(labfile, dtype='str', skip_header=1)
lab = []
for l in labels:
lab.append(l[0])
return [post, lab]
#!/usr/bin/zsh
#
# # Allow setshell
# software=/idiap/resource/software
# source $software/initfiles/shrc $software
# SETSHELL kaldi
out=test/kaldi
if [[ ! -e $out ]]; then
mkdir $out
fi
nnet-forward --feature-transform=dnn/ami.feature_transform \
dnn/ami.nnet \
ark,t:$out/sample16k.cmvn.deltas.ark \
ark,t:$out/sample16k.posteriors.ark
<eps> 0
sil 1
laughter 2
noise 3
oov 4
AA 5
AE 6
AH 7
AO 8
AW 9
AY 10
B 11
CH 12
D 13
DH 14
EH 15
ER 16
EY 17
F 18
G 19
HH 20
IH 21
IY 22
JH 23
K 24
L 25
M 26
N 27
NG 28
OW 29
OY 30
P 31
R 32
S 33
SH 34
T 35
TH 36
UH 37
UW 38
V 39
W 40
Y 41
Z 42
ZH 43
#0 44
#1 45
#2 46
#3 47
#4 48
#5 49
#6 50
#7 51
#8 52
#9 53
......@@ -52,5 +52,15 @@ def test_compute_dnn_vad():
assert np.allclose(ours, theirs)
def test_compute_dnn_phone():
sample = pkg_resources.resource_filename(__name__, 'data/librivox.wav')
data = bob.io.audio.reader(sample)
post, labs = bob.kaldi.compute_dnn_phone(data.load()[0], data.rate)
mdecoding=np.argmax(post,axis=1) # max decoding
# check if the last spoken sound at frame 250 is 'N' (word DOMAIN)
assert(labs[mdecoding[250]]=='N')
......@@ -210,5 +210,23 @@ independent. The training of such model has following pipeline:
>>> print (model.find('TransitionModel'))
1
Phone frame decoding
--------------------
Simple frame decoding can by done by finding the indices of the
maximum values along the frame axis. The following example performs
a forward pass with pre-trained phone DNN, and finds :math:`argmax()`
of the output posterior features. Looking at the DNN labels, the
phones are decoded per frame.
.. doctest::
>>> sample = pkg_resources.resource_filename('bob.kaldi', 'test/data/librivox.wav')
>>> data = bob.io.audio.reader(sample)
>>> post, labs = bob.kaldi.compute_dnn_phone(data.load()[0], data.rate)
>>> mdecoding = numpy.argmax(post,axis=1) # max decoding
>>> print (labs[mdecoding[250]]) # the last spoken sound of sample is N (of the word DOMAIN)
N
.. include:: links.rst
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment