Amir MOHAMMADI · 2c3117ed
--- a/bob/learn/pytorch/preprocessor/audio/DltResNet.py

+ 24

− 15

View file @ 2c3117ed

Open in Web IDE
+++ b/bob/learn/pytorch/preprocessor/audio/DltResNet.py

+ 24

− 15

View file @ 2c3117ed

Open in Web IDE
 @@ -13,6 +13,8 @@ from bob.extension import rc
 from bob.extension.download import download_file
 import os
 from bob.ap import Spectrogram
+from scipy.signal import resample
+

 class DltResNetExtractor(Preprocessor):
  """ The class for implementing the feature extraction of DltResNet embeddings.
 @@ -25,7 +27,7 @@ class DltResNetExtractor(Preprocessor):
      Use gpu for extracting the embeddings.

  """
-  
+
  def __init__(self, model_file=None, num_classes=1211, bn_dim=128, cuda_flag=0, test=False, use_res=False):
    """ Init method

 @@ -43,11 +45,11 @@ class DltResNetExtractor(Preprocessor):
        Called for unit test (Default: False).
    use_res: bool
        Use ResNet model instead of DltResNet model (Default: False).
-    
+
    """
-        
+
    Preprocessor.__init__(self, min_preprocessed_file_size=bn_dim)
-    
+
    # model
    model_type = "emb"
    if use_res:
 @@ -77,9 +79,9 @@ class DltResNetExtractor(Preprocessor):
        model_file = model_path

      else:
-        # do nothing (used mainly for unit testing) 
+        # do nothing (used mainly for unit testing)
        pass
-    
+
    if test:
      self.network = torch.nn.DataParallel(self.network)
    else:
 @@ -118,8 +120,8 @@ class DltResNetExtractor(Preprocessor):
    Returns
    -------
    feature : 2D :py:class:`numpy.ndarray` (floats)
-      The extracted features as a 1d array of size 128 
-    
+      The extracted features as a 1d array of size 128
+
    """

    feats = []
 @@ -139,10 +141,10 @@ class DltResNetExtractor(Preprocessor):
            k = specs[i*L:(i+1)*L,:]
        else:
            k = specs[i*L:,:]
-        
+
        X = (k - k.mean(axis=0)) / (k.std(axis=0) + 1e-7)
        batch.append(X)
-        if len(batch) == batch_size or i == embedding_count - 2 or i == embedding_count - 1: 
+        if len(batch) == batch_size or i == embedding_count - 2 or i == embedding_count - 1:
          with torch.no_grad():
            in_batch = numpy.array(batch)
            if cuda_flag == 1:
 @@ -152,7 +154,7 @@ class DltResNetExtractor(Preprocessor):
            fX = model(input_var).data.cpu().numpy()
            feats.append(fX)
            batch = []
-  
+
    return numpy.vstack(feats)

  def __call__(self, audio, annotations=None):
 @@ -161,22 +163,29 @@ class DltResNetExtractor(Preprocessor):
    Parameters
    ----------
    audio : :py:class:`numpy.ndarray` (floats)
-      The audio file to extract the features from. 
+      The audio file to extract the features from.
    annotations : None
      Apply annotations if needed

    Returns
    -------
    feature : 2D :py:class:`numpy.ndarray` (floats)
-      The extracted features as a 1d array of size 128 
-    
+      The extracted features as a 1d array of size 128
+
    """
    data = audio[1]
    if data.dtype =='int16':
      data = numpy.cast['float'](data)
    if numpy.max(numpy.abs(data)) < 1:
      data = data * 2**15
+
    rate = audio[0]
+    # resample audio to 16KHz so it can work with the model
+    if rate != 16000:
+      samps = round(len(data) * 16000 / rate)  # Number of samples to resample
+      data = resample(data, samps)
+      rate = 16000
+
    win_length_ms = 25
    win_shift_ms = 10
    normalize_mean = True
 @@ -204,4 +213,4 @@ class DltResNetExtractor(Preprocessor):
    """
    f= bob.io.base.HDF5File(data_file)
    feats = f.read("feats")
-    return feats
 \ No newline at end of file
+    return feats