Commit 1020ce81 authored by Amir MOHAMMADI's avatar Amir MOHAMMADI

Merge branch 'voxforge-csv' into 'master'

VoxForge port to CSVDataset

See merge request !49
parents 20026b0a b1f2618b
Pipeline #51902 failed with stages
in 7 minutes and 24 seconds
#!/usr/bin/env python
# @author: Yannick Dayer <>
# @date: Wed 16 Jun 2021 17:20:16 UTC+02
"""VoxForge CSV database interface default configuration
VoxForge is an open speech dataset that was set up to collect transcribed speech for
use with Free and Open Source Speech Recognition Engines. (
This database interface uses a subset of the full dataset used for speaker recognition.
The list of data files in the subset is defined by a CSV file for each protocol. Use
the ``bob db voxforge-download`` command to retrieve those data files if needed, and
set the config with the correct path with ``bob config set``.
The protocol definition files are available at and downloaded automatically
(by default into ``bob_data_folder`` which is configurable with ``bob config``).
Feed this file (also defined as a ``voxforge`` resource) to ``bob bio pipelines`` as
$ bob bio pipelines vanilla-biometrics voxforge <pipeline_name> -vv
from import VoxforgeBioDatabase
default_protocol = "Default"
if "protocol" not in locals():
protocol = default_protocol
database = VoxforgeBioDatabase(
......@@ -26,6 +26,7 @@ from .asvspoof2017 import ASVspoof2017BioDatabase
from .avspoof import AVspoofBioDatabase
from .mobio import MobioBioDatabase
from .voicepa import VoicePABioDatabase
from .voxforge import VoxforgeBioDatabase
# gets sphinx autodoc done right - don't remove it
......@@ -51,5 +52,6 @@ __appropriate__(
__all__ = [_ for _ in dir() if not _.startswith("_")]
#!/usr/bin/env python
# @author: Yannick Dayer <>
# @date: Wed 16 Jun 2021 17:21:47 UTC+02
import csv
import logging
from pathlib import Path
import click
from tqdm import tqdm
from import CSVDataset
from import CSVToSampleLoaderBiometrics
from bob.extension import rc
from import download_and_unzip
from import get_file
from import search_file
from bob.extension.scripts.click_helper import verbosity_option
logger = logging.getLogger(__name__)
def get_voxforge_protocol_file():
"""Returns the protocol definition archive, downloading it if necessary.
Looks into ``bob_data_folder``, into the ``datasets`` folder for the file, and
download it from if
proto_def_hash = "dc84ac65"
proto_def_name = f"database-protocols-voxforge-{proto_def_hash}.tar.gz"
proto_def_urls = [
]"Retrieving protocol definition file '{proto_def_name}'.")
return get_file(
def VoxforgeBioDatabase(
protocol="Default", dataset_protocol_path=None, data_path=None, **kwargs
"""Database interface for the VoxForge dataset subset for speaker recognition.
This database interface is meant to be used with the vanilla-biometrics pipeline.
Given a series of CSV files (or downloading them from the bob data server), it
creates the Sample objects for each roles needed by the pipeline (enroll, probe),
for different groups (dev, eval).
`protocol definition` files are not the `data` files:
- `protocol definition` files are a list of paths and corresponding reference
name. They are available on the bob data server.
- `data` files are the actual files of the dataset (pointed to by the definition
files). They are not provided by bob.
Although not provided by bob, the VoxForge data is freely available online.
If you don't already have the data, download it and set the bob configuration using
the following commands:
``$ bob db download-voxforge -d your_path_to_data``
``$ bob config set your_path_to_data``
protocol: str
Name of the protocol to use (subfolder in protocol definition).
dataset_protocol_path: str or None
Path to an existing protocol definition folder structure.
If None: will download the definition files to the path pointed by the
``bob_data_folder`` config (see :py:func:``).
data_path: str or None
Path to the data files of VoxForge.
If None: will use the path in the ```` config.
if dataset_protocol_path is None:
dataset_protocol_path = get_voxforge_protocol_file()
if data_path is None:
data_path = rc.get("")
if data_path is None:
"No data path was provided! Either set "
"'' with the 'bob config set' command, or "
"provide a 'data_path' to VoxforgeBioDatabase."
f"Database: Will read the CSV protocol definitions in '{dataset_protocol_path}'."
)"Database: Will read raw data files in '{data_path}'.")
return CSVDataset(
$ bob db download-voxforge ./data/
$ bob db download-voxforge --protocol-definition bio-spear-voxforge.tar ./data/
"A path to a the protocol definition file of VoxForge. "
"If omitted, will use the default protocol definition file at "
help="Download a file even if it already exists locally.",
def download_voxforge(protocol_definition, destination, force_download, **kwargs):
"""Downloads a series of VoxForge data files from their repository and untar them.
The files will be downloaded and saved in the `destination` folder then extracted.
The list of URLs is provided in the protocol definition file of Voxforge.
destination = Path(destination)
if protocol_definition is None:
protocol_definition = get_voxforge_protocol_file()
# Use the `Default` protocol
protocol = "Default"
# Open the list file
list_file = f"{protocol}/data_files_urls.csv"
open_list_file = search_file(protocol_definition, [list_file])
num_files = sum(1 for _ in open_list_file) - 1, 0)"{num_files} files are listed in {list_file}. Downloading...")
csv_list_file = csv.DictReader(open_list_file)
for row in tqdm(csv_list_file, total=num_files):
full_filename = destination / row["filename"]
if force_download or not full_filename.exists():
logger.debug(f"Downloading {row['filename']} from {row['url']}")
download_and_unzip(urls=[row["url"]], filename=full_filename)
logger.debug(f"Downloaded to {full_filename}")"Download of {num_files} files completed.")
......@@ -26,6 +26,8 @@ import
from import check_database
from import check_database_zt
from import db_available
from bob.pipelines import DelayedSample
from bob.pipelines import SampleSet
......@@ -143,3 +145,29 @@ def test_timit():
"The database could not queried; probably the db.sql3 file is missing. Here is the error: '%s'"
% e
def test_voxforge():
database =
"voxforge", "database", preferred_package=""
dev_ref = database.references(group="dev")
eval_ref = database.references(group="eval")
dev_pro = database.probes(group="dev")
eval_pro = database.probes(group="eval")
train = database.background_model_samples()
assert len(dev_ref) == 10, len(dev_ref)
assert all(isinstance(s, SampleSet) for s in dev_ref)
assert all(isinstance(s, DelayedSample) for s in dev_ref[0])
assert len(dev_pro) == 300, len(dev_pro)
assert all(len(s) == 1 for s in dev_pro)
assert all(isinstance(s[0], DelayedSample) for s in dev_pro)
assert len(dev_ref) == 10, len(eval_ref)
assert all(isinstance(s, SampleSet) for s in eval_ref)
assert all(isinstance(s, DelayedSample) for s in eval_ref[0])
assert len(eval_pro) == 300, len(eval_pro)
assert all(len(s) == 1 for s in eval_pro)
assert all(isinstance(s[0], DelayedSample) for s in eval_pro)
assert len(train) == 3148, len(train)
......@@ -40,6 +40,7 @@ requirements:
- python
- setuptools
- matplotlib
- tqdm
......@@ -14,5 +14,6 @@ bob.db.base
matplotlib # for plotting
scikit-learn # for pipelines Tranformers
......@@ -95,17 +95,18 @@ setup(
# the version of bob.
"": [
"timit =",
"mobio-audio-male =",
"mobio-audio-female =",
"avspoof-licit =",
"avspoof-spoof =",
"asvspoof-licit =",
"asvspoof-spoof =",
"voicepa-licit =",
"voicepa-spoof =",
"asvspoof2017-licit =",
"asvspoof2017-spoof =",
"timit =",
"mobio-audio-male =",
"mobio-audio-female =",
"avspoof-licit =",
"avspoof-spoof =",
"asvspoof-licit =",
"asvspoof-spoof =",
"voicepa-licit =",
"voicepa-spoof =",
"asvspoof2017-licit =",
"asvspoof2017-spoof =",
"voxforge =",
"": [
"cqcc20p =", # Empty preprocessor for CQCC features
......@@ -156,6 +157,9 @@ setup(
"": [
"modest =",
"bob.db.cli": [
"download-voxforge =",
# Classifiers are important if you plan to distribute this package through
# PyPI. You can find the complete list of classifiers that are valid and
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment