Commit ceaca41f authored by Yannick DAYER's avatar Yannick DAYER

VoxForge data download as separate bob db command.

parent db13ac7f
Pipeline #51739 failed with stage
in 8 minutes and 16 seconds
......@@ -7,14 +7,22 @@
VoxForge is an open speech dataset that was set up to collect transcribed speech for
use with Free and Open Source Speech Recognition Engines. (http://www.voxforge.org/)
This database interface contains a subset used for speaker recognition defined by
default in a set of CSV files available at
https://www.idiap.ch/software/bob/data/bob/bob.bio.spear/
This database interface uses a subset of the full dataset used for speaker recognition.
The list of data files in a subset is defined by a CSV file for each protocol. Use the
``bob db voxforge-download`` command to retrieve those data files if needed.
Feed this file (also defined as resource: ``voxforge``) to ``bob bio pipelines`` as
The protocol definition files are available at
https://www.idiap.ch/software/bob/data/bob/bob.bio.spear/ and downloaded automatically
by default (into ``bob_data_folder`` which is configurable with ``bob config``).
Usage
-----
Feed this file (also defined as a ``voxforge`` resource) to ``bob bio pipelines`` as
configuration:
$ bob bio pipelines vanilla-biometrics -v voxforge <pipeline_name>
$ bob bio pipelines vanilla-biometrics voxforge <pipeline_name> -vv
"""
from bob.bio.spear.database import VoxforgeBioDatabase
......@@ -27,5 +35,5 @@ if "protocol" not in locals():
database = VoxforgeBioDatabase(
protocol=protocol,
dataset_protocol_path=None, # Get from config, or download the protocol definitions
data_path=None, # Get from config, or download the data from VoxForge
data_path=None, # Get from config
)
......@@ -5,58 +5,32 @@
import logging
import os
from tqdm import tqdm
from bob.bio.base.database import CSVDataset
from bob.extension import rc
from bob.extension.download import get_file
from bob.extension.download import search_file
logger = logging.getLogger(__name__)
def download_voxforge_data(list_file: str):
"""Downloads a series of VoxForge data files from their repository and untar them.
The files will be retrieved by :py:func:`bob.extension.download.get_file` and saved
in the ``data/voxforge`` subdirectory of `bob_data_folder`.
Parameters
----------
def get_voxforge_protocol_file():
"""Returns the protocol definition archive, downloading it if necessary.
list_file: str
A path to a text file with one line per file to download.
Looks into ``bob_data_folder``, into the ``datasets`` folder for the file, and
download it if necessary.
"""
if ":" in list_file:
tar_base, in_file = list_file.split(":", maxsplit=1)
list_file = search_file(tar_base, [in_file]) # Returns an open file
else:
list_file = open(list_file, "r")
voxforge_repo = "http://www.repository.voxforge1.org"
base_url = f"{voxforge_repo}/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit"
num_files = sum(1 for _ in list_file)
list_file.seek(0, 0)
if num_files > 20:
logger.warning(f"Downloading {num_files} will take some time.")
logger.info(
f"{num_files} files are listed in {list_file}. Downloading from {base_url}..."
proto_def_hash = "dc84ac65"
proto_def_name = f"bio-spear-voxforge-{proto_def_hash}.tar.gz"
proto_def_urls = [
f"https://www.idiap.ch/software/bob/data/bob/bob.bio.spear/{proto_def_name}",
f"http://www.idiap.ch/software/bob/data/bob/bob.bio.spear/{proto_def_name}",
]
logger.info(f"Retrieving protocol definition file '{proto_def_name}'.")
return get_file(
filename=proto_def_name,
urls=proto_def_urls,
file_hash=proto_def_hash,
cache_subdir="datasets",
)
for line in tqdm(list_file, total=num_files):
tar_file = line.strip()
file_name = os.path.basename(tar_file)
data_file_url = f"{base_url}/{tar_file}"
logger.debug(f"Downloading {file_name} from {data_file_url}")
final_file = get_file(
filename=file_name,
urls=[data_file_url],
cache_subdir=os.path.join("data", "voxforge"),
extract=True,
force=False,
)
logger.debug(f"Downloaded to {final_file}")
logger.info(f"Download of {num_files} completed.")
list_file.close()
class VoxforgeBioDatabase(CSVDataset):
......@@ -101,18 +75,7 @@ class VoxforgeBioDatabase(CSVDataset):
def __init__(self, protocol, dataset_protocol_path=None, data_path=None, **kwargs):
if dataset_protocol_path is None:
proto_def_hash = "xxxxxxxx" # TODO bob dav
proto_def_name = f"bio-spear-voxforge-{proto_def_hash}.tar.gz"
proto_def_urls = [
f"https://www.idiap.ch/software/bob/data/bob/bob.bio.spear/{proto_def_name}",
f"http://www.idiap.ch/software/bob/data/bob/bob.bio.spear/{proto_def_name}",
]
dataset_protocol_path = get_file(
filename=proto_def_name,
urls=proto_def_urls,
file_hash=proto_def_hash,
cache_subdir="datasets",
)
dataset_protocol_path = get_voxforge_protocol_file()
if data_path is None:
data_path = rc.get("bob.db.voxforge.directory", None)
......@@ -122,9 +85,6 @@ class VoxforgeBioDatabase(CSVDataset):
"bob_data_folder", os.path.join(os.path.expanduser("~"), "bob_data")
)
data_path = os.path.join(bob_data_path, "data")
download_voxforge_data(
f"{dataset_protocol_path}:{protocol}/list_of_data_files.lst"
)
logger.info(
f"Database: Will read the CSV protocol definitions in '{dataset_protocol_path}'."
......
#!/usr/bin/env python
# @author: Yannick Dayer <yannick.dayer@idiap.ch>
# Tue 22 Jun 2021 14:53:03 UTC+02
import csv
import logging
from pathlib import Path
import click
from tqdm import tqdm
from bob.bio.spear.database.voxforge import get_voxforge_protocol_file
from bob.extension.download import download_and_unzip
from bob.extension.download import search_file
from bob.extension.scripts.click_helper import verbosity_option
logger = logging.getLogger(__name__)
@click.command(
epilog="""Examples:
\b
$ bob db download-voxforge --list-file my_urls.csv --destination my_datasets/data/
The file list can be in a tar archive:
\b
$ bob db download-voxforge --list-file voxforge.tar.gz:Default/data_files_urls.csv
""",
)
@click.option(
"--list-file",
"-l",
default=None,
help=(
"A path to a text file with one line per file to download. "
"Can be in a tar file: use a ``:`` to point inside the archive. "
"If ``--list-file`` is omitted, will look for the protocol definition file in "
"``bob_data_folder``, and download the file from "
"`https://www.idiap.ch/software/bob/data/bob/bob.bio.spear` if needed."
),
)
@click.option(
"--destination",
"-d",
default=None,
help=(
"Where to store the downloaded data files. "
"If omitted, will download to the bob_data/data folder."
),
)
@click.option(
"--force-download",
"-f",
is_flag=True,
help=("Download a file even if it already exists locally."),
)
@verbosity_option()
def download_voxforge(list_file, destination, force_download, verbose, **kwargs):
"""Downloads a series of VoxForge data files from their repository and untar them.
The files will be downloaded and saved in the `destination` folder then extracted.
A list of files is required in the form of a csv file with a ``url`` column as well
as a ``filename`` column that specifies the local name of the file. If the csv file
is not specified as ``--list-file`` option, it will be looked up in
``bob_data_folder`` (see
:py:func:`bob.bio.spear.database.voxforge.get_voxforge_protocol_file`).
"""
# logger.setLevel(["ERROR", "WARNING", "INFO", "DEBUG"][verbose])
destination = Path(destination)
destination.mkdir(exist_ok=True)
# Defaults to list in protocol definition
if list_file is None:
protocol_file = get_voxforge_protocol_file()
list_file = f"{protocol_file}:Default/data_files_urls.csv"
# Open the list file
if ":" in list_file:
tar_base, in_file = list_file.split(":", maxsplit=1)
open_list_file = search_file(tar_base, [in_file]) # Returns an open file
else:
open_list_file = open(list_file, "r")
num_files = sum(1 for _ in open_list_file) - 1
open_list_file.seek(0, 0)
logger.info(f"{num_files} files are listed in {list_file}. Downloading...")
csv_list_file = csv.DictReader(open_list_file)
for row in tqdm(csv_list_file, total=num_files):
full_filename = destination / row["filename"]
if force_download or not full_filename.exists():
logger.debug(f"Downloading {row['filename']} from {row['url']}")
download_and_unzip(urls=[row["url"]], filename=full_filename)
logger.debug(f"Downloaded to {full_filename}")
logger.info(f"Download of {num_files} completed.")
open_list_file.close()
......@@ -157,6 +157,9 @@ setup(
"bob.bio.grid": [
"modest = bob.bio.spear.config.grid.modest:grid",
],
"bob.db.cli": [
"download-voxforge = bob.bio.spear.script.db_download:download_voxforge",
],
},
# Classifiers are important if you plan to distribute this package through
# PyPI. You can find the complete list of classifiers that are valid and
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment