Commit 2d399621 authored by André Anjos's avatar André Anjos 💬
Browse files

Merge branch 'update_avspoof' into 'master'

Version 2 of AVspoof database

Created files for version 2 of AVspoof database. The new set of protocols was added that allow to use train set in the verification evaluations.

See merge request !6
parents 2d0c1a16 5ed80abb
This diff is collapsed.
###############################################################################
# #
# Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ #
# Contact: beat.support@idiap.ch #
# #
# This file is part of the beat.examples module of the BEAT platform. #
# #
# Commercial License Usage #
# Licensees holding valid commercial BEAT licenses may use this file in #
# accordance with the terms contained in a written agreement between you #
# and Idiap. For further information contact tto@idiap.ch #
# #
# Alternatively, this file may be used under the terms of the GNU Affero #
# Public License version 3 as published by the Free Software and appearing #
# in the file LICENSE.AGPL included in the packaging of this file. #
# The BEAT platform is distributed in the hope that it will be useful, but #
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY #
# or FITNESS FOR A PARTICULAR PURPOSE. #
# #
# You should have received a copy of the GNU Affero Public License along #
# with the BEAT platform. If not, see http://www.gnu.org/licenses/. #
# #
###############################################################################
import numpy
import bob.io.base
import bob.db.avspoof
import scipy.io.wavfile
from bob.db.avspoof.driver import Interface
INFO = Interface()
SQLITE_FILE = INFO.files()[0]
class RecognitionTraining:
def setup(self, root_folder, outputs, parameters, force_start_index=None, force_end_index=None):
self.root_folder = root_folder
self.outputs = outputs
self.parameters = parameters
# verification.Database implements bob.db.verification.utils interface
self.db = bob.db.avspoof.verification.Database(original_directory=self.root_folder, original_extension='.wav')
# retrieve all file paths for the given parameters
self.objs = sorted(self.db.objects(protocol=parameters['protocol'], groups=parameters['group']), key=lambda x: x.id)
self.next_index = 0
self.force_start_index = force_start_index
self.force_end_index = force_end_index
# Retrieve only 'useful' data, as ignore everything before force_start_index and after force_end_index
### Start index
if self.force_start_index is not None:
self.next_index = self.force_start_index
### End index
if self.force_end_index is not None:
self.objs = self.objs[:self.force_end_index+1]
return True
def done(self):
return (self.next_index >= len(self.objs))
def next(self):
obj = self.objs[self.next_index]
if self.outputs['file_id'].isConnected():
self.outputs['file_id'].write({'text': obj.id}, self.next_index)
if self.outputs['client_id'].isConnected():
self.outputs['client_id'].write({'text': obj.client_id}, self.next_index)
# read audio data
if self.outputs['speech'].isConnected():
rate, audio = scipy.io.wavfile.read(obj.make_path(self.root_folder, self.db.original_extension))
data = {
'value': numpy.cast['float'](audio)
}
self.outputs['speech'].write(data, self.next_index)
self.next_index += 1
return True
class RecognitionTemplates:
# Reasoning: Each client may have a number of models in certain databases.
# So, each model receives an unique identifier. Those identifiers are linked
# to the client identifier and contain a number of samples to generate the
# model from.
def setup(self, root_folder, outputs, parameters, force_start_index=None, force_end_index=None):
self.root_folder = root_folder
self.outputs = outputs
self.parameters = parameters
# verification.Database implements bob.db.verification.utils interface
self.db = bob.db.avspoof.verification.Database(original_directory=self.root_folder, original_extension='.wav')
# template Ids are model Ids (which again are just Ids of clients) in the terminoloy of the bob.db interfaces
self.template_ids = sorted(self.db.model_ids(groups=parameters['group']))
self.objs = []
# store the File objects for each template
for template_id in self.template_ids:
self.objs.append(sorted(self.db.objects(protocol=parameters['protocol'], purposes=parameters['purpose'], groups=parameters['group'], model_ids = (template_id,)), key=lambda x: x.id))
self.current_template_index = 0
self.current_obj_index = 0
self.next_index = 0
self.force_start_index = force_start_index
self.force_end_index = force_end_index
# Retrieve only 'useful' data
### End index
#if self.force_end_index is not None:
# self.objs = self.objs[:self.force_end_index+1]
### Start index
if self.force_start_index is None:
self.force_start_index = 0
while self.next_index < self.force_start_index:
self.next()
return True
def done(self):
return (self.current_template_index >= len(self.template_ids)) or (self.force_end_index is not None and self.force_end_index < self.next_index)
def next(self):
# the length of the batch of File objects in the current template
len_template_objs = len(self.objs[self.current_template_index])
# current file object that we want to read and output
obj = self.objs[self.current_template_index][self.current_obj_index]
if self.current_obj_index == 0:
# we update the 'template_id' and 'client_id' only when the current template/model changes
if self.force_start_index <= self.next_index and (self.force_end_index is None or self.force_end_index >= self.next_index):
# For this database, 'self.template_ids[self.current_template_index]' corresponds to 'obj.client_id'
if self.outputs['template_id'].isConnected():
self.outputs['template_id'].write({'text': self.template_ids[self.current_template_index]}, self.next_index+len_template_objs-1)
if self.outputs['client_id'].isConnected():
self.outputs['client_id'].write({'text': obj.client_id}, self.next_index+len_template_objs-1)
if self.outputs['file_id'].isConnected() or self.outputs['speech'].isConnected():
if self.force_start_index <= self.next_index and (self.force_end_index is None or self.force_end_index >= self.next_index):
if self.outputs['file_id'].isConnected():
self.outputs['file_id'].write({'text': obj.id}, self.next_index)
if self.outputs['speech'].isConnected():
rate, audio = scipy.io.wavfile.read(obj.make_path(self.root_folder, self.db.original_extension))
data = {
'value': numpy.cast['float'](audio)
}
self.outputs['speech'].write(data, self.next_index)
self.next_index += 1
self.current_obj_index += 1
else:
self.next_index += len_template_objs
self.current_obj_index = len_template_objs
if self.current_obj_index == len_template_objs:
self.current_obj_index = 0
self.current_template_index += 1
return True
class ProbesReal:
# Reasoning: Each client may have a number of probes. Each probe may be
# composed of any number of sample. So, each probe receives an unique
# identifier. Those identifiers are linked to the client identifier and
# contain a number of images to generated the match/comparison score. Each
# probe must be matched against a number of models defined by a list of
# client identifiers.
def setup(self, root_folder, outputs, parameters, force_start_index=None, force_end_index=None):
self.root_folder = root_folder
self.outputs = outputs
self.parameters = parameters
# verification.Database implements bob.db.verification.utils interface
self.db = bob.db.avspoof.verification.Database(original_directory=self.root_folder, original_extension='.wav')
# template Ids are model Ids (which again are just Ids of clients) in the terminoloy of the bob.db interfaces
self.template_ids = sorted(self.db.model_ids(groups=parameters['group']))
self.objs = sorted(self.db.objects(protocol=parameters['protocol'],
purposes=parameters['purpose'], groups=parameters['group']), key=lambda x: x.id)
self.next_index = 0
self.force_start_index = force_start_index
self.force_end_index = force_end_index
# Retrieve only 'useful' data
### Start index
if self.force_start_index is not None:
self.next_index = self.force_start_index
### End index
if self.force_end_index is not None:
if self.force_end_index < len(self.objs):
self.objs = self.objs[:self.force_end_index+1]
return True
def done(self):
return (self.next_index >= len(self.objs))
def next(self):
obj = self.objs[self.next_index]
if self.outputs['file_id'].isConnected():
self.outputs['file_id'].write({'text': obj.id}, self.next_index)
if self.outputs['probe_id'].isConnected():
self.outputs['probe_id'].write({'text': obj.id}, self.next_index)
if self.outputs['client_id'].isConnected():
self.outputs['client_id'].write({'text': obj.client_id}, self.next_index)
if self.outputs['template_ids'].isConnected():
data = {
'text': numpy.array(self.template_ids, dtype=numpy.string_)
}
self.outputs['template_ids'].write(data, self.next_index)
# read audio data
if self.outputs['speech'].isConnected():
rate, audio = scipy.io.wavfile.read(obj.make_path(self.root_folder, self.db.original_extension))
data = {
'value': numpy.cast['float'](audio)
}
self.outputs['speech'].write(data, self.next_index)
self.next_index += 1
return True
class SimpleAntispoofing:
def setup(self, root_folder, outputs, parameters, force_start_index=None, force_end_index=None):
self.root_folder = root_folder
self.outputs = outputs
self.parameters = parameters
# By right we should take the bob.db.avspoof.spoofing.Database(), which implements antispoofing.utils.db interface, but that interface does not support Id's for file objects (see antispoofing.utils.File). It's a problem for us here, since we need to sort files by their Id's and also we need to yeild the id of each file.
# So, we are using the direct access to the Database here.
self.db = bob.db.avspoof.Database()
# retrieve all file paths for the given parameters
self.objs_real = sorted(self.db.objects(protocol=parameters['protocol'], groups=parameters['group'], cls='real'), key=lambda x: x.id)
self.objs_attack = sorted(self.db.objects(protocol=parameters['protocol'], groups=parameters['group'], cls='attack'), key=lambda x: x.id)
# we have two sets of data, so to serve them sequentially, we need to know their sizes and
# when the second set is following after the first set, hence, this approach with offsets
self.offset_real = 0
self.offset_attack = self.offset_real + len(self.objs_real)
self.next_index = 0
self.force_start_index = force_start_index
self.force_end_index = force_end_index
# Retrieve only 'useful' data, as ignore everything before force_start_index and after force_end_index
### Start index
if self.force_start_index is not None:
self.next_index = self.force_start_index
### End index
if self.force_end_index is not None:
if self.force_end_index < self.offset_attack: # it means we are still within real objects
self.objs_real = self.objs_real[:self.force_end_index+1]
self.offset_attack = []
elif self.force_end_index < self.offset_attack + len(self.objs_attack) - 1:
self.objs_attack = self.objs_attack[:self.force_end_index+1-self.offset_attack]
return True
def done(self):
return (self.next_index >= len(self.objs_real) + len(self.objs_attack))
def next(self):
cls = ''
attack_type = ''
obj = None
# if we are inside set of attacks, we yeild an attack object
if self.next_index >= self.offset_attack:
obj = self.objs_attack[self.next_index - self.offset_attack]
cls = 'attack'
# get the attack and compute the type of the attack
attack = obj.get_attack()
attack_type = attack.attack_support + attack.attack_device
else:
obj = self.objs_real[self.next_index - self.offset_real]
cls = 'real'
attack_type = 'human'
if self.outputs['file_id'].isConnected():
self.outputs['file_id'].write({'text': obj.id}, self.next_index)
if self.outputs['client_id'].isConnected():
self.outputs['client_id'].write({'text': obj.client_id}, self.next_index)
# read audio data
if self.outputs['speech'].isConnected():
rate, audio = scipy.io.wavfile.read(obj.make_path(directory=self.root_folder, extension='.wav'))
data = {
'value': numpy.cast['float'](audio)
}
self.outputs['speech'].write(data, self.next_index)
if self.outputs['attack_type'].isConnected():
self.outputs['attack_type'].write({'text': attack_type}, self.next_index)
if self.outputs['class'].isConnected():
self.outputs['class'].write({'text': cls}, self.next_index)
self.next_index += 1
return True
.. Copyright (c) 2016 Idiap Research Institute, http://www.idiap.ch/ ..
.. Contact: beat.support@idiap.ch ..
.. ..
.. This file is part of the beat.examples module of the BEAT platform. ..
.. ..
.. Commercial License Usage ..
.. Licensees holding valid commercial BEAT licenses may use this file in ..
.. accordance with the terms contained in a written agreement between you ..
.. and Idiap. For further information contact tto@idiap.ch ..
.. ..
.. Alternatively, this file may be used under the terms of the GNU Affero ..
.. Public License version 3 as published by the Free Software and appearing ..
.. in the file LICENSE.AGPL included in the packaging of this file. ..
.. The BEAT platform is distributed in the hope that it will be useful, but ..
.. WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ..
.. or FITNESS FOR A PARTICULAR PURPOSE. ..
.. ..
.. You should have received a copy of the GNU Affero Public License along ..
.. with the BEAT platform. If not, see http://www.gnu.org/licenses/. ..
The AVspoof Database
--------------------
Changelog
=========
* **Version 2**, 24/Mar/2016:
- Added ``_verify_train`` protocols that allow using train set in the
verification evaluations.
* **Version 1**, 09/Feb/2016:
- Initial release
Description
===========
The `AVspoof Database <https://www.idiap.ch/dataset/avspoof>`_ provides
non-biased spoofing attacks in order for researchers to test both their
ASV systems and anti-spoofing algorithms. The attacks are created based on
newly acquired audio recordings. The data acquisition process lasted approximately
two months with 44 persons, each participating in several sessions configured in
different environmental conditions and setups. After the collection of the data,
the attacks, more precisely, replay, voice conversion and speech synthesis attacks
were generated. This Database was produced at the Idiap Research Institute, in Switzerland.
Acknowledgements
================
If you use this database, please cite the following publication on your paper::
@INPROCEEDINGS{KucurErgunay_IEEEBTAS_2015,
author = {Kucur Ergunay, Serife and Khoury, Elie and Lazaridis, Alexandros and Marcel, S{\'{e}}bastien},
projects = {Idiap, SNSF-LOBI, BEAT},
month = sep,
title = {On the Vulnerability of Speaker Verification to Realistic Voice Spoofing},
booktitle = {IEEE International Conference on Biometrics: Theory, Applications and Systems},
year = {2015},
pdf = {http://publications.idiap.ch/downloads/papers/2015/KucurErgunay_IEEEBTAS_2015.pdf}
}
Database Description
====================
The data acquisition process is divided into four different sessions, each scheduled several
days apart in different setups and environmental conditions (e.g. different in terms of
background noise, reverberation, etc.) for each of 31 male and 13 female participants.
The first session which is supposed to be used as training set while creating the attacks,
was performed in the most controlled conditions. Besides, the conditions for the last
three sessions dedicated to test trials were more relaxed in order to grasp the challenging
scenarios. The audio data were recorded by three different devices including (a) one
good-quality microphone, AT2020USB+, and two mobiles, (b) Samsung Galaxy S4 (phone1)
and (c) iPhone 3GS (phone2). The positioning of the devices was stabilized for each
session and each participant in order to standardize the recording settings.
For each session, the participant was subjected to three different data acquisition protocols as in the following:
* **Reading part (read)**: 10/40 pre-defined sentences are read by the participant.
* **Pass-phrases part (pass)**: 5 short prompts are read by the participant.
* **Free speech part (free)**: The participant speaks freely about any topic for 3 to 10 minutes.
The number, the length, as well as the content of the sentences for the reading and
pass-phrases part are carefully selected in order to satisfy the constraints in terms
of readability, data acquisition and attack quality. Similarly, the minimum duration of
the free speech part is also determined according to our preliminary investigations
mostly on the voice conversion attacks for which the free speech data would be included
in the training set.
Spoofing Attacks
================
In the spoofing attack creation phase, we considered creating spoofing trials for the
text-dependent utterances of the testing data, i.e. reading parts of sessions 2-4
and the pass-phrases of all four sessions. As a preliminary step before the creation
of the attacks, the speech data originally recorded at 44.1 KHz sampling rate is
down-sampled to 16 KHz.
There are four main spoofing attacks for ASV systems: Impersonation, replay,
speech synthesis and voice conversion. As the impersonation is known not to be a
serious threat for ASV systems, we did not include it in our database. For the
remaining three spoofing types, we designed ten different scenarios (see table below).
We gave special attention to physical access attacks. These attacks are more realistic
than logic access attacks considering the fact that the attacker often has no direct
access to the system. The acquisition devices (sensors) are open to anyone, therefore
more subjected to such attacks.
+-----------------------+--------------+-------------+--------------+-------------+
| | Num. of trials per speaker | Total num. of trials |
+ Attacks +--------------+-------------+--------------+-------------+
| | Male | Female | Male | Female |
+-----------------------+--------------+-------------+--------------+-------------+
|Replay-phone1 | 50 | 50 | 1550 | 650 |
+-----------------------+--------------+-------------+--------------+-------------+
|Replay-phone2 | 50 | 50 | 1550 | 650 |
+-----------------------+--------------+-------------+--------------+-------------+
|Replay-laptop | 50 | 50 | 1550 | 650 |
+-----------------------+--------------+-------------+--------------+-------------+
|Replay-laptop-HQ | 50 | 50 | 1550 | 650 |
+-----------------------+--------------+-------------+--------------+-------------+
|Speech-Synthesis-LA | 35 | 35 | 1085 | 455 |
+-----------------------+--------------+-------------+--------------+-------------+
|Speech-Synthesis-PA | 35 | 35 | 1085 | 455 |
+-----------------------+--------------+-------------+--------------+-------------+
|Speech-Synthesis-PA-HQ | 35 | 35 | 1085 | 455 |
+-----------------------+--------------+-------------+--------------+-------------+
|Voice-Conversion-LA | 1500 | 600 | 46500 | 7800 |
+-----------------------+--------------+-------------+--------------+-------------+
|Voice-Conversion-PA | 1500 | 600 | 46500 | 7800 |
+-----------------------+--------------+-------------+--------------+-------------+
|Voice-Conversion-PA-HQ | 1500 | 600 | 46500 | 7800 |
+-----------------------+--------------+-------------+--------------+-------------+
Replay Attacks
--------------
A replay attack consists of replaying a pre-recorded speech to an ASV system.
We assume that the ASV system has a good quality microphone and the replay attack targets this sensor:
* **Replay-phone1**: Replay attack using the data captured by the Samsung mobile. The speech recorded by this mobile is replayed using its own speakers and re-recorded by the microphone of the ASV system.
* **Replay-phone2**: Replay attack using the data captured by the iPhone mobile. The speech recorded by this mobile is replayed using its own speakers and re-recorded by the microphone of the ASV system.
* **Replay-laptop**: Replay attack using the data captured by the microphone of the ASV system. The speech recorded by this microphone is replayed using the laptop speakers and re-recorded again by the microphone of the system.
* **Replay-laptop-HQ**: Replay attack using the data captured by the microphone of the ASV system. The speech recorded by this microphone is replayed using external high-quality loudspeakers and re-recorded using the microphone of the ASV system.
Speech Synthesis Attacks
------------------------
The speech synthesis attacks were based on statistical parametric speech synthesis (SPSS).
More specific, hidden Markov model (HMM)-based speech synthesis technique was used.
* **Speech-Synthesis-LA**: Speech synthesis via logical access. The synthesized speech is directly presented to the ASV system without being re-recorded.
* **Speech-Synthesis-PA**: Speech synthesis via physical access. The synthesized speech is replayed using the laptop speakers and re-recorded by the microphone of the ASV system.
* **Speech-Synthesis-PA-HQ**: Speech synthesis via high-quality physical access. The synthesized speech is replayed using external high-quality loudspeakers and re-recorded by the microphone of the ASV system.
Voice Conversion Attacks
------------------------
The voice conversion attacks were created using Festvox. A conversion function
for each pair of source-target speaker is found based on the learned GMM model/parameters
by using the source and target speakers training data. We did not consider cross-gender
voice conversion attacks, that is only male-to-male and female-to-female conversions
were taken into account. As in the case of speech synthesis, three possible scenarios are involved:
* **Voice-Conversion-LA**: Voice conversion via logical access. The converted speech is directly presented to the system without being re-recorded.
* **Voice-Conversion-PA**: Voice conversion via physical access. The converted speech is replayed using the speakers of the laptop and re-recorded by the microphone of the ASV system.
* **Voice-Conversion-PA-HQ**: Voice conversion via high-quality physical access. The converted speech is replayed using external high-quality loudspeakers and re-recorded by the microphone of the ASV system.
Specificities to the BEAT View
==============================
Spoofing and genuine samples are each labelled with a text field that defines
the class of the sample: ``"attack"`` or ``"real"`` for simple anti-spoofing
binary classification systems. Code using this
database views may use the ``class`` field to differentiate samples.
The view supports the following protocols, which are also available in the database:
``smalltest`` (for prove of concept experiments only, as a subset of only three clients is
provided for each set), ``grandtest`` (data of the whole database is provided), ``physical_access``
(only replay/presentation attack are provided), and ``logical_access`` (only logical access
attacks are provided with no replay attacks).
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment