Commit 0a4b78ba authored by Tiago de Freitas Pereira's avatar Tiago de Freitas Pereira
Browse files

Pushed some plots

parent ef8ff15a
Pipeline #51722 failed with stages
in 0 seconds
# see https://docs.python.org/3/library/pkgutil.html
from pkgutil import extend_path
__path__ = extend_path(__path__, __name__)
# see https://docs.python.org/3/library/pkgutil.html
from pkgutil import extend_path
__path__ = extend_path(__path__, __name__)
from .scores import compute_fmr_thresholds, split_scores_by_variable
"""
Some plotting demographic plotting mechanisms
"""
from . import split_scores_by_variable, compute_fmr_thresholds
import matplotlib.pyplot as plt
def plot_demographic_boxplot(
negatives_dev,
positives_dev,
variable_suffix,
negatives_eval=None,
positives_eval=None,
fmr_thresholds=None,
label_lookup_table=None,
percentile=None,
title="",
):
"""
Plot the box-plots of the score distribution
Parameters
----------
negatives_dev: dataframe
Pandas Dataframe containing the negative scores (or impostor scores, or even non-mated scores)
positives_dev: dataframe
Pandas Dataframe containing the positive scores (or genuines scores, or even mated scores)
variable_suffix: str
The suffix of a variable that will be appended to `bio_ref_[variable_suffix]` for biometric references
and `probe_[variable_suffix]` that will be appended to probes.
negatives_eval: dataframe
Pandas Dataframe containing the negative scores (or impostor scores, or even non-mated scores)
positives_dev: dataframe
Pandas Dataframe containing the positive scores (or genuines scores, or even mated scores)
fmr_thresholds: list
List containing the FMR operational points
label_lookup_table: dict
Lookup table mapping `variable` to the actual label of the variable
percentile=None,
title="",
"""
if percentile is not None and percentile < 0 and percentile > 1:
raise ValueError(
f"Percentile needs to be between 0 and 1, {percentile} is provided"
)
# Computing decision thresholds if we have any FMR
taus = (
compute_fmr_thresholds(negatives_dev, fmr_thresholds)
if fmr_thresholds is not None
else None
)
# Spliting the scores by cohort
negatives_dev_as_dict, positives_dev_as_dict = split_scores_by_variable(
negatives_dev, positives_dev, variable_suffix
)
def has_eval():
return negatives_eval is not None and positives_eval is not None
# Spliting the scores by cohort IF we have an evaluation set
if has_eval():
negatives_eval_as_dict, positives_eval_as_dict = split_scores_by_variable(
negatives_eval, positives_eval, variable_suffix
)
def _color_boxplot(bp, color):
for patch in bp["boxes"]:
patch.set_facecolor(color)
def _get_scores(negatives_as_dict, positives_as_dict):
"""
Getting the scores as numpy arrays,
so we can plot using matplotlib
"""
scores = dict()
for n in negatives_as_dict:
# Filtering out by percentile
if percentile is not None:
negatives = negatives_as_dict[n][
negatives_as_dict[n]["score"]
> negatives_as_dict[n]["score"].quantile(1 - percentile)
]
negatives = negatives["score"].to_numpy()
if n in positives_as_dict:
positives_as_dict[n]["score"].to_numpy()
positives = positives_as_dict[n][
positives_as_dict[n]["score"]
< positives_as_dict[n]["score"].quantile(percentile)
]
positives = positives["score"].to_numpy()
else:
positives = []
# Running with the whole data
else:
negatives = negatives_as_dict[n]["score"].to_numpy()
positives = (
positives_as_dict[n]["score"].to_numpy()
if n in positives_as_dict
else []
)
scores[n] = [negatives, positives]
return scores
def _plot(scores, axes, labels):
bp_negatives = axes.boxplot(
[scores[s][0] for s in scores],
patch_artist=True,
labels=labels,
showfliers=False,
vert=False,
)
_color_boxplot(bp_negatives, "tab:red")
bp_positives = axes.boxplot(
[scores[s][1] for s in scores],
patch_artist=True,
labels=labels,
showfliers=False,
vert=False,
)
_color_boxplot(bp_positives, "tab:blue")
# Matching the variable values to
# the actual labels for readability
labels = list(negatives_dev_as_dict.keys())
if label_lookup_table is not None:
labels = [label_lookup_table[l] for l in labels]
# Plotting the boxplots
fig, ax = plt.subplots(figsize=(16, 8))
title = title if percentile is None else title + f" percentile = {percentile}"
fig.suptitle(title)
negatives_dev_as_dict
if has_eval():
axes = plt.subplot(2, 1, 1)
else:
axes = plt.subplot(1, 1, 1)
def _compute_scores_and_plot(
negatives_as_dict, positives_as_dict, axes, plot_fmrs=True
):
# Computing the scores
scores = _get_scores(negatives_as_dict, positives_as_dict)
# Plotting the box plot
_plot(scores, axes, labels)
if plot_fmrs:
if taus is not None:
colors = list(plt.cm.get_cmap("tab20").colors)
[
axes.axvline(
t, linestyle="--", label=f"FMR {f} in the dev set", color=c
)
for t, c, f in zip(taus, colors, fmr_thresholds)
]
_compute_scores_and_plot(negatives_dev_as_dict, positives_dev_as_dict, axes)
if has_eval():
axes = plt.subplot(2, 1, 2)
_compute_scores_and_plot(
negatives_eval_as_dict, positives_eval_as_dict, axes, plot_fmrs=False
)
fig.legend()
return fig
"""
Manipulations with scores
"""
import bob.measure
def compute_fmr_thresholds(negatives, fmrs=[0.1, 0.01, 0.001]):
"""
Compute FMR thresholds given the impostor scores using the function :any:`bob.measure.far_threshold`
Parameters
----------
negatives: dataframe
Pandas Dataframe containing the negative scores (or impostor scores, or even non-mated scores)
fmr: list
List containing the FMR operational points
Returns
-------
taus: list
List containing the decision thresholds
"""
negatives_as_np = negatives["score"].to_numpy().astype("float64")
taus = [bob.measure.far_threshold(negatives_as_np, [], far_value=t) for t in fmrs]
return taus
def split_scores_by_variable(negatives, positives, variable_suffix):
"""
Split positives and negatives dataframe scores by a variable.
For both positives and negatives scores, it will return all possible combinations
of comparisons between biometric references and probes permutating using all possible
values `variable` has.
For example, if your variable is `gender` and in your dataset, this support only two
possible values for this variable (`M` or `F`) this function will return for negatives
a dictionary containing the following keys.
>>> negatives_as_dict = {'M_M':[], 'M_F':[], 'F_M':[], 'F_F':[]}
This corresponds to all possible comparisons possible with negative (non-mated or impostor scores)
For the positives, it will return a dictionary containing the following keys:
>>> positives_as_dict = {'M_M':[], 'F_F':[]}
Parameters
----------
negatives: dataframe
Pandas Dataframe containing the negative scores (or impostor scores, or even non-mated scores)
positives: dataframe
Pandas Dataframe containing the positive scores (or genuines scores, or even mated scores)
variable_suffix: str
The suffix of a variable that will be appended to `bio_ref_[variable_suffix]` for biometric references
and `probe_[variable_suffix]` that will be appended to probes.
Returns
-------
negatives_as_dict: dataframe
Dict containing the pandas Dataframe of the negative scores
positives_as_dict: dataframe
Dict containing the pandas Dataframe of the positive scores
"""
bio_ref_variable = f"bio_ref_{variable_suffix}"
probe_variable = f"probe_{variable_suffix}"
# Getting all possible values (using the negatives as a reference)
bio_ref_cohorts = negatives[bio_ref_variable].unique()
probe_cohorts = negatives[probe_variable].unique()
negatives_as_dict = dict()
positives_as_dict = dict()
def filter(df, variable_suffix, probe_value, bio_ref_value):
return df.loc[
(df[f"probe_{variable_suffix}"] == probe_value)
& (df[f"bio_ref_{variable_suffix}"] == bio_ref_value)
]
for b in bio_ref_cohorts:
for p in probe_cohorts:
negatives_filtered = filter(negatives, variable_suffix, p, b)
negatives_as_dict[f"{b}__{p}"] = negatives_filtered
# In Mated scores, probes and biometric references have
# the same variable_suffix. IF NOT, SOMETHING IS WRONG
# WITH YOUR DATA
if b == p:
positives_filtered = filter(positives, variable_suffix, p, b)
positives_as_dict[f"{b}__{p}"] = positives_filtered
return negatives_as_dict, positives_as_dict
# see https://docs.python.org/3/library/pkgutil.html
from pkgutil import extend_path
__path__ = extend_path(__path__, __name__)
This diff is collapsed.
"""
Base tests
"""
import pandas as pd
from bob.bio.base.score.load import get_dataframe
from bob.bio.demographics import compute_fmr_thresholds, split_scores_by_variable
import pkg_resources
import numpy as np
def test_fmr_thresholds():
score_file = pkg_resources.resource_filename(
"bob.bio.demographics.test", "data/test_scores.csv"
)
negatives, positives = get_dataframe(score_file)
taus = compute_fmr_thresholds(negatives, fmrs=[0.1, 0.01, 0.001])
taus_ref = [-0.8789046186862524, -0.7802872029246775, -0.6941374890187405]
assert np.allclose(taus, taus_ref, rtol=1e-05, atol=1e-05)
def test_split_scores_by_variable():
score_file = pkg_resources.resource_filename(
"bob.bio.demographics.test", "data/test_scores.csv"
)
negatives, positives = get_dataframe(score_file)
variable_suffix = "rac" # Rac is `W` or `B` in this dataset
negatives_as_dict, positives_as_dict = split_scores_by_variable(
negatives, positives, variable_suffix,
)
# Negative keys are `W__W`, `W__B`, `B__W`, and `B__B`
assert len(negatives_as_dict.keys()) == 4
# Positive keys are `W__W`, and `B__B`
assert len(positives_as_dict.keys()) == 2
assert len(negatives_as_dict["W__W"]) == 7236
assert len(negatives_as_dict["W__B"]) == 9845
assert len(negatives_as_dict["B__B"]) == 9845
assert len(negatives_as_dict["B__W"]) == 7504
assert len(positives_as_dict["B__B"]) == 179
assert len(positives_as_dict["W__W"]) == 134
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment