From e19c229e85882d5fc1e938c25ddc36c1949117e2 Mon Sep 17 00:00:00 2001
From: Andre Anjos <andre.dos.anjos@gmail.com>
Date: Wed, 15 Jul 2020 15:27:19 +0200
Subject: [PATCH] [engine.significance] Close figures to avoid memory leaks in
 mpl

---
 bob/ip/binseg/engine/significance.py | 202 +++++++++++++++++++++++++++
 1 file changed, 202 insertions(+)

diff --git a/bob/ip/binseg/engine/significance.py b/bob/ip/binseg/engine/significance.py
index 1bc7b960..f162d7e8 100644
--- a/bob/ip/binseg/engine/significance.py
+++ b/bob/ip/binseg/engine/significance.py
@@ -10,6 +10,7 @@ from tqdm import tqdm
 import numpy
 import pandas
 import torch.nn
+import scipy.stats
 
 from .evaluator import _sample_measures_for_threshold
 
@@ -284,6 +285,7 @@ def _visual_dataset_performance(stem, img, n, avg, std, outdir):
     fname = os.path.join(outdir, stem + ".pdf")
     os.makedirs(os.path.dirname(fname), exist_ok=True)
     fig.savefig(fname)
+    plt.close(fig)
 
 
 def _patch_performances_for_sample(
@@ -688,3 +690,203 @@ def visual_performances(
                 data.append(df)
 
     return dict(data)
+
+
+def index_of_outliers(c):
+    """Finds indexes of outliers (+/- 1.5*IQR) on a pandas dataframe column
+
+    The IQR measures the midspread or where 50% of a normal distribution would
+    sit, if the input data is, indeed, normal.  1.5 IQR corresponds to a
+    symmetrical range that would encompass most of the data, characterizing
+    outliers (outside of that range).  Check out `this Wikipedia page
+    <https://en.wikipedia.org/wiki/Interquartile_range>` for more details.
+
+
+    Parameters
+    ----------
+
+    c : pandas.DataFrame
+        This should be a **single** column of a pandas dataframe with the
+        ``quantile`` method
+
+
+    Returns
+    -------
+
+    indexes : typing.Sequence
+        Indexes of the input column that are considered outliers in the
+        distribution (outside the 1.5 Interquartile Range).
+
+    """
+
+    iqr = c.quantile(0.75) - c.quantile(0.25)
+    limits = (c.quantile(0.25) - 1.5 * iqr, c.quantile(0.75) + 1.5 * iqr)
+    return (c < limits[0]) | (c > limits[1])
+
+
+def write_analysis_text(names, da, db, f):
+    """Writes a text file containing the most important statistics
+
+    Compares patch performances in ``da`` and ``db`` taking into consideration
+    their statistical properties.  A significance test is applied to check
+    whether observed differences in the statistics of both distributions is
+    significant.
+
+
+    Parameters
+    ==========
+
+    names : tuple
+        A tuple containing two strings which are the names of the systems being
+        analyzed
+
+    da : numpy.ndarray
+        A 1D numpy array containing all the performance figures per patch
+        analyzed and organized in a particular order (raster), for the first
+        system (first entry of ``names``)
+
+    db : numpy.ndarray
+        A 1D numpy array containing all the performance figures per patch
+        analyzed and organized in a particular order (raster), for the second
+        system (second entry of ``names``)
+
+    f : file
+        An open file that will be used dump the analysis to
+
+    """
+
+    diff = da - db
+    f.write("#Samples/Median/Avg/Std.Dev./Normality Conf. F1-scores:\n")
+    f.write(
+        f"* {names[0]}: {len(da)}" \
+        f" / {numpy.median(da):.3f}" \
+        f" / {numpy.mean(da):.3f}" \
+        f" / {numpy.std(da, ddof=1):.3f}\n"
+    )
+    f.write(
+        f"* {names[1]}: {len(db)}" \
+        f" / {numpy.median(db):.3f}" \
+        f" / {numpy.mean(db):.3f}" \
+        f" / {numpy.std(db, ddof=1):.3f}\n"
+    )
+    f.write(
+        f"* {names[0]}-{names[1]}: {len(diff)}" \
+        f" / {numpy.median(diff):.3f}" \
+        f" / {numpy.mean(diff):.3f}" \
+        f" / {numpy.std(diff, ddof=1):.3f}" \
+        f" / gaussian? p={scipy.stats.normaltest(diff)[1]:.3f}\n"
+    )
+
+    w, p = scipy.stats.ttest_rel(da, db)
+    f.write(
+        f"Paired T-test (is the difference zero?): S = {w:g}, p = {p:.5f}\n"
+    )
+
+    w, p = scipy.stats.ttest_ind(da, db, equal_var=False)
+    f.write(f"Ind. T-test (is the difference zero?): S = {w:g}, p = {p:.5f}\n")
+
+    w, p = scipy.stats.wilcoxon(diff)
+    f.write(
+        f"Wilcoxon test (is the difference zero?): W = {w:g}, p = {p:.5f}\n"
+    )
+
+    w, p = scipy.stats.wilcoxon(diff, alternative="greater")
+    f.write(
+        f"Wilcoxon test (md({names[0]}) < md({names[1]})?): " \
+        f"W = {w:g}, p = {p:.5f}\n"
+    )
+
+    w, p = scipy.stats.wilcoxon(diff, alternative="less")
+    f.write(
+        f"Wilcoxon test (md({names[0]}) > md({names[1]})?): " \
+        f"W = {w:g}, p = {p:.5f}\n"
+    )
+
+
+def write_analysis_figures(names, da, db, fname):
+    """Writes a PDF containing most important plots for analysis
+
+
+    Parameters
+    ==========
+
+    names : tuple
+        A tuple containing two strings which are the names of the systems being
+        analyzed
+
+    da : numpy.ndarray
+        A 1D numpy array containing all the performance figures per patch
+        analyzed and organized in a particular order (raster), for the first
+        system (first entry of ``names``)
+
+    db : numpy.ndarray
+        A 1D numpy array containing all the performance figures per patch
+        analyzed and organized in a particular order (raster), for the second
+        system (second entry of ``names``)
+
+    fname : str
+        The filename to use for storing the summarized performance figures
+
+    """
+
+    from matplotlib.backends.backend_pdf import PdfPages
+    import matplotlib.pyplot as plt
+
+    diff = da - db
+    bins = 50
+
+    with PdfPages(fname) as pdf:
+
+        fig = plt.figure()
+        plt.grid()
+        plt.hist(da, bins=bins)
+        plt.title(
+            f"{names[0]} - scores (N={len(da)}; M={numpy.median(da):.3f}; "
+            f"$\mu$={numpy.mean(da):.3f}; $\sigma$={numpy.std(da, ddof=1):.3f})"
+        )
+        pdf.savefig()
+        plt.close(fig)
+
+        fig = plt.figure()
+        plt.grid()
+        plt.hist(db, bins=bins)
+        plt.title(
+            f"{names[1]} - scores (N={len(db)}; M={numpy.median(db):.3f}; "
+            f"$\mu$={numpy.mean(db):.3f}; $\sigma$={numpy.std(db, ddof=1):.3f})"
+        )
+        pdf.savefig()
+        plt.close(fig)
+
+        fig = plt.figure()
+        plt.boxplot([da, db])
+        plt.title(f"{names[0]} and {names[1]} (N={len(da)})")
+        pdf.savefig()
+        plt.close(fig)
+
+        fig = plt.figure()
+        plt.boxplot(diff)
+        plt.title(f"Differences ({names[0]} - {names[1]}) (N={len(da)})")
+        pdf.savefig()
+        plt.close(fig)
+
+        fig = plt.figure()
+        plt.grid()
+        plt.hist(diff, bins=bins)
+        plt.title(
+            f"Systems ({names[0]} - {names[1]}) " \
+            f"(N={len(diff)}; M={numpy.median(diff):.3f}; " \
+            f"$\mu$={numpy.mean(diff):.3f}; " \
+            f"$\sigma$={numpy.std(diff, ddof=1):.3f})"
+        )
+        pdf.savefig()
+        plt.close(fig)
+
+        p = scipy.stats.pearsonr(da, db)
+        fig = plt.figure()
+        plt.grid()
+        plt.scatter(da, db, marker=".", color="black")
+        plt.xlabel("{names[0]}")
+        plt.ylabel("{names[1]}")
+        plt.title(f"Scatter (p={p[0]:.3f})")
+        pdf.savefig()
+        plt.close(fig)
-- 
GitLab