From cc0b6c3bf52273081f6bf3c982812a0685ae8c3a Mon Sep 17 00:00:00 2001
From: Andre Anjos <andre.dos.anjos@gmail.com>
Date: Fri, 18 Aug 2023 16:10:26 +0200
Subject: [PATCH] [scripts.train_analysis] Simplify and remove pandas
 requirements

---
 conda/meta.yaml                       |   2 -
 doc/conf.py                           |   1 -
 pyproject.toml                        |   1 -
 src/ptbench/scripts/train_analysis.py | 121 +++++++++++++++-----------
 src/ptbench/utils/tensorboard.py      |  54 ++++++------
 5 files changed, 98 insertions(+), 81 deletions(-)

diff --git a/conda/meta.yaml b/conda/meta.yaml
index 58d7ab74..fd299908 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -26,7 +26,6 @@ requirements:
     - click {{ click }}
     - matplotlib {{ matplotlib }}
     - numpy {{ numpy }}
-    - pandas {{ pandas }}
     - pillow {{ pillow }}
     - psutil {{ psutil }}
     - pytorch {{ pytorch }}
@@ -43,7 +42,6 @@ requirements:
     - {{ pin_compatible('click') }}
     - {{ pin_compatible('matplotlib') }}
     - {{ pin_compatible('numpy') }}
-    - {{ pin_compatible('pandas') }}
     - {{ pin_compatible('pillow') }}
     - {{ pin_compatible('psutil') }}
     - {{ pin_compatible('pytorch') }}
diff --git a/doc/conf.py b/doc/conf.py
index b69ef1b9..e9d570d2 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -117,7 +117,6 @@ autodoc_default_options = {
 auto_intersphinx_packages = [
     "matplotlib",
     "numpy",
-    "pandas",
     "pillow",
     "psutil",
     "scipy",
diff --git a/pyproject.toml b/pyproject.toml
index 30741ef8..16082e2d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,7 +30,6 @@ dependencies = [
   "clapper",
   "click",
   "numpy",
-  "pandas",
   "scipy",
   "scikit-learn",
   "tqdm",
diff --git a/src/ptbench/scripts/train_analysis.py b/src/ptbench/scripts/train_analysis.py
index 7fd99a75..f108f00d 100644
--- a/src/ptbench/scripts/train_analysis.py
+++ b/src/ptbench/scripts/train_analysis.py
@@ -5,18 +5,35 @@
 import pathlib
 
 import click
-import matplotlib.figure
-import matplotlib.pyplot as plt
-import pandas
 
 from clapper.click import verbosity_option
 from clapper.logging import setup
-from matplotlib.ticker import MaxNLocator
+
+# avoids X11/graphical desktop requirement when creating plots
+__import__("matplotlib").use("agg")
 
 logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
 
 
-def create_figures(df: pandas.DataFrame) -> list[matplotlib.figure.Figure]:
+def create_figures(
+    data: dict[str, tuple[list[int], list[float]]],
+    groups: list[str] = [
+        "total-execution-time-seconds",
+        "loss/*",
+        "learning-rate",
+        "memory-used-GB/cpu/*" "rss-GB/cpu/*",
+        "vms-GB/cpu/*",
+        "num-open-files/cpu/*",
+        "num-processes/cpu/*",
+        "percent-usage/cpu/*",
+        # nvidia gpu
+        "memory-percent/gpu/*",
+        "memory-used-GB/gpu/*",
+        "memory-free-GB/gpu/*",
+        "memory-free-GB/gpu/*",
+        "percent-usage/gpu/*",
+    ],
+) -> list:
     """Generates figures for each metric in the dataframe.
 
     Each row of the dataframe correspond to an epoch and each column to a metric.
@@ -27,8 +44,15 @@ def create_figures(df: pandas.DataFrame) -> list[matplotlib.figure.Figure]:
     Parameters
     ----------
 
-    df:
-        Pandas dataframe containing the data to plot.
+    data:
+        A dictionary where keys represent all scalar names, and values
+        correspond to a tuple that contains an array with epoch numbers (when
+        values were taken), when the monitored values themselves.  These lists
+        are pre-sorted by epoch number.
+    groups:
+        A list of scalar globs we are interested on the existing tensorboard
+        data, for plotting.  Values with multiple matches are drawn on the same
+        plot.  Values that do not exist are ignored.
 
 
     Returns
@@ -37,49 +61,46 @@ def create_figures(df: pandas.DataFrame) -> list[matplotlib.figure.Figure]:
     figures:
         List of matplotlib figures, one per metric.
     """
+    import fnmatch
+    import typing
+
+    import matplotlib.pyplot as plt
+
+    from matplotlib.ticker import MaxNLocator
 
     figures = []
 
-    labels = sorted(df.columns)
-    from collections import defaultdict
-
-    # Dict of metric: subset. Subset can be None.
-    metrics_groups = defaultdict(list)
-
-    for label in labels:
-        # Separate the name of the subset from the metric
-        split_label = label.rsplit("/", 1)
-        metric = split_label[0]
-        subset = split_label[1] if len(split_label) > 1 else None
-        metrics_groups[metric].append(subset)
-
-    for metric, subsets in metrics_groups.items():
-        figure = plt.figure()
-        axes = figure.gca()
-
-        for subset in subsets:
-            if subset is None:
-                axes.plot(
-                    df["step"].values,
-                    df[metric],
-                    label=metric,
-                )
-            else:
-                axes.plot(
-                    df["step"].values,
-                    df[metric + "/" + subset],
-                    label=subset,
-                )
-
-        axes.xaxis.set_major_locator(MaxNLocator(integer=True))
-        axes.set_title(metric)
-        axes.set_xlabel("Epoch")
-
-        axes.legend(loc="best")
-        axes.grid(alpha=0.3)
-        figure.set_layout_engine("tight")
-
-        figures.append(figure)
+    for group in groups:
+        curves = {k: data[k] for k in fnmatch.filter(data.keys(), group)}
+
+        if len(curves) == 0:
+            continue
+
+        fig, ax = plt.subplots(1, 1)
+        ax = typing.cast(plt.Axes, ax)
+        fig = typing.cast(plt.Figure, fig)
+
+        if len(curves) == 1:
+            # there is only one curve, just plot it
+            title, (epochs, values) = next(iter(curves.items()))
+            ax.plot(epochs, values)
+
+        else:
+            # this is an aggregate plot, name things consistently
+            labels = {k: k[len(group) - 1 :] for k in curves.keys()}
+            title = group.rstrip("*").rstrip("/")
+            for key, (epochs, values) in curves.items():
+                ax.plot(epochs, values, label=labels[key])
+            ax.legend(loc="best")
+
+        ax.xaxis.set_major_locator(MaxNLocator(integer=True))
+        ax.set_title(title)
+        ax.set_xlabel("Epoch")
+        ax.set_ylabel(title)
+
+        ax.grid(alpha=0.3)
+        fig.tight_layout()
+        figures.append(fig)
 
     return figures
 
@@ -116,11 +137,13 @@ def train_analysis(
     """Creates a plot for each metric in the training logs and saves them in a
     pdf file."""
 
+    import matplotlib.pyplot as plt
+
     from matplotlib.backends.backend_pdf import PdfPages
 
-    from ..utils.tensorboard import get_scalars
+    from ..utils.tensorboard import scalars_to_dict
 
-    data = get_scalars(logdir)
+    data = scalars_to_dict(logdir)
 
     output.parent.mkdir(parents=True, exist_ok=True)
 
diff --git a/src/ptbench/utils/tensorboard.py b/src/ptbench/utils/tensorboard.py
index 7e2feaa4..e41b2c07 100644
--- a/src/ptbench/utils/tensorboard.py
+++ b/src/ptbench/utils/tensorboard.py
@@ -3,52 +3,50 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 
 import pathlib
-import typing
-
-import pandas
 
 from tensorboard.backend.event_processing.event_accumulator import (
     EventAccumulator,
 )
 
 
-def get_scalars(logdir: pathlib.Path) -> pandas.DataFrame:
+def scalars_to_dict(
+    logdir: pathlib.Path,
+) -> dict[str, tuple[list[int], list[float]]]:
     """Returns scalars stored in tensorboard event files.
 
+    This method will gather all tensorboard event files produced by a training
+    run, and will return a dictionary with all collected scalars, ready for
+    plotting.
+
+
     Parameters
     ----------
-
-    logdir:
+    logdir
         Directory containing the event files.
 
+
     Returns
     -------
-
-    data:
-        Pandas dataframe containing the results. Rows correspond to an epoch,
-        columns to the metrics.
+        A dictionary where keys represent all scalar names, and values
+        correspond to a tuple that contains an array with epoch numbers (when
+        values were taken), when the monitored values themselves.  The lists
+        are pre-sorted by epoch number.
     """
-    tensorboard_logs = sorted(logdir.glob("events.out.tfevents.*"))
+    retval: dict[str, tuple[list[int], list[float]]] = {}
 
-    data: dict[str, dict[str, typing.Any]] = {}
-    headers = {"step"}
-
-    for logfile in tensorboard_logs:
+    for logfile in sorted(logdir.glob("events.out.tfevents.*")):
         event_accumulator = EventAccumulator(str(logfile))
         event_accumulator.Reload()
 
-        tags = event_accumulator.Tags()
-        # Can cause issues if different logfiles don't have the same tags
-
-        for scalar_tag in tags["scalars"]:
-            headers.add(scalar_tag)
-            tag_list = event_accumulator.Scalars(scalar_tag)
-            for tag_data in tag_list:
-                _ = tag_data.wall_time
-                step = tag_data.step
-                value = tag_data.value
+        for tag in event_accumulator.Tags()["scalars"]:
+            steps, values = retval.setdefault(tag, ([], []))
+            for data_point in event_accumulator.Scalars(tag):
+                steps.append(data_point.step)
+                values.append(data_point.value)
 
-                data.setdefault(step, {"step": step})["step"] = step
-                data.setdefault(step, {scalar_tag: value})[scalar_tag] = value
+    # reorder according to step number
+    for key, (steps, values) in retval.items():
+        _steps, _values = zip(*sorted(zip(steps, values)))
+        retval[key] = (list(_steps), list(_values))  # type: ignore
 
-    return pandas.DataFrame.from_dict(data, orient="index")
+    return retval
-- 
GitLab