[scripts.train_analysis] Simplify and remove pandas requirements

cc0b6c3b · André Anjos · c15ea640 · cc0b6c3b · cc0b6c3b · cc0b6c3b
Commit cc0b6c3b authored 1 year ago by André Anjos
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -26,7 +26,6 @@ requirements:
    - click {{ click }}
    - matplotlib {{ matplotlib }}
    - numpy {{ numpy }}
-    - pandas {{ pandas }}
    - pillow {{ pillow }}
    - psutil {{ psutil }}
    - pytorch {{ pytorch }}
@@ -43,7 +42,6 @@ requirements:
    - {{ pin_compatible('click') }}
    - {{ pin_compatible('matplotlib') }}
    - {{ pin_compatible('numpy') }}
-    - {{ pin_compatible('pandas') }}
    - {{ pin_compatible('pillow') }}
    - {{ pin_compatible('psutil') }}
    - {{ pin_compatible('pytorch') }}

--- a/doc/conf.py
+++ b/doc/conf.py
@@ -117,7 +117,6 @@ autodoc_default_options = {
 auto_intersphinx_packages = [
    "matplotlib",
    "numpy",
-    "pandas",
    "pillow",
    "psutil",
    "scipy",

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,7 +30,6 @@ dependencies = [
  "clapper",
  "click",
  "numpy",
-  "pandas",
  "scipy",
  "scikit-learn",
  "tqdm",

--- a/src/ptbench/scripts/train_analysis.py
+++ b/src/ptbench/scripts/train_analysis.py
@@ -5,18 +5,35 @@
 import pathlib

 import click
-import matplotlib.figure
-import matplotlib.pyplot as plt
-import pandas

 from clapper.click import verbosity_option
 from clapper.logging import setup
-from matplotlib.ticker import MaxNLocator
+
+# avoids X11/graphical desktop requirement when creating plots
+__import__("matplotlib").use("agg")

 logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")


-def create_figures(df: pandas.DataFrame) -> list[matplotlib.figure.Figure]:
+def create_figures(
+    data: dict[str, tuple[list[int], list[float]]],
+    groups: list[str] = [
+        "total-execution-time-seconds",
+        "loss/*",
+        "learning-rate",
+        "memory-used-GB/cpu/*" "rss-GB/cpu/*",
+        "vms-GB/cpu/*",
+        "num-open-files/cpu/*",
+        "num-processes/cpu/*",
+        "percent-usage/cpu/*",
+        # nvidia gpu
+        "memory-percent/gpu/*",
+        "memory-used-GB/gpu/*",
+        "memory-free-GB/gpu/*",
+        "memory-free-GB/gpu/*",
+        "percent-usage/gpu/*",
+    ],
+) -> list:
    """Generates figures for each metric in the dataframe.

    Each row of the dataframe correspond to an epoch and each column to a metric.
@@ -27,8 +44,15 @@ def create_figures(df: pandas.DataFrame) -> list[matplotlib.figure.Figure]:
    Parameters
    ----------

-    df:
-        Pandas dataframe containing the data to plot.
+    data:
+        A dictionary where keys represent all scalar names, and values
+        correspond to a tuple that contains an array with epoch numbers (when
+        values were taken), when the monitored values themselves.  These lists
+        are pre-sorted by epoch number.
+    groups:
+        A list of scalar globs we are interested on the existing tensorboard
+        data, for plotting.  Values with multiple matches are drawn on the same
+        plot.  Values that do not exist are ignored.


    Returns
@@ -37,49 +61,46 @@ def create_figures(df: pandas.DataFrame) -> list[matplotlib.figure.Figure]:
    figures:
        List of matplotlib figures, one per metric.
    """
+    import fnmatch
+    import typing
+
+    import matplotlib.pyplot as plt
+
+    from matplotlib.ticker import MaxNLocator

    figures = []

-    labels = sorted(df.columns)
-    from collections import defaultdict
-
-    # Dict of metric: subset. Subset can be None.
-    metrics_groups = defaultdict(list)
-
-    for label in labels:
-        # Separate the name of the subset from the metric
-        split_label = label.rsplit("/", 1)
-        metric = split_label[0]
-        subset = split_label[1] if len(split_label) > 1 else None
-        metrics_groups[metric].append(subset)
-
-    for metric, subsets in metrics_groups.items():
-        figure = plt.figure()
-        axes = figure.gca()
-
-        for subset in subsets:
-            if subset is None:
-                axes.plot(
-                    df["step"].values,
-                    df[metric],
-                    label=metric,
-                )
-            else:
-                axes.plot(
-                    df["step"].values,
-                    df[metric + "/" + subset],
-                    label=subset,
-                )
-
-        axes.xaxis.set_major_locator(MaxNLocator(integer=True))
-        axes.set_title(metric)
-        axes.set_xlabel("Epoch")
-
-        axes.legend(loc="best")
-        axes.grid(alpha=0.3)
-        figure.set_layout_engine("tight")
-
-        figures.append(figure)
+    for group in groups:
+        curves = {k: data[k] for k in fnmatch.filter(data.keys(), group)}
+
+        if len(curves) == 0:
+            continue
+
+        fig, ax = plt.subplots(1, 1)
+        ax = typing.cast(plt.Axes, ax)
+        fig = typing.cast(plt.Figure, fig)
+
+        if len(curves) == 1:
+            # there is only one curve, just plot it
+            title, (epochs, values) = next(iter(curves.items()))
+            ax.plot(epochs, values)
+
+        else:
+            # this is an aggregate plot, name things consistently
+            labels = {k: k[len(group) - 1 :] for k in curves.keys()}
+            title = group.rstrip("*").rstrip("/")
+            for key, (epochs, values) in curves.items():
+                ax.plot(epochs, values, label=labels[key])
+            ax.legend(loc="best")
+
+        ax.xaxis.set_major_locator(MaxNLocator(integer=True))
+        ax.set_title(title)
+        ax.set_xlabel("Epoch")
+        ax.set_ylabel(title)
+
+        ax.grid(alpha=0.3)
+        fig.tight_layout()
+        figures.append(fig)

    return figures

@@ -116,11 +137,13 @@ def train_analysis(
    """Creates a plot for each metric in the training logs and saves them in a
    pdf file."""

+    import matplotlib.pyplot as plt
+
    from matplotlib.backends.backend_pdf import PdfPages

-    from ..utils.tensorboard import get_scalars
+    from ..utils.tensorboard import scalars_to_dict

-    data = get_scalars(logdir)
+    data = scalars_to_dict(logdir)

    output.parent.mkdir(parents=True, exist_ok=True)


--- a/src/ptbench/utils/tensorboard.py
+++ b/src/ptbench/utils/tensorboard.py
@@ -3,52 +3,50 @@
 # SPDX-License-Identifier: GPL-3.0-or-later

 import pathlib
-import typing
-
-import pandas

 from tensorboard.backend.event_processing.event_accumulator import (
    EventAccumulator,
 )


-def get_scalars(logdir: pathlib.Path) -> pandas.DataFrame:
+def scalars_to_dict(
+    logdir: pathlib.Path,
+) -> dict[str, tuple[list[int], list[float]]]:
    """Returns scalars stored in tensorboard event files.

+    This method will gather all tensorboard event files produced by a training
+    run, and will return a dictionary with all collected scalars, ready for
+    plotting.
+
+
    Parameters
    ----------
-
-    logdir:
+    logdir
        Directory containing the event files.

+
    Returns
    -------
-
-    data:
-        Pandas dataframe containing the results. Rows correspond to an epoch,
-        columns to the metrics.
+        A dictionary where keys represent all scalar names, and values
+        correspond to a tuple that contains an array with epoch numbers (when
+        values were taken), when the monitored values themselves.  The lists
+        are pre-sorted by epoch number.
    """
-    tensorboard_logs = sorted(logdir.glob("events.out.tfevents.*"))
+    retval: dict[str, tuple[list[int], list[float]]] = {}

-    data: dict[str, dict[str, typing.Any]] = {}
-    headers = {"step"}
-
-    for logfile in tensorboard_logs:
+    for logfile in sorted(logdir.glob("events.out.tfevents.*")):
        event_accumulator = EventAccumulator(str(logfile))
        event_accumulator.Reload()

-        tags = event_accumulator.Tags()
-        # Can cause issues if different logfiles don't have the same tags
-
-        for scalar_tag in tags["scalars"]:
-            headers.add(scalar_tag)
-            tag_list = event_accumulator.Scalars(scalar_tag)
-            for tag_data in tag_list:
-                _ = tag_data.wall_time
-                step = tag_data.step
-                value = tag_data.value
+        for tag in event_accumulator.Tags()["scalars"]:
+            steps, values = retval.setdefault(tag, ([], []))
+            for data_point in event_accumulator.Scalars(tag):
+                steps.append(data_point.step)
+                values.append(data_point.value)

-                data.setdefault(step, {"step": step})["step"] = step
-                data.setdefault(step, {scalar_tag: value})[scalar_tag] = value
+    # reorder according to step number
+    for key, (steps, values) in retval.items():
+        _steps, _values = zip(*sorted(zip(steps, values)))
+        retval[key] = (list(_steps), list(_values))  # type: ignore

-    return pandas.DataFrame.from_dict(data, orient="index")
+    return retval