From cc0b6c3bf52273081f6bf3c982812a0685ae8c3a Mon Sep 17 00:00:00 2001 From: Andre Anjos <andre.dos.anjos@gmail.com> Date: Fri, 18 Aug 2023 16:10:26 +0200 Subject: [PATCH] [scripts.train_analysis] Simplify and remove pandas requirements --- conda/meta.yaml | 2 - doc/conf.py | 1 - pyproject.toml | 1 - src/ptbench/scripts/train_analysis.py | 121 +++++++++++++++----------- src/ptbench/utils/tensorboard.py | 54 ++++++------ 5 files changed, 98 insertions(+), 81 deletions(-) diff --git a/conda/meta.yaml b/conda/meta.yaml index 58d7ab74..fd299908 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -26,7 +26,6 @@ requirements: - click {{ click }} - matplotlib {{ matplotlib }} - numpy {{ numpy }} - - pandas {{ pandas }} - pillow {{ pillow }} - psutil {{ psutil }} - pytorch {{ pytorch }} @@ -43,7 +42,6 @@ requirements: - {{ pin_compatible('click') }} - {{ pin_compatible('matplotlib') }} - {{ pin_compatible('numpy') }} - - {{ pin_compatible('pandas') }} - {{ pin_compatible('pillow') }} - {{ pin_compatible('psutil') }} - {{ pin_compatible('pytorch') }} diff --git a/doc/conf.py b/doc/conf.py index b69ef1b9..e9d570d2 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -117,7 +117,6 @@ autodoc_default_options = { auto_intersphinx_packages = [ "matplotlib", "numpy", - "pandas", "pillow", "psutil", "scipy", diff --git a/pyproject.toml b/pyproject.toml index 30741ef8..16082e2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,6 @@ dependencies = [ "clapper", "click", "numpy", - "pandas", "scipy", "scikit-learn", "tqdm", diff --git a/src/ptbench/scripts/train_analysis.py b/src/ptbench/scripts/train_analysis.py index 7fd99a75..f108f00d 100644 --- a/src/ptbench/scripts/train_analysis.py +++ b/src/ptbench/scripts/train_analysis.py @@ -5,18 +5,35 @@ import pathlib import click -import matplotlib.figure -import matplotlib.pyplot as plt -import pandas from clapper.click import verbosity_option from clapper.logging import setup -from matplotlib.ticker import MaxNLocator + +# avoids X11/graphical desktop requirement when creating plots +__import__("matplotlib").use("agg") logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") -def create_figures(df: pandas.DataFrame) -> list[matplotlib.figure.Figure]: +def create_figures( + data: dict[str, tuple[list[int], list[float]]], + groups: list[str] = [ + "total-execution-time-seconds", + "loss/*", + "learning-rate", + "memory-used-GB/cpu/*" "rss-GB/cpu/*", + "vms-GB/cpu/*", + "num-open-files/cpu/*", + "num-processes/cpu/*", + "percent-usage/cpu/*", + # nvidia gpu + "memory-percent/gpu/*", + "memory-used-GB/gpu/*", + "memory-free-GB/gpu/*", + "memory-free-GB/gpu/*", + "percent-usage/gpu/*", + ], +) -> list: """Generates figures for each metric in the dataframe. Each row of the dataframe correspond to an epoch and each column to a metric. @@ -27,8 +44,15 @@ def create_figures(df: pandas.DataFrame) -> list[matplotlib.figure.Figure]: Parameters ---------- - df: - Pandas dataframe containing the data to plot. + data: + A dictionary where keys represent all scalar names, and values + correspond to a tuple that contains an array with epoch numbers (when + values were taken), when the monitored values themselves. These lists + are pre-sorted by epoch number. + groups: + A list of scalar globs we are interested on the existing tensorboard + data, for plotting. Values with multiple matches are drawn on the same + plot. Values that do not exist are ignored. Returns @@ -37,49 +61,46 @@ def create_figures(df: pandas.DataFrame) -> list[matplotlib.figure.Figure]: figures: List of matplotlib figures, one per metric. """ + import fnmatch + import typing + + import matplotlib.pyplot as plt + + from matplotlib.ticker import MaxNLocator figures = [] - labels = sorted(df.columns) - from collections import defaultdict - - # Dict of metric: subset. Subset can be None. - metrics_groups = defaultdict(list) - - for label in labels: - # Separate the name of the subset from the metric - split_label = label.rsplit("/", 1) - metric = split_label[0] - subset = split_label[1] if len(split_label) > 1 else None - metrics_groups[metric].append(subset) - - for metric, subsets in metrics_groups.items(): - figure = plt.figure() - axes = figure.gca() - - for subset in subsets: - if subset is None: - axes.plot( - df["step"].values, - df[metric], - label=metric, - ) - else: - axes.plot( - df["step"].values, - df[metric + "/" + subset], - label=subset, - ) - - axes.xaxis.set_major_locator(MaxNLocator(integer=True)) - axes.set_title(metric) - axes.set_xlabel("Epoch") - - axes.legend(loc="best") - axes.grid(alpha=0.3) - figure.set_layout_engine("tight") - - figures.append(figure) + for group in groups: + curves = {k: data[k] for k in fnmatch.filter(data.keys(), group)} + + if len(curves) == 0: + continue + + fig, ax = plt.subplots(1, 1) + ax = typing.cast(plt.Axes, ax) + fig = typing.cast(plt.Figure, fig) + + if len(curves) == 1: + # there is only one curve, just plot it + title, (epochs, values) = next(iter(curves.items())) + ax.plot(epochs, values) + + else: + # this is an aggregate plot, name things consistently + labels = {k: k[len(group) - 1 :] for k in curves.keys()} + title = group.rstrip("*").rstrip("/") + for key, (epochs, values) in curves.items(): + ax.plot(epochs, values, label=labels[key]) + ax.legend(loc="best") + + ax.xaxis.set_major_locator(MaxNLocator(integer=True)) + ax.set_title(title) + ax.set_xlabel("Epoch") + ax.set_ylabel(title) + + ax.grid(alpha=0.3) + fig.tight_layout() + figures.append(fig) return figures @@ -116,11 +137,13 @@ def train_analysis( """Creates a plot for each metric in the training logs and saves them in a pdf file.""" + import matplotlib.pyplot as plt + from matplotlib.backends.backend_pdf import PdfPages - from ..utils.tensorboard import get_scalars + from ..utils.tensorboard import scalars_to_dict - data = get_scalars(logdir) + data = scalars_to_dict(logdir) output.parent.mkdir(parents=True, exist_ok=True) diff --git a/src/ptbench/utils/tensorboard.py b/src/ptbench/utils/tensorboard.py index 7e2feaa4..e41b2c07 100644 --- a/src/ptbench/utils/tensorboard.py +++ b/src/ptbench/utils/tensorboard.py @@ -3,52 +3,50 @@ # SPDX-License-Identifier: GPL-3.0-or-later import pathlib -import typing - -import pandas from tensorboard.backend.event_processing.event_accumulator import ( EventAccumulator, ) -def get_scalars(logdir: pathlib.Path) -> pandas.DataFrame: +def scalars_to_dict( + logdir: pathlib.Path, +) -> dict[str, tuple[list[int], list[float]]]: """Returns scalars stored in tensorboard event files. + This method will gather all tensorboard event files produced by a training + run, and will return a dictionary with all collected scalars, ready for + plotting. + + Parameters ---------- - - logdir: + logdir Directory containing the event files. + Returns ------- - - data: - Pandas dataframe containing the results. Rows correspond to an epoch, - columns to the metrics. + A dictionary where keys represent all scalar names, and values + correspond to a tuple that contains an array with epoch numbers (when + values were taken), when the monitored values themselves. The lists + are pre-sorted by epoch number. """ - tensorboard_logs = sorted(logdir.glob("events.out.tfevents.*")) + retval: dict[str, tuple[list[int], list[float]]] = {} - data: dict[str, dict[str, typing.Any]] = {} - headers = {"step"} - - for logfile in tensorboard_logs: + for logfile in sorted(logdir.glob("events.out.tfevents.*")): event_accumulator = EventAccumulator(str(logfile)) event_accumulator.Reload() - tags = event_accumulator.Tags() - # Can cause issues if different logfiles don't have the same tags - - for scalar_tag in tags["scalars"]: - headers.add(scalar_tag) - tag_list = event_accumulator.Scalars(scalar_tag) - for tag_data in tag_list: - _ = tag_data.wall_time - step = tag_data.step - value = tag_data.value + for tag in event_accumulator.Tags()["scalars"]: + steps, values = retval.setdefault(tag, ([], [])) + for data_point in event_accumulator.Scalars(tag): + steps.append(data_point.step) + values.append(data_point.value) - data.setdefault(step, {"step": step})["step"] = step - data.setdefault(step, {scalar_tag: value})[scalar_tag] = value + # reorder according to step number + for key, (steps, values) in retval.items(): + _steps, _values = zip(*sorted(zip(steps, values))) + retval[key] = (list(_steps), list(_values)) # type: ignore - return pandas.DataFrame.from_dict(data, orient="index") + return retval -- GitLab