Skip to content
Snippets Groups Projects
Commit b46b71b0 authored by Daniel CARRON's avatar Daniel CARRON :b: Committed by André Anjos
Browse files

Update train_analysis, extract values from tensorboard logs

parent eff1554d
No related branches found
No related tags found
1 merge request!6Making use of LightningDataModule and simplification of data loading
...@@ -297,8 +297,22 @@ def experiment( ...@@ -297,8 +297,22 @@ def experiment(
resume_from=resume_from, resume_from=resume_from,
balance_classes=balance_classes, balance_classes=balance_classes,
) )
logger.info("Ended training") logger.info("Ended training")
logger.info("Started train analysis")
from .train_analysis import train_analysis
logdir = os.path.join(train_output_folder, "logs")
output_pdf = os.path.join(train_output_folder, "train_analysis.pdf")
ctx.invoke(
train_analysis,
logdir=logdir,
output_pdf=output_pdf,
)
logger.info("Ended train analysis")
logger.info("Started predicting") logger.info("Started predicting")
from .predict import predict from .predict import predict
......
...@@ -8,126 +8,79 @@ import os ...@@ -8,126 +8,79 @@ import os
import click import click
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import pandas
from clapper.click import ConfigCommand, ResourceOption, verbosity_option from clapper.click import ConfigCommand, ResourceOption, verbosity_option
from clapper.logging import setup from clapper.logging import setup
from matplotlib.ticker import MaxNLocator
logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s") logger = setup(__name__.split(".")[0], format="%(levelname)s: %(message)s")
def _loss_evolution(df): def create_figures(df: pandas.DataFrame) -> list[plt.figure]:
"""Plots the loss evolution over time (epochs) """Generates figures for each metric in the dataframe.
Each row of the dataframe correspond to an epoch and each column to a metric.
It is assumed that some metric names are of the form <metric>/<subset>.
All subsets for a metric will be displayed on the same figure.
Parameters Parameters
---------- ----------
df : pandas.DataFrame df:
dataframe containing the training logs Pandas dataframe containing the data to plot.
Returns Returns
------- -------
matplotlib.figure.Figure: Figure to be displayed or saved to file figures:
List of matplotlib figures, one per metric.
""" """
import numpy
figure = plt.figure()
axes = figure.gca()
axes.plot(df.epoch.values, df.loss.values, label="Training")
if "validation_loss" in df.columns:
axes.plot(
df.epoch.values, df.validation_loss.values, label="Validation"
)
# shows a red dot on the location with the minima on the validation set
lowest_index = numpy.argmin(df["validation_loss"])
axes.plot(
df.epoch.values[lowest_index],
df.validation_loss[lowest_index],
"mo",
label=f"Lowest validation ({df.validation_loss[lowest_index]:.3f}@{df.epoch[lowest_index]})",
)
if "extra_validation_losses" in df.columns:
# These losses are in array format. So, we read all rows, then create a
# 2d array. We transpose the array to iterate over each column and
# plot the losses individually. They are numbered from 1.
df["extra_validation_losses"] = df["extra_validation_losses"].apply(
lambda x: numpy.fromstring(x.strip("[]"), sep=" ")
)
losses = numpy.vstack(df.extra_validation_losses.values).T
for n, k in enumerate(losses):
axes.plot(df.epoch.values, k, label=f"Extra validation {n+1}")
axes.set_title("Loss over time")
axes.set_xlabel("Epoch")
axes.set_ylabel("Loss")
axes.legend(loc="best")
axes.grid(alpha=0.3)
figure.set_layout_engine("tight")
return figure
def _hardware_utilisation(df, const):
"""Plot the CPU utilisation over time (epochs).
Parameters figures = []
----------
df : pandas.DataFrame labels = sorted(df.columns)
dataframe containing the training logs from collections import defaultdict
const : dict # Dict of metric: subset. Subset can be None.
training and hardware constants metrics_groups = defaultdict(list)
for label in labels:
# Separate the name of the subset from the metric
split_label = label.rsplit("/", 1)
metric = split_label[0]
subset = split_label[1] if len(split_label) > 1 else None
metrics_groups[metric].append(subset)
Returns for metric, subsets in metrics_groups.items():
------- figure = plt.figure()
axes = figure.gca()
matplotlib.figure.Figure: figure to be displayed or saved to file for subset in subsets:
""" if subset is None:
figure = plt.figure() axes.plot(
axes = figure.gca() df["step"].values,
df[metric],
cpu_percent = df.cpu_percent.values / const["cpu_count"] label=metric,
cpu_memory = 100 * df.cpu_rss / const["cpu_memory_total"] )
else:
axes.plot( axes.plot(
df.epoch.values, df["step"].values,
cpu_percent, df[metric + "/" + subset],
label=f"CPU usage (cores: {const['cpu_count']})", label=subset,
) )
axes.plot(
df.epoch.values, axes.xaxis.set_major_locator(MaxNLocator(integer=True))
cpu_memory, axes.set_title(metric)
label=f"CPU memory (total: {const['cpu_memory_total']:.1f} Gb)", axes.set_xlabel("Epoch")
)
if "gpu_percent" in df: axes.legend(loc="best")
axes.plot( axes.grid(alpha=0.3)
df.epoch.values, figure.set_layout_engine("tight")
df.gpu_percent.values,
label=f"GPU usage (type: {const['gpu_name']})", figures.append(figure)
)
if "gpu_memory_percent" in df: return figures
axes.plot(
df.epoch.values,
df.gpu_memory_percent.values,
label=f"GPU memory (total: {const['gpu_memory_total']:.1f} Gb)",
)
axes.set_title("Hardware utilisation over time")
axes.set_xlabel("Epoch")
axes.set_ylabel("Relative utilisation (%)")
axes.set_ylim([0, 100])
axes.legend(loc="best")
axes.grid(alpha=0.3)
figure.set_layout_engine("tight")
return figure
@click.command( @click.command(
...@@ -140,17 +93,12 @@ def _hardware_utilisation(df, const): ...@@ -140,17 +93,12 @@ def _hardware_utilisation(df, const):
.. code:: sh .. code:: sh
ptbench train-analysis -vv log.csv constants.csv ptbench train-analysis -vv results/logs
""", """,
) )
@click.argument( @click.argument(
"log", "logdir",
type=click.Path(dir_okay=False, exists=True), type=click.Path(dir_okay=True, exists=True),
)
@click.argument(
"constants",
type=click.Path(dir_okay=False, exists=True),
) )
@click.option( @click.option(
"--output-pdf", "--output-pdf",
...@@ -162,33 +110,35 @@ def _hardware_utilisation(df, const): ...@@ -162,33 +110,35 @@ def _hardware_utilisation(df, const):
) )
@verbosity_option(logger=logger, cls=ResourceOption, expose_value=False) @verbosity_option(logger=logger, cls=ResourceOption, expose_value=False)
def train_analysis( def train_analysis(
log, logdir: str,
constants, output_pdf: str,
output_pdf,
**_, **_,
): ) -> None:
"""Analyzes the training logs for loss evolution and resource """Creates a plot for each metric in the training logs and saves them in a
utilisation.""" pdf file.
Parameters
----------
logdir:
Directory containing tensorboard event files.
import pandas output_pdf:
The pdf file in which to save the plots.
"""
from matplotlib.backends.backend_pdf import PdfPages from matplotlib.backends.backend_pdf import PdfPages
constants = pandas.read_csv(constants) from ..utils.tensorboard import get_scalars
constants = dict(zip(constants.keys(), constants.values[0]))
data = pandas.read_csv(log) data = get_scalars(logdir)
# makes sure the directory to save the output PDF is there # makes sure the directory to save the output PDF is there
dirname = os.path.dirname(os.path.realpath(output_pdf)) dirname = os.path.dirname(os.path.realpath(output_pdf))
if not os.path.exists(dirname): if not os.path.exists(dirname):
os.makedirs(dirname) os.makedirs(dirname)
# now, do the analysis
with PdfPages(output_pdf) as pdf: with PdfPages(output_pdf) as pdf:
figure = _loss_evolution(data) for figure in create_figures(data):
pdf.savefig(figure) pdf.savefig(figure)
plt.close(figure) plt.close(figure)
figure = _hardware_utilisation(data, constants)
pdf.savefig(figure)
plt.close(figure)
import glob
import os
from collections import defaultdict
from typing import Any
import pandas
from tensorboard.backend.event_processing.event_accumulator import (
EventAccumulator,
)
def get_scalars(logdir: str) -> pandas.DataFrame:
"""Returns scalars stored in tensorboard event files.
Parameters
----------
logdir:
Directory containing the event files.
Returns
-------
data:
Pandas dataframe containing the results. Rows correspond to an epoch, columns to the metrics.
"""
tensorboard_logs = sorted(
glob.glob(os.path.join(logdir, "events.out.tfevents.*"))
)
data: dict[str, dict[str, Any]] = defaultdict(dict)
headers = {"step"}
for logfile in tensorboard_logs:
event_accumulator = EventAccumulator(logfile)
event_accumulator.Reload()
tags = event_accumulator.Tags()
# Can cause issues if different logfiles don't have the same tags
for scalar_tag in tags["scalars"]:
headers.add(scalar_tag)
tag_list = event_accumulator.Scalars(scalar_tag)
for tag_data in tag_list:
_ = tag_data.wall_time
step = tag_data.step
value = tag_data.value
data[step]["step"] = step
data[step][scalar_tag] = value
data = pandas.DataFrame.from_dict(data, orient="index")
return data
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment