Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
mednet
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
medai
software
mednet
Commits
367ca62f
Commit
367ca62f
authored
1 year ago
by
Daniel CARRON
Browse files
Options
Downloads
Patches
Plain Diff
Update train_analysis, extract values from tensorboard logs
parent
fdcf8e84
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Pipeline
#76501
failed
1 year ago
Stage: qa
Stage: test
Stage: doc
Stage: dist
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
src/ptbench/scripts/experiment.py
+14
-0
14 additions, 0 deletions
src/ptbench/scripts/experiment.py
src/ptbench/scripts/train_analysis.py
+74
-124
74 additions, 124 deletions
src/ptbench/scripts/train_analysis.py
src/ptbench/utils/tensorboard.py
+55
-0
55 additions, 0 deletions
src/ptbench/utils/tensorboard.py
with
143 additions
and
124 deletions
src/ptbench/scripts/experiment.py
+
14
−
0
View file @
367ca62f
...
...
@@ -297,8 +297,22 @@ def experiment(
resume_from
=
resume_from
,
balance_classes
=
balance_classes
,
)
logger
.
info
(
"
Ended training
"
)
logger
.
info
(
"
Started train analysis
"
)
from
.train_analysis
import
train_analysis
logdir
=
os
.
path
.
join
(
train_output_folder
,
"
logs
"
)
output_pdf
=
os
.
path
.
join
(
train_output_folder
,
"
train_analysis.pdf
"
)
ctx
.
invoke
(
train_analysis
,
logdir
=
logdir
,
output_pdf
=
output_pdf
,
)
logger
.
info
(
"
Ended train analysis
"
)
logger
.
info
(
"
Started predicting
"
)
from
.predict
import
predict
...
...
This diff is collapsed.
Click to expand it.
src/ptbench/scripts/train_analysis.py
+
74
−
124
View file @
367ca62f
...
...
@@ -8,126 +8,79 @@ import os
import
click
import
matplotlib.pyplot
as
plt
import
pandas
from
clapper.click
import
ConfigCommand
,
ResourceOption
,
verbosity_option
from
clapper.logging
import
setup
from
matplotlib.ticker
import
MaxNLocator
logger
=
setup
(
__name__
.
split
(
"
.
"
)[
0
],
format
=
"
%(levelname)s: %(message)s
"
)
def
_loss_evolution
(
df
):
"""
Plots the loss evolution over time (epochs)
def
create_figures
(
df
:
pandas
.
DataFrame
)
->
list
[
plt
.
figure
]:
"""
Generates figures for each metric in the dataframe.
Each row of the dataframe correspond to an epoch and each column to a metric.
It is assumed that some metric names are of the form <metric>/<subset>.
All subsets for a metric will be displayed on the same figure.
Parameters
----------
df : pandas.DataFrame
dataframe containing the training logs
df:
Pandas dataframe containing the data to plot.
Returns
-------
matplotlib.figure.Figure: Figure to be displayed or saved to file
figures:
List of matplotlib figures, one per metric.
"""
import
numpy
figure
=
plt
.
figure
()
axes
=
figure
.
gca
()
axes
.
plot
(
df
.
epoch
.
values
,
df
.
loss
.
values
,
label
=
"
Training
"
)
if
"
validation_loss
"
in
df
.
columns
:
axes
.
plot
(
df
.
epoch
.
values
,
df
.
validation_loss
.
values
,
label
=
"
Validation
"
)
# shows a red dot on the location with the minima on the validation set
lowest_index
=
numpy
.
argmin
(
df
[
"
validation_loss
"
])
axes
.
plot
(
df
.
epoch
.
values
[
lowest_index
],
df
.
validation_loss
[
lowest_index
],
"
mo
"
,
label
=
f
"
Lowest validation (
{
df
.
validation_loss
[
lowest_index
]
:
.
3
f
}
@
{
df
.
epoch
[
lowest_index
]
}
)
"
,
)
if
"
extra_validation_losses
"
in
df
.
columns
:
# These losses are in array format. So, we read all rows, then create a
# 2d array. We transpose the array to iterate over each column and
# plot the losses individually. They are numbered from 1.
df
[
"
extra_validation_losses
"
]
=
df
[
"
extra_validation_losses
"
].
apply
(
lambda
x
:
numpy
.
fromstring
(
x
.
strip
(
"
[]
"
),
sep
=
"
"
)
)
losses
=
numpy
.
vstack
(
df
.
extra_validation_losses
.
values
).
T
for
n
,
k
in
enumerate
(
losses
):
axes
.
plot
(
df
.
epoch
.
values
,
k
,
label
=
f
"
Extra validation
{
n
+
1
}
"
)
axes
.
set_title
(
"
Loss over time
"
)
axes
.
set_xlabel
(
"
Epoch
"
)
axes
.
set_ylabel
(
"
Loss
"
)
axes
.
legend
(
loc
=
"
best
"
)
axes
.
grid
(
alpha
=
0.3
)
figure
.
set_layout_engine
(
"
tight
"
)
return
figure
def
_hardware_utilisation
(
df
,
const
):
"""
Plot the CPU utilisation over time (epochs).
Parameters
----------
figures
=
[]
df : pandas.DataFrame
dataframe containing the training logs
labels
=
sorted
(
df
.
columns
)
from
collections
import
defaultdict
const : dict
training and hardware constants
# Dict of metric: subset. Subset can be None.
metrics_groups
=
defaultdict
(
list
)
for
label
in
labels
:
# Separate the name of the subset from the metric
split_label
=
label
.
rsplit
(
"
/
"
,
1
)
metric
=
split_label
[
0
]
subset
=
split_label
[
1
]
if
len
(
split_label
)
>
1
else
None
metrics_groups
[
metric
].
append
(
subset
)
Returns
-------
for
metric
,
subsets
in
metrics_groups
.
items
():
figure
=
plt
.
figure
()
axes
=
figure
.
gca
()
matplotlib.figure.Figure: figure to be displayed or saved to file
"""
figure
=
plt
.
figure
()
axes
=
figure
.
gca
()
cpu_percent
=
df
.
cpu_percent
.
values
/
const
[
"
cpu_count
"
]
cpu_memory
=
100
*
df
.
cpu_rss
/
const
[
"
cpu_memory_total
"
]
axes
.
plot
(
df
.
epoch
.
values
,
cpu_percent
,
label
=
f
"
CPU usage (cores:
{
const
[
'
cpu_count
'
]
}
)
"
,
)
axes
.
plot
(
df
.
epoch
.
values
,
cpu_memory
,
label
=
f
"
CPU memory (total:
{
const
[
'
cpu_memory_total
'
]
:
.
1
f
}
Gb)
"
,
)
if
"
gpu_percent
"
in
df
:
axes
.
plot
(
df
.
epoch
.
values
,
df
.
gpu_percent
.
values
,
label
=
f
"
GPU usage (type:
{
const
[
'
gpu_name
'
]
}
)
"
,
)
if
"
gpu_memory_percent
"
in
df
:
axes
.
plot
(
df
.
epoch
.
values
,
df
.
gpu_memory_percent
.
values
,
label
=
f
"
GPU memory (total:
{
const
[
'
gpu_memory_total
'
]
:
.
1
f
}
Gb)
"
,
)
axes
.
set_title
(
"
Hardware utilisation over time
"
)
axes
.
set_xlabel
(
"
Epoch
"
)
axes
.
set_ylabel
(
"
Relative utilisation (%)
"
)
axes
.
set_ylim
([
0
,
100
])
axes
.
legend
(
loc
=
"
best
"
)
axes
.
grid
(
alpha
=
0.3
)
figure
.
set_layout_engine
(
"
tight
"
)
return
figure
for
subset
in
subsets
:
if
subset
is
None
:
axes
.
plot
(
df
[
"
step
"
].
values
,
df
[
metric
],
label
=
metric
,
)
else
:
axes
.
plot
(
df
[
"
step
"
].
values
,
df
[
metric
+
"
/
"
+
subset
],
label
=
subset
,
)
axes
.
xaxis
.
set_major_locator
(
MaxNLocator
(
integer
=
True
))
axes
.
set_title
(
metric
)
axes
.
set_xlabel
(
"
Epoch
"
)
axes
.
legend
(
loc
=
"
best
"
)
axes
.
grid
(
alpha
=
0.3
)
figure
.
set_layout_engine
(
"
tight
"
)
figures
.
append
(
figure
)
return
figures
@click.command
(
...
...
@@ -140,17 +93,12 @@ def _hardware_utilisation(df, const):
.. code:: sh
ptbench train-analysis -vv log.csv constants.csv
ptbench train-analysis -vv results/logs
"""
,
)
@click.argument
(
"
log
"
,
type
=
click
.
Path
(
dir_okay
=
False
,
exists
=
True
),
)
@click.argument
(
"
constants
"
,
type
=
click
.
Path
(
dir_okay
=
False
,
exists
=
True
),
"
logdir
"
,
type
=
click
.
Path
(
dir_okay
=
True
,
exists
=
True
),
)
@click.option
(
"
--output-pdf
"
,
...
...
@@ -162,33 +110,35 @@ def _hardware_utilisation(df, const):
)
@verbosity_option
(
logger
=
logger
,
cls
=
ResourceOption
,
expose_value
=
False
)
def
train_analysis
(
log
,
constants
,
output_pdf
,
logdir
:
str
,
output_pdf
:
str
,
**
_
,
):
"""
Analyzes the training logs for loss evolution and resource
utilisation.
"""
)
->
None
:
"""
Creates a plot for each metric in the training logs and saves them in a
pdf file.
Parameters
----------
logdir:
Directory containing tensorboard event files.
import
pandas
output_pdf:
The pdf file in which to save the plots.
"""
from
matplotlib.backends.backend_pdf
import
PdfPages
constants
=
pandas
.
read_csv
(
constants
)
constants
=
dict
(
zip
(
constants
.
keys
(),
constants
.
values
[
0
]))
data
=
pandas
.
read_csv
(
log
)
from
..utils.tensorboard
import
get_scalars
data
=
get_scalars
(
log
dir
)
# makes sure the directory to save the output PDF is there
dirname
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
output_pdf
))
if
not
os
.
path
.
exists
(
dirname
):
os
.
makedirs
(
dirname
)
# now, do the analysis
with
PdfPages
(
output_pdf
)
as
pdf
:
figure
=
_loss_evolution
(
data
)
pdf
.
savefig
(
figure
)
plt
.
close
(
figure
)
figure
=
_hardware_utilisation
(
data
,
constants
)
pdf
.
savefig
(
figure
)
plt
.
close
(
figure
)
for
figure
in
create_figures
(
data
):
pdf
.
savefig
(
figure
)
plt
.
close
(
figure
)
This diff is collapsed.
Click to expand it.
src/ptbench/utils/tensorboard.py
0 → 100644
+
55
−
0
View file @
367ca62f
import
glob
import
os
from
collections
import
defaultdict
from
typing
import
Any
import
pandas
from
tensorboard.backend.event_processing.event_accumulator
import
(
EventAccumulator
,
)
def
get_scalars
(
logdir
:
str
)
->
pandas
.
DataFrame
:
"""
Returns scalars stored in tensorboard event files.
Parameters
----------
logdir:
Directory containing the event files.
Returns
-------
data:
Pandas dataframe containing the results. Rows correspond to an epoch, columns to the metrics.
"""
tensorboard_logs
=
sorted
(
glob
.
glob
(
os
.
path
.
join
(
logdir
,
"
events.out.tfevents.*
"
))
)
data
:
dict
[
str
,
dict
[
str
,
Any
]]
=
defaultdict
(
dict
)
headers
=
{
"
step
"
}
for
logfile
in
tensorboard_logs
:
event_accumulator
=
EventAccumulator
(
logfile
)
event_accumulator
.
Reload
()
tags
=
event_accumulator
.
Tags
()
# Can cause issues if different logfiles don't have the same tags
for
scalar_tag
in
tags
[
"
scalars
"
]:
headers
.
add
(
scalar_tag
)
tag_list
=
event_accumulator
.
Scalars
(
scalar_tag
)
for
tag_data
in
tag_list
:
_
=
tag_data
.
wall_time
step
=
tag_data
.
step
value
=
tag_data
.
value
data
[
step
][
"
step
"
]
=
step
data
[
step
][
scalar_tag
]
=
value
data
=
pandas
.
DataFrame
.
from_dict
(
data
,
orient
=
"
index
"
)
return
data
This diff is collapsed.
Click to expand it.
Daniel CARRON
@dcarron
mentioned in issue
#19 (closed)
·
1 year ago
mentioned in issue
#19 (closed)
mentioned in issue #19
Toggle commit list
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment