Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
mednet
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
medai
software
mednet
Commits
64fda27e
Commit
64fda27e
authored
1 year ago
by
André Anjos
Browse files
Options
Downloads
Patches
Plain Diff
[engine.callbacks] Fix tensorboard logging; Remove CSV logging (closes
#10
)
parent
4fc45f2a
No related branches found
Branches containing commit
No related tags found
Tags containing commit
1 merge request
!6
Making use of LightningDataModule and simplification of data loading
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
src/ptbench/engine/callbacks.py
+16
-12
16 additions, 12 deletions
src/ptbench/engine/callbacks.py
src/ptbench/engine/trainer.py
+10
-4
10 additions, 4 deletions
src/ptbench/engine/trainer.py
src/ptbench/utils/resources.py
+15
-17
15 additions, 17 deletions
src/ptbench/utils/resources.py
with
41 additions
and
33 deletions
src/ptbench/engine/callbacks.py
+
16
−
12
View file @
64fda27e
...
...
@@ -139,18 +139,18 @@ class LoggingCallback(lightning.pytorch.Callback):
# We disconsider accumulate_grad_batches and assume they were all of
# the same size. This way, the average of averages is the overall
# average.
self
.
_to_log
[
"
train
_loss
"
]
=
torch
.
mean
(
self
.
_to_log
[
"
loss/
train
"
]
=
torch
.
mean
(
torch
.
tensor
(
self
.
_training_epoch_loss
[
0
])
*
torch
.
tensor
(
self
.
_training_epoch_loss
[
1
])
).
item
()
self
.
_to_log
[
"
train_epoch_time
"
]
=
epoch_time
self
.
_to_log
[
"
learning
_
rate
"
]
=
pl_module
.
optimizers
().
defaults
[
"
lr
"
]
self
.
_to_log
[
"
epoch-duration-seconds/train
"
]
=
epoch_time
self
.
_to_log
[
"
learning
-
rate
"
]
=
pl_module
.
optimizers
().
defaults
[
"
lr
"
]
metrics
=
self
.
_resource_monitor
.
data
if
metrics
is
not
None
:
for
metric_name
,
metric_value
in
metrics
.
items
():
self
.
_to_log
[
f
"
train_
{
metric_name
}
"
]
=
float
(
metric_value
)
self
.
_to_log
[
f
"
{
metric_name
}
/train
"
]
=
float
(
metric_value
)
else
:
logger
.
warning
(
"
Unable to fetch monitoring information from
"
...
...
@@ -261,12 +261,12 @@ class LoggingCallback(lightning.pytorch.Callback):
self
.
_resource_monitor
.
checkpoint
()
epoch_time
=
time
.
time
()
-
self
.
_start_validation_epoch_time
self
.
_to_log
[
"
validation_epoch_time
"
]
=
epoch_time
self
.
_to_log
[
"
epoch-duration-seconds/validation
"
]
=
epoch_time
metrics
=
self
.
_resource_monitor
.
data
if
metrics
is
not
None
:
for
metric_name
,
metric_value
in
metrics
.
items
():
self
.
_to_log
[
f
"
validation_
{
metric_name
}
"
]
=
float
(
metric_value
)
self
.
_to_log
[
f
"
{
metric_name
}
/validation
"
]
=
float
(
metric_value
)
else
:
logger
.
warning
(
"
Unable to fetch monitoring information from
"
...
...
@@ -280,9 +280,9 @@ class LoggingCallback(lightning.pytorch.Callback):
# overall average.
for
key
in
sorted
(
self
.
_validation_epoch_loss
.
keys
()):
if
key
==
0
:
name
=
"
validation
_loss
"
name
=
"
loss/
validation
"
else
:
name
=
f
"
validation
_loss_
{
key
}
"
name
=
f
"
loss/
validation
-
{
key
}
"
self
.
_to_log
[
name
]
=
torch
.
mean
(
torch
.
tensor
(
self
.
_validation_epoch_loss
[
key
][
0
])
...
...
@@ -365,16 +365,20 @@ class LoggingCallback(lightning.pytorch.Callback):
# Note: logging should happen at on_validation_end(), but
# apparently you can't log from there
overall_cycle_time
=
time
.
time
()
-
self
.
_start_training_epoch_time
self
.
_to_log
[
"
train_cycle_time
"
]
=
overall_cycle_time
self
.
_to_log
[
"
total_time
"
]
=
time
.
time
()
-
self
.
_start_training_time
self
.
_to_log
[
"
eta
"
]
=
overall_cycle_time
*
(
self
.
_to_log
[
"
cycle-time-seconds/train
"
]
=
overall_cycle_time
self
.
_to_log
[
"
total-execution-time-seconds
"
]
=
(
time
.
time
()
-
self
.
_start_training_time
)
self
.
_to_log
[
"
eta-seconds
"
]
=
overall_cycle_time
*
(
trainer
.
max_epochs
-
trainer
.
current_epoch
# type: ignore
)
# Do not log during sanity check as results are not relevant
if
not
trainer
.
sanity_checking
:
for
k
in
sorted
(
self
.
_to_log
.
keys
()):
pl_module
.
log
(
k
,
self
.
_to_log
[
k
])
pl_module
.
log_dict
(
{
k
:
self
.
_to_log
[
k
],
"
step
"
:
float
(
trainer
.
current_epoch
)}
)
self
.
_to_log
=
{}
...
...
This diff is collapsed.
Click to expand it.
src/ptbench/engine/trainer.py
+
10
−
4
View file @
64fda27e
...
...
@@ -156,9 +156,15 @@ def run(
# Save model summary
_
,
no_of_parameters
=
save_model_summary
(
output_folder
,
model
)
csv_logger
=
lightning
.
pytorch
.
loggers
.
CSVLogger
(
output_folder
,
"
logs_csv
"
)
log_dir
=
"
logs
"
tensorboard_logger
=
lightning
.
pytorch
.
loggers
.
TensorBoardLogger
(
output_folder
,
"
logs_tensorboard
"
output_folder
,
log_dir
,
)
logger
.
info
(
f
"
Monitor experiment with `tensorboard serve
"
f
"
--logdir=
{
output_folder
}
/
{
log_dir
}
/version_*/`.
"
f
"
Then, open a browser on the printed address.
"
)
resource_monitor
=
ResourceMonitor
(
...
...
@@ -172,7 +178,7 @@ def run(
output_folder
,
"
model_lowest_valid_loss
"
,
save_last
=
True
,
monitor
=
"
validation
_loss
"
,
monitor
=
"
loss/
validation
"
,
mode
=
"
min
"
,
save_on_train_epoch_end
=
True
,
every_n_epochs
=
checkpoint_period
,
...
...
@@ -195,7 +201,7 @@ def run(
devices
=
devices
,
max_epochs
=
max_epochs
,
accumulate_grad_batches
=
batch_chunk_count
,
logger
=
[
csv_logger
,
tensorboard_logger
]
,
logger
=
tensorboard_logger
,
check_val_every_n_epoch
=
1
,
callbacks
=
[
LoggingCallback
(
resource_monitor
),
checkpoint_callback
],
)
...
...
This diff is collapsed.
Click to expand it.
src/ptbench/utils/resources.py
+
15
−
17
View file @
64fda27e
...
...
@@ -98,8 +98,8 @@ def gpu_constants() -> dict[str, str | int | float] | None:
return
retval
# else, just update with more generic names
retval
[
"
gpu_
driver
_
version
"
]
=
retval
.
pop
(
"
driver_version
"
)
retval
[
"
gpu_memory_used_GB
"
]
=
retval
.
pop
(
"
memory.total
"
)
retval
[
"
driver
-
version
/gpu
"
]
=
retval
.
pop
(
"
driver_version
"
)
retval
[
"
total-memory-GB/gpu
"
]
=
retval
.
pop
(
"
memory.total
"
)
return
retval
...
...
@@ -135,12 +135,12 @@ def gpu_log() -> dict[str, float] | None:
return
result
return
{
"
gpu_
memory
_
used
_
GB
"
:
float
(
result
[
"
memory.used
"
]),
"
gpu_
memory
_
free
_
GB
"
:
float
(
result
[
"
memory.free
"
]),
"
gpu_
memory
_
percent
"
:
100
"
memory
-
used
-
GB
/gpu
"
:
float
(
result
[
"
memory.used
"
]),
"
memory
-
free
-
GB
/gpu
"
:
float
(
result
[
"
memory.free
"
]),
"
memory
-
percent
/gpu
"
:
100
*
float
(
result
[
"
memory.used
"
])
/
float
(
result
[
"
memory.total
"
]),
"
gpu_
percent
"
:
float
(
result
[
"
utilization.gpu
"
]),
"
percent
-usage/gpu
"
:
float
(
result
[
"
utilization.gpu
"
]),
}
...
...
@@ -158,8 +158,8 @@ def cpu_constants() -> dict[str, int | float]:
1. ``cpu_count`` (:py:class:`int`): number of logical CPUs available
"""
return
{
"
cpu_
memory
_
total
_
GB
"
:
psutil
.
virtual_memory
().
total
/
GB
,
"
c
pu_c
ount
"
:
psutil
.
cpu_count
(
logical
=
True
),
"
memory
-
total
-
GB
/cpu
"
:
psutil
.
virtual_memory
().
total
/
GB
,
"
count
/cpu
"
:
psutil
.
cpu_count
(
logical
=
True
),
}
...
...
@@ -238,12 +238,12 @@ class CPULogger:
# at this point, but ensures to update counts later on
gone
.
add
(
k
)
return
{
"
cpu_
memory
_
used
_
GB
"
:
psutil
.
virtual_memory
().
used
/
GB
,
"
cpu_
rss
_
GB
"
:
sum
([
k
.
rss
for
k
in
memory_info
])
/
GB
,
"
cpu_
vms
_
GB
"
:
sum
([
k
.
vms
for
k
in
memory_info
])
/
GB
,
"
cpu_
percent
"
:
sum
(
cpu_percent
),
"
cpu_
processes
"
:
len
(
self
.
cluster
)
-
len
(
gone
),
"
cpu_
open
_
files
"
:
sum
(
open_files
),
"
memory
-
used
-
GB
/cpu
"
:
psutil
.
virtual_memory
().
used
/
GB
,
"
rss
-
GB
/cpu
"
:
sum
([
k
.
rss
for
k
in
memory_info
])
/
GB
,
"
vms
-
GB
/cpu
"
:
sum
([
k
.
vms
for
k
in
memory_info
])
/
GB
,
"
percent
-usage/cpu
"
:
sum
(
cpu_percent
),
"
num-
processes
/cpu
"
:
len
(
self
.
cluster
)
-
len
(
gone
),
"
num-
open
-
files
/cpu
"
:
sum
(
open_files
),
}
...
...
@@ -342,10 +342,8 @@ def _monitor_worker(
ra
.
acc
()
# guarantees at least an entry will be available
if
summary_event
.
is_set
():
summary
=
ra
.
summary
().
copy
()
queue
.
put
(
summary
)
queue
.
put
(
ra
.
summary
().
copy
())
ra
.
clear
()
print
(
queue
.
get
())
summary_event
.
clear
()
time
.
sleep
(
interval
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment