Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
beat.web
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
beat
beat.web
Commits
dfbc4f22
Commit
dfbc4f22
authored
8 years ago
by
André Anjos
Browse files
Options
Downloads
Patches
Plain Diff
[backend] Improve worker job handling (cannot use multiprocessing)
parent
cf4e7bca
No related branches found
No related tags found
1 merge request
!194
Scheduler
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
beat/web/backend/schedule.py
+144
-48
144 additions, 48 deletions
beat/web/backend/schedule.py
with
144 additions
and
48 deletions
beat/web/backend/schedule.py
+
144
−
48
View file @
dfbc4f22
...
...
@@ -27,12 +27,16 @@
'''
Scheduling functions and utilities
'''
import
os
import
sys
import
time
import
socket
import
signal
import
subprocess
import
logging
logger
=
logging
.
getLogger
(
__name__
)
import
multiprocessing
import
psutil
import
simplejson
from
django.conf
import
settings
...
...
@@ -242,7 +246,8 @@ def schedule():
return
assigned_splits
def
process
(
split
,
environments
,
cpulimit
=
None
,
cache
=
settings
.
CACHE_ROOT
):
@transaction.atomic
def
process
(
split_pk
,
execute
,
cpulimit
=
None
,
cache
=
settings
.
CACHE_ROOT
):
'''
Process assigned job splits using beat.core
This task executes the user algorithm on a subprocess. It also serves the
...
...
@@ -265,10 +270,10 @@ def process(split, environments, cpulimit=None, cache=settings.CACHE_ROOT):
Parameters:
split
(:py:class:JobSplit): T
he JobSplit to process
split
_pk (int): The primary-key of t
he JobSplit to process
e
nvironments (dict): A dictionary mapping environment objects from
the
Django database to their actual l
ocat
ion on the file system
e
xecute (str): The path to the ``execute`` program to use for running
the
user code ass
oc
i
at
ed with this job split.
cpulimit (str, Optional): The path to the ``cpulimit`` program to use for
limiting the user code in CPU usage. If not set, then don
'
t use it,
...
...
@@ -280,7 +285,8 @@ def process(split, environments, cpulimit=None, cache=settings.CACHE_ROOT):
'''
split
.
start
()
#officially starts the split
# lock split
split
=
JobSplit
.
objects
.
select_for_update
().
get
(
pk
=
split_pk
)
config
=
simplejson
.
loads
(
split
.
job
.
block
.
command
)
...
...
@@ -288,11 +294,11 @@ def process(split, environments, cpulimit=None, cache=settings.CACHE_ROOT):
if
split
.
job
.
block
.
required_slots
>
1
:
if
(
split
.
start_index
)
is
None
or
(
split
.
end_index
is
None
):
message
=
"
The split %d/%d running on worker `%s
'
for
"
\
message
=
"
The split %d/%d
(pid=%d)
running on worker `%s
'
for
"
\
"
block `%s
'
of experiment `%s
'
could not
"
\
"
be completed: indexes are missing!
"
%
\
(
split
.
split_index
+
1
,
split
.
job
.
block
.
required_slots
,
split
.
worker
,
split
.
job
.
block
.
name
,
split
.
process_id
,
split
.
worker
,
split
.
job
.
block
.
name
,
split
.
job
.
block
.
experiment
.
fullname
())
logger
.
error
(
message
)
split
.
end
(
Result
(
status
=
1
,
usrerr
=
settings
.
DEFAULT_USER_ERROR
,
...
...
@@ -323,25 +329,6 @@ def process(split, environments, cpulimit=None, cache=settings.CACHE_ROOT):
)
raise
RuntimeError
(
message
)
# Check we have a compatible environment to execute the user algorithm
envkey
=
config
[
'
environment
'
]
envinfo
=
environments
.
get
(
'
%(name)s (%(version)s)
'
%
envkey
)
execute_path
=
envinfo
[
'
execute
'
]
if
envinfo
else
None
if
execute_path
is
None
:
message
=
"
Environment `%s
'
is not available for split %d/%d
"
\
"
running at worker `%s
'
, for block `%s
'
of experiment
"
\
"
`%s
'
: %s
"
%
(
split
.
job
.
block
.
environment
,
split
.
split_index
+
1
,
split
.
job
.
block
.
required_slots
,
split
.
worker
,
split
.
job
.
block
.
name
,
split
.
job
.
block
.
experiment
.
fullname
(),
"
Available environments are `%s
'"
%
\
'
|
'
.
join
(
environments
.
keys
()),
)
raise
RuntimeError
(
message
)
queue
=
split
.
job
.
block
.
queue
nb_cores
=
queue
.
cores_per_slot
if
(
nb_cores
>
0
)
and
(
cpulimit
is
None
):
...
...
@@ -355,7 +342,7 @@ def process(split, environments, cpulimit=None, cache=settings.CACHE_ROOT):
# n.b.: with executor may crash on the database view setup
with
executor
:
result
=
executor
.
process
(
execute_path
=
execute
_path
,
execute_path
=
execute
,
virtual_memory_in_megabytes
=
queue
.
memory_limit
,
max_cpu_percent
=
int
(
100
*
float
(
nb_cores
)),
#allows for 150%
cpulimit_path
=
cpulimit
,
...
...
@@ -371,35 +358,54 @@ def process(split, environments, cpulimit=None, cache=settings.CACHE_ROOT):
syserr
=
result
[
'
system_error
'
],
_stats
=
simplejson
.
dumps
(
result
[
'
statistics
'
],
indent
=
2
),
))
logger
.
info
(
"
Process `%s
'
for split `%s
'
ended gracefully
"
,
split
.
process_id
,
split
)
except
:
from
traceback
import
format_exc
from
beat.core.stats
import
Statistics
logger
.
error
(
format_exc
())
logger
.
warn
(
"
Process `%s
'
for split `%s
'
ended with an error
"
,
split
.
process_id
,
split
)
split
.
end
(
Result
(
status
=
1
,
usrerr
=
settings
.
DEFAULT_USER_ERROR
,
syserr
=
format_exc
(),))
def
multiprocess
(
*
args
,
**
kwargs
):
'''
Runs :py:func:`process` through a forked subprocess
@transaction.atomic
def
worker_update
():
'''
Updates the worker state in the database from local readings
'''
Input arguments are the same as for :py:func:`process`.
# myself, raises if I cannot find me
worker
=
Worker
.
objects
.
select_for_update
().
get
(
name
=
socket
.
gethostname
())
Returns:
# check I have at least all cores and memory I'm supposed to have
cores
=
psutil
.
cpu_count
()
ram
=
psutil
.
virtual_memory
().
total
/
(
1024
*
1024
)
worker
.
info
=
''
multiprocessing.Process: an instance of a Process you can call
``join()`` at.
if
cores
<
worker
.
cores
:
logger
.
warn
(
"
Worker `%s
'
only has %d cores which is less then the
"
\
"
value declared on the database - it
'
s not a problem, but note
"
\
"
this worker may get overloaded
"
,
worker
,
cores
)
worker
.
info
+=
'
only %d cores;
'
%
cores
'''
if
ram
<
worker
.
memory
:
logger
.
warn
(
"
Worker `%s
'
only has %d Mb of RAM which is less then
"
\
"
the value declared on the database - it
'
s not a problem, but
"
\
"
note this worker may get overloaded
"
,
worker
,
ram
)
worker
.
info
+=
'
only %d Mb of RAM;
'
%
ram
retval
=
multiprocessing
.
Process
(
target
=
process
,
args
=
args
,
kwargs
=
kwargs
)
retval
.
start
()
return
retval
# update process and memory usage
worker
.
used_cores
=
int
(
psutil
.
cpu_percent
())
worker
.
used_memory
=
int
(
psutil
.
virtual_memory
().
percent
)
# save current worker state
worker
.
info
+=
'
updated:
'
+
time
.
asctime
()
worker
.
active
=
True
worker
.
save
()
@transaction.atomic
def
work
(
environments
,
cpulimit
):
def
work
(
environments
,
cpulimit
,
process
):
'''
Launches user code on isolated processes
This function is supposed to be called asynchronously, by a scheduling
...
...
@@ -409,26 +415,74 @@ def work(environments, cpulimit):
Job split cancelling is executed by setting the split state as
``CANCELLED`` and waiting for this function to handle it.
Parameters:
environments (dict): A dictionary containing installed environments,
their description and execute-file paths.
cpulimit (str): The path to the ``cpulimit`` program to use for
limiting the user code in CPU usage. If set to ``None``, then don
'
t use
it, even if the select user queue has limits.
process (str): The path to the ``process.py`` program to use for running
the user code on isolated processes.
'''
# myself, raises if I cannot find me
worker
=
Worker
.
objects
.
select_for_update
().
get
(
name
=
socket
.
gethostname
())
worker
=
Worker
.
objects
.
get
(
name
=
socket
.
gethostname
())
# cancel job splits
for
j
in
JobSplit
.
objects
.
select_for_update
().
filter
(
worker
=
worker
,
status
=
Job
.
CANCELLED
,
end_date__isnull
=
True
,
process_id__isnull
=
False
):
import
signal
signal
.
signal
(
signal
.
SIGKILL
,
j
.
process_id
)
j
.
end
(
None
,
Job
.
CANCELLED
)
# cmdline base argument
cmdline
=
[
process
]
if
cpulimit
is
not
None
:
cmdline
+=
[
'
--cpulimit=%s
'
%
cpulimit
]
if
settings
.
DEBUG
:
cmdline
+=
[
'
-vv
'
]
# start newly assigned job splits
for
j
in
JobSplit
.
objects
.
select_for_update
().
filter
(
worker
=
worker
,
status
=
Job
.
SCHEDULED
,
start_date__isnull
=
True
,
process_id__isnull
=
True
):
multiprocess
(
j
,
environments
,
cpulimit
)
status
=
Job
.
QUEUED
,
start_date__isnull
=
True
,
process_id__isnull
=
True
):
execute
=
pick_execute
(
j
,
environments
)
if
execute
is
None
:
message
=
"
Environment `%s
'
is not available for split %d/%d
"
\
"
running at worker `%s
'
, for block `%s
'
of experiment
"
\
"
`%s
'
: %s
"
%
(
split
.
job
.
block
.
environment
,
split
.
split_index
+
1
,
split
.
job
.
block
.
required_slots
,
split
.
worker
,
split
.
job
.
block
.
name
,
split
.
job
.
block
.
experiment
.
fullname
(),
"
Available environments are `%s
'"
%
\
'
|
'
.
join
(
environments
.
keys
()),
)
j
.
end
(
Result
(
status
=
1
,
usrerr
=
settings
.
DEFAULT_USER_ERROR
,
syserr
=
message
))
# if we get to this point, then we launch the user process
subprocess
.
Popen
(
cmdline
+
[
execute
,
str
(
j
.
pk
)])
def
refresh_environments
(
paths
=
None
):
'''
Refresh current list of known environments
def
resolve_process_path
():
'''
Returns the path to cpulimit
'''
basedir
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
sys
.
argv
[
0
]))
r
=
os
.
path
.
join
(
basedir
,
'
process.py
'
)
if
not
os
.
path
.
exists
(
r
):
raise
RuntimeError
(
"
Cannot find `process.py
'
at `%s
'
- please check
"
\
"
your installation
"
%
basedir
)
return
r
def
find_environments
(
paths
=
None
):
'''
Finds list of known environments
Parameters:
...
...
@@ -467,3 +521,45 @@ def refresh_environments(paths=None):
path
=
pkg_resources
.
resource_filename
(
__name__
,
'
environments
'
)
logger
.
debug
(
"
Search for environments at `%s
'"
,
path
)
return
discover_environments
([
path
])
@transaction.atomic
def
worker_shutdown
():
"""
Standard worker shutdown procedure
Stop all running/assigned splits and then mark the worker as inactive.
"""
# myself, raises if I cannot find me
worker
=
Worker
.
objects
.
select_for_update
().
get
(
name
=
socket
.
gethostname
())
message
=
'
Cancelled on forced worker shutdown (maintenance)
'
\
'
- you may retry submitting your experiment shortly
'
# cancel job splits which are running
for
j
in
JobSplit
.
objects
.
select_for_update
().
filter
(
worker
=
worker
,
status
=
(
Job
.
CANCELLED
,
Job
.
PROCESSING
),
end_date__isnull
=
True
,
process_id__isnull
=
False
):
signal
.
signal
(
signal
.
SIGKILL
,
j
.
process_id
)
j
.
end
(
Result
(
status
=
1
,
usrerr
=
message
))
# cancel job splits which were not yet started
for
j
in
JobSplit
.
objects
.
select_for_update
().
filter
(
worker
=
worker
,
status
=
Job
.
QUEUED
,
start_date__isnull
=
True
,
process_id__isnull
=
True
):
j
.
end
(
Result
(
status
=
1
,
usrerr
=
message
))
# lock worker and modify it
worker
.
active
=
False
worker
.
used_cores
=
0
worker
.
used_memory
=
0
worker
.
info
=
'
Worker shutdown by system administrator
'
worker
.
save
()
def
pick_execute
(
split
,
environments
):
"""
Resolves the path to the ``execute`` program to use for the split
"""
# Check we have a compatible environment to execute the user algorithm
envinfo
=
environments
.
get
(
split
.
job
.
block
.
environment
.
natural_key
())
return
envinfo
[
'
execute
'
]
if
envinfo
else
None
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment