Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
beat
beat.backend.python
Commits
078cf0fd
Commit
078cf0fd
authored
Dec 06, 2017
by
Philip ABBET
Browse files
Refactoring of the 'CachedDataSink' class
parent
73fcb84a
Changes
15
Hide whitespace changes
Inline
Side-by-side
beat/backend/python/algorithm.py
View file @
078cf0fd
...
...
@@ -355,6 +355,7 @@ class Algorithm(object):
self
.
data
=
simplejson
.
load
(
f
)
self
.
code_path
=
self
.
storage
.
code
.
path
self
.
code
=
self
.
storage
.
code
.
load
()
self
.
groups
=
self
.
data
[
'groups'
]
...
...
@@ -772,3 +773,88 @@ class Algorithm(object):
raise
#just re-raise the user exception
return
Runner
(
self
.
__module
,
klass
,
self
,
exc
)
def
json_dumps
(
self
,
indent
=
4
):
"""Dumps the JSON declaration of this object in a string
Parameters:
indent (int): The number of indentation spaces at every indentation level
Returns:
str: The JSON representation for this object
"""
return
simplejson
.
dumps
(
self
.
data
,
indent
=
indent
,
cls
=
utils
.
NumpyJSONEncoder
)
def
__str__
(
self
):
return
self
.
json_dumps
()
def
write
(
self
,
storage
=
None
):
"""Writes contents to prefix location
Parameters:
storage (Storage, optional): If you pass a new storage, then this object
will be written to that storage point rather than its default.
"""
if
self
.
data
[
'language'
]
==
'unknown'
:
raise
RuntimeError
(
"algorithm has no programming language set"
)
if
storage
is
None
:
if
not
self
.
_name
:
raise
RuntimeError
(
"algorithm has no name"
)
storage
=
self
.
storage
#overwrite
storage
.
save
(
str
(
self
),
self
.
code
,
self
.
description
)
def
export
(
self
,
prefix
):
"""Recursively exports itself into another prefix
Dataformats and associated libraries are also copied.
Parameters:
prefix (str): A path to a prefix that must different then my own.
Returns:
None
Raises:
RuntimeError: If prefix and self.prefix point to the same directory.
"""
if
not
self
.
_name
:
raise
RuntimeError
(
"algorithm has no name"
)
if
not
self
.
valid
:
raise
RuntimeError
(
"algorithm is not valid"
)
if
os
.
path
.
samefile
(
prefix
,
self
.
prefix
):
raise
RuntimeError
(
"Cannot export algorithm to the same prefix (%s == "
\
"%s)"
%
(
prefix
,
self
.
prefix
))
for
k
in
self
.
libraries
.
values
():
k
.
export
(
prefix
)
for
k
in
self
.
dataformats
.
values
():
k
.
export
(
prefix
)
self
.
write
(
Storage
(
prefix
,
self
.
name
,
self
.
language
))
beat/backend/python/data.py
View file @
078cf0fd
...
...
@@ -298,6 +298,9 @@ class CachedFileLoader(object):
(
self
.
filenames
,
indices_filenames
,
data_checksum_filenames
,
indices_checksum_filenames
)
=
\
getAllFilenames
(
filename
,
start_index
,
end_index
)
if
len
(
self
.
filenames
)
==
0
:
return
False
check_consistency
(
self
.
filenames
,
data_checksum_filenames
)
...
...
@@ -487,6 +490,10 @@ class DataSink(object):
pass
def
close
(
self
):
pass
#----------------------------------------------------------
...
...
@@ -633,91 +640,21 @@ class CachedDataSink(DataSink):
"""
def
__init__
(
self
):
self
.
filename
=
None
self
.
process_id
=
None
self
.
split_id
=
None
self
.
max_size
=
None
self
.
_nb_bytes_written
=
0
self
.
_write_duration
=
0
self
.
_nb_bytes_written_split
=
0
self
.
_new_file
=
False
self
.
_cur_filename
=
None
self
.
_cur_file
=
None
self
.
_cur_indexname
=
None
self
.
_cur_index
=
None
self
.
_cur_start_index
=
None
self
.
_cur_end_index
=
None
self
.
_filenames
=
[]
self
.
_filenames_tmp
=
[]
self
.
_tmp_ext
=
'.tmp'
self
.
encoding
=
None
self
.
dataformat
=
None
self
.
start_index
=
None
self
.
end_index
=
None
def
_curTmpFilenameWithSplit
(
self
):
filename
,
data_ext
=
os
.
path
.
splitext
(
self
.
filename
)
dirname
=
os
.
path
.
dirname
(
filename
)
basename
=
os
.
path
.
basename
(
filename
)
fd
,
tmp_file
=
tempfile
.
mkstemp
(
dir
=
dirname
,
prefix
=
basename
+
'.'
+
str
(
self
.
process_id
)
+
'.'
+
str
(
self
.
split_id
)
+
'_'
,
suffix
=
data_ext
+
self
.
_tmp_ext
,
)
os
.
close
(
fd
)
# Preserve only the name
os
.
unlink
(
tmp_file
)
return
tmp_file
def
_curFilenameWithIndices
(
self
):
self
.
data_file
=
None
self
.
index_file
=
None
self
.
last_written_data_index
=
None
basename
=
os
.
path
.
basename
(
self
.
filename
)
basename
,
data_ext
=
os
.
path
.
splitext
(
basename
)
dirname
=
os
.
path
.
dirname
(
self
.
filename
)
return
os
.
path
.
join
(
dirname
,
basename
+
'.'
+
str
(
self
.
_cur_start_index
)
+
'.'
+
str
(
self
.
_cur_end_index
)
+
data_ext
)
self
.
nb_bytes_written
=
0
self
.
write_duration
=
0
def
_tmpIndexFilenameFromTmpFilename
(
self
,
tmp_filename
):
return
os
.
path
.
splitext
(
os
.
path
.
splitext
(
tmp_filename
)[
0
])[
0
]
+
'.index'
+
self
.
_tmp_ext
def
_indexFilenameFromFilename
(
self
,
filename
):
return
os
.
path
.
splitext
(
filename
)[
0
]
+
'.index'
def
_openAndWriteHeader
(
self
):
"""Write the header of the current file"""
# Close current file if open
self
.
_close_current
()
# Open new file in writing mode
self
.
_cur_filename
=
self
.
_curTmpFilenameWithSplit
()
self
.
_cur_indexname
=
\
self
.
_tmpIndexFilenameFromTmpFilename
(
self
.
_cur_filename
)
self
.
_filenames_tmp
.
append
(
self
.
_cur_filename
)
try
:
self
.
_cur_file
=
open
(
self
.
_cur_filename
,
'wb'
)
self
.
_cur_index
=
open
(
self
.
_cur_indexname
,
'wt'
)
except
:
return
# Write dataformat
self
.
_cur_file
.
write
(
six
.
b
(
'%s
\n
%s
\n
'
%
\
(
self
.
encoding
,
self
.
dataformat
.
name
)))
self
.
_cur_file
.
flush
()
# Reset few flags
self
.
_cur_start_index
=
None
self
.
_cur_end_index
=
None
self
.
_new_file
=
False
self
.
_nb_bytes_written_split
=
0
def
setup
(
self
,
filename
,
dataformat
,
encoding
=
'binary'
,
process_id
=
0
,
max_size
=
0
):
def
setup
(
self
,
filename
,
dataformat
,
start_index
,
end_index
,
encoding
=
'binary'
):
"""Configures the data sink
Parameters:
...
...
@@ -734,127 +671,82 @@ class CachedDataSink(DataSink):
"""
# Close current file if open
self
.
close
()
if
encoding
not
in
(
'binary'
,
'json'
):
raise
RuntimeError
(
"valid formats for data writ
t
ing are 'binary' "
raise
RuntimeError
(
"valid formats for data writing are 'binary' "
"or 'json': the format `%s' is invalid"
%
format
)
if
dataformat
.
name
==
'__unnamed_dataformat__'
:
raise
RuntimeError
(
"cannot record data using an unnam
m
ed data format"
)
raise
RuntimeError
(
"cannot record data using an unnamed data format"
)
self
.
filename
=
filename
self
.
process_id
=
process_id
self
.
split_id
=
0
self
.
max_size
=
max_size
filename
,
data_ext
=
os
.
path
.
splitext
(
filename
)
self
.
_nb_bytes_written
=
0
self
.
_write_duration
=
0
self
.
_new_file
=
True
self
.
filename
=
'%s.%d.%d%s'
%
(
filename
,
start_index
,
end_index
,
data_ext
)
self
.
encoding
=
encoding
self
.
dataformat
=
dataformat
self
.
start_index
=
start_index
self
.
end_index
=
end_index
self
.
_cur_filename
=
None
self
.
_cur_file
=
None
self
.
_cur_indexname
=
None
self
.
_cur_index
=
None
self
.
_cur_start_index
=
None
self
.
_cur_end_index
=
None
self
.
nb_bytes_written
=
0
self
.
write_duration
=
0
self
.
last_written_data_index
=
None
self
.
_filenames
=
[]
self
.
_filenames_tmp
=
[]
try
:
self
.
data_file
=
open
(
self
.
filename
,
'wb'
)
self
.
index_file
=
open
(
self
.
filename
.
replace
(
'.data'
,
'.index'
),
'wt'
)
except
:
return
False
self
.
dataformat
=
dataformat
self
.
encoding
=
encoding
# Write the dataformat
self
.
data_file
.
write
(
six
.
b
(
'%s
\n
%s
\n
'
%
(
self
.
encoding
,
self
.
dataformat
.
name
)))
self
.
data_file
.
flush
()
return
True
def
_close_current
(
self
):
def
close
(
self
):
"""Closes the data sink
"""
if
self
.
_cur
_file
is
not
None
:
self
.
_cur
_file
.
close
()
self
.
_cur_
index
.
close
()
if
self
.
data
_file
is
not
None
:
self
.
data
_file
.
close
()
self
.
index
_file
.
close
()
# If file is empty, remove it
if
self
.
_cur_start_index
is
None
or
self
.
_cur_end_index
is
None
:
# If file is not complete, delete it
if
(
self
.
last_written_data_index
is
None
)
or
\
(
self
.
last_written_data_index
<
self
.
end_index
):
try
:
os
.
remove
(
self
.
_cur_filename
)
os
.
remove
(
self
.
_cur_index
)
os
.
remove
(
self
.
filename
)
os
.
remove
(
self
.
filename
.
replace
(
'.data'
,
'.index'
))
return
True
except
:
return
False
self
.
_filenames_tmp
.
pop
()
# Otherwise, append final filename to list
else
:
self
.
_filenames
.
append
(
self
.
_curFilenameWithIndices
())
self
.
_cur_filename
=
None
self
.
_cur_file
=
None
self
.
_cur_indexname
=
None
self
.
_cur_index
=
None
def
close
(
self
):
"""Move the files to final location
"""
self
.
_close_current
()
assert
len
(
self
.
_filenames_tmp
)
==
len
(
self
.
_filenames
)
for
i
in
range
(
len
(
self
.
_filenames_tmp
)):
os
.
rename
(
self
.
_filenames_tmp
[
i
],
self
.
_filenames
[
i
])
tmp_indexname
=
\
self
.
_tmpIndexFilenameFromTmpFilename
(
self
.
_filenames_tmp
[
i
])
final_indexname
=
self
.
_indexFilenameFromFilename
(
self
.
_filenames
[
i
])
os
.
rename
(
tmp_indexname
,
final_indexname
)
# creates the checksums for all data and indexes
chksum_data
=
hashFileContents
(
self
.
_filenames
[
i
])
with
open
(
self
.
_filenames
[
i
]
+
'.checksum'
,
'wt'
)
as
f
:
# Creates the checksums for all data and indexes
chksum_data
=
hashFileContents
(
self
.
filename
)
with
open
(
self
.
filename
+
'.checksum'
,
'wt'
)
as
f
:
f
.
write
(
chksum_data
)
chksum_index
=
hashFileContents
(
final_indexname
)
with
open
(
final_indexname
+
'.checksum'
,
'wt'
)
as
f
:
f
.
write
(
chksum_index
)
self
.
_cur_filename
=
None
self
.
_cur_file
=
None
self
.
_cur_indexname
=
None
self
.
_cur_index
=
None
self
.
_cur_start_index
=
None
self
.
_cur_end_index
=
None
self
.
_filenames
=
[]
self
.
_filenames_tmp
=
[]
index_filename
=
self
.
filename
.
replace
(
'.data'
,
'.index'
)
chksum_index
=
hashFileContents
(
index_filename
)
with
open
(
index_filename
+
'.checksum'
,
'wt'
)
as
f
:
f
.
write
(
chksum_index
)
def
reset
(
self
):
"""Move the files to final locati
on
"""
self
.
data_file
=
None
self
.
index_file
=
N
on
e
self
.
last_written_data_index
=
None
self
.
_close_current
()
assert
len
(
self
.
_filenames_tmp
)
==
len
(
self
.
_filenames
)
for
i
in
range
(
len
(
self
.
_filenames_tmp
)):
try
:
os
.
remove
(
self
.
_filenames_tmp
[
i
])
tmp_indexname
=
\
self
.
_tmpIndexFilenameFromTmpFilename
(
self
.
_filenames_tmp
[
i
])
os
.
remove
(
tmp_indexname
)
except
:
return
False
self
.
_cur_filename
=
None
self
.
_cur_file
=
None
self
.
_cur_indexname
=
None
self
.
_cur_index
=
None
return
True
self
.
_cur_start_index
=
None
self
.
_cur_end_index
=
None
self
.
_filenames
=
[]
self
.
_filenames_tmp
=
[]
def
__del__
(
self
):
"""Make sure the files are close
and rename
d when the object is deleted
"""Make sure the files are closed when the object is deleted
"""
self
.
close
()
def
write
(
self
,
data
,
start_data_index
,
end_data_index
):
"""Writes a block of data to the filesystem
...
...
@@ -868,8 +760,8 @@ class CachedDataSink(DataSink):
"""
# If the user passed a dictionary - convert it
if
isinstance
(
data
,
dict
):
# the user passed a dictionary - must convert
data
=
self
.
dataformat
.
type
(
**
data
)
else
:
# Checks that the input data conforms to the expected format
...
...
@@ -877,57 +769,43 @@ class CachedDataSink(DataSink):
raise
TypeError
(
"input data uses format `%s' while this sink "
"expects `%s'"
%
(
data
.
__class__
.
_name
,
self
.
dataformat
))
# If the flag new_file is set, open new file and write header
if
self
.
_new_file
:
self
.
_openAndWriteHeader
()
if
self
.
_cur_file
is
None
:
raise
RuntimeError
(
"no destination file"
)
if
self
.
data_file
is
None
:
raise
RuntimeError
(
"No destination file"
)
#
e
ncoding
happens here
#
E
ncoding
if
self
.
encoding
==
'binary'
:
encoded_data
=
data
.
pack
()
else
:
from
.utils
import
NumpyJSONEncoder
encoded_data
=
json
.
dumps
(
data
.
as_dict
(),
indent
=
4
,
cls
=
NumpyJSONEncoder
)
#
a
dds a new line by the end of the encoded data
, for clarity
#
A
dds a new line by the end of the encoded data
encoded_data
+=
six
.
b
(
'
\n
'
)
informations
=
six
.
b
(
'%d %d %d
\n
'
%
(
start_data_index
,
end_data_index
,
len
(
encoded_data
)))
end_data_index
,
len
(
encoded_data
)))
t1
=
time
.
time
()
self
.
_cur
_file
.
write
(
informations
+
encoded_data
)
self
.
_cur
_file
.
flush
()
self
.
data
_file
.
write
(
informations
+
encoded_data
)
self
.
data
_file
.
flush
()
indexes
=
'%d %d
\n
'
%
(
start_data_index
,
end_data_index
)
self
.
_cur_
index
.
write
(
indexes
)
self
.
_cur_
index
.
flush
()
self
.
index
_file
.
write
(
indexes
)
self
.
index
_file
.
flush
()
t2
=
time
.
time
()
self
.
_nb_bytes_written
+=
\
len
(
informations
)
+
len
(
encoded_data
)
+
len
(
indexes
)
self
.
_nb_bytes_written_split
+=
\
len
(
informations
)
+
len
(
encoded_data
)
+
len
(
indexes
)
self
.
_write_duration
+=
t2
-
t1
self
.
nb_bytes_written
+=
len
(
informations
)
+
len
(
encoded_data
)
+
len
(
indexes
)
self
.
write_duration
+=
t2
-
t1
# Update start and end indices
if
self
.
_cur_start_index
is
None
:
self
.
_cur_start_index
=
start_data_index
self
.
_cur_end_index
=
end_data_index
self
.
last_written_data_index
=
end_data_index
# If file size exceeds max, sets the flag to create a new file
if
self
.
max_size
!=
0
and
self
.
_nb_bytes_written
>=
self
.
max_size
:
self
.
_new_file
=
True
self
.
split_id
+=
1
def
statistics
(
self
):
"""Return the statistics about the number of bytes written to the cache"""
return
(
self
.
nb_bytes_written
,
self
.
write_duration
)
return
(
self
.
_nb_bytes_written
,
self
.
_write_duration
)
def
isConnected
(
self
):
return
(
self
.
filename
is
not
None
)
...
...
beat/backend/python/dataformat.py
100644 → 100755
View file @
078cf0fd
...
...
@@ -430,3 +430,81 @@ class DataFormat(object):
return
self
.
isparent
(
other
.
referenced
[
other
.
extends
])
return
False
def
json_dumps
(
self
,
indent
=
4
):
"""Dumps the JSON declaration of this object in a string
Parameters:
indent (int): The number of indentation spaces at every indentation level
Returns:
str: The JSON representation for this object
"""
return
simplejson
.
dumps
(
self
.
data
,
indent
=
indent
,
cls
=
utils
.
NumpyJSONEncoder
)
def
__str__
(
self
):
return
self
.
json_dumps
()
def
write
(
self
,
storage
=
None
):
"""Writes contents to prefix location
Parameters:
storage (Storage, optional): If you pass a new storage, then this object
will be written to that storage point rather than its default.
"""
if
storage
is
None
:
if
not
self
.
_name
:
raise
RuntimeError
(
"dataformat has no name"
)
storage
=
self
.
storage
#overwrite
storage
.
save
(
str
(
self
),
self
.
description
)
def
export
(
self
,
prefix
):
"""Recursively exports itself into another prefix
Other required dataformats are also copied.
Parameters:
prefix (str): A path to a prefix that must different then my own.
Returns:
None
Raises:
RuntimeError: If prefix and self.prefix point to the same directory.
"""
if
not
self
.
_name
:
raise
RuntimeError
(
"dataformat has no name"
)
if
not
self
.
valid
:
raise
RuntimeError
(
"dataformat is not valid"
)
if
os
.
path
.
samefile
(
prefix
,
self
.
prefix
):
raise
RuntimeError
(
"Cannot dataformat object to the same prefix (%s "
\
"== %s)"
%
(
prefix
,
self
.
prefix
))
for
k
in
self
.
referenced
.
values
():
k
.
export
(
prefix
)
self
.
write
(
Storage
(
prefix
,
self
.
name
))
beat/backend/python/dbexecution.py
View file @
078cf0fd
...
...
@@ -260,9 +260,8 @@ class DBExecutor(object):
group
.
add
(
inputs
.
Input
(
name
,
self
.
dataformat_cache
[
input_dataformat_name
],
data_source
))
def
process
(
self
,
zmq_context
,
zmq_socket
):
self
.
handler
=
message_handler
.
MessageHandler
(
self
.
input_list
,
zmq_context
,
zmq_socket
)
def
process
(
self
,
address
):
self
.
handler
=
message_handler
.
MessageHandler
(
address
,
inputs
=
self
.
input_list
)
self
.
handler
.
start
()
...
...
@@ -275,6 +274,7 @@ class DBExecutor(object):
def
wait
(
self
):
self
.
handler
.
join
()
self
.
handler
.
destroy
()
self
.
handler
=
None
...
...
beat/backend/python/executor.py
View file @
078cf0fd
...
...
@@ -136,16 +136,23 @@ class Executor(object):
if
not
self
.
input_list
or
not
self
.
output_list
:
raise
RuntimeError
(
"I/O for execution block has not yet been set up"
)
using_output
=
self
.
output_list
[
0
]
if
self
.
analysis
else
self
.
output_list
while
self
.
input_list
.
hasMoreData
():
main_group
=
self
.
input_list
.
main_group
main_group
.
restricted_access
=
False
main_group
.
next
()
main_group
.
restricted_access
=
True
if
not
self
.
runner
.
process
(
self
.
input_list
,
using_output
):
if
self
.
analysis
:
result
=
self
.
runner
.
process
(
inputs
=
self
.
input_list
,
output
=
self
.
output_list
[
0
])
else
:
result
=
self
.
runner
.
process
(
inputs
=
self
.
input_list
,
outputs
=
self
.
output_list
)
if
not
result
:
return
False
for
output
in
self
.
output_list
:
output
.
close
()
missing_data_outputs
=
[
x
for
x
in
self
.
output_list
if
x
.
isDataMissing
()]
if
missing_data_outputs
:
...
...
beat/backend/python/helpers.py
View file @
078cf0fd
...
...
@@ -247,14 +247,30 @@ def create_outputs_from_configuration(config, algorithm, prefix, cache_root, inp
if
exception
.
errno
!=
errno
.
EEXIST
:
raise
if
start_index
is
None
: