Commit 078cf0fd authored by Philip ABBET's avatar Philip ABBET
Browse files

Refactoring of the 'CachedDataSink' class

parent 73fcb84a
......@@ -355,6 +355,7 @@ class Algorithm(object):
self.data = simplejson.load(f)
self.code_path = self.storage.code.path
self.code = self.storage.code.load()
self.groups = self.data['groups']
......@@ -772,3 +773,88 @@ class Algorithm(object):
raise #just re-raise the user exception
return Runner(self.__module, klass, self, exc)
def json_dumps(self, indent=4):
"""Dumps the JSON declaration of this object in a string
Parameters:
indent (int): The number of indentation spaces at every indentation level
Returns:
str: The JSON representation for this object
"""
return simplejson.dumps(self.data, indent=indent,
cls=utils.NumpyJSONEncoder)
def __str__(self):
return self.json_dumps()
def write(self, storage=None):
"""Writes contents to prefix location
Parameters:
storage (Storage, optional): If you pass a new storage, then this object
will be written to that storage point rather than its default.
"""
if self.data['language'] == 'unknown':
raise RuntimeError("algorithm has no programming language set")
if storage is None:
if not self._name:
raise RuntimeError("algorithm has no name")
storage = self.storage #overwrite
storage.save(str(self), self.code, self.description)
def export(self, prefix):
"""Recursively exports itself into another prefix
Dataformats and associated libraries are also copied.
Parameters:
prefix (str): A path to a prefix that must different then my own.
Returns:
None
Raises:
RuntimeError: If prefix and self.prefix point to the same directory.
"""
if not self._name:
raise RuntimeError("algorithm has no name")
if not self.valid:
raise RuntimeError("algorithm is not valid")
if os.path.samefile(prefix, self.prefix):
raise RuntimeError("Cannot export algorithm to the same prefix (%s == " \
"%s)" % (prefix, self.prefix))
for k in self.libraries.values():
k.export(prefix)
for k in self.dataformats.values():
k.export(prefix)
self.write(Storage(prefix, self.name, self.language))
......@@ -298,6 +298,9 @@ class CachedFileLoader(object):
(self.filenames, indices_filenames, data_checksum_filenames, indices_checksum_filenames) = \
getAllFilenames(filename, start_index, end_index)
if len(self.filenames) == 0:
return False
check_consistency(self.filenames, data_checksum_filenames)
......@@ -487,6 +490,10 @@ class DataSink(object):
pass
def close(self):
pass
#----------------------------------------------------------
......@@ -633,91 +640,21 @@ class CachedDataSink(DataSink):
"""
def __init__(self):
self.filename = None
self.process_id = None
self.split_id = None
self.max_size = None
self._nb_bytes_written = 0
self._write_duration = 0
self._nb_bytes_written_split = 0
self._new_file = False
self._cur_filename = None
self._cur_file = None
self._cur_indexname = None
self._cur_index = None
self._cur_start_index = None
self._cur_end_index = None
self._filenames = []
self._filenames_tmp = []
self._tmp_ext = '.tmp'
self.encoding = None
self.dataformat = None
self.start_index = None
self.end_index = None
def _curTmpFilenameWithSplit(self):
filename, data_ext = os.path.splitext(self.filename)
dirname = os.path.dirname(filename)
basename = os.path.basename(filename)
fd, tmp_file = tempfile.mkstemp(
dir=dirname,
prefix=basename+'.' + str(self.process_id)+'.'+ str(self.split_id)+'_',
suffix=data_ext + self._tmp_ext,
)
os.close(fd) # Preserve only the name
os.unlink(tmp_file)
return tmp_file
def _curFilenameWithIndices(self):
self.data_file = None
self.index_file = None
self.last_written_data_index = None
basename = os.path.basename(self.filename)
basename, data_ext = os.path.splitext(basename)
dirname = os.path.dirname(self.filename)
return os.path.join(dirname, basename + '.' + str(self._cur_start_index) + '.' + str(self._cur_end_index) + data_ext)
self.nb_bytes_written = 0
self.write_duration = 0
def _tmpIndexFilenameFromTmpFilename(self, tmp_filename):
return os.path.splitext(os.path.splitext(tmp_filename)[0])[0] + '.index' + self._tmp_ext
def _indexFilenameFromFilename(self, filename):
return os.path.splitext(filename)[0] + '.index'
def _openAndWriteHeader(self):
"""Write the header of the current file"""
# Close current file if open
self._close_current()
# Open new file in writing mode
self._cur_filename = self._curTmpFilenameWithSplit()
self._cur_indexname = \
self._tmpIndexFilenameFromTmpFilename(self._cur_filename)
self._filenames_tmp.append(self._cur_filename)
try:
self._cur_file = open(self._cur_filename, 'wb')
self._cur_index = open(self._cur_indexname, 'wt')
except:
return
# Write dataformat
self._cur_file.write(six.b('%s\n%s\n' % \
(self.encoding, self.dataformat.name)))
self._cur_file.flush()
# Reset few flags
self._cur_start_index = None
self._cur_end_index = None
self._new_file = False
self._nb_bytes_written_split = 0
def setup(self, filename, dataformat, encoding='binary', process_id=0,
max_size=0):
def setup(self, filename, dataformat, start_index, end_index, encoding='binary'):
"""Configures the data sink
Parameters:
......@@ -734,127 +671,82 @@ class CachedDataSink(DataSink):
"""
# Close current file if open
self.close()
if encoding not in ('binary', 'json'):
raise RuntimeError("valid formats for data writting are 'binary' "
raise RuntimeError("valid formats for data writing are 'binary' "
"or 'json': the format `%s' is invalid" % format)
if dataformat.name == '__unnamed_dataformat__':
raise RuntimeError("cannot record data using an unnammed data format")
raise RuntimeError("cannot record data using an unnamed data format")
self.filename = filename
self.process_id = process_id
self.split_id = 0
self.max_size = max_size
filename, data_ext = os.path.splitext(filename)
self._nb_bytes_written = 0
self._write_duration = 0
self._new_file = True
self.filename = '%s.%d.%d%s' % (filename, start_index, end_index, data_ext)
self.encoding = encoding
self.dataformat = dataformat
self.start_index = start_index
self.end_index = end_index
self._cur_filename = None
self._cur_file = None
self._cur_indexname = None
self._cur_index = None
self._cur_start_index = None
self._cur_end_index = None
self.nb_bytes_written = 0
self.write_duration = 0
self.last_written_data_index = None
self._filenames = []
self._filenames_tmp = []
try:
self.data_file = open(self.filename, 'wb')
self.index_file = open(self.filename.replace('.data', '.index'), 'wt')
except:
return False
self.dataformat = dataformat
self.encoding = encoding
# Write the dataformat
self.data_file.write(six.b('%s\n%s\n' % (self.encoding, self.dataformat.name)))
self.data_file.flush()
return True
def _close_current(self):
def close(self):
"""Closes the data sink
"""
if self._cur_file is not None:
self._cur_file.close()
self._cur_index.close()
if self.data_file is not None:
self.data_file.close()
self.index_file.close()
# If file is empty, remove it
if self._cur_start_index is None or self._cur_end_index is None:
# If file is not complete, delete it
if (self.last_written_data_index is None) or \
(self.last_written_data_index < self.end_index):
try:
os.remove(self._cur_filename)
os.remove(self._cur_index)
os.remove(self.filename)
os.remove(self.filename.replace('.data', '.index'))
return True
except:
return False
self._filenames_tmp.pop()
# Otherwise, append final filename to list
else:
self._filenames.append(self._curFilenameWithIndices())
self._cur_filename = None
self._cur_file = None
self._cur_indexname = None
self._cur_index = None
def close(self):
"""Move the files to final location
"""
self._close_current()
assert len(self._filenames_tmp) == len(self._filenames)
for i in range(len(self._filenames_tmp)):
os.rename(self._filenames_tmp[i], self._filenames[i])
tmp_indexname = \
self._tmpIndexFilenameFromTmpFilename(self._filenames_tmp[i])
final_indexname = self._indexFilenameFromFilename(self._filenames[i])
os.rename(tmp_indexname, final_indexname)
# creates the checksums for all data and indexes
chksum_data = hashFileContents(self._filenames[i])
with open(self._filenames[i] + '.checksum', 'wt') as f:
# Creates the checksums for all data and indexes
chksum_data = hashFileContents(self.filename)
with open(self.filename + '.checksum', 'wt') as f:
f.write(chksum_data)
chksum_index = hashFileContents(final_indexname)
with open(final_indexname + '.checksum', 'wt') as f:
f.write(chksum_index)
self._cur_filename = None
self._cur_file = None
self._cur_indexname = None
self._cur_index = None
self._cur_start_index = None
self._cur_end_index = None
self._filenames = []
self._filenames_tmp = []
index_filename = self.filename.replace('.data', '.index')
chksum_index = hashFileContents(index_filename)
with open(index_filename + '.checksum', 'wt') as f:
f.write(chksum_index)
def reset(self):
"""Move the files to final location
"""
self.data_file = None
self.index_file = None
self.last_written_data_index = None
self._close_current()
assert len(self._filenames_tmp) == len(self._filenames)
for i in range(len(self._filenames_tmp)):
try:
os.remove(self._filenames_tmp[i])
tmp_indexname = \
self._tmpIndexFilenameFromTmpFilename(self._filenames_tmp[i])
os.remove(tmp_indexname)
except:
return False
self._cur_filename = None
self._cur_file = None
self._cur_indexname = None
self._cur_index = None
return True
self._cur_start_index = None
self._cur_end_index = None
self._filenames = []
self._filenames_tmp = []
def __del__(self):
"""Make sure the files are close and renamed when the object is deleted
"""Make sure the files are closed when the object is deleted
"""
self.close()
def write(self, data, start_data_index, end_data_index):
"""Writes a block of data to the filesystem
......@@ -868,8 +760,8 @@ class CachedDataSink(DataSink):
"""
# If the user passed a dictionary - convert it
if isinstance(data, dict):
# the user passed a dictionary - must convert
data = self.dataformat.type(**data)
else:
# Checks that the input data conforms to the expected format
......@@ -877,57 +769,43 @@ class CachedDataSink(DataSink):
raise TypeError("input data uses format `%s' while this sink "
"expects `%s'" % (data.__class__._name, self.dataformat))
# If the flag new_file is set, open new file and write header
if self._new_file:
self._openAndWriteHeader()
if self._cur_file is None:
raise RuntimeError("no destination file")
if self.data_file is None:
raise RuntimeError("No destination file")
# encoding happens here
# Encoding
if self.encoding == 'binary':
encoded_data = data.pack()
else:
from .utils import NumpyJSONEncoder
encoded_data = json.dumps(data.as_dict(), indent=4, cls=NumpyJSONEncoder)
# adds a new line by the end of the encoded data, for clarity
# Adds a new line by the end of the encoded data
encoded_data += six.b('\n')
informations = six.b('%d %d %d\n' % (start_data_index,
end_data_index, len(encoded_data)))
end_data_index, len(encoded_data)))
t1 = time.time()
self._cur_file.write(informations + encoded_data)
self._cur_file.flush()
self.data_file.write(informations + encoded_data)
self.data_file.flush()
indexes = '%d %d\n' % (start_data_index, end_data_index)
self._cur_index.write(indexes)
self._cur_index.flush()
self.index_file.write(indexes)
self.index_file.flush()
t2 = time.time()
self._nb_bytes_written += \
len(informations) + len(encoded_data) + len(indexes)
self._nb_bytes_written_split += \
len(informations) + len(encoded_data) + len(indexes)
self._write_duration += t2 - t1
self.nb_bytes_written += len(informations) + len(encoded_data) + len(indexes)
self.write_duration += t2 - t1
# Update start and end indices
if self._cur_start_index is None:
self._cur_start_index = start_data_index
self._cur_end_index = end_data_index
self.last_written_data_index = end_data_index
# If file size exceeds max, sets the flag to create a new file
if self.max_size != 0 and self._nb_bytes_written >= self.max_size:
self._new_file = True
self.split_id += 1
def statistics(self):
"""Return the statistics about the number of bytes written to the cache"""
return (self.nb_bytes_written, self.write_duration)
return (self._nb_bytes_written, self._write_duration)
def isConnected(self):
return (self.filename is not None)
......
......@@ -430,3 +430,81 @@ class DataFormat(object):
return self.isparent(other.referenced[other.extends])
return False
def json_dumps(self, indent=4):
"""Dumps the JSON declaration of this object in a string
Parameters:
indent (int): The number of indentation spaces at every indentation level
Returns:
str: The JSON representation for this object
"""
return simplejson.dumps(self.data, indent=indent,
cls=utils.NumpyJSONEncoder)
def __str__(self):
return self.json_dumps()
def write(self, storage=None):
"""Writes contents to prefix location
Parameters:
storage (Storage, optional): If you pass a new storage, then this object
will be written to that storage point rather than its default.
"""
if storage is None:
if not self._name:
raise RuntimeError("dataformat has no name")
storage = self.storage #overwrite
storage.save(str(self), self.description)
def export(self, prefix):
"""Recursively exports itself into another prefix
Other required dataformats are also copied.
Parameters:
prefix (str): A path to a prefix that must different then my own.
Returns:
None
Raises:
RuntimeError: If prefix and self.prefix point to the same directory.
"""
if not self._name:
raise RuntimeError("dataformat has no name")
if not self.valid:
raise RuntimeError("dataformat is not valid")
if os.path.samefile(prefix, self.prefix):
raise RuntimeError("Cannot dataformat object to the same prefix (%s " \
"== %s)" % (prefix, self.prefix))
for k in self.referenced.values():
k.export(prefix)
self.write(Storage(prefix, self.name))
......@@ -260,9 +260,8 @@ class DBExecutor(object):
group.add(inputs.Input(name, self.dataformat_cache[input_dataformat_name], data_source))
def process(self, zmq_context, zmq_socket):
self.handler = message_handler.MessageHandler(self.input_list, zmq_context, zmq_socket)
def process(self, address):
self.handler = message_handler.MessageHandler(address, inputs=self.input_list)
self.handler.start()
......@@ -275,6 +274,7 @@ class DBExecutor(object):
def wait(self):
self.handler.join()
self.handler.destroy()
self.handler = None
......
......@@ -136,16 +136,23 @@ class Executor(object):
if not self.input_list or not self.output_list:
raise RuntimeError("I/O for execution block has not yet been set up")
using_output = self.output_list[0] if self.analysis else self.output_list
while self.input_list.hasMoreData():
main_group = self.input_list.main_group
main_group.restricted_access = False
main_group.next()
main_group.restricted_access = True
if not self.runner.process(self.input_list, using_output):
if self.analysis:
result = self.runner.process(inputs=self.input_list, output=self.output_list[0])
else:
result = self.runner.process(inputs=self.input_list, outputs=self.output_list)
if not result:
return False
for output in self.output_list:
output.close()
missing_data_outputs = [x for x in self.output_list if x.isDataMissing()]
if missing_data_outputs:
......
......@@ -247,14 +247,30 @@ def create_outputs_from_configuration(config, algorithm, prefix, cache_root, inp
if exception.errno != errno.EEXIST:
raise
if start_index is None: