Fixing [sphinx] warnings

parent d8e1fb4d
Pipeline #38999 failed with stage
in 5 minutes and 52 seconds
# gets sphinx autodoc done right - don't remove it
__all__ = [_ for _ in dir() if not _.startswith('_')]
# see https://docs.python.org/3/library/pkgutil.html
from pkgutil import extend_path
__path__ = extend_path(__path__, __name__)
__path__ = extend_path(__path__, __name__)
\ No newline at end of file
......@@ -164,58 +164,64 @@ class SGEIdiapCluster(JobQueueCluster):
tag: Mark this worker with an specific tag so dask scheduler can place specific tasks to it (https://distributed.dask.org/en/latest/resources.html)
Example
-------
Below follow a vanilla-example that will create a set of jobs on all.q:
Below follow a vanilla-example that will create a set of jobs on all.q
>>> from bob.pipelines.distributed.sge import SGEIdiapCluster
>>> from dask.distributed import Client
>>> cluster = SGEIdiapCluster()
>>> cluster.scale_up(10)
>>> client = Client(cluster)
It's possible to demand a resource specification yourself:
>>> Q_1DAY_IO_BIG_SPEC = {
"default": {
"queue": "q_1day",
"memory": "8GB",
"io_big": True,
"resource_spec": "",
"resources": "",
}
}
>>> "default": {
>>> "queue": "q_1day",
>>> "memory": "8GB",
>>> "io_big": True,
>>> "resource_spec": "",
>>> "resources": "",
>>> }
>>> }
>>> cluster = SGEIdiapCluster(sge_job_spec=Q_1DAY_IO_BIG_SPEC)
>>> cluster.scale_up(10)
>>> client = Client(cluster)
More than one jon spec can be set:
>>> Q_1DAY_GPU_SPEC = {
"default": {
"queue": "q_1day",
"memory": "8GB",
"io_big": True,
"resource_spec": "",
"resources": "",
},
"gpu": {
"queue": "q_gpu",
"memory": "12GB",
"io_big": False,
"resource_spec": "",
"resources": {"GPU":1},
},
}
>>> "default": {
>>> "queue": "q_1day",
>>> "memory": "8GB",
>>> "io_big": True,
>>> "resource_spec": "",
>>> "resources": "",
>>> },
>>> "gpu": {
>>> "queue": "q_gpu",
>>> "memory": "12GB",
>>> "io_big": False,
>>> "resource_spec": "",
>>> "resources": {"GPU":1},
>>> },
>>> }
>>> cluster = SGEIdiapCluster(sge_job_spec=Q_1DAY_GPU_SPEC)
>>> cluster.scale_up(10)
>>> cluster.scale_up(1, sge_job_spec_key="gpu")
>>> client = Client(cluster)
Adaptive job allocation can also be used via `AdaptiveIdiap` extension
Adaptive job allocation can also be used via `AdaptiveIdiap` extension:
>>> cluster = SGEIdiapCluster(sge_job_spec=Q_1DAY_GPU_SPEC)
>>> cluster.adapt(Adaptive=AdaptiveIdiap,minimum=2, maximum=10)
>>> client = Client(cluster)
"""
def __init__(
......@@ -473,4 +479,4 @@ class SchedulerIdiap(Scheduler):
):
resource_restrictions.append(self.tasks[k].resource_restrictions)
return resource_restrictions
return resource_restrictions
......@@ -20,27 +20,27 @@ def estimator_dask_it(
o, fit_tag=None, transform_tag=None, npartitions=None,
):
"""
Mix up any :py:class:`sklearn.pipeline.Pipeline` or :py:class:`sklearn.estimator.Base` with
Mix up any :py:class:`sklearn.pipeline.Pipeline` or :py:class:`sklearn.base.BaseEstimator` with
:py:class`DaskEstimatorMixin`
Parameters
----------
o: :py:class:`sklearn.pipeline.Pipeline` or :py:class:`sklearn.estimator.Base`
Any :py:class:`sklearn.pipeline.Pipeline` or :py:class:`sklearn.estimator.Base` to be dask mixed
o: :py:class:`sklearn.pipeline.Pipeline` or :py:class:`sklearn.base.BaseEstimator`
Any :py:class:`sklearn.pipeline.Pipeline` or :py:class:`sklearn.base.BaseEstimator` to be dask mixed
fit_tag: list(tuple()) or "str"
Tag the `fit` method. This is useful to tag dask tasks to run in specific workers https://distributed.dask.org/en/latest/resources.html
If `o` is :py:class:`sklearn.pipeline.Pipeline`, this parameter should contain a list of tuples
containing the pipeline.step index and the `str` tag for `fit`.
If `o` is :py:class:`sklearn.estimator.Base`, this parameter should contain just the tag for `fit`
If `o` is :py:class:`sklearn.base.BaseEstimator`, this parameter should contain just the tag for `fit`
transform_tag: list(tuple()) or "str"
Tag the `fit` method. This is useful to tag dask tasks to run in specific workers https://distributed.dask.org/en/latest/resources.html
If `o` is :py:class:`sklearn.pipeline.Pipeline`, this parameter should contain a list of tuples
containing the pipeline.step index and the `str` tag for `transform`.
If `o` is :py:class:`sklearn.estimator.Base`, this parameter should contain just the tag for `transform`
If `o` is :py:class:`sklearn.base.BaseEstimator`, this parameter should contain just the tag for `transform`
Examples
......@@ -254,8 +254,8 @@ class SampleMixin:
return self
# if the estimator needs to be fitted.
kwargs = _make_kwargs_from_samples(samples, self.fit_extra_arguments)
X = [s.data for s in samples]
kwargs = _make_kwargs_from_samples(samples, self.fit_extra_arguments)
X = [s.data for s in samples]
return super().fit(X, **kwargs)
......@@ -474,8 +474,8 @@ class DaskBagMixin(TransformerMixin):
"""Transform an arbitrary iterator into a :py:class:`dask.bag`
Paramters
---------
Parameters
----------
npartitions: int
Number of partitions used it :py:meth:`dask.bag.npartitions`
......
......@@ -2,3 +2,6 @@
from pkgutil import extend_path
__path__ = extend_path(__path__, __name__)
# gets sphinx autodoc done right - don't remove it
__all__ = [_ for _ in dir() if not _.startswith('_')]
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
def is_picklable(obj):
"""
Test if an object is picklable or not
......
......@@ -8,14 +8,16 @@ Mechanism that allows checkpointing of :py:class:`bob.pipelines.sample.Sample` d
Very often during the processing of :py:class:`sklearn.pipeline.Pipeline` with big chunks of data is useful to have checkpoints of some steps of the pipeline into the disk.
This is useful for several purposes:
- Reuse samples that are expensive to be re-computed
- Inspection of algorithms
- Reuse samples that are expensive to be re-computed
- Inspection of algorithms
Scikit learn has a caching mechanism that allows the caching of `estimators <https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline>`_ that can be used for such purpose.
Althought useful, such structure is not user friendly.
As in :ref:`sample`, this can be approached with the :py:class:`bob.pipelines.mixins.CheckpointMixin` mixin, where a new class can be created either dynamically with the :py:func:`bob.pipelines.mixings.mix_me_up` function:
As in :ref:`sample`, this can be approached with the :py:class:`bob.pipelines.mixins.CheckpointMixin` mixin, where a new class can be created either dynamically with the :py:func:`bob.pipelines.mixins.mix_me_up` function:
.. code:: python
......@@ -63,4 +65,6 @@ Those can be checkpointed too as can be observed in the example below.
The keyword argument `features_dir` and `model_payh` defined in lines 52 to 55 sets the absolute path where samples and the model trained after fit will be saved
.. note::
SampleSets can be checkpointed in the exact same way as Samples.
......@@ -2,4 +2,43 @@
========================================
Dask: Scale your scikit.learn pipelines
========================================
\ No newline at end of file
========================================
"`Dask is <https://dask.org/>`_ a flexible library for parallel computing in Python.".
The purpose of this guide is not to describe how dask works, for that, go to its documentation.
Moreover, there are plenty of tutorials online.
For instance, `this official one <https://github.com/dask/dask-tutorial>`_, there is also a nice overview presented in `AnacondaCon 2018 <https://www.youtube.com/watch?v=tQBovBvSDvA>`_ and there's even one crafted for `Idiap <https://github.com/tiagofrepereira2012/tam-dask>`_.
The purpose of this guide is to describe:
1. The integration of dask with scikit learn pipelines and samples
2. The specificities of `Dask` under the Idiap SGE
Dask + scikit learn pipelines
-----------------------------
An arbitrary scikit learn pipeline can be transformed in a `dask graph <https://docs.dask.org/en/latest/graphs.html>`_ to be further executed using the :py:class:`bob.pipelines.mixins.DaskEstimatorMixin` mixin.
This can be mixed with the :py:func:`bob.pipelines.mixins.estimator_dask_it` function.
This function does two things.
1. Edit the in input pipeline adding a new first step, where input samples are wrapped in `Dask Bags <https://docs.dask.org/en/latest/bag.html>`_
2. Create a Dask graph for each step in your pipeline
..todo ::
Provide code sample
Dask + Idiap SGE
----------------
..todo ::
Provide code sample
......@@ -6,7 +6,21 @@ Python API for bob.pipelines
Samples
-----------------
.. autosummary::
bob.pipelines.sample.Sample
bob.pipelines.sample.SampleSet
bob.pipelines.sample.DelayedSample
.. automodule:: bob.pipelines.sample
Mixins
-----------------
.. automodule:: bob.pipelines.mixins
Idiap SGE Support
-----------------
.. automodule:: bob.pipelines.distributed.sge
.. automodule:: bob.pipelines.distributed.local
Transformers
------------
.. automodule:: bob.pipelines.transformers
......@@ -49,7 +49,7 @@ The example below shows a simple snippet on how to build a scikit learn transfor
.. literalinclude:: ./python/pipeline_example.py
:linenos:
As can be observed, `MyTransformer` supports one keyword argument called `metadata` that can't be used by :py:class:`scikit.pipeline.Pipeline`.
As can be observed, `MyTransformer` supports one keyword argument called `metadata` that can't be used by :py:class:`sklearn.pipeline.Pipeline`.
This can be approached with the mixing class :py:class:`bob.pipelines.mixins.SampleMixin`.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment