From 4bb0e089b2c90c03e2c229d8bf060b80511dea08 Mon Sep 17 00:00:00 2001
From: Andre Anjos <andre.dos.anjos@gmail.com>
Date: Mon, 21 Oct 2019 13:32:07 +0200
Subject: [PATCH] [mirror] Own implementation of conda channel mirroring

---
 bob/devtools/mirror.py         | 260 +++++++++++++++++++++++++++++++++
 bob/devtools/scripts/mirror.py | 151 +++++++++++++++++++
 setup.py                       |   1 +
 3 files changed, 412 insertions(+)
 create mode 100644 bob/devtools/mirror.py
 create mode 100644 bob/devtools/scripts/mirror.py

diff --git a/bob/devtools/mirror.py b/bob/devtools/mirror.py
new file mode 100644
index 00000000..660ddbd5
--- /dev/null
+++ b/bob/devtools/mirror.py
@@ -0,0 +1,260 @@
+#!/usr/bin/env python
+# vim: set fileencoding=utf-8 :
+
+
+'''Mirroring functionality for conda channels
+
+Some constructs are bluntly copied from
+https://github.com/valassis-digital-media/conda-mirror
+'''
+
+import os
+import bz2
+import json
+import hashlib
+import fnmatch
+import tempfile
+
+import requests
+
+from .log import get_logger
+logger = get_logger(__name__)
+
+
+
+def _download(url, target_directory):
+    """Download `url` to `target_directory`
+
+    Parameters
+    ----------
+    url : str
+        The url to download
+    target_directory : str
+        The path to a directory where `url` should be downloaded
+
+    Returns
+    -------
+    file_size: int
+        The size in bytes of the file that was downloaded
+    """
+
+    file_size = 0
+    chunk_size = 1024  # 1KB chunks
+    logger.info("Download %s -> %s", url, target_directory)
+    # create a temporary file
+    target_filename = url.split('/')[-1]
+    download_filename = os.path.join(target_directory, target_filename)
+    with open(download_filename, 'w+b') as tf:
+        ret = requests.get(url, stream=True)
+        logger.debug('Saving to %s (%s bytes)', download_filename,
+                ret.headers['Content-length'])
+        for data in ret.iter_content(chunk_size):
+            tf.write(data)
+        file_size = os.path.getsize(download_filename)
+    return file_size
+
+
+def _list_conda_packages(local_dir):
+    """List the conda packages (*.tar.bz2 or *.conda files) in `local_dir`
+
+    Parameters
+    ----------
+    local_dir : str
+        Some local directory with (hopefully) some conda packages in it
+
+    Returns
+    -------
+    list
+        List of conda packages in `local_dir`
+    """
+    contents = os.listdir(local_dir)
+    return fnmatch.filter(contents, "*.conda") + \
+            fnmatch.filter(contents, "*.tar.bz2")
+
+
+def get_json(channel, platform, name):
+    """Get a JSON file for a channel/platform combo on conda channel
+
+    Parameters
+    ----------
+    channel : str
+        Complete channel URL
+    platform : {'linux-64', 'osx-64', 'noarch'}
+        The platform of interest
+    name : str
+        The name of the file to retrieve.  If the name ends in '.bz2', then it
+        is auto-decompressed
+
+    Returns
+    -------
+    repodata : dict
+        contents of repodata.json
+    """
+
+    url = channel + '/' + platform + '/' + name
+    logger.debug('[checking] %s...', url)
+    r = requests.get(url, allow_redirects=True, stream=True)
+    logger.info('[download] %s (%s bytes)...', url, r.headers['Content-length'])
+
+    if name.endswith('.bz2'):
+        # just in case transport encoding was applied
+        r.raw.decode_content = True
+        data = bz2.decompress(r.raw.read())
+    else:
+        data = r.read()
+
+    return json.loads(data)
+
+
+def get_local_contents(path, arch):
+    """Returns the local package contents as a set"""
+
+    path_arch = os.path.join(path, arch)
+    if not os.path.exists(path_arch):
+        return set()
+
+    # path exists, lists currently available packages
+    logger.info('Listing package contents of %s...', path_arch)
+    contents = os.listdir(path_arch)
+    return set(fnmatch.filter(contents, '*.tar.bz2') +
+            fnmatch.filter(contents, '*.conda'))
+
+
+def load_glob_list(path):
+    """Loads a list of globs from a configuration file
+
+    Excludes comments and empty lines
+    """
+
+    retval = [str(k.strip()) for k in open(path, "rt")]
+    return [k for k in retval if k and k[0] not in ("#", "-")]
+
+
+def blacklist_filter(packages, globs):
+    """Filters **out** the input package set with the glob list"""
+
+    to_remove = set()
+    for k in globs:
+        to_remove |= set(fnmatch.filter(packages, k))
+    return packages - to_remove
+
+
+def download_packages(packages, repodata, channel_url, dest_dir, arch, dry_run):
+    """Downloads remote packages to a download directory
+
+    Packages are downloaded first to a temporary directory, then validated
+    according to the expected sha256/md5 sum and then moved, one by one, to the
+    destination directory.  An error is raised if the package cannot be
+    correctly downloaded.
+
+    Parameters
+    ----------
+    packages : list of str
+        List of packages to download from the remote channel
+    repodata: dict
+        A dictionary containing the remote repodata.json contents
+    channel_url: str
+        The complete channel URL
+    dest_dir: str
+        The local directory where the channel is being mirrored
+    arch: str
+        The current architecture which we are mirroring
+    dry_run: bool
+        A boolean flag indicating if this is just a dry-run (simulation),
+        flagging so we don't really do anything (set to ``True``).
+
+    """
+
+    def _sha256sum(filename):
+        h  = hashlib.sha256()
+        b  = bytearray(128*1024)
+        mv = memoryview(b)
+        with open(filename, 'rb', buffering=0) as f:
+            for n in iter(lambda : f.readinto(mv), 0):
+                h.update(mv[:n])
+        return h.hexdigest()
+
+
+    def _md5sum(filename):
+        h  = hashlib.md5()
+        b  = bytearray(128*1024)
+        mv = memoryview(b)
+        with open(filename, 'rb', buffering=0) as f:
+            for n in iter(lambda : f.readinto(mv), 0):
+                h.update(mv[:n])
+        return h.hexdigest()
+
+
+    # download files into temporary directory, that is removed by the end of
+    # the procedure, or if something bad occurs
+    with tempfile.TemporaryDirectory() as download_dir:
+
+        total = len(packages)
+        for k, p in enumerate(packages):
+
+            k+=1 #adjust to produce correct order on printouts
+
+            # checksum to verify
+            if p.endswith('.tar.bz2'):
+                expected_hash = repodata['packages'][p].get('sha256',
+                        repodata['packages'][p]['md5'])
+            else:
+                expected_hash = repodata['packages.conda'][p].get('sha256',
+                        repodata['packages'][p]['md5'])
+
+            # download package to file in our temporary directory
+            url = channel_url + '/' + arch + '/' + p
+            temp_dest = os.path.join(download_dir, p)
+            logger.info('[download: %d/%d] %s -> %s', k, total, url, temp_dest)
+
+            if not dry_run:
+                logger.debug('[checking: %d/%d] %s', k, total, url)
+                r = requests.get(url, stream=True, allow_redirects=True)
+                logger.info('[download: %d/%d] %s -> %s (%s bytes)', k, total,
+                        url, temp_dest, r.headers['Content-length'])
+                open(temp_dest, 'wb').write(r.raw.read())
+
+            # verify that checksum matches
+            if len(expected_hash) == 32:  #md5
+                logger.info('[verify: %d/%d] md5(%s) == %s?', k, total,
+                        temp_dest, expected_hash)
+            else:  #sha256
+                logger.info('[verify: %d/%d] sha256(%s) == %s?', k, total,
+                        temp_dest, expected_hash)
+
+            if not dry_run:
+                if len(expected_hash) == 32:  #md5
+                    actual_hash = _md5sum(temp_dest)
+                else:  #sha256
+                    actual_hash = _sha256sum(temp_dest)
+                assert actual_hash == expected_hash, 'Checksum of locally' \
+                        ' downloaded version of %s does not match ' \
+                        '(actual:%r != %r:expected)' % (url, actual_hash,
+                                expected_hash)
+
+            # move
+            local_dest = os.path.join(dest_dir, arch, p)
+            logger.info('[move: %d/%d] %s -> %s', k, total, temp_dest,
+                    local_dest)
+
+            # check local directory is available before moving
+            dirname = os.path.dirname(local_dest)
+            if not os.path.exists(dirname):
+                logger.info('[mkdir] %s', dirname)
+                if not dry_run:
+                    os.makedirs(dirname)
+
+            if not dry_run:
+                os.rename(temp_dest, local_dest)
+
+
+def remove_packages(packages, dest_dir, arch, dry_run):
+    """Removes local packages that no longer matter"""
+
+    total = len(packages)
+    for k, p in enumerate(packages):
+        k+=1 #adjust to produce correct order on printouts
+        path = os.path.join(dest_dir, arch, p)
+        logger.info('[remove: %d/%d] %s', k, total, path)
+        if not dry_run:
+            os.unlink(path)
diff --git a/bob/devtools/scripts/mirror.py b/bob/devtools/scripts/mirror.py
new file mode 100644
index 00000000..2568f18f
--- /dev/null
+++ b/bob/devtools/scripts/mirror.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+# vim: set fileencoding=utf-8 :
+
+
+import os
+import click
+
+import conda_build.api
+
+from . import bdt
+from ..mirror import (
+        get_json,
+        get_local_contents,
+        load_glob_list,
+        blacklist_filter,
+        download_packages,
+        remove_packages,
+        )
+from ..log import verbosity_option, get_logger, echo_info, echo_warning
+
+logger = get_logger(__name__)
+
+
+@click.command(
+    epilog="""
+Examples:
+
+  1. Mirrors a conda channel:
+
+\b
+     $ bdt mirror -vv https://www.idiap.ch/software/bob/label/beta
+
+    """
+)
+@click.argument(
+    "channel-url",
+    required=True,
+)
+@click.argument(
+    "dest-dir",
+    type=click.Path(exists=False, dir_okay=True, file_okay=False,
+        writable=True, readable=True, resolve_path=True),
+    required=True,
+)
+@click.option(
+    "-b",
+    "--blacklist",
+    type=click.Path(exists=True, dir_okay=False, file_okay=True,
+        readable=True, resolve_path=True),
+    help="A file containing a list of globs to exclude from local " \
+            "mirroring, one per line",
+)
+@click.option(
+    "-m",
+    "--check-md5/--no-check-md5",
+    default=False,
+    help="If set, then check MD5 sums of all packages during conda-index",
+)
+@click.option(
+    "-d",
+    "--dry-run/--no-dry-run",
+    default=False,
+    help="Only goes through the actions, but does not execute them "
+    "(combine with the verbosity flags - e.g. ``-vvv``) to enable "
+    "printing to help you understand what will be done",
+)
+@verbosity_option()
+@bdt.raise_on_error
+def mirror(
+        channel_url,
+        dest_dir,
+        blacklist,
+        check_md5,
+        dry_run,
+        ):
+    """Mirrors a conda channel to a particular local destination
+
+    This command is capable of completely mirroring a valid conda channel,
+    excluding packages that you may not be interested on via globs.  It works
+    to minimize channel usage by first downloading the channel repository data
+    (in compressed format), analysing what is available locally and what is
+    available on the channel, and only downloading the missing files.
+    """
+
+    # if we are in a dry-run mode, let's let it be known
+    if dry_run:
+        logger.warn("!!!! DRY RUN MODE !!!!")
+        logger.warn("Nothing will be really mirrored")
+
+
+    DEFAULT_SUBDIRS = ['noarch', 'linux-64', 'osx-64']
+
+    noarch = os.path.join(dest_dir, 'noarch')
+    if not os.path.exists(noarch):  #first time
+        # calls conda index to create basic infrastructure
+        logger.info("Creating conda channel at %s...", dest_dir)
+        if not dry_run:
+            conda_build.api.update_index([dest_dir], subdir=DEFAULT_SUBDIRS,
+                    progress=False)
+
+
+    for arch in DEFAULT_SUBDIRS:
+
+        remote_repodata = get_json(channel_url, arch, 'repodata.json.bz2')
+        logger.info('%d packages available in remote index',
+                len(remote_repodata.get('packages', {})))
+        local_packages = get_local_contents(dest_dir, arch)
+        logger.info('%d packages available in local mirror', len(local_packages))
+
+        remote_packages = set(list(remote_repodata.get('packages', {}).keys()) +
+                list(remote_repodata.get('packages.conda', {}).keys()))
+
+        if blacklist is not None and os.path.exists(blacklist):
+            globs_to_remove = set(load_glob_list(blacklist))
+        else:
+            globs_to_remove = set()
+
+        # in the remote packages, subset those that need to be downloaded
+        # according to our own interest
+        to_download = blacklist_filter(remote_packages - local_packages,
+                globs_to_remove)
+
+        # in the local packages, subset those that we no longer need, be it
+        # because they have been removed from the remote repository, or because
+        # we decided to blacklist them.
+        disappeared_remotely = local_packages - remote_packages
+        to_keep = blacklist_filter(local_packages, globs_to_remove)
+        to_delete_locally = (local_packages - to_keep) | disappeared_remotely
+
+        # execute the transaction
+        if to_download:
+            download_packages(to_download, remote_repodata, channel_url, dest_dir,
+                    arch, dry_run)
+        else:
+            echo_info("Mirror at %s/%s is up-to-date w.r.t. %s/%s. " \
+                    "No packages to download." % (dest_dir, arch, channel_url,
+                        arch))
+
+        if to_delete_locally:
+            echo_warning("%d packages will be removed at %s/%s" % \
+                    (len(to_delete_locally), dest_dir, arch))
+            remove_packages(to_delete_locally, dest_dir, arch, dry_run)
+        else:
+            echo_info("Mirror at %s/%s is up-to-date w.r.t. blacklist. " \
+                    "No packages to be removed." % (dest_dir, arch))
+
+    # re-indexes the channel to produce a conda-compatible setup
+    echo_info("Re-indexing %s..." % dest_dir)
+    if not dry_run:
+        conda_build.api.update_index([dest_dir], check_md5=check_md5,
+                progress=True)
diff --git a/setup.py b/setup.py
index a3d490ec..ab67bfde 100644
--- a/setup.py
+++ b/setup.py
@@ -48,6 +48,7 @@ setup(
           'dumpsphinx = bob.devtools.scripts.dumpsphinx:dumpsphinx',
           'create = bob.devtools.scripts.create:create',
           'build = bob.devtools.scripts.build:build',
+          'mirror = bob.devtools.scripts.mirror:mirror',
           'rebuild = bob.devtools.scripts.rebuild:rebuild',
           'test = bob.devtools.scripts.test:test',
           'caupdate = bob.devtools.scripts.caupdate:caupdate',
-- 
GitLab