diff --git a/bob/devtools/mirror.py b/bob/devtools/mirror.py new file mode 100644 index 0000000000000000000000000000000000000000..660ddbd55341f61b8fd162567cc2d4e030ddb70c --- /dev/null +++ b/bob/devtools/mirror.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python +# vim: set fileencoding=utf-8 : + + +'''Mirroring functionality for conda channels + +Some constructs are bluntly copied from +https://github.com/valassis-digital-media/conda-mirror +''' + +import os +import bz2 +import json +import hashlib +import fnmatch +import tempfile + +import requests + +from .log import get_logger +logger = get_logger(__name__) + + + +def _download(url, target_directory): + """Download `url` to `target_directory` + + Parameters + ---------- + url : str + The url to download + target_directory : str + The path to a directory where `url` should be downloaded + + Returns + ------- + file_size: int + The size in bytes of the file that was downloaded + """ + + file_size = 0 + chunk_size = 1024 # 1KB chunks + logger.info("Download %s -> %s", url, target_directory) + # create a temporary file + target_filename = url.split('/')[-1] + download_filename = os.path.join(target_directory, target_filename) + with open(download_filename, 'w+b') as tf: + ret = requests.get(url, stream=True) + logger.debug('Saving to %s (%s bytes)', download_filename, + ret.headers['Content-length']) + for data in ret.iter_content(chunk_size): + tf.write(data) + file_size = os.path.getsize(download_filename) + return file_size + + +def _list_conda_packages(local_dir): + """List the conda packages (*.tar.bz2 or *.conda files) in `local_dir` + + Parameters + ---------- + local_dir : str + Some local directory with (hopefully) some conda packages in it + + Returns + ------- + list + List of conda packages in `local_dir` + """ + contents = os.listdir(local_dir) + return fnmatch.filter(contents, "*.conda") + \ + fnmatch.filter(contents, "*.tar.bz2") + + +def get_json(channel, platform, name): + """Get a JSON file for a channel/platform combo on conda channel + + Parameters + ---------- + channel : str + Complete channel URL + platform : {'linux-64', 'osx-64', 'noarch'} + The platform of interest + name : str + The name of the file to retrieve. If the name ends in '.bz2', then it + is auto-decompressed + + Returns + ------- + repodata : dict + contents of repodata.json + """ + + url = channel + '/' + platform + '/' + name + logger.debug('[checking] %s...', url) + r = requests.get(url, allow_redirects=True, stream=True) + logger.info('[download] %s (%s bytes)...', url, r.headers['Content-length']) + + if name.endswith('.bz2'): + # just in case transport encoding was applied + r.raw.decode_content = True + data = bz2.decompress(r.raw.read()) + else: + data = r.read() + + return json.loads(data) + + +def get_local_contents(path, arch): + """Returns the local package contents as a set""" + + path_arch = os.path.join(path, arch) + if not os.path.exists(path_arch): + return set() + + # path exists, lists currently available packages + logger.info('Listing package contents of %s...', path_arch) + contents = os.listdir(path_arch) + return set(fnmatch.filter(contents, '*.tar.bz2') + + fnmatch.filter(contents, '*.conda')) + + +def load_glob_list(path): + """Loads a list of globs from a configuration file + + Excludes comments and empty lines + """ + + retval = [str(k.strip()) for k in open(path, "rt")] + return [k for k in retval if k and k[0] not in ("#", "-")] + + +def blacklist_filter(packages, globs): + """Filters **out** the input package set with the glob list""" + + to_remove = set() + for k in globs: + to_remove |= set(fnmatch.filter(packages, k)) + return packages - to_remove + + +def download_packages(packages, repodata, channel_url, dest_dir, arch, dry_run): + """Downloads remote packages to a download directory + + Packages are downloaded first to a temporary directory, then validated + according to the expected sha256/md5 sum and then moved, one by one, to the + destination directory. An error is raised if the package cannot be + correctly downloaded. + + Parameters + ---------- + packages : list of str + List of packages to download from the remote channel + repodata: dict + A dictionary containing the remote repodata.json contents + channel_url: str + The complete channel URL + dest_dir: str + The local directory where the channel is being mirrored + arch: str + The current architecture which we are mirroring + dry_run: bool + A boolean flag indicating if this is just a dry-run (simulation), + flagging so we don't really do anything (set to ``True``). + + """ + + def _sha256sum(filename): + h = hashlib.sha256() + b = bytearray(128*1024) + mv = memoryview(b) + with open(filename, 'rb', buffering=0) as f: + for n in iter(lambda : f.readinto(mv), 0): + h.update(mv[:n]) + return h.hexdigest() + + + def _md5sum(filename): + h = hashlib.md5() + b = bytearray(128*1024) + mv = memoryview(b) + with open(filename, 'rb', buffering=0) as f: + for n in iter(lambda : f.readinto(mv), 0): + h.update(mv[:n]) + return h.hexdigest() + + + # download files into temporary directory, that is removed by the end of + # the procedure, or if something bad occurs + with tempfile.TemporaryDirectory() as download_dir: + + total = len(packages) + for k, p in enumerate(packages): + + k+=1 #adjust to produce correct order on printouts + + # checksum to verify + if p.endswith('.tar.bz2'): + expected_hash = repodata['packages'][p].get('sha256', + repodata['packages'][p]['md5']) + else: + expected_hash = repodata['packages.conda'][p].get('sha256', + repodata['packages'][p]['md5']) + + # download package to file in our temporary directory + url = channel_url + '/' + arch + '/' + p + temp_dest = os.path.join(download_dir, p) + logger.info('[download: %d/%d] %s -> %s', k, total, url, temp_dest) + + if not dry_run: + logger.debug('[checking: %d/%d] %s', k, total, url) + r = requests.get(url, stream=True, allow_redirects=True) + logger.info('[download: %d/%d] %s -> %s (%s bytes)', k, total, + url, temp_dest, r.headers['Content-length']) + open(temp_dest, 'wb').write(r.raw.read()) + + # verify that checksum matches + if len(expected_hash) == 32: #md5 + logger.info('[verify: %d/%d] md5(%s) == %s?', k, total, + temp_dest, expected_hash) + else: #sha256 + logger.info('[verify: %d/%d] sha256(%s) == %s?', k, total, + temp_dest, expected_hash) + + if not dry_run: + if len(expected_hash) == 32: #md5 + actual_hash = _md5sum(temp_dest) + else: #sha256 + actual_hash = _sha256sum(temp_dest) + assert actual_hash == expected_hash, 'Checksum of locally' \ + ' downloaded version of %s does not match ' \ + '(actual:%r != %r:expected)' % (url, actual_hash, + expected_hash) + + # move + local_dest = os.path.join(dest_dir, arch, p) + logger.info('[move: %d/%d] %s -> %s', k, total, temp_dest, + local_dest) + + # check local directory is available before moving + dirname = os.path.dirname(local_dest) + if not os.path.exists(dirname): + logger.info('[mkdir] %s', dirname) + if not dry_run: + os.makedirs(dirname) + + if not dry_run: + os.rename(temp_dest, local_dest) + + +def remove_packages(packages, dest_dir, arch, dry_run): + """Removes local packages that no longer matter""" + + total = len(packages) + for k, p in enumerate(packages): + k+=1 #adjust to produce correct order on printouts + path = os.path.join(dest_dir, arch, p) + logger.info('[remove: %d/%d] %s', k, total, path) + if not dry_run: + os.unlink(path) diff --git a/bob/devtools/scripts/mirror.py b/bob/devtools/scripts/mirror.py new file mode 100644 index 0000000000000000000000000000000000000000..2568f18f05839511e8e4e31f00e5f7eda44ce1f9 --- /dev/null +++ b/bob/devtools/scripts/mirror.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python +# vim: set fileencoding=utf-8 : + + +import os +import click + +import conda_build.api + +from . import bdt +from ..mirror import ( + get_json, + get_local_contents, + load_glob_list, + blacklist_filter, + download_packages, + remove_packages, + ) +from ..log import verbosity_option, get_logger, echo_info, echo_warning + +logger = get_logger(__name__) + + +@click.command( + epilog=""" +Examples: + + 1. Mirrors a conda channel: + +\b + $ bdt mirror -vv https://www.idiap.ch/software/bob/label/beta + + """ +) +@click.argument( + "channel-url", + required=True, +) +@click.argument( + "dest-dir", + type=click.Path(exists=False, dir_okay=True, file_okay=False, + writable=True, readable=True, resolve_path=True), + required=True, +) +@click.option( + "-b", + "--blacklist", + type=click.Path(exists=True, dir_okay=False, file_okay=True, + readable=True, resolve_path=True), + help="A file containing a list of globs to exclude from local " \ + "mirroring, one per line", +) +@click.option( + "-m", + "--check-md5/--no-check-md5", + default=False, + help="If set, then check MD5 sums of all packages during conda-index", +) +@click.option( + "-d", + "--dry-run/--no-dry-run", + default=False, + help="Only goes through the actions, but does not execute them " + "(combine with the verbosity flags - e.g. ``-vvv``) to enable " + "printing to help you understand what will be done", +) +@verbosity_option() +@bdt.raise_on_error +def mirror( + channel_url, + dest_dir, + blacklist, + check_md5, + dry_run, + ): + """Mirrors a conda channel to a particular local destination + + This command is capable of completely mirroring a valid conda channel, + excluding packages that you may not be interested on via globs. It works + to minimize channel usage by first downloading the channel repository data + (in compressed format), analysing what is available locally and what is + available on the channel, and only downloading the missing files. + """ + + # if we are in a dry-run mode, let's let it be known + if dry_run: + logger.warn("!!!! DRY RUN MODE !!!!") + logger.warn("Nothing will be really mirrored") + + + DEFAULT_SUBDIRS = ['noarch', 'linux-64', 'osx-64'] + + noarch = os.path.join(dest_dir, 'noarch') + if not os.path.exists(noarch): #first time + # calls conda index to create basic infrastructure + logger.info("Creating conda channel at %s...", dest_dir) + if not dry_run: + conda_build.api.update_index([dest_dir], subdir=DEFAULT_SUBDIRS, + progress=False) + + + for arch in DEFAULT_SUBDIRS: + + remote_repodata = get_json(channel_url, arch, 'repodata.json.bz2') + logger.info('%d packages available in remote index', + len(remote_repodata.get('packages', {}))) + local_packages = get_local_contents(dest_dir, arch) + logger.info('%d packages available in local mirror', len(local_packages)) + + remote_packages = set(list(remote_repodata.get('packages', {}).keys()) + + list(remote_repodata.get('packages.conda', {}).keys())) + + if blacklist is not None and os.path.exists(blacklist): + globs_to_remove = set(load_glob_list(blacklist)) + else: + globs_to_remove = set() + + # in the remote packages, subset those that need to be downloaded + # according to our own interest + to_download = blacklist_filter(remote_packages - local_packages, + globs_to_remove) + + # in the local packages, subset those that we no longer need, be it + # because they have been removed from the remote repository, or because + # we decided to blacklist them. + disappeared_remotely = local_packages - remote_packages + to_keep = blacklist_filter(local_packages, globs_to_remove) + to_delete_locally = (local_packages - to_keep) | disappeared_remotely + + # execute the transaction + if to_download: + download_packages(to_download, remote_repodata, channel_url, dest_dir, + arch, dry_run) + else: + echo_info("Mirror at %s/%s is up-to-date w.r.t. %s/%s. " \ + "No packages to download." % (dest_dir, arch, channel_url, + arch)) + + if to_delete_locally: + echo_warning("%d packages will be removed at %s/%s" % \ + (len(to_delete_locally), dest_dir, arch)) + remove_packages(to_delete_locally, dest_dir, arch, dry_run) + else: + echo_info("Mirror at %s/%s is up-to-date w.r.t. blacklist. " \ + "No packages to be removed." % (dest_dir, arch)) + + # re-indexes the channel to produce a conda-compatible setup + echo_info("Re-indexing %s..." % dest_dir) + if not dry_run: + conda_build.api.update_index([dest_dir], check_md5=check_md5, + progress=True) diff --git a/setup.py b/setup.py index a3d490ec52158faf0f15f483e47af5b5336b4804..ab67bfde64ceca5661fa1b264d9846fde6d7d62a 100644 --- a/setup.py +++ b/setup.py @@ -48,6 +48,7 @@ setup( 'dumpsphinx = bob.devtools.scripts.dumpsphinx:dumpsphinx', 'create = bob.devtools.scripts.create:create', 'build = bob.devtools.scripts.build:build', + 'mirror = bob.devtools.scripts.mirror:mirror', 'rebuild = bob.devtools.scripts.rebuild:rebuild', 'test = bob.devtools.scripts.test:test', 'caupdate = bob.devtools.scripts.caupdate:caupdate',