Commit 2c30b848 authored by André Anjos's avatar André Anjos 💬

Merge branch 'conda-mirror' into 'master'

Own implementation of conda channel mirroring

See merge request !123
parents bd288e10 58f5e18a
Pipeline #34580 passed with stages
in 6 minutes and 31 seconds
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
'''Mirroring functionality for conda channels
Some constructs are bluntly copied from
https://github.com/valassis-digital-media/conda-mirror
'''
import os
import bz2
import json
import hashlib
import fnmatch
import tempfile
import requests
from .log import get_logger
logger = get_logger(__name__)
def _download(url, target_directory):
"""Download `url` to `target_directory`
Parameters
----------
url : str
The url to download
target_directory : str
The path to a directory where `url` should be downloaded
Returns
-------
file_size: int
The size in bytes of the file that was downloaded
"""
file_size = 0
chunk_size = 1024 # 1KB chunks
logger.info("Download %s -> %s", url, target_directory)
# create a temporary file
target_filename = url.split('/')[-1]
download_filename = os.path.join(target_directory, target_filename)
with open(download_filename, 'w+b') as tf:
ret = requests.get(url, stream=True)
logger.debug('Saving to %s (%s bytes)', download_filename,
ret.headers['Content-length'])
for data in ret.iter_content(chunk_size):
tf.write(data)
file_size = os.path.getsize(download_filename)
return file_size
def _list_conda_packages(local_dir):
"""List the conda packages (*.tar.bz2 or *.conda files) in `local_dir`
Parameters
----------
local_dir : str
Some local directory with (hopefully) some conda packages in it
Returns
-------
list
List of conda packages in `local_dir`
"""
contents = os.listdir(local_dir)
return fnmatch.filter(contents, "*.conda") + \
fnmatch.filter(contents, "*.tar.bz2")
def get_json(channel, platform, name):
"""Get a JSON file for a channel/platform combo on conda channel
Parameters
----------
channel : str
Complete channel URL
platform : {'linux-64', 'osx-64', 'noarch'}
The platform of interest
name : str
The name of the file to retrieve. If the name ends in '.bz2', then it
is auto-decompressed
Returns
-------
repodata : dict
contents of repodata.json
"""
url = channel + '/' + platform + '/' + name
logger.debug('[checking] %s...', url)
r = requests.get(url, allow_redirects=True, stream=True)
logger.info('[download] %s (%s bytes)...', url, r.headers['Content-length'])
if name.endswith('.bz2'):
# just in case transport encoding was applied
r.raw.decode_content = True
data = bz2.decompress(r.raw.read())
else:
data = r.read()
return json.loads(data)
def get_local_contents(path, arch):
"""Returns the local package contents as a set"""
path_arch = os.path.join(path, arch)
if not os.path.exists(path_arch):
return set()
# path exists, lists currently available packages
logger.info('Listing package contents of %s...', path_arch)
contents = os.listdir(path_arch)
return set(fnmatch.filter(contents, '*.tar.bz2') +
fnmatch.filter(contents, '*.conda'))
def load_glob_list(path):
"""Loads a list of globs from a configuration file
Excludes comments and empty lines
"""
retval = [str(k.strip()) for k in open(path, "rt")]
return [k for k in retval if k and k[0] not in ("#", "-")]
def blacklist_filter(packages, globs):
"""Filters **out** the input package set with the glob list"""
to_remove = set()
for k in globs:
to_remove |= set(fnmatch.filter(packages, k))
return packages - to_remove
def download_packages(packages, repodata, channel_url, dest_dir, arch, dry_run):
"""Downloads remote packages to a download directory
Packages are downloaded first to a temporary directory, then validated
according to the expected sha256/md5 sum and then moved, one by one, to the
destination directory. An error is raised if the package cannot be
correctly downloaded.
Parameters
----------
packages : list of str
List of packages to download from the remote channel
repodata: dict
A dictionary containing the remote repodata.json contents
channel_url: str
The complete channel URL
dest_dir: str
The local directory where the channel is being mirrored
arch: str
The current architecture which we are mirroring
dry_run: bool
A boolean flag indicating if this is just a dry-run (simulation),
flagging so we don't really do anything (set to ``True``).
"""
def _sha256sum(filename):
h = hashlib.sha256()
b = bytearray(128*1024)
mv = memoryview(b)
with open(filename, 'rb', buffering=0) as f:
for n in iter(lambda : f.readinto(mv), 0):
h.update(mv[:n])
return h.hexdigest()
def _md5sum(filename):
h = hashlib.md5()
b = bytearray(128*1024)
mv = memoryview(b)
with open(filename, 'rb', buffering=0) as f:
for n in iter(lambda : f.readinto(mv), 0):
h.update(mv[:n])
return h.hexdigest()
# download files into temporary directory, that is removed by the end of
# the procedure, or if something bad occurs
with tempfile.TemporaryDirectory() as download_dir:
total = len(packages)
for k, p in enumerate(packages):
k+=1 #adjust to produce correct order on printouts
# checksum to verify
if p.endswith('.tar.bz2'):
expected_hash = repodata['packages'][p].get('sha256',
repodata['packages'][p]['md5'])
else:
expected_hash = repodata['packages.conda'][p].get('sha256',
repodata['packages'][p]['md5'])
# download package to file in our temporary directory
url = channel_url + '/' + arch + '/' + p
temp_dest = os.path.join(download_dir, p)
logger.info('[download: %d/%d] %s -> %s', k, total, url, temp_dest)
if not dry_run:
logger.debug('[checking: %d/%d] %s', k, total, url)
r = requests.get(url, stream=True, allow_redirects=True)
logger.info('[download: %d/%d] %s -> %s (%s bytes)', k, total,
url, temp_dest, r.headers['Content-length'])
open(temp_dest, 'wb').write(r.raw.read())
# verify that checksum matches
if len(expected_hash) == 32: #md5
logger.info('[verify: %d/%d] md5(%s) == %s?', k, total,
temp_dest, expected_hash)
else: #sha256
logger.info('[verify: %d/%d] sha256(%s) == %s?', k, total,
temp_dest, expected_hash)
if not dry_run:
if len(expected_hash) == 32: #md5
actual_hash = _md5sum(temp_dest)
else: #sha256
actual_hash = _sha256sum(temp_dest)
assert actual_hash == expected_hash, 'Checksum of locally' \
' downloaded version of %s does not match ' \
'(actual:%r != %r:expected)' % (url, actual_hash,
expected_hash)
# move
local_dest = os.path.join(dest_dir, arch, p)
logger.info('[move: %d/%d] %s -> %s', k, total, temp_dest,
local_dest)
# check local directory is available before moving
dirname = os.path.dirname(local_dest)
if not os.path.exists(dirname):
logger.info('[mkdir] %s', dirname)
if not dry_run:
os.makedirs(dirname)
if not dry_run:
os.rename(temp_dest, local_dest)
def remove_packages(packages, dest_dir, arch, dry_run):
"""Removes local packages that no longer matter"""
total = len(packages)
for k, p in enumerate(packages):
k+=1 #adjust to produce correct order on printouts
path = os.path.join(dest_dir, arch, p)
logger.info('[remove: %d/%d] %s', k, total, path)
if not dry_run:
os.unlink(path)
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
import os
import click
import conda_build.api
from . import bdt
from ..mirror import (
get_json,
get_local_contents,
load_glob_list,
blacklist_filter,
download_packages,
remove_packages,
)
from ..log import verbosity_option, get_logger, echo_info, echo_warning
logger = get_logger(__name__)
@click.command(
epilog="""
Examples:
1. Mirrors a conda channel:
\b
$ bdt mirror -vv https://www.idiap.ch/software/bob/label/beta
"""
)
@click.argument(
"channel-url",
required=True,
)
@click.argument(
"dest-dir",
type=click.Path(exists=False, dir_okay=True, file_okay=False,
writable=True, readable=True, resolve_path=True),
required=True,
)
@click.option(
"-b",
"--blacklist",
type=click.Path(exists=True, dir_okay=False, file_okay=True,
readable=True, resolve_path=True),
help="A file containing a list of globs to exclude from local " \
"mirroring, one per line",
)
@click.option(
"-m",
"--check-md5/--no-check-md5",
default=False,
help="If set, then check MD5 sums of all packages during conda-index",
)
@click.option(
"-d",
"--dry-run/--no-dry-run",
default=False,
help="Only goes through the actions, but does not execute them "
"(combine with the verbosity flags - e.g. ``-vvv``) to enable "
"printing to help you understand what will be done",
)
@verbosity_option()
@bdt.raise_on_error
def mirror(
channel_url,
dest_dir,
blacklist,
check_md5,
dry_run,
):
"""Mirrors a conda channel to a particular local destination
This command is capable of completely mirroring a valid conda channel,
excluding packages that you may not be interested on via globs. It works
to minimize channel usage by first downloading the channel repository data
(in compressed format), analysing what is available locally and what is
available on the channel, and only downloading the missing files.
"""
# if we are in a dry-run mode, let's let it be known
if dry_run:
logger.warn("!!!! DRY RUN MODE !!!!")
logger.warn("Nothing will be really mirrored")
DEFAULT_SUBDIRS = ['noarch', 'linux-64', 'osx-64']
noarch = os.path.join(dest_dir, 'noarch')
if not os.path.exists(noarch): #first time
# calls conda index to create basic infrastructure
logger.info("Creating conda channel at %s...", dest_dir)
if not dry_run:
conda_build.api.update_index([dest_dir], subdir=DEFAULT_SUBDIRS,
progress=False)
for arch in DEFAULT_SUBDIRS:
remote_repodata = get_json(channel_url, arch, 'repodata.json.bz2')
logger.info('%d packages available in remote index',
len(remote_repodata.get('packages', {})))
local_packages = get_local_contents(dest_dir, arch)
logger.info('%d packages available in local mirror', len(local_packages))
remote_packages = set(list(remote_repodata.get('packages', {}).keys()) +
list(remote_repodata.get('packages.conda', {}).keys()))
if blacklist is not None and os.path.exists(blacklist):
globs_to_remove = set(load_glob_list(blacklist))
else:
globs_to_remove = set()
# in the remote packages, subset those that need to be downloaded
# according to our own interest
to_download = blacklist_filter(remote_packages - local_packages,
globs_to_remove)
# in the local packages, subset those that we no longer need, be it
# because they have been removed from the remote repository, or because
# we decided to blacklist them.
disappeared_remotely = local_packages - remote_packages
to_keep = blacklist_filter(local_packages, globs_to_remove)
to_delete_locally = (local_packages - to_keep) | disappeared_remotely
# execute the transaction
if to_download:
download_packages(to_download, remote_repodata, channel_url, dest_dir,
arch, dry_run)
else:
echo_info("Mirror at %s/%s is up-to-date w.r.t. %s/%s. " \
"No packages to download." % (dest_dir, arch, channel_url,
arch))
if to_delete_locally:
echo_warning("%d packages will be removed at %s/%s" % \
(len(to_delete_locally), dest_dir, arch))
remove_packages(to_delete_locally, dest_dir, arch, dry_run)
else:
echo_info("Mirror at %s/%s is up-to-date w.r.t. blacklist. " \
"No packages to be removed." % (dest_dir, arch))
# re-indexes the channel to produce a conda-compatible setup
echo_info("Re-indexing %s..." % dest_dir)
if not dry_run:
conda_build.api.update_index([dest_dir], check_md5=check_md5,
progress=True)
......@@ -62,6 +62,7 @@ test:
- bdt dumpsphinx https://docs.python.org/3/objects.inv > /dev/null
- bdt create --help
- bdt build --help
- bdt mirror --help
- bdt rebuild --help
- bdt test --help
- bdt caupdate --help
......
......@@ -48,6 +48,7 @@ setup(
'dumpsphinx = bob.devtools.scripts.dumpsphinx:dumpsphinx',
'create = bob.devtools.scripts.create:create',
'build = bob.devtools.scripts.build:build',
'mirror = bob.devtools.scripts.mirror:mirror',
'rebuild = bob.devtools.scripts.rebuild:rebuild',
'test = bob.devtools.scripts.test:test',
'caupdate = bob.devtools.scripts.caupdate:caupdate',
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment