From 3a065d15a83b9d154675163ac02c1a6c8f1363de Mon Sep 17 00:00:00 2001 From: Andre Anjos <andre.dos.anjos@gmail.com> Date: Wed, 23 Oct 2019 19:48:59 +0200 Subject: [PATCH] [mirror] Implement patch and whitelist support --- bob/devtools/mirror.py | 68 ++++++++++++++++++++++++++++++---- bob/devtools/scripts/mirror.py | 35 +++++++++++++++++ 2 files changed, 96 insertions(+), 7 deletions(-) diff --git a/bob/devtools/mirror.py b/bob/devtools/mirror.py index 5aa6d526..fc826c29 100644 --- a/bob/devtools/mirror.py +++ b/bob/devtools/mirror.py @@ -48,8 +48,8 @@ def _download(url, target_directory): download_filename = os.path.join(target_directory, target_filename) with open(download_filename, 'w+b') as tf: ret = requests.get(url, stream=True) - logger.debug('Saving to %s (%s bytes)', download_filename, - ret.headers['Content-length']) + size = ret.headers.get('Content-length', '??') + logger.debug('Saving to %s (%s bytes)', download_filename, size) for data in ret.iter_content(chunk_size): tf.write(data) file_size = os.path.getsize(download_filename) @@ -96,16 +96,17 @@ def get_json(channel, platform, name): url = channel + '/' + platform + '/' + name logger.debug('[checking] %s...', url) r = requests.get(url, allow_redirects=True, stream=True) - logger.info('[download] %s (%s bytes)...', url, r.headers['Content-length']) + size = r.headers.get('Content-length', '??') + logger.info('[download] %s (%s bytes)...', url, size) if name.endswith('.bz2'): # just in case transport encoding was applied r.raw.decode_content = True data = bz2.decompress(r.raw.read()) - else: - data = r.read() + return json.loads(data) - return json.loads(data) + # else, just decodes the response + return r.json() def get_local_contents(path, arch): @@ -141,6 +142,15 @@ def blacklist_filter(packages, globs): return packages - to_remove +def whitelist_filter(packages, globs): + """Filters **in** the input package set with the glob list""" + + to_keep = set() + for k in globs: + to_keep |= set(fnmatch.filter(packages, k)) + return to_keep + + def download_packages(packages, repodata, channel_url, dest_dir, arch, dry_run): """Downloads remote packages to a download directory @@ -215,8 +225,9 @@ def download_packages(packages, repodata, channel_url, dest_dir, arch, dry_run): if not dry_run: logger.debug('[checking: %d/%d] %s', k, total, url) r = requests.get(url, stream=True, allow_redirects=True) + size = r.headers.get('Content-length', '??') logger.info('[download: %d/%d] %s -> %s (%s bytes)', k, - total, url, temp_dest, r.headers['Content-length']) + total, url, temp_dest, size) open(temp_dest, 'wb').write(r.raw.read()) # verify that checksum matches @@ -279,3 +290,46 @@ def remove_packages(packages, dest_dir, arch, dry_run): logger.info('[remove: %d/%d] %s', k, total, path) if not dry_run: os.unlink(path) + + +def _cleanup_json(data, packages): + """Cleans-up the contents of conda JSON looking at existing packages""" + + # only keys to clean-up here, othere keys remain unchanged + for key in ('packages', 'packages.conda'): + if key not in data: continue + data[key] = dict((k,v) for k,v in data[key].items() if k in packages) + + return data + + +def _save_json(data, dest_dir, arch, name): + """Saves contents of conda JSON""" + + destfile = os.path.join(dest_dir, arch, name) + with open(destfile, 'w') as outfile: + json.dump(data, outfile, ensure_ascii=True, indent=2) + return destfile + + +def copy_and_clean_json(url, dest_dir, arch, name): + """Copies and cleans conda JSON file""" + + data = get_json(url, arch, name) + packages = get_local_contents(dest_dir, arch) + data = _cleanup_json(data, packages) + return _save_json(data, dest_dir, arch, name) + + +def copy_and_clean_patch(url, dest_dir, arch, name): + """Copies and cleans conda JSON file""" + + data = get_json(url, arch, name) + packages = get_local_contents(dest_dir, arch) + data = _cleanup_json(data, packages) + + # cleanup specific patch_instructions.json fields + for key in ["remove", "revoke"]: + data[key] = [k for k in data[key] if k in packages] + + return _save_json(data, dest_dir, arch, name) diff --git a/bob/devtools/scripts/mirror.py b/bob/devtools/scripts/mirror.py index ec32e2f9..96866813 100644 --- a/bob/devtools/scripts/mirror.py +++ b/bob/devtools/scripts/mirror.py @@ -14,8 +14,10 @@ from ..mirror import ( get_local_contents, load_glob_list, blacklist_filter, + whitelist_filter, download_packages, remove_packages, + copy_and_clean_patch, ) from ..log import verbosity_option, get_logger, echo_info, echo_warning @@ -51,6 +53,15 @@ Examples: help="A file containing a list of globs to exclude from local " \ "mirroring, one per line", ) +@click.option( + "-w", + "--whitelist", + type=click.Path(exists=True, dir_okay=False, file_okay=True, + readable=True, resolve_path=True), + help="A file containing a list of globs to include at local " \ + "mirroring, one per line. This is considered *after* " \ + "the blacklisting. It is here just for testing purposes", +) @click.option( "-m", "--check-md5/--no-check-md5", @@ -72,15 +83,25 @@ Examples: readable=True, writable=True, resolve_path=True), help="A directory where to store temporary files", ) +@click.option( + "-p", + "--patch/--no-patch", + default=False, + help="If set, then consider we are mirroring the defaults channel " + "where a patch_instructions.json exists and must be downloaded and " + "prunned so the mirror works adequately", +) @verbosity_option() @bdt.raise_on_error def mirror( channel_url, dest_dir, blacklist, + whitelist, check_md5, dry_run, tmpdir, + patch, ): """Mirrors a conda channel to a particular local destination @@ -133,6 +154,10 @@ def mirror( to_download = blacklist_filter(remote_packages - local_packages, globs_to_remove) + if whitelist is not None and os.path.exists(whitelist): + globs_to_consider = set(load_glob_list(whitelist)) + to_download = whitelist_filter(to_download, globs_to_consider) + # in the local packages, subset those that we no longer need, be it # because they have been removed from the remote repository, or because # we decided to blacklist them. @@ -157,6 +182,16 @@ def mirror( echo_info("Mirror at %s/%s is up-to-date w.r.t. blacklist. " \ "No packages to be removed." % (dest_dir, arch)) + if patch: + # download/cleanup patch instructions, otherwise conda installs may + # go crazy. Do this before the indexing, that will use that file + # to do its magic. + patch_file = 'patch_instructions.json' + name = copy_and_clean_patch(channel_url, dest_dir, arch, + patch_file) + echo_info("Cleaned copy of %s/%s/%s installed at %s" % + (channel_url, arch, patch_file, name)) + # re-indexes the channel to produce a conda-compatible setup echo_info("Re-indexing %s..." % dest_dir) if not dry_run: -- GitLab