From 3a065d15a83b9d154675163ac02c1a6c8f1363de Mon Sep 17 00:00:00 2001
From: Andre Anjos <andre.dos.anjos@gmail.com>
Date: Wed, 23 Oct 2019 19:48:59 +0200
Subject: [PATCH] [mirror] Implement patch and whitelist support

---
 bob/devtools/mirror.py         | 68 ++++++++++++++++++++++++++++++----
 bob/devtools/scripts/mirror.py | 35 +++++++++++++++++
 2 files changed, 96 insertions(+), 7 deletions(-)

diff --git a/bob/devtools/mirror.py b/bob/devtools/mirror.py
index 5aa6d526..fc826c29 100644
--- a/bob/devtools/mirror.py
+++ b/bob/devtools/mirror.py
@@ -48,8 +48,8 @@ def _download(url, target_directory):
     download_filename = os.path.join(target_directory, target_filename)
     with open(download_filename, 'w+b') as tf:
         ret = requests.get(url, stream=True)
-        logger.debug('Saving to %s (%s bytes)', download_filename,
-                ret.headers['Content-length'])
+        size = ret.headers.get('Content-length', '??')
+        logger.debug('Saving to %s (%s bytes)', download_filename, size)
         for data in ret.iter_content(chunk_size):
             tf.write(data)
         file_size = os.path.getsize(download_filename)
@@ -96,16 +96,17 @@ def get_json(channel, platform, name):
     url = channel + '/' + platform + '/' + name
     logger.debug('[checking] %s...', url)
     r = requests.get(url, allow_redirects=True, stream=True)
-    logger.info('[download] %s (%s bytes)...', url, r.headers['Content-length'])
+    size = r.headers.get('Content-length', '??')
+    logger.info('[download] %s (%s bytes)...', url, size)
 
     if name.endswith('.bz2'):
         # just in case transport encoding was applied
         r.raw.decode_content = True
         data = bz2.decompress(r.raw.read())
-    else:
-        data = r.read()
+        return json.loads(data)
 
-    return json.loads(data)
+    # else, just decodes the response
+    return r.json()
 
 
 def get_local_contents(path, arch):
@@ -141,6 +142,15 @@ def blacklist_filter(packages, globs):
     return packages - to_remove
 
 
+def whitelist_filter(packages, globs):
+    """Filters **in** the input package set with the glob list"""
+
+    to_keep = set()
+    for k in globs:
+        to_keep |= set(fnmatch.filter(packages, k))
+    return to_keep
+
+
 def download_packages(packages, repodata, channel_url, dest_dir, arch, dry_run):
     """Downloads remote packages to a download directory
 
@@ -215,8 +225,9 @@ def download_packages(packages, repodata, channel_url, dest_dir, arch, dry_run):
                 if not dry_run:
                     logger.debug('[checking: %d/%d] %s', k, total, url)
                     r = requests.get(url, stream=True, allow_redirects=True)
+                    size = r.headers.get('Content-length', '??')
                     logger.info('[download: %d/%d] %s -> %s (%s bytes)', k,
-                            total, url, temp_dest, r.headers['Content-length'])
+                            total, url, temp_dest, size)
                     open(temp_dest, 'wb').write(r.raw.read())
 
                 # verify that checksum matches
@@ -279,3 +290,46 @@ def remove_packages(packages, dest_dir, arch, dry_run):
         logger.info('[remove: %d/%d] %s', k, total, path)
         if not dry_run:
             os.unlink(path)
+
+
+def _cleanup_json(data, packages):
+    """Cleans-up the contents of conda JSON looking at existing packages"""
+
+    # only keys to clean-up here, othere keys remain unchanged
+    for key in ('packages', 'packages.conda'):
+        if key not in data: continue
+        data[key] = dict((k,v) for k,v in data[key].items() if k in packages)
+
+    return data
+
+
+def _save_json(data, dest_dir, arch, name):
+    """Saves contents of conda JSON"""
+
+    destfile = os.path.join(dest_dir, arch, name)
+    with open(destfile, 'w') as outfile:
+        json.dump(data, outfile, ensure_ascii=True, indent=2)
+    return destfile
+
+
+def copy_and_clean_json(url, dest_dir, arch, name):
+    """Copies and cleans conda JSON file"""
+
+    data = get_json(url, arch, name)
+    packages = get_local_contents(dest_dir, arch)
+    data = _cleanup_json(data, packages)
+    return _save_json(data, dest_dir, arch, name)
+
+
+def copy_and_clean_patch(url, dest_dir, arch, name):
+    """Copies and cleans conda JSON file"""
+
+    data = get_json(url, arch, name)
+    packages = get_local_contents(dest_dir, arch)
+    data = _cleanup_json(data, packages)
+
+    # cleanup specific patch_instructions.json fields
+    for key in ["remove", "revoke"]:
+        data[key] = [k for k in data[key] if k in packages]
+
+    return _save_json(data, dest_dir, arch, name)
diff --git a/bob/devtools/scripts/mirror.py b/bob/devtools/scripts/mirror.py
index ec32e2f9..96866813 100644
--- a/bob/devtools/scripts/mirror.py
+++ b/bob/devtools/scripts/mirror.py
@@ -14,8 +14,10 @@ from ..mirror import (
         get_local_contents,
         load_glob_list,
         blacklist_filter,
+        whitelist_filter,
         download_packages,
         remove_packages,
+        copy_and_clean_patch,
         )
 from ..log import verbosity_option, get_logger, echo_info, echo_warning
 
@@ -51,6 +53,15 @@ Examples:
     help="A file containing a list of globs to exclude from local " \
             "mirroring, one per line",
 )
+@click.option(
+    "-w",
+    "--whitelist",
+    type=click.Path(exists=True, dir_okay=False, file_okay=True,
+        readable=True, resolve_path=True),
+    help="A file containing a list of globs to include at local " \
+            "mirroring, one per line.  This is considered *after* " \
+            "the blacklisting.  It is here just for testing purposes",
+)
 @click.option(
     "-m",
     "--check-md5/--no-check-md5",
@@ -72,15 +83,25 @@ Examples:
         readable=True, writable=True, resolve_path=True),
     help="A directory where to store temporary files",
 )
+@click.option(
+    "-p",
+    "--patch/--no-patch",
+    default=False,
+    help="If set, then consider we are mirroring the defaults channel "
+    "where a patch_instructions.json exists and must be downloaded and "
+    "prunned so the mirror works adequately",
+)
 @verbosity_option()
 @bdt.raise_on_error
 def mirror(
         channel_url,
         dest_dir,
         blacklist,
+        whitelist,
         check_md5,
         dry_run,
         tmpdir,
+        patch,
         ):
     """Mirrors a conda channel to a particular local destination
 
@@ -133,6 +154,10 @@ def mirror(
         to_download = blacklist_filter(remote_packages - local_packages,
                 globs_to_remove)
 
+        if whitelist is not None and os.path.exists(whitelist):
+            globs_to_consider = set(load_glob_list(whitelist))
+            to_download = whitelist_filter(to_download, globs_to_consider)
+
         # in the local packages, subset those that we no longer need, be it
         # because they have been removed from the remote repository, or because
         # we decided to blacklist them.
@@ -157,6 +182,16 @@ def mirror(
             echo_info("Mirror at %s/%s is up-to-date w.r.t. blacklist. " \
                     "No packages to be removed." % (dest_dir, arch))
 
+        if patch:
+            # download/cleanup patch instructions, otherwise conda installs may
+            # go crazy.  Do this before the indexing, that will use that file
+            # to do its magic.
+            patch_file = 'patch_instructions.json'
+            name = copy_and_clean_patch(channel_url, dest_dir, arch,
+                    patch_file)
+            echo_info("Cleaned copy of %s/%s/%s installed at %s" %
+                    (channel_url, arch, patch_file, name))
+
     # re-indexes the channel to produce a conda-compatible setup
     echo_info("Re-indexing %s..." % dest_dir)
     if not dry_run:
-- 
GitLab