From 0d05b38c9656658748904b3b62e19258e0d0118b Mon Sep 17 00:00:00 2001
From: Andre Anjos <andre.dos.anjos@gmail.com>
Date: Thu, 24 Oct 2019 13:34:41 +0200
Subject: [PATCH] [mirror] Implement thorough optional checksum in existing
 mirrored packages (closes #42)

---
 bob/devtools/mirror.py         | 98 +++++++++++++++++++++++++++-------
 bob/devtools/scripts/mirror.py | 18 +++++++
 2 files changed, 96 insertions(+), 20 deletions(-)

diff --git a/bob/devtools/mirror.py b/bob/devtools/mirror.py
index 6493e90c..778ea60a 100644
--- a/bob/devtools/mirror.py
+++ b/bob/devtools/mirror.py
@@ -151,6 +151,30 @@ def whitelist_filter(packages, globs):
     return to_keep
 
 
+def _sha256sum(filename):
+    """Calculates and returns the sha-256 sum given a file name"""
+
+    h  = hashlib.sha256()
+    b  = bytearray(128*1024)
+    mv = memoryview(b)
+    with open(filename, 'rb', buffering=0) as f:
+        for n in iter(lambda : f.readinto(mv), 0):
+            h.update(mv[:n])
+    return h.hexdigest()
+
+
+def _md5sum(filename):
+    """Calculates and returns the md5 sum given a file name"""
+
+    h  = hashlib.md5()
+    b  = bytearray(128*1024)
+    mv = memoryview(b)
+    with open(filename, 'rb', buffering=0) as f:
+        for n in iter(lambda : f.readinto(mv), 0):
+            h.update(mv[:n])
+    return h.hexdigest()
+
+
 def download_packages(packages, repodata, channel_url, dest_dir, arch, dry_run):
     """Downloads remote packages to a download directory
 
@@ -177,26 +201,6 @@ def download_packages(packages, repodata, channel_url, dest_dir, arch, dry_run):
 
     """
 
-    def _sha256sum(filename):
-        h  = hashlib.sha256()
-        b  = bytearray(128*1024)
-        mv = memoryview(b)
-        with open(filename, 'rb', buffering=0) as f:
-            for n in iter(lambda : f.readinto(mv), 0):
-                h.update(mv[:n])
-        return h.hexdigest()
-
-
-    def _md5sum(filename):
-        h  = hashlib.md5()
-        b  = bytearray(128*1024)
-        mv = memoryview(b)
-        with open(filename, 'rb', buffering=0) as f:
-            for n in iter(lambda : f.readinto(mv), 0):
-                h.update(mv[:n])
-        return h.hexdigest()
-
-
     # download files into temporary directory, that is removed by the end of
     # the procedure, or if something bad occurs
     with tempfile.TemporaryDirectory() as download_dir:
@@ -319,3 +323,57 @@ def copy_and_clean_json(url, dest_dir, arch, name):
     packages = get_local_contents(dest_dir, arch)
     data = _cleanup_json(data, packages)
     return _save_json(data, dest_dir, arch, name)
+
+
+def checksum(repodata, basepath, packages):
+    """Checksums packages on the local mirror and compare to remote repository
+
+    Parameters
+    ----------
+    repodata : dict
+        Data loaded from `repodata.json` on the remote repository
+    basepath : str
+        Path leading to the packages in the package list
+    packages : list
+        List of packages that are available locally, by name
+
+    Returns
+    -------
+    issues : list
+        List of matching errors
+    """
+
+    issues = []
+    total = len(packages)
+    for k, p in enumerate(packages):
+
+        path_to_package = os.path.join(basepath, p)
+
+        # checksum to verify
+        if p.endswith('.tar.bz2'):
+            expected_hash = repodata['packages'][p].get('sha256',
+                    repodata['packages'][p]['md5'])
+        else:
+            expected_hash = repodata['packages.conda'][p].get('sha256',
+                    repodata['packages.conda'][p]['md5'])
+
+        # verify that checksum matches
+        if len(expected_hash) == 32:  #md5
+            logger.debug('[verify: %d/%d] md5(%s) == %s?', k, total,
+                    path_to_package, expected_hash)
+        else:  #sha256
+            logger.debug('[verify: %d/%d] sha256(%s) == %s?', k, total,
+                    path_to_package, expected_hash)
+
+        if len(expected_hash) == 32:  #md5
+            actual_hash = _md5sum(path_to_package)
+        else:  #sha256
+            actual_hash = _sha256sum(path_to_package)
+
+        if actual_hash != expected_hash:
+            logger.warning('Checksum of %s does not match remote ' \
+                    'repository description (actual:%r != %r:expected)',
+                    path_to_package, actual_hash, expected_hash)
+            issues.append(p)
+
+    return issues
diff --git a/bob/devtools/scripts/mirror.py b/bob/devtools/scripts/mirror.py
index 054eda8a..cdca9a5c 100644
--- a/bob/devtools/scripts/mirror.py
+++ b/bob/devtools/scripts/mirror.py
@@ -18,6 +18,7 @@ from ..mirror import (
         download_packages,
         remove_packages,
         copy_and_clean_json,
+        checksum,
         )
 from ..log import verbosity_option, get_logger, echo_info, echo_warning
 
@@ -91,6 +92,15 @@ Examples:
     "where a patch_instructions.json exists and must be downloaded and "
     "prunned so the mirror works adequately",
 )
+@click.option(
+    "-c",
+    "--checksum/--no-checksum",
+    default=False,
+    help="If set, then packages that are supposed to be kept locally "
+    "will be checksummed against the remote repository repodata.json "
+    "expections.  Errors will be reported and packages will be "
+    "removed from the local repository",
+)
 @verbosity_option()
 @bdt.raise_on_error
 def mirror(
@@ -166,6 +176,14 @@ def mirror(
         to_delete_locally = (local_packages - to_keep) | disappeared_remotely
 
         # execute the transaction
+        if checksum:
+            # double-check if, among packages I should keep, everything looks
+            # already with respect to expected checksums from the remote repo
+            issues = checksum(remote_repodata, os.path.join(dest_dir, arch),
+                    to_keep)
+            remove_packages(issues, dest_dir, arch, dry_run)
+            to_download |= issues
+
         if to_download:
             download_packages(to_download, remote_repodata, channel_url, dest_dir,
                     arch, dry_run)
-- 
GitLab