From 0d05b38c9656658748904b3b62e19258e0d0118b Mon Sep 17 00:00:00 2001 From: Andre Anjos <andre.dos.anjos@gmail.com> Date: Thu, 24 Oct 2019 13:34:41 +0200 Subject: [PATCH] [mirror] Implement thorough optional checksum in existing mirrored packages (closes #42) --- bob/devtools/mirror.py | 98 +++++++++++++++++++++++++++------- bob/devtools/scripts/mirror.py | 18 +++++++ 2 files changed, 96 insertions(+), 20 deletions(-) diff --git a/bob/devtools/mirror.py b/bob/devtools/mirror.py index 6493e90c..778ea60a 100644 --- a/bob/devtools/mirror.py +++ b/bob/devtools/mirror.py @@ -151,6 +151,30 @@ def whitelist_filter(packages, globs): return to_keep +def _sha256sum(filename): + """Calculates and returns the sha-256 sum given a file name""" + + h = hashlib.sha256() + b = bytearray(128*1024) + mv = memoryview(b) + with open(filename, 'rb', buffering=0) as f: + for n in iter(lambda : f.readinto(mv), 0): + h.update(mv[:n]) + return h.hexdigest() + + +def _md5sum(filename): + """Calculates and returns the md5 sum given a file name""" + + h = hashlib.md5() + b = bytearray(128*1024) + mv = memoryview(b) + with open(filename, 'rb', buffering=0) as f: + for n in iter(lambda : f.readinto(mv), 0): + h.update(mv[:n]) + return h.hexdigest() + + def download_packages(packages, repodata, channel_url, dest_dir, arch, dry_run): """Downloads remote packages to a download directory @@ -177,26 +201,6 @@ def download_packages(packages, repodata, channel_url, dest_dir, arch, dry_run): """ - def _sha256sum(filename): - h = hashlib.sha256() - b = bytearray(128*1024) - mv = memoryview(b) - with open(filename, 'rb', buffering=0) as f: - for n in iter(lambda : f.readinto(mv), 0): - h.update(mv[:n]) - return h.hexdigest() - - - def _md5sum(filename): - h = hashlib.md5() - b = bytearray(128*1024) - mv = memoryview(b) - with open(filename, 'rb', buffering=0) as f: - for n in iter(lambda : f.readinto(mv), 0): - h.update(mv[:n]) - return h.hexdigest() - - # download files into temporary directory, that is removed by the end of # the procedure, or if something bad occurs with tempfile.TemporaryDirectory() as download_dir: @@ -319,3 +323,57 @@ def copy_and_clean_json(url, dest_dir, arch, name): packages = get_local_contents(dest_dir, arch) data = _cleanup_json(data, packages) return _save_json(data, dest_dir, arch, name) + + +def checksum(repodata, basepath, packages): + """Checksums packages on the local mirror and compare to remote repository + + Parameters + ---------- + repodata : dict + Data loaded from `repodata.json` on the remote repository + basepath : str + Path leading to the packages in the package list + packages : list + List of packages that are available locally, by name + + Returns + ------- + issues : list + List of matching errors + """ + + issues = [] + total = len(packages) + for k, p in enumerate(packages): + + path_to_package = os.path.join(basepath, p) + + # checksum to verify + if p.endswith('.tar.bz2'): + expected_hash = repodata['packages'][p].get('sha256', + repodata['packages'][p]['md5']) + else: + expected_hash = repodata['packages.conda'][p].get('sha256', + repodata['packages.conda'][p]['md5']) + + # verify that checksum matches + if len(expected_hash) == 32: #md5 + logger.debug('[verify: %d/%d] md5(%s) == %s?', k, total, + path_to_package, expected_hash) + else: #sha256 + logger.debug('[verify: %d/%d] sha256(%s) == %s?', k, total, + path_to_package, expected_hash) + + if len(expected_hash) == 32: #md5 + actual_hash = _md5sum(path_to_package) + else: #sha256 + actual_hash = _sha256sum(path_to_package) + + if actual_hash != expected_hash: + logger.warning('Checksum of %s does not match remote ' \ + 'repository description (actual:%r != %r:expected)', + path_to_package, actual_hash, expected_hash) + issues.append(p) + + return issues diff --git a/bob/devtools/scripts/mirror.py b/bob/devtools/scripts/mirror.py index 054eda8a..cdca9a5c 100644 --- a/bob/devtools/scripts/mirror.py +++ b/bob/devtools/scripts/mirror.py @@ -18,6 +18,7 @@ from ..mirror import ( download_packages, remove_packages, copy_and_clean_json, + checksum, ) from ..log import verbosity_option, get_logger, echo_info, echo_warning @@ -91,6 +92,15 @@ Examples: "where a patch_instructions.json exists and must be downloaded and " "prunned so the mirror works adequately", ) +@click.option( + "-c", + "--checksum/--no-checksum", + default=False, + help="If set, then packages that are supposed to be kept locally " + "will be checksummed against the remote repository repodata.json " + "expections. Errors will be reported and packages will be " + "removed from the local repository", +) @verbosity_option() @bdt.raise_on_error def mirror( @@ -166,6 +176,14 @@ def mirror( to_delete_locally = (local_packages - to_keep) | disappeared_remotely # execute the transaction + if checksum: + # double-check if, among packages I should keep, everything looks + # already with respect to expected checksums from the remote repo + issues = checksum(remote_repodata, os.path.join(dest_dir, arch), + to_keep) + remove_packages(issues, dest_dir, arch, dry_run) + to_download |= issues + if to_download: download_packages(to_download, remote_repodata, channel_url, dest_dir, arch, dry_run) -- GitLab