Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
bob
bob.devtools
Commits
4bb0e089
Commit
4bb0e089
authored
Oct 21, 2019
by
André Anjos
💬
Browse files
[mirror] Own implementation of conda channel mirroring
parent
bd288e10
Pipeline
#34578
passed with stage
in 5 minutes and 15 seconds
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
bob/devtools/mirror.py
0 → 100644
View file @
4bb0e089
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
'''Mirroring functionality for conda channels
Some constructs are bluntly copied from
https://github.com/valassis-digital-media/conda-mirror
'''
import
os
import
bz2
import
json
import
hashlib
import
fnmatch
import
tempfile
import
requests
from
.log
import
get_logger
logger
=
get_logger
(
__name__
)
def
_download
(
url
,
target_directory
):
"""Download `url` to `target_directory`
Parameters
----------
url : str
The url to download
target_directory : str
The path to a directory where `url` should be downloaded
Returns
-------
file_size: int
The size in bytes of the file that was downloaded
"""
file_size
=
0
chunk_size
=
1024
# 1KB chunks
logger
.
info
(
"Download %s -> %s"
,
url
,
target_directory
)
# create a temporary file
target_filename
=
url
.
split
(
'/'
)[
-
1
]
download_filename
=
os
.
path
.
join
(
target_directory
,
target_filename
)
with
open
(
download_filename
,
'w+b'
)
as
tf
:
ret
=
requests
.
get
(
url
,
stream
=
True
)
logger
.
debug
(
'Saving to %s (%s bytes)'
,
download_filename
,
ret
.
headers
[
'Content-length'
])
for
data
in
ret
.
iter_content
(
chunk_size
):
tf
.
write
(
data
)
file_size
=
os
.
path
.
getsize
(
download_filename
)
return
file_size
def
_list_conda_packages
(
local_dir
):
"""List the conda packages (*.tar.bz2 or *.conda files) in `local_dir`
Parameters
----------
local_dir : str
Some local directory with (hopefully) some conda packages in it
Returns
-------
list
List of conda packages in `local_dir`
"""
contents
=
os
.
listdir
(
local_dir
)
return
fnmatch
.
filter
(
contents
,
"*.conda"
)
+
\
fnmatch
.
filter
(
contents
,
"*.tar.bz2"
)
def
get_json
(
channel
,
platform
,
name
):
"""Get a JSON file for a channel/platform combo on conda channel
Parameters
----------
channel : str
Complete channel URL
platform : {'linux-64', 'osx-64', 'noarch'}
The platform of interest
name : str
The name of the file to retrieve. If the name ends in '.bz2', then it
is auto-decompressed
Returns
-------
repodata : dict
contents of repodata.json
"""
url
=
channel
+
'/'
+
platform
+
'/'
+
name
logger
.
debug
(
'[checking] %s...'
,
url
)
r
=
requests
.
get
(
url
,
allow_redirects
=
True
,
stream
=
True
)
logger
.
info
(
'[download] %s (%s bytes)...'
,
url
,
r
.
headers
[
'Content-length'
])
if
name
.
endswith
(
'.bz2'
):
# just in case transport encoding was applied
r
.
raw
.
decode_content
=
True
data
=
bz2
.
decompress
(
r
.
raw
.
read
())
else
:
data
=
r
.
read
()
return
json
.
loads
(
data
)
def
get_local_contents
(
path
,
arch
):
"""Returns the local package contents as a set"""
path_arch
=
os
.
path
.
join
(
path
,
arch
)
if
not
os
.
path
.
exists
(
path_arch
):
return
set
()
# path exists, lists currently available packages
logger
.
info
(
'Listing package contents of %s...'
,
path_arch
)
contents
=
os
.
listdir
(
path_arch
)
return
set
(
fnmatch
.
filter
(
contents
,
'*.tar.bz2'
)
+
fnmatch
.
filter
(
contents
,
'*.conda'
))
def
load_glob_list
(
path
):
"""Loads a list of globs from a configuration file
Excludes comments and empty lines
"""
retval
=
[
str
(
k
.
strip
())
for
k
in
open
(
path
,
"rt"
)]
return
[
k
for
k
in
retval
if
k
and
k
[
0
]
not
in
(
"#"
,
"-"
)]
def
blacklist_filter
(
packages
,
globs
):
"""Filters **out** the input package set with the glob list"""
to_remove
=
set
()
for
k
in
globs
:
to_remove
|=
set
(
fnmatch
.
filter
(
packages
,
k
))
return
packages
-
to_remove
def
download_packages
(
packages
,
repodata
,
channel_url
,
dest_dir
,
arch
,
dry_run
):
"""Downloads remote packages to a download directory
Packages are downloaded first to a temporary directory, then validated
according to the expected sha256/md5 sum and then moved, one by one, to the
destination directory. An error is raised if the package cannot be
correctly downloaded.
Parameters
----------
packages : list of str
List of packages to download from the remote channel
repodata: dict
A dictionary containing the remote repodata.json contents
channel_url: str
The complete channel URL
dest_dir: str
The local directory where the channel is being mirrored
arch: str
The current architecture which we are mirroring
dry_run: bool
A boolean flag indicating if this is just a dry-run (simulation),
flagging so we don't really do anything (set to ``True``).
"""
def
_sha256sum
(
filename
):
h
=
hashlib
.
sha256
()
b
=
bytearray
(
128
*
1024
)
mv
=
memoryview
(
b
)
with
open
(
filename
,
'rb'
,
buffering
=
0
)
as
f
:
for
n
in
iter
(
lambda
:
f
.
readinto
(
mv
),
0
):
h
.
update
(
mv
[:
n
])
return
h
.
hexdigest
()
def
_md5sum
(
filename
):
h
=
hashlib
.
md5
()
b
=
bytearray
(
128
*
1024
)
mv
=
memoryview
(
b
)
with
open
(
filename
,
'rb'
,
buffering
=
0
)
as
f
:
for
n
in
iter
(
lambda
:
f
.
readinto
(
mv
),
0
):
h
.
update
(
mv
[:
n
])
return
h
.
hexdigest
()
# download files into temporary directory, that is removed by the end of
# the procedure, or if something bad occurs
with
tempfile
.
TemporaryDirectory
()
as
download_dir
:
total
=
len
(
packages
)
for
k
,
p
in
enumerate
(
packages
):
k
+=
1
#adjust to produce correct order on printouts
# checksum to verify
if
p
.
endswith
(
'.tar.bz2'
):
expected_hash
=
repodata
[
'packages'
][
p
].
get
(
'sha256'
,
repodata
[
'packages'
][
p
][
'md5'
])
else
:
expected_hash
=
repodata
[
'packages.conda'
][
p
].
get
(
'sha256'
,
repodata
[
'packages'
][
p
][
'md5'
])
# download package to file in our temporary directory
url
=
channel_url
+
'/'
+
arch
+
'/'
+
p
temp_dest
=
os
.
path
.
join
(
download_dir
,
p
)
logger
.
info
(
'[download: %d/%d] %s -> %s'
,
k
,
total
,
url
,
temp_dest
)
if
not
dry_run
:
logger
.
debug
(
'[checking: %d/%d] %s'
,
k
,
total
,
url
)
r
=
requests
.
get
(
url
,
stream
=
True
,
allow_redirects
=
True
)
logger
.
info
(
'[download: %d/%d] %s -> %s (%s bytes)'
,
k
,
total
,
url
,
temp_dest
,
r
.
headers
[
'Content-length'
])
open
(
temp_dest
,
'wb'
).
write
(
r
.
raw
.
read
())
# verify that checksum matches
if
len
(
expected_hash
)
==
32
:
#md5
logger
.
info
(
'[verify: %d/%d] md5(%s) == %s?'
,
k
,
total
,
temp_dest
,
expected_hash
)
else
:
#sha256
logger
.
info
(
'[verify: %d/%d] sha256(%s) == %s?'
,
k
,
total
,
temp_dest
,
expected_hash
)
if
not
dry_run
:
if
len
(
expected_hash
)
==
32
:
#md5
actual_hash
=
_md5sum
(
temp_dest
)
else
:
#sha256
actual_hash
=
_sha256sum
(
temp_dest
)
assert
actual_hash
==
expected_hash
,
'Checksum of locally'
\
' downloaded version of %s does not match '
\
'(actual:%r != %r:expected)'
%
(
url
,
actual_hash
,
expected_hash
)
# move
local_dest
=
os
.
path
.
join
(
dest_dir
,
arch
,
p
)
logger
.
info
(
'[move: %d/%d] %s -> %s'
,
k
,
total
,
temp_dest
,
local_dest
)
# check local directory is available before moving
dirname
=
os
.
path
.
dirname
(
local_dest
)
if
not
os
.
path
.
exists
(
dirname
):
logger
.
info
(
'[mkdir] %s'
,
dirname
)
if
not
dry_run
:
os
.
makedirs
(
dirname
)
if
not
dry_run
:
os
.
rename
(
temp_dest
,
local_dest
)
def
remove_packages
(
packages
,
dest_dir
,
arch
,
dry_run
):
"""Removes local packages that no longer matter"""
total
=
len
(
packages
)
for
k
,
p
in
enumerate
(
packages
):
k
+=
1
#adjust to produce correct order on printouts
path
=
os
.
path
.
join
(
dest_dir
,
arch
,
p
)
logger
.
info
(
'[remove: %d/%d] %s'
,
k
,
total
,
path
)
if
not
dry_run
:
os
.
unlink
(
path
)
bob/devtools/scripts/mirror.py
0 → 100644
View file @
4bb0e089
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
import
os
import
click
import
conda_build.api
from
.
import
bdt
from
..mirror
import
(
get_json
,
get_local_contents
,
load_glob_list
,
blacklist_filter
,
download_packages
,
remove_packages
,
)
from
..log
import
verbosity_option
,
get_logger
,
echo_info
,
echo_warning
logger
=
get_logger
(
__name__
)
@
click
.
command
(
epilog
=
"""
Examples:
1. Mirrors a conda channel:
\b
$ bdt mirror -vv https://www.idiap.ch/software/bob/label/beta
"""
)
@
click
.
argument
(
"channel-url"
,
required
=
True
,
)
@
click
.
argument
(
"dest-dir"
,
type
=
click
.
Path
(
exists
=
False
,
dir_okay
=
True
,
file_okay
=
False
,
writable
=
True
,
readable
=
True
,
resolve_path
=
True
),
required
=
True
,
)
@
click
.
option
(
"-b"
,
"--blacklist"
,
type
=
click
.
Path
(
exists
=
True
,
dir_okay
=
False
,
file_okay
=
True
,
readable
=
True
,
resolve_path
=
True
),
help
=
"A file containing a list of globs to exclude from local "
\
"mirroring, one per line"
,
)
@
click
.
option
(
"-m"
,
"--check-md5/--no-check-md5"
,
default
=
False
,
help
=
"If set, then check MD5 sums of all packages during conda-index"
,
)
@
click
.
option
(
"-d"
,
"--dry-run/--no-dry-run"
,
default
=
False
,
help
=
"Only goes through the actions, but does not execute them "
"(combine with the verbosity flags - e.g. ``-vvv``) to enable "
"printing to help you understand what will be done"
,
)
@
verbosity_option
()
@
bdt
.
raise_on_error
def
mirror
(
channel_url
,
dest_dir
,
blacklist
,
check_md5
,
dry_run
,
):
"""Mirrors a conda channel to a particular local destination
This command is capable of completely mirroring a valid conda channel,
excluding packages that you may not be interested on via globs. It works
to minimize channel usage by first downloading the channel repository data
(in compressed format), analysing what is available locally and what is
available on the channel, and only downloading the missing files.
"""
# if we are in a dry-run mode, let's let it be known
if
dry_run
:
logger
.
warn
(
"!!!! DRY RUN MODE !!!!"
)
logger
.
warn
(
"Nothing will be really mirrored"
)
DEFAULT_SUBDIRS
=
[
'noarch'
,
'linux-64'
,
'osx-64'
]
noarch
=
os
.
path
.
join
(
dest_dir
,
'noarch'
)
if
not
os
.
path
.
exists
(
noarch
):
#first time
# calls conda index to create basic infrastructure
logger
.
info
(
"Creating conda channel at %s..."
,
dest_dir
)
if
not
dry_run
:
conda_build
.
api
.
update_index
([
dest_dir
],
subdir
=
DEFAULT_SUBDIRS
,
progress
=
False
)
for
arch
in
DEFAULT_SUBDIRS
:
remote_repodata
=
get_json
(
channel_url
,
arch
,
'repodata.json.bz2'
)
logger
.
info
(
'%d packages available in remote index'
,
len
(
remote_repodata
.
get
(
'packages'
,
{})))
local_packages
=
get_local_contents
(
dest_dir
,
arch
)
logger
.
info
(
'%d packages available in local mirror'
,
len
(
local_packages
))
remote_packages
=
set
(
list
(
remote_repodata
.
get
(
'packages'
,
{}).
keys
())
+
list
(
remote_repodata
.
get
(
'packages.conda'
,
{}).
keys
()))
if
blacklist
is
not
None
and
os
.
path
.
exists
(
blacklist
):
globs_to_remove
=
set
(
load_glob_list
(
blacklist
))
else
:
globs_to_remove
=
set
()
# in the remote packages, subset those that need to be downloaded
# according to our own interest
to_download
=
blacklist_filter
(
remote_packages
-
local_packages
,
globs_to_remove
)
# in the local packages, subset those that we no longer need, be it
# because they have been removed from the remote repository, or because
# we decided to blacklist them.
disappeared_remotely
=
local_packages
-
remote_packages
to_keep
=
blacklist_filter
(
local_packages
,
globs_to_remove
)
to_delete_locally
=
(
local_packages
-
to_keep
)
|
disappeared_remotely
# execute the transaction
if
to_download
:
download_packages
(
to_download
,
remote_repodata
,
channel_url
,
dest_dir
,
arch
,
dry_run
)
else
:
echo_info
(
"Mirror at %s/%s is up-to-date w.r.t. %s/%s. "
\
"No packages to download."
%
(
dest_dir
,
arch
,
channel_url
,
arch
))
if
to_delete_locally
:
echo_warning
(
"%d packages will be removed at %s/%s"
%
\
(
len
(
to_delete_locally
),
dest_dir
,
arch
))
remove_packages
(
to_delete_locally
,
dest_dir
,
arch
,
dry_run
)
else
:
echo_info
(
"Mirror at %s/%s is up-to-date w.r.t. blacklist. "
\
"No packages to be removed."
%
(
dest_dir
,
arch
))
# re-indexes the channel to produce a conda-compatible setup
echo_info
(
"Re-indexing %s..."
%
dest_dir
)
if
not
dry_run
:
conda_build
.
api
.
update_index
([
dest_dir
],
check_md5
=
check_md5
,
progress
=
True
)
setup.py
View file @
4bb0e089
...
...
@@ -48,6 +48,7 @@ setup(
'dumpsphinx = bob.devtools.scripts.dumpsphinx:dumpsphinx'
,
'create = bob.devtools.scripts.create:create'
,
'build = bob.devtools.scripts.build:build'
,
'mirror = bob.devtools.scripts.mirror:mirror'
,
'rebuild = bob.devtools.scripts.rebuild:rebuild'
,
'test = bob.devtools.scripts.test:test'
,
'caupdate = bob.devtools.scripts.caupdate:caupdate'
,
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment