scripts/dl_github_archive.py: rename from download.py

- Make the code more GitHub-specific
 - Requires mirror hash to work with .gitattributes
 - Use different API depending on whether PKG_SOURCE_VERSION is a
   complete commit id or other ref types like tags
 - Fix removing symbolic link
 - pre-clean dir_untar for possible leftovers from previous run

Signed-off-by: Yousong Zhou <yszhou4tech@gmail.com>
This commit is contained in:
Yousong Zhou 2018-06-28 18:27:27 +08:00
parent e48ea13b3b
commit 04b9f85873
2 changed files with 130 additions and 125 deletions

View File

@ -172,25 +172,25 @@ endef
define DownloadMethod/git define DownloadMethod/git
$(call wrap_mirror,$(1),$(2), \ $(call wrap_mirror,$(1),$(2), \
$(call DownloadMethod/git-raw) \ $(call DownloadMethod/rawgit) \
) )
endef endef
define DownloadMethod/github-tarball define DownloadMethod/github_archive
$(call wrap_mirror,$(1),$(2), \ $(call wrap_mirror,$(1),$(2), \
$(SCRIPT_DIR)/download.py dl \ $(SCRIPT_DIR)/dl_github_archive.py \
--dl-dir="$(DL_DIR)" \ --dl-dir="$(DL_DIR)" \
--url $(foreach url,$(URL),"$(url)") \ --url="$(URL)" \
--proto="$(PROTO)" \
--version="$(VERSION)" \ --version="$(VERSION)" \
--subdir="$(SUBDIR)" \ --subdir="$(SUBDIR)" \
--source="$(FILE)" \ --source="$(FILE)" \
|| ( $(call DownloadMethod/git-raw) ); \ --hash="$(MIRROR_HASH)" \
|| ( $(call DownloadMethod/rawgit) ); \
) )
endef endef
# Only intends to be called as a submethod from other DownloadMethod # Only intends to be called as a submethod from other DownloadMethod
define DownloadMethod/git-raw define DownloadMethod/rawgit
echo "Checking out files from the git repository..."; \ echo "Checking out files from the git repository..."; \
mkdir -p $(TMP_DIR)/dl && \ mkdir -p $(TMP_DIR)/dl && \
cd $(TMP_DIR)/dl && \ cd $(TMP_DIR)/dl && \

View File

@ -10,6 +10,7 @@ import calendar
import datetime import datetime
import errno import errno
import fcntl import fcntl
import hashlib
import json import json
import os import os
import os.path import os.path
@ -23,26 +24,31 @@ import urllib2
TMPDIR = os.environ.get('TMP_DIR') or '/tmp' TMPDIR = os.environ.get('TMP_DIR') or '/tmp'
TMPDIR_DL = os.path.join(TMPDIR, 'dl') TMPDIR_DL = os.path.join(TMPDIR, 'dl')
DOWNLOAD_METHODS = []
class PathException(Exception): pass class PathException(Exception): pass
class DownloadException(Exception): pass class DownloadGitHubError(Exception): pass
class Path(object): class Path(object):
"""Context class for preparing and cleaning up directories. """Context class for preparing and cleaning up directories.
If ```preclean` is ``False``, ``path`` will NOT be removed on context enter
If ``path`` ``isdir``, then it will be created on context enter. If ``path`` ``isdir``, then it will be created on context enter.
If ``keep`` is True, then ``path`` will NOT be removed on context exit If ``keep`` is True, then ``path`` will NOT be removed on context exit
""" """
def __init__(self, path, isdir=True, keep=False): def __init__(self, path, isdir=True, preclean=False, keep=False):
self.path = path self.path = path
self.isdir = isdir self.isdir = isdir
self.preclean = preclean
self.keep = keep self.keep = keep
def __enter__(self): def __enter__(self):
if self.preclean:
self.rm_all(self.path)
if self.isdir: if self.isdir:
self.mkdir_all(self.path) self.mkdir_all(self.path)
return self return self
@ -61,14 +67,11 @@ class Path(object):
Path._mkdir(p) Path._mkdir(p)
@staticmethod @staticmethod
def _rmdir_all(dir_): def _rmdir_dir(dir_):
names = Path._listdir(dir_) names = Path._listdir(dir_)
for name in names: for name in names:
p = os.path.join(dir_, name) p = os.path.join(dir_, name)
if os.path.isdir(p): Path.rm_all(p)
Path._rmdir_all(p)
else:
Path._remove(p)
Path._rmdir(dir_) Path._rmdir(dir_)
@staticmethod @staticmethod
@ -105,8 +108,10 @@ class Path(object):
@staticmethod @staticmethod
def rm_all(path): def rm_all(path):
"""Same as rm -r.""" """Same as rm -r."""
if os.path.isdir(path): if os.path.islink(path):
Path._rmdir_all(path) Path._remove(path)
elif os.path.isdir(path):
Path._rmdir_dir(path)
else: else:
Path._remove(path) Path._remove(path)
@ -201,60 +206,47 @@ class GitHubCommitTsCache(object):
fout.write(line) fout.write(line)
class DownloadMethod(object): class DownloadGitHubTarball(object):
"""Base class of all download method.""" """Download and repack archive tarabll from GitHub.
def __init__(self, args): Compared with the method of packing after cloning the whole repo, this
self.args = args method is more friendly to users with fragile internet connection.
self.urls = args.urls
self.url = self.urls[0]
self.dl_dir = args.dl_dir
@classmethod However, there are limitations with this method
def resolve(cls, args):
"""Resolve download method to use.
return instance of subclass of DownloadMethod - GitHub imposes a 60 reqs/hour limit for unauthenticated API access.
This affects fetching commit date for reproducible tarballs. Download
through the archive link is not affected.
- GitHub archives do not contain source codes for submodules.
- GitHub archives seem to respect .gitattributes and ignore pathes with
export-ignore attributes.
For the first two issues, the method will fail loudly to allow fallback to
clone-then-pack method.
As for the 3rd issue, to make sure that this method only produces identical
tarballs as the fallback method, we require the expected hash value to be
supplied. That means the first tarball will need to be prepared by the
clone-then-pack method
""" """
for c in DOWNLOAD_METHODS:
if c.match(args):
return c(args)
@staticmethod
def match(args):
"""Return True if it can do the download."""
return NotImplemented
def download(self):
"""Do the download and put it into the download dir."""
return NotImplemented
class DownloadMethodGitHubTarball(DownloadMethod):
"""Download and repack archive tarabll from GitHub."""
__repo_url_regex = re.compile(r'^(?:https|git)://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)') __repo_url_regex = re.compile(r'^(?:https|git)://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)')
def __init__(self, args): def __init__(self, args):
super(DownloadMethodGitHubTarball, self).__init__(args) self.dl_dir = args.dl_dir
self._init_owner_repo()
self.version = args.version self.version = args.version
self.subdir = args.subdir self.subdir = args.subdir
self.source = args.source self.source = args.source
self.url = args.url
self._init_owner_repo()
self.xhash = args.hash
self._init_hasher()
self.commit_ts = None # lazy load commit timestamp self.commit_ts = None # lazy load commit timestamp
self.commit_ts_cache = GitHubCommitTsCache() self.commit_ts_cache = GitHubCommitTsCache()
self.name = 'github-tarball' self.name = 'github-tarball'
@staticmethod
def match(args):
"""Match if it's a GitHub clone url."""
url = args.urls[0]
proto = args.proto
if proto == 'git' and isinstance(url, basestring) \
and (url.startswith('https://github.com/') or url.startswith('git://github.com/')):
return True
return False
def download(self): def download(self):
"""Download and repack GitHub archive tarball.""" """Download and repack GitHub archive tarball."""
self._init_commit_ts() self._init_commit_ts()
@ -265,18 +257,23 @@ class DownloadMethodGitHubTarball(DownloadMethod):
self._fetch(tarball_path) self._fetch(tarball_path)
# unpack # unpack
d = os.path.join(dir_dl.path, self.subdir + '.untar') d = os.path.join(dir_dl.path, self.subdir + '.untar')
with Path(d) as dir_untar: with Path(d, preclean=True) as dir_untar:
tarball_prefix = Path.untar(tarball_path, into=dir_untar.path) tarball_prefix = Path.untar(tarball_path, into=dir_untar.path)
dir0 = os.path.join(dir_untar.path, tarball_prefix) dir0 = os.path.join(dir_untar.path, tarball_prefix)
dir1 = os.path.join(dir_untar.path, self.subdir) dir1 = os.path.join(dir_untar.path, self.subdir)
# submodules check # submodules check
if self._has_submodule(dir0): if self._has_submodule(dir0):
raise DownloadException('unable to fetch submodules\' source code') raise self._error('Fetching submodules is not yet supported')
# rename subdir # rename subdir
os.rename(dir0, dir1) os.rename(dir0, dir1)
# repack # repack
into=os.path.join(TMPDIR_DL, self.source) into=os.path.join(TMPDIR_DL, self.source)
Path.tar(dir_untar.path, self.subdir, into=into, ts=self.commit_ts) Path.tar(dir_untar.path, self.subdir, into=into, ts=self.commit_ts)
try:
self._hash_check(into)
except Exception:
Path.rm_all(into)
raise
# move to target location # move to target location
file1 = os.path.join(self.dl_dir, self.source) file1 = os.path.join(self.dl_dir, self.source)
if into != file1: if into != file1:
@ -291,10 +288,9 @@ class DownloadMethodGitHubTarball(DownloadMethod):
return e.errno != errno.ENOENT return e.errno != errno.ENOENT
def _init_owner_repo(self): def _init_owner_repo(self):
url = self.url m = self.__repo_url_regex.search(self.url)
m = self.__repo_url_regex.search(url)
if m is None: if m is None:
raise DownloadException('invalid github url: %s' % url) raise self._error('Invalid github url: {}'.format(self.url))
owner = m.group('owner') owner = m.group('owner')
repo = m.group('repo') repo = m.group('repo')
if repo.endswith('.git'): if repo.endswith('.git'):
@ -302,23 +298,79 @@ class DownloadMethodGitHubTarball(DownloadMethod):
self.owner = owner self.owner = owner
self.repo = repo self.repo = repo
def _init_hasher(self):
xhash = self.xhash
if len(xhash) == 64:
self.hasher = hashlib.sha256()
elif len(xhash) == 32:
self.hasher = hashlib.md5()
else:
raise self._error('Requires sha256sum for verification')
self.xhash = xhash
def _hash_check(self, f):
with open(f, 'rb') as fin:
while True:
d = fin.read(4096)
if not d:
break
self.hasher.update(d)
xhash = self.hasher.hexdigest()
if xhash != self.xhash:
raise self._error('Wrong hash (probably caused by .gitattributes), expecting {}, got {}'.format(self.xhash, xhash))
def _init_commit_ts(self): def _init_commit_ts(self):
if self.commit_ts is not None: if self.commit_ts is not None:
return return
url = self._make_repo_url_path('git', 'commits', self.version) # GitHub provides 2 APIs[1,2] for fetching commit data. API[1] is more
# terse while API[2] provides more verbose info such as commit diff
# etc. That's the main reason why API[1] is preferred: the response
# size is predictable.
#
# However, API[1] only accepts complete commit sha1sum as the parameter
# while API[2] is more liberal accepting also partial commit id and
# tags, etc.
#
# [1] Get a single commit, Repositories, https://developer.github.com/v3/repos/commits/#get-a-single-commit
# [2] Git Commits, Git Data, https://developer.github.com/v3/git/commits/#get-a-commit
apis = [
{
'url': self._make_repo_url_path('git', 'commits', self.version),
'attr_path': ('committer', 'date'),
}, {
'url': self._make_repo_url_path('commits', self.version),
'attr_path': ('commit', 'committer', 'date'),
},
]
version_is_sha1sum = len(self.version) == 40
if not version_is_sha1sum:
apis.insert(0, apis.pop())
for api in apis:
url = api['url']
attr_path = api['attr_path']
try:
ct = self.commit_ts_cache.get(url) ct = self.commit_ts_cache.get(url)
if ct is not None: if ct is not None:
self.commit_ts = ct self.commit_ts = ct
return return
ct = self._init_commit_ts_remote_get(url, attr_path)
self.commit_ts = ct
self.commit_ts_cache.set(url, ct)
return
except Exception:
pass
raise self._error('Cannot fetch commit ts: {}'.format(url))
def _init_commit_ts_remote_get(self, url, attrpath):
resp = self._make_request(url) resp = self._make_request(url)
data = resp.read() data = resp.read()
data = json.loads(data) date = json.loads(data)
date = data['committer']['date'] for attr in attrpath:
date = date[attr]
date = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ') date = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ')
date = date.timetuple() date = date.timetuple()
ct = calendar.timegm(date) ct = calendar.timegm(date)
self.commit_ts = ct return ct
self.commit_ts_cache.set(url, ct)
def _fetch(self, path): def _fetch(self, path):
"""Fetch tarball of the specified version ref.""" """Fetch tarball of the specified version ref."""
@ -350,71 +402,24 @@ class DownloadMethodGitHubTarball(DownloadMethod):
fileobj = urllib2.urlopen(req, context=sslcontext) fileobj = urllib2.urlopen(req, context=sslcontext)
return fileobj return fileobj
def _error(self, msg):
class DownloadMethodCatchall(DownloadMethod): return DownloadGitHubError('{}: {}'.format(self.source, msg))
"""Dummy method that knows names but not ways of download."""
def __init__(self, args):
super(DownloadMethodCatchall, self).__init__(args)
self.args = args
self.proto = args.proto
self.name = self._resolve_name()
def _resolve_name(self):
if self.proto:
return self.proto
methods_map = (
('default', ('@APACHE/', '@GITHUB/', '@GNOME/', '@GNU/',
'@KERNEL/', '@SF/', '@SAVANNAH/', 'ftp://', 'http://',
'https://', 'file://')),
('git', ('git://', )),
('svn', ('svn://', )),
('cvs', ('cvs://', )),
('bzr', ('sftp://', )),
('bzr', ('sftp://', )),
('unknown', ('', )),
)
for name, prefixes in methods_map:
if any(url.startswith(prefix) for prefix in prefixes for url in self.urls):
return name
@staticmethod
def match(args):
"""Return True."""
return True
def download(self):
"""Not implemented.
raise DownloadException
"""
raise DownloadException('download method for %s is not yet implemented' % self.name)
# order matters
DOWNLOAD_METHODS = [
DownloadMethodGitHubTarball,
DownloadMethodCatchall,
]
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('action', choices=('dl_method', 'dl'), help='Action to take') parser.add_argument('--dl-dir', default=os.getcwd(), help='Download dir')
parser.add_argument('--urls', nargs='+', metavar='URL', help='Download URLs') parser.add_argument('--url', help='Download URL')
parser.add_argument('--proto', help='Download proto')
parser.add_argument('--subdir', help='Source code subdir name') parser.add_argument('--subdir', help='Source code subdir name')
parser.add_argument('--version', help='Source code version') parser.add_argument('--version', help='Source code version')
parser.add_argument('--source', help='Source tarball filename') parser.add_argument('--source', help='Source tarball filename')
parser.add_argument('--dl-dir', default=os.getcwd(), help='Download dir') parser.add_argument('--hash', help='Source tarball\'s expected sha256sum')
args = parser.parse_args() args = parser.parse_args()
if args.action == 'dl_method': method = DownloadGitHubTarball(args)
method = DownloadMethod.resolve(args)
sys.stdout.write(method.name + '\n')
elif args.action == 'dl':
method = DownloadMethod.resolve(args)
try: try:
method.download() method.download()
except Exception: except Exception:
sys.stderr.write('download {} from {} failed\n'.format(args.source, args.url))
raise raise
if __name__ == '__main__': if __name__ == '__main__':