scripts/dl_github_archive: add support for packing zstd archive
[openwrt/staging/nbd.git] / scripts / dl_github_archive.py
1 #!/usr/bin/env python3
2 #
3 # Copyright (c) 2018 Yousong Zhou <yszhou4tech@gmail.com>
4 #
5 # This is free software, licensed under the GNU General Public License v2.
6 # See /LICENSE for more information.
7
8 import argparse
9 import calendar
10 import datetime
11 import errno
12 import fcntl
13 import hashlib
14 import json
15 import os
16 import os.path
17 import re
18 import shutil
19 import ssl
20 import subprocess
21 import sys
22 import time
23 import urllib.request
24
25 TMPDIR = os.environ.get('TMP_DIR') or '/tmp'
26 TMPDIR_DL = os.path.join(TMPDIR, 'dl')
27
28
29 class PathException(Exception): pass
30 class DownloadGitHubError(Exception): pass
31
32
33 class Path(object):
34 """Context class for preparing and cleaning up directories.
35
36 If ```preclean` is ``False``, ``path`` will NOT be removed on context enter
37
38 If ``path`` ``isdir``, then it will be created on context enter.
39
40 If ``keep`` is True, then ``path`` will NOT be removed on context exit
41 """
42
43 def __init__(self, path, isdir=True, preclean=False, keep=False):
44 self.path = path
45 self.isdir = isdir
46 self.preclean = preclean
47 self.keep = keep
48
49 def __enter__(self):
50 if self.preclean:
51 self.rm_all(self.path)
52 if self.isdir:
53 self.mkdir_all(self.path)
54 return self
55
56 def __exit__(self, exc_type, exc_value, traceback):
57 if not self.keep:
58 self.rm_all(self.path)
59
60 @staticmethod
61 def mkdir_all(path):
62 """Same as mkdir -p."""
63 names = os.path.split(path)
64 p = ''
65 for name in names:
66 p = os.path.join(p, name)
67 Path._mkdir(p)
68
69 @staticmethod
70 def _rmdir_dir(dir_):
71 names = Path._listdir(dir_)
72 for name in names:
73 p = os.path.join(dir_, name)
74 Path.rm_all(p)
75 Path._rmdir(dir_)
76
77 @staticmethod
78 def _mkdir(path):
79 Path._os_func(os.mkdir, path, errno.EEXIST)
80
81 @staticmethod
82 def _rmdir(path):
83 Path._os_func(os.rmdir, path, errno.ENOENT)
84
85 @staticmethod
86 def _remove(path):
87 Path._os_func(os.remove, path, errno.ENOENT)
88
89 @staticmethod
90 def _listdir(path):
91 return Path._os_func(os.listdir, path, errno.ENOENT, default=[])
92
93 @staticmethod
94 def _os_func(func, path, errno, default=None):
95 """Call func(path) in an idempotent way.
96
97 On exception ``ex``, if the type is OSError and ``ex.errno == errno``,
98 return ``default``, otherwise, re-raise
99 """
100 try:
101 return func(path)
102 except OSError as e:
103 if e.errno == errno:
104 return default
105 else:
106 raise
107
108 @staticmethod
109 def rm_all(path):
110 """Same as rm -r."""
111 if os.path.islink(path):
112 Path._remove(path)
113 elif os.path.isdir(path):
114 Path._rmdir_dir(path)
115 else:
116 Path._remove(path)
117
118 @staticmethod
119 def untar(path, into=None):
120 """Extract tarball at ``path`` into subdir ``into``.
121
122 return subdir name if and only if there exists one, otherwise raise PathException
123 """
124 args = ('tar', '-C', into, '-xzf', path, '--no-same-permissions')
125 subprocess.check_call(args, preexec_fn=lambda: os.umask(0o22))
126 dirs = os.listdir(into)
127 if len(dirs) == 1:
128 return dirs[0]
129 else:
130 raise PathException('untar %s: expecting a single subdir, got %s' % (path, dirs))
131
132 @staticmethod
133 def tar(path, subdir, into=None, ts=None):
134 """Pack ``path`` into tarball ``into``."""
135 # --sort=name requires a recent build of GNU tar
136 args = ['tar', '--numeric-owner', '--owner=0', '--group=0', '--sort=name', '--mode=a-s']
137 args += ['-C', path, '-cf', into, subdir]
138 envs = os.environ.copy()
139 if ts is not None:
140 args.append('--mtime=@%d' % ts)
141 if into.endswith('.zst'):
142 envs['ZSTD_CLEVEL'] = '20'
143 envs['ZSTD_NBTHREADS'] = '0'
144 args.append('--zstd')
145 elif into.endswith('.xz'):
146 envs['XZ_OPT'] = '-7e'
147 args.append('-J')
148 elif into.endswith('.bz2'):
149 args.append('-j')
150 elif into.endswith('.gz'):
151 args.append('-z')
152 envs['GZIP'] = '-n'
153 else:
154 raise PathException('unknown compression type %s' % into)
155 subprocess.check_call(args, env=envs)
156
157
158 class GitHubCommitTsCache(object):
159 __cachef = 'github.commit.ts.cache'
160 __cachen = 2048
161
162 def __init__(self):
163 Path.mkdir_all(TMPDIR_DL)
164 self.cachef = os.path.join(TMPDIR_DL, self.__cachef)
165 self.cache = {}
166
167 def get(self, k):
168 """Get timestamp with key ``k``."""
169 fileno = os.open(self.cachef, os.O_RDONLY | os.O_CREAT)
170 with os.fdopen(fileno) as fin:
171 try:
172 fcntl.lockf(fileno, fcntl.LOCK_SH)
173 self._cache_init(fin)
174 if k in self.cache:
175 ts = self.cache[k][0]
176 return ts
177 finally:
178 fcntl.lockf(fileno, fcntl.LOCK_UN)
179 return None
180
181 def set(self, k, v):
182 """Update timestamp with ``k``."""
183 fileno = os.open(self.cachef, os.O_RDWR | os.O_CREAT)
184 with os.fdopen(fileno, 'w+') as f:
185 try:
186 fcntl.lockf(fileno, fcntl.LOCK_EX)
187 self._cache_init(f)
188 self.cache[k] = (v, int(time.time()))
189 self._cache_flush(f)
190 finally:
191 fcntl.lockf(fileno, fcntl.LOCK_UN)
192
193 def _cache_init(self, fin):
194 for line in fin:
195 k, ts, updated = line.split()
196 ts = int(ts)
197 updated = int(updated)
198 self.cache[k] = (ts, updated)
199
200 def _cache_flush(self, fout):
201 cache = sorted(self.cache.items(), key=lambda a: a[1][1])
202 cache = cache[:self.__cachen]
203 self.cache = {}
204 os.ftruncate(fout.fileno(), 0)
205 fout.seek(0, os.SEEK_SET)
206 for k, ent in cache:
207 ts = ent[0]
208 updated = ent[1]
209 line = '{0} {1} {2}\n'.format(k, ts, updated)
210 fout.write(line)
211
212
213 class DownloadGitHubTarball(object):
214 """Download and repack archive tarball from GitHub.
215
216 Compared with the method of packing after cloning the whole repo, this
217 method is more friendly to users with fragile internet connection.
218
219 However, there are limitations with this method
220
221 - GitHub imposes a 60 reqs/hour limit for unauthenticated API access.
222 This affects fetching commit date for reproducible tarballs. Download
223 through the archive link is not affected.
224
225 - GitHub archives do not contain source codes for submodules.
226
227 - GitHub archives seem to respect .gitattributes and ignore paths with
228 export-ignore attributes.
229
230 For the first two issues, the method will fail loudly to allow fallback to
231 clone-then-pack method.
232
233 As for the 3rd issue, to make sure that this method only produces identical
234 tarballs as the fallback method, we require the expected hash value to be
235 supplied. That means the first tarball will need to be prepared by the
236 clone-then-pack method
237 """
238
239 __repo_url_regex = re.compile(r'^(?:https|git)://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)')
240
241 def __init__(self, args):
242 self.dl_dir = args.dl_dir
243 self.version = args.version
244 self.subdir = args.subdir
245 self.source = args.source
246 self.submodules = args.submodules
247 self.url = args.url
248 self._init_owner_repo()
249 self.xhash = args.hash
250 self._init_hasher()
251 self.commit_ts = None # lazy load commit timestamp
252 self.commit_ts_cache = GitHubCommitTsCache()
253 self.name = 'github-tarball'
254
255 def download(self):
256 """Download and repack GitHub archive tarball."""
257 if self.submodules and self.submodules != ['skip']:
258 raise self._error('Fetching submodules is not yet supported')
259 self._init_commit_ts()
260 with Path(TMPDIR_DL, keep=True) as dir_dl:
261 # fetch tarball from GitHub
262 tarball_path = os.path.join(dir_dl.path, self.subdir + '.tar.gz.dl')
263 with Path(tarball_path, isdir=False):
264 self._fetch(tarball_path)
265 # unpack
266 d = os.path.join(dir_dl.path, self.subdir + '.untar')
267 with Path(d, preclean=True) as dir_untar:
268 tarball_prefix = Path.untar(tarball_path, into=dir_untar.path)
269 dir0 = os.path.join(dir_untar.path, tarball_prefix)
270 dir1 = os.path.join(dir_untar.path, self.subdir)
271 # submodules check
272 if self.submodules != ['skip'] and self._has_submodule(dir0):
273 raise self._error('Fetching submodules is not yet supported')
274 # rename subdir
275 os.rename(dir0, dir1)
276 # repack
277 into=os.path.join(TMPDIR_DL, self.source)
278 Path.tar(dir_untar.path, self.subdir, into=into, ts=self.commit_ts)
279 try:
280 self._hash_check(into)
281 except Exception:
282 Path.rm_all(into)
283 raise
284 # move to target location
285 file1 = os.path.join(self.dl_dir, self.source)
286 if into != file1:
287 shutil.move(into, file1)
288
289 def _has_submodule(self, dir_):
290 m = os.path.join(dir_, '.gitmodules')
291 try:
292 st = os.stat(m)
293 return st.st_size > 0
294 except OSError as e:
295 return e.errno != errno.ENOENT
296
297 def _init_owner_repo(self):
298 m = self.__repo_url_regex.search(self.url)
299 if m is None:
300 raise self._error('Invalid github url: {}'.format(self.url))
301 owner = m.group('owner')
302 repo = m.group('repo')
303 if repo.endswith('.git'):
304 repo = repo[:-4]
305 self.owner = owner
306 self.repo = repo
307
308 def _init_hasher(self):
309 xhash = self.xhash
310 if len(xhash) == 64:
311 self.hasher = hashlib.sha256()
312 elif len(xhash) == 32:
313 self.hasher = hashlib.md5()
314 else:
315 raise self._error('Requires sha256sum for verification')
316 self.xhash = xhash
317
318 def _hash_check(self, f):
319 with open(f, 'rb') as fin:
320 while True:
321 d = fin.read(4096)
322 if not d:
323 break
324 self.hasher.update(d)
325 xhash = self.hasher.hexdigest()
326 if xhash != self.xhash:
327 raise self._error('Wrong hash (probably caused by .gitattributes), expecting {}, got {}'.format(self.xhash, xhash))
328
329 def _init_commit_ts(self):
330 if self.commit_ts is not None:
331 return
332 # GitHub provides 2 APIs[1,2] for fetching commit data. API[1] is more
333 # terse while API[2] provides more verbose info such as commit diff
334 # etc. That's the main reason why API[1] is preferred: the response
335 # size is predictable.
336 #
337 # However, API[1] only accepts complete commit sha1sum as the parameter
338 # while API[2] is more liberal accepting also partial commit id and
339 # tags, etc.
340 #
341 # [1] Get a single commit, Repositories, https://developer.github.com/v3/repos/commits/#get-a-single-commit
342 # [2] Git Commits, Git Data, https://developer.github.com/v3/git/commits/#get-a-commit
343 apis = [
344 {
345 'url': self._make_repo_url_path('git', 'commits', self.version),
346 'attr_path': ('committer', 'date'),
347 }, {
348 'url': self._make_repo_url_path('commits', self.version),
349 'attr_path': ('commit', 'committer', 'date'),
350 },
351 ]
352 version_is_sha1sum = len(self.version) == 40
353 if not version_is_sha1sum:
354 apis.insert(0, apis.pop())
355 reasons = ''
356 for api in apis:
357 url = api['url']
358 attr_path = api['attr_path']
359 try:
360 ct = self.commit_ts_cache.get(url)
361 if ct is not None:
362 self.commit_ts = ct
363 return
364 ct = self._init_commit_ts_remote_get(url, attr_path)
365 self.commit_ts = ct
366 self.commit_ts_cache.set(url, ct)
367 return
368 except Exception as e:
369 reasons += '\n' + (" {}: {}".format(url, e))
370 raise self._error('Cannot fetch commit ts:{}'.format(reasons))
371
372 def _init_commit_ts_remote_get(self, url, attrpath):
373 resp = self._make_request(url)
374 data = resp.read()
375 date = json.loads(data)
376 for attr in attrpath:
377 date = date[attr]
378 date = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ')
379 date = date.timetuple()
380 ct = calendar.timegm(date)
381 return ct
382
383 def _fetch(self, path):
384 """Fetch tarball of the specified version ref."""
385 ref = self.version
386 url = self._make_repo_url_path('tarball', ref)
387 resp = self._make_request(url)
388 with open(path, 'wb') as fout:
389 while True:
390 d = resp.read(4096)
391 if not d:
392 break
393 fout.write(d)
394
395 def _make_repo_url_path(self, *args):
396 url = '/repos/{0}/{1}'.format(self.owner, self.repo)
397 if args:
398 url += '/' + '/'.join(args)
399 return url
400
401 def _make_request(self, path):
402 """Request GitHub API endpoint on ``path``."""
403 url = 'https://api.github.com' + path
404 headers = {
405 'Accept': 'application/vnd.github.v3+json',
406 'User-Agent': 'OpenWrt',
407 }
408 req = urllib.request.Request(url, headers=headers)
409 sslcontext = ssl._create_unverified_context()
410 fileobj = urllib.request.urlopen(req, context=sslcontext)
411 return fileobj
412
413 def _error(self, msg):
414 return DownloadGitHubError('{}: {}'.format(self.source, msg))
415
416
417 def main():
418 parser = argparse.ArgumentParser()
419 parser.add_argument('--dl-dir', default=os.getcwd(), help='Download dir')
420 parser.add_argument('--url', help='Download URL')
421 parser.add_argument('--subdir', help='Source code subdir name')
422 parser.add_argument('--version', help='Source code version')
423 parser.add_argument('--source', help='Source tarball filename')
424 parser.add_argument('--hash', help='Source tarball\'s expected sha256sum')
425 parser.add_argument('--submodules', nargs='*', help='List of submodules, or "skip"')
426 args = parser.parse_args()
427 try:
428 method = DownloadGitHubTarball(args)
429 method.download()
430 except Exception as ex:
431 sys.stderr.write('{}: Download from {} failed\n'.format(args.source, args.url))
432 sys.stderr.write('{}\n'.format(ex))
433 sys.exit(1)
434
435 if __name__ == '__main__':
436 main()