scripts: sercomm-pid.py: use uppercase hwid in pid
[openwrt/staging/stintel.git] / scripts / dl_github_archive.py
1 #!/usr/bin/env python3
2 #
3 # Copyright (c) 2018 Yousong Zhou <yszhou4tech@gmail.com>
4 #
5 # This is free software, licensed under the GNU General Public License v2.
6 # See /LICENSE for more information.
7
8 import argparse
9 import calendar
10 import datetime
11 import errno
12 import fcntl
13 import hashlib
14 import json
15 import os
16 import os.path
17 import re
18 import shutil
19 import ssl
20 import subprocess
21 import sys
22 import time
23 import urllib.request
24
25 TMPDIR = os.environ.get('TMP_DIR') or '/tmp'
26 TMPDIR_DL = os.path.join(TMPDIR, 'dl')
27
28
29 class PathException(Exception): pass
30 class DownloadGitHubError(Exception): pass
31
32
33 class Path(object):
34 """Context class for preparing and cleaning up directories.
35
36 If ```preclean` is ``False``, ``path`` will NOT be removed on context enter
37
38 If ``path`` ``isdir``, then it will be created on context enter.
39
40 If ``keep`` is True, then ``path`` will NOT be removed on context exit
41 """
42
43 def __init__(self, path, isdir=True, preclean=False, keep=False):
44 self.path = path
45 self.isdir = isdir
46 self.preclean = preclean
47 self.keep = keep
48
49 def __enter__(self):
50 if self.preclean:
51 self.rm_all(self.path)
52 if self.isdir:
53 self.mkdir_all(self.path)
54 return self
55
56 def __exit__(self, exc_type, exc_value, traceback):
57 if not self.keep:
58 self.rm_all(self.path)
59
60 @staticmethod
61 def mkdir_all(path):
62 """Same as mkdir -p."""
63 names = os.path.split(path)
64 p = ''
65 for name in names:
66 p = os.path.join(p, name)
67 Path._mkdir(p)
68
69 @staticmethod
70 def _rmdir_dir(dir_):
71 names = Path._listdir(dir_)
72 for name in names:
73 p = os.path.join(dir_, name)
74 Path.rm_all(p)
75 Path._rmdir(dir_)
76
77 @staticmethod
78 def _mkdir(path):
79 Path._os_func(os.mkdir, path, errno.EEXIST)
80
81 @staticmethod
82 def _rmdir(path):
83 Path._os_func(os.rmdir, path, errno.ENOENT)
84
85 @staticmethod
86 def _remove(path):
87 Path._os_func(os.remove, path, errno.ENOENT)
88
89 @staticmethod
90 def _listdir(path):
91 return Path._os_func(os.listdir, path, errno.ENOENT, default=[])
92
93 @staticmethod
94 def _os_func(func, path, errno, default=None):
95 """Call func(path) in an idempotent way.
96
97 On exception ``ex``, if the type is OSError and ``ex.errno == errno``,
98 return ``default``, otherwise, re-raise
99 """
100 try:
101 return func(path)
102 except OSError as e:
103 if e.errno == errno:
104 return default
105 else:
106 raise
107
108 @staticmethod
109 def rm_all(path):
110 """Same as rm -r."""
111 if os.path.islink(path):
112 Path._remove(path)
113 elif os.path.isdir(path):
114 Path._rmdir_dir(path)
115 else:
116 Path._remove(path)
117
118 @staticmethod
119 def untar(path, into=None):
120 """Extract tarball at ``path`` into subdir ``into``.
121
122 return subdir name if and only if there exists one, otherwise raise PathException
123 """
124 args = ('tar', '-C', into, '-xzf', path, '--no-same-permissions')
125 subprocess.check_call(args, preexec_fn=lambda: os.umask(0o22))
126 dirs = os.listdir(into)
127 if len(dirs) == 1:
128 return dirs[0]
129 else:
130 raise PathException('untar %s: expecting a single subdir, got %s' % (path, dirs))
131
132 @staticmethod
133 def tar(path, subdir, into=None, ts=None):
134 """Pack ``path`` into tarball ``into``."""
135 # --sort=name requires a recent build of GNU tar
136 args = ['tar', '--numeric-owner', '--owner=0', '--group=0', '--sort=name', '--mode=a-s']
137 args += ['-C', path, '-cf', into, subdir]
138 envs = os.environ.copy()
139 if ts is not None:
140 args.append('--mtime=@%d' % ts)
141 if into.endswith('.xz'):
142 envs['XZ_OPT'] = '-7e'
143 args.append('-J')
144 elif into.endswith('.bz2'):
145 args.append('-j')
146 elif into.endswith('.gz'):
147 args.append('-z')
148 envs['GZIP'] = '-n'
149 else:
150 raise PathException('unknown compression type %s' % into)
151 subprocess.check_call(args, env=envs)
152
153
154 class GitHubCommitTsCache(object):
155 __cachef = 'github.commit.ts.cache'
156 __cachen = 2048
157
158 def __init__(self):
159 Path.mkdir_all(TMPDIR_DL)
160 self.cachef = os.path.join(TMPDIR_DL, self.__cachef)
161 self.cache = {}
162
163 def get(self, k):
164 """Get timestamp with key ``k``."""
165 fileno = os.open(self.cachef, os.O_RDONLY | os.O_CREAT)
166 with os.fdopen(fileno) as fin:
167 try:
168 fcntl.lockf(fileno, fcntl.LOCK_SH)
169 self._cache_init(fin)
170 if k in self.cache:
171 ts = self.cache[k][0]
172 return ts
173 finally:
174 fcntl.lockf(fileno, fcntl.LOCK_UN)
175 return None
176
177 def set(self, k, v):
178 """Update timestamp with ``k``."""
179 fileno = os.open(self.cachef, os.O_RDWR | os.O_CREAT)
180 with os.fdopen(fileno, 'w+') as f:
181 try:
182 fcntl.lockf(fileno, fcntl.LOCK_EX)
183 self._cache_init(f)
184 self.cache[k] = (v, int(time.time()))
185 self._cache_flush(f)
186 finally:
187 fcntl.lockf(fileno, fcntl.LOCK_UN)
188
189 def _cache_init(self, fin):
190 for line in fin:
191 k, ts, updated = line.split()
192 ts = int(ts)
193 updated = int(updated)
194 self.cache[k] = (ts, updated)
195
196 def _cache_flush(self, fout):
197 cache = sorted(self.cache.items(), key=lambda a: a[1][1])
198 cache = cache[:self.__cachen]
199 self.cache = {}
200 os.ftruncate(fout.fileno(), 0)
201 fout.seek(0, os.SEEK_SET)
202 for k, ent in cache:
203 ts = ent[0]
204 updated = ent[1]
205 line = '{0} {1} {2}\n'.format(k, ts, updated)
206 fout.write(line)
207
208
209 class DownloadGitHubTarball(object):
210 """Download and repack archive tarball from GitHub.
211
212 Compared with the method of packing after cloning the whole repo, this
213 method is more friendly to users with fragile internet connection.
214
215 However, there are limitations with this method
216
217 - GitHub imposes a 60 reqs/hour limit for unauthenticated API access.
218 This affects fetching commit date for reproducible tarballs. Download
219 through the archive link is not affected.
220
221 - GitHub archives do not contain source codes for submodules.
222
223 - GitHub archives seem to respect .gitattributes and ignore paths with
224 export-ignore attributes.
225
226 For the first two issues, the method will fail loudly to allow fallback to
227 clone-then-pack method.
228
229 As for the 3rd issue, to make sure that this method only produces identical
230 tarballs as the fallback method, we require the expected hash value to be
231 supplied. That means the first tarball will need to be prepared by the
232 clone-then-pack method
233 """
234
235 __repo_url_regex = re.compile(r'^(?:https|git)://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)')
236
237 def __init__(self, args):
238 self.dl_dir = args.dl_dir
239 self.version = args.version
240 self.subdir = args.subdir
241 self.source = args.source
242 self.submodules = args.submodules
243 self.url = args.url
244 self._init_owner_repo()
245 self.xhash = args.hash
246 self._init_hasher()
247 self.commit_ts = None # lazy load commit timestamp
248 self.commit_ts_cache = GitHubCommitTsCache()
249 self.name = 'github-tarball'
250
251 def download(self):
252 """Download and repack GitHub archive tarball."""
253 if self.submodules and self.submodules != ['skip']:
254 raise self._error('Fetching submodules is not yet supported')
255 self._init_commit_ts()
256 with Path(TMPDIR_DL, keep=True) as dir_dl:
257 # fetch tarball from GitHub
258 tarball_path = os.path.join(dir_dl.path, self.subdir + '.tar.gz.dl')
259 with Path(tarball_path, isdir=False):
260 self._fetch(tarball_path)
261 # unpack
262 d = os.path.join(dir_dl.path, self.subdir + '.untar')
263 with Path(d, preclean=True) as dir_untar:
264 tarball_prefix = Path.untar(tarball_path, into=dir_untar.path)
265 dir0 = os.path.join(dir_untar.path, tarball_prefix)
266 dir1 = os.path.join(dir_untar.path, self.subdir)
267 # submodules check
268 if self.submodules != ['skip'] and self._has_submodule(dir0):
269 raise self._error('Fetching submodules is not yet supported')
270 # rename subdir
271 os.rename(dir0, dir1)
272 # repack
273 into=os.path.join(TMPDIR_DL, self.source)
274 Path.tar(dir_untar.path, self.subdir, into=into, ts=self.commit_ts)
275 try:
276 self._hash_check(into)
277 except Exception:
278 Path.rm_all(into)
279 raise
280 # move to target location
281 file1 = os.path.join(self.dl_dir, self.source)
282 if into != file1:
283 shutil.move(into, file1)
284
285 def _has_submodule(self, dir_):
286 m = os.path.join(dir_, '.gitmodules')
287 try:
288 st = os.stat(m)
289 return st.st_size > 0
290 except OSError as e:
291 return e.errno != errno.ENOENT
292
293 def _init_owner_repo(self):
294 m = self.__repo_url_regex.search(self.url)
295 if m is None:
296 raise self._error('Invalid github url: {}'.format(self.url))
297 owner = m.group('owner')
298 repo = m.group('repo')
299 if repo.endswith('.git'):
300 repo = repo[:-4]
301 self.owner = owner
302 self.repo = repo
303
304 def _init_hasher(self):
305 xhash = self.xhash
306 if len(xhash) == 64:
307 self.hasher = hashlib.sha256()
308 elif len(xhash) == 32:
309 self.hasher = hashlib.md5()
310 else:
311 raise self._error('Requires sha256sum for verification')
312 self.xhash = xhash
313
314 def _hash_check(self, f):
315 with open(f, 'rb') as fin:
316 while True:
317 d = fin.read(4096)
318 if not d:
319 break
320 self.hasher.update(d)
321 xhash = self.hasher.hexdigest()
322 if xhash != self.xhash:
323 raise self._error('Wrong hash (probably caused by .gitattributes), expecting {}, got {}'.format(self.xhash, xhash))
324
325 def _init_commit_ts(self):
326 if self.commit_ts is not None:
327 return
328 # GitHub provides 2 APIs[1,2] for fetching commit data. API[1] is more
329 # terse while API[2] provides more verbose info such as commit diff
330 # etc. That's the main reason why API[1] is preferred: the response
331 # size is predictable.
332 #
333 # However, API[1] only accepts complete commit sha1sum as the parameter
334 # while API[2] is more liberal accepting also partial commit id and
335 # tags, etc.
336 #
337 # [1] Get a single commit, Repositories, https://developer.github.com/v3/repos/commits/#get-a-single-commit
338 # [2] Git Commits, Git Data, https://developer.github.com/v3/git/commits/#get-a-commit
339 apis = [
340 {
341 'url': self._make_repo_url_path('git', 'commits', self.version),
342 'attr_path': ('committer', 'date'),
343 }, {
344 'url': self._make_repo_url_path('commits', self.version),
345 'attr_path': ('commit', 'committer', 'date'),
346 },
347 ]
348 version_is_sha1sum = len(self.version) == 40
349 if not version_is_sha1sum:
350 apis.insert(0, apis.pop())
351 reasons = ''
352 for api in apis:
353 url = api['url']
354 attr_path = api['attr_path']
355 try:
356 ct = self.commit_ts_cache.get(url)
357 if ct is not None:
358 self.commit_ts = ct
359 return
360 ct = self._init_commit_ts_remote_get(url, attr_path)
361 self.commit_ts = ct
362 self.commit_ts_cache.set(url, ct)
363 return
364 except Exception as e:
365 reasons += '\n' + (" {}: {}".format(url, e))
366 raise self._error('Cannot fetch commit ts:{}'.format(reasons))
367
368 def _init_commit_ts_remote_get(self, url, attrpath):
369 resp = self._make_request(url)
370 data = resp.read()
371 date = json.loads(data)
372 for attr in attrpath:
373 date = date[attr]
374 date = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ')
375 date = date.timetuple()
376 ct = calendar.timegm(date)
377 return ct
378
379 def _fetch(self, path):
380 """Fetch tarball of the specified version ref."""
381 ref = self.version
382 url = self._make_repo_url_path('tarball', ref)
383 resp = self._make_request(url)
384 with open(path, 'wb') as fout:
385 while True:
386 d = resp.read(4096)
387 if not d:
388 break
389 fout.write(d)
390
391 def _make_repo_url_path(self, *args):
392 url = '/repos/{0}/{1}'.format(self.owner, self.repo)
393 if args:
394 url += '/' + '/'.join(args)
395 return url
396
397 def _make_request(self, path):
398 """Request GitHub API endpoint on ``path``."""
399 url = 'https://api.github.com' + path
400 headers = {
401 'Accept': 'application/vnd.github.v3+json',
402 'User-Agent': 'OpenWrt',
403 }
404 req = urllib.request.Request(url, headers=headers)
405 sslcontext = ssl._create_unverified_context()
406 fileobj = urllib.request.urlopen(req, context=sslcontext)
407 return fileobj
408
409 def _error(self, msg):
410 return DownloadGitHubError('{}: {}'.format(self.source, msg))
411
412
413 def main():
414 parser = argparse.ArgumentParser()
415 parser.add_argument('--dl-dir', default=os.getcwd(), help='Download dir')
416 parser.add_argument('--url', help='Download URL')
417 parser.add_argument('--subdir', help='Source code subdir name')
418 parser.add_argument('--version', help='Source code version')
419 parser.add_argument('--source', help='Source tarball filename')
420 parser.add_argument('--hash', help='Source tarball\'s expected sha256sum')
421 parser.add_argument('--submodules', nargs='*', help='List of submodules, or "skip"')
422 args = parser.parse_args()
423 try:
424 method = DownloadGitHubTarball(args)
425 method.download()
426 except Exception as ex:
427 sys.stderr.write('{}: Download from {} failed\n'.format(args.source, args.url))
428 sys.stderr.write('{}\n'.format(ex))
429 sys.exit(1)
430
431 if __name__ == '__main__':
432 main()