ebtables: fix compilation with GCC14
[openwrt/openwrt.git] / scripts / dl_github_archive.py
1 #!/usr/bin/env python3
2 #
3 # Copyright (c) 2018 Yousong Zhou <yszhou4tech@gmail.com>
4 #
5 # This is free software, licensed under the GNU General Public License v2.
6 # See /LICENSE for more information.
7
8 import argparse
9 import calendar
10 import datetime
11 import errno
12 import fcntl
13 import hashlib
14 import json
15 import os
16 import os.path
17 import re
18 import shutil
19 import ssl
20 import subprocess
21 import sys
22 import time
23 import urllib.request
24
25 TMPDIR = os.environ.get('TMP_DIR') or '/tmp'
26 TMPDIR_DL = os.path.join(TMPDIR, 'dl')
27
28
29 class PathException(Exception): pass
30 class DownloadGitHubError(Exception): pass
31
32
33 class Path(object):
34 """Context class for preparing and cleaning up directories.
35
36 If ```preclean` is ``False``, ``path`` will NOT be removed on context enter
37
38 If ``path`` ``isdir``, then it will be created on context enter.
39
40 If ``keep`` is True, then ``path`` will NOT be removed on context exit
41 """
42
43 def __init__(self, path, isdir=True, preclean=False, keep=False):
44 self.path = path
45 self.isdir = isdir
46 self.preclean = preclean
47 self.keep = keep
48
49 def __enter__(self):
50 if self.preclean:
51 self.rm_all(self.path)
52 if self.isdir:
53 self.mkdir_all(self.path)
54 return self
55
56 def __exit__(self, exc_type, exc_value, traceback):
57 if not self.keep:
58 self.rm_all(self.path)
59
60 @staticmethod
61 def mkdir_all(path):
62 """Same as mkdir -p."""
63 names = os.path.split(path)
64 p = ''
65 for name in names:
66 p = os.path.join(p, name)
67 Path._mkdir(p)
68
69 @staticmethod
70 def _rmdir_dir(dir_):
71 names = Path._listdir(dir_)
72 for name in names:
73 p = os.path.join(dir_, name)
74 Path.rm_all(p)
75 Path._rmdir(dir_)
76
77 @staticmethod
78 def _mkdir(path):
79 Path._os_func(os.mkdir, path, errno.EEXIST)
80
81 @staticmethod
82 def _rmdir(path):
83 Path._os_func(os.rmdir, path, errno.ENOENT)
84
85 @staticmethod
86 def _remove(path):
87 Path._os_func(os.remove, path, errno.ENOENT)
88
89 @staticmethod
90 def _listdir(path):
91 return Path._os_func(os.listdir, path, errno.ENOENT, default=[])
92
93 @staticmethod
94 def _os_func(func, path, errno, default=None):
95 """Call func(path) in an idempotent way.
96
97 On exception ``ex``, if the type is OSError and ``ex.errno == errno``,
98 return ``default``, otherwise, re-raise
99 """
100 try:
101 return func(path)
102 except OSError as e:
103 if e.errno == errno:
104 return default
105 else:
106 raise
107
108 @staticmethod
109 def rm_all(path):
110 """Same as rm -r."""
111 if os.path.islink(path):
112 Path._remove(path)
113 elif os.path.isdir(path):
114 Path._rmdir_dir(path)
115 else:
116 Path._remove(path)
117
118 @staticmethod
119 def untar(path, into=None):
120 """Extract tarball at ``path`` into subdir ``into``.
121
122 return subdir name if and only if there exists one, otherwise raise PathException
123 """
124 args = ('tar', '-C', into, '-xzf', path, '--no-same-permissions')
125 subprocess.check_call(args, preexec_fn=lambda: os.umask(0o22))
126 dirs = os.listdir(into)
127 if len(dirs) == 1:
128 return dirs[0]
129 else:
130 raise PathException('untar %s: expecting a single subdir, got %s' % (path, dirs))
131
132 @staticmethod
133 def tar(path, subdir, into=None, ts=None):
134 """Pack ``path`` into tarball ``into``."""
135 # --sort=name requires a recent build of GNU tar
136 args = ['tar', '--numeric-owner', '--owner=0', '--group=0', '--sort=name', '--mode=a-s']
137 args += ['-C', path, '-cf', into, subdir]
138 envs = os.environ.copy()
139 if ts is not None:
140 args.append('--mtime=@%d' % ts)
141 if into.endswith('.zst'):
142 args.append('-I zstd -T0 --ultra -20')
143 elif into.endswith('.xz'):
144 envs['XZ_OPT'] = '-7e'
145 args.append('-J')
146 elif into.endswith('.bz2'):
147 args.append('-j')
148 elif into.endswith('.gz'):
149 args.append('-z')
150 envs['GZIP'] = '-n'
151 else:
152 raise PathException('unknown compression type %s' % into)
153 subprocess.check_call(args, env=envs)
154
155
156 class GitHubCommitTsCache(object):
157 __cachef = 'github.commit.ts.cache'
158 __cachen = 2048
159
160 def __init__(self):
161 Path.mkdir_all(TMPDIR_DL)
162 self.cachef = os.path.join(TMPDIR_DL, self.__cachef)
163 self.cache = {}
164
165 def get(self, k):
166 """Get timestamp with key ``k``."""
167 fileno = os.open(self.cachef, os.O_RDONLY | os.O_CREAT)
168 with os.fdopen(fileno) as fin:
169 try:
170 fcntl.lockf(fileno, fcntl.LOCK_SH)
171 self._cache_init(fin)
172 if k in self.cache:
173 ts = self.cache[k][0]
174 return ts
175 finally:
176 fcntl.lockf(fileno, fcntl.LOCK_UN)
177 return None
178
179 def set(self, k, v):
180 """Update timestamp with ``k``."""
181 fileno = os.open(self.cachef, os.O_RDWR | os.O_CREAT)
182 with os.fdopen(fileno, 'w+') as f:
183 try:
184 fcntl.lockf(fileno, fcntl.LOCK_EX)
185 self._cache_init(f)
186 self.cache[k] = (v, int(time.time()))
187 self._cache_flush(f)
188 finally:
189 fcntl.lockf(fileno, fcntl.LOCK_UN)
190
191 def _cache_init(self, fin):
192 for line in fin:
193 k, ts, updated = line.split()
194 ts = int(ts)
195 updated = int(updated)
196 self.cache[k] = (ts, updated)
197
198 def _cache_flush(self, fout):
199 cache = sorted(self.cache.items(), key=lambda a: a[1][1])
200 cache = cache[:self.__cachen]
201 self.cache = {}
202 os.ftruncate(fout.fileno(), 0)
203 fout.seek(0, os.SEEK_SET)
204 for k, ent in cache:
205 ts = ent[0]
206 updated = ent[1]
207 line = '{0} {1} {2}\n'.format(k, ts, updated)
208 fout.write(line)
209
210
211 class DownloadGitHubTarball(object):
212 """Download and repack archive tarball from GitHub.
213
214 Compared with the method of packing after cloning the whole repo, this
215 method is more friendly to users with fragile internet connection.
216
217 However, there are limitations with this method
218
219 - GitHub imposes a 60 reqs/hour limit for unauthenticated API access.
220 This affects fetching commit date for reproducible tarballs. Download
221 through the archive link is not affected.
222
223 - GitHub archives do not contain source codes for submodules.
224
225 - GitHub archives seem to respect .gitattributes and ignore paths with
226 export-ignore attributes.
227
228 For the first two issues, the method will fail loudly to allow fallback to
229 clone-then-pack method.
230
231 As for the 3rd issue, to make sure that this method only produces identical
232 tarballs as the fallback method, we require the expected hash value to be
233 supplied. That means the first tarball will need to be prepared by the
234 clone-then-pack method
235 """
236
237 __repo_url_regex = re.compile(r'^(?:https|git)://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)')
238
239 def __init__(self, args):
240 self.dl_dir = args.dl_dir
241 self.version = args.version
242 self.subdir = args.subdir
243 self.source = args.source
244 self.submodules = args.submodules
245 self.url = args.url
246 self._init_owner_repo()
247 self.xhash = args.hash
248 self._init_hasher()
249 self.commit_ts = None # lazy load commit timestamp
250 self.commit_ts_cache = GitHubCommitTsCache()
251 self.name = 'github-tarball'
252
253 def download(self):
254 """Download and repack GitHub archive tarball."""
255 if self.submodules and self.submodules != ['skip']:
256 raise self._error('Fetching submodules is not yet supported')
257 self._init_commit_ts()
258 with Path(TMPDIR_DL, keep=True) as dir_dl:
259 # fetch tarball from GitHub
260 tarball_path = os.path.join(dir_dl.path, self.subdir + '.tar.gz.dl')
261 with Path(tarball_path, isdir=False):
262 self._fetch(tarball_path)
263 # unpack
264 d = os.path.join(dir_dl.path, self.subdir + '.untar')
265 with Path(d, preclean=True) as dir_untar:
266 tarball_prefix = Path.untar(tarball_path, into=dir_untar.path)
267 dir0 = os.path.join(dir_untar.path, tarball_prefix)
268 dir1 = os.path.join(dir_untar.path, self.subdir)
269 # submodules check
270 if self.submodules != ['skip'] and self._has_submodule(dir0):
271 raise self._error('Fetching submodules is not yet supported')
272 # rename subdir
273 os.rename(dir0, dir1)
274 # repack
275 into=os.path.join(TMPDIR_DL, self.source)
276 Path.tar(dir_untar.path, self.subdir, into=into, ts=self.commit_ts)
277 try:
278 self._hash_check(into)
279 except Exception:
280 Path.rm_all(into)
281 raise
282 # move to target location
283 file1 = os.path.join(self.dl_dir, self.source)
284 if into != file1:
285 shutil.move(into, file1)
286
287 def _has_submodule(self, dir_):
288 m = os.path.join(dir_, '.gitmodules')
289 try:
290 st = os.stat(m)
291 return st.st_size > 0
292 except OSError as e:
293 return e.errno != errno.ENOENT
294
295 def _init_owner_repo(self):
296 m = self.__repo_url_regex.search(self.url)
297 if m is None:
298 raise self._error('Invalid github url: {}'.format(self.url))
299 owner = m.group('owner')
300 repo = m.group('repo')
301 if repo.endswith('.git'):
302 repo = repo[:-4]
303 self.owner = owner
304 self.repo = repo
305
306 def _init_hasher(self):
307 xhash = self.xhash
308 if len(xhash) == 64:
309 self.hasher = hashlib.sha256()
310 elif len(xhash) == 32:
311 self.hasher = hashlib.md5()
312 else:
313 raise self._error('Requires sha256sum for verification')
314 self.xhash = xhash
315
316 def _hash_check(self, f):
317 with open(f, 'rb') as fin:
318 while True:
319 d = fin.read(4096)
320 if not d:
321 break
322 self.hasher.update(d)
323 xhash = self.hasher.hexdigest()
324 if xhash != self.xhash:
325 raise self._error('Wrong hash (probably caused by .gitattributes), expecting {}, got {}'.format(self.xhash, xhash))
326
327 def _init_commit_ts(self):
328 if self.commit_ts is not None:
329 return
330 # GitHub provides 2 APIs[1,2] for fetching commit data. API[1] is more
331 # terse while API[2] provides more verbose info such as commit diff
332 # etc. That's the main reason why API[1] is preferred: the response
333 # size is predictable.
334 #
335 # However, API[1] only accepts complete commit sha1sum as the parameter
336 # while API[2] is more liberal accepting also partial commit id and
337 # tags, etc.
338 #
339 # [1] Get a single commit, Repositories, https://developer.github.com/v3/repos/commits/#get-a-single-commit
340 # [2] Git Commits, Git Data, https://developer.github.com/v3/git/commits/#get-a-commit
341 apis = [
342 {
343 'url': self._make_repo_url_path('git', 'commits', self.version),
344 'attr_path': ('committer', 'date'),
345 }, {
346 'url': self._make_repo_url_path('commits', self.version),
347 'attr_path': ('commit', 'committer', 'date'),
348 },
349 ]
350 version_is_sha1sum = len(self.version) == 40
351 if not version_is_sha1sum:
352 apis.insert(0, apis.pop())
353 reasons = ''
354 for api in apis:
355 url = api['url']
356 attr_path = api['attr_path']
357 try:
358 ct = self.commit_ts_cache.get(url)
359 if ct is not None:
360 self.commit_ts = ct
361 return
362 ct = self._init_commit_ts_remote_get(url, attr_path)
363 self.commit_ts = ct
364 self.commit_ts_cache.set(url, ct)
365 return
366 except Exception as e:
367 reasons += '\n' + (" {}: {}".format(url, e))
368 raise self._error('Cannot fetch commit ts:{}'.format(reasons))
369
370 def _init_commit_ts_remote_get(self, url, attrpath):
371 resp = self._make_request(url)
372 data = resp.read()
373 date = json.loads(data)
374 for attr in attrpath:
375 date = date[attr]
376 date = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ')
377 date = date.timetuple()
378 ct = calendar.timegm(date)
379 return ct
380
381 def _fetch(self, path):
382 """Fetch tarball of the specified version ref."""
383 ref = self.version
384 url = self._make_repo_url_path('tarball', ref)
385 resp = self._make_request(url)
386 with open(path, 'wb') as fout:
387 while True:
388 d = resp.read(4096)
389 if not d:
390 break
391 fout.write(d)
392
393 def _make_repo_url_path(self, *args):
394 url = '/repos/{0}/{1}'.format(self.owner, self.repo)
395 if args:
396 url += '/' + '/'.join(args)
397 return url
398
399 def _make_request(self, path):
400 """Request GitHub API endpoint on ``path``."""
401 url = 'https://api.github.com' + path
402 headers = {
403 'Accept': 'application/vnd.github.v3+json',
404 'User-Agent': 'OpenWrt',
405 }
406 req = urllib.request.Request(url, headers=headers)
407 sslcontext = ssl._create_unverified_context()
408 fileobj = urllib.request.urlopen(req, context=sslcontext)
409 return fileobj
410
411 def _error(self, msg):
412 return DownloadGitHubError('{}: {}'.format(self.source, msg))
413
414
415 def main():
416 parser = argparse.ArgumentParser()
417 parser.add_argument('--dl-dir', default=os.getcwd(), help='Download dir')
418 parser.add_argument('--url', help='Download URL')
419 parser.add_argument('--subdir', help='Source code subdir name')
420 parser.add_argument('--version', help='Source code version')
421 parser.add_argument('--source', help='Source tarball filename')
422 parser.add_argument('--hash', help='Source tarball\'s expected sha256sum')
423 parser.add_argument('--submodules', nargs='*', help='List of submodules, or "skip"')
424 args = parser.parse_args()
425 try:
426 method = DownloadGitHubTarball(args)
427 method.download()
428 except Exception as ex:
429 sys.stderr.write('{}: Download from {} failed\n'.format(args.source, args.url))
430 sys.stderr.write('{}\n'.format(ex))
431 sys.exit(1)
432
433 if __name__ == '__main__':
434 main()