jail: don't assume positive return value of creat
[project/procd.git] / jail / jail.c
1 /*
2 * Copyright (C) 2015 John Crispin <blogic@openwrt.org>
3 * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU Lesser General Public License version 2.1
7 * as published by the Free Software Foundation
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 */
14
15 #define _GNU_SOURCE
16 #include <sys/mount.h>
17 #include <sys/prctl.h>
18 #include <sys/wait.h>
19 #include <sys/types.h>
20 #include <sys/time.h>
21 #include <sys/resource.h>
22 #include <sys/stat.h>
23 #include <sys/sysmacros.h>
24
25 /* musl only defined 15 limit types, make sure all 16 are supported */
26 #ifndef RLIMIT_RTTIME
27 #define RLIMIT_RTTIME 15
28 #undef RLIMIT_NLIMITS
29 #define RLIMIT_NLIMITS 16
30 #undef RLIM_NLIMITS
31 #define RLIM_NLIMITS 16
32 #endif
33
34 #include <assert.h>
35 #include <stdlib.h>
36 #include <unistd.h>
37 #include <errno.h>
38 #include <pwd.h>
39 #include <grp.h>
40 #include <string.h>
41 #include <fcntl.h>
42 #include <sched.h>
43 #include <linux/filter.h>
44 #include <linux/limits.h>
45 #include <linux/nsfs.h>
46 #include <linux/securebits.h>
47 #include <signal.h>
48 #include <inttypes.h>
49
50 #include "capabilities.h"
51 #include "elf.h"
52 #include "fs.h"
53 #include "jail.h"
54 #include "log.h"
55 #include "seccomp-oci.h"
56 #include "cgroups.h"
57 #include "netifd.h"
58
59 #include <libubox/blobmsg.h>
60 #include <libubox/blobmsg_json.h>
61 #include <libubox/list.h>
62 #include <libubox/vlist.h>
63 #include <libubox/uloop.h>
64 #include <libubox/utils.h>
65 #include <libubus.h>
66
67 #ifndef CLONE_NEWCGROUP
68 #define CLONE_NEWCGROUP 0x02000000
69 #endif
70
71 #define STACK_SIZE (1024 * 1024)
72 #define OPT_ARGS "cC:d:e:EfFG:h:ij:J:ln:NoO:pP:r:R:sS:uU:w:t:T:y"
73
74 #define OCI_VERSION_STRING "1.0.2"
75
76 struct hook_execvpe {
77 char *file;
78 char **argv;
79 char **envp;
80 int timeout;
81 };
82
83 struct sysctl_val {
84 char *entry;
85 char *value;
86 };
87
88 struct mknod_args {
89 char *path;
90 mode_t mode;
91 dev_t dev;
92 uid_t uid;
93 gid_t gid;
94 };
95
96 static struct {
97 char *name;
98 char *hostname;
99 char **jail_argv;
100 char *cwd;
101 char *seccomp;
102 struct sock_fprog *ociseccomp;
103 char *capabilities;
104 struct jail_capset capset;
105 char *user;
106 char *group;
107 char *extroot;
108 char *overlaydir;
109 char *tmpoverlaysize;
110 char **envp;
111 char *uidmap;
112 char *gidmap;
113 char *pidfile;
114 struct sysctl_val **sysctl;
115 int no_new_privs;
116 int namespace;
117 struct {
118 int pid;
119 int net;
120 int ns;
121 int ipc;
122 int uts;
123 int user;
124 int cgroup;
125 #ifdef CLONE_NEWTIME
126 int time;
127 #endif
128 } setns;
129 int procfs;
130 int ronly;
131 int sysfs;
132 int console;
133 int pw_uid;
134 int pw_gid;
135 int gr_gid;
136 int root_map_uid;
137 gid_t *additional_gids;
138 size_t num_additional_gids;
139 mode_t umask;
140 bool set_umask;
141 int require_jail;
142 struct {
143 struct hook_execvpe **createRuntime;
144 struct hook_execvpe **createContainer;
145 struct hook_execvpe **startContainer;
146 struct hook_execvpe **poststart;
147 struct hook_execvpe **poststop;
148 } hooks;
149 struct rlimit *rlimits[RLIM_NLIMITS];
150 int oom_score_adj;
151 bool set_oom_score_adj;
152 struct mknod_args **devices;
153 char *ocibundle;
154 bool immediately;
155 struct blob_attr *annotations;
156 int term_timeout;
157 } opts;
158
159 static struct blob_buf ocibuf;
160
161 extern int pivot_root(const char *new_root, const char *put_old);
162
163 int debug = 0;
164
165 static char child_stack[STACK_SIZE];
166
167 static struct ubus_context *parent_ctx;
168
169 int console_fd;
170
171
172 static inline bool has_namespaces(void)
173 {
174 return ((opts.setns.pid != -1) ||
175 (opts.setns.net != -1) ||
176 (opts.setns.ns != -1) ||
177 (opts.setns.ipc != -1) ||
178 (opts.setns.uts != -1) ||
179 (opts.setns.user != -1) ||
180 (opts.setns.cgroup != -1) ||
181 #ifdef CLONE_NEWTIME
182 (opts.setns.time != -1) ||
183 #endif
184 opts.namespace);
185 }
186
187 static void free_oci_envp(char **p) {
188 char **tmp;
189
190 if (p) {
191 tmp = p;
192 while (*tmp)
193 free(*(tmp++));
194
195 free(p);
196 }
197 }
198
199 static void free_hooklist(struct hook_execvpe **hooklist)
200 {
201 struct hook_execvpe *cur;
202
203 if (!hooklist)
204 return;
205
206 cur = *hooklist;
207 while (cur) {
208 free_oci_envp(cur->argv);
209 free_oci_envp(cur->envp);
210 free(cur->file);
211 free(cur++);
212 }
213 free(hooklist);
214 }
215
216 static void free_sysctl(void) {
217 struct sysctl_val *cur;
218
219 if (!opts.sysctl)
220 return;
221
222 cur = *opts.sysctl;
223
224 while (cur) {
225 free(cur->entry);
226 free(cur->value);
227 free(cur++);
228 }
229 free(opts.sysctl);
230 }
231
232 static void free_devices(void) {
233 struct mknod_args **cur;
234
235 if (!opts.devices)
236 return;
237
238 cur = opts.devices;
239
240 while (*cur) {
241 free((*cur)->path);
242 free(*(cur++));
243 }
244 free(opts.devices);
245 }
246
247 static void free_rlimits(void) {
248 int type;
249
250 for (type = 0; type < RLIM_NLIMITS; ++type)
251 free(opts.rlimits[type]);
252 }
253
254 static void free_opts(bool parent) {
255
256 free_library_search();
257 mount_free();
258 cgroups_free();
259
260 /* we need to keep argv, envp and seccomp filter in child */
261 if (parent) { /* parent-only */
262 if (opts.ociseccomp) {
263 free(opts.ociseccomp->filter);
264 free(opts.ociseccomp);
265 }
266
267 free_oci_envp(opts.jail_argv);
268 free_oci_envp(opts.envp);
269 }
270
271 free_rlimits();
272 free_sysctl();
273 free_devices();
274 free(opts.hostname);
275 free(opts.cwd);
276 free(opts.uidmap);
277 free(opts.gidmap);
278 free(opts.annotations);
279 free(opts.extroot);
280 free(opts.overlaydir);
281 free_hooklist(opts.hooks.createRuntime);
282 free_hooklist(opts.hooks.createContainer);
283 free_hooklist(opts.hooks.startContainer);
284 free_hooklist(opts.hooks.poststart);
285 free_hooklist(opts.hooks.poststop);
286 }
287
288 static int mount_overlay(char *jail_root, char *overlaydir) {
289 char *upperdir, *workdir, *optsstr, *upperetc, *upperresolvconf;
290 const char mountoptsformat[] = "lowerdir=%s,upperdir=%s,workdir=%s";
291 int ret = -1, fd;
292
293 if (asprintf(&upperdir, "%s%s", overlaydir, "/upper") < 0)
294 goto out;
295
296 if (asprintf(&workdir, "%s%s", overlaydir, "/work") < 0)
297 goto upper_printf;
298
299 if (asprintf(&optsstr, mountoptsformat, jail_root, upperdir, workdir) < 0)
300 goto work_printf;
301
302 if (mkdir_p(upperdir, 0755) || mkdir_p(workdir, 0755))
303 goto opts_printf;
304
305 /*
306 * make sure /etc/resolv.conf exists in overlay and is owned by jail userns root
307 * this is to work-around a bug in overlayfs described in the overlayfs-userns
308 * patch:
309 * 3. modification of a file 'hithere' which is in l but not yet
310 * in u, and which is not owned by T, is not allowed, even if
311 * writes to u are allowed. This may be a bug in overlayfs,
312 * but it is safe behavior.
313 */
314 if (asprintf(&upperetc, "%s/etc", upperdir) < 0)
315 goto opts_printf;
316
317 if (mkdir_p(upperetc, 0755))
318 goto upper_etc_printf;
319
320 if (asprintf(&upperresolvconf, "%s/resolv.conf", upperetc) < 0)
321 goto upper_etc_printf;
322
323 fd = creat(upperresolvconf, 0644);
324 if (fd < 0) {
325 if (errno != EEXIST)
326 ERROR("creat(%s) failed: %m\n", upperresolvconf);
327 } else {
328 close(fd);
329 }
330 DEBUG("mount -t overlay %s %s (%s)\n", jail_root, jail_root, optsstr);
331
332 if (mount(jail_root, jail_root, "overlay", MS_NOATIME, optsstr))
333 goto upper_resolvconf_printf;
334
335 ret = 0;
336
337 upper_resolvconf_printf:
338 free(upperresolvconf);
339 upper_etc_printf:
340 free(upperetc);
341 opts_printf:
342 free(optsstr);
343 work_printf:
344 free(workdir);
345 upper_printf:
346 free(upperdir);
347 out:
348 return ret;
349 }
350
351 static void pass_console(int console_fd)
352 {
353 struct ubus_context *child_ctx = ubus_connect(NULL);
354 static struct blob_buf req;
355 uint32_t id;
356
357 if (!child_ctx)
358 return;
359
360 blob_buf_init(&req, 0);
361 blobmsg_add_string(&req, "name", opts.name);
362
363 if (ubus_lookup_id(child_ctx, "container", &id) ||
364 ubus_invoke_fd(child_ctx, id, "console_set", req.head, NULL, NULL, 3000, console_fd))
365 INFO("ubus request failed\n");
366 else
367 close(console_fd);
368
369 blob_buf_free(&req);
370 ubus_free(child_ctx);
371 }
372
373 static int create_dev_console(const char *jail_root)
374 {
375 char *console_fname;
376 char dev_console_path[PATH_MAX];
377 int slave_console_fd, dev_console_dummy;
378
379 /* Open UNIX/98 virtual console */
380 console_fd = posix_openpt(O_RDWR | O_NOCTTY);
381 if (console_fd < 0)
382 return -1;
383
384 console_fname = ptsname(console_fd);
385 DEBUG("got console fd %d and PTS client name %s\n", console_fd, console_fname);
386 if (!console_fname)
387 goto no_console;
388
389 grantpt(console_fd);
390 unlockpt(console_fd);
391
392 /* pass PTY master to procd */
393 pass_console(console_fd);
394
395 /* mount-bind PTY slave to /dev/console in jail */
396 snprintf(dev_console_path, sizeof(dev_console_path), "%s/dev/console", jail_root);
397 dev_console_dummy = creat(dev_console_path, 0620);
398 if (dev_console_dummy < 0)
399 goto no_console;
400
401 close(dev_console_dummy);
402
403 if (mount(console_fname, dev_console_path, "bind", MS_BIND, NULL))
404 goto no_console;
405
406 /* use PTY slave for stdio */
407 slave_console_fd = open(console_fname, O_RDWR); /* | O_NOCTTY */
408 if (slave_console_fd < 0)
409 goto no_console;
410
411 dup2(slave_console_fd, 0);
412 dup2(slave_console_fd, 1);
413 dup2(slave_console_fd, 2);
414 close(slave_console_fd);
415
416 INFO("using guest console %s\n", console_fname);
417
418 return 0;
419
420 no_console:
421 close(console_fd);
422 return 1;
423 }
424
425 static int hook_running = 0;
426 static int hook_return_code = 0;
427 static struct hook_execvpe **current_hook = NULL;
428 typedef void (*hook_return_handler)(void);
429 static hook_return_handler hook_return_cb = NULL;
430
431 static void hook_process_timeout_cb(struct uloop_timeout *t);
432 static struct uloop_timeout hook_process_timeout = {
433 .cb = hook_process_timeout_cb,
434 };
435
436 static void run_hooklist(void);
437 static void hook_process_handler(struct uloop_process *c, int ret)
438 {
439 uloop_timeout_cancel(&hook_process_timeout);
440
441 if (WIFEXITED(ret)) {
442 hook_return_code = WEXITSTATUS(ret);
443 if (hook_return_code)
444 ERROR("hook (%d) exited with exit: %d\n", c->pid, hook_return_code);
445 else
446 DEBUG("hook (%d) exited with exit: %d\n", c->pid, hook_return_code);
447
448 } else {
449 hook_return_code = WTERMSIG(ret);
450 ERROR("hook (%d) exited with signal: %d\n", c->pid, hook_return_code);
451 }
452 hook_running = 0;
453 ++current_hook;
454 run_hooklist();
455 }
456
457 static struct uloop_process hook_process = {
458 .cb = hook_process_handler,
459 };
460
461 static void hook_process_timeout_cb(struct uloop_timeout *t)
462 {
463 DEBUG("hook process failed to stop, sending SIGKILL\n");
464 kill(hook_process.pid, SIGKILL);
465 }
466
467 static void run_hooklist(void)
468 {
469 struct hook_execvpe *hook = *current_hook;
470 struct stat s;
471
472 if (!hook)
473 return hook_return_cb();
474
475 DEBUG("executing hook %s\n", hook->file);
476
477 if (stat(hook->file, &s))
478 hook_process_handler(&hook_process, ENOENT);
479
480 if (!((unsigned long)s.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH)))
481 hook_process_handler(&hook_process, EPERM);
482
483 hook_running = 1;
484 hook_process.pid = fork();
485 if (hook_process.pid == 0) {
486 /* child */
487 execve(hook->file, hook->argv, hook->envp);
488 ERROR("execve error %m\n");
489 _exit(errno);
490 } else if (hook_process.pid < 0) {
491 /* fork error */
492 ERROR("hook fork error\n");
493 hook_running = 0;
494 hook_process_handler(&hook_process, errno);
495 }
496
497 /* parent */
498 uloop_process_add(&hook_process);
499
500 if (hook->timeout > 0)
501 uloop_timeout_set(&hook_process_timeout, 1000 * hook->timeout);
502
503 uloop_run();
504 if (hook_running) {
505 DEBUG("uloop interrupted, killing jail process\n");
506 kill(hook_process.pid, SIGTERM);
507 uloop_timeout_set(&hook_process_timeout, 1000);
508 uloop_run();
509 }
510 }
511
512 static void run_hooks(struct hook_execvpe **hooklist, hook_return_handler return_cb)
513 {
514 if (!hooklist)
515 return_cb();
516
517 current_hook = hooklist;
518 hook_return_cb = return_cb;
519
520 run_hooklist();
521 }
522
523 static int apply_sysctl(const char *jail_root)
524 {
525 struct sysctl_val **cur;
526 char *procdir, *fname;
527 int f;
528
529 if (!opts.sysctl)
530 return 0;
531
532 if (asprintf(&procdir, "%s/proc", jail_root) < 0)
533 return ENOMEM;
534
535 mkdir(procdir, 0700);
536 if (mount("proc", procdir, "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0))
537 return EPERM;
538
539 cur = opts.sysctl;
540
541 while (*cur) {
542 if (asprintf(&fname, "%s/sys/%s", procdir, (*cur)->entry) < 0)
543 return ENOMEM;
544
545 DEBUG("sysctl: writing '%s' to %s\n", (*cur)->value, fname);
546
547 f = open(fname, O_WRONLY);
548 if (f < 0) {
549 ERROR("sysctl: can't open %s\n", fname);
550 free(fname);
551 return errno;
552 }
553 if (write(f, (*cur)->value, strlen((*cur)->value)) < 0) {
554 ERROR("sysctl: write to %s\n", fname);
555 free(fname);
556 close(f);
557 return errno;
558 }
559
560 free(fname);
561 close(f);
562 ++cur;
563 }
564 umount(procdir);
565 rmdir(procdir);
566 free(procdir);
567
568 return 0;
569 }
570
571 /* glibc defines makedev calling a function. make sure it's a pure macro */
572 #if defined(__GLIBC__)
573 #undef makedev
574 /* from musl's sys/sysmacros.h */
575 #define makedev(x,y) ( \
576 (((x)&0xfffff000ULL) << 32) | \
577 (((x)&0x00000fffULL) << 8) | \
578 (((y)&0xffffff00ULL) << 12) | \
579 (((y)&0x000000ffULL)) )
580 #endif
581
582 static struct mknod_args default_devices[] = {
583 { .path = "/dev/null", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 3) },
584 { .path = "/dev/zero", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 5) },
585 { .path = "/dev/full", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 7) },
586 { .path = "/dev/random", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 8) },
587 { .path = "/dev/urandom", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 9) },
588 { .path = "/dev/tty", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP), .dev = makedev(5, 0), .gid = 5 },
589 { 0 },
590 };
591
592 static int create_devices(void)
593 {
594 struct mknod_args **cur, *curdef;
595 char *path, *tmp;
596 int ret;
597
598 if (!opts.devices)
599 goto only_default_devices;
600
601 cur = opts.devices;
602
603 while (*cur) {
604 path = (*cur)->path;
605 /* don't allow devices outside of /dev */
606 if (strncmp(path, "/dev", 4))
607 return EPERM;
608
609 /* make sure parent folder exists */
610 tmp = strrchr(path, '/');
611 if (!tmp)
612 return EINVAL;
613
614 *tmp = '\0';
615 if (strcmp(path, "/dev")) {
616 DEBUG("creating directory %s\n", path);
617
618 mkdir_p(path, 0755);
619 }
620 *tmp = '/';
621
622 DEBUG("creating %s (mode=%08o)\n", path, (*cur)->mode);
623
624 /* create device */
625 if (mknod(path, (*cur)->mode, (*cur)->dev))
626 return errno;
627
628 /* change owner, if needed */
629 if (((*cur)->uid || (*cur)->gid) &&
630 chown(path, (*cur)->uid, (*cur)->gid))
631 return errno;
632
633 ++cur;
634 }
635
636 only_default_devices:
637 curdef = default_devices;
638 while(curdef->path) {
639 DEBUG("creating %s (mode=%08o)\n", curdef->path, curdef->mode);
640 if (mknod(curdef->path, curdef->mode, curdef->dev)) {
641 ++curdef;
642 continue; /* may already exist, eg. due to a bind-mount */
643 }
644 if ((curdef->uid || curdef->gid) &&
645 chown(curdef->path, curdef->uid, curdef->gid))
646 return errno;
647
648 ++curdef;
649 }
650
651 /* Dev symbolic links as defined in OCI spec */
652 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
653 if (ret < 0)
654 WARNING("symlink() failed to create link to /dev/pts/ptmx");
655
656 ret = symlink("/proc/self/fd", "/dev/fd");
657 if (ret < 0)
658 WARNING("symlink() failed to create link to /proc/self/fd");
659
660 ret = symlink("/proc/self/fd/0", "/dev/stdin");
661 if (ret < 0)
662 WARNING("symlink() failed to create link to /proc/self/fd/0");
663
664 ret = symlink("/proc/self/fd/1", "/dev/stdout");
665 if (ret < 0)
666 WARNING("symlink() failed to create link to /proc/self/fd/1");
667
668 ret = symlink("/proc/self/fd/2", "/dev/stderr");
669 if (ret < 0)
670 WARNING("symlink() failed to create link to /proc/self/fd/2");
671
672 return 0;
673 }
674
675 static char jail_root[] = "/tmp/ujail-XXXXXX";
676 static char tmpovdir[] = "/tmp/ujail-overlay-XXXXXX";
677 static mode_t old_umask;
678 static void enter_jail_fs(void);
679 static int build_jail_fs(void)
680 {
681 char *overlaydir = NULL;
682 int ret;
683
684 old_umask = umask(0);
685
686 if (mkdtemp(jail_root) == NULL) {
687 ERROR("mkdtemp(%s) failed: %m\n", jail_root);
688 return -1;
689 }
690
691 if (apply_sysctl(jail_root)) {
692 ERROR("failed to apply sysctl values\n");
693 return -1;
694 }
695
696 /* oldroot can't be MS_SHARED else pivot_root() fails */
697 if (mount("none", "/", "none", MS_REC|MS_PRIVATE, NULL)) {
698 ERROR("private mount failed %m\n");
699 return -1;
700 }
701
702 if (opts.extroot) {
703 if (mount(opts.extroot, jail_root, "bind", MS_BIND, NULL)) {
704 ERROR("extroot mount failed %m\n");
705 return -1;
706 }
707 } else {
708 if (mount("tmpfs", jail_root, "tmpfs", MS_NOATIME, "mode=0755")) {
709 ERROR("tmpfs mount failed %m\n");
710 return -1;
711 }
712 }
713
714 if (opts.tmpoverlaysize) {
715 char mountoptsstr[] = "mode=0755,size=XXXXXXXX";
716
717 snprintf(mountoptsstr, sizeof(mountoptsstr),
718 "mode=0755,size=%s", opts.tmpoverlaysize);
719 if (mkdtemp(tmpovdir) == NULL) {
720 ERROR("mkdtemp(%s) failed: %m\n", jail_root);
721 return -1;
722 }
723 if (mount("tmpfs", tmpovdir, "tmpfs", MS_NOATIME,
724 mountoptsstr)) {
725 ERROR("failed to mount tmpfs for overlay (size=%s)\n", opts.tmpoverlaysize);
726 return -1;
727 }
728 overlaydir = tmpovdir;
729 }
730
731 if (opts.overlaydir)
732 overlaydir = opts.overlaydir;
733
734 if (overlaydir) {
735 ret = mount_overlay(jail_root, overlaydir);
736 if (ret)
737 return ret;
738 }
739
740 if (chdir(jail_root)) {
741 ERROR("chdir(%s) (jail_root) failed: %m\n", jail_root);
742 return -1;
743 }
744
745 if (mount_all(jail_root)) {
746 ERROR("mount_all() failed\n");
747 return -1;
748 }
749
750 if (opts.console)
751 create_dev_console(jail_root);
752
753 /* make sure /etc/resolv.conf exists if in new network namespace */
754 if (opts.namespace & CLONE_NEWNET) {
755 char jailetc[PATH_MAX], jaillink[PATH_MAX];
756
757 snprintf(jailetc, PATH_MAX, "%s/etc", jail_root);
758 mkdir_p(jailetc, 0755);
759 snprintf(jaillink, PATH_MAX, "%s/etc/resolv.conf", jail_root);
760 if (overlaydir)
761 unlink(jaillink);
762
763 ret = symlink("../dev/resolv.conf.d/resolv.conf.auto", jaillink);
764 if (ret < 0)
765 WARNING("symlink() failed to create link to ../dev/resolv.conf.d/resolv.conf.auto");
766 }
767
768 run_hooks(opts.hooks.createContainer, enter_jail_fs);
769
770 return 0;
771 }
772
773 static bool exit_from_child;
774 static void free_and_exit(int ret)
775 {
776 if (!exit_from_child && opts.ocibundle)
777 cgroups_free();
778
779 if (!exit_from_child && parent_ctx)
780 ubus_free(parent_ctx);
781
782 free_opts(!exit_from_child);
783
784 exit(ret);
785 }
786
787 static void post_jail_fs(void);
788 static void enter_jail_fs(void)
789 {
790 char dirbuf[sizeof(jail_root) + 4];
791
792 snprintf(dirbuf, sizeof(dirbuf), "%s/old", jail_root);
793 mkdir(dirbuf, 0755);
794
795 if (pivot_root(jail_root, dirbuf) == -1) {
796 ERROR("pivot_root(%s, %s) failed: %m\n", jail_root, dirbuf);
797 free_and_exit(-1);
798 }
799 if (chdir("/")) {
800 ERROR("chdir(/) (after pivot_root) failed: %m\n");
801 free_and_exit(-1);
802 }
803
804 snprintf(dirbuf, sizeof(dirbuf), "/old%s", jail_root);
805 umount2(dirbuf, MNT_DETACH);
806 rmdir(dirbuf);
807 if (opts.tmpoverlaysize) {
808 char tmpdirbuf[sizeof(tmpovdir) + 4];
809 snprintf(tmpdirbuf, sizeof(tmpdirbuf), "/old%s", tmpovdir);
810 umount2(tmpdirbuf, MNT_DETACH);
811 rmdir(tmpdirbuf);
812 }
813
814 umount2("/old", MNT_DETACH);
815 rmdir("/old");
816
817 if (create_devices()) {
818 ERROR("create_devices() failed\n");
819 free_and_exit(-1);
820 }
821 if (opts.ronly)
822 mount(NULL, "/", "bind", MS_REMOUNT | MS_BIND | MS_RDONLY, 0);
823
824 umask(old_umask);
825 post_jail_fs();
826 }
827
828 static int write_uid_gid_map(pid_t child_pid, bool gidmap, char *mapstr)
829 {
830 int map_file;
831 char map_path[64];
832
833 if (snprintf(map_path, sizeof(map_path), "/proc/%d/%s",
834 child_pid, gidmap?"gid_map":"uid_map") < 0)
835 return -1;
836
837 if ((map_file = open(map_path, O_WRONLY)) < 0)
838 return -1;
839
840 if (dprintf(map_file, "%s", mapstr)) {
841 close(map_file);
842 return -1;
843 }
844
845 close(map_file);
846 return 0;
847 }
848
849 static int write_single_uid_gid_map(pid_t child_pid, bool gidmap, int id)
850 {
851 int map_file;
852 char map_path[64];
853 const char *map_format = "%d %d %d\n";
854 if (snprintf(map_path, sizeof(map_path), "/proc/%d/%s",
855 child_pid, gidmap?"gid_map":"uid_map") < 0)
856 return -1;
857
858 if ((map_file = open(map_path, O_WRONLY)) < 0)
859 return -1;
860
861 if (dprintf(map_file, map_format, 0, id, 1) < 0) {
862 close(map_file);
863 return -1;
864 }
865
866 close(map_file);
867 return 0;
868 }
869
870 static int write_setgroups(pid_t child_pid, bool allow)
871 {
872 int setgroups_file;
873 char setgroups_path[64];
874
875 if (snprintf(setgroups_path, sizeof(setgroups_path), "/proc/%d/setgroups",
876 child_pid) < 0) {
877 return -1;
878 }
879
880 if ((setgroups_file = open(setgroups_path, O_WRONLY)) < 0) {
881 return -1;
882 }
883
884 if (dprintf(setgroups_file, "%s", allow?"allow":"deny") == -1) {
885 close(setgroups_file);
886 return -1;
887 }
888
889 close(setgroups_file);
890 return 0;
891 }
892
893 static void get_jail_user(int *user, int *user_gid, int *gr_gid)
894 {
895 struct passwd *p = NULL;
896 struct group *g = NULL;
897
898 if (opts.user) {
899 p = getpwnam(opts.user);
900 if (!p) {
901 ERROR("failed to get uid/gid for user %s: %d (%s)\n",
902 opts.user, errno, strerror(errno));
903 free_and_exit(EXIT_FAILURE);
904 }
905 *user = p->pw_uid;
906 *user_gid = p->pw_gid;
907 } else {
908 *user = -1;
909 *user_gid = -1;
910 }
911
912 if (opts.group) {
913 g = getgrnam(opts.group);
914 if (!g) {
915 ERROR("failed to get gid for group %s: %m\n", opts.group);
916 free_and_exit(EXIT_FAILURE);
917 }
918 *gr_gid = g->gr_gid;
919 } else {
920 *gr_gid = -1;
921 }
922 };
923
924 static void set_jail_user(int pw_uid, int user_gid, int gr_gid)
925 {
926 if (opts.user && (user_gid != -1) && initgroups(opts.user, user_gid)) {
927 ERROR("failed to initgroups() for user %s: %m\n", opts.user);
928 free_and_exit(EXIT_FAILURE);
929 }
930
931 if ((gr_gid != -1) && setregid(gr_gid, gr_gid)) {
932 ERROR("failed to set group id %d: %m\n", gr_gid);
933 free_and_exit(EXIT_FAILURE);
934 }
935
936 if ((pw_uid != -1) && setreuid(pw_uid, pw_uid)) {
937 ERROR("failed to set user id %d: %m\n", pw_uid);
938 free_and_exit(EXIT_FAILURE);
939 }
940 }
941
942 static int apply_rlimits(void)
943 {
944 int resource;
945
946 for (resource = 0; resource < RLIM_NLIMITS; ++resource) {
947 if (opts.rlimits[resource])
948 DEBUG("applying limits to resource %u\n", resource);
949
950 if (opts.rlimits[resource] &&
951 setrlimit(resource, opts.rlimits[resource]))
952 return errno;
953 }
954
955 return 0;
956 }
957
958 #define MAX_ENVP 64
959 static char** build_envp(const char *seccomp, char **ocienvp)
960 {
961 static char *envp[MAX_ENVP];
962 static char preload_var[PATH_MAX];
963 static char seccomp_var[PATH_MAX];
964 static char seccomp_debug_var[20];
965 static char debug_var[] = "LD_DEBUG=all";
966 static char container_var[] = "container=ujail";
967 const char *preload_lib = find_lib("libpreload-seccomp.so");
968 char **addenv;
969
970 int count = 0;
971
972 if (seccomp && !preload_lib) {
973 ERROR("failed to add preload-lib to env\n");
974 return NULL;
975 }
976 if (seccomp) {
977 snprintf(seccomp_var, sizeof(seccomp_var), "SECCOMP_FILE=%s", seccomp);
978 envp[count++] = seccomp_var;
979 snprintf(seccomp_debug_var, sizeof(seccomp_debug_var), "SECCOMP_DEBUG=%2d", debug);
980 envp[count++] = seccomp_debug_var;
981 snprintf(preload_var, sizeof(preload_var), "LD_PRELOAD=%s", preload_lib);
982 envp[count++] = preload_var;
983 }
984
985 envp[count++] = container_var;
986
987 if (debug > 1)
988 envp[count++] = debug_var;
989
990 addenv = ocienvp;
991 while (addenv && *addenv) {
992 envp[count++] = *(addenv++);
993 if (count >= MAX_ENVP) {
994 ERROR("environment limited to %d extra records, truncating\n", MAX_ENVP);
995 break;
996 }
997 }
998 return envp;
999 }
1000
1001 static void usage(void)
1002 {
1003 fprintf(stderr, "ujail <options> -- <binary> <params ...>\n");
1004 fprintf(stderr, " -d <num>\tshow debug log (increase num to increase verbosity)\n");
1005 fprintf(stderr, " -S <file>\tseccomp filter config\n");
1006 fprintf(stderr, " -C <file>\tcapabilities drop config\n");
1007 fprintf(stderr, " -c\t\tset PR_SET_NO_NEW_PRIVS\n");
1008 fprintf(stderr, " -n <name>\tthe name of the jail\n");
1009 fprintf(stderr, " -e <var>\timport environment variable\n");
1010 fprintf(stderr, "namespace jail options:\n");
1011 fprintf(stderr, " -h <hostname>\tchange the hostname of the jail\n");
1012 fprintf(stderr, " -N\t\tjail has network namespace\n");
1013 fprintf(stderr, " -f\t\tjail has user namespace\n");
1014 fprintf(stderr, " -F\t\tjail has cgroups namespace\n");
1015 fprintf(stderr, " -r <file>\treadonly files that should be staged\n");
1016 fprintf(stderr, " -w <file>\twriteable files that should be staged\n");
1017 fprintf(stderr, " -p\t\tjail has /proc\n");
1018 fprintf(stderr, " -s\t\tjail has /sys\n");
1019 fprintf(stderr, " -l\t\tjail has /dev/log\n");
1020 fprintf(stderr, " -u\t\tjail has a ubus socket\n");
1021 fprintf(stderr, " -U <name>\tuser to run jailed process\n");
1022 fprintf(stderr, " -G <name>\tgroup to run jailed process\n");
1023 fprintf(stderr, " -o\t\tremont jail root (/) read only\n");
1024 fprintf(stderr, " -R <dir>\texternal jail rootfs (system container)\n");
1025 fprintf(stderr, " -O <dir>\tdirectory for r/w overlayfs\n");
1026 fprintf(stderr, " -T <size>\tuse tmpfs r/w overlayfs with <size>\n");
1027 fprintf(stderr, " -E\t\tfail if jail cannot be setup\n");
1028 fprintf(stderr, " -y\t\tprovide jail console\n");
1029 fprintf(stderr, " -J <dir>\tcreate container from OCI bundle\n");
1030 fprintf(stderr, " -i\t\tstart container immediately\n");
1031 fprintf(stderr, " -P <pidfile>\tcreate <pidfile>\n");
1032 fprintf(stderr, "\nWarning: by default root inside the jail is the same\n\
1033 and he has the same powers as root outside the jail,\n\
1034 thus he can escape the jail and/or break stuff.\n\
1035 Please use seccomp/capabilities (-S/-C) to restrict his powers\n\n\
1036 If you use none of the namespace jail options,\n\
1037 ujail will not use namespace/build a jail,\n\
1038 and will only drop capabilities/apply seccomp filter.\n\n");
1039 }
1040
1041 static int* get_namespace_fd(const unsigned int nstype)
1042 {
1043 switch (nstype) {
1044 case CLONE_NEWPID:
1045 return &opts.setns.pid;
1046 case CLONE_NEWNET:
1047 return &opts.setns.net;
1048 case CLONE_NEWNS:
1049 return &opts.setns.ns;
1050 case CLONE_NEWIPC:
1051 return &opts.setns.ipc;
1052 case CLONE_NEWUTS:
1053 return &opts.setns.uts;
1054 case CLONE_NEWUSER:
1055 return &opts.setns.user;
1056 case CLONE_NEWCGROUP:
1057 return &opts.setns.cgroup;
1058 #ifdef CLONE_NEWTIME
1059 case CLONE_NEWTIME:
1060 return &opts.setns.time;
1061 #endif
1062 default:
1063 return NULL;
1064 }
1065 }
1066
1067 static int setns_open(unsigned long nstype)
1068 {
1069 int *fd = get_namespace_fd(nstype);
1070
1071 assert(fd != NULL);
1072
1073 if (*fd < 0)
1074 return 0;
1075
1076 if (setns(*fd, nstype) == -1) {
1077 close(*fd);
1078 return errno;
1079 }
1080
1081 close(*fd);
1082 return 0;
1083 }
1084
1085 static int jail_running = 0;
1086 static int jail_return_code = 0;
1087
1088 static void jail_process_timeout_cb(struct uloop_timeout *t);
1089 static struct uloop_timeout jail_process_timeout = {
1090 .cb = jail_process_timeout_cb,
1091 };
1092 static void poststop(void);
1093 static void jail_process_handler(struct uloop_process *c, int ret)
1094 {
1095 uloop_timeout_cancel(&jail_process_timeout);
1096 if (WIFEXITED(ret)) {
1097 jail_return_code = WEXITSTATUS(ret);
1098 INFO("jail (%d) exited with exit: %d\n", c->pid, jail_return_code);
1099 } else {
1100 jail_return_code = WTERMSIG(ret);
1101 INFO("jail (%d) exited with signal: %d\n", c->pid, jail_return_code);
1102 }
1103 jail_running = 0;
1104 poststop();
1105 }
1106
1107 static struct uloop_process jail_process = {
1108 .cb = jail_process_handler,
1109 };
1110
1111 static void jail_process_timeout_cb(struct uloop_timeout *t)
1112 {
1113 DEBUG("jail process failed to stop, sending SIGKILL\n");
1114 kill(jail_process.pid, SIGKILL);
1115 }
1116
1117 static void jail_handle_signal(int signo)
1118 {
1119 if (hook_running) {
1120 DEBUG("forwarding signal %d to the hook process\n", signo);
1121 kill(hook_process.pid, signo);
1122 /* set timeout to send SIGKILL hook process in case SIGTERM doesn't succeed */
1123 if (signo == SIGTERM)
1124 uloop_timeout_set(&hook_process_timeout, opts.term_timeout * 1000);
1125 }
1126
1127 if (jail_running) {
1128 DEBUG("forwarding signal %d to the jailed process\n", signo);
1129 kill(jail_process.pid, signo);
1130 /* set timeout to send SIGKILL jail process in case SIGTERM doesn't succeed */
1131 if (signo == SIGTERM)
1132 uloop_timeout_set(&jail_process_timeout, opts.term_timeout * 1000);
1133 }
1134 }
1135
1136 static void signals_init(void)
1137 {
1138 int i;
1139 sigset_t sigmask;
1140
1141 sigfillset(&sigmask);
1142 for (i = 0; i < _NSIG; i++) {
1143 struct sigaction s = { 0 };
1144
1145 if (!sigismember(&sigmask, i))
1146 continue;
1147 if ((i == SIGCHLD) || (i == SIGPIPE) || (i == SIGSEGV) || (i == SIGSTOP) || (i == SIGKILL))
1148 continue;
1149
1150 s.sa_handler = jail_handle_signal;
1151 sigaction(i, &s, NULL);
1152 }
1153 }
1154
1155 static void pre_exec_jail(struct uloop_timeout *t);
1156 static struct uloop_timeout pre_exec_timeout = {
1157 .cb = pre_exec_jail,
1158 };
1159
1160 int pipes[4];
1161 static int exec_jail(void *arg)
1162 {
1163 char buf[1];
1164
1165 exit_from_child = true;
1166 prctl(PR_SET_SECUREBITS, 0);
1167
1168 uloop_init();
1169 signals_init();
1170
1171 close(pipes[0]);
1172 close(pipes[3]);
1173
1174 setns_open(CLONE_NEWUSER);
1175 setns_open(CLONE_NEWNET);
1176 setns_open(CLONE_NEWNS);
1177 setns_open(CLONE_NEWIPC);
1178 setns_open(CLONE_NEWUTS);
1179
1180 buf[0] = 'i';
1181 if (write(pipes[1], buf, 1) < 1) {
1182 ERROR("can't write to parent\n");
1183 return EXIT_FAILURE;
1184 }
1185 close(pipes[1]);
1186 if (read(pipes[2], buf, 1) < 1) {
1187 ERROR("can't read from parent\n");
1188 return EXIT_FAILURE;
1189 }
1190 if (buf[0] != 'O') {
1191 ERROR("parent had an error, child exiting\n");
1192 return EXIT_FAILURE;
1193 }
1194
1195 if (opts.namespace & CLONE_NEWCGROUP)
1196 unshare(CLONE_NEWCGROUP);
1197
1198 setns_open(CLONE_NEWCGROUP);
1199
1200 if ((opts.namespace & CLONE_NEWUSER) || (opts.setns.user != -1)) {
1201 if (setregid(0, 0) < 0) {
1202 ERROR("setgid\n");
1203 free_and_exit(EXIT_FAILURE);
1204 }
1205 if (setreuid(0, 0) < 0) {
1206 ERROR("setuid\n");
1207 free_and_exit(EXIT_FAILURE);
1208 }
1209 if (setgroups(0, NULL) < 0) {
1210 ERROR("setgroups\n");
1211 free_and_exit(EXIT_FAILURE);
1212 }
1213 }
1214
1215 if (opts.namespace && opts.hostname && strlen(opts.hostname) > 0
1216 && sethostname(opts.hostname, strlen(opts.hostname))) {
1217 ERROR("sethostname(%s) failed: %m\n", opts.hostname);
1218 free_and_exit(EXIT_FAILURE);
1219 }
1220
1221 uloop_timeout_add(&pre_exec_timeout);
1222 uloop_run();
1223
1224 free_and_exit(-1);
1225 return -1;
1226 }
1227
1228 static void pre_exec_jail(struct uloop_timeout *t)
1229 {
1230 if ((opts.namespace & CLONE_NEWNS) && build_jail_fs()) {
1231 ERROR("failed to build jail fs\n");
1232 free_and_exit(EXIT_FAILURE);
1233 } else {
1234 run_hooks(opts.hooks.createContainer, post_jail_fs);
1235 }
1236 }
1237
1238 static void post_start_hook(void);
1239 static void post_jail_fs(void)
1240 {
1241 char buf[1];
1242
1243 if (read(pipes[2], buf, 1) < 1) {
1244 ERROR("can't read from parent\n");
1245 free_and_exit(EXIT_FAILURE);
1246 }
1247 if (buf[0] != '!') {
1248 ERROR("parent had an error, child exiting\n");
1249 free_and_exit(EXIT_FAILURE);
1250 }
1251 close(pipes[2]);
1252
1253 run_hooks(opts.hooks.startContainer, post_start_hook);
1254 }
1255
1256 static void post_start_hook(void)
1257 {
1258 int pw_uid, pw_gid, gr_gid;
1259
1260 /*
1261 * make sure setuid/setgid won't drop capabilities in case capabilities
1262 * have been specified explicitely.
1263 */
1264 if (opts.capset.apply) {
1265 if (prctl(PR_SET_SECUREBITS, SECBIT_NO_SETUID_FIXUP)) {
1266 ERROR("prctl(PR_SET_SECUREBITS) failed: %m\n");
1267 free_and_exit(EXIT_FAILURE);
1268 }
1269 }
1270
1271 /* drop capabilities, retain those still needed to further setup jail */
1272 if (applyOCIcapabilities(opts.capset, (1LLU << CAP_SETGID) | (1LLU << CAP_SETUID) | (1LLU << CAP_SETPCAP)))
1273 free_and_exit(EXIT_FAILURE);
1274
1275 /* use either cmdline-supplied user/group or uid/gid from OCI spec */
1276 get_jail_user(&pw_uid, &pw_gid, &gr_gid);
1277 set_jail_user(opts.pw_uid?:pw_uid, opts.pw_gid?:pw_gid, opts.gr_gid?:gr_gid);
1278
1279 if (opts.additional_gids &&
1280 (setgroups(opts.num_additional_gids, opts.additional_gids) < 0)) {
1281 ERROR("setgroups failed: %m\n");
1282 free_and_exit(EXIT_FAILURE);
1283 }
1284
1285 if (opts.set_umask)
1286 umask(opts.umask);
1287
1288 /* restore securebits back to normal (and lock them if not in userns) */
1289 if (opts.capset.apply) {
1290 if (prctl(PR_SET_SECUREBITS, (opts.namespace & CLONE_NEWUSER)?0:
1291 SECBIT_KEEP_CAPS_LOCKED|SECBIT_NO_SETUID_FIXUP_LOCKED|SECBIT_NOROOT_LOCKED)) {
1292 ERROR("prctl(PR_SET_SECUREBITS) failed: %m\n");
1293 free_and_exit(EXIT_FAILURE);
1294 }
1295 }
1296
1297 /* drop remaining capabilities to end up with specified sets */
1298 if (applyOCIcapabilities(opts.capset, 0))
1299 free_and_exit(EXIT_FAILURE);
1300
1301 if (opts.no_new_privs && prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
1302 ERROR("prctl(PR_SET_NO_NEW_PRIVS) failed: %m\n");
1303 free_and_exit(EXIT_FAILURE);
1304 }
1305
1306 char **envp = build_envp(opts.seccomp, opts.envp);
1307 if (!envp)
1308 free_and_exit(EXIT_FAILURE);
1309
1310 if (opts.cwd && chdir(opts.cwd))
1311 free_and_exit(EXIT_FAILURE);
1312
1313 if (opts.ociseccomp && applyOCIlinuxseccomp(opts.ociseccomp))
1314 free_and_exit(EXIT_FAILURE);
1315
1316 uloop_end();
1317 free_opts(false);
1318 INFO("exec-ing %s\n", *opts.jail_argv);
1319 if (opts.envp) /* respect PATH if potentially set in ENV */
1320 execvpe(*opts.jail_argv, opts.jail_argv, envp);
1321 else
1322 execve(*opts.jail_argv, opts.jail_argv, envp);
1323
1324 /* we get there only if execve fails */
1325 ERROR("failed to execve %s: %m\n", *opts.jail_argv);
1326 exit(EXIT_FAILURE);
1327 }
1328
1329 int ns_open_pid(const char *nstype, const pid_t target_ns)
1330 {
1331 char pid_pid_path[PATH_MAX];
1332
1333 snprintf(pid_pid_path, sizeof(pid_pid_path), "/proc/%u/ns/%s", target_ns, nstype);
1334
1335 return open(pid_pid_path, O_RDONLY);
1336 }
1337
1338 static int parseOCIenvarray(struct blob_attr *msg, char ***envp)
1339 {
1340 struct blob_attr *cur;
1341 int sz = 0, rem;
1342
1343 blobmsg_for_each_attr(cur, msg, rem)
1344 ++sz;
1345
1346 if (sz > 0) {
1347 *envp = calloc(1 + sz, sizeof(char*));
1348 if (!(*envp))
1349 return ENOMEM;
1350 } else {
1351 *envp = NULL;
1352 return 0;
1353 }
1354
1355 sz = 0;
1356 blobmsg_for_each_attr(cur, msg, rem)
1357 (*envp)[sz++] = strdup(blobmsg_get_string(cur));
1358
1359 if (sz)
1360 (*envp)[sz] = NULL;
1361
1362 return 0;
1363 }
1364
1365 enum {
1366 OCI_ROOT_PATH,
1367 OCI_ROOT_READONLY,
1368 __OCI_ROOT_MAX,
1369 };
1370
1371 static const struct blobmsg_policy oci_root_policy[] = {
1372 [OCI_ROOT_PATH] = { "path", BLOBMSG_TYPE_STRING },
1373 [OCI_ROOT_READONLY] = { "readonly", BLOBMSG_TYPE_BOOL },
1374 };
1375
1376 static int parseOCIroot(const char *jsonfile, struct blob_attr *msg)
1377 {
1378 char extroot[PATH_MAX] = { 0 };
1379 struct blob_attr *tb[__OCI_ROOT_MAX];
1380 char *cur;
1381 char *root_path;
1382
1383 blobmsg_parse(oci_root_policy, __OCI_ROOT_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
1384
1385 if (!tb[OCI_ROOT_PATH])
1386 return ENODATA;
1387
1388 root_path = blobmsg_get_string(tb[OCI_ROOT_PATH]);
1389
1390 /* prepend bundle directory in case of relative paths */
1391 if (root_path[0] != '/') {
1392 strncpy(extroot, jsonfile, PATH_MAX - 1);
1393
1394 cur = strrchr(extroot, '/');
1395
1396 if (!cur)
1397 return ENOTDIR;
1398
1399 *(++cur) = '\0';
1400 }
1401
1402 strncat(extroot, root_path, PATH_MAX - (strlen(extroot) + 1));
1403
1404 /* follow symbolic link(s) */
1405 opts.extroot = realpath(extroot, NULL);
1406 if (!opts.extroot)
1407 return errno;
1408
1409 if (tb[OCI_ROOT_READONLY])
1410 opts.ronly = blobmsg_get_bool(tb[OCI_ROOT_READONLY]);
1411
1412 return 0;
1413 }
1414
1415
1416 enum {
1417 OCI_HOOK_PATH,
1418 OCI_HOOK_ARGS,
1419 OCI_HOOK_ENV,
1420 OCI_HOOK_TIMEOUT,
1421 __OCI_HOOK_MAX,
1422 };
1423
1424 static const struct blobmsg_policy oci_hook_policy[] = {
1425 [OCI_HOOK_PATH] = { "path", BLOBMSG_TYPE_STRING },
1426 [OCI_HOOK_ARGS] = { "args", BLOBMSG_TYPE_ARRAY },
1427 [OCI_HOOK_ENV] = { "env", BLOBMSG_TYPE_ARRAY },
1428 [OCI_HOOK_TIMEOUT] = { "timeout", BLOBMSG_TYPE_INT32 },
1429 };
1430
1431
1432 static int parseOCIhook(struct hook_execvpe ***hooklist, struct blob_attr *msg)
1433 {
1434 struct blob_attr *tb[__OCI_HOOK_MAX];
1435 struct blob_attr *cur;
1436 int rem, ret = 0;
1437 int idx = 0;
1438
1439 blobmsg_for_each_attr(cur, msg, rem)
1440 ++idx;
1441
1442 if (!idx)
1443 return 0;
1444
1445 *hooklist = calloc(idx + 1, sizeof(struct hook_execvpe *));
1446 idx = 0;
1447
1448 if (!(*hooklist))
1449 return ENOMEM;
1450
1451 blobmsg_for_each_attr(cur, msg, rem) {
1452 blobmsg_parse(oci_hook_policy, __OCI_HOOK_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
1453
1454 if (!tb[OCI_HOOK_PATH]) {
1455 ret = EINVAL;
1456 goto errout;
1457 }
1458
1459 (*hooklist)[idx] = calloc(1, sizeof(struct hook_execvpe));
1460 if (tb[OCI_HOOK_ARGS]) {
1461 ret = parseOCIenvarray(tb[OCI_HOOK_ARGS], &((*hooklist)[idx]->argv));
1462 if (ret)
1463 goto errout;
1464 } else {
1465 (*hooklist)[idx]->argv = calloc(2, sizeof(char *));
1466 ((*hooklist)[idx]->argv)[0] = strdup(blobmsg_get_string(tb[OCI_HOOK_PATH]));
1467 ((*hooklist)[idx]->argv)[1] = NULL;
1468 };
1469
1470
1471 if (tb[OCI_HOOK_ENV]) {
1472 ret = parseOCIenvarray(tb[OCI_HOOK_ENV], &((*hooklist)[idx]->envp));
1473 if (ret)
1474 goto errout;
1475 }
1476
1477 if (tb[OCI_HOOK_TIMEOUT])
1478 (*hooklist)[idx]->timeout = blobmsg_get_u32(tb[OCI_HOOK_TIMEOUT]);
1479
1480 (*hooklist)[idx]->file = strdup(blobmsg_get_string(tb[OCI_HOOK_PATH]));
1481
1482 ++idx;
1483 }
1484
1485 (*hooklist)[idx] = NULL;
1486
1487 DEBUG("added %d hooks\n", idx);
1488
1489 return 0;
1490
1491 errout:
1492 free_hooklist(*hooklist);
1493 *hooklist = NULL;
1494
1495 return ret;
1496 };
1497
1498
1499 enum {
1500 OCI_HOOKS_PRESTART,
1501 OCI_HOOKS_CREATERUNTIME,
1502 OCI_HOOKS_CREATECONTAINER,
1503 OCI_HOOKS_STARTCONTAINER,
1504 OCI_HOOKS_POSTSTART,
1505 OCI_HOOKS_POSTSTOP,
1506 __OCI_HOOKS_MAX,
1507 };
1508
1509 static const struct blobmsg_policy oci_hooks_policy[] = {
1510 [OCI_HOOKS_PRESTART] = { "prestart", BLOBMSG_TYPE_ARRAY },
1511 [OCI_HOOKS_CREATERUNTIME] = { "createRuntime", BLOBMSG_TYPE_ARRAY },
1512 [OCI_HOOKS_CREATECONTAINER] = { "createContainer", BLOBMSG_TYPE_ARRAY },
1513 [OCI_HOOKS_STARTCONTAINER] = { "startContainer", BLOBMSG_TYPE_ARRAY },
1514 [OCI_HOOKS_POSTSTART] = { "poststart", BLOBMSG_TYPE_ARRAY },
1515 [OCI_HOOKS_POSTSTOP] = { "poststop", BLOBMSG_TYPE_ARRAY },
1516 };
1517
1518 static int parseOCIhooks(struct blob_attr *msg)
1519 {
1520 struct blob_attr *tb[__OCI_HOOKS_MAX];
1521 int ret;
1522
1523 blobmsg_parse(oci_hooks_policy, __OCI_HOOKS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
1524
1525 if (tb[OCI_HOOKS_PRESTART])
1526 INFO("warning: ignoring deprecated prestart hook\n");
1527
1528 if (tb[OCI_HOOKS_CREATERUNTIME]) {
1529 ret = parseOCIhook(&opts.hooks.createRuntime, tb[OCI_HOOKS_CREATERUNTIME]);
1530 if (ret)
1531 return ret;
1532 }
1533
1534 if (tb[OCI_HOOKS_CREATECONTAINER]) {
1535 ret = parseOCIhook(&opts.hooks.createContainer, tb[OCI_HOOKS_CREATECONTAINER]);
1536 if (ret)
1537 goto out_createruntime;
1538 }
1539
1540 if (tb[OCI_HOOKS_STARTCONTAINER]) {
1541 ret = parseOCIhook(&opts.hooks.startContainer, tb[OCI_HOOKS_STARTCONTAINER]);
1542 if (ret)
1543 goto out_createcontainer;
1544 }
1545
1546 if (tb[OCI_HOOKS_POSTSTART]) {
1547 ret = parseOCIhook(&opts.hooks.poststart, tb[OCI_HOOKS_POSTSTART]);
1548 if (ret)
1549 goto out_startcontainer;
1550 }
1551
1552 if (tb[OCI_HOOKS_POSTSTOP]) {
1553 ret = parseOCIhook(&opts.hooks.poststop, tb[OCI_HOOKS_POSTSTOP]);
1554 if (ret)
1555 goto out_poststart;
1556 }
1557
1558 return 0;
1559
1560 out_poststart:
1561 free_hooklist(opts.hooks.poststart);
1562 out_startcontainer:
1563 free_hooklist(opts.hooks.startContainer);
1564 out_createcontainer:
1565 free_hooklist(opts.hooks.createContainer);
1566 out_createruntime:
1567 free_hooklist(opts.hooks.createRuntime);
1568
1569 return ret;
1570 };
1571
1572
1573 enum {
1574 OCI_PROCESS_USER_UID,
1575 OCI_PROCESS_USER_GID,
1576 OCI_PROCESS_USER_UMASK,
1577 OCI_PROCESS_USER_ADDITIONALGIDS,
1578 __OCI_PROCESS_USER_MAX,
1579 };
1580
1581 static const struct blobmsg_policy oci_process_user_policy[] = {
1582 [OCI_PROCESS_USER_UID] = { "uid", BLOBMSG_TYPE_INT32 },
1583 [OCI_PROCESS_USER_GID] = { "gid", BLOBMSG_TYPE_INT32 },
1584 [OCI_PROCESS_USER_UMASK] = { "umask", BLOBMSG_TYPE_INT32 },
1585 [OCI_PROCESS_USER_ADDITIONALGIDS] = { "additionalGids", BLOBMSG_TYPE_ARRAY },
1586 };
1587
1588 static int parseOCIprocessuser(struct blob_attr *msg) {
1589 struct blob_attr *tb[__OCI_PROCESS_USER_MAX];
1590 struct blob_attr *cur;
1591 int rem;
1592 int has_gid = 0;
1593
1594 blobmsg_parse(oci_process_user_policy, __OCI_PROCESS_USER_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
1595
1596 if (tb[OCI_PROCESS_USER_UID])
1597 opts.pw_uid = blobmsg_get_u32(tb[OCI_PROCESS_USER_UID]);
1598
1599 if (tb[OCI_PROCESS_USER_GID]) {
1600 opts.pw_gid = blobmsg_get_u32(tb[OCI_PROCESS_USER_GID]);
1601 opts.gr_gid = blobmsg_get_u32(tb[OCI_PROCESS_USER_GID]);
1602 has_gid = 1;
1603 }
1604
1605 if (tb[OCI_PROCESS_USER_ADDITIONALGIDS]) {
1606 size_t gidcnt = 0;
1607
1608 blobmsg_for_each_attr(cur, tb[OCI_PROCESS_USER_ADDITIONALGIDS], rem) {
1609 ++gidcnt;
1610 if (has_gid && (blobmsg_get_u32(cur) == opts.gr_gid))
1611 continue;
1612 }
1613
1614 if (gidcnt) {
1615 opts.additional_gids = calloc(gidcnt + has_gid, sizeof(gid_t));
1616 gidcnt = 0;
1617
1618 /* always add primary GID to set of GIDs if set */
1619 if (has_gid)
1620 opts.additional_gids[gidcnt++] = opts.gr_gid;
1621
1622 blobmsg_for_each_attr(cur, tb[OCI_PROCESS_USER_ADDITIONALGIDS], rem) {
1623 if (has_gid && (blobmsg_get_u32(cur) == opts.gr_gid))
1624 continue;
1625 opts.additional_gids[gidcnt++] = blobmsg_get_u32(cur);
1626 }
1627 opts.num_additional_gids = gidcnt;
1628 }
1629 DEBUG("read %zu additional groups\n", gidcnt);
1630 }
1631
1632 if (tb[OCI_PROCESS_USER_UMASK]) {
1633 opts.umask = blobmsg_get_u32(tb[OCI_PROCESS_USER_UMASK]);
1634 opts.set_umask = true;
1635 }
1636
1637 return 0;
1638 }
1639
1640 enum {
1641 OCI_PROCESS_RLIMIT_TYPE,
1642 OCI_PROCESS_RLIMIT_SOFT,
1643 OCI_PROCESS_RLIMIT_HARD,
1644 __OCI_PROCESS_RLIMIT_MAX,
1645 };
1646
1647 static const struct blobmsg_policy oci_process_rlimit_policy[] = {
1648 [OCI_PROCESS_RLIMIT_TYPE] = { "type", BLOBMSG_TYPE_STRING },
1649 [OCI_PROCESS_RLIMIT_SOFT] = { "soft", BLOBMSG_CAST_INT64 },
1650 [OCI_PROCESS_RLIMIT_HARD] = { "hard", BLOBMSG_CAST_INT64 },
1651 };
1652
1653 /* from manpage GETRLIMIT(2) */
1654 static const char* const rlimit_names[RLIM_NLIMITS] = {
1655 [RLIMIT_AS] = "AS",
1656 [RLIMIT_CORE] = "CORE",
1657 [RLIMIT_CPU] = "CPU",
1658 [RLIMIT_DATA] = "DATA",
1659 [RLIMIT_FSIZE] = "FSIZE",
1660 [RLIMIT_LOCKS] = "LOCKS",
1661 [RLIMIT_MEMLOCK] = "MEMLOCK",
1662 [RLIMIT_MSGQUEUE] = "MSGQUEUE",
1663 [RLIMIT_NICE] = "NICE",
1664 [RLIMIT_NOFILE] = "NOFILE",
1665 [RLIMIT_NPROC] = "NPROC",
1666 [RLIMIT_RSS] = "RSS",
1667 [RLIMIT_RTPRIO] = "RTPRIO",
1668 [RLIMIT_RTTIME] = "RTTIME",
1669 [RLIMIT_SIGPENDING] = "SIGPENDING",
1670 [RLIMIT_STACK] = "STACK",
1671 };
1672
1673 static int resolve_rlimit(char *type) {
1674 unsigned int rltype;
1675
1676 for (rltype = 0; rltype < RLIM_NLIMITS; ++rltype)
1677 if (rlimit_names[rltype] &&
1678 !strncmp("RLIMIT_", type, 7) &&
1679 !strcmp(rlimit_names[rltype], type + 7))
1680 return rltype;
1681
1682 return -1;
1683 }
1684
1685
1686 static int parseOCIrlimit(struct blob_attr *msg)
1687 {
1688 struct blob_attr *tb[__OCI_PROCESS_RLIMIT_MAX];
1689 int limtype = -1;
1690 struct rlimit *curlim;
1691
1692 blobmsg_parse(oci_process_rlimit_policy, __OCI_PROCESS_RLIMIT_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
1693
1694 if (!tb[OCI_PROCESS_RLIMIT_TYPE] ||
1695 !tb[OCI_PROCESS_RLIMIT_SOFT] ||
1696 !tb[OCI_PROCESS_RLIMIT_HARD])
1697 return ENODATA;
1698
1699 limtype = resolve_rlimit(blobmsg_get_string(tb[OCI_PROCESS_RLIMIT_TYPE]));
1700
1701 if (limtype < 0)
1702 return EINVAL;
1703
1704 if (opts.rlimits[limtype])
1705 return ENOTUNIQ;
1706
1707 curlim = malloc(sizeof(struct rlimit));
1708 curlim->rlim_cur = blobmsg_cast_u64(tb[OCI_PROCESS_RLIMIT_SOFT]);
1709 curlim->rlim_max = blobmsg_cast_u64(tb[OCI_PROCESS_RLIMIT_HARD]);
1710
1711 opts.rlimits[limtype] = curlim;
1712
1713 return 0;
1714 };
1715
1716 enum {
1717 OCI_PROCESS_ARGS,
1718 OCI_PROCESS_CAPABILITIES,
1719 OCI_PROCESS_CWD,
1720 OCI_PROCESS_ENV,
1721 OCI_PROCESS_OOMSCOREADJ,
1722 OCI_PROCESS_NONEWPRIVILEGES,
1723 OCI_PROCESS_RLIMITS,
1724 OCI_PROCESS_TERMINAL,
1725 OCI_PROCESS_USER,
1726 __OCI_PROCESS_MAX,
1727 };
1728
1729 static const struct blobmsg_policy oci_process_policy[] = {
1730 [OCI_PROCESS_ARGS] = { "args", BLOBMSG_TYPE_ARRAY },
1731 [OCI_PROCESS_CAPABILITIES] = { "capabilities", BLOBMSG_TYPE_TABLE },
1732 [OCI_PROCESS_CWD] = { "cwd", BLOBMSG_TYPE_STRING },
1733 [OCI_PROCESS_ENV] = { "env", BLOBMSG_TYPE_ARRAY },
1734 [OCI_PROCESS_OOMSCOREADJ] = { "oomScoreAdj", BLOBMSG_TYPE_INT32 },
1735 [OCI_PROCESS_NONEWPRIVILEGES] = { "noNewPrivileges", BLOBMSG_TYPE_BOOL },
1736 [OCI_PROCESS_RLIMITS] = { "rlimits", BLOBMSG_TYPE_ARRAY },
1737 [OCI_PROCESS_TERMINAL] = { "terminal", BLOBMSG_TYPE_BOOL },
1738 [OCI_PROCESS_USER] = { "user", BLOBMSG_TYPE_TABLE },
1739 };
1740
1741
1742 static int parseOCIprocess(struct blob_attr *msg)
1743 {
1744 struct blob_attr *tb[__OCI_PROCESS_MAX], *cur;
1745 int rem, res;
1746
1747 blobmsg_parse(oci_process_policy, __OCI_PROCESS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
1748
1749 if (!tb[OCI_PROCESS_ARGS])
1750 return ENOENT;
1751
1752 res = parseOCIenvarray(tb[OCI_PROCESS_ARGS], &opts.jail_argv);
1753 if (res)
1754 return res;
1755
1756 if (tb[OCI_PROCESS_TERMINAL])
1757 opts.console = blobmsg_get_bool(tb[OCI_PROCESS_TERMINAL]);
1758
1759 if (tb[OCI_PROCESS_NONEWPRIVILEGES])
1760 opts.no_new_privs = blobmsg_get_bool(tb[OCI_PROCESS_NONEWPRIVILEGES]);
1761
1762 if (tb[OCI_PROCESS_CWD])
1763 opts.cwd = strdup(blobmsg_get_string(tb[OCI_PROCESS_CWD]));
1764
1765 if (tb[OCI_PROCESS_ENV]) {
1766 res = parseOCIenvarray(tb[OCI_PROCESS_ENV], &opts.envp);
1767 if (res)
1768 return res;
1769 }
1770
1771 if (tb[OCI_PROCESS_USER] && (res = parseOCIprocessuser(tb[OCI_PROCESS_USER])))
1772 return res;
1773
1774 if (tb[OCI_PROCESS_CAPABILITIES] &&
1775 (res = parseOCIcapabilities(&opts.capset, tb[OCI_PROCESS_CAPABILITIES])))
1776 return res;
1777
1778 if (tb[OCI_PROCESS_RLIMITS]) {
1779 blobmsg_for_each_attr(cur, tb[OCI_PROCESS_RLIMITS], rem) {
1780 res = parseOCIrlimit(cur);
1781 if (res)
1782 return res;
1783 }
1784 }
1785
1786 if (tb[OCI_PROCESS_OOMSCOREADJ]) {
1787 opts.oom_score_adj = blobmsg_get_u32(tb[OCI_PROCESS_OOMSCOREADJ]);
1788 opts.set_oom_score_adj = true;
1789 }
1790
1791 return 0;
1792 }
1793
1794 enum {
1795 OCI_LINUX_NAMESPACE_TYPE,
1796 OCI_LINUX_NAMESPACE_PATH,
1797 __OCI_LINUX_NAMESPACE_MAX,
1798 };
1799
1800 static const struct blobmsg_policy oci_linux_namespace_policy[] = {
1801 [OCI_LINUX_NAMESPACE_TYPE] = { "type", BLOBMSG_TYPE_STRING },
1802 [OCI_LINUX_NAMESPACE_PATH] = { "path", BLOBMSG_TYPE_STRING },
1803 };
1804
1805 static int resolve_nstype(char *type) {
1806 if (!strcmp("pid", type))
1807 return CLONE_NEWPID;
1808 else if (!strcmp("network", type))
1809 return CLONE_NEWNET;
1810 else if (!strcmp("net", type))
1811 return CLONE_NEWNET;
1812 else if (!strcmp("mount", type))
1813 return CLONE_NEWNS;
1814 else if (!strcmp("ipc", type))
1815 return CLONE_NEWIPC;
1816 else if (!strcmp("uts", type))
1817 return CLONE_NEWUTS;
1818 else if (!strcmp("user", type))
1819 return CLONE_NEWUSER;
1820 else if (!strcmp("cgroup", type))
1821 return CLONE_NEWCGROUP;
1822 #ifdef CLONE_NEWTIME
1823 else if (!strcmp("time", type))
1824 return CLONE_NEWTIME;
1825 #endif
1826 else
1827 return 0;
1828 }
1829
1830 static int parseOCIlinuxns(struct blob_attr *msg)
1831 {
1832 struct blob_attr *tb[__OCI_LINUX_NAMESPACE_MAX];
1833 int nstype;
1834 int *setns;
1835 int fd;
1836
1837 blobmsg_parse(oci_linux_namespace_policy, __OCI_LINUX_NAMESPACE_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
1838
1839 if (!tb[OCI_LINUX_NAMESPACE_TYPE])
1840 return EINVAL;
1841
1842 nstype = resolve_nstype(blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE]));
1843 if (!nstype)
1844 return EINVAL;
1845
1846 if (opts.namespace & nstype)
1847 return ENOTUNIQ;
1848
1849 setns = get_namespace_fd(nstype);
1850
1851 if (!setns)
1852 return EFAULT;
1853
1854 if (*setns != -1)
1855 return ENOTUNIQ;
1856
1857 if (tb[OCI_LINUX_NAMESPACE_PATH]) {
1858 DEBUG("opening existing %s namespace from path %s\n",
1859 blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE]),
1860 blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_PATH]));
1861
1862 fd = open(blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_PATH]), O_RDONLY);
1863 if (fd < 0)
1864 return errno?:ESTALE;
1865
1866 if (ioctl(fd, NS_GET_NSTYPE) != nstype) {
1867 close(fd);
1868 return EINVAL;
1869 }
1870
1871 DEBUG("opened existing %s namespace got filehandler %u\n",
1872 blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE]),
1873 fd);
1874
1875 *setns = fd;
1876 } else {
1877 opts.namespace |= nstype;
1878 }
1879
1880 return 0;
1881 }
1882
1883 /*
1884 * join namespace of existing PID
1885 * The string argument is the reference PID followed by ':' and a
1886 * ',' separated list of namespaces to to join.
1887 */
1888 static int jail_join_ns(char *arg)
1889 {
1890 pid_t pid;
1891 int fd;
1892 int nstype;
1893 char *tmp, *etmp, *nspath;
1894 int *setns;
1895
1896 tmp = strchr(arg, ':');
1897 if (!tmp)
1898 return EINVAL;
1899
1900 *tmp = '\0';
1901 pid = atoi(arg);
1902
1903 do {
1904 ++tmp;
1905 etmp = strchr(tmp, ',');
1906 if (etmp)
1907 *etmp = '\0';
1908
1909 nstype = resolve_nstype(tmp);
1910 if (!nstype)
1911 return EINVAL;
1912
1913 if (opts.namespace & nstype)
1914 return ENOTUNIQ;
1915
1916 setns = get_namespace_fd(nstype);
1917
1918 if (!setns)
1919 return EFAULT;
1920
1921 if (*setns != -1)
1922 return ENOTUNIQ;
1923
1924 if (asprintf(&nspath, "/proc/%d/ns/%s", pid, tmp) < 0)
1925 return ENOMEM;
1926
1927 fd = open(nspath, O_RDONLY);
1928 free(nspath);
1929
1930 if (fd < 0)
1931 return errno?:ESTALE;
1932
1933 *setns = fd;
1934
1935 if (etmp)
1936 tmp = etmp;
1937 else
1938 tmp = NULL;
1939 } while (tmp);
1940
1941 return 0;
1942 }
1943
1944 static void get_jail_root_user(bool is_gidmap, uint32_t container_id, uint32_t host_id, uint32_t size)
1945 {
1946 if (container_id == 0 && size >= 1)
1947 if (!is_gidmap)
1948 opts.root_map_uid = host_id;
1949 }
1950
1951 enum {
1952 OCI_LINUX_UIDGIDMAP_CONTAINERID,
1953 OCI_LINUX_UIDGIDMAP_HOSTID,
1954 OCI_LINUX_UIDGIDMAP_SIZE,
1955 __OCI_LINUX_UIDGIDMAP_MAX,
1956 };
1957
1958 static const struct blobmsg_policy oci_linux_uidgidmap_policy[] = {
1959 [OCI_LINUX_UIDGIDMAP_CONTAINERID] = { "containerID", BLOBMSG_TYPE_INT32 },
1960 [OCI_LINUX_UIDGIDMAP_HOSTID] = { "hostID", BLOBMSG_TYPE_INT32 },
1961 [OCI_LINUX_UIDGIDMAP_SIZE] = { "size", BLOBMSG_TYPE_INT32 },
1962 };
1963
1964 static int parseOCIuidgidmappings(struct blob_attr *msg, bool is_gidmap)
1965 {
1966 struct blob_attr *tb[__OCI_LINUX_UIDGIDMAP_MAX];
1967 struct blob_attr *cur;
1968 int rem;
1969 char *map;
1970 size_t len, pos, totallen = 0;
1971
1972 blobmsg_for_each_attr(cur, msg, rem) {
1973 blobmsg_parse(oci_linux_uidgidmap_policy, __OCI_LINUX_UIDGIDMAP_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
1974
1975 if (!tb[OCI_LINUX_UIDGIDMAP_CONTAINERID] ||
1976 !tb[OCI_LINUX_UIDGIDMAP_HOSTID] ||
1977 !tb[OCI_LINUX_UIDGIDMAP_SIZE])
1978 return EINVAL;
1979
1980 /* count length */
1981 totallen += snprintf(NULL, 0, "%d %d %d\n",
1982 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]),
1983 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]),
1984 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE]));
1985 }
1986
1987 /* allocate combined mapping string */
1988 map = malloc(totallen + 1);
1989 if (!map)
1990 return ENOMEM;
1991
1992 pos = 0;
1993 blobmsg_for_each_attr(cur, msg, rem) {
1994 blobmsg_parse(oci_linux_uidgidmap_policy, __OCI_LINUX_UIDGIDMAP_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
1995
1996 get_jail_root_user(is_gidmap, blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]),
1997 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]),
1998 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE]));
1999
2000 /* write mapping line into pre-allocated string */
2001 len = snprintf(&map[pos], totallen + 1, "%d %d %d\n",
2002 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]),
2003 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]),
2004 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE]));
2005 pos += len;
2006 totallen -= len;
2007 }
2008
2009 assert(totallen == 0);
2010
2011 if (is_gidmap)
2012 opts.gidmap = map;
2013 else
2014 opts.uidmap = map;
2015
2016 return 0;
2017 }
2018
2019 enum {
2020 OCI_DEVICES_TYPE,
2021 OCI_DEVICES_PATH,
2022 OCI_DEVICES_MAJOR,
2023 OCI_DEVICES_MINOR,
2024 OCI_DEVICES_FILEMODE,
2025 OCI_DEVICES_UID,
2026 OCI_DEVICES_GID,
2027 __OCI_DEVICES_MAX,
2028 };
2029
2030 static const struct blobmsg_policy oci_devices_policy[] = {
2031 [OCI_DEVICES_TYPE] = { "type", BLOBMSG_TYPE_STRING },
2032 [OCI_DEVICES_PATH] = { "path", BLOBMSG_TYPE_STRING },
2033 [OCI_DEVICES_MAJOR] = { "major", BLOBMSG_TYPE_INT32 },
2034 [OCI_DEVICES_MINOR] = { "minor", BLOBMSG_TYPE_INT32 },
2035 [OCI_DEVICES_FILEMODE] = { "fileMode", BLOBMSG_TYPE_INT32 },
2036 [OCI_DEVICES_UID] = { "uid", BLOBMSG_TYPE_INT32 },
2037 [OCI_DEVICES_GID] = { "uid", BLOBMSG_TYPE_INT32 },
2038 };
2039
2040 static mode_t resolve_devtype(char *tstr)
2041 {
2042 if (!strcmp("c", tstr) ||
2043 !strcmp("u", tstr))
2044 return S_IFCHR;
2045 else if (!strcmp("b", tstr))
2046 return S_IFBLK;
2047 else if (!strcmp("p", tstr))
2048 return S_IFIFO;
2049 else
2050 return 0;
2051 }
2052
2053 static int parseOCIdevices(struct blob_attr *msg)
2054 {
2055 struct blob_attr *tb[__OCI_DEVICES_MAX];
2056 struct blob_attr *cur;
2057 int rem;
2058 size_t cnt = 0;
2059 struct mknod_args *tmp;
2060
2061 blobmsg_for_each_attr(cur, msg, rem)
2062 ++cnt;
2063
2064 opts.devices = calloc(cnt + 1, sizeof(struct mknod_args *));
2065
2066 cnt = 0;
2067 blobmsg_for_each_attr(cur, msg, rem) {
2068 blobmsg_parse(oci_devices_policy, __OCI_DEVICES_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
2069 if (!tb[OCI_DEVICES_TYPE] ||
2070 !tb[OCI_DEVICES_PATH])
2071 return ENODATA;
2072
2073 tmp = calloc(1, sizeof(struct mknod_args));
2074 if (!tmp)
2075 return ENOMEM;
2076
2077 tmp->mode = resolve_devtype(blobmsg_get_string(tb[OCI_DEVICES_TYPE]));
2078 if (!tmp->mode) {
2079 free(tmp);
2080 return EINVAL;
2081 }
2082
2083 if (tmp->mode != S_IFIFO) {
2084 if (!tb[OCI_DEVICES_MAJOR] || !tb[OCI_DEVICES_MINOR]) {
2085 free(tmp);
2086 return ENODATA;
2087 }
2088
2089 tmp->dev = makedev(blobmsg_get_u32(tb[OCI_DEVICES_MAJOR]),
2090 blobmsg_get_u32(tb[OCI_DEVICES_MINOR]));
2091 }
2092
2093 if (tb[OCI_DEVICES_FILEMODE]) {
2094 if (~(S_IRWXU|S_IRWXG|S_IRWXO) & blobmsg_get_u32(tb[OCI_DEVICES_FILEMODE])) {
2095 free(tmp);
2096 return EINVAL;
2097 }
2098
2099 tmp->mode |= blobmsg_get_u32(tb[OCI_DEVICES_FILEMODE]);
2100 } else {
2101 tmp->mode |= (S_IRUSR|S_IWUSR); /* 0600 */
2102 }
2103
2104 tmp->path = strdup(blobmsg_get_string(tb[OCI_DEVICES_PATH]));
2105
2106 if (tb[OCI_DEVICES_UID])
2107 tmp->uid = blobmsg_get_u32(tb[OCI_DEVICES_UID]);
2108 else
2109 tmp->uid = -1;
2110
2111 if (tb[OCI_DEVICES_GID])
2112 tmp->gid = blobmsg_get_u32(tb[OCI_DEVICES_GID]);
2113 else
2114 tmp->gid = -1;
2115
2116 DEBUG("read device %s (%s)\n", blobmsg_get_string(tb[OCI_DEVICES_PATH]), blobmsg_get_string(tb[OCI_DEVICES_TYPE]));
2117 opts.devices[cnt++] = tmp;
2118 }
2119
2120 opts.devices[cnt] = NULL;
2121
2122 return 0;
2123 }
2124
2125 static int parseOCIsysctl(struct blob_attr *msg)
2126 {
2127 struct blob_attr *cur;
2128 int rem;
2129 char *tmp, *tc;
2130 size_t cnt = 0;
2131
2132 blobmsg_for_each_attr(cur, msg, rem) {
2133 if (!blobmsg_name(cur) || !blobmsg_get_string(cur))
2134 return EINVAL;
2135
2136 ++cnt;
2137 }
2138
2139 if (!cnt)
2140 return 0;
2141
2142 opts.sysctl = calloc(cnt + 1, sizeof(struct sysctl_val *));
2143 if (!opts.sysctl)
2144 return ENOMEM;
2145
2146 cnt = 0;
2147 blobmsg_for_each_attr(cur, msg, rem) {
2148 opts.sysctl[cnt] = malloc(sizeof(struct sysctl_val));
2149 if (!opts.sysctl[cnt])
2150 return ENOMEM;
2151
2152 /* replace '.' with '/' in entry name */
2153 tc = tmp = strdup(blobmsg_name(cur));
2154 while ((tc = strchr(tc, '.')))
2155 *tc = '/';
2156
2157 opts.sysctl[cnt]->value = strdup(blobmsg_get_string(cur));
2158 opts.sysctl[cnt]->entry = tmp;
2159
2160 ++cnt;
2161 }
2162
2163 opts.sysctl[cnt] = NULL;
2164
2165 return 0;
2166 }
2167
2168
2169 enum {
2170 OCI_LINUX_CGROUPSPATH,
2171 OCI_LINUX_RESOURCES,
2172 OCI_LINUX_SECCOMP,
2173 OCI_LINUX_SYSCTL,
2174 OCI_LINUX_NAMESPACES,
2175 OCI_LINUX_DEVICES,
2176 OCI_LINUX_UIDMAPPINGS,
2177 OCI_LINUX_GIDMAPPINGS,
2178 OCI_LINUX_MASKEDPATHS,
2179 OCI_LINUX_READONLYPATHS,
2180 OCI_LINUX_ROOTFSPROPAGATION,
2181 __OCI_LINUX_MAX,
2182 };
2183
2184 static const struct blobmsg_policy oci_linux_policy[] = {
2185 [OCI_LINUX_CGROUPSPATH] = { "cgroupsPath", BLOBMSG_TYPE_STRING },
2186 [OCI_LINUX_RESOURCES] = { "resources", BLOBMSG_TYPE_TABLE },
2187 [OCI_LINUX_SECCOMP] = { "seccomp", BLOBMSG_TYPE_TABLE },
2188 [OCI_LINUX_SYSCTL] = { "sysctl", BLOBMSG_TYPE_TABLE },
2189 [OCI_LINUX_NAMESPACES] = { "namespaces", BLOBMSG_TYPE_ARRAY },
2190 [OCI_LINUX_DEVICES] = { "devices", BLOBMSG_TYPE_ARRAY },
2191 [OCI_LINUX_UIDMAPPINGS] = { "uidMappings", BLOBMSG_TYPE_ARRAY },
2192 [OCI_LINUX_GIDMAPPINGS] = { "gidMappings", BLOBMSG_TYPE_ARRAY },
2193 [OCI_LINUX_MASKEDPATHS] = { "maskedPaths", BLOBMSG_TYPE_ARRAY },
2194 [OCI_LINUX_READONLYPATHS] = { "readonlyPaths", BLOBMSG_TYPE_ARRAY },
2195 [OCI_LINUX_ROOTFSPROPAGATION] = { "rootfsPropagation", BLOBMSG_TYPE_STRING },
2196 };
2197
2198 static int parseOCIlinux(struct blob_attr *msg)
2199 {
2200 struct blob_attr *tb[__OCI_LINUX_MAX];
2201 struct blob_attr *cur;
2202 int rem;
2203 int res = 0;
2204 char *cgpath;
2205 char cgfullpath[256] = "/sys/fs/cgroup";
2206
2207 blobmsg_parse(oci_linux_policy, __OCI_LINUX_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
2208
2209 if (tb[OCI_LINUX_NAMESPACES]) {
2210 blobmsg_for_each_attr(cur, tb[OCI_LINUX_NAMESPACES], rem) {
2211 res = parseOCIlinuxns(cur);
2212 if (res)
2213 return res;
2214 }
2215 }
2216
2217 if (tb[OCI_LINUX_UIDMAPPINGS]) {
2218 res = parseOCIuidgidmappings(tb[OCI_LINUX_GIDMAPPINGS], 0);
2219 if (res)
2220 return res;
2221 }
2222
2223 if (tb[OCI_LINUX_GIDMAPPINGS]) {
2224 res = parseOCIuidgidmappings(tb[OCI_LINUX_GIDMAPPINGS], 1);
2225 if (res)
2226 return res;
2227 }
2228
2229 if (tb[OCI_LINUX_READONLYPATHS]) {
2230 blobmsg_for_each_attr(cur, tb[OCI_LINUX_READONLYPATHS], rem) {
2231 res = add_mount(NULL, blobmsg_get_string(cur), NULL, MS_BIND | MS_REC | MS_RDONLY, 0, NULL, 0);
2232 if (res)
2233 return res;
2234 }
2235 }
2236
2237 if (tb[OCI_LINUX_MASKEDPATHS]) {
2238 blobmsg_for_each_attr(cur, tb[OCI_LINUX_MASKEDPATHS], rem) {
2239 res = add_mount((void *)(-1), blobmsg_get_string(cur), NULL, 0, 0, NULL, 0);
2240 if (res)
2241 return res;
2242 }
2243 }
2244
2245 if (tb[OCI_LINUX_SYSCTL]) {
2246 res = parseOCIsysctl(tb[OCI_LINUX_SYSCTL]);
2247 if (res)
2248 return res;
2249 }
2250
2251 if (tb[OCI_LINUX_SECCOMP]) {
2252 opts.ociseccomp = parseOCIlinuxseccomp(tb[OCI_LINUX_SECCOMP]);
2253 if (!opts.ociseccomp)
2254 return EINVAL;
2255 }
2256
2257 if (tb[OCI_LINUX_DEVICES]) {
2258 res = parseOCIdevices(tb[OCI_LINUX_DEVICES]);
2259 if (res)
2260 return res;
2261 }
2262
2263 if (tb[OCI_LINUX_CGROUPSPATH]) {
2264 cgpath = blobmsg_get_string(tb[OCI_LINUX_CGROUPSPATH]);
2265 if (cgpath[0] == '/') {
2266 if (strlen(cgpath) + 1 >= (sizeof(cgfullpath) - strlen(cgfullpath)))
2267 return E2BIG;
2268
2269 strcat(cgfullpath, cgpath);
2270 } else {
2271 strcat(cgfullpath, "/containers/");
2272 if (strlen(opts.name) + strlen(cgpath) + 2 >= (sizeof(cgfullpath) - strlen(cgfullpath)))
2273 return E2BIG;
2274
2275 strcat(cgfullpath, opts.name); /* should be container name rather than jail name */
2276 strcat(cgfullpath, "/");
2277 strcat(cgfullpath, cgpath);
2278 }
2279 } else {
2280 strcat(cgfullpath, "/containers/");
2281 if (2 * strlen(opts.name) + 2 >= (sizeof(cgfullpath) - strlen(cgfullpath)))
2282 return E2BIG;
2283
2284 strcat(cgfullpath, opts.name); /* should be container name rather than jail name */
2285 strcat(cgfullpath, "/");
2286 strcat(cgfullpath, opts.name); /* should be container instance name rather than jail name */
2287 }
2288
2289 cgroups_init(cgfullpath);
2290
2291 if (tb[OCI_LINUX_RESOURCES]) {
2292 res = parseOCIlinuxcgroups(tb[OCI_LINUX_RESOURCES]);
2293 if (res)
2294 return res;
2295 }
2296
2297 return 0;
2298 }
2299
2300 enum {
2301 OCI_VERSION,
2302 OCI_HOSTNAME,
2303 OCI_PROCESS,
2304 OCI_ROOT,
2305 OCI_MOUNTS,
2306 OCI_HOOKS,
2307 OCI_LINUX,
2308 OCI_ANNOTATIONS,
2309 __OCI_MAX,
2310 };
2311
2312 static const struct blobmsg_policy oci_policy[] = {
2313 [OCI_VERSION] = { "ociVersion", BLOBMSG_TYPE_STRING },
2314 [OCI_HOSTNAME] = { "hostname", BLOBMSG_TYPE_STRING },
2315 [OCI_PROCESS] = { "process", BLOBMSG_TYPE_TABLE },
2316 [OCI_ROOT] = { "root", BLOBMSG_TYPE_TABLE },
2317 [OCI_MOUNTS] = { "mounts", BLOBMSG_TYPE_ARRAY },
2318 [OCI_HOOKS] = { "hooks", BLOBMSG_TYPE_TABLE },
2319 [OCI_LINUX] = { "linux", BLOBMSG_TYPE_TABLE },
2320 [OCI_ANNOTATIONS] = { "annotations", BLOBMSG_TYPE_TABLE },
2321 };
2322
2323 static int parseOCI(const char *jsonfile)
2324 {
2325 struct blob_attr *tb[__OCI_MAX];
2326 struct blob_attr *cur;
2327 int rem;
2328 int res;
2329
2330 blob_buf_init(&ocibuf, 0);
2331
2332 if (!blobmsg_add_json_from_file(&ocibuf, jsonfile)) {
2333 res=ENOENT;
2334 goto errout;
2335 }
2336
2337 blobmsg_parse(oci_policy, __OCI_MAX, tb, blob_data(ocibuf.head), blob_len(ocibuf.head));
2338
2339 if (!tb[OCI_VERSION]) {
2340 res=ENOMSG;
2341 goto errout;
2342 }
2343
2344 if (strncmp("1.0", blobmsg_get_string(tb[OCI_VERSION]), 3)) {
2345 ERROR("unsupported ociVersion %s\n", blobmsg_get_string(tb[OCI_VERSION]));
2346 res=ENOTSUP;
2347 goto errout;
2348 }
2349
2350 if (tb[OCI_HOSTNAME])
2351 opts.hostname = strdup(blobmsg_get_string(tb[OCI_HOSTNAME]));
2352
2353 if (!tb[OCI_PROCESS]) {
2354 res=ENODATA;
2355 goto errout;
2356 }
2357
2358 if ((res = parseOCIprocess(tb[OCI_PROCESS])))
2359 goto errout;
2360
2361 if (!tb[OCI_ROOT]) {
2362 res=ENODATA;
2363 goto errout;
2364 }
2365 if ((res = parseOCIroot(jsonfile, tb[OCI_ROOT])))
2366 goto errout;
2367
2368 if (!tb[OCI_MOUNTS]) {
2369 res=ENODATA;
2370 goto errout;
2371 }
2372
2373 blobmsg_for_each_attr(cur, tb[OCI_MOUNTS], rem)
2374 if ((res = parseOCImount(cur)))
2375 goto errout;
2376
2377 if (tb[OCI_LINUX] && (res = parseOCIlinux(tb[OCI_LINUX])))
2378 goto errout;
2379
2380 if (tb[OCI_HOOKS] && (res = parseOCIhooks(tb[OCI_HOOKS])))
2381 goto errout;
2382
2383 if (tb[OCI_ANNOTATIONS])
2384 opts.annotations = blob_memdup(tb[OCI_ANNOTATIONS]);
2385
2386 errout:
2387 blob_buf_free(&ocibuf);
2388
2389 return res;
2390 }
2391
2392 static int set_oom_score_adj(void)
2393 {
2394 int f;
2395 char fname[32];
2396
2397 if (!opts.set_oom_score_adj)
2398 return 0;
2399
2400 snprintf(fname, sizeof(fname), "/proc/%u/oom_score_adj", jail_process.pid);
2401 f = open(fname, O_WRONLY | O_TRUNC);
2402 if (f < 0)
2403 return errno;
2404
2405 dprintf(f, "%d", opts.oom_score_adj);
2406 close(f);
2407
2408 return 0;
2409 }
2410
2411
2412 enum {
2413 OCI_STATE_CREATING,
2414 OCI_STATE_CREATED,
2415 OCI_STATE_RUNNING,
2416 OCI_STATE_STOPPED,
2417 };
2418
2419 static int jail_oci_state = OCI_STATE_CREATED;
2420 static void pipe_send_start_container(struct uloop_timeout *t);
2421 static struct uloop_timeout start_container_timeout = {
2422 .cb = pipe_send_start_container,
2423 };
2424
2425 static int handle_start(struct ubus_context *ctx, struct ubus_object *obj,
2426 struct ubus_request_data *req, const char *method,
2427 struct blob_attr *msg)
2428 {
2429 if (jail_oci_state != OCI_STATE_CREATED)
2430 return UBUS_STATUS_INVALID_ARGUMENT;
2431
2432 uloop_timeout_add(&start_container_timeout);
2433
2434 return UBUS_STATUS_OK;
2435 }
2436
2437 static struct blob_buf bb;
2438 static int handle_state(struct ubus_context *ctx, struct ubus_object *obj,
2439 struct ubus_request_data *req, const char *method,
2440 struct blob_attr *msg)
2441 {
2442 char *statusstr;
2443
2444 switch (jail_oci_state) {
2445 case OCI_STATE_CREATING:
2446 statusstr = "creating";
2447 break;
2448 case OCI_STATE_CREATED:
2449 statusstr = "created";
2450 break;
2451 case OCI_STATE_RUNNING:
2452 statusstr = "running";
2453 break;
2454 case OCI_STATE_STOPPED:
2455 statusstr = "stopped";
2456 break;
2457 default:
2458 statusstr = "unknown";
2459 }
2460
2461 blob_buf_init(&bb, 0);
2462 blobmsg_add_string(&bb, "ociVersion", OCI_VERSION_STRING);
2463 blobmsg_add_string(&bb, "id", opts.name);
2464 blobmsg_add_string(&bb, "status", statusstr);
2465 if (jail_oci_state == OCI_STATE_CREATED ||
2466 jail_oci_state == OCI_STATE_RUNNING)
2467 blobmsg_add_u32(&bb, "pid", jail_process.pid);
2468
2469 blobmsg_add_string(&bb, "bundle", opts.ocibundle);
2470
2471 if (opts.annotations)
2472 blobmsg_add_blob(&bb, opts.annotations);
2473
2474 ubus_send_reply(ctx, req, bb.head);
2475
2476 return UBUS_STATUS_OK;
2477 }
2478
2479 enum {
2480 CONTAINER_KILL_ATTR_SIGNAL,
2481 __CONTAINER_KILL_ATTR_MAX,
2482 };
2483
2484 static const struct blobmsg_policy container_kill_attrs[__CONTAINER_KILL_ATTR_MAX] = {
2485 [CONTAINER_KILL_ATTR_SIGNAL] = { "signal", BLOBMSG_TYPE_INT32 },
2486 };
2487
2488 static int
2489 container_handle_kill(struct ubus_context *ctx, struct ubus_object *obj,
2490 struct ubus_request_data *req, const char *method,
2491 struct blob_attr *msg)
2492 {
2493 struct blob_attr *tb[__CONTAINER_KILL_ATTR_MAX], *cur;
2494 int sig = SIGTERM;
2495
2496 blobmsg_parse(container_kill_attrs, __CONTAINER_KILL_ATTR_MAX, tb, blobmsg_data(msg), blobmsg_data_len(msg));
2497
2498 cur = tb[CONTAINER_KILL_ATTR_SIGNAL];
2499 if (cur)
2500 sig = blobmsg_get_u32(cur);
2501
2502 if (jail_oci_state == OCI_STATE_CREATING)
2503 return UBUS_STATUS_NOT_FOUND;
2504
2505 if (kill(jail_process.pid, sig) == 0)
2506 return 0;
2507
2508 switch (errno) {
2509 case EINVAL: return UBUS_STATUS_INVALID_ARGUMENT;
2510 case EPERM: return UBUS_STATUS_PERMISSION_DENIED;
2511 case ESRCH: return UBUS_STATUS_NOT_FOUND;
2512 }
2513
2514 return UBUS_STATUS_UNKNOWN_ERROR;
2515 }
2516
2517 static int
2518 jail_writepid(pid_t pid)
2519 {
2520 FILE *_pidfile;
2521
2522 if (!opts.pidfile)
2523 return 0;
2524
2525 _pidfile = fopen(opts.pidfile, "w");
2526 if (_pidfile == NULL)
2527 return errno;
2528
2529 if (fprintf(_pidfile, "%d\n", pid) < 0) {
2530 fclose(_pidfile);
2531 return errno;
2532 }
2533
2534 if (fclose(_pidfile))
2535 return errno;
2536
2537 return 0;
2538 }
2539
2540 static int checkpath(const char *path)
2541 {
2542 int dirfd = open(path, O_RDONLY | O_DIRECTORY | O_CLOEXEC);
2543 if (dirfd < 0) {
2544 ERROR("path %s open failed %m\n", path);
2545 return -1;
2546 }
2547 close(dirfd);
2548
2549 return 0;
2550 }
2551
2552 static struct ubus_method container_methods[] = {
2553 UBUS_METHOD_NOARG("start", handle_start),
2554 UBUS_METHOD_NOARG("state", handle_state),
2555 UBUS_METHOD("kill", container_handle_kill, container_kill_attrs),
2556 };
2557
2558 static struct ubus_object_type container_object_type =
2559 UBUS_OBJECT_TYPE("container", container_methods);
2560
2561 static struct ubus_object container_object = {
2562 .type = &container_object_type,
2563 .methods = container_methods,
2564 .n_methods = ARRAY_SIZE(container_methods),
2565 };
2566
2567 static void post_main(struct uloop_timeout *t);
2568 static struct uloop_timeout post_main_timeout = {
2569 .cb = post_main,
2570 };
2571 static int netns_fd;
2572 static int pidns_fd;
2573 #ifdef CLONE_NEWTIME
2574 static int timens_fd;
2575 #endif
2576 static void post_create_runtime(void);
2577
2578 struct env_e {
2579 struct list_head list;
2580 char *envarg;
2581 };
2582
2583 int main(int argc, char **argv)
2584 {
2585 uid_t uid = getuid();
2586 const char log[] = "/dev/log";
2587 const char ubus[] = "/var/run/ubus/ubus.sock";
2588 int ret = EXIT_FAILURE;
2589 int ch;
2590 char *tmp;
2591 struct list_head envl = LIST_HEAD_INIT(envl);
2592 struct env_e *enve, *tmpenve;
2593 unsigned short int envn = 0, envc = 0;
2594
2595 if (uid) {
2596 ERROR("not root, aborting: %m\n");
2597 return EXIT_FAILURE;
2598 }
2599
2600 /* those are filehandlers, so -1 indicates unused */
2601 opts.setns.pid = -1;
2602 opts.setns.net = -1;
2603 opts.setns.ns = -1;
2604 opts.setns.ipc = -1;
2605 opts.setns.uts = -1;
2606 opts.setns.user = -1;
2607 opts.setns.cgroup = -1;
2608 #ifdef CLONE_NEWTIME
2609 opts.setns.time = -1;
2610 #endif
2611
2612 /* default 5 seconds timeout after SIGTERM before SIGKILL is sent */
2613 opts.term_timeout = 5;
2614
2615 umask(022);
2616 mount_list_init();
2617 init_library_search();
2618 cgroups_prepare();
2619 exit_from_child = false;
2620
2621 while ((ch = getopt(argc, argv, OPT_ARGS)) != -1) {
2622 switch (ch) {
2623 case 'd':
2624 debug = atoi(optarg);
2625 break;
2626 case 'e':
2627 enve = calloc(1, sizeof(*enve));
2628 enve->envarg = optarg;
2629 list_add_tail(&enve->list, &envl);
2630 break;
2631 case 'p':
2632 opts.namespace |= CLONE_NEWNS;
2633 opts.procfs = 1;
2634 break;
2635 case 'o':
2636 opts.namespace |= CLONE_NEWNS;
2637 opts.ronly = 1;
2638 break;
2639 case 'f':
2640 opts.namespace |= CLONE_NEWUSER;
2641 break;
2642 case 'F':
2643 opts.namespace |= CLONE_NEWCGROUP;
2644 break;
2645 case 'R':
2646 opts.extroot = realpath(optarg, NULL);
2647 break;
2648 case 's':
2649 opts.namespace |= CLONE_NEWNS;
2650 opts.sysfs = 1;
2651 break;
2652 case 'S':
2653 opts.seccomp = optarg;
2654 add_mount_bind(optarg, 1, -1);
2655 break;
2656 case 'C':
2657 opts.capabilities = optarg;
2658 break;
2659 case 'c':
2660 opts.no_new_privs = 1;
2661 break;
2662 case 'n':
2663 opts.name = optarg;
2664 break;
2665 case 'N':
2666 opts.namespace |= CLONE_NEWNET;
2667 break;
2668 case 'h':
2669 opts.namespace |= CLONE_NEWUTS;
2670 opts.hostname = strdup(optarg);
2671 break;
2672 case 'j':
2673 jail_join_ns(optarg);
2674 break;
2675 case 'r':
2676 opts.namespace |= CLONE_NEWNS;
2677 tmp = strchr(optarg, ':');
2678 if (tmp) {
2679 *(tmp++) = '\0';
2680 add_2paths_and_deps(optarg, tmp, 1, 0, 0);
2681 } else {
2682 add_path_and_deps(optarg, 1, 0, 0);
2683 }
2684 break;
2685 case 'w':
2686 opts.namespace |= CLONE_NEWNS;
2687 tmp = strchr(optarg, ':');
2688 if (tmp) {
2689 *(tmp++) = '\0';
2690 add_2paths_and_deps(optarg, tmp, 0, 0, 0);
2691 } else {
2692 add_path_and_deps(optarg, 0, 0, 0);
2693 }
2694 break;
2695 case 'u':
2696 opts.namespace |= CLONE_NEWNS;
2697 add_mount_bind(ubus, 0, -1);
2698 break;
2699 case 'l':
2700 opts.namespace |= CLONE_NEWNS;
2701 add_mount_bind(log, 0, -1);
2702 break;
2703 case 'U':
2704 opts.user = optarg;
2705 break;
2706 case 'G':
2707 opts.group = optarg;
2708 break;
2709 case 'O':
2710 opts.overlaydir = realpath(optarg, NULL);
2711 break;
2712 case 't':
2713 opts.term_timeout = atoi(optarg);
2714 break;
2715 case 'T':
2716 opts.tmpoverlaysize = optarg;
2717 break;
2718 case 'E':
2719 opts.require_jail = 1;
2720 break;
2721 case 'y':
2722 opts.console = 1;
2723 break;
2724 case 'J':
2725 opts.ocibundle = optarg;
2726 break;
2727 case 'i':
2728 opts.immediately = true;
2729 break;
2730 case 'P':
2731 opts.pidfile = optarg;
2732 break;
2733 }
2734 }
2735
2736 if (opts.namespace && !opts.ocibundle)
2737 opts.namespace |= CLONE_NEWIPC | CLONE_NEWPID;
2738
2739 /*
2740 * env import from cmdline is not available for OCI containers
2741 */
2742 if (opts.ocibundle && !list_empty(&envl)) {
2743 ret=-ENOTSUP;
2744 goto errout;
2745 }
2746
2747 /*
2748 * prepare list of env variables to import for slim containers
2749 */
2750 if (!list_empty(&envl)) {
2751 list_for_each_entry(enve, &envl, list)
2752 ++envn;
2753
2754 opts.envp = calloc(1 + envn, sizeof(char*));
2755 list_for_each_entry_safe(enve, tmpenve, &envl, list) {
2756 tmp = getenv(enve->envarg);
2757 if (tmp) {
2758 ret = asprintf(&opts.envp[envc++], "%s=%s", enve->envarg, tmp);
2759 if (ret < 0) {
2760 ERROR("filed to handle envargs %s\n", tmp);
2761 free(enve);
2762 goto errout;
2763 }
2764 }
2765
2766 list_del(&enve->list);
2767 free(enve);
2768 }
2769
2770 opts.envp[envc] = NULL;
2771 }
2772
2773 /*
2774 * uid in parent user namespace representing root user in new
2775 * user namespace, defaults to nobody unless specified in uidMappings
2776 */
2777 opts.root_map_uid = 65534;
2778
2779 if (opts.capabilities && parseOCIcapabilities_from_file(&opts.capset, opts.capabilities)) {
2780 ERROR("failed to read capabilities from file %s\n", opts.capabilities);
2781 ret=-1;
2782 goto errout;
2783 }
2784
2785 if (opts.ocibundle) {
2786 char *jsonfile;
2787 int ocires;
2788
2789 if (!opts.name) {
2790 ERROR("OCI bundle needs a named jail\n");
2791 ret=-1;
2792 goto errout;
2793 }
2794 if (asprintf(&jsonfile, "%s/config.json", opts.ocibundle) < 0) {
2795 ret=-ENOMEM;
2796 goto errout;
2797 }
2798 ocires = parseOCI(jsonfile);
2799 free(jsonfile);
2800 if (ocires) {
2801 ERROR("parsing of OCI JSON spec has failed: %s (%d)\n", strerror(ocires), ocires);
2802 ret=ocires;
2803 goto errout;
2804 }
2805 }
2806
2807 if (opts.namespace & CLONE_NEWNET) {
2808 if (!opts.name) {
2809 ERROR("netns needs a named jail\n");
2810 ret=-1;
2811 goto errout;
2812 }
2813 }
2814
2815
2816 if (opts.tmpoverlaysize && strlen(opts.tmpoverlaysize) > 8) {
2817 ERROR("size parameter too long: \"%s\"\n", opts.tmpoverlaysize);
2818 ret=-1;
2819 goto errout;
2820 }
2821
2822 if (opts.extroot && checkpath(opts.extroot)) {
2823 ERROR("invalid rootfs path '%s'", opts.extroot);
2824 ret=-1;
2825 goto errout;
2826 }
2827
2828 if (opts.overlaydir && checkpath(opts.overlaydir)) {
2829 ERROR("invalid rootfs overlay path '%s'", opts.overlaydir);
2830 ret=-1;
2831 goto errout;
2832 }
2833
2834 /* no <binary> param found */
2835 if (!opts.ocibundle && (argc - optind < 1)) {
2836 usage();
2837 ret=EXIT_FAILURE;
2838 goto errout;
2839 }
2840 if (!(opts.ocibundle||opts.namespace||opts.capabilities||opts.seccomp||
2841 (opts.setns.net != -1) ||
2842 (opts.setns.ns != -1) ||
2843 (opts.setns.ipc != -1) ||
2844 (opts.setns.uts != -1) ||
2845 (opts.setns.user != -1) ||
2846 (opts.setns.cgroup != -1))) {
2847 ERROR("Not using namespaces, capabilities or seccomp !!!\n\n");
2848 usage();
2849 ret=EXIT_FAILURE;
2850 goto errout;
2851 }
2852 DEBUG("Using namespaces(0x%08x), capabilities(%d), seccomp(%d)\n",
2853 opts.namespace,
2854 opts.capset.apply,
2855 opts.seccomp != 0 || opts.ociseccomp != 0);
2856
2857 uloop_init();
2858 signals_init();
2859
2860 parent_ctx = ubus_connect(NULL);
2861 ubus_add_uloop(parent_ctx);
2862
2863 if (opts.ocibundle) {
2864 char *objname;
2865 if (asprintf(&objname, "container.%s", opts.name) < 0) {
2866 ret=-ENOMEM;
2867 goto errout;
2868 }
2869
2870 container_object.name = objname;
2871 ret = ubus_add_object(parent_ctx, &container_object);
2872 if (ret) {
2873 ERROR("Failed to add object: %s\n", ubus_strerror(ret));
2874 ret=-1;
2875 goto errout;
2876 }
2877 }
2878
2879 /* deliberately not using 'else' on unrelated conditional branches */
2880 if (!opts.ocibundle) {
2881 /* allocate NULL-terminated array for argv */
2882 opts.jail_argv = calloc(1 + argc - optind, sizeof(void *));
2883 if (!opts.jail_argv) {
2884 ret=EXIT_FAILURE;
2885 goto errout;
2886 }
2887 for (size_t s = optind; s < argc; s++)
2888 opts.jail_argv[s - optind] = strdup(argv[s]);
2889
2890 if (opts.namespace & CLONE_NEWUSER)
2891 get_jail_user(&opts.pw_uid, &opts.pw_gid, &opts.gr_gid);
2892 }
2893
2894 if (!opts.extroot) {
2895 if (opts.namespace && add_path_and_deps(*opts.jail_argv, 1, -1, 0)) {
2896 ERROR("failed to load dependencies\n");
2897 ret=-1;
2898 goto errout;
2899 }
2900 }
2901
2902 if (opts.namespace && opts.seccomp && add_path_and_deps("libpreload-seccomp.so", 1, -1, 1)) {
2903 ERROR("failed to load libpreload-seccomp.so\n");
2904 opts.seccomp = 0;
2905 if (opts.require_jail) {
2906 ret=-1;
2907 goto errout;
2908 }
2909 }
2910
2911 uloop_timeout_add(&post_main_timeout);
2912 uloop_run();
2913
2914 errout:
2915 if (opts.ocibundle)
2916 cgroups_free();
2917
2918 free_opts(true);
2919
2920 return ret;
2921 }
2922
2923 static void post_main(struct uloop_timeout *t)
2924 {
2925 if (apply_rlimits()) {
2926 ERROR("error applying resource limits\n");
2927 free_and_exit(EXIT_FAILURE);
2928 }
2929
2930 if (opts.name)
2931 prctl(PR_SET_NAME, opts.name, NULL, NULL, NULL);
2932
2933 if (pipe(&pipes[0]) < 0 || pipe(&pipes[2]) < 0)
2934 free_and_exit(-1);
2935
2936 if (has_namespaces()) {
2937 if (opts.namespace & CLONE_NEWNS) {
2938 if (!opts.extroot && (opts.user || opts.group)) {
2939 add_mount_bind("/etc/passwd", 1, -1);
2940 add_mount_bind("/etc/group", 1, -1);
2941 }
2942
2943 #if defined(__GLIBC__)
2944 if (!opts.extroot)
2945 add_mount_bind("/etc/nsswitch.conf", 1, -1);
2946 #endif
2947 if (opts.setns.ns == -1) {
2948 if (!(opts.namespace & CLONE_NEWNET)) {
2949 add_mount_bind("/etc/resolv.conf", 1, 0);
2950 } else {
2951 /* new mount namespace to provide /dev/resolv.conf.d */
2952 char hostdir[PATH_MAX];
2953
2954 snprintf(hostdir, PATH_MAX, "/tmp/resolv.conf-%s.d", opts.name);
2955 mkdir_p(hostdir, 0755);
2956 add_mount(hostdir, "/dev/resolv.conf.d", NULL,
2957 MS_BIND | MS_NOEXEC | MS_NOATIME | MS_NOSUID | MS_NODEV | MS_RDONLY, 0, NULL, 0);
2958 }
2959 }
2960 /* default mounts */
2961 add_mount(NULL, "/dev", "tmpfs", MS_NOATIME | MS_NOEXEC | MS_NOSUID, 0, "size=1M", -1);
2962 add_mount(NULL, "/dev/pts", "devpts", MS_NOATIME | MS_NOEXEC | MS_NOSUID, 0, "newinstance,ptmxmode=0666,mode=0620,gid=5", 0);
2963
2964 if (opts.procfs || opts.ocibundle) {
2965 add_mount("proc", "/proc", "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0, NULL, -1);
2966
2967 /*
2968 * hack to make /proc/sys/net read-write while the rest of /proc/sys is read-only
2969 * which cannot be expressed with OCI spec, but happends to be very useful.
2970 * Only apply it if '/proc/sys' is not already listed as mount, maskedPath or
2971 * readonlyPath.
2972 * If not running in a new network namespace, only make /proc/sys read-only.
2973 * If running in a new network namespace, temporarily stash (ie. mount-bind)
2974 * /proc/sys/net into (totally unrelated, but surely existing) /proc/self/net.
2975 * Then we mount-bind /proc/sys read-only and then mount-move /proc/self/net into
2976 * /proc/sys/net.
2977 * This works because mounts are executed in incrementing strcmp() order and
2978 * /proc/self/net appears there before /proc/sys/net and hence the operation
2979 * succeeds as the bind-mount of /proc/self/net is performed first and then
2980 * move-mount of /proc/sys/net follows because 'e' preceeds 'y' in the ASCII
2981 * table (and in the alphabet).
2982 */
2983 if (!add_mount(NULL, "/proc/sys", NULL, MS_BIND | MS_RDONLY, 0, NULL, -1))
2984 if (opts.namespace & CLONE_NEWNET)
2985 if (!add_mount_inner("/proc/self/net", "/proc/sys/net", NULL, MS_MOVE, 0, NULL, -1))
2986 add_mount_inner("/proc/sys/net", "/proc/self/net", NULL, MS_BIND, 0, NULL, -1);
2987
2988 }
2989 if (opts.sysfs || opts.ocibundle)
2990 add_mount("sysfs", "/sys", "sysfs", MS_RELATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY, 0, NULL, -1);
2991
2992 if (opts.ocibundle)
2993 add_mount("shm", "/dev/shm", "tmpfs", MS_NOSUID | MS_NOEXEC | MS_NODEV, 0, "mode=1777", -1);
2994
2995 }
2996
2997 if (opts.setns.pid != -1) {
2998 pidns_fd = ns_open_pid("pid", getpid());
2999 setns_open(CLONE_NEWPID);
3000 } else {
3001 pidns_fd = -1;
3002 }
3003
3004 #ifdef CLONE_NEWTIME
3005 if (opts.setns.time != -1) {
3006 timens_fd = ns_open_pid("time", getpid());
3007 setns_open(CLONE_NEWTIME);
3008 } else {
3009 timens_fd = -1;
3010 }
3011 #endif
3012
3013 if (opts.namespace & CLONE_NEWUSER) {
3014 if (prctl(PR_SET_SECUREBITS, SECBIT_NO_SETUID_FIXUP)) {
3015 ERROR("prctl(PR_SET_SECUREBITS) failed: %m\n");
3016 free_and_exit(EXIT_FAILURE);
3017 }
3018 if (seteuid(opts.root_map_uid)) {
3019 ERROR("seteuid(%d) failed: %m\n", opts.root_map_uid);
3020 free_and_exit(EXIT_FAILURE);
3021 }
3022 }
3023
3024 jail_process.pid = clone(exec_jail, child_stack + STACK_SIZE, SIGCHLD | (opts.namespace & (~CLONE_NEWCGROUP)), NULL);
3025 } else {
3026 jail_process.pid = fork();
3027 }
3028
3029 if (jail_process.pid > 0) {
3030 /* parent process */
3031 char sig_buf[1];
3032
3033 uloop_process_add(&jail_process);
3034 jail_running = 1;
3035 if (seteuid(0)) {
3036 ERROR("seteuid(%d) failed: %m\n", opts.root_map_uid);
3037 free_and_exit(EXIT_FAILURE);
3038 }
3039
3040 prctl(PR_SET_SECUREBITS, 0);
3041
3042 if (pidns_fd != -1) {
3043 setns(pidns_fd, CLONE_NEWPID);
3044 close(pidns_fd);
3045 }
3046 #ifdef CLONE_NEWTIME
3047 if (timens_fd != -1) {
3048 setns(timens_fd, CLONE_NEWTIME);
3049 close(timens_fd);
3050 }
3051 #endif
3052 if (opts.setns.net != -1)
3053 close(opts.setns.net);
3054 if (opts.setns.ns != -1)
3055 close(opts.setns.ns);
3056 if (opts.setns.ipc != -1)
3057 close(opts.setns.ipc);
3058 if (opts.setns.uts != -1)
3059 close(opts.setns.uts);
3060 if (opts.setns.user != -1)
3061 close(opts.setns.user);
3062 if (opts.setns.cgroup != -1)
3063 close(opts.setns.cgroup);
3064 close(pipes[1]);
3065 close(pipes[2]);
3066 if (read(pipes[0], sig_buf, 1) < 1) {
3067 ERROR("can't read from child\n");
3068 free_and_exit(-1);
3069 }
3070 close(pipes[0]);
3071 set_oom_score_adj();
3072
3073 if (opts.ocibundle)
3074 cgroups_apply(jail_process.pid);
3075
3076 if (opts.namespace & CLONE_NEWUSER) {
3077 if (write_setgroups(jail_process.pid, true)) {
3078 ERROR("can't write setgroups\n");
3079 free_and_exit(-1);
3080 }
3081 if (!opts.uidmap) {
3082 bool has_gr = (opts.gr_gid != -1);
3083 if (opts.pw_uid != -1) {
3084 write_single_uid_gid_map(jail_process.pid, 0, opts.pw_uid);
3085 write_single_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:opts.pw_gid);
3086 } else {
3087 write_single_uid_gid_map(jail_process.pid, 0, 65534);
3088 write_single_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:65534);
3089 }
3090 } else {
3091 write_uid_gid_map(jail_process.pid, 0, opts.uidmap);
3092 if (opts.gidmap)
3093 write_uid_gid_map(jail_process.pid, 1, opts.gidmap);
3094 }
3095 }
3096
3097 if (opts.namespace & CLONE_NEWNET)
3098 jail_network_start(parent_ctx, opts.name, jail_process.pid);
3099
3100 if (jail_writepid(jail_process.pid)) {
3101 ERROR("failed to write pidfile: %m\n");
3102 free_and_exit(-1);
3103 }
3104 } else if (jail_process.pid == 0) {
3105 /* fork child process */
3106 free_and_exit(exec_jail(NULL));
3107 } else {
3108 ERROR("failed to clone/fork: %m\n");
3109 free_and_exit(EXIT_FAILURE);
3110 }
3111 run_hooks(opts.hooks.createRuntime, post_create_runtime);
3112 }
3113
3114 static void post_poststart(void);
3115 static void post_create_runtime(void)
3116 {
3117 char sig_buf[1];
3118
3119 sig_buf[0] = 'O';
3120 if (write(pipes[3], sig_buf, 1) < 0) {
3121 ERROR("can't write to child\n");
3122 free_and_exit(-1);
3123 }
3124
3125 jail_oci_state = OCI_STATE_CREATED;
3126 if (opts.ocibundle && !opts.immediately)
3127 uloop_run(); /* wait for 'start' command via ubus */
3128 else
3129 pipe_send_start_container(NULL);
3130 }
3131
3132 static void pipe_send_start_container(struct uloop_timeout *t)
3133 {
3134 char sig_buf[1];
3135
3136 jail_oci_state = OCI_STATE_RUNNING;
3137 sig_buf[0] = '!';
3138 if (write(pipes[3], sig_buf, 1) < 0) {
3139 ERROR("can't write to child\n");
3140 free_and_exit(-1);
3141 }
3142 close(pipes[3]);
3143
3144 run_hooks(opts.hooks.poststart, post_poststart);
3145 }
3146
3147 static void post_poststart(void)
3148 {
3149 uloop_run(); /* idle here while jail is running */
3150 if (jail_running) {
3151 DEBUG("uloop interrupted, killing jail process\n");
3152 kill(jail_process.pid, SIGTERM);
3153 uloop_timeout_set(&jail_process_timeout, 1000);
3154 uloop_run();
3155 }
3156 uloop_done();
3157 poststop();
3158 }
3159
3160 static void post_poststop(void);
3161 static void poststop(void) {
3162 if (opts.namespace & CLONE_NEWNET) {
3163 setns(netns_fd, CLONE_NEWNET);
3164 jail_network_stop();
3165 close(netns_fd);
3166 }
3167 run_hooks(opts.hooks.poststop, post_poststop);
3168 }
3169
3170 static void post_poststop(void)
3171 {
3172 free_opts(true);
3173 if (parent_ctx)
3174 ubus_free(parent_ctx);
3175
3176 exit(jail_return_code);
3177 }