jail: cgroups-bpf: fix compile with musl 1.2
[project/procd.git] / jail / cgroups-bpf.c
1 /*
2 * Copyright (C) 2021 Daniel Golle <daniel@makrotopia.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU Lesser General Public License version 2.1
6 * as published by the Free Software Foundation
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * somehow emulate devices.allow/devices.deny using eBPF
14 *
15 * OCI run-time spec defines the syntax for allowing/denying access
16 * to devices according to the definition of cgroup-v1 in the Kernel
17 * as described in Documentation/admin-guide/cgroup-v1.
18 */
19
20 #include <assert.h>
21 #include <linux/bpf.h>
22 #ifdef __GLIBC__
23 #include <sys/cdefs.h>
24 #else
25 #include <sys/reg.h>
26 #endif
27 #include <sys/syscall.h>
28
29 #include <libubox/blobmsg.h>
30 #include <libubox/blobmsg_json.h>
31 #include <libubox/list.h>
32
33 #include "cgroups.h"
34 #include "cgroups-bpf.h"
35 #include "log.h"
36
37 static struct bpf_insn *program = NULL;
38 static int bpf_total_insn = 0;
39 static const char *license = "GPL";
40
41 static int
42 syscall_bpf (int cmd, union bpf_attr *attr, unsigned int size)
43 {
44 return (int) syscall (__NR_bpf, cmd, attr, size);
45 }
46
47 /* from crun/src/libcrun/ebpf.c */
48 #define BPF_ALU32_IMM(OP, DST, IMM) \
49 ((struct bpf_insn){ .code = BPF_ALU | BPF_OP (OP) | BPF_K, .dst_reg = DST, .src_reg = 0, .off = 0, .imm = IMM })
50
51 #define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \
52 ((struct bpf_insn){ \
53 .code = BPF_LDX | BPF_SIZE (SIZE) | BPF_MEM, .dst_reg = DST, .src_reg = SRC, .off = OFF, .imm = 0 })
54
55 #define BPF_MOV64_REG(DST, SRC) \
56 ((struct bpf_insn){ .code = BPF_ALU64 | BPF_MOV | BPF_X, .dst_reg = DST, .src_reg = SRC, .off = 0, .imm = 0 })
57
58 #define BPF_JMP_A(OFF) \
59 ((struct bpf_insn){ .code = BPF_JMP | BPF_JA, .dst_reg = 0, .src_reg = 0, .off = OFF, .imm = 0 })
60
61 #define BPF_JMP_IMM(OP, DST, IMM, OFF) \
62 ((struct bpf_insn){ .code = BPF_JMP | BPF_OP (OP) | BPF_K, .dst_reg = DST, .src_reg = 0, .off = OFF, .imm = IMM })
63
64 #define BPF_JMP_REG(OP, DST, SRC, OFF) \
65 ((struct bpf_insn){ .code = BPF_JMP | BPF_OP (OP) | BPF_X, .dst_reg = DST, .src_reg = SRC, .off = OFF, .imm = 0 })
66
67 #define BPF_MOV64_IMM(DST, IMM) \
68 ((struct bpf_insn){ .code = BPF_ALU64 | BPF_MOV | BPF_K, .dst_reg = DST, .src_reg = 0, .off = 0, .imm = IMM })
69
70 #define BPF_MOV32_REG(DST, SRC) \
71 ((struct bpf_insn){ .code = BPF_ALU | BPF_MOV | BPF_X, .dst_reg = DST, .src_reg = SRC, .off = 0, .imm = 0 })
72
73 #define BPF_EXIT_INSN() \
74 ((struct bpf_insn){ .code = BPF_JMP | BPF_EXIT, .dst_reg = 0, .src_reg = 0, .off = 0, .imm = 0 })
75
76 /* taken from systemd. */
77 static const struct bpf_insn pre_insn[] = {
78 /* type -> R2. */
79 BPF_LDX_MEM (BPF_W, BPF_REG_2, BPF_REG_1, 0),
80 BPF_ALU32_IMM (BPF_AND, BPF_REG_2, 0xFFFF),
81 /* access -> R3. */
82 BPF_LDX_MEM (BPF_W, BPF_REG_3, BPF_REG_1, 0),
83 BPF_ALU32_IMM (BPF_RSH, BPF_REG_3, 16),
84 /* major -> R4. */
85 BPF_LDX_MEM (BPF_W, BPF_REG_4, BPF_REG_1, 4),
86 /* minor -> R5. */
87 BPF_LDX_MEM (BPF_W, BPF_REG_5, BPF_REG_1, 8),
88 };
89
90 enum {
91 OCI_LINUX_CGROUPS_DEVICES_ALLOW,
92 OCI_LINUX_CGROUPS_DEVICES_TYPE,
93 OCI_LINUX_CGROUPS_DEVICES_MAJOR,
94 OCI_LINUX_CGROUPS_DEVICES_MINOR,
95 OCI_LINUX_CGROUPS_DEVICES_ACCESS,
96 __OCI_LINUX_CGROUPS_DEVICES_MAX,
97 };
98
99 static const struct blobmsg_policy oci_linux_cgroups_devices_policy[] = {
100 [OCI_LINUX_CGROUPS_DEVICES_ALLOW] = { "allow", BLOBMSG_TYPE_BOOL },
101 [OCI_LINUX_CGROUPS_DEVICES_TYPE] = { "type", BLOBMSG_TYPE_STRING },
102 [OCI_LINUX_CGROUPS_DEVICES_MAJOR] = { "major", BLOBMSG_CAST_INT64 },
103 [OCI_LINUX_CGROUPS_DEVICES_MINOR] = { "minor", BLOBMSG_CAST_INT64 },
104 [OCI_LINUX_CGROUPS_DEVICES_ACCESS] = { "access", BLOBMSG_TYPE_STRING },
105 };
106
107 /*
108 * cgroup-v1 devices got a (default) behaviour and a list of exceptions.
109 * define datatypes similar to the legacy kernel code.
110 */
111 #define DEVCG_DEV_ALL (BPF_DEVCG_DEV_BLOCK | BPF_DEVCG_DEV_CHAR)
112 #define DEVCG_ACC_ALL (BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | BPF_DEVCG_ACC_MKNOD)
113
114 enum devcg_behavior {
115 DEVCG_DEFAULT_NONE,
116 DEVCG_DEFAULT_ALLOW,
117 DEVCG_DEFAULT_DENY,
118 };
119
120 struct dev_exception_item {
121 uint32_t major, minor;
122 short type;
123 short access;
124 struct list_head list;
125 bool allow;
126 };
127
128 /*
129 * add a bunch of default rules
130 */
131 static int add_default_exceptions(struct list_head *exceptions)
132 {
133 int i, ret = 0;
134 struct dev_exception_item *cur;
135 /* from crun/src/libcrun/cgroup.c */
136 const struct dev_exception_item defrules[] = {
137 /* always allow mknod */
138 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = ~0, .minor = ~0, .access = BPF_DEVCG_ACC_MKNOD },
139 { .allow = true, .type = BPF_DEVCG_DEV_BLOCK, .major = ~0, .minor = ~0, .access = BPF_DEVCG_ACC_MKNOD },
140 /* /dev/null */
141 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 1, .minor = 3, .access = DEVCG_ACC_ALL },
142 /* /dev/random */
143 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 1, .minor = 8, .access = DEVCG_ACC_ALL },
144 /* /dev/full */
145 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 1, .minor = 7, .access = DEVCG_ACC_ALL },
146 /* /dev/tty */
147 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 5, .minor = 0, .access = DEVCG_ACC_ALL },
148 /* /dev/zero */
149 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 1, .minor = 5, .access = DEVCG_ACC_ALL },
150 /* /dev/urandom */
151 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 1, .minor = 9, .access = DEVCG_ACC_ALL },
152 /* /dev/console */
153 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 5, .minor = 1, .access = DEVCG_ACC_ALL },
154 /* /dev/pts/[0-255] */
155 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 136, .minor = ~0, .access = DEVCG_ACC_ALL },
156 /* /dev/ptmx */
157 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 5, .minor = 2, .access = DEVCG_ACC_ALL },
158 /* /dev/net/tun */
159 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 10, .minor = 200, .access = DEVCG_ACC_ALL },
160 };
161
162 for (i = 0; i < (sizeof(defrules) / sizeof(struct dev_exception_item)); ++i) {
163 cur = malloc(sizeof(struct dev_exception_item));
164 if (!cur) {
165 ret = ENOMEM;
166 break;
167 }
168 /* add defaults to list in reverse order (last item will be first in list) */
169 memcpy(cur, &defrules[i], sizeof(struct dev_exception_item));
170 list_add(&cur->list, exceptions);
171 }
172
173 return ret;
174 }
175
176 /*
177 * free all exceptions in the list
178 */
179 static void flush_exceptions(struct list_head *freelist)
180 {
181 struct dev_exception_item *dl, *dln;
182
183 if (!list_empty(freelist))
184 list_for_each_entry_safe(dl, dln, freelist, list) {
185 list_del(&dl->list);
186 free(dl);
187 }
188 }
189
190 /*
191 * parse OCI cgroups devices and translate into cgroups-v2 eBPF program
192 */
193 int parseOCIlinuxcgroups_devices(struct blob_attr *msg)
194 {
195 struct blob_attr *tb[__OCI_LINUX_CGROUPS_DEVICES_MAX];
196 struct blob_attr *cur;
197 int rem, ret = 0;
198 int bpf_type, bpf_access;
199 unsigned char acidx;
200 bool allow = false,
201 has_access = false,
202 has_type = false,
203 has_major = false,
204 has_minor = false;
205 int total_ins = 0,
206 cur_ins = 0,
207 pre_insn_len = sizeof(pre_insn) / sizeof(struct bpf_insn),
208 next_ins;
209 char *access, *devtype;
210 uint32_t devmajor, devminor;
211 struct dev_exception_item *dl;
212 struct list_head exceptions;
213 enum devcg_behavior behavior = DEVCG_DEFAULT_ALLOW;
214 INIT_LIST_HEAD(&exceptions);
215
216 /* parse according to OCI spec */
217 blobmsg_for_each_attr(cur, msg, rem) {
218 blobmsg_parse(oci_linux_cgroups_devices_policy, __OCI_LINUX_CGROUPS_DEVICES_MAX,
219 tb, blobmsg_data(cur), blobmsg_len(cur));
220
221 if (!tb[OCI_LINUX_CGROUPS_DEVICES_ALLOW]) {
222 ret = EINVAL;
223 goto out;
224 }
225
226 allow = blobmsg_get_bool(tb[OCI_LINUX_CGROUPS_DEVICES_ALLOW]);
227
228 bpf_access = 0;
229 if (tb[OCI_LINUX_CGROUPS_DEVICES_ACCESS]) {
230 access = blobmsg_get_string(tb[OCI_LINUX_CGROUPS_DEVICES_ACCESS]);
231 if ((strlen(access) > 3) || (strlen(access) == 0)) {
232 ret = EINVAL;
233 goto out;
234 }
235
236 for (acidx = 0; acidx < strlen(access); ++acidx) {
237 switch (access[acidx]) {
238 case 'r':
239 bpf_access |= BPF_DEVCG_ACC_READ;
240 break;
241 case 'w':
242 bpf_access |= BPF_DEVCG_ACC_WRITE;
243 break;
244 case 'm':
245 bpf_access |= BPF_DEVCG_ACC_MKNOD;
246 break;
247 default:
248 ret = EINVAL;
249 goto out;
250 }
251 }
252 }
253
254 if (!bpf_access)
255 bpf_access = DEVCG_ACC_ALL;
256
257 bpf_type = 0;
258 if (tb[OCI_LINUX_CGROUPS_DEVICES_TYPE]) {
259 devtype = blobmsg_get_string(tb[OCI_LINUX_CGROUPS_DEVICES_TYPE]);
260
261 switch (devtype[0]) {
262 case 'c':
263 bpf_type = BPF_DEVCG_DEV_CHAR;
264 break;
265 case 'b':
266 bpf_type = BPF_DEVCG_DEV_BLOCK;
267 break;
268 case 'a':
269 bpf_type = DEVCG_DEV_ALL;
270 break;
271 default:
272 ret = EINVAL;
273 goto out;
274 }
275 }
276
277 if (!bpf_type)
278 bpf_type = DEVCG_DEV_ALL;
279
280 if (tb[OCI_LINUX_CGROUPS_DEVICES_MAJOR])
281 devmajor = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_DEVICES_MAJOR]);
282 else
283 devmajor = ~0;
284
285 if (tb[OCI_LINUX_CGROUPS_DEVICES_MINOR])
286 devminor = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_DEVICES_MINOR]);
287 else
288 devminor = ~0;
289
290 if (bpf_type == DEVCG_DEV_ALL) {
291 /* wildcard => change default policy and flush all existing rules */
292 flush_exceptions(&exceptions);
293 behavior = allow?DEVCG_DEFAULT_ALLOW:DEVCG_DEFAULT_DENY;
294 } else {
295 /* allocate and populate record for exception */
296 dl = malloc(sizeof(struct dev_exception_item));
297 if (!dl) {
298 ret = ENOSPC;
299 break;
300 }
301 dl->allow = allow;
302 dl->type = bpf_type;
303 dl->access = bpf_access;
304 dl->major = devmajor;
305 dl->minor = devminor;
306
307 /* push to exceptions list, last goes first */
308 list_add(&dl->list, &exceptions);
309 }
310 }
311 if (ret)
312 goto out;
313
314 /* add default rules */
315 ret = add_default_exceptions(&exceptions);
316 if (ret)
317 goto out;
318
319 /* calculate number of instructions to allocate */
320 list_for_each_entry(dl, &exceptions, list) {
321 has_access = dl->access != DEVCG_ACC_ALL;
322 has_type = dl->type != DEVCG_DEV_ALL;
323 has_major = dl->major != ~0;
324 has_minor = dl->minor != ~0;
325
326 total_ins += (has_type ? 1 : 0) + (has_access ? 3 : 0) + (has_major ? 1 : 0) + (has_minor ? 1 : 0) + 2;
327 }
328
329 /* acccount for loader instructions */
330 total_ins += pre_insn_len;
331
332 /* final accept/deny block */
333 total_ins += 2;
334
335 /* allocate memory for eBPF program */
336 program = calloc(total_ins, sizeof(struct bpf_insn));
337 if (!program) {
338 ret = ENOMEM;
339 goto out;
340 }
341
342 /* copy program loader instructions */
343 memcpy(program, &pre_insn, sizeof(pre_insn));
344 cur_ins = pre_insn_len;
345
346 /* generate eBPF program */
347 list_for_each_entry(dl, &exceptions, list) {
348 has_access = dl->access != DEVCG_ACC_ALL;
349 has_type = dl->type != DEVCG_DEV_ALL;
350 has_major = dl->major != ~0;
351 has_minor = dl->minor != ~0;
352
353 next_ins = (has_type ? 1 : 0) + (has_access ? 3 : 0) + (has_major ? 1 : 0) + (has_minor ? 1 : 0) + 1;
354
355 if (has_type) {
356 program[cur_ins++] = BPF_JMP_IMM(BPF_JNE, BPF_REG_2, dl->type, next_ins);
357 --next_ins;
358 }
359
360 if (has_access) {
361 program[cur_ins++] = BPF_MOV32_REG(BPF_REG_1, BPF_REG_3);
362 program[cur_ins++] = BPF_ALU32_IMM(BPF_AND, BPF_REG_1, dl->access);
363 program[cur_ins++] = BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, next_ins - 2);
364 next_ins -= 3;
365 }
366
367 if (has_major) {
368 program[cur_ins++] = BPF_JMP_IMM(BPF_JNE, BPF_REG_4, dl->major, next_ins);
369 --next_ins;
370 }
371
372 if (has_minor) {
373 program[cur_ins++] = BPF_JMP_IMM(BPF_JNE, BPF_REG_5, dl->minor, next_ins);
374 --next_ins;
375 }
376
377 program[cur_ins++] = BPF_MOV64_IMM(BPF_REG_0, dl->allow ? 1 : 0);
378 program[cur_ins++] = BPF_EXIT_INSN();
379 }
380
381 /* default behavior */
382 program[cur_ins++] = BPF_MOV64_IMM(BPF_REG_0, (behavior == DEVCG_DEFAULT_ALLOW)?1:0);
383 program[cur_ins++] = BPF_EXIT_INSN();
384
385 if (debug) {
386 fprintf(stderr, "cgroup devices:\na > devices.%s\n",
387 (behavior == DEVCG_DEFAULT_ALLOW)?"allow":"deny");
388
389 list_for_each_entry(dl, &exceptions, list)
390 fprintf(stderr, "%c %d:%d %s%s%s > devices.%s\n",
391 (dl->type == DEVCG_DEV_ALL)?'a':
392 (dl->type == BPF_DEVCG_DEV_CHAR)?'c':'b',
393 (dl->major == ~0)?-1:dl->major,
394 (dl->minor == ~0)?-1:dl->minor,
395 (dl->access & BPF_DEVCG_ACC_READ)?"r":"",
396 (dl->access & BPF_DEVCG_ACC_WRITE)?"w":"",
397 (dl->access & BPF_DEVCG_ACC_MKNOD)?"m":"",
398 (dl->allow)?"allow":"deny");
399
400 fprintf(stderr, "generated cgroup-devices eBPF program:\n");
401 fprintf(stderr, " [idx]\tcode\t dest\t src\t off\t imm\n");
402 for (cur_ins=0; cur_ins<total_ins; cur_ins++)
403 fprintf(stderr, " [%03d]\t%02hhx\t%3hhu\t%3hhu\t%04hx\t%d\n", cur_ins,
404 program[cur_ins].code,
405 program[cur_ins].dst_reg,
406 program[cur_ins].src_reg,
407 program[cur_ins].off,
408 program[cur_ins].imm);
409 }
410
411 assert(cur_ins == total_ins);
412 bpf_total_insn = total_ins;
413 ret = 0;
414
415 out:
416 flush_exceptions(&exceptions);
417 return ret;
418 }
419
420 /*
421 * attach eBPF program to cgroup
422 */
423 int attach_cgroups_ebpf(int cgroup_dirfd) {
424 int prog_fd;
425 #if ( __WORDSIZE == 64 )
426 uint64_t program_ptr = (uint64_t)program;
427 uint64_t license_ptr = (uint64_t)license;
428 #elif ( __WORDSIZE == 32 )
429 uint32_t program_ptr = (uint32_t)program;
430 uint32_t license_ptr = (uint32_t)license;
431 #else
432 #error
433 #endif
434 union bpf_attr load_attr = {
435 .prog_type = BPF_PROG_TYPE_CGROUP_DEVICE,
436 .license = license_ptr,
437 .insns = program_ptr,
438 .insn_cnt = bpf_total_insn,
439 };
440
441 if (!program)
442 return 0;
443
444 prog_fd = syscall_bpf(BPF_PROG_LOAD, &load_attr, sizeof(load_attr));
445 if (prog_fd < 0)
446 return EIO;
447
448 union bpf_attr attach_attr = {
449 .attach_type = BPF_CGROUP_DEVICE,
450 .target_fd = cgroup_dirfd,
451 .attach_bpf_fd = prog_fd,
452 };
453
454 return syscall_bpf(BPF_PROG_ATTACH, &attach_attr, sizeof (attach_attr));
455 }