jail: fix several issues discovered by Coverity
[project/procd.git] / jail / cgroups.c
1 /*
2 * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU Lesser General Public License version 2.1
6 * as published by the Free Software Foundation
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * reads unified cgroup config as proposed in
14 * https://github.com/opencontainers/runtime-spec/pull/1040
15 * attempt conversion from cgroup1 -> cgroup2
16 * https://github.com/containers/crun/blob/0.14.1/crun.1.md#cgroup-v2
17 *
18 * ToDo:
19 * - convert cgroup1 net_prio and net_cls to eBPF program
20 * - rdma (anyone?) intelrdt (anyone?)
21 */
22
23 #define _GNU_SOURCE
24
25 #include <errno.h>
26 #include <fcntl.h>
27 #include <stdlib.h>
28 #include <stdio.h>
29 #include <string.h>
30 #include <sys/stat.h>
31 #include <sys/mman.h>
32 #include <unistd.h>
33 #include <libgen.h>
34 #include <inttypes.h>
35
36 #include <libubox/avl.h>
37 #include <libubox/avl-cmp.h>
38 #include <libubox/blobmsg.h>
39 #include <libubox/list.h>
40 #include <libubox/utils.h>
41
42 #include "log.h"
43 #include "cgroups.h"
44 #include "cgroups-bpf.h"
45
46 #define CGROUP_ROOT "/sys/fs/cgroup/"
47 #define CGROUP_IO_WEIGHT_MAX 10000
48
49 struct cgval {
50 struct avl_node avl;
51 char *val;
52 };
53
54 struct avl_tree cgvals;
55 static char *cgroup_path;
56 static bool initialized;
57
58 void cgroups_prepare(void) {
59 initialized = false;
60 }
61
62 void cgroups_init(const char *p) {
63 avl_init(&cgvals, avl_strcmp, false, NULL);
64 cgroup_path = strdup(p);
65 initialized = true;
66 }
67
68 static void cgroups_set(const char *key, const char *val)
69 {
70 struct cgval *valp;
71
72 valp = avl_find_element(&cgvals, key, valp, avl);
73 if (!valp) {
74 valp = malloc(sizeof(struct cgval));
75 if (!valp)
76 exit(ENOMEM);
77
78 valp->avl.key = strdup(key);
79 avl_insert(&cgvals, &valp->avl);
80 } else {
81 DEBUG("overwriting previous cgroup2 assignment %s=\"%s\"!\n", key, valp->val);
82 free(valp->val);
83 }
84
85 valp->val = strdup(val);
86 }
87
88 void cgroups_free(void)
89 {
90 struct cgval *valp, *tmp;
91
92 if (initialized) {
93 avl_remove_all_elements(&cgvals, valp, avl, tmp) {
94 free((void *)(valp->avl.key));
95 free(valp->val);
96 free(valp);
97 }
98 free(cgroup_path);
99 }
100 }
101
102 void cgroups_apply(pid_t pid)
103 {
104 struct cgval *valp;
105 char *cdir, *ent;
106 int fd;
107 size_t maxlen = strlen("cgroup.subtree_control");
108
109 bool cpuset = false,
110 cpu = false,
111 hugetlb = false,
112 io = false,
113 memory = false,
114 pids = false,
115 rdma = false;
116
117 char subtree_control[64] = { 0 };
118
119 DEBUG("using cgroup path %s\n", cgroup_path);
120 mkdir_p(cgroup_path, 0700);
121
122 /* find which controllers need to be enabled */
123 avl_for_each_element(&cgvals, valp, avl) {
124 ent = (char *)valp->avl.key;
125 if (strlen(ent) > maxlen)
126 maxlen = strlen(ent);
127
128 if (!strncmp("cpuset.", ent, 7))
129 cpuset = true;
130 else if (!strncmp("cpu.", ent, 4))
131 cpu = true;
132 else if (!strncmp("hugetlb.", ent, 8))
133 hugetlb = true;
134 else if (!strncmp("io.", ent, 3))
135 io = true;
136 else if (!strncmp("memory.", ent, 7))
137 memory = true;
138 else if (!strncmp("pids.", ent, 5))
139 pids = true;
140 else if (!strncmp("rdma.", ent, 5))
141 pids = true;
142 }
143
144 maxlen += strlen(cgroup_path) + 2;
145
146 if (cpuset)
147 strcat(subtree_control, "+cpuset ");
148
149 if (cpu)
150 strcat(subtree_control, "+cpu ");
151
152 if (hugetlb)
153 strcat(subtree_control, "+hugetlb ");
154
155 if (io)
156 strcat(subtree_control, "+io ");
157
158 if (memory)
159 strcat(subtree_control, "+memory ");
160
161 if (pids)
162 strcat(subtree_control, "+pids ");
163
164 if (rdma)
165 strcat(subtree_control, "+rdma ");
166
167 /* remove trailing space */
168 ent = strchr(subtree_control, '\0') - 1;
169 *ent = '\0';
170
171 ent = malloc(maxlen);
172 if (!ent)
173 exit(ENOMEM);
174
175 DEBUG("recursively applying cgroup.subtree_control = \"%s\"\n", subtree_control);
176 cdir = &cgroup_path[strlen(CGROUP_ROOT) - 2];
177 while ((cdir = strchr(cdir + 1, '/'))) {
178 *cdir = '\0';
179 snprintf(ent, maxlen, "%s/cgroup.subtree_control", cgroup_path);
180 DEBUG(" * %s\n", ent);
181 if ((fd = open(ent, O_WRONLY)) < 0) {
182 ERROR("can't open %s: %m\n", ent);
183 continue;
184 }
185
186 if (write(fd, subtree_control, strlen(subtree_control)) == -1) {
187 ERROR("can't write to %s: %m\n", ent);
188 close(fd);
189 continue;
190 }
191
192 close(fd);
193 *cdir = '/';
194 }
195
196 avl_for_each_element(&cgvals, valp, avl) {
197 DEBUG("applying cgroup2 %s=\"%s\"\n", (char *)valp->avl.key, valp->val);
198 snprintf(ent, maxlen, "%s/%s", cgroup_path, (char *)valp->avl.key);
199 fd = open(ent, O_WRONLY);
200 if (fd < 0) {
201 ERROR("can't open %s: %m\n", ent);
202 continue;
203 }
204 if (dprintf(fd, "%s", valp->val) < 0) {
205 ERROR("can't write to %s: %m\n", ent);
206 };
207 close(fd);
208 }
209
210 int dirfd = open(cgroup_path, O_DIRECTORY);
211 if (dirfd < 0) {
212 ERROR("can't open %s: %m\n", cgroup_path);
213 } else {
214 attach_cgroups_ebpf(dirfd);
215 close(dirfd);
216 }
217
218 snprintf(ent, maxlen, "%s/%s", cgroup_path, "cgroup.procs");
219 fd = open(ent, O_WRONLY);
220 if (fd < 0) {
221 ERROR("can't open %s: %m\n", cgroup_path);
222 } else {
223 dprintf(fd, "%d", pid);
224 close(fd);
225 }
226
227 free(ent);
228 }
229
230 enum {
231 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR,
232 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR,
233 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT,
234 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT,
235 __OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX,
236 };
237
238 static const struct blobmsg_policy oci_linux_cgroups_blockio_weightdevice_policy[] = {
239 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR] = { "major", BLOBMSG_CAST_INT64 },
240 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR] = { "minor", BLOBMSG_CAST_INT64 },
241 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT] = { "weight", BLOBMSG_TYPE_INT32 },
242 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT] = { "leafWeight", BLOBMSG_TYPE_INT32 },
243 };
244
245 enum {
246 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR,
247 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR,
248 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE,
249 __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX,
250 };
251
252 static const struct blobmsg_policy oci_linux_cgroups_blockio_throttledevice_policy[] = {
253 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] = { "major", BLOBMSG_CAST_INT64 },
254 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] = { "minor", BLOBMSG_CAST_INT64 },
255 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE] = { "rate", BLOBMSG_CAST_INT64 },
256 };
257
258 enum {
259 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT,
260 OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT,
261 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE,
262 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE,
263 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE,
264 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE,
265 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE,
266 __OCI_LINUX_CGROUPS_BLOCKIO_MAX,
267 };
268
269 static const struct blobmsg_policy oci_linux_cgroups_blockio_policy[] = {
270 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT] = { "weight", BLOBMSG_TYPE_INT32 },
271 [OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT] = { "leafWeight", BLOBMSG_TYPE_INT32 },
272 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE] = { "weightDevice", BLOBMSG_TYPE_ARRAY },
273 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE] = { "throttleReadBpsDevice", BLOBMSG_TYPE_ARRAY },
274 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE] = { "throttleWriteBpsDevice", BLOBMSG_TYPE_ARRAY },
275 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE] = { "throttleReadIOPSDevice", BLOBMSG_TYPE_ARRAY },
276 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE] = { "throttleWriteIOPSDevice", BLOBMSG_TYPE_ARRAY },
277 };
278
279 struct posix_dev {
280 uint64_t major;
281 uint64_t minor;
282 };
283
284 struct iomax_line {
285 struct avl_node avl;
286 struct posix_dev dev;
287 uint64_t rbps;
288 uint64_t wbps;
289 uint64_t riops;
290 uint64_t wiops;
291 };
292
293 static int avl_devcmp(const void *k1, const void *k2, void *ptr)
294 {
295 struct posix_dev *d1 = (struct posix_dev *)k1, *d2 = (struct posix_dev *)k2;
296
297 if (d1->major < d2->major)
298 return -1;
299
300 if (d1->major > d2->major)
301 return 1;
302
303 if (d1->minor < d2->minor)
304 return -1;
305
306 if (d1->minor > d2->minor)
307 return 1;
308
309 return 0;
310 }
311
312 static struct iomax_line *get_iomax_line(struct avl_tree *iomax, uint64_t major, uint64_t minor)
313 {
314 struct iomax_line *l;
315 struct posix_dev d;
316 d.major = major;
317 d.minor = minor;
318 l = avl_find_element(iomax, &d, l, avl);
319 if (!l) {
320 l = malloc(sizeof(struct iomax_line));
321 if (!l)
322 exit(ENOMEM);
323
324 l->dev.major = d.major;
325 l->dev.minor = d.minor;
326 l->avl.key = &l->dev;
327 l->rbps = -1;
328 l->wbps = -1;
329 l->riops = -1;
330 l->wiops = -1;
331 avl_insert(iomax, &l->avl);
332 }
333
334 return l;
335 }
336
337 static int parseOCIlinuxcgroups_legacy_blockio(struct blob_attr *msg)
338 {
339 struct blob_attr *tb[__OCI_LINUX_CGROUPS_BLOCKIO_MAX],
340 *tbwd[__OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX],
341 *tbtd[__OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX],
342 *cur;
343 int rem;
344 int weight = -1, leafweight = -1;
345 size_t numweightstrs = 0, numiomaxstrs = 0, strtotlen = 1;
346 char **weightstrs = NULL, **iomaxstrs = NULL, **curstr;
347 char *weightstr, *iomaxstr;
348 struct avl_tree iomax;
349 struct iomax_line *curiomax, *tmp;
350
351 blobmsg_parse(oci_linux_cgroups_blockio_policy, __OCI_LINUX_CGROUPS_BLOCKIO_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
352
353 if (tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]) {
354 weight = blobmsg_get_u32(tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]);
355 ++numweightstrs;
356 }
357
358 if (weight > CGROUP_IO_WEIGHT_MAX)
359 return ERANGE;
360
361 if (tb[OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT])
362 leafweight = blobmsg_get_u32(tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]);
363
364 if (leafweight > CGROUP_IO_WEIGHT_MAX)
365 return ERANGE;
366
367 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE], rem)
368 ++numweightstrs;
369
370 weightstrs = calloc(numweightstrs + 1, sizeof(char *));
371 if (!weightstrs)
372 exit(ENOMEM);
373
374 numweightstrs = 0;
375
376 if (weight > -1)
377 if (asprintf(&weightstrs[numweightstrs++], "default %d", weight) < 0)
378 return ENOMEM;
379
380 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE], rem) {
381 uint64_t major, minor;
382 int devweight = weight, devleafweight = leafweight;
383
384 blobmsg_parse(oci_linux_cgroups_blockio_weightdevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX, tbwd, blobmsg_data(cur), blobmsg_len(cur));
385 if (!tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR] ||
386 !tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR])
387 return ENODATA;
388
389 if (!tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT] &&
390 !tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
391 return ENODATA;
392
393 if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT])
394 devweight = blobmsg_get_u32(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT]);
395
396 if (devweight > CGROUP_IO_WEIGHT_MAX)
397 return ERANGE;
398
399 if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
400 devleafweight = blobmsg_get_u32(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT]);
401
402 if (devleafweight > CGROUP_IO_WEIGHT_MAX)
403 return ERANGE;
404
405 if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
406 return ENOTSUP;
407
408 major = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR]);
409 minor = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR]);
410
411 if (asprintf(&weightstrs[numweightstrs++], "%" PRIu64 ":%" PRIu64 " %u", major, minor, devweight) < 0)
412 return ENOMEM;
413 }
414
415 if (numweightstrs) {
416 curstr = weightstrs;
417 while (*curstr)
418 strtotlen += strlen(*(curstr++)) + 1;
419
420 weightstr = calloc(strtotlen, sizeof(char));
421 if (!weightstr)
422 exit(ENOMEM);
423
424 curstr = weightstrs;
425 while (*curstr) {
426 strcat(weightstr, *curstr);
427 strcat(weightstr, "\n");
428 free(*(curstr++));
429 }
430
431 cgroups_set("io.bfq.weight", weightstr);
432 free(weightstr);
433 };
434
435 free(weightstrs);
436
437 avl_init(&iomax, avl_devcmp, false, NULL);
438
439 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE], rem) {
440 struct iomax_line *l;
441
442 blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
443
444 if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
445 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
446 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
447 return ENODATA;
448
449 l = get_iomax_line(&iomax,
450 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
451 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
452
453 l->rbps = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
454 }
455
456 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE], rem) {
457 struct iomax_line *l;
458
459 blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
460
461 if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
462 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
463 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
464 return ENODATA;
465
466 l = get_iomax_line(&iomax,
467 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
468 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
469
470 l->wbps = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
471 }
472
473 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE], rem) {
474 struct iomax_line *l;
475
476 blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
477
478 if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
479 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
480 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
481 return ENODATA;
482
483 l = get_iomax_line(&iomax,
484 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
485 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
486
487 l->riops = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
488 }
489
490 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE], rem) {
491 struct iomax_line *l;
492
493 blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
494
495 if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
496 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
497 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
498 return ENODATA;
499
500 l = get_iomax_line(&iomax,
501 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
502 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
503
504 l->wiops = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
505 }
506
507 avl_for_each_element(&iomax, curiomax, avl)
508 ++numiomaxstrs;
509
510 if (!numiomaxstrs)
511 return 0;
512
513 iomaxstrs = calloc(numiomaxstrs + 1, sizeof(char *));
514 if (!iomaxstrs)
515 exit(ENOMEM);
516
517 numiomaxstrs = 0;
518
519 avl_for_each_element(&iomax, curiomax, avl) {
520 char iomaxlstr[160];
521 char lstr[32];
522
523 sprintf(iomaxlstr, "%" PRIu64 ":%" PRIu64 " ", curiomax->dev.major, curiomax->dev.minor);
524
525 if (curiomax->rbps != -1) {
526 sprintf(lstr, "rbps=%" PRIu64 " ", curiomax->rbps);
527 strcat(iomaxlstr, lstr);
528 }
529 if (curiomax->wbps != -1) {
530 sprintf(lstr, "wbps=%" PRIu64 " ", curiomax->wbps);
531 strcat(iomaxlstr, lstr);
532 }
533 if (curiomax->riops != -1) {
534 sprintf(lstr, "riops=%" PRIu64 " ", curiomax->riops);
535 strcat(iomaxlstr, lstr);
536 }
537 if (curiomax->wiops != -1) {
538 sprintf(lstr, "wiops=%" PRIu64 " ", curiomax->wiops);
539 strcat(iomaxlstr, lstr);
540 }
541
542 iomaxstrs[numiomaxstrs++] = strdup(iomaxlstr);
543 }
544
545 avl_for_each_element_safe(&iomax, curiomax, avl, tmp) {
546 avl_delete(&iomax, &curiomax->avl);
547 free(curiomax);
548 }
549
550 strtotlen = 1; /* 1 accounts for \0 at end of string */
551 if (numiomaxstrs) {
552 curstr = iomaxstrs;
553 while (*curstr)
554 strtotlen += strlen(*(curstr++)) + 1; /* +1 accounts for \n at end of line */
555
556 iomaxstr = calloc(strtotlen, sizeof(char));
557 if (!iomaxstr)
558 exit(ENOMEM);
559
560 curstr = iomaxstrs;
561
562 while (*curstr) {
563 strcat(iomaxstr, *curstr);
564 strcat(iomaxstr, "\n");
565 free(*(curstr++));
566 }
567
568 cgroups_set("io.max", iomaxstr);
569 free(iomaxstr);
570 };
571
572 free(iomaxstrs);
573
574 return 0;
575 }
576
577
578 enum {
579 OCI_LINUX_CGROUPS_CPU_SHARES,
580 OCI_LINUX_CGROUPS_CPU_PERIOD,
581 OCI_LINUX_CGROUPS_CPU_QUOTA,
582 OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME,
583 OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD,
584 OCI_LINUX_CGROUPS_CPU_CPUS,
585 OCI_LINUX_CGROUPS_CPU_MEMS,
586 __OCI_LINUX_CGROUPS_CPU_MAX,
587 };
588
589 static const struct blobmsg_policy oci_linux_cgroups_cpu_policy[] = {
590 [OCI_LINUX_CGROUPS_CPU_SHARES] = { "shares", BLOBMSG_CAST_INT64 },
591 [OCI_LINUX_CGROUPS_CPU_PERIOD] = { "period", BLOBMSG_CAST_INT64 },
592 [OCI_LINUX_CGROUPS_CPU_QUOTA] = { "quota", BLOBMSG_CAST_INT64 }, /* signed int64! */
593 [OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD] = { "realtimePeriod", BLOBMSG_CAST_INT64 },
594 [OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME] = { "realtimeRuntime", BLOBMSG_CAST_INT64 },
595 [OCI_LINUX_CGROUPS_CPU_CPUS] = { "cpus", BLOBMSG_TYPE_STRING },
596 [OCI_LINUX_CGROUPS_CPU_MEMS] = { "mems", BLOBMSG_TYPE_STRING },
597 };
598
599 static int parseOCIlinuxcgroups_legacy_cpu(struct blob_attr *msg)
600 {
601 struct blob_attr *tb[__OCI_LINUX_CGROUPS_CPU_MAX];
602 uint64_t shares, period = 0;
603 int64_t quota = -2; /* unset */
604 char tmp[32] = { 0 };
605
606 blobmsg_parse(oci_linux_cgroups_cpu_policy, __OCI_LINUX_CGROUPS_CPU_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
607
608 if (tb[OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD] ||
609 tb[OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME])
610 return ENOTSUP; /* no equivalent in cgroup2 */
611
612 if (tb[OCI_LINUX_CGROUPS_CPU_SHARES]) {
613 shares = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_CPU_SHARES]);
614 if ((shares < 2) || (shares > 262144))
615 return ERANGE;
616
617 snprintf(tmp, sizeof(tmp), "%" PRIu64, (((uint64_t)1) + ((shares - 2) * 9999) / 262142));
618 cgroups_set("cpu.weight", tmp);
619 tmp[0] = '\0';
620 }
621
622 if (tb[OCI_LINUX_CGROUPS_CPU_QUOTA])
623 quota = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_CPU_QUOTA]);
624
625 if (tb[OCI_LINUX_CGROUPS_CPU_PERIOD])
626 period = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_CPU_PERIOD]);
627
628 if (period) {
629 if (quota >= 0)
630 snprintf(tmp, sizeof(tmp), "%" PRId64 " %" PRIu64 , quota, period);
631 else
632 snprintf(tmp, sizeof(tmp), "max %" PRIu64, period); /* assume default */
633 } else if (quota >= 0) {
634 snprintf(tmp, sizeof(tmp), "%" PRId64, quota);
635 } else if (quota == -1) {
636 strcpy(tmp, "max");
637 }
638
639 if (tmp[0])
640 cgroups_set("cpu.max", tmp);
641
642 if (tb[OCI_LINUX_CGROUPS_CPU_CPUS])
643 cgroups_set("cpuset.cpus", blobmsg_get_string(tb[OCI_LINUX_CGROUPS_CPU_CPUS]));
644
645 if (tb[OCI_LINUX_CGROUPS_CPU_MEMS])
646 cgroups_set("cpuset.mems", blobmsg_get_string(tb[OCI_LINUX_CGROUPS_CPU_MEMS]));
647
648 return 0;
649 }
650
651
652 enum {
653 OCI_LINUX_CGROUPS_MEMORY_LIMIT,
654 OCI_LINUX_CGROUPS_MEMORY_RESERVATION,
655 OCI_LINUX_CGROUPS_MEMORY_SWAP,
656 OCI_LINUX_CGROUPS_MEMORY_KERNEL,
657 OCI_LINUX_CGROUPS_MEMORY_KERNELTCP,
658 OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS,
659 OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER,
660 OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY,
661 __OCI_LINUX_CGROUPS_MEMORY_MAX,
662 };
663
664 static const struct blobmsg_policy oci_linux_cgroups_memory_policy[] = {
665 [OCI_LINUX_CGROUPS_MEMORY_LIMIT] = { "limit", BLOBMSG_CAST_INT64 }, /* signed int64! */
666 [OCI_LINUX_CGROUPS_MEMORY_RESERVATION] = { "reservation", BLOBMSG_CAST_INT64 }, /* signed int64! */
667 [OCI_LINUX_CGROUPS_MEMORY_SWAP] = { "swap", BLOBMSG_CAST_INT64 }, /* signed int64! */
668 [OCI_LINUX_CGROUPS_MEMORY_KERNEL] = { "kernel", BLOBMSG_CAST_INT64 }, /* signed int64! ignored */
669 [OCI_LINUX_CGROUPS_MEMORY_KERNELTCP] = { "kernelTCP", BLOBMSG_CAST_INT64 }, /* signed int64! ignored */
670 [OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] = { "swappiness", BLOBMSG_CAST_INT64 },
671 [OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER] = { "disableOOMKiller", BLOBMSG_TYPE_BOOL },
672 [OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY] { "useHierarchy", BLOBMSG_TYPE_BOOL },
673 };
674
675 static int parseOCIlinuxcgroups_legacy_memory(struct blob_attr *msg)
676 {
677 struct blob_attr *tb[__OCI_LINUX_CGROUPS_MEMORY_MAX];
678 char tmp[32] = { 0 };
679 int64_t limit = -1, swap, reservation;
680
681 blobmsg_parse(oci_linux_cgroups_memory_policy, __OCI_LINUX_CGROUPS_MEMORY_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
682
683 /*
684 * not all properties of the OCI memory section can be mapped to cgroup2
685 * kernel memory accounting is always enabled and included in the set
686 * memory limit, hence these options can be ignored
687 * disableOOMKiller could be emulated using oom_score_adj + seccomp eBPF
688 * preventing self-upgrade (but allow downgrade)
689 *
690 * see also https://github.com/opencontainers/runtime-spec/issues/1005
691 */
692 if (tb[OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] ||
693 tb[OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER] ||
694 tb[OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY])
695 return ENOTSUP;
696
697
698 if (tb[OCI_LINUX_CGROUPS_MEMORY_LIMIT]) {
699 limit = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_LIMIT]);
700 if (limit == -1)
701 strcpy(tmp, "max");
702 else
703 snprintf(tmp, sizeof(tmp), "%" PRId64, limit);
704
705 cgroups_set("memory.max", tmp);
706 }
707
708 if (tb[OCI_LINUX_CGROUPS_MEMORY_RESERVATION]) {
709 reservation = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_RESERVATION]);
710
711 if (reservation == -1)
712 strcpy(tmp, "max");
713 else
714 snprintf(tmp, sizeof(tmp), "%" PRId64, reservation);
715
716 cgroups_set("memory.low", tmp);
717 }
718
719 /* OCI 'swap' acounts for memory+swap */
720 if (tb[OCI_LINUX_CGROUPS_MEMORY_SWAP]) {
721 swap = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_SWAP]);
722
723 if (swap == -1)
724 strcpy(tmp, "max");
725 else if (limit == -1 || (limit < swap))
726 snprintf(tmp, sizeof(tmp), "%" PRId64, swap);
727 else
728 snprintf(tmp, sizeof(tmp), "%" PRId64, limit - swap);
729
730 cgroups_set("memory.swap_max", tmp);
731 }
732
733 return 0;
734 }
735
736
737 enum {
738 OCI_LINUX_CGROUPS_PIDS_LIMIT,
739 __OCI_LINUX_CGROUPS_PIDS_MAX,
740 };
741
742 static const struct blobmsg_policy oci_linux_cgroups_pids_policy[] = {
743 [OCI_LINUX_CGROUPS_PIDS_LIMIT] = { "limit", BLOBMSG_CAST_INT64 },
744 };
745
746 static int parseOCIlinuxcgroups_legacy_pids(struct blob_attr *msg)
747 {
748 struct blob_attr *tb[__OCI_LINUX_CGROUPS_MEMORY_MAX];
749 char tmp[32] = { 0 };
750
751 blobmsg_parse(oci_linux_cgroups_pids_policy, __OCI_LINUX_CGROUPS_PIDS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
752
753 if (!tb[OCI_LINUX_CGROUPS_PIDS_LIMIT])
754 return EINVAL;
755
756 snprintf(tmp, sizeof(tmp), "%" PRIu64, blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_PIDS_LIMIT]));
757
758 cgroups_set("pids.max", tmp);
759
760 return 0;
761 }
762
763 static int parseOCIlinuxcgroups_unified(struct blob_attr *msg)
764 {
765 struct blob_attr *cur;
766 int rem;
767
768 blobmsg_for_each_attr(cur, msg, rem) {
769 if (blobmsg_type(cur) != BLOBMSG_TYPE_STRING)
770 return EINVAL;
771
772 /* restrict keys */
773 if (strchr(blobmsg_name(cur), '/') ||
774 !strcmp(blobmsg_name(cur), "cgroup.subtree_control") ||
775 !strcmp(blobmsg_name(cur), "cgroup.procs") ||
776 !strcmp(blobmsg_name(cur), "cgroup.threads") ||
777 !strcmp(blobmsg_name(cur), "cgroup.freeze"))
778 return EINVAL;
779
780 cgroups_set(blobmsg_name(cur), blobmsg_get_string(cur));
781 }
782
783 return 0;
784 }
785
786 enum {
787 OCI_LINUX_CGROUPS_BLOCKIO,
788 OCI_LINUX_CGROUPS_CPU,
789 OCI_LINUX_CGROUPS_DEVICES,
790 OCI_LINUX_CGROUPS_HUGEPAGELIMITS,
791 OCI_LINUX_CGROUPS_INTELRDT,
792 OCI_LINUX_CGROUPS_MEMORY,
793 OCI_LINUX_CGROUPS_NETWORK,
794 OCI_LINUX_CGROUPS_PIDS,
795 OCI_LINUX_CGROUPS_RDMA,
796 OCI_LINUX_CGROUPS_UNIFIED,
797 __OCI_LINUX_CGROUPS_MAX,
798 };
799
800 static const struct blobmsg_policy oci_linux_cgroups_policy[] = {
801 [OCI_LINUX_CGROUPS_BLOCKIO] = { "blockIO", BLOBMSG_TYPE_TABLE },
802 [OCI_LINUX_CGROUPS_CPU] = { "cpu", BLOBMSG_TYPE_TABLE },
803 [OCI_LINUX_CGROUPS_DEVICES] = { "devices", BLOBMSG_TYPE_ARRAY },
804 [OCI_LINUX_CGROUPS_HUGEPAGELIMITS] = { "hugepageLimits", BLOBMSG_TYPE_ARRAY },
805 [OCI_LINUX_CGROUPS_INTELRDT] = { "intelRdt", BLOBMSG_TYPE_TABLE },
806 [OCI_LINUX_CGROUPS_MEMORY] = { "memory", BLOBMSG_TYPE_TABLE },
807 [OCI_LINUX_CGROUPS_NETWORK] = { "network", BLOBMSG_TYPE_TABLE },
808 [OCI_LINUX_CGROUPS_PIDS] = { "pids", BLOBMSG_TYPE_TABLE },
809 [OCI_LINUX_CGROUPS_RDMA] = { "rdma", BLOBMSG_TYPE_TABLE },
810 [OCI_LINUX_CGROUPS_UNIFIED] = { "unified", BLOBMSG_TYPE_TABLE },
811 };
812
813 int parseOCIlinuxcgroups(struct blob_attr *msg)
814 {
815 struct blob_attr *tb[__OCI_LINUX_CGROUPS_MAX];
816 int ret;
817
818 blobmsg_parse(oci_linux_cgroups_policy, __OCI_LINUX_CGROUPS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
819
820 if (tb[OCI_LINUX_CGROUPS_HUGEPAGELIMITS] ||
821 tb[OCI_LINUX_CGROUPS_INTELRDT] ||
822 tb[OCI_LINUX_CGROUPS_NETWORK] ||
823 tb[OCI_LINUX_CGROUPS_RDMA])
824 return ENOTSUP;
825
826 if (tb[OCI_LINUX_CGROUPS_BLOCKIO]) {
827 ret = parseOCIlinuxcgroups_legacy_blockio(tb[OCI_LINUX_CGROUPS_BLOCKIO]);
828 if (ret)
829 return ret;
830 }
831
832 if (tb[OCI_LINUX_CGROUPS_CPU]) {
833 ret = parseOCIlinuxcgroups_legacy_cpu(tb[OCI_LINUX_CGROUPS_CPU]);
834 if (ret)
835 return ret;
836 }
837
838 if (tb[OCI_LINUX_CGROUPS_DEVICES]) {
839 ret = parseOCIlinuxcgroups_devices(tb[OCI_LINUX_CGROUPS_DEVICES]);
840 if (ret)
841 return ret;
842 }
843
844 if (tb[OCI_LINUX_CGROUPS_MEMORY]) {
845 ret = parseOCIlinuxcgroups_legacy_memory(tb[OCI_LINUX_CGROUPS_MEMORY]);
846 if (ret)
847 return ret;
848 }
849
850 if (tb[OCI_LINUX_CGROUPS_PIDS]) {
851 ret = parseOCIlinuxcgroups_legacy_pids(tb[OCI_LINUX_CGROUPS_PIDS]);
852 if (ret)
853 return ret;
854 }
855
856 if (tb[OCI_LINUX_CGROUPS_UNIFIED]) {
857 ret = parseOCIlinuxcgroups_unified(tb[OCI_LINUX_CGROUPS_UNIFIED]);
858 if (ret)
859 return ret;
860 }
861
862 return 0;
863 }