3e53dd5e71410c7ee0ac5a420b1a06b4acd13539
[project/procd.git] / jail / cgroups.c
1 /*
2 * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU Lesser General Public License version 2.1
6 * as published by the Free Software Foundation
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * reads unified cgroup config as proposed in
14 * https://github.com/opencontainers/runtime-spec/pull/1040
15 * attempt conversion from cgroup1 -> cgroup2
16 * https://github.com/containers/crun/blob/0.14.1/crun.1.md#cgroup-v2
17 *
18 * ToDo:
19 * - convert cgroup1 net_prio and net_cls to eBPF program
20 * - rdma (anyone?) intelrdt (anyone?)
21 */
22
23 #define _GNU_SOURCE
24
25 #include <errno.h>
26 #include <fcntl.h>
27 #include <stdlib.h>
28 #include <stdio.h>
29 #include <string.h>
30 #include <sys/stat.h>
31 #include <sys/mman.h>
32 #include <unistd.h>
33 #include <libgen.h>
34 #include <inttypes.h>
35
36 #include <libubox/avl.h>
37 #include <libubox/avl-cmp.h>
38 #include <libubox/blobmsg.h>
39 #include <libubox/list.h>
40 #include <libubox/utils.h>
41
42 #include "log.h"
43 #include "cgroups.h"
44 #include "cgroups-bpf.h"
45
46 #define CGROUP_ROOT "/sys/fs/cgroup/"
47 #define CGROUP_IO_WEIGHT_MAX 10000
48
49 struct cgval {
50 struct avl_node avl;
51 char *val;
52 };
53
54 struct avl_tree cgvals;
55 static char *cgroup_path;
56 static bool initialized;
57
58 void cgroups_prepare(void) {
59 initialized = false;
60 }
61
62 void cgroups_init(const char *p) {
63 avl_init(&cgvals, avl_strcmp, false, NULL);
64 cgroup_path = strdup(p);
65 initialized = true;
66 }
67
68 static void cgroups_set(const char *key, const char *val)
69 {
70 struct cgval *valp;
71
72 valp = avl_find_element(&cgvals, key, valp, avl);
73 if (!valp) {
74 valp = malloc(sizeof(struct cgval));
75 if (!valp)
76 exit(ENOMEM);
77
78 valp->avl.key = strdup(key);
79 avl_insert(&cgvals, &valp->avl);
80 } else {
81 DEBUG("overwriting previous cgroup2 assignment %s=\"%s\"!\n", key, valp->val);
82 free(valp->val);
83 }
84
85 valp->val = strdup(val);
86 }
87
88 void cgroups_free(void)
89 {
90 struct cgval *valp, *tmp;
91
92 if (initialized) {
93 avl_remove_all_elements(&cgvals, valp, avl, tmp) {
94 free((void *)(valp->avl.key));
95 free(valp->val);
96 free(valp);
97 }
98 free(cgroup_path);
99 }
100 }
101
102 void cgroups_apply(pid_t pid)
103 {
104 struct cgval *valp;
105 char *cdir, *ent;
106 int fd;
107 size_t maxlen = strlen("cgroup.subtree_control");
108
109 bool cpuset = false,
110 cpu = false,
111 hugetlb = false,
112 io = false,
113 memory = false,
114 pids = false,
115 rdma = false;
116
117 char subtree_control[64] = { 0 };
118
119 DEBUG("using cgroup path %s\n", cgroup_path);
120 mkdir_p(cgroup_path, 0700);
121
122 /* find which controllers need to be enabled */
123 avl_for_each_element(&cgvals, valp, avl) {
124 ent = (char *)valp->avl.key;
125 if (strlen(ent) > maxlen)
126 maxlen = strlen(ent);
127
128 if (!strncmp("cpuset.", ent, 7))
129 cpuset = true;
130 else if (!strncmp("cpu.", ent, 4))
131 cpu = true;
132 else if (!strncmp("hugetlb.", ent, 8))
133 hugetlb = true;
134 else if (!strncmp("io.", ent, 3))
135 io = true;
136 else if (!strncmp("memory.", ent, 7))
137 memory = true;
138 else if (!strncmp("pids.", ent, 5))
139 pids = true;
140 else if (!strncmp("rdma.", ent, 5))
141 pids = true;
142 }
143
144 maxlen += strlen(cgroup_path) + 2;
145
146 if (cpuset)
147 strcat(subtree_control, "+cpuset ");
148
149 if (cpu)
150 strcat(subtree_control, "+cpu ");
151
152 if (hugetlb)
153 strcat(subtree_control, "+hugetlb ");
154
155 if (io)
156 strcat(subtree_control, "+io ");
157
158 if (memory)
159 strcat(subtree_control, "+memory ");
160
161 if (pids)
162 strcat(subtree_control, "+pids ");
163
164 if (rdma)
165 strcat(subtree_control, "+rdma ");
166
167 /* remove trailing space */
168 ent = strchr(subtree_control, '\0') - 1;
169 *ent = '\0';
170
171 ent = malloc(maxlen);
172 if (!ent)
173 exit(ENOMEM);
174
175 DEBUG("recursively applying cgroup.subtree_control = \"%s\"\n", subtree_control);
176 cdir = &cgroup_path[strlen(CGROUP_ROOT) - 2];
177 while ((cdir = strchr(cdir + 1, '/'))) {
178 *cdir = '\0';
179 snprintf(ent, maxlen, "%s/cgroup.subtree_control", cgroup_path);
180 DEBUG(" * %s\n", ent);
181 if ((fd = open(ent, O_WRONLY)) == -1) {
182 ERROR("can't open %s: %m\n", ent);
183 continue;
184 }
185
186 if (write(fd, subtree_control, strlen(subtree_control)) == -1) {
187 ERROR("can't write to %s: %m\n", ent);
188 continue;
189 }
190
191 close(fd);
192 *cdir = '/';
193 }
194
195 avl_for_each_element(&cgvals, valp, avl) {
196 DEBUG("applying cgroup2 %s=\"%s\"\n", (char *)valp->avl.key, valp->val);
197 snprintf(ent, maxlen, "%s/%s", cgroup_path, (char *)valp->avl.key);
198 fd = open(ent, O_WRONLY);
199 if (fd == -1) {
200 ERROR("can't open %s: %m\n", ent);
201 continue;
202 }
203 if (dprintf(fd, "%s", valp->val) < 0) {
204 ERROR("can't write to %s: %m\n", ent);
205 };
206 close(fd);
207 }
208
209 int dirfd = open(cgroup_path, O_DIRECTORY);
210 if (dirfd < 0) {
211 ERROR("can't open %s: %m\n", cgroup_path);
212 } else {
213 attach_cgroups_ebpf(dirfd);
214 close(dirfd);
215 }
216
217 snprintf(ent, maxlen, "%s/%s", cgroup_path, "cgroup.procs");
218 fd = open(ent, O_WRONLY);
219 if (fd < 0) {
220 ERROR("can't open %s: %m\n", cgroup_path);
221 } else {
222 dprintf(fd, "%d", pid);
223 close(fd);
224 }
225
226 free(ent);
227 }
228
229 enum {
230 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR,
231 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR,
232 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT,
233 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT,
234 __OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX,
235 };
236
237 static const struct blobmsg_policy oci_linux_cgroups_blockio_weightdevice_policy[] = {
238 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR] = { "major", BLOBMSG_CAST_INT64 },
239 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR] = { "minor", BLOBMSG_CAST_INT64 },
240 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT] = { "weight", BLOBMSG_TYPE_INT32 },
241 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT] = { "leafWeight", BLOBMSG_TYPE_INT32 },
242 };
243
244 enum {
245 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR,
246 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR,
247 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE,
248 __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX,
249 };
250
251 static const struct blobmsg_policy oci_linux_cgroups_blockio_throttledevice_policy[] = {
252 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] = { "major", BLOBMSG_CAST_INT64 },
253 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] = { "minor", BLOBMSG_CAST_INT64 },
254 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE] = { "rate", BLOBMSG_CAST_INT64 },
255 };
256
257 enum {
258 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT,
259 OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT,
260 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE,
261 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE,
262 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE,
263 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE,
264 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE,
265 __OCI_LINUX_CGROUPS_BLOCKIO_MAX,
266 };
267
268 static const struct blobmsg_policy oci_linux_cgroups_blockio_policy[] = {
269 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT] = { "weight", BLOBMSG_TYPE_INT32 },
270 [OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT] = { "leafWeight", BLOBMSG_TYPE_INT32 },
271 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE] = { "weightDevice", BLOBMSG_TYPE_ARRAY },
272 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE] = { "throttleReadBpsDevice", BLOBMSG_TYPE_ARRAY },
273 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE] = { "throttleWriteBpsDevice", BLOBMSG_TYPE_ARRAY },
274 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE] = { "throttleReadIOPSDevice", BLOBMSG_TYPE_ARRAY },
275 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE] = { "throttleWriteIOPSDevice", BLOBMSG_TYPE_ARRAY },
276 };
277
278 struct posix_dev {
279 uint64_t major;
280 uint64_t minor;
281 };
282
283 struct iomax_line {
284 struct avl_node avl;
285 struct posix_dev dev;
286 uint64_t rbps;
287 uint64_t wbps;
288 uint64_t riops;
289 uint64_t wiops;
290 };
291
292 static int avl_devcmp(const void *k1, const void *k2, void *ptr)
293 {
294 struct posix_dev *d1 = (struct posix_dev *)k1, *d2 = (struct posix_dev *)k2;
295
296 if (d1->major < d2->major)
297 return -1;
298
299 if (d1->major > d2->major)
300 return 1;
301
302 if (d1->minor < d2->minor)
303 return -1;
304
305 if (d1->minor > d2->minor)
306 return 1;
307
308 return 0;
309 }
310
311 static struct iomax_line *get_iomax_line(struct avl_tree *iomax, uint64_t major, uint64_t minor)
312 {
313 struct iomax_line *l;
314 struct posix_dev d;
315 d.major = major;
316 d.minor = minor;
317 l = avl_find_element(iomax, &d, l, avl);
318 if (!l) {
319 l = malloc(sizeof(struct iomax_line));
320 if (!l)
321 exit(ENOMEM);
322
323 l->dev.major = d.major;
324 l->dev.minor = d.minor;
325 l->avl.key = &l->dev;
326 l->rbps = -1;
327 l->wbps = -1;
328 l->riops = -1;
329 l->wiops = -1;
330 avl_insert(iomax, &l->avl);
331 }
332
333 return l;
334 }
335
336 static int parseOCIlinuxcgroups_legacy_blockio(struct blob_attr *msg)
337 {
338 struct blob_attr *tb[__OCI_LINUX_CGROUPS_BLOCKIO_MAX],
339 *tbwd[__OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX],
340 *tbtd[__OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX],
341 *cur;
342 int rem;
343 int weight = -1, leafweight = -1;
344 size_t numweightstrs = 0, numiomaxstrs = 0, strtotlen = 1;
345 char **weightstrs = NULL, **iomaxstrs = NULL, **curstr;
346 char *weightstr, *iomaxstr;
347 struct avl_tree iomax;
348 struct iomax_line *curiomax, *tmp;
349
350 blobmsg_parse(oci_linux_cgroups_blockio_policy, __OCI_LINUX_CGROUPS_BLOCKIO_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
351
352 if (tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]) {
353 weight = blobmsg_get_u32(tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]);
354 ++numweightstrs;
355 }
356
357 if (weight > CGROUP_IO_WEIGHT_MAX)
358 return ERANGE;
359
360 if (tb[OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT])
361 leafweight = blobmsg_get_u32(tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]);
362
363 if (leafweight > CGROUP_IO_WEIGHT_MAX)
364 return ERANGE;
365
366 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE], rem)
367 ++numweightstrs;
368
369 weightstrs = calloc(numweightstrs + 1, sizeof(char *));
370 if (!weightstrs)
371 exit(ENOMEM);
372
373 numweightstrs = 0;
374
375 if (weight > -1)
376 if (asprintf(&weightstrs[numweightstrs++], "default %d", weight) < 0)
377 return ENOMEM;
378
379 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE], rem) {
380 uint64_t major, minor;
381 int devweight = weight, devleafweight = leafweight;
382
383 blobmsg_parse(oci_linux_cgroups_blockio_weightdevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX, tbwd, blobmsg_data(cur), blobmsg_len(cur));
384 if (!tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR] ||
385 !tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR])
386 return ENODATA;
387
388 if (!tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT] &&
389 !tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
390 return ENODATA;
391
392 if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT])
393 devweight = blobmsg_get_u32(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT]);
394
395 if (devweight > CGROUP_IO_WEIGHT_MAX)
396 return ERANGE;
397
398 if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
399 devleafweight = blobmsg_get_u32(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT]);
400
401 if (devleafweight > CGROUP_IO_WEIGHT_MAX)
402 return ERANGE;
403
404 if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
405 return ENOTSUP;
406
407 major = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR]);
408 minor = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR]);
409
410 if (asprintf(&weightstrs[numweightstrs++], "%" PRIu64 ":%" PRIu64 " %u", major, minor, devweight) < 0)
411 return ENOMEM;
412 }
413
414 if (numweightstrs) {
415 curstr = weightstrs;
416 while (*curstr)
417 strtotlen += strlen(*(curstr++)) + 1;
418
419 weightstr = calloc(strtotlen, sizeof(char));
420 if (!weightstr)
421 exit(ENOMEM);
422
423 curstr = weightstrs;
424 while (*curstr) {
425 strcat(weightstr, *curstr);
426 strcat(weightstr, "\n");
427 free(*(curstr++));
428 }
429
430 cgroups_set("io.bfq.weight", weightstr);
431 free(weightstr);
432 };
433
434 free(weightstrs);
435
436 avl_init(&iomax, avl_devcmp, false, NULL);
437
438 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE], rem) {
439 struct iomax_line *l;
440
441 blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
442
443 if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
444 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
445 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
446 return ENODATA;
447
448 l = get_iomax_line(&iomax,
449 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
450 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
451
452 l->rbps = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
453 }
454
455 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE], rem) {
456 struct iomax_line *l;
457
458 blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
459
460 if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
461 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
462 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
463 return ENODATA;
464
465 l = get_iomax_line(&iomax,
466 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
467 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
468
469 l->wbps = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
470 }
471
472 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE], rem) {
473 struct iomax_line *l;
474
475 blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
476
477 if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
478 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
479 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
480 return ENODATA;
481
482 l = get_iomax_line(&iomax,
483 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
484 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
485
486 l->riops = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
487 }
488
489 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE], rem) {
490 struct iomax_line *l;
491
492 blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
493
494 if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
495 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
496 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
497 return ENODATA;
498
499 l = get_iomax_line(&iomax,
500 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
501 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
502
503 l->wiops = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
504 }
505
506 avl_for_each_element(&iomax, curiomax, avl)
507 ++numiomaxstrs;
508
509 if (!numiomaxstrs)
510 return 0;
511
512 iomaxstrs = calloc(numiomaxstrs + 1, sizeof(char *));
513 if (!iomaxstrs)
514 exit(ENOMEM);
515
516 numiomaxstrs = 0;
517
518 avl_for_each_element(&iomax, curiomax, avl) {
519 char iomaxlstr[160];
520 char lstr[32];
521
522 sprintf(iomaxlstr, "%" PRIu64 ":%" PRIu64 " ", curiomax->dev.major, curiomax->dev.minor);
523
524 if (curiomax->rbps != -1) {
525 sprintf(lstr, "rbps=%" PRIu64 " ", curiomax->rbps);
526 strcat(iomaxlstr, lstr);
527 }
528 if (curiomax->wbps != -1) {
529 sprintf(lstr, "wbps=%" PRIu64 " ", curiomax->wbps);
530 strcat(iomaxlstr, lstr);
531 }
532 if (curiomax->riops != -1) {
533 sprintf(lstr, "riops=%" PRIu64 " ", curiomax->riops);
534 strcat(iomaxlstr, lstr);
535 }
536 if (curiomax->wiops != -1) {
537 sprintf(lstr, "wiops=%" PRIu64 " ", curiomax->wiops);
538 strcat(iomaxlstr, lstr);
539 }
540
541 iomaxstrs[numiomaxstrs++] = strdup(iomaxlstr);
542 }
543
544 avl_for_each_element_safe(&iomax, curiomax, avl, tmp) {
545 avl_delete(&iomax, &curiomax->avl);
546 free(curiomax);
547 }
548
549 strtotlen = 1; /* 1 accounts for \0 at end of string */
550 if (numiomaxstrs) {
551 curstr = iomaxstrs;
552 while (*curstr)
553 strtotlen += strlen(*(curstr++)) + 1; /* +1 accounts for \n at end of line */
554
555 iomaxstr = calloc(strtotlen, sizeof(char));
556 if (!iomaxstr)
557 exit(ENOMEM);
558
559 curstr = iomaxstrs;
560
561 while (*curstr) {
562 strcat(iomaxstr, *curstr);
563 strcat(iomaxstr, "\n");
564 free(*(curstr++));
565 }
566
567 cgroups_set("io.max", iomaxstr);
568 free(iomaxstr);
569 };
570
571 free(iomaxstrs);
572
573 return 0;
574 }
575
576
577 enum {
578 OCI_LINUX_CGROUPS_CPU_SHARES,
579 OCI_LINUX_CGROUPS_CPU_PERIOD,
580 OCI_LINUX_CGROUPS_CPU_QUOTA,
581 OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME,
582 OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD,
583 OCI_LINUX_CGROUPS_CPU_CPUS,
584 OCI_LINUX_CGROUPS_CPU_MEMS,
585 __OCI_LINUX_CGROUPS_CPU_MAX,
586 };
587
588 static const struct blobmsg_policy oci_linux_cgroups_cpu_policy[] = {
589 [OCI_LINUX_CGROUPS_CPU_SHARES] = { "shares", BLOBMSG_CAST_INT64 },
590 [OCI_LINUX_CGROUPS_CPU_PERIOD] = { "period", BLOBMSG_CAST_INT64 },
591 [OCI_LINUX_CGROUPS_CPU_QUOTA] = { "quota", BLOBMSG_CAST_INT64 }, /* signed int64! */
592 [OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD] = { "realtimePeriod", BLOBMSG_CAST_INT64 },
593 [OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME] = { "realtimeRuntime", BLOBMSG_CAST_INT64 },
594 [OCI_LINUX_CGROUPS_CPU_CPUS] = { "cpus", BLOBMSG_TYPE_STRING },
595 [OCI_LINUX_CGROUPS_CPU_MEMS] = { "mems", BLOBMSG_TYPE_STRING },
596 };
597
598 static int parseOCIlinuxcgroups_legacy_cpu(struct blob_attr *msg)
599 {
600 struct blob_attr *tb[__OCI_LINUX_CGROUPS_CPU_MAX];
601 uint64_t shares, period = 0;
602 int64_t quota = -2; /* unset */
603 char tmp[32] = { 0 };
604
605 blobmsg_parse(oci_linux_cgroups_cpu_policy, __OCI_LINUX_CGROUPS_CPU_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
606
607 if (tb[OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD] ||
608 tb[OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME])
609 return ENOTSUP; /* no equivalent in cgroup2 */
610
611 if (tb[OCI_LINUX_CGROUPS_CPU_SHARES]) {
612 shares = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_CPU_SHARES]);
613 if ((shares < 2) || (shares > 262144))
614 return ERANGE;
615
616 snprintf(tmp, sizeof(tmp), "%" PRIu64, (((uint64_t)1) + ((shares - 2) * 9999) / 262142));
617 cgroups_set("cpu.weight", tmp);
618 tmp[0] = '\0';
619 }
620
621 if (tb[OCI_LINUX_CGROUPS_CPU_QUOTA])
622 quota = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_CPU_QUOTA]);
623
624 if (tb[OCI_LINUX_CGROUPS_CPU_PERIOD])
625 period = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_CPU_PERIOD]);
626
627 if (period) {
628 if (quota >= 0)
629 snprintf(tmp, sizeof(tmp), "%" PRId64 " %" PRIu64 , quota, period);
630 else
631 snprintf(tmp, sizeof(tmp), "max %" PRIu64, period); /* assume default */
632 } else if (quota >= 0) {
633 snprintf(tmp, sizeof(tmp), "%" PRId64, quota);
634 } else if (quota == -1) {
635 strcpy(tmp, "max");
636 }
637
638 if (tmp[0])
639 cgroups_set("cpu.max", tmp);
640
641 if (tb[OCI_LINUX_CGROUPS_CPU_CPUS])
642 cgroups_set("cpuset.cpus", blobmsg_get_string(tb[OCI_LINUX_CGROUPS_CPU_CPUS]));
643
644 if (tb[OCI_LINUX_CGROUPS_CPU_MEMS])
645 cgroups_set("cpuset.mems", blobmsg_get_string(tb[OCI_LINUX_CGROUPS_CPU_MEMS]));
646
647 return 0;
648 }
649
650
651 enum {
652 OCI_LINUX_CGROUPS_MEMORY_LIMIT,
653 OCI_LINUX_CGROUPS_MEMORY_RESERVATION,
654 OCI_LINUX_CGROUPS_MEMORY_SWAP,
655 OCI_LINUX_CGROUPS_MEMORY_KERNEL,
656 OCI_LINUX_CGROUPS_MEMORY_KERNELTCP,
657 OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS,
658 OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER,
659 OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY,
660 __OCI_LINUX_CGROUPS_MEMORY_MAX,
661 };
662
663 static const struct blobmsg_policy oci_linux_cgroups_memory_policy[] = {
664 [OCI_LINUX_CGROUPS_MEMORY_LIMIT] = { "limit", BLOBMSG_CAST_INT64 }, /* signed int64! */
665 [OCI_LINUX_CGROUPS_MEMORY_RESERVATION] = { "reservation", BLOBMSG_CAST_INT64 }, /* signed int64! */
666 [OCI_LINUX_CGROUPS_MEMORY_SWAP] = { "swap", BLOBMSG_CAST_INT64 }, /* signed int64! */
667 [OCI_LINUX_CGROUPS_MEMORY_KERNEL] = { "kernel", BLOBMSG_CAST_INT64 }, /* signed int64! ignored */
668 [OCI_LINUX_CGROUPS_MEMORY_KERNELTCP] = { "kernelTCP", BLOBMSG_CAST_INT64 }, /* signed int64! ignored */
669 [OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] = { "swappiness", BLOBMSG_CAST_INT64 },
670 [OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER] = { "disableOOMKiller", BLOBMSG_TYPE_BOOL },
671 [OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY] { "useHierarchy", BLOBMSG_TYPE_BOOL },
672 };
673
674 static int parseOCIlinuxcgroups_legacy_memory(struct blob_attr *msg)
675 {
676 struct blob_attr *tb[__OCI_LINUX_CGROUPS_MEMORY_MAX];
677 char tmp[32] = { 0 };
678 int64_t limit = -1, swap, reservation;
679
680 blobmsg_parse(oci_linux_cgroups_memory_policy, __OCI_LINUX_CGROUPS_MEMORY_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
681
682 /*
683 * not all properties of the OCI memory section can be mapped to cgroup2
684 * kernel memory accounting is always enabled and included in the set
685 * memory limit, hence these options can be ignored
686 * disableOOMKiller could be emulated using oom_score_adj + seccomp eBPF
687 * preventing self-upgrade (but allow downgrade)
688 *
689 * see also https://github.com/opencontainers/runtime-spec/issues/1005
690 */
691 if (tb[OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] ||
692 tb[OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER] ||
693 tb[OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY])
694 return ENOTSUP;
695
696
697 if (tb[OCI_LINUX_CGROUPS_MEMORY_LIMIT]) {
698 limit = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_LIMIT]);
699 if (limit == -1)
700 strcpy(tmp, "max");
701 else
702 snprintf(tmp, sizeof(tmp), "%" PRId64, limit);
703
704 cgroups_set("memory.max", tmp);
705 }
706
707 if (tb[OCI_LINUX_CGROUPS_MEMORY_RESERVATION]) {
708 reservation = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_RESERVATION]);
709
710 if (reservation == -1)
711 strcpy(tmp, "max");
712 else
713 snprintf(tmp, sizeof(tmp), "%" PRId64, reservation);
714
715 cgroups_set("memory.low", tmp);
716 }
717
718 /* OCI 'swap' acounts for memory+swap */
719 if (tb[OCI_LINUX_CGROUPS_MEMORY_SWAP]) {
720 swap = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_SWAP]);
721
722 if (swap == -1)
723 strcpy(tmp, "max");
724 else if (limit == -1 || (limit < swap))
725 snprintf(tmp, sizeof(tmp), "%" PRId64, swap);
726 else
727 snprintf(tmp, sizeof(tmp), "%" PRId64, limit - swap);
728
729 cgroups_set("memory.swap_max", tmp);
730 }
731
732 return 0;
733 }
734
735
736 enum {
737 OCI_LINUX_CGROUPS_PIDS_LIMIT,
738 __OCI_LINUX_CGROUPS_PIDS_MAX,
739 };
740
741 static const struct blobmsg_policy oci_linux_cgroups_pids_policy[] = {
742 [OCI_LINUX_CGROUPS_PIDS_LIMIT] = { "limit", BLOBMSG_CAST_INT64 },
743 };
744
745 static int parseOCIlinuxcgroups_legacy_pids(struct blob_attr *msg)
746 {
747 struct blob_attr *tb[__OCI_LINUX_CGROUPS_MEMORY_MAX];
748 char tmp[32] = { 0 };
749
750 blobmsg_parse(oci_linux_cgroups_pids_policy, __OCI_LINUX_CGROUPS_PIDS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
751
752 if (!tb[OCI_LINUX_CGROUPS_PIDS_LIMIT])
753 return EINVAL;
754
755 snprintf(tmp, sizeof(tmp), "%" PRIu64, blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_PIDS_LIMIT]));
756
757 cgroups_set("pids.max", tmp);
758
759 return 0;
760 }
761
762 static int parseOCIlinuxcgroups_unified(struct blob_attr *msg)
763 {
764 struct blob_attr *cur;
765 int rem;
766
767 blobmsg_for_each_attr(cur, msg, rem) {
768 if (blobmsg_type(cur) != BLOBMSG_TYPE_STRING)
769 return EINVAL;
770
771 /* restrict keys */
772 if (strchr(blobmsg_name(cur), '/') ||
773 !strcmp(blobmsg_name(cur), "cgroup.subtree_control") ||
774 !strcmp(blobmsg_name(cur), "cgroup.procs") ||
775 !strcmp(blobmsg_name(cur), "cgroup.threads") ||
776 !strcmp(blobmsg_name(cur), "cgroup.freeze"))
777 return EINVAL;
778
779 cgroups_set(blobmsg_name(cur), blobmsg_get_string(cur));
780 }
781
782 return 0;
783 }
784
785 enum {
786 OCI_LINUX_CGROUPS_BLOCKIO,
787 OCI_LINUX_CGROUPS_CPU,
788 OCI_LINUX_CGROUPS_DEVICES,
789 OCI_LINUX_CGROUPS_HUGEPAGELIMITS,
790 OCI_LINUX_CGROUPS_INTELRDT,
791 OCI_LINUX_CGROUPS_MEMORY,
792 OCI_LINUX_CGROUPS_NETWORK,
793 OCI_LINUX_CGROUPS_PIDS,
794 OCI_LINUX_CGROUPS_RDMA,
795 OCI_LINUX_CGROUPS_UNIFIED,
796 __OCI_LINUX_CGROUPS_MAX,
797 };
798
799 static const struct blobmsg_policy oci_linux_cgroups_policy[] = {
800 [OCI_LINUX_CGROUPS_BLOCKIO] = { "blockIO", BLOBMSG_TYPE_TABLE },
801 [OCI_LINUX_CGROUPS_CPU] = { "cpu", BLOBMSG_TYPE_TABLE },
802 [OCI_LINUX_CGROUPS_DEVICES] = { "devices", BLOBMSG_TYPE_ARRAY },
803 [OCI_LINUX_CGROUPS_HUGEPAGELIMITS] = { "hugepageLimits", BLOBMSG_TYPE_ARRAY },
804 [OCI_LINUX_CGROUPS_INTELRDT] = { "intelRdt", BLOBMSG_TYPE_TABLE },
805 [OCI_LINUX_CGROUPS_MEMORY] = { "memory", BLOBMSG_TYPE_TABLE },
806 [OCI_LINUX_CGROUPS_NETWORK] = { "network", BLOBMSG_TYPE_TABLE },
807 [OCI_LINUX_CGROUPS_PIDS] = { "pids", BLOBMSG_TYPE_TABLE },
808 [OCI_LINUX_CGROUPS_RDMA] = { "rdma", BLOBMSG_TYPE_TABLE },
809 [OCI_LINUX_CGROUPS_UNIFIED] = { "unified", BLOBMSG_TYPE_TABLE },
810 };
811
812 int parseOCIlinuxcgroups(struct blob_attr *msg)
813 {
814 struct blob_attr *tb[__OCI_LINUX_CGROUPS_MAX];
815 int ret;
816
817 blobmsg_parse(oci_linux_cgroups_policy, __OCI_LINUX_CGROUPS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
818
819 if (tb[OCI_LINUX_CGROUPS_HUGEPAGELIMITS] ||
820 tb[OCI_LINUX_CGROUPS_INTELRDT] ||
821 tb[OCI_LINUX_CGROUPS_NETWORK] ||
822 tb[OCI_LINUX_CGROUPS_RDMA])
823 return ENOTSUP;
824
825 if (tb[OCI_LINUX_CGROUPS_BLOCKIO]) {
826 ret = parseOCIlinuxcgroups_legacy_blockio(tb[OCI_LINUX_CGROUPS_BLOCKIO]);
827 if (ret)
828 return ret;
829 }
830
831 if (tb[OCI_LINUX_CGROUPS_CPU]) {
832 ret = parseOCIlinuxcgroups_legacy_cpu(tb[OCI_LINUX_CGROUPS_CPU]);
833 if (ret)
834 return ret;
835 }
836
837 if (tb[OCI_LINUX_CGROUPS_DEVICES]) {
838 ret = parseOCIlinuxcgroups_devices(tb[OCI_LINUX_CGROUPS_DEVICES]);
839 if (ret)
840 return ret;
841 }
842
843 if (tb[OCI_LINUX_CGROUPS_MEMORY]) {
844 ret = parseOCIlinuxcgroups_legacy_memory(tb[OCI_LINUX_CGROUPS_MEMORY]);
845 if (ret)
846 return ret;
847 }
848
849 if (tb[OCI_LINUX_CGROUPS_PIDS]) {
850 ret = parseOCIlinuxcgroups_legacy_pids(tb[OCI_LINUX_CGROUPS_PIDS]);
851 if (ret)
852 return ret;
853 }
854
855 if (tb[OCI_LINUX_CGROUPS_UNIFIED]) {
856 ret = parseOCIlinuxcgroups_unified(tb[OCI_LINUX_CGROUPS_UNIFIED]);
857 if (ret)
858 return ret;
859 }
860
861 return 0;
862 }