1
2
3#define _GNU_SOURCE
4
5#include <errno.h>
6#include <fcntl.h>
7#include <linux/limits.h>
8#include <poll.h>
9#include <signal.h>
10#include <stdio.h>
11#include <stdlib.h>
12#include <string.h>
13#include <sys/inotify.h>
14#include <sys/stat.h>
15#include <sys/types.h>
16#include <sys/wait.h>
17#include <unistd.h>
18
19#include "cgroup_util.h"
20#include "../clone3/clone3_selftests.h"
21
22
23static ssize_t read_text(const char *path, char *buf, size_t max_len)
24{
25 ssize_t len;
26 int fd;
27
28 fd = open(path, O_RDONLY);
29 if (fd < 0)
30 return -errno;
31
32 len = read(fd, buf, max_len - 1);
33
34 if (len >= 0)
35 buf[len] = 0;
36
37 close(fd);
38 return len < 0 ? -errno : len;
39}
40
41
42static ssize_t write_text(const char *path, char *buf, ssize_t len)
43{
44 int fd;
45
46 fd = open(path, O_WRONLY | O_APPEND);
47 if (fd < 0)
48 return -errno;
49
50 len = write(fd, buf, len);
51 close(fd);
52 return len < 0 ? -errno : len;
53}
54
55char *cg_name(const char *root, const char *name)
56{
57 size_t len = strlen(root) + strlen(name) + 2;
58 char *ret = malloc(len);
59
60 snprintf(ret, len, "%s/%s", root, name);
61
62 return ret;
63}
64
65char *cg_name_indexed(const char *root, const char *name, int index)
66{
67 size_t len = strlen(root) + strlen(name) + 10;
68 char *ret = malloc(len);
69
70 snprintf(ret, len, "%s/%s_%d", root, name, index);
71
72 return ret;
73}
74
75char *cg_control(const char *cgroup, const char *control)
76{
77 size_t len = strlen(cgroup) + strlen(control) + 2;
78 char *ret = malloc(len);
79
80 snprintf(ret, len, "%s/%s", cgroup, control);
81
82 return ret;
83}
84
85
86int cg_read(const char *cgroup, const char *control, char *buf, size_t len)
87{
88 char path[PATH_MAX];
89 ssize_t ret;
90
91 snprintf(path, sizeof(path), "%s/%s", cgroup, control);
92
93 ret = read_text(path, buf, len);
94 return ret >= 0 ? 0 : ret;
95}
96
97int cg_read_strcmp(const char *cgroup, const char *control,
98 const char *expected)
99{
100 size_t size;
101 char *buf;
102 int ret;
103
104
105 if (!expected)
106 return -1;
107 else
108 size = strlen(expected) + 1;
109
110 buf = malloc(size);
111 if (!buf)
112 return -1;
113
114 if (cg_read(cgroup, control, buf, size)) {
115 free(buf);
116 return -1;
117 }
118
119 ret = strcmp(expected, buf);
120 free(buf);
121 return ret;
122}
123
124int cg_read_strstr(const char *cgroup, const char *control, const char *needle)
125{
126 char buf[PAGE_SIZE];
127
128 if (cg_read(cgroup, control, buf, sizeof(buf)))
129 return -1;
130
131 return strstr(buf, needle) ? 0 : -1;
132}
133
134long cg_read_long(const char *cgroup, const char *control)
135{
136 char buf[128];
137
138 if (cg_read(cgroup, control, buf, sizeof(buf)))
139 return -1;
140
141 return atol(buf);
142}
143
144long cg_read_key_long(const char *cgroup, const char *control, const char *key)
145{
146 char buf[PAGE_SIZE];
147 char *ptr;
148
149 if (cg_read(cgroup, control, buf, sizeof(buf)))
150 return -1;
151
152 ptr = strstr(buf, key);
153 if (!ptr)
154 return -1;
155
156 return atol(ptr + strlen(key));
157}
158
159long cg_read_lc(const char *cgroup, const char *control)
160{
161 char buf[PAGE_SIZE];
162 const char delim[] = "\n";
163 char *line;
164 long cnt = 0;
165
166 if (cg_read(cgroup, control, buf, sizeof(buf)))
167 return -1;
168
169 for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
170 cnt++;
171
172 return cnt;
173}
174
175
176int cg_write(const char *cgroup, const char *control, char *buf)
177{
178 char path[PATH_MAX];
179 ssize_t len = strlen(buf), ret;
180
181 snprintf(path, sizeof(path), "%s/%s", cgroup, control);
182 ret = write_text(path, buf, len);
183 return ret == len ? 0 : ret;
184}
185
186int cg_write_numeric(const char *cgroup, const char *control, long value)
187{
188 char buf[64];
189 int ret;
190
191 ret = sprintf(buf, "%lu", value);
192 if (ret < 0)
193 return ret;
194
195 return cg_write(cgroup, control, buf);
196}
197
198int cg_find_unified_root(char *root, size_t len)
199{
200 char buf[10 * PAGE_SIZE];
201 char *fs, *mount, *type;
202 const char delim[] = "\n\t ";
203
204 if (read_text("/proc/self/mounts", buf, sizeof(buf)) <= 0)
205 return -1;
206
207
208
209
210
211 for (fs = strtok(buf, delim); fs; fs = strtok(NULL, delim)) {
212 mount = strtok(NULL, delim);
213 type = strtok(NULL, delim);
214 strtok(NULL, delim);
215 strtok(NULL, delim);
216 strtok(NULL, delim);
217
218 if (strcmp(type, "cgroup2") == 0) {
219 strncpy(root, mount, len);
220 return 0;
221 }
222 }
223
224 return -1;
225}
226
227int cg_create(const char *cgroup)
228{
229 return mkdir(cgroup, 0755);
230}
231
232int cg_wait_for_proc_count(const char *cgroup, int count)
233{
234 char buf[10 * PAGE_SIZE] = {0};
235 int attempts;
236 char *ptr;
237
238 for (attempts = 10; attempts >= 0; attempts--) {
239 int nr = 0;
240
241 if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
242 break;
243
244 for (ptr = buf; *ptr; ptr++)
245 if (*ptr == '\n')
246 nr++;
247
248 if (nr >= count)
249 return 0;
250
251 usleep(100000);
252 }
253
254 return -1;
255}
256
257int cg_killall(const char *cgroup)
258{
259 char buf[PAGE_SIZE];
260 char *ptr = buf;
261
262
263 if (!cg_write(cgroup, "cgroup.kill", "1"))
264 return 0;
265
266 if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
267 return -1;
268
269 while (ptr < buf + sizeof(buf)) {
270 int pid = strtol(ptr, &ptr, 10);
271
272 if (pid == 0)
273 break;
274 if (*ptr)
275 ptr++;
276 else
277 break;
278 if (kill(pid, SIGKILL))
279 return -1;
280 }
281
282 return 0;
283}
284
285int cg_destroy(const char *cgroup)
286{
287 int ret;
288
289retry:
290 ret = rmdir(cgroup);
291 if (ret && errno == EBUSY) {
292 cg_killall(cgroup);
293 usleep(100);
294 goto retry;
295 }
296
297 if (ret && errno == ENOENT)
298 ret = 0;
299
300 return ret;
301}
302
303int cg_enter(const char *cgroup, int pid)
304{
305 char pidbuf[64];
306
307 snprintf(pidbuf, sizeof(pidbuf), "%d", pid);
308 return cg_write(cgroup, "cgroup.procs", pidbuf);
309}
310
311int cg_enter_current(const char *cgroup)
312{
313 return cg_write(cgroup, "cgroup.procs", "0");
314}
315
316int cg_enter_current_thread(const char *cgroup)
317{
318 return cg_write(cgroup, "cgroup.threads", "0");
319}
320
321int cg_run(const char *cgroup,
322 int (*fn)(const char *cgroup, void *arg),
323 void *arg)
324{
325 int pid, retcode;
326
327 pid = fork();
328 if (pid < 0) {
329 return pid;
330 } else if (pid == 0) {
331 char buf[64];
332
333 snprintf(buf, sizeof(buf), "%d", getpid());
334 if (cg_write(cgroup, "cgroup.procs", buf))
335 exit(EXIT_FAILURE);
336 exit(fn(cgroup, arg));
337 } else {
338 waitpid(pid, &retcode, 0);
339 if (WIFEXITED(retcode))
340 return WEXITSTATUS(retcode);
341 else
342 return -1;
343 }
344}
345
346pid_t clone_into_cgroup(int cgroup_fd)
347{
348#ifdef CLONE_ARGS_SIZE_VER2
349 pid_t pid;
350
351 struct __clone_args args = {
352 .flags = CLONE_INTO_CGROUP,
353 .exit_signal = SIGCHLD,
354 .cgroup = cgroup_fd,
355 };
356
357 pid = sys_clone3(&args, sizeof(struct __clone_args));
358
359
360
361
362
363 if (pid < 0 && (errno == ENOSYS || errno == E2BIG))
364 goto pretend_enosys;
365
366 return pid;
367
368pretend_enosys:
369#endif
370 errno = ENOSYS;
371 return -ENOSYS;
372}
373
374int clone_reap(pid_t pid, int options)
375{
376 int ret;
377 siginfo_t info = {
378 .si_signo = 0,
379 };
380
381again:
382 ret = waitid(P_PID, pid, &info, options | __WALL | __WNOTHREAD);
383 if (ret < 0) {
384 if (errno == EINTR)
385 goto again;
386 return -1;
387 }
388
389 if (options & WEXITED) {
390 if (WIFEXITED(info.si_status))
391 return WEXITSTATUS(info.si_status);
392 }
393
394 if (options & WSTOPPED) {
395 if (WIFSTOPPED(info.si_status))
396 return WSTOPSIG(info.si_status);
397 }
398
399 if (options & WCONTINUED) {
400 if (WIFCONTINUED(info.si_status))
401 return 0;
402 }
403
404 return -1;
405}
406
407int dirfd_open_opath(const char *dir)
408{
409 return open(dir, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW | O_PATH);
410}
411
412#define close_prot_errno(fd) \
413 if (fd >= 0) { \
414 int _e_ = errno; \
415 close(fd); \
416 errno = _e_; \
417 }
418
419static int clone_into_cgroup_run_nowait(const char *cgroup,
420 int (*fn)(const char *cgroup, void *arg),
421 void *arg)
422{
423 int cgroup_fd;
424 pid_t pid;
425
426 cgroup_fd = dirfd_open_opath(cgroup);
427 if (cgroup_fd < 0)
428 return -1;
429
430 pid = clone_into_cgroup(cgroup_fd);
431 close_prot_errno(cgroup_fd);
432 if (pid == 0)
433 exit(fn(cgroup, arg));
434
435 return pid;
436}
437
438int cg_run_nowait(const char *cgroup,
439 int (*fn)(const char *cgroup, void *arg),
440 void *arg)
441{
442 int pid;
443
444 pid = clone_into_cgroup_run_nowait(cgroup, fn, arg);
445 if (pid > 0)
446 return pid;
447
448
449 if (pid < 0 && errno != ENOSYS)
450 return -1;
451
452 pid = fork();
453 if (pid == 0) {
454 char buf[64];
455
456 snprintf(buf, sizeof(buf), "%d", getpid());
457 if (cg_write(cgroup, "cgroup.procs", buf))
458 exit(EXIT_FAILURE);
459 exit(fn(cgroup, arg));
460 }
461
462 return pid;
463}
464
465int get_temp_fd(void)
466{
467 return open(".", O_TMPFILE | O_RDWR | O_EXCL);
468}
469
470int alloc_pagecache(int fd, size_t size)
471{
472 char buf[PAGE_SIZE];
473 struct stat st;
474 int i;
475
476 if (fstat(fd, &st))
477 goto cleanup;
478
479 size += st.st_size;
480
481 if (ftruncate(fd, size))
482 goto cleanup;
483
484 for (i = 0; i < size; i += sizeof(buf))
485 read(fd, buf, sizeof(buf));
486
487 return 0;
488
489cleanup:
490 return -1;
491}
492
493int alloc_anon(const char *cgroup, void *arg)
494{
495 size_t size = (unsigned long)arg;
496 char *buf, *ptr;
497
498 buf = malloc(size);
499 for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
500 *ptr = 0;
501
502 free(buf);
503 return 0;
504}
505
506int is_swap_enabled(void)
507{
508 char buf[PAGE_SIZE];
509 const char delim[] = "\n";
510 int cnt = 0;
511 char *line;
512
513 if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)
514 return -1;
515
516 for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
517 cnt++;
518
519 return cnt > 1;
520}
521
522int set_oom_adj_score(int pid, int score)
523{
524 char path[PATH_MAX];
525 int fd, len;
526
527 sprintf(path, "/proc/%d/oom_score_adj", pid);
528
529 fd = open(path, O_WRONLY | O_APPEND);
530 if (fd < 0)
531 return fd;
532
533 len = dprintf(fd, "%d", score);
534 if (len < 0) {
535 close(fd);
536 return len;
537 }
538
539 close(fd);
540 return 0;
541}
542
543int proc_mount_contains(const char *option)
544{
545 char buf[4 * PAGE_SIZE];
546 ssize_t read;
547
548 read = read_text("/proc/mounts", buf, sizeof(buf));
549 if (read < 0)
550 return read;
551
552 return strstr(buf, option) != NULL;
553}
554
555ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size)
556{
557 char path[PATH_MAX];
558
559 if (!pid)
560 snprintf(path, sizeof(path), "/proc/%s/%s",
561 thread ? "thread-self" : "self", item);
562 else
563 snprintf(path, sizeof(path), "/proc/%d/%s", pid, item);
564
565 size = read_text(path, buf, size);
566 return size < 0 ? -1 : size;
567}
568
569int proc_read_strstr(int pid, bool thread, const char *item, const char *needle)
570{
571 char buf[PAGE_SIZE];
572
573 if (proc_read_text(pid, thread, item, buf, sizeof(buf)) < 0)
574 return -1;
575
576 return strstr(buf, needle) ? 0 : -1;
577}
578
579int clone_into_cgroup_run_wait(const char *cgroup)
580{
581 int cgroup_fd;
582 pid_t pid;
583
584 cgroup_fd = dirfd_open_opath(cgroup);
585 if (cgroup_fd < 0)
586 return -1;
587
588 pid = clone_into_cgroup(cgroup_fd);
589 close_prot_errno(cgroup_fd);
590 if (pid < 0)
591 return -1;
592
593 if (pid == 0)
594 exit(EXIT_SUCCESS);
595
596
597
598
599
600 (void)clone_reap(pid, WEXITED);
601 return 0;
602}
603
604static int __prepare_for_wait(const char *cgroup, const char *filename)
605{
606 int fd, ret = -1;
607
608 fd = inotify_init1(0);
609 if (fd == -1)
610 return fd;
611
612 ret = inotify_add_watch(fd, cg_control(cgroup, filename), IN_MODIFY);
613 if (ret == -1) {
614 close(fd);
615 fd = -1;
616 }
617
618 return fd;
619}
620
621int cg_prepare_for_wait(const char *cgroup)
622{
623 return __prepare_for_wait(cgroup, "cgroup.events");
624}
625
626int memcg_prepare_for_wait(const char *cgroup)
627{
628 return __prepare_for_wait(cgroup, "memory.events");
629}
630
631int cg_wait_for(int fd)
632{
633 int ret = -1;
634 struct pollfd fds = {
635 .fd = fd,
636 .events = POLLIN,
637 };
638
639 while (true) {
640 ret = poll(&fds, 1, 10000);
641
642 if (ret == -1) {
643 if (errno == EINTR)
644 continue;
645
646 break;
647 }
648
649 if (ret > 0 && fds.revents & POLLIN) {
650 ret = 0;
651 break;
652 }
653 }
654
655 return ret;
656}
657