LXR linux/kernel/cgroup

   1/*
   2 * Process number limiting controller for cgroups.
   3 *
   4 * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
   5 * after a certain limit is reached.
   6 *
   7 * Since it is trivial to hit the task limit without hitting any kmemcg limits
   8 * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
   9 * preventable in the scope of a cgroup hierarchy by allowing resource limiting
  10 * of the number of tasks in a cgroup.
  11 *
  12 * In order to use the `pids` controller, set the maximum number of tasks in
  13 * pids.max (this is not available in the root cgroup for obvious reasons). The
  14 * number of processes currently in the cgroup is given by pids.current.
  15 * Organisational operations are not blocked by cgroup policies, so it is
  16 * possible to have pids.current > pids.max. However, it is not possible to
  17 * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
  18 * would cause a cgroup policy to be violated.
  19 *
  20 * To set a cgroup to have no limit, set pids.max to "max". This is the default
  21 * for all new cgroups (N.B. that PID limits are hierarchical, so the most
  22 * stringent limit in the hierarchy is followed).
  23 *
  24 * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
  25 * a superset of parent/child/pids.current.
  26 *
  27 * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
  28 *
  29 * This file is subject to the terms and conditions of version 2 of the GNU
  30 * General Public License.  See the file COPYING in the main directory of the
  31 * Linux distribution for more details.
  32 */
  33
  34#include <linux/kernel.h>
  35#include <linux/threads.h>
  36#include <linux/atomic.h>
  37#include <linux/cgroup.h>
  38#include <linux/slab.h>
  39
  40#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
  41#define PIDS_MAX_STR "max"
  42
  43struct pids_cgroup {
  44        struct cgroup_subsys_state      css;
  45
  46        /*
  47         * Use 64-bit types so that we can safely represent "max" as
  48         * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
  49         */
  50        atomic64_t                      counter;
  51        int64_t                         limit;
  52};
  53
  54static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
  55{
  56        return container_of(css, struct pids_cgroup, css);
  57}
  58
  59static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
  60{
  61        return css_pids(pids->css.parent);
  62}
  63
  64static struct cgroup_subsys_state *
  65pids_css_alloc(struct cgroup_subsys_state *parent)
  66{
  67        struct pids_cgroup *pids;
  68
  69        pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
  70        if (!pids)
  71                return ERR_PTR(-ENOMEM);
  72
  73        pids->limit = PIDS_MAX;
  74        atomic64_set(&pids->counter, 0);
  75        return &pids->css;
  76}
  77
  78static void pids_css_free(struct cgroup_subsys_state *css)
  79{
  80        kfree(css_pids(css));
  81}
  82
  83/**
  84 * pids_cancel - uncharge the local pid count
  85 * @pids: the pid cgroup state
  86 * @num: the number of pids to cancel
  87 *
  88 * This function will WARN if the pid count goes under 0, because such a case is
  89 * a bug in the pids controller proper.
  90 */
  91static void pids_cancel(struct pids_cgroup *pids, int num)
  92{
  93        /*
  94         * A negative count (or overflow for that matter) is invalid,
  95         * and indicates a bug in the `pids` controller proper.
  96         */
  97        WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
  98}
  99
 100/**
 101 * pids_uncharge - hierarchically uncharge the pid count
 102 * @pids: the pid cgroup state
 103 * @num: the number of pids to uncharge
 104 */
 105static void pids_uncharge(struct pids_cgroup *pids, int num)
 106{
 107        struct pids_cgroup *p;
 108
 109        for (p = pids; parent_pids(p); p = parent_pids(p))
 110                pids_cancel(p, num);
 111}
 112
 113/**
 114 * pids_charge - hierarchically charge the pid count
 115 * @pids: the pid cgroup state
 116 * @num: the number of pids to charge
 117 *
 118 * This function does *not* follow the pid limit set. It cannot fail and the new
 119 * pid count may exceed the limit. This is only used for reverting failed
 120 * attaches, where there is no other way out than violating the limit.
 121 */
 122static void pids_charge(struct pids_cgroup *pids, int num)
 123{
 124        struct pids_cgroup *p;
 125
 126        for (p = pids; parent_pids(p); p = parent_pids(p))
 127                atomic64_add(num, &p->counter);
 128}
 129
 130/**
 131 * pids_try_charge - hierarchically try to charge the pid count
 132 * @pids: the pid cgroup state
 133 * @num: the number of pids to charge
 134 *
 135 * This function follows the set limit. It will fail if the charge would cause
 136 * the new value to exceed the hierarchical limit. Returns 0 if the charge
 137 * succeeded, otherwise -EAGAIN.
 138 */
 139static int pids_try_charge(struct pids_cgroup *pids, int num)
 140{
 141        struct pids_cgroup *p, *q;
 142
 143        for (p = pids; parent_pids(p); p = parent_pids(p)) {
 144                int64_t new = atomic64_add_return(num, &p->counter);
 145
 146                /*
 147                 * Since new is capped to the maximum number of pid_t, if
 148                 * p->limit is %PIDS_MAX then we know that this test will never
 149                 * fail.
 150                 */
 151                if (new > p->limit)
 152                        goto revert;
 153        }
 154
 155        return 0;
 156
 157revert:
 158        for (q = pids; q != p; q = parent_pids(q))
 159                pids_cancel(q, num);
 160        pids_cancel(p, num);
 161
 162        return -EAGAIN;
 163}
 164
 165static int pids_can_attach(struct cgroup_taskset *tset)
 166{
 167        struct task_struct *task;
 168        struct cgroup_subsys_state *dst_css;
 169
 170        cgroup_taskset_for_each(task, dst_css, tset) {
 171                struct pids_cgroup *pids = css_pids(dst_css);
 172                struct cgroup_subsys_state *old_css;
 173                struct pids_cgroup *old_pids;
 174
 175                /*
 176                 * No need to pin @old_css between here and cancel_attach()
 177                 * because cgroup core protects it from being freed before
 178                 * the migration completes or fails.
 179                 */
 180                old_css = task_css(task, pids_cgrp_id);
 181                old_pids = css_pids(old_css);
 182
 183                pids_charge(pids, 1);
 184                pids_uncharge(old_pids, 1);
 185        }
 186
 187        return 0;
 188}
 189
 190static void pids_cancel_attach(struct cgroup_taskset *tset)
 191{
 192        struct task_struct *task;
 193        struct cgroup_subsys_state *dst_css;
 194
 195        cgroup_taskset_for_each(task, dst_css, tset) {
 196                struct pids_cgroup *pids = css_pids(dst_css);
 197                struct cgroup_subsys_state *old_css;
 198                struct pids_cgroup *old_pids;
 199
 200                old_css = task_css(task, pids_cgrp_id);
 201                old_pids = css_pids(old_css);
 202
 203                pids_charge(old_pids, 1);
 204                pids_uncharge(pids, 1);
 205        }
 206}
 207
 208/*
 209 * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
 210 * on threadgroup_change_begin() held by the copy_process().
 211 */
 212static int pids_can_fork(struct task_struct *task)
 213{
 214        struct cgroup_subsys_state *css;
 215        struct pids_cgroup *pids;
 216
 217        css = task_css_check(current, pids_cgrp_id, true);
 218        pids = css_pids(css);
 219        return pids_try_charge(pids, 1);
 220}
 221
 222static void pids_cancel_fork(struct task_struct *task)
 223{
 224        struct cgroup_subsys_state *css;
 225        struct pids_cgroup *pids;
 226
 227        css = task_css_check(current, pids_cgrp_id, true);
 228        pids = css_pids(css);
 229        pids_uncharge(pids, 1);
 230}
 231
 232static void pids_free(struct task_struct *task)
 233{
 234        struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
 235
 236        pids_uncharge(pids, 1);
 237}
 238
 239static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
 240                              size_t nbytes, loff_t off)
 241{
 242        struct cgroup_subsys_state *css = of_css(of);
 243        struct pids_cgroup *pids = css_pids(css);
 244        int64_t limit;
 245        int err;
 246
 247        buf = strstrip(buf);
 248        if (!strcmp(buf, PIDS_MAX_STR)) {
 249                limit = PIDS_MAX;
 250                goto set_limit;
 251        }
 252
 253        err = kstrtoll(buf, 0, &limit);
 254        if (err)
 255                return err;
 256
 257        if (limit < 0 || limit >= PIDS_MAX)
 258                return -EINVAL;
 259
 260set_limit:
 261        /*
 262         * Limit updates don't need to be mutex'd, since it isn't
 263         * critical that any racing fork()s follow the new limit.
 264         */
 265        pids->limit = limit;
 266        return nbytes;
 267}
 268
 269static int pids_max_show(struct seq_file *sf, void *v)
 270{
 271        struct cgroup_subsys_state *css = seq_css(sf);
 272        struct pids_cgroup *pids = css_pids(css);
 273        int64_t limit = pids->limit;
 274
 275        if (limit >= PIDS_MAX)
 276                seq_printf(sf, "%s\n", PIDS_MAX_STR);
 277        else
 278                seq_printf(sf, "%lld\n", limit);
 279
 280        return 0;
 281}
 282
 283static s64 pids_current_read(struct cgroup_subsys_state *css,
 284                             struct cftype *cft)
 285{
 286        struct pids_cgroup *pids = css_pids(css);
 287
 288        return atomic64_read(&pids->counter);
 289}
 290
 291static struct cftype pids_files[] = {
 292        {
 293                .name = "max",
 294                .write = pids_max_write,
 295                .seq_show = pids_max_show,
 296                .flags = CFTYPE_NOT_ON_ROOT,
 297        },
 298        {
 299                .name = "current",
 300                .read_s64 = pids_current_read,
 301                .flags = CFTYPE_NOT_ON_ROOT,
 302        },
 303        { }     /* terminate */
 304};
 305
 306struct cgroup_subsys pids_cgrp_subsys = {
 307        .css_alloc      = pids_css_alloc,
 308        .css_free       = pids_css_free,
 309        .can_attach     = pids_can_attach,
 310        .cancel_attach  = pids_cancel_attach,
 311        .can_fork       = pids_can_fork,
 312        .cancel_fork    = pids_cancel_fork,
 313        .free           = pids_free,
 314        .legacy_cftypes = pids_files,
 315        .dfl_cftypes    = pids_files,
 316};
 317