LXR linux/kernel/cgroup

   1/*
   2 * Process number limiting controller for cgroups.
   3 *
   4 * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
   5 * after a certain limit is reached.
   6 *
   7 * Since it is trivial to hit the task limit without hitting any kmemcg limits
   8 * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
   9 * preventable in the scope of a cgroup hierarchy by allowing resource limiting
  10 * of the number of tasks in a cgroup.
  11 *
  12 * In order to use the `pids` controller, set the maximum number of tasks in
  13 * pids.max (this is not available in the root cgroup for obvious reasons). The
  14 * number of processes currently in the cgroup is given by pids.current.
  15 * Organisational operations are not blocked by cgroup policies, so it is
  16 * possible to have pids.current > pids.max. However, it is not possible to
  17 * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
  18 * would cause a cgroup policy to be violated.
  19 *
  20 * To set a cgroup to have no limit, set pids.max to "max". This is the default
  21 * for all new cgroups (N.B. that PID limits are hierarchical, so the most
  22 * stringent limit in the hierarchy is followed).
  23 *
  24 * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
  25 * a superset of parent/child/pids.current.
  26 *
  27 * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
  28 *
  29 * This file is subject to the terms and conditions of version 2 of the GNU
  30 * General Public License.  See the file COPYING in the main directory of the
  31 * Linux distribution for more details.
  32 */
  33
  34#include <linux/kernel.h>
  35#include <linux/threads.h>
  36#include <linux/atomic.h>
  37#include <linux/cgroup.h>
  38#include <linux/slab.h>
  39#include <linux/seq_file.h>
  40
  41#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
  42#define PIDS_MAX_STR "max"
  43
  44struct pids_cgroup {
  45        struct cgroup_subsys_state      css;
  46
  47        /*
  48         * Use 64-bit types so that we can safely represent "max" as
  49         * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
  50         */
  51        atomic64_t                      counter;
  52        int64_t                         limit;
  53};
  54
  55static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
  56{
  57        return container_of(css, struct pids_cgroup, css);
  58}
  59
  60static inline struct pids_cgroup *cgroup_pids(struct cgroup *cgroup)
  61{
  62        return css_pids(cgroup_subsys_state(cgroup, pids_subsys_id));
  63}
  64
  65static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
  66{
  67        struct cgroup *pcg = pids->css.cgroup->parent;
  68        return pcg ? cgroup_pids(pcg) : NULL;
  69}
  70
  71static struct cgroup_subsys_state *pids_css_alloc(struct cgroup *cgroup)
  72{
  73        struct pids_cgroup *pids;
  74
  75        pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
  76        if (!pids)
  77                return ERR_PTR(-ENOMEM);
  78
  79        pids->limit = PIDS_MAX;
  80        atomic64_set(&pids->counter, 0);
  81        return &pids->css;
  82}
  83
  84static void pids_css_free(struct cgroup *cgroup)
  85{
  86        kfree(cgroup_pids(cgroup));
  87}
  88
  89/**
  90 * pids_cancel - uncharge the local pid count
  91 * @pids: the pid cgroup state
  92 * @num: the number of pids to cancel
  93 *
  94 * This function will WARN if the pid count goes under 0, because such a case is
  95 * a bug in the pids controller proper.
  96 */
  97static void pids_cancel(struct pids_cgroup *pids, int num)
  98{
  99        /*
 100         * A negative count (or overflow for that matter) is invalid,
 101         * and indicates a bug in the `pids` controller proper.
 102         */
 103        WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
 104}
 105
 106/**
 107 * pids_uncharge - hierarchically uncharge the pid count
 108 * @pids: the pid cgroup state
 109 * @num: the number of pids to uncharge
 110 */
 111static void pids_uncharge(struct pids_cgroup *pids, int num)
 112{
 113        struct pids_cgroup *p;
 114
 115        for (p = pids; p; p = parent_pids(p))
 116                pids_cancel(p, num);
 117}
 118
 119/**
 120 * pids_charge - hierarchically charge the pid count
 121 * @pids: the pid cgroup state
 122 * @num: the number of pids to charge
 123 *
 124 * This function does *not* follow the pid limit set. It cannot fail and the new
 125 * pid count may exceed the limit. This is only used for reverting failed
 126 * attaches, where there is no other way out than violating the limit.
 127 */
 128static void pids_charge(struct pids_cgroup *pids, int num)
 129{
 130        struct pids_cgroup *p;
 131
 132        for (p = pids; p; p = parent_pids(p))
 133                atomic64_add(num, &p->counter);
 134}
 135
 136/**
 137 * pids_try_charge - hierarchically try to charge the pid count
 138 * @pids: the pid cgroup state
 139 * @num: the number of pids to charge
 140 *
 141 * This function follows the set limit. It will fail if the charge would cause
 142 * the new value to exceed the hierarchical limit. Returns 0 if the charge
 143 * succeded, otherwise -EAGAIN.
 144 */
 145static int pids_try_charge(struct pids_cgroup *pids, int num)
 146{
 147        struct pids_cgroup *p, *q;
 148
 149        for (p = pids; p; p = parent_pids(p)) {
 150                int64_t new = atomic64_add_return(num, &p->counter);
 151
 152                /*
 153                 * Since new is capped to the maximum number of pid_t, if
 154                 * p->limit is %PIDS_MAX then we know that this test will never
 155                 * fail.
 156                 */
 157                if (new > p->limit)
 158                        goto revert;
 159        }
 160
 161        return 0;
 162
 163revert:
 164        for (q = pids; q != p; q = parent_pids(q))
 165                pids_cancel(q, num);
 166        pids_cancel(p, num);
 167
 168        return -EAGAIN;
 169}
 170
 171static int pids_can_attach(struct cgroup *cgrp,
 172                           struct cgroup_taskset *tset)
 173{
 174        struct pids_cgroup *pids = cgroup_pids(cgrp);
 175        struct task_struct *task;
 176
 177        cgroup_taskset_for_each(task, cgrp, tset) {
 178                struct cgroup_subsys_state *old_css;
 179                struct pids_cgroup *old_pids;
 180
 181                /*
 182                 * No need to pin @old_css between here and cancel_attach()
 183                 * because cgroup core protects it from being freed before
 184                 * the migration completes or fails.
 185                 */
 186                old_css = task_subsys_state(task, pids_subsys_id);
 187                old_pids = css_pids(old_css);
 188
 189                pids_charge(pids, 1);
 190                pids_uncharge(old_pids, 1);
 191        }
 192
 193        return 0;
 194}
 195
 196static void pids_cancel_attach(struct cgroup *cgrp,
 197                               struct cgroup_taskset *tset)
 198{
 199        struct pids_cgroup *pids = cgroup_pids(cgrp);
 200        struct task_struct *task;
 201
 202        cgroup_taskset_for_each(task, cgrp, tset) {
 203                struct cgroup_subsys_state *old_css;
 204                struct pids_cgroup *old_pids;
 205
 206                old_css = task_subsys_state(task, pids_subsys_id);
 207                old_pids = css_pids(old_css);
 208
 209                pids_charge(old_pids, 1);
 210                pids_uncharge(pids, 1);
 211        }
 212}
 213
 214static int pids_can_fork(struct task_struct *task, void **priv_p)
 215{
 216        struct cgroup_subsys_state *css;
 217        struct pids_cgroup *pids;
 218        int err;
 219
 220        /*
 221         * Use the "current" task_css for the pids subsystem as the tentative
 222         * css. It is possible we will charge the wrong hierarchy, in which
 223         * case we will forcefully revert/reapply the charge on the right
 224         * hierarchy after it is committed to the task proper.
 225         */
 226        css = task_get_css(current, pids_subsys_id);
 227        pids = css_pids(css);
 228
 229        err = pids_try_charge(pids, 1);
 230        if (err)
 231                goto err_css_put;
 232
 233        *priv_p = css;
 234        return 0;
 235
 236err_css_put:
 237        css_put(css);
 238        return err;
 239}
 240
 241static void pids_cancel_fork(struct task_struct *task, void *priv)
 242{
 243        struct cgroup_subsys_state *css = priv;
 244        struct pids_cgroup *pids = css_pids(css);
 245
 246        pids_uncharge(pids, 1);
 247        css_put(css);
 248}
 249
 250static void pids_fork(struct task_struct *task, void *priv)
 251{
 252        struct cgroup_subsys_state *css;
 253        struct cgroup_subsys_state *old_css = priv;
 254        struct pids_cgroup *pids;
 255        struct pids_cgroup *old_pids = css_pids(old_css);
 256
 257        css = task_get_css(task, pids_subsys_id);
 258        pids = css_pids(css);
 259
 260        /*
 261         * If the association has changed, we have to revert and reapply the
 262         * charge/uncharge on the wrong hierarchy to the current one. Since
 263         * the association can only change due to an organisation event, its
 264         * okay for us to ignore the limit in this case.
 265         */
 266        if (pids != old_pids) {
 267                pids_uncharge(old_pids, 1);
 268                pids_charge(pids, 1);
 269        }
 270
 271        css_put(css);
 272        css_put(old_css);
 273}
 274
 275void cgroup_pids_release(struct task_struct *task)
 276{
 277        struct list_head *cg_list = &task->cg_list;
 278        struct cgroup_subsys_state *css;
 279
 280        if (WARN_ON(!list_empty(cg_list)))
 281                return;
 282        if (WARN_ON(cg_list->prev == cg_list))
 283                return;
 284
 285        css = (void *)cg_list->prev;
 286        pids_uncharge(css_pids(css), 1);
 287        css_put(css);
 288}
 289
 290static void pids_exit(struct cgroup *cgroup,
 291                      struct cgroup *old_cgroup,
 292                      struct task_struct *task)
 293{
 294        struct list_head *cg_list = &task->cg_list;
 295        struct cgroup_subsys_state *css;
 296
 297        if (WARN_ON(cg_list->prev != cg_list))
 298                return;
 299        /*
 300         * This preserves list_empty(cg_list) == T and nobody else can use
 301         * ->cg_list after cgroup_exit(). Abuse cg_list->prev to pass this
 302         * css to cgroup_pids_release().
 303         */
 304        css = cgroup_subsys_state(old_cgroup, pids_subsys_id);
 305        cg_list->prev = (void *)css;
 306        css_get(css);
 307}
 308
 309static int pids_max_write(struct cgroup *cgroup, struct cftype *cft,
 310                          const char *buf)
 311{
 312        struct pids_cgroup *pids = cgroup_pids(cgroup);
 313        int64_t limit;
 314        int err;
 315
 316        buf = strstrip((char *)buf);
 317        if (!strcmp(buf, PIDS_MAX_STR)) {
 318                limit = PIDS_MAX;
 319                goto set_limit;
 320        }
 321
 322        err = kstrtoll(buf, 0, &limit);
 323        if (err)
 324                return err;
 325
 326        if (limit < 0 || limit >= PIDS_MAX)
 327                return -EINVAL;
 328
 329set_limit:
 330        /*
 331         * Limit updates don't need to be mutex'd, since it isn't
 332         * critical that any racing fork()s follow the new limit.
 333         */
 334        pids->limit = limit;
 335        return 0;
 336}
 337
 338static int pids_max_show(struct cgroup *cgroup, struct cftype *cft,
 339                         struct seq_file *sf)
 340{
 341        struct pids_cgroup *pids = cgroup_pids(cgroup);
 342        int64_t limit = pids->limit;
 343
 344        if (limit >= PIDS_MAX)
 345                seq_printf(sf, "%s\n", PIDS_MAX_STR);
 346        else
 347                seq_printf(sf, "%lld\n", limit);
 348
 349        return 0;
 350}
 351
 352static s64 pids_current_read(struct cgroup *cgroup,
 353                             struct cftype *cft)
 354{
 355        struct pids_cgroup *pids = cgroup_pids(cgroup);
 356
 357        return atomic64_read(&pids->counter);
 358}
 359
 360static struct cftype pids_files[] = {
 361        {
 362                .name = "max",
 363                .write_string = pids_max_write,
 364                .read_seq_string = pids_max_show,
 365                .flags = CFTYPE_NOT_ON_ROOT,
 366        },
 367        {
 368                .name = "current",
 369                .read_s64 = pids_current_read,
 370        },
 371        { }     /* terminate */
 372};
 373
 374struct cgroup_subsys pids_subsys = {
 375        .name           = "pids",
 376        .subsys_id      = pids_subsys_id,
 377        .css_alloc      = pids_css_alloc,
 378        .css_free       = pids_css_free,
 379        .can_attach     = pids_can_attach,
 380        .cancel_attach  = pids_cancel_attach,
 381        .can_fork       = pids_can_fork,
 382        .cancel_fork    = pids_cancel_fork,
 383        .fork           = pids_fork,
 384        .exit           = pids_exit,
 385        .base_cftypes   = pids_files,
 386};
 387