linux/kernel/cgroup/cgroup.c
<<
>>
Prefs
   1/*
   2 *  Generic process-grouping system.
   3 *
   4 *  Based originally on the cpuset system, extracted by Paul Menage
   5 *  Copyright (C) 2006 Google, Inc
   6 *
   7 *  Notifications support
   8 *  Copyright (C) 2009 Nokia Corporation
   9 *  Author: Kirill A. Shutemov
  10 *
  11 *  Copyright notices from the original cpuset code:
  12 *  --------------------------------------------------
  13 *  Copyright (C) 2003 BULL SA.
  14 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  15 *
  16 *  Portions derived from Patrick Mochel's sysfs code.
  17 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
  18 *
  19 *  2003-10-10 Written by Simon Derr.
  20 *  2003-10-22 Updates by Stephen Hemminger.
  21 *  2004 May-July Rework by Paul Jackson.
  22 *  ---------------------------------------------------
  23 *
  24 *  This file is subject to the terms and conditions of the GNU General Public
  25 *  License.  See the file COPYING in the main directory of the Linux
  26 *  distribution for more details.
  27 */
  28
  29#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  30
  31#include "cgroup-internal.h"
  32
  33#include <linux/cred.h>
  34#include <linux/errno.h>
  35#include <linux/init_task.h>
  36#include <linux/kernel.h>
  37#include <linux/magic.h>
  38#include <linux/mutex.h>
  39#include <linux/mount.h>
  40#include <linux/pagemap.h>
  41#include <linux/proc_fs.h>
  42#include <linux/rcupdate.h>
  43#include <linux/sched.h>
  44#include <linux/sched/task.h>
  45#include <linux/slab.h>
  46#include <linux/spinlock.h>
  47#include <linux/percpu-rwsem.h>
  48#include <linux/string.h>
  49#include <linux/hashtable.h>
  50#include <linux/idr.h>
  51#include <linux/kthread.h>
  52#include <linux/atomic.h>
  53#include <linux/cpuset.h>
  54#include <linux/proc_ns.h>
  55#include <linux/nsproxy.h>
  56#include <linux/file.h>
  57#include <net/sock.h>
  58
  59#define CREATE_TRACE_POINTS
  60#include <trace/events/cgroup.h>
  61
  62#define CGROUP_FILE_NAME_MAX            (MAX_CGROUP_TYPE_NAMELEN +      \
  63                                         MAX_CFTYPE_NAME + 2)
  64
  65/*
  66 * cgroup_mutex is the master lock.  Any modification to cgroup or its
  67 * hierarchy must be performed while holding it.
  68 *
  69 * css_set_lock protects task->cgroups pointer, the list of css_set
  70 * objects, and the chain of tasks off each css_set.
  71 *
  72 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
  73 * cgroup.h can use them for lockdep annotations.
  74 */
  75DEFINE_MUTEX(cgroup_mutex);
  76DEFINE_SPINLOCK(css_set_lock);
  77
  78#ifdef CONFIG_PROVE_RCU
  79EXPORT_SYMBOL_GPL(cgroup_mutex);
  80EXPORT_SYMBOL_GPL(css_set_lock);
  81#endif
  82
  83/*
  84 * Protects cgroup_idr and css_idr so that IDs can be released without
  85 * grabbing cgroup_mutex.
  86 */
  87static DEFINE_SPINLOCK(cgroup_idr_lock);
  88
  89/*
  90 * Protects cgroup_file->kn for !self csses.  It synchronizes notifications
  91 * against file removal/re-creation across css hiding.
  92 */
  93static DEFINE_SPINLOCK(cgroup_file_kn_lock);
  94
  95struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
  96
  97#define cgroup_assert_mutex_or_rcu_locked()                             \
  98        RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
  99                           !lockdep_is_held(&cgroup_mutex),             \
 100                           "cgroup_mutex or RCU read lock required");
 101
 102/*
 103 * cgroup destruction makes heavy use of work items and there can be a lot
 104 * of concurrent destructions.  Use a separate workqueue so that cgroup
 105 * destruction work items don't end up filling up max_active of system_wq
 106 * which may lead to deadlock.
 107 */
 108static struct workqueue_struct *cgroup_destroy_wq;
 109
 110/* generate an array of cgroup subsystem pointers */
 111#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
 112struct cgroup_subsys *cgroup_subsys[] = {
 113#include <linux/cgroup_subsys.h>
 114};
 115#undef SUBSYS
 116
 117/* array of cgroup subsystem names */
 118#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
 119static const char *cgroup_subsys_name[] = {
 120#include <linux/cgroup_subsys.h>
 121};
 122#undef SUBSYS
 123
 124/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
 125#define SUBSYS(_x)                                                              \
 126        DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key);                 \
 127        DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key);                  \
 128        EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key);                      \
 129        EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
 130#include <linux/cgroup_subsys.h>
 131#undef SUBSYS
 132
 133#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
 134static struct static_key_true *cgroup_subsys_enabled_key[] = {
 135#include <linux/cgroup_subsys.h>
 136};
 137#undef SUBSYS
 138
 139#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
 140static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
 141#include <linux/cgroup_subsys.h>
 142};
 143#undef SUBSYS
 144
 145/*
 146 * The default hierarchy, reserved for the subsystems that are otherwise
 147 * unattached - it never has more than a single cgroup, and all tasks are
 148 * part of that cgroup.
 149 */
 150struct cgroup_root cgrp_dfl_root;
 151EXPORT_SYMBOL_GPL(cgrp_dfl_root);
 152
 153/*
 154 * The default hierarchy always exists but is hidden until mounted for the
 155 * first time.  This is for backward compatibility.
 156 */
 157static bool cgrp_dfl_visible;
 158
 159/* some controllers are not supported in the default hierarchy */
 160static u16 cgrp_dfl_inhibit_ss_mask;
 161
 162/* some controllers are implicitly enabled on the default hierarchy */
 163static u16 cgrp_dfl_implicit_ss_mask;
 164
 165/* The list of hierarchy roots */
 166LIST_HEAD(cgroup_roots);
 167static int cgroup_root_count;
 168
 169/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
 170static DEFINE_IDR(cgroup_hierarchy_idr);
 171
 172/*
 173 * Assign a monotonically increasing serial number to csses.  It guarantees
 174 * cgroups with bigger numbers are newer than those with smaller numbers.
 175 * Also, as csses are always appended to the parent's ->children list, it
 176 * guarantees that sibling csses are always sorted in the ascending serial
 177 * number order on the list.  Protected by cgroup_mutex.
 178 */
 179static u64 css_serial_nr_next = 1;
 180
 181/*
 182 * These bitmasks identify subsystems with specific features to avoid
 183 * having to do iterative checks repeatedly.
 184 */
 185static u16 have_fork_callback __read_mostly;
 186static u16 have_exit_callback __read_mostly;
 187static u16 have_free_callback __read_mostly;
 188static u16 have_canfork_callback __read_mostly;
 189
 190/* cgroup namespace for init task */
 191struct cgroup_namespace init_cgroup_ns = {
 192        .count          = REFCOUNT_INIT(2),
 193        .user_ns        = &init_user_ns,
 194        .ns.ops         = &cgroupns_operations,
 195        .ns.inum        = PROC_CGROUP_INIT_INO,
 196        .root_cset      = &init_css_set,
 197};
 198
 199static struct file_system_type cgroup2_fs_type;
 200static struct cftype cgroup_base_files[];
 201
 202static int cgroup_apply_control(struct cgroup *cgrp);
 203static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
 204static void css_task_iter_advance(struct css_task_iter *it);
 205static int cgroup_destroy_locked(struct cgroup *cgrp);
 206static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
 207                                              struct cgroup_subsys *ss);
 208static void css_release(struct percpu_ref *ref);
 209static void kill_css(struct cgroup_subsys_state *css);
 210static int cgroup_addrm_files(struct cgroup_subsys_state *css,
 211                              struct cgroup *cgrp, struct cftype cfts[],
 212                              bool is_add);
 213
 214/**
 215 * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
 216 * @ssid: subsys ID of interest
 217 *
 218 * cgroup_subsys_enabled() can only be used with literal subsys names which
 219 * is fine for individual subsystems but unsuitable for cgroup core.  This
 220 * is slower static_key_enabled() based test indexed by @ssid.
 221 */
 222bool cgroup_ssid_enabled(int ssid)
 223{
 224        if (CGROUP_SUBSYS_COUNT == 0)
 225                return false;
 226
 227        return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
 228}
 229
 230/**
 231 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
 232 * @cgrp: the cgroup of interest
 233 *
 234 * The default hierarchy is the v2 interface of cgroup and this function
 235 * can be used to test whether a cgroup is on the default hierarchy for
 236 * cases where a subsystem should behave differnetly depending on the
 237 * interface version.
 238 *
 239 * The set of behaviors which change on the default hierarchy are still
 240 * being determined and the mount option is prefixed with __DEVEL__.
 241 *
 242 * List of changed behaviors:
 243 *
 244 * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
 245 *   and "name" are disallowed.
 246 *
 247 * - When mounting an existing superblock, mount options should match.
 248 *
 249 * - Remount is disallowed.
 250 *
 251 * - rename(2) is disallowed.
 252 *
 253 * - "tasks" is removed.  Everything should be at process granularity.  Use
 254 *   "cgroup.procs" instead.
 255 *
 256 * - "cgroup.procs" is not sorted.  pids will be unique unless they got
 257 *   recycled inbetween reads.
 258 *
 259 * - "release_agent" and "notify_on_release" are removed.  Replacement
 260 *   notification mechanism will be implemented.
 261 *
 262 * - "cgroup.clone_children" is removed.
 263 *
 264 * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
 265 *   and its descendants contain no task; otherwise, 1.  The file also
 266 *   generates kernfs notification which can be monitored through poll and
 267 *   [di]notify when the value of the file changes.
 268 *
 269 * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
 270 *   take masks of ancestors with non-empty cpus/mems, instead of being
 271 *   moved to an ancestor.
 272 *
 273 * - cpuset: a task can be moved into an empty cpuset, and again it takes
 274 *   masks of ancestors.
 275 *
 276 * - memcg: use_hierarchy is on by default and the cgroup file for the flag
 277 *   is not created.
 278 *
 279 * - blkcg: blk-throttle becomes properly hierarchical.
 280 *
 281 * - debug: disallowed on the default hierarchy.
 282 */
 283bool cgroup_on_dfl(const struct cgroup *cgrp)
 284{
 285        return cgrp->root == &cgrp_dfl_root;
 286}
 287
 288/* IDR wrappers which synchronize using cgroup_idr_lock */
 289static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
 290                            gfp_t gfp_mask)
 291{
 292        int ret;
 293
 294        idr_preload(gfp_mask);
 295        spin_lock_bh(&cgroup_idr_lock);
 296        ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
 297        spin_unlock_bh(&cgroup_idr_lock);
 298        idr_preload_end();
 299        return ret;
 300}
 301
 302static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
 303{
 304        void *ret;
 305
 306        spin_lock_bh(&cgroup_idr_lock);
 307        ret = idr_replace(idr, ptr, id);
 308        spin_unlock_bh(&cgroup_idr_lock);
 309        return ret;
 310}
 311
 312static void cgroup_idr_remove(struct idr *idr, int id)
 313{
 314        spin_lock_bh(&cgroup_idr_lock);
 315        idr_remove(idr, id);
 316        spin_unlock_bh(&cgroup_idr_lock);
 317}
 318
 319static struct cgroup *cgroup_parent(struct cgroup *cgrp)
 320{
 321        struct cgroup_subsys_state *parent_css = cgrp->self.parent;
 322
 323        if (parent_css)
 324                return container_of(parent_css, struct cgroup, self);
 325        return NULL;
 326}
 327
 328/* subsystems visibly enabled on a cgroup */
 329static u16 cgroup_control(struct cgroup *cgrp)
 330{
 331        struct cgroup *parent = cgroup_parent(cgrp);
 332        u16 root_ss_mask = cgrp->root->subsys_mask;
 333
 334        if (parent)
 335                return parent->subtree_control;
 336
 337        if (cgroup_on_dfl(cgrp))
 338                root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
 339                                  cgrp_dfl_implicit_ss_mask);
 340        return root_ss_mask;
 341}
 342
 343/* subsystems enabled on a cgroup */
 344static u16 cgroup_ss_mask(struct cgroup *cgrp)
 345{
 346        struct cgroup *parent = cgroup_parent(cgrp);
 347
 348        if (parent)
 349                return parent->subtree_ss_mask;
 350
 351        return cgrp->root->subsys_mask;
 352}
 353
 354/**
 355 * cgroup_css - obtain a cgroup's css for the specified subsystem
 356 * @cgrp: the cgroup of interest
 357 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 358 *
 359 * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
 360 * function must be called either under cgroup_mutex or rcu_read_lock() and
 361 * the caller is responsible for pinning the returned css if it wants to
 362 * keep accessing it outside the said locks.  This function may return
 363 * %NULL if @cgrp doesn't have @subsys_id enabled.
 364 */
 365static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
 366                                              struct cgroup_subsys *ss)
 367{
 368        if (ss)
 369                return rcu_dereference_check(cgrp->subsys[ss->id],
 370                                        lockdep_is_held(&cgroup_mutex));
 371        else
 372                return &cgrp->self;
 373}
 374
 375/**
 376 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
 377 * @cgrp: the cgroup of interest
 378 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 379 *
 380 * Similar to cgroup_css() but returns the effective css, which is defined
 381 * as the matching css of the nearest ancestor including self which has @ss
 382 * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
 383 * function is guaranteed to return non-NULL css.
 384 */
 385static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
 386                                                struct cgroup_subsys *ss)
 387{
 388        lockdep_assert_held(&cgroup_mutex);
 389
 390        if (!ss)
 391                return &cgrp->self;
 392
 393        /*
 394         * This function is used while updating css associations and thus
 395         * can't test the csses directly.  Test ss_mask.
 396         */
 397        while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
 398                cgrp = cgroup_parent(cgrp);
 399                if (!cgrp)
 400                        return NULL;
 401        }
 402
 403        return cgroup_css(cgrp, ss);
 404}
 405
 406/**
 407 * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
 408 * @cgrp: the cgroup of interest
 409 * @ss: the subsystem of interest
 410 *
 411 * Find and get the effective css of @cgrp for @ss.  The effective css is
 412 * defined as the matching css of the nearest ancestor including self which
 413 * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
 414 * the root css is returned, so this function always returns a valid css.
 415 * The returned css must be put using css_put().
 416 */
 417struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
 418                                             struct cgroup_subsys *ss)
 419{
 420        struct cgroup_subsys_state *css;
 421
 422        rcu_read_lock();
 423
 424        do {
 425                css = cgroup_css(cgrp, ss);
 426
 427                if (css && css_tryget_online(css))
 428                        goto out_unlock;
 429                cgrp = cgroup_parent(cgrp);
 430        } while (cgrp);
 431
 432        css = init_css_set.subsys[ss->id];
 433        css_get(css);
 434out_unlock:
 435        rcu_read_unlock();
 436        return css;
 437}
 438
 439static void __maybe_unused cgroup_get(struct cgroup *cgrp)
 440{
 441        css_get(&cgrp->self);
 442}
 443
 444static void cgroup_get_live(struct cgroup *cgrp)
 445{
 446        WARN_ON_ONCE(cgroup_is_dead(cgrp));
 447        css_get(&cgrp->self);
 448}
 449
 450static bool cgroup_tryget(struct cgroup *cgrp)
 451{
 452        return css_tryget(&cgrp->self);
 453}
 454
 455struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 456{
 457        struct cgroup *cgrp = of->kn->parent->priv;
 458        struct cftype *cft = of_cft(of);
 459
 460        /*
 461         * This is open and unprotected implementation of cgroup_css().
 462         * seq_css() is only called from a kernfs file operation which has
 463         * an active reference on the file.  Because all the subsystem
 464         * files are drained before a css is disassociated with a cgroup,
 465         * the matching css from the cgroup's subsys table is guaranteed to
 466         * be and stay valid until the enclosing operation is complete.
 467         */
 468        if (cft->ss)
 469                return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
 470        else
 471                return &cgrp->self;
 472}
 473EXPORT_SYMBOL_GPL(of_css);
 474
 475/**
 476 * for_each_css - iterate all css's of a cgroup
 477 * @css: the iteration cursor
 478 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 479 * @cgrp: the target cgroup to iterate css's of
 480 *
 481 * Should be called under cgroup_[tree_]mutex.
 482 */
 483#define for_each_css(css, ssid, cgrp)                                   \
 484        for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
 485                if (!((css) = rcu_dereference_check(                    \
 486                                (cgrp)->subsys[(ssid)],                 \
 487                                lockdep_is_held(&cgroup_mutex)))) { }   \
 488                else
 489
 490/**
 491 * for_each_e_css - iterate all effective css's of a cgroup
 492 * @css: the iteration cursor
 493 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 494 * @cgrp: the target cgroup to iterate css's of
 495 *
 496 * Should be called under cgroup_[tree_]mutex.
 497 */
 498#define for_each_e_css(css, ssid, cgrp)                                 \
 499        for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
 500                if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
 501                        ;                                               \
 502                else
 503
 504/**
 505 * do_each_subsys_mask - filter for_each_subsys with a bitmask
 506 * @ss: the iteration cursor
 507 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
 508 * @ss_mask: the bitmask
 509 *
 510 * The block will only run for cases where the ssid-th bit (1 << ssid) of
 511 * @ss_mask is set.
 512 */
 513#define do_each_subsys_mask(ss, ssid, ss_mask) do {                     \
 514        unsigned long __ss_mask = (ss_mask);                            \
 515        if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \
 516                (ssid) = 0;                                             \
 517                break;                                                  \
 518        }                                                               \
 519        for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) {       \
 520                (ss) = cgroup_subsys[ssid];                             \
 521                {
 522
 523#define while_each_subsys_mask()                                        \
 524                }                                                       \
 525        }                                                               \
 526} while (false)
 527
 528/* iterate over child cgrps, lock should be held throughout iteration */
 529#define cgroup_for_each_live_child(child, cgrp)                         \
 530        list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
 531                if (({ lockdep_assert_held(&cgroup_mutex);              \
 532                       cgroup_is_dead(child); }))                       \
 533                        ;                                               \
 534                else
 535
 536/* walk live descendants in preorder */
 537#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)          \
 538        css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL))  \
 539                if (({ lockdep_assert_held(&cgroup_mutex);              \
 540                       (dsct) = (d_css)->cgroup;                        \
 541                       cgroup_is_dead(dsct); }))                        \
 542                        ;                                               \
 543                else
 544
 545/* walk live descendants in postorder */
 546#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp)         \
 547        css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
 548                if (({ lockdep_assert_held(&cgroup_mutex);              \
 549                       (dsct) = (d_css)->cgroup;                        \
 550                       cgroup_is_dead(dsct); }))                        \
 551                        ;                                               \
 552                else
 553
 554/*
 555 * The default css_set - used by init and its children prior to any
 556 * hierarchies being mounted. It contains a pointer to the root state
 557 * for each subsystem. Also used to anchor the list of css_sets. Not
 558 * reference-counted, to improve performance when child cgroups
 559 * haven't been created.
 560 */
 561struct css_set init_css_set = {
 562        .refcount               = REFCOUNT_INIT(1),
 563        .tasks                  = LIST_HEAD_INIT(init_css_set.tasks),
 564        .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
 565        .task_iters             = LIST_HEAD_INIT(init_css_set.task_iters),
 566        .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
 567        .mg_preload_node        = LIST_HEAD_INIT(init_css_set.mg_preload_node),
 568        .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
 569};
 570
 571static int css_set_count        = 1;    /* 1 for init_css_set */
 572
 573/**
 574 * css_set_populated - does a css_set contain any tasks?
 575 * @cset: target css_set
 576 *
 577 * css_set_populated() should be the same as !!cset->nr_tasks at steady
 578 * state. However, css_set_populated() can be called while a task is being
 579 * added to or removed from the linked list before the nr_tasks is
 580 * properly updated. Hence, we can't just look at ->nr_tasks here.
 581 */
 582static bool css_set_populated(struct css_set *cset)
 583{
 584        lockdep_assert_held(&css_set_lock);
 585
 586        return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
 587}
 588
 589/**
 590 * cgroup_update_populated - updated populated count of a cgroup
 591 * @cgrp: the target cgroup
 592 * @populated: inc or dec populated count
 593 *
 594 * One of the css_sets associated with @cgrp is either getting its first
 595 * task or losing the last.  Update @cgrp->populated_cnt accordingly.  The
 596 * count is propagated towards root so that a given cgroup's populated_cnt
 597 * is zero iff the cgroup and all its descendants don't contain any tasks.
 598 *
 599 * @cgrp's interface file "cgroup.populated" is zero if
 600 * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
 601 * changes from or to zero, userland is notified that the content of the
 602 * interface file has changed.  This can be used to detect when @cgrp and
 603 * its descendants become populated or empty.
 604 */
 605static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 606{
 607        lockdep_assert_held(&css_set_lock);
 608
 609        do {
 610                bool trigger;
 611
 612                if (populated)
 613                        trigger = !cgrp->populated_cnt++;
 614                else
 615                        trigger = !--cgrp->populated_cnt;
 616
 617                if (!trigger)
 618                        break;
 619
 620                cgroup1_check_for_release(cgrp);
 621                cgroup_file_notify(&cgrp->events_file);
 622
 623                cgrp = cgroup_parent(cgrp);
 624        } while (cgrp);
 625}
 626
 627/**
 628 * css_set_update_populated - update populated state of a css_set
 629 * @cset: target css_set
 630 * @populated: whether @cset is populated or depopulated
 631 *
 632 * @cset is either getting the first task or losing the last.  Update the
 633 * ->populated_cnt of all associated cgroups accordingly.
 634 */
 635static void css_set_update_populated(struct css_set *cset, bool populated)
 636{
 637        struct cgrp_cset_link *link;
 638
 639        lockdep_assert_held(&css_set_lock);
 640
 641        list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
 642                cgroup_update_populated(link->cgrp, populated);
 643}
 644
 645/**
 646 * css_set_move_task - move a task from one css_set to another
 647 * @task: task being moved
 648 * @from_cset: css_set @task currently belongs to (may be NULL)
 649 * @to_cset: new css_set @task is being moved to (may be NULL)
 650 * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
 651 *
 652 * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
 653 * css_set, @from_cset can be NULL.  If @task is being disassociated
 654 * instead of moved, @to_cset can be NULL.
 655 *
 656 * This function automatically handles populated_cnt updates and
 657 * css_task_iter adjustments but the caller is responsible for managing
 658 * @from_cset and @to_cset's reference counts.
 659 */
 660static void css_set_move_task(struct task_struct *task,
 661                              struct css_set *from_cset, struct css_set *to_cset,
 662                              bool use_mg_tasks)
 663{
 664        lockdep_assert_held(&css_set_lock);
 665
 666        if (to_cset && !css_set_populated(to_cset))
 667                css_set_update_populated(to_cset, true);
 668
 669        if (from_cset) {
 670                struct css_task_iter *it, *pos;
 671
 672                WARN_ON_ONCE(list_empty(&task->cg_list));
 673
 674                /*
 675                 * @task is leaving, advance task iterators which are
 676                 * pointing to it so that they can resume at the next
 677                 * position.  Advancing an iterator might remove it from
 678                 * the list, use safe walk.  See css_task_iter_advance*()
 679                 * for details.
 680                 */
 681                list_for_each_entry_safe(it, pos, &from_cset->task_iters,
 682                                         iters_node)
 683                        if (it->task_pos == &task->cg_list)
 684                                css_task_iter_advance(it);
 685
 686                list_del_init(&task->cg_list);
 687                if (!css_set_populated(from_cset))
 688                        css_set_update_populated(from_cset, false);
 689        } else {
 690                WARN_ON_ONCE(!list_empty(&task->cg_list));
 691        }
 692
 693        if (to_cset) {
 694                /*
 695                 * We are synchronized through cgroup_threadgroup_rwsem
 696                 * against PF_EXITING setting such that we can't race
 697                 * against cgroup_exit() changing the css_set to
 698                 * init_css_set and dropping the old one.
 699                 */
 700                WARN_ON_ONCE(task->flags & PF_EXITING);
 701
 702                rcu_assign_pointer(task->cgroups, to_cset);
 703                list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
 704                                                             &to_cset->tasks);
 705        }
 706}
 707
 708/*
 709 * hash table for cgroup groups. This improves the performance to find
 710 * an existing css_set. This hash doesn't (currently) take into
 711 * account cgroups in empty hierarchies.
 712 */
 713#define CSS_SET_HASH_BITS       7
 714static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
 715
 716static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 717{
 718        unsigned long key = 0UL;
 719        struct cgroup_subsys *ss;
 720        int i;
 721
 722        for_each_subsys(ss, i)
 723                key += (unsigned long)css[i];
 724        key = (key >> 16) ^ key;
 725
 726        return key;
 727}
 728
 729void put_css_set_locked(struct css_set *cset)
 730{
 731        struct cgrp_cset_link *link, *tmp_link;
 732        struct cgroup_subsys *ss;
 733        int ssid;
 734
 735        lockdep_assert_held(&css_set_lock);
 736
 737        if (!refcount_dec_and_test(&cset->refcount))
 738                return;
 739
 740        /* This css_set is dead. unlink it and release cgroup and css refs */
 741        for_each_subsys(ss, ssid) {
 742                list_del(&cset->e_cset_node[ssid]);
 743                css_put(cset->subsys[ssid]);
 744        }
 745        hash_del(&cset->hlist);
 746        css_set_count--;
 747
 748        list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
 749                list_del(&link->cset_link);
 750                list_del(&link->cgrp_link);
 751                if (cgroup_parent(link->cgrp))
 752                        cgroup_put(link->cgrp);
 753                kfree(link);
 754        }
 755
 756        kfree_rcu(cset, rcu_head);
 757}
 758
 759/**
 760 * compare_css_sets - helper function for find_existing_css_set().
 761 * @cset: candidate css_set being tested
 762 * @old_cset: existing css_set for a task
 763 * @new_cgrp: cgroup that's being entered by the task
 764 * @template: desired set of css pointers in css_set (pre-calculated)
 765 *
 766 * Returns true if "cset" matches "old_cset" except for the hierarchy
 767 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
 768 */
 769static bool compare_css_sets(struct css_set *cset,
 770                             struct css_set *old_cset,
 771                             struct cgroup *new_cgrp,
 772                             struct cgroup_subsys_state *template[])
 773{
 774        struct list_head *l1, *l2;
 775
 776        /*
 777         * On the default hierarchy, there can be csets which are
 778         * associated with the same set of cgroups but different csses.
 779         * Let's first ensure that csses match.
 780         */
 781        if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
 782                return false;
 783
 784        /*
 785         * Compare cgroup pointers in order to distinguish between
 786         * different cgroups in hierarchies.  As different cgroups may
 787         * share the same effective css, this comparison is always
 788         * necessary.
 789         */
 790        l1 = &cset->cgrp_links;
 791        l2 = &old_cset->cgrp_links;
 792        while (1) {
 793                struct cgrp_cset_link *link1, *link2;
 794                struct cgroup *cgrp1, *cgrp2;
 795
 796                l1 = l1->next;
 797                l2 = l2->next;
 798                /* See if we reached the end - both lists are equal length. */
 799                if (l1 == &cset->cgrp_links) {
 800                        BUG_ON(l2 != &old_cset->cgrp_links);
 801                        break;
 802                } else {
 803                        BUG_ON(l2 == &old_cset->cgrp_links);
 804                }
 805                /* Locate the cgroups associated with these links. */
 806                link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
 807                link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
 808                cgrp1 = link1->cgrp;
 809                cgrp2 = link2->cgrp;
 810                /* Hierarchies should be linked in the same order. */
 811                BUG_ON(cgrp1->root != cgrp2->root);
 812
 813                /*
 814                 * If this hierarchy is the hierarchy of the cgroup
 815                 * that's changing, then we need to check that this
 816                 * css_set points to the new cgroup; if it's any other
 817                 * hierarchy, then this css_set should point to the
 818                 * same cgroup as the old css_set.
 819                 */
 820                if (cgrp1->root == new_cgrp->root) {
 821                        if (cgrp1 != new_cgrp)
 822                                return false;
 823                } else {
 824                        if (cgrp1 != cgrp2)
 825                                return false;
 826                }
 827        }
 828        return true;
 829}
 830
 831/**
 832 * find_existing_css_set - init css array and find the matching css_set
 833 * @old_cset: the css_set that we're using before the cgroup transition
 834 * @cgrp: the cgroup that we're moving into
 835 * @template: out param for the new set of csses, should be clear on entry
 836 */
 837static struct css_set *find_existing_css_set(struct css_set *old_cset,
 838                                        struct cgroup *cgrp,
 839                                        struct cgroup_subsys_state *template[])
 840{
 841        struct cgroup_root *root = cgrp->root;
 842        struct cgroup_subsys *ss;
 843        struct css_set *cset;
 844        unsigned long key;
 845        int i;
 846
 847        /*
 848         * Build the set of subsystem state objects that we want to see in the
 849         * new css_set. while subsystems can change globally, the entries here
 850         * won't change, so no need for locking.
 851         */
 852        for_each_subsys(ss, i) {
 853                if (root->subsys_mask & (1UL << i)) {
 854                        /*
 855                         * @ss is in this hierarchy, so we want the
 856                         * effective css from @cgrp.
 857                         */
 858                        template[i] = cgroup_e_css(cgrp, ss);
 859                } else {
 860                        /*
 861                         * @ss is not in this hierarchy, so we don't want
 862                         * to change the css.
 863                         */
 864                        template[i] = old_cset->subsys[i];
 865                }
 866        }
 867
 868        key = css_set_hash(template);
 869        hash_for_each_possible(css_set_table, cset, hlist, key) {
 870                if (!compare_css_sets(cset, old_cset, cgrp, template))
 871                        continue;
 872
 873                /* This css_set matches what we need */
 874                return cset;
 875        }
 876
 877        /* No existing cgroup group matched */
 878        return NULL;
 879}
 880
 881static void free_cgrp_cset_links(struct list_head *links_to_free)
 882{
 883        struct cgrp_cset_link *link, *tmp_link;
 884
 885        list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
 886                list_del(&link->cset_link);
 887                kfree(link);
 888        }
 889}
 890
 891/**
 892 * allocate_cgrp_cset_links - allocate cgrp_cset_links
 893 * @count: the number of links to allocate
 894 * @tmp_links: list_head the allocated links are put on
 895 *
 896 * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
 897 * through ->cset_link.  Returns 0 on success or -errno.
 898 */
 899static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
 900{
 901        struct cgrp_cset_link *link;
 902        int i;
 903
 904        INIT_LIST_HEAD(tmp_links);
 905
 906        for (i = 0; i < count; i++) {
 907                link = kzalloc(sizeof(*link), GFP_KERNEL);
 908                if (!link) {
 909                        free_cgrp_cset_links(tmp_links);
 910                        return -ENOMEM;
 911                }
 912                list_add(&link->cset_link, tmp_links);
 913        }
 914        return 0;
 915}
 916
 917/**
 918 * link_css_set - a helper function to link a css_set to a cgroup
 919 * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
 920 * @cset: the css_set to be linked
 921 * @cgrp: the destination cgroup
 922 */
 923static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
 924                         struct cgroup *cgrp)
 925{
 926        struct cgrp_cset_link *link;
 927
 928        BUG_ON(list_empty(tmp_links));
 929
 930        if (cgroup_on_dfl(cgrp))
 931                cset->dfl_cgrp = cgrp;
 932
 933        link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
 934        link->cset = cset;
 935        link->cgrp = cgrp;
 936
 937        /*
 938         * Always add links to the tail of the lists so that the lists are
 939         * in choronological order.
 940         */
 941        list_move_tail(&link->cset_link, &cgrp->cset_links);
 942        list_add_tail(&link->cgrp_link, &cset->cgrp_links);
 943
 944        if (cgroup_parent(cgrp))
 945                cgroup_get_live(cgrp);
 946}
 947
 948/**
 949 * find_css_set - return a new css_set with one cgroup updated
 950 * @old_cset: the baseline css_set
 951 * @cgrp: the cgroup to be updated
 952 *
 953 * Return a new css_set that's equivalent to @old_cset, but with @cgrp
 954 * substituted into the appropriate hierarchy.
 955 */
 956static struct css_set *find_css_set(struct css_set *old_cset,
 957                                    struct cgroup *cgrp)
 958{
 959        struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
 960        struct css_set *cset;
 961        struct list_head tmp_links;
 962        struct cgrp_cset_link *link;
 963        struct cgroup_subsys *ss;
 964        unsigned long key;
 965        int ssid;
 966
 967        lockdep_assert_held(&cgroup_mutex);
 968
 969        /* First see if we already have a cgroup group that matches
 970         * the desired set */
 971        spin_lock_irq(&css_set_lock);
 972        cset = find_existing_css_set(old_cset, cgrp, template);
 973        if (cset)
 974                get_css_set(cset);
 975        spin_unlock_irq(&css_set_lock);
 976
 977        if (cset)
 978                return cset;
 979
 980        cset = kzalloc(sizeof(*cset), GFP_KERNEL);
 981        if (!cset)
 982                return NULL;
 983
 984        /* Allocate all the cgrp_cset_link objects that we'll need */
 985        if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
 986                kfree(cset);
 987                return NULL;
 988        }
 989
 990        refcount_set(&cset->refcount, 1);
 991        INIT_LIST_HEAD(&cset->tasks);
 992        INIT_LIST_HEAD(&cset->mg_tasks);
 993        INIT_LIST_HEAD(&cset->task_iters);
 994        INIT_HLIST_NODE(&cset->hlist);
 995        INIT_LIST_HEAD(&cset->cgrp_links);
 996        INIT_LIST_HEAD(&cset->mg_preload_node);
 997        INIT_LIST_HEAD(&cset->mg_node);
 998
 999        /* Copy the set of subsystem state objects generated in
1000         * find_existing_css_set() */
1001        memcpy(cset->subsys, template, sizeof(cset->subsys));
1002
1003        spin_lock_irq(&css_set_lock);
1004        /* Add reference counts and links from the new css_set. */
1005        list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
1006                struct cgroup *c = link->cgrp;
1007
1008                if (c->root == cgrp->root)
1009                        c = cgrp;
1010                link_css_set(&tmp_links, cset, c);
1011        }
1012
1013        BUG_ON(!list_empty(&tmp_links));
1014
1015        css_set_count++;
1016
1017        /* Add @cset to the hash table */
1018        key = css_set_hash(cset->subsys);
1019        hash_add(css_set_table, &cset->hlist, key);
1020
1021        for_each_subsys(ss, ssid) {
1022                struct cgroup_subsys_state *css = cset->subsys[ssid];
1023
1024                list_add_tail(&cset->e_cset_node[ssid],
1025                              &css->cgroup->e_csets[ssid]);
1026                css_get(css);
1027        }
1028
1029        spin_unlock_irq(&css_set_lock);
1030
1031        return cset;
1032}
1033
1034struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
1035{
1036        struct cgroup *root_cgrp = kf_root->kn->priv;
1037
1038        return root_cgrp->root;
1039}
1040
1041static int cgroup_init_root_id(struct cgroup_root *root)
1042{
1043        int id;
1044
1045        lockdep_assert_held(&cgroup_mutex);
1046
1047        id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
1048        if (id < 0)
1049                return id;
1050
1051        root->hierarchy_id = id;
1052        return 0;
1053}
1054
1055static void cgroup_exit_root_id(struct cgroup_root *root)
1056{
1057        lockdep_assert_held(&cgroup_mutex);
1058
1059        idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1060}
1061
1062void cgroup_free_root(struct cgroup_root *root)
1063{
1064        if (root) {
1065                idr_destroy(&root->cgroup_idr);
1066                kfree(root);
1067        }
1068}
1069
1070static void cgroup_destroy_root(struct cgroup_root *root)
1071{
1072        struct cgroup *cgrp = &root->cgrp;
1073        struct cgrp_cset_link *link, *tmp_link;
1074
1075        trace_cgroup_destroy_root(root);
1076
1077        cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1078
1079        BUG_ON(atomic_read(&root->nr_cgrps));
1080        BUG_ON(!list_empty(&cgrp->self.children));
1081
1082        /* Rebind all subsystems back to the default hierarchy */
1083        WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
1084
1085        /*
1086         * Release all the links from cset_links to this hierarchy's
1087         * root cgroup
1088         */
1089        spin_lock_irq(&css_set_lock);
1090
1091        list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1092                list_del(&link->cset_link);
1093                list_del(&link->cgrp_link);
1094                kfree(link);
1095        }
1096
1097        spin_unlock_irq(&css_set_lock);
1098
1099        if (!list_empty(&root->root_list)) {
1100                list_del(&root->root_list);
1101                cgroup_root_count--;
1102        }
1103
1104        cgroup_exit_root_id(root);
1105
1106        mutex_unlock(&cgroup_mutex);
1107
1108        kernfs_destroy_root(root->kf_root);
1109        cgroup_free_root(root);
1110}
1111
1112/*
1113 * look up cgroup associated with current task's cgroup namespace on the
1114 * specified hierarchy
1115 */
1116static struct cgroup *
1117current_cgns_cgroup_from_root(struct cgroup_root *root)
1118{
1119        struct cgroup *res = NULL;
1120        struct css_set *cset;
1121
1122        lockdep_assert_held(&css_set_lock);
1123
1124        rcu_read_lock();
1125
1126        cset = current->nsproxy->cgroup_ns->root_cset;
1127        if (cset == &init_css_set) {
1128                res = &root->cgrp;
1129        } else {
1130                struct cgrp_cset_link *link;
1131
1132                list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1133                        struct cgroup *c = link->cgrp;
1134
1135                        if (c->root == root) {
1136                                res = c;
1137                                break;
1138                        }
1139                }
1140        }
1141        rcu_read_unlock();
1142
1143        BUG_ON(!res);
1144        return res;
1145}
1146
1147/* look up cgroup associated with given css_set on the specified hierarchy */
1148static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1149                                            struct cgroup_root *root)
1150{
1151        struct cgroup *res = NULL;
1152
1153        lockdep_assert_held(&cgroup_mutex);
1154        lockdep_assert_held(&css_set_lock);
1155
1156        if (cset == &init_css_set) {
1157                res = &root->cgrp;
1158        } else {
1159                struct cgrp_cset_link *link;
1160
1161                list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1162                        struct cgroup *c = link->cgrp;
1163
1164                        if (c->root == root) {
1165                                res = c;
1166                                break;
1167                        }
1168                }
1169        }
1170
1171        BUG_ON(!res);
1172        return res;
1173}
1174
1175/*
1176 * Return the cgroup for "task" from the given hierarchy. Must be
1177 * called with cgroup_mutex and css_set_lock held.
1178 */
1179struct cgroup *task_cgroup_from_root(struct task_struct *task,
1180                                     struct cgroup_root *root)
1181{
1182        /*
1183         * No need to lock the task - since we hold cgroup_mutex the
1184         * task can't change groups, so the only thing that can happen
1185         * is that it exits and its css is set back to init_css_set.
1186         */
1187        return cset_cgroup_from_root(task_css_set(task), root);
1188}
1189
1190/*
1191 * A task must hold cgroup_mutex to modify cgroups.
1192 *
1193 * Any task can increment and decrement the count field without lock.
1194 * So in general, code holding cgroup_mutex can't rely on the count
1195 * field not changing.  However, if the count goes to zero, then only
1196 * cgroup_attach_task() can increment it again.  Because a count of zero
1197 * means that no tasks are currently attached, therefore there is no
1198 * way a task attached to that cgroup can fork (the other way to
1199 * increment the count).  So code holding cgroup_mutex can safely
1200 * assume that if the count is zero, it will stay zero. Similarly, if
1201 * a task holds cgroup_mutex on a cgroup with zero count, it
1202 * knows that the cgroup won't be removed, as cgroup_rmdir()
1203 * needs that mutex.
1204 *
1205 * A cgroup can only be deleted if both its 'count' of using tasks
1206 * is zero, and its list of 'children' cgroups is empty.  Since all
1207 * tasks in the system use _some_ cgroup, and since there is always at
1208 * least one task in the system (init, pid == 1), therefore, root cgroup
1209 * always has either children cgroups and/or using tasks.  So we don't
1210 * need a special hack to ensure that root cgroup cannot be deleted.
1211 *
1212 * P.S.  One more locking exception.  RCU is used to guard the
1213 * update of a tasks cgroup pointer by cgroup_attach_task()
1214 */
1215
1216static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1217
1218static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1219                              char *buf)
1220{
1221        struct cgroup_subsys *ss = cft->ss;
1222
1223        if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1224            !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
1225                snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
1226                         cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1227                         cft->name);
1228        else
1229                strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1230        return buf;
1231}
1232
1233/**
1234 * cgroup_file_mode - deduce file mode of a control file
1235 * @cft: the control file in question
1236 *
1237 * S_IRUGO for read, S_IWUSR for write.
1238 */
1239static umode_t cgroup_file_mode(const struct cftype *cft)
1240{
1241        umode_t mode = 0;
1242
1243        if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1244                mode |= S_IRUGO;
1245
1246        if (cft->write_u64 || cft->write_s64 || cft->write) {
1247                if (cft->flags & CFTYPE_WORLD_WRITABLE)
1248                        mode |= S_IWUGO;
1249                else
1250                        mode |= S_IWUSR;
1251        }
1252
1253        return mode;
1254}
1255
1256/**
1257 * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
1258 * @subtree_control: the new subtree_control mask to consider
1259 * @this_ss_mask: available subsystems
1260 *
1261 * On the default hierarchy, a subsystem may request other subsystems to be
1262 * enabled together through its ->depends_on mask.  In such cases, more
1263 * subsystems than specified in "cgroup.subtree_control" may be enabled.
1264 *
1265 * This function calculates which subsystems need to be enabled if
1266 * @subtree_control is to be applied while restricted to @this_ss_mask.
1267 */
1268static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
1269{
1270        u16 cur_ss_mask = subtree_control;
1271        struct cgroup_subsys *ss;
1272        int ssid;
1273
1274        lockdep_assert_held(&cgroup_mutex);
1275
1276        cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
1277
1278        while (true) {
1279                u16 new_ss_mask = cur_ss_mask;
1280
1281                do_each_subsys_mask(ss, ssid, cur_ss_mask) {
1282                        new_ss_mask |= ss->depends_on;
1283                } while_each_subsys_mask();
1284
1285                /*
1286                 * Mask out subsystems which aren't available.  This can
1287                 * happen only if some depended-upon subsystems were bound
1288                 * to non-default hierarchies.
1289                 */
1290                new_ss_mask &= this_ss_mask;
1291
1292                if (new_ss_mask == cur_ss_mask)
1293                        break;
1294                cur_ss_mask = new_ss_mask;
1295        }
1296
1297        return cur_ss_mask;
1298}
1299
1300/**
1301 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
1302 * @kn: the kernfs_node being serviced
1303 *
1304 * This helper undoes cgroup_kn_lock_live() and should be invoked before
1305 * the method finishes if locking succeeded.  Note that once this function
1306 * returns the cgroup returned by cgroup_kn_lock_live() may become
1307 * inaccessible any time.  If the caller intends to continue to access the
1308 * cgroup, it should pin it before invoking this function.
1309 */
1310void cgroup_kn_unlock(struct kernfs_node *kn)
1311{
1312        struct cgroup *cgrp;
1313
1314        if (kernfs_type(kn) == KERNFS_DIR)
1315                cgrp = kn->priv;
1316        else
1317                cgrp = kn->parent->priv;
1318
1319        mutex_unlock(&cgroup_mutex);
1320
1321        kernfs_unbreak_active_protection(kn);
1322        cgroup_put(cgrp);
1323}
1324
1325/**
1326 * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
1327 * @kn: the kernfs_node being serviced
1328 * @drain_offline: perform offline draining on the cgroup
1329 *
1330 * This helper is to be used by a cgroup kernfs method currently servicing
1331 * @kn.  It breaks the active protection, performs cgroup locking and
1332 * verifies that the associated cgroup is alive.  Returns the cgroup if
1333 * alive; otherwise, %NULL.  A successful return should be undone by a
1334 * matching cgroup_kn_unlock() invocation.  If @drain_offline is %true, the
1335 * cgroup is drained of offlining csses before return.
1336 *
1337 * Any cgroup kernfs method implementation which requires locking the
1338 * associated cgroup should use this helper.  It avoids nesting cgroup
1339 * locking under kernfs active protection and allows all kernfs operations
1340 * including self-removal.
1341 */
1342struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
1343{
1344        struct cgroup *cgrp;
1345
1346        if (kernfs_type(kn) == KERNFS_DIR)
1347                cgrp = kn->priv;
1348        else
1349                cgrp = kn->parent->priv;
1350
1351        /*
1352         * We're gonna grab cgroup_mutex which nests outside kernfs
1353         * active_ref.  cgroup liveliness check alone provides enough
1354         * protection against removal.  Ensure @cgrp stays accessible and
1355         * break the active_ref protection.
1356         */
1357        if (!cgroup_tryget(cgrp))
1358                return NULL;
1359        kernfs_break_active_protection(kn);
1360
1361        if (drain_offline)
1362                cgroup_lock_and_drain_offline(cgrp);
1363        else
1364                mutex_lock(&cgroup_mutex);
1365
1366        if (!cgroup_is_dead(cgrp))
1367                return cgrp;
1368
1369        cgroup_kn_unlock(kn);
1370        return NULL;
1371}
1372
1373static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1374{
1375        char name[CGROUP_FILE_NAME_MAX];
1376
1377        lockdep_assert_held(&cgroup_mutex);
1378
1379        if (cft->file_offset) {
1380                struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
1381                struct cgroup_file *cfile = (void *)css + cft->file_offset;
1382
1383                spin_lock_irq(&cgroup_file_kn_lock);
1384                cfile->kn = NULL;
1385                spin_unlock_irq(&cgroup_file_kn_lock);
1386        }
1387
1388        kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1389}
1390
1391/**
1392 * css_clear_dir - remove subsys files in a cgroup directory
1393 * @css: taget css
1394 */
1395static void css_clear_dir(struct cgroup_subsys_state *css)
1396{
1397        struct cgroup *cgrp = css->cgroup;
1398        struct cftype *cfts;
1399
1400        if (!(css->flags & CSS_VISIBLE))
1401                return;
1402
1403        css->flags &= ~CSS_VISIBLE;
1404
1405        list_for_each_entry(cfts, &css->ss->cfts, node)
1406                cgroup_addrm_files(css, cgrp, cfts, false);
1407}
1408
1409/**
1410 * css_populate_dir - create subsys files in a cgroup directory
1411 * @css: target css
1412 *
1413 * On failure, no file is added.
1414 */
1415static int css_populate_dir(struct cgroup_subsys_state *css)
1416{
1417        struct cgroup *cgrp = css->cgroup;
1418        struct cftype *cfts, *failed_cfts;
1419        int ret;
1420
1421        if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
1422                return 0;
1423
1424        if (!css->ss) {
1425                if (cgroup_on_dfl(cgrp))
1426                        cfts = cgroup_base_files;
1427                else
1428                        cfts = cgroup1_base_files;
1429
1430                return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
1431        }
1432
1433        list_for_each_entry(cfts, &css->ss->cfts, node) {
1434                ret = cgroup_addrm_files(css, cgrp, cfts, true);
1435                if (ret < 0) {
1436                        failed_cfts = cfts;
1437                        goto err;
1438                }
1439        }
1440
1441        css->flags |= CSS_VISIBLE;
1442
1443        return 0;
1444err:
1445        list_for_each_entry(cfts, &css->ss->cfts, node) {
1446                if (cfts == failed_cfts)
1447                        break;
1448                cgroup_addrm_files(css, cgrp, cfts, false);
1449        }
1450        return ret;
1451}
1452
1453int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
1454{
1455        struct cgroup *dcgrp = &dst_root->cgrp;
1456        struct cgroup_subsys *ss;
1457        int ssid, i, ret;
1458
1459        lockdep_assert_held(&cgroup_mutex);
1460
1461        do_each_subsys_mask(ss, ssid, ss_mask) {
1462                /*
1463                 * If @ss has non-root csses attached to it, can't move.
1464                 * If @ss is an implicit controller, it is exempt from this
1465                 * rule and can be stolen.
1466                 */
1467                if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
1468                    !ss->implicit_on_dfl)
1469                        return -EBUSY;
1470
1471                /* can't move between two non-dummy roots either */
1472                if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1473                        return -EBUSY;
1474        } while_each_subsys_mask();
1475
1476        do_each_subsys_mask(ss, ssid, ss_mask) {
1477                struct cgroup_root *src_root = ss->root;
1478                struct cgroup *scgrp = &src_root->cgrp;
1479                struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
1480                struct css_set *cset;
1481
1482                WARN_ON(!css || cgroup_css(dcgrp, ss));
1483
1484                /* disable from the source */
1485                src_root->subsys_mask &= ~(1 << ssid);
1486                WARN_ON(cgroup_apply_control(scgrp));
1487                cgroup_finalize_control(scgrp, 0);
1488
1489                /* rebind */
1490                RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
1491                rcu_assign_pointer(dcgrp->subsys[ssid], css);
1492                ss->root = dst_root;
1493                css->cgroup = dcgrp;
1494
1495                spin_lock_irq(&css_set_lock);
1496                hash_for_each(css_set_table, i, cset, hlist)
1497                        list_move_tail(&cset->e_cset_node[ss->id],
1498                                       &dcgrp->e_csets[ss->id]);
1499                spin_unlock_irq(&css_set_lock);
1500
1501                /* default hierarchy doesn't enable controllers by default */
1502                dst_root->subsys_mask |= 1 << ssid;
1503                if (dst_root == &cgrp_dfl_root) {
1504                        static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
1505                } else {
1506                        dcgrp->subtree_control |= 1 << ssid;
1507                        static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
1508                }
1509
1510                ret = cgroup_apply_control(dcgrp);
1511                if (ret)
1512                        pr_warn("partial failure to rebind %s controller (err=%d)\n",
1513                                ss->name, ret);
1514
1515                if (ss->bind)
1516                        ss->bind(css);
1517        } while_each_subsys_mask();
1518
1519        kernfs_activate(dcgrp->kn);
1520        return 0;
1521}
1522
1523int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1524                     struct kernfs_root *kf_root)
1525{
1526        int len = 0;
1527        char *buf = NULL;
1528        struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
1529        struct cgroup *ns_cgroup;
1530
1531        buf = kmalloc(PATH_MAX, GFP_KERNEL);
1532        if (!buf)
1533                return -ENOMEM;
1534
1535        spin_lock_irq(&css_set_lock);
1536        ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
1537        len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
1538        spin_unlock_irq(&css_set_lock);
1539
1540        if (len >= PATH_MAX)
1541                len = -ERANGE;
1542        else if (len > 0) {
1543                seq_escape(sf, buf, " \t\n\\");
1544                len = 0;
1545        }
1546        kfree(buf);
1547        return len;
1548}
1549
1550static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
1551{
1552        char *token;
1553
1554        *root_flags = 0;
1555
1556        if (!data)
1557                return 0;
1558
1559        while ((token = strsep(&data, ",")) != NULL) {
1560                if (!strcmp(token, "nsdelegate")) {
1561                        *root_flags |= CGRP_ROOT_NS_DELEGATE;
1562                        continue;
1563                }
1564
1565                pr_err("cgroup2: unknown option \"%s\"\n", token);
1566                return -EINVAL;
1567        }
1568
1569        return 0;
1570}
1571
1572static void apply_cgroup_root_flags(unsigned int root_flags)
1573{
1574        if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
1575                if (root_flags & CGRP_ROOT_NS_DELEGATE)
1576                        cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
1577                else
1578                        cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
1579        }
1580}
1581
1582static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
1583{
1584        if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
1585                seq_puts(seq, ",nsdelegate");
1586        return 0;
1587}
1588
1589static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1590{
1591        unsigned int root_flags;
1592        int ret;
1593
1594        ret = parse_cgroup_root_flags(data, &root_flags);
1595        if (ret)
1596                return ret;
1597
1598        apply_cgroup_root_flags(root_flags);
1599        return 0;
1600}
1601
1602/*
1603 * To reduce the fork() overhead for systems that are not actually using
1604 * their cgroups capability, we don't maintain the lists running through
1605 * each css_set to its tasks until we see the list actually used - in other
1606 * words after the first mount.
1607 */
1608static bool use_task_css_set_links __read_mostly;
1609
1610static void cgroup_enable_task_cg_lists(void)
1611{
1612        struct task_struct *p, *g;
1613
1614        spin_lock_irq(&css_set_lock);
1615
1616        if (use_task_css_set_links)
1617                goto out_unlock;
1618
1619        use_task_css_set_links = true;
1620
1621        /*
1622         * We need tasklist_lock because RCU is not safe against
1623         * while_each_thread(). Besides, a forking task that has passed
1624         * cgroup_post_fork() without seeing use_task_css_set_links = 1
1625         * is not guaranteed to have its child immediately visible in the
1626         * tasklist if we walk through it with RCU.
1627         */
1628        read_lock(&tasklist_lock);
1629        do_each_thread(g, p) {
1630                WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1631                             task_css_set(p) != &init_css_set);
1632
1633                /*
1634                 * We should check if the process is exiting, otherwise
1635                 * it will race with cgroup_exit() in that the list
1636                 * entry won't be deleted though the process has exited.
1637                 * Do it while holding siglock so that we don't end up
1638                 * racing against cgroup_exit().
1639                 *
1640                 * Interrupts were already disabled while acquiring
1641                 * the css_set_lock, so we do not need to disable it
1642                 * again when acquiring the sighand->siglock here.
1643                 */
1644                spin_lock(&p->sighand->siglock);
1645                if (!(p->flags & PF_EXITING)) {
1646                        struct css_set *cset = task_css_set(p);
1647
1648                        if (!css_set_populated(cset))
1649                                css_set_update_populated(cset, true);
1650                        list_add_tail(&p->cg_list, &cset->tasks);
1651                        get_css_set(cset);
1652                        cset->nr_tasks++;
1653                }
1654                spin_unlock(&p->sighand->siglock);
1655        } while_each_thread(g, p);
1656        read_unlock(&tasklist_lock);
1657out_unlock:
1658        spin_unlock_irq(&css_set_lock);
1659}
1660
1661static void init_cgroup_housekeeping(struct cgroup *cgrp)
1662{
1663        struct cgroup_subsys *ss;
1664        int ssid;
1665
1666        INIT_LIST_HEAD(&cgrp->self.sibling);
1667        INIT_LIST_HEAD(&cgrp->self.children);
1668        INIT_LIST_HEAD(&cgrp->cset_links);
1669        INIT_LIST_HEAD(&cgrp->pidlists);
1670        mutex_init(&cgrp->pidlist_mutex);
1671        cgrp->self.cgroup = cgrp;
1672        cgrp->self.flags |= CSS_ONLINE;
1673
1674        for_each_subsys(ss, ssid)
1675                INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1676
1677        init_waitqueue_head(&cgrp->offline_waitq);
1678        INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
1679}
1680
1681void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
1682{
1683        struct cgroup *cgrp = &root->cgrp;
1684
1685        INIT_LIST_HEAD(&root->root_list);
1686        atomic_set(&root->nr_cgrps, 1);
1687        cgrp->root = root;
1688        init_cgroup_housekeeping(cgrp);
1689        idr_init(&root->cgroup_idr);
1690
1691        root->flags = opts->flags;
1692        if (opts->release_agent)
1693                strcpy(root->release_agent_path, opts->release_agent);
1694        if (opts->name)
1695                strcpy(root->name, opts->name);
1696        if (opts->cpuset_clone_children)
1697                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1698}
1699
1700int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
1701{
1702        LIST_HEAD(tmp_links);
1703        struct cgroup *root_cgrp = &root->cgrp;
1704        struct kernfs_syscall_ops *kf_sops;
1705        struct css_set *cset;
1706        int i, ret;
1707
1708        lockdep_assert_held(&cgroup_mutex);
1709
1710        ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
1711        if (ret < 0)
1712                goto out;
1713        root_cgrp->id = ret;
1714        root_cgrp->ancestor_ids[0] = ret;
1715
1716        ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
1717                              ref_flags, GFP_KERNEL);
1718        if (ret)
1719                goto out;
1720
1721        /*
1722         * We're accessing css_set_count without locking css_set_lock here,
1723         * but that's OK - it can only be increased by someone holding
1724         * cgroup_lock, and that's us.  Later rebinding may disable
1725         * controllers on the default hierarchy and thus create new csets,
1726         * which can't be more than the existing ones.  Allocate 2x.
1727         */
1728        ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
1729        if (ret)
1730                goto cancel_ref;
1731
1732        ret = cgroup_init_root_id(root);
1733        if (ret)
1734                goto cancel_ref;
1735
1736        kf_sops = root == &cgrp_dfl_root ?
1737                &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
1738
1739        root->kf_root = kernfs_create_root(kf_sops,
1740                                           KERNFS_ROOT_CREATE_DEACTIVATED,
1741                                           root_cgrp);
1742        if (IS_ERR(root->kf_root)) {
1743                ret = PTR_ERR(root->kf_root);
1744                goto exit_root_id;
1745        }
1746        root_cgrp->kn = root->kf_root->kn;
1747
1748        ret = css_populate_dir(&root_cgrp->self);
1749        if (ret)
1750                goto destroy_root;
1751
1752        ret = rebind_subsystems(root, ss_mask);
1753        if (ret)
1754                goto destroy_root;
1755
1756        trace_cgroup_setup_root(root);
1757
1758        /*
1759         * There must be no failure case after here, since rebinding takes
1760         * care of subsystems' refcounts, which are explicitly dropped in
1761         * the failure exit path.
1762         */
1763        list_add(&root->root_list, &cgroup_roots);
1764        cgroup_root_count++;
1765
1766        /*
1767         * Link the root cgroup in this hierarchy into all the css_set
1768         * objects.
1769         */
1770        spin_lock_irq(&css_set_lock);
1771        hash_for_each(css_set_table, i, cset, hlist) {
1772                link_css_set(&tmp_links, cset, root_cgrp);
1773                if (css_set_populated(cset))
1774                        cgroup_update_populated(root_cgrp, true);
1775        }
1776        spin_unlock_irq(&css_set_lock);
1777
1778        BUG_ON(!list_empty(&root_cgrp->self.children));
1779        BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1780
1781        kernfs_activate(root_cgrp->kn);
1782        ret = 0;
1783        goto out;
1784
1785destroy_root:
1786        kernfs_destroy_root(root->kf_root);
1787        root->kf_root = NULL;
1788exit_root_id:
1789        cgroup_exit_root_id(root);
1790cancel_ref:
1791        percpu_ref_exit(&root_cgrp->self.refcnt);
1792out:
1793        free_cgrp_cset_links(&tmp_links);
1794        return ret;
1795}
1796
1797struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
1798                               struct cgroup_root *root, unsigned long magic,
1799                               struct cgroup_namespace *ns)
1800{
1801        struct dentry *dentry;
1802        bool new_sb;
1803
1804        dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
1805
1806        /*
1807         * In non-init cgroup namespace, instead of root cgroup's dentry,
1808         * we return the dentry corresponding to the cgroupns->root_cgrp.
1809         */
1810        if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
1811                struct dentry *nsdentry;
1812                struct cgroup *cgrp;
1813
1814                mutex_lock(&cgroup_mutex);
1815                spin_lock_irq(&css_set_lock);
1816
1817                cgrp = cset_cgroup_from_root(ns->root_cset, root);
1818
1819                spin_unlock_irq(&css_set_lock);
1820                mutex_unlock(&cgroup_mutex);
1821
1822                nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
1823                dput(dentry);
1824                dentry = nsdentry;
1825        }
1826
1827        if (IS_ERR(dentry) || !new_sb)
1828                cgroup_put(&root->cgrp);
1829
1830        return dentry;
1831}
1832
1833static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1834                         int flags, const char *unused_dev_name,
1835                         void *data)
1836{
1837        struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
1838        struct dentry *dentry;
1839        int ret;
1840
1841        get_cgroup_ns(ns);
1842
1843        /* Check if the caller has permission to mount. */
1844        if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
1845                put_cgroup_ns(ns);
1846                return ERR_PTR(-EPERM);
1847        }
1848
1849        /*
1850         * The first time anyone tries to mount a cgroup, enable the list
1851         * linking each css_set to its tasks and fix up all existing tasks.
1852         */
1853        if (!use_task_css_set_links)
1854                cgroup_enable_task_cg_lists();
1855
1856        if (fs_type == &cgroup2_fs_type) {
1857                unsigned int root_flags;
1858
1859                ret = parse_cgroup_root_flags(data, &root_flags);
1860                if (ret) {
1861                        put_cgroup_ns(ns);
1862                        return ERR_PTR(ret);
1863                }
1864
1865                cgrp_dfl_visible = true;
1866                cgroup_get_live(&cgrp_dfl_root.cgrp);
1867
1868                dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
1869                                         CGROUP2_SUPER_MAGIC, ns);
1870                if (!IS_ERR(dentry))
1871                        apply_cgroup_root_flags(root_flags);
1872        } else {
1873                dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
1874                                       CGROUP_SUPER_MAGIC, ns);
1875        }
1876
1877        put_cgroup_ns(ns);
1878        return dentry;
1879}
1880
1881static void cgroup_kill_sb(struct super_block *sb)
1882{
1883        struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1884        struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1885
1886        /*
1887         * If @root doesn't have any mounts or children, start killing it.
1888         * This prevents new mounts by disabling percpu_ref_tryget_live().
1889         * cgroup_mount() may wait for @root's release.
1890         *
1891         * And don't kill the default root.
1892         */
1893        if (!list_empty(&root->cgrp.self.children) ||
1894            root == &cgrp_dfl_root)
1895                cgroup_put(&root->cgrp);
1896        else
1897                percpu_ref_kill(&root->cgrp.self.refcnt);
1898
1899        kernfs_kill_sb(sb);
1900}
1901
1902struct file_system_type cgroup_fs_type = {
1903        .name = "cgroup",
1904        .mount = cgroup_mount,
1905        .kill_sb = cgroup_kill_sb,
1906        .fs_flags = FS_USERNS_MOUNT,
1907};
1908
1909static struct file_system_type cgroup2_fs_type = {
1910        .name = "cgroup2",
1911        .mount = cgroup_mount,
1912        .kill_sb = cgroup_kill_sb,
1913        .fs_flags = FS_USERNS_MOUNT,
1914};
1915
1916int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
1917                          struct cgroup_namespace *ns)
1918{
1919        struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
1920
1921        return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
1922}
1923
1924int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
1925                   struct cgroup_namespace *ns)
1926{
1927        int ret;
1928
1929        mutex_lock(&cgroup_mutex);
1930        spin_lock_irq(&css_set_lock);
1931
1932        ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
1933
1934        spin_unlock_irq(&css_set_lock);
1935        mutex_unlock(&cgroup_mutex);
1936
1937        return ret;
1938}
1939EXPORT_SYMBOL_GPL(cgroup_path_ns);
1940
1941/**
1942 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
1943 * @task: target task
1944 * @buf: the buffer to write the path into
1945 * @buflen: the length of the buffer
1946 *
1947 * Determine @task's cgroup on the first (the one with the lowest non-zero
1948 * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
1949 * function grabs cgroup_mutex and shouldn't be used inside locks used by
1950 * cgroup controller callbacks.
1951 *
1952 * Return value is the same as kernfs_path().
1953 */
1954int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1955{
1956        struct cgroup_root *root;
1957        struct cgroup *cgrp;
1958        int hierarchy_id = 1;
1959        int ret;
1960
1961        mutex_lock(&cgroup_mutex);
1962        spin_lock_irq(&css_set_lock);
1963
1964        root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
1965
1966        if (root) {
1967                cgrp = task_cgroup_from_root(task, root);
1968                ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
1969        } else {
1970                /* if no hierarchy exists, everyone is in "/" */
1971                ret = strlcpy(buf, "/", buflen);
1972        }
1973
1974        spin_unlock_irq(&css_set_lock);
1975        mutex_unlock(&cgroup_mutex);
1976        return ret;
1977}
1978EXPORT_SYMBOL_GPL(task_cgroup_path);
1979
1980/**
1981 * cgroup_migrate_add_task - add a migration target task to a migration context
1982 * @task: target task
1983 * @mgctx: target migration context
1984 *
1985 * Add @task, which is a migration target, to @mgctx->tset.  This function
1986 * becomes noop if @task doesn't need to be migrated.  @task's css_set
1987 * should have been added as a migration source and @task->cg_list will be
1988 * moved from the css_set's tasks list to mg_tasks one.
1989 */
1990static void cgroup_migrate_add_task(struct task_struct *task,
1991                                    struct cgroup_mgctx *mgctx)
1992{
1993        struct css_set *cset;
1994
1995        lockdep_assert_held(&css_set_lock);
1996
1997        /* @task either already exited or can't exit until the end */
1998        if (task->flags & PF_EXITING)
1999                return;
2000
2001        /* leave @task alone if post_fork() hasn't linked it yet */
2002        if (list_empty(&task->cg_list))
2003                return;
2004
2005        cset = task_css_set(task);
2006        if (!cset->mg_src_cgrp)
2007                return;
2008
2009        mgctx->tset.nr_tasks++;
2010
2011        list_move_tail(&task->cg_list, &cset->mg_tasks);
2012        if (list_empty(&cset->mg_node))
2013                list_add_tail(&cset->mg_node,
2014                              &mgctx->tset.src_csets);
2015        if (list_empty(&cset->mg_dst_cset->mg_node))
2016                list_add_tail(&cset->mg_dst_cset->mg_node,
2017                              &mgctx->tset.dst_csets);
2018}
2019
2020/**
2021 * cgroup_taskset_first - reset taskset and return the first task
2022 * @tset: taskset of interest
2023 * @dst_cssp: output variable for the destination css
2024 *
2025 * @tset iteration is initialized and the first task is returned.
2026 */
2027struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
2028                                         struct cgroup_subsys_state **dst_cssp)
2029{
2030        tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
2031        tset->cur_task = NULL;
2032
2033        return cgroup_taskset_next(tset, dst_cssp);
2034}
2035
2036/**
2037 * cgroup_taskset_next - iterate to the next task in taskset
2038 * @tset: taskset of interest
2039 * @dst_cssp: output variable for the destination css
2040 *
2041 * Return the next task in @tset.  Iteration must have been initialized
2042 * with cgroup_taskset_first().
2043 */
2044struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
2045                                        struct cgroup_subsys_state **dst_cssp)
2046{
2047        struct css_set *cset = tset->cur_cset;
2048        struct task_struct *task = tset->cur_task;
2049
2050        while (&cset->mg_node != tset->csets) {
2051                if (!task)
2052                        task = list_first_entry(&cset->mg_tasks,
2053                                                struct task_struct, cg_list);
2054                else
2055                        task = list_next_entry(task, cg_list);
2056
2057                if (&task->cg_list != &cset->mg_tasks) {
2058                        tset->cur_cset = cset;
2059                        tset->cur_task = task;
2060
2061                        /*
2062                         * This function may be called both before and
2063                         * after cgroup_taskset_migrate().  The two cases
2064                         * can be distinguished by looking at whether @cset
2065                         * has its ->mg_dst_cset set.
2066                         */
2067                        if (cset->mg_dst_cset)
2068                                *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
2069                        else
2070                                *dst_cssp = cset->subsys[tset->ssid];
2071
2072                        return task;
2073                }
2074
2075                cset = list_next_entry(cset, mg_node);
2076                task = NULL;
2077        }
2078
2079        return NULL;
2080}
2081
2082/**
2083 * cgroup_taskset_migrate - migrate a taskset
2084 * @mgctx: migration context
2085 *
2086 * Migrate tasks in @mgctx as setup by migration preparation functions.
2087 * This function fails iff one of the ->can_attach callbacks fails and
2088 * guarantees that either all or none of the tasks in @mgctx are migrated.
2089 * @mgctx is consumed regardless of success.
2090 */
2091static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
2092{
2093        struct cgroup_taskset *tset = &mgctx->tset;
2094        struct cgroup_subsys *ss;
2095        struct task_struct *task, *tmp_task;
2096        struct css_set *cset, *tmp_cset;
2097        int ssid, failed_ssid, ret;
2098
2099        /* check that we can legitimately attach to the cgroup */
2100        if (tset->nr_tasks) {
2101                do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2102                        if (ss->can_attach) {
2103                                tset->ssid = ssid;
2104                                ret = ss->can_attach(tset);
2105                                if (ret) {
2106                                        failed_ssid = ssid;
2107                                        goto out_cancel_attach;
2108                                }
2109                        }
2110                } while_each_subsys_mask();
2111        }
2112
2113        /*
2114         * Now that we're guaranteed success, proceed to move all tasks to
2115         * the new cgroup.  There are no failure cases after here, so this
2116         * is the commit point.
2117         */
2118        spin_lock_irq(&css_set_lock);
2119        list_for_each_entry(cset, &tset->src_csets, mg_node) {
2120                list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
2121                        struct css_set *from_cset = task_css_set(task);
2122                        struct css_set *to_cset = cset->mg_dst_cset;
2123
2124                        get_css_set(to_cset);
2125                        to_cset->nr_tasks++;
2126                        css_set_move_task(task, from_cset, to_cset, true);
2127                        put_css_set_locked(from_cset);
2128                        from_cset->nr_tasks--;
2129                }
2130        }
2131        spin_unlock_irq(&css_set_lock);
2132
2133        /*
2134         * Migration is committed, all target tasks are now on dst_csets.
2135         * Nothing is sensitive to fork() after this point.  Notify
2136         * controllers that migration is complete.
2137         */
2138        tset->csets = &tset->dst_csets;
2139
2140        if (tset->nr_tasks) {
2141                do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2142                        if (ss->attach) {
2143                                tset->ssid = ssid;
2144                                ss->attach(tset);
2145                        }
2146                } while_each_subsys_mask();
2147        }
2148
2149        ret = 0;
2150        goto out_release_tset;
2151
2152out_cancel_attach:
2153        if (tset->nr_tasks) {
2154                do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2155                        if (ssid == failed_ssid)
2156                                break;
2157                        if (ss->cancel_attach) {
2158                                tset->ssid = ssid;
2159                                ss->cancel_attach(tset);
2160                        }
2161                } while_each_subsys_mask();
2162        }
2163out_release_tset:
2164        spin_lock_irq(&css_set_lock);
2165        list_splice_init(&tset->dst_csets, &tset->src_csets);
2166        list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
2167                list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2168                list_del_init(&cset->mg_node);
2169        }
2170        spin_unlock_irq(&css_set_lock);
2171        return ret;
2172}
2173
2174/**
2175 * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
2176 * @dst_cgrp: destination cgroup to test
2177 *
2178 * On the default hierarchy, except for the root, subtree_control must be
2179 * zero for migration destination cgroups with tasks so that child cgroups
2180 * don't compete against tasks.
2181 */
2182bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
2183{
2184        return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
2185                !dst_cgrp->subtree_control;
2186}
2187
2188/**
2189 * cgroup_migrate_finish - cleanup after attach
2190 * @mgctx: migration context
2191 *
2192 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
2193 * those functions for details.
2194 */
2195void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
2196{
2197        LIST_HEAD(preloaded);
2198        struct css_set *cset, *tmp_cset;
2199
2200        lockdep_assert_held(&cgroup_mutex);
2201
2202        spin_lock_irq(&css_set_lock);
2203
2204        list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
2205        list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
2206
2207        list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
2208                cset->mg_src_cgrp = NULL;
2209                cset->mg_dst_cgrp = NULL;
2210                cset->mg_dst_cset = NULL;
2211                list_del_init(&cset->mg_preload_node);
2212                put_css_set_locked(cset);
2213        }
2214
2215        spin_unlock_irq(&css_set_lock);
2216}
2217
2218/**
2219 * cgroup_migrate_add_src - add a migration source css_set
2220 * @src_cset: the source css_set to add
2221 * @dst_cgrp: the destination cgroup
2222 * @mgctx: migration context
2223 *
2224 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
2225 * @src_cset and add it to @mgctx->src_csets, which should later be cleaned
2226 * up by cgroup_migrate_finish().
2227 *
2228 * This function may be called without holding cgroup_threadgroup_rwsem
2229 * even if the target is a process.  Threads may be created and destroyed
2230 * but as long as cgroup_mutex is not dropped, no new css_set can be put
2231 * into play and the preloaded css_sets are guaranteed to cover all
2232 * migrations.
2233 */
2234void cgroup_migrate_add_src(struct css_set *src_cset,
2235                            struct cgroup *dst_cgrp,
2236                            struct cgroup_mgctx *mgctx)
2237{
2238        struct cgroup *src_cgrp;
2239
2240        lockdep_assert_held(&cgroup_mutex);
2241        lockdep_assert_held(&css_set_lock);
2242
2243        /*
2244         * If ->dead, @src_set is associated with one or more dead cgroups
2245         * and doesn't contain any migratable tasks.  Ignore it early so
2246         * that the rest of migration path doesn't get confused by it.
2247         */
2248        if (src_cset->dead)
2249                return;
2250
2251        src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2252
2253        if (!list_empty(&src_cset->mg_preload_node))
2254                return;
2255
2256        WARN_ON(src_cset->mg_src_cgrp);
2257        WARN_ON(src_cset->mg_dst_cgrp);
2258        WARN_ON(!list_empty(&src_cset->mg_tasks));
2259        WARN_ON(!list_empty(&src_cset->mg_node));
2260
2261        src_cset->mg_src_cgrp = src_cgrp;
2262        src_cset->mg_dst_cgrp = dst_cgrp;
2263        get_css_set(src_cset);
2264        list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
2265}
2266
2267/**
2268 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
2269 * @mgctx: migration context
2270 *
2271 * Tasks are about to be moved and all the source css_sets have been
2272 * preloaded to @mgctx->preloaded_src_csets.  This function looks up and
2273 * pins all destination css_sets, links each to its source, and append them
2274 * to @mgctx->preloaded_dst_csets.
2275 *
2276 * This function must be called after cgroup_migrate_add_src() has been
2277 * called on each migration source css_set.  After migration is performed
2278 * using cgroup_migrate(), cgroup_migrate_finish() must be called on
2279 * @mgctx.
2280 */
2281int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
2282{
2283        struct css_set *src_cset, *tmp_cset;
2284
2285        lockdep_assert_held(&cgroup_mutex);
2286
2287        /* look up the dst cset for each src cset and link it to src */
2288        list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
2289                                 mg_preload_node) {
2290                struct css_set *dst_cset;
2291                struct cgroup_subsys *ss;
2292                int ssid;
2293
2294                dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
2295                if (!dst_cset)
2296                        goto err;
2297
2298                WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2299
2300                /*
2301                 * If src cset equals dst, it's noop.  Drop the src.
2302                 * cgroup_migrate() will skip the cset too.  Note that we
2303                 * can't handle src == dst as some nodes are used by both.
2304                 */
2305                if (src_cset == dst_cset) {
2306                        src_cset->mg_src_cgrp = NULL;
2307                        src_cset->mg_dst_cgrp = NULL;
2308                        list_del_init(&src_cset->mg_preload_node);
2309                        put_css_set(src_cset);
2310                        put_css_set(dst_cset);
2311                        continue;
2312                }
2313
2314                src_cset->mg_dst_cset = dst_cset;
2315
2316                if (list_empty(&dst_cset->mg_preload_node))
2317                        list_add_tail(&dst_cset->mg_preload_node,
2318                                      &mgctx->preloaded_dst_csets);
2319                else
2320                        put_css_set(dst_cset);
2321
2322                for_each_subsys(ss, ssid)
2323                        if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
2324                                mgctx->ss_mask |= 1 << ssid;
2325        }
2326
2327        return 0;
2328err:
2329        cgroup_migrate_finish(mgctx);
2330        return -ENOMEM;
2331}
2332
2333/**
2334 * cgroup_migrate - migrate a process or task to a cgroup
2335 * @leader: the leader of the process or the task to migrate
2336 * @threadgroup: whether @leader points to the whole process or a single task
2337 * @mgctx: migration context
2338 *
2339 * Migrate a process or task denoted by @leader.  If migrating a process,
2340 * the caller must be holding cgroup_threadgroup_rwsem.  The caller is also
2341 * responsible for invoking cgroup_migrate_add_src() and
2342 * cgroup_migrate_prepare_dst() on the targets before invoking this
2343 * function and following up with cgroup_migrate_finish().
2344 *
2345 * As long as a controller's ->can_attach() doesn't fail, this function is
2346 * guaranteed to succeed.  This means that, excluding ->can_attach()
2347 * failure, when migrating multiple targets, the success or failure can be
2348 * decided for all targets by invoking group_migrate_prepare_dst() before
2349 * actually starting migrating.
2350 */
2351int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2352                   struct cgroup_mgctx *mgctx)
2353{
2354        struct task_struct *task;
2355
2356        /*
2357         * Prevent freeing of tasks while we take a snapshot. Tasks that are
2358         * already PF_EXITING could be freed from underneath us unless we
2359         * take an rcu_read_lock.
2360         */
2361        spin_lock_irq(&css_set_lock);
2362        rcu_read_lock();
2363        task = leader;
2364        do {
2365                cgroup_migrate_add_task(task, mgctx);
2366                if (!threadgroup)
2367                        break;
2368        } while_each_thread(leader, task);
2369        rcu_read_unlock();
2370        spin_unlock_irq(&css_set_lock);
2371
2372        return cgroup_migrate_execute(mgctx);
2373}
2374
2375/**
2376 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
2377 * @dst_cgrp: the cgroup to attach to
2378 * @leader: the task or the leader of the threadgroup to be attached
2379 * @threadgroup: attach the whole threadgroup?
2380 *
2381 * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
2382 */
2383int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
2384                       bool threadgroup)
2385{
2386        DEFINE_CGROUP_MGCTX(mgctx);
2387        struct task_struct *task;
2388        int ret;
2389
2390        if (!cgroup_may_migrate_to(dst_cgrp))
2391                return -EBUSY;
2392
2393        /* look up all src csets */
2394        spin_lock_irq(&css_set_lock);
2395        rcu_read_lock();
2396        task = leader;
2397        do {
2398                cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
2399                if (!threadgroup)
2400                        break;
2401        } while_each_thread(leader, task);
2402        rcu_read_unlock();
2403        spin_unlock_irq(&css_set_lock);
2404
2405        /* prepare dst csets and commit */
2406        ret = cgroup_migrate_prepare_dst(&mgctx);
2407        if (!ret)
2408                ret = cgroup_migrate(leader, threadgroup, &mgctx);
2409
2410        cgroup_migrate_finish(&mgctx);
2411
2412        if (!ret)
2413                trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
2414
2415        return ret;
2416}
2417
2418static int cgroup_procs_write_permission(struct task_struct *task,
2419                                         struct cgroup *dst_cgrp,
2420                                         struct kernfs_open_file *of)
2421{
2422        struct super_block *sb = of->file->f_path.dentry->d_sb;
2423        struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
2424        struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
2425        struct cgroup *src_cgrp, *com_cgrp;
2426        struct inode *inode;
2427        int ret;
2428
2429        if (!cgroup_on_dfl(dst_cgrp)) {
2430                const struct cred *cred = current_cred();
2431                const struct cred *tcred = get_task_cred(task);
2432
2433                /*
2434                 * even if we're attaching all tasks in the thread group,
2435                 * we only need to check permissions on one of them.
2436                 */
2437                if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
2438                    uid_eq(cred->euid, tcred->uid) ||
2439                    uid_eq(cred->euid, tcred->suid))
2440                        ret = 0;
2441                else
2442                        ret = -EACCES;
2443
2444                put_cred(tcred);
2445                return ret;
2446        }
2447
2448        /* find the source cgroup */
2449        spin_lock_irq(&css_set_lock);
2450        src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
2451        spin_unlock_irq(&css_set_lock);
2452
2453        /* and the common ancestor */
2454        com_cgrp = src_cgrp;
2455        while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
2456                com_cgrp = cgroup_parent(com_cgrp);
2457
2458        /* %current should be authorized to migrate to the common ancestor */
2459        inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
2460        if (!inode)
2461                return -ENOMEM;
2462
2463        ret = inode_permission(inode, MAY_WRITE);
2464        iput(inode);
2465        if (ret)
2466                return ret;
2467
2468        /*
2469         * If namespaces are delegation boundaries, %current must be able
2470         * to see both source and destination cgroups from its namespace.
2471         */
2472        if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
2473            (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
2474             !cgroup_is_descendant(dst_cgrp, root_cgrp)))
2475                return -ENOENT;
2476
2477        return 0;
2478}
2479
2480/*
2481 * Find the task_struct of the task to attach by vpid and pass it along to the
2482 * function to attach either it or all tasks in its threadgroup. Will lock
2483 * cgroup_mutex and threadgroup.
2484 */
2485ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2486                             size_t nbytes, loff_t off, bool threadgroup)
2487{
2488        struct task_struct *tsk;
2489        struct cgroup_subsys *ss;
2490        struct cgroup *cgrp;
2491        pid_t pid;
2492        int ssid, ret;
2493
2494        if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2495                return -EINVAL;
2496
2497        cgrp = cgroup_kn_lock_live(of->kn, false);
2498        if (!cgrp)
2499                return -ENODEV;
2500
2501        percpu_down_write(&cgroup_threadgroup_rwsem);
2502        rcu_read_lock();
2503        if (pid) {
2504                tsk = find_task_by_vpid(pid);
2505                if (!tsk) {
2506                        ret = -ESRCH;
2507                        goto out_unlock_rcu;
2508                }
2509        } else {
2510                tsk = current;
2511        }
2512
2513        if (threadgroup)
2514                tsk = tsk->group_leader;
2515
2516        /*
2517         * kthreads may acquire PF_NO_SETAFFINITY during initialization.
2518         * If userland migrates such a kthread to a non-root cgroup, it can
2519         * become trapped in a cpuset, or RT kthread may be born in a
2520         * cgroup with no rt_runtime allocated.  Just say no.
2521         */
2522        if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
2523                ret = -EINVAL;
2524                goto out_unlock_rcu;
2525        }
2526
2527        get_task_struct(tsk);
2528        rcu_read_unlock();
2529
2530        ret = cgroup_procs_write_permission(tsk, cgrp, of);
2531        if (!ret)
2532                ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2533
2534        put_task_struct(tsk);
2535        goto out_unlock_threadgroup;
2536
2537out_unlock_rcu:
2538        rcu_read_unlock();
2539out_unlock_threadgroup:
2540        percpu_up_write(&cgroup_threadgroup_rwsem);
2541        for_each_subsys(ss, ssid)
2542                if (ss->post_attach)
2543                        ss->post_attach();
2544        cgroup_kn_unlock(of->kn);
2545        return ret ?: nbytes;
2546}
2547
2548ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
2549                           loff_t off)
2550{
2551        return __cgroup_procs_write(of, buf, nbytes, off, true);
2552}
2553
2554static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
2555{
2556        struct cgroup_subsys *ss;
2557        bool printed = false;
2558        int ssid;
2559
2560        do_each_subsys_mask(ss, ssid, ss_mask) {
2561                if (printed)
2562                        seq_putc(seq, ' ');
2563                seq_printf(seq, "%s", ss->name);
2564                printed = true;
2565        } while_each_subsys_mask();
2566        if (printed)
2567                seq_putc(seq, '\n');
2568}
2569
2570/* show controllers which are enabled from the parent */
2571static int cgroup_controllers_show(struct seq_file *seq, void *v)
2572{
2573        struct cgroup *cgrp = seq_css(seq)->cgroup;
2574
2575        cgroup_print_ss_mask(seq, cgroup_control(cgrp));
2576        return 0;
2577}
2578
2579/* show controllers which are enabled for a given cgroup's children */
2580static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2581{
2582        struct cgroup *cgrp = seq_css(seq)->cgroup;
2583
2584        cgroup_print_ss_mask(seq, cgrp->subtree_control);
2585        return 0;
2586}
2587
2588/**
2589 * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
2590 * @cgrp: root of the subtree to update csses for
2591 *
2592 * @cgrp's control masks have changed and its subtree's css associations
2593 * need to be updated accordingly.  This function looks up all css_sets
2594 * which are attached to the subtree, creates the matching updated css_sets
2595 * and migrates the tasks to the new ones.
2596 */
2597static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2598{
2599        DEFINE_CGROUP_MGCTX(mgctx);
2600        struct cgroup_subsys_state *d_css;
2601        struct cgroup *dsct;
2602        struct css_set *src_cset;
2603        int ret;
2604
2605        lockdep_assert_held(&cgroup_mutex);
2606
2607        percpu_down_write(&cgroup_threadgroup_rwsem);
2608
2609        /* look up all csses currently attached to @cgrp's subtree */
2610        spin_lock_irq(&css_set_lock);
2611        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2612                struct cgrp_cset_link *link;
2613
2614                list_for_each_entry(link, &dsct->cset_links, cset_link)
2615                        cgroup_migrate_add_src(link->cset, dsct, &mgctx);
2616        }
2617        spin_unlock_irq(&css_set_lock);
2618
2619        /* NULL dst indicates self on default hierarchy */
2620        ret = cgroup_migrate_prepare_dst(&mgctx);
2621        if (ret)
2622                goto out_finish;
2623
2624        spin_lock_irq(&css_set_lock);
2625        list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
2626                struct task_struct *task, *ntask;
2627
2628                /* all tasks in src_csets need to be migrated */
2629                list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
2630                        cgroup_migrate_add_task(task, &mgctx);
2631        }
2632        spin_unlock_irq(&css_set_lock);
2633
2634        ret = cgroup_migrate_execute(&mgctx);
2635out_finish:
2636        cgroup_migrate_finish(&mgctx);
2637        percpu_up_write(&cgroup_threadgroup_rwsem);
2638        return ret;
2639}
2640
2641/**
2642 * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
2643 * @cgrp: root of the target subtree
2644 *
2645 * Because css offlining is asynchronous, userland may try to re-enable a
2646 * controller while the previous css is still around.  This function grabs
2647 * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
2648 */
2649void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
2650        __acquires(&cgroup_mutex)
2651{
2652        struct cgroup *dsct;
2653        struct cgroup_subsys_state *d_css;
2654        struct cgroup_subsys *ss;
2655        int ssid;
2656
2657restart:
2658        mutex_lock(&cgroup_mutex);
2659
2660        cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
2661                for_each_subsys(ss, ssid) {
2662                        struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
2663                        DEFINE_WAIT(wait);
2664
2665                        if (!css || !percpu_ref_is_dying(&css->refcnt))
2666                                continue;
2667
2668                        cgroup_get_live(dsct);
2669                        prepare_to_wait(&dsct->offline_waitq, &wait,
2670                                        TASK_UNINTERRUPTIBLE);
2671
2672                        mutex_unlock(&cgroup_mutex);
2673                        schedule();
2674                        finish_wait(&dsct->offline_waitq, &wait);
2675
2676                        cgroup_put(dsct);
2677                        goto restart;
2678                }
2679        }
2680}
2681
2682/**
2683 * cgroup_save_control - save control masks of a subtree
2684 * @cgrp: root of the target subtree
2685 *
2686 * Save ->subtree_control and ->subtree_ss_mask to the respective old_
2687 * prefixed fields for @cgrp's subtree including @cgrp itself.
2688 */
2689static void cgroup_save_control(struct cgroup *cgrp)
2690{
2691        struct cgroup *dsct;
2692        struct cgroup_subsys_state *d_css;
2693
2694        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2695                dsct->old_subtree_control = dsct->subtree_control;
2696                dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
2697        }
2698}
2699
2700/**
2701 * cgroup_propagate_control - refresh control masks of a subtree
2702 * @cgrp: root of the target subtree
2703 *
2704 * For @cgrp and its subtree, ensure ->subtree_ss_mask matches
2705 * ->subtree_control and propagate controller availability through the
2706 * subtree so that descendants don't have unavailable controllers enabled.
2707 */
2708static void cgroup_propagate_control(struct cgroup *cgrp)
2709{
2710        struct cgroup *dsct;
2711        struct cgroup_subsys_state *d_css;
2712
2713        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2714                dsct->subtree_control &= cgroup_control(dsct);
2715                dsct->subtree_ss_mask =
2716                        cgroup_calc_subtree_ss_mask(dsct->subtree_control,
2717                                                    cgroup_ss_mask(dsct));
2718        }
2719}
2720
2721/**
2722 * cgroup_restore_control - restore control masks of a subtree
2723 * @cgrp: root of the target subtree
2724 *
2725 * Restore ->subtree_control and ->subtree_ss_mask from the respective old_
2726 * prefixed fields for @cgrp's subtree including @cgrp itself.
2727 */
2728static void cgroup_restore_control(struct cgroup *cgrp)
2729{
2730        struct cgroup *dsct;
2731        struct cgroup_subsys_state *d_css;
2732
2733        cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
2734                dsct->subtree_control = dsct->old_subtree_control;
2735                dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
2736        }
2737}
2738
2739static bool css_visible(struct cgroup_subsys_state *css)
2740{
2741        struct cgroup_subsys *ss = css->ss;
2742        struct cgroup *cgrp = css->cgroup;
2743
2744        if (cgroup_control(cgrp) & (1 << ss->id))
2745                return true;
2746        if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
2747                return false;
2748        return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
2749}
2750
2751/**
2752 * cgroup_apply_control_enable - enable or show csses according to control
2753 * @cgrp: root of the target subtree
2754 *
2755 * Walk @cgrp's subtree and create new csses or make the existing ones
2756 * visible.  A css is created invisible if it's being implicitly enabled
2757 * through dependency.  An invisible css is made visible when the userland
2758 * explicitly enables it.
2759 *
2760 * Returns 0 on success, -errno on failure.  On failure, csses which have
2761 * been processed already aren't cleaned up.  The caller is responsible for
2762 * cleaning up with cgroup_apply_control_disable().
2763 */
2764static int cgroup_apply_control_enable(struct cgroup *cgrp)
2765{
2766        struct cgroup *dsct;
2767        struct cgroup_subsys_state *d_css;
2768        struct cgroup_subsys *ss;
2769        int ssid, ret;
2770
2771        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
2772                for_each_subsys(ss, ssid) {
2773                        struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
2774
2775                        WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
2776
2777                        if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
2778                                continue;
2779
2780                        if (!css) {
2781                                css = css_create(dsct, ss);
2782                                if (IS_ERR(css))
2783                                        return PTR_ERR(css);
2784                        }
2785
2786                        if (css_visible(css)) {
2787                                ret = css_populate_dir(css);
2788                                if (ret)
2789                                        return ret;
2790                        }
2791                }
2792        }
2793
2794        return 0;
2795}
2796
2797/**
2798 * cgroup_apply_control_disable - kill or hide csses according to control
2799 * @cgrp: root of the target subtree
2800 *
2801 * Walk @cgrp's subtree and kill and hide csses so that they match
2802 * cgroup_ss_mask() and cgroup_visible_mask().
2803 *
2804 * A css is hidden when the userland requests it to be disabled while other
2805 * subsystems are still depending on it.  The css must not actively control
2806 * resources and be in the vanilla state if it's made visible again later.
2807 * Controllers which may be depended upon should provide ->css_reset() for
2808 * this purpose.
2809 */
2810static void cgroup_apply_control_disable(struct cgroup *cgrp)
2811{
2812        struct cgroup *dsct;
2813        struct cgroup_subsys_state *d_css;
2814        struct cgroup_subsys *ss;
2815        int ssid;
2816
2817        cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
2818                for_each_subsys(ss, ssid) {
2819                        struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
2820
2821                        WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
2822
2823                        if (!css)
2824                                continue;
2825
2826                        if (css->parent &&
2827                            !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
2828                                kill_css(css);
2829                        } else if (!css_visible(css)) {
2830                                css_clear_dir(css);
2831                                if (ss->css_reset)
2832                                        ss->css_reset(css);
2833                        }
2834                }
2835        }
2836}
2837
2838/**
2839 * cgroup_apply_control - apply control mask updates to the subtree
2840 * @cgrp: root of the target subtree
2841 *
2842 * subsystems can be enabled and disabled in a subtree using the following
2843 * steps.
2844 *
2845 * 1. Call cgroup_save_control() to stash the current state.
2846 * 2. Update ->subtree_control masks in the subtree as desired.
2847 * 3. Call cgroup_apply_control() to apply the changes.
2848 * 4. Optionally perform other related operations.
2849 * 5. Call cgroup_finalize_control() to finish up.
2850 *
2851 * This function implements step 3 and propagates the mask changes
2852 * throughout @cgrp's subtree, updates csses accordingly and perform
2853 * process migrations.
2854 */
2855static int cgroup_apply_control(struct cgroup *cgrp)
2856{
2857        int ret;
2858
2859        cgroup_propagate_control(cgrp);
2860
2861        ret = cgroup_apply_control_enable(cgrp);
2862        if (ret)
2863                return ret;
2864
2865        /*
2866         * At this point, cgroup_e_css() results reflect the new csses
2867         * making the following cgroup_update_dfl_csses() properly update
2868         * css associations of all tasks in the subtree.
2869         */
2870        ret = cgroup_update_dfl_csses(cgrp);
2871        if (ret)
2872                return ret;
2873
2874        return 0;
2875}
2876
2877/**
2878 * cgroup_finalize_control - finalize control mask update
2879 * @cgrp: root of the target subtree
2880 * @ret: the result of the update
2881 *
2882 * Finalize control mask update.  See cgroup_apply_control() for more info.
2883 */
2884static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
2885{
2886        if (ret) {
2887                cgroup_restore_control(cgrp);
2888                cgroup_propagate_control(cgrp);
2889        }
2890
2891        cgroup_apply_control_disable(cgrp);
2892}
2893
2894/* change the enabled child controllers for a cgroup in the default hierarchy */
2895static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2896                                            char *buf, size_t nbytes,
2897                                            loff_t off)
2898{
2899        u16 enable = 0, disable = 0;
2900        struct cgroup *cgrp, *child;
2901        struct cgroup_subsys *ss;
2902        char *tok;
2903        int ssid, ret;
2904
2905        /*
2906         * Parse input - space separated list of subsystem names prefixed
2907         * with either + or -.
2908         */
2909        buf = strstrip(buf);
2910        while ((tok = strsep(&buf, " "))) {
2911                if (tok[0] == '\0')
2912                        continue;
2913                do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
2914                        if (!cgroup_ssid_enabled(ssid) ||
2915                            strcmp(tok + 1, ss->name))
2916                                continue;
2917
2918                        if (*tok == '+') {
2919                                enable |= 1 << ssid;
2920                                disable &= ~(1 << ssid);
2921                        } else if (*tok == '-') {
2922                                disable |= 1 << ssid;
2923                                enable &= ~(1 << ssid);
2924                        } else {
2925                                return -EINVAL;
2926                        }
2927                        break;
2928                } while_each_subsys_mask();
2929                if (ssid == CGROUP_SUBSYS_COUNT)
2930                        return -EINVAL;
2931        }
2932
2933        cgrp = cgroup_kn_lock_live(of->kn, true);
2934        if (!cgrp)
2935                return -ENODEV;
2936
2937        for_each_subsys(ss, ssid) {
2938                if (enable & (1 << ssid)) {
2939                        if (cgrp->subtree_control & (1 << ssid)) {
2940                                enable &= ~(1 << ssid);
2941                                continue;
2942                        }
2943
2944                        if (!(cgroup_control(cgrp) & (1 << ssid))) {
2945                                ret = -ENOENT;
2946                                goto out_unlock;
2947                        }
2948                } else if (disable & (1 << ssid)) {
2949                        if (!(cgrp->subtree_control & (1 << ssid))) {
2950                                disable &= ~(1 << ssid);
2951                                continue;
2952                        }
2953
2954                        /* a child has it enabled? */
2955                        cgroup_for_each_live_child(child, cgrp) {
2956                                if (child->subtree_control & (1 << ssid)) {
2957                                        ret = -EBUSY;
2958                                        goto out_unlock;
2959                                }
2960                        }
2961                }
2962        }
2963
2964        if (!enable && !disable) {
2965                ret = 0;
2966                goto out_unlock;
2967        }
2968
2969        /*
2970         * Except for the root, subtree_control must be zero for a cgroup
2971         * with tasks so that child cgroups don't compete against tasks.
2972         */
2973        if (enable && cgroup_parent(cgrp)) {
2974                struct cgrp_cset_link *link;
2975
2976                /*
2977                 * Because namespaces pin csets too, @cgrp->cset_links
2978                 * might not be empty even when @cgrp is empty.  Walk and
2979                 * verify each cset.
2980                 */
2981                spin_lock_irq(&css_set_lock);
2982
2983                ret = 0;
2984                list_for_each_entry(link, &cgrp->cset_links, cset_link) {
2985                        if (css_set_populated(link->cset)) {
2986                                ret = -EBUSY;
2987                                break;
2988                        }
2989                }
2990
2991                spin_unlock_irq(&css_set_lock);
2992
2993                if (ret)
2994                        goto out_unlock;
2995        }
2996
2997        /* save and update control masks and prepare csses */
2998        cgroup_save_control(cgrp);
2999
3000        cgrp->subtree_control |= enable;
3001        cgrp->subtree_control &= ~disable;
3002
3003        ret = cgroup_apply_control(cgrp);
3004        cgroup_finalize_control(cgrp, ret);
3005        if (ret)
3006                goto out_unlock;
3007
3008        kernfs_activate(cgrp->kn);
3009out_unlock:
3010        cgroup_kn_unlock(of->kn);
3011        return ret ?: nbytes;
3012}
3013
3014static int cgroup_events_show(struct seq_file *seq, void *v)
3015{
3016        seq_printf(seq, "populated %d\n",
3017                   cgroup_is_populated(seq_css(seq)->cgroup));
3018        return 0;
3019}
3020
3021static int cgroup_file_open(struct kernfs_open_file *of)
3022{
3023        struct cftype *cft = of->kn->priv;
3024
3025        if (cft->open)
3026                return cft->open(of);
3027        return 0;
3028}
3029
3030static void cgroup_file_release(struct kernfs_open_file *of)
3031{
3032        struct cftype *cft = of->kn->priv;
3033
3034        if (cft->release)
3035                cft->release(of);
3036}
3037
3038static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
3039                                 size_t nbytes, loff_t off)
3040{
3041        struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
3042        struct cgroup *cgrp = of->kn->parent->priv;
3043        struct cftype *cft = of->kn->priv;
3044        struct cgroup_subsys_state *css;
3045        int ret;
3046
3047        /*
3048         * If namespaces are delegation boundaries, disallow writes to
3049         * files in an non-init namespace root from inside the namespace
3050         * except for the files explicitly marked delegatable -
3051         * cgroup.procs and cgroup.subtree_control.
3052         */
3053        if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
3054            !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
3055            ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
3056                return -EPERM;
3057
3058        if (cft->write)
3059                return cft->write(of, buf, nbytes, off);
3060
3061        /*
3062         * kernfs guarantees that a file isn't deleted with operations in
3063         * flight, which means that the matching css is and stays alive and
3064         * doesn't need to be pinned.  The RCU locking is not necessary
3065         * either.  It's just for the convenience of using cgroup_css().
3066         */
3067        rcu_read_lock();
3068        css = cgroup_css(cgrp, cft->ss);
3069        rcu_read_unlock();
3070
3071        if (cft->write_u64) {
3072                unsigned long long v;
3073                ret = kstrtoull(buf, 0, &v);
3074                if (!ret)
3075                        ret = cft->write_u64(css, cft, v);
3076        } else if (cft->write_s64) {
3077                long long v;
3078                ret = kstrtoll(buf, 0, &v);
3079                if (!ret)
3080                        ret = cft->write_s64(css, cft, v);
3081        } else {
3082                ret = -EINVAL;
3083        }
3084
3085        return ret ?: nbytes;
3086}
3087
3088static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
3089{
3090        return seq_cft(seq)->seq_start(seq, ppos);
3091}
3092
3093static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
3094{
3095        return seq_cft(seq)->seq_next(seq, v, ppos);
3096}
3097
3098static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
3099{
3100        if (seq_cft(seq)->seq_stop)
3101                seq_cft(seq)->seq_stop(seq, v);
3102}
3103
3104static int cgroup_seqfile_show(struct seq_file *m, void *arg)
3105{
3106        struct cftype *cft = seq_cft(m);
3107        struct cgroup_subsys_state *css = seq_css(m);
3108
3109        if (cft->seq_show)
3110                return cft->seq_show(m, arg);
3111
3112        if (cft->read_u64)
3113                seq_printf(m, "%llu\n", cft->read_u64(css, cft));
3114        else if (cft->read_s64)
3115                seq_printf(m, "%lld\n", cft->read_s64(css, cft));
3116        else
3117                return -EINVAL;
3118        return 0;
3119}
3120
3121static struct kernfs_ops cgroup_kf_single_ops = {
3122        .atomic_write_len       = PAGE_SIZE,
3123        .open                   = cgroup_file_open,
3124        .release                = cgroup_file_release,
3125        .write                  = cgroup_file_write,
3126        .seq_show               = cgroup_seqfile_show,
3127};
3128
3129static struct kernfs_ops cgroup_kf_ops = {
3130        .atomic_write_len       = PAGE_SIZE,
3131        .open                   = cgroup_file_open,
3132        .release                = cgroup_file_release,
3133        .write                  = cgroup_file_write,
3134        .seq_start              = cgroup_seqfile_start,
3135        .seq_next               = cgroup_seqfile_next,
3136        .seq_stop               = cgroup_seqfile_stop,
3137        .seq_show               = cgroup_seqfile_show,
3138};
3139
3140/* set uid and gid of cgroup dirs and files to that of the creator */
3141static int cgroup_kn_set_ugid(struct kernfs_node *kn)
3142{
3143        struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
3144                               .ia_uid = current_fsuid(),
3145                               .ia_gid = current_fsgid(), };
3146
3147        if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
3148            gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
3149                return 0;
3150
3151        return kernfs_setattr(kn, &iattr);
3152}
3153
3154static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
3155                           struct cftype *cft)
3156{
3157        char name[CGROUP_FILE_NAME_MAX];
3158        struct kernfs_node *kn;
3159        struct lock_class_key *key = NULL;
3160        int ret;
3161
3162#ifdef CONFIG_DEBUG_LOCK_ALLOC
3163        key = &cft->lockdep_key;
3164#endif
3165        kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
3166                                  cgroup_file_mode(cft), 0, cft->kf_ops, cft,
3167                                  NULL, key);
3168        if (IS_ERR(kn))
3169                return PTR_ERR(kn);
3170
3171        ret = cgroup_kn_set_ugid(kn);
3172        if (ret) {
3173                kernfs_remove(kn);
3174                return ret;
3175        }
3176
3177        if (cft->file_offset) {
3178                struct cgroup_file *cfile = (void *)css + cft->file_offset;
3179
3180                spin_lock_irq(&cgroup_file_kn_lock);
3181                cfile->kn = kn;
3182                spin_unlock_irq(&cgroup_file_kn_lock);
3183        }
3184
3185        return 0;
3186}
3187
3188/**
3189 * cgroup_addrm_files - add or remove files to a cgroup directory
3190 * @css: the target css
3191 * @cgrp: the target cgroup (usually css->cgroup)
3192 * @cfts: array of cftypes to be added
3193 * @is_add: whether to add or remove
3194 *
3195 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
3196 * For removals, this function never fails.
3197 */
3198static int cgroup_addrm_files(struct cgroup_subsys_state *css,
3199                              struct cgroup *cgrp, struct cftype cfts[],
3200                              bool is_add)
3201{
3202        struct cftype *cft, *cft_end = NULL;
3203        int ret = 0;
3204
3205        lockdep_assert_held(&cgroup_mutex);
3206
3207restart:
3208        for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
3209                /* does cft->flags tell us to skip this file on @cgrp? */
3210                if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
3211                        continue;
3212                if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
3213                        continue;
3214                if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
3215                        continue;
3216                if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
3217                        continue;
3218
3219                if (is_add) {
3220                        ret = cgroup_add_file(css, cgrp, cft);
3221                        if (ret) {
3222                                pr_warn("%s: failed to add %s, err=%d\n",
3223                                        __func__, cft->name, ret);
3224                                cft_end = cft;
3225                                is_add = false;
3226                                goto restart;
3227                        }
3228                } else {
3229                        cgroup_rm_file(cgrp, cft);
3230                }
3231        }
3232        return ret;
3233}
3234
3235static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
3236{
3237        LIST_HEAD(pending);
3238        struct cgroup_subsys *ss = cfts[0].ss;
3239        struct cgroup *root = &ss->root->cgrp;
3240        struct cgroup_subsys_state *css;
3241        int ret = 0;
3242
3243        lockdep_assert_held(&cgroup_mutex);
3244
3245        /* add/rm files for all cgroups created before */
3246        css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
3247                struct cgroup *cgrp = css->cgroup;
3248
3249                if (!(css->flags & CSS_VISIBLE))
3250                        continue;
3251
3252                ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
3253                if (ret)
3254                        break;
3255        }
3256
3257        if (is_add && !ret)
3258                kernfs_activate(root->kn);
3259        return ret;
3260}
3261
3262static void cgroup_exit_cftypes(struct cftype *cfts)
3263{
3264        struct cftype *cft;
3265
3266        for (cft = cfts; cft->name[0] != '\0'; cft++) {
3267                /* free copy for custom atomic_write_len, see init_cftypes() */
3268                if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
3269                        kfree(cft->kf_ops);
3270                cft->kf_ops = NULL;
3271                cft->ss = NULL;
3272
3273                /* revert flags set by cgroup core while adding @cfts */
3274                cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
3275        }
3276}
3277
3278static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3279{
3280        struct cftype *cft;
3281
3282        for (cft = cfts; cft->name[0] != '\0'; cft++) {
3283                struct kernfs_ops *kf_ops;
3284
3285                WARN_ON(cft->ss || cft->kf_ops);
3286
3287                if (cft->seq_start)
3288                        kf_ops = &cgroup_kf_ops;
3289                else
3290                        kf_ops = &cgroup_kf_single_ops;
3291
3292                /*
3293                 * Ugh... if @cft wants a custom max_write_len, we need to
3294                 * make a copy of kf_ops to set its atomic_write_len.
3295                 */
3296                if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
3297                        kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
3298                        if (!kf_ops) {
3299                                cgroup_exit_cftypes(cfts);
3300                                return -ENOMEM;
3301                        }
3302                        kf_ops->atomic_write_len = cft->max_write_len;
3303                }
3304
3305                cft->kf_ops = kf_ops;
3306                cft->ss = ss;
3307        }
3308
3309        return 0;
3310}
3311
3312static int cgroup_rm_cftypes_locked(struct cftype *cfts)
3313{
3314        lockdep_assert_held(&cgroup_mutex);
3315
3316        if (!cfts || !cfts[0].ss)
3317                return -ENOENT;
3318
3319        list_del(&cfts->node);
3320        cgroup_apply_cftypes(cfts, false);
3321        cgroup_exit_cftypes(cfts);
3322        return 0;
3323}
3324
3325/**
3326 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
3327 * @cfts: zero-length name terminated array of cftypes
3328 *
3329 * Unregister @cfts.  Files described by @cfts are removed from all
3330 * existing cgroups and all future cgroups won't have them either.  This
3331 * function can be called anytime whether @cfts' subsys is attached or not.
3332 *
3333 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
3334 * registered.
3335 */
3336int cgroup_rm_cftypes(struct cftype *cfts)
3337{
3338        int ret;
3339
3340        mutex_lock(&cgroup_mutex);
3341        ret = cgroup_rm_cftypes_locked(cfts);
3342        mutex_unlock(&cgroup_mutex);
3343        return ret;
3344}
3345
3346/**
3347 * cgroup_add_cftypes - add an array of cftypes to a subsystem
3348 * @ss: target cgroup subsystem
3349 * @cfts: zero-length name terminated array of cftypes
3350 *
3351 * Register @cfts to @ss.  Files described by @cfts are created for all
3352 * existing cgroups to which @ss is attached and all future cgroups will
3353 * have them too.  This function can be called anytime whether @ss is
3354 * attached or not.
3355 *
3356 * Returns 0 on successful registration, -errno on failure.  Note that this
3357 * function currently returns 0 as long as @cfts registration is successful
3358 * even if some file creation attempts on existing cgroups fail.
3359 */
3360static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3361{
3362        int ret;
3363
3364        if (!cgroup_ssid_enabled(ss->id))
3365                return 0;
3366
3367        if (!cfts || cfts[0].name[0] == '\0')
3368                return 0;
3369
3370        ret = cgroup_init_cftypes(ss, cfts);
3371        if (ret)
3372                return ret;
3373
3374        mutex_lock(&cgroup_mutex);
3375
3376        list_add_tail(&cfts->node, &ss->cfts);
3377        ret = cgroup_apply_cftypes(cfts, true);
3378        if (ret)
3379                cgroup_rm_cftypes_locked(cfts);
3380
3381        mutex_unlock(&cgroup_mutex);
3382        return ret;
3383}
3384
3385/**
3386 * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
3387 * @ss: target cgroup subsystem
3388 * @cfts: zero-length name terminated array of cftypes
3389 *
3390 * Similar to cgroup_add_cftypes() but the added files are only used for
3391 * the default hierarchy.
3392 */
3393int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3394{
3395        struct cftype *cft;
3396
3397        for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3398                cft->flags |= __CFTYPE_ONLY_ON_DFL;
3399        return cgroup_add_cftypes(ss, cfts);
3400}
3401
3402/**
3403 * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
3404 * @ss: target cgroup subsystem
3405 * @cfts: zero-length name terminated array of cftypes
3406 *
3407 * Similar to cgroup_add_cftypes() but the added files are only used for
3408 * the legacy hierarchies.
3409 */
3410int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3411{
3412        struct cftype *cft;
3413
3414        for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3415                cft->flags |= __CFTYPE_NOT_ON_DFL;
3416        return cgroup_add_cftypes(ss, cfts);
3417}
3418
3419/**
3420 * cgroup_file_notify - generate a file modified event for a cgroup_file
3421 * @cfile: target cgroup_file
3422 *
3423 * @cfile must have been obtained by setting cftype->file_offset.
3424 */
3425void cgroup_file_notify(struct cgroup_file *cfile)
3426{
3427        unsigned long flags;
3428
3429        spin_lock_irqsave(&cgroup_file_kn_lock, flags);
3430        if (cfile->kn)
3431                kernfs_notify(cfile->kn);
3432        spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
3433}
3434
3435/**
3436 * css_next_child - find the next child of a given css
3437 * @pos: the current position (%NULL to initiate traversal)
3438 * @parent: css whose children to walk
3439 *
3440 * This function returns the next child of @parent and should be called
3441 * under either cgroup_mutex or RCU read lock.  The only requirement is
3442 * that @parent and @pos are accessible.  The next sibling is guaranteed to
3443 * be returned regardless of their states.
3444 *
3445 * If a subsystem synchronizes ->css_online() and the start of iteration, a
3446 * css which finished ->css_online() is guaranteed to be visible in the
3447 * future iterations and will stay visible until the last reference is put.
3448 * A css which hasn't finished ->css_online() or already finished
3449 * ->css_offline() may show up during traversal.  It's each subsystem's
3450 * responsibility to synchronize against on/offlining.
3451 */
3452struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
3453                                           struct cgroup_subsys_state *parent)
3454{
3455        struct cgroup_subsys_state *next;
3456
3457        cgroup_assert_mutex_or_rcu_locked();
3458
3459        /*
3460         * @pos could already have been unlinked from the sibling list.
3461         * Once a cgroup is removed, its ->sibling.next is no longer
3462         * updated when its next sibling changes.  CSS_RELEASED is set when
3463         * @pos is taken off list, at which time its next pointer is valid,
3464         * and, as releases are serialized, the one pointed to by the next
3465         * pointer is guaranteed to not have started release yet.  This
3466         * implies that if we observe !CSS_RELEASED on @pos in this RCU
3467         * critical section, the one pointed to by its next pointer is
3468         * guaranteed to not have finished its RCU grace period even if we
3469         * have dropped rcu_read_lock() inbetween iterations.
3470         *
3471         * If @pos has CSS_RELEASED set, its next pointer can't be
3472         * dereferenced; however, as each css is given a monotonically
3473         * increasing unique serial number and always appended to the
3474         * sibling list, the next one can be found by walking the parent's
3475         * children until the first css with higher serial number than
3476         * @pos's.  While this path can be slower, it happens iff iteration
3477         * races against release and the race window is very small.
3478         */
3479        if (!pos) {
3480                next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
3481        } else if (likely(!(pos->flags & CSS_RELEASED))) {
3482                next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
3483        } else {
3484                list_for_each_entry_rcu(next, &parent->children, sibling)
3485                        if (next->serial_nr > pos->serial_nr)
3486                                break;
3487        }
3488
3489        /*
3490         * @next, if not pointing to the head, can be dereferenced and is
3491         * the next sibling.
3492         */
3493        if (&next->sibling != &parent->children)
3494                return next;
3495        return NULL;
3496}
3497
3498/**
3499 * css_next_descendant_pre - find the next descendant for pre-order walk
3500 * @pos: the current position (%NULL to initiate traversal)
3501 * @root: css whose descendants to walk
3502 *
3503 * To be used by css_for_each_descendant_pre().  Find the next descendant
3504 * to visit for pre-order traversal of @root's descendants.  @root is
3505 * included in the iteration and the first node to be visited.
3506 *
3507 * While this function requires cgroup_mutex or RCU read locking, it
3508 * doesn't require the whole traversal to be contained in a single critical
3509 * section.  This function will return the correct next descendant as long
3510 * as both @pos and @root are accessible and @pos is a descendant of @root.
3511 *
3512 * If a subsystem synchronizes ->css_online() and the start of iteration, a
3513 * css which finished ->css_online() is guaranteed to be visible in the
3514 * future iterations and will stay visible until the last reference is put.
3515 * A css which hasn't finished ->css_online() or already finished
3516 * ->css_offline() may show up during traversal.  It's each subsystem's
3517 * responsibility to synchronize against on/offlining.
3518 */
3519struct cgroup_subsys_state *
3520css_next_descendant_pre(struct cgroup_subsys_state *pos,
3521                        struct cgroup_subsys_state *root)
3522{
3523        struct cgroup_subsys_state *next;
3524
3525        cgroup_assert_mutex_or_rcu_locked();
3526
3527        /* if first iteration, visit @root */
3528        if (!pos)
3529                return root;
3530
3531        /* visit the first child if exists */
3532        next = css_next_child(NULL, pos);
3533        if (next)
3534                return next;
3535
3536        /* no child, visit my or the closest ancestor's next sibling */
3537        while (pos != root) {
3538                next = css_next_child(pos, pos->parent);
3539                if (next)
3540                        return next;
3541                pos = pos->parent;
3542        }
3543
3544        return NULL;
3545}
3546
3547/**
3548 * css_rightmost_descendant - return the rightmost descendant of a css
3549 * @pos: css of interest
3550 *
3551 * Return the rightmost descendant of @pos.  If there's no descendant, @pos
3552 * is returned.  This can be used during pre-order traversal to skip
3553 * subtree of @pos.
3554 *
3555 * While this function requires cgroup_mutex or RCU read locking, it
3556 * doesn't require the whole traversal to be contained in a single critical
3557 * section.  This function will return the correct rightmost descendant as
3558 * long as @pos is accessible.
3559 */
3560struct cgroup_subsys_state *
3561css_rightmost_descendant(struct cgroup_subsys_state *pos)
3562{
3563        struct cgroup_subsys_state *last, *tmp;
3564
3565        cgroup_assert_mutex_or_rcu_locked();
3566
3567        do {
3568                last = pos;
3569                /* ->prev isn't RCU safe, walk ->next till the end */
3570                pos = NULL;
3571                css_for_each_child(tmp, last)
3572                        pos = tmp;
3573        } while (pos);
3574
3575        return last;
3576}
3577
3578static struct cgroup_subsys_state *
3579css_leftmost_descendant(struct cgroup_subsys_state *pos)
3580{
3581        struct cgroup_subsys_state *last;
3582
3583        do {
3584                last = pos;
3585                pos = css_next_child(NULL, pos);
3586        } while (pos);
3587
3588        return last;
3589}
3590
3591/**
3592 * css_next_descendant_post - find the next descendant for post-order walk
3593 * @pos: the current position (%NULL to initiate traversal)
3594 * @root: css whose descendants to walk
3595 *
3596 * To be used by css_for_each_descendant_post().  Find the next descendant
3597 * to visit for post-order traversal of @root's descendants.  @root is
3598 * included in the iteration and the last node to be visited.
3599 *
3600 * While this function requires cgroup_mutex or RCU read locking, it
3601 * doesn't require the whole traversal to be contained in a single critical
3602 * section.  This function will return the correct next descendant as long
3603 * as both @pos and @cgroup are accessible and @pos is a descendant of
3604 * @cgroup.
3605 *
3606 * If a subsystem synchronizes ->css_online() and the start of iteration, a
3607 * css which finished ->css_online() is guaranteed to be visible in the
3608 * future iterations and will stay visible until the last reference is put.
3609 * A css which hasn't finished ->css_online() or already finished
3610 * ->css_offline() may show up during traversal.  It's each subsystem's
3611 * responsibility to synchronize against on/offlining.
3612 */
3613struct cgroup_subsys_state *
3614css_next_descendant_post(struct cgroup_subsys_state *pos,
3615                         struct cgroup_subsys_state *root)
3616{
3617        struct cgroup_subsys_state *next;
3618
3619        cgroup_assert_mutex_or_rcu_locked();
3620
3621        /* if first iteration, visit leftmost descendant which may be @root */
3622        if (!pos)
3623                return css_leftmost_descendant(root);
3624
3625        /* if we visited @root, we're done */
3626        if (pos == root)
3627                return NULL;
3628
3629        /* if there's an unvisited sibling, visit its leftmost descendant */
3630        next = css_next_child(pos, pos->parent);
3631        if (next)
3632                return css_leftmost_descendant(next);
3633
3634        /* no sibling left, visit parent */
3635        return pos->parent;
3636}
3637
3638/**
3639 * css_has_online_children - does a css have online children
3640 * @css: the target css
3641 *
3642 * Returns %true if @css has any online children; otherwise, %false.  This
3643 * function can be called from any context but the caller is responsible
3644 * for synchronizing against on/offlining as necessary.
3645 */
3646bool css_has_online_children(struct cgroup_subsys_state *css)
3647{
3648        struct cgroup_subsys_state *child;
3649        bool ret = false;
3650
3651        rcu_read_lock();
3652        css_for_each_child(child, css) {
3653                if (child->flags & CSS_ONLINE) {
3654                        ret = true;
3655                        break;
3656                }
3657        }
3658        rcu_read_unlock();
3659        return ret;
3660}
3661
3662/**
3663 * css_task_iter_advance_css_set - advance a task itererator to the next css_set
3664 * @it: the iterator to advance
3665 *
3666 * Advance @it to the next css_set to walk.
3667 */
3668static void css_task_iter_advance_css_set(struct css_task_iter *it)
3669{
3670        struct list_head *l = it->cset_pos;
3671        struct cgrp_cset_link *link;
3672        struct css_set *cset;
3673
3674        lockdep_assert_held(&css_set_lock);
3675
3676        /* Advance to the next non-empty css_set */
3677        do {
3678                l = l->next;
3679                if (l == it->cset_head) {
3680                        it->cset_pos = NULL;
3681                        it->task_pos = NULL;
3682                        return;
3683                }
3684
3685                if (it->ss) {
3686                        cset = container_of(l, struct css_set,
3687                                            e_cset_node[it->ss->id]);
3688                } else {
3689                        link = list_entry(l, struct cgrp_cset_link, cset_link);
3690                        cset = link->cset;
3691                }
3692        } while (!css_set_populated(cset));
3693
3694        it->cset_pos = l;
3695
3696        if (!list_empty(&cset->tasks))
3697                it->task_pos = cset->tasks.next;
3698        else
3699                it->task_pos = cset->mg_tasks.next;
3700
3701        it->tasks_head = &cset->tasks;
3702        it->mg_tasks_head = &cset->mg_tasks;
3703
3704        /*
3705         * We don't keep css_sets locked across iteration steps and thus
3706         * need to take steps to ensure that iteration can be resumed after
3707         * the lock is re-acquired.  Iteration is performed at two levels -
3708         * css_sets and tasks in them.
3709         *
3710         * Once created, a css_set never leaves its cgroup lists, so a
3711         * pinned css_set is guaranteed to stay put and we can resume
3712         * iteration afterwards.
3713         *
3714         * Tasks may leave @cset across iteration steps.  This is resolved
3715         * by registering each iterator with the css_set currently being
3716         * walked and making css_set_move_task() advance iterators whose
3717         * next task is leaving.
3718         */
3719        if (it->cur_cset) {
3720                list_del(&it->iters_node);
3721                put_css_set_locked(it->cur_cset);
3722        }
3723        get_css_set(cset);
3724        it->cur_cset = cset;
3725        list_add(&it->iters_node, &cset->task_iters);
3726}
3727
3728static void css_task_iter_advance(struct css_task_iter *it)
3729{
3730        struct list_head *l = it->task_pos;
3731
3732        lockdep_assert_held(&css_set_lock);
3733        WARN_ON_ONCE(!l);
3734
3735        /*
3736         * Advance iterator to find next entry.  cset->tasks is consumed
3737         * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
3738         * next cset.
3739         */
3740        l = l->next;
3741
3742        if (l == it->tasks_head)
3743                l = it->mg_tasks_head->next;
3744
3745        if (l == it->mg_tasks_head)
3746                css_task_iter_advance_css_set(it);
3747        else
3748                it->task_pos = l;
3749}
3750
3751/**
3752 * css_task_iter_start - initiate task iteration
3753 * @css: the css to walk tasks of
3754 * @it: the task iterator to use
3755 *
3756 * Initiate iteration through the tasks of @css.  The caller can call
3757 * css_task_iter_next() to walk through the tasks until the function
3758 * returns NULL.  On completion of iteration, css_task_iter_end() must be
3759 * called.
3760 */
3761void css_task_iter_start(struct cgroup_subsys_state *css,
3762                         struct css_task_iter *it)
3763{
3764        /* no one should try to iterate before mounting cgroups */
3765        WARN_ON_ONCE(!use_task_css_set_links);
3766
3767        memset(it, 0, sizeof(*it));
3768
3769        spin_lock_irq(&css_set_lock);
3770
3771        it->ss = css->ss;
3772
3773        if (it->ss)
3774                it->cset_pos = &css->cgroup->e_csets[css->ss->id];
3775        else
3776                it->cset_pos = &css->cgroup->cset_links;
3777
3778        it->cset_head = it->cset_pos;
3779
3780        css_task_iter_advance_css_set(it);
3781
3782        spin_unlock_irq(&css_set_lock);
3783}
3784
3785/**
3786 * css_task_iter_next - return the next task for the iterator
3787 * @it: the task iterator being iterated
3788 *
3789 * The "next" function for task iteration.  @it should have been
3790 * initialized via css_task_iter_start().  Returns NULL when the iteration
3791 * reaches the end.
3792 */
3793struct task_struct *css_task_iter_next(struct css_task_iter *it)
3794{
3795        if (it->cur_task) {
3796                put_task_struct(it->cur_task);
3797                it->cur_task = NULL;
3798        }
3799
3800        spin_lock_irq(&css_set_lock);
3801
3802        if (it->task_pos) {
3803                it->cur_task = list_entry(it->task_pos, struct task_struct,
3804                                          cg_list);
3805                get_task_struct(it->cur_task);
3806                css_task_iter_advance(it);
3807        }
3808
3809        spin_unlock_irq(&css_set_lock);
3810
3811        return it->cur_task;
3812}
3813
3814/**
3815 * css_task_iter_end - finish task iteration
3816 * @it: the task iterator to finish
3817 *
3818 * Finish task iteration started by css_task_iter_start().
3819 */
3820void css_task_iter_end(struct css_task_iter *it)
3821{
3822        if (it->cur_cset) {
3823                spin_lock_irq(&css_set_lock);
3824                list_del(&it->iters_node);
3825                put_css_set_locked(it->cur_cset);
3826                spin_unlock_irq(&css_set_lock);
3827        }
3828
3829        if (it->cur_task)
3830                put_task_struct(it->cur_task);
3831}
3832
3833static void cgroup_procs_release(struct kernfs_open_file *of)
3834{
3835        if (of->priv) {
3836                css_task_iter_end(of->priv);
3837                kfree(of->priv);
3838        }
3839}
3840
3841static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
3842{
3843        struct kernfs_open_file *of = s->private;
3844        struct css_task_iter *it = of->priv;
3845        struct task_struct *task;
3846
3847        do {
3848                task = css_task_iter_next(it);
3849        } while (task && !thread_group_leader(task));
3850
3851        return task;
3852}
3853
3854static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
3855{
3856        struct kernfs_open_file *of = s->private;
3857        struct cgroup *cgrp = seq_css(s)->cgroup;
3858        struct css_task_iter *it = of->priv;
3859
3860        /*
3861         * When a seq_file is seeked, it's always traversed sequentially
3862         * from position 0, so we can simply keep iterating on !0 *pos.
3863         */
3864        if (!it) {
3865                if (WARN_ON_ONCE((*pos)++))
3866                        return ERR_PTR(-EINVAL);
3867
3868                it = kzalloc(sizeof(*it), GFP_KERNEL);
3869                if (!it)
3870                        return ERR_PTR(-ENOMEM);
3871                of->priv = it;
3872                css_task_iter_start(&cgrp->self, it);
3873        } else if (!(*pos)++) {
3874                css_task_iter_end(it);
3875                css_task_iter_start(&cgrp->self, it);
3876        }
3877
3878        return cgroup_procs_next(s, NULL, NULL);
3879}
3880
3881static int cgroup_procs_show(struct seq_file *s, void *v)
3882{
3883        seq_printf(s, "%d\n", task_tgid_vnr(v));
3884        return 0;
3885}
3886
3887/* cgroup core interface files for the default hierarchy */
3888static struct cftype cgroup_base_files[] = {
3889        {
3890                .name = "cgroup.procs",
3891                .flags = CFTYPE_NS_DELEGATABLE,
3892                .file_offset = offsetof(struct cgroup, procs_file),
3893                .release = cgroup_procs_release,
3894                .seq_start = cgroup_procs_start,
3895                .seq_next = cgroup_procs_next,
3896                .seq_show = cgroup_procs_show,
3897                .write = cgroup_procs_write,
3898        },
3899        {
3900                .name = "cgroup.controllers",
3901                .seq_show = cgroup_controllers_show,
3902        },
3903        {
3904                .name = "cgroup.subtree_control",
3905                .flags = CFTYPE_NS_DELEGATABLE,
3906                .seq_show = cgroup_subtree_control_show,
3907                .write = cgroup_subtree_control_write,
3908        },
3909        {
3910                .name = "cgroup.events",
3911                .flags = CFTYPE_NOT_ON_ROOT,
3912                .file_offset = offsetof(struct cgroup, events_file),
3913                .seq_show = cgroup_events_show,
3914        },
3915        { }     /* terminate */
3916};
3917
3918/*
3919 * css destruction is four-stage process.
3920 *
3921 * 1. Destruction starts.  Killing of the percpu_ref is initiated.
3922 *    Implemented in kill_css().
3923 *
3924 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
3925 *    and thus css_tryget_online() is guaranteed to fail, the css can be
3926 *    offlined by invoking offline_css().  After offlining, the base ref is
3927 *    put.  Implemented in css_killed_work_fn().
3928 *
3929 * 3. When the percpu_ref reaches zero, the only possible remaining
3930 *    accessors are inside RCU read sections.  css_release() schedules the
3931 *    RCU callback.
3932 *
3933 * 4. After the grace period, the css can be freed.  Implemented in
3934 *    css_free_work_fn().
3935 *
3936 * It is actually hairier because both step 2 and 4 require process context
3937 * and thus involve punting to css->destroy_work adding two additional
3938 * steps to the already complex sequence.
3939 */
3940static void css_free_work_fn(struct work_struct *work)
3941{
3942        struct cgroup_subsys_state *css =
3943                container_of(work, struct cgroup_subsys_state, destroy_work);
3944        struct cgroup_subsys *ss = css->ss;
3945        struct cgroup *cgrp = css->cgroup;
3946
3947        percpu_ref_exit(&css->refcnt);
3948
3949        if (ss) {
3950                /* css free path */
3951                struct cgroup_subsys_state *parent = css->parent;
3952                int id = css->id;
3953
3954                ss->css_free(css);
3955                cgroup_idr_remove(&ss->css_idr, id);
3956                cgroup_put(cgrp);
3957
3958                if (parent)
3959                        css_put(parent);
3960        } else {
3961                /* cgroup free path */
3962                atomic_dec(&cgrp->root->nr_cgrps);
3963                cgroup1_pidlist_destroy_all(cgrp);
3964                cancel_work_sync(&cgrp->release_agent_work);
3965
3966                if (cgroup_parent(cgrp)) {
3967                        /*
3968                         * We get a ref to the parent, and put the ref when
3969                         * this cgroup is being freed, so it's guaranteed
3970                         * that the parent won't be destroyed before its
3971                         * children.
3972                         */
3973                        cgroup_put(cgroup_parent(cgrp));
3974                        kernfs_put(cgrp->kn);
3975                        kfree(cgrp);
3976                } else {
3977                        /*
3978                         * This is root cgroup's refcnt reaching zero,
3979                         * which indicates that the root should be
3980                         * released.
3981                         */
3982                        cgroup_destroy_root(cgrp->root);
3983                }
3984        }
3985}
3986
3987static void css_free_rcu_fn(struct rcu_head *rcu_head)
3988{
3989        struct cgroup_subsys_state *css =
3990                container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
3991
3992        INIT_WORK(&css->destroy_work, css_free_work_fn);
3993        queue_work(cgroup_destroy_wq, &css->destroy_work);
3994}
3995
3996static void css_release_work_fn(struct work_struct *work)
3997{
3998        struct cgroup_subsys_state *css =
3999                container_of(work, struct cgroup_subsys_state, destroy_work);
4000        struct cgroup_subsys *ss = css->ss;
4001        struct cgroup *cgrp = css->cgroup;
4002
4003        mutex_lock(&cgroup_mutex);
4004
4005        css->flags |= CSS_RELEASED;
4006        list_del_rcu(&css->sibling);
4007
4008        if (ss) {
4009                /* css release path */
4010                cgroup_idr_replace(&ss->css_idr, NULL, css->id);
4011                if (ss->css_released)
4012                        ss->css_released(css);
4013        } else {
4014                /* cgroup release path */
4015                trace_cgroup_release(cgrp);
4016
4017                cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4018                cgrp->id = -1;
4019
4020                /*
4021                 * There are two control paths which try to determine
4022                 * cgroup from dentry without going through kernfs -
4023                 * cgroupstats_build() and css_tryget_online_from_dir().
4024                 * Those are supported by RCU protecting clearing of
4025                 * cgrp->kn->priv backpointer.
4026                 */
4027                if (cgrp->kn)
4028                        RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
4029                                         NULL);
4030
4031                cgroup_bpf_put(cgrp);
4032        }
4033
4034        mutex_unlock(&cgroup_mutex);
4035
4036        call_rcu(&css->rcu_head, css_free_rcu_fn);
4037}
4038
4039static void css_release(struct percpu_ref *ref)
4040{
4041        struct cgroup_subsys_state *css =
4042                container_of(ref, struct cgroup_subsys_state, refcnt);
4043
4044        INIT_WORK(&css->destroy_work, css_release_work_fn);
4045        queue_work(cgroup_destroy_wq, &css->destroy_work);
4046}
4047
4048static void init_and_link_css(struct cgroup_subsys_state *css,
4049                              struct cgroup_subsys *ss, struct cgroup *cgrp)
4050{
4051        lockdep_assert_held(&cgroup_mutex);
4052
4053        cgroup_get_live(cgrp);
4054
4055        memset(css, 0, sizeof(*css));
4056        css->cgroup = cgrp;
4057        css->ss = ss;
4058        css->id = -1;
4059        INIT_LIST_HEAD(&css->sibling);
4060        INIT_LIST_HEAD(&css->children);
4061        css->serial_nr = css_serial_nr_next++;
4062        atomic_set(&css->online_cnt, 0);
4063
4064        if (cgroup_parent(cgrp)) {
4065                css->parent = cgroup_css(cgroup_parent(cgrp), ss);
4066                css_get(css->parent);
4067        }
4068
4069        BUG_ON(cgroup_css(cgrp, ss));
4070}
4071
4072/* invoke ->css_online() on a new CSS and mark it online if successful */
4073static int online_css(struct cgroup_subsys_state *css)
4074{
4075        struct cgroup_subsys *ss = css->ss;
4076        int ret = 0;
4077
4078        lockdep_assert_held(&cgroup_mutex);
4079
4080        if (ss->css_online)
4081                ret = ss->css_online(css);
4082        if (!ret) {
4083                css->flags |= CSS_ONLINE;
4084                rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
4085
4086                atomic_inc(&css->online_cnt);
4087                if (css->parent)
4088                        atomic_inc(&css->parent->online_cnt);
4089        }
4090        return ret;
4091}
4092
4093/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
4094static void offline_css(struct cgroup_subsys_state *css)
4095{
4096        struct cgroup_subsys *ss = css->ss;
4097
4098        lockdep_assert_held(&cgroup_mutex);
4099
4100        if (!(css->flags & CSS_ONLINE))
4101                return;
4102
4103        if (ss->css_reset)
4104                ss->css_reset(css);
4105
4106        if (ss->css_offline)
4107                ss->css_offline(css);
4108
4109        css->flags &= ~CSS_ONLINE;
4110        RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
4111
4112        wake_up_all(&css->cgroup->offline_waitq);
4113}
4114
4115/**
4116 * css_create - create a cgroup_subsys_state
4117 * @cgrp: the cgroup new css will be associated with
4118 * @ss: the subsys of new css
4119 *
4120 * Create a new css associated with @cgrp - @ss pair.  On success, the new
4121 * css is online and installed in @cgrp.  This function doesn't create the
4122 * interface files.  Returns 0 on success, -errno on failure.
4123 */
4124static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
4125                                              struct cgroup_subsys *ss)
4126{
4127        struct cgroup *parent = cgroup_parent(cgrp);
4128        struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
4129        struct cgroup_subsys_state *css;
4130        int err;
4131
4132        lockdep_assert_held(&cgroup_mutex);
4133
4134        css = ss->css_alloc(parent_css);
4135        if (!css)
4136                css = ERR_PTR(-ENOMEM);
4137        if (IS_ERR(css))
4138                return css;
4139
4140        init_and_link_css(css, ss, cgrp);
4141
4142        err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
4143        if (err)
4144                goto err_free_css;
4145
4146        err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
4147        if (err < 0)
4148                goto err_free_css;
4149        css->id = err;
4150
4151        /* @css is ready to be brought online now, make it visible */
4152        list_add_tail_rcu(&css->sibling, &parent_css->children);
4153        cgroup_idr_replace(&ss->css_idr, css, css->id);
4154
4155        err = online_css(css);
4156        if (err)
4157                goto err_list_del;
4158
4159        if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4160            cgroup_parent(parent)) {
4161                pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4162                        current->comm, current->pid, ss->name);
4163                if (!strcmp(ss->name, "memory"))
4164                        pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
4165                ss->warned_broken_hierarchy = true;
4166        }
4167
4168        return css;
4169
4170err_list_del:
4171        list_del_rcu(&css->sibling);
4172err_free_css:
4173        call_rcu(&css->rcu_head, css_free_rcu_fn);
4174        return ERR_PTR(err);
4175}
4176
4177/*
4178 * The returned cgroup is fully initialized including its control mask, but
4179 * it isn't associated with its kernfs_node and doesn't have the control
4180 * mask applied.
4181 */
4182static struct cgroup *cgroup_create(struct cgroup *parent)
4183{
4184        struct cgroup_root *root = parent->root;
4185        struct cgroup *cgrp, *tcgrp;
4186        int level = parent->level + 1;
4187        int ret;
4188
4189        /* allocate the cgroup and its ID, 0 is reserved for the root */
4190        cgrp = kzalloc(sizeof(*cgrp) +
4191                       sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
4192        if (!cgrp)
4193                return ERR_PTR(-ENOMEM);
4194
4195        ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
4196        if (ret)
4197                goto out_free_cgrp;
4198
4199        /*
4200         * Temporarily set the pointer to NULL, so idr_find() won't return
4201         * a half-baked cgroup.
4202         */
4203        cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
4204        if (cgrp->id < 0) {
4205                ret = -ENOMEM;
4206                goto out_cancel_ref;
4207        }
4208
4209        init_cgroup_housekeeping(cgrp);
4210
4211        cgrp->self.parent = &parent->self;
4212        cgrp->root = root;
4213        cgrp->level = level;
4214
4215        for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
4216                cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
4217
4218        if (notify_on_release(parent))
4219                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4220
4221        if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4222                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4223
4224        cgrp->self.serial_nr = css_serial_nr_next++;
4225
4226        /* allocation complete, commit to creation */
4227        list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
4228        atomic_inc(&root->nr_cgrps);
4229        cgroup_get_live(parent);
4230
4231        /*
4232         * @cgrp is now fully operational.  If something fails after this
4233         * point, it'll be released via the normal destruction path.
4234         */
4235        cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4236
4237        /*
4238         * On the default hierarchy, a child doesn't automatically inherit
4239         * subtree_control from the parent.  Each is configured manually.
4240         */
4241        if (!cgroup_on_dfl(cgrp))
4242                cgrp->subtree_control = cgroup_control(cgrp);
4243
4244        if (parent)
4245                cgroup_bpf_inherit(cgrp, parent);
4246
4247        cgroup_propagate_control(cgrp);
4248
4249        return cgrp;
4250
4251out_cancel_ref:
4252        percpu_ref_exit(&cgrp->self.refcnt);
4253out_free_cgrp:
4254        kfree(cgrp);
4255        return ERR_PTR(ret);
4256}
4257
4258int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
4259{
4260        struct cgroup *parent, *cgrp;
4261        struct kernfs_node *kn;
4262        int ret;
4263
4264        /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
4265        if (strchr(name, '\n'))
4266                return -EINVAL;
4267
4268        parent = cgroup_kn_lock_live(parent_kn, false);
4269        if (!parent)
4270                return -ENODEV;
4271
4272        cgrp = cgroup_create(parent);
4273        if (IS_ERR(cgrp)) {
4274                ret = PTR_ERR(cgrp);
4275                goto out_unlock;
4276        }
4277
4278        /* create the directory */
4279        kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
4280        if (IS_ERR(kn)) {
4281                ret = PTR_ERR(kn);
4282                goto out_destroy;
4283        }
4284        cgrp->kn = kn;
4285
4286        /*
4287         * This extra ref will be put in cgroup_free_fn() and guarantees
4288         * that @cgrp->kn is always accessible.
4289         */
4290        kernfs_get(kn);
4291
4292        ret = cgroup_kn_set_ugid(kn);
4293        if (ret)
4294                goto out_destroy;
4295
4296        ret = css_populate_dir(&cgrp->self);
4297        if (ret)
4298                goto out_destroy;
4299
4300        ret = cgroup_apply_control_enable(cgrp);
4301        if (ret)
4302                goto out_destroy;
4303
4304        trace_cgroup_mkdir(cgrp);
4305
4306        /* let's create and online css's */
4307        kernfs_activate(kn);
4308
4309        ret = 0;
4310        goto out_unlock;
4311
4312out_destroy:
4313        cgroup_destroy_locked(cgrp);
4314out_unlock:
4315        cgroup_kn_unlock(parent_kn);
4316        return ret;
4317}
4318
4319/*
4320 * This is called when the refcnt of a css is confirmed to be killed.
4321 * css_tryget_online() is now guaranteed to fail.  Tell the subsystem to
4322 * initate destruction and put the css ref from kill_css().
4323 */
4324static void css_killed_work_fn(struct work_struct *work)
4325{
4326        struct cgroup_subsys_state *css =
4327                container_of(work, struct cgroup_subsys_state, destroy_work);
4328
4329        mutex_lock(&cgroup_mutex);
4330
4331        do {
4332                offline_css(css);
4333                css_put(css);
4334                /* @css can't go away while we're holding cgroup_mutex */
4335                css = css->parent;
4336        } while (css && atomic_dec_and_test(&css->online_cnt));
4337
4338        mutex_unlock(&cgroup_mutex);
4339}
4340
4341/* css kill confirmation processing requires process context, bounce */
4342static void css_killed_ref_fn(struct percpu_ref *ref)
4343{
4344        struct cgroup_subsys_state *css =
4345                container_of(ref, struct cgroup_subsys_state, refcnt);
4346
4347        if (atomic_dec_and_test(&css->online_cnt)) {
4348                INIT_WORK(&css->destroy_work, css_killed_work_fn);
4349                queue_work(cgroup_destroy_wq, &css->destroy_work);
4350        }
4351}
4352
4353/**
4354 * kill_css - destroy a css
4355 * @css: css to destroy
4356 *
4357 * This function initiates destruction of @css by removing cgroup interface
4358 * files and putting its base reference.  ->css_offline() will be invoked
4359 * asynchronously once css_tryget_online() is guaranteed to fail and when
4360 * the reference count reaches zero, @css will be released.
4361 */
4362static void kill_css(struct cgroup_subsys_state *css)
4363{
4364        lockdep_assert_held(&cgroup_mutex);
4365
4366        if (css->flags & CSS_DYING)
4367                return;
4368
4369        css->flags |= CSS_DYING;
4370
4371        /*
4372         * This must happen before css is disassociated with its cgroup.
4373         * See seq_css() for details.
4374         */
4375        css_clear_dir(css);
4376
4377        /*
4378         * Killing would put the base ref, but we need to keep it alive
4379         * until after ->css_offline().
4380         */
4381        css_get(css);
4382
4383        /*
4384         * cgroup core guarantees that, by the time ->css_offline() is
4385         * invoked, no new css reference will be given out via
4386         * css_tryget_online().  We can't simply call percpu_ref_kill() and
4387         * proceed to offlining css's because percpu_ref_kill() doesn't
4388         * guarantee that the ref is seen as killed on all CPUs on return.
4389         *
4390         * Use percpu_ref_kill_and_confirm() to get notifications as each
4391         * css is confirmed to be seen as killed on all CPUs.
4392         */
4393        percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
4394}
4395
4396/**
4397 * cgroup_destroy_locked - the first stage of cgroup destruction
4398 * @cgrp: cgroup to be destroyed
4399 *
4400 * css's make use of percpu refcnts whose killing latency shouldn't be
4401 * exposed to userland and are RCU protected.  Also, cgroup core needs to
4402 * guarantee that css_tryget_online() won't succeed by the time
4403 * ->css_offline() is invoked.  To satisfy all the requirements,
4404 * destruction is implemented in the following two steps.
4405 *
4406 * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
4407 *     userland visible parts and start killing the percpu refcnts of
4408 *     css's.  Set up so that the next stage will be kicked off once all
4409 *     the percpu refcnts are confirmed to be killed.
4410 *
4411 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
4412 *     rest of destruction.  Once all cgroup references are gone, the
4413 *     cgroup is RCU-freed.
4414 *
4415 * This function implements s1.  After this step, @cgrp is gone as far as
4416 * the userland is concerned and a new cgroup with the same name may be
4417 * created.  As cgroup doesn't care about the names internally, this
4418 * doesn't cause any problem.
4419 */
4420static int cgroup_destroy_locked(struct cgroup *cgrp)
4421        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4422{
4423        struct cgroup_subsys_state *css;
4424        struct cgrp_cset_link *link;
4425        int ssid;
4426
4427        lockdep_assert_held(&cgroup_mutex);
4428
4429        /*
4430         * Only migration can raise populated from zero and we're already
4431         * holding cgroup_mutex.
4432         */
4433        if (cgroup_is_populated(cgrp))
4434                return -EBUSY;
4435
4436        /*
4437         * Make sure there's no live children.  We can't test emptiness of
4438         * ->self.children as dead children linger on it while being
4439         * drained; otherwise, "rmdir parent/child parent" may fail.
4440         */
4441        if (css_has_online_children(&cgrp->self))
4442                return -EBUSY;
4443
4444        /*
4445         * Mark @cgrp and the associated csets dead.  The former prevents
4446         * further task migration and child creation by disabling
4447         * cgroup_lock_live_group().  The latter makes the csets ignored by
4448         * the migration path.
4449         */
4450        cgrp->self.flags &= ~CSS_ONLINE;
4451
4452        spin_lock_irq(&css_set_lock);
4453        list_for_each_entry(link, &cgrp->cset_links, cset_link)
4454                link->cset->dead = true;
4455        spin_unlock_irq(&css_set_lock);
4456
4457        /* initiate massacre of all css's */
4458        for_each_css(css, ssid, cgrp)
4459                kill_css(css);
4460
4461        /*
4462         * Remove @cgrp directory along with the base files.  @cgrp has an
4463         * extra ref on its kn.
4464         */
4465        kernfs_remove(cgrp->kn);
4466
4467        cgroup1_check_for_release(cgroup_parent(cgrp));
4468
4469        /* put the base reference */
4470        percpu_ref_kill(&cgrp->self.refcnt);
4471
4472        return 0;
4473};
4474
4475int cgroup_rmdir(struct kernfs_node *kn)
4476{
4477        struct cgroup *cgrp;
4478        int ret = 0;
4479
4480        cgrp = cgroup_kn_lock_live(kn, false);
4481        if (!cgrp)
4482                return 0;
4483
4484        ret = cgroup_destroy_locked(cgrp);
4485
4486        if (!ret)
4487                trace_cgroup_rmdir(cgrp);
4488
4489        cgroup_kn_unlock(kn);
4490        return ret;
4491}
4492
4493static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4494        .show_options           = cgroup_show_options,
4495        .remount_fs             = cgroup_remount,
4496        .mkdir                  = cgroup_mkdir,
4497        .rmdir                  = cgroup_rmdir,
4498        .show_path              = cgroup_show_path,
4499};
4500
4501static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
4502{
4503        struct cgroup_subsys_state *css;
4504
4505        pr_debug("Initializing cgroup subsys %s\n", ss->name);
4506
4507        mutex_lock(&cgroup_mutex);
4508
4509        idr_init(&ss->css_idr);
4510        INIT_LIST_HEAD(&ss->cfts);
4511
4512        /* Create the root cgroup state for this subsystem */
4513        ss->root = &cgrp_dfl_root;
4514        css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
4515        /* We don't handle early failures gracefully */
4516        BUG_ON(IS_ERR(css));
4517        init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
4518
4519        /*
4520         * Root csses are never destroyed and we can't initialize
4521         * percpu_ref during early init.  Disable refcnting.
4522         */
4523        css->flags |= CSS_NO_REF;
4524
4525        if (early) {
4526                /* allocation can't be done safely during early init */
4527                css->id = 1;
4528        } else {
4529                css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
4530                BUG_ON(css->id < 0);
4531        }
4532
4533        /* Update the init_css_set to contain a subsys
4534         * pointer to this state - since the subsystem is
4535         * newly registered, all tasks and hence the
4536         * init_css_set is in the subsystem's root cgroup. */
4537        init_css_set.subsys[ss->id] = css;
4538
4539        have_fork_callback |= (bool)ss->fork << ss->id;
4540        have_exit_callback |= (bool)ss->exit << ss->id;
4541        have_free_callback |= (bool)ss->free << ss->id;
4542        have_canfork_callback |= (bool)ss->can_fork << ss->id;
4543
4544        /* At system boot, before all subsystems have been
4545         * registered, no tasks have been forked, so we don't
4546         * need to invoke fork callbacks here. */
4547        BUG_ON(!list_empty(&init_task.tasks));
4548
4549        BUG_ON(online_css(css));
4550
4551        mutex_unlock(&cgroup_mutex);
4552}
4553
4554/**
4555 * cgroup_init_early - cgroup initialization at system boot
4556 *
4557 * Initialize cgroups at system boot, and initialize any
4558 * subsystems that request early init.
4559 */
4560int __init cgroup_init_early(void)
4561{
4562        static struct cgroup_sb_opts __initdata opts;
4563        struct cgroup_subsys *ss;
4564        int i;
4565
4566        init_cgroup_root(&cgrp_dfl_root, &opts);
4567        cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
4568
4569        RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4570
4571        for_each_subsys(ss, i) {
4572                WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
4573                     "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
4574                     i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
4575                     ss->id, ss->name);
4576                WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
4577                     "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
4578
4579                ss->id = i;
4580                ss->name = cgroup_subsys_name[i];
4581                if (!ss->legacy_name)
4582                        ss->legacy_name = cgroup_subsys_name[i];
4583
4584                if (ss->early_init)
4585                        cgroup_init_subsys(ss, true);
4586        }
4587        return 0;
4588}
4589
4590static u16 cgroup_disable_mask __initdata;
4591
4592/**
4593 * cgroup_init - cgroup initialization
4594 *
4595 * Register cgroup filesystem and /proc file, and initialize
4596 * any subsystems that didn't request early init.
4597 */
4598int __init cgroup_init(void)
4599{
4600        struct cgroup_subsys *ss;
4601        int ssid;
4602
4603        BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
4604        BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
4605        BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
4606        BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
4607
4608        /*
4609         * The latency of the synchronize_sched() is too high for cgroups,
4610         * avoid it at the cost of forcing all readers into the slow path.
4611         */
4612        rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
4613
4614        get_user_ns(init_cgroup_ns.user_ns);
4615
4616        mutex_lock(&cgroup_mutex);
4617
4618        /*
4619         * Add init_css_set to the hash table so that dfl_root can link to
4620         * it during init.
4621         */
4622        hash_add(css_set_table, &init_css_set.hlist,
4623                 css_set_hash(init_css_set.subsys));
4624
4625        BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0));
4626
4627        mutex_unlock(&cgroup_mutex);
4628
4629        for_each_subsys(ss, ssid) {
4630                if (ss->early_init) {
4631                        struct cgroup_subsys_state *css =
4632                                init_css_set.subsys[ss->id];
4633
4634                        css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
4635                                                   GFP_KERNEL);
4636                        BUG_ON(css->id < 0);
4637                } else {
4638                        cgroup_init_subsys(ss, false);
4639                }
4640
4641                list_add_tail(&init_css_set.e_cset_node[ssid],
4642                              &cgrp_dfl_root.cgrp.e_csets[ssid]);
4643
4644                /*
4645                 * Setting dfl_root subsys_mask needs to consider the
4646                 * disabled flag and cftype registration needs kmalloc,
4647                 * both of which aren't available during early_init.
4648                 */
4649                if (cgroup_disable_mask & (1 << ssid)) {
4650                        static_branch_disable(cgroup_subsys_enabled_key[ssid]);
4651                        printk(KERN_INFO "Disabling %s control group subsystem\n",
4652                               ss->name);
4653                        continue;
4654                }
4655
4656                if (cgroup1_ssid_disabled(ssid))
4657                        printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
4658                               ss->name);
4659
4660                cgrp_dfl_root.subsys_mask |= 1 << ss->id;
4661
4662                if (ss->implicit_on_dfl)
4663                        cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
4664                else if (!ss->dfl_cftypes)
4665                        cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
4666
4667                if (ss->dfl_cftypes == ss->legacy_cftypes) {
4668                        WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
4669                } else {
4670                        WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
4671                        WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
4672                }
4673
4674                if (ss->bind)
4675                        ss->bind(init_css_set.subsys[ssid]);
4676
4677                mutex_lock(&cgroup_mutex);
4678                css_populate_dir(init_css_set.subsys[ssid]);
4679                mutex_unlock(&cgroup_mutex);
4680        }
4681
4682        /* init_css_set.subsys[] has been updated, re-hash */
4683        hash_del(&init_css_set.hlist);
4684        hash_add(css_set_table, &init_css_set.hlist,
4685                 css_set_hash(init_css_set.subsys));
4686
4687        WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
4688        WARN_ON(register_filesystem(&cgroup_fs_type));
4689        WARN_ON(register_filesystem(&cgroup2_fs_type));
4690        WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
4691
4692        return 0;
4693}
4694
4695static int __init cgroup_wq_init(void)
4696{
4697        /*
4698         * There isn't much point in executing destruction path in
4699         * parallel.  Good chunk is serialized with cgroup_mutex anyway.
4700         * Use 1 for @max_active.
4701         *
4702         * We would prefer to do this in cgroup_init() above, but that
4703         * is called before init_workqueues(): so leave this until after.
4704         */
4705        cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
4706        BUG_ON(!cgroup_destroy_wq);
4707        return 0;
4708}
4709core_initcall(cgroup_wq_init);
4710
4711/*
4712 * proc_cgroup_show()
4713 *  - Print task's cgroup paths into seq_file, one line for each hierarchy
4714 *  - Used for /proc/<pid>/cgroup.
4715 */
4716int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
4717                     struct pid *pid, struct task_struct *tsk)
4718{
4719        char *buf;
4720        int retval;
4721        struct cgroup_root *root;
4722
4723        retval = -ENOMEM;
4724        buf = kmalloc(PATH_MAX, GFP_KERNEL);
4725        if (!buf)
4726                goto out;
4727
4728        mutex_lock(&cgroup_mutex);
4729        spin_lock_irq(&css_set_lock);
4730
4731        for_each_root(root) {
4732                struct cgroup_subsys *ss;
4733                struct cgroup *cgrp;
4734                int ssid, count = 0;
4735
4736                if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
4737                        continue;
4738
4739                seq_printf(m, "%d:", root->hierarchy_id);
4740                if (root != &cgrp_dfl_root)
4741                        for_each_subsys(ss, ssid)
4742                                if (root->subsys_mask & (1 << ssid))
4743                                        seq_printf(m, "%s%s", count++ ? "," : "",
4744                                                   ss->legacy_name);
4745                if (strlen(root->name))
4746                        seq_printf(m, "%sname=%s", count ? "," : "",
4747                                   root->name);
4748                seq_putc(m, ':');
4749
4750                cgrp = task_cgroup_from_root(tsk, root);
4751
4752                /*
4753                 * On traditional hierarchies, all zombie tasks show up as
4754                 * belonging to the root cgroup.  On the default hierarchy,
4755                 * while a zombie doesn't show up in "cgroup.procs" and
4756                 * thus can't be migrated, its /proc/PID/cgroup keeps
4757                 * reporting the cgroup it belonged to before exiting.  If
4758                 * the cgroup is removed before the zombie is reaped,
4759                 * " (deleted)" is appended to the cgroup path.
4760                 */
4761                if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
4762                        retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
4763                                                current->nsproxy->cgroup_ns);
4764                        if (retval >= PATH_MAX)
4765                                retval = -ENAMETOOLONG;
4766                        if (retval < 0)
4767                                goto out_unlock;
4768
4769                        seq_puts(m, buf);
4770                } else {
4771                        seq_puts(m, "/");
4772                }
4773
4774                if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
4775                        seq_puts(m, " (deleted)\n");
4776                else
4777                        seq_putc(m, '\n');
4778        }
4779
4780        retval = 0;
4781out_unlock:
4782        spin_unlock_irq(&css_set_lock);
4783        mutex_unlock(&cgroup_mutex);
4784        kfree(buf);
4785out:
4786        return retval;
4787}
4788
4789/**
4790 * cgroup_fork - initialize cgroup related fields during copy_process()
4791 * @child: pointer to task_struct of forking parent process.
4792 *
4793 * A task is associated with the init_css_set until cgroup_post_fork()
4794 * attaches it to the parent's css_set.  Empty cg_list indicates that
4795 * @child isn't holding reference to its css_set.
4796 */
4797void cgroup_fork(struct task_struct *child)
4798{
4799        RCU_INIT_POINTER(child->cgroups, &init_css_set);
4800        INIT_LIST_HEAD(&child->cg_list);
4801}
4802
4803/**
4804 * cgroup_can_fork - called on a new task before the process is exposed
4805 * @child: the task in question.
4806 *
4807 * This calls the subsystem can_fork() callbacks. If the can_fork() callback
4808 * returns an error, the fork aborts with that error code. This allows for
4809 * a cgroup subsystem to conditionally allow or deny new forks.
4810 */
4811int cgroup_can_fork(struct task_struct *child)
4812{
4813        struct cgroup_subsys *ss;
4814        int i, j, ret;
4815
4816        do_each_subsys_mask(ss, i, have_canfork_callback) {
4817                ret = ss->can_fork(child);
4818                if (ret)
4819                        goto out_revert;
4820        } while_each_subsys_mask();
4821
4822        return 0;
4823
4824out_revert:
4825        for_each_subsys(ss, j) {
4826                if (j >= i)
4827                        break;
4828                if (ss->cancel_fork)
4829                        ss->cancel_fork(child);
4830        }
4831
4832        return ret;
4833}
4834
4835/**
4836 * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
4837 * @child: the task in question
4838 *
4839 * This calls the cancel_fork() callbacks if a fork failed *after*
4840 * cgroup_can_fork() succeded.
4841 */
4842void cgroup_cancel_fork(struct task_struct *child)
4843{
4844        struct cgroup_subsys *ss;
4845        int i;
4846
4847        for_each_subsys(ss, i)
4848                if (ss->cancel_fork)
4849                        ss->cancel_fork(child);
4850}
4851
4852/**
4853 * cgroup_post_fork - called on a new task after adding it to the task list
4854 * @child: the task in question
4855 *
4856 * Adds the task to the list running through its css_set if necessary and
4857 * call the subsystem fork() callbacks.  Has to be after the task is
4858 * visible on the task list in case we race with the first call to
4859 * cgroup_task_iter_start() - to guarantee that the new task ends up on its
4860 * list.
4861 */
4862void cgroup_post_fork(struct task_struct *child)
4863{
4864        struct cgroup_subsys *ss;
4865        int i;
4866
4867        /*
4868         * This may race against cgroup_enable_task_cg_lists().  As that
4869         * function sets use_task_css_set_links before grabbing
4870         * tasklist_lock and we just went through tasklist_lock to add
4871         * @child, it's guaranteed that either we see the set
4872         * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
4873         * @child during its iteration.
4874         *
4875         * If we won the race, @child is associated with %current's
4876         * css_set.  Grabbing css_set_lock guarantees both that the
4877         * association is stable, and, on completion of the parent's
4878         * migration, @child is visible in the source of migration or
4879         * already in the destination cgroup.  This guarantee is necessary
4880         * when implementing operations which need to migrate all tasks of
4881         * a cgroup to another.
4882         *
4883         * Note that if we lose to cgroup_enable_task_cg_lists(), @child
4884         * will remain in init_css_set.  This is safe because all tasks are
4885         * in the init_css_set before cg_links is enabled and there's no
4886         * operation which transfers all tasks out of init_css_set.
4887         */
4888        if (use_task_css_set_links) {
4889                struct css_set *cset;
4890
4891                spin_lock_irq(&css_set_lock);
4892                cset = task_css_set(current);
4893                if (list_empty(&child->cg_list)) {
4894                        get_css_set(cset);
4895                        cset->nr_tasks++;
4896                        css_set_move_task(child, NULL, cset, false);
4897                }
4898                spin_unlock_irq(&css_set_lock);
4899        }
4900
4901        /*
4902         * Call ss->fork().  This must happen after @child is linked on
4903         * css_set; otherwise, @child might change state between ->fork()
4904         * and addition to css_set.
4905         */
4906        do_each_subsys_mask(ss, i, have_fork_callback) {
4907                ss->fork(child);
4908        } while_each_subsys_mask();
4909}
4910
4911/**
4912 * cgroup_exit - detach cgroup from exiting task
4913 * @tsk: pointer to task_struct of exiting process
4914 *
4915 * Description: Detach cgroup from @tsk and release it.
4916 *
4917 * Note that cgroups marked notify_on_release force every task in
4918 * them to take the global cgroup_mutex mutex when exiting.
4919 * This could impact scaling on very large systems.  Be reluctant to
4920 * use notify_on_release cgroups where very high task exit scaling
4921 * is required on large systems.
4922 *
4923 * We set the exiting tasks cgroup to the root cgroup (top_cgroup).  We
4924 * call cgroup_exit() while the task is still competent to handle
4925 * notify_on_release(), then leave the task attached to the root cgroup in
4926 * each hierarchy for the remainder of its exit.  No need to bother with
4927 * init_css_set refcnting.  init_css_set never goes away and we can't race
4928 * with migration path - PF_EXITING is visible to migration path.
4929 */
4930void cgroup_exit(struct task_struct *tsk)
4931{
4932        struct cgroup_subsys *ss;
4933        struct css_set *cset;
4934        int i;
4935
4936        /*
4937         * Unlink from @tsk from its css_set.  As migration path can't race
4938         * with us, we can check css_set and cg_list without synchronization.
4939         */
4940        cset = task_css_set(tsk);
4941
4942        if (!list_empty(&tsk->cg_list)) {
4943                spin_lock_irq(&css_set_lock);
4944                css_set_move_task(tsk, cset, NULL, false);
4945                cset->nr_tasks--;
4946                spin_unlock_irq(&css_set_lock);
4947        } else {
4948                get_css_set(cset);
4949        }
4950
4951        /* see cgroup_post_fork() for details */
4952        do_each_subsys_mask(ss, i, have_exit_callback) {
4953                ss->exit(tsk);
4954        } while_each_subsys_mask();
4955}
4956
4957void cgroup_free(struct task_struct *task)
4958{
4959        struct css_set *cset = task_css_set(task);
4960        struct cgroup_subsys *ss;
4961        int ssid;
4962
4963        do_each_subsys_mask(ss, ssid, have_free_callback) {
4964                ss->free(task);
4965        } while_each_subsys_mask();
4966
4967        put_css_set(cset);
4968}
4969
4970static int __init cgroup_disable(char *str)
4971{
4972        struct cgroup_subsys *ss;
4973        char *token;
4974        int i;
4975
4976        while ((token = strsep(&str, ",")) != NULL) {
4977                if (!*token)
4978                        continue;
4979
4980                for_each_subsys(ss, i) {
4981                        if (strcmp(token, ss->name) &&
4982                            strcmp(token, ss->legacy_name))
4983                                continue;
4984                        cgroup_disable_mask |= 1 << i;
4985                }
4986        }
4987        return 1;
4988}
4989__setup("cgroup_disable=", cgroup_disable);
4990
4991/**
4992 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
4993 * @dentry: directory dentry of interest
4994 * @ss: subsystem of interest
4995 *
4996 * If @dentry is a directory for a cgroup which has @ss enabled on it, try
4997 * to get the corresponding css and return it.  If such css doesn't exist
4998 * or can't be pinned, an ERR_PTR value is returned.
4999 */
5000struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
5001                                                       struct cgroup_subsys *ss)
5002{
5003        struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
5004        struct file_system_type *s_type = dentry->d_sb->s_type;
5005        struct cgroup_subsys_state *css = NULL;
5006        struct cgroup *cgrp;
5007
5008        /* is @dentry a cgroup dir? */
5009        if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
5010            !kn || kernfs_type(kn) != KERNFS_DIR)
5011                return ERR_PTR(-EBADF);
5012
5013        rcu_read_lock();
5014
5015        /*
5016         * This path doesn't originate from kernfs and @kn could already
5017         * have been or be removed at any point.  @kn->priv is RCU
5018         * protected for this access.  See css_release_work_fn() for details.
5019         */
5020        cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
5021        if (cgrp)
5022                css = cgroup_css(cgrp, ss);
5023
5024        if (!css || !css_tryget_online(css))
5025                css = ERR_PTR(-ENOENT);
5026
5027        rcu_read_unlock();
5028        return css;
5029}
5030
5031/**
5032 * css_from_id - lookup css by id
5033 * @id: the cgroup id
5034 * @ss: cgroup subsys to be looked into
5035 *
5036 * Returns the css if there's valid one with @id, otherwise returns NULL.
5037 * Should be called under rcu_read_lock().
5038 */
5039struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5040{
5041        WARN_ON_ONCE(!rcu_read_lock_held());
5042        return idr_find(&ss->css_idr, id);
5043}
5044
5045/**
5046 * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
5047 * @path: path on the default hierarchy
5048 *
5049 * Find the cgroup at @path on the default hierarchy, increment its
5050 * reference count and return it.  Returns pointer to the found cgroup on
5051 * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
5052 * if @path points to a non-directory.
5053 */
5054struct cgroup *cgroup_get_from_path(const char *path)
5055{
5056        struct kernfs_node *kn;
5057        struct cgroup *cgrp;
5058
5059        mutex_lock(&cgroup_mutex);
5060
5061        kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
5062        if (kn) {
5063                if (kernfs_type(kn) == KERNFS_DIR) {
5064                        cgrp = kn->priv;
5065                        cgroup_get_live(cgrp);
5066                } else {
5067                        cgrp = ERR_PTR(-ENOTDIR);
5068                }
5069                kernfs_put(kn);
5070        } else {
5071                cgrp = ERR_PTR(-ENOENT);
5072        }
5073
5074        mutex_unlock(&cgroup_mutex);
5075        return cgrp;
5076}
5077EXPORT_SYMBOL_GPL(cgroup_get_from_path);
5078
5079/**
5080 * cgroup_get_from_fd - get a cgroup pointer from a fd
5081 * @fd: fd obtained by open(cgroup2_dir)
5082 *
5083 * Find the cgroup from a fd which should be obtained
5084 * by opening a cgroup directory.  Returns a pointer to the
5085 * cgroup on success. ERR_PTR is returned if the cgroup
5086 * cannot be found.
5087 */
5088struct cgroup *cgroup_get_from_fd(int fd)
5089{
5090        struct cgroup_subsys_state *css;
5091        struct cgroup *cgrp;
5092        struct file *f;
5093
5094        f = fget_raw(fd);
5095        if (!f)
5096                return ERR_PTR(-EBADF);
5097
5098        css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
5099        fput(f);
5100        if (IS_ERR(css))
5101                return ERR_CAST(css);
5102
5103        cgrp = css->cgroup;
5104        if (!cgroup_on_dfl(cgrp)) {
5105                cgroup_put(cgrp);
5106                return ERR_PTR(-EBADF);
5107        }
5108
5109        return cgrp;
5110}
5111EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
5112
5113/*
5114 * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
5115 * definition in cgroup-defs.h.
5116 */
5117#ifdef CONFIG_SOCK_CGROUP_DATA
5118
5119#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
5120
5121DEFINE_SPINLOCK(cgroup_sk_update_lock);
5122static bool cgroup_sk_alloc_disabled __read_mostly;
5123
5124void cgroup_sk_alloc_disable(void)
5125{
5126        if (cgroup_sk_alloc_disabled)
5127                return;
5128        pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
5129        cgroup_sk_alloc_disabled = true;
5130}
5131
5132#else
5133
5134#define cgroup_sk_alloc_disabled        false
5135
5136#endif
5137
5138void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
5139{
5140        if (cgroup_sk_alloc_disabled)
5141                return;
5142
5143        /* Socket clone path */
5144        if (skcd->val) {
5145                /*
5146                 * We might be cloning a socket which is left in an empty
5147                 * cgroup and the cgroup might have already been rmdir'd.
5148                 * Don't use cgroup_get_live().
5149                 */
5150                cgroup_get(sock_cgroup_ptr(skcd));
5151                return;
5152        }
5153
5154        rcu_read_lock();
5155
5156        while (true) {
5157                struct css_set *cset;
5158
5159                cset = task_css_set(current);
5160                if (likely(cgroup_tryget(cset->dfl_cgrp))) {
5161                        skcd->val = (unsigned long)cset->dfl_cgrp;
5162                        break;
5163                }
5164                cpu_relax();
5165        }
5166
5167        rcu_read_unlock();
5168}
5169
5170void cgroup_sk_free(struct sock_cgroup_data *skcd)
5171{
5172        cgroup_put(sock_cgroup_ptr(skcd));
5173}
5174
5175#endif  /* CONFIG_SOCK_CGROUP_DATA */
5176
5177#ifdef CONFIG_CGROUP_BPF
5178int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
5179                      enum bpf_attach_type type, bool overridable)
5180{
5181        struct cgroup *parent = cgroup_parent(cgrp);
5182        int ret;
5183
5184        mutex_lock(&cgroup_mutex);
5185        ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
5186        mutex_unlock(&cgroup_mutex);
5187        return ret;
5188}
5189#endif /* CONFIG_CGROUP_BPF */
5190