linux/block/blk-cgroup.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Common Block IO controller cgroup interface
   4 *
   5 * Based on ideas and code from CFQ, CFS and BFQ:
   6 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
   7 *
   8 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
   9 *                    Paolo Valente <paolo.valente@unimore.it>
  10 *
  11 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
  12 *                    Nauman Rafique <nauman@google.com>
  13 *
  14 * For policy-specific per-blkcg data:
  15 * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
  16 *                    Arianna Avanzini <avanzini.arianna@gmail.com>
  17 */
  18#include <linux/ioprio.h>
  19#include <linux/kdev_t.h>
  20#include <linux/module.h>
  21#include <linux/sched/signal.h>
  22#include <linux/err.h>
  23#include <linux/blkdev.h>
  24#include <linux/backing-dev.h>
  25#include <linux/slab.h>
  26#include <linux/genhd.h>
  27#include <linux/delay.h>
  28#include <linux/atomic.h>
  29#include <linux/ctype.h>
  30#include <linux/blk-cgroup.h>
  31#include <linux/tracehook.h>
  32#include "blk.h"
  33
  34#define MAX_KEY_LEN 100
  35
  36/*
  37 * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
  38 * blkcg_pol_register_mutex nests outside of it and synchronizes entire
  39 * policy [un]register operations including cgroup file additions /
  40 * removals.  Putting cgroup file registration outside blkcg_pol_mutex
  41 * allows grabbing it from cgroup callbacks.
  42 */
  43static DEFINE_MUTEX(blkcg_pol_register_mutex);
  44static DEFINE_MUTEX(blkcg_pol_mutex);
  45
  46struct blkcg blkcg_root;
  47EXPORT_SYMBOL_GPL(blkcg_root);
  48
  49struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
  50
  51static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
  52
  53static LIST_HEAD(all_blkcgs);           /* protected by blkcg_pol_mutex */
  54
  55static bool blkcg_debug_stats = false;
  56
  57static bool blkcg_policy_enabled(struct request_queue *q,
  58                                 const struct blkcg_policy *pol)
  59{
  60        return pol && test_bit(pol->plid, q->blkcg_pols);
  61}
  62
  63/**
  64 * blkg_free - free a blkg
  65 * @blkg: blkg to free
  66 *
  67 * Free @blkg which may be partially allocated.
  68 */
  69static void blkg_free(struct blkcg_gq *blkg)
  70{
  71        int i;
  72
  73        if (!blkg)
  74                return;
  75
  76        for (i = 0; i < BLKCG_MAX_POLS; i++)
  77                if (blkg->pd[i])
  78                        blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
  79
  80        blkg_rwstat_exit(&blkg->stat_ios);
  81        blkg_rwstat_exit(&blkg->stat_bytes);
  82        kfree(blkg);
  83}
  84
  85static void __blkg_release(struct rcu_head *rcu)
  86{
  87        struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
  88
  89        percpu_ref_exit(&blkg->refcnt);
  90
  91        /* release the blkcg and parent blkg refs this blkg has been holding */
  92        css_put(&blkg->blkcg->css);
  93        if (blkg->parent)
  94                blkg_put(blkg->parent);
  95
  96        wb_congested_put(blkg->wb_congested);
  97
  98        blkg_free(blkg);
  99}
 100
 101/*
 102 * A group is RCU protected, but having an rcu lock does not mean that one
 103 * can access all the fields of blkg and assume these are valid.  For
 104 * example, don't try to follow throtl_data and request queue links.
 105 *
 106 * Having a reference to blkg under an rcu allows accesses to only values
 107 * local to groups like group stats and group rate limits.
 108 */
 109static void blkg_release(struct percpu_ref *ref)
 110{
 111        struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);
 112
 113        call_rcu(&blkg->rcu_head, __blkg_release);
 114}
 115
 116/**
 117 * blkg_alloc - allocate a blkg
 118 * @blkcg: block cgroup the new blkg is associated with
 119 * @q: request_queue the new blkg is associated with
 120 * @gfp_mask: allocation mask to use
 121 *
 122 * Allocate a new blkg assocating @blkcg and @q.
 123 */
 124static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 125                                   gfp_t gfp_mask)
 126{
 127        struct blkcg_gq *blkg;
 128        int i;
 129
 130        /* alloc and init base part */
 131        blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
 132        if (!blkg)
 133                return NULL;
 134
 135        if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
 136            blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
 137                goto err_free;
 138
 139        blkg->q = q;
 140        INIT_LIST_HEAD(&blkg->q_node);
 141        blkg->blkcg = blkcg;
 142
 143        for (i = 0; i < BLKCG_MAX_POLS; i++) {
 144                struct blkcg_policy *pol = blkcg_policy[i];
 145                struct blkg_policy_data *pd;
 146
 147                if (!blkcg_policy_enabled(q, pol))
 148                        continue;
 149
 150                /* alloc per-policy data and attach it to blkg */
 151                pd = pol->pd_alloc_fn(gfp_mask, q->node);
 152                if (!pd)
 153                        goto err_free;
 154
 155                blkg->pd[i] = pd;
 156                pd->blkg = blkg;
 157                pd->plid = i;
 158        }
 159
 160        return blkg;
 161
 162err_free:
 163        blkg_free(blkg);
 164        return NULL;
 165}
 166
 167struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
 168                                      struct request_queue *q, bool update_hint)
 169{
 170        struct blkcg_gq *blkg;
 171
 172        /*
 173         * Hint didn't match.  Look up from the radix tree.  Note that the
 174         * hint can only be updated under queue_lock as otherwise @blkg
 175         * could have already been removed from blkg_tree.  The caller is
 176         * responsible for grabbing queue_lock if @update_hint.
 177         */
 178        blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
 179        if (blkg && blkg->q == q) {
 180                if (update_hint) {
 181                        lockdep_assert_held(&q->queue_lock);
 182                        rcu_assign_pointer(blkcg->blkg_hint, blkg);
 183                }
 184                return blkg;
 185        }
 186
 187        return NULL;
 188}
 189EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
 190
 191/*
 192 * If @new_blkg is %NULL, this function tries to allocate a new one as
 193 * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
 194 */
 195static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 196                                    struct request_queue *q,
 197                                    struct blkcg_gq *new_blkg)
 198{
 199        struct blkcg_gq *blkg;
 200        struct bdi_writeback_congested *wb_congested;
 201        int i, ret;
 202
 203        WARN_ON_ONCE(!rcu_read_lock_held());
 204        lockdep_assert_held(&q->queue_lock);
 205
 206        /* request_queue is dying, do not create/recreate a blkg */
 207        if (blk_queue_dying(q)) {
 208                ret = -ENODEV;
 209                goto err_free_blkg;
 210        }
 211
 212        /* blkg holds a reference to blkcg */
 213        if (!css_tryget_online(&blkcg->css)) {
 214                ret = -ENODEV;
 215                goto err_free_blkg;
 216        }
 217
 218        wb_congested = wb_congested_get_create(q->backing_dev_info,
 219                                               blkcg->css.id,
 220                                               GFP_NOWAIT | __GFP_NOWARN);
 221        if (!wb_congested) {
 222                ret = -ENOMEM;
 223                goto err_put_css;
 224        }
 225
 226        /* allocate */
 227        if (!new_blkg) {
 228                new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
 229                if (unlikely(!new_blkg)) {
 230                        ret = -ENOMEM;
 231                        goto err_put_congested;
 232                }
 233        }
 234        blkg = new_blkg;
 235        blkg->wb_congested = wb_congested;
 236
 237        /* link parent */
 238        if (blkcg_parent(blkcg)) {
 239                blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
 240                if (WARN_ON_ONCE(!blkg->parent)) {
 241                        ret = -ENODEV;
 242                        goto err_put_congested;
 243                }
 244                blkg_get(blkg->parent);
 245        }
 246
 247        ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0,
 248                              GFP_NOWAIT | __GFP_NOWARN);
 249        if (ret)
 250                goto err_cancel_ref;
 251
 252        /* invoke per-policy init */
 253        for (i = 0; i < BLKCG_MAX_POLS; i++) {
 254                struct blkcg_policy *pol = blkcg_policy[i];
 255
 256                if (blkg->pd[i] && pol->pd_init_fn)
 257                        pol->pd_init_fn(blkg->pd[i]);
 258        }
 259
 260        /* insert */
 261        spin_lock(&blkcg->lock);
 262        ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
 263        if (likely(!ret)) {
 264                hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
 265                list_add(&blkg->q_node, &q->blkg_list);
 266
 267                for (i = 0; i < BLKCG_MAX_POLS; i++) {
 268                        struct blkcg_policy *pol = blkcg_policy[i];
 269
 270                        if (blkg->pd[i] && pol->pd_online_fn)
 271                                pol->pd_online_fn(blkg->pd[i]);
 272                }
 273        }
 274        blkg->online = true;
 275        spin_unlock(&blkcg->lock);
 276
 277        if (!ret)
 278                return blkg;
 279
 280        /* @blkg failed fully initialized, use the usual release path */
 281        blkg_put(blkg);
 282        return ERR_PTR(ret);
 283
 284err_cancel_ref:
 285        percpu_ref_exit(&blkg->refcnt);
 286err_put_congested:
 287        wb_congested_put(wb_congested);
 288err_put_css:
 289        css_put(&blkcg->css);
 290err_free_blkg:
 291        blkg_free(new_blkg);
 292        return ERR_PTR(ret);
 293}
 294
 295/**
 296 * __blkg_lookup_create - lookup blkg, try to create one if not there
 297 * @blkcg: blkcg of interest
 298 * @q: request_queue of interest
 299 *
 300 * Lookup blkg for the @blkcg - @q pair.  If it doesn't exist, try to
 301 * create one.  blkg creation is performed recursively from blkcg_root such
 302 * that all non-root blkg's have access to the parent blkg.  This function
 303 * should be called under RCU read lock and @q->queue_lock.
 304 *
 305 * Returns the blkg or the closest blkg if blkg_create() fails as it walks
 306 * down from root.
 307 */
 308struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
 309                                      struct request_queue *q)
 310{
 311        struct blkcg_gq *blkg;
 312
 313        WARN_ON_ONCE(!rcu_read_lock_held());
 314        lockdep_assert_held(&q->queue_lock);
 315
 316        blkg = __blkg_lookup(blkcg, q, true);
 317        if (blkg)
 318                return blkg;
 319
 320        /*
 321         * Create blkgs walking down from blkcg_root to @blkcg, so that all
 322         * non-root blkgs have access to their parents.  Returns the closest
 323         * blkg to the intended blkg should blkg_create() fail.
 324         */
 325        while (true) {
 326                struct blkcg *pos = blkcg;
 327                struct blkcg *parent = blkcg_parent(blkcg);
 328                struct blkcg_gq *ret_blkg = q->root_blkg;
 329
 330                while (parent) {
 331                        blkg = __blkg_lookup(parent, q, false);
 332                        if (blkg) {
 333                                /* remember closest blkg */
 334                                ret_blkg = blkg;
 335                                break;
 336                        }
 337                        pos = parent;
 338                        parent = blkcg_parent(parent);
 339                }
 340
 341                blkg = blkg_create(pos, q, NULL);
 342                if (IS_ERR(blkg))
 343                        return ret_blkg;
 344                if (pos == blkcg)
 345                        return blkg;
 346        }
 347}
 348
 349/**
 350 * blkg_lookup_create - find or create a blkg
 351 * @blkcg: target block cgroup
 352 * @q: target request_queue
 353 *
 354 * This looks up or creates the blkg representing the unique pair
 355 * of the blkcg and the request_queue.
 356 */
 357struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 358                                    struct request_queue *q)
 359{
 360        struct blkcg_gq *blkg = blkg_lookup(blkcg, q);
 361
 362        if (unlikely(!blkg)) {
 363                unsigned long flags;
 364
 365                spin_lock_irqsave(&q->queue_lock, flags);
 366                blkg = __blkg_lookup_create(blkcg, q);
 367                spin_unlock_irqrestore(&q->queue_lock, flags);
 368        }
 369
 370        return blkg;
 371}
 372
 373static void blkg_destroy(struct blkcg_gq *blkg)
 374{
 375        struct blkcg *blkcg = blkg->blkcg;
 376        struct blkcg_gq *parent = blkg->parent;
 377        int i;
 378
 379        lockdep_assert_held(&blkg->q->queue_lock);
 380        lockdep_assert_held(&blkcg->lock);
 381
 382        /* Something wrong if we are trying to remove same group twice */
 383        WARN_ON_ONCE(list_empty(&blkg->q_node));
 384        WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
 385
 386        for (i = 0; i < BLKCG_MAX_POLS; i++) {
 387                struct blkcg_policy *pol = blkcg_policy[i];
 388
 389                if (blkg->pd[i] && pol->pd_offline_fn)
 390                        pol->pd_offline_fn(blkg->pd[i]);
 391        }
 392
 393        if (parent) {
 394                blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
 395                blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
 396        }
 397
 398        blkg->online = false;
 399
 400        radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
 401        list_del_init(&blkg->q_node);
 402        hlist_del_init_rcu(&blkg->blkcg_node);
 403
 404        /*
 405         * Both setting lookup hint to and clearing it from @blkg are done
 406         * under queue_lock.  If it's not pointing to @blkg now, it never
 407         * will.  Hint assignment itself can race safely.
 408         */
 409        if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
 410                rcu_assign_pointer(blkcg->blkg_hint, NULL);
 411
 412        /*
 413         * Put the reference taken at the time of creation so that when all
 414         * queues are gone, group can be destroyed.
 415         */
 416        percpu_ref_kill(&blkg->refcnt);
 417}
 418
 419/**
 420 * blkg_destroy_all - destroy all blkgs associated with a request_queue
 421 * @q: request_queue of interest
 422 *
 423 * Destroy all blkgs associated with @q.
 424 */
 425static void blkg_destroy_all(struct request_queue *q)
 426{
 427        struct blkcg_gq *blkg, *n;
 428
 429        spin_lock_irq(&q->queue_lock);
 430        list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
 431                struct blkcg *blkcg = blkg->blkcg;
 432
 433                spin_lock(&blkcg->lock);
 434                blkg_destroy(blkg);
 435                spin_unlock(&blkcg->lock);
 436        }
 437
 438        q->root_blkg = NULL;
 439        spin_unlock_irq(&q->queue_lock);
 440}
 441
 442static int blkcg_reset_stats(struct cgroup_subsys_state *css,
 443                             struct cftype *cftype, u64 val)
 444{
 445        struct blkcg *blkcg = css_to_blkcg(css);
 446        struct blkcg_gq *blkg;
 447        int i;
 448
 449        mutex_lock(&blkcg_pol_mutex);
 450        spin_lock_irq(&blkcg->lock);
 451
 452        /*
 453         * Note that stat reset is racy - it doesn't synchronize against
 454         * stat updates.  This is a debug feature which shouldn't exist
 455         * anyway.  If you get hit by a race, retry.
 456         */
 457        hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
 458                blkg_rwstat_reset(&blkg->stat_bytes);
 459                blkg_rwstat_reset(&blkg->stat_ios);
 460
 461                for (i = 0; i < BLKCG_MAX_POLS; i++) {
 462                        struct blkcg_policy *pol = blkcg_policy[i];
 463
 464                        if (blkg->pd[i] && pol->pd_reset_stats_fn)
 465                                pol->pd_reset_stats_fn(blkg->pd[i]);
 466                }
 467        }
 468
 469        spin_unlock_irq(&blkcg->lock);
 470        mutex_unlock(&blkcg_pol_mutex);
 471        return 0;
 472}
 473
 474const char *blkg_dev_name(struct blkcg_gq *blkg)
 475{
 476        /* some drivers (floppy) instantiate a queue w/o disk registered */
 477        if (blkg->q->backing_dev_info->dev)
 478                return dev_name(blkg->q->backing_dev_info->dev);
 479        return NULL;
 480}
 481
 482/**
 483 * blkcg_print_blkgs - helper for printing per-blkg data
 484 * @sf: seq_file to print to
 485 * @blkcg: blkcg of interest
 486 * @prfill: fill function to print out a blkg
 487 * @pol: policy in question
 488 * @data: data to be passed to @prfill
 489 * @show_total: to print out sum of prfill return values or not
 490 *
 491 * This function invokes @prfill on each blkg of @blkcg if pd for the
 492 * policy specified by @pol exists.  @prfill is invoked with @sf, the
 493 * policy data and @data and the matching queue lock held.  If @show_total
 494 * is %true, the sum of the return values from @prfill is printed with
 495 * "Total" label at the end.
 496 *
 497 * This is to be used to construct print functions for
 498 * cftype->read_seq_string method.
 499 */
 500void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 501                       u64 (*prfill)(struct seq_file *,
 502                                     struct blkg_policy_data *, int),
 503                       const struct blkcg_policy *pol, int data,
 504                       bool show_total)
 505{
 506        struct blkcg_gq *blkg;
 507        u64 total = 0;
 508
 509        rcu_read_lock();
 510        hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
 511                spin_lock_irq(&blkg->q->queue_lock);
 512                if (blkcg_policy_enabled(blkg->q, pol))
 513                        total += prfill(sf, blkg->pd[pol->plid], data);
 514                spin_unlock_irq(&blkg->q->queue_lock);
 515        }
 516        rcu_read_unlock();
 517
 518        if (show_total)
 519                seq_printf(sf, "Total %llu\n", (unsigned long long)total);
 520}
 521EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
 522
 523/**
 524 * __blkg_prfill_u64 - prfill helper for a single u64 value
 525 * @sf: seq_file to print to
 526 * @pd: policy private data of interest
 527 * @v: value to print
 528 *
 529 * Print @v to @sf for the device assocaited with @pd.
 530 */
 531u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
 532{
 533        const char *dname = blkg_dev_name(pd->blkg);
 534
 535        if (!dname)
 536                return 0;
 537
 538        seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
 539        return v;
 540}
 541EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
 542
 543/**
 544 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
 545 * @sf: seq_file to print to
 546 * @pd: policy private data of interest
 547 * @rwstat: rwstat to print
 548 *
 549 * Print @rwstat to @sf for the device assocaited with @pd.
 550 */
 551u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 552                         const struct blkg_rwstat *rwstat)
 553{
 554        static const char *rwstr[] = {
 555                [BLKG_RWSTAT_READ]      = "Read",
 556                [BLKG_RWSTAT_WRITE]     = "Write",
 557                [BLKG_RWSTAT_SYNC]      = "Sync",
 558                [BLKG_RWSTAT_ASYNC]     = "Async",
 559                [BLKG_RWSTAT_DISCARD]   = "Discard",
 560        };
 561        const char *dname = blkg_dev_name(pd->blkg);
 562        u64 v;
 563        int i;
 564
 565        if (!dname)
 566                return 0;
 567
 568        for (i = 0; i < BLKG_RWSTAT_NR; i++)
 569                seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
 570                           (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
 571
 572        v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
 573                atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) +
 574                atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]);
 575        seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
 576        return v;
 577}
 578EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
 579
 580/**
 581 * blkg_prfill_stat - prfill callback for blkg_stat
 582 * @sf: seq_file to print to
 583 * @pd: policy private data of interest
 584 * @off: offset to the blkg_stat in @pd
 585 *
 586 * prfill callback for printing a blkg_stat.
 587 */
 588u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
 589{
 590        return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
 591}
 592EXPORT_SYMBOL_GPL(blkg_prfill_stat);
 593
 594/**
 595 * blkg_prfill_rwstat - prfill callback for blkg_rwstat
 596 * @sf: seq_file to print to
 597 * @pd: policy private data of interest
 598 * @off: offset to the blkg_rwstat in @pd
 599 *
 600 * prfill callback for printing a blkg_rwstat.
 601 */
 602u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 603                       int off)
 604{
 605        struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
 606
 607        return __blkg_prfill_rwstat(sf, pd, &rwstat);
 608}
 609EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
 610
 611static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
 612                                    struct blkg_policy_data *pd, int off)
 613{
 614        struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
 615
 616        return __blkg_prfill_rwstat(sf, pd, &rwstat);
 617}
 618
 619/**
 620 * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
 621 * @sf: seq_file to print to
 622 * @v: unused
 623 *
 624 * To be used as cftype->seq_show to print blkg->stat_bytes.
 625 * cftype->private must be set to the blkcg_policy.
 626 */
 627int blkg_print_stat_bytes(struct seq_file *sf, void *v)
 628{
 629        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
 630                          blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
 631                          offsetof(struct blkcg_gq, stat_bytes), true);
 632        return 0;
 633}
 634EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
 635
 636/**
 637 * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
 638 * @sf: seq_file to print to
 639 * @v: unused
 640 *
 641 * To be used as cftype->seq_show to print blkg->stat_ios.  cftype->private
 642 * must be set to the blkcg_policy.
 643 */
 644int blkg_print_stat_ios(struct seq_file *sf, void *v)
 645{
 646        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
 647                          blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
 648                          offsetof(struct blkcg_gq, stat_ios), true);
 649        return 0;
 650}
 651EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
 652
 653static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
 654                                              struct blkg_policy_data *pd,
 655                                              int off)
 656{
 657        struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
 658                                                              NULL, off);
 659        return __blkg_prfill_rwstat(sf, pd, &rwstat);
 660}
 661
 662/**
 663 * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
 664 * @sf: seq_file to print to
 665 * @v: unused
 666 */
 667int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
 668{
 669        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
 670                          blkg_prfill_rwstat_field_recursive,
 671                          (void *)seq_cft(sf)->private,
 672                          offsetof(struct blkcg_gq, stat_bytes), true);
 673        return 0;
 674}
 675EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
 676
 677/**
 678 * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
 679 * @sf: seq_file to print to
 680 * @v: unused
 681 */
 682int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
 683{
 684        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
 685                          blkg_prfill_rwstat_field_recursive,
 686                          (void *)seq_cft(sf)->private,
 687                          offsetof(struct blkcg_gq, stat_ios), true);
 688        return 0;
 689}
 690EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
 691
 692/**
 693 * blkg_stat_recursive_sum - collect hierarchical blkg_stat
 694 * @blkg: blkg of interest
 695 * @pol: blkcg_policy which contains the blkg_stat
 696 * @off: offset to the blkg_stat in blkg_policy_data or @blkg
 697 *
 698 * Collect the blkg_stat specified by @blkg, @pol and @off and all its
 699 * online descendants and their aux counts.  The caller must be holding the
 700 * queue lock for online tests.
 701 *
 702 * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
 703 * at @off bytes into @blkg's blkg_policy_data of the policy.
 704 */
 705u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
 706                            struct blkcg_policy *pol, int off)
 707{
 708        struct blkcg_gq *pos_blkg;
 709        struct cgroup_subsys_state *pos_css;
 710        u64 sum = 0;
 711
 712        lockdep_assert_held(&blkg->q->queue_lock);
 713
 714        rcu_read_lock();
 715        blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
 716                struct blkg_stat *stat;
 717
 718                if (!pos_blkg->online)
 719                        continue;
 720
 721                if (pol)
 722                        stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
 723                else
 724                        stat = (void *)blkg + off;
 725
 726                sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
 727        }
 728        rcu_read_unlock();
 729
 730        return sum;
 731}
 732EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
 733
 734/**
 735 * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
 736 * @blkg: blkg of interest
 737 * @pol: blkcg_policy which contains the blkg_rwstat
 738 * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
 739 *
 740 * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
 741 * online descendants and their aux counts.  The caller must be holding the
 742 * queue lock for online tests.
 743 *
 744 * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
 745 * is at @off bytes into @blkg's blkg_policy_data of the policy.
 746 */
 747struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
 748                                             struct blkcg_policy *pol, int off)
 749{
 750        struct blkcg_gq *pos_blkg;
 751        struct cgroup_subsys_state *pos_css;
 752        struct blkg_rwstat sum = { };
 753        int i;
 754
 755        lockdep_assert_held(&blkg->q->queue_lock);
 756
 757        rcu_read_lock();
 758        blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
 759                struct blkg_rwstat *rwstat;
 760
 761                if (!pos_blkg->online)
 762                        continue;
 763
 764                if (pol)
 765                        rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
 766                else
 767                        rwstat = (void *)pos_blkg + off;
 768
 769                for (i = 0; i < BLKG_RWSTAT_NR; i++)
 770                        atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
 771                                percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
 772                                &sum.aux_cnt[i]);
 773        }
 774        rcu_read_unlock();
 775
 776        return sum;
 777}
 778EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
 779
 780/* Performs queue bypass and policy enabled checks then looks up blkg. */
 781static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
 782                                          const struct blkcg_policy *pol,
 783                                          struct request_queue *q)
 784{
 785        WARN_ON_ONCE(!rcu_read_lock_held());
 786        lockdep_assert_held(&q->queue_lock);
 787
 788        if (!blkcg_policy_enabled(q, pol))
 789                return ERR_PTR(-EOPNOTSUPP);
 790        return __blkg_lookup(blkcg, q, true /* update_hint */);
 791}
 792
 793/**
 794 * blkg_conf_prep - parse and prepare for per-blkg config update
 795 * @blkcg: target block cgroup
 796 * @pol: target policy
 797 * @input: input string
 798 * @ctx: blkg_conf_ctx to be filled
 799 *
 800 * Parse per-blkg config update from @input and initialize @ctx with the
 801 * result.  @ctx->blkg points to the blkg to be updated and @ctx->body the
 802 * part of @input following MAJ:MIN.  This function returns with RCU read
 803 * lock and queue lock held and must be paired with blkg_conf_finish().
 804 */
 805int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 806                   char *input, struct blkg_conf_ctx *ctx)
 807        __acquires(rcu) __acquires(&disk->queue->queue_lock)
 808{
 809        struct gendisk *disk;
 810        struct request_queue *q;
 811        struct blkcg_gq *blkg;
 812        unsigned int major, minor;
 813        int key_len, part, ret;
 814        char *body;
 815
 816        if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
 817                return -EINVAL;
 818
 819        body = input + key_len;
 820        if (!isspace(*body))
 821                return -EINVAL;
 822        body = skip_spaces(body);
 823
 824        disk = get_gendisk(MKDEV(major, minor), &part);
 825        if (!disk)
 826                return -ENODEV;
 827        if (part) {
 828                ret = -ENODEV;
 829                goto fail;
 830        }
 831
 832        q = disk->queue;
 833
 834        rcu_read_lock();
 835        spin_lock_irq(&q->queue_lock);
 836
 837        blkg = blkg_lookup_check(blkcg, pol, q);
 838        if (IS_ERR(blkg)) {
 839                ret = PTR_ERR(blkg);
 840                goto fail_unlock;
 841        }
 842
 843        if (blkg)
 844                goto success;
 845
 846        /*
 847         * Create blkgs walking down from blkcg_root to @blkcg, so that all
 848         * non-root blkgs have access to their parents.
 849         */
 850        while (true) {
 851                struct blkcg *pos = blkcg;
 852                struct blkcg *parent;
 853                struct blkcg_gq *new_blkg;
 854
 855                parent = blkcg_parent(blkcg);
 856                while (parent && !__blkg_lookup(parent, q, false)) {
 857                        pos = parent;
 858                        parent = blkcg_parent(parent);
 859                }
 860
 861                /* Drop locks to do new blkg allocation with GFP_KERNEL. */
 862                spin_unlock_irq(&q->queue_lock);
 863                rcu_read_unlock();
 864
 865                new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
 866                if (unlikely(!new_blkg)) {
 867                        ret = -ENOMEM;
 868                        goto fail;
 869                }
 870
 871                rcu_read_lock();
 872                spin_lock_irq(&q->queue_lock);
 873
 874                blkg = blkg_lookup_check(pos, pol, q);
 875                if (IS_ERR(blkg)) {
 876                        ret = PTR_ERR(blkg);
 877                        goto fail_unlock;
 878                }
 879
 880                if (blkg) {
 881                        blkg_free(new_blkg);
 882                } else {
 883                        blkg = blkg_create(pos, q, new_blkg);
 884                        if (IS_ERR(blkg)) {
 885                                ret = PTR_ERR(blkg);
 886                                goto fail_unlock;
 887                        }
 888                }
 889
 890                if (pos == blkcg)
 891                        goto success;
 892        }
 893success:
 894        ctx->disk = disk;
 895        ctx->blkg = blkg;
 896        ctx->body = body;
 897        return 0;
 898
 899fail_unlock:
 900        spin_unlock_irq(&q->queue_lock);
 901        rcu_read_unlock();
 902fail:
 903        put_disk_and_module(disk);
 904        /*
 905         * If queue was bypassing, we should retry.  Do so after a
 906         * short msleep().  It isn't strictly necessary but queue
 907         * can be bypassing for some time and it's always nice to
 908         * avoid busy looping.
 909         */
 910        if (ret == -EBUSY) {
 911                msleep(10);
 912                ret = restart_syscall();
 913        }
 914        return ret;
 915}
 916
 917/**
 918 * blkg_conf_finish - finish up per-blkg config update
 919 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
 920 *
 921 * Finish up after per-blkg config update.  This function must be paired
 922 * with blkg_conf_prep().
 923 */
 924void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 925        __releases(&ctx->disk->queue->queue_lock) __releases(rcu)
 926{
 927        spin_unlock_irq(&ctx->disk->queue->queue_lock);
 928        rcu_read_unlock();
 929        put_disk_and_module(ctx->disk);
 930}
 931
 932static int blkcg_print_stat(struct seq_file *sf, void *v)
 933{
 934        struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
 935        struct blkcg_gq *blkg;
 936
 937        rcu_read_lock();
 938
 939        hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
 940                const char *dname;
 941                char *buf;
 942                struct blkg_rwstat rwstat;
 943                u64 rbytes, wbytes, rios, wios, dbytes, dios;
 944                size_t size = seq_get_buf(sf, &buf), off = 0;
 945                int i;
 946                bool has_stats = false;
 947
 948                dname = blkg_dev_name(blkg);
 949                if (!dname)
 950                        continue;
 951
 952                /*
 953                 * Hooray string manipulation, count is the size written NOT
 954                 * INCLUDING THE \0, so size is now count+1 less than what we
 955                 * had before, but we want to start writing the next bit from
 956                 * the \0 so we only add count to buf.
 957                 */
 958                off += scnprintf(buf+off, size-off, "%s ", dname);
 959
 960                spin_lock_irq(&blkg->q->queue_lock);
 961
 962                rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
 963                                        offsetof(struct blkcg_gq, stat_bytes));
 964                rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
 965                wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
 966                dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
 967
 968                rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
 969                                        offsetof(struct blkcg_gq, stat_ios));
 970                rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
 971                wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
 972                dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
 973
 974                spin_unlock_irq(&blkg->q->queue_lock);
 975
 976                if (rbytes || wbytes || rios || wios) {
 977                        has_stats = true;
 978                        off += scnprintf(buf+off, size-off,
 979                                         "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
 980                                         rbytes, wbytes, rios, wios,
 981                                         dbytes, dios);
 982                }
 983
 984                if (!blkcg_debug_stats)
 985                        goto next;
 986
 987                if (atomic_read(&blkg->use_delay)) {
 988                        has_stats = true;
 989                        off += scnprintf(buf+off, size-off,
 990                                         " use_delay=%d delay_nsec=%llu",
 991                                         atomic_read(&blkg->use_delay),
 992                                        (unsigned long long)atomic64_read(&blkg->delay_nsec));
 993                }
 994
 995                for (i = 0; i < BLKCG_MAX_POLS; i++) {
 996                        struct blkcg_policy *pol = blkcg_policy[i];
 997                        size_t written;
 998
 999                        if (!blkg->pd[i] || !pol->pd_stat_fn)
1000                                continue;
1001
1002                        written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
1003                        if (written)
1004                                has_stats = true;
1005                        off += written;
1006                }
1007next:
1008                if (has_stats) {
1009                        off += scnprintf(buf+off, size-off, "\n");
1010                        seq_commit(sf, off);
1011                }
1012        }
1013
1014        rcu_read_unlock();
1015        return 0;
1016}
1017
1018static struct cftype blkcg_files[] = {
1019        {
1020                .name = "stat",
1021                .flags = CFTYPE_NOT_ON_ROOT,
1022                .seq_show = blkcg_print_stat,
1023        },
1024        { }     /* terminate */
1025};
1026
1027static struct cftype blkcg_legacy_files[] = {
1028        {
1029                .name = "reset_stats",
1030                .write_u64 = blkcg_reset_stats,
1031        },
1032        { }     /* terminate */
1033};
1034
1035/*
1036 * blkcg destruction is a three-stage process.
1037 *
1038 * 1. Destruction starts.  The blkcg_css_offline() callback is invoked
1039 *    which offlines writeback.  Here we tie the next stage of blkg destruction
1040 *    to the completion of writeback associated with the blkcg.  This lets us
1041 *    avoid punting potentially large amounts of outstanding writeback to root
1042 *    while maintaining any ongoing policies.  The next stage is triggered when
1043 *    the nr_cgwbs count goes to zero.
1044 *
1045 * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called
1046 *    and handles the destruction of blkgs.  Here the css reference held by
1047 *    the blkg is put back eventually allowing blkcg_css_free() to be called.
1048 *    This work may occur in cgwb_release_workfn() on the cgwb_release
1049 *    workqueue.  Any submitted ios that fail to get the blkg ref will be
1050 *    punted to the root_blkg.
1051 *
1052 * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called.
1053 *    This finally frees the blkcg.
1054 */
1055
1056/**
1057 * blkcg_css_offline - cgroup css_offline callback
1058 * @css: css of interest
1059 *
1060 * This function is called when @css is about to go away.  Here the cgwbs are
1061 * offlined first and only once writeback associated with the blkcg has
1062 * finished do we start step 2 (see above).
1063 */
1064static void blkcg_css_offline(struct cgroup_subsys_state *css)
1065{
1066        struct blkcg *blkcg = css_to_blkcg(css);
1067
1068        /* this prevents anyone from attaching or migrating to this blkcg */
1069        wb_blkcg_offline(blkcg);
1070
1071        /* put the base cgwb reference allowing step 2 to be triggered */
1072        blkcg_cgwb_put(blkcg);
1073}
1074
1075/**
1076 * blkcg_destroy_blkgs - responsible for shooting down blkgs
1077 * @blkcg: blkcg of interest
1078 *
1079 * blkgs should be removed while holding both q and blkcg locks.  As blkcg lock
1080 * is nested inside q lock, this function performs reverse double lock dancing.
1081 * Destroying the blkgs releases the reference held on the blkcg's css allowing
1082 * blkcg_css_free to eventually be called.
1083 *
1084 * This is the blkcg counterpart of ioc_release_fn().
1085 */
1086void blkcg_destroy_blkgs(struct blkcg *blkcg)
1087{
1088        spin_lock_irq(&blkcg->lock);
1089
1090        while (!hlist_empty(&blkcg->blkg_list)) {
1091                struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
1092                                                struct blkcg_gq, blkcg_node);
1093                struct request_queue *q = blkg->q;
1094
1095                if (spin_trylock(&q->queue_lock)) {
1096                        blkg_destroy(blkg);
1097                        spin_unlock(&q->queue_lock);
1098                } else {
1099                        spin_unlock_irq(&blkcg->lock);
1100                        cpu_relax();
1101                        spin_lock_irq(&blkcg->lock);
1102                }
1103        }
1104
1105        spin_unlock_irq(&blkcg->lock);
1106}
1107
1108static void blkcg_css_free(struct cgroup_subsys_state *css)
1109{
1110        struct blkcg *blkcg = css_to_blkcg(css);
1111        int i;
1112
1113        mutex_lock(&blkcg_pol_mutex);
1114
1115        list_del(&blkcg->all_blkcgs_node);
1116
1117        for (i = 0; i < BLKCG_MAX_POLS; i++)
1118                if (blkcg->cpd[i])
1119                        blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1120
1121        mutex_unlock(&blkcg_pol_mutex);
1122
1123        kfree(blkcg);
1124}
1125
1126static struct cgroup_subsys_state *
1127blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
1128{
1129        struct blkcg *blkcg;
1130        struct cgroup_subsys_state *ret;
1131        int i;
1132
1133        mutex_lock(&blkcg_pol_mutex);
1134
1135        if (!parent_css) {
1136                blkcg = &blkcg_root;
1137        } else {
1138                blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1139                if (!blkcg) {
1140                        ret = ERR_PTR(-ENOMEM);
1141                        goto unlock;
1142                }
1143        }
1144
1145        for (i = 0; i < BLKCG_MAX_POLS ; i++) {
1146                struct blkcg_policy *pol = blkcg_policy[i];
1147                struct blkcg_policy_data *cpd;
1148
1149                /*
1150                 * If the policy hasn't been attached yet, wait for it
1151                 * to be attached before doing anything else. Otherwise,
1152                 * check if the policy requires any specific per-cgroup
1153                 * data: if it does, allocate and initialize it.
1154                 */
1155                if (!pol || !pol->cpd_alloc_fn)
1156                        continue;
1157
1158                cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1159                if (!cpd) {
1160                        ret = ERR_PTR(-ENOMEM);
1161                        goto free_pd_blkcg;
1162                }
1163                blkcg->cpd[i] = cpd;
1164                cpd->blkcg = blkcg;
1165                cpd->plid = i;
1166                if (pol->cpd_init_fn)
1167                        pol->cpd_init_fn(cpd);
1168        }
1169
1170        spin_lock_init(&blkcg->lock);
1171        INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
1172        INIT_HLIST_HEAD(&blkcg->blkg_list);
1173#ifdef CONFIG_CGROUP_WRITEBACK
1174        INIT_LIST_HEAD(&blkcg->cgwb_list);
1175        refcount_set(&blkcg->cgwb_refcnt, 1);
1176#endif
1177        list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
1178
1179        mutex_unlock(&blkcg_pol_mutex);
1180        return &blkcg->css;
1181
1182free_pd_blkcg:
1183        for (i--; i >= 0; i--)
1184                if (blkcg->cpd[i])
1185                        blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1186
1187        if (blkcg != &blkcg_root)
1188                kfree(blkcg);
1189unlock:
1190        mutex_unlock(&blkcg_pol_mutex);
1191        return ret;
1192}
1193
1194/**
1195 * blkcg_init_queue - initialize blkcg part of request queue
1196 * @q: request_queue to initialize
1197 *
1198 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
1199 * part of new request_queue @q.
1200 *
1201 * RETURNS:
1202 * 0 on success, -errno on failure.
1203 */
1204int blkcg_init_queue(struct request_queue *q)
1205{
1206        struct blkcg_gq *new_blkg, *blkg;
1207        bool preloaded;
1208        int ret;
1209
1210        new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
1211        if (!new_blkg)
1212                return -ENOMEM;
1213
1214        preloaded = !radix_tree_preload(GFP_KERNEL);
1215
1216        /* Make sure the root blkg exists. */
1217        rcu_read_lock();
1218        spin_lock_irq(&q->queue_lock);
1219        blkg = blkg_create(&blkcg_root, q, new_blkg);
1220        if (IS_ERR(blkg))
1221                goto err_unlock;
1222        q->root_blkg = blkg;
1223        spin_unlock_irq(&q->queue_lock);
1224        rcu_read_unlock();
1225
1226        if (preloaded)
1227                radix_tree_preload_end();
1228
1229        ret = blk_iolatency_init(q);
1230        if (ret)
1231                goto err_destroy_all;
1232
1233        ret = blk_throtl_init(q);
1234        if (ret)
1235                goto err_destroy_all;
1236        return 0;
1237
1238err_destroy_all:
1239        blkg_destroy_all(q);
1240        return ret;
1241err_unlock:
1242        spin_unlock_irq(&q->queue_lock);
1243        rcu_read_unlock();
1244        if (preloaded)
1245                radix_tree_preload_end();
1246        return PTR_ERR(blkg);
1247}
1248
1249/**
1250 * blkcg_drain_queue - drain blkcg part of request_queue
1251 * @q: request_queue to drain
1252 *
1253 * Called from blk_drain_queue().  Responsible for draining blkcg part.
1254 */
1255void blkcg_drain_queue(struct request_queue *q)
1256{
1257        lockdep_assert_held(&q->queue_lock);
1258
1259        /*
1260         * @q could be exiting and already have destroyed all blkgs as
1261         * indicated by NULL root_blkg.  If so, don't confuse policies.
1262         */
1263        if (!q->root_blkg)
1264                return;
1265
1266        blk_throtl_drain(q);
1267}
1268
1269/**
1270 * blkcg_exit_queue - exit and release blkcg part of request_queue
1271 * @q: request_queue being released
1272 *
1273 * Called from blk_exit_queue().  Responsible for exiting blkcg part.
1274 */
1275void blkcg_exit_queue(struct request_queue *q)
1276{
1277        blkg_destroy_all(q);
1278        blk_throtl_exit(q);
1279}
1280
1281/*
1282 * We cannot support shared io contexts, as we have no mean to support
1283 * two tasks with the same ioc in two different groups without major rework
1284 * of the main cic data structures.  For now we allow a task to change
1285 * its cgroup only if it's the only owner of its ioc.
1286 */
1287static int blkcg_can_attach(struct cgroup_taskset *tset)
1288{
1289        struct task_struct *task;
1290        struct cgroup_subsys_state *dst_css;
1291        struct io_context *ioc;
1292        int ret = 0;
1293
1294        /* task_lock() is needed to avoid races with exit_io_context() */
1295        cgroup_taskset_for_each(task, dst_css, tset) {
1296                task_lock(task);
1297                ioc = task->io_context;
1298                if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1299                        ret = -EINVAL;
1300                task_unlock(task);
1301                if (ret)
1302                        break;
1303        }
1304        return ret;
1305}
1306
1307static void blkcg_bind(struct cgroup_subsys_state *root_css)
1308{
1309        int i;
1310
1311        mutex_lock(&blkcg_pol_mutex);
1312
1313        for (i = 0; i < BLKCG_MAX_POLS; i++) {
1314                struct blkcg_policy *pol = blkcg_policy[i];
1315                struct blkcg *blkcg;
1316
1317                if (!pol || !pol->cpd_bind_fn)
1318                        continue;
1319
1320                list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
1321                        if (blkcg->cpd[pol->plid])
1322                                pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
1323        }
1324        mutex_unlock(&blkcg_pol_mutex);
1325}
1326
1327static void blkcg_exit(struct task_struct *tsk)
1328{
1329        if (tsk->throttle_queue)
1330                blk_put_queue(tsk->throttle_queue);
1331        tsk->throttle_queue = NULL;
1332}
1333
1334struct cgroup_subsys io_cgrp_subsys = {
1335        .css_alloc = blkcg_css_alloc,
1336        .css_offline = blkcg_css_offline,
1337        .css_free = blkcg_css_free,
1338        .can_attach = blkcg_can_attach,
1339        .bind = blkcg_bind,
1340        .dfl_cftypes = blkcg_files,
1341        .legacy_cftypes = blkcg_legacy_files,
1342        .legacy_name = "blkio",
1343        .exit = blkcg_exit,
1344#ifdef CONFIG_MEMCG
1345        /*
1346         * This ensures that, if available, memcg is automatically enabled
1347         * together on the default hierarchy so that the owner cgroup can
1348         * be retrieved from writeback pages.
1349         */
1350        .depends_on = 1 << memory_cgrp_id,
1351#endif
1352};
1353EXPORT_SYMBOL_GPL(io_cgrp_subsys);
1354
1355/**
1356 * blkcg_activate_policy - activate a blkcg policy on a request_queue
1357 * @q: request_queue of interest
1358 * @pol: blkcg policy to activate
1359 *
1360 * Activate @pol on @q.  Requires %GFP_KERNEL context.  @q goes through
1361 * bypass mode to populate its blkgs with policy_data for @pol.
1362 *
1363 * Activation happens with @q bypassed, so nobody would be accessing blkgs
1364 * from IO path.  Update of each blkg is protected by both queue and blkcg
1365 * locks so that holding either lock and testing blkcg_policy_enabled() is
1366 * always enough for dereferencing policy data.
1367 *
1368 * The caller is responsible for synchronizing [de]activations and policy
1369 * [un]registerations.  Returns 0 on success, -errno on failure.
1370 */
1371int blkcg_activate_policy(struct request_queue *q,
1372                          const struct blkcg_policy *pol)
1373{
1374        struct blkg_policy_data *pd_prealloc = NULL;
1375        struct blkcg_gq *blkg;
1376        int ret;
1377
1378        if (blkcg_policy_enabled(q, pol))
1379                return 0;
1380
1381        if (queue_is_mq(q))
1382                blk_mq_freeze_queue(q);
1383pd_prealloc:
1384        if (!pd_prealloc) {
1385                pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
1386                if (!pd_prealloc) {
1387                        ret = -ENOMEM;
1388                        goto out_bypass_end;
1389                }
1390        }
1391
1392        spin_lock_irq(&q->queue_lock);
1393
1394        list_for_each_entry(blkg, &q->blkg_list, q_node) {
1395                struct blkg_policy_data *pd;
1396
1397                if (blkg->pd[pol->plid])
1398                        continue;
1399
1400                pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node);
1401                if (!pd)
1402                        swap(pd, pd_prealloc);
1403                if (!pd) {
1404                        spin_unlock_irq(&q->queue_lock);
1405                        goto pd_prealloc;
1406                }
1407
1408                blkg->pd[pol->plid] = pd;
1409                pd->blkg = blkg;
1410                pd->plid = pol->plid;
1411                if (pol->pd_init_fn)
1412                        pol->pd_init_fn(pd);
1413        }
1414
1415        __set_bit(pol->plid, q->blkcg_pols);
1416        ret = 0;
1417
1418        spin_unlock_irq(&q->queue_lock);
1419out_bypass_end:
1420        if (queue_is_mq(q))
1421                blk_mq_unfreeze_queue(q);
1422        if (pd_prealloc)
1423                pol->pd_free_fn(pd_prealloc);
1424        return ret;
1425}
1426EXPORT_SYMBOL_GPL(blkcg_activate_policy);
1427
1428/**
1429 * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
1430 * @q: request_queue of interest
1431 * @pol: blkcg policy to deactivate
1432 *
1433 * Deactivate @pol on @q.  Follows the same synchronization rules as
1434 * blkcg_activate_policy().
1435 */
1436void blkcg_deactivate_policy(struct request_queue *q,
1437                             const struct blkcg_policy *pol)
1438{
1439        struct blkcg_gq *blkg;
1440
1441        if (!blkcg_policy_enabled(q, pol))
1442                return;
1443
1444        if (queue_is_mq(q))
1445                blk_mq_freeze_queue(q);
1446
1447        spin_lock_irq(&q->queue_lock);
1448
1449        __clear_bit(pol->plid, q->blkcg_pols);
1450
1451        list_for_each_entry(blkg, &q->blkg_list, q_node) {
1452                if (blkg->pd[pol->plid]) {
1453                        if (pol->pd_offline_fn)
1454                                pol->pd_offline_fn(blkg->pd[pol->plid]);
1455                        pol->pd_free_fn(blkg->pd[pol->plid]);
1456                        blkg->pd[pol->plid] = NULL;
1457                }
1458        }
1459
1460        spin_unlock_irq(&q->queue_lock);
1461
1462        if (queue_is_mq(q))
1463                blk_mq_unfreeze_queue(q);
1464}
1465EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1466
1467/**
1468 * blkcg_policy_register - register a blkcg policy
1469 * @pol: blkcg policy to register
1470 *
1471 * Register @pol with blkcg core.  Might sleep and @pol may be modified on
1472 * successful registration.  Returns 0 on success and -errno on failure.
1473 */
1474int blkcg_policy_register(struct blkcg_policy *pol)
1475{
1476        struct blkcg *blkcg;
1477        int i, ret;
1478
1479        mutex_lock(&blkcg_pol_register_mutex);
1480        mutex_lock(&blkcg_pol_mutex);
1481
1482        /* find an empty slot */
1483        ret = -ENOSPC;
1484        for (i = 0; i < BLKCG_MAX_POLS; i++)
1485                if (!blkcg_policy[i])
1486                        break;
1487        if (i >= BLKCG_MAX_POLS) {
1488                pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
1489                goto err_unlock;
1490        }
1491
1492        /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */
1493        if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
1494                (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
1495                goto err_unlock;
1496
1497        /* register @pol */
1498        pol->plid = i;
1499        blkcg_policy[pol->plid] = pol;
1500
1501        /* allocate and install cpd's */
1502        if (pol->cpd_alloc_fn) {
1503                list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1504                        struct blkcg_policy_data *cpd;
1505
1506                        cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1507                        if (!cpd)
1508                                goto err_free_cpds;
1509
1510                        blkcg->cpd[pol->plid] = cpd;
1511                        cpd->blkcg = blkcg;
1512                        cpd->plid = pol->plid;
1513                        pol->cpd_init_fn(cpd);
1514                }
1515        }
1516
1517        mutex_unlock(&blkcg_pol_mutex);
1518
1519        /* everything is in place, add intf files for the new policy */
1520        if (pol->dfl_cftypes)
1521                WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
1522                                               pol->dfl_cftypes));
1523        if (pol->legacy_cftypes)
1524                WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
1525                                                  pol->legacy_cftypes));
1526        mutex_unlock(&blkcg_pol_register_mutex);
1527        return 0;
1528
1529err_free_cpds:
1530        if (pol->cpd_free_fn) {
1531                list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1532                        if (blkcg->cpd[pol->plid]) {
1533                                pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1534                                blkcg->cpd[pol->plid] = NULL;
1535                        }
1536                }
1537        }
1538        blkcg_policy[pol->plid] = NULL;
1539err_unlock:
1540        mutex_unlock(&blkcg_pol_mutex);
1541        mutex_unlock(&blkcg_pol_register_mutex);
1542        return ret;
1543}
1544EXPORT_SYMBOL_GPL(blkcg_policy_register);
1545
1546/**
1547 * blkcg_policy_unregister - unregister a blkcg policy
1548 * @pol: blkcg policy to unregister
1549 *
1550 * Undo blkcg_policy_register(@pol).  Might sleep.
1551 */
1552void blkcg_policy_unregister(struct blkcg_policy *pol)
1553{
1554        struct blkcg *blkcg;
1555
1556        mutex_lock(&blkcg_pol_register_mutex);
1557
1558        if (WARN_ON(blkcg_policy[pol->plid] != pol))
1559                goto out_unlock;
1560
1561        /* kill the intf files first */
1562        if (pol->dfl_cftypes)
1563                cgroup_rm_cftypes(pol->dfl_cftypes);
1564        if (pol->legacy_cftypes)
1565                cgroup_rm_cftypes(pol->legacy_cftypes);
1566
1567        /* remove cpds and unregister */
1568        mutex_lock(&blkcg_pol_mutex);
1569
1570        if (pol->cpd_free_fn) {
1571                list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1572                        if (blkcg->cpd[pol->plid]) {
1573                                pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1574                                blkcg->cpd[pol->plid] = NULL;
1575                        }
1576                }
1577        }
1578        blkcg_policy[pol->plid] = NULL;
1579
1580        mutex_unlock(&blkcg_pol_mutex);
1581out_unlock:
1582        mutex_unlock(&blkcg_pol_register_mutex);
1583}
1584EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1585
1586/*
1587 * Scale the accumulated delay based on how long it has been since we updated
1588 * the delay.  We only call this when we are adding delay, in case it's been a
1589 * while since we added delay, and when we are checking to see if we need to
1590 * delay a task, to account for any delays that may have occurred.
1591 */
1592static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
1593{
1594        u64 old = atomic64_read(&blkg->delay_start);
1595
1596        /*
1597         * We only want to scale down every second.  The idea here is that we
1598         * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
1599         * time window.  We only want to throttle tasks for recent delay that
1600         * has occurred, in 1 second time windows since that's the maximum
1601         * things can be throttled.  We save the current delay window in
1602         * blkg->last_delay so we know what amount is still left to be charged
1603         * to the blkg from this point onward.  blkg->last_use keeps track of
1604         * the use_delay counter.  The idea is if we're unthrottling the blkg we
1605         * are ok with whatever is happening now, and we can take away more of
1606         * the accumulated delay as we've already throttled enough that
1607         * everybody is happy with their IO latencies.
1608         */
1609        if (time_before64(old + NSEC_PER_SEC, now) &&
1610            atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
1611                u64 cur = atomic64_read(&blkg->delay_nsec);
1612                u64 sub = min_t(u64, blkg->last_delay, now - old);
1613                int cur_use = atomic_read(&blkg->use_delay);
1614
1615                /*
1616                 * We've been unthrottled, subtract a larger chunk of our
1617                 * accumulated delay.
1618                 */
1619                if (cur_use < blkg->last_use)
1620                        sub = max_t(u64, sub, blkg->last_delay >> 1);
1621
1622                /*
1623                 * This shouldn't happen, but handle it anyway.  Our delay_nsec
1624                 * should only ever be growing except here where we subtract out
1625                 * min(last_delay, 1 second), but lord knows bugs happen and I'd
1626                 * rather not end up with negative numbers.
1627                 */
1628                if (unlikely(cur < sub)) {
1629                        atomic64_set(&blkg->delay_nsec, 0);
1630                        blkg->last_delay = 0;
1631                } else {
1632                        atomic64_sub(sub, &blkg->delay_nsec);
1633                        blkg->last_delay = cur - sub;
1634                }
1635                blkg->last_use = cur_use;
1636        }
1637}
1638
1639/*
1640 * This is called when we want to actually walk up the hierarchy and check to
1641 * see if we need to throttle, and then actually throttle if there is some
1642 * accumulated delay.  This should only be called upon return to user space so
1643 * we're not holding some lock that would induce a priority inversion.
1644 */
1645static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
1646{
1647        u64 now = ktime_to_ns(ktime_get());
1648        u64 exp;
1649        u64 delay_nsec = 0;
1650        int tok;
1651
1652        while (blkg->parent) {
1653                if (atomic_read(&blkg->use_delay)) {
1654                        blkcg_scale_delay(blkg, now);
1655                        delay_nsec = max_t(u64, delay_nsec,
1656                                           atomic64_read(&blkg->delay_nsec));
1657                }
1658                blkg = blkg->parent;
1659        }
1660
1661        if (!delay_nsec)
1662                return;
1663
1664        /*
1665         * Let's not sleep for all eternity if we've amassed a huge delay.
1666         * Swapping or metadata IO can accumulate 10's of seconds worth of
1667         * delay, and we want userspace to be able to do _something_ so cap the
1668         * delays at 1 second.  If there's 10's of seconds worth of delay then
1669         * the tasks will be delayed for 1 second for every syscall.
1670         */
1671        delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
1672
1673        /*
1674         * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
1675         * that hasn't landed upstream yet.  Once that stuff is in place we need
1676         * to do a psi_memstall_enter/leave if memdelay is set.
1677         */
1678
1679        exp = ktime_add_ns(now, delay_nsec);
1680        tok = io_schedule_prepare();
1681        do {
1682                __set_current_state(TASK_KILLABLE);
1683                if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
1684                        break;
1685        } while (!fatal_signal_pending(current));
1686        io_schedule_finish(tok);
1687}
1688
1689/**
1690 * blkcg_maybe_throttle_current - throttle the current task if it has been marked
1691 *
1692 * This is only called if we've been marked with set_notify_resume().  Obviously
1693 * we can be set_notify_resume() for reasons other than blkcg throttling, so we
1694 * check to see if current->throttle_queue is set and if not this doesn't do
1695 * anything.  This should only ever be called by the resume code, it's not meant
1696 * to be called by people willy-nilly as it will actually do the work to
1697 * throttle the task if it is setup for throttling.
1698 */
1699void blkcg_maybe_throttle_current(void)
1700{
1701        struct request_queue *q = current->throttle_queue;
1702        struct cgroup_subsys_state *css;
1703        struct blkcg *blkcg;
1704        struct blkcg_gq *blkg;
1705        bool use_memdelay = current->use_memdelay;
1706
1707        if (!q)
1708                return;
1709
1710        current->throttle_queue = NULL;
1711        current->use_memdelay = false;
1712
1713        rcu_read_lock();
1714        css = kthread_blkcg();
1715        if (css)
1716                blkcg = css_to_blkcg(css);
1717        else
1718                blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
1719
1720        if (!blkcg)
1721                goto out;
1722        blkg = blkg_lookup(blkcg, q);
1723        if (!blkg)
1724                goto out;
1725        if (!blkg_tryget(blkg))
1726                goto out;
1727        rcu_read_unlock();
1728
1729        blkcg_maybe_throttle_blkg(blkg, use_memdelay);
1730        blkg_put(blkg);
1731        blk_put_queue(q);
1732        return;
1733out:
1734        rcu_read_unlock();
1735        blk_put_queue(q);
1736}
1737
1738/**
1739 * blkcg_schedule_throttle - this task needs to check for throttling
1740 * @q: the request queue IO was submitted on
1741 * @use_memdelay: do we charge this to memory delay for PSI
1742 *
1743 * This is called by the IO controller when we know there's delay accumulated
1744 * for the blkg for this task.  We do not pass the blkg because there are places
1745 * we call this that may not have that information, the swapping code for
1746 * instance will only have a request_queue at that point.  This set's the
1747 * notify_resume for the task to check and see if it requires throttling before
1748 * returning to user space.
1749 *
1750 * We will only schedule once per syscall.  You can call this over and over
1751 * again and it will only do the check once upon return to user space, and only
1752 * throttle once.  If the task needs to be throttled again it'll need to be
1753 * re-set at the next time we see the task.
1754 */
1755void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
1756{
1757        if (unlikely(current->flags & PF_KTHREAD))
1758                return;
1759
1760        if (!blk_get_queue(q))
1761                return;
1762
1763        if (current->throttle_queue)
1764                blk_put_queue(current->throttle_queue);
1765        current->throttle_queue = q;
1766        if (use_memdelay)
1767                current->use_memdelay = use_memdelay;
1768        set_notify_resume(current);
1769}
1770
1771/**
1772 * blkcg_add_delay - add delay to this blkg
1773 * @blkg: blkg of interest
1774 * @now: the current time in nanoseconds
1775 * @delta: how many nanoseconds of delay to add
1776 *
1777 * Charge @delta to the blkg's current delay accumulation.  This is used to
1778 * throttle tasks if an IO controller thinks we need more throttling.
1779 */
1780void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1781{
1782        blkcg_scale_delay(blkg, now);
1783        atomic64_add(delta, &blkg->delay_nsec);
1784}
1785
1786module_param(blkcg_debug_stats, bool, 0644);
1787MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
1788