LXR linux/block/blk-cgroup.c

   1/*
   2 * Common Block IO controller cgroup interface
   3 *
   4 * Based on ideas and code from CFQ, CFS and BFQ:
   5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
   6 *
   7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
   8 *                    Paolo Valente <paolo.valente@unimore.it>
   9 *
  10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
  11 *                    Nauman Rafique <nauman@google.com>
  12 *
  13 * For policy-specific per-blkcg data:
  14 * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
  15 *                    Arianna Avanzini <avanzini.arianna@gmail.com>
  16 */
  17#include <linux/ioprio.h>
  18#include <linux/kdev_t.h>
  19#include <linux/module.h>
  20#include <linux/err.h>
  21#include <linux/blkdev.h>
  22#include <linux/backing-dev.h>
  23#include <linux/slab.h>
  24#include <linux/genhd.h>
  25#include <linux/delay.h>
  26#include <linux/atomic.h>
  27#include <linux/blk-cgroup.h>
  28#include "blk.h"
  29
  30#define MAX_KEY_LEN 100
  31
  32/*
  33 * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
  34 * blkcg_pol_register_mutex nests outside of it and synchronizes entire
  35 * policy [un]register operations including cgroup file additions /
  36 * removals.  Putting cgroup file registration outside blkcg_pol_mutex
  37 * allows grabbing it from cgroup callbacks.
  38 */
  39static DEFINE_MUTEX(blkcg_pol_register_mutex);
  40static DEFINE_MUTEX(blkcg_pol_mutex);
  41
  42struct blkcg blkcg_root;
  43EXPORT_SYMBOL_GPL(blkcg_root);
  44
  45struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
  46
  47static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
  48
  49static LIST_HEAD(all_blkcgs);           /* protected by blkcg_pol_mutex */
  50
  51static bool blkcg_policy_enabled(struct request_queue *q,
  52                                 const struct blkcg_policy *pol)
  53{
  54        return pol && test_bit(pol->plid, q->blkcg_pols);
  55}
  56
  57/**
  58 * blkg_free - free a blkg
  59 * @blkg: blkg to free
  60 *
  61 * Free @blkg which may be partially allocated.
  62 */
  63static void blkg_free(struct blkcg_gq *blkg)
  64{
  65        int i;
  66
  67        if (!blkg)
  68                return;
  69
  70        for (i = 0; i < BLKCG_MAX_POLS; i++)
  71                kfree(blkg->pd[i]);
  72
  73        blk_exit_rl(&blkg->rl);
  74        kfree(blkg);
  75}
  76
  77/**
  78 * blkg_alloc - allocate a blkg
  79 * @blkcg: block cgroup the new blkg is associated with
  80 * @q: request_queue the new blkg is associated with
  81 * @gfp_mask: allocation mask to use
  82 *
  83 * Allocate a new blkg assocating @blkcg and @q.
  84 */
  85static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
  86                                   gfp_t gfp_mask)
  87{
  88        struct blkcg_gq *blkg;
  89        int i;
  90
  91        /* alloc and init base part */
  92        blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
  93        if (!blkg)
  94                return NULL;
  95
  96        blkg->q = q;
  97        INIT_LIST_HEAD(&blkg->q_node);
  98        blkg->blkcg = blkcg;
  99        atomic_set(&blkg->refcnt, 1);
 100
 101        /* root blkg uses @q->root_rl, init rl only for !root blkgs */
 102        if (blkcg != &blkcg_root) {
 103                if (blk_init_rl(&blkg->rl, q, gfp_mask))
 104                        goto err_free;
 105                blkg->rl.blkg = blkg;
 106        }
 107
 108        for (i = 0; i < BLKCG_MAX_POLS; i++) {
 109                struct blkcg_policy *pol = blkcg_policy[i];
 110                struct blkg_policy_data *pd;
 111
 112                if (!blkcg_policy_enabled(q, pol))
 113                        continue;
 114
 115                /* alloc per-policy data and attach it to blkg */
 116                pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
 117                if (!pd)
 118                        goto err_free;
 119
 120                blkg->pd[i] = pd;
 121                pd->blkg = blkg;
 122                pd->plid = i;
 123        }
 124
 125        return blkg;
 126
 127err_free:
 128        blkg_free(blkg);
 129        return NULL;
 130}
 131
 132/**
 133 * __blkg_lookup - internal version of blkg_lookup()
 134 * @blkcg: blkcg of interest
 135 * @q: request_queue of interest
 136 * @update_hint: whether to update lookup hint with the result or not
 137 *
 138 * This is internal version and shouldn't be used by policy
 139 * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
 140 * @q's bypass state.  If @update_hint is %true, the caller should be
 141 * holding @q->queue_lock and lookup hint is updated on success.
 142 */
 143struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
 144                               bool update_hint)
 145{
 146        struct blkcg_gq *blkg;
 147
 148        blkg = rcu_dereference(blkcg->blkg_hint);
 149        if (blkg && blkg->q == q)
 150                return blkg;
 151
 152        /*
 153         * Hint didn't match.  Look up from the radix tree.  Note that the
 154         * hint can only be updated under queue_lock as otherwise @blkg
 155         * could have already been removed from blkg_tree.  The caller is
 156         * responsible for grabbing queue_lock if @update_hint.
 157         */
 158        blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
 159        if (blkg && blkg->q == q) {
 160                if (update_hint) {
 161                        lockdep_assert_held(q->queue_lock);
 162                        rcu_assign_pointer(blkcg->blkg_hint, blkg);
 163                }
 164                return blkg;
 165        }
 166
 167        return NULL;
 168}
 169
 170/**
 171 * blkg_lookup - lookup blkg for the specified blkcg - q pair
 172 * @blkcg: blkcg of interest
 173 * @q: request_queue of interest
 174 *
 175 * Lookup blkg for the @blkcg - @q pair.  This function should be called
 176 * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
 177 * - see blk_queue_bypass_start() for details.
 178 */
 179struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
 180{
 181        WARN_ON_ONCE(!rcu_read_lock_held());
 182
 183        if (unlikely(blk_queue_bypass(q)))
 184                return NULL;
 185        return __blkg_lookup(blkcg, q, false);
 186}
 187EXPORT_SYMBOL_GPL(blkg_lookup);
 188
 189/*
 190 * If @new_blkg is %NULL, this function tries to allocate a new one as
 191 * necessary using %GFP_ATOMIC.  @new_blkg is always consumed on return.
 192 */
 193static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 194                                    struct request_queue *q,
 195                                    struct blkcg_gq *new_blkg)
 196{
 197        struct blkcg_gq *blkg;
 198        struct bdi_writeback_congested *wb_congested;
 199        int i, ret;
 200
 201        WARN_ON_ONCE(!rcu_read_lock_held());
 202        lockdep_assert_held(q->queue_lock);
 203
 204        /* blkg holds a reference to blkcg */
 205        if (!css_tryget_online(&blkcg->css)) {
 206                ret = -EINVAL;
 207                goto err_free_blkg;
 208        }
 209
 210        wb_congested = wb_congested_get_create(&q->backing_dev_info,
 211                                               blkcg->css.id, GFP_ATOMIC);
 212        if (!wb_congested) {
 213                ret = -ENOMEM;
 214                goto err_put_css;
 215        }
 216
 217        /* allocate */
 218        if (!new_blkg) {
 219                new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
 220                if (unlikely(!new_blkg)) {
 221                        ret = -ENOMEM;
 222                        goto err_put_congested;
 223                }
 224        }
 225        blkg = new_blkg;
 226        blkg->wb_congested = wb_congested;
 227
 228        /* link parent */
 229        if (blkcg_parent(blkcg)) {
 230                blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
 231                if (WARN_ON_ONCE(!blkg->parent)) {
 232                        ret = -EINVAL;
 233                        goto err_put_congested;
 234                }
 235                blkg_get(blkg->parent);
 236        }
 237
 238        /* invoke per-policy init */
 239        for (i = 0; i < BLKCG_MAX_POLS; i++) {
 240                struct blkcg_policy *pol = blkcg_policy[i];
 241
 242                if (blkg->pd[i] && pol->pd_init_fn)
 243                        pol->pd_init_fn(blkg);
 244        }
 245
 246        /* insert */
 247        spin_lock(&blkcg->lock);
 248        ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
 249        if (likely(!ret)) {
 250                hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
 251                list_add(&blkg->q_node, &q->blkg_list);
 252
 253                for (i = 0; i < BLKCG_MAX_POLS; i++) {
 254                        struct blkcg_policy *pol = blkcg_policy[i];
 255
 256                        if (blkg->pd[i] && pol->pd_online_fn)
 257                                pol->pd_online_fn(blkg);
 258                }
 259        }
 260        blkg->online = true;
 261        spin_unlock(&blkcg->lock);
 262
 263        if (!ret)
 264                return blkg;
 265
 266        /* @blkg failed fully initialized, use the usual release path */
 267        blkg_put(blkg);
 268        return ERR_PTR(ret);
 269
 270err_put_congested:
 271        wb_congested_put(wb_congested);
 272err_put_css:
 273        css_put(&blkcg->css);
 274err_free_blkg:
 275        blkg_free(new_blkg);
 276        return ERR_PTR(ret);
 277}
 278
 279/**
 280 * blkg_lookup_create - lookup blkg, try to create one if not there
 281 * @blkcg: blkcg of interest
 282 * @q: request_queue of interest
 283 *
 284 * Lookup blkg for the @blkcg - @q pair.  If it doesn't exist, try to
 285 * create one.  blkg creation is performed recursively from blkcg_root such
 286 * that all non-root blkg's have access to the parent blkg.  This function
 287 * should be called under RCU read lock and @q->queue_lock.
 288 *
 289 * Returns pointer to the looked up or created blkg on success, ERR_PTR()
 290 * value on error.  If @q is dead, returns ERR_PTR(-EINVAL).  If @q is not
 291 * dead and bypassing, returns ERR_PTR(-EBUSY).
 292 */
 293struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 294                                    struct request_queue *q)
 295{
 296        struct blkcg_gq *blkg;
 297
 298        WARN_ON_ONCE(!rcu_read_lock_held());
 299        lockdep_assert_held(q->queue_lock);
 300
 301        /*
 302         * This could be the first entry point of blkcg implementation and
 303         * we shouldn't allow anything to go through for a bypassing queue.
 304         */
 305        if (unlikely(blk_queue_bypass(q)))
 306                return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY);
 307
 308        blkg = __blkg_lookup(blkcg, q, true);
 309        if (blkg)
 310                return blkg;
 311
 312        /*
 313         * Create blkgs walking down from blkcg_root to @blkcg, so that all
 314         * non-root blkgs have access to their parents.
 315         */
 316        while (true) {
 317                struct blkcg *pos = blkcg;
 318                struct blkcg *parent = blkcg_parent(blkcg);
 319
 320                while (parent && !__blkg_lookup(parent, q, false)) {
 321                        pos = parent;
 322                        parent = blkcg_parent(parent);
 323                }
 324
 325                blkg = blkg_create(pos, q, NULL);
 326                if (pos == blkcg || IS_ERR(blkg))
 327                        return blkg;
 328        }
 329}
 330EXPORT_SYMBOL_GPL(blkg_lookup_create);
 331
 332static void blkg_destroy(struct blkcg_gq *blkg)
 333{
 334        struct blkcg *blkcg = blkg->blkcg;
 335        int i;
 336
 337        lockdep_assert_held(blkg->q->queue_lock);
 338        lockdep_assert_held(&blkcg->lock);
 339
 340        /* Something wrong if we are trying to remove same group twice */
 341        WARN_ON_ONCE(list_empty(&blkg->q_node));
 342        WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
 343
 344        for (i = 0; i < BLKCG_MAX_POLS; i++) {
 345                struct blkcg_policy *pol = blkcg_policy[i];
 346
 347                if (blkg->pd[i] && pol->pd_offline_fn)
 348                        pol->pd_offline_fn(blkg);
 349        }
 350        blkg->online = false;
 351
 352        radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
 353        list_del_init(&blkg->q_node);
 354        hlist_del_init_rcu(&blkg->blkcg_node);
 355
 356        /*
 357         * Both setting lookup hint to and clearing it from @blkg are done
 358         * under queue_lock.  If it's not pointing to @blkg now, it never
 359         * will.  Hint assignment itself can race safely.
 360         */
 361        if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
 362                rcu_assign_pointer(blkcg->blkg_hint, NULL);
 363
 364        /*
 365         * Put the reference taken at the time of creation so that when all
 366         * queues are gone, group can be destroyed.
 367         */
 368        blkg_put(blkg);
 369}
 370
 371/**
 372 * blkg_destroy_all - destroy all blkgs associated with a request_queue
 373 * @q: request_queue of interest
 374 *
 375 * Destroy all blkgs associated with @q.
 376 */
 377static void blkg_destroy_all(struct request_queue *q)
 378{
 379        struct blkcg_gq *blkg, *n;
 380
 381        lockdep_assert_held(q->queue_lock);
 382
 383        list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
 384                struct blkcg *blkcg = blkg->blkcg;
 385
 386                spin_lock(&blkcg->lock);
 387                blkg_destroy(blkg);
 388                spin_unlock(&blkcg->lock);
 389        }
 390}
 391
 392/*
 393 * A group is RCU protected, but having an rcu lock does not mean that one
 394 * can access all the fields of blkg and assume these are valid.  For
 395 * example, don't try to follow throtl_data and request queue links.
 396 *
 397 * Having a reference to blkg under an rcu allows accesses to only values
 398 * local to groups like group stats and group rate limits.
 399 */
 400void __blkg_release_rcu(struct rcu_head *rcu_head)
 401{
 402        struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
 403        int i;
 404
 405        /* tell policies that this one is being freed */
 406        for (i = 0; i < BLKCG_MAX_POLS; i++) {
 407                struct blkcg_policy *pol = blkcg_policy[i];
 408
 409                if (blkg->pd[i] && pol->pd_exit_fn)
 410                        pol->pd_exit_fn(blkg);
 411        }
 412
 413        /* release the blkcg and parent blkg refs this blkg has been holding */
 414        css_put(&blkg->blkcg->css);
 415        if (blkg->parent)
 416                blkg_put(blkg->parent);
 417
 418        wb_congested_put(blkg->wb_congested);
 419
 420        blkg_free(blkg);
 421}
 422EXPORT_SYMBOL_GPL(__blkg_release_rcu);
 423
 424/*
 425 * The next function used by blk_queue_for_each_rl().  It's a bit tricky
 426 * because the root blkg uses @q->root_rl instead of its own rl.
 427 */
 428struct request_list *__blk_queue_next_rl(struct request_list *rl,
 429                                         struct request_queue *q)
 430{
 431        struct list_head *ent;
 432        struct blkcg_gq *blkg;
 433
 434        /*
 435         * Determine the current blkg list_head.  The first entry is
 436         * root_rl which is off @q->blkg_list and mapped to the head.
 437         */
 438        if (rl == &q->root_rl) {
 439                ent = &q->blkg_list;
 440                /* There are no more block groups, hence no request lists */
 441                if (list_empty(ent))
 442                        return NULL;
 443        } else {
 444                blkg = container_of(rl, struct blkcg_gq, rl);
 445                ent = &blkg->q_node;
 446        }
 447
 448        /* walk to the next list_head, skip root blkcg */
 449        ent = ent->next;
 450        if (ent == &q->root_blkg->q_node)
 451                ent = ent->next;
 452        if (ent == &q->blkg_list)
 453                return NULL;
 454
 455        blkg = container_of(ent, struct blkcg_gq, q_node);
 456        return &blkg->rl;
 457}
 458
 459static int blkcg_reset_stats(struct cgroup_subsys_state *css,
 460                             struct cftype *cftype, u64 val)
 461{
 462        struct blkcg *blkcg = css_to_blkcg(css);
 463        struct blkcg_gq *blkg;
 464        int i;
 465
 466        mutex_lock(&blkcg_pol_mutex);
 467        spin_lock_irq(&blkcg->lock);
 468
 469        /*
 470         * Note that stat reset is racy - it doesn't synchronize against
 471         * stat updates.  This is a debug feature which shouldn't exist
 472         * anyway.  If you get hit by a race, retry.
 473         */
 474        hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
 475                for (i = 0; i < BLKCG_MAX_POLS; i++) {
 476                        struct blkcg_policy *pol = blkcg_policy[i];
 477
 478                        if (blkcg_policy_enabled(blkg->q, pol) &&
 479                            pol->pd_reset_stats_fn)
 480                                pol->pd_reset_stats_fn(blkg);
 481                }
 482        }
 483
 484        spin_unlock_irq(&blkcg->lock);
 485        mutex_unlock(&blkcg_pol_mutex);
 486        return 0;
 487}
 488
 489static const char *blkg_dev_name(struct blkcg_gq *blkg)
 490{
 491        /* some drivers (floppy) instantiate a queue w/o disk registered */
 492        if (blkg->q->backing_dev_info.dev)
 493                return dev_name(blkg->q->backing_dev_info.dev);
 494        return NULL;
 495}
 496
 497/**
 498 * blkcg_print_blkgs - helper for printing per-blkg data
 499 * @sf: seq_file to print to
 500 * @blkcg: blkcg of interest
 501 * @prfill: fill function to print out a blkg
 502 * @pol: policy in question
 503 * @data: data to be passed to @prfill
 504 * @show_total: to print out sum of prfill return values or not
 505 *
 506 * This function invokes @prfill on each blkg of @blkcg if pd for the
 507 * policy specified by @pol exists.  @prfill is invoked with @sf, the
 508 * policy data and @data and the matching queue lock held.  If @show_total
 509 * is %true, the sum of the return values from @prfill is printed with
 510 * "Total" label at the end.
 511 *
 512 * This is to be used to construct print functions for
 513 * cftype->read_seq_string method.
 514 */
 515void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 516                       u64 (*prfill)(struct seq_file *,
 517                                     struct blkg_policy_data *, int),
 518                       const struct blkcg_policy *pol, int data,
 519                       bool show_total)
 520{
 521        struct blkcg_gq *blkg;
 522        u64 total = 0;
 523
 524        rcu_read_lock();
 525        hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
 526                spin_lock_irq(blkg->q->queue_lock);
 527                if (blkcg_policy_enabled(blkg->q, pol))
 528                        total += prfill(sf, blkg->pd[pol->plid], data);
 529                spin_unlock_irq(blkg->q->queue_lock);
 530        }
 531        rcu_read_unlock();
 532
 533        if (show_total)
 534                seq_printf(sf, "Total %llu\n", (unsigned long long)total);
 535}
 536EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
 537
 538/**
 539 * __blkg_prfill_u64 - prfill helper for a single u64 value
 540 * @sf: seq_file to print to
 541 * @pd: policy private data of interest
 542 * @v: value to print
 543 *
 544 * Print @v to @sf for the device assocaited with @pd.
 545 */
 546u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
 547{
 548        const char *dname = blkg_dev_name(pd->blkg);
 549
 550        if (!dname)
 551                return 0;
 552
 553        seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
 554        return v;
 555}
 556EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
 557
 558/**
 559 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
 560 * @sf: seq_file to print to
 561 * @pd: policy private data of interest
 562 * @rwstat: rwstat to print
 563 *
 564 * Print @rwstat to @sf for the device assocaited with @pd.
 565 */
 566u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 567                         const struct blkg_rwstat *rwstat)
 568{
 569        static const char *rwstr[] = {
 570                [BLKG_RWSTAT_READ]      = "Read",
 571                [BLKG_RWSTAT_WRITE]     = "Write",
 572                [BLKG_RWSTAT_SYNC]      = "Sync",
 573                [BLKG_RWSTAT_ASYNC]     = "Async",
 574        };
 575        const char *dname = blkg_dev_name(pd->blkg);
 576        u64 v;
 577        int i;
 578
 579        if (!dname)
 580                return 0;
 581
 582        for (i = 0; i < BLKG_RWSTAT_NR; i++)
 583                seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
 584                           (unsigned long long)rwstat->cnt[i]);
 585
 586        v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
 587        seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
 588        return v;
 589}
 590EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
 591
 592/**
 593 * blkg_prfill_stat - prfill callback for blkg_stat
 594 * @sf: seq_file to print to
 595 * @pd: policy private data of interest
 596 * @off: offset to the blkg_stat in @pd
 597 *
 598 * prfill callback for printing a blkg_stat.
 599 */
 600u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
 601{
 602        return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
 603}
 604EXPORT_SYMBOL_GPL(blkg_prfill_stat);
 605
 606/**
 607 * blkg_prfill_rwstat - prfill callback for blkg_rwstat
 608 * @sf: seq_file to print to
 609 * @pd: policy private data of interest
 610 * @off: offset to the blkg_rwstat in @pd
 611 *
 612 * prfill callback for printing a blkg_rwstat.
 613 */
 614u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 615                       int off)
 616{
 617        struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
 618
 619        return __blkg_prfill_rwstat(sf, pd, &rwstat);
 620}
 621EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
 622
 623/**
 624 * blkg_stat_recursive_sum - collect hierarchical blkg_stat
 625 * @pd: policy private data of interest
 626 * @off: offset to the blkg_stat in @pd
 627 *
 628 * Collect the blkg_stat specified by @off from @pd and all its online
 629 * descendants and return the sum.  The caller must be holding the queue
 630 * lock for online tests.
 631 */
 632u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
 633{
 634        struct blkcg_policy *pol = blkcg_policy[pd->plid];
 635        struct blkcg_gq *pos_blkg;
 636        struct cgroup_subsys_state *pos_css;
 637        u64 sum = 0;
 638
 639        lockdep_assert_held(pd->blkg->q->queue_lock);
 640
 641        rcu_read_lock();
 642        blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
 643                struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
 644                struct blkg_stat *stat = (void *)pos_pd + off;
 645
 646                if (pos_blkg->online)
 647                        sum += blkg_stat_read(stat);
 648        }
 649        rcu_read_unlock();
 650
 651        return sum;
 652}
 653EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
 654
 655/**
 656 * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
 657 * @pd: policy private data of interest
 658 * @off: offset to the blkg_stat in @pd
 659 *
 660 * Collect the blkg_rwstat specified by @off from @pd and all its online
 661 * descendants and return the sum.  The caller must be holding the queue
 662 * lock for online tests.
 663 */
 664struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
 665                                             int off)
 666{
 667        struct blkcg_policy *pol = blkcg_policy[pd->plid];
 668        struct blkcg_gq *pos_blkg;
 669        struct cgroup_subsys_state *pos_css;
 670        struct blkg_rwstat sum = { };
 671        int i;
 672
 673        lockdep_assert_held(pd->blkg->q->queue_lock);
 674
 675        rcu_read_lock();
 676        blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
 677                struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
 678                struct blkg_rwstat *rwstat = (void *)pos_pd + off;
 679                struct blkg_rwstat tmp;
 680
 681                if (!pos_blkg->online)
 682                        continue;
 683
 684                tmp = blkg_rwstat_read(rwstat);
 685
 686                for (i = 0; i < BLKG_RWSTAT_NR; i++)
 687                        sum.cnt[i] += tmp.cnt[i];
 688        }
 689        rcu_read_unlock();
 690
 691        return sum;
 692}
 693EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
 694
 695/**
 696 * blkg_conf_prep - parse and prepare for per-blkg config update
 697 * @blkcg: target block cgroup
 698 * @pol: target policy
 699 * @input: input string
 700 * @ctx: blkg_conf_ctx to be filled
 701 *
 702 * Parse per-blkg config update from @input and initialize @ctx with the
 703 * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
 704 * value.  This function returns with RCU read lock and queue lock held and
 705 * must be paired with blkg_conf_finish().
 706 */
 707int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 708                   const char *input, struct blkg_conf_ctx *ctx)
 709        __acquires(rcu) __acquires(disk->queue->queue_lock)
 710{
 711        struct gendisk *disk;
 712        struct blkcg_gq *blkg;
 713        unsigned int major, minor;
 714        unsigned long long v;
 715        int part, ret;
 716
 717        if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
 718                return -EINVAL;
 719
 720        disk = get_gendisk(MKDEV(major, minor), &part);
 721        if (!disk)
 722                return -EINVAL;
 723        if (part) {
 724                put_disk(disk);
 725                return -EINVAL;
 726        }
 727
 728        rcu_read_lock();
 729        spin_lock_irq(disk->queue->queue_lock);
 730
 731        if (blkcg_policy_enabled(disk->queue, pol))
 732                blkg = blkg_lookup_create(blkcg, disk->queue);
 733        else
 734                blkg = ERR_PTR(-EINVAL);
 735
 736        if (IS_ERR(blkg)) {
 737                ret = PTR_ERR(blkg);
 738                rcu_read_unlock();
 739                spin_unlock_irq(disk->queue->queue_lock);
 740                put_disk(disk);
 741                /*
 742                 * If queue was bypassing, we should retry.  Do so after a
 743                 * short msleep().  It isn't strictly necessary but queue
 744                 * can be bypassing for some time and it's always nice to
 745                 * avoid busy looping.
 746                 */
 747                if (ret == -EBUSY) {
 748                        msleep(10);
 749                        ret = restart_syscall();
 750                }
 751                return ret;
 752        }
 753
 754        ctx->disk = disk;
 755        ctx->blkg = blkg;
 756        ctx->v = v;
 757        return 0;
 758}
 759EXPORT_SYMBOL_GPL(blkg_conf_prep);
 760
 761/**
 762 * blkg_conf_finish - finish up per-blkg config update
 763 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
 764 *
 765 * Finish up after per-blkg config update.  This function must be paired
 766 * with blkg_conf_prep().
 767 */
 768void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 769        __releases(ctx->disk->queue->queue_lock) __releases(rcu)
 770{
 771        spin_unlock_irq(ctx->disk->queue->queue_lock);
 772        rcu_read_unlock();
 773        put_disk(ctx->disk);
 774}
 775EXPORT_SYMBOL_GPL(blkg_conf_finish);
 776
 777struct cftype blkcg_files[] = {
 778        {
 779                .name = "reset_stats",
 780                .write_u64 = blkcg_reset_stats,
 781        },
 782        { }     /* terminate */
 783};
 784
 785/**
 786 * blkcg_css_offline - cgroup css_offline callback
 787 * @css: css of interest
 788 *
 789 * This function is called when @css is about to go away and responsible
 790 * for shooting down all blkgs associated with @css.  blkgs should be
 791 * removed while holding both q and blkcg locks.  As blkcg lock is nested
 792 * inside q lock, this function performs reverse double lock dancing.
 793 *
 794 * This is the blkcg counterpart of ioc_release_fn().
 795 */
 796static void blkcg_css_offline(struct cgroup_subsys_state *css)
 797{
 798        struct blkcg *blkcg = css_to_blkcg(css);
 799
 800        spin_lock_irq(&blkcg->lock);
 801
 802        while (!hlist_empty(&blkcg->blkg_list)) {
 803                struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
 804                                                struct blkcg_gq, blkcg_node);
 805                struct request_queue *q = blkg->q;
 806
 807                if (spin_trylock(q->queue_lock)) {
 808                        blkg_destroy(blkg);
 809                        spin_unlock(q->queue_lock);
 810                } else {
 811                        spin_unlock_irq(&blkcg->lock);
 812                        cpu_relax();
 813                        spin_lock_irq(&blkcg->lock);
 814                }
 815        }
 816
 817        spin_unlock_irq(&blkcg->lock);
 818
 819        wb_blkcg_offline(blkcg);
 820}
 821
 822static void blkcg_css_free(struct cgroup_subsys_state *css)
 823{
 824        struct blkcg *blkcg = css_to_blkcg(css);
 825
 826        mutex_lock(&blkcg_pol_mutex);
 827        list_del(&blkcg->all_blkcgs_node);
 828        mutex_unlock(&blkcg_pol_mutex);
 829
 830        if (blkcg != &blkcg_root) {
 831                int i;
 832
 833                for (i = 0; i < BLKCG_MAX_POLS; i++)
 834                        kfree(blkcg->pd[i]);
 835                kfree(blkcg);
 836        }
 837}
 838
 839static struct cgroup_subsys_state *
 840blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 841{
 842        struct blkcg *blkcg;
 843        struct cgroup_subsys_state *ret;
 844        int i;
 845
 846        mutex_lock(&blkcg_pol_mutex);
 847
 848        if (!parent_css) {
 849                blkcg = &blkcg_root;
 850                goto done;
 851        }
 852
 853        blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
 854        if (!blkcg) {
 855                ret = ERR_PTR(-ENOMEM);
 856                goto free_blkcg;
 857        }
 858
 859        for (i = 0; i < BLKCG_MAX_POLS ; i++) {
 860                struct blkcg_policy *pol = blkcg_policy[i];
 861                struct blkcg_policy_data *cpd;
 862
 863                /*
 864                 * If the policy hasn't been attached yet, wait for it
 865                 * to be attached before doing anything else. Otherwise,
 866                 * check if the policy requires any specific per-cgroup
 867                 * data: if it does, allocate and initialize it.
 868                 */
 869                if (!pol || !pol->cpd_size)
 870                        continue;
 871
 872                BUG_ON(blkcg->pd[i]);
 873                cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
 874                if (!cpd) {
 875                        ret = ERR_PTR(-ENOMEM);
 876                        goto free_pd_blkcg;
 877                }
 878                blkcg->pd[i] = cpd;
 879                cpd->plid = i;
 880                pol->cpd_init_fn(blkcg);
 881        }
 882
 883done:
 884        spin_lock_init(&blkcg->lock);
 885        INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
 886        INIT_HLIST_HEAD(&blkcg->blkg_list);
 887#ifdef CONFIG_CGROUP_WRITEBACK
 888        INIT_LIST_HEAD(&blkcg->cgwb_list);
 889#endif
 890        list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
 891
 892        mutex_unlock(&blkcg_pol_mutex);
 893        return &blkcg->css;
 894
 895free_pd_blkcg:
 896        for (i--; i >= 0; i--)
 897                kfree(blkcg->pd[i]);
 898free_blkcg:
 899        kfree(blkcg);
 900        mutex_unlock(&blkcg_pol_mutex);
 901        return ret;
 902}
 903
 904/**
 905 * blkcg_init_queue - initialize blkcg part of request queue
 906 * @q: request_queue to initialize
 907 *
 908 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
 909 * part of new request_queue @q.
 910 *
 911 * RETURNS:
 912 * 0 on success, -errno on failure.
 913 */
 914int blkcg_init_queue(struct request_queue *q)
 915{
 916        struct blkcg_gq *new_blkg, *blkg;
 917        bool preloaded;
 918        int ret;
 919
 920        new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
 921        if (!new_blkg)
 922                return -ENOMEM;
 923
 924        preloaded = !radix_tree_preload(GFP_KERNEL);
 925
 926        /*
 927         * Make sure the root blkg exists and count the existing blkgs.  As
 928         * @q is bypassing at this point, blkg_lookup_create() can't be
 929         * used.  Open code insertion.
 930         */
 931        rcu_read_lock();
 932        spin_lock_irq(q->queue_lock);
 933        blkg = blkg_create(&blkcg_root, q, new_blkg);
 934        spin_unlock_irq(q->queue_lock);
 935        rcu_read_unlock();
 936
 937        if (preloaded)
 938                radix_tree_preload_end();
 939
 940        if (IS_ERR(blkg)) {
 941                kfree(new_blkg);
 942                return PTR_ERR(blkg);
 943        }
 944
 945        q->root_blkg = blkg;
 946        q->root_rl.blkg = blkg;
 947
 948        ret = blk_throtl_init(q);
 949        if (ret) {
 950                spin_lock_irq(q->queue_lock);
 951                blkg_destroy_all(q);
 952                spin_unlock_irq(q->queue_lock);
 953        }
 954        return ret;
 955}
 956
 957/**
 958 * blkcg_drain_queue - drain blkcg part of request_queue
 959 * @q: request_queue to drain
 960 *
 961 * Called from blk_drain_queue().  Responsible for draining blkcg part.
 962 */
 963void blkcg_drain_queue(struct request_queue *q)
 964{
 965        lockdep_assert_held(q->queue_lock);
 966
 967        /*
 968         * @q could be exiting and already have destroyed all blkgs as
 969         * indicated by NULL root_blkg.  If so, don't confuse policies.
 970         */
 971        if (!q->root_blkg)
 972                return;
 973
 974        blk_throtl_drain(q);
 975}
 976
 977/**
 978 * blkcg_exit_queue - exit and release blkcg part of request_queue
 979 * @q: request_queue being released
 980 *
 981 * Called from blk_release_queue().  Responsible for exiting blkcg part.
 982 */
 983void blkcg_exit_queue(struct request_queue *q)
 984{
 985        spin_lock_irq(q->queue_lock);
 986        blkg_destroy_all(q);
 987        spin_unlock_irq(q->queue_lock);
 988
 989        blk_throtl_exit(q);
 990}
 991
 992/*
 993 * We cannot support shared io contexts, as we have no mean to support
 994 * two tasks with the same ioc in two different groups without major rework
 995 * of the main cic data structures.  For now we allow a task to change
 996 * its cgroup only if it's the only owner of its ioc.
 997 */
 998static int blkcg_can_attach(struct cgroup_subsys_state *css,
 999                            struct cgroup_taskset *tset)
1000{

1001        struct task_struct *task;
1002        struct io_context *ioc;
1003        int ret = 0;
1004
1005        /* task_lock() is needed to avoid races with exit_io_context() */
1006        cgroup_taskset_for_each(task, tset) {
1007                task_lock(task);
1008                ioc = task->io_context;
1009                if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1010                        ret = -EINVAL;
1011                task_unlock(task);
1012                if (ret)
1013                        break;
1014        }
1015        return ret;
1016}
1017
1018struct cgroup_subsys blkio_cgrp_subsys = {
1019        .css_alloc = blkcg_css_alloc,
1020        .css_offline = blkcg_css_offline,
1021        .css_free = blkcg_css_free,
1022        .can_attach = blkcg_can_attach,
1023        .legacy_cftypes = blkcg_files,
1024#ifdef CONFIG_MEMCG
1025        /*
1026         * This ensures that, if available, memcg is automatically enabled
1027         * together on the default hierarchy so that the owner cgroup can
1028         * be retrieved from writeback pages.
1029         */
1030        .depends_on = 1 << memory_cgrp_id,
1031#endif
1032};
1033EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
1034
1035/**
1036 * blkcg_activate_policy - activate a blkcg policy on a request_queue
1037 * @q: request_queue of interest
1038 * @pol: blkcg policy to activate
1039 *
1040 * Activate @pol on @q.  Requires %GFP_KERNEL context.  @q goes through
1041 * bypass mode to populate its blkgs with policy_data for @pol.
1042 *
1043 * Activation happens with @q bypassed, so nobody would be accessing blkgs
1044 * from IO path.  Update of each blkg is protected by both queue and blkcg
1045 * locks so that holding either lock and testing blkcg_policy_enabled() is
1046 * always enough for dereferencing policy data.
1047 *
1048 * The caller is responsible for synchronizing [de]activations and policy
1049 * [un]registerations.  Returns 0 on success, -errno on failure.
1050 */
1051int blkcg_activate_policy(struct request_queue *q,
1052                          const struct blkcg_policy *pol)
1053{
1054        LIST_HEAD(pds);
1055        struct blkcg_gq *blkg;
1056        struct blkg_policy_data *pd, *nd;
1057        int cnt = 0, ret;
1058
1059        if (blkcg_policy_enabled(q, pol))
1060                return 0;
1061
1062        /* count and allocate policy_data for all existing blkgs */
1063        blk_queue_bypass_start(q);
1064        spin_lock_irq(q->queue_lock);
1065        list_for_each_entry(blkg, &q->blkg_list, q_node)
1066                cnt++;
1067        spin_unlock_irq(q->queue_lock);
1068
1069        /* allocate per-blkg policy data for all existing blkgs */
1070        while (cnt--) {
1071                pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
1072                if (!pd) {
1073                        ret = -ENOMEM;
1074                        goto out_free;
1075                }
1076                list_add_tail(&pd->alloc_node, &pds);
1077        }
1078
1079        /*
1080         * Install the allocated pds and cpds. With @q bypassing, no new blkg
1081         * should have been created while the queue lock was dropped.
1082         */
1083        spin_lock_irq(q->queue_lock);
1084
1085        list_for_each_entry(blkg, &q->blkg_list, q_node) {
1086                if (WARN_ON(list_empty(&pds))) {
1087                        /* umm... this shouldn't happen, just abort */
1088                        ret = -ENOMEM;
1089                        goto out_unlock;
1090                }
1091                pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
1092                list_del_init(&pd->alloc_node);
1093
1094                /* grab blkcg lock too while installing @pd on @blkg */
1095                spin_lock(&blkg->blkcg->lock);
1096
1097                blkg->pd[pol->plid] = pd;
1098                pd->blkg = blkg;
1099                pd->plid = pol->plid;
1100                pol->pd_init_fn(blkg);
1101
1102                spin_unlock(&blkg->blkcg->lock);
1103        }
1104
1105        __set_bit(pol->plid, q->blkcg_pols);
1106        ret = 0;
1107out_unlock:
1108        spin_unlock_irq(q->queue_lock);
1109out_free:
1110        blk_queue_bypass_end(q);
1111        list_for_each_entry_safe(pd, nd, &pds, alloc_node)
1112                kfree(pd);
1113        return ret;
1114}
1115EXPORT_SYMBOL_GPL(blkcg_activate_policy);
1116
1117/**
1118 * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
1119 * @q: request_queue of interest
1120 * @pol: blkcg policy to deactivate
1121 *
1122 * Deactivate @pol on @q.  Follows the same synchronization rules as
1123 * blkcg_activate_policy().
1124 */
1125void blkcg_deactivate_policy(struct request_queue *q,
1126                             const struct blkcg_policy *pol)
1127{
1128        struct blkcg_gq *blkg;
1129
1130        if (!blkcg_policy_enabled(q, pol))
1131                return;
1132
1133        blk_queue_bypass_start(q);
1134        spin_lock_irq(q->queue_lock);
1135
1136        __clear_bit(pol->plid, q->blkcg_pols);
1137
1138        list_for_each_entry(blkg, &q->blkg_list, q_node) {
1139                /* grab blkcg lock too while removing @pd from @blkg */
1140                spin_lock(&blkg->blkcg->lock);
1141
1142                if (pol->pd_offline_fn)
1143                        pol->pd_offline_fn(blkg);
1144                if (pol->pd_exit_fn)
1145                        pol->pd_exit_fn(blkg);
1146
1147                kfree(blkg->pd[pol->plid]);
1148                blkg->pd[pol->plid] = NULL;
1149
1150                spin_unlock(&blkg->blkcg->lock);
1151        }
1152
1153        spin_unlock_irq(q->queue_lock);
1154        blk_queue_bypass_end(q);
1155}
1156EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1157
1158/**
1159 * blkcg_policy_register - register a blkcg policy
1160 * @pol: blkcg policy to register
1161 *
1162 * Register @pol with blkcg core.  Might sleep and @pol may be modified on
1163 * successful registration.  Returns 0 on success and -errno on failure.
1164 */
1165int blkcg_policy_register(struct blkcg_policy *pol)
1166{
1167        struct blkcg *blkcg;
1168        int i, ret;
1169
1170        if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
1171                return -EINVAL;
1172
1173        mutex_lock(&blkcg_pol_register_mutex);
1174        mutex_lock(&blkcg_pol_mutex);
1175
1176        /* find an empty slot */
1177        ret = -ENOSPC;
1178        for (i = 0; i < BLKCG_MAX_POLS; i++)
1179                if (!blkcg_policy[i])
1180                        break;
1181        if (i >= BLKCG_MAX_POLS)
1182                goto err_unlock;
1183
1184        /* register @pol */
1185        pol->plid = i;
1186        blkcg_policy[pol->plid] = pol;
1187
1188        /* allocate and install cpd's */
1189        if (pol->cpd_size) {
1190                list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1191                        struct blkcg_policy_data *cpd;
1192
1193                        cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
1194                        if (!cpd) {
1195                                mutex_unlock(&blkcg_pol_mutex);
1196                                goto err_free_cpds;
1197                        }
1198
1199                        blkcg->pd[pol->plid] = cpd;
1200                        cpd->plid = pol->plid;
1201                        pol->cpd_init_fn(blkcg);
1202                }
1203        }
1204
1205        mutex_unlock(&blkcg_pol_mutex);
1206
1207        /* everything is in place, add intf files for the new policy */
1208        if (pol->cftypes)
1209                WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys,
1210                                                  pol->cftypes));
1211        mutex_unlock(&blkcg_pol_register_mutex);
1212        return 0;
1213
1214err_free_cpds:
1215        if (pol->cpd_size) {
1216                list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1217                        kfree(blkcg->pd[pol->plid]);
1218                        blkcg->pd[pol->plid] = NULL;
1219                }
1220        }
1221        blkcg_policy[pol->plid] = NULL;
1222err_unlock:
1223        mutex_unlock(&blkcg_pol_mutex);
1224        mutex_unlock(&blkcg_pol_register_mutex);
1225        return ret;
1226}
1227EXPORT_SYMBOL_GPL(blkcg_policy_register);
1228
1229/**
1230 * blkcg_policy_unregister - unregister a blkcg policy
1231 * @pol: blkcg policy to unregister
1232 *
1233 * Undo blkcg_policy_register(@pol).  Might sleep.
1234 */
1235void blkcg_policy_unregister(struct blkcg_policy *pol)
1236{
1237        struct blkcg *blkcg;
1238
1239        mutex_lock(&blkcg_pol_register_mutex);
1240
1241        if (WARN_ON(blkcg_policy[pol->plid] != pol))
1242                goto out_unlock;
1243
1244        /* kill the intf files first */
1245        if (pol->cftypes)
1246                cgroup_rm_cftypes(pol->cftypes);
1247
1248        /* remove cpds and unregister */
1249        mutex_lock(&blkcg_pol_mutex);
1250
1251        if (pol->cpd_size) {
1252                list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1253                        kfree(blkcg->pd[pol->plid]);
1254                        blkcg->pd[pol->plid] = NULL;
1255                }
1256        }
1257        blkcg_policy[pol->plid] = NULL;
1258
1259        mutex_unlock(&blkcg_pol_mutex);
1260out_unlock:
1261        mutex_unlock(&blkcg_pol_register_mutex);
1262}
1263EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1264