linux/include/linux/blk-cgroup.h
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0 */
   2#ifndef _BLK_CGROUP_H
   3#define _BLK_CGROUP_H
   4/*
   5 * Common Block IO controller cgroup interface
   6 *
   7 * Based on ideas and code from CFQ, CFS and BFQ:
   8 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
   9 *
  10 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
  11 *                    Paolo Valente <paolo.valente@unimore.it>
  12 *
  13 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
  14 *                    Nauman Rafique <nauman@google.com>
  15 */
  16
  17#include <linux/cgroup.h>
  18#include <linux/percpu_counter.h>
  19#include <linux/seq_file.h>
  20#include <linux/radix-tree.h>
  21#include <linux/blkdev.h>
  22#include <linux/atomic.h>
  23#include <linux/kthread.h>
  24
  25/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
  26#define BLKG_STAT_CPU_BATCH     (INT_MAX / 2)
  27
  28/* Max limits for throttle policy */
  29#define THROTL_IOPS_MAX         UINT_MAX
  30
  31#ifdef CONFIG_BLK_CGROUP
  32
  33enum blkg_rwstat_type {
  34        BLKG_RWSTAT_READ,
  35        BLKG_RWSTAT_WRITE,
  36        BLKG_RWSTAT_SYNC,
  37        BLKG_RWSTAT_ASYNC,
  38        BLKG_RWSTAT_DISCARD,
  39
  40        BLKG_RWSTAT_NR,
  41        BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
  42};
  43
  44struct blkcg_gq;
  45
  46struct blkcg {
  47        struct cgroup_subsys_state      css;
  48        spinlock_t                      lock;
  49
  50        struct radix_tree_root          blkg_tree;
  51        struct blkcg_gq __rcu           *blkg_hint;
  52        struct hlist_head               blkg_list;
  53
  54        struct blkcg_policy_data        *cpd[BLKCG_MAX_POLS];
  55
  56        struct list_head                all_blkcgs_node;
  57#ifdef CONFIG_CGROUP_WRITEBACK
  58        struct list_head                cgwb_list;
  59        refcount_t                      cgwb_refcnt;
  60#endif
  61};
  62
  63/*
  64 * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
  65 * recursive.  Used to carry stats of dead children, and, for blkg_rwstat,
  66 * to carry result values from read and sum operations.
  67 */
  68struct blkg_stat {
  69        struct percpu_counter           cpu_cnt;
  70        atomic64_t                      aux_cnt;
  71};
  72
  73struct blkg_rwstat {
  74        struct percpu_counter           cpu_cnt[BLKG_RWSTAT_NR];
  75        atomic64_t                      aux_cnt[BLKG_RWSTAT_NR];
  76};
  77
  78/*
  79 * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
  80 * request_queue (q).  This is used by blkcg policies which need to track
  81 * information per blkcg - q pair.
  82 *
  83 * There can be multiple active blkcg policies and each blkg:policy pair is
  84 * represented by a blkg_policy_data which is allocated and freed by each
  85 * policy's pd_alloc/free_fn() methods.  A policy can allocate private data
  86 * area by allocating larger data structure which embeds blkg_policy_data
  87 * at the beginning.
  88 */
  89struct blkg_policy_data {
  90        /* the blkg and policy id this per-policy data belongs to */
  91        struct blkcg_gq                 *blkg;
  92        int                             plid;
  93};
  94
  95/*
  96 * Policies that need to keep per-blkcg data which is independent from any
  97 * request_queue associated to it should implement cpd_alloc/free_fn()
  98 * methods.  A policy can allocate private data area by allocating larger
  99 * data structure which embeds blkcg_policy_data at the beginning.
 100 * cpd_init() is invoked to let each policy handle per-blkcg data.
 101 */
 102struct blkcg_policy_data {
 103        /* the blkcg and policy id this per-policy data belongs to */
 104        struct blkcg                    *blkcg;
 105        int                             plid;
 106};
 107
 108/* association between a blk cgroup and a request queue */
 109struct blkcg_gq {
 110        /* Pointer to the associated request_queue */
 111        struct request_queue            *q;
 112        struct list_head                q_node;
 113        struct hlist_node               blkcg_node;
 114        struct blkcg                    *blkcg;
 115
 116        /*
 117         * Each blkg gets congested separately and the congestion state is
 118         * propagated to the matching bdi_writeback_congested.
 119         */
 120        struct bdi_writeback_congested  *wb_congested;
 121
 122        /* all non-root blkcg_gq's are guaranteed to have access to parent */
 123        struct blkcg_gq                 *parent;
 124
 125        /* request allocation list for this blkcg-q pair */
 126        struct request_list             rl;
 127
 128        /* reference count */
 129        atomic_t                        refcnt;
 130
 131        /* is this blkg online? protected by both blkcg and q locks */
 132        bool                            online;
 133
 134        struct blkg_rwstat              stat_bytes;
 135        struct blkg_rwstat              stat_ios;
 136
 137        struct blkg_policy_data         *pd[BLKCG_MAX_POLS];
 138
 139        struct rcu_head                 rcu_head;
 140
 141        atomic_t                        use_delay;
 142        atomic64_t                      delay_nsec;
 143        atomic64_t                      delay_start;
 144        u64                             last_delay;
 145        int                             last_use;
 146};
 147
 148typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
 149typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
 150typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
 151typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
 152typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
 153typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
 154typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
 155typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
 156typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
 157typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
 158typedef size_t (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, char *buf,
 159                                      size_t size);
 160
 161struct blkcg_policy {
 162        int                             plid;
 163        /* cgroup files for the policy */
 164        struct cftype                   *dfl_cftypes;
 165        struct cftype                   *legacy_cftypes;
 166
 167        /* operations */
 168        blkcg_pol_alloc_cpd_fn          *cpd_alloc_fn;
 169        blkcg_pol_init_cpd_fn           *cpd_init_fn;
 170        blkcg_pol_free_cpd_fn           *cpd_free_fn;
 171        blkcg_pol_bind_cpd_fn           *cpd_bind_fn;
 172
 173        blkcg_pol_alloc_pd_fn           *pd_alloc_fn;
 174        blkcg_pol_init_pd_fn            *pd_init_fn;
 175        blkcg_pol_online_pd_fn          *pd_online_fn;
 176        blkcg_pol_offline_pd_fn         *pd_offline_fn;
 177        blkcg_pol_free_pd_fn            *pd_free_fn;
 178        blkcg_pol_reset_pd_stats_fn     *pd_reset_stats_fn;
 179        blkcg_pol_stat_pd_fn            *pd_stat_fn;
 180};
 181
 182extern struct blkcg blkcg_root;
 183extern struct cgroup_subsys_state * const blkcg_root_css;
 184
 185struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
 186                                      struct request_queue *q, bool update_hint);
 187struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 188                                    struct request_queue *q);
 189int blkcg_init_queue(struct request_queue *q);
 190void blkcg_drain_queue(struct request_queue *q);
 191void blkcg_exit_queue(struct request_queue *q);
 192
 193/* Blkio controller policy registration */
 194int blkcg_policy_register(struct blkcg_policy *pol);
 195void blkcg_policy_unregister(struct blkcg_policy *pol);
 196int blkcg_activate_policy(struct request_queue *q,
 197                          const struct blkcg_policy *pol);
 198void blkcg_deactivate_policy(struct request_queue *q,
 199                             const struct blkcg_policy *pol);
 200
 201const char *blkg_dev_name(struct blkcg_gq *blkg);
 202void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 203                       u64 (*prfill)(struct seq_file *,
 204                                     struct blkg_policy_data *, int),
 205                       const struct blkcg_policy *pol, int data,
 206                       bool show_total);
 207u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
 208u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 209                         const struct blkg_rwstat *rwstat);
 210u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
 211u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 212                       int off);
 213int blkg_print_stat_bytes(struct seq_file *sf, void *v);
 214int blkg_print_stat_ios(struct seq_file *sf, void *v);
 215int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v);
 216int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v);
 217
 218u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
 219                            struct blkcg_policy *pol, int off);
 220struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
 221                                             struct blkcg_policy *pol, int off);
 222
 223struct blkg_conf_ctx {
 224        struct gendisk                  *disk;
 225        struct blkcg_gq                 *blkg;
 226        char                            *body;
 227};
 228
 229int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 230                   char *input, struct blkg_conf_ctx *ctx);
 231void blkg_conf_finish(struct blkg_conf_ctx *ctx);
 232
 233
 234static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
 235{
 236        return css ? container_of(css, struct blkcg, css) : NULL;
 237}
 238
 239static inline struct blkcg *bio_blkcg(struct bio *bio)
 240{
 241        struct cgroup_subsys_state *css;
 242
 243        if (bio && bio->bi_css)
 244                return css_to_blkcg(bio->bi_css);
 245        css = kthread_blkcg();
 246        if (css)
 247                return css_to_blkcg(css);
 248        return css_to_blkcg(task_css(current, io_cgrp_id));
 249}
 250
 251static inline bool blk_cgroup_congested(void)
 252{
 253        struct cgroup_subsys_state *css;
 254        bool ret = false;
 255
 256        rcu_read_lock();
 257        css = kthread_blkcg();
 258        if (!css)
 259                css = task_css(current, io_cgrp_id);
 260        while (css) {
 261                if (atomic_read(&css->cgroup->congestion_count)) {
 262                        ret = true;
 263                        break;
 264                }
 265                css = css->parent;
 266        }
 267        rcu_read_unlock();
 268        return ret;
 269}
 270
 271/**
 272 * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
 273 * @return: true if this bio needs to be submitted with the root blkg context.
 274 *
 275 * In order to avoid priority inversions we sometimes need to issue a bio as if
 276 * it were attached to the root blkg, and then backcharge to the actual owning
 277 * blkg.  The idea is we do bio_blkcg() to look up the actual context for the
 278 * bio and attach the appropriate blkg to the bio.  Then we call this helper and
 279 * if it is true run with the root blkg for that queue and then do any
 280 * backcharging to the originating cgroup once the io is complete.
 281 */
 282static inline bool bio_issue_as_root_blkg(struct bio *bio)
 283{
 284        return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0;
 285}
 286
 287/**
 288 * blkcg_parent - get the parent of a blkcg
 289 * @blkcg: blkcg of interest
 290 *
 291 * Return the parent blkcg of @blkcg.  Can be called anytime.
 292 */
 293static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
 294{
 295        return css_to_blkcg(blkcg->css.parent);
 296}
 297
 298/**
 299 * __blkg_lookup - internal version of blkg_lookup()
 300 * @blkcg: blkcg of interest
 301 * @q: request_queue of interest
 302 * @update_hint: whether to update lookup hint with the result or not
 303 *
 304 * This is internal version and shouldn't be used by policy
 305 * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
 306 * @q's bypass state.  If @update_hint is %true, the caller should be
 307 * holding @q->queue_lock and lookup hint is updated on success.
 308 */
 309static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
 310                                             struct request_queue *q,
 311                                             bool update_hint)
 312{
 313        struct blkcg_gq *blkg;
 314
 315        if (blkcg == &blkcg_root)
 316                return q->root_blkg;
 317
 318        blkg = rcu_dereference(blkcg->blkg_hint);
 319        if (blkg && blkg->q == q)
 320                return blkg;
 321
 322        return blkg_lookup_slowpath(blkcg, q, update_hint);
 323}
 324
 325/**
 326 * blkg_lookup - lookup blkg for the specified blkcg - q pair
 327 * @blkcg: blkcg of interest
 328 * @q: request_queue of interest
 329 *
 330 * Lookup blkg for the @blkcg - @q pair.  This function should be called
 331 * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
 332 * - see blk_queue_bypass_start() for details.
 333 */
 334static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
 335                                           struct request_queue *q)
 336{
 337        WARN_ON_ONCE(!rcu_read_lock_held());
 338
 339        if (unlikely(blk_queue_bypass(q)))
 340                return NULL;
 341        return __blkg_lookup(blkcg, q, false);
 342}
 343
 344/**
 345 * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair
 346 * @q: request_queue of interest
 347 *
 348 * Lookup blkg for @q at the root level. See also blkg_lookup().
 349 */
 350static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
 351{
 352        return q->root_blkg;
 353}
 354
 355/**
 356 * blkg_to_pdata - get policy private data
 357 * @blkg: blkg of interest
 358 * @pol: policy of interest
 359 *
 360 * Return pointer to private data associated with the @blkg-@pol pair.
 361 */
 362static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
 363                                                  struct blkcg_policy *pol)
 364{
 365        return blkg ? blkg->pd[pol->plid] : NULL;
 366}
 367
 368static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
 369                                                     struct blkcg_policy *pol)
 370{
 371        return blkcg ? blkcg->cpd[pol->plid] : NULL;
 372}
 373
 374/**
 375 * pdata_to_blkg - get blkg associated with policy private data
 376 * @pd: policy private data of interest
 377 *
 378 * @pd is policy private data.  Determine the blkg it's associated with.
 379 */
 380static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
 381{
 382        return pd ? pd->blkg : NULL;
 383}
 384
 385static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
 386{
 387        return cpd ? cpd->blkcg : NULL;
 388}
 389
 390extern void blkcg_destroy_blkgs(struct blkcg *blkcg);
 391
 392#ifdef CONFIG_CGROUP_WRITEBACK
 393
 394/**
 395 * blkcg_cgwb_get - get a reference for blkcg->cgwb_list
 396 * @blkcg: blkcg of interest
 397 *
 398 * This is used to track the number of active wb's related to a blkcg.
 399 */
 400static inline void blkcg_cgwb_get(struct blkcg *blkcg)
 401{
 402        refcount_inc(&blkcg->cgwb_refcnt);
 403}
 404
 405/**
 406 * blkcg_cgwb_put - put a reference for @blkcg->cgwb_list
 407 * @blkcg: blkcg of interest
 408 *
 409 * This is used to track the number of active wb's related to a blkcg.
 410 * When this count goes to zero, all active wb has finished so the
 411 * blkcg can continue destruction by calling blkcg_destroy_blkgs().
 412 * This work may occur in cgwb_release_workfn() on the cgwb_release
 413 * workqueue.
 414 */
 415static inline void blkcg_cgwb_put(struct blkcg *blkcg)
 416{
 417        if (refcount_dec_and_test(&blkcg->cgwb_refcnt))
 418                blkcg_destroy_blkgs(blkcg);
 419}
 420
 421#else
 422
 423static inline void blkcg_cgwb_get(struct blkcg *blkcg) { }
 424
 425static inline void blkcg_cgwb_put(struct blkcg *blkcg)
 426{
 427        /* wb isn't being accounted, so trigger destruction right away */
 428        blkcg_destroy_blkgs(blkcg);
 429}
 430
 431#endif
 432
 433/**
 434 * blkg_path - format cgroup path of blkg
 435 * @blkg: blkg of interest
 436 * @buf: target buffer
 437 * @buflen: target buffer length
 438 *
 439 * Format the path of the cgroup of @blkg into @buf.
 440 */
 441static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
 442{
 443        return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
 444}
 445
 446/**
 447 * blkg_get - get a blkg reference
 448 * @blkg: blkg to get
 449 *
 450 * The caller should be holding an existing reference.
 451 */
 452static inline void blkg_get(struct blkcg_gq *blkg)
 453{
 454        WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
 455        atomic_inc(&blkg->refcnt);
 456}
 457
 458/**
 459 * blkg_try_get - try and get a blkg reference
 460 * @blkg: blkg to get
 461 *
 462 * This is for use when doing an RCU lookup of the blkg.  We may be in the midst
 463 * of freeing this blkg, so we can only use it if the refcnt is not zero.
 464 */
 465static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg)
 466{
 467        if (atomic_inc_not_zero(&blkg->refcnt))
 468                return blkg;
 469        return NULL;
 470}
 471
 472
 473void __blkg_release_rcu(struct rcu_head *rcu);
 474
 475/**
 476 * blkg_put - put a blkg reference
 477 * @blkg: blkg to put
 478 */
 479static inline void blkg_put(struct blkcg_gq *blkg)
 480{
 481        WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
 482        if (atomic_dec_and_test(&blkg->refcnt))
 483                call_rcu(&blkg->rcu_head, __blkg_release_rcu);
 484}
 485
 486/**
 487 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
 488 * @d_blkg: loop cursor pointing to the current descendant
 489 * @pos_css: used for iteration
 490 * @p_blkg: target blkg to walk descendants of
 491 *
 492 * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
 493 * read locked.  If called under either blkcg or queue lock, the iteration
 494 * is guaranteed to include all and only online blkgs.  The caller may
 495 * update @pos_css by calling css_rightmost_descendant() to skip subtree.
 496 * @p_blkg is included in the iteration and the first node to be visited.
 497 */
 498#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)           \
 499        css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)   \
 500                if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),    \
 501                                              (p_blkg)->q, false)))
 502
 503/**
 504 * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
 505 * @d_blkg: loop cursor pointing to the current descendant
 506 * @pos_css: used for iteration
 507 * @p_blkg: target blkg to walk descendants of
 508 *
 509 * Similar to blkg_for_each_descendant_pre() but performs post-order
 510 * traversal instead.  Synchronization rules are the same.  @p_blkg is
 511 * included in the iteration and the last node to be visited.
 512 */
 513#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)          \
 514        css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)  \
 515                if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),    \
 516                                              (p_blkg)->q, false)))
 517
 518/**
 519 * blk_get_rl - get request_list to use
 520 * @q: request_queue of interest
 521 * @bio: bio which will be attached to the allocated request (may be %NULL)
 522 *
 523 * The caller wants to allocate a request from @q to use for @bio.  Find
 524 * the request_list to use and obtain a reference on it.  Should be called
 525 * under queue_lock.  This function is guaranteed to return non-%NULL
 526 * request_list.
 527 */
 528static inline struct request_list *blk_get_rl(struct request_queue *q,
 529                                              struct bio *bio)
 530{
 531        struct blkcg *blkcg;
 532        struct blkcg_gq *blkg;
 533
 534        rcu_read_lock();
 535
 536        blkcg = bio_blkcg(bio);
 537
 538        /* bypass blkg lookup and use @q->root_rl directly for root */
 539        if (blkcg == &blkcg_root)
 540                goto root_rl;
 541
 542        /*
 543         * Try to use blkg->rl.  blkg lookup may fail under memory pressure
 544         * or if either the blkcg or queue is going away.  Fall back to
 545         * root_rl in such cases.
 546         */
 547        blkg = blkg_lookup(blkcg, q);
 548        if (unlikely(!blkg))
 549                goto root_rl;
 550
 551        blkg_get(blkg);
 552        rcu_read_unlock();
 553        return &blkg->rl;
 554root_rl:
 555        rcu_read_unlock();
 556        return &q->root_rl;
 557}
 558
 559/**
 560 * blk_put_rl - put request_list
 561 * @rl: request_list to put
 562 *
 563 * Put the reference acquired by blk_get_rl().  Should be called under
 564 * queue_lock.
 565 */
 566static inline void blk_put_rl(struct request_list *rl)
 567{
 568        if (rl->blkg->blkcg != &blkcg_root)
 569                blkg_put(rl->blkg);
 570}
 571
 572/**
 573 * blk_rq_set_rl - associate a request with a request_list
 574 * @rq: request of interest
 575 * @rl: target request_list
 576 *
 577 * Associate @rq with @rl so that accounting and freeing can know the
 578 * request_list @rq came from.
 579 */
 580static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
 581{
 582        rq->rl = rl;
 583}
 584
 585/**
 586 * blk_rq_rl - return the request_list a request came from
 587 * @rq: request of interest
 588 *
 589 * Return the request_list @rq is allocated from.
 590 */
 591static inline struct request_list *blk_rq_rl(struct request *rq)
 592{
 593        return rq->rl;
 594}
 595
 596struct request_list *__blk_queue_next_rl(struct request_list *rl,
 597                                         struct request_queue *q);
 598/**
 599 * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
 600 *
 601 * Should be used under queue_lock.
 602 */
 603#define blk_queue_for_each_rl(rl, q)    \
 604        for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
 605
 606static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp)
 607{
 608        int ret;
 609
 610        ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp);
 611        if (ret)
 612                return ret;
 613
 614        atomic64_set(&stat->aux_cnt, 0);
 615        return 0;
 616}
 617
 618static inline void blkg_stat_exit(struct blkg_stat *stat)
 619{
 620        percpu_counter_destroy(&stat->cpu_cnt);
 621}
 622
 623/**
 624 * blkg_stat_add - add a value to a blkg_stat
 625 * @stat: target blkg_stat
 626 * @val: value to add
 627 *
 628 * Add @val to @stat.  The caller must ensure that IRQ on the same CPU
 629 * don't re-enter this function for the same counter.
 630 */
 631static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
 632{
 633        percpu_counter_add_batch(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
 634}
 635
 636/**
 637 * blkg_stat_read - read the current value of a blkg_stat
 638 * @stat: blkg_stat to read
 639 */
 640static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
 641{
 642        return percpu_counter_sum_positive(&stat->cpu_cnt);
 643}
 644
 645/**
 646 * blkg_stat_reset - reset a blkg_stat
 647 * @stat: blkg_stat to reset
 648 */
 649static inline void blkg_stat_reset(struct blkg_stat *stat)
 650{
 651        percpu_counter_set(&stat->cpu_cnt, 0);
 652        atomic64_set(&stat->aux_cnt, 0);
 653}
 654
 655/**
 656 * blkg_stat_add_aux - add a blkg_stat into another's aux count
 657 * @to: the destination blkg_stat
 658 * @from: the source
 659 *
 660 * Add @from's count including the aux one to @to's aux count.
 661 */
 662static inline void blkg_stat_add_aux(struct blkg_stat *to,
 663                                     struct blkg_stat *from)
 664{
 665        atomic64_add(blkg_stat_read(from) + atomic64_read(&from->aux_cnt),
 666                     &to->aux_cnt);
 667}
 668
 669static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
 670{
 671        int i, ret;
 672
 673        for (i = 0; i < BLKG_RWSTAT_NR; i++) {
 674                ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
 675                if (ret) {
 676                        while (--i >= 0)
 677                                percpu_counter_destroy(&rwstat->cpu_cnt[i]);
 678                        return ret;
 679                }
 680                atomic64_set(&rwstat->aux_cnt[i], 0);
 681        }
 682        return 0;
 683}
 684
 685static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
 686{
 687        int i;
 688
 689        for (i = 0; i < BLKG_RWSTAT_NR; i++)
 690                percpu_counter_destroy(&rwstat->cpu_cnt[i]);
 691}
 692
 693/**
 694 * blkg_rwstat_add - add a value to a blkg_rwstat
 695 * @rwstat: target blkg_rwstat
 696 * @op: REQ_OP and flags
 697 * @val: value to add
 698 *
 699 * Add @val to @rwstat.  The counters are chosen according to @rw.  The
 700 * caller is responsible for synchronizing calls to this function.
 701 */
 702static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
 703                                   unsigned int op, uint64_t val)
 704{
 705        struct percpu_counter *cnt;
 706
 707        if (op_is_discard(op))
 708                cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD];
 709        else if (op_is_write(op))
 710                cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
 711        else
 712                cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
 713
 714        percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
 715
 716        if (op_is_sync(op))
 717                cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
 718        else
 719                cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
 720
 721        percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
 722}
 723
 724/**
 725 * blkg_rwstat_read - read the current values of a blkg_rwstat
 726 * @rwstat: blkg_rwstat to read
 727 *
 728 * Read the current snapshot of @rwstat and return it in the aux counts.
 729 */
 730static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
 731{
 732        struct blkg_rwstat result;
 733        int i;
 734
 735        for (i = 0; i < BLKG_RWSTAT_NR; i++)
 736                atomic64_set(&result.aux_cnt[i],
 737                             percpu_counter_sum_positive(&rwstat->cpu_cnt[i]));
 738        return result;
 739}
 740
 741/**
 742 * blkg_rwstat_total - read the total count of a blkg_rwstat
 743 * @rwstat: blkg_rwstat to read
 744 *
 745 * Return the total count of @rwstat regardless of the IO direction.  This
 746 * function can be called without synchronization and takes care of u64
 747 * atomicity.
 748 */
 749static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
 750{
 751        struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
 752
 753        return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
 754                atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
 755}
 756
 757/**
 758 * blkg_rwstat_reset - reset a blkg_rwstat
 759 * @rwstat: blkg_rwstat to reset
 760 */
 761static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
 762{
 763        int i;
 764
 765        for (i = 0; i < BLKG_RWSTAT_NR; i++) {
 766                percpu_counter_set(&rwstat->cpu_cnt[i], 0);
 767                atomic64_set(&rwstat->aux_cnt[i], 0);
 768        }
 769}
 770
 771/**
 772 * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count
 773 * @to: the destination blkg_rwstat
 774 * @from: the source
 775 *
 776 * Add @from's count including the aux one to @to's aux count.
 777 */
 778static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
 779                                       struct blkg_rwstat *from)
 780{
 781        u64 sum[BLKG_RWSTAT_NR];
 782        int i;
 783
 784        for (i = 0; i < BLKG_RWSTAT_NR; i++)
 785                sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]);
 786
 787        for (i = 0; i < BLKG_RWSTAT_NR; i++)
 788                atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]),
 789                             &to->aux_cnt[i]);
 790}
 791
 792#ifdef CONFIG_BLK_DEV_THROTTLING
 793extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 794                           struct bio *bio);
 795#else
 796static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 797                                  struct bio *bio) { return false; }
 798#endif
 799
 800static inline bool blkcg_bio_issue_check(struct request_queue *q,
 801                                         struct bio *bio)
 802{
 803        struct blkcg *blkcg;
 804        struct blkcg_gq *blkg;
 805        bool throtl = false;
 806
 807        rcu_read_lock();
 808        blkcg = bio_blkcg(bio);
 809
 810        /* associate blkcg if bio hasn't attached one */
 811        bio_associate_blkcg(bio, &blkcg->css);
 812
 813        blkg = blkg_lookup(blkcg, q);
 814        if (unlikely(!blkg)) {
 815                spin_lock_irq(q->queue_lock);
 816                blkg = blkg_lookup_create(blkcg, q);
 817                if (IS_ERR(blkg))
 818                        blkg = NULL;
 819                spin_unlock_irq(q->queue_lock);
 820        }
 821
 822        throtl = blk_throtl_bio(q, blkg, bio);
 823
 824        if (!throtl) {
 825                blkg = blkg ?: q->root_blkg;
 826                /*
 827                 * If the bio is flagged with BIO_QUEUE_ENTERED it means this
 828                 * is a split bio and we would have already accounted for the
 829                 * size of the bio.
 830                 */
 831                if (!bio_flagged(bio, BIO_QUEUE_ENTERED))
 832                        blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf,
 833                                        bio->bi_iter.bi_size);
 834                blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1);
 835        }
 836
 837        rcu_read_unlock();
 838        return !throtl;
 839}
 840
 841static inline void blkcg_use_delay(struct blkcg_gq *blkg)
 842{
 843        if (atomic_add_return(1, &blkg->use_delay) == 1)
 844                atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
 845}
 846
 847static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
 848{
 849        int old = atomic_read(&blkg->use_delay);
 850
 851        if (old == 0)
 852                return 0;
 853
 854        /*
 855         * We do this song and dance because we can race with somebody else
 856         * adding or removing delay.  If we just did an atomic_dec we'd end up
 857         * negative and we'd already be in trouble.  We need to subtract 1 and
 858         * then check to see if we were the last delay so we can drop the
 859         * congestion count on the cgroup.
 860         */
 861        while (old) {
 862                int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
 863                if (cur == old)
 864                        break;
 865                old = cur;
 866        }
 867
 868        if (old == 0)
 869                return 0;
 870        if (old == 1)
 871                atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
 872        return 1;
 873}
 874
 875static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
 876{
 877        int old = atomic_read(&blkg->use_delay);
 878        if (!old)
 879                return;
 880        /* We only want 1 person clearing the congestion count for this blkg. */
 881        while (old) {
 882                int cur = atomic_cmpxchg(&blkg->use_delay, old, 0);
 883                if (cur == old) {
 884                        atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
 885                        break;
 886                }
 887                old = cur;
 888        }
 889}
 890
 891void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
 892void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay);
 893void blkcg_maybe_throttle_current(void);
 894#else   /* CONFIG_BLK_CGROUP */
 895
 896struct blkcg {
 897};
 898
 899struct blkg_policy_data {
 900};
 901
 902struct blkcg_policy_data {
 903};
 904
 905struct blkcg_gq {
 906};
 907
 908struct blkcg_policy {
 909};
 910
 911#define blkcg_root_css  ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
 912
 913static inline void blkcg_maybe_throttle_current(void) { }
 914static inline bool blk_cgroup_congested(void) { return false; }
 915
 916#ifdef CONFIG_BLOCK
 917
 918static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { }
 919
 920static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
 921static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
 922{ return NULL; }
 923static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
 924static inline void blkcg_drain_queue(struct request_queue *q) { }
 925static inline void blkcg_exit_queue(struct request_queue *q) { }
 926static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
 927static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
 928static inline int blkcg_activate_policy(struct request_queue *q,
 929                                        const struct blkcg_policy *pol) { return 0; }
 930static inline void blkcg_deactivate_policy(struct request_queue *q,
 931                                           const struct blkcg_policy *pol) { }
 932
 933static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
 934
 935static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
 936                                                  struct blkcg_policy *pol) { return NULL; }
 937static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
 938static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
 939static inline void blkg_get(struct blkcg_gq *blkg) { }
 940static inline void blkg_put(struct blkcg_gq *blkg) { }
 941
 942static inline struct request_list *blk_get_rl(struct request_queue *q,
 943                                              struct bio *bio) { return &q->root_rl; }
 944static inline void blk_put_rl(struct request_list *rl) { }
 945static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
 946static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
 947
 948static inline bool blkcg_bio_issue_check(struct request_queue *q,
 949                                         struct bio *bio) { return true; }
 950
 951#define blk_queue_for_each_rl(rl, q)    \
 952        for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
 953
 954#endif  /* CONFIG_BLOCK */
 955#endif  /* CONFIG_BLK_CGROUP */
 956#endif  /* _BLK_CGROUP_H */
 957