linux/block/blk-iocost.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0
   2 *
   3 * IO cost model based controller.
   4 *
   5 * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
   6 * Copyright (C) 2019 Andy Newell <newella@fb.com>
   7 * Copyright (C) 2019 Facebook
   8 *
   9 * One challenge of controlling IO resources is the lack of trivially
  10 * observable cost metric.  This is distinguished from CPU and memory where
  11 * wallclock time and the number of bytes can serve as accurate enough
  12 * approximations.
  13 *
  14 * Bandwidth and iops are the most commonly used metrics for IO devices but
  15 * depending on the type and specifics of the device, different IO patterns
  16 * easily lead to multiple orders of magnitude variations rendering them
  17 * useless for the purpose of IO capacity distribution.  While on-device
  18 * time, with a lot of clutches, could serve as a useful approximation for
  19 * non-queued rotational devices, this is no longer viable with modern
  20 * devices, even the rotational ones.
  21 *
  22 * While there is no cost metric we can trivially observe, it isn't a
  23 * complete mystery.  For example, on a rotational device, seek cost
  24 * dominates while a contiguous transfer contributes a smaller amount
  25 * proportional to the size.  If we can characterize at least the relative
  26 * costs of these different types of IOs, it should be possible to
  27 * implement a reasonable work-conserving proportional IO resource
  28 * distribution.
  29 *
  30 * 1. IO Cost Model
  31 *
  32 * IO cost model estimates the cost of an IO given its basic parameters and
  33 * history (e.g. the end sector of the last IO).  The cost is measured in
  34 * device time.  If a given IO is estimated to cost 10ms, the device should
  35 * be able to process ~100 of those IOs in a second.
  36 *
  37 * Currently, there's only one builtin cost model - linear.  Each IO is
  38 * classified as sequential or random and given a base cost accordingly.
  39 * On top of that, a size cost proportional to the length of the IO is
  40 * added.  While simple, this model captures the operational
  41 * characteristics of a wide varienty of devices well enough.  Default
  42 * parameters for several different classes of devices are provided and the
  43 * parameters can be configured from userspace via
  44 * /sys/fs/cgroup/io.cost.model.
  45 *
  46 * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
  47 * device-specific coefficients.
  48 *
  49 * 2. Control Strategy
  50 *
  51 * The device virtual time (vtime) is used as the primary control metric.
  52 * The control strategy is composed of the following three parts.
  53 *
  54 * 2-1. Vtime Distribution
  55 *
  56 * When a cgroup becomes active in terms of IOs, its hierarchical share is
  57 * calculated.  Please consider the following hierarchy where the numbers
  58 * inside parentheses denote the configured weights.
  59 *
  60 *           root
  61 *         /       \
  62 *      A (w:100)  B (w:300)
  63 *      /       \
  64 *  A0 (w:100)  A1 (w:100)
  65 *
  66 * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
  67 * of equal weight, each gets 50% share.  If then B starts issuing IOs, B
  68 * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
  69 * 12.5% each.  The distribution mechanism only cares about these flattened
  70 * shares.  They're called hweights (hierarchical weights) and always add
  71 * upto 1 (WEIGHT_ONE).
  72 *
  73 * A given cgroup's vtime runs slower in inverse proportion to its hweight.
  74 * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
  75 * against the device vtime - an IO which takes 10ms on the underlying
  76 * device is considered to take 80ms on A0.
  77 *
  78 * This constitutes the basis of IO capacity distribution.  Each cgroup's
  79 * vtime is running at a rate determined by its hweight.  A cgroup tracks
  80 * the vtime consumed by past IOs and can issue a new IO if doing so
  81 * wouldn't outrun the current device vtime.  Otherwise, the IO is
  82 * suspended until the vtime has progressed enough to cover it.
  83 *
  84 * 2-2. Vrate Adjustment
  85 *
  86 * It's unrealistic to expect the cost model to be perfect.  There are too
  87 * many devices and even on the same device the overall performance
  88 * fluctuates depending on numerous factors such as IO mixture and device
  89 * internal garbage collection.  The controller needs to adapt dynamically.
  90 *
  91 * This is achieved by adjusting the overall IO rate according to how busy
  92 * the device is.  If the device becomes overloaded, we're sending down too
  93 * many IOs and should generally slow down.  If there are waiting issuers
  94 * but the device isn't saturated, we're issuing too few and should
  95 * generally speed up.
  96 *
  97 * To slow down, we lower the vrate - the rate at which the device vtime
  98 * passes compared to the wall clock.  For example, if the vtime is running
  99 * at the vrate of 75%, all cgroups added up would only be able to issue
 100 * 750ms worth of IOs per second, and vice-versa for speeding up.
 101 *
 102 * Device business is determined using two criteria - rq wait and
 103 * completion latencies.
 104 *
 105 * When a device gets saturated, the on-device and then the request queues
 106 * fill up and a bio which is ready to be issued has to wait for a request
 107 * to become available.  When this delay becomes noticeable, it's a clear
 108 * indication that the device is saturated and we lower the vrate.  This
 109 * saturation signal is fairly conservative as it only triggers when both
 110 * hardware and software queues are filled up, and is used as the default
 111 * busy signal.
 112 *
 113 * As devices can have deep queues and be unfair in how the queued commands
 114 * are executed, soley depending on rq wait may not result in satisfactory
 115 * control quality.  For a better control quality, completion latency QoS
 116 * parameters can be configured so that the device is considered saturated
 117 * if N'th percentile completion latency rises above the set point.
 118 *
 119 * The completion latency requirements are a function of both the
 120 * underlying device characteristics and the desired IO latency quality of
 121 * service.  There is an inherent trade-off - the tighter the latency QoS,
 122 * the higher the bandwidth lossage.  Latency QoS is disabled by default
 123 * and can be set through /sys/fs/cgroup/io.cost.qos.
 124 *
 125 * 2-3. Work Conservation
 126 *
 127 * Imagine two cgroups A and B with equal weights.  A is issuing a small IO
 128 * periodically while B is sending out enough parallel IOs to saturate the
 129 * device on its own.  Let's say A's usage amounts to 100ms worth of IO
 130 * cost per second, i.e., 10% of the device capacity.  The naive
 131 * distribution of half and half would lead to 60% utilization of the
 132 * device, a significant reduction in the total amount of work done
 133 * compared to free-for-all competition.  This is too high a cost to pay
 134 * for IO control.
 135 *
 136 * To conserve the total amount of work done, we keep track of how much
 137 * each active cgroup is actually using and yield part of its weight if
 138 * there are other cgroups which can make use of it.  In the above case,
 139 * A's weight will be lowered so that it hovers above the actual usage and
 140 * B would be able to use the rest.
 141 *
 142 * As we don't want to penalize a cgroup for donating its weight, the
 143 * surplus weight adjustment factors in a margin and has an immediate
 144 * snapback mechanism in case the cgroup needs more IO vtime for itself.
 145 *
 146 * Note that adjusting down surplus weights has the same effects as
 147 * accelerating vtime for other cgroups and work conservation can also be
 148 * implemented by adjusting vrate dynamically.  However, squaring who can
 149 * donate and should take back how much requires hweight propagations
 150 * anyway making it easier to implement and understand as a separate
 151 * mechanism.
 152 *
 153 * 3. Monitoring
 154 *
 155 * Instead of debugfs or other clumsy monitoring mechanisms, this
 156 * controller uses a drgn based monitoring script -
 157 * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
 158 * https://github.com/osandov/drgn.  The output looks like the following.
 159 *
 160 *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
 161 *                 active      weight      hweight% inflt% dbt  delay usages%
 162 *  test/a              *    50/   50  33.33/ 33.33  27.65   2  0*041 033:033:033
 163 *  test/b              *   100/  100  66.67/ 66.67  17.56   0  0*000 066:079:077
 164 *
 165 * - per        : Timer period
 166 * - cur_per    : Internal wall and device vtime clock
 167 * - vrate      : Device virtual time rate against wall clock
 168 * - weight     : Surplus-adjusted and configured weights
 169 * - hweight    : Surplus-adjusted and configured hierarchical weights
 170 * - inflt      : The percentage of in-flight IO cost at the end of last period
 171 * - del_ms     : Deferred issuer delay induction level and duration
 172 * - usages     : Usage history
 173 */
 174
 175#include <linux/kernel.h>
 176#include <linux/module.h>
 177#include <linux/timer.h>
 178#include <linux/time64.h>
 179#include <linux/parser.h>
 180#include <linux/sched/signal.h>
 181#include <linux/blk-cgroup.h>
 182#include <asm/local.h>
 183#include <asm/local64.h>
 184#include "blk-rq-qos.h"
 185#include "blk-stat.h"
 186#include "blk-wbt.h"
 187
 188#ifdef CONFIG_TRACEPOINTS
 189
 190/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
 191#define TRACE_IOCG_PATH_LEN 1024
 192static DEFINE_SPINLOCK(trace_iocg_path_lock);
 193static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
 194
 195#define TRACE_IOCG_PATH(type, iocg, ...)                                        \
 196        do {                                                                    \
 197                unsigned long flags;                                            \
 198                if (trace_iocost_##type##_enabled()) {                          \
 199                        spin_lock_irqsave(&trace_iocg_path_lock, flags);        \
 200                        cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup,      \
 201                                    trace_iocg_path, TRACE_IOCG_PATH_LEN);      \
 202                        trace_iocost_##type(iocg, trace_iocg_path,              \
 203                                              ##__VA_ARGS__);                   \
 204                        spin_unlock_irqrestore(&trace_iocg_path_lock, flags);   \
 205                }                                                               \
 206        } while (0)
 207
 208#else   /* CONFIG_TRACE_POINTS */
 209#define TRACE_IOCG_PATH(type, iocg, ...)        do { } while (0)
 210#endif  /* CONFIG_TRACE_POINTS */
 211
 212enum {
 213        MILLION                 = 1000000,
 214
 215        /* timer period is calculated from latency requirements, bound it */
 216        MIN_PERIOD              = USEC_PER_MSEC,
 217        MAX_PERIOD              = USEC_PER_SEC,
 218
 219        /*
 220         * iocg->vtime is targeted at 50% behind the device vtime, which
 221         * serves as its IO credit buffer.  Surplus weight adjustment is
 222         * immediately canceled if the vtime margin runs below 10%.
 223         */
 224        MARGIN_MIN_PCT          = 10,
 225        MARGIN_LOW_PCT          = 20,
 226        MARGIN_TARGET_PCT       = 50,
 227
 228        INUSE_ADJ_STEP_PCT      = 25,
 229
 230        /* Have some play in timer operations */
 231        TIMER_SLACK_PCT         = 1,
 232
 233        /* 1/64k is granular enough and can easily be handled w/ u32 */
 234        WEIGHT_ONE              = 1 << 16,
 235
 236        /*
 237         * As vtime is used to calculate the cost of each IO, it needs to
 238         * be fairly high precision.  For example, it should be able to
 239         * represent the cost of a single page worth of discard with
 240         * suffificient accuracy.  At the same time, it should be able to
 241         * represent reasonably long enough durations to be useful and
 242         * convenient during operation.
 243         *
 244         * 1s worth of vtime is 2^37.  This gives us both sub-nanosecond
 245         * granularity and days of wrap-around time even at extreme vrates.
 246         */
 247        VTIME_PER_SEC_SHIFT     = 37,
 248        VTIME_PER_SEC           = 1LLU << VTIME_PER_SEC_SHIFT,
 249        VTIME_PER_USEC          = VTIME_PER_SEC / USEC_PER_SEC,
 250        VTIME_PER_NSEC          = VTIME_PER_SEC / NSEC_PER_SEC,
 251
 252        /* bound vrate adjustments within two orders of magnitude */
 253        VRATE_MIN_PPM           = 10000,        /* 1% */
 254        VRATE_MAX_PPM           = 100000000,    /* 10000% */
 255
 256        VRATE_MIN               = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
 257        VRATE_CLAMP_ADJ_PCT     = 4,
 258
 259        /* if IOs end up waiting for requests, issue less */
 260        RQ_WAIT_BUSY_PCT        = 5,
 261
 262        /* unbusy hysterisis */
 263        UNBUSY_THR_PCT          = 75,
 264
 265        /*
 266         * The effect of delay is indirect and non-linear and a huge amount of
 267         * future debt can accumulate abruptly while unthrottled. Linearly scale
 268         * up delay as debt is going up and then let it decay exponentially.
 269         * This gives us quick ramp ups while delay is accumulating and long
 270         * tails which can help reducing the frequency of debt explosions on
 271         * unthrottle. The parameters are experimentally determined.
 272         *
 273         * The delay mechanism provides adequate protection and behavior in many
 274         * cases. However, this is far from ideal and falls shorts on both
 275         * fronts. The debtors are often throttled too harshly costing a
 276         * significant level of fairness and possibly total work while the
 277         * protection against their impacts on the system can be choppy and
 278         * unreliable.
 279         *
 280         * The shortcoming primarily stems from the fact that, unlike for page
 281         * cache, the kernel doesn't have well-defined back-pressure propagation
 282         * mechanism and policies for anonymous memory. Fully addressing this
 283         * issue will likely require substantial improvements in the area.
 284         */
 285        MIN_DELAY_THR_PCT       = 500,
 286        MAX_DELAY_THR_PCT       = 25000,
 287        MIN_DELAY               = 250,
 288        MAX_DELAY               = 250 * USEC_PER_MSEC,
 289
 290        /* halve debts if avg usage over 100ms is under 50% */
 291        DFGV_USAGE_PCT          = 50,
 292        DFGV_PERIOD             = 100 * USEC_PER_MSEC,
 293
 294        /* don't let cmds which take a very long time pin lagging for too long */
 295        MAX_LAGGING_PERIODS     = 10,
 296
 297        /* switch iff the conditions are met for longer than this */
 298        AUTOP_CYCLE_NSEC        = 10LLU * NSEC_PER_SEC,
 299
 300        /*
 301         * Count IO size in 4k pages.  The 12bit shift helps keeping
 302         * size-proportional components of cost calculation in closer
 303         * numbers of digits to per-IO cost components.
 304         */
 305        IOC_PAGE_SHIFT          = 12,
 306        IOC_PAGE_SIZE           = 1 << IOC_PAGE_SHIFT,
 307        IOC_SECT_TO_PAGE_SHIFT  = IOC_PAGE_SHIFT - SECTOR_SHIFT,
 308
 309        /* if apart further than 16M, consider randio for linear model */
 310        LCOEF_RANDIO_PAGES      = 4096,
 311};
 312
 313enum ioc_running {
 314        IOC_IDLE,
 315        IOC_RUNNING,
 316        IOC_STOP,
 317};
 318
 319/* io.cost.qos controls including per-dev enable of the whole controller */
 320enum {
 321        QOS_ENABLE,
 322        QOS_CTRL,
 323        NR_QOS_CTRL_PARAMS,
 324};
 325
 326/* io.cost.qos params */
 327enum {
 328        QOS_RPPM,
 329        QOS_RLAT,
 330        QOS_WPPM,
 331        QOS_WLAT,
 332        QOS_MIN,
 333        QOS_MAX,
 334        NR_QOS_PARAMS,
 335};
 336
 337/* io.cost.model controls */
 338enum {
 339        COST_CTRL,
 340        COST_MODEL,
 341        NR_COST_CTRL_PARAMS,
 342};
 343
 344/* builtin linear cost model coefficients */
 345enum {
 346        I_LCOEF_RBPS,
 347        I_LCOEF_RSEQIOPS,
 348        I_LCOEF_RRANDIOPS,
 349        I_LCOEF_WBPS,
 350        I_LCOEF_WSEQIOPS,
 351        I_LCOEF_WRANDIOPS,
 352        NR_I_LCOEFS,
 353};
 354
 355enum {
 356        LCOEF_RPAGE,
 357        LCOEF_RSEQIO,
 358        LCOEF_RRANDIO,
 359        LCOEF_WPAGE,
 360        LCOEF_WSEQIO,
 361        LCOEF_WRANDIO,
 362        NR_LCOEFS,
 363};
 364
 365enum {
 366        AUTOP_INVALID,
 367        AUTOP_HDD,
 368        AUTOP_SSD_QD1,
 369        AUTOP_SSD_DFL,
 370        AUTOP_SSD_FAST,
 371};
 372
 373struct ioc_params {
 374        u32                             qos[NR_QOS_PARAMS];
 375        u64                             i_lcoefs[NR_I_LCOEFS];
 376        u64                             lcoefs[NR_LCOEFS];
 377        u32                             too_fast_vrate_pct;
 378        u32                             too_slow_vrate_pct;
 379};
 380
 381struct ioc_margins {
 382        s64                             min;
 383        s64                             low;
 384        s64                             target;
 385};
 386
 387struct ioc_missed {
 388        local_t                         nr_met;
 389        local_t                         nr_missed;
 390        u32                             last_met;
 391        u32                             last_missed;
 392};
 393
 394struct ioc_pcpu_stat {
 395        struct ioc_missed               missed[2];
 396
 397        local64_t                       rq_wait_ns;
 398        u64                             last_rq_wait_ns;
 399};
 400
 401/* per device */
 402struct ioc {
 403        struct rq_qos                   rqos;
 404
 405        bool                            enabled;
 406
 407        struct ioc_params               params;
 408        struct ioc_margins              margins;
 409        u32                             period_us;
 410        u32                             timer_slack_ns;
 411        u64                             vrate_min;
 412        u64                             vrate_max;
 413
 414        spinlock_t                      lock;
 415        struct timer_list               timer;
 416        struct list_head                active_iocgs;   /* active cgroups */
 417        struct ioc_pcpu_stat __percpu   *pcpu_stat;
 418
 419        enum ioc_running                running;
 420        atomic64_t                      vtime_rate;
 421        u64                             vtime_base_rate;
 422        s64                             vtime_err;
 423
 424        seqcount_spinlock_t             period_seqcount;
 425        u64                             period_at;      /* wallclock starttime */
 426        u64                             period_at_vtime; /* vtime starttime */
 427
 428        atomic64_t                      cur_period;     /* inc'd each period */
 429        int                             busy_level;     /* saturation history */
 430
 431        bool                            weights_updated;
 432        atomic_t                        hweight_gen;    /* for lazy hweights */
 433
 434        /* debt forgivness */
 435        u64                             dfgv_period_at;
 436        u64                             dfgv_period_rem;
 437        u64                             dfgv_usage_us_sum;
 438
 439        u64                             autop_too_fast_at;
 440        u64                             autop_too_slow_at;
 441        int                             autop_idx;
 442        bool                            user_qos_params:1;
 443        bool                            user_cost_model:1;
 444};
 445
 446struct iocg_pcpu_stat {
 447        local64_t                       abs_vusage;
 448};
 449
 450struct iocg_stat {
 451        u64                             usage_us;
 452        u64                             wait_us;
 453        u64                             indebt_us;
 454        u64                             indelay_us;
 455};
 456
 457/* per device-cgroup pair */
 458struct ioc_gq {
 459        struct blkg_policy_data         pd;
 460        struct ioc                      *ioc;
 461
 462        /*
 463         * A iocg can get its weight from two sources - an explicit
 464         * per-device-cgroup configuration or the default weight of the
 465         * cgroup.  `cfg_weight` is the explicit per-device-cgroup
 466         * configuration.  `weight` is the effective considering both
 467         * sources.
 468         *
 469         * When an idle cgroup becomes active its `active` goes from 0 to
 470         * `weight`.  `inuse` is the surplus adjusted active weight.
 471         * `active` and `inuse` are used to calculate `hweight_active` and
 472         * `hweight_inuse`.
 473         *
 474         * `last_inuse` remembers `inuse` while an iocg is idle to persist
 475         * surplus adjustments.
 476         *
 477         * `inuse` may be adjusted dynamically during period. `saved_*` are used
 478         * to determine and track adjustments.
 479         */
 480        u32                             cfg_weight;
 481        u32                             weight;
 482        u32                             active;
 483        u32                             inuse;
 484
 485        u32                             last_inuse;
 486        s64                             saved_margin;
 487
 488        sector_t                        cursor;         /* to detect randio */
 489
 490        /*
 491         * `vtime` is this iocg's vtime cursor which progresses as IOs are
 492         * issued.  If lagging behind device vtime, the delta represents
 493         * the currently available IO budget.  If running ahead, the
 494         * overage.
 495         *
 496         * `vtime_done` is the same but progressed on completion rather
 497         * than issue.  The delta behind `vtime` represents the cost of
 498         * currently in-flight IOs.
 499         */
 500        atomic64_t                      vtime;
 501        atomic64_t                      done_vtime;
 502        u64                             abs_vdebt;
 503
 504        /* current delay in effect and when it started */
 505        u64                             delay;
 506        u64                             delay_at;
 507
 508        /*
 509         * The period this iocg was last active in.  Used for deactivation
 510         * and invalidating `vtime`.
 511         */
 512        atomic64_t                      active_period;
 513        struct list_head                active_list;
 514
 515        /* see __propagate_weights() and current_hweight() for details */
 516        u64                             child_active_sum;
 517        u64                             child_inuse_sum;
 518        u64                             child_adjusted_sum;
 519        int                             hweight_gen;
 520        u32                             hweight_active;
 521        u32                             hweight_inuse;
 522        u32                             hweight_donating;
 523        u32                             hweight_after_donation;
 524
 525        struct list_head                walk_list;
 526        struct list_head                surplus_list;
 527
 528        struct wait_queue_head          waitq;
 529        struct hrtimer                  waitq_timer;
 530
 531        /* timestamp at the latest activation */
 532        u64                             activated_at;
 533
 534        /* statistics */
 535        struct iocg_pcpu_stat __percpu  *pcpu_stat;
 536        struct iocg_stat                local_stat;
 537        struct iocg_stat                desc_stat;
 538        struct iocg_stat                last_stat;
 539        u64                             last_stat_abs_vusage;
 540        u64                             usage_delta_us;
 541        u64                             wait_since;
 542        u64                             indebt_since;
 543        u64                             indelay_since;
 544
 545        /* this iocg's depth in the hierarchy and ancestors including self */
 546        int                             level;
 547        struct ioc_gq                   *ancestors[];
 548};
 549
 550/* per cgroup */
 551struct ioc_cgrp {
 552        struct blkcg_policy_data        cpd;
 553        unsigned int                    dfl_weight;
 554};
 555
 556struct ioc_now {
 557        u64                             now_ns;
 558        u64                             now;
 559        u64                             vnow;
 560        u64                             vrate;
 561};
 562
 563struct iocg_wait {
 564        struct wait_queue_entry         wait;
 565        struct bio                      *bio;
 566        u64                             abs_cost;
 567        bool                            committed;
 568};
 569
 570struct iocg_wake_ctx {
 571        struct ioc_gq                   *iocg;
 572        u32                             hw_inuse;
 573        s64                             vbudget;
 574};
 575
 576static const struct ioc_params autop[] = {
 577        [AUTOP_HDD] = {
 578                .qos                            = {
 579                        [QOS_RLAT]              =        250000, /* 250ms */
 580                        [QOS_WLAT]              =        250000,
 581                        [QOS_MIN]               = VRATE_MIN_PPM,
 582                        [QOS_MAX]               = VRATE_MAX_PPM,
 583                },
 584                .i_lcoefs                       = {
 585                        [I_LCOEF_RBPS]          =     174019176,
 586                        [I_LCOEF_RSEQIOPS]      =         41708,
 587                        [I_LCOEF_RRANDIOPS]     =           370,
 588                        [I_LCOEF_WBPS]          =     178075866,
 589                        [I_LCOEF_WSEQIOPS]      =         42705,
 590                        [I_LCOEF_WRANDIOPS]     =           378,
 591                },
 592        },
 593        [AUTOP_SSD_QD1] = {
 594                .qos                            = {
 595                        [QOS_RLAT]              =         25000, /* 25ms */
 596                        [QOS_WLAT]              =         25000,
 597                        [QOS_MIN]               = VRATE_MIN_PPM,
 598                        [QOS_MAX]               = VRATE_MAX_PPM,
 599                },
 600                .i_lcoefs                       = {
 601                        [I_LCOEF_RBPS]          =     245855193,
 602                        [I_LCOEF_RSEQIOPS]      =         61575,
 603                        [I_LCOEF_RRANDIOPS]     =          6946,
 604                        [I_LCOEF_WBPS]          =     141365009,
 605                        [I_LCOEF_WSEQIOPS]      =         33716,
 606                        [I_LCOEF_WRANDIOPS]     =         26796,
 607                },
 608        },
 609        [AUTOP_SSD_DFL] = {
 610                .qos                            = {
 611                        [QOS_RLAT]              =         25000, /* 25ms */
 612                        [QOS_WLAT]              =         25000,
 613                        [QOS_MIN]               = VRATE_MIN_PPM,
 614                        [QOS_MAX]               = VRATE_MAX_PPM,
 615                },
 616                .i_lcoefs                       = {
 617                        [I_LCOEF_RBPS]          =     488636629,
 618                        [I_LCOEF_RSEQIOPS]      =          8932,
 619                        [I_LCOEF_RRANDIOPS]     =          8518,
 620                        [I_LCOEF_WBPS]          =     427891549,
 621                        [I_LCOEF_WSEQIOPS]      =         28755,
 622                        [I_LCOEF_WRANDIOPS]     =         21940,
 623                },
 624                .too_fast_vrate_pct             =           500,
 625        },
 626        [AUTOP_SSD_FAST] = {
 627                .qos                            = {
 628                        [QOS_RLAT]              =          5000, /* 5ms */
 629                        [QOS_WLAT]              =          5000,
 630                        [QOS_MIN]               = VRATE_MIN_PPM,
 631                        [QOS_MAX]               = VRATE_MAX_PPM,
 632                },
 633                .i_lcoefs                       = {
 634                        [I_LCOEF_RBPS]          =    3102524156LLU,
 635                        [I_LCOEF_RSEQIOPS]      =        724816,
 636                        [I_LCOEF_RRANDIOPS]     =        778122,
 637                        [I_LCOEF_WBPS]          =    1742780862LLU,
 638                        [I_LCOEF_WSEQIOPS]      =        425702,
 639                        [I_LCOEF_WRANDIOPS]     =        443193,
 640                },
 641                .too_slow_vrate_pct             =            10,
 642        },
 643};
 644
 645/*
 646 * vrate adjust percentages indexed by ioc->busy_level.  We adjust up on
 647 * vtime credit shortage and down on device saturation.
 648 */
 649static u32 vrate_adj_pct[] =
 650        { 0, 0, 0, 0,
 651          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 652          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 653          4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
 654
 655static struct blkcg_policy blkcg_policy_iocost;
 656
 657/* accessors and helpers */
 658static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
 659{
 660        return container_of(rqos, struct ioc, rqos);
 661}
 662
 663static struct ioc *q_to_ioc(struct request_queue *q)
 664{
 665        return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
 666}
 667
 668static const char *q_name(struct request_queue *q)
 669{
 670        if (blk_queue_registered(q))
 671                return kobject_name(q->kobj.parent);
 672        else
 673                return "<unknown>";
 674}
 675
 676static const char __maybe_unused *ioc_name(struct ioc *ioc)
 677{
 678        return q_name(ioc->rqos.q);
 679}
 680
 681static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
 682{
 683        return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
 684}
 685
 686static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
 687{
 688        return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
 689}
 690
 691static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
 692{
 693        return pd_to_blkg(&iocg->pd);
 694}
 695
 696static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
 697{
 698        return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
 699                            struct ioc_cgrp, cpd);
 700}
 701
 702/*
 703 * Scale @abs_cost to the inverse of @hw_inuse.  The lower the hierarchical
 704 * weight, the more expensive each IO.  Must round up.
 705 */
 706static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
 707{
 708        return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
 709}
 710
 711/*
 712 * The inverse of abs_cost_to_cost().  Must round up.
 713 */
 714static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
 715{
 716        return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
 717}
 718
 719static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
 720                            u64 abs_cost, u64 cost)
 721{
 722        struct iocg_pcpu_stat *gcs;
 723
 724        bio->bi_iocost_cost = cost;
 725        atomic64_add(cost, &iocg->vtime);
 726
 727        gcs = get_cpu_ptr(iocg->pcpu_stat);
 728        local64_add(abs_cost, &gcs->abs_vusage);
 729        put_cpu_ptr(gcs);
 730}
 731
 732static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
 733{
 734        if (lock_ioc) {
 735                spin_lock_irqsave(&iocg->ioc->lock, *flags);
 736                spin_lock(&iocg->waitq.lock);
 737        } else {
 738                spin_lock_irqsave(&iocg->waitq.lock, *flags);
 739        }
 740}
 741
 742static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
 743{
 744        if (unlock_ioc) {
 745                spin_unlock(&iocg->waitq.lock);
 746                spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
 747        } else {
 748                spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
 749        }
 750}
 751
 752#define CREATE_TRACE_POINTS
 753#include <trace/events/iocost.h>
 754
 755static void ioc_refresh_margins(struct ioc *ioc)
 756{
 757        struct ioc_margins *margins = &ioc->margins;
 758        u32 period_us = ioc->period_us;
 759        u64 vrate = ioc->vtime_base_rate;
 760
 761        margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
 762        margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
 763        margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
 764}
 765
 766/* latency Qos params changed, update period_us and all the dependent params */
 767static void ioc_refresh_period_us(struct ioc *ioc)
 768{
 769        u32 ppm, lat, multi, period_us;
 770
 771        lockdep_assert_held(&ioc->lock);
 772
 773        /* pick the higher latency target */
 774        if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
 775                ppm = ioc->params.qos[QOS_RPPM];
 776                lat = ioc->params.qos[QOS_RLAT];
 777        } else {
 778                ppm = ioc->params.qos[QOS_WPPM];
 779                lat = ioc->params.qos[QOS_WLAT];
 780        }
 781
 782        /*
 783         * We want the period to be long enough to contain a healthy number
 784         * of IOs while short enough for granular control.  Define it as a
 785         * multiple of the latency target.  Ideally, the multiplier should
 786         * be scaled according to the percentile so that it would nominally
 787         * contain a certain number of requests.  Let's be simpler and
 788         * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
 789         */
 790        if (ppm)
 791                multi = max_t(u32, (MILLION - ppm) / 50000, 2);
 792        else
 793                multi = 2;
 794        period_us = multi * lat;
 795        period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
 796
 797        /* calculate dependent params */
 798        ioc->period_us = period_us;
 799        ioc->timer_slack_ns = div64_u64(
 800                (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
 801                100);
 802        ioc_refresh_margins(ioc);
 803}
 804
 805static int ioc_autop_idx(struct ioc *ioc)
 806{
 807        int idx = ioc->autop_idx;
 808        const struct ioc_params *p = &autop[idx];
 809        u32 vrate_pct;
 810        u64 now_ns;
 811
 812        /* rotational? */
 813        if (!blk_queue_nonrot(ioc->rqos.q))
 814                return AUTOP_HDD;
 815
 816        /* handle SATA SSDs w/ broken NCQ */
 817        if (blk_queue_depth(ioc->rqos.q) == 1)
 818                return AUTOP_SSD_QD1;
 819
 820        /* use one of the normal ssd sets */
 821        if (idx < AUTOP_SSD_DFL)
 822                return AUTOP_SSD_DFL;
 823
 824        /* if user is overriding anything, maintain what was there */
 825        if (ioc->user_qos_params || ioc->user_cost_model)
 826                return idx;
 827
 828        /* step up/down based on the vrate */
 829        vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
 830        now_ns = ktime_get_ns();
 831
 832        if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
 833                if (!ioc->autop_too_fast_at)
 834                        ioc->autop_too_fast_at = now_ns;
 835                if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
 836                        return idx + 1;
 837        } else {
 838                ioc->autop_too_fast_at = 0;
 839        }
 840
 841        if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
 842                if (!ioc->autop_too_slow_at)
 843                        ioc->autop_too_slow_at = now_ns;
 844                if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
 845                        return idx - 1;
 846        } else {
 847                ioc->autop_too_slow_at = 0;
 848        }
 849
 850        return idx;
 851}
 852
 853/*
 854 * Take the followings as input
 855 *
 856 *  @bps        maximum sequential throughput
 857 *  @seqiops    maximum sequential 4k iops
 858 *  @randiops   maximum random 4k iops
 859 *
 860 * and calculate the linear model cost coefficients.
 861 *
 862 *  *@page      per-page cost           1s / (@bps / 4096)
 863 *  *@seqio     base cost of a seq IO   max((1s / @seqiops) - *@page, 0)
 864 *  @randiops   base cost of a rand IO  max((1s / @randiops) - *@page, 0)
 865 */
 866static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
 867                        u64 *page, u64 *seqio, u64 *randio)
 868{
 869        u64 v;
 870
 871        *page = *seqio = *randio = 0;
 872
 873        if (bps)
 874                *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
 875                                           DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
 876
 877        if (seqiops) {
 878                v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
 879                if (v > *page)
 880                        *seqio = v - *page;
 881        }
 882
 883        if (randiops) {
 884                v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
 885                if (v > *page)
 886                        *randio = v - *page;
 887        }
 888}
 889
 890static void ioc_refresh_lcoefs(struct ioc *ioc)
 891{
 892        u64 *u = ioc->params.i_lcoefs;
 893        u64 *c = ioc->params.lcoefs;
 894
 895        calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
 896                    &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
 897        calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
 898                    &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
 899}
 900
 901static bool ioc_refresh_params(struct ioc *ioc, bool force)
 902{
 903        const struct ioc_params *p;
 904        int idx;
 905
 906        lockdep_assert_held(&ioc->lock);
 907
 908        idx = ioc_autop_idx(ioc);
 909        p = &autop[idx];
 910
 911        if (idx == ioc->autop_idx && !force)
 912                return false;
 913
 914        if (idx != ioc->autop_idx)
 915                atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
 916
 917        ioc->autop_idx = idx;
 918        ioc->autop_too_fast_at = 0;
 919        ioc->autop_too_slow_at = 0;
 920
 921        if (!ioc->user_qos_params)
 922                memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
 923        if (!ioc->user_cost_model)
 924                memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
 925
 926        ioc_refresh_period_us(ioc);
 927        ioc_refresh_lcoefs(ioc);
 928
 929        ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
 930                                            VTIME_PER_USEC, MILLION);
 931        ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
 932                                   VTIME_PER_USEC, MILLION);
 933
 934        return true;
 935}
 936
 937/*
 938 * When an iocg accumulates too much vtime or gets deactivated, we throw away
 939 * some vtime, which lowers the overall device utilization. As the exact amount
 940 * which is being thrown away is known, we can compensate by accelerating the
 941 * vrate accordingly so that the extra vtime generated in the current period
 942 * matches what got lost.
 943 */
 944static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now)
 945{
 946        s64 pleft = ioc->period_at + ioc->period_us - now->now;
 947        s64 vperiod = ioc->period_us * ioc->vtime_base_rate;
 948        s64 vcomp, vcomp_min, vcomp_max;
 949
 950        lockdep_assert_held(&ioc->lock);
 951
 952        /* we need some time left in this period */
 953        if (pleft <= 0)
 954                goto done;
 955
 956        /*
 957         * Calculate how much vrate should be adjusted to offset the error.
 958         * Limit the amount of adjustment and deduct the adjusted amount from
 959         * the error.
 960         */
 961        vcomp = -div64_s64(ioc->vtime_err, pleft);
 962        vcomp_min = -(ioc->vtime_base_rate >> 1);
 963        vcomp_max = ioc->vtime_base_rate;
 964        vcomp = clamp(vcomp, vcomp_min, vcomp_max);
 965
 966        ioc->vtime_err += vcomp * pleft;
 967
 968        atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp);
 969done:
 970        /* bound how much error can accumulate */
 971        ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
 972}
 973
 974static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
 975                                  int nr_lagging, int nr_shortages,
 976                                  int prev_busy_level, u32 *missed_ppm)
 977{
 978        u64 vrate = ioc->vtime_base_rate;
 979        u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
 980
 981        if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) {
 982                if (ioc->busy_level != prev_busy_level || nr_lagging)
 983                        trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
 984                                                   missed_ppm, rq_wait_pct,
 985                                                   nr_lagging, nr_shortages);
 986
 987                return;
 988        }
 989
 990        /*
 991         * If vrate is out of bounds, apply clamp gradually as the
 992         * bounds can change abruptly.  Otherwise, apply busy_level
 993         * based adjustment.
 994         */
 995        if (vrate < vrate_min) {
 996                vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100);
 997                vrate = min(vrate, vrate_min);
 998        } else if (vrate > vrate_max) {
 999                vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100);
1000                vrate = max(vrate, vrate_max);
1001        } else {
1002                int idx = min_t(int, abs(ioc->busy_level),
1003                                ARRAY_SIZE(vrate_adj_pct) - 1);
1004                u32 adj_pct = vrate_adj_pct[idx];
1005
1006                if (ioc->busy_level > 0)
1007                        adj_pct = 100 - adj_pct;
1008                else
1009                        adj_pct = 100 + adj_pct;
1010
1011                vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1012                              vrate_min, vrate_max);
1013        }
1014
1015        trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1016                                   nr_lagging, nr_shortages);
1017
1018        ioc->vtime_base_rate = vrate;
1019        ioc_refresh_margins(ioc);
1020}
1021
1022/* take a snapshot of the current [v]time and vrate */
1023static void ioc_now(struct ioc *ioc, struct ioc_now *now)
1024{
1025        unsigned seq;
1026
1027        now->now_ns = ktime_get();
1028        now->now = ktime_to_us(now->now_ns);
1029        now->vrate = atomic64_read(&ioc->vtime_rate);
1030
1031        /*
1032         * The current vtime is
1033         *
1034         *   vtime at period start + (wallclock time since the start) * vrate
1035         *
1036         * As a consistent snapshot of `period_at_vtime` and `period_at` is
1037         * needed, they're seqcount protected.
1038         */
1039        do {
1040                seq = read_seqcount_begin(&ioc->period_seqcount);
1041                now->vnow = ioc->period_at_vtime +
1042                        (now->now - ioc->period_at) * now->vrate;
1043        } while (read_seqcount_retry(&ioc->period_seqcount, seq));
1044}
1045
1046static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
1047{
1048        WARN_ON_ONCE(ioc->running != IOC_RUNNING);
1049
1050        write_seqcount_begin(&ioc->period_seqcount);
1051        ioc->period_at = now->now;
1052        ioc->period_at_vtime = now->vnow;
1053        write_seqcount_end(&ioc->period_seqcount);
1054
1055        ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
1056        add_timer(&ioc->timer);
1057}
1058
1059/*
1060 * Update @iocg's `active` and `inuse` to @active and @inuse, update level
1061 * weight sums and propagate upwards accordingly. If @save, the current margin
1062 * is saved to be used as reference for later inuse in-period adjustments.
1063 */
1064static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1065                                bool save, struct ioc_now *now)
1066{
1067        struct ioc *ioc = iocg->ioc;
1068        int lvl;
1069
1070        lockdep_assert_held(&ioc->lock);
1071
1072        /*
1073         * For an active leaf node, its inuse shouldn't be zero or exceed
1074         * @active. An active internal node's inuse is solely determined by the
1075         * inuse to active ratio of its children regardless of @inuse.
1076         */
1077        if (list_empty(&iocg->active_list) && iocg->child_active_sum) {
1078                inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum,
1079                                           iocg->child_active_sum);
1080        } else {
1081                inuse = clamp_t(u32, inuse, 1, active);
1082        }
1083
1084        iocg->last_inuse = iocg->inuse;
1085        if (save)
1086                iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime);
1087
1088        if (active == iocg->active && inuse == iocg->inuse)
1089                return;
1090
1091        for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1092                struct ioc_gq *parent = iocg->ancestors[lvl];
1093                struct ioc_gq *child = iocg->ancestors[lvl + 1];
1094                u32 parent_active = 0, parent_inuse = 0;
1095
1096                /* update the level sums */
1097                parent->child_active_sum += (s32)(active - child->active);
1098                parent->child_inuse_sum += (s32)(inuse - child->inuse);
1099                /* apply the updates */
1100                child->active = active;
1101                child->inuse = inuse;
1102
1103                /*
1104                 * The delta between inuse and active sums indicates that
1105                 * much of weight is being given away.  Parent's inuse
1106                 * and active should reflect the ratio.
1107                 */
1108                if (parent->child_active_sum) {
1109                        parent_active = parent->weight;
1110                        parent_inuse = DIV64_U64_ROUND_UP(
1111                                parent_active * parent->child_inuse_sum,
1112                                parent->child_active_sum);
1113                }
1114
1115                /* do we need to keep walking up? */
1116                if (parent_active == parent->active &&
1117                    parent_inuse == parent->inuse)
1118                        break;
1119
1120                active = parent_active;
1121                inuse = parent_inuse;
1122        }
1123
1124        ioc->weights_updated = true;
1125}
1126
1127static void commit_weights(struct ioc *ioc)
1128{
1129        lockdep_assert_held(&ioc->lock);
1130
1131        if (ioc->weights_updated) {
1132                /* paired with rmb in current_hweight(), see there */
1133                smp_wmb();
1134                atomic_inc(&ioc->hweight_gen);
1135                ioc->weights_updated = false;
1136        }
1137}
1138
1139static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1140                              bool save, struct ioc_now *now)
1141{
1142        __propagate_weights(iocg, active, inuse, save, now);
1143        commit_weights(iocg->ioc);
1144}
1145
1146static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
1147{
1148        struct ioc *ioc = iocg->ioc;
1149        int lvl;
1150        u32 hwa, hwi;
1151        int ioc_gen;
1152
1153        /* hot path - if uptodate, use cached */
1154        ioc_gen = atomic_read(&ioc->hweight_gen);
1155        if (ioc_gen == iocg->hweight_gen)
1156                goto out;
1157
1158        /*
1159         * Paired with wmb in commit_weights(). If we saw the updated
1160         * hweight_gen, all the weight updates from __propagate_weights() are
1161         * visible too.
1162         *
1163         * We can race with weight updates during calculation and get it
1164         * wrong.  However, hweight_gen would have changed and a future
1165         * reader will recalculate and we're guaranteed to discard the
1166         * wrong result soon.
1167         */
1168        smp_rmb();
1169
1170        hwa = hwi = WEIGHT_ONE;
1171        for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
1172                struct ioc_gq *parent = iocg->ancestors[lvl];
1173                struct ioc_gq *child = iocg->ancestors[lvl + 1];
1174                u64 active_sum = READ_ONCE(parent->child_active_sum);
1175                u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
1176                u32 active = READ_ONCE(child->active);
1177                u32 inuse = READ_ONCE(child->inuse);
1178
1179                /* we can race with deactivations and either may read as zero */
1180                if (!active_sum || !inuse_sum)
1181                        continue;
1182
1183                active_sum = max_t(u64, active, active_sum);
1184                hwa = div64_u64((u64)hwa * active, active_sum);
1185
1186                inuse_sum = max_t(u64, inuse, inuse_sum);
1187                hwi = div64_u64((u64)hwi * inuse, inuse_sum);
1188        }
1189
1190        iocg->hweight_active = max_t(u32, hwa, 1);
1191        iocg->hweight_inuse = max_t(u32, hwi, 1);
1192        iocg->hweight_gen = ioc_gen;
1193out:
1194        if (hw_activep)
1195                *hw_activep = iocg->hweight_active;
1196        if (hw_inusep)
1197                *hw_inusep = iocg->hweight_inuse;
1198}
1199
1200/*
1201 * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the
1202 * other weights stay unchanged.
1203 */
1204static u32 current_hweight_max(struct ioc_gq *iocg)
1205{
1206        u32 hwm = WEIGHT_ONE;
1207        u32 inuse = iocg->active;
1208        u64 child_inuse_sum;
1209        int lvl;
1210
1211        lockdep_assert_held(&iocg->ioc->lock);
1212
1213        for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1214                struct ioc_gq *parent = iocg->ancestors[lvl];
1215                struct ioc_gq *child = iocg->ancestors[lvl + 1];
1216
1217                child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse;
1218                hwm = div64_u64((u64)hwm * inuse, child_inuse_sum);
1219                inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum,
1220                                           parent->child_active_sum);
1221        }
1222
1223        return max_t(u32, hwm, 1);
1224}
1225
1226static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
1227{
1228        struct ioc *ioc = iocg->ioc;
1229        struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1230        struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1231        u32 weight;
1232
1233        lockdep_assert_held(&ioc->lock);
1234
1235        weight = iocg->cfg_weight ?: iocc->dfl_weight;
1236        if (weight != iocg->weight && iocg->active)
1237                propagate_weights(iocg, weight, iocg->inuse, true, now);
1238        iocg->weight = weight;
1239}
1240
1241static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1242{
1243        struct ioc *ioc = iocg->ioc;
1244        u64 last_period, cur_period;
1245        u64 vtime, vtarget;
1246        int i;
1247
1248        /*
1249         * If seem to be already active, just update the stamp to tell the
1250         * timer that we're still active.  We don't mind occassional races.
1251         */
1252        if (!list_empty(&iocg->active_list)) {
1253                ioc_now(ioc, now);
1254                cur_period = atomic64_read(&ioc->cur_period);
1255                if (atomic64_read(&iocg->active_period) != cur_period)
1256                        atomic64_set(&iocg->active_period, cur_period);
1257                return true;
1258        }
1259
1260        /* racy check on internal node IOs, treat as root level IOs */
1261        if (iocg->child_active_sum)
1262                return false;
1263
1264        spin_lock_irq(&ioc->lock);
1265
1266        ioc_now(ioc, now);
1267
1268        /* update period */
1269        cur_period = atomic64_read(&ioc->cur_period);
1270        last_period = atomic64_read(&iocg->active_period);
1271        atomic64_set(&iocg->active_period, cur_period);
1272
1273        /* already activated or breaking leaf-only constraint? */
1274        if (!list_empty(&iocg->active_list))
1275                goto succeed_unlock;
1276        for (i = iocg->level - 1; i > 0; i--)
1277                if (!list_empty(&iocg->ancestors[i]->active_list))
1278                        goto fail_unlock;
1279
1280        if (iocg->child_active_sum)
1281                goto fail_unlock;
1282
1283        /*
1284         * Always start with the target budget. On deactivation, we throw away
1285         * anything above it.
1286         */
1287        vtarget = now->vnow - ioc->margins.target;
1288        vtime = atomic64_read(&iocg->vtime);
1289
1290        atomic64_add(vtarget - vtime, &iocg->vtime);
1291        atomic64_add(vtarget - vtime, &iocg->done_vtime);
1292        vtime = vtarget;
1293
1294        /*
1295         * Activate, propagate weight and start period timer if not
1296         * running.  Reset hweight_gen to avoid accidental match from
1297         * wrapping.
1298         */
1299        iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1300        list_add(&iocg->active_list, &ioc->active_iocgs);
1301
1302        propagate_weights(iocg, iocg->weight,
1303                          iocg->last_inuse ?: iocg->weight, true, now);
1304
1305        TRACE_IOCG_PATH(iocg_activate, iocg, now,
1306                        last_period, cur_period, vtime);
1307
1308        iocg->activated_at = now->now;
1309
1310        if (ioc->running == IOC_IDLE) {
1311                ioc->running = IOC_RUNNING;
1312                ioc->dfgv_period_at = now->now;
1313                ioc->dfgv_period_rem = 0;
1314                ioc_start_period(ioc, now);
1315        }
1316
1317succeed_unlock:
1318        spin_unlock_irq(&ioc->lock);
1319        return true;
1320
1321fail_unlock:
1322        spin_unlock_irq(&ioc->lock);
1323        return false;
1324}
1325
1326static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
1327{
1328        struct ioc *ioc = iocg->ioc;
1329        struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1330        u64 tdelta, delay, new_delay;
1331        s64 vover, vover_pct;
1332        u32 hwa;
1333
1334        lockdep_assert_held(&iocg->waitq.lock);
1335
1336        /* calculate the current delay in effect - 1/2 every second */
1337        tdelta = now->now - iocg->delay_at;
1338        if (iocg->delay)
1339                delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC);
1340        else
1341                delay = 0;
1342
1343        /* calculate the new delay from the debt amount */
1344        current_hweight(iocg, &hwa, NULL);
1345        vover = atomic64_read(&iocg->vtime) +
1346                abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow;
1347        vover_pct = div64_s64(100 * vover,
1348                              ioc->period_us * ioc->vtime_base_rate);
1349
1350        if (vover_pct <= MIN_DELAY_THR_PCT)
1351                new_delay = 0;
1352        else if (vover_pct >= MAX_DELAY_THR_PCT)
1353                new_delay = MAX_DELAY;
1354        else
1355                new_delay = MIN_DELAY +
1356                        div_u64((MAX_DELAY - MIN_DELAY) *
1357                                (vover_pct - MIN_DELAY_THR_PCT),
1358                                MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT);
1359
1360        /* pick the higher one and apply */
1361        if (new_delay > delay) {
1362                iocg->delay = new_delay;
1363                iocg->delay_at = now->now;
1364                delay = new_delay;
1365        }
1366
1367        if (delay >= MIN_DELAY) {
1368                if (!iocg->indelay_since)
1369                        iocg->indelay_since = now->now;
1370                blkcg_set_delay(blkg, delay * NSEC_PER_USEC);
1371                return true;
1372        } else {
1373                if (iocg->indelay_since) {
1374                        iocg->local_stat.indelay_us += now->now - iocg->indelay_since;
1375                        iocg->indelay_since = 0;
1376                }
1377                iocg->delay = 0;
1378                blkcg_clear_delay(blkg);
1379                return false;
1380        }
1381}
1382
1383static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost,
1384                            struct ioc_now *now)
1385{
1386        struct iocg_pcpu_stat *gcs;
1387
1388        lockdep_assert_held(&iocg->ioc->lock);
1389        lockdep_assert_held(&iocg->waitq.lock);
1390        WARN_ON_ONCE(list_empty(&iocg->active_list));
1391
1392        /*
1393         * Once in debt, debt handling owns inuse. @iocg stays at the minimum
1394         * inuse donating all of it share to others until its debt is paid off.
1395         */
1396        if (!iocg->abs_vdebt && abs_cost) {
1397                iocg->indebt_since = now->now;
1398                propagate_weights(iocg, iocg->active, 0, false, now);
1399        }
1400
1401        iocg->abs_vdebt += abs_cost;
1402
1403        gcs = get_cpu_ptr(iocg->pcpu_stat);
1404        local64_add(abs_cost, &gcs->abs_vusage);
1405        put_cpu_ptr(gcs);
1406}
1407
1408static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay,
1409                          struct ioc_now *now)
1410{
1411        lockdep_assert_held(&iocg->ioc->lock);
1412        lockdep_assert_held(&iocg->waitq.lock);
1413
1414        /* make sure that nobody messed with @iocg */
1415        WARN_ON_ONCE(list_empty(&iocg->active_list));
1416        WARN_ON_ONCE(iocg->inuse > 1);
1417
1418        iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt);
1419
1420        /* if debt is paid in full, restore inuse */
1421        if (!iocg->abs_vdebt) {
1422                iocg->local_stat.indebt_us += now->now - iocg->indebt_since;
1423                iocg->indebt_since = 0;
1424
1425                propagate_weights(iocg, iocg->active, iocg->last_inuse,
1426                                  false, now);
1427        }
1428}
1429
1430static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1431                        int flags, void *key)
1432{
1433        struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1434        struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1435        u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1436
1437        ctx->vbudget -= cost;
1438
1439        if (ctx->vbudget < 0)
1440                return -1;
1441
1442        iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
1443        wait->committed = true;
1444
1445        /*
1446         * autoremove_wake_function() removes the wait entry only when it
1447         * actually changed the task state. We want the wait always removed.
1448         * Remove explicitly and use default_wake_function(). Note that the
1449         * order of operations is important as finish_wait() tests whether
1450         * @wq_entry is removed without grabbing the lock.
1451         */
1452        default_wake_function(wq_entry, mode, flags, key);
1453        list_del_init_careful(&wq_entry->entry);
1454        return 0;
1455}
1456
1457/*
1458 * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters
1459 * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
1460 * addition to iocg->waitq.lock.
1461 */
1462static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
1463                            struct ioc_now *now)
1464{
1465        struct ioc *ioc = iocg->ioc;
1466        struct iocg_wake_ctx ctx = { .iocg = iocg };
1467        u64 vshortage, expires, oexpires;
1468        s64 vbudget;
1469        u32 hwa;
1470
1471        lockdep_assert_held(&iocg->waitq.lock);
1472
1473        current_hweight(iocg, &hwa, NULL);
1474        vbudget = now->vnow - atomic64_read(&iocg->vtime);
1475
1476        /* pay off debt */
1477        if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
1478                u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa);
1479                u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt);
1480                u64 vpay = abs_cost_to_cost(abs_vpay, hwa);
1481
1482                lockdep_assert_held(&ioc->lock);
1483
1484                atomic64_add(vpay, &iocg->vtime);
1485                atomic64_add(vpay, &iocg->done_vtime);
1486                iocg_pay_debt(iocg, abs_vpay, now);
1487                vbudget -= vpay;
1488        }
1489
1490        if (iocg->abs_vdebt || iocg->delay)
1491                iocg_kick_delay(iocg, now);
1492
1493        /*
1494         * Debt can still be outstanding if we haven't paid all yet or the
1495         * caller raced and called without @pay_debt. Shouldn't wake up waiters
1496         * under debt. Make sure @vbudget reflects the outstanding amount and is
1497         * not positive.
1498         */
1499        if (iocg->abs_vdebt) {
1500                s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa);
1501                vbudget = min_t(s64, 0, vbudget - vdebt);
1502        }
1503
1504        /*
1505         * Wake up the ones which are due and see how much vtime we'll need for
1506         * the next one. As paying off debt restores hw_inuse, it must be read
1507         * after the above debt payment.
1508         */
1509        ctx.vbudget = vbudget;
1510        current_hweight(iocg, NULL, &ctx.hw_inuse);
1511
1512        __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1513
1514        if (!waitqueue_active(&iocg->waitq)) {
1515                if (iocg->wait_since) {
1516                        iocg->local_stat.wait_us += now->now - iocg->wait_since;
1517                        iocg->wait_since = 0;
1518                }
1519                return;
1520        }
1521
1522        if (!iocg->wait_since)
1523                iocg->wait_since = now->now;
1524
1525        if (WARN_ON_ONCE(ctx.vbudget >= 0))
1526                return;
1527
1528        /* determine next wakeup, add a timer margin to guarantee chunking */
1529        vshortage = -ctx.vbudget;
1530        expires = now->now_ns +
1531                DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) *
1532                NSEC_PER_USEC;
1533        expires += ioc->timer_slack_ns;
1534
1535        /* if already active and close enough, don't bother */
1536        oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1537        if (hrtimer_is_queued(&iocg->waitq_timer) &&
1538            abs(oexpires - expires) <= ioc->timer_slack_ns)
1539                return;
1540
1541        hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1542                               ioc->timer_slack_ns, HRTIMER_MODE_ABS);
1543}
1544
1545static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1546{
1547        struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1548        bool pay_debt = READ_ONCE(iocg->abs_vdebt);
1549        struct ioc_now now;
1550        unsigned long flags;
1551
1552        ioc_now(iocg->ioc, &now);
1553
1554        iocg_lock(iocg, pay_debt, &flags);
1555        iocg_kick_waitq(iocg, pay_debt, &now);
1556        iocg_unlock(iocg, pay_debt, &flags);
1557
1558        return HRTIMER_NORESTART;
1559}
1560
1561static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1562{
1563        u32 nr_met[2] = { };
1564        u32 nr_missed[2] = { };
1565        u64 rq_wait_ns = 0;
1566        int cpu, rw;
1567
1568        for_each_online_cpu(cpu) {
1569                struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1570                u64 this_rq_wait_ns;
1571
1572                for (rw = READ; rw <= WRITE; rw++) {
1573                        u32 this_met = local_read(&stat->missed[rw].nr_met);
1574                        u32 this_missed = local_read(&stat->missed[rw].nr_missed);
1575
1576                        nr_met[rw] += this_met - stat->missed[rw].last_met;
1577                        nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1578                        stat->missed[rw].last_met = this_met;
1579                        stat->missed[rw].last_missed = this_missed;
1580                }
1581
1582                this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
1583                rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1584                stat->last_rq_wait_ns = this_rq_wait_ns;
1585        }
1586
1587        for (rw = READ; rw <= WRITE; rw++) {
1588                if (nr_met[rw] + nr_missed[rw])
1589                        missed_ppm_ar[rw] =
1590                                DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1591                                                   nr_met[rw] + nr_missed[rw]);
1592                else
1593                        missed_ppm_ar[rw] = 0;
1594        }
1595
1596        *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1597                                   ioc->period_us * NSEC_PER_USEC);
1598}
1599
1600/* was iocg idle this period? */
1601static bool iocg_is_idle(struct ioc_gq *iocg)
1602{
1603        struct ioc *ioc = iocg->ioc;
1604
1605        /* did something get issued this period? */
1606        if (atomic64_read(&iocg->active_period) ==
1607            atomic64_read(&ioc->cur_period))
1608                return false;
1609
1610        /* is something in flight? */
1611        if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1612                return false;
1613
1614        return true;
1615}
1616
1617/*
1618 * Call this function on the target leaf @iocg's to build pre-order traversal
1619 * list of all the ancestors in @inner_walk. The inner nodes are linked through
1620 * ->walk_list and the caller is responsible for dissolving the list after use.
1621 */
1622static void iocg_build_inner_walk(struct ioc_gq *iocg,
1623                                  struct list_head *inner_walk)
1624{
1625        int lvl;
1626
1627        WARN_ON_ONCE(!list_empty(&iocg->walk_list));
1628
1629        /* find the first ancestor which hasn't been visited yet */
1630        for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1631                if (!list_empty(&iocg->ancestors[lvl]->walk_list))
1632                        break;
1633        }
1634
1635        /* walk down and visit the inner nodes to get pre-order traversal */
1636        while (++lvl <= iocg->level - 1) {
1637                struct ioc_gq *inner = iocg->ancestors[lvl];
1638
1639                /* record traversal order */
1640                list_add_tail(&inner->walk_list, inner_walk);
1641        }
1642}
1643
1644/* collect per-cpu counters and propagate the deltas to the parent */
1645static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now)
1646{
1647        struct ioc *ioc = iocg->ioc;
1648        struct iocg_stat new_stat;
1649        u64 abs_vusage = 0;
1650        u64 vusage_delta;
1651        int cpu;
1652
1653        lockdep_assert_held(&iocg->ioc->lock);
1654
1655        /* collect per-cpu counters */
1656        for_each_possible_cpu(cpu) {
1657                abs_vusage += local64_read(
1658                                per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
1659        }
1660        vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
1661        iocg->last_stat_abs_vusage = abs_vusage;
1662
1663        iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate);
1664        iocg->local_stat.usage_us += iocg->usage_delta_us;
1665
1666        /* propagate upwards */
1667        new_stat.usage_us =
1668                iocg->local_stat.usage_us + iocg->desc_stat.usage_us;
1669        new_stat.wait_us =
1670                iocg->local_stat.wait_us + iocg->desc_stat.wait_us;
1671        new_stat.indebt_us =
1672                iocg->local_stat.indebt_us + iocg->desc_stat.indebt_us;
1673        new_stat.indelay_us =
1674                iocg->local_stat.indelay_us + iocg->desc_stat.indelay_us;
1675
1676        /* propagate the deltas to the parent */
1677        if (iocg->level > 0) {
1678                struct iocg_stat *parent_stat =
1679                        &iocg->ancestors[iocg->level - 1]->desc_stat;
1680
1681                parent_stat->usage_us +=
1682                        new_stat.usage_us - iocg->last_stat.usage_us;
1683                parent_stat->wait_us +=
1684                        new_stat.wait_us - iocg->last_stat.wait_us;
1685                parent_stat->indebt_us +=
1686                        new_stat.indebt_us - iocg->last_stat.indebt_us;
1687                parent_stat->indelay_us +=
1688                        new_stat.indelay_us - iocg->last_stat.indelay_us;
1689        }
1690
1691        iocg->last_stat = new_stat;
1692}
1693
1694/* get stat counters ready for reading on all active iocgs */
1695static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
1696{
1697        LIST_HEAD(inner_walk);
1698        struct ioc_gq *iocg, *tiocg;
1699
1700        /* flush leaves and build inner node walk list */
1701        list_for_each_entry(iocg, target_iocgs, active_list) {
1702                iocg_flush_stat_one(iocg, now);
1703                iocg_build_inner_walk(iocg, &inner_walk);
1704        }
1705
1706        /* keep flushing upwards by walking the inner list backwards */
1707        list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
1708                iocg_flush_stat_one(iocg, now);
1709                list_del_init(&iocg->walk_list);
1710        }
1711}
1712
1713/*
1714 * Determine what @iocg's hweight_inuse should be after donating unused
1715 * capacity. @hwm is the upper bound and used to signal no donation. This
1716 * function also throws away @iocg's excess budget.
1717 */
1718static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm,
1719                                  u32 usage, struct ioc_now *now)
1720{
1721        struct ioc *ioc = iocg->ioc;
1722        u64 vtime = atomic64_read(&iocg->vtime);
1723        s64 excess, delta, target, new_hwi;
1724
1725        /* debt handling owns inuse for debtors */
1726        if (iocg->abs_vdebt)
1727                return 1;
1728
1729        /* see whether minimum margin requirement is met */
1730        if (waitqueue_active(&iocg->waitq) ||
1731            time_after64(vtime, now->vnow - ioc->margins.min))
1732                return hwm;
1733
1734        /* throw away excess above target */
1735        excess = now->vnow - vtime - ioc->margins.target;
1736        if (excess > 0) {
1737                atomic64_add(excess, &iocg->vtime);
1738                atomic64_add(excess, &iocg->done_vtime);
1739                vtime += excess;
1740                ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE);
1741        }
1742
1743        /*
1744         * Let's say the distance between iocg's and device's vtimes as a
1745         * fraction of period duration is delta. Assuming that the iocg will
1746         * consume the usage determined above, we want to determine new_hwi so
1747         * that delta equals MARGIN_TARGET at the end of the next period.
1748         *
1749         * We need to execute usage worth of IOs while spending the sum of the
1750         * new budget (1 - MARGIN_TARGET) and the leftover from the last period
1751         * (delta):
1752         *
1753         *   usage = (1 - MARGIN_TARGET + delta) * new_hwi
1754         *
1755         * Therefore, the new_hwi is:
1756         *
1757         *   new_hwi = usage / (1 - MARGIN_TARGET + delta)
1758         */
1759        delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
1760                          now->vnow - ioc->period_at_vtime);
1761        target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
1762        new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
1763
1764        return clamp_t(s64, new_hwi, 1, hwm);
1765}
1766
1767/*
1768 * For work-conservation, an iocg which isn't using all of its share should
1769 * donate the leftover to other iocgs. There are two ways to achieve this - 1.
1770 * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight.
1771 *
1772 * #1 is mathematically simpler but has the drawback of requiring synchronous
1773 * global hweight_inuse updates when idle iocg's get activated or inuse weights
1774 * change due to donation snapbacks as it has the possibility of grossly
1775 * overshooting what's allowed by the model and vrate.
1776 *
1777 * #2 is inherently safe with local operations. The donating iocg can easily
1778 * snap back to higher weights when needed without worrying about impacts on
1779 * other nodes as the impacts will be inherently correct. This also makes idle
1780 * iocg activations safe. The only effect activations have is decreasing
1781 * hweight_inuse of others, the right solution to which is for those iocgs to
1782 * snap back to higher weights.
1783 *
1784 * So, we go with #2. The challenge is calculating how each donating iocg's
1785 * inuse should be adjusted to achieve the target donation amounts. This is done
1786 * using Andy's method described in the following pdf.
1787 *
1788 *   https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo
1789 *
1790 * Given the weights and target after-donation hweight_inuse values, Andy's
1791 * method determines how the proportional distribution should look like at each
1792 * sibling level to maintain the relative relationship between all non-donating
1793 * pairs. To roughly summarize, it divides the tree into donating and
1794 * non-donating parts, calculates global donation rate which is used to
1795 * determine the target hweight_inuse for each node, and then derives per-level
1796 * proportions.
1797 *
1798 * The following pdf shows that global distribution calculated this way can be
1799 * achieved by scaling inuse weights of donating leaves and propagating the
1800 * adjustments upwards proportionally.
1801 *
1802 *   https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
1803 *
1804 * Combining the above two, we can determine how each leaf iocg's inuse should
1805 * be adjusted to achieve the target donation.
1806 *
1807 *   https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN
1808 *
1809 * The inline comments use symbols from the last pdf.
1810 *
1811 *   b is the sum of the absolute budgets in the subtree. 1 for the root node.
1812 *   f is the sum of the absolute budgets of non-donating nodes in the subtree.
1813 *   t is the sum of the absolute budgets of donating nodes in the subtree.
1814 *   w is the weight of the node. w = w_f + w_t
1815 *   w_f is the non-donating portion of w. w_f = w * f / b
1816 *   w_b is the donating portion of w. w_t = w * t / b
1817 *   s is the sum of all sibling weights. s = Sum(w) for siblings
1818 *   s_f and s_t are the non-donating and donating portions of s.
1819 *
1820 * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
1821 * w_pt is the donating portion of the parent's weight and w'_pt the same value
1822 * after adjustments. Subscript r denotes the root node's values.
1823 */
1824static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
1825{
1826        LIST_HEAD(over_hwa);
1827        LIST_HEAD(inner_walk);
1828        struct ioc_gq *iocg, *tiocg, *root_iocg;
1829        u32 after_sum, over_sum, over_target, gamma;
1830
1831        /*
1832         * It's pretty unlikely but possible for the total sum of
1833         * hweight_after_donation's to be higher than WEIGHT_ONE, which will
1834         * confuse the following calculations. If such condition is detected,
1835         * scale down everyone over its full share equally to keep the sum below
1836         * WEIGHT_ONE.
1837         */
1838        after_sum = 0;
1839        over_sum = 0;
1840        list_for_each_entry(iocg, surpluses, surplus_list) {
1841                u32 hwa;
1842
1843                current_hweight(iocg, &hwa, NULL);
1844                after_sum += iocg->hweight_after_donation;
1845
1846                if (iocg->hweight_after_donation > hwa) {
1847                        over_sum += iocg->hweight_after_donation;
1848                        list_add(&iocg->walk_list, &over_hwa);
1849                }
1850        }
1851
1852        if (after_sum >= WEIGHT_ONE) {
1853                /*
1854                 * The delta should be deducted from the over_sum, calculate
1855                 * target over_sum value.
1856                 */
1857                u32 over_delta = after_sum - (WEIGHT_ONE - 1);
1858                WARN_ON_ONCE(over_sum <= over_delta);
1859                over_target = over_sum - over_delta;
1860        } else {
1861                over_target = 0;
1862        }
1863
1864        list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) {
1865                if (over_target)
1866                        iocg->hweight_after_donation =
1867                                div_u64((u64)iocg->hweight_after_donation *
1868                                        over_target, over_sum);
1869                list_del_init(&iocg->walk_list);
1870        }
1871
1872        /*
1873         * Build pre-order inner node walk list and prepare for donation
1874         * adjustment calculations.
1875         */
1876        list_for_each_entry(iocg, surpluses, surplus_list) {
1877                iocg_build_inner_walk(iocg, &inner_walk);
1878        }
1879
1880        root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list);
1881        WARN_ON_ONCE(root_iocg->level > 0);
1882
1883        list_for_each_entry(iocg, &inner_walk, walk_list) {
1884                iocg->child_adjusted_sum = 0;
1885                iocg->hweight_donating = 0;
1886                iocg->hweight_after_donation = 0;
1887        }
1888
1889        /*
1890         * Propagate the donating budget (b_t) and after donation budget (b'_t)
1891         * up the hierarchy.
1892         */
1893        list_for_each_entry(iocg, surpluses, surplus_list) {
1894                struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1895
1896                parent->hweight_donating += iocg->hweight_donating;
1897                parent->hweight_after_donation += iocg->hweight_after_donation;
1898        }
1899
1900        list_for_each_entry_reverse(iocg, &inner_walk, walk_list) {
1901                if (iocg->level > 0) {
1902                        struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1903
1904                        parent->hweight_donating += iocg->hweight_donating;
1905                        parent->hweight_after_donation += iocg->hweight_after_donation;
1906                }
1907        }
1908
1909        /*
1910         * Calculate inner hwa's (b) and make sure the donation values are
1911         * within the accepted ranges as we're doing low res calculations with
1912         * roundups.
1913         */
1914        list_for_each_entry(iocg, &inner_walk, walk_list) {
1915                if (iocg->level) {
1916                        struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1917
1918                        iocg->hweight_active = DIV64_U64_ROUND_UP(
1919                                (u64)parent->hweight_active * iocg->active,
1920                                parent->child_active_sum);
1921
1922                }
1923
1924                iocg->hweight_donating = min(iocg->hweight_donating,
1925                                             iocg->hweight_active);
1926                iocg->hweight_after_donation = min(iocg->hweight_after_donation,
1927                                                   iocg->hweight_donating - 1);
1928                if (WARN_ON_ONCE(iocg->hweight_active <= 1 ||
1929                                 iocg->hweight_donating <= 1 ||
1930                                 iocg->hweight_after_donation == 0)) {
1931                        pr_warn("iocg: invalid donation weights in ");
1932                        pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup);
1933                        pr_cont(": active=%u donating=%u after=%u\n",
1934                                iocg->hweight_active, iocg->hweight_donating,
1935                                iocg->hweight_after_donation);
1936                }
1937        }
1938
1939        /*
1940         * Calculate the global donation rate (gamma) - the rate to adjust
1941         * non-donating budgets by.
1942         *
1943         * No need to use 64bit multiplication here as the first operand is
1944         * guaranteed to be smaller than WEIGHT_ONE (1<<16).
1945         *
1946         * We know that there are beneficiary nodes and the sum of the donating
1947         * hweights can't be whole; however, due to the round-ups during hweight
1948         * calculations, root_iocg->hweight_donating might still end up equal to
1949         * or greater than whole. Limit the range when calculating the divider.
1950         *
1951         * gamma = (1 - t_r') / (1 - t_r)
1952         */
1953        gamma = DIV_ROUND_UP(
1954                (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE,
1955                WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1));
1956
1957        /*
1958         * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner
1959         * nodes.
1960         */
1961        list_for_each_entry(iocg, &inner_walk, walk_list) {
1962                struct ioc_gq *parent;
1963                u32 inuse, wpt, wptp;
1964                u64 st, sf;
1965
1966                if (iocg->level == 0) {
1967                        /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */
1968                        iocg->child_adjusted_sum = DIV64_U64_ROUND_UP(
1969                                iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating),
1970                                WEIGHT_ONE - iocg->hweight_after_donation);
1971                        continue;
1972                }
1973
1974                parent = iocg->ancestors[iocg->level - 1];
1975
1976                /* b' = gamma * b_f + b_t' */
1977                iocg->hweight_inuse = DIV64_U64_ROUND_UP(
1978                        (u64)gamma * (iocg->hweight_active - iocg->hweight_donating),
1979                        WEIGHT_ONE) + iocg->hweight_after_donation;
1980
1981                /* w' = s' * b' / b'_p */
1982                inuse = DIV64_U64_ROUND_UP(
1983                        (u64)parent->child_adjusted_sum * iocg->hweight_inuse,
1984                        parent->hweight_inuse);
1985
1986                /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */
1987                st = DIV64_U64_ROUND_UP(
1988                        iocg->child_active_sum * iocg->hweight_donating,
1989                        iocg->hweight_active);
1990                sf = iocg->child_active_sum - st;
1991                wpt = DIV64_U64_ROUND_UP(
1992                        (u64)iocg->active * iocg->hweight_donating,
1993                        iocg->hweight_active);
1994                wptp = DIV64_U64_ROUND_UP(
1995                        (u64)inuse * iocg->hweight_after_donation,
1996                        iocg->hweight_inuse);
1997
1998                iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt);
1999        }
2000
2001        /*
2002         * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and
2003         * we can finally determine leaf adjustments.
2004         */
2005        list_for_each_entry(iocg, surpluses, surplus_list) {
2006                struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
2007                u32 inuse;
2008
2009                /*
2010                 * In-debt iocgs participated in the donation calculation with
2011                 * the minimum target hweight_inuse. Configuring inuse
2012                 * accordingly would work fine but debt handling expects
2013                 * @iocg->inuse stay at the minimum and we don't wanna
2014                 * interfere.
2015                 */
2016                if (iocg->abs_vdebt) {
2017                        WARN_ON_ONCE(iocg->inuse > 1);
2018                        continue;
2019                }
2020
2021                /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */
2022                inuse = DIV64_U64_ROUND_UP(
2023                        parent->child_adjusted_sum * iocg->hweight_after_donation,
2024                        parent->hweight_inuse);
2025
2026                TRACE_IOCG_PATH(inuse_transfer, iocg, now,
2027                                iocg->inuse, inuse,
2028                                iocg->hweight_inuse,
2029                                iocg->hweight_after_donation);
2030
2031                __propagate_weights(iocg, iocg->active, inuse, true, now);
2032        }
2033
2034        /* walk list should be dissolved after use */
2035        list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list)
2036                list_del_init(&iocg->walk_list);
2037}
2038
2039/*
2040 * A low weight iocg can amass a large amount of debt, for example, when
2041 * anonymous memory gets reclaimed aggressively. If the system has a lot of
2042 * memory paired with a slow IO device, the debt can span multiple seconds or
2043 * more. If there are no other subsequent IO issuers, the in-debt iocg may end
2044 * up blocked paying its debt while the IO device is idle.
2045 *
2046 * The following protects against such cases. If the device has been
2047 * sufficiently idle for a while, the debts are halved and delays are
2048 * recalculated.
2049 */
2050static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
2051                              struct ioc_now *now)
2052{
2053        struct ioc_gq *iocg;
2054        u64 dur, usage_pct, nr_cycles;
2055
2056        /* if no debtor, reset the cycle */
2057        if (!nr_debtors) {
2058                ioc->dfgv_period_at = now->now;
2059                ioc->dfgv_period_rem = 0;
2060                ioc->dfgv_usage_us_sum = 0;
2061                return;
2062        }
2063
2064        /*
2065         * Debtors can pass through a lot of writes choking the device and we
2066         * don't want to be forgiving debts while the device is struggling from
2067         * write bursts. If we're missing latency targets, consider the device
2068         * fully utilized.
2069         */
2070        if (ioc->busy_level > 0)
2071                usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us);
2072
2073        ioc->dfgv_usage_us_sum += usage_us_sum;
2074        if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD))
2075                return;
2076
2077        /*
2078         * At least DFGV_PERIOD has passed since the last period. Calculate the
2079         * average usage and reset the period counters.
2080         */
2081        dur = now->now - ioc->dfgv_period_at;
2082        usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur);
2083
2084        ioc->dfgv_period_at = now->now;
2085        ioc->dfgv_usage_us_sum = 0;
2086
2087        /* if was too busy, reset everything */
2088        if (usage_pct > DFGV_USAGE_PCT) {
2089                ioc->dfgv_period_rem = 0;
2090                return;
2091        }
2092
2093        /*
2094         * Usage is lower than threshold. Let's forgive some debts. Debt
2095         * forgiveness runs off of the usual ioc timer but its period usually
2096         * doesn't match ioc's. Compensate the difference by performing the
2097         * reduction as many times as would fit in the duration since the last
2098         * run and carrying over the left-over duration in @ioc->dfgv_period_rem
2099         * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive
2100         * reductions is doubled.
2101         */
2102        nr_cycles = dur + ioc->dfgv_period_rem;
2103        ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD);
2104
2105        list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
2106                u64 __maybe_unused old_debt, __maybe_unused old_delay;
2107
2108                if (!iocg->abs_vdebt && !iocg->delay)
2109                        continue;
2110
2111                spin_lock(&iocg->waitq.lock);
2112
2113                old_debt = iocg->abs_vdebt;
2114                old_delay = iocg->delay;
2115
2116                if (iocg->abs_vdebt)
2117                        iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1;
2118                if (iocg->delay)
2119                        iocg->delay = iocg->delay >> nr_cycles ?: 1;
2120
2121                iocg_kick_waitq(iocg, true, now);
2122
2123                TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct,
2124                                old_debt, iocg->abs_vdebt,
2125                                old_delay, iocg->delay);
2126
2127                spin_unlock(&iocg->waitq.lock);
2128        }
2129}
2130
2131/*
2132 * Check the active iocgs' state to avoid oversleeping and deactive
2133 * idle iocgs.
2134 *
2135 * Since waiters determine the sleep durations based on the vrate
2136 * they saw at the time of sleep, if vrate has increased, some
2137 * waiters could be sleeping for too long. Wake up tardy waiters
2138 * which should have woken up in the last period and expire idle
2139 * iocgs.
2140 */
2141static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now)
2142{
2143        int nr_debtors = 0;
2144        struct ioc_gq *iocg, *tiocg;
2145
2146        list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
2147                if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
2148                    !iocg->delay && !iocg_is_idle(iocg))
2149                        continue;
2150
2151                spin_lock(&iocg->waitq.lock);
2152
2153                /* flush wait and indebt stat deltas */
2154                if (iocg->wait_since) {
2155                        iocg->local_stat.wait_us += now->now - iocg->wait_since;
2156                        iocg->wait_since = now->now;
2157                }
2158                if (iocg->indebt_since) {
2159                        iocg->local_stat.indebt_us +=
2160                                now->now - iocg->indebt_since;
2161                        iocg->indebt_since = now->now;
2162                }
2163                if (iocg->indelay_since) {
2164                        iocg->local_stat.indelay_us +=
2165                                now->now - iocg->indelay_since;
2166                        iocg->indelay_since = now->now;
2167                }
2168
2169                if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt ||
2170                    iocg->delay) {
2171                        /* might be oversleeping vtime / hweight changes, kick */
2172                        iocg_kick_waitq(iocg, true, now);
2173                        if (iocg->abs_vdebt || iocg->delay)
2174                                nr_debtors++;
2175                } else if (iocg_is_idle(iocg)) {
2176                        /* no waiter and idle, deactivate */
2177                        u64 vtime = atomic64_read(&iocg->vtime);
2178                        s64 excess;
2179
2180                        /*
2181                         * @iocg has been inactive for a full duration and will
2182                         * have a high budget. Account anything above target as
2183                         * error and throw away. On reactivation, it'll start
2184                         * with the target budget.
2185                         */
2186                        excess = now->vnow - vtime - ioc->margins.target;
2187                        if (excess > 0) {
2188                                u32 old_hwi;
2189
2190                                current_hweight(iocg, NULL, &old_hwi);
2191                                ioc->vtime_err -= div64_u64(excess * old_hwi,
2192                                                            WEIGHT_ONE);
2193                        }
2194
2195                        TRACE_IOCG_PATH(iocg_idle, iocg, now,
2196                                        atomic64_read(&iocg->active_period),
2197                                        atomic64_read(&ioc->cur_period), vtime);
2198                        __propagate_weights(iocg, 0, 0, false, now);
2199                        list_del_init(&iocg->active_list);
2200                }
2201
2202                spin_unlock(&iocg->waitq.lock);
2203        }
2204
2205        commit_weights(ioc);
2206        return nr_debtors;
2207}
2208
2209static void ioc_timer_fn(struct timer_list *timer)
2210{
2211        struct ioc *ioc = container_of(timer, struct ioc, timer);
2212        struct ioc_gq *iocg, *tiocg;
2213        struct ioc_now now;
2214        LIST_HEAD(surpluses);
2215        int nr_debtors, nr_shortages = 0, nr_lagging = 0;
2216        u64 usage_us_sum = 0;
2217        u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
2218        u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
2219        u32 missed_ppm[2], rq_wait_pct;
2220        u64 period_vtime;
2221        int prev_busy_level;
2222
2223        /* how were the latencies during the period? */
2224        ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
2225
2226        /* take care of active iocgs */
2227        spin_lock_irq(&ioc->lock);
2228
2229        ioc_now(ioc, &now);
2230
2231        period_vtime = now.vnow - ioc->period_at_vtime;
2232        if (WARN_ON_ONCE(!period_vtime)) {
2233                spin_unlock_irq(&ioc->lock);
2234                return;
2235        }
2236
2237        nr_debtors = ioc_check_iocgs(ioc, &now);
2238
2239        /*
2240         * Wait and indebt stat are flushed above and the donation calculation
2241         * below needs updated usage stat. Let's bring stat up-to-date.
2242         */
2243        iocg_flush_stat(&ioc->active_iocgs, &now);
2244
2245        /* calc usage and see whether some weights need to be moved around */
2246        list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
2247                u64 vdone, vtime, usage_us;
2248                u32 hw_active, hw_inuse;
2249
2250                /*
2251                 * Collect unused and wind vtime closer to vnow to prevent
2252                 * iocgs from accumulating a large amount of budget.
2253                 */
2254                vdone = atomic64_read(&iocg->done_vtime);
2255                vtime = atomic64_read(&iocg->vtime);
2256                current_hweight(iocg, &hw_active, &hw_inuse);
2257
2258                /*
2259                 * Latency QoS detection doesn't account for IOs which are
2260                 * in-flight for longer than a period.  Detect them by
2261                 * comparing vdone against period start.  If lagging behind
2262                 * IOs from past periods, don't increase vrate.
2263                 */
2264                if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
2265                    !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
2266                    time_after64(vtime, vdone) &&
2267                    time_after64(vtime, now.vnow -
2268                                 MAX_LAGGING_PERIODS * period_vtime) &&
2269                    time_before64(vdone, now.vnow - period_vtime))
2270                        nr_lagging++;
2271
2272                /*
2273                 * Determine absolute usage factoring in in-flight IOs to avoid
2274                 * high-latency completions appearing as idle.
2275                 */
2276                usage_us = iocg->usage_delta_us;
2277                usage_us_sum += usage_us;
2278
2279                /* see whether there's surplus vtime */
2280                WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
2281                if (hw_inuse < hw_active ||
2282                    (!waitqueue_active(&iocg->waitq) &&
2283                     time_before64(vtime, now.vnow - ioc->margins.low))) {
2284                        u32 hwa, old_hwi, hwm, new_hwi, usage;
2285                        u64 usage_dur;
2286
2287                        if (vdone != vtime) {
2288                                u64 inflight_us = DIV64_U64_ROUND_UP(
2289                                        cost_to_abs_cost(vtime - vdone, hw_inuse),
2290                                        ioc->vtime_base_rate);
2291
2292                                usage_us = max(usage_us, inflight_us);
2293                        }
2294
2295                        /* convert to hweight based usage ratio */
2296                        if (time_after64(iocg->activated_at, ioc->period_at))
2297                                usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
2298                        else
2299                                usage_dur = max_t(u64, now.now - ioc->period_at, 1);
2300
2301                        usage = clamp_t(u32,
2302                                DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
2303                                                   usage_dur),
2304                                1, WEIGHT_ONE);
2305
2306                        /*
2307                         * Already donating or accumulated enough to start.
2308                         * Determine the donation amount.
2309                         */
2310                        current_hweight(iocg, &hwa, &old_hwi);
2311                        hwm = current_hweight_max(iocg);
2312                        new_hwi = hweight_after_donation(iocg, old_hwi, hwm,
2313                                                         usage, &now);
2314                        if (new_hwi < hwm) {
2315                                iocg->hweight_donating = hwa;
2316                                iocg->hweight_after_donation = new_hwi;
2317                                list_add(&iocg->surplus_list, &surpluses);
2318                        } else {
2319                                TRACE_IOCG_PATH(inuse_shortage, iocg, &now,
2320                                                iocg->inuse, iocg->active,
2321                                                iocg->hweight_inuse, new_hwi);
2322
2323                                __propagate_weights(iocg, iocg->active,
2324                                                    iocg->active, true, &now);
2325                                nr_shortages++;
2326                        }
2327                } else {
2328                        /* genuinely short on vtime */
2329                        nr_shortages++;
2330                }
2331        }
2332
2333        if (!list_empty(&surpluses) && nr_shortages)
2334                transfer_surpluses(&surpluses, &now);
2335
2336        commit_weights(ioc);
2337
2338        /* surplus list should be dissolved after use */
2339        list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list)
2340                list_del_init(&iocg->surplus_list);
2341
2342        /*
2343         * If q is getting clogged or we're missing too much, we're issuing
2344         * too much IO and should lower vtime rate.  If we're not missing
2345         * and experiencing shortages but not surpluses, we're too stingy
2346         * and should increase vtime rate.
2347         */
2348        prev_busy_level = ioc->busy_level;
2349        if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
2350            missed_ppm[READ] > ppm_rthr ||
2351            missed_ppm[WRITE] > ppm_wthr) {
2352                /* clearly missing QoS targets, slow down vrate */
2353                ioc->busy_level = max(ioc->busy_level, 0);
2354                ioc->busy_level++;
2355        } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
2356                   missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
2357                   missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
2358                /* QoS targets are being met with >25% margin */
2359                if (nr_shortages) {
2360                        /*
2361                         * We're throttling while the device has spare
2362                         * capacity.  If vrate was being slowed down, stop.
2363                         */
2364                        ioc->busy_level = min(ioc->busy_level, 0);
2365
2366                        /*
2367                         * If there are IOs spanning multiple periods, wait
2368                         * them out before pushing the device harder.
2369                         */
2370                        if (!nr_lagging)
2371                                ioc->busy_level--;
2372                } else {
2373                        /*
2374                         * Nobody is being throttled and the users aren't
2375                         * issuing enough IOs to saturate the device.  We
2376                         * simply don't know how close the device is to
2377                         * saturation.  Coast.
2378                         */
2379                        ioc->busy_level = 0;
2380                }
2381        } else {
2382                /* inside the hysterisis margin, we're good */
2383                ioc->busy_level = 0;
2384        }
2385
2386        ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
2387
2388        ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages,
2389                              prev_busy_level, missed_ppm);
2390
2391        ioc_refresh_params(ioc, false);
2392
2393        ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now);
2394
2395        /*
2396         * This period is done.  Move onto the next one.  If nothing's
2397         * going on with the device, stop the timer.
2398         */
2399        atomic64_inc(&ioc->cur_period);
2400
2401        if (ioc->running != IOC_STOP) {
2402                if (!list_empty(&ioc->active_iocgs)) {
2403                        ioc_start_period(ioc, &now);
2404                } else {
2405                        ioc->busy_level = 0;
2406                        ioc->vtime_err = 0;
2407                        ioc->running = IOC_IDLE;
2408                }
2409
2410                ioc_refresh_vrate(ioc, &now);
2411        }
2412
2413        spin_unlock_irq(&ioc->lock);
2414}
2415
2416static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
2417                                      u64 abs_cost, struct ioc_now *now)
2418{
2419        struct ioc *ioc = iocg->ioc;
2420        struct ioc_margins *margins = &ioc->margins;
2421        u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi;
2422        u32 hwi, adj_step;
2423        s64 margin;
2424        u64 cost, new_inuse;
2425
2426        current_hweight(iocg, NULL, &hwi);
2427        old_hwi = hwi;
2428        cost = abs_cost_to_cost(abs_cost, hwi);
2429        margin = now->vnow - vtime - cost;
2430
2431        /* debt handling owns inuse for debtors */
2432        if (iocg->abs_vdebt)
2433                return cost;
2434
2435        /*
2436         * We only increase inuse during period and do so if the margin has
2437         * deteriorated since the previous adjustment.
2438         */
2439        if (margin >= iocg->saved_margin || margin >= margins->low ||
2440            iocg->inuse == iocg->active)
2441                return cost;
2442
2443        spin_lock_irq(&ioc->lock);
2444
2445        /* we own inuse only when @iocg is in the normal active state */
2446        if (iocg->abs_vdebt || list_empty(&iocg->active_list)) {
2447                spin_unlock_irq(&ioc->lock);
2448                return cost;
2449        }
2450
2451        /*
2452         * Bump up inuse till @abs_cost fits in the existing budget.
2453         * adj_step must be determined after acquiring ioc->lock - we might
2454         * have raced and lost to another thread for activation and could
2455         * be reading 0 iocg->active before ioc->lock which will lead to
2456         * infinite loop.
2457         */
2458        new_inuse = iocg->inuse;
2459        adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100);
2460        do {
2461                new_inuse = new_inuse + adj_step;
2462                propagate_weights(iocg, iocg->active, new_inuse, true, now);
2463                current_hweight(iocg, NULL, &hwi);
2464                cost = abs_cost_to_cost(abs_cost, hwi);
2465        } while (time_after64(vtime + cost, now->vnow) &&
2466                 iocg->inuse != iocg->active);
2467
2468        spin_unlock_irq(&ioc->lock);
2469
2470        TRACE_IOCG_PATH(inuse_adjust, iocg, now,
2471                        old_inuse, iocg->inuse, old_hwi, hwi);
2472
2473        return cost;
2474}
2475
2476static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
2477                                    bool is_merge, u64 *costp)
2478{
2479        struct ioc *ioc = iocg->ioc;
2480        u64 coef_seqio, coef_randio, coef_page;
2481        u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
2482        u64 seek_pages = 0;
2483        u64 cost = 0;
2484
2485        switch (bio_op(bio)) {
2486        case REQ_OP_READ:
2487                coef_seqio      = ioc->params.lcoefs[LCOEF_RSEQIO];
2488                coef_randio     = ioc->params.lcoefs[LCOEF_RRANDIO];
2489                coef_page       = ioc->params.lcoefs[LCOEF_RPAGE];
2490                break;
2491        case REQ_OP_WRITE:
2492                coef_seqio      = ioc->params.lcoefs[LCOEF_WSEQIO];
2493                coef_randio     = ioc->params.lcoefs[LCOEF_WRANDIO];
2494                coef_page       = ioc->params.lcoefs[LCOEF_WPAGE];
2495                break;
2496        default:
2497                goto out;
2498        }
2499
2500        if (iocg->cursor) {
2501                seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
2502                seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
2503        }
2504
2505        if (!is_merge) {
2506                if (seek_pages > LCOEF_RANDIO_PAGES) {
2507                        cost += coef_randio;
2508                } else {
2509                        cost += coef_seqio;
2510                }
2511        }
2512        cost += pages * coef_page;
2513out:
2514        *costp = cost;
2515}
2516
2517static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
2518{
2519        u64 cost;
2520
2521        calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
2522        return cost;
2523}
2524
2525static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
2526                                         u64 *costp)
2527{
2528        unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
2529
2530        switch (req_op(rq)) {
2531        case REQ_OP_READ:
2532                *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
2533                break;
2534        case REQ_OP_WRITE:
2535                *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
2536                break;
2537        default:
2538                *costp = 0;
2539        }
2540}
2541
2542static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
2543{
2544        u64 cost;
2545
2546        calc_size_vtime_cost_builtin(rq, ioc, &cost);
2547        return cost;
2548}
2549
2550static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
2551{
2552        struct blkcg_gq *blkg = bio->bi_blkg;
2553        struct ioc *ioc = rqos_to_ioc(rqos);
2554        struct ioc_gq *iocg = blkg_to_iocg(blkg);
2555        struct ioc_now now;
2556        struct iocg_wait wait;
2557        u64 abs_cost, cost, vtime;
2558        bool use_debt, ioc_locked;
2559        unsigned long flags;
2560
2561        /* bypass IOs if disabled, still initializing, or for root cgroup */
2562        if (!ioc->enabled || !iocg || !iocg->level)
2563                return;
2564
2565        /* calculate the absolute vtime cost */
2566        abs_cost = calc_vtime_cost(bio, iocg, false);
2567        if (!abs_cost)
2568                return;
2569
2570        if (!iocg_activate(iocg, &now))
2571                return;
2572
2573        iocg->cursor = bio_end_sector(bio);
2574        vtime = atomic64_read(&iocg->vtime);
2575        cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
2576
2577        /*
2578         * If no one's waiting and within budget, issue right away.  The
2579         * tests are racy but the races aren't systemic - we only miss once
2580         * in a while which is fine.
2581         */
2582        if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
2583            time_before_eq64(vtime + cost, now.vnow)) {
2584                iocg_commit_bio(iocg, bio, abs_cost, cost);
2585                return;
2586        }
2587
2588        /*
2589         * We're over budget. This can be handled in two ways. IOs which may
2590         * cause priority inversions are punted to @ioc->aux_iocg and charged as
2591         * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling
2592         * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine
2593         * whether debt handling is needed and acquire locks accordingly.
2594         */
2595        use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
2596        ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
2597retry_lock:
2598        iocg_lock(iocg, ioc_locked, &flags);
2599
2600        /*
2601         * @iocg must stay activated for debt and waitq handling. Deactivation
2602         * is synchronized against both ioc->lock and waitq.lock and we won't
2603         * get deactivated as long as we're waiting or has debt, so we're good
2604         * if we're activated here. In the unlikely cases that we aren't, just
2605         * issue the IO.
2606         */
2607        if (unlikely(list_empty(&iocg->active_list))) {
2608                iocg_unlock(iocg, ioc_locked, &flags);
2609                iocg_commit_bio(iocg, bio, abs_cost, cost);
2610                return;
2611        }
2612
2613        /*
2614         * We're over budget. If @bio has to be issued regardless, remember
2615         * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
2616         * off the debt before waking more IOs.
2617         *
2618         * This way, the debt is continuously paid off each period with the
2619         * actual budget available to the cgroup. If we just wound vtime, we
2620         * would incorrectly use the current hw_inuse for the entire amount
2621         * which, for example, can lead to the cgroup staying blocked for a
2622         * long time even with substantially raised hw_inuse.
2623         *
2624         * An iocg with vdebt should stay online so that the timer can keep
2625         * deducting its vdebt and [de]activate use_delay mechanism
2626         * accordingly. We don't want to race against the timer trying to
2627         * clear them and leave @iocg inactive w/ dangling use_delay heavily
2628         * penalizing the cgroup and its descendants.
2629         */
2630        if (use_debt) {
2631                iocg_incur_debt(iocg, abs_cost, &now);
2632                if (iocg_kick_delay(iocg, &now))
2633                        blkcg_schedule_throttle(rqos->q,
2634                                        (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
2635                iocg_unlock(iocg, ioc_locked, &flags);
2636                return;
2637        }
2638
2639        /* guarantee that iocgs w/ waiters have maximum inuse */
2640        if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
2641                if (!ioc_locked) {
2642                        iocg_unlock(iocg, false, &flags);
2643                        ioc_locked = true;
2644                        goto retry_lock;
2645                }
2646                propagate_weights(iocg, iocg->active, iocg->active, true,
2647                                  &now);
2648        }
2649
2650        /*
2651         * Append self to the waitq and schedule the wakeup timer if we're
2652         * the first waiter.  The timer duration is calculated based on the
2653         * current vrate.  vtime and hweight changes can make it too short
2654         * or too long.  Each wait entry records the absolute cost it's
2655         * waiting for to allow re-evaluation using a custom wait entry.
2656         *
2657         * If too short, the timer simply reschedules itself.  If too long,
2658         * the period timer will notice and trigger wakeups.
2659         *
2660         * All waiters are on iocg->waitq and the wait states are
2661         * synchronized using waitq.lock.
2662         */
2663        init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
2664        wait.wait.private = current;
2665        wait.bio = bio;
2666        wait.abs_cost = abs_cost;
2667        wait.committed = false; /* will be set true by waker */
2668
2669        __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
2670        iocg_kick_waitq(iocg, ioc_locked, &now);
2671
2672        iocg_unlock(iocg, ioc_locked, &flags);
2673
2674        while (true) {
2675                set_current_state(TASK_UNINTERRUPTIBLE);
2676                if (wait.committed)
2677                        break;
2678                io_schedule();
2679        }
2680
2681        /* waker already committed us, proceed */
2682        finish_wait(&iocg->waitq, &wait.wait);
2683}
2684
2685static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
2686                           struct bio *bio)
2687{
2688        struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2689        struct ioc *ioc = rqos_to_ioc(rqos);
2690        sector_t bio_end = bio_end_sector(bio);
2691        struct ioc_now now;
2692        u64 vtime, abs_cost, cost;
2693        unsigned long flags;
2694
2695        /* bypass if disabled, still initializing, or for root cgroup */
2696        if (!ioc->enabled || !iocg || !iocg->level)
2697                return;
2698
2699        abs_cost = calc_vtime_cost(bio, iocg, true);
2700        if (!abs_cost)
2701                return;
2702
2703        ioc_now(ioc, &now);
2704
2705        vtime = atomic64_read(&iocg->vtime);
2706        cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
2707
2708        /* update cursor if backmerging into the request at the cursor */
2709        if (blk_rq_pos(rq) < bio_end &&
2710            blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
2711                iocg->cursor = bio_end;
2712
2713        /*
2714         * Charge if there's enough vtime budget and the existing request has
2715         * cost assigned.
2716         */
2717        if (rq->bio && rq->bio->bi_iocost_cost &&
2718            time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
2719                iocg_commit_bio(iocg, bio, abs_cost, cost);
2720                return;
2721        }
2722
2723        /*
2724         * Otherwise, account it as debt if @iocg is online, which it should
2725         * be for the vast majority of cases. See debt handling in
2726         * ioc_rqos_throttle() for details.
2727         */
2728        spin_lock_irqsave(&ioc->lock, flags);
2729        spin_lock(&iocg->waitq.lock);
2730
2731        if (likely(!list_empty(&iocg->active_list))) {
2732                iocg_incur_debt(iocg, abs_cost, &now);
2733                if (iocg_kick_delay(iocg, &now))
2734                        blkcg_schedule_throttle(rqos->q,
2735                                        (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
2736        } else {
2737                iocg_commit_bio(iocg, bio, abs_cost, cost);
2738        }
2739
2740        spin_unlock(&iocg->waitq.lock);
2741        spin_unlock_irqrestore(&ioc->lock, flags);
2742}
2743
2744static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
2745{
2746        struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2747
2748        if (iocg && bio->bi_iocost_cost)
2749                atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
2750}
2751
2752static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
2753{
2754        struct ioc *ioc = rqos_to_ioc(rqos);
2755        struct ioc_pcpu_stat *ccs;
2756        u64 on_q_ns, rq_wait_ns, size_nsec;
2757        int pidx, rw;
2758
2759        if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
2760                return;
2761
2762        switch (req_op(rq) & REQ_OP_MASK) {
2763        case REQ_OP_READ:
2764                pidx = QOS_RLAT;
2765                rw = READ;
2766                break;
2767        case REQ_OP_WRITE:
2768                pidx = QOS_WLAT;
2769                rw = WRITE;
2770                break;
2771        default:
2772                return;
2773        }
2774
2775        on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
2776        rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
2777        size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
2778
2779        ccs = get_cpu_ptr(ioc->pcpu_stat);
2780
2781        if (on_q_ns <= size_nsec ||
2782            on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
2783                local_inc(&ccs->missed[rw].nr_met);
2784        else
2785                local_inc(&ccs->missed[rw].nr_missed);
2786
2787        local64_add(rq_wait_ns, &ccs->rq_wait_ns);
2788
2789        put_cpu_ptr(ccs);
2790}
2791
2792static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
2793{
2794        struct ioc *ioc = rqos_to_ioc(rqos);
2795
2796        spin_lock_irq(&ioc->lock);
2797        ioc_refresh_params(ioc, false);
2798        spin_unlock_irq(&ioc->lock);
2799}
2800
2801static void ioc_rqos_exit(struct rq_qos *rqos)
2802{
2803        struct ioc *ioc = rqos_to_ioc(rqos);
2804
2805        blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
2806
2807        spin_lock_irq(&ioc->lock);
2808        ioc->running = IOC_STOP;
2809        spin_unlock_irq(&ioc->lock);
2810
2811        del_timer_sync(&ioc->timer);
2812        free_percpu(ioc->pcpu_stat);
2813        kfree(ioc);
2814}
2815
2816static struct rq_qos_ops ioc_rqos_ops = {
2817        .throttle = ioc_rqos_throttle,
2818        .merge = ioc_rqos_merge,
2819        .done_bio = ioc_rqos_done_bio,
2820        .done = ioc_rqos_done,
2821        .queue_depth_changed = ioc_rqos_queue_depth_changed,
2822        .exit = ioc_rqos_exit,
2823};
2824
2825static int blk_iocost_init(struct request_queue *q)
2826{
2827        struct ioc *ioc;
2828        struct rq_qos *rqos;
2829        int i, cpu, ret;
2830
2831        ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
2832        if (!ioc)
2833                return -ENOMEM;
2834
2835        ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
2836        if (!ioc->pcpu_stat) {
2837                kfree(ioc);
2838                return -ENOMEM;
2839        }
2840
2841        for_each_possible_cpu(cpu) {
2842                struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
2843
2844                for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
2845                        local_set(&ccs->missed[i].nr_met, 0);
2846                        local_set(&ccs->missed[i].nr_missed, 0);
2847                }
2848                local64_set(&ccs->rq_wait_ns, 0);
2849        }
2850
2851        rqos = &ioc->rqos;
2852        rqos->id = RQ_QOS_COST;
2853        rqos->ops = &ioc_rqos_ops;
2854        rqos->q = q;
2855
2856        spin_lock_init(&ioc->lock);
2857        timer_setup(&ioc->timer, ioc_timer_fn, 0);
2858        INIT_LIST_HEAD(&ioc->active_iocgs);
2859
2860        ioc->running = IOC_IDLE;
2861        ioc->vtime_base_rate = VTIME_PER_USEC;
2862        atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
2863        seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
2864        ioc->period_at = ktime_to_us(ktime_get());
2865        atomic64_set(&ioc->cur_period, 0);
2866        atomic_set(&ioc->hweight_gen, 0);
2867
2868        spin_lock_irq(&ioc->lock);
2869        ioc->autop_idx = AUTOP_INVALID;
2870        ioc_refresh_params(ioc, true);
2871        spin_unlock_irq(&ioc->lock);
2872
2873        /*
2874         * rqos must be added before activation to allow iocg_pd_init() to
2875         * lookup the ioc from q. This means that the rqos methods may get
2876         * called before policy activation completion, can't assume that the
2877         * target bio has an iocg associated and need to test for NULL iocg.
2878         */
2879        rq_qos_add(q, rqos);
2880        ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
2881        if (ret) {
2882                rq_qos_del(q, rqos);
2883                free_percpu(ioc->pcpu_stat);
2884                kfree(ioc);
2885                return ret;
2886        }
2887        return 0;
2888}
2889
2890static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2891{
2892        struct ioc_cgrp *iocc;
2893
2894        iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
2895        if (!iocc)
2896                return NULL;
2897
2898        iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
2899        return &iocc->cpd;
2900}
2901
2902static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2903{
2904        kfree(container_of(cpd, struct ioc_cgrp, cpd));
2905}
2906
2907static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2908                                             struct blkcg *blkcg)
2909{
2910        int levels = blkcg->css.cgroup->level + 1;
2911        struct ioc_gq *iocg;
2912
2913        iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
2914        if (!iocg)
2915                return NULL;
2916
2917        iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
2918        if (!iocg->pcpu_stat) {
2919                kfree(iocg);
2920                return NULL;
2921        }
2922
2923        return &iocg->pd;
2924}
2925
2926static void ioc_pd_init(struct blkg_policy_data *pd)
2927{
2928        struct ioc_gq *iocg = pd_to_iocg(pd);
2929        struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2930        struct ioc *ioc = q_to_ioc(blkg->q);
2931        struct ioc_now now;
2932        struct blkcg_gq *tblkg;
2933        unsigned long flags;
2934
2935        ioc_now(ioc, &now);
2936
2937        iocg->ioc = ioc;
2938        atomic64_set(&iocg->vtime, now.vnow);
2939        atomic64_set(&iocg->done_vtime, now.vnow);
2940        atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2941        INIT_LIST_HEAD(&iocg->active_list);
2942        INIT_LIST_HEAD(&iocg->walk_list);
2943        INIT_LIST_HEAD(&iocg->surplus_list);
2944        iocg->hweight_active = WEIGHT_ONE;
2945        iocg->hweight_inuse = WEIGHT_ONE;
2946
2947        init_waitqueue_head(&iocg->waitq);
2948        hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2949        iocg->waitq_timer.function = iocg_waitq_timer_fn;
2950
2951        iocg->level = blkg->blkcg->css.cgroup->level;
2952
2953        for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2954                struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2955                iocg->ancestors[tiocg->level] = tiocg;
2956        }
2957
2958        spin_lock_irqsave(&ioc->lock, flags);
2959        weight_updated(iocg, &now);
2960        spin_unlock_irqrestore(&ioc->lock, flags);
2961}
2962
2963static void ioc_pd_free(struct blkg_policy_data *pd)
2964{
2965        struct ioc_gq *iocg = pd_to_iocg(pd);
2966        struct ioc *ioc = iocg->ioc;
2967        unsigned long flags;
2968
2969        if (ioc) {
2970                spin_lock_irqsave(&ioc->lock, flags);
2971
2972                if (!list_empty(&iocg->active_list)) {
2973                        struct ioc_now now;
2974
2975                        ioc_now(ioc, &now);
2976                        propagate_weights(iocg, 0, 0, false, &now);
2977                        list_del_init(&iocg->active_list);
2978                }
2979
2980                WARN_ON_ONCE(!list_empty(&iocg->walk_list));
2981                WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
2982
2983                spin_unlock_irqrestore(&ioc->lock, flags);
2984
2985                hrtimer_cancel(&iocg->waitq_timer);
2986        }
2987        free_percpu(iocg->pcpu_stat);
2988        kfree(iocg);
2989}
2990
2991static bool ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
2992{
2993        struct ioc_gq *iocg = pd_to_iocg(pd);
2994        struct ioc *ioc = iocg->ioc;
2995
2996        if (!ioc->enabled)
2997                return false;
2998
2999        if (iocg->level == 0) {
3000                unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
3001                        ioc->vtime_base_rate * 10000,
3002                        VTIME_PER_USEC);
3003                seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100);
3004        }
3005
3006        seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us);
3007
3008        if (blkcg_debug_stats)
3009                seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
3010                        iocg->last_stat.wait_us,
3011                        iocg->last_stat.indebt_us,
3012                        iocg->last_stat.indelay_us);
3013        return true;
3014}
3015
3016static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
3017                             int off)
3018{
3019        const char *dname = blkg_dev_name(pd->blkg);
3020        struct ioc_gq *iocg = pd_to_iocg(pd);
3021
3022        if (dname && iocg->cfg_weight)
3023                seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
3024        return 0;
3025}
3026
3027
3028static int ioc_weight_show(struct seq_file *sf, void *v)
3029{
3030        struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3031        struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
3032
3033        seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
3034        blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
3035                          &blkcg_policy_iocost, seq_cft(sf)->private, false);
3036        return 0;
3037}
3038
3039static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
3040                                size_t nbytes, loff_t off)
3041{
3042        struct blkcg *blkcg = css_to_blkcg(of_css(of));
3043        struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
3044        struct blkg_conf_ctx ctx;
3045        struct ioc_now now;
3046        struct ioc_gq *iocg;
3047        u32 v;
3048        int ret;
3049
3050        if (!strchr(buf, ':')) {
3051                struct blkcg_gq *blkg;
3052
3053                if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
3054                        return -EINVAL;
3055
3056                if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3057                        return -EINVAL;
3058
3059                spin_lock_irq(&blkcg->lock);
3060                iocc->dfl_weight = v * WEIGHT_ONE;
3061                hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
3062                        struct ioc_gq *iocg = blkg_to_iocg(blkg);
3063
3064                        if (iocg) {
3065                                spin_lock(&iocg->ioc->lock);
3066                                ioc_now(iocg->ioc, &now);
3067                                weight_updated(iocg, &now);
3068                                spin_unlock(&iocg->ioc->lock);
3069                        }
3070                }
3071                spin_unlock_irq(&blkcg->lock);
3072
3073                return nbytes;
3074        }
3075
3076        ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
3077        if (ret)
3078                return ret;
3079
3080        iocg = blkg_to_iocg(ctx.blkg);
3081
3082        if (!strncmp(ctx.body, "default", 7)) {
3083                v = 0;
3084        } else {
3085                if (!sscanf(ctx.body, "%u", &v))
3086                        goto einval;
3087                if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3088                        goto einval;
3089        }
3090
3091        spin_lock(&iocg->ioc->lock);
3092        iocg->cfg_weight = v * WEIGHT_ONE;
3093        ioc_now(iocg->ioc, &now);
3094        weight_updated(iocg, &now);
3095        spin_unlock(&iocg->ioc->lock);
3096
3097        blkg_conf_finish(&ctx);
3098        return nbytes;
3099
3100einval:
3101        blkg_conf_finish(&ctx);
3102        return -EINVAL;
3103}
3104
3105static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
3106                          int off)
3107{
3108        const char *dname = blkg_dev_name(pd->blkg);
3109        struct ioc *ioc = pd_to_iocg(pd)->ioc;
3110
3111        if (!dname)
3112                return 0;
3113
3114        seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
3115                   dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
3116                   ioc->params.qos[QOS_RPPM] / 10000,
3117                   ioc->params.qos[QOS_RPPM] % 10000 / 100,
3118                   ioc->params.qos[QOS_RLAT],
3119                   ioc->params.qos[QOS_WPPM] / 10000,
3120                   ioc->params.qos[QOS_WPPM] % 10000 / 100,
3121                   ioc->params.qos[QOS_WLAT],
3122                   ioc->params.qos[QOS_MIN] / 10000,
3123                   ioc->params.qos[QOS_MIN] % 10000 / 100,
3124                   ioc->params.qos[QOS_MAX] / 10000,
3125                   ioc->params.qos[QOS_MAX] % 10000 / 100);
3126        return 0;
3127}
3128
3129static int ioc_qos_show(struct seq_file *sf, void *v)
3130{
3131        struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3132
3133        blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
3134                          &blkcg_policy_iocost, seq_cft(sf)->private, false);
3135        return 0;
3136}
3137
3138static const match_table_t qos_ctrl_tokens = {
3139        { QOS_ENABLE,           "enable=%u"     },
3140        { QOS_CTRL,             "ctrl=%s"       },
3141        { NR_QOS_CTRL_PARAMS,   NULL            },
3142};
3143
3144static const match_table_t qos_tokens = {
3145        { QOS_RPPM,             "rpct=%s"       },
3146        { QOS_RLAT,             "rlat=%u"       },
3147        { QOS_WPPM,             "wpct=%s"       },
3148        { QOS_WLAT,             "wlat=%u"       },
3149        { QOS_MIN,              "min=%s"        },
3150        { QOS_MAX,              "max=%s"        },
3151        { NR_QOS_PARAMS,        NULL            },
3152};
3153
3154static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
3155                             size_t nbytes, loff_t off)
3156{
3157        struct block_device *bdev;
3158        struct ioc *ioc;
3159        u32 qos[NR_QOS_PARAMS];
3160        bool enable, user;
3161        char *p;
3162        int ret;
3163
3164        bdev = blkcg_conf_open_bdev(&input);
3165        if (IS_ERR(bdev))
3166                return PTR_ERR(bdev);
3167
3168        ioc = q_to_ioc(bdev->bd_disk->queue);
3169        if (!ioc) {
3170                ret = blk_iocost_init(bdev->bd_disk->queue);
3171                if (ret)
3172                        goto err;
3173                ioc = q_to_ioc(bdev->bd_disk->queue);
3174        }
3175
3176        spin_lock_irq(&ioc->lock);
3177        memcpy(qos, ioc->params.qos, sizeof(qos));
3178        enable = ioc->enabled;
3179        user = ioc->user_qos_params;
3180        spin_unlock_irq(&ioc->lock);
3181
3182        while ((p = strsep(&input, " \t\n"))) {
3183                substring_t args[MAX_OPT_ARGS];
3184                char buf[32];
3185                int tok;
3186                s64 v;
3187
3188                if (!*p)
3189                        continue;
3190
3191                switch (match_token(p, qos_ctrl_tokens, args)) {
3192                case QOS_ENABLE:
3193                        match_u64(&args[0], &v);
3194                        enable = v;
3195                        continue;
3196                case QOS_CTRL:
3197                        match_strlcpy(buf, &args[0], sizeof(buf));
3198                        if (!strcmp(buf, "auto"))
3199                                user = false;
3200                        else if (!strcmp(buf, "user"))
3201                                user = true;
3202                        else
3203                                goto einval;
3204                        continue;
3205                }
3206
3207                tok = match_token(p, qos_tokens, args);
3208                switch (tok) {
3209                case QOS_RPPM:
3210                case QOS_WPPM:
3211                        if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3212                            sizeof(buf))
3213                                goto einval;
3214                        if (cgroup_parse_float(buf, 2, &v))
3215                                goto einval;
3216                        if (v < 0 || v > 10000)
3217                                goto einval;
3218                        qos[tok] = v * 100;
3219                        break;
3220                case QOS_RLAT:
3221                case QOS_WLAT:
3222                        if (match_u64(&args[0], &v))
3223                                goto einval;
3224                        qos[tok] = v;
3225                        break;
3226                case QOS_MIN:
3227                case QOS_MAX:
3228                        if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3229                            sizeof(buf))
3230                                goto einval;
3231                        if (cgroup_parse_float(buf, 2, &v))
3232                                goto einval;
3233                        if (v < 0)
3234                                goto einval;
3235                        qos[tok] = clamp_t(s64, v * 100,
3236                                           VRATE_MIN_PPM, VRATE_MAX_PPM);
3237                        break;
3238                default:
3239                        goto einval;
3240                }
3241                user = true;
3242        }
3243
3244        if (qos[QOS_MIN] > qos[QOS_MAX])
3245                goto einval;
3246
3247        spin_lock_irq(&ioc->lock);
3248
3249        if (enable) {
3250                blk_stat_enable_accounting(ioc->rqos.q);
3251                blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
3252                ioc->enabled = true;
3253        } else {
3254                blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
3255                ioc->enabled = false;
3256        }
3257
3258        if (user) {
3259                memcpy(ioc->params.qos, qos, sizeof(qos));
3260                ioc->user_qos_params = true;
3261        } else {
3262                ioc->user_qos_params = false;
3263        }
3264
3265        ioc_refresh_params(ioc, true);
3266        spin_unlock_irq(&ioc->lock);
3267
3268        blkdev_put_no_open(bdev);
3269        return nbytes;
3270einval:
3271        ret = -EINVAL;
3272err:
3273        blkdev_put_no_open(bdev);
3274        return ret;
3275}
3276
3277static u64 ioc_cost_model_prfill(struct seq_file *sf,
3278                                 struct blkg_policy_data *pd, int off)
3279{
3280        const char *dname = blkg_dev_name(pd->blkg);
3281        struct ioc *ioc = pd_to_iocg(pd)->ioc;
3282        u64 *u = ioc->params.i_lcoefs;
3283
3284        if (!dname)
3285                return 0;
3286
3287        seq_printf(sf, "%s ctrl=%s model=linear "
3288                   "rbps=%llu rseqiops=%llu rrandiops=%llu "
3289                   "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
3290                   dname, ioc->user_cost_model ? "user" : "auto",
3291                   u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
3292                   u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
3293        return 0;
3294}
3295
3296static int ioc_cost_model_show(struct seq_file *sf, void *v)
3297{
3298        struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3299
3300        blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
3301                          &blkcg_policy_iocost, seq_cft(sf)->private, false);
3302        return 0;
3303}
3304
3305static const match_table_t cost_ctrl_tokens = {
3306        { COST_CTRL,            "ctrl=%s"       },
3307        { COST_MODEL,           "model=%s"      },
3308        { NR_COST_CTRL_PARAMS,  NULL            },
3309};
3310
3311static const match_table_t i_lcoef_tokens = {
3312        { I_LCOEF_RBPS,         "rbps=%u"       },
3313        { I_LCOEF_RSEQIOPS,     "rseqiops=%u"   },
3314        { I_LCOEF_RRANDIOPS,    "rrandiops=%u"  },
3315        { I_LCOEF_WBPS,         "wbps=%u"       },
3316        { I_LCOEF_WSEQIOPS,     "wseqiops=%u"   },
3317        { I_LCOEF_WRANDIOPS,    "wrandiops=%u"  },
3318        { NR_I_LCOEFS,          NULL            },
3319};
3320
3321static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
3322                                    size_t nbytes, loff_t off)
3323{
3324        struct block_device *bdev;
3325        struct ioc *ioc;
3326        u64 u[NR_I_LCOEFS];
3327        bool user;
3328        char *p;
3329        int ret;
3330
3331        bdev = blkcg_conf_open_bdev(&input);
3332        if (IS_ERR(bdev))
3333                return PTR_ERR(bdev);
3334
3335        ioc = q_to_ioc(bdev->bd_disk->queue);
3336        if (!ioc) {
3337                ret = blk_iocost_init(bdev->bd_disk->queue);
3338                if (ret)
3339                        goto err;
3340                ioc = q_to_ioc(bdev->bd_disk->queue);
3341        }
3342
3343        spin_lock_irq(&ioc->lock);
3344        memcpy(u, ioc->params.i_lcoefs, sizeof(u));
3345        user = ioc->user_cost_model;
3346        spin_unlock_irq(&ioc->lock);
3347
3348        while ((p = strsep(&input, " \t\n"))) {
3349                substring_t args[MAX_OPT_ARGS];
3350                char buf[32];
3351                int tok;
3352                u64 v;
3353
3354                if (!*p)
3355                        continue;
3356
3357                switch (match_token(p, cost_ctrl_tokens, args)) {
3358                case COST_CTRL:
3359                        match_strlcpy(buf, &args[0], sizeof(buf));
3360                        if (!strcmp(buf, "auto"))
3361                                user = false;
3362                        else if (!strcmp(buf, "user"))
3363                                user = true;
3364                        else
3365                                goto einval;
3366                        continue;
3367                case COST_MODEL:
3368                        match_strlcpy(buf, &args[0], sizeof(buf));
3369                        if (strcmp(buf, "linear"))
3370                                goto einval;
3371                        continue;
3372                }
3373
3374                tok = match_token(p, i_lcoef_tokens, args);
3375                if (tok == NR_I_LCOEFS)
3376                        goto einval;
3377                if (match_u64(&args[0], &v))
3378                        goto einval;
3379                u[tok] = v;
3380                user = true;
3381        }
3382
3383        spin_lock_irq(&ioc->lock);
3384        if (user) {
3385                memcpy(ioc->params.i_lcoefs, u, sizeof(u));
3386                ioc->user_cost_model = true;
3387        } else {
3388                ioc->user_cost_model = false;
3389        }
3390        ioc_refresh_params(ioc, true);
3391        spin_unlock_irq(&ioc->lock);
3392
3393        blkdev_put_no_open(bdev);
3394        return nbytes;
3395
3396einval:
3397        ret = -EINVAL;
3398err:
3399        blkdev_put_no_open(bdev);
3400        return ret;
3401}
3402
3403static struct cftype ioc_files[] = {
3404        {
3405                .name = "weight",
3406                .flags = CFTYPE_NOT_ON_ROOT,
3407                .seq_show = ioc_weight_show,
3408                .write = ioc_weight_write,
3409        },
3410        {
3411                .name = "cost.qos",
3412                .flags = CFTYPE_ONLY_ON_ROOT,
3413                .seq_show = ioc_qos_show,
3414                .write = ioc_qos_write,
3415        },
3416        {
3417                .name = "cost.model",
3418                .flags = CFTYPE_ONLY_ON_ROOT,
3419                .seq_show = ioc_cost_model_show,
3420                .write = ioc_cost_model_write,
3421        },
3422        {}
3423};
3424
3425static struct blkcg_policy blkcg_policy_iocost = {
3426        .dfl_cftypes    = ioc_files,
3427        .cpd_alloc_fn   = ioc_cpd_alloc,
3428        .cpd_free_fn    = ioc_cpd_free,
3429        .pd_alloc_fn    = ioc_pd_alloc,
3430        .pd_init_fn     = ioc_pd_init,
3431        .pd_free_fn     = ioc_pd_free,
3432        .pd_stat_fn     = ioc_pd_stat,
3433};
3434
3435static int __init ioc_init(void)
3436{
3437        return blkcg_policy_register(&blkcg_policy_iocost);
3438}
3439
3440static void __exit ioc_exit(void)
3441{
3442        blkcg_policy_unregister(&blkcg_policy_iocost);
3443}
3444
3445module_init(ioc_init);
3446module_exit(ioc_exit);
3447