LXR linux/block/blk-iocost.c

   1/* SPDX-License-Identifier: GPL-2.0
   2 *
   3 * IO cost model based controller.
   4 *
   5 * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
   6 * Copyright (C) 2019 Andy Newell <newella@fb.com>
   7 * Copyright (C) 2019 Facebook
   8 *
   9 * One challenge of controlling IO resources is the lack of trivially
  10 * observable cost metric.  This is distinguished from CPU and memory where
  11 * wallclock time and the number of bytes can serve as accurate enough
  12 * approximations.
  13 *
  14 * Bandwidth and iops are the most commonly used metrics for IO devices but
  15 * depending on the type and specifics of the device, different IO patterns
  16 * easily lead to multiple orders of magnitude variations rendering them
  17 * useless for the purpose of IO capacity distribution.  While on-device
  18 * time, with a lot of clutches, could serve as a useful approximation for
  19 * non-queued rotational devices, this is no longer viable with modern
  20 * devices, even the rotational ones.
  21 *
  22 * While there is no cost metric we can trivially observe, it isn't a
  23 * complete mystery.  For example, on a rotational device, seek cost
  24 * dominates while a contiguous transfer contributes a smaller amount
  25 * proportional to the size.  If we can characterize at least the relative
  26 * costs of these different types of IOs, it should be possible to
  27 * implement a reasonable work-conserving proportional IO resource
  28 * distribution.
  29 *
  30 * 1. IO Cost Model
  31 *
  32 * IO cost model estimates the cost of an IO given its basic parameters and
  33 * history (e.g. the end sector of the last IO).  The cost is measured in
  34 * device time.  If a given IO is estimated to cost 10ms, the device should
  35 * be able to process ~100 of those IOs in a second.
  36 *
  37 * Currently, there's only one builtin cost model - linear.  Each IO is
  38 * classified as sequential or random and given a base cost accordingly.
  39 * On top of that, a size cost proportional to the length of the IO is
  40 * added.  While simple, this model captures the operational
  41 * characteristics of a wide varienty of devices well enough.  Default
  42 * paramters for several different classes of devices are provided and the
  43 * parameters can be configured from userspace via
  44 * /sys/fs/cgroup/io.cost.model.
  45 *
  46 * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
  47 * device-specific coefficients.
  48 *
  49 * 2. Control Strategy
  50 *
  51 * The device virtual time (vtime) is used as the primary control metric.
  52 * The control strategy is composed of the following three parts.
  53 *
  54 * 2-1. Vtime Distribution
  55 *
  56 * When a cgroup becomes active in terms of IOs, its hierarchical share is
  57 * calculated.  Please consider the following hierarchy where the numbers
  58 * inside parentheses denote the configured weights.
  59 *
  60 *           root
  61 *         /       \
  62 *      A (w:100)  B (w:300)
  63 *      /       \
  64 *  A0 (w:100)  A1 (w:100)
  65 *
  66 * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
  67 * of equal weight, each gets 50% share.  If then B starts issuing IOs, B
  68 * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
  69 * 12.5% each.  The distribution mechanism only cares about these flattened
  70 * shares.  They're called hweights (hierarchical weights) and always add
  71 * upto 1 (HWEIGHT_WHOLE).
  72 *
  73 * A given cgroup's vtime runs slower in inverse proportion to its hweight.
  74 * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
  75 * against the device vtime - an IO which takes 10ms on the underlying
  76 * device is considered to take 80ms on A0.
  77 *
  78 * This constitutes the basis of IO capacity distribution.  Each cgroup's
  79 * vtime is running at a rate determined by its hweight.  A cgroup tracks
  80 * the vtime consumed by past IOs and can issue a new IO iff doing so
  81 * wouldn't outrun the current device vtime.  Otherwise, the IO is
  82 * suspended until the vtime has progressed enough to cover it.
  83 *
  84 * 2-2. Vrate Adjustment
  85 *
  86 * It's unrealistic to expect the cost model to be perfect.  There are too
  87 * many devices and even on the same device the overall performance
  88 * fluctuates depending on numerous factors such as IO mixture and device
  89 * internal garbage collection.  The controller needs to adapt dynamically.
  90 *
  91 * This is achieved by adjusting the overall IO rate according to how busy
  92 * the device is.  If the device becomes overloaded, we're sending down too
  93 * many IOs and should generally slow down.  If there are waiting issuers
  94 * but the device isn't saturated, we're issuing too few and should
  95 * generally speed up.
  96 *
  97 * To slow down, we lower the vrate - the rate at which the device vtime
  98 * passes compared to the wall clock.  For example, if the vtime is running
  99 * at the vrate of 75%, all cgroups added up would only be able to issue
 100 * 750ms worth of IOs per second, and vice-versa for speeding up.
 101 *
 102 * Device business is determined using two criteria - rq wait and
 103 * completion latencies.
 104 *
 105 * When a device gets saturated, the on-device and then the request queues
 106 * fill up and a bio which is ready to be issued has to wait for a request
 107 * to become available.  When this delay becomes noticeable, it's a clear
 108 * indication that the device is saturated and we lower the vrate.  This
 109 * saturation signal is fairly conservative as it only triggers when both
 110 * hardware and software queues are filled up, and is used as the default
 111 * busy signal.
 112 *
 113 * As devices can have deep queues and be unfair in how the queued commands
 114 * are executed, soley depending on rq wait may not result in satisfactory
 115 * control quality.  For a better control quality, completion latency QoS
 116 * parameters can be configured so that the device is considered saturated
 117 * if N'th percentile completion latency rises above the set point.
 118 *
 119 * The completion latency requirements are a function of both the
 120 * underlying device characteristics and the desired IO latency quality of
 121 * service.  There is an inherent trade-off - the tighter the latency QoS,
 122 * the higher the bandwidth lossage.  Latency QoS is disabled by default
 123 * and can be set through /sys/fs/cgroup/io.cost.qos.
 124 *
 125 * 2-3. Work Conservation
 126 *
 127 * Imagine two cgroups A and B with equal weights.  A is issuing a small IO
 128 * periodically while B is sending out enough parallel IOs to saturate the
 129 * device on its own.  Let's say A's usage amounts to 100ms worth of IO
 130 * cost per second, i.e., 10% of the device capacity.  The naive
 131 * distribution of half and half would lead to 60% utilization of the
 132 * device, a significant reduction in the total amount of work done
 133 * compared to free-for-all competition.  This is too high a cost to pay
 134 * for IO control.
 135 *
 136 * To conserve the total amount of work done, we keep track of how much
 137 * each active cgroup is actually using and yield part of its weight if
 138 * there are other cgroups which can make use of it.  In the above case,
 139 * A's weight will be lowered so that it hovers above the actual usage and
 140 * B would be able to use the rest.
 141 *
 142 * As we don't want to penalize a cgroup for donating its weight, the
 143 * surplus weight adjustment factors in a margin and has an immediate
 144 * snapback mechanism in case the cgroup needs more IO vtime for itself.
 145 *
 146 * Note that adjusting down surplus weights has the same effects as
 147 * accelerating vtime for other cgroups and work conservation can also be
 148 * implemented by adjusting vrate dynamically.  However, squaring who can
 149 * donate and should take back how much requires hweight propagations
 150 * anyway making it easier to implement and understand as a separate
 151 * mechanism.
 152 *
 153 * 3. Monitoring
 154 *
 155 * Instead of debugfs or other clumsy monitoring mechanisms, this
 156 * controller uses a drgn based monitoring script -
 157 * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
 158 * https://github.com/osandov/drgn.  The ouput looks like the following.
 159 *
 160 *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
 161 *                 active      weight      hweight% inflt% dbt  delay usages%
 162 *  test/a              *    50/   50  33.33/ 33.33  27.65   2  0*041 033:033:033
 163 *  test/b              *   100/  100  66.67/ 66.67  17.56   0  0*000 066:079:077
 164 *
 165 * - per        : Timer period
 166 * - cur_per    : Internal wall and device vtime clock
 167 * - vrate      : Device virtual time rate against wall clock
 168 * - weight     : Surplus-adjusted and configured weights
 169 * - hweight    : Surplus-adjusted and configured hierarchical weights
 170 * - inflt      : The percentage of in-flight IO cost at the end of last period
 171 * - del_ms     : Deferred issuer delay induction level and duration
 172 * - usages     : Usage history
 173 */
 174
 175#include <linux/kernel.h>
 176#include <linux/module.h>
 177#include <linux/timer.h>
 178#include <linux/time64.h>
 179#include <linux/parser.h>
 180#include <linux/sched/signal.h>
 181#include <linux/blk-cgroup.h>
 182#include "blk-rq-qos.h"
 183#include "blk-stat.h"
 184#include "blk-wbt.h"
 185#include "blk.h"
 186
 187#ifdef CONFIG_TRACEPOINTS
 188
 189/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
 190#define TRACE_IOCG_PATH_LEN 1024
 191static DEFINE_SPINLOCK(trace_iocg_path_lock);
 192static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
 193
 194#define TRACE_IOCG_PATH(type, iocg, ...)                                        \
 195        do {                                                                    \
 196                unsigned long flags;                                            \
 197                if (trace_iocost_##type##_enabled()) {                          \
 198                        spin_lock_irqsave(&trace_iocg_path_lock, flags);        \
 199                        cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup,      \
 200                                    trace_iocg_path, TRACE_IOCG_PATH_LEN);      \
 201                        trace_iocost_##type(iocg, trace_iocg_path,              \
 202                                              ##__VA_ARGS__);                   \
 203                        spin_unlock_irqrestore(&trace_iocg_path_lock, flags);   \
 204                }                                                               \
 205        } while (0)
 206
 207#else   /* CONFIG_TRACE_POINTS */
 208#define TRACE_IOCG_PATH(type, iocg, ...)        do { } while (0)
 209#endif  /* CONFIG_TRACE_POINTS */
 210
 211enum {
 212        MILLION                 = 1000000,
 213
 214        /* timer period is calculated from latency requirements, bound it */
 215        MIN_PERIOD              = USEC_PER_MSEC,
 216        MAX_PERIOD              = USEC_PER_SEC,
 217
 218        /*
 219         * A cgroup's vtime can run 50% behind the device vtime, which
 220         * serves as its IO credit buffer.  Surplus weight adjustment is
 221         * immediately canceled if the vtime margin runs below 10%.
 222         */
 223        MARGIN_PCT              = 50,
 224        INUSE_MARGIN_PCT        = 10,
 225
 226        /* Have some play in waitq timer operations */
 227        WAITQ_TIMER_MARGIN_PCT  = 5,
 228
 229        /*
 230         * vtime can wrap well within a reasonable uptime when vrate is
 231         * consistently raised.  Don't trust recorded cgroup vtime if the
 232         * period counter indicates that it's older than 5mins.
 233         */
 234        VTIME_VALID_DUR         = 300 * USEC_PER_SEC,
 235
 236        /*
 237         * Remember the past three non-zero usages and use the max for
 238         * surplus calculation.  Three slots guarantee that we remember one
 239         * full period usage from the last active stretch even after
 240         * partial deactivation and re-activation periods.  Don't start
 241         * giving away weight before collecting two data points to prevent
 242         * hweight adjustments based on one partial activation period.
 243         */
 244        NR_USAGE_SLOTS          = 3,
 245        MIN_VALID_USAGES        = 2,
 246
 247        /* 1/64k is granular enough and can easily be handled w/ u32 */
 248        HWEIGHT_WHOLE           = 1 << 16,
 249
 250        /*
 251         * As vtime is used to calculate the cost of each IO, it needs to
 252         * be fairly high precision.  For example, it should be able to
 253         * represent the cost of a single page worth of discard with
 254         * suffificient accuracy.  At the same time, it should be able to
 255         * represent reasonably long enough durations to be useful and
 256         * convenient during operation.
 257         *
 258         * 1s worth of vtime is 2^37.  This gives us both sub-nanosecond
 259         * granularity and days of wrap-around time even at extreme vrates.
 260         */
 261        VTIME_PER_SEC_SHIFT     = 37,
 262        VTIME_PER_SEC           = 1LLU << VTIME_PER_SEC_SHIFT,
 263        VTIME_PER_USEC          = VTIME_PER_SEC / USEC_PER_SEC,
 264
 265        /* bound vrate adjustments within two orders of magnitude */
 266        VRATE_MIN_PPM           = 10000,        /* 1% */
 267        VRATE_MAX_PPM           = 100000000,    /* 10000% */
 268
 269        VRATE_MIN               = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
 270        VRATE_CLAMP_ADJ_PCT     = 4,
 271
 272        /* if IOs end up waiting for requests, issue less */
 273        RQ_WAIT_BUSY_PCT        = 5,
 274
 275        /* unbusy hysterisis */
 276        UNBUSY_THR_PCT          = 75,
 277
 278        /* don't let cmds which take a very long time pin lagging for too long */
 279        MAX_LAGGING_PERIODS     = 10,
 280
 281        /*
 282         * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
 283         * donate the surplus.
 284         */
 285        SURPLUS_SCALE_PCT       = 125,                  /* * 125% */
 286        SURPLUS_SCALE_ABS       = HWEIGHT_WHOLE / 50,   /* + 2% */
 287        SURPLUS_MIN_ADJ_DELTA   = HWEIGHT_WHOLE / 33,   /* 3% */
 288
 289        /* switch iff the conditions are met for longer than this */
 290        AUTOP_CYCLE_NSEC        = 10LLU * NSEC_PER_SEC,
 291
 292        /*
 293         * Count IO size in 4k pages.  The 12bit shift helps keeping
 294         * size-proportional components of cost calculation in closer
 295         * numbers of digits to per-IO cost components.
 296         */
 297        IOC_PAGE_SHIFT          = 12,
 298        IOC_PAGE_SIZE           = 1 << IOC_PAGE_SHIFT,
 299        IOC_SECT_TO_PAGE_SHIFT  = IOC_PAGE_SHIFT - SECTOR_SHIFT,
 300
 301        /* if apart further than 16M, consider randio for linear model */
 302        LCOEF_RANDIO_PAGES      = 4096,
 303};
 304
 305enum ioc_running {
 306        IOC_IDLE,
 307        IOC_RUNNING,
 308        IOC_STOP,
 309};
 310
 311/* io.cost.qos controls including per-dev enable of the whole controller */
 312enum {
 313        QOS_ENABLE,
 314        QOS_CTRL,
 315        NR_QOS_CTRL_PARAMS,
 316};
 317
 318/* io.cost.qos params */
 319enum {
 320        QOS_RPPM,
 321        QOS_RLAT,
 322        QOS_WPPM,
 323        QOS_WLAT,
 324        QOS_MIN,
 325        QOS_MAX,
 326        NR_QOS_PARAMS,
 327};
 328
 329/* io.cost.model controls */
 330enum {
 331        COST_CTRL,
 332        COST_MODEL,
 333        NR_COST_CTRL_PARAMS,
 334};
 335
 336/* builtin linear cost model coefficients */
 337enum {
 338        I_LCOEF_RBPS,
 339        I_LCOEF_RSEQIOPS,
 340        I_LCOEF_RRANDIOPS,
 341        I_LCOEF_WBPS,
 342        I_LCOEF_WSEQIOPS,
 343        I_LCOEF_WRANDIOPS,
 344        NR_I_LCOEFS,
 345};
 346
 347enum {
 348        LCOEF_RPAGE,
 349        LCOEF_RSEQIO,
 350        LCOEF_RRANDIO,
 351        LCOEF_WPAGE,
 352        LCOEF_WSEQIO,
 353        LCOEF_WRANDIO,
 354        NR_LCOEFS,
 355};
 356
 357enum {
 358        AUTOP_INVALID,
 359        AUTOP_HDD,
 360        AUTOP_SSD_QD1,
 361        AUTOP_SSD_DFL,
 362        AUTOP_SSD_FAST,
 363};
 364
 365struct ioc_gq;
 366
 367struct ioc_params {
 368        u32                             qos[NR_QOS_PARAMS];
 369        u64                             i_lcoefs[NR_I_LCOEFS];
 370        u64                             lcoefs[NR_LCOEFS];
 371        u32                             too_fast_vrate_pct;
 372        u32                             too_slow_vrate_pct;
 373};
 374
 375struct ioc_missed {
 376        u32                             nr_met;
 377        u32                             nr_missed;
 378        u32                             last_met;
 379        u32                             last_missed;
 380};
 381
 382struct ioc_pcpu_stat {
 383        struct ioc_missed               missed[2];
 384
 385        u64                             rq_wait_ns;
 386        u64                             last_rq_wait_ns;
 387};
 388
 389/* per device */
 390struct ioc {
 391        struct rq_qos                   rqos;
 392
 393        bool                            enabled;
 394
 395        struct ioc_params               params;
 396        u32                             period_us;
 397        u32                             margin_us;
 398        u64                             vrate_min;
 399        u64                             vrate_max;
 400
 401        spinlock_t                      lock;
 402        struct timer_list               timer;
 403        struct list_head                active_iocgs;   /* active cgroups */
 404        struct ioc_pcpu_stat __percpu   *pcpu_stat;
 405
 406        enum ioc_running                running;
 407        atomic64_t                      vtime_rate;
 408
 409        seqcount_t                      period_seqcount;
 410        u32                             period_at;      /* wallclock starttime */
 411        u64                             period_at_vtime; /* vtime starttime */
 412
 413        atomic64_t                      cur_period;     /* inc'd each period */
 414        int                             busy_level;     /* saturation history */
 415
 416        u64                             inuse_margin_vtime;
 417        bool                            weights_updated;
 418        atomic_t                        hweight_gen;    /* for lazy hweights */
 419
 420        u64                             autop_too_fast_at;
 421        u64                             autop_too_slow_at;
 422        int                             autop_idx;
 423        bool                            user_qos_params:1;
 424        bool                            user_cost_model:1;
 425};
 426
 427/* per device-cgroup pair */
 428struct ioc_gq {
 429        struct blkg_policy_data         pd;
 430        struct ioc                      *ioc;
 431
 432        /*
 433         * A iocg can get its weight from two sources - an explicit
 434         * per-device-cgroup configuration or the default weight of the
 435         * cgroup.  `cfg_weight` is the explicit per-device-cgroup
 436         * configuration.  `weight` is the effective considering both
 437         * sources.
 438         *
 439         * When an idle cgroup becomes active its `active` goes from 0 to
 440         * `weight`.  `inuse` is the surplus adjusted active weight.
 441         * `active` and `inuse` are used to calculate `hweight_active` and
 442         * `hweight_inuse`.
 443         *
 444         * `last_inuse` remembers `inuse` while an iocg is idle to persist
 445         * surplus adjustments.
 446         */
 447        u32                             cfg_weight;
 448        u32                             weight;
 449        u32                             active;
 450        u32                             inuse;
 451        u32                             last_inuse;
 452
 453        sector_t                        cursor;         /* to detect randio */
 454
 455        /*
 456         * `vtime` is this iocg's vtime cursor which progresses as IOs are
 457         * issued.  If lagging behind device vtime, the delta represents
 458         * the currently available IO budget.  If runnning ahead, the
 459         * overage.
 460         *
 461         * `vtime_done` is the same but progressed on completion rather
 462         * than issue.  The delta behind `vtime` represents the cost of
 463         * currently in-flight IOs.
 464         *
 465         * `last_vtime` is used to remember `vtime` at the end of the last
 466         * period to calculate utilization.
 467         */
 468        atomic64_t                      vtime;
 469        atomic64_t                      done_vtime;
 470        u64                             abs_vdebt;
 471        u64                             last_vtime;
 472
 473        /*
 474         * The period this iocg was last active in.  Used for deactivation
 475         * and invalidating `vtime`.
 476         */
 477        atomic64_t                      active_period;
 478        struct list_head                active_list;
 479
 480        /* see __propagate_active_weight() and current_hweight() for details */
 481        u64                             child_active_sum;
 482        u64                             child_inuse_sum;
 483        int                             hweight_gen;
 484        u32                             hweight_active;
 485        u32                             hweight_inuse;
 486        bool                            has_surplus;
 487
 488        struct wait_queue_head          waitq;
 489        struct hrtimer                  waitq_timer;
 490        struct hrtimer                  delay_timer;
 491
 492        /* usage is recorded as fractions of HWEIGHT_WHOLE */
 493        int                             usage_idx;
 494        u32                             usages[NR_USAGE_SLOTS];
 495
 496        /* this iocg's depth in the hierarchy and ancestors including self */
 497        int                             level;
 498        struct ioc_gq                   *ancestors[];
 499};
 500
 501/* per cgroup */
 502struct ioc_cgrp {
 503        struct blkcg_policy_data        cpd;
 504        unsigned int                    dfl_weight;
 505};
 506
 507struct ioc_now {
 508        u64                             now_ns;
 509        u32                             now;
 510        u64                             vnow;
 511        u64                             vrate;
 512};
 513
 514struct iocg_wait {
 515        struct wait_queue_entry         wait;
 516        struct bio                      *bio;
 517        u64                             abs_cost;
 518        bool                            committed;
 519};
 520
 521struct iocg_wake_ctx {
 522        struct ioc_gq                   *iocg;
 523        u32                             hw_inuse;
 524        s64                             vbudget;
 525};
 526
 527static const struct ioc_params autop[] = {
 528        [AUTOP_HDD] = {
 529                .qos                            = {
 530                        [QOS_RLAT]              =        250000, /* 250ms */
 531                        [QOS_WLAT]              =        250000,
 532                        [QOS_MIN]               = VRATE_MIN_PPM,
 533                        [QOS_MAX]               = VRATE_MAX_PPM,
 534                },
 535                .i_lcoefs                       = {
 536                        [I_LCOEF_RBPS]          =     174019176,
 537                        [I_LCOEF_RSEQIOPS]      =         41708,
 538                        [I_LCOEF_RRANDIOPS]     =           370,
 539                        [I_LCOEF_WBPS]          =     178075866,
 540                        [I_LCOEF_WSEQIOPS]      =         42705,
 541                        [I_LCOEF_WRANDIOPS]     =           378,
 542                },
 543        },
 544        [AUTOP_SSD_QD1] = {
 545                .qos                            = {
 546                        [QOS_RLAT]              =         25000, /* 25ms */
 547                        [QOS_WLAT]              =         25000,
 548                        [QOS_MIN]               = VRATE_MIN_PPM,
 549                        [QOS_MAX]               = VRATE_MAX_PPM,
 550                },
 551                .i_lcoefs                       = {
 552                        [I_LCOEF_RBPS]          =     245855193,
 553                        [I_LCOEF_RSEQIOPS]      =         61575,
 554                        [I_LCOEF_RRANDIOPS]     =          6946,
 555                        [I_LCOEF_WBPS]          =     141365009,
 556                        [I_LCOEF_WSEQIOPS]      =         33716,
 557                        [I_LCOEF_WRANDIOPS]     =         26796,
 558                },
 559        },
 560        [AUTOP_SSD_DFL] = {
 561                .qos                            = {
 562                        [QOS_RLAT]              =         25000, /* 25ms */
 563                        [QOS_WLAT]              =         25000,
 564                        [QOS_MIN]               = VRATE_MIN_PPM,
 565                        [QOS_MAX]               = VRATE_MAX_PPM,
 566                },
 567                .i_lcoefs                       = {
 568                        [I_LCOEF_RBPS]          =     488636629,
 569                        [I_LCOEF_RSEQIOPS]      =          8932,
 570                        [I_LCOEF_RRANDIOPS]     =          8518,
 571                        [I_LCOEF_WBPS]          =     427891549,
 572                        [I_LCOEF_WSEQIOPS]      =         28755,
 573                        [I_LCOEF_WRANDIOPS]     =         21940,
 574                },
 575                .too_fast_vrate_pct             =           500,
 576        },
 577        [AUTOP_SSD_FAST] = {
 578                .qos                            = {
 579                        [QOS_RLAT]              =          5000, /* 5ms */
 580                        [QOS_WLAT]              =          5000,
 581                        [QOS_MIN]               = VRATE_MIN_PPM,
 582                        [QOS_MAX]               = VRATE_MAX_PPM,
 583                },
 584                .i_lcoefs                       = {
 585                        [I_LCOEF_RBPS]          =    3102524156LLU,
 586                        [I_LCOEF_RSEQIOPS]      =        724816,
 587                        [I_LCOEF_RRANDIOPS]     =        778122,
 588                        [I_LCOEF_WBPS]          =    1742780862LLU,
 589                        [I_LCOEF_WSEQIOPS]      =        425702,
 590                        [I_LCOEF_WRANDIOPS]     =        443193,
 591                },
 592                .too_slow_vrate_pct             =            10,
 593        },
 594};
 595
 596/*
 597 * vrate adjust percentages indexed by ioc->busy_level.  We adjust up on
 598 * vtime credit shortage and down on device saturation.
 599 */
 600static u32 vrate_adj_pct[] =
 601        { 0, 0, 0, 0,
 602          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 603          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 604          4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
 605
 606static struct blkcg_policy blkcg_policy_iocost;
 607
 608/* accessors and helpers */
 609static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
 610{
 611        return container_of(rqos, struct ioc, rqos);
 612}
 613
 614static struct ioc *q_to_ioc(struct request_queue *q)
 615{
 616        return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
 617}
 618
 619static const char *q_name(struct request_queue *q)
 620{
 621        if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
 622                return kobject_name(q->kobj.parent);
 623        else
 624                return "<unknown>";
 625}
 626
 627static const char __maybe_unused *ioc_name(struct ioc *ioc)
 628{
 629        return q_name(ioc->rqos.q);
 630}
 631
 632static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
 633{
 634        return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
 635}
 636
 637static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
 638{
 639        return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
 640}
 641
 642static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
 643{
 644        return pd_to_blkg(&iocg->pd);
 645}
 646
 647static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
 648{
 649        return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
 650                            struct ioc_cgrp, cpd);
 651}
 652
 653/*
 654 * Scale @abs_cost to the inverse of @hw_inuse.  The lower the hierarchical
 655 * weight, the more expensive each IO.  Must round up.
 656 */
 657static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
 658{
 659        return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
 660}
 661
 662/*
 663 * The inverse of abs_cost_to_cost().  Must round up.
 664 */
 665static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
 666{
 667        return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
 668}
 669
 670static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
 671{
 672        bio->bi_iocost_cost = cost;
 673        atomic64_add(cost, &iocg->vtime);
 674}
 675
 676#define CREATE_TRACE_POINTS
 677#include <trace/events/iocost.h>
 678
 679/* latency Qos params changed, update period_us and all the dependent params */
 680static void ioc_refresh_period_us(struct ioc *ioc)
 681{
 682        u32 ppm, lat, multi, period_us;
 683
 684        lockdep_assert_held(&ioc->lock);
 685
 686        /* pick the higher latency target */
 687        if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
 688                ppm = ioc->params.qos[QOS_RPPM];
 689                lat = ioc->params.qos[QOS_RLAT];
 690        } else {
 691                ppm = ioc->params.qos[QOS_WPPM];
 692                lat = ioc->params.qos[QOS_WLAT];
 693        }
 694
 695        /*
 696         * We want the period to be long enough to contain a healthy number
 697         * of IOs while short enough for granular control.  Define it as a
 698         * multiple of the latency target.  Ideally, the multiplier should
 699         * be scaled according to the percentile so that it would nominally
 700         * contain a certain number of requests.  Let's be simpler and
 701         * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
 702         */
 703        if (ppm)
 704                multi = max_t(u32, (MILLION - ppm) / 50000, 2);
 705        else
 706                multi = 2;
 707        period_us = multi * lat;
 708        period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
 709
 710        /* calculate dependent params */
 711        ioc->period_us = period_us;
 712        ioc->margin_us = period_us * MARGIN_PCT / 100;
 713        ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
 714                        period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
 715}
 716
 717static int ioc_autop_idx(struct ioc *ioc)
 718{
 719        int idx = ioc->autop_idx;
 720        const struct ioc_params *p = &autop[idx];
 721        u32 vrate_pct;
 722        u64 now_ns;
 723
 724        /* rotational? */
 725        if (!blk_queue_nonrot(ioc->rqos.q))
 726                return AUTOP_HDD;
 727
 728        /* handle SATA SSDs w/ broken NCQ */
 729        if (blk_queue_depth(ioc->rqos.q) == 1)
 730                return AUTOP_SSD_QD1;
 731
 732        /* use one of the normal ssd sets */
 733        if (idx < AUTOP_SSD_DFL)
 734                return AUTOP_SSD_DFL;
 735
 736        /* if user is overriding anything, maintain what was there */
 737        if (ioc->user_qos_params || ioc->user_cost_model)
 738                return idx;
 739
 740        /* step up/down based on the vrate */
 741        vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
 742                              VTIME_PER_USEC);
 743        now_ns = ktime_get_ns();
 744
 745        if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
 746                if (!ioc->autop_too_fast_at)
 747                        ioc->autop_too_fast_at = now_ns;
 748                if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
 749                        return idx + 1;
 750        } else {
 751                ioc->autop_too_fast_at = 0;
 752        }
 753
 754        if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
 755                if (!ioc->autop_too_slow_at)
 756                        ioc->autop_too_slow_at = now_ns;
 757                if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
 758                        return idx - 1;
 759        } else {
 760                ioc->autop_too_slow_at = 0;
 761        }
 762
 763        return idx;
 764}
 765
 766/*
 767 * Take the followings as input
 768 *
 769 *  @bps        maximum sequential throughput
 770 *  @seqiops    maximum sequential 4k iops
 771 *  @randiops   maximum random 4k iops
 772 *
 773 * and calculate the linear model cost coefficients.
 774 *
 775 *  *@page      per-page cost           1s / (@bps / 4096)
 776 *  *@seqio     base cost of a seq IO   max((1s / @seqiops) - *@page, 0)
 777 *  @randiops   base cost of a rand IO  max((1s / @randiops) - *@page, 0)
 778 */
 779static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
 780                        u64 *page, u64 *seqio, u64 *randio)
 781{
 782        u64 v;
 783
 784        *page = *seqio = *randio = 0;
 785
 786        if (bps)
 787                *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
 788                                           DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
 789
 790        if (seqiops) {
 791                v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
 792                if (v > *page)
 793                        *seqio = v - *page;
 794        }
 795
 796        if (randiops) {
 797                v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
 798                if (v > *page)
 799                        *randio = v - *page;
 800        }
 801}
 802
 803static void ioc_refresh_lcoefs(struct ioc *ioc)
 804{
 805        u64 *u = ioc->params.i_lcoefs;
 806        u64 *c = ioc->params.lcoefs;
 807
 808        calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
 809                    &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
 810        calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
 811                    &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
 812}
 813
 814static bool ioc_refresh_params(struct ioc *ioc, bool force)
 815{
 816        const struct ioc_params *p;
 817        int idx;
 818
 819        lockdep_assert_held(&ioc->lock);
 820
 821        idx = ioc_autop_idx(ioc);
 822        p = &autop[idx];
 823
 824        if (idx == ioc->autop_idx && !force)
 825                return false;
 826
 827        if (idx != ioc->autop_idx)
 828                atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
 829
 830        ioc->autop_idx = idx;
 831        ioc->autop_too_fast_at = 0;
 832        ioc->autop_too_slow_at = 0;
 833
 834        if (!ioc->user_qos_params)
 835                memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
 836        if (!ioc->user_cost_model)
 837                memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
 838
 839        ioc_refresh_period_us(ioc);
 840        ioc_refresh_lcoefs(ioc);
 841
 842        ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
 843                                            VTIME_PER_USEC, MILLION);
 844        ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
 845                                   VTIME_PER_USEC, MILLION);
 846
 847        return true;
 848}
 849
 850/* take a snapshot of the current [v]time and vrate */
 851static void ioc_now(struct ioc *ioc, struct ioc_now *now)
 852{
 853        unsigned seq;
 854
 855        now->now_ns = ktime_get();
 856        now->now = ktime_to_us(now->now_ns);
 857        now->vrate = atomic64_read(&ioc->vtime_rate);
 858
 859        /*
 860         * The current vtime is
 861         *
 862         *   vtime at period start + (wallclock time since the start) * vrate
 863         *
 864         * As a consistent snapshot of `period_at_vtime` and `period_at` is
 865         * needed, they're seqcount protected.
 866         */
 867        do {
 868                seq = read_seqcount_begin(&ioc->period_seqcount);
 869                now->vnow = ioc->period_at_vtime +
 870                        (now->now - ioc->period_at) * now->vrate;
 871        } while (read_seqcount_retry(&ioc->period_seqcount, seq));
 872}
 873
 874static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
 875{
 876        lockdep_assert_held(&ioc->lock);
 877        WARN_ON_ONCE(ioc->running != IOC_RUNNING);
 878
 879        write_seqcount_begin(&ioc->period_seqcount);
 880        ioc->period_at = now->now;
 881        ioc->period_at_vtime = now->vnow;
 882        write_seqcount_end(&ioc->period_seqcount);
 883
 884        ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
 885        add_timer(&ioc->timer);
 886}
 887
 888/*
 889 * Update @iocg's `active` and `inuse` to @active and @inuse, update level
 890 * weight sums and propagate upwards accordingly.
 891 */
 892static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
 893{
 894        struct ioc *ioc = iocg->ioc;
 895        int lvl;
 896
 897        lockdep_assert_held(&ioc->lock);
 898
 899        inuse = min(active, inuse);
 900
 901        for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
 902                struct ioc_gq *parent = iocg->ancestors[lvl];
 903                struct ioc_gq *child = iocg->ancestors[lvl + 1];
 904                u32 parent_active = 0, parent_inuse = 0;
 905
 906                /* update the level sums */
 907                parent->child_active_sum += (s32)(active - child->active);
 908                parent->child_inuse_sum += (s32)(inuse - child->inuse);
 909                /* apply the udpates */
 910                child->active = active;
 911                child->inuse = inuse;
 912
 913                /*
 914                 * The delta between inuse and active sums indicates that
 915                 * that much of weight is being given away.  Parent's inuse
 916                 * and active should reflect the ratio.
 917                 */
 918                if (parent->child_active_sum) {
 919                        parent_active = parent->weight;
 920                        parent_inuse = DIV64_U64_ROUND_UP(
 921                                parent_active * parent->child_inuse_sum,
 922                                parent->child_active_sum);
 923                }
 924
 925                /* do we need to keep walking up? */
 926                if (parent_active == parent->active &&
 927                    parent_inuse == parent->inuse)
 928                        break;
 929
 930                active = parent_active;
 931                inuse = parent_inuse;
 932        }
 933
 934        ioc->weights_updated = true;
 935}
 936
 937static void commit_active_weights(struct ioc *ioc)
 938{
 939        lockdep_assert_held(&ioc->lock);
 940
 941        if (ioc->weights_updated) {
 942                /* paired with rmb in current_hweight(), see there */
 943                smp_wmb();
 944                atomic_inc(&ioc->hweight_gen);
 945                ioc->weights_updated = false;
 946        }
 947}
 948
 949static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
 950{
 951        __propagate_active_weight(iocg, active, inuse);
 952        commit_active_weights(iocg->ioc);
 953}
 954
 955static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
 956{
 957        struct ioc *ioc = iocg->ioc;
 958        int lvl;
 959        u32 hwa, hwi;
 960        int ioc_gen;
 961
 962        /* hot path - if uptodate, use cached */
 963        ioc_gen = atomic_read(&ioc->hweight_gen);
 964        if (ioc_gen == iocg->hweight_gen)
 965                goto out;
 966
 967        /*
 968         * Paired with wmb in commit_active_weights().  If we saw the
 969         * updated hweight_gen, all the weight updates from
 970         * __propagate_active_weight() are visible too.
 971         *
 972         * We can race with weight updates during calculation and get it
 973         * wrong.  However, hweight_gen would have changed and a future
 974         * reader will recalculate and we're guaranteed to discard the
 975         * wrong result soon.
 976         */
 977        smp_rmb();
 978
 979        hwa = hwi = HWEIGHT_WHOLE;
 980        for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
 981                struct ioc_gq *parent = iocg->ancestors[lvl];
 982                struct ioc_gq *child = iocg->ancestors[lvl + 1];
 983                u32 active_sum = READ_ONCE(parent->child_active_sum);
 984                u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
 985                u32 active = READ_ONCE(child->active);
 986                u32 inuse = READ_ONCE(child->inuse);
 987
 988                /* we can race with deactivations and either may read as zero */
 989                if (!active_sum || !inuse_sum)
 990                        continue;
 991
 992                active_sum = max(active, active_sum);
 993                hwa = hwa * active / active_sum;        /* max 16bits * 10000 */
 994
 995                inuse_sum = max(inuse, inuse_sum);
 996                hwi = hwi * inuse / inuse_sum;          /* max 16bits * 10000 */
 997        }
 998
 999        iocg->hweight_active = max_t(u32, hwa, 1);
1000        iocg->hweight_inuse = max_t(u32, hwi, 1);

1001        iocg->hweight_gen = ioc_gen;
1002out:
1003        if (hw_activep)
1004                *hw_activep = iocg->hweight_active;
1005        if (hw_inusep)
1006                *hw_inusep = iocg->hweight_inuse;
1007}
1008
1009static void weight_updated(struct ioc_gq *iocg)
1010{
1011        struct ioc *ioc = iocg->ioc;
1012        struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1013        struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1014        u32 weight;
1015
1016        lockdep_assert_held(&ioc->lock);
1017
1018        weight = iocg->cfg_weight ?: iocc->dfl_weight;
1019        if (weight != iocg->weight && iocg->active)
1020                propagate_active_weight(iocg, weight,
1021                        DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
1022        iocg->weight = weight;
1023}
1024
1025static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1026{
1027        struct ioc *ioc = iocg->ioc;
1028        u64 last_period, cur_period, max_period_delta;
1029        u64 vtime, vmargin, vmin;
1030        int i;
1031
1032        /*
1033         * If seem to be already active, just update the stamp to tell the
1034         * timer that we're still active.  We don't mind occassional races.
1035         */
1036        if (!list_empty(&iocg->active_list)) {
1037                ioc_now(ioc, now);
1038                cur_period = atomic64_read(&ioc->cur_period);
1039                if (atomic64_read(&iocg->active_period) != cur_period)
1040                        atomic64_set(&iocg->active_period, cur_period);
1041                return true;
1042        }
1043
1044        /* racy check on internal node IOs, treat as root level IOs */
1045        if (iocg->child_active_sum)
1046                return false;
1047
1048        spin_lock_irq(&ioc->lock);
1049
1050        ioc_now(ioc, now);
1051
1052        /* update period */
1053        cur_period = atomic64_read(&ioc->cur_period);
1054        last_period = atomic64_read(&iocg->active_period);
1055        atomic64_set(&iocg->active_period, cur_period);
1056
1057        /* already activated or breaking leaf-only constraint? */
1058        if (!list_empty(&iocg->active_list))
1059                goto succeed_unlock;
1060        for (i = iocg->level - 1; i > 0; i--)
1061                if (!list_empty(&iocg->ancestors[i]->active_list))
1062                        goto fail_unlock;
1063
1064        if (iocg->child_active_sum)
1065                goto fail_unlock;
1066
1067        /*
1068         * vtime may wrap when vrate is raised substantially due to
1069         * underestimated IO costs.  Look at the period and ignore its
1070         * vtime if the iocg has been idle for too long.  Also, cap the
1071         * budget it can start with to the margin.
1072         */
1073        max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1074        vtime = atomic64_read(&iocg->vtime);
1075        vmargin = ioc->margin_us * now->vrate;
1076        vmin = now->vnow - vmargin;
1077
1078        if (last_period + max_period_delta < cur_period ||
1079            time_before64(vtime, vmin)) {
1080                atomic64_add(vmin - vtime, &iocg->vtime);
1081                atomic64_add(vmin - vtime, &iocg->done_vtime);
1082                vtime = vmin;
1083        }
1084
1085        /*
1086         * Activate, propagate weight and start period timer if not
1087         * running.  Reset hweight_gen to avoid accidental match from
1088         * wrapping.
1089         */
1090        iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1091        list_add(&iocg->active_list, &ioc->active_iocgs);
1092        propagate_active_weight(iocg, iocg->weight,
1093                                iocg->last_inuse ?: iocg->weight);
1094
1095        TRACE_IOCG_PATH(iocg_activate, iocg, now,
1096                        last_period, cur_period, vtime);
1097
1098        iocg->last_vtime = vtime;
1099
1100        if (ioc->running == IOC_IDLE) {
1101                ioc->running = IOC_RUNNING;
1102                ioc_start_period(ioc, now);
1103        }
1104
1105succeed_unlock:
1106        spin_unlock_irq(&ioc->lock);
1107        return true;
1108
1109fail_unlock:
1110        spin_unlock_irq(&ioc->lock);
1111        return false;
1112}
1113
1114static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1115                        int flags, void *key)
1116{
1117        struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1118        struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1119        u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1120
1121        ctx->vbudget -= cost;
1122
1123        if (ctx->vbudget < 0)
1124                return -1;
1125
1126        iocg_commit_bio(ctx->iocg, wait->bio, cost);
1127
1128        /*
1129         * autoremove_wake_function() removes the wait entry only when it
1130         * actually changed the task state.  We want the wait always
1131         * removed.  Remove explicitly and use default_wake_function().
1132         */
1133        list_del_init(&wq_entry->entry);
1134        wait->committed = true;
1135
1136        default_wake_function(wq_entry, mode, flags, key);
1137        return 0;
1138}
1139
1140static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
1141{
1142        struct ioc *ioc = iocg->ioc;
1143        struct iocg_wake_ctx ctx = { .iocg = iocg };
1144        u64 margin_ns = (u64)(ioc->period_us *
1145                              WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
1146        u64 vdebt, vshortage, expires, oexpires;
1147        s64 vbudget;
1148        u32 hw_inuse;
1149
1150        lockdep_assert_held(&iocg->waitq.lock);
1151
1152        current_hweight(iocg, NULL, &hw_inuse);
1153        vbudget = now->vnow - atomic64_read(&iocg->vtime);
1154
1155        /* pay off debt */
1156        vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1157        if (vdebt && vbudget > 0) {
1158                u64 delta = min_t(u64, vbudget, vdebt);
1159                u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
1160                                    iocg->abs_vdebt);
1161
1162                atomic64_add(delta, &iocg->vtime);
1163                atomic64_add(delta, &iocg->done_vtime);
1164                iocg->abs_vdebt -= abs_delta;
1165        }
1166
1167        /*
1168         * Wake up the ones which are due and see how much vtime we'll need
1169         * for the next one.
1170         */
1171        ctx.hw_inuse = hw_inuse;
1172        ctx.vbudget = vbudget - vdebt;
1173        __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1174        if (!waitqueue_active(&iocg->waitq))
1175                return;
1176        if (WARN_ON_ONCE(ctx.vbudget >= 0))
1177                return;
1178
1179        /* determine next wakeup, add a quarter margin to guarantee chunking */
1180        vshortage = -ctx.vbudget;
1181        expires = now->now_ns +
1182                DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
1183        expires += margin_ns / 4;
1184
1185        /* if already active and close enough, don't bother */
1186        oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1187        if (hrtimer_is_queued(&iocg->waitq_timer) &&
1188            abs(oexpires - expires) <= margin_ns / 4)
1189                return;
1190
1191        hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1192                               margin_ns / 4, HRTIMER_MODE_ABS);
1193}
1194
1195static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1196{
1197        struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1198        struct ioc_now now;
1199        unsigned long flags;
1200
1201        ioc_now(iocg->ioc, &now);
1202
1203        spin_lock_irqsave(&iocg->waitq.lock, flags);
1204        iocg_kick_waitq(iocg, &now);
1205        spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1206
1207        return HRTIMER_NORESTART;
1208}
1209
1210static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
1211{
1212        struct ioc *ioc = iocg->ioc;
1213        struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1214        u64 vtime = atomic64_read(&iocg->vtime);
1215        u64 vmargin = ioc->margin_us * now->vrate;
1216        u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
1217        u64 expires, oexpires;
1218        u32 hw_inuse;
1219
1220        lockdep_assert_held(&iocg->waitq.lock);
1221
1222        /* debt-adjust vtime */
1223        current_hweight(iocg, NULL, &hw_inuse);
1224        vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1225
1226        /*
1227         * Clear or maintain depending on the overage. Non-zero vdebt is what
1228         * guarantees that @iocg is online and future iocg_kick_delay() will
1229         * clear use_delay. Don't leave it on when there's no vdebt.
1230         */
1231        if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
1232                blkcg_clear_delay(blkg);
1233                return false;
1234        }
1235        if (!atomic_read(&blkg->use_delay) &&
1236            time_before_eq64(vtime, now->vnow + vmargin))
1237                return false;
1238
1239        /* use delay */
1240        if (cost) {
1241                u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC,
1242                                                 now->vrate);
1243                blkcg_add_delay(blkg, now->now_ns, cost_ns);
1244        }
1245        blkcg_use_delay(blkg);
1246
1247        expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow,
1248                                                   now->vrate) * NSEC_PER_USEC;
1249
1250        /* if already active and close enough, don't bother */
1251        oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1252        if (hrtimer_is_queued(&iocg->delay_timer) &&
1253            abs(oexpires - expires) <= margin_ns / 4)
1254                return true;
1255
1256        hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
1257                               margin_ns / 4, HRTIMER_MODE_ABS);
1258        return true;
1259}
1260
1261static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1262{
1263        struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1264        struct ioc_now now;
1265        unsigned long flags;
1266
1267        spin_lock_irqsave(&iocg->waitq.lock, flags);
1268        ioc_now(iocg->ioc, &now);
1269        iocg_kick_delay(iocg, &now, 0);
1270        spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1271
1272        return HRTIMER_NORESTART;
1273}
1274
1275static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1276{
1277        u32 nr_met[2] = { };
1278        u32 nr_missed[2] = { };
1279        u64 rq_wait_ns = 0;
1280        int cpu, rw;
1281
1282        for_each_online_cpu(cpu) {
1283                struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1284                u64 this_rq_wait_ns;
1285
1286                for (rw = READ; rw <= WRITE; rw++) {
1287                        u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
1288                        u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
1289
1290                        nr_met[rw] += this_met - stat->missed[rw].last_met;
1291                        nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1292                        stat->missed[rw].last_met = this_met;
1293                        stat->missed[rw].last_missed = this_missed;
1294                }
1295
1296                this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
1297                rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1298                stat->last_rq_wait_ns = this_rq_wait_ns;
1299        }
1300
1301        for (rw = READ; rw <= WRITE; rw++) {
1302                if (nr_met[rw] + nr_missed[rw])
1303                        missed_ppm_ar[rw] =
1304                                DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1305                                                   nr_met[rw] + nr_missed[rw]);
1306                else
1307                        missed_ppm_ar[rw] = 0;
1308        }
1309
1310        *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1311                                   ioc->period_us * NSEC_PER_USEC);
1312}
1313
1314/* was iocg idle this period? */
1315static bool iocg_is_idle(struct ioc_gq *iocg)
1316{
1317        struct ioc *ioc = iocg->ioc;
1318
1319        /* did something get issued this period? */
1320        if (atomic64_read(&iocg->active_period) ==
1321            atomic64_read(&ioc->cur_period))
1322                return false;
1323
1324        /* is something in flight? */
1325        if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1326                return false;
1327
1328        return true;
1329}
1330
1331/* returns usage with margin added if surplus is large enough */
1332static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
1333{
1334        /* add margin */
1335        usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1336        usage += SURPLUS_SCALE_ABS;
1337
1338        /* don't bother if the surplus is too small */
1339        if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
1340                return 0;
1341
1342        return usage;
1343}
1344
1345static void ioc_timer_fn(struct timer_list *timer)
1346{
1347        struct ioc *ioc = container_of(timer, struct ioc, timer);
1348        struct ioc_gq *iocg, *tiocg;
1349        struct ioc_now now;
1350        int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
1351        u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1352        u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1353        u32 missed_ppm[2], rq_wait_pct;
1354        u64 period_vtime;
1355        int prev_busy_level, i;
1356
1357        /* how were the latencies during the period? */
1358        ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1359
1360        /* take care of active iocgs */
1361        spin_lock_irq(&ioc->lock);
1362
1363        ioc_now(ioc, &now);
1364
1365        period_vtime = now.vnow - ioc->period_at_vtime;
1366        if (WARN_ON_ONCE(!period_vtime)) {
1367                spin_unlock_irq(&ioc->lock);
1368                return;
1369        }
1370
1371        /*
1372         * Waiters determine the sleep durations based on the vrate they
1373         * saw at the time of sleep.  If vrate has increased, some waiters
1374         * could be sleeping for too long.  Wake up tardy waiters which
1375         * should have woken up in the last period and expire idle iocgs.
1376         */
1377        list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
1378                if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
1379                    !iocg_is_idle(iocg))
1380                        continue;
1381
1382                spin_lock(&iocg->waitq.lock);
1383
1384                if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
1385                        /* might be oversleeping vtime / hweight changes, kick */
1386                        iocg_kick_waitq(iocg, &now);
1387                        iocg_kick_delay(iocg, &now, 0);
1388                } else if (iocg_is_idle(iocg)) {
1389                        /* no waiter and idle, deactivate */
1390                        iocg->last_inuse = iocg->inuse;
1391                        __propagate_active_weight(iocg, 0, 0);
1392                        list_del_init(&iocg->active_list);
1393                }
1394
1395                spin_unlock(&iocg->waitq.lock);
1396        }
1397        commit_active_weights(ioc);
1398
1399        /* calc usages and see whether some weights need to be moved around */
1400        list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1401                u64 vdone, vtime, vusage, vmargin, vmin;
1402                u32 hw_active, hw_inuse, usage;
1403
1404                /*
1405                 * Collect unused and wind vtime closer to vnow to prevent
1406                 * iocgs from accumulating a large amount of budget.
1407                 */
1408                vdone = atomic64_read(&iocg->done_vtime);
1409                vtime = atomic64_read(&iocg->vtime);
1410                current_hweight(iocg, &hw_active, &hw_inuse);
1411
1412                /*
1413                 * Latency QoS detection doesn't account for IOs which are
1414                 * in-flight for longer than a period.  Detect them by
1415                 * comparing vdone against period start.  If lagging behind
1416                 * IOs from past periods, don't increase vrate.
1417                 */
1418                if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
1419                    !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
1420                    time_after64(vtime, vdone) &&
1421                    time_after64(vtime, now.vnow -
1422                                 MAX_LAGGING_PERIODS * period_vtime) &&
1423                    time_before64(vdone, now.vnow - period_vtime))
1424                        nr_lagging++;
1425
1426                if (waitqueue_active(&iocg->waitq))
1427                        vusage = now.vnow - iocg->last_vtime;
1428                else if (time_before64(iocg->last_vtime, vtime))
1429                        vusage = vtime - iocg->last_vtime;
1430                else
1431                        vusage = 0;
1432
1433                iocg->last_vtime += vusage;
1434                /*
1435                 * Factor in in-flight vtime into vusage to avoid
1436                 * high-latency completions appearing as idle.  This should
1437                 * be done after the above ->last_time adjustment.
1438                 */
1439                vusage = max(vusage, vtime - vdone);
1440
1441                /* calculate hweight based usage ratio and record */
1442                if (vusage) {
1443                        usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
1444                                                   period_vtime);
1445                        iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1446                        iocg->usages[iocg->usage_idx] = usage;
1447                } else {
1448                        usage = 0;
1449                }
1450
1451                /* see whether there's surplus vtime */
1452                vmargin = ioc->margin_us * now.vrate;
1453                vmin = now.vnow - vmargin;
1454
1455                iocg->has_surplus = false;
1456
1457                if (!waitqueue_active(&iocg->waitq) &&
1458                    time_before64(vtime, vmin)) {
1459                        u64 delta = vmin - vtime;
1460
1461                        /* throw away surplus vtime */
1462                        atomic64_add(delta, &iocg->vtime);
1463                        atomic64_add(delta, &iocg->done_vtime);
1464                        iocg->last_vtime += delta;
1465                        /* if usage is sufficiently low, maybe it can donate */
1466                        if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
1467                                iocg->has_surplus = true;
1468                                nr_surpluses++;
1469                        }
1470                } else if (hw_inuse < hw_active) {
1471                        u32 new_hwi, new_inuse;
1472
1473                        /* was donating but might need to take back some */
1474                        if (waitqueue_active(&iocg->waitq)) {
1475                                new_hwi = hw_active;
1476                        } else {
1477                                new_hwi = max(hw_inuse,
1478                                              usage * SURPLUS_SCALE_PCT / 100 +
1479                                              SURPLUS_SCALE_ABS);
1480                        }
1481
1482                        new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
1483                                              hw_inuse);
1484                        new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
1485
1486                        if (new_inuse > iocg->inuse) {
1487                                TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
1488                                                iocg->inuse, new_inuse,
1489                                                hw_inuse, new_hwi);
1490                                __propagate_active_weight(iocg, iocg->weight,
1491                                                          new_inuse);
1492                        }
1493                } else {
1494                        /* genuninely out of vtime */
1495                        nr_shortages++;
1496                }
1497        }
1498
1499        if (!nr_shortages || !nr_surpluses)
1500                goto skip_surplus_transfers;
1501
1502        /* there are both shortages and surpluses, transfer surpluses */
1503        list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1504                u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
1505                int nr_valid = 0;
1506
1507                if (!iocg->has_surplus)
1508                        continue;
1509
1510                /* base the decision on max historical usage */
1511                for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
1512                        if (iocg->usages[i]) {
1513                                usage = max(usage, iocg->usages[i]);
1514                                nr_valid++;
1515                        }
1516                }
1517                if (nr_valid < MIN_VALID_USAGES)
1518                        continue;
1519
1520                current_hweight(iocg, &hw_active, &hw_inuse);
1521                new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
1522                if (!new_hwi)
1523                        continue;
1524
1525                new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
1526                                               hw_inuse);
1527                if (new_inuse < iocg->inuse) {
1528                        TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
1529                                        iocg->inuse, new_inuse,
1530                                        hw_inuse, new_hwi);
1531                        __propagate_active_weight(iocg, iocg->weight, new_inuse);
1532                }
1533        }
1534skip_surplus_transfers:
1535        commit_active_weights(ioc);
1536
1537        /*
1538         * If q is getting clogged or we're missing too much, we're issuing
1539         * too much IO and should lower vtime rate.  If we're not missing
1540         * and experiencing shortages but not surpluses, we're too stingy
1541         * and should increase vtime rate.
1542         */
1543        prev_busy_level = ioc->busy_level;
1544        if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1545            missed_ppm[READ] > ppm_rthr ||
1546            missed_ppm[WRITE] > ppm_wthr) {
1547                /* clearly missing QoS targets, slow down vrate */
1548                ioc->busy_level = max(ioc->busy_level, 0);
1549                ioc->busy_level++;
1550        } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
1551                   missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1552                   missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
1553                /* QoS targets are being met with >25% margin */
1554                if (nr_shortages) {
1555                        /*
1556                         * We're throttling while the device has spare
1557                         * capacity.  If vrate was being slowed down, stop.
1558                         */
1559                        ioc->busy_level = min(ioc->busy_level, 0);
1560
1561                        /*
1562                         * If there are IOs spanning multiple periods, wait
1563                         * them out before pushing the device harder.  If
1564                         * there are surpluses, let redistribution work it
1565                         * out first.
1566                         */
1567                        if (!nr_lagging && !nr_surpluses)
1568                                ioc->busy_level--;
1569                } else {
1570                        /*
1571                         * Nobody is being throttled and the users aren't
1572                         * issuing enough IOs to saturate the device.  We
1573                         * simply don't know how close the device is to
1574                         * saturation.  Coast.
1575                         */
1576                        ioc->busy_level = 0;
1577                }
1578        } else {
1579                /* inside the hysterisis margin, we're good */
1580                ioc->busy_level = 0;
1581        }
1582
1583        ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
1584
1585        if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
1586                u64 vrate = atomic64_read(&ioc->vtime_rate);
1587                u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
1588
1589                /* rq_wait signal is always reliable, ignore user vrate_min */
1590                if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
1591                        vrate_min = VRATE_MIN;
1592
1593                /*
1594                 * If vrate is out of bounds, apply clamp gradually as the
1595                 * bounds can change abruptly.  Otherwise, apply busy_level
1596                 * based adjustment.
1597                 */
1598                if (vrate < vrate_min) {
1599                        vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
1600                                          100);
1601                        vrate = min(vrate, vrate_min);
1602                } else if (vrate > vrate_max) {
1603                        vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
1604                                          100);
1605                        vrate = max(vrate, vrate_max);
1606                } else {
1607                        int idx = min_t(int, abs(ioc->busy_level),
1608                                        ARRAY_SIZE(vrate_adj_pct) - 1);
1609                        u32 adj_pct = vrate_adj_pct[idx];
1610
1611                        if (ioc->busy_level > 0)
1612                                adj_pct = 100 - adj_pct;
1613                        else
1614                                adj_pct = 100 + adj_pct;
1615
1616                        vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1617                                      vrate_min, vrate_max);
1618                }
1619
1620                trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1621                                           nr_lagging, nr_shortages,
1622                                           nr_surpluses);
1623
1624                atomic64_set(&ioc->vtime_rate, vrate);
1625                ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
1626                        ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
1627        } else if (ioc->busy_level != prev_busy_level || nr_lagging) {
1628                trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
1629                                           missed_ppm, rq_wait_pct, nr_lagging,
1630                                           nr_shortages, nr_surpluses);
1631        }
1632
1633        ioc_refresh_params(ioc, false);
1634
1635        /*
1636         * This period is done.  Move onto the next one.  If nothing's
1637         * going on with the device, stop the timer.
1638         */
1639        atomic64_inc(&ioc->cur_period);
1640
1641        if (ioc->running != IOC_STOP) {
1642                if (!list_empty(&ioc->active_iocgs)) {
1643                        ioc_start_period(ioc, &now);
1644                } else {
1645                        ioc->busy_level = 0;
1646                        ioc->running = IOC_IDLE;
1647                }
1648        }
1649
1650        spin_unlock_irq(&ioc->lock);
1651}
1652
1653static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
1654                                    bool is_merge, u64 *costp)
1655{
1656        struct ioc *ioc = iocg->ioc;
1657        u64 coef_seqio, coef_randio, coef_page;
1658        u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
1659        u64 seek_pages = 0;
1660        u64 cost = 0;
1661
1662        switch (bio_op(bio)) {
1663        case REQ_OP_READ:
1664                coef_seqio      = ioc->params.lcoefs[LCOEF_RSEQIO];
1665                coef_randio     = ioc->params.lcoefs[LCOEF_RRANDIO];
1666                coef_page       = ioc->params.lcoefs[LCOEF_RPAGE];
1667                break;
1668        case REQ_OP_WRITE:
1669                coef_seqio      = ioc->params.lcoefs[LCOEF_WSEQIO];
1670                coef_randio     = ioc->params.lcoefs[LCOEF_WRANDIO];
1671                coef_page       = ioc->params.lcoefs[LCOEF_WPAGE];
1672                break;
1673        default:
1674                goto out;
1675        }
1676
1677        if (iocg->cursor) {
1678                seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
1679                seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
1680        }
1681
1682        if (!is_merge) {
1683                if (seek_pages > LCOEF_RANDIO_PAGES) {
1684                        cost += coef_randio;
1685                } else {
1686                        cost += coef_seqio;
1687                }
1688        }
1689        cost += pages * coef_page;
1690out:
1691        *costp = cost;
1692}
1693
1694static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
1695{
1696        u64 cost;
1697
1698        calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
1699        return cost;
1700}
1701
1702static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
1703{
1704        struct blkcg_gq *blkg = bio->bi_blkg;
1705        struct ioc *ioc = rqos_to_ioc(rqos);
1706        struct ioc_gq *iocg = blkg_to_iocg(blkg);
1707        struct ioc_now now;
1708        struct iocg_wait wait;
1709        u32 hw_active, hw_inuse;
1710        u64 abs_cost, cost, vtime;
1711
1712        /* bypass IOs if disabled or for root cgroup */
1713        if (!ioc->enabled || !iocg->level)
1714                return;
1715
1716        /* always activate so that even 0 cost IOs get protected to some level */
1717        if (!iocg_activate(iocg, &now))
1718                return;
1719
1720        /* calculate the absolute vtime cost */
1721        abs_cost = calc_vtime_cost(bio, iocg, false);
1722        if (!abs_cost)
1723                return;
1724
1725        iocg->cursor = bio_end_sector(bio);
1726
1727        vtime = atomic64_read(&iocg->vtime);
1728        current_hweight(iocg, &hw_active, &hw_inuse);
1729
1730        if (hw_inuse < hw_active &&
1731            time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
1732                TRACE_IOCG_PATH(inuse_reset, iocg, &now,
1733                                iocg->inuse, iocg->weight, hw_inuse, hw_active);
1734                spin_lock_irq(&ioc->lock);
1735                propagate_active_weight(iocg, iocg->weight, iocg->weight);
1736                spin_unlock_irq(&ioc->lock);
1737                current_hweight(iocg, &hw_active, &hw_inuse);
1738        }
1739
1740        cost = abs_cost_to_cost(abs_cost, hw_inuse);
1741
1742        /*
1743         * If no one's waiting and within budget, issue right away.  The
1744         * tests are racy but the races aren't systemic - we only miss once
1745         * in a while which is fine.
1746         */
1747        if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
1748            time_before_eq64(vtime + cost, now.vnow)) {
1749                iocg_commit_bio(iocg, bio, cost);
1750                return;
1751        }
1752
1753        /*
1754         * We activated above but w/o any synchronization. Deactivation is
1755         * synchronized with waitq.lock and we won't get deactivated as long
1756         * as we're waiting or has debt, so we're good if we're activated
1757         * here. In the unlikely case that we aren't, just issue the IO.
1758         */
1759        spin_lock_irq(&iocg->waitq.lock);
1760
1761        if (unlikely(list_empty(&iocg->active_list))) {
1762                spin_unlock_irq(&iocg->waitq.lock);
1763                iocg_commit_bio(iocg, bio, cost);
1764                return;
1765        }
1766
1767        /*
1768         * We're over budget. If @bio has to be issued regardless, remember
1769         * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
1770         * off the debt before waking more IOs.
1771         *
1772         * This way, the debt is continuously paid off each period with the
1773         * actual budget available to the cgroup. If we just wound vtime, we
1774         * would incorrectly use the current hw_inuse for the entire amount
1775         * which, for example, can lead to the cgroup staying blocked for a
1776         * long time even with substantially raised hw_inuse.
1777         *
1778         * An iocg with vdebt should stay online so that the timer can keep
1779         * deducting its vdebt and [de]activate use_delay mechanism
1780         * accordingly. We don't want to race against the timer trying to
1781         * clear them and leave @iocg inactive w/ dangling use_delay heavily
1782         * penalizing the cgroup and its descendants.
1783         */
1784        if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
1785                iocg->abs_vdebt += abs_cost;
1786                if (iocg_kick_delay(iocg, &now, cost))
1787                        blkcg_schedule_throttle(rqos->q,
1788                                        (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
1789                spin_unlock_irq(&iocg->waitq.lock);
1790                return;
1791        }
1792
1793        /*
1794         * Append self to the waitq and schedule the wakeup timer if we're
1795         * the first waiter.  The timer duration is calculated based on the
1796         * current vrate.  vtime and hweight changes can make it too short
1797         * or too long.  Each wait entry records the absolute cost it's
1798         * waiting for to allow re-evaluation using a custom wait entry.
1799         *
1800         * If too short, the timer simply reschedules itself.  If too long,
1801         * the period timer will notice and trigger wakeups.
1802         *
1803         * All waiters are on iocg->waitq and the wait states are
1804         * synchronized using waitq.lock.
1805         */
1806        init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
1807        wait.wait.private = current;
1808        wait.bio = bio;
1809        wait.abs_cost = abs_cost;
1810        wait.committed = false; /* will be set true by waker */
1811
1812        __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
1813        iocg_kick_waitq(iocg, &now);
1814
1815        spin_unlock_irq(&iocg->waitq.lock);
1816
1817        while (true) {
1818                set_current_state(TASK_UNINTERRUPTIBLE);
1819                if (wait.committed)
1820                        break;
1821                io_schedule();
1822        }
1823
1824        /* waker already committed us, proceed */
1825        finish_wait(&iocg->waitq, &wait.wait);
1826}
1827
1828static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
1829                           struct bio *bio)
1830{
1831        struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1832        struct ioc *ioc = iocg->ioc;
1833        sector_t bio_end = bio_end_sector(bio);
1834        struct ioc_now now;
1835        u32 hw_inuse;
1836        u64 abs_cost, cost;
1837        unsigned long flags;
1838
1839        /* bypass if disabled or for root cgroup */
1840        if (!ioc->enabled || !iocg->level)
1841                return;
1842
1843        abs_cost = calc_vtime_cost(bio, iocg, true);
1844        if (!abs_cost)
1845                return;
1846
1847        ioc_now(ioc, &now);
1848        current_hweight(iocg, NULL, &hw_inuse);
1849        cost = abs_cost_to_cost(abs_cost, hw_inuse);
1850
1851        /* update cursor if backmerging into the request at the cursor */
1852        if (blk_rq_pos(rq) < bio_end &&
1853            blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
1854                iocg->cursor = bio_end;
1855
1856        /*
1857         * Charge if there's enough vtime budget and the existing request has
1858         * cost assigned.
1859         */
1860        if (rq->bio && rq->bio->bi_iocost_cost &&
1861            time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
1862                iocg_commit_bio(iocg, bio, cost);
1863                return;
1864        }
1865
1866        /*
1867         * Otherwise, account it as debt if @iocg is online, which it should
1868         * be for the vast majority of cases. See debt handling in
1869         * ioc_rqos_throttle() for details.
1870         */
1871        spin_lock_irqsave(&iocg->waitq.lock, flags);
1872        if (likely(!list_empty(&iocg->active_list))) {
1873                iocg->abs_vdebt += abs_cost;
1874                iocg_kick_delay(iocg, &now, cost);
1875        } else {
1876                iocg_commit_bio(iocg, bio, cost);
1877        }
1878        spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1879}
1880
1881static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
1882{
1883        struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1884
1885        if (iocg && bio->bi_iocost_cost)
1886                atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
1887}
1888
1889static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
1890{
1891        struct ioc *ioc = rqos_to_ioc(rqos);
1892        u64 on_q_ns, rq_wait_ns;
1893        int pidx, rw;
1894
1895        if (!ioc->enabled || !rq_aux(rq)->alloc_time_ns || !rq->start_time_ns)
1896                return;
1897
1898        switch (req_op(rq) & REQ_OP_MASK) {
1899        case REQ_OP_READ:
1900                pidx = QOS_RLAT;
1901                rw = READ;
1902                break;
1903        case REQ_OP_WRITE:
1904                pidx = QOS_WLAT;
1905                rw = WRITE;
1906                break;
1907        default:
1908                return;
1909        }
1910
1911        on_q_ns = ktime_get_ns() - rq_aux(rq)->alloc_time_ns;
1912        rq_wait_ns = rq->start_time_ns - rq_aux(rq)->alloc_time_ns;
1913
1914        if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC)
1915                this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
1916        else
1917                this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
1918
1919        this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
1920}
1921
1922static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
1923{
1924        struct ioc *ioc = rqos_to_ioc(rqos);
1925
1926        spin_lock_irq(&ioc->lock);
1927        ioc_refresh_params(ioc, false);
1928        spin_unlock_irq(&ioc->lock);
1929}
1930
1931static void ioc_rqos_exit(struct rq_qos *rqos)
1932{
1933        struct ioc *ioc = rqos_to_ioc(rqos);
1934
1935        blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
1936
1937        spin_lock_irq(&ioc->lock);
1938        ioc->running = IOC_STOP;
1939        spin_unlock_irq(&ioc->lock);
1940
1941        del_timer_sync(&ioc->timer);
1942        free_percpu(ioc->pcpu_stat);
1943        kfree(ioc);
1944}
1945
1946static struct rq_qos_ops ioc_rqos_ops = {
1947        .throttle = ioc_rqos_throttle,
1948        .merge = ioc_rqos_merge,
1949        .done_bio = ioc_rqos_done_bio,
1950        .done = ioc_rqos_done,
1951        .queue_depth_changed = ioc_rqos_queue_depth_changed,
1952        .exit = ioc_rqos_exit,
1953};
1954
1955static int blk_iocost_init(struct request_queue *q)
1956{
1957        struct ioc *ioc;
1958        struct rq_qos *rqos;
1959        int ret;
1960
1961        ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
1962        if (!ioc)
1963                return -ENOMEM;
1964
1965        ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
1966        if (!ioc->pcpu_stat) {
1967                kfree(ioc);
1968                return -ENOMEM;
1969        }
1970
1971        rqos = &ioc->rqos;
1972        rqos->id = RQ_QOS_COST;
1973        rqos->ops = &ioc_rqos_ops;
1974        rqos->q = q;
1975
1976        spin_lock_init(&ioc->lock);
1977        timer_setup(&ioc->timer, ioc_timer_fn, 0);
1978        INIT_LIST_HEAD(&ioc->active_iocgs);
1979
1980        ioc->running = IOC_IDLE;
1981        atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
1982        seqcount_init(&ioc->period_seqcount);
1983        ioc->period_at = ktime_to_us(ktime_get());
1984        atomic64_set(&ioc->cur_period, 0);
1985        atomic_set(&ioc->hweight_gen, 0);
1986
1987        spin_lock_irq(&ioc->lock);
1988        ioc->autop_idx = AUTOP_INVALID;
1989        ioc_refresh_params(ioc, true);
1990        spin_unlock_irq(&ioc->lock);
1991
1992        rq_qos_add(q, rqos);
1993        ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
1994        if (ret) {
1995                rq_qos_del(q, rqos);
1996                free_percpu(ioc->pcpu_stat);
1997                kfree(ioc);
1998                return ret;
1999        }
2000        return 0;

2001}
2002
2003static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2004{
2005        struct ioc_cgrp *iocc;
2006
2007        iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
2008        if (!iocc)
2009                return NULL;
2010
2011        iocc->dfl_weight = CGROUP_WEIGHT_DFL;
2012        return &iocc->cpd;
2013}
2014
2015static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2016{
2017        kfree(container_of(cpd, struct ioc_cgrp, cpd));
2018}
2019
2020static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2021                                             struct blkcg *blkcg)
2022{
2023        int levels = blkcg->css.cgroup->level + 1;
2024        struct ioc_gq *iocg;
2025
2026        iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
2027        if (!iocg)
2028                return NULL;
2029
2030        return &iocg->pd;
2031}
2032
2033static void ioc_pd_init(struct blkg_policy_data *pd)
2034{
2035        struct ioc_gq *iocg = pd_to_iocg(pd);
2036        struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2037        struct ioc *ioc = q_to_ioc(blkg->q);
2038        struct ioc_now now;
2039        struct blkcg_gq *tblkg;
2040        unsigned long flags;
2041
2042        ioc_now(ioc, &now);
2043
2044        iocg->ioc = ioc;
2045        atomic64_set(&iocg->vtime, now.vnow);
2046        atomic64_set(&iocg->done_vtime, now.vnow);
2047        atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2048        INIT_LIST_HEAD(&iocg->active_list);
2049        iocg->hweight_active = HWEIGHT_WHOLE;
2050        iocg->hweight_inuse = HWEIGHT_WHOLE;
2051
2052        init_waitqueue_head(&iocg->waitq);
2053        hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2054        iocg->waitq_timer.function = iocg_waitq_timer_fn;
2055        hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2056        iocg->delay_timer.function = iocg_delay_timer_fn;
2057
2058        iocg->level = blkg->blkcg->css.cgroup->level;
2059
2060        for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2061                struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2062                iocg->ancestors[tiocg->level] = tiocg;
2063        }
2064
2065        spin_lock_irqsave(&ioc->lock, flags);
2066        weight_updated(iocg);
2067        spin_unlock_irqrestore(&ioc->lock, flags);
2068}
2069
2070static void ioc_pd_free(struct blkg_policy_data *pd)
2071{
2072        struct ioc_gq *iocg = pd_to_iocg(pd);
2073        struct ioc *ioc = iocg->ioc;
2074        unsigned long flags;
2075
2076        if (ioc) {
2077                spin_lock_irqsave(&ioc->lock, flags);
2078                if (!list_empty(&iocg->active_list)) {
2079                        propagate_active_weight(iocg, 0, 0);
2080                        list_del_init(&iocg->active_list);
2081                }
2082                spin_unlock_irqrestore(&ioc->lock, flags);
2083
2084                hrtimer_cancel(&iocg->waitq_timer);
2085                hrtimer_cancel(&iocg->delay_timer);
2086        }
2087        kfree(iocg);
2088}
2089
2090static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2091                             int off)
2092{
2093        const char *dname = blkg_dev_name(pd->blkg);
2094        struct ioc_gq *iocg = pd_to_iocg(pd);
2095
2096        if (dname && iocg->cfg_weight)
2097                seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
2098        return 0;
2099}
2100
2101
2102static int ioc_weight_show(struct seq_file *sf, void *v)
2103{
2104        struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2105        struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2106
2107        seq_printf(sf, "default %u\n", iocc->dfl_weight);
2108        blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
2109                          &blkcg_policy_iocost, seq_cft(sf)->private, false);
2110        return 0;
2111}
2112
2113static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
2114                                size_t nbytes, loff_t off)
2115{
2116        struct blkcg *blkcg = css_to_blkcg(of_css(of));
2117        struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2118        struct blkg_conf_ctx ctx;
2119        struct ioc_gq *iocg;
2120        u32 v;
2121        int ret;
2122
2123        if (!strchr(buf, ':')) {
2124                struct blkcg_gq *blkg;
2125
2126                if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
2127                        return -EINVAL;
2128
2129                if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2130                        return -EINVAL;
2131
2132                spin_lock(&blkcg->lock);
2133                iocc->dfl_weight = v;
2134                hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
2135                        struct ioc_gq *iocg = blkg_to_iocg(blkg);
2136
2137                        if (iocg) {
2138                                spin_lock_irq(&iocg->ioc->lock);
2139                                weight_updated(iocg);
2140                                spin_unlock_irq(&iocg->ioc->lock);
2141                        }
2142                }
2143                spin_unlock(&blkcg->lock);
2144
2145                return nbytes;
2146        }
2147
2148        ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2149        if (ret)
2150                return ret;
2151
2152        iocg = blkg_to_iocg(ctx.blkg);
2153
2154        if (!strncmp(ctx.body, "default", 7)) {
2155                v = 0;
2156        } else {
2157                if (!sscanf(ctx.body, "%u", &v))
2158                        goto einval;
2159                if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2160                        goto einval;
2161        }
2162
2163        spin_lock(&iocg->ioc->lock);
2164        iocg->cfg_weight = v;
2165        weight_updated(iocg);
2166        spin_unlock(&iocg->ioc->lock);
2167
2168        blkg_conf_finish(&ctx);
2169        return nbytes;
2170
2171einval:
2172        blkg_conf_finish(&ctx);
2173        return -EINVAL;
2174}
2175
2176static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2177                          int off)
2178{
2179        const char *dname = blkg_dev_name(pd->blkg);
2180        struct ioc *ioc = pd_to_iocg(pd)->ioc;
2181
2182        if (!dname)
2183                return 0;
2184
2185        seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2186                   dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2187                   ioc->params.qos[QOS_RPPM] / 10000,
2188                   ioc->params.qos[QOS_RPPM] % 10000 / 100,
2189                   ioc->params.qos[QOS_RLAT],
2190                   ioc->params.qos[QOS_WPPM] / 10000,
2191                   ioc->params.qos[QOS_WPPM] % 10000 / 100,
2192                   ioc->params.qos[QOS_WLAT],
2193                   ioc->params.qos[QOS_MIN] / 10000,
2194                   ioc->params.qos[QOS_MIN] % 10000 / 100,
2195                   ioc->params.qos[QOS_MAX] / 10000,
2196                   ioc->params.qos[QOS_MAX] % 10000 / 100);
2197        return 0;
2198}
2199
2200static int ioc_qos_show(struct seq_file *sf, void *v)
2201{
2202        struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2203
2204        blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2205                          &blkcg_policy_iocost, seq_cft(sf)->private, false);
2206        return 0;
2207}
2208
2209static const match_table_t qos_ctrl_tokens = {
2210        { QOS_ENABLE,           "enable=%u"     },
2211        { QOS_CTRL,             "ctrl=%s"       },
2212        { NR_QOS_CTRL_PARAMS,   NULL            },
2213};
2214
2215static const match_table_t qos_tokens = {
2216        { QOS_RPPM,             "rpct=%s"       },
2217        { QOS_RLAT,             "rlat=%u"       },
2218        { QOS_WPPM,             "wpct=%s"       },
2219        { QOS_WLAT,             "wlat=%u"       },
2220        { QOS_MIN,              "min=%s"        },
2221        { QOS_MAX,              "max=%s"        },
2222        { NR_QOS_PARAMS,        NULL            },
2223};
2224
2225static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2226                             size_t nbytes, loff_t off)
2227{
2228        struct gendisk *disk;
2229        struct ioc *ioc;
2230        u32 qos[NR_QOS_PARAMS];
2231        bool enable, user;
2232        char *p;
2233        int ret;
2234
2235        disk = blkcg_conf_get_disk(&input);
2236        if (IS_ERR(disk))
2237                return PTR_ERR(disk);
2238
2239        ioc = q_to_ioc(disk->queue);
2240        if (!ioc) {
2241                ret = blk_iocost_init(disk->queue);
2242                if (ret)
2243                        goto err;
2244                ioc = q_to_ioc(disk->queue);
2245        }
2246
2247        spin_lock_irq(&ioc->lock);
2248        memcpy(qos, ioc->params.qos, sizeof(qos));
2249        enable = ioc->enabled;
2250        user = ioc->user_qos_params;
2251        spin_unlock_irq(&ioc->lock);
2252
2253        while ((p = strsep(&input, " \t\n"))) {
2254                substring_t args[MAX_OPT_ARGS];
2255                char buf[32];
2256                int tok;
2257                s64 v;
2258
2259                if (!*p)
2260                        continue;
2261
2262                switch (match_token(p, qos_ctrl_tokens, args)) {
2263                case QOS_ENABLE:
2264                        match_u64(&args[0], &v);
2265                        enable = v;
2266                        continue;
2267                case QOS_CTRL:
2268                        match_strlcpy(buf, &args[0], sizeof(buf));
2269                        if (!strcmp(buf, "auto"))
2270                                user = false;
2271                        else if (!strcmp(buf, "user"))
2272                                user = true;
2273                        else
2274                                goto einval;
2275                        continue;
2276                }
2277
2278                tok = match_token(p, qos_tokens, args);
2279                switch (tok) {
2280                case QOS_RPPM:
2281                case QOS_WPPM:
2282                        if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2283                            sizeof(buf))
2284                                goto einval;
2285                        if (cgroup_parse_float(buf, 2, &v))
2286                                goto einval;
2287                        if (v < 0 || v > 10000)
2288                                goto einval;
2289                        qos[tok] = v * 100;
2290                        break;
2291                case QOS_RLAT:
2292                case QOS_WLAT:
2293                        if (match_u64(&args[0], &v))
2294                                goto einval;
2295                        qos[tok] = v;
2296                        break;
2297                case QOS_MIN:
2298                case QOS_MAX:
2299                        if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2300                            sizeof(buf))
2301                                goto einval;
2302                        if (cgroup_parse_float(buf, 2, &v))
2303                                goto einval;
2304                        if (v < 0)
2305                                goto einval;
2306                        qos[tok] = clamp_t(s64, v * 100,
2307                                           VRATE_MIN_PPM, VRATE_MAX_PPM);
2308                        break;
2309                default:
2310                        goto einval;
2311                }
2312                user = true;
2313        }
2314
2315        if (qos[QOS_MIN] > qos[QOS_MAX])
2316                goto einval;
2317
2318        spin_lock_irq(&ioc->lock);
2319
2320        if (enable) {
2321                blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2322                ioc->enabled = true;
2323        } else {
2324                blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2325                ioc->enabled = false;
2326        }
2327
2328        if (user) {
2329                memcpy(ioc->params.qos, qos, sizeof(qos));
2330                ioc->user_qos_params = true;
2331        } else {
2332                ioc->user_qos_params = false;
2333        }
2334
2335        ioc_refresh_params(ioc, true);
2336        spin_unlock_irq(&ioc->lock);
2337
2338        put_disk_and_module(disk);
2339        return nbytes;
2340einval:
2341        ret = -EINVAL;
2342err:
2343        put_disk_and_module(disk);
2344        return ret;
2345}
2346
2347static u64 ioc_cost_model_prfill(struct seq_file *sf,
2348                                 struct blkg_policy_data *pd, int off)
2349{
2350        const char *dname = blkg_dev_name(pd->blkg);
2351        struct ioc *ioc = pd_to_iocg(pd)->ioc;
2352        u64 *u = ioc->params.i_lcoefs;
2353
2354        if (!dname)
2355                return 0;
2356
2357        seq_printf(sf, "%s ctrl=%s model=linear "
2358                   "rbps=%llu rseqiops=%llu rrandiops=%llu "
2359                   "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2360                   dname, ioc->user_cost_model ? "user" : "auto",
2361                   u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2362                   u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2363        return 0;
2364}
2365
2366static int ioc_cost_model_show(struct seq_file *sf, void *v)
2367{
2368        struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2369
2370        blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2371                          &blkcg_policy_iocost, seq_cft(sf)->private, false);
2372        return 0;
2373}
2374
2375static const match_table_t cost_ctrl_tokens = {
2376        { COST_CTRL,            "ctrl=%s"       },
2377        { COST_MODEL,           "model=%s"      },
2378        { NR_COST_CTRL_PARAMS,  NULL            },
2379};
2380
2381static const match_table_t i_lcoef_tokens = {
2382        { I_LCOEF_RBPS,         "rbps=%u"       },
2383        { I_LCOEF_RSEQIOPS,     "rseqiops=%u"   },
2384        { I_LCOEF_RRANDIOPS,    "rrandiops=%u"  },
2385        { I_LCOEF_WBPS,         "wbps=%u"       },
2386        { I_LCOEF_WSEQIOPS,     "wseqiops=%u"   },
2387        { I_LCOEF_WRANDIOPS,    "wrandiops=%u"  },
2388        { NR_I_LCOEFS,          NULL            },
2389};
2390
2391static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2392                                    size_t nbytes, loff_t off)
2393{
2394        struct gendisk *disk;
2395        struct ioc *ioc;
2396        u64 u[NR_I_LCOEFS];
2397        bool user;
2398        char *p;
2399        int ret;
2400
2401        disk = blkcg_conf_get_disk(&input);
2402        if (IS_ERR(disk))
2403                return PTR_ERR(disk);
2404
2405        ioc = q_to_ioc(disk->queue);
2406        if (!ioc) {
2407                ret = blk_iocost_init(disk->queue);
2408                if (ret)
2409                        goto err;
2410                ioc = q_to_ioc(disk->queue);
2411        }
2412
2413        spin_lock_irq(&ioc->lock);
2414        memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2415        user = ioc->user_cost_model;
2416        spin_unlock_irq(&ioc->lock);
2417
2418        while ((p = strsep(&input, " \t\n"))) {
2419                substring_t args[MAX_OPT_ARGS];
2420                char buf[32];
2421                int tok;
2422                u64 v;
2423
2424                if (!*p)
2425                        continue;
2426
2427                switch (match_token(p, cost_ctrl_tokens, args)) {
2428                case COST_CTRL:
2429                        match_strlcpy(buf, &args[0], sizeof(buf));
2430                        if (!strcmp(buf, "auto"))
2431                                user = false;
2432                        else if (!strcmp(buf, "user"))
2433                                user = true;
2434                        else
2435                                goto einval;
2436                        continue;
2437                case COST_MODEL:
2438                        match_strlcpy(buf, &args[0], sizeof(buf));
2439                        if (strcmp(buf, "linear"))
2440                                goto einval;
2441                        continue;
2442                }
2443
2444                tok = match_token(p, i_lcoef_tokens, args);
2445                if (tok == NR_I_LCOEFS)
2446                        goto einval;
2447                if (match_u64(&args[0], &v))
2448                        goto einval;
2449                u[tok] = v;
2450                user = true;
2451        }
2452
2453        spin_lock_irq(&ioc->lock);
2454        if (user) {
2455                memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2456                ioc->user_cost_model = true;
2457        } else {
2458                ioc->user_cost_model = false;
2459        }
2460        ioc_refresh_params(ioc, true);
2461        spin_unlock_irq(&ioc->lock);
2462
2463        put_disk_and_module(disk);
2464        return nbytes;
2465
2466einval:
2467        ret = -EINVAL;
2468err:
2469        put_disk_and_module(disk);
2470        return ret;
2471}
2472
2473static struct cftype ioc_files[] = {
2474        {
2475                .name = "weight",
2476                .flags = CFTYPE_NOT_ON_ROOT,
2477                .seq_show = ioc_weight_show,
2478                .write = ioc_weight_write,
2479        },
2480        {
2481                .name = "cost.qos",
2482                .flags = CFTYPE_ONLY_ON_ROOT,
2483                .seq_show = ioc_qos_show,
2484                .write = ioc_qos_write,
2485        },
2486        {
2487                .name = "cost.model",
2488                .flags = CFTYPE_ONLY_ON_ROOT,
2489                .seq_show = ioc_cost_model_show,
2490                .write = ioc_cost_model_write,
2491        },
2492        {}
2493};
2494
2495static struct blkcg_policy blkcg_policy_iocost = {
2496        .dfl_cftypes    = ioc_files,
2497        .cpd_alloc_fn   = ioc_cpd_alloc,
2498        .cpd_free_fn    = ioc_cpd_free,
2499        .pd_alloc_fn    = ioc_pd_alloc,
2500        .pd_init_fn     = ioc_pd_init,
2501        .pd_free_fn     = ioc_pd_free,
2502};
2503
2504static int __init ioc_init(void)
2505{
2506        return blkcg_policy_register(&blkcg_policy_iocost);
2507}
2508
2509static void __exit ioc_exit(void)
2510{
2511        return blkcg_policy_unregister(&blkcg_policy_iocost);
2512}
2513
2514module_init(ioc_init);
2515module_exit(ioc_exit);
2516