linux/block/cfq-iosched.c
<<
>>
Prefs
   1/*
   2 *  CFQ, or complete fairness queueing, disk scheduler.
   3 *
   4 *  Based on ideas from a previously unfinished io
   5 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
   6 *
   7 *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
   8 */
   9#include <linux/module.h>
  10#include <linux/slab.h>
  11#include <linux/sched/clock.h>
  12#include <linux/blkdev.h>
  13#include <linux/elevator.h>
  14#include <linux/ktime.h>
  15#include <linux/rbtree.h>
  16#include <linux/ioprio.h>
  17#include <linux/blktrace_api.h>
  18#include <linux/blk-cgroup.h>
  19#include "blk.h"
  20#include "blk-wbt.h"
  21
  22/*
  23 * tunables
  24 */
  25/* max queue in one round of service */
  26static const int cfq_quantum = 8;
  27static const u64 cfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
  28/* maximum backwards seek, in KiB */
  29static const int cfq_back_max = 16 * 1024;
  30/* penalty of a backwards seek */
  31static const int cfq_back_penalty = 2;
  32static const u64 cfq_slice_sync = NSEC_PER_SEC / 10;
  33static u64 cfq_slice_async = NSEC_PER_SEC / 25;
  34static const int cfq_slice_async_rq = 2;
  35static u64 cfq_slice_idle = NSEC_PER_SEC / 125;
  36static u64 cfq_group_idle = NSEC_PER_SEC / 125;
  37static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 3/10; /* 300 ms */
  38static const int cfq_hist_divisor = 4;
  39
  40/*
  41 * offset from end of queue service tree for idle class
  42 */
  43#define CFQ_IDLE_DELAY          (NSEC_PER_SEC / 5)
  44/* offset from end of group service tree under time slice mode */
  45#define CFQ_SLICE_MODE_GROUP_DELAY (NSEC_PER_SEC / 5)
  46/* offset from end of group service under IOPS mode */
  47#define CFQ_IOPS_MODE_GROUP_DELAY (HZ / 5)
  48
  49/*
  50 * below this threshold, we consider thinktime immediate
  51 */
  52#define CFQ_MIN_TT              (2 * NSEC_PER_SEC / HZ)
  53
  54#define CFQ_SLICE_SCALE         (5)
  55#define CFQ_HW_QUEUE_MIN        (5)
  56#define CFQ_SERVICE_SHIFT       12
  57
  58#define CFQQ_SEEK_THR           (sector_t)(8 * 100)
  59#define CFQQ_CLOSE_THR          (sector_t)(8 * 1024)
  60#define CFQQ_SECT_THR_NONROT    (sector_t)(2 * 32)
  61#define CFQQ_SEEKY(cfqq)        (hweight32(cfqq->seek_history) > 32/8)
  62
  63#define RQ_CIC(rq)              icq_to_cic((rq)->elv.icq)
  64#define RQ_CFQQ(rq)             (struct cfq_queue *) ((rq)->elv.priv[0])
  65#define RQ_CFQG(rq)             (struct cfq_group *) ((rq)->elv.priv[1])
  66
  67static struct kmem_cache *cfq_pool;
  68
  69#define CFQ_PRIO_LISTS          IOPRIO_BE_NR
  70#define cfq_class_idle(cfqq)    ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
  71#define cfq_class_rt(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
  72
  73#define sample_valid(samples)   ((samples) > 80)
  74#define rb_entry_cfqg(node)     rb_entry((node), struct cfq_group, rb_node)
  75
  76/* blkio-related constants */
  77#define CFQ_WEIGHT_LEGACY_MIN   10
  78#define CFQ_WEIGHT_LEGACY_DFL   500
  79#define CFQ_WEIGHT_LEGACY_MAX   1000
  80
  81struct cfq_ttime {
  82        u64 last_end_request;
  83
  84        u64 ttime_total;
  85        u64 ttime_mean;
  86        unsigned long ttime_samples;
  87};
  88
  89/*
  90 * Most of our rbtree usage is for sorting with min extraction, so
  91 * if we cache the leftmost node we don't have to walk down the tree
  92 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
  93 * move this into the elevator for the rq sorting as well.
  94 */
  95struct cfq_rb_root {
  96        struct rb_root_cached rb;
  97        struct rb_node *rb_rightmost;
  98        unsigned count;
  99        u64 min_vdisktime;
 100        struct cfq_ttime ttime;
 101};
 102#define CFQ_RB_ROOT     (struct cfq_rb_root) { .rb = RB_ROOT_CACHED, \
 103                        .rb_rightmost = NULL,                        \
 104                        .ttime = {.last_end_request = ktime_get_ns(),},}
 105
 106/*
 107 * Per process-grouping structure
 108 */
 109struct cfq_queue {
 110        /* reference count */
 111        int ref;
 112        /* various state flags, see below */
 113        unsigned int flags;
 114        /* parent cfq_data */
 115        struct cfq_data *cfqd;
 116        /* service_tree member */
 117        struct rb_node rb_node;
 118        /* service_tree key */
 119        u64 rb_key;
 120        /* prio tree member */
 121        struct rb_node p_node;
 122        /* prio tree root we belong to, if any */
 123        struct rb_root *p_root;
 124        /* sorted list of pending requests */
 125        struct rb_root sort_list;
 126        /* if fifo isn't expired, next request to serve */
 127        struct request *next_rq;
 128        /* requests queued in sort_list */
 129        int queued[2];
 130        /* currently allocated requests */
 131        int allocated[2];
 132        /* fifo list of requests in sort_list */
 133        struct list_head fifo;
 134
 135        /* time when queue got scheduled in to dispatch first request. */
 136        u64 dispatch_start;
 137        u64 allocated_slice;
 138        u64 slice_dispatch;
 139        /* time when first request from queue completed and slice started. */
 140        u64 slice_start;
 141        u64 slice_end;
 142        s64 slice_resid;
 143
 144        /* pending priority requests */
 145        int prio_pending;
 146        /* number of requests that are on the dispatch list or inside driver */
 147        int dispatched;
 148
 149        /* io prio of this group */
 150        unsigned short ioprio, org_ioprio;
 151        unsigned short ioprio_class, org_ioprio_class;
 152
 153        pid_t pid;
 154
 155        u32 seek_history;
 156        sector_t last_request_pos;
 157
 158        struct cfq_rb_root *service_tree;
 159        struct cfq_queue *new_cfqq;
 160        struct cfq_group *cfqg;
 161        /* Number of sectors dispatched from queue in single dispatch round */
 162        unsigned long nr_sectors;
 163};
 164
 165/*
 166 * First index in the service_trees.
 167 * IDLE is handled separately, so it has negative index
 168 */
 169enum wl_class_t {
 170        BE_WORKLOAD = 0,
 171        RT_WORKLOAD = 1,
 172        IDLE_WORKLOAD = 2,
 173        CFQ_PRIO_NR,
 174};
 175
 176/*
 177 * Second index in the service_trees.
 178 */
 179enum wl_type_t {
 180        ASYNC_WORKLOAD = 0,
 181        SYNC_NOIDLE_WORKLOAD = 1,
 182        SYNC_WORKLOAD = 2
 183};
 184
 185struct cfqg_stats {
 186#ifdef CONFIG_CFQ_GROUP_IOSCHED
 187        /* number of ios merged */
 188        struct blkg_rwstat              merged;
 189        /* total time spent on device in ns, may not be accurate w/ queueing */
 190        struct blkg_rwstat              service_time;
 191        /* total time spent waiting in scheduler queue in ns */
 192        struct blkg_rwstat              wait_time;
 193        /* number of IOs queued up */
 194        struct blkg_rwstat              queued;
 195        /* total disk time and nr sectors dispatched by this group */
 196        struct blkg_stat                time;
 197#ifdef CONFIG_DEBUG_BLK_CGROUP
 198        /* time not charged to this cgroup */
 199        struct blkg_stat                unaccounted_time;
 200        /* sum of number of ios queued across all samples */
 201        struct blkg_stat                avg_queue_size_sum;
 202        /* count of samples taken for average */
 203        struct blkg_stat                avg_queue_size_samples;
 204        /* how many times this group has been removed from service tree */
 205        struct blkg_stat                dequeue;
 206        /* total time spent waiting for it to be assigned a timeslice. */
 207        struct blkg_stat                group_wait_time;
 208        /* time spent idling for this blkcg_gq */
 209        struct blkg_stat                idle_time;
 210        /* total time with empty current active q with other requests queued */
 211        struct blkg_stat                empty_time;
 212        /* fields after this shouldn't be cleared on stat reset */
 213        u64                             start_group_wait_time;
 214        u64                             start_idle_time;
 215        u64                             start_empty_time;
 216        uint16_t                        flags;
 217#endif  /* CONFIG_DEBUG_BLK_CGROUP */
 218#endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 219};
 220
 221/* Per-cgroup data */
 222struct cfq_group_data {
 223        /* must be the first member */
 224        struct blkcg_policy_data cpd;
 225
 226        unsigned int weight;
 227        unsigned int leaf_weight;
 228};
 229
 230/* This is per cgroup per device grouping structure */
 231struct cfq_group {
 232        /* must be the first member */
 233        struct blkg_policy_data pd;
 234
 235        /* group service_tree member */
 236        struct rb_node rb_node;
 237
 238        /* group service_tree key */
 239        u64 vdisktime;
 240
 241        /*
 242         * The number of active cfqgs and sum of their weights under this
 243         * cfqg.  This covers this cfqg's leaf_weight and all children's
 244         * weights, but does not cover weights of further descendants.
 245         *
 246         * If a cfqg is on the service tree, it's active.  An active cfqg
 247         * also activates its parent and contributes to the children_weight
 248         * of the parent.
 249         */
 250        int nr_active;
 251        unsigned int children_weight;
 252
 253        /*
 254         * vfraction is the fraction of vdisktime that the tasks in this
 255         * cfqg are entitled to.  This is determined by compounding the
 256         * ratios walking up from this cfqg to the root.
 257         *
 258         * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
 259         * vfractions on a service tree is approximately 1.  The sum may
 260         * deviate a bit due to rounding errors and fluctuations caused by
 261         * cfqgs entering and leaving the service tree.
 262         */
 263        unsigned int vfraction;
 264
 265        /*
 266         * There are two weights - (internal) weight is the weight of this
 267         * cfqg against the sibling cfqgs.  leaf_weight is the wight of
 268         * this cfqg against the child cfqgs.  For the root cfqg, both
 269         * weights are kept in sync for backward compatibility.
 270         */
 271        unsigned int weight;
 272        unsigned int new_weight;
 273        unsigned int dev_weight;
 274
 275        unsigned int leaf_weight;
 276        unsigned int new_leaf_weight;
 277        unsigned int dev_leaf_weight;
 278
 279        /* number of cfqq currently on this group */
 280        int nr_cfqq;
 281
 282        /*
 283         * Per group busy queues average. Useful for workload slice calc. We
 284         * create the array for each prio class but at run time it is used
 285         * only for RT and BE class and slot for IDLE class remains unused.
 286         * This is primarily done to avoid confusion and a gcc warning.
 287         */
 288        unsigned int busy_queues_avg[CFQ_PRIO_NR];
 289        /*
 290         * rr lists of queues with requests. We maintain service trees for
 291         * RT and BE classes. These trees are subdivided in subclasses
 292         * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
 293         * class there is no subclassification and all the cfq queues go on
 294         * a single tree service_tree_idle.
 295         * Counts are embedded in the cfq_rb_root
 296         */
 297        struct cfq_rb_root service_trees[2][3];
 298        struct cfq_rb_root service_tree_idle;
 299
 300        u64 saved_wl_slice;
 301        enum wl_type_t saved_wl_type;
 302        enum wl_class_t saved_wl_class;
 303
 304        /* number of requests that are on the dispatch list or inside driver */
 305        int dispatched;
 306        struct cfq_ttime ttime;
 307        struct cfqg_stats stats;        /* stats for this cfqg */
 308
 309        /* async queue for each priority case */
 310        struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
 311        struct cfq_queue *async_idle_cfqq;
 312
 313};
 314
 315struct cfq_io_cq {
 316        struct io_cq            icq;            /* must be the first member */
 317        struct cfq_queue        *cfqq[2];
 318        struct cfq_ttime        ttime;
 319        int                     ioprio;         /* the current ioprio */
 320#ifdef CONFIG_CFQ_GROUP_IOSCHED
 321        uint64_t                blkcg_serial_nr; /* the current blkcg serial */
 322#endif
 323};
 324
 325/*
 326 * Per block device queue structure
 327 */
 328struct cfq_data {
 329        struct request_queue *queue;
 330        /* Root service tree for cfq_groups */
 331        struct cfq_rb_root grp_service_tree;
 332        struct cfq_group *root_group;
 333
 334        /*
 335         * The priority currently being served
 336         */
 337        enum wl_class_t serving_wl_class;
 338        enum wl_type_t serving_wl_type;
 339        u64 workload_expires;
 340        struct cfq_group *serving_group;
 341
 342        /*
 343         * Each priority tree is sorted by next_request position.  These
 344         * trees are used when determining if two or more queues are
 345         * interleaving requests (see cfq_close_cooperator).
 346         */
 347        struct rb_root prio_trees[CFQ_PRIO_LISTS];
 348
 349        unsigned int busy_queues;
 350        unsigned int busy_sync_queues;
 351
 352        int rq_in_driver;
 353        int rq_in_flight[2];
 354
 355        /*
 356         * queue-depth detection
 357         */
 358        int rq_queued;
 359        int hw_tag;
 360        /*
 361         * hw_tag can be
 362         * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
 363         *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
 364         *  0 => no NCQ
 365         */
 366        int hw_tag_est_depth;
 367        unsigned int hw_tag_samples;
 368
 369        /*
 370         * idle window management
 371         */
 372        struct hrtimer idle_slice_timer;
 373        struct work_struct unplug_work;
 374
 375        struct cfq_queue *active_queue;
 376        struct cfq_io_cq *active_cic;
 377
 378        sector_t last_position;
 379
 380        /*
 381         * tunables, see top of file
 382         */
 383        unsigned int cfq_quantum;
 384        unsigned int cfq_back_penalty;
 385        unsigned int cfq_back_max;
 386        unsigned int cfq_slice_async_rq;
 387        unsigned int cfq_latency;
 388        u64 cfq_fifo_expire[2];
 389        u64 cfq_slice[2];
 390        u64 cfq_slice_idle;
 391        u64 cfq_group_idle;
 392        u64 cfq_target_latency;
 393
 394        /*
 395         * Fallback dummy cfqq for extreme OOM conditions
 396         */
 397        struct cfq_queue oom_cfqq;
 398
 399        u64 last_delayed_sync;
 400};
 401
 402static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
 403static void cfq_put_queue(struct cfq_queue *cfqq);
 404
 405static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
 406                                            enum wl_class_t class,
 407                                            enum wl_type_t type)
 408{
 409        if (!cfqg)
 410                return NULL;
 411
 412        if (class == IDLE_WORKLOAD)
 413                return &cfqg->service_tree_idle;
 414
 415        return &cfqg->service_trees[class][type];
 416}
 417
 418enum cfqq_state_flags {
 419        CFQ_CFQQ_FLAG_on_rr = 0,        /* on round-robin busy list */
 420        CFQ_CFQQ_FLAG_wait_request,     /* waiting for a request */
 421        CFQ_CFQQ_FLAG_must_dispatch,    /* must be allowed a dispatch */
 422        CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
 423        CFQ_CFQQ_FLAG_fifo_expire,      /* FIFO checked in this slice */
 424        CFQ_CFQQ_FLAG_idle_window,      /* slice idling enabled */
 425        CFQ_CFQQ_FLAG_prio_changed,     /* task priority has changed */
 426        CFQ_CFQQ_FLAG_slice_new,        /* no requests dispatched in slice */
 427        CFQ_CFQQ_FLAG_sync,             /* synchronous queue */
 428        CFQ_CFQQ_FLAG_coop,             /* cfqq is shared */
 429        CFQ_CFQQ_FLAG_split_coop,       /* shared cfqq will be splitted */
 430        CFQ_CFQQ_FLAG_deep,             /* sync cfqq experienced large depth */
 431        CFQ_CFQQ_FLAG_wait_busy,        /* Waiting for next request */
 432};
 433
 434#define CFQ_CFQQ_FNS(name)                                              \
 435static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)         \
 436{                                                                       \
 437        (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);                   \
 438}                                                                       \
 439static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)        \
 440{                                                                       \
 441        (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);                  \
 442}                                                                       \
 443static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)         \
 444{                                                                       \
 445        return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;      \
 446}
 447
 448CFQ_CFQQ_FNS(on_rr);
 449CFQ_CFQQ_FNS(wait_request);
 450CFQ_CFQQ_FNS(must_dispatch);
 451CFQ_CFQQ_FNS(must_alloc_slice);
 452CFQ_CFQQ_FNS(fifo_expire);
 453CFQ_CFQQ_FNS(idle_window);
 454CFQ_CFQQ_FNS(prio_changed);
 455CFQ_CFQQ_FNS(slice_new);
 456CFQ_CFQQ_FNS(sync);
 457CFQ_CFQQ_FNS(coop);
 458CFQ_CFQQ_FNS(split_coop);
 459CFQ_CFQQ_FNS(deep);
 460CFQ_CFQQ_FNS(wait_busy);
 461#undef CFQ_CFQQ_FNS
 462
 463#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
 464
 465/* cfqg stats flags */
 466enum cfqg_stats_flags {
 467        CFQG_stats_waiting = 0,
 468        CFQG_stats_idling,
 469        CFQG_stats_empty,
 470};
 471
 472#define CFQG_FLAG_FNS(name)                                             \
 473static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats)     \
 474{                                                                       \
 475        stats->flags |= (1 << CFQG_stats_##name);                       \
 476}                                                                       \
 477static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats)    \
 478{                                                                       \
 479        stats->flags &= ~(1 << CFQG_stats_##name);                      \
 480}                                                                       \
 481static inline int cfqg_stats_##name(struct cfqg_stats *stats)           \
 482{                                                                       \
 483        return (stats->flags & (1 << CFQG_stats_##name)) != 0;          \
 484}                                                                       \
 485
 486CFQG_FLAG_FNS(waiting)
 487CFQG_FLAG_FNS(idling)
 488CFQG_FLAG_FNS(empty)
 489#undef CFQG_FLAG_FNS
 490
 491/* This should be called with the queue_lock held. */
 492static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
 493{
 494        u64 now;
 495
 496        if (!cfqg_stats_waiting(stats))
 497                return;
 498
 499        now = ktime_get_ns();
 500        if (now > stats->start_group_wait_time)
 501                blkg_stat_add(&stats->group_wait_time,
 502                              now - stats->start_group_wait_time);
 503        cfqg_stats_clear_waiting(stats);
 504}
 505
 506/* This should be called with the queue_lock held. */
 507static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
 508                                                 struct cfq_group *curr_cfqg)
 509{
 510        struct cfqg_stats *stats = &cfqg->stats;
 511
 512        if (cfqg_stats_waiting(stats))
 513                return;
 514        if (cfqg == curr_cfqg)
 515                return;
 516        stats->start_group_wait_time = ktime_get_ns();
 517        cfqg_stats_mark_waiting(stats);
 518}
 519
 520/* This should be called with the queue_lock held. */
 521static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
 522{
 523        u64 now;
 524
 525        if (!cfqg_stats_empty(stats))
 526                return;
 527
 528        now = ktime_get_ns();
 529        if (now > stats->start_empty_time)
 530                blkg_stat_add(&stats->empty_time,
 531                              now - stats->start_empty_time);
 532        cfqg_stats_clear_empty(stats);
 533}
 534
 535static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
 536{
 537        blkg_stat_add(&cfqg->stats.dequeue, 1);
 538}
 539
 540static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
 541{
 542        struct cfqg_stats *stats = &cfqg->stats;
 543
 544        if (blkg_rwstat_total(&stats->queued))
 545                return;
 546
 547        /*
 548         * group is already marked empty. This can happen if cfqq got new
 549         * request in parent group and moved to this group while being added
 550         * to service tree. Just ignore the event and move on.
 551         */
 552        if (cfqg_stats_empty(stats))
 553                return;
 554
 555        stats->start_empty_time = ktime_get_ns();
 556        cfqg_stats_mark_empty(stats);
 557}
 558
 559static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
 560{
 561        struct cfqg_stats *stats = &cfqg->stats;
 562
 563        if (cfqg_stats_idling(stats)) {
 564                u64 now = ktime_get_ns();
 565
 566                if (now > stats->start_idle_time)
 567                        blkg_stat_add(&stats->idle_time,
 568                                      now - stats->start_idle_time);
 569                cfqg_stats_clear_idling(stats);
 570        }
 571}
 572
 573static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
 574{
 575        struct cfqg_stats *stats = &cfqg->stats;
 576
 577        BUG_ON(cfqg_stats_idling(stats));
 578
 579        stats->start_idle_time = ktime_get_ns();
 580        cfqg_stats_mark_idling(stats);
 581}
 582
 583static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
 584{
 585        struct cfqg_stats *stats = &cfqg->stats;
 586
 587        blkg_stat_add(&stats->avg_queue_size_sum,
 588                      blkg_rwstat_total(&stats->queued));
 589        blkg_stat_add(&stats->avg_queue_size_samples, 1);
 590        cfqg_stats_update_group_wait_time(stats);
 591}
 592
 593#else   /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
 594
 595static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
 596static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
 597static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
 598static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
 599static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
 600static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
 601static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
 602
 603#endif  /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
 604
 605#ifdef CONFIG_CFQ_GROUP_IOSCHED
 606
 607static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
 608{
 609        return pd ? container_of(pd, struct cfq_group, pd) : NULL;
 610}
 611
 612static struct cfq_group_data
 613*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
 614{
 615        return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
 616}
 617
 618static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
 619{
 620        return pd_to_blkg(&cfqg->pd);
 621}
 622
 623static struct blkcg_policy blkcg_policy_cfq;
 624
 625static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
 626{
 627        return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
 628}
 629
 630static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
 631{
 632        return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
 633}
 634
 635static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
 636{
 637        struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
 638
 639        return pblkg ? blkg_to_cfqg(pblkg) : NULL;
 640}
 641
 642static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
 643                                      struct cfq_group *ancestor)
 644{
 645        return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup,
 646                                    cfqg_to_blkg(ancestor)->blkcg->css.cgroup);
 647}
 648
 649static inline void cfqg_get(struct cfq_group *cfqg)
 650{
 651        return blkg_get(cfqg_to_blkg(cfqg));
 652}
 653
 654static inline void cfqg_put(struct cfq_group *cfqg)
 655{
 656        return blkg_put(cfqg_to_blkg(cfqg));
 657}
 658
 659#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  do {                    \
 660        blk_add_cgroup_trace_msg((cfqd)->queue,                         \
 661                        cfqg_to_blkg((cfqq)->cfqg)->blkcg,              \
 662                        "cfq%d%c%c " fmt, (cfqq)->pid,                  \
 663                        cfq_cfqq_sync((cfqq)) ? 'S' : 'A',              \
 664                        cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
 665                          ##args);                                      \
 666} while (0)
 667
 668#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)  do {                    \
 669        blk_add_cgroup_trace_msg((cfqd)->queue,                         \
 670                        cfqg_to_blkg(cfqg)->blkcg, fmt, ##args);        \
 671} while (0)
 672
 673static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
 674                                            struct cfq_group *curr_cfqg,
 675                                            unsigned int op)
 676{
 677        blkg_rwstat_add(&cfqg->stats.queued, op, 1);
 678        cfqg_stats_end_empty_time(&cfqg->stats);
 679        cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
 680}
 681
 682static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
 683                        uint64_t time, unsigned long unaccounted_time)
 684{
 685        blkg_stat_add(&cfqg->stats.time, time);
 686#ifdef CONFIG_DEBUG_BLK_CGROUP
 687        blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
 688#endif
 689}
 690
 691static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
 692                                               unsigned int op)
 693{
 694        blkg_rwstat_add(&cfqg->stats.queued, op, -1);
 695}
 696
 697static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
 698                                               unsigned int op)
 699{
 700        blkg_rwstat_add(&cfqg->stats.merged, op, 1);
 701}
 702
 703static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
 704                                                u64 start_time_ns,
 705                                                u64 io_start_time_ns,
 706                                                unsigned int op)
 707{
 708        struct cfqg_stats *stats = &cfqg->stats;
 709        u64 now = ktime_get_ns();
 710
 711        if (now > io_start_time_ns)
 712                blkg_rwstat_add(&stats->service_time, op,
 713                                now - io_start_time_ns);
 714        if (io_start_time_ns > start_time_ns)
 715                blkg_rwstat_add(&stats->wait_time, op,
 716                                io_start_time_ns - start_time_ns);
 717}
 718
 719/* @stats = 0 */
 720static void cfqg_stats_reset(struct cfqg_stats *stats)
 721{
 722        /* queued stats shouldn't be cleared */
 723        blkg_rwstat_reset(&stats->merged);
 724        blkg_rwstat_reset(&stats->service_time);
 725        blkg_rwstat_reset(&stats->wait_time);
 726        blkg_stat_reset(&stats->time);
 727#ifdef CONFIG_DEBUG_BLK_CGROUP
 728        blkg_stat_reset(&stats->unaccounted_time);
 729        blkg_stat_reset(&stats->avg_queue_size_sum);
 730        blkg_stat_reset(&stats->avg_queue_size_samples);
 731        blkg_stat_reset(&stats->dequeue);
 732        blkg_stat_reset(&stats->group_wait_time);
 733        blkg_stat_reset(&stats->idle_time);
 734        blkg_stat_reset(&stats->empty_time);
 735#endif
 736}
 737
 738/* @to += @from */
 739static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
 740{
 741        /* queued stats shouldn't be cleared */
 742        blkg_rwstat_add_aux(&to->merged, &from->merged);
 743        blkg_rwstat_add_aux(&to->service_time, &from->service_time);
 744        blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
 745        blkg_stat_add_aux(&from->time, &from->time);
 746#ifdef CONFIG_DEBUG_BLK_CGROUP
 747        blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
 748        blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
 749        blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
 750        blkg_stat_add_aux(&to->dequeue, &from->dequeue);
 751        blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
 752        blkg_stat_add_aux(&to->idle_time, &from->idle_time);
 753        blkg_stat_add_aux(&to->empty_time, &from->empty_time);
 754#endif
 755}
 756
 757/*
 758 * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
 759 * recursive stats can still account for the amount used by this cfqg after
 760 * it's gone.
 761 */
 762static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
 763{
 764        struct cfq_group *parent = cfqg_parent(cfqg);
 765
 766        lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);
 767
 768        if (unlikely(!parent))
 769                return;
 770
 771        cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
 772        cfqg_stats_reset(&cfqg->stats);
 773}
 774
 775#else   /* CONFIG_CFQ_GROUP_IOSCHED */
 776
 777static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
 778static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
 779                                      struct cfq_group *ancestor)
 780{
 781        return true;
 782}
 783static inline void cfqg_get(struct cfq_group *cfqg) { }
 784static inline void cfqg_put(struct cfq_group *cfqg) { }
 785
 786#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  \
 787        blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \
 788                        cfq_cfqq_sync((cfqq)) ? 'S' : 'A',              \
 789                        cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
 790                                ##args)
 791#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)          do {} while (0)
 792
 793static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
 794                        struct cfq_group *curr_cfqg, unsigned int op) { }
 795static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
 796                        uint64_t time, unsigned long unaccounted_time) { }
 797static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
 798                        unsigned int op) { }
 799static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
 800                        unsigned int op) { }
 801static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
 802                                                u64 start_time_ns,
 803                                                u64 io_start_time_ns,
 804                                                unsigned int op) { }
 805
 806#endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 807
 808#define cfq_log(cfqd, fmt, args...)     \
 809        blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
 810
 811/* Traverses through cfq group service trees */
 812#define for_each_cfqg_st(cfqg, i, j, st) \
 813        for (i = 0; i <= IDLE_WORKLOAD; i++) \
 814                for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
 815                        : &cfqg->service_tree_idle; \
 816                        (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
 817                        (i == IDLE_WORKLOAD && j == 0); \
 818                        j++, st = i < IDLE_WORKLOAD ? \
 819                        &cfqg->service_trees[i][j]: NULL) \
 820
 821static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
 822        struct cfq_ttime *ttime, bool group_idle)
 823{
 824        u64 slice;
 825        if (!sample_valid(ttime->ttime_samples))
 826                return false;
 827        if (group_idle)
 828                slice = cfqd->cfq_group_idle;
 829        else
 830                slice = cfqd->cfq_slice_idle;
 831        return ttime->ttime_mean > slice;
 832}
 833
 834static inline bool iops_mode(struct cfq_data *cfqd)
 835{
 836        /*
 837         * If we are not idling on queues and it is a NCQ drive, parallel
 838         * execution of requests is on and measuring time is not possible
 839         * in most of the cases until and unless we drive shallower queue
 840         * depths and that becomes a performance bottleneck. In such cases
 841         * switch to start providing fairness in terms of number of IOs.
 842         */
 843        if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
 844                return true;
 845        else
 846                return false;
 847}
 848
 849static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
 850{
 851        if (cfq_class_idle(cfqq))
 852                return IDLE_WORKLOAD;
 853        if (cfq_class_rt(cfqq))
 854                return RT_WORKLOAD;
 855        return BE_WORKLOAD;
 856}
 857
 858
 859static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
 860{
 861        if (!cfq_cfqq_sync(cfqq))
 862                return ASYNC_WORKLOAD;
 863        if (!cfq_cfqq_idle_window(cfqq))
 864                return SYNC_NOIDLE_WORKLOAD;
 865        return SYNC_WORKLOAD;
 866}
 867
 868static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
 869                                        struct cfq_data *cfqd,
 870                                        struct cfq_group *cfqg)
 871{
 872        if (wl_class == IDLE_WORKLOAD)
 873                return cfqg->service_tree_idle.count;
 874
 875        return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
 876                cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
 877                cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
 878}
 879
 880static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
 881                                        struct cfq_group *cfqg)
 882{
 883        return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
 884                cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
 885}
 886
 887static void cfq_dispatch_insert(struct request_queue *, struct request *);
 888static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
 889                                       struct cfq_io_cq *cic, struct bio *bio);
 890
 891static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
 892{
 893        /* cic->icq is the first member, %NULL will convert to %NULL */
 894        return container_of(icq, struct cfq_io_cq, icq);
 895}
 896
 897static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
 898                                               struct io_context *ioc)
 899{
 900        if (ioc)
 901                return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
 902        return NULL;
 903}
 904
 905static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
 906{
 907        return cic->cfqq[is_sync];
 908}
 909
 910static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
 911                                bool is_sync)
 912{
 913        cic->cfqq[is_sync] = cfqq;
 914}
 915
 916static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
 917{
 918        return cic->icq.q->elevator->elevator_data;
 919}
 920
 921/*
 922 * scheduler run of queue, if there are requests pending and no one in the
 923 * driver that will restart queueing
 924 */
 925static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
 926{
 927        if (cfqd->busy_queues) {
 928                cfq_log(cfqd, "schedule dispatch");
 929                kblockd_schedule_work(&cfqd->unplug_work);
 930        }
 931}
 932
 933/*
 934 * Scale schedule slice based on io priority. Use the sync time slice only
 935 * if a queue is marked sync and has sync io queued. A sync queue with async
 936 * io only, should not get full sync slice length.
 937 */
 938static inline u64 cfq_prio_slice(struct cfq_data *cfqd, bool sync,
 939                                 unsigned short prio)
 940{
 941        u64 base_slice = cfqd->cfq_slice[sync];
 942        u64 slice = div_u64(base_slice, CFQ_SLICE_SCALE);
 943
 944        WARN_ON(prio >= IOPRIO_BE_NR);
 945
 946        return base_slice + (slice * (4 - prio));
 947}
 948
 949static inline u64
 950cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 951{
 952        return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 953}
 954
 955/**
 956 * cfqg_scale_charge - scale disk time charge according to cfqg weight
 957 * @charge: disk time being charged
 958 * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
 959 *
 960 * Scale @charge according to @vfraction, which is in range (0, 1].  The
 961 * scaling is inversely proportional.
 962 *
 963 * scaled = charge / vfraction
 964 *
 965 * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
 966 */
 967static inline u64 cfqg_scale_charge(u64 charge,
 968                                    unsigned int vfraction)
 969{
 970        u64 c = charge << CFQ_SERVICE_SHIFT;    /* make it fixed point */
 971
 972        /* charge / vfraction */
 973        c <<= CFQ_SERVICE_SHIFT;
 974        return div_u64(c, vfraction);
 975}
 976
 977static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
 978{
 979        s64 delta = (s64)(vdisktime - min_vdisktime);
 980        if (delta > 0)
 981                min_vdisktime = vdisktime;
 982
 983        return min_vdisktime;
 984}
 985
 986static void update_min_vdisktime(struct cfq_rb_root *st)
 987{
 988        if (!RB_EMPTY_ROOT(&st->rb.rb_root)) {
 989                struct cfq_group *cfqg = rb_entry_cfqg(st->rb.rb_leftmost);
 990
 991                st->min_vdisktime = max_vdisktime(st->min_vdisktime,
 992                                                  cfqg->vdisktime);
 993        }
 994}
 995
 996/*
 997 * get averaged number of queues of RT/BE priority.
 998 * average is updated, with a formula that gives more weight to higher numbers,
 999 * to quickly follows sudden increases and decrease slowly
1000 */
1001
1002static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
1003                                        struct cfq_group *cfqg, bool rt)
1004{
1005        unsigned min_q, max_q;
1006        unsigned mult  = cfq_hist_divisor - 1;
1007        unsigned round = cfq_hist_divisor / 2;
1008        unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
1009
1010        min_q = min(cfqg->busy_queues_avg[rt], busy);
1011        max_q = max(cfqg->busy_queues_avg[rt], busy);
1012        cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
1013                cfq_hist_divisor;
1014        return cfqg->busy_queues_avg[rt];
1015}
1016
1017static inline u64
1018cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
1019{
1020        return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
1021}
1022
1023static inline u64
1024cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1025{
1026        u64 slice = cfq_prio_to_slice(cfqd, cfqq);
1027        if (cfqd->cfq_latency) {
1028                /*
1029                 * interested queues (we consider only the ones with the same
1030                 * priority class in the cfq group)
1031                 */
1032                unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
1033                                                cfq_class_rt(cfqq));
1034                u64 sync_slice = cfqd->cfq_slice[1];
1035                u64 expect_latency = sync_slice * iq;
1036                u64 group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
1037
1038                if (expect_latency > group_slice) {
1039                        u64 base_low_slice = 2 * cfqd->cfq_slice_idle;
1040                        u64 low_slice;
1041
1042                        /* scale low_slice according to IO priority
1043                         * and sync vs async */
1044                        low_slice = div64_u64(base_low_slice*slice, sync_slice);
1045                        low_slice = min(slice, low_slice);
1046                        /* the adapted slice value is scaled to fit all iqs
1047                         * into the target latency */
1048                        slice = div64_u64(slice*group_slice, expect_latency);
1049                        slice = max(slice, low_slice);
1050                }
1051        }
1052        return slice;
1053}
1054
1055static inline void
1056cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1057{
1058        u64 slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
1059        u64 now = ktime_get_ns();
1060
1061        cfqq->slice_start = now;
1062        cfqq->slice_end = now + slice;
1063        cfqq->allocated_slice = slice;
1064        cfq_log_cfqq(cfqd, cfqq, "set_slice=%llu", cfqq->slice_end - now);
1065}
1066
1067/*
1068 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
1069 * isn't valid until the first request from the dispatch is activated
1070 * and the slice time set.
1071 */
1072static inline bool cfq_slice_used(struct cfq_queue *cfqq)
1073{
1074        if (cfq_cfqq_slice_new(cfqq))
1075                return false;
1076        if (ktime_get_ns() < cfqq->slice_end)
1077                return false;
1078
1079        return true;
1080}
1081
1082/*
1083 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1084 * We choose the request that is closest to the head right now. Distance
1085 * behind the head is penalized and only allowed to a certain extent.
1086 */
1087static struct request *
1088cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
1089{
1090        sector_t s1, s2, d1 = 0, d2 = 0;
1091        unsigned long back_max;
1092#define CFQ_RQ1_WRAP    0x01 /* request 1 wraps */
1093#define CFQ_RQ2_WRAP    0x02 /* request 2 wraps */
1094        unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1095
1096        if (rq1 == NULL || rq1 == rq2)
1097                return rq2;
1098        if (rq2 == NULL)
1099                return rq1;
1100
1101        if (rq_is_sync(rq1) != rq_is_sync(rq2))
1102                return rq_is_sync(rq1) ? rq1 : rq2;
1103
1104        if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
1105                return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
1106
1107        s1 = blk_rq_pos(rq1);
1108        s2 = blk_rq_pos(rq2);
1109
1110        /*
1111         * by definition, 1KiB is 2 sectors
1112         */
1113        back_max = cfqd->cfq_back_max * 2;
1114
1115        /*
1116         * Strict one way elevator _except_ in the case where we allow
1117         * short backward seeks which are biased as twice the cost of a
1118         * similar forward seek.
1119         */
1120        if (s1 >= last)
1121                d1 = s1 - last;
1122        else if (s1 + back_max >= last)
1123                d1 = (last - s1) * cfqd->cfq_back_penalty;
1124        else
1125                wrap |= CFQ_RQ1_WRAP;
1126
1127        if (s2 >= last)
1128                d2 = s2 - last;
1129        else if (s2 + back_max >= last)
1130                d2 = (last - s2) * cfqd->cfq_back_penalty;
1131        else
1132                wrap |= CFQ_RQ2_WRAP;
1133
1134        /* Found required data */
1135
1136        /*
1137         * By doing switch() on the bit mask "wrap" we avoid having to
1138         * check two variables for all permutations: --> faster!
1139         */
1140        switch (wrap) {
1141        case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1142                if (d1 < d2)
1143                        return rq1;
1144                else if (d2 < d1)
1145                        return rq2;
1146                else {
1147                        if (s1 >= s2)
1148                                return rq1;
1149                        else
1150                                return rq2;
1151                }
1152
1153        case CFQ_RQ2_WRAP:
1154                return rq1;
1155        case CFQ_RQ1_WRAP:
1156                return rq2;
1157        case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
1158        default:
1159                /*
1160                 * Since both rqs are wrapped,
1161                 * start with the one that's further behind head
1162                 * (--> only *one* back seek required),
1163                 * since back seek takes more time than forward.
1164                 */
1165                if (s1 <= s2)
1166                        return rq1;
1167                else
1168                        return rq2;
1169        }
1170}
1171
1172static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
1173{
1174        /* Service tree is empty */
1175        if (!root->count)
1176                return NULL;
1177
1178        return rb_entry(rb_first_cached(&root->rb), struct cfq_queue, rb_node);
1179}
1180
1181static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
1182{
1183        return rb_entry_cfqg(rb_first_cached(&root->rb));
1184}
1185
1186static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
1187{
1188        if (root->rb_rightmost == n)
1189                root->rb_rightmost = rb_prev(n);
1190
1191        rb_erase_cached(n, &root->rb);
1192        RB_CLEAR_NODE(n);
1193
1194        --root->count;
1195}
1196
1197/*
1198 * would be nice to take fifo expire time into account as well
1199 */
1200static struct request *
1201cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1202                  struct request *last)
1203{
1204        struct rb_node *rbnext = rb_next(&last->rb_node);
1205        struct rb_node *rbprev = rb_prev(&last->rb_node);
1206        struct request *next = NULL, *prev = NULL;
1207
1208        BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1209
1210        if (rbprev)
1211                prev = rb_entry_rq(rbprev);
1212
1213        if (rbnext)
1214                next = rb_entry_rq(rbnext);
1215        else {
1216                rbnext = rb_first(&cfqq->sort_list);
1217                if (rbnext && rbnext != &last->rb_node)
1218                        next = rb_entry_rq(rbnext);
1219        }
1220
1221        return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
1222}
1223
1224static u64 cfq_slice_offset(struct cfq_data *cfqd,
1225                            struct cfq_queue *cfqq)
1226{
1227        /*
1228         * just an approximation, should be ok.
1229         */
1230        return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
1231                       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
1232}
1233
1234static inline s64
1235cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
1236{
1237        return cfqg->vdisktime - st->min_vdisktime;
1238}
1239
1240static void
1241__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1242{
1243        struct rb_node **node = &st->rb.rb_root.rb_node;
1244        struct rb_node *parent = NULL;
1245        struct cfq_group *__cfqg;
1246        s64 key = cfqg_key(st, cfqg);
1247        bool leftmost = true, rightmost = true;
1248
1249        while (*node != NULL) {
1250                parent = *node;
1251                __cfqg = rb_entry_cfqg(parent);
1252
1253                if (key < cfqg_key(st, __cfqg)) {
1254                        node = &parent->rb_left;
1255                        rightmost = false;
1256                } else {
1257                        node = &parent->rb_right;
1258                        leftmost = false;
1259                }
1260        }
1261
1262        if (rightmost)
1263                st->rb_rightmost = &cfqg->rb_node;
1264
1265        rb_link_node(&cfqg->rb_node, parent, node);
1266        rb_insert_color_cached(&cfqg->rb_node, &st->rb, leftmost);
1267}
1268
1269/*
1270 * This has to be called only on activation of cfqg
1271 */
1272static void
1273cfq_update_group_weight(struct cfq_group *cfqg)
1274{
1275        if (cfqg->new_weight) {
1276                cfqg->weight = cfqg->new_weight;
1277                cfqg->new_weight = 0;
1278        }
1279}
1280
1281static void
1282cfq_update_group_leaf_weight(struct cfq_group *cfqg)
1283{
1284        BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1285
1286        if (cfqg->new_leaf_weight) {
1287                cfqg->leaf_weight = cfqg->new_leaf_weight;
1288                cfqg->new_leaf_weight = 0;
1289        }
1290}
1291
1292static void
1293cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1294{
1295        unsigned int vfr = 1 << CFQ_SERVICE_SHIFT;      /* start with 1 */
1296        struct cfq_group *pos = cfqg;
1297        struct cfq_group *parent;
1298        bool propagate;
1299
1300        /* add to the service tree */
1301        BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1302
1303        /*
1304         * Update leaf_weight.  We cannot update weight at this point
1305         * because cfqg might already have been activated and is
1306         * contributing its current weight to the parent's child_weight.
1307         */
1308        cfq_update_group_leaf_weight(cfqg);
1309        __cfq_group_service_tree_add(st, cfqg);
1310
1311        /*
1312         * Activate @cfqg and calculate the portion of vfraction @cfqg is
1313         * entitled to.  vfraction is calculated by walking the tree
1314         * towards the root calculating the fraction it has at each level.
1315         * The compounded ratio is how much vfraction @cfqg owns.
1316         *
1317         * Start with the proportion tasks in this cfqg has against active
1318         * children cfqgs - its leaf_weight against children_weight.
1319         */
1320        propagate = !pos->nr_active++;
1321        pos->children_weight += pos->leaf_weight;
1322        vfr = vfr * pos->leaf_weight / pos->children_weight;
1323
1324        /*
1325         * Compound ->weight walking up the tree.  Both activation and
1326         * vfraction calculation are done in the same loop.  Propagation
1327         * stops once an already activated node is met.  vfraction
1328         * calculation should always continue to the root.
1329         */
1330        while ((parent = cfqg_parent(pos))) {
1331                if (propagate) {
1332                        cfq_update_group_weight(pos);
1333                        propagate = !parent->nr_active++;
1334                        parent->children_weight += pos->weight;
1335                }
1336                vfr = vfr * pos->weight / parent->children_weight;
1337                pos = parent;
1338        }
1339
1340        cfqg->vfraction = max_t(unsigned, vfr, 1);
1341}
1342
1343static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd)
1344{
1345        if (!iops_mode(cfqd))
1346                return CFQ_SLICE_MODE_GROUP_DELAY;
1347        else
1348                return CFQ_IOPS_MODE_GROUP_DELAY;
1349}
1350
1351static void
1352cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
1353{
1354        struct cfq_rb_root *st = &cfqd->grp_service_tree;
1355        struct cfq_group *__cfqg;
1356        struct rb_node *n;
1357
1358        cfqg->nr_cfqq++;
1359        if (!RB_EMPTY_NODE(&cfqg->rb_node))
1360                return;
1361
1362        /*
1363         * Currently put the group at the end. Later implement something
1364         * so that groups get lesser vtime based on their weights, so that
1365         * if group does not loose all if it was not continuously backlogged.
1366         */
1367        n = st->rb_rightmost;
1368        if (n) {
1369                __cfqg = rb_entry_cfqg(n);
1370                cfqg->vdisktime = __cfqg->vdisktime +
1371                        cfq_get_cfqg_vdisktime_delay(cfqd);
1372        } else
1373                cfqg->vdisktime = st->min_vdisktime;
1374        cfq_group_service_tree_add(st, cfqg);
1375}
1376
1377static void
1378cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
1379{
1380        struct cfq_group *pos = cfqg;
1381        bool propagate;
1382
1383        /*
1384         * Undo activation from cfq_group_service_tree_add().  Deactivate
1385         * @cfqg and propagate deactivation upwards.
1386         */
1387        propagate = !--pos->nr_active;
1388        pos->children_weight -= pos->leaf_weight;
1389
1390        while (propagate) {
1391                struct cfq_group *parent = cfqg_parent(pos);
1392
1393                /* @pos has 0 nr_active at this point */
1394                WARN_ON_ONCE(pos->children_weight);
1395                pos->vfraction = 0;
1396
1397                if (!parent)
1398                        break;
1399
1400                propagate = !--parent->nr_active;
1401                parent->children_weight -= pos->weight;
1402                pos = parent;
1403        }
1404
1405        /* remove from the service tree */
1406        if (!RB_EMPTY_NODE(&cfqg->rb_node))
1407                cfq_rb_erase(&cfqg->rb_node, st);
1408}
1409
1410static void
1411cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
1412{
1413        struct cfq_rb_root *st = &cfqd->grp_service_tree;
1414
1415        BUG_ON(cfqg->nr_cfqq < 1);
1416        cfqg->nr_cfqq--;
1417
1418        /* If there are other cfq queues under this group, don't delete it */
1419        if (cfqg->nr_cfqq)
1420                return;
1421
1422        cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
1423        cfq_group_service_tree_del(st, cfqg);
1424        cfqg->saved_wl_slice = 0;
1425        cfqg_stats_update_dequeue(cfqg);
1426}
1427
1428static inline u64 cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
1429                                       u64 *unaccounted_time)
1430{
1431        u64 slice_used;
1432        u64 now = ktime_get_ns();
1433
1434        /*
1435         * Queue got expired before even a single request completed or
1436         * got expired immediately after first request completion.
1437         */
1438        if (!cfqq->slice_start || cfqq->slice_start == now) {
1439                /*
1440                 * Also charge the seek time incurred to the group, otherwise
1441                 * if there are mutiple queues in the group, each can dispatch
1442                 * a single request on seeky media and cause lots of seek time
1443                 * and group will never know it.
1444                 */
1445                slice_used = max_t(u64, (now - cfqq->dispatch_start),
1446                                        jiffies_to_nsecs(1));
1447        } else {
1448                slice_used = now - cfqq->slice_start;
1449                if (slice_used > cfqq->allocated_slice) {
1450                        *unaccounted_time = slice_used - cfqq->allocated_slice;
1451                        slice_used = cfqq->allocated_slice;
1452                }
1453                if (cfqq->slice_start > cfqq->dispatch_start)
1454                        *unaccounted_time += cfqq->slice_start -
1455                                        cfqq->dispatch_start;
1456        }
1457
1458        return slice_used;
1459}
1460
1461static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1462                                struct cfq_queue *cfqq)
1463{
1464        struct cfq_rb_root *st = &cfqd->grp_service_tree;
1465        u64 used_sl, charge, unaccounted_sl = 0;
1466        int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
1467                        - cfqg->service_tree_idle.count;
1468        unsigned int vfr;
1469        u64 now = ktime_get_ns();
1470
1471        BUG_ON(nr_sync < 0);
1472        used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
1473
1474        if (iops_mode(cfqd))
1475                charge = cfqq->slice_dispatch;
1476        else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
1477                charge = cfqq->allocated_slice;
1478
1479        /*
1480         * Can't update vdisktime while on service tree and cfqg->vfraction
1481         * is valid only while on it.  Cache vfr, leave the service tree,
1482         * update vdisktime and go back on.  The re-addition to the tree
1483         * will also update the weights as necessary.
1484         */
1485        vfr = cfqg->vfraction;
1486        cfq_group_service_tree_del(st, cfqg);
1487        cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
1488        cfq_group_service_tree_add(st, cfqg);
1489
1490        /* This group is being expired. Save the context */
1491        if (cfqd->workload_expires > now) {
1492                cfqg->saved_wl_slice = cfqd->workload_expires - now;
1493                cfqg->saved_wl_type = cfqd->serving_wl_type;
1494                cfqg->saved_wl_class = cfqd->serving_wl_class;
1495        } else
1496                cfqg->saved_wl_slice = 0;
1497
1498        cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
1499                                        st->min_vdisktime);
1500        cfq_log_cfqq(cfqq->cfqd, cfqq,
1501                     "sl_used=%llu disp=%llu charge=%llu iops=%u sect=%lu",
1502                     used_sl, cfqq->slice_dispatch, charge,
1503                     iops_mode(cfqd), cfqq->nr_sectors);
1504        cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
1505        cfqg_stats_set_start_empty_time(cfqg);
1506}
1507
1508/**
1509 * cfq_init_cfqg_base - initialize base part of a cfq_group
1510 * @cfqg: cfq_group to initialize
1511 *
1512 * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
1513 * is enabled or not.
1514 */
1515static void cfq_init_cfqg_base(struct cfq_group *cfqg)
1516{
1517        struct cfq_rb_root *st;
1518        int i, j;
1519
1520        for_each_cfqg_st(cfqg, i, j, st)
1521                *st = CFQ_RB_ROOT;
1522        RB_CLEAR_NODE(&cfqg->rb_node);
1523
1524        cfqg->ttime.last_end_request = ktime_get_ns();
1525}
1526
1527#ifdef CONFIG_CFQ_GROUP_IOSCHED
1528static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
1529                            bool on_dfl, bool reset_dev, bool is_leaf_weight);
1530
1531static void cfqg_stats_exit(struct cfqg_stats *stats)
1532{
1533        blkg_rwstat_exit(&stats->merged);
1534        blkg_rwstat_exit(&stats->service_time);
1535        blkg_rwstat_exit(&stats->wait_time);
1536        blkg_rwstat_exit(&stats->queued);
1537        blkg_stat_exit(&stats->time);
1538#ifdef CONFIG_DEBUG_BLK_CGROUP
1539        blkg_stat_exit(&stats->unaccounted_time);
1540        blkg_stat_exit(&stats->avg_queue_size_sum);
1541        blkg_stat_exit(&stats->avg_queue_size_samples);
1542        blkg_stat_exit(&stats->dequeue);
1543        blkg_stat_exit(&stats->group_wait_time);
1544        blkg_stat_exit(&stats->idle_time);
1545        blkg_stat_exit(&stats->empty_time);
1546#endif
1547}
1548
1549static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
1550{
1551        if (blkg_rwstat_init(&stats->merged, gfp) ||
1552            blkg_rwstat_init(&stats->service_time, gfp) ||
1553            blkg_rwstat_init(&stats->wait_time, gfp) ||
1554            blkg_rwstat_init(&stats->queued, gfp) ||
1555            blkg_stat_init(&stats->time, gfp))
1556                goto err;
1557
1558#ifdef CONFIG_DEBUG_BLK_CGROUP
1559        if (blkg_stat_init(&stats->unaccounted_time, gfp) ||
1560            blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
1561            blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
1562            blkg_stat_init(&stats->dequeue, gfp) ||
1563            blkg_stat_init(&stats->group_wait_time, gfp) ||
1564            blkg_stat_init(&stats->idle_time, gfp) ||
1565            blkg_stat_init(&stats->empty_time, gfp))
1566                goto err;
1567#endif
1568        return 0;
1569err:
1570        cfqg_stats_exit(stats);
1571        return -ENOMEM;
1572}
1573
1574static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
1575{
1576        struct cfq_group_data *cgd;
1577
1578        cgd = kzalloc(sizeof(*cgd), gfp);
1579        if (!cgd)
1580                return NULL;
1581        return &cgd->cpd;
1582}
1583
1584static void cfq_cpd_init(struct blkcg_policy_data *cpd)
1585{
1586        struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
1587        unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
1588                              CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
1589
1590        if (cpd_to_blkcg(cpd) == &blkcg_root)
1591                weight *= 2;
1592
1593        cgd->weight = weight;
1594        cgd->leaf_weight = weight;
1595}
1596
1597static void cfq_cpd_free(struct blkcg_policy_data *cpd)
1598{
1599        kfree(cpd_to_cfqgd(cpd));
1600}
1601
1602static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
1603{
1604        struct blkcg *blkcg = cpd_to_blkcg(cpd);
1605        bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys);
1606        unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
1607
1608        if (blkcg == &blkcg_root)
1609                weight *= 2;
1610
1611        WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false));
1612        WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true));
1613}
1614
1615static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
1616{
1617        struct cfq_group *cfqg;
1618
1619        cfqg = kzalloc_node(sizeof(*cfqg), gfp, node);
1620        if (!cfqg)
1621                return NULL;
1622
1623        cfq_init_cfqg_base(cfqg);
1624        if (cfqg_stats_init(&cfqg->stats, gfp)) {
1625                kfree(cfqg);
1626                return NULL;
1627        }
1628
1629        return &cfqg->pd;
1630}
1631
1632static void cfq_pd_init(struct blkg_policy_data *pd)
1633{
1634        struct cfq_group *cfqg = pd_to_cfqg(pd);
1635        struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg);
1636
1637        cfqg->weight = cgd->weight;
1638        cfqg->leaf_weight = cgd->leaf_weight;
1639}
1640
1641static void cfq_pd_offline(struct blkg_policy_data *pd)
1642{
1643        struct cfq_group *cfqg = pd_to_cfqg(pd);
1644        int i;
1645
1646        for (i = 0; i < IOPRIO_BE_NR; i++) {
1647                if (cfqg->async_cfqq[0][i])
1648                        cfq_put_queue(cfqg->async_cfqq[0][i]);
1649                if (cfqg->async_cfqq[1][i])
1650                        cfq_put_queue(cfqg->async_cfqq[1][i]);
1651        }
1652
1653        if (cfqg->async_idle_cfqq)
1654                cfq_put_queue(cfqg->async_idle_cfqq);
1655
1656        /*
1657         * @blkg is going offline and will be ignored by
1658         * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
1659         * that they don't get lost.  If IOs complete after this point, the
1660         * stats for them will be lost.  Oh well...
1661         */
1662        cfqg_stats_xfer_dead(cfqg);
1663}
1664
1665static void cfq_pd_free(struct blkg_policy_data *pd)
1666{
1667        struct cfq_group *cfqg = pd_to_cfqg(pd);
1668
1669        cfqg_stats_exit(&cfqg->stats);
1670        return kfree(cfqg);
1671}
1672
1673static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
1674{
1675        struct cfq_group *cfqg = pd_to_cfqg(pd);
1676
1677        cfqg_stats_reset(&cfqg->stats);
1678}
1679
1680static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
1681                                         struct blkcg *blkcg)
1682{
1683        struct blkcg_gq *blkg;
1684
1685        blkg = blkg_lookup(blkcg, cfqd->queue);
1686        if (likely(blkg))
1687                return blkg_to_cfqg(blkg);
1688        return NULL;
1689}
1690
1691static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1692{
1693        cfqq->cfqg = cfqg;
1694        /* cfqq reference on cfqg */
1695        cfqg_get(cfqg);
1696}
1697
1698static u64 cfqg_prfill_weight_device(struct seq_file *sf,
1699                                     struct blkg_policy_data *pd, int off)
1700{
1701        struct cfq_group *cfqg = pd_to_cfqg(pd);
1702
1703        if (!cfqg->dev_weight)
1704                return 0;
1705        return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
1706}
1707
1708static int cfqg_print_weight_device(struct seq_file *sf, void *v)
1709{
1710        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1711                          cfqg_prfill_weight_device, &blkcg_policy_cfq,
1712                          0, false);
1713        return 0;
1714}
1715
1716static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
1717                                          struct blkg_policy_data *pd, int off)
1718{
1719        struct cfq_group *cfqg = pd_to_cfqg(pd);
1720
1721        if (!cfqg->dev_leaf_weight)
1722                return 0;
1723        return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
1724}
1725
1726static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
1727{
1728        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1729                          cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq,
1730                          0, false);
1731        return 0;
1732}
1733
1734static int cfq_print_weight(struct seq_file *sf, void *v)
1735{
1736        struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1737        struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
1738        unsigned int val = 0;
1739
1740        if (cgd)
1741                val = cgd->weight;
1742
1743        seq_printf(sf, "%u\n", val);
1744        return 0;
1745}
1746
1747static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
1748{
1749        struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1750        struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
1751        unsigned int val = 0;
1752
1753        if (cgd)
1754                val = cgd->leaf_weight;
1755
1756        seq_printf(sf, "%u\n", val);
1757        return 0;
1758}
1759
1760static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
1761                                        char *buf, size_t nbytes, loff_t off,
1762                                        bool on_dfl, bool is_leaf_weight)
1763{
1764        unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
1765        unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
1766        struct blkcg *blkcg = css_to_blkcg(of_css(of));
1767        struct blkg_conf_ctx ctx;
1768        struct cfq_group *cfqg;
1769        struct cfq_group_data *cfqgd;
1770        int ret;
1771        u64 v;
1772
1773        ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
1774        if (ret)
1775                return ret;
1776
1777        if (sscanf(ctx.body, "%llu", &v) == 1) {
1778                /* require "default" on dfl */
1779                ret = -ERANGE;
1780                if (!v && on_dfl)
1781                        goto out_finish;
1782        } else if (!strcmp(strim(ctx.body), "default")) {
1783                v = 0;
1784        } else {
1785                ret = -EINVAL;
1786                goto out_finish;
1787        }
1788
1789        cfqg = blkg_to_cfqg(ctx.blkg);
1790        cfqgd = blkcg_to_cfqgd(blkcg);
1791
1792        ret = -ERANGE;
1793        if (!v || (v >= min && v <= max)) {
1794                if (!is_leaf_weight) {
1795                        cfqg->dev_weight = v;
1796                        cfqg->new_weight = v ?: cfqgd->weight;
1797                } else {
1798                        cfqg->dev_leaf_weight = v;
1799                        cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight;
1800                }
1801                ret = 0;
1802        }
1803out_finish:
1804        blkg_conf_finish(&ctx);
1805        return ret ?: nbytes;
1806}
1807
1808static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
1809                                      char *buf, size_t nbytes, loff_t off)
1810{
1811        return __cfqg_set_weight_device(of, buf, nbytes, off, false, false);
1812}
1813
1814static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
1815                                           char *buf, size_t nbytes, loff_t off)
1816{
1817        return __cfqg_set_weight_device(of, buf, nbytes, off, false, true);
1818}
1819
1820static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
1821                            bool on_dfl, bool reset_dev, bool is_leaf_weight)
1822{
1823        unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
1824        unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
1825        struct blkcg *blkcg = css_to_blkcg(css);
1826        struct blkcg_gq *blkg;
1827        struct cfq_group_data *cfqgd;
1828        int ret = 0;
1829
1830        if (val < min || val > max)
1831                return -ERANGE;
1832
1833        spin_lock_irq(&blkcg->lock);
1834        cfqgd = blkcg_to_cfqgd(blkcg);
1835        if (!cfqgd) {
1836                ret = -EINVAL;
1837                goto out;
1838        }
1839
1840        if (!is_leaf_weight)
1841                cfqgd->weight = val;
1842        else
1843                cfqgd->leaf_weight = val;
1844
1845        hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
1846                struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1847
1848                if (!cfqg)
1849                        continue;
1850
1851                if (!is_leaf_weight) {
1852                        if (reset_dev)
1853                                cfqg->dev_weight = 0;
1854                        if (!cfqg->dev_weight)
1855                                cfqg->new_weight = cfqgd->weight;
1856                } else {
1857                        if (reset_dev)
1858                                cfqg->dev_leaf_weight = 0;
1859                        if (!cfqg->dev_leaf_weight)
1860                                cfqg->new_leaf_weight = cfqgd->leaf_weight;
1861                }
1862        }
1863
1864out:
1865        spin_unlock_irq(&blkcg->lock);
1866        return ret;
1867}
1868
1869static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
1870                          u64 val)
1871{
1872        return __cfq_set_weight(css, val, false, false, false);
1873}
1874
1875static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
1876                               struct cftype *cft, u64 val)
1877{
1878        return __cfq_set_weight(css, val, false, false, true);
1879}
1880
1881static int cfqg_print_stat(struct seq_file *sf, void *v)
1882{
1883        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
1884                          &blkcg_policy_cfq, seq_cft(sf)->private, false);
1885        return 0;
1886}
1887
1888static int cfqg_print_rwstat(struct seq_file *sf, void *v)
1889{
1890        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
1891                          &blkcg_policy_cfq, seq_cft(sf)->private, true);
1892        return 0;
1893}
1894
1895static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
1896                                      struct blkg_policy_data *pd, int off)
1897{
1898        u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
1899                                          &blkcg_policy_cfq, off);
1900        return __blkg_prfill_u64(sf, pd, sum);
1901}
1902
1903static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
1904                                        struct blkg_policy_data *pd, int off)
1905{
1906        struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
1907                                                        &blkcg_policy_cfq, off);
1908        return __blkg_prfill_rwstat(sf, pd, &sum);
1909}
1910
1911static int cfqg_print_stat_recursive(struct seq_file *sf, void *v)
1912{
1913        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1914                          cfqg_prfill_stat_recursive, &blkcg_policy_cfq,
1915                          seq_cft(sf)->private, false);
1916        return 0;
1917}
1918
1919static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
1920{
1921        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1922                          cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq,
1923                          seq_cft(sf)->private, true);
1924        return 0;
1925}
1926
1927static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
1928                               int off)
1929{
1930        u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
1931
1932        return __blkg_prfill_u64(sf, pd, sum >> 9);
1933}
1934
1935static int cfqg_print_stat_sectors(struct seq_file *sf, void *v)
1936{
1937        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1938                          cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false);
1939        return 0;
1940}
1941
1942static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf,
1943                                         struct blkg_policy_data *pd, int off)
1944{
1945        struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
1946                                        offsetof(struct blkcg_gq, stat_bytes));
1947        u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
1948                atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
1949
1950        return __blkg_prfill_u64(sf, pd, sum >> 9);
1951}
1952
1953static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
1954{
1955        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1956                          cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0,
1957                          false);
1958        return 0;
1959}
1960
1961#ifdef CONFIG_DEBUG_BLK_CGROUP
1962static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1963                                      struct blkg_policy_data *pd, int off)
1964{
1965        struct cfq_group *cfqg = pd_to_cfqg(pd);
1966        u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
1967        u64 v = 0;
1968
1969        if (samples) {
1970                v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
1971                v = div64_u64(v, samples);
1972        }
1973        __blkg_prfill_u64(sf, pd, v);
1974        return 0;
1975}
1976
1977/* print avg_queue_size */
1978static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
1979{
1980        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1981                          cfqg_prfill_avg_queue_size, &blkcg_policy_cfq,
1982                          0, false);
1983        return 0;
1984}
1985#endif  /* CONFIG_DEBUG_BLK_CGROUP */
1986
1987static struct cftype cfq_blkcg_legacy_files[] = {
1988        /* on root, weight is mapped to leaf_weight */
1989        {
1990                .name = "weight_device",
1991                .flags = CFTYPE_ONLY_ON_ROOT,
1992                .seq_show = cfqg_print_leaf_weight_device,
1993                .write = cfqg_set_leaf_weight_device,
1994        },
1995        {
1996                .name = "weight",
1997                .flags = CFTYPE_ONLY_ON_ROOT,
1998                .seq_show = cfq_print_leaf_weight,
1999                .write_u64 = cfq_set_leaf_weight,
2000        },
2001
2002        /* no such mapping necessary for !roots */
2003        {
2004                .name = "weight_device",
2005                .flags = CFTYPE_NOT_ON_ROOT,
2006                .seq_show = cfqg_print_weight_device,
2007                .write = cfqg_set_weight_device,
2008        },
2009        {
2010                .name = "weight",
2011                .flags = CFTYPE_NOT_ON_ROOT,
2012                .seq_show = cfq_print_weight,
2013                .write_u64 = cfq_set_weight,
2014        },
2015
2016        {
2017                .name = "leaf_weight_device",
2018                .seq_show = cfqg_print_leaf_weight_device,
2019                .write = cfqg_set_leaf_weight_device,
2020        },
2021        {
2022                .name = "leaf_weight",
2023                .seq_show = cfq_print_leaf_weight,
2024                .write_u64 = cfq_set_leaf_weight,
2025        },
2026
2027        /* statistics, covers only the tasks in the cfqg */
2028        {
2029                .name = "time",
2030                .private = offsetof(struct cfq_group, stats.time),
2031                .seq_show = cfqg_print_stat,
2032        },
2033        {
2034                .name = "sectors",
2035                .seq_show = cfqg_print_stat_sectors,
2036        },
2037        {
2038                .name = "io_service_bytes",
2039                .private = (unsigned long)&blkcg_policy_cfq,
2040                .seq_show = blkg_print_stat_bytes,
2041        },
2042        {
2043                .name = "io_serviced",
2044                .private = (unsigned long)&blkcg_policy_cfq,
2045                .seq_show = blkg_print_stat_ios,
2046        },
2047        {
2048                .name = "io_service_time",
2049                .private = offsetof(struct cfq_group, stats.service_time),
2050                .seq_show = cfqg_print_rwstat,
2051        },
2052        {
2053                .name = "io_wait_time",
2054                .private = offsetof(struct cfq_group, stats.wait_time),
2055                .seq_show = cfqg_print_rwstat,
2056        },
2057        {
2058                .name = "io_merged",
2059                .private = offsetof(struct cfq_group, stats.merged),
2060                .seq_show = cfqg_print_rwstat,
2061        },
2062        {
2063                .name = "io_queued",
2064                .private = offsetof(struct cfq_group, stats.queued),
2065                .seq_show = cfqg_print_rwstat,
2066        },
2067
2068        /* the same statictics which cover the cfqg and its descendants */
2069        {
2070                .name = "time_recursive",
2071                .private = offsetof(struct cfq_group, stats.time),
2072                .seq_show = cfqg_print_stat_recursive,
2073        },
2074        {
2075                .name = "sectors_recursive",
2076                .seq_show = cfqg_print_stat_sectors_recursive,
2077        },
2078        {
2079                .name = "io_service_bytes_recursive",
2080                .private = (unsigned long)&blkcg_policy_cfq,
2081                .seq_show = blkg_print_stat_bytes_recursive,
2082        },
2083        {
2084                .name = "io_serviced_recursive",
2085                .private = (unsigned long)&blkcg_policy_cfq,
2086                .seq_show = blkg_print_stat_ios_recursive,
2087        },
2088        {
2089                .name = "io_service_time_recursive",
2090                .private = offsetof(struct cfq_group, stats.service_time),
2091                .seq_show = cfqg_print_rwstat_recursive,
2092        },
2093        {
2094                .name = "io_wait_time_recursive",
2095                .private = offsetof(struct cfq_group, stats.wait_time),
2096                .seq_show = cfqg_print_rwstat_recursive,
2097        },
2098        {
2099                .name = "io_merged_recursive",
2100                .private = offsetof(struct cfq_group, stats.merged),
2101                .seq_show = cfqg_print_rwstat_recursive,
2102        },
2103        {
2104                .name = "io_queued_recursive",
2105                .private = offsetof(struct cfq_group, stats.queued),
2106                .seq_show = cfqg_print_rwstat_recursive,
2107        },
2108#ifdef CONFIG_DEBUG_BLK_CGROUP
2109        {
2110                .name = "avg_queue_size",
2111                .seq_show = cfqg_print_avg_queue_size,
2112        },
2113        {
2114                .name = "group_wait_time",
2115                .private = offsetof(struct cfq_group, stats.group_wait_time),
2116                .seq_show = cfqg_print_stat,
2117        },
2118        {
2119                .name = "idle_time",
2120                .private = offsetof(struct cfq_group, stats.idle_time),
2121                .seq_show = cfqg_print_stat,
2122        },
2123        {
2124                .name = "empty_time",
2125                .private = offsetof(struct cfq_group, stats.empty_time),
2126                .seq_show = cfqg_print_stat,
2127        },
2128        {
2129                .name = "dequeue",
2130                .private = offsetof(struct cfq_group, stats.dequeue),
2131                .seq_show = cfqg_print_stat,
2132        },
2133        {
2134                .name = "unaccounted_time",
2135                .private = offsetof(struct cfq_group, stats.unaccounted_time),
2136                .seq_show = cfqg_print_stat,
2137        },
2138#endif  /* CONFIG_DEBUG_BLK_CGROUP */
2139        { }     /* terminate */
2140};
2141
2142static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v)
2143{
2144        struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2145        struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
2146
2147        seq_printf(sf, "default %u\n", cgd->weight);
2148        blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device,
2149                          &blkcg_policy_cfq, 0, false);
2150        return 0;
2151}
2152
2153static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
2154                                     char *buf, size_t nbytes, loff_t off)
2155{
2156        char *endp;
2157        int ret;
2158        u64 v;
2159
2160        buf = strim(buf);
2161
2162        /* "WEIGHT" or "default WEIGHT" sets the default weight */
2163        v = simple_strtoull(buf, &endp, 0);
2164        if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
2165                ret = __cfq_set_weight(of_css(of), v, true, false, false);
2166                return ret ?: nbytes;
2167        }
2168
2169        /* "MAJ:MIN WEIGHT" */
2170        return __cfqg_set_weight_device(of, buf, nbytes, off, true, false);
2171}
2172
2173static struct cftype cfq_blkcg_files[] = {
2174        {
2175                .name = "weight",
2176                .flags = CFTYPE_NOT_ON_ROOT,
2177                .seq_show = cfq_print_weight_on_dfl,
2178                .write = cfq_set_weight_on_dfl,
2179        },
2180        { }     /* terminate */
2181};
2182
2183#else /* GROUP_IOSCHED */
2184static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
2185                                         struct blkcg *blkcg)
2186{
2187        return cfqd->root_group;
2188}
2189
2190static inline void
2191cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
2192        cfqq->cfqg = cfqg;
2193}
2194
2195#endif /* GROUP_IOSCHED */
2196
2197/*
2198 * The cfqd->service_trees holds all pending cfq_queue's that have
2199 * requests waiting to be processed. It is sorted in the order that
2200 * we will service the queues.
2201 */
2202static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2203                                 bool add_front)
2204{
2205        struct rb_node **p, *parent;
2206        struct cfq_queue *__cfqq;
2207        u64 rb_key;
2208        struct cfq_rb_root *st;
2209        bool leftmost = true;
2210        int new_cfqq = 1;
2211        u64 now = ktime_get_ns();
2212
2213        st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
2214        if (cfq_class_idle(cfqq)) {
2215                rb_key = CFQ_IDLE_DELAY;
2216                parent = st->rb_rightmost;
2217                if (parent && parent != &cfqq->rb_node) {
2218                        __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
2219                        rb_key += __cfqq->rb_key;
2220                } else
2221                        rb_key += now;
2222        } else if (!add_front) {
2223                /*
2224                 * Get our rb key offset. Subtract any residual slice
2225                 * value carried from last service. A negative resid
2226                 * count indicates slice overrun, and this should position
2227                 * the next service time further away in the tree.
2228                 */
2229                rb_key = cfq_slice_offset(cfqd, cfqq) + now;
2230                rb_key -= cfqq->slice_resid;
2231                cfqq->slice_resid = 0;
2232        } else {
2233                rb_key = -NSEC_PER_SEC;
2234                __cfqq = cfq_rb_first(st);
2235                rb_key += __cfqq ? __cfqq->rb_key : now;
2236        }
2237
2238        if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
2239                new_cfqq = 0;
2240                /*
2241                 * same position, nothing more to do
2242                 */
2243                if (rb_key == cfqq->rb_key && cfqq->service_tree == st)
2244                        return;
2245
2246                cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
2247                cfqq->service_tree = NULL;
2248        }
2249
2250        parent = NULL;
2251        cfqq->service_tree = st;
2252        p = &st->rb.rb_root.rb_node;
2253        while (*p) {
2254                parent = *p;
2255                __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
2256
2257                /*
2258                 * sort by key, that represents service time.
2259                 */
2260                if (rb_key < __cfqq->rb_key)
2261                        p = &parent->rb_left;
2262                else {
2263                        p = &parent->rb_right;
2264                        leftmost = false;
2265                }
2266        }
2267
2268        cfqq->rb_key = rb_key;
2269        rb_link_node(&cfqq->rb_node, parent, p);
2270        rb_insert_color_cached(&cfqq->rb_node, &st->rb, leftmost);
2271        st->count++;
2272        if (add_front || !new_cfqq)
2273                return;
2274        cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
2275}
2276
2277static struct cfq_queue *
2278cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
2279                     sector_t sector, struct rb_node **ret_parent,
2280                     struct rb_node ***rb_link)
2281{
2282        struct rb_node **p, *parent;
2283        struct cfq_queue *cfqq = NULL;
2284
2285        parent = NULL;
2286        p = &root->rb_node;
2287        while (*p) {
2288                struct rb_node **n;
2289
2290                parent = *p;
2291                cfqq = rb_entry(parent, struct cfq_queue, p_node);
2292
2293                /*
2294                 * Sort strictly based on sector.  Smallest to the left,
2295                 * largest to the right.
2296                 */
2297                if (sector > blk_rq_pos(cfqq->next_rq))
2298                        n = &(*p)->rb_right;
2299                else if (sector < blk_rq_pos(cfqq->next_rq))
2300                        n = &(*p)->rb_left;
2301                else
2302                        break;
2303                p = n;
2304                cfqq = NULL;
2305        }
2306
2307        *ret_parent = parent;
2308        if (rb_link)
2309                *rb_link = p;
2310        return cfqq;
2311}
2312
2313static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2314{
2315        struct rb_node **p, *parent;
2316        struct cfq_queue *__cfqq;
2317
2318        if (cfqq->p_root) {
2319                rb_erase(&cfqq->p_node, cfqq->p_root);
2320                cfqq->p_root = NULL;
2321        }
2322
2323        if (cfq_class_idle(cfqq))
2324                return;
2325        if (!cfqq->next_rq)
2326                return;
2327
2328        cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
2329        __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
2330                                      blk_rq_pos(cfqq->next_rq), &parent, &p);
2331        if (!__cfqq) {
2332                rb_link_node(&cfqq->p_node, parent, p);
2333                rb_insert_color(&cfqq->p_node, cfqq->p_root);
2334        } else
2335                cfqq->p_root = NULL;
2336}
2337
2338/*
2339 * Update cfqq's position in the service tree.
2340 */
2341static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2342{
2343        /*
2344         * Resorting requires the cfqq to be on the RR list already.
2345         */
2346        if (cfq_cfqq_on_rr(cfqq)) {
2347                cfq_service_tree_add(cfqd, cfqq, 0);
2348                cfq_prio_tree_add(cfqd, cfqq);
2349        }
2350}
2351
2352/*
2353 * add to busy list of queues for service, trying to be fair in ordering
2354 * the pending list according to last request service
2355 */
2356static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2357{
2358        cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
2359        BUG_ON(cfq_cfqq_on_rr(cfqq));
2360        cfq_mark_cfqq_on_rr(cfqq);
2361        cfqd->busy_queues++;
2362        if (cfq_cfqq_sync(cfqq))
2363                cfqd->busy_sync_queues++;
2364
2365        cfq_resort_rr_list(cfqd, cfqq);
2366}
2367
2368/*
2369 * Called when the cfqq no longer has requests pending, remove it from
2370 * the service tree.
2371 */
2372static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2373{
2374        cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
2375        BUG_ON(!cfq_cfqq_on_rr(cfqq));
2376        cfq_clear_cfqq_on_rr(cfqq);
2377
2378        if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
2379                cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
2380                cfqq->service_tree = NULL;
2381        }
2382        if (cfqq->p_root) {
2383                rb_erase(&cfqq->p_node, cfqq->p_root);
2384                cfqq->p_root = NULL;
2385        }
2386
2387        cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
2388        BUG_ON(!cfqd->busy_queues);
2389        cfqd->busy_queues--;
2390        if (cfq_cfqq_sync(cfqq))
2391                cfqd->busy_sync_queues--;
2392}
2393
2394/*
2395 * rb tree support functions
2396 */
2397static void cfq_del_rq_rb(struct request *rq)
2398{
2399        struct cfq_queue *cfqq = RQ_CFQQ(rq);
2400        const int sync = rq_is_sync(rq);
2401
2402        BUG_ON(!cfqq->queued[sync]);
2403        cfqq->queued[sync]--;
2404
2405        elv_rb_del(&cfqq->sort_list, rq);
2406
2407        if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
2408                /*
2409                 * Queue will be deleted from service tree when we actually
2410                 * expire it later. Right now just remove it from prio tree
2411                 * as it is empty.
2412                 */
2413                if (cfqq->p_root) {
2414                        rb_erase(&cfqq->p_node, cfqq->p_root);
2415                        cfqq->p_root = NULL;
2416                }
2417        }
2418}
2419
2420static void cfq_add_rq_rb(struct request *rq)
2421{
2422        struct cfq_queue *cfqq = RQ_CFQQ(rq);
2423        struct cfq_data *cfqd = cfqq->cfqd;
2424        struct request *prev;
2425
2426        cfqq->queued[rq_is_sync(rq)]++;
2427
2428        elv_rb_add(&cfqq->sort_list, rq);
2429
2430        if (!cfq_cfqq_on_rr(cfqq))
2431                cfq_add_cfqq_rr(cfqd, cfqq);
2432
2433        /*
2434         * check if this request is a better next-serve candidate
2435         */
2436        prev = cfqq->next_rq;
2437        cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
2438
2439        /*
2440         * adjust priority tree position, if ->next_rq changes
2441         */
2442        if (prev != cfqq->next_rq)
2443                cfq_prio_tree_add(cfqd, cfqq);
2444
2445        BUG_ON(!cfqq->next_rq);
2446}
2447
2448static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
2449{
2450        elv_rb_del(&cfqq->sort_list, rq);
2451        cfqq->queued[rq_is_sync(rq)]--;
2452        cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2453        cfq_add_rq_rb(rq);
2454        cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
2455                                 rq->cmd_flags);
2456}
2457
2458static struct request *
2459cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
2460{
2461        struct task_struct *tsk = current;
2462        struct cfq_io_cq *cic;
2463        struct cfq_queue *cfqq;
2464
2465        cic = cfq_cic_lookup(cfqd, tsk->io_context);
2466        if (!cic)
2467                return NULL;
2468
2469        cfqq = cic_to_cfqq(cic, op_is_sync(bio->bi_opf));
2470        if (cfqq)
2471                return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));
2472
2473        return NULL;
2474}
2475
2476static void cfq_activate_request(struct request_queue *q, struct request *rq)
2477{
2478        struct cfq_data *cfqd = q->elevator->elevator_data;
2479
2480        cfqd->rq_in_driver++;
2481        cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
2482                                                cfqd->rq_in_driver);
2483
2484        cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
2485}
2486
2487static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
2488{
2489        struct cfq_data *cfqd = q->elevator->elevator_data;
2490
2491        WARN_ON(!cfqd->rq_in_driver);
2492        cfqd->rq_in_driver--;
2493        cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
2494                                                cfqd->rq_in_driver);
2495}
2496
2497static void cfq_remove_request(struct request *rq)
2498{
2499        struct cfq_queue *cfqq = RQ_CFQQ(rq);
2500
2501        if (cfqq->next_rq == rq)
2502                cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
2503
2504        list_del_init(&rq->queuelist);
2505        cfq_del_rq_rb(rq);
2506
2507        cfqq->cfqd->rq_queued--;
2508        cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2509        if (rq->cmd_flags & REQ_PRIO) {
2510                WARN_ON(!cfqq->prio_pending);
2511                cfqq->prio_pending--;
2512        }
2513}
2514
2515static enum elv_merge cfq_merge(struct request_queue *q, struct request **req,
2516                     struct bio *bio)
2517{
2518        struct cfq_data *cfqd = q->elevator->elevator_data;
2519        struct request *__rq;
2520
2521        __rq = cfq_find_rq_fmerge(cfqd, bio);
2522        if (__rq && elv_bio_merge_ok(__rq, bio)) {
2523                *req = __rq;
2524                return ELEVATOR_FRONT_MERGE;
2525        }
2526
2527        return ELEVATOR_NO_MERGE;
2528}
2529
2530static void cfq_merged_request(struct request_queue *q, struct request *req,
2531                               enum elv_merge type)
2532{
2533        if (type == ELEVATOR_FRONT_MERGE) {
2534                struct cfq_queue *cfqq = RQ_CFQQ(req);
2535
2536                cfq_reposition_rq_rb(cfqq, req);
2537        }
2538}
2539
2540static void cfq_bio_merged(struct request_queue *q, struct request *req,
2541                                struct bio *bio)
2542{
2543        cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf);
2544}
2545
2546static void
2547cfq_merged_requests(struct request_queue *q, struct request *rq,
2548                    struct request *next)
2549{
2550        struct cfq_queue *cfqq = RQ_CFQQ(rq);
2551        struct cfq_data *cfqd = q->elevator->elevator_data;
2552
2553        /*
2554         * reposition in fifo if next is older than rq
2555         */
2556        if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
2557            next->fifo_time < rq->fifo_time &&
2558            cfqq == RQ_CFQQ(next)) {
2559                list_move(&rq->queuelist, &next->queuelist);
2560                rq->fifo_time = next->fifo_time;
2561        }
2562
2563        if (cfqq->next_rq == next)
2564                cfqq->next_rq = rq;
2565        cfq_remove_request(next);
2566        cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
2567
2568        cfqq = RQ_CFQQ(next);
2569        /*
2570         * all requests of this queue are merged to other queues, delete it
2571         * from the service tree. If it's the active_queue,
2572         * cfq_dispatch_requests() will choose to expire it or do idle
2573         */
2574        if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
2575            cfqq != cfqd->active_queue)
2576                cfq_del_cfqq_rr(cfqd, cfqq);
2577}
2578
2579static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq,
2580                               struct bio *bio)
2581{
2582        struct cfq_data *cfqd = q->elevator->elevator_data;
2583        bool is_sync = op_is_sync(bio->bi_opf);
2584        struct cfq_io_cq *cic;
2585        struct cfq_queue *cfqq;
2586
2587        /*
2588         * Disallow merge of a sync bio into an async request.
2589         */
2590        if (is_sync && !rq_is_sync(rq))
2591                return false;
2592
2593        /*
2594         * Lookup the cfqq that this bio will be queued with and allow
2595         * merge only if rq is queued there.
2596         */
2597        cic = cfq_cic_lookup(cfqd, current->io_context);
2598        if (!cic)
2599                return false;
2600
2601        cfqq = cic_to_cfqq(cic, is_sync);
2602        return cfqq == RQ_CFQQ(rq);
2603}
2604
2605static int cfq_allow_rq_merge(struct request_queue *q, struct request *rq,
2606                              struct request *next)
2607{
2608        return RQ_CFQQ(rq) == RQ_CFQQ(next);
2609}
2610
2611static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2612{
2613        hrtimer_try_to_cancel(&cfqd->idle_slice_timer);
2614        cfqg_stats_update_idle_time(cfqq->cfqg);
2615}
2616
2617static void __cfq_set_active_queue(struct cfq_data *cfqd,
2618                                   struct cfq_queue *cfqq)
2619{
2620        if (cfqq) {
2621                cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
2622                                cfqd->serving_wl_class, cfqd->serving_wl_type);
2623                cfqg_stats_update_avg_queue_size(cfqq->cfqg);
2624                cfqq->slice_start = 0;
2625                cfqq->dispatch_start = ktime_get_ns();
2626                cfqq->allocated_slice = 0;
2627                cfqq->slice_end = 0;
2628                cfqq->slice_dispatch = 0;
2629                cfqq->nr_sectors = 0;
2630
2631                cfq_clear_cfqq_wait_request(cfqq);
2632                cfq_clear_cfqq_must_dispatch(cfqq);
2633                cfq_clear_cfqq_must_alloc_slice(cfqq);
2634                cfq_clear_cfqq_fifo_expire(cfqq);
2635                cfq_mark_cfqq_slice_new(cfqq);
2636
2637                cfq_del_timer(cfqd, cfqq);
2638        }
2639
2640        cfqd->active_queue = cfqq;
2641}
2642
2643/*
2644 * current cfqq expired its slice (or was too idle), select new one
2645 */
2646static void
2647__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2648                    bool timed_out)
2649{
2650        cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
2651
2652        if (cfq_cfqq_wait_request(cfqq))
2653                cfq_del_timer(cfqd, cfqq);
2654
2655        cfq_clear_cfqq_wait_request(cfqq);
2656        cfq_clear_cfqq_wait_busy(cfqq);
2657
2658        /*
2659         * If this cfqq is shared between multiple processes, check to
2660         * make sure that those processes are still issuing I/Os within
2661         * the mean seek distance.  If not, it may be time to break the
2662         * queues apart again.
2663         */
2664        if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
2665                cfq_mark_cfqq_split_coop(cfqq);
2666
2667        /*
2668         * store what was left of this slice, if the queue idled/timed out
2669         */
2670        if (timed_out) {
2671                if (cfq_cfqq_slice_new(cfqq))
2672                        cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
2673                else
2674                        cfqq->slice_resid = cfqq->slice_end - ktime_get_ns();
2675                cfq_log_cfqq(cfqd, cfqq, "resid=%lld", cfqq->slice_resid);
2676        }
2677
2678        cfq_group_served(cfqd, cfqq->cfqg, cfqq);
2679
2680        if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
2681                cfq_del_cfqq_rr(cfqd, cfqq);
2682
2683        cfq_resort_rr_list(cfqd, cfqq);
2684
2685        if (cfqq == cfqd->active_queue)
2686                cfqd->active_queue = NULL;
2687
2688        if (cfqd->active_cic) {
2689                put_io_context(cfqd->active_cic->icq.ioc);
2690                cfqd->active_cic = NULL;
2691        }
2692}
2693
2694static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
2695{
2696        struct cfq_queue *cfqq = cfqd->active_queue;
2697
2698        if (cfqq)
2699                __cfq_slice_expired(cfqd, cfqq, timed_out);
2700}
2701
2702/*
2703 * Get next queue for service. Unless we have a queue preemption,
2704 * we'll simply select the first cfqq in the service tree.
2705 */
2706static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
2707{
2708        struct cfq_rb_root *st = st_for(cfqd->serving_group,
2709                        cfqd->serving_wl_class, cfqd->serving_wl_type);
2710
2711        if (!cfqd->rq_queued)
2712                return NULL;
2713
2714        /* There is nothing to dispatch */
2715        if (!st)
2716                return NULL;
2717        if (RB_EMPTY_ROOT(&st->rb.rb_root))
2718                return NULL;
2719        return cfq_rb_first(st);
2720}
2721
2722static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
2723{
2724        struct cfq_group *cfqg;
2725        struct cfq_queue *cfqq;
2726        int i, j;
2727        struct cfq_rb_root *st;
2728
2729        if (!cfqd->rq_queued)
2730                return NULL;
2731
2732        cfqg = cfq_get_next_cfqg(cfqd);
2733        if (!cfqg)
2734                return NULL;
2735
2736        for_each_cfqg_st(cfqg, i, j, st) {
2737                cfqq = cfq_rb_first(st);
2738                if (cfqq)
2739                        return cfqq;
2740        }
2741        return NULL;
2742}
2743
2744/*
2745 * Get and set a new active queue for service.
2746 */
2747static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
2748                                              struct cfq_queue *cfqq)
2749{
2750        if (!cfqq)
2751                cfqq = cfq_get_next_queue(cfqd);
2752
2753        __cfq_set_active_queue(cfqd, cfqq);
2754        return cfqq;
2755}
2756
2757static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
2758                                          struct request *rq)
2759{
2760        if (blk_rq_pos(rq) >= cfqd->last_position)
2761                return blk_rq_pos(rq) - cfqd->last_position;
2762        else
2763                return cfqd->last_position - blk_rq_pos(rq);
2764}
2765
2766static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2767                               struct request *rq)
2768{
2769        return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
2770}
2771
2772static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
2773                                    struct cfq_queue *cur_cfqq)
2774{
2775        struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
2776        struct rb_node *parent, *node;
2777        struct cfq_queue *__cfqq;
2778        sector_t sector = cfqd->last_position;
2779
2780        if (RB_EMPTY_ROOT(root))
2781                return NULL;
2782
2783        /*
2784         * First, if we find a request starting at the end of the last
2785         * request, choose it.
2786         */
2787        __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
2788        if (__cfqq)
2789                return __cfqq;
2790
2791        /*
2792         * If the exact sector wasn't found, the parent of the NULL leaf
2793         * will contain the closest sector.
2794         */
2795        __cfqq = rb_entry(parent, struct cfq_queue, p_node);
2796        if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2797                return __cfqq;
2798
2799        if (blk_rq_pos(__cfqq->next_rq) < sector)
2800                node = rb_next(&__cfqq->p_node);
2801        else
2802                node = rb_prev(&__cfqq->p_node);
2803        if (!node)
2804                return NULL;
2805
2806        __cfqq = rb_entry(node, struct cfq_queue, p_node);
2807        if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2808                return __cfqq;
2809
2810        return NULL;
2811}
2812
2813/*
2814 * cfqd - obvious
2815 * cur_cfqq - passed in so that we don't decide that the current queue is
2816 *            closely cooperating with itself.
2817 *
2818 * So, basically we're assuming that that cur_cfqq has dispatched at least
2819 * one request, and that cfqd->last_position reflects a position on the disk
2820 * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
2821 * assumption.
2822 */
2823static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
2824                                              struct cfq_queue *cur_cfqq)
2825{
2826        struct cfq_queue *cfqq;
2827
2828        if (cfq_class_idle(cur_cfqq))
2829                return NULL;
2830        if (!cfq_cfqq_sync(cur_cfqq))
2831                return NULL;
2832        if (CFQQ_SEEKY(cur_cfqq))
2833                return NULL;
2834
2835        /*
2836         * Don't search priority tree if it's the only queue in the group.
2837         */
2838        if (cur_cfqq->cfqg->nr_cfqq == 1)
2839                return NULL;
2840
2841        /*
2842         * We should notice if some of the queues are cooperating, eg
2843         * working closely on the same area of the disk. In that case,
2844         * we can group them together and don't waste time idling.
2845         */
2846        cfqq = cfqq_close(cfqd, cur_cfqq);
2847        if (!cfqq)
2848                return NULL;
2849
2850        /* If new queue belongs to different cfq_group, don't choose it */
2851        if (cur_cfqq->cfqg != cfqq->cfqg)
2852                return NULL;
2853
2854        /*
2855         * It only makes sense to merge sync queues.
2856         */
2857        if (!cfq_cfqq_sync(cfqq))
2858                return NULL;
2859        if (CFQQ_SEEKY(cfqq))
2860                return NULL;
2861
2862        /*
2863         * Do not merge queues of different priority classes
2864         */
2865        if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
2866                return NULL;
2867
2868        return cfqq;
2869}
2870
2871/*
2872 * Determine whether we should enforce idle window for this queue.
2873 */
2874
2875static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2876{
2877        enum wl_class_t wl_class = cfqq_class(cfqq);
2878        struct cfq_rb_root *st = cfqq->service_tree;
2879
2880        BUG_ON(!st);
2881        BUG_ON(!st->count);
2882
2883        if (!cfqd->cfq_slice_idle)
2884                return false;
2885
2886        /* We never do for idle class queues. */
2887        if (wl_class == IDLE_WORKLOAD)
2888                return false;
2889
2890        /* We do for queues that were marked with idle window flag. */
2891        if (cfq_cfqq_idle_window(cfqq) &&
2892           !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
2893                return true;
2894
2895        /*
2896         * Otherwise, we do only if they are the last ones
2897         * in their service tree.
2898         */
2899        if (st->count == 1 && cfq_cfqq_sync(cfqq) &&
2900           !cfq_io_thinktime_big(cfqd, &st->ttime, false))
2901                return true;
2902        cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count);
2903        return false;
2904}
2905
2906static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2907{
2908        struct cfq_queue *cfqq = cfqd->active_queue;
2909        struct cfq_rb_root *st = cfqq->service_tree;
2910        struct cfq_io_cq *cic;
2911        u64 sl, group_idle = 0;
2912        u64 now = ktime_get_ns();
2913
2914        /*
2915         * SSD device without seek penalty, disable idling. But only do so
2916         * for devices that support queuing, otherwise we still have a problem
2917         * with sync vs async workloads.
2918         */
2919        if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag &&
2920                !cfqd->cfq_group_idle)
2921                return;
2922
2923        WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
2924        WARN_ON(cfq_cfqq_slice_new(cfqq));
2925
2926        /*
2927         * idle is disabled, either manually or by past process history
2928         */
2929        if (!cfq_should_idle(cfqd, cfqq)) {
2930                /* no queue idling. Check for group idling */
2931                if (cfqd->cfq_group_idle)
2932                        group_idle = cfqd->cfq_group_idle;
2933                else
2934                        return;
2935        }
2936
2937        /*
2938         * still active requests from this queue, don't idle
2939         */
2940        if (cfqq->dispatched)
2941                return;
2942
2943        /*
2944         * task has exited, don't wait
2945         */
2946        cic = cfqd->active_cic;
2947        if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
2948                return;
2949
2950        /*
2951         * If our average think time is larger than the remaining time
2952         * slice, then don't idle. This avoids overrunning the allotted
2953         * time slice.
2954         */
2955        if (sample_valid(cic->ttime.ttime_samples) &&
2956            (cfqq->slice_end - now < cic->ttime.ttime_mean)) {
2957                cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%llu",
2958                             cic->ttime.ttime_mean);
2959                return;
2960        }
2961
2962        /*
2963         * There are other queues in the group or this is the only group and
2964         * it has too big thinktime, don't do group idle.
2965         */
2966        if (group_idle &&
2967            (cfqq->cfqg->nr_cfqq > 1 ||
2968             cfq_io_thinktime_big(cfqd, &st->ttime, true)))
2969                return;
2970
2971        cfq_mark_cfqq_wait_request(cfqq);
2972
2973        if (group_idle)
2974                sl = cfqd->cfq_group_idle;
2975        else
2976                sl = cfqd->cfq_slice_idle;
2977
2978        hrtimer_start(&cfqd->idle_slice_timer, ns_to_ktime(sl),
2979                      HRTIMER_MODE_REL);
2980        cfqg_stats_set_start_idle_time(cfqq->cfqg);
2981        cfq_log_cfqq(cfqd, cfqq, "arm_idle: %llu group_idle: %d", sl,
2982                        group_idle ? 1 : 0);
2983}
2984
2985/*
2986 * Move request from internal lists to the request queue dispatch list.
2987 */
2988static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
2989{
2990        struct cfq_data *cfqd = q->elevator->elevator_data;
2991        struct cfq_queue *cfqq = RQ_CFQQ(rq);
2992
2993        cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
2994
2995        cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
2996        cfq_remove_request(rq);
2997        cfqq->dispatched++;
2998        (RQ_CFQG(rq))->dispatched++;
2999        elv_dispatch_sort(q, rq);
3000
3001        cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
3002        cfqq->nr_sectors += blk_rq_sectors(rq);
3003}
3004
3005/*
3006 * return expired entry, or NULL to just start from scratch in rbtree
3007 */
3008static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
3009{
3010        struct request *rq = NULL;
3011
3012        if (cfq_cfqq_fifo_expire(cfqq))
3013                return NULL;
3014
3015        cfq_mark_cfqq_fifo_expire(cfqq);
3016
3017        if (list_empty(&cfqq->fifo))
3018                return NULL;
3019
3020        rq = rq_entry_fifo(cfqq->fifo.next);
3021        if (ktime_get_ns() < rq->fifo_time)
3022                rq = NULL;
3023
3024        return rq;
3025}
3026
3027static inline int
3028cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3029{
3030        const int base_rq = cfqd->cfq_slice_async_rq;
3031
3032        WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
3033
3034        return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
3035}
3036
3037/*
3038 * Must be called with the queue_lock held.
3039 */
3040static int cfqq_process_refs(struct cfq_queue *cfqq)
3041{
3042        int process_refs, io_refs;
3043
3044        io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
3045        process_refs = cfqq->ref - io_refs;
3046        BUG_ON(process_refs < 0);
3047        return process_refs;
3048}
3049
3050static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
3051{
3052        int process_refs, new_process_refs;
3053        struct cfq_queue *__cfqq;
3054
3055        /*
3056         * If there are no process references on the new_cfqq, then it is
3057         * unsafe to follow the ->new_cfqq chain as other cfqq's in the
3058         * chain may have dropped their last reference (not just their
3059         * last process reference).
3060         */
3061        if (!cfqq_process_refs(new_cfqq))
3062                return;
3063
3064        /* Avoid a circular list and skip interim queue merges */
3065        while ((__cfqq = new_cfqq->new_cfqq)) {
3066                if (__cfqq == cfqq)
3067                        return;
3068                new_cfqq = __cfqq;
3069        }
3070
3071        process_refs = cfqq_process_refs(cfqq);
3072        new_process_refs = cfqq_process_refs(new_cfqq);
3073        /*
3074         * If the process for the cfqq has gone away, there is no
3075         * sense in merging the queues.
3076         */
3077        if (process_refs == 0 || new_process_refs == 0)
3078                return;
3079
3080        /*
3081         * Merge in the direction of the lesser amount of work.
3082         */
3083        if (new_process_refs >= process_refs) {
3084                cfqq->new_cfqq = new_cfqq;
3085                new_cfqq->ref += process_refs;
3086        } else {
3087                new_cfqq->new_cfqq = cfqq;
3088                cfqq->ref += new_process_refs;
3089        }
3090}
3091
3092static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd,
3093                        struct cfq_group *cfqg, enum wl_class_t wl_class)
3094{
3095        struct cfq_queue *queue;
3096        int i;
3097        bool key_valid = false;
3098        u64 lowest_key = 0;
3099        enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
3100
3101        for (i = 0; i <= SYNC_WORKLOAD; ++i) {
3102                /* select the one with lowest rb_key */
3103                queue = cfq_rb_first(st_for(cfqg, wl_class, i));
3104                if (queue &&
3105                    (!key_valid || queue->rb_key < lowest_key)) {
3106                        lowest_key = queue->rb_key;
3107                        cur_best = i;
3108                        key_valid = true;
3109                }
3110        }
3111
3112        return cur_best;
3113}
3114
3115static void
3116choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg)
3117{
3118        u64 slice;
3119        unsigned count;
3120        struct cfq_rb_root *st;
3121        u64 group_slice;
3122        enum wl_class_t original_class = cfqd->serving_wl_class;
3123        u64 now = ktime_get_ns();
3124
3125        /* Choose next priority. RT > BE > IDLE */
3126        if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
3127                cfqd->serving_wl_class = RT_WORKLOAD;
3128        else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
3129                cfqd->serving_wl_class = BE_WORKLOAD;
3130        else {
3131                cfqd->serving_wl_class = IDLE_WORKLOAD;
3132                cfqd->workload_expires = now + jiffies_to_nsecs(1);
3133                return;
3134        }
3135
3136        if (original_class != cfqd->serving_wl_class)
3137                goto new_workload;
3138
3139        /*
3140         * For RT and BE, we have to choose also the type
3141         * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
3142         * expiration time
3143         */
3144        st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
3145        count = st->count;
3146
3147        /*
3148         * check workload expiration, and that we still have other queues ready
3149         */
3150        if (count && !(now > cfqd->workload_expires))
3151                return;
3152
3153new_workload:
3154        /* otherwise select new workload type */
3155        cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg,
3156                                        cfqd->serving_wl_class);
3157        st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
3158        count = st->count;
3159
3160        /*
3161         * the workload slice is computed as a fraction of target latency
3162         * proportional to the number of queues in that workload, over
3163         * all the queues in the same priority class
3164         */
3165        group_slice = cfq_group_slice(cfqd, cfqg);
3166
3167        slice = div_u64(group_slice * count,
3168                max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class],
3169                      cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd,
3170                                        cfqg)));
3171
3172        if (cfqd->serving_wl_type == ASYNC_WORKLOAD) {
3173                u64 tmp;
3174
3175                /*
3176                 * Async queues are currently system wide. Just taking
3177                 * proportion of queues with-in same group will lead to higher
3178                 * async ratio system wide as generally root group is going
3179                 * to have higher weight. A more accurate thing would be to
3180                 * calculate system wide asnc/sync ratio.
3181                 */
3182                tmp = cfqd->cfq_target_latency *
3183                        cfqg_busy_async_queues(cfqd, cfqg);
3184                tmp = div_u64(tmp, cfqd->busy_queues);
3185                slice = min_t(u64, slice, tmp);
3186
3187                /* async workload slice is scaled down according to
3188                 * the sync/async slice ratio. */
3189                slice = div64_u64(slice*cfqd->cfq_slice[0], cfqd->cfq_slice[1]);
3190        } else
3191                /* sync workload slice is at least 2 * cfq_slice_idle */
3192                slice = max(slice, 2 * cfqd->cfq_slice_idle);
3193
3194        slice = max_t(u64, slice, CFQ_MIN_TT);
3195        cfq_log(cfqd, "workload slice:%llu", slice);
3196        cfqd->workload_expires = now + slice;
3197}
3198
3199static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
3200{
3201        struct cfq_rb_root *st = &cfqd->grp_service_tree;
3202        struct cfq_group *cfqg;
3203
3204        if (RB_EMPTY_ROOT(&st->rb.rb_root))
3205                return NULL;
3206        cfqg = cfq_rb_first_group(st);
3207        update_min_vdisktime(st);
3208        return cfqg;
3209}
3210
3211static void cfq_choose_cfqg(struct cfq_data *cfqd)
3212{
3213        struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
3214        u64 now = ktime_get_ns();
3215
3216        cfqd->serving_group = cfqg;
3217
3218        /* Restore the workload type data */
3219        if (cfqg->saved_wl_slice) {
3220                cfqd->workload_expires = now + cfqg->saved_wl_slice;
3221                cfqd->serving_wl_type = cfqg->saved_wl_type;
3222                cfqd->serving_wl_class = cfqg->saved_wl_class;
3223        } else
3224                cfqd->workload_expires = now - 1;
3225
3226        choose_wl_class_and_type(cfqd, cfqg);
3227}
3228
3229/*
3230 * Select a queue for service. If we have a current active queue,
3231 * check whether to continue servicing it, or retrieve and set a new one.
3232 */
3233static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
3234{
3235        struct cfq_queue *cfqq, *new_cfqq = NULL;
3236        u64 now = ktime_get_ns();
3237
3238        cfqq = cfqd->active_queue;
3239        if (!cfqq)
3240                goto new_queue;
3241
3242        if (!cfqd->rq_queued)
3243                return NULL;
3244
3245        /*
3246         * We were waiting for group to get backlogged. Expire the queue
3247         */
3248        if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
3249                goto expire;
3250
3251        /*
3252         * The active queue has run out of time, expire it and select new.
3253         */
3254        if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
3255                /*
3256                 * If slice had not expired at the completion of last request
3257                 * we might not have turned on wait_busy flag. Don't expire
3258                 * the queue yet. Allow the group to get backlogged.
3259                 *
3260                 * The very fact that we have used the slice, that means we
3261                 * have been idling all along on this queue and it should be
3262                 * ok to wait for this request to complete.
3263                 */
3264                if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
3265                    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
3266                        cfqq = NULL;
3267                        goto keep_queue;
3268                } else
3269                        goto check_group_idle;
3270        }
3271
3272        /*
3273         * The active queue has requests and isn't expired, allow it to
3274         * dispatch.
3275         */
3276        if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3277                goto keep_queue;
3278
3279        /*
3280         * If another queue has a request waiting within our mean seek
3281         * distance, let it run.  The expire code will check for close
3282         * cooperators and put the close queue at the front of the service
3283         * tree.  If possible, merge the expiring queue with the new cfqq.
3284         */
3285        new_cfqq = cfq_close_cooperator(cfqd, cfqq);
3286        if (new_cfqq) {
3287                if (!cfqq->new_cfqq)
3288                        cfq_setup_merge(cfqq, new_cfqq);
3289                goto expire;
3290        }
3291
3292        /*
3293         * No requests pending. If the active queue still has requests in
3294         * flight or is idling for a new request, allow either of these
3295         * conditions to happen (or time out) before selecting a new queue.
3296         */
3297        if (hrtimer_active(&cfqd->idle_slice_timer)) {
3298                cfqq = NULL;
3299                goto keep_queue;
3300        }
3301
3302        /*
3303         * This is a deep seek queue, but the device is much faster than
3304         * the queue can deliver, don't idle
3305         **/
3306        if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
3307            (cfq_cfqq_slice_new(cfqq) ||
3308            (cfqq->slice_end - now > now - cfqq->slice_start))) {
3309                cfq_clear_cfqq_deep(cfqq);
3310                cfq_clear_cfqq_idle_window(cfqq);
3311        }
3312
3313        if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
3314                cfqq = NULL;
3315                goto keep_queue;
3316        }
3317
3318        /*
3319         * If group idle is enabled and there are requests dispatched from
3320         * this group, wait for requests to complete.
3321         */
3322check_group_idle:
3323        if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
3324            cfqq->cfqg->dispatched &&
3325            !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
3326                cfqq = NULL;
3327                goto keep_queue;
3328        }
3329
3330expire:
3331        cfq_slice_expired(cfqd, 0);
3332new_queue:
3333        /*
3334         * Current queue expired. Check if we have to switch to a new
3335         * service tree
3336         */
3337        if (!new_cfqq)
3338                cfq_choose_cfqg(cfqd);
3339
3340        cfqq = cfq_set_active_queue(cfqd, new_cfqq);
3341keep_queue:
3342        return cfqq;
3343}
3344
3345static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
3346{
3347        int dispatched = 0;
3348
3349        while (cfqq->next_rq) {
3350                cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
3351                dispatched++;
3352        }
3353
3354        BUG_ON(!list_empty(&cfqq->fifo));
3355
3356        /* By default cfqq is not expired if it is empty. Do it explicitly */
3357        __cfq_slice_expired(cfqq->cfqd, cfqq, 0);
3358        return dispatched;
3359}
3360
3361/*
3362 * Drain our current requests. Used for barriers and when switching
3363 * io schedulers on-the-fly.
3364 */
3365static int cfq_forced_dispatch(struct cfq_data *cfqd)
3366{
3367        struct cfq_queue *cfqq;
3368        int dispatched = 0;
3369
3370        /* Expire the timeslice of the current active queue first */
3371        cfq_slice_expired(cfqd, 0);
3372        while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
3373                __cfq_set_active_queue(cfqd, cfqq);
3374                dispatched += __cfq_forced_dispatch_cfqq(cfqq);
3375        }
3376
3377        BUG_ON(cfqd->busy_queues);
3378
3379        cfq_log(cfqd, "forced_dispatch=%d", dispatched);
3380        return dispatched;
3381}
3382
3383static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
3384        struct cfq_queue *cfqq)
3385{
3386        u64 now = ktime_get_ns();
3387
3388        /* the queue hasn't finished any request, can't estimate */
3389        if (cfq_cfqq_slice_new(cfqq))
3390                return true;
3391        if (now + cfqd->cfq_slice_idle * cfqq->dispatched > cfqq->slice_end)
3392                return true;
3393
3394        return false;
3395}
3396
3397static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3398{
3399        unsigned int max_dispatch;
3400
3401        if (cfq_cfqq_must_dispatch(cfqq))
3402                return true;
3403
3404        /*
3405         * Drain async requests before we start sync IO
3406         */
3407        if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
3408                return false;
3409
3410        /*
3411         * If this is an async queue and we have sync IO in flight, let it wait
3412         */
3413        if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
3414                return false;
3415
3416        max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
3417        if (cfq_class_idle(cfqq))
3418                max_dispatch = 1;
3419
3420        /*
3421         * Does this cfqq already have too much IO in flight?
3422         */
3423        if (cfqq->dispatched >= max_dispatch) {
3424                bool promote_sync = false;
3425                /*
3426                 * idle queue must always only have a single IO in flight
3427                 */
3428                if (cfq_class_idle(cfqq))
3429                        return false;
3430
3431                /*
3432                 * If there is only one sync queue
3433                 * we can ignore async queue here and give the sync
3434                 * queue no dispatch limit. The reason is a sync queue can
3435                 * preempt async queue, limiting the sync queue doesn't make
3436                 * sense. This is useful for aiostress test.
3437                 */
3438                if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
3439                        promote_sync = true;
3440
3441                /*
3442                 * We have other queues, don't allow more IO from this one
3443                 */
3444                if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
3445                                !promote_sync)
3446                        return false;
3447
3448                /*
3449                 * Sole queue user, no limit
3450                 */
3451                if (cfqd->busy_queues == 1 || promote_sync)
3452                        max_dispatch = -1;
3453                else
3454                        /*
3455                         * Normally we start throttling cfqq when cfq_quantum/2
3456                         * requests have been dispatched. But we can drive
3457                         * deeper queue depths at the beginning of slice
3458                         * subjected to upper limit of cfq_quantum.
3459                         * */
3460                        max_dispatch = cfqd->cfq_quantum;
3461        }
3462
3463        /*
3464         * Async queues must wait a bit before being allowed dispatch.
3465         * We also ramp up the dispatch depth gradually for async IO,
3466         * based on the last sync IO we serviced
3467         */
3468        if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
3469                u64 last_sync = ktime_get_ns() - cfqd->last_delayed_sync;
3470                unsigned int depth;
3471
3472                depth = div64_u64(last_sync, cfqd->cfq_slice[1]);
3473                if (!depth && !cfqq->dispatched)
3474                        depth = 1;
3475                if (depth < max_dispatch)
3476                        max_dispatch = depth;
3477        }
3478
3479        /*
3480         * If we're below the current max, allow a dispatch
3481         */
3482        return cfqq->dispatched < max_dispatch;
3483}
3484
3485/*
3486 * Dispatch a request from cfqq, moving them to the request queue
3487 * dispatch list.
3488 */
3489static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3490{
3491        struct request *rq;
3492
3493        BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
3494
3495        rq = cfq_check_fifo(cfqq);
3496        if (rq)
3497                cfq_mark_cfqq_must_dispatch(cfqq);
3498
3499        if (!cfq_may_dispatch(cfqd, cfqq))
3500                return false;
3501
3502        /*
3503         * follow expired path, else get first next available
3504         */
3505        if (!rq)
3506                rq = cfqq->next_rq;
3507        else
3508                cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
3509
3510        /*
3511         * insert request into driver dispatch list
3512         */
3513        cfq_dispatch_insert(cfqd->queue, rq);
3514
3515        if (!cfqd->active_cic) {
3516                struct cfq_io_cq *cic = RQ_CIC(rq);
3517
3518                atomic_long_inc(&cic->icq.ioc->refcount);
3519                cfqd->active_cic = cic;
3520        }
3521
3522        return true;
3523}
3524
3525/*
3526 * Find the cfqq that we need to service and move a request from that to the
3527 * dispatch list
3528 */
3529static int cfq_dispatch_requests(struct request_queue *q, int force)
3530{
3531        struct cfq_data *cfqd = q->elevator->elevator_data;
3532        struct cfq_queue *cfqq;
3533
3534        if (!cfqd->busy_queues)
3535                return 0;
3536
3537        if (unlikely(force))
3538                return cfq_forced_dispatch(cfqd);
3539
3540        cfqq = cfq_select_queue(cfqd);
3541        if (!cfqq)
3542                return 0;
3543
3544        /*
3545         * Dispatch a request from this cfqq, if it is allowed
3546         */
3547        if (!cfq_dispatch_request(cfqd, cfqq))
3548                return 0;
3549
3550        cfqq->slice_dispatch++;
3551        cfq_clear_cfqq_must_dispatch(cfqq);
3552
3553        /*
3554         * expire an async queue immediately if it has used up its slice. idle
3555         * queue always expire after 1 dispatch round.
3556         */
3557        if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
3558            cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
3559            cfq_class_idle(cfqq))) {
3560                cfqq->slice_end = ktime_get_ns() + 1;
3561                cfq_slice_expired(cfqd, 0);
3562        }
3563
3564        cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
3565        return 1;
3566}
3567
3568/*
3569 * task holds one reference to the queue, dropped when task exits. each rq
3570 * in-flight on this queue also holds a reference, dropped when rq is freed.
3571 *
3572 * Each cfq queue took a reference on the parent group. Drop it now.
3573 * queue lock must be held here.
3574 */
3575static void cfq_put_queue(struct cfq_queue *cfqq)
3576{
3577        struct cfq_data *cfqd = cfqq->cfqd;
3578        struct cfq_group *cfqg;
3579
3580        BUG_ON(cfqq->ref <= 0);
3581
3582        cfqq->ref--;
3583        if (cfqq->ref)
3584                return;
3585
3586        cfq_log_cfqq(cfqd, cfqq, "put_queue");
3587        BUG_ON(rb_first(&cfqq->sort_list));
3588        BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
3589        cfqg = cfqq->cfqg;
3590
3591        if (unlikely(cfqd->active_queue == cfqq)) {
3592                __cfq_slice_expired(cfqd, cfqq, 0);
3593                cfq_schedule_dispatch(cfqd);
3594        }
3595
3596        BUG_ON(cfq_cfqq_on_rr(cfqq));
3597        kmem_cache_free(cfq_pool, cfqq);
3598        cfqg_put(cfqg);
3599}
3600
3601static void cfq_put_cooperator(struct cfq_queue *cfqq)
3602{
3603        struct cfq_queue *__cfqq, *next;
3604
3605        /*
3606         * If this queue was scheduled to merge with another queue, be
3607         * sure to drop the reference taken on that queue (and others in
3608         * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
3609         */
3610        __cfqq = cfqq->new_cfqq;
3611        while (__cfqq) {
3612                if (__cfqq == cfqq) {
3613                        WARN(1, "cfqq->new_cfqq loop detected\n");
3614                        break;
3615                }
3616                next = __cfqq->new_cfqq;
3617                cfq_put_queue(__cfqq);
3618                __cfqq = next;
3619        }
3620}
3621
3622static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3623{
3624        if (unlikely(cfqq == cfqd->active_queue)) {
3625                __cfq_slice_expired(cfqd, cfqq, 0);
3626                cfq_schedule_dispatch(cfqd);
3627        }
3628
3629        cfq_put_cooperator(cfqq);
3630
3631        cfq_put_queue(cfqq);
3632}
3633
3634static void cfq_init_icq(struct io_cq *icq)
3635{
3636        struct cfq_io_cq *cic = icq_to_cic(icq);
3637
3638        cic->ttime.last_end_request = ktime_get_ns();
3639}
3640
3641static void cfq_exit_icq(struct io_cq *icq)
3642{
3643        struct cfq_io_cq *cic = icq_to_cic(icq);
3644        struct cfq_data *cfqd = cic_to_cfqd(cic);
3645
3646        if (cic_to_cfqq(cic, false)) {
3647                cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false));
3648                cic_set_cfqq(cic, NULL, false);
3649        }
3650
3651        if (cic_to_cfqq(cic, true)) {
3652                cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true));
3653                cic_set_cfqq(cic, NULL, true);
3654        }
3655}
3656
3657static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
3658{
3659        struct task_struct *tsk = current;
3660        int ioprio_class;
3661
3662        if (!cfq_cfqq_prio_changed(cfqq))
3663                return;
3664
3665        ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3666        switch (ioprio_class) {
3667        default:
3668                printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
3669                /* fall through */
3670        case IOPRIO_CLASS_NONE:
3671                /*
3672                 * no prio set, inherit CPU scheduling settings
3673                 */
3674                cfqq->ioprio = task_nice_ioprio(tsk);
3675                cfqq->ioprio_class = task_nice_ioclass(tsk);
3676                break;
3677        case IOPRIO_CLASS_RT:
3678                cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3679                cfqq->ioprio_class = IOPRIO_CLASS_RT;
3680                break;
3681        case IOPRIO_CLASS_BE:
3682                cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3683                cfqq->ioprio_class = IOPRIO_CLASS_BE;
3684                break;
3685        case IOPRIO_CLASS_IDLE:
3686                cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
3687                cfqq->ioprio = 7;
3688                cfq_clear_cfqq_idle_window(cfqq);
3689                break;
3690        }
3691
3692        /*
3693         * keep track of original prio settings in case we have to temporarily
3694         * elevate the priority of this queue
3695         */
3696        cfqq->org_ioprio = cfqq->ioprio;
3697        cfqq->org_ioprio_class = cfqq->ioprio_class;
3698        cfq_clear_cfqq_prio_changed(cfqq);
3699}
3700
3701static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
3702{
3703        int ioprio = cic->icq.ioc->ioprio;
3704        struct cfq_data *cfqd = cic_to_cfqd(cic);
3705        struct cfq_queue *cfqq;
3706
3707        /*
3708         * Check whether ioprio has changed.  The condition may trigger
3709         * spuriously on a newly created cic but there's no harm.
3710         */
3711        if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
3712                return;
3713
3714        cfqq = cic_to_cfqq(cic, false);
3715        if (cfqq) {
3716                cfq_put_queue(cfqq);
3717                cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio);
3718                cic_set_cfqq(cic, cfqq, false);
3719        }
3720
3721        cfqq = cic_to_cfqq(cic, true);
3722        if (cfqq)
3723                cfq_mark_cfqq_prio_changed(cfqq);
3724
3725        cic->ioprio = ioprio;
3726}
3727
3728static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3729                          pid_t pid, bool is_sync)
3730{
3731        RB_CLEAR_NODE(&cfqq->rb_node);
3732        RB_CLEAR_NODE(&cfqq->p_node);
3733        INIT_LIST_HEAD(&cfqq->fifo);
3734
3735        cfqq->ref = 0;
3736        cfqq->cfqd = cfqd;
3737
3738        cfq_mark_cfqq_prio_changed(cfqq);
3739
3740        if (is_sync) {
3741                if (!cfq_class_idle(cfqq))
3742                        cfq_mark_cfqq_idle_window(cfqq);
3743                cfq_mark_cfqq_sync(cfqq);
3744        }
3745        cfqq->pid = pid;
3746}
3747
3748#ifdef CONFIG_CFQ_GROUP_IOSCHED
3749static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3750{
3751        struct cfq_data *cfqd = cic_to_cfqd(cic);
3752        struct cfq_queue *cfqq;
3753        uint64_t serial_nr;
3754
3755        rcu_read_lock();
3756        serial_nr = bio_blkcg(bio)->css.serial_nr;
3757        rcu_read_unlock();
3758
3759        /*
3760         * Check whether blkcg has changed.  The condition may trigger
3761         * spuriously on a newly created cic but there's no harm.
3762         */
3763        if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
3764                return;
3765
3766        /*
3767         * Drop reference to queues.  New queues will be assigned in new
3768         * group upon arrival of fresh requests.
3769         */
3770        cfqq = cic_to_cfqq(cic, false);
3771        if (cfqq) {
3772                cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
3773                cic_set_cfqq(cic, NULL, false);
3774                cfq_put_queue(cfqq);
3775        }
3776
3777        cfqq = cic_to_cfqq(cic, true);
3778        if (cfqq) {
3779                cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
3780                cic_set_cfqq(cic, NULL, true);
3781                cfq_put_queue(cfqq);
3782        }
3783
3784        cic->blkcg_serial_nr = serial_nr;
3785}
3786#else
3787static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3788{
3789}
3790#endif  /* CONFIG_CFQ_GROUP_IOSCHED */
3791
3792static struct cfq_queue **
3793cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio)
3794{
3795        switch (ioprio_class) {
3796        case IOPRIO_CLASS_RT:
3797                return &cfqg->async_cfqq[0][ioprio];
3798        case IOPRIO_CLASS_NONE:
3799                ioprio = IOPRIO_NORM;
3800                /* fall through */
3801        case IOPRIO_CLASS_BE:
3802                return &cfqg->async_cfqq[1][ioprio];
3803        case IOPRIO_CLASS_IDLE:
3804                return &cfqg->async_idle_cfqq;
3805        default:
3806                BUG();
3807        }
3808}
3809
3810static struct cfq_queue *
3811cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3812              struct bio *bio)
3813{
3814        int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3815        int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3816        struct cfq_queue **async_cfqq = NULL;
3817        struct cfq_queue *cfqq;
3818        struct cfq_group *cfqg;
3819
3820        rcu_read_lock();
3821        cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
3822        if (!cfqg) {
3823                cfqq = &cfqd->oom_cfqq;
3824                goto out;
3825        }
3826
3827        if (!is_sync) {
3828                if (!ioprio_valid(cic->ioprio)) {
3829                        struct task_struct *tsk = current;
3830                        ioprio = task_nice_ioprio(tsk);
3831                        ioprio_class = task_nice_ioclass(tsk);
3832                }
3833                async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio);
3834                cfqq = *async_cfqq;
3835                if (cfqq)
3836                        goto out;
3837        }
3838
3839        cfqq = kmem_cache_alloc_node(cfq_pool,
3840                                     GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
3841                                     cfqd->queue->node);
3842        if (!cfqq) {
3843                cfqq = &cfqd->oom_cfqq;
3844                goto out;
3845        }
3846
3847        /* cfq_init_cfqq() assumes cfqq->ioprio_class is initialized. */
3848        cfqq->ioprio_class = IOPRIO_CLASS_NONE;
3849        cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
3850        cfq_init_prio_data(cfqq, cic);
3851        cfq_link_cfqq_cfqg(cfqq, cfqg);
3852        cfq_log_cfqq(cfqd, cfqq, "alloced");
3853
3854        if (async_cfqq) {
3855                /* a new async queue is created, pin and remember */
3856                cfqq->ref++;
3857                *async_cfqq = cfqq;
3858        }
3859out:
3860        cfqq->ref++;
3861        rcu_read_unlock();
3862        return cfqq;
3863}
3864
3865static void
3866__cfq_update_io_thinktime(struct cfq_ttime *ttime, u64 slice_idle)
3867{
3868        u64 elapsed = ktime_get_ns() - ttime->last_end_request;
3869        elapsed = min(elapsed, 2UL * slice_idle);
3870
3871        ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
3872        ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed,  8);
3873        ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
3874                                     ttime->ttime_samples);
3875}
3876
3877static void
3878cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3879                        struct cfq_io_cq *cic)
3880{
3881        if (cfq_cfqq_sync(cfqq)) {
3882                __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
3883                __cfq_update_io_thinktime(&cfqq->service_tree->ttime,
3884                        cfqd->cfq_slice_idle);
3885        }
3886#ifdef CONFIG_CFQ_GROUP_IOSCHED
3887        __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
3888#endif
3889}
3890
3891static void
3892cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3893                       struct request *rq)
3894{
3895        sector_t sdist = 0;
3896        sector_t n_sec = blk_rq_sectors(rq);
3897        if (cfqq->last_request_pos) {
3898                if (cfqq->last_request_pos < blk_rq_pos(rq))
3899                        sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
3900                else
3901                        sdist = cfqq->last_request_pos - blk_rq_pos(rq);
3902        }
3903
3904        cfqq->seek_history <<= 1;
3905        if (blk_queue_nonrot(cfqd->queue))
3906                cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
3907        else
3908                cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
3909}
3910
3911static inline bool req_noidle(struct request *req)
3912{
3913        return req_op(req) == REQ_OP_WRITE &&
3914                (req->cmd_flags & (REQ_SYNC | REQ_IDLE)) == REQ_SYNC;
3915}
3916
3917/*
3918 * Disable idle window if the process thinks too long or seeks so much that
3919 * it doesn't matter
3920 */
3921static void
3922cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3923                       struct cfq_io_cq *cic)
3924{
3925        int old_idle, enable_idle;
3926
3927        /*
3928         * Don't idle for async or idle io prio class
3929         */
3930        if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
3931                return;
3932
3933        enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
3934
3935        if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3936                cfq_mark_cfqq_deep(cfqq);
3937
3938        if (cfqq->next_rq && req_noidle(cfqq->next_rq))
3939                enable_idle = 0;
3940        else if (!atomic_read(&cic->icq.ioc->active_ref) ||
3941                 !cfqd->cfq_slice_idle ||
3942                 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3943                enable_idle = 0;
3944        else if (sample_valid(cic->ttime.ttime_samples)) {
3945                if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
3946                        enable_idle = 0;
3947                else
3948                        enable_idle = 1;
3949        }
3950
3951        if (old_idle != enable_idle) {
3952                cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
3953                if (enable_idle)
3954                        cfq_mark_cfqq_idle_window(cfqq);
3955                else
3956                        cfq_clear_cfqq_idle_window(cfqq);
3957        }
3958}
3959
3960/*
3961 * Check if new_cfqq should preempt the currently active queue. Return 0 for
3962 * no or if we aren't sure, a 1 will cause a preempt.
3963 */
3964static bool
3965cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3966                   struct request *rq)
3967{
3968        struct cfq_queue *cfqq;
3969
3970        cfqq = cfqd->active_queue;
3971        if (!cfqq)
3972                return false;
3973
3974        if (cfq_class_idle(new_cfqq))
3975                return false;
3976
3977        if (cfq_class_idle(cfqq))
3978                return true;
3979
3980        /*
3981         * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
3982         */
3983        if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
3984                return false;
3985
3986        /*
3987         * if the new request is sync, but the currently running queue is
3988         * not, let the sync request have priority.
3989         */
3990        if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
3991                return true;
3992
3993        /*
3994         * Treat ancestors of current cgroup the same way as current cgroup.
3995         * For anybody else we disallow preemption to guarantee service
3996         * fairness among cgroups.
3997         */
3998        if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg))
3999                return false;
4000
4001        if (cfq_slice_used(cfqq))
4002                return true;
4003
4004        /*
4005         * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
4006         */
4007        if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
4008                return true;
4009
4010        WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class);
4011        /* Allow preemption only if we are idling on sync-noidle tree */
4012        if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
4013            cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
4014            RB_EMPTY_ROOT(&cfqq->sort_list))
4015                return true;
4016
4017        /*
4018         * So both queues are sync. Let the new request get disk time if
4019         * it's a metadata request and the current queue is doing regular IO.
4020         */
4021        if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
4022                return true;
4023
4024        /* An idle queue should not be idle now for some reason */
4025        if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
4026                return true;
4027
4028        if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
4029                return false;
4030
4031        /*
4032         * if this request is as-good as one we would expect from the
4033         * current cfqq, let it preempt
4034         */
4035        if (cfq_rq_close(cfqd, cfqq, rq))
4036                return true;
4037
4038        return false;
4039}
4040
4041/*
4042 * cfqq preempts the active queue. if we allowed preempt with no slice left,
4043 * let it have half of its nominal slice.
4044 */
4045static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
4046{
4047        enum wl_type_t old_type = cfqq_type(cfqd->active_queue);
4048
4049        cfq_log_cfqq(cfqd, cfqq, "preempt");
4050        cfq_slice_expired(cfqd, 1);
4051
4052        /*
4053         * workload type is changed, don't save slice, otherwise preempt
4054         * doesn't happen
4055         */
4056        if (old_type != cfqq_type(cfqq))
4057                cfqq->cfqg->saved_wl_slice = 0;
4058
4059        /*
4060         * Put the new queue at the front of the of the current list,
4061         * so we know that it will be selected next.
4062         */
4063        BUG_ON(!cfq_cfqq_on_rr(cfqq));
4064
4065        cfq_service_tree_add(cfqd, cfqq, 1);
4066
4067        cfqq->slice_end = 0;
4068        cfq_mark_cfqq_slice_new(cfqq);
4069}
4070
4071/*
4072 * Called when a new fs request (rq) is added (to cfqq). Check if there's
4073 * something we should do about it
4074 */
4075static void
4076cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
4077                struct request *rq)
4078{
4079        struct cfq_io_cq *cic = RQ_CIC(rq);
4080
4081        cfqd->rq_queued++;
4082        if (rq->cmd_flags & REQ_PRIO)
4083                cfqq->prio_pending++;
4084
4085        cfq_update_io_thinktime(cfqd, cfqq, cic);
4086        cfq_update_io_seektime(cfqd, cfqq, rq);
4087        cfq_update_idle_window(cfqd, cfqq, cic);
4088
4089        cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
4090
4091        if (cfqq == cfqd->active_queue) {
4092                /*
4093                 * Remember that we saw a request from this process, but
4094                 * don't start queuing just yet. Otherwise we risk seeing lots
4095                 * of tiny requests, because we disrupt the normal plugging
4096                 * and merging. If the request is already larger than a single
4097                 * page, let it rip immediately. For that case we assume that
4098                 * merging is already done. Ditto for a busy system that
4099                 * has other work pending, don't risk delaying until the
4100                 * idle timer unplug to continue working.
4101                 */
4102                if (cfq_cfqq_wait_request(cfqq)) {
4103                        if (blk_rq_bytes(rq) > PAGE_SIZE ||
4104                            cfqd->busy_queues > 1) {
4105                                cfq_del_timer(cfqd, cfqq);
4106                                cfq_clear_cfqq_wait_request(cfqq);
4107                                __blk_run_queue(cfqd->queue);
4108                        } else {
4109                                cfqg_stats_update_idle_time(cfqq->cfqg);
4110                                cfq_mark_cfqq_must_dispatch(cfqq);
4111                        }
4112                }
4113        } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
4114                /*
4115                 * not the active queue - expire current slice if it is
4116                 * idle and has expired it's mean thinktime or this new queue
4117                 * has some old slice time left and is of higher priority or
4118                 * this new queue is RT and the current one is BE
4119                 */
4120                cfq_preempt_queue(cfqd, cfqq);
4121                __blk_run_queue(cfqd->queue);
4122        }
4123}
4124
4125static void cfq_insert_request(struct request_queue *q, struct request *rq)
4126{
4127        struct cfq_data *cfqd = q->elevator->elevator_data;
4128        struct cfq_queue *cfqq = RQ_CFQQ(rq);
4129
4130        cfq_log_cfqq(cfqd, cfqq, "insert_request");
4131        cfq_init_prio_data(cfqq, RQ_CIC(rq));
4132
4133        rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)];
4134        list_add_tail(&rq->queuelist, &cfqq->fifo);
4135        cfq_add_rq_rb(rq);
4136        cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
4137                                 rq->cmd_flags);
4138        cfq_rq_enqueued(cfqd, cfqq, rq);
4139}
4140
4141/*
4142 * Update hw_tag based on peak queue depth over 50 samples under
4143 * sufficient load.
4144 */
4145static void cfq_update_hw_tag(struct cfq_data *cfqd)
4146{
4147        struct cfq_queue *cfqq = cfqd->active_queue;
4148
4149        if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
4150                cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
4151
4152        if (cfqd->hw_tag == 1)
4153                return;
4154
4155        if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
4156            cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
4157                return;
4158
4159        /*
4160         * If active queue hasn't enough requests and can idle, cfq might not
4161         * dispatch sufficient requests to hardware. Don't zero hw_tag in this
4162         * case
4163         */
4164        if (cfqq && cfq_cfqq_idle_window(cfqq) &&
4165            cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
4166            CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
4167                return;
4168
4169        if (cfqd->hw_tag_samples++ < 50)
4170                return;
4171
4172        if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
4173                cfqd->hw_tag = 1;
4174        else
4175                cfqd->hw_tag = 0;
4176}
4177
4178static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
4179{
4180        struct cfq_io_cq *cic = cfqd->active_cic;
4181        u64 now = ktime_get_ns();
4182
4183        /* If the queue already has requests, don't wait */
4184        if (!RB_EMPTY_ROOT(&cfqq->sort_list))
4185                return false;
4186
4187        /* If there are other queues in the group, don't wait */
4188        if (cfqq->cfqg->nr_cfqq > 1)
4189                return false;
4190
4191        /* the only queue in the group, but think time is big */
4192        if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
4193                return false;
4194
4195        if (cfq_slice_used(cfqq))
4196                return true;
4197
4198        /* if slice left is less than think time, wait busy */
4199        if (cic && sample_valid(cic->ttime.ttime_samples)
4200            && (cfqq->slice_end - now < cic->ttime.ttime_mean))
4201                return true;
4202
4203        /*
4204         * If think times is less than a jiffy than ttime_mean=0 and above
4205         * will not be true. It might happen that slice has not expired yet
4206         * but will expire soon (4-5 ns) during select_queue(). To cover the
4207         * case where think time is less than a jiffy, mark the queue wait
4208         * busy if only 1 jiffy is left in the slice.
4209         */
4210        if (cfqq->slice_end - now <= jiffies_to_nsecs(1))
4211                return true;
4212
4213        return false;
4214}
4215
4216static void cfq_completed_request(struct request_queue *q, struct request *rq)
4217{
4218        struct cfq_queue *cfqq = RQ_CFQQ(rq);
4219        struct cfq_data *cfqd = cfqq->cfqd;
4220        const int sync = rq_is_sync(rq);
4221        u64 now = ktime_get_ns();
4222
4223        cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", req_noidle(rq));
4224
4225        cfq_update_hw_tag(cfqd);
4226
4227        WARN_ON(!cfqd->rq_in_driver);
4228        WARN_ON(!cfqq->dispatched);
4229        cfqd->rq_in_driver--;
4230        cfqq->dispatched--;
4231        (RQ_CFQG(rq))->dispatched--;
4232        cfqg_stats_update_completion(cfqq->cfqg, rq->start_time_ns,
4233                                     rq->io_start_time_ns, rq->cmd_flags);
4234
4235        cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
4236
4237        if (sync) {
4238                struct cfq_rb_root *st;
4239
4240                RQ_CIC(rq)->ttime.last_end_request = now;
4241
4242                if (cfq_cfqq_on_rr(cfqq))
4243                        st = cfqq->service_tree;
4244                else
4245                        st = st_for(cfqq->cfqg, cfqq_class(cfqq),
4246                                        cfqq_type(cfqq));
4247
4248                st->ttime.last_end_request = now;
4249                if (rq->start_time_ns + cfqd->cfq_fifo_expire[1] <= now)
4250                        cfqd->last_delayed_sync = now;
4251        }
4252
4253#ifdef CONFIG_CFQ_GROUP_IOSCHED
4254        cfqq->cfqg->ttime.last_end_request = now;
4255#endif
4256
4257        /*
4258         * If this is the active queue, check if it needs to be expired,
4259         * or if we want to idle in case it has no pending requests.
4260         */
4261        if (cfqd->active_queue == cfqq) {
4262                const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
4263
4264                if (cfq_cfqq_slice_new(cfqq)) {
4265                        cfq_set_prio_slice(cfqd, cfqq);
4266                        cfq_clear_cfqq_slice_new(cfqq);
4267                }
4268
4269                /*
4270                 * Should we wait for next request to come in before we expire
4271                 * the queue.
4272                 */
4273                if (cfq_should_wait_busy(cfqd, cfqq)) {
4274                        u64 extend_sl = cfqd->cfq_slice_idle;
4275                        if (!cfqd->cfq_slice_idle)
4276                                extend_sl = cfqd->cfq_group_idle;
4277                        cfqq->slice_end = now + extend_sl;
4278                        cfq_mark_cfqq_wait_busy(cfqq);
4279                        cfq_log_cfqq(cfqd, cfqq, "will busy wait");
4280                }
4281
4282                /*
4283                 * Idling is not enabled on:
4284                 * - expired queues
4285                 * - idle-priority queues
4286                 * - async queues
4287                 * - queues with still some requests queued
4288                 * - when there is a close cooperator
4289                 */
4290                if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
4291                        cfq_slice_expired(cfqd, 1);
4292                else if (sync && cfqq_empty &&
4293                         !cfq_close_cooperator(cfqd, cfqq)) {
4294                        cfq_arm_slice_timer(cfqd);
4295                }
4296        }
4297
4298        if (!cfqd->rq_in_driver)
4299                cfq_schedule_dispatch(cfqd);
4300}
4301
4302static void cfqq_boost_on_prio(struct cfq_queue *cfqq, unsigned int op)
4303{
4304        /*
4305         * If REQ_PRIO is set, boost class and prio level, if it's below
4306         * BE/NORM. If prio is not set, restore the potentially boosted
4307         * class/prio level.
4308         */
4309        if (!(op & REQ_PRIO)) {
4310                cfqq->ioprio_class = cfqq->org_ioprio_class;
4311                cfqq->ioprio = cfqq->org_ioprio;
4312        } else {
4313                if (cfq_class_idle(cfqq))
4314                        cfqq->ioprio_class = IOPRIO_CLASS_BE;
4315                if (cfqq->ioprio > IOPRIO_NORM)
4316                        cfqq->ioprio = IOPRIO_NORM;
4317        }
4318}
4319
4320static inline int __cfq_may_queue(struct cfq_queue *cfqq)
4321{
4322        if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
4323                cfq_mark_cfqq_must_alloc_slice(cfqq);
4324                return ELV_MQUEUE_MUST;
4325        }
4326
4327        return ELV_MQUEUE_MAY;
4328}
4329
4330static int cfq_may_queue(struct request_queue *q, unsigned int op)
4331{
4332        struct cfq_data *cfqd = q->elevator->elevator_data;
4333        struct task_struct *tsk = current;
4334        struct cfq_io_cq *cic;
4335        struct cfq_queue *cfqq;
4336
4337        /*
4338         * don't force setup of a queue from here, as a call to may_queue
4339         * does not necessarily imply that a request actually will be queued.
4340         * so just lookup a possibly existing queue, or return 'may queue'
4341         * if that fails
4342         */
4343        cic = cfq_cic_lookup(cfqd, tsk->io_context);
4344        if (!cic)
4345                return ELV_MQUEUE_MAY;
4346
4347        cfqq = cic_to_cfqq(cic, op_is_sync(op));
4348        if (cfqq) {
4349                cfq_init_prio_data(cfqq, cic);
4350                cfqq_boost_on_prio(cfqq, op);
4351
4352                return __cfq_may_queue(cfqq);
4353        }
4354
4355        return ELV_MQUEUE_MAY;
4356}
4357
4358/*
4359 * queue lock held here
4360 */
4361static void cfq_put_request(struct request *rq)
4362{
4363        struct cfq_queue *cfqq = RQ_CFQQ(rq);
4364
4365        if (cfqq) {
4366                const int rw = rq_data_dir(rq);
4367
4368                BUG_ON(!cfqq->allocated[rw]);
4369                cfqq->allocated[rw]--;
4370
4371                /* Put down rq reference on cfqg */
4372                cfqg_put(RQ_CFQG(rq));
4373                rq->elv.priv[0] = NULL;
4374                rq->elv.priv[1] = NULL;
4375
4376                cfq_put_queue(cfqq);
4377        }
4378}
4379
4380static struct cfq_queue *
4381cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
4382                struct cfq_queue *cfqq)
4383{
4384        cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
4385        cic_set_cfqq(cic, cfqq->new_cfqq, 1);
4386        cfq_mark_cfqq_coop(cfqq->new_cfqq);
4387        cfq_put_queue(cfqq);
4388        return cic_to_cfqq(cic, 1);
4389}
4390
4391/*
4392 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
4393 * was the last process referring to said cfqq.
4394 */
4395static struct cfq_queue *
4396split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
4397{
4398        if (cfqq_process_refs(cfqq) == 1) {
4399                cfqq->pid = current->pid;
4400                cfq_clear_cfqq_coop(cfqq);
4401                cfq_clear_cfqq_split_coop(cfqq);
4402                return cfqq;
4403        }
4404
4405        cic_set_cfqq(cic, NULL, 1);
4406
4407        cfq_put_cooperator(cfqq);
4408
4409        cfq_put_queue(cfqq);
4410        return NULL;
4411}
4412/*
4413 * Allocate cfq data structures associated with this request.
4414 */
4415static int
4416cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
4417                gfp_t gfp_mask)
4418{
4419        struct cfq_data *cfqd = q->elevator->elevator_data;
4420        struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
4421        const int rw = rq_data_dir(rq);
4422        const bool is_sync = rq_is_sync(rq);
4423        struct cfq_queue *cfqq;
4424
4425        spin_lock_irq(q->queue_lock);
4426
4427        check_ioprio_changed(cic, bio);
4428        check_blkcg_changed(cic, bio);
4429new_queue:
4430        cfqq = cic_to_cfqq(cic, is_sync);
4431        if (!cfqq || cfqq == &cfqd->oom_cfqq) {
4432                if (cfqq)
4433                        cfq_put_queue(cfqq);
4434                cfqq = cfq_get_queue(cfqd, is_sync, cic, bio);
4435                cic_set_cfqq(cic, cfqq, is_sync);
4436        } else {
4437                /*
4438                 * If the queue was seeky for too long, break it apart.
4439                 */
4440                if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
4441                        cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
4442                        cfqq = split_cfqq(cic, cfqq);
4443                        if (!cfqq)
4444                                goto new_queue;
4445                }
4446
4447                /*
4448                 * Check to see if this queue is scheduled to merge with
4449                 * another, closely cooperating queue.  The merging of
4450                 * queues happens here as it must be done in process context.
4451                 * The reference on new_cfqq was taken in merge_cfqqs.
4452                 */
4453                if (cfqq->new_cfqq)
4454                        cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
4455        }
4456
4457        cfqq->allocated[rw]++;
4458
4459        cfqq->ref++;
4460        cfqg_get(cfqq->cfqg);
4461        rq->elv.priv[0] = cfqq;
4462        rq->elv.priv[1] = cfqq->cfqg;
4463        spin_unlock_irq(q->queue_lock);
4464
4465        return 0;
4466}
4467
4468static void cfq_kick_queue(struct work_struct *work)
4469{
4470        struct cfq_data *cfqd =
4471                container_of(work, struct cfq_data, unplug_work);
4472        struct request_queue *q = cfqd->queue;
4473
4474        spin_lock_irq(q->queue_lock);
4475        __blk_run_queue(cfqd->queue);
4476        spin_unlock_irq(q->queue_lock);
4477}
4478
4479/*
4480 * Timer running if the active_queue is currently idling inside its time slice
4481 */
4482static enum hrtimer_restart cfq_idle_slice_timer(struct hrtimer *timer)
4483{
4484        struct cfq_data *cfqd = container_of(timer, struct cfq_data,
4485                                             idle_slice_timer);
4486        struct cfq_queue *cfqq;
4487        unsigned long flags;
4488        int timed_out = 1;
4489
4490        cfq_log(cfqd, "idle timer fired");
4491
4492        spin_lock_irqsave(cfqd->queue->queue_lock, flags);
4493
4494        cfqq = cfqd->active_queue;
4495        if (cfqq) {
4496                timed_out = 0;
4497
4498                /*
4499                 * We saw a request before the queue expired, let it through
4500                 */
4501                if (cfq_cfqq_must_dispatch(cfqq))
4502                        goto out_kick;
4503
4504                /*
4505                 * expired
4506                 */
4507                if (cfq_slice_used(cfqq))
4508                        goto expire;
4509
4510                /*
4511                 * only expire and reinvoke request handler, if there are
4512                 * other queues with pending requests
4513                 */
4514                if (!cfqd->busy_queues)
4515                        goto out_cont;
4516
4517                /*
4518                 * not expired and it has a request pending, let it dispatch
4519                 */
4520                if (!RB_EMPTY_ROOT(&cfqq->sort_list))
4521                        goto out_kick;
4522
4523                /*
4524                 * Queue depth flag is reset only when the idle didn't succeed
4525                 */
4526                cfq_clear_cfqq_deep(cfqq);
4527        }
4528expire:
4529        cfq_slice_expired(cfqd, timed_out);
4530out_kick:
4531        cfq_schedule_dispatch(cfqd);
4532out_cont:
4533        spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
4534        return HRTIMER_NORESTART;
4535}
4536
4537static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
4538{
4539        hrtimer_cancel(&cfqd->idle_slice_timer);
4540        cancel_work_sync(&cfqd->unplug_work);
4541}
4542
4543static void cfq_exit_queue(struct elevator_queue *e)
4544{
4545        struct cfq_data *cfqd = e->elevator_data;
4546        struct request_queue *q = cfqd->queue;
4547
4548        cfq_shutdown_timer_wq(cfqd);
4549
4550        spin_lock_irq(q->queue_lock);
4551
4552        if (cfqd->active_queue)
4553                __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
4554
4555        spin_unlock_irq(q->queue_lock);
4556
4557        cfq_shutdown_timer_wq(cfqd);
4558
4559#ifdef CONFIG_CFQ_GROUP_IOSCHED
4560        blkcg_deactivate_policy(q, &blkcg_policy_cfq);
4561#else
4562        kfree(cfqd->root_group);
4563#endif
4564        kfree(cfqd);
4565}
4566
4567static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
4568{
4569        struct cfq_data *cfqd;
4570        struct blkcg_gq *blkg __maybe_unused;
4571        int i, ret;
4572        struct elevator_queue *eq;
4573
4574        eq = elevator_alloc(q, e);
4575        if (!eq)
4576                return -ENOMEM;
4577
4578        cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
4579        if (!cfqd) {
4580                kobject_put(&eq->kobj);
4581                return -ENOMEM;
4582        }
4583        eq->elevator_data = cfqd;
4584
4585        cfqd->queue = q;
4586        spin_lock_irq(q->queue_lock);
4587        q->elevator = eq;
4588        spin_unlock_irq(q->queue_lock);
4589
4590        /* Init root service tree */
4591        cfqd->grp_service_tree = CFQ_RB_ROOT;
4592
4593        /* Init root group and prefer root group over other groups by default */
4594#ifdef CONFIG_CFQ_GROUP_IOSCHED
4595        ret = blkcg_activate_policy(q, &blkcg_policy_cfq);
4596        if (ret)
4597                goto out_free;
4598
4599        cfqd->root_group = blkg_to_cfqg(q->root_blkg);
4600#else
4601        ret = -ENOMEM;
4602        cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
4603                                        GFP_KERNEL, cfqd->queue->node);
4604        if (!cfqd->root_group)
4605                goto out_free;
4606
4607        cfq_init_cfqg_base(cfqd->root_group);
4608        cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
4609        cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
4610#endif
4611
4612        /*
4613         * Not strictly needed (since RB_ROOT just clears the node and we
4614         * zeroed cfqd on alloc), but better be safe in case someone decides
4615         * to add magic to the rb code
4616         */
4617        for (i = 0; i < CFQ_PRIO_LISTS; i++)
4618                cfqd->prio_trees[i] = RB_ROOT;
4619
4620        /*
4621         * Our fallback cfqq if cfq_get_queue() runs into OOM issues.
4622         * Grab a permanent reference to it, so that the normal code flow
4623         * will not attempt to free it.  oom_cfqq is linked to root_group
4624         * but shouldn't hold a reference as it'll never be unlinked.  Lose
4625         * the reference from linking right away.
4626         */
4627        cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
4628        cfqd->oom_cfqq.ref++;
4629
4630        spin_lock_irq(q->queue_lock);
4631        cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
4632        cfqg_put(cfqd->root_group);
4633        spin_unlock_irq(q->queue_lock);
4634
4635        hrtimer_init(&cfqd->idle_slice_timer, CLOCK_MONOTONIC,
4636                     HRTIMER_MODE_REL);
4637        cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
4638
4639        INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
4640
4641        cfqd->cfq_quantum = cfq_quantum;
4642        cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
4643        cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
4644        cfqd->cfq_back_max = cfq_back_max;
4645        cfqd->cfq_back_penalty = cfq_back_penalty;
4646        cfqd->cfq_slice[0] = cfq_slice_async;
4647        cfqd->cfq_slice[1] = cfq_slice_sync;
4648        cfqd->cfq_target_latency = cfq_target_latency;
4649        cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
4650        cfqd->cfq_slice_idle = cfq_slice_idle;
4651        cfqd->cfq_group_idle = cfq_group_idle;
4652        cfqd->cfq_latency = 1;
4653        cfqd->hw_tag = -1;
4654        /*
4655         * we optimistically start assuming sync ops weren't delayed in last
4656         * second, in order to have larger depth for async operations.
4657         */
4658        cfqd->last_delayed_sync = ktime_get_ns() - NSEC_PER_SEC;
4659        return 0;
4660
4661out_free:
4662        kfree(cfqd);
4663        kobject_put(&eq->kobj);
4664        return ret;
4665}
4666
4667static void cfq_registered_queue(struct request_queue *q)
4668{
4669        struct elevator_queue *e = q->elevator;
4670        struct cfq_data *cfqd = e->elevator_data;
4671
4672        /*
4673         * Default to IOPS mode with no idling for SSDs
4674         */
4675        if (blk_queue_nonrot(q))
4676                cfqd->cfq_slice_idle = 0;
4677        wbt_disable_default(q);
4678}
4679
4680/*
4681 * sysfs parts below -->
4682 */
4683static ssize_t
4684cfq_var_show(unsigned int var, char *page)
4685{
4686        return sprintf(page, "%u\n", var);
4687}
4688
4689static void
4690cfq_var_store(unsigned int *var, const char *page)
4691{
4692        char *p = (char *) page;
4693
4694        *var = simple_strtoul(p, &p, 10);
4695}
4696
4697#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
4698static ssize_t __FUNC(struct elevator_queue *e, char *page)             \
4699{                                                                       \
4700        struct cfq_data *cfqd = e->elevator_data;                       \
4701        u64 __data = __VAR;                                             \
4702        if (__CONV)                                                     \
4703                __data = div_u64(__data, NSEC_PER_MSEC);                        \
4704        return cfq_var_show(__data, (page));                            \
4705}
4706SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
4707SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
4708SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
4709SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
4710SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
4711SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
4712SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
4713SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
4714SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
4715SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
4716SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
4717SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1);
4718#undef SHOW_FUNCTION
4719
4720#define USEC_SHOW_FUNCTION(__FUNC, __VAR)                               \
4721static ssize_t __FUNC(struct elevator_queue *e, char *page)             \
4722{                                                                       \
4723        struct cfq_data *cfqd = e->elevator_data;                       \
4724        u64 __data = __VAR;                                             \
4725        __data = div_u64(__data, NSEC_PER_USEC);                        \
4726        return cfq_var_show(__data, (page));                            \
4727}
4728USEC_SHOW_FUNCTION(cfq_slice_idle_us_show, cfqd->cfq_slice_idle);
4729USEC_SHOW_FUNCTION(cfq_group_idle_us_show, cfqd->cfq_group_idle);
4730USEC_SHOW_FUNCTION(cfq_slice_sync_us_show, cfqd->cfq_slice[1]);
4731USEC_SHOW_FUNCTION(cfq_slice_async_us_show, cfqd->cfq_slice[0]);
4732USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency);
4733#undef USEC_SHOW_FUNCTION
4734
4735#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
4736static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
4737{                                                                       \
4738        struct cfq_data *cfqd = e->elevator_data;                       \
4739        unsigned int __data, __min = (MIN), __max = (MAX);              \
4740                                                                        \
4741        cfq_var_store(&__data, (page));                                 \
4742        if (__data < __min)                                             \
4743                __data = __min;                                         \
4744        else if (__data > __max)                                        \
4745                __data = __max;                                         \
4746        if (__CONV)                                                     \
4747                *(__PTR) = (u64)__data * NSEC_PER_MSEC;                 \
4748        else                                                            \
4749                *(__PTR) = __data;                                      \
4750        return count;                                                   \
4751}
4752STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
4753STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
4754                UINT_MAX, 1);
4755STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
4756                UINT_MAX, 1);
4757STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
4758STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
4759                UINT_MAX, 0);
4760STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
4761STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
4762STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
4763STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4764STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
4765                UINT_MAX, 0);
4766STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
4767STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1);
4768#undef STORE_FUNCTION
4769
4770#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)                    \
4771static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
4772{                                                                       \
4773        struct cfq_data *cfqd = e->elevator_data;                       \
4774        unsigned int __data, __min = (MIN), __max = (MAX);              \
4775                                                                        \
4776        cfq_var_store(&__data, (page));                                 \
4777        if (__data < __min)                                             \
4778                __data = __min;                                         \
4779        else if (__data > __max)                                        \
4780                __data = __max;                                         \
4781        *(__PTR) = (u64)__data * NSEC_PER_USEC;                         \
4782        return count;                                                   \
4783}
4784USEC_STORE_FUNCTION(cfq_slice_idle_us_store, &cfqd->cfq_slice_idle, 0, UINT_MAX);
4785USEC_STORE_FUNCTION(cfq_group_idle_us_store, &cfqd->cfq_group_idle, 0, UINT_MAX);
4786USEC_STORE_FUNCTION(cfq_slice_sync_us_store, &cfqd->cfq_slice[1], 1, UINT_MAX);
4787USEC_STORE_FUNCTION(cfq_slice_async_us_store, &cfqd->cfq_slice[0], 1, UINT_MAX);
4788USEC_STORE_FUNCTION(cfq_target_latency_us_store, &cfqd->cfq_target_latency, 1, UINT_MAX);
4789#undef USEC_STORE_FUNCTION
4790
4791#define CFQ_ATTR(name) \
4792        __ATTR(name, 0644, cfq_##name##_show, cfq_##name##_store)
4793
4794static struct elv_fs_entry cfq_attrs[] = {
4795        CFQ_ATTR(quantum),
4796        CFQ_ATTR(fifo_expire_sync),
4797        CFQ_ATTR(fifo_expire_async),
4798        CFQ_ATTR(back_seek_max),
4799        CFQ_ATTR(back_seek_penalty),
4800        CFQ_ATTR(slice_sync),
4801        CFQ_ATTR(slice_sync_us),
4802        CFQ_ATTR(slice_async),
4803        CFQ_ATTR(slice_async_us),
4804        CFQ_ATTR(slice_async_rq),
4805        CFQ_ATTR(slice_idle),
4806        CFQ_ATTR(slice_idle_us),
4807        CFQ_ATTR(group_idle),
4808        CFQ_ATTR(group_idle_us),
4809        CFQ_ATTR(low_latency),
4810        CFQ_ATTR(target_latency),
4811        CFQ_ATTR(target_latency_us),
4812        __ATTR_NULL
4813};
4814
4815static struct elevator_type iosched_cfq = {
4816        .ops.sq = {
4817                .elevator_merge_fn =            cfq_merge,
4818                .elevator_merged_fn =           cfq_merged_request,
4819                .elevator_merge_req_fn =        cfq_merged_requests,
4820                .elevator_allow_bio_merge_fn =  cfq_allow_bio_merge,
4821                .elevator_allow_rq_merge_fn =   cfq_allow_rq_merge,
4822                .elevator_bio_merged_fn =       cfq_bio_merged,
4823                .elevator_dispatch_fn =         cfq_dispatch_requests,
4824                .elevator_add_req_fn =          cfq_insert_request,
4825                .elevator_activate_req_fn =     cfq_activate_request,
4826                .elevator_deactivate_req_fn =   cfq_deactivate_request,
4827                .elevator_completed_req_fn =    cfq_completed_request,
4828                .elevator_former_req_fn =       elv_rb_former_request,
4829                .elevator_latter_req_fn =       elv_rb_latter_request,
4830                .elevator_init_icq_fn =         cfq_init_icq,
4831                .elevator_exit_icq_fn =         cfq_exit_icq,
4832                .elevator_set_req_fn =          cfq_set_request,
4833                .elevator_put_req_fn =          cfq_put_request,
4834                .elevator_may_queue_fn =        cfq_may_queue,
4835                .elevator_init_fn =             cfq_init_queue,
4836                .elevator_exit_fn =             cfq_exit_queue,
4837                .elevator_registered_fn =       cfq_registered_queue,
4838        },
4839        .icq_size       =       sizeof(struct cfq_io_cq),
4840        .icq_align      =       __alignof__(struct cfq_io_cq),
4841        .elevator_attrs =       cfq_attrs,
4842        .elevator_name  =       "cfq",
4843        .elevator_owner =       THIS_MODULE,
4844};
4845
4846#ifdef CONFIG_CFQ_GROUP_IOSCHED
4847static struct blkcg_policy blkcg_policy_cfq = {
4848        .dfl_cftypes            = cfq_blkcg_files,
4849        .legacy_cftypes         = cfq_blkcg_legacy_files,
4850
4851        .cpd_alloc_fn           = cfq_cpd_alloc,
4852        .cpd_init_fn            = cfq_cpd_init,
4853        .cpd_free_fn            = cfq_cpd_free,
4854        .cpd_bind_fn            = cfq_cpd_bind,
4855
4856        .pd_alloc_fn            = cfq_pd_alloc,
4857        .pd_init_fn             = cfq_pd_init,
4858        .pd_offline_fn          = cfq_pd_offline,
4859        .pd_free_fn             = cfq_pd_free,
4860        .pd_reset_stats_fn      = cfq_pd_reset_stats,
4861};
4862#endif
4863
4864static int __init cfq_init(void)
4865{
4866        int ret;
4867
4868#ifdef CONFIG_CFQ_GROUP_IOSCHED
4869        ret = blkcg_policy_register(&blkcg_policy_cfq);
4870        if (ret)
4871                return ret;
4872#else
4873        cfq_group_idle = 0;
4874#endif
4875
4876        ret = -ENOMEM;
4877        cfq_pool = KMEM_CACHE(cfq_queue, 0);
4878        if (!cfq_pool)
4879                goto err_pol_unreg;
4880
4881        ret = elv_register(&iosched_cfq);
4882        if (ret)
4883                goto err_free_pool;
4884
4885        return 0;
4886
4887err_free_pool:
4888        kmem_cache_destroy(cfq_pool);
4889err_pol_unreg:
4890#ifdef CONFIG_CFQ_GROUP_IOSCHED
4891        blkcg_policy_unregister(&blkcg_policy_cfq);
4892#endif
4893        return ret;
4894}
4895
4896static void __exit cfq_exit(void)
4897{
4898#ifdef CONFIG_CFQ_GROUP_IOSCHED
4899        blkcg_policy_unregister(&blkcg_policy_cfq);
4900#endif
4901        elv_unregister(&iosched_cfq);
4902        kmem_cache_destroy(cfq_pool);
4903}
4904
4905module_init(cfq_init);
4906module_exit(cfq_exit);
4907
4908MODULE_AUTHOR("Jens Axboe");
4909MODULE_LICENSE("GPL");
4910MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
4911