linux/block/kyber-iosched.c
<<
>>
Prefs
   1/*
   2 * The Kyber I/O scheduler. Controls latency by throttling queue depths using
   3 * scalable techniques.
   4 *
   5 * Copyright (C) 2017 Facebook
   6 *
   7 * This program is free software; you can redistribute it and/or
   8 * modify it under the terms of the GNU General Public
   9 * License v2 as published by the Free Software Foundation.
  10 *
  11 * This program is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  18 */
  19
  20#include <linux/kernel.h>
  21#include <linux/blkdev.h>
  22#include <linux/blk-mq.h>
  23#include <linux/elevator.h>
  24#include <linux/module.h>
  25#include <linux/sbitmap.h>
  26
  27#include "blk.h"
  28#include "blk-mq.h"
  29#include "blk-mq-debugfs.h"
  30#include "blk-mq-sched.h"
  31#include "blk-mq-tag.h"
  32#include "blk-stat.h"
  33
  34/* Scheduling domains. */
  35enum {
  36        KYBER_READ,
  37        KYBER_SYNC_WRITE,
  38        KYBER_OTHER, /* Async writes, discard, etc. */
  39        KYBER_NUM_DOMAINS,
  40};
  41
  42enum {
  43        KYBER_MIN_DEPTH = 256,
  44
  45        /*
  46         * In order to prevent starvation of synchronous requests by a flood of
  47         * asynchronous requests, we reserve 25% of requests for synchronous
  48         * operations.
  49         */
  50        KYBER_ASYNC_PERCENT = 75,
  51};
  52
  53/*
  54 * Initial device-wide depths for each scheduling domain.
  55 *
  56 * Even for fast devices with lots of tags like NVMe, you can saturate
  57 * the device with only a fraction of the maximum possible queue depth.
  58 * So, we cap these to a reasonable value.
  59 */
  60static const unsigned int kyber_depth[] = {
  61        [KYBER_READ] = 256,
  62        [KYBER_SYNC_WRITE] = 128,
  63        [KYBER_OTHER] = 64,
  64};
  65
  66/*
  67 * Scheduling domain batch sizes. We favor reads.
  68 */
  69static const unsigned int kyber_batch_size[] = {
  70        [KYBER_READ] = 16,
  71        [KYBER_SYNC_WRITE] = 8,
  72        [KYBER_OTHER] = 8,
  73};
  74
  75struct kyber_queue_data {
  76        struct request_queue *q;
  77
  78        struct blk_stat_callback *cb;
  79
  80        /*
  81         * The device is divided into multiple scheduling domains based on the
  82         * request type. Each domain has a fixed number of in-flight requests of
  83         * that type device-wide, limited by these tokens.
  84         */
  85        struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
  86
  87        /*
  88         * Async request percentage, converted to per-word depth for
  89         * sbitmap_get_shallow().
  90         */
  91        unsigned int async_depth;
  92
  93        /* Target latencies in nanoseconds. */
  94        u64 read_lat_nsec, write_lat_nsec;
  95};
  96
  97struct kyber_hctx_data {
  98        spinlock_t lock;
  99        struct list_head rqs[KYBER_NUM_DOMAINS];
 100        unsigned int cur_domain;
 101        unsigned int batching;
 102        wait_queue_t domain_wait[KYBER_NUM_DOMAINS];
 103        struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
 104        atomic_t wait_index[KYBER_NUM_DOMAINS];
 105};
 106
 107static bool op_is_sync(unsigned int cmd_flags)
 108{
 109        return rw_is_sync(cmd_flags) || (cmd_flags & (REQ_FUA | REQ_FLUSH));
 110}
 111
 112static int kyber_domain_wake(wait_queue_t *wait, unsigned mode, int flags,
 113                             void *key);
 114
 115static int rq_sched_domain(const struct request *rq)
 116{
 117        unsigned int cmd_flags = (unsigned int)rq->cmd_flags;
 118
 119        if (!(cmd_flags & REQ_WRITE))
 120                return KYBER_READ;
 121        else if ((cmd_flags & REQ_WRITE) && op_is_sync(cmd_flags))
 122                return KYBER_SYNC_WRITE;
 123        else
 124                return KYBER_OTHER;
 125}
 126
 127enum {
 128        NONE = 0,
 129        GOOD = 1,
 130        GREAT = 2,
 131        BAD = -1,
 132        AWFUL = -2,
 133};
 134
 135#define IS_GOOD(status) ((status) > 0)
 136#define IS_BAD(status) ((status) < 0)
 137
 138static int kyber_lat_status(struct blk_stat_callback *cb,
 139                            unsigned int sched_domain, u64 target)
 140{
 141        u64 latency;
 142
 143        if (!cb->stat[sched_domain].nr_samples)
 144                return NONE;
 145
 146        latency = cb->stat[sched_domain].mean;
 147        if (latency >= 2 * target)
 148                return AWFUL;
 149        else if (latency > target)
 150                return BAD;
 151        else if (latency <= target / 2)
 152                return GREAT;
 153        else /* (latency <= target) */
 154                return GOOD;
 155}
 156
 157/*
 158 * Adjust the read or synchronous write depth given the status of reads and
 159 * writes. The goal is that the latencies of the two domains are fair (i.e., if
 160 * one is good, then the other is good).
 161 */
 162static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
 163                                  unsigned int sched_domain, int this_status,
 164                                  int other_status)
 165{
 166        unsigned int orig_depth, depth;
 167
 168        /*
 169         * If this domain had no samples, or reads and writes are both good or
 170         * both bad, don't adjust the depth.
 171         */
 172        if (this_status == NONE ||
 173            (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
 174            (IS_BAD(this_status) && IS_BAD(other_status)))
 175                return;
 176
 177        orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
 178
 179        if (other_status == NONE) {
 180                depth++;
 181        } else {
 182                switch (this_status) {
 183                case GOOD:
 184                        if (other_status == AWFUL)
 185                                depth -= max(depth / 4, 1U);
 186                        else
 187                                depth -= max(depth / 8, 1U);
 188                        break;
 189                case GREAT:
 190                        if (other_status == AWFUL)
 191                                depth /= 2;
 192                        else
 193                                depth -= max(depth / 4, 1U);
 194                        break;
 195                case BAD:
 196                        depth++;
 197                        break;
 198                case AWFUL:
 199                        if (other_status == GREAT)
 200                                depth += 2;
 201                        else
 202                                depth++;
 203                        break;
 204                }
 205        }
 206
 207        depth = clamp(depth, 1U, kyber_depth[sched_domain]);
 208        if (depth != orig_depth)
 209                sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
 210}
 211
 212/*
 213 * Adjust the depth of other requests given the status of reads and synchronous
 214 * writes. As long as either domain is doing fine, we don't throttle, but if
 215 * both domains are doing badly, we throttle heavily.
 216 */
 217static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
 218                                     int read_status, int write_status,
 219                                     bool have_samples)
 220{
 221        unsigned int orig_depth, depth;
 222        int status;
 223
 224        orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
 225
 226        if (read_status == NONE && write_status == NONE) {
 227                depth += 2;
 228        } else if (have_samples) {
 229                if (read_status == NONE)
 230                        status = write_status;
 231                else if (write_status == NONE)
 232                        status = read_status;
 233                else
 234                        status = max(read_status, write_status);
 235                switch (status) {
 236                case GREAT:
 237                        depth += 2;
 238                        break;
 239                case GOOD:
 240                        depth++;
 241                        break;
 242                case BAD:
 243                        depth -= max(depth / 4, 1U);
 244                        break;
 245                case AWFUL:
 246                        depth /= 2;
 247                        break;
 248                }
 249        }
 250
 251        depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
 252        if (depth != orig_depth)
 253                sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
 254}
 255
 256/*
 257 * Apply heuristics for limiting queue depths based on gathered latency
 258 * statistics.
 259 */
 260static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
 261{
 262        struct kyber_queue_data *kqd = cb->data;
 263        int read_status, write_status;
 264
 265        read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
 266        write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
 267
 268        kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
 269        kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
 270        kyber_adjust_other_depth(kqd, read_status, write_status,
 271                                 cb->stat[KYBER_OTHER].nr_samples != 0);
 272
 273        /*
 274         * Continue monitoring latencies if we aren't hitting the targets or
 275         * we're still throttling other requests.
 276         */
 277        if (!blk_stat_is_active(kqd->cb) &&
 278            ((IS_BAD(read_status) || IS_BAD(write_status) ||
 279              kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
 280                blk_stat_activate_msecs(kqd->cb, 100);
 281}
 282
 283static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
 284{
 285        /*
 286         * All of the hardware queues have the same depth, so we can just grab
 287         * the shift of the first one.
 288         */
 289        return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
 290}
 291
 292static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
 293{
 294        struct kyber_queue_data *kqd;
 295        unsigned int max_tokens;
 296        unsigned int shift;
 297        int ret = -ENOMEM;
 298        int i;
 299
 300        kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
 301        if (!kqd)
 302                goto err;
 303        kqd->q = q;
 304
 305        kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, rq_sched_domain,
 306                                          KYBER_NUM_DOMAINS, kqd);
 307        if (!kqd->cb)
 308                goto err_kqd;
 309
 310        /*
 311         * The maximum number of tokens for any scheduling domain is at least
 312         * the queue depth of a single hardware queue. If the hardware doesn't
 313         * have many tags, still provide a reasonable number.
 314         */
 315        max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
 316                           KYBER_MIN_DEPTH);
 317        for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 318                WARN_ON(!kyber_depth[i]);
 319                WARN_ON(!kyber_batch_size[i]);
 320                ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
 321                                              max_tokens, -1, false, GFP_KERNEL,
 322                                              q->node);
 323                if (ret) {
 324                        while (--i >= 0)
 325                                sbitmap_queue_free(&kqd->domain_tokens[i]);
 326                        goto err_cb;
 327                }
 328                sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
 329        }
 330
 331        shift = kyber_sched_tags_shift(kqd);
 332        kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
 333
 334        kqd->read_lat_nsec = 2000000ULL;
 335        kqd->write_lat_nsec = 10000000ULL;
 336
 337        return kqd;
 338
 339err_cb:
 340        blk_stat_free_callback(kqd->cb);
 341err_kqd:
 342        kfree(kqd);
 343err:
 344        return ERR_PTR(ret);
 345}
 346
 347static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
 348{
 349        struct kyber_queue_data *kqd;
 350        struct elevator_queue *eq;
 351
 352        mark_tech_preview("kyber", THIS_MODULE);
 353
 354        eq = elevator_alloc(q, e);
 355        if (!eq)
 356                return -ENOMEM;
 357
 358        kqd = kyber_queue_data_alloc(q);
 359        if (IS_ERR(kqd)) {
 360                kobject_put(&eq->kobj);
 361                return PTR_ERR(kqd);
 362        }
 363
 364        eq->elevator_data = kqd;
 365        q->elevator = eq;
 366
 367        blk_stat_add_callback(q, kqd->cb);
 368
 369        return 0;
 370}
 371
 372static void kyber_exit_sched(struct elevator_queue *e)
 373{
 374        struct kyber_queue_data *kqd = e->elevator_data;
 375        struct request_queue *q = kqd->q;
 376        int i;
 377
 378        blk_stat_remove_callback(q, kqd->cb);
 379
 380        for (i = 0; i < KYBER_NUM_DOMAINS; i++)
 381                sbitmap_queue_free(&kqd->domain_tokens[i]);
 382        blk_stat_free_callback(kqd->cb);
 383        kfree(kqd);
 384}
 385
 386static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 387{
 388        struct kyber_hctx_data *khd;
 389        int i;
 390
 391        khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
 392        if (!khd)
 393                return -ENOMEM;
 394
 395        spin_lock_init(&khd->lock);
 396
 397        for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 398                INIT_LIST_HEAD(&khd->rqs[i]);
 399                init_waitqueue_func_entry(&khd->domain_wait[i],
 400                                          kyber_domain_wake);
 401                khd->domain_wait[i].private = hctx;
 402                INIT_LIST_HEAD(&khd->domain_wait[i].task_list);
 403                atomic_set(&khd->wait_index[i], 0);
 404        }
 405
 406        khd->cur_domain = 0;
 407        khd->batching = 0;
 408
 409        hctx->sched_data = khd;
 410
 411        return 0;
 412}
 413
 414static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 415{
 416        kfree(hctx->sched_data);
 417}
 418
 419static int rq_get_domain_token(struct request *rq)
 420{
 421        return (long)rq->elv.priv[0];
 422}
 423
 424static void rq_set_domain_token(struct request *rq, int token)
 425{
 426        rq->elv.priv[0] = (void *)(long)token;
 427}
 428
 429static void rq_clear_domain_token(struct kyber_queue_data *kqd,
 430                                  struct request *rq)
 431{
 432        unsigned int sched_domain;
 433        int nr;
 434
 435        nr = rq_get_domain_token(rq);
 436        if (nr != -1) {
 437                sched_domain = rq_sched_domain(rq);
 438                sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
 439                                    rq->mq_ctx->cpu);
 440        }
 441}
 442
 443static struct request *kyber_get_request(struct request_queue *q,
 444                                         unsigned int op,
 445                                         struct blk_mq_alloc_data *data)
 446{
 447        struct kyber_queue_data *kqd = q->elevator->elevator_data;
 448        struct request *rq;
 449
 450        /*
 451         * We use the scheduler tags as per-hardware queue queueing tokens.
 452         * Async requests can be limited at this stage.
 453         */
 454        if (!op_is_sync(op))
 455                data->shallow_depth = kqd->async_depth;
 456
 457        rq = __blk_mq_alloc_request(data, op);
 458        if (rq)
 459                rq_set_domain_token(rq, -1);
 460        return rq;
 461}
 462
 463static void kyber_put_request(struct request *rq)
 464{
 465        struct request_queue *q = rq->q;
 466        struct kyber_queue_data *kqd = q->elevator->elevator_data;
 467
 468        rq_clear_domain_token(kqd, rq);
 469        blk_mq_finish_request(rq);
 470}
 471
 472static void kyber_requeue_request(struct request *rq)
 473{
 474        struct request_queue *q = rq->q;
 475        struct kyber_queue_data *kqd = q->elevator->elevator_data;
 476
 477        rq_clear_domain_token(kqd, rq);
 478}
 479
 480static void kyber_completed_request(struct request *rq)
 481{
 482        struct request_queue *q = rq->q;
 483        struct kyber_queue_data *kqd = q->elevator->elevator_data;
 484        unsigned int sched_domain;
 485        u64 now, latency, target;
 486
 487        /*
 488         * Check if this request met our latency goal. If not, quickly gather
 489         * some statistics and start throttling.
 490         */
 491        sched_domain = rq_sched_domain(rq);
 492        switch (sched_domain) {
 493        case KYBER_READ:
 494                target = kqd->read_lat_nsec;
 495                break;
 496        case KYBER_SYNC_WRITE:
 497                target = kqd->write_lat_nsec;
 498                break;
 499        default:
 500                return;
 501        }
 502
 503        /* If we are already monitoring latencies, don't check again. */
 504        if (blk_stat_is_active(kqd->cb))
 505                return;
 506
 507        now = __blk_stat_time(ktime_to_ns(ktime_get()));
 508        if (now < blk_stat_time(&rq_aux(rq)->issue_stat))
 509                return;
 510
 511        latency = now - blk_stat_time(&rq_aux(rq)->issue_stat);
 512
 513        if (latency > target)
 514                blk_stat_activate_msecs(kqd->cb, 10);
 515}
 516
 517static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
 518                                  struct blk_mq_hw_ctx *hctx)
 519{
 520        LIST_HEAD(rq_list);
 521        struct request *rq, *next;
 522
 523        blk_mq_flush_busy_ctxs(hctx, &rq_list);
 524        list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
 525                unsigned int sched_domain;
 526
 527                sched_domain = rq_sched_domain(rq);
 528                list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]);
 529        }
 530}
 531
 532static int kyber_domain_wake(wait_queue_t *wait, unsigned mode, int flags,
 533                             void *key)
 534{
 535        struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
 536
 537        list_del_init(&wait->task_list);
 538        blk_mq_run_hw_queue(hctx, true);
 539        return 1;
 540}
 541
 542static int kyber_get_domain_token(struct kyber_queue_data *kqd,
 543                                  struct kyber_hctx_data *khd,
 544                                  struct blk_mq_hw_ctx *hctx)
 545{
 546        unsigned int sched_domain = khd->cur_domain;
 547        struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
 548        wait_queue_t *wait = &khd->domain_wait[sched_domain];
 549        struct sbq_wait_state *ws;
 550        int nr;
 551
 552        nr = __sbitmap_queue_get(domain_tokens);
 553
 554        /*
 555         * If we failed to get a domain token, make sure the hardware queue is
 556         * run when one becomes available. Note that this is serialized on
 557         * khd->lock, but we still need to be careful about the waker.
 558         */
 559        if (nr < 0 && list_empty_careful(&wait->task_list)) {
 560                ws = sbq_wait_ptr(domain_tokens,
 561                                  &khd->wait_index[sched_domain]);
 562                khd->domain_ws[sched_domain] = ws;
 563                add_wait_queue(&ws->wait, wait);
 564
 565                /*
 566                 * Try again in case a token was freed before we got on the wait
 567                 * queue.
 568                 */
 569                nr = __sbitmap_queue_get(domain_tokens);
 570        }
 571
 572        /*
 573         * If we got a token while we were on the wait queue, remove ourselves
 574         * from the wait queue to ensure that all wake ups make forward
 575         * progress. It's possible that the waker already deleted the entry
 576         * between the !list_empty_careful() check and us grabbing the lock, but
 577         * list_del_init() is okay with that.
 578         */
 579        if (nr >= 0 && !list_empty_careful(&wait->task_list)) {
 580                ws = khd->domain_ws[sched_domain];
 581                spin_lock_irq(&ws->wait.lock);
 582                list_del_init(&wait->task_list);
 583                spin_unlock_irq(&ws->wait.lock);
 584        }
 585
 586        return nr;
 587}
 588
 589static struct request *
 590kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
 591                          struct kyber_hctx_data *khd,
 592                          struct blk_mq_hw_ctx *hctx,
 593                          bool *flushed)
 594{
 595        struct list_head *rqs;
 596        struct request *rq;
 597        int nr;
 598
 599        rqs = &khd->rqs[khd->cur_domain];
 600        rq = list_first_entry_or_null(rqs, struct request, queuelist);
 601
 602        /*
 603         * If there wasn't already a pending request and we haven't flushed the
 604         * software queues yet, flush the software queues and check again.
 605         */
 606        if (!rq && !*flushed) {
 607                kyber_flush_busy_ctxs(khd, hctx);
 608                *flushed = true;
 609                rq = list_first_entry_or_null(rqs, struct request, queuelist);
 610        }
 611
 612        if (rq) {
 613                nr = kyber_get_domain_token(kqd, khd, hctx);
 614                if (nr >= 0) {
 615                        khd->batching++;
 616                        rq_set_domain_token(rq, nr);
 617                        list_del_init(&rq->queuelist);
 618                        return rq;
 619                }
 620        }
 621
 622        /* There were either no pending requests or no tokens. */
 623        return NULL;
 624}
 625
 626static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
 627{
 628        struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
 629        struct kyber_hctx_data *khd = hctx->sched_data;
 630        bool flushed = false;
 631        struct request *rq;
 632        int i;
 633
 634        spin_lock(&khd->lock);
 635
 636        /*
 637         * First, if we are still entitled to batch, try to dispatch a request
 638         * from the batch.
 639         */
 640        if (khd->batching < kyber_batch_size[khd->cur_domain]) {
 641                rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
 642                if (rq)
 643                        goto out;
 644        }
 645
 646        /*
 647         * Either,
 648         * 1. We were no longer entitled to a batch.
 649         * 2. The domain we were batching didn't have any requests.
 650         * 3. The domain we were batching was out of tokens.
 651         *
 652         * Start another batch. Note that this wraps back around to the original
 653         * domain if no other domains have requests or tokens.
 654         */
 655        khd->batching = 0;
 656        for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 657                if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
 658                        khd->cur_domain = 0;
 659                else
 660                        khd->cur_domain++;
 661
 662                rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
 663                if (rq)
 664                        goto out;
 665        }
 666
 667        rq = NULL;
 668out:
 669        spin_unlock(&khd->lock);
 670        return rq;
 671}
 672
 673static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
 674{
 675        struct kyber_hctx_data *khd = hctx->sched_data;
 676        int i;
 677
 678        for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 679                if (!list_empty_careful(&khd->rqs[i]))
 680                        return true;
 681        }
 682        return sbitmap_any_bit_set(&hctx->ctx_map);
 683}
 684
 685#define KYBER_LAT_SHOW_STORE(op)                                        \
 686static ssize_t kyber_##op##_lat_show(struct elevator_queue *e,          \
 687                                     char *page)                        \
 688{                                                                       \
 689        struct kyber_queue_data *kqd = e->elevator_data;                \
 690                                                                        \
 691        return sprintf(page, "%llu\n", kqd->op##_lat_nsec);             \
 692}                                                                       \
 693                                                                        \
 694static ssize_t kyber_##op##_lat_store(struct elevator_queue *e,         \
 695                                      const char *page, size_t count)   \
 696{                                                                       \
 697        struct kyber_queue_data *kqd = e->elevator_data;                \
 698        unsigned long long nsec;                                        \
 699        int ret;                                                        \
 700                                                                        \
 701        ret = kstrtoull(page, 10, &nsec);                               \
 702        if (ret)                                                        \
 703                return ret;                                             \
 704                                                                        \
 705        kqd->op##_lat_nsec = nsec;                                      \
 706                                                                        \
 707        return count;                                                   \
 708}
 709KYBER_LAT_SHOW_STORE(read);
 710KYBER_LAT_SHOW_STORE(write);
 711#undef KYBER_LAT_SHOW_STORE
 712
 713#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
 714static struct elv_fs_entry kyber_sched_attrs[] = {
 715        KYBER_LAT_ATTR(read),
 716        KYBER_LAT_ATTR(write),
 717        __ATTR_NULL
 718};
 719#undef KYBER_LAT_ATTR
 720
 721#ifdef CONFIG_BLK_DEBUG_FS
 722#define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name)                        \
 723static int kyber_##name##_tokens_show(void *data, struct seq_file *m)   \
 724{                                                                       \
 725        struct request_queue *q = data;                                 \
 726        struct kyber_queue_data *kqd = q->elevator->elevator_data;      \
 727                                                                        \
 728        sbitmap_queue_show(&kqd->domain_tokens[domain], m);             \
 729        return 0;                                                       \
 730}                                                                       \
 731                                                                        \
 732static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos)  \
 733        __acquires(&khd->lock)                                          \
 734{                                                                       \
 735        struct blk_mq_hw_ctx *hctx = m->private;                        \
 736        struct kyber_hctx_data *khd = hctx->sched_data;                 \
 737                                                                        \
 738        spin_lock(&khd->lock);                                          \
 739        return seq_list_start(&khd->rqs[domain], *pos);                 \
 740}                                                                       \
 741                                                                        \
 742static void *kyber_##name##_rqs_next(struct seq_file *m, void *v,       \
 743                                     loff_t *pos)                       \
 744{                                                                       \
 745        struct blk_mq_hw_ctx *hctx = m->private;                        \
 746        struct kyber_hctx_data *khd = hctx->sched_data;                 \
 747                                                                        \
 748        return seq_list_next(v, &khd->rqs[domain], pos);                \
 749}                                                                       \
 750                                                                        \
 751static void kyber_##name##_rqs_stop(struct seq_file *m, void *v)        \
 752        __releases(&khd->lock)                                          \
 753{                                                                       \
 754        struct blk_mq_hw_ctx *hctx = m->private;                        \
 755        struct kyber_hctx_data *khd = hctx->sched_data;                 \
 756                                                                        \
 757        spin_unlock(&khd->lock);                                        \
 758}                                                                       \
 759                                                                        \
 760static const struct seq_operations kyber_##name##_rqs_seq_ops = {       \
 761        .start  = kyber_##name##_rqs_start,                             \
 762        .next   = kyber_##name##_rqs_next,                              \
 763        .stop   = kyber_##name##_rqs_stop,                              \
 764        .show   = blk_mq_debugfs_rq_show,                               \
 765};                                                                      \
 766                                                                        \
 767static int kyber_##name##_waiting_show(void *data, struct seq_file *m)  \
 768{                                                                       \
 769        struct blk_mq_hw_ctx *hctx = data;                              \
 770        struct kyber_hctx_data *khd = hctx->sched_data;                 \
 771        wait_queue_t *wait = &khd->domain_wait[domain];                 \
 772                                                                        \
 773        seq_printf(m, "%d\n", !list_empty_careful(&wait->task_list));   \
 774        return 0;                                                       \
 775}
 776KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
 777KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write)
 778KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
 779#undef KYBER_DEBUGFS_DOMAIN_ATTRS
 780
 781static int kyber_async_depth_show(void *data, struct seq_file *m)
 782{
 783        struct request_queue *q = data;
 784        struct kyber_queue_data *kqd = q->elevator->elevator_data;
 785
 786        seq_printf(m, "%u\n", kqd->async_depth);
 787        return 0;
 788}
 789
 790static int kyber_cur_domain_show(void *data, struct seq_file *m)
 791{
 792        struct blk_mq_hw_ctx *hctx = data;
 793        struct kyber_hctx_data *khd = hctx->sched_data;
 794
 795        switch (khd->cur_domain) {
 796        case KYBER_READ:
 797                seq_puts(m, "READ\n");
 798                break;
 799        case KYBER_SYNC_WRITE:
 800                seq_puts(m, "SYNC_WRITE\n");
 801                break;
 802        case KYBER_OTHER:
 803                seq_puts(m, "OTHER\n");
 804                break;
 805        default:
 806                seq_printf(m, "%u\n", khd->cur_domain);
 807                break;
 808        }
 809        return 0;
 810}
 811
 812static int kyber_batching_show(void *data, struct seq_file *m)
 813{
 814        struct blk_mq_hw_ctx *hctx = data;
 815        struct kyber_hctx_data *khd = hctx->sched_data;
 816
 817        seq_printf(m, "%u\n", khd->batching);
 818        return 0;
 819}
 820
 821#define KYBER_QUEUE_DOMAIN_ATTRS(name)  \
 822        {#name "_tokens", 0400, kyber_##name##_tokens_show}
 823static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
 824        KYBER_QUEUE_DOMAIN_ATTRS(read),
 825        KYBER_QUEUE_DOMAIN_ATTRS(sync_write),
 826        KYBER_QUEUE_DOMAIN_ATTRS(other),
 827        {"async_depth", 0400, kyber_async_depth_show},
 828        {},
 829};
 830#undef KYBER_QUEUE_DOMAIN_ATTRS
 831
 832#define KYBER_HCTX_DOMAIN_ATTRS(name)                                   \
 833        {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops},   \
 834        {#name "_waiting", 0400, kyber_##name##_waiting_show}
 835static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
 836        KYBER_HCTX_DOMAIN_ATTRS(read),
 837        KYBER_HCTX_DOMAIN_ATTRS(sync_write),
 838        KYBER_HCTX_DOMAIN_ATTRS(other),
 839        {"cur_domain", 0400, kyber_cur_domain_show},
 840        {"batching", 0400, kyber_batching_show},
 841        {},
 842};
 843#undef KYBER_HCTX_DOMAIN_ATTRS
 844#endif
 845
 846static struct elevator_mq_ops kyber_ops = {
 847        .init_sched = kyber_init_sched,
 848        .exit_sched = kyber_exit_sched,
 849        .init_hctx = kyber_init_hctx,
 850        .exit_hctx = kyber_exit_hctx,
 851        .get_request = kyber_get_request,
 852        .put_request = kyber_put_request,
 853        .requeue_request = kyber_requeue_request,
 854        .completed_request = kyber_completed_request,
 855        .dispatch_request = kyber_dispatch_request,
 856        .has_work = kyber_has_work,
 857};
 858
 859static struct elevator_type kyber_sched = {
 860        .elevator_attrs = kyber_sched_attrs,
 861        .elevator_name = "kyber",
 862        .elevator_owner = THIS_MODULE,
 863};
 864
 865static int __init kyber_init(void)
 866{
 867        int ret = elv_register(&kyber_sched);
 868        struct elevator_type_aux *aux;
 869
 870        if (ret)
 871                return ret;
 872        aux = elevator_aux_find(&kyber_sched);
 873        memcpy(&aux->ops.mq, &kyber_ops, sizeof(struct elevator_mq_ops));
 874        aux->uses_mq = true;
 875        aux->queue_debugfs_attrs = kyber_queue_debugfs_attrs;
 876        aux->hctx_debugfs_attrs = kyber_hctx_debugfs_attrs;
 877
 878        return 0;
 879}
 880
 881static void __exit kyber_exit(void)
 882{
 883        elv_unregister(&kyber_sched);
 884}
 885
 886module_init(kyber_init);
 887module_exit(kyber_exit);
 888
 889MODULE_AUTHOR("Omar Sandoval");
 890MODULE_LICENSE("GPL");
 891MODULE_DESCRIPTION("Kyber I/O scheduler");
 892