linux/block/kyber-iosched.c
<<
>>
Prefs
   1/*
   2 * The Kyber I/O scheduler. Controls latency by throttling queue depths using
   3 * scalable techniques.
   4 *
   5 * Copyright (C) 2017 Facebook
   6 *
   7 * This program is free software; you can redistribute it and/or
   8 * modify it under the terms of the GNU General Public
   9 * License v2 as published by the Free Software Foundation.
  10 *
  11 * This program is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  18 */
  19
  20#include <linux/kernel.h>
  21#include <linux/blkdev.h>
  22#include <linux/blk-mq.h>
  23#include <linux/elevator.h>
  24#include <linux/module.h>
  25#include <linux/sbitmap.h>
  26
  27#include "blk.h"
  28#include "blk-mq.h"
  29#include "blk-mq-debugfs.h"
  30#include "blk-mq-sched.h"
  31#include "blk-mq-tag.h"
  32#include "blk-stat.h"
  33
  34/* Scheduling domains. */
  35enum {
  36        KYBER_READ,
  37        KYBER_SYNC_WRITE,
  38        KYBER_OTHER, /* Async writes, discard, etc. */
  39        KYBER_NUM_DOMAINS,
  40};
  41
  42enum {
  43        KYBER_MIN_DEPTH = 256,
  44
  45        /*
  46         * In order to prevent starvation of synchronous requests by a flood of
  47         * asynchronous requests, we reserve 25% of requests for synchronous
  48         * operations.
  49         */
  50        KYBER_ASYNC_PERCENT = 75,
  51};
  52
  53/*
  54 * Initial device-wide depths for each scheduling domain.
  55 *
  56 * Even for fast devices with lots of tags like NVMe, you can saturate
  57 * the device with only a fraction of the maximum possible queue depth.
  58 * So, we cap these to a reasonable value.
  59 */
  60static const unsigned int kyber_depth[] = {
  61        [KYBER_READ] = 256,
  62        [KYBER_SYNC_WRITE] = 128,
  63        [KYBER_OTHER] = 64,
  64};
  65
  66/*
  67 * Scheduling domain batch sizes. We favor reads.
  68 */
  69static const unsigned int kyber_batch_size[] = {
  70        [KYBER_READ] = 16,
  71        [KYBER_SYNC_WRITE] = 8,
  72        [KYBER_OTHER] = 8,
  73};
  74
  75struct kyber_queue_data {
  76        struct request_queue *q;
  77
  78        struct blk_stat_callback *cb;
  79
  80        /*
  81         * The device is divided into multiple scheduling domains based on the
  82         * request type. Each domain has a fixed number of in-flight requests of
  83         * that type device-wide, limited by these tokens.
  84         */
  85        struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
  86
  87        /*
  88         * Async request percentage, converted to per-word depth for
  89         * sbitmap_get_shallow().
  90         */
  91        unsigned int async_depth;
  92
  93        /* Target latencies in nanoseconds. */
  94        u64 read_lat_nsec, write_lat_nsec;
  95};
  96
  97struct kyber_hctx_data {
  98        spinlock_t lock;
  99        struct list_head rqs[KYBER_NUM_DOMAINS];
 100        unsigned int cur_domain;
 101        unsigned int batching;
 102        wait_queue_t domain_wait[KYBER_NUM_DOMAINS];
 103        atomic_t wait_index[KYBER_NUM_DOMAINS];
 104};
 105
 106static int rq_sched_domain(const struct request *rq)
 107{
 108        unsigned int op = rq->cmd_flags;
 109
 110        if ((op & REQ_OP_MASK) == REQ_OP_READ)
 111                return KYBER_READ;
 112        else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
 113                return KYBER_SYNC_WRITE;
 114        else
 115                return KYBER_OTHER;
 116}
 117
 118enum {
 119        NONE = 0,
 120        GOOD = 1,
 121        GREAT = 2,
 122        BAD = -1,
 123        AWFUL = -2,
 124};
 125
 126#define IS_GOOD(status) ((status) > 0)
 127#define IS_BAD(status) ((status) < 0)
 128
 129static int kyber_lat_status(struct blk_stat_callback *cb,
 130                            unsigned int sched_domain, u64 target)
 131{
 132        u64 latency;
 133
 134        if (!cb->stat[sched_domain].nr_samples)
 135                return NONE;
 136
 137        latency = cb->stat[sched_domain].mean;
 138        if (latency >= 2 * target)
 139                return AWFUL;
 140        else if (latency > target)
 141                return BAD;
 142        else if (latency <= target / 2)
 143                return GREAT;
 144        else /* (latency <= target) */
 145                return GOOD;
 146}
 147
 148/*
 149 * Adjust the read or synchronous write depth given the status of reads and
 150 * writes. The goal is that the latencies of the two domains are fair (i.e., if
 151 * one is good, then the other is good).
 152 */
 153static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
 154                                  unsigned int sched_domain, int this_status,
 155                                  int other_status)
 156{
 157        unsigned int orig_depth, depth;
 158
 159        /*
 160         * If this domain had no samples, or reads and writes are both good or
 161         * both bad, don't adjust the depth.
 162         */
 163        if (this_status == NONE ||
 164            (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
 165            (IS_BAD(this_status) && IS_BAD(other_status)))
 166                return;
 167
 168        orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
 169
 170        if (other_status == NONE) {
 171                depth++;
 172        } else {
 173                switch (this_status) {
 174                case GOOD:
 175                        if (other_status == AWFUL)
 176                                depth -= max(depth / 4, 1U);
 177                        else
 178                                depth -= max(depth / 8, 1U);
 179                        break;
 180                case GREAT:
 181                        if (other_status == AWFUL)
 182                                depth /= 2;
 183                        else
 184                                depth -= max(depth / 4, 1U);
 185                        break;
 186                case BAD:
 187                        depth++;
 188                        break;
 189                case AWFUL:
 190                        if (other_status == GREAT)
 191                                depth += 2;
 192                        else
 193                                depth++;
 194                        break;
 195                }
 196        }
 197
 198        depth = clamp(depth, 1U, kyber_depth[sched_domain]);
 199        if (depth != orig_depth)
 200                sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
 201}
 202
 203/*
 204 * Adjust the depth of other requests given the status of reads and synchronous
 205 * writes. As long as either domain is doing fine, we don't throttle, but if
 206 * both domains are doing badly, we throttle heavily.
 207 */
 208static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
 209                                     int read_status, int write_status,
 210                                     bool have_samples)
 211{
 212        unsigned int orig_depth, depth;
 213        int status;
 214
 215        orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
 216
 217        if (read_status == NONE && write_status == NONE) {
 218                depth += 2;
 219        } else if (have_samples) {
 220                if (read_status == NONE)
 221                        status = write_status;
 222                else if (write_status == NONE)
 223                        status = read_status;
 224                else
 225                        status = max(read_status, write_status);
 226                switch (status) {
 227                case GREAT:
 228                        depth += 2;
 229                        break;
 230                case GOOD:
 231                        depth++;
 232                        break;
 233                case BAD:
 234                        depth -= max(depth / 4, 1U);
 235                        break;
 236                case AWFUL:
 237                        depth /= 2;
 238                        break;
 239                }
 240        }
 241
 242        depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
 243        if (depth != orig_depth)
 244                sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
 245}
 246
 247/*
 248 * Apply heuristics for limiting queue depths based on gathered latency
 249 * statistics.
 250 */
 251static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
 252{
 253        struct kyber_queue_data *kqd = cb->data;
 254        int read_status, write_status;
 255
 256        read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
 257        write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
 258
 259        kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
 260        kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
 261        kyber_adjust_other_depth(kqd, read_status, write_status,
 262                                 cb->stat[KYBER_OTHER].nr_samples != 0);
 263
 264        /*
 265         * Continue monitoring latencies if we aren't hitting the targets or
 266         * we're still throttling other requests.
 267         */
 268        if (!blk_stat_is_active(kqd->cb) &&
 269            ((IS_BAD(read_status) || IS_BAD(write_status) ||
 270              kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
 271                blk_stat_activate_msecs(kqd->cb, 100);
 272}
 273
 274static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
 275{
 276        /*
 277         * All of the hardware queues have the same depth, so we can just grab
 278         * the shift of the first one.
 279         */
 280        return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
 281}
 282
 283static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
 284{
 285        struct kyber_queue_data *kqd;
 286        unsigned int max_tokens;
 287        unsigned int shift;
 288        int ret = -ENOMEM;
 289        int i;
 290
 291        kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
 292        if (!kqd)
 293                goto err;
 294        kqd->q = q;
 295
 296        kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, rq_sched_domain,
 297                                          KYBER_NUM_DOMAINS, kqd);
 298        if (!kqd->cb)
 299                goto err_kqd;
 300
 301        /*
 302         * The maximum number of tokens for any scheduling domain is at least
 303         * the queue depth of a single hardware queue. If the hardware doesn't
 304         * have many tags, still provide a reasonable number.
 305         */
 306        max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
 307                           KYBER_MIN_DEPTH);
 308        for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 309                WARN_ON(!kyber_depth[i]);
 310                WARN_ON(!kyber_batch_size[i]);
 311                ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
 312                                              max_tokens, -1, false, GFP_KERNEL,
 313                                              q->node);
 314                if (ret) {
 315                        while (--i >= 0)
 316                                sbitmap_queue_free(&kqd->domain_tokens[i]);
 317                        goto err_cb;
 318                }
 319                sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
 320        }
 321
 322        shift = kyber_sched_tags_shift(kqd);
 323        kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
 324
 325        kqd->read_lat_nsec = 2000000ULL;
 326        kqd->write_lat_nsec = 10000000ULL;
 327
 328        return kqd;
 329
 330err_cb:
 331        blk_stat_free_callback(kqd->cb);
 332err_kqd:
 333        kfree(kqd);
 334err:
 335        return ERR_PTR(ret);
 336}
 337
 338static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
 339{
 340        struct kyber_queue_data *kqd;
 341        struct elevator_queue *eq;
 342
 343        eq = elevator_alloc(q, e);
 344        if (!eq)
 345                return -ENOMEM;
 346
 347        kqd = kyber_queue_data_alloc(q);
 348        if (IS_ERR(kqd)) {
 349                kobject_put(&eq->kobj);
 350                return PTR_ERR(kqd);
 351        }
 352
 353        eq->elevator_data = kqd;
 354        q->elevator = eq;
 355
 356        blk_stat_add_callback(q, kqd->cb);
 357
 358        return 0;
 359}
 360
 361static void kyber_exit_sched(struct elevator_queue *e)
 362{
 363        struct kyber_queue_data *kqd = e->elevator_data;
 364        struct request_queue *q = kqd->q;
 365        int i;
 366
 367        blk_stat_remove_callback(q, kqd->cb);
 368
 369        for (i = 0; i < KYBER_NUM_DOMAINS; i++)
 370                sbitmap_queue_free(&kqd->domain_tokens[i]);
 371        blk_stat_free_callback(kqd->cb);
 372        kfree(kqd);
 373}
 374
 375static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 376{
 377        struct kyber_hctx_data *khd;
 378        int i;
 379
 380        khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
 381        if (!khd)
 382                return -ENOMEM;
 383
 384        spin_lock_init(&khd->lock);
 385
 386        for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 387                INIT_LIST_HEAD(&khd->rqs[i]);
 388                INIT_LIST_HEAD(&khd->domain_wait[i].task_list);
 389                atomic_set(&khd->wait_index[i], 0);
 390        }
 391
 392        khd->cur_domain = 0;
 393        khd->batching = 0;
 394
 395        hctx->sched_data = khd;
 396
 397        return 0;
 398}
 399
 400static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 401{
 402        kfree(hctx->sched_data);
 403}
 404
 405static int rq_get_domain_token(struct request *rq)
 406{
 407        return (long)rq->elv.priv[0];
 408}
 409
 410static void rq_set_domain_token(struct request *rq, int token)
 411{
 412        rq->elv.priv[0] = (void *)(long)token;
 413}
 414
 415static void rq_clear_domain_token(struct kyber_queue_data *kqd,
 416                                  struct request *rq)
 417{
 418        unsigned int sched_domain;
 419        int nr;
 420
 421        nr = rq_get_domain_token(rq);
 422        if (nr != -1) {
 423                sched_domain = rq_sched_domain(rq);
 424                sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
 425                                    rq->mq_ctx->cpu);
 426        }
 427}
 428
 429static struct request *kyber_get_request(struct request_queue *q,
 430                                         unsigned int op,
 431                                         struct blk_mq_alloc_data *data)
 432{
 433        struct kyber_queue_data *kqd = q->elevator->elevator_data;
 434        struct request *rq;
 435
 436        /*
 437         * We use the scheduler tags as per-hardware queue queueing tokens.
 438         * Async requests can be limited at this stage.
 439         */
 440        if (!op_is_sync(op))
 441                data->shallow_depth = kqd->async_depth;
 442
 443        rq = __blk_mq_alloc_request(data, op);
 444        if (rq)
 445                rq_set_domain_token(rq, -1);
 446        return rq;
 447}
 448
 449static void kyber_put_request(struct request *rq)
 450{
 451        struct request_queue *q = rq->q;
 452        struct kyber_queue_data *kqd = q->elevator->elevator_data;
 453
 454        rq_clear_domain_token(kqd, rq);
 455        blk_mq_finish_request(rq);
 456}
 457
 458static void kyber_completed_request(struct request *rq)
 459{
 460        struct request_queue *q = rq->q;
 461        struct kyber_queue_data *kqd = q->elevator->elevator_data;
 462        unsigned int sched_domain;
 463        u64 now, latency, target;
 464
 465        /*
 466         * Check if this request met our latency goal. If not, quickly gather
 467         * some statistics and start throttling.
 468         */
 469        sched_domain = rq_sched_domain(rq);
 470        switch (sched_domain) {
 471        case KYBER_READ:
 472                target = kqd->read_lat_nsec;
 473                break;
 474        case KYBER_SYNC_WRITE:
 475                target = kqd->write_lat_nsec;
 476                break;
 477        default:
 478                return;
 479        }
 480
 481        /* If we are already monitoring latencies, don't check again. */
 482        if (blk_stat_is_active(kqd->cb))
 483                return;
 484
 485        now = __blk_stat_time(ktime_to_ns(ktime_get()));
 486        if (now < blk_stat_time(&rq->issue_stat))
 487                return;
 488
 489        latency = now - blk_stat_time(&rq->issue_stat);
 490
 491        if (latency > target)
 492                blk_stat_activate_msecs(kqd->cb, 10);
 493}
 494
 495static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
 496                                  struct blk_mq_hw_ctx *hctx)
 497{
 498        LIST_HEAD(rq_list);
 499        struct request *rq, *next;
 500
 501        blk_mq_flush_busy_ctxs(hctx, &rq_list);
 502        list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
 503                unsigned int sched_domain;
 504
 505                sched_domain = rq_sched_domain(rq);
 506                list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]);
 507        }
 508}
 509
 510static int kyber_domain_wake(wait_queue_t *wait, unsigned mode, int flags,
 511                             void *key)
 512{
 513        struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
 514
 515        list_del_init(&wait->task_list);
 516        blk_mq_run_hw_queue(hctx, true);
 517        return 1;
 518}
 519
 520static int kyber_get_domain_token(struct kyber_queue_data *kqd,
 521                                  struct kyber_hctx_data *khd,
 522                                  struct blk_mq_hw_ctx *hctx)
 523{
 524        unsigned int sched_domain = khd->cur_domain;
 525        struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
 526        wait_queue_t *wait = &khd->domain_wait[sched_domain];
 527        struct sbq_wait_state *ws;
 528        int nr;
 529
 530        nr = __sbitmap_queue_get(domain_tokens);
 531        if (nr >= 0)
 532                return nr;
 533
 534        /*
 535         * If we failed to get a domain token, make sure the hardware queue is
 536         * run when one becomes available. Note that this is serialized on
 537         * khd->lock, but we still need to be careful about the waker.
 538         */
 539        if (list_empty_careful(&wait->task_list)) {
 540                init_waitqueue_func_entry(wait, kyber_domain_wake);
 541                wait->private = hctx;
 542                ws = sbq_wait_ptr(domain_tokens,
 543                                  &khd->wait_index[sched_domain]);
 544                add_wait_queue(&ws->wait, wait);
 545
 546                /*
 547                 * Try again in case a token was freed before we got on the wait
 548                 * queue.
 549                 */
 550                nr = __sbitmap_queue_get(domain_tokens);
 551        }
 552        return nr;
 553}
 554
 555static struct request *
 556kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
 557                          struct kyber_hctx_data *khd,
 558                          struct blk_mq_hw_ctx *hctx,
 559                          bool *flushed)
 560{
 561        struct list_head *rqs;
 562        struct request *rq;
 563        int nr;
 564
 565        rqs = &khd->rqs[khd->cur_domain];
 566        rq = list_first_entry_or_null(rqs, struct request, queuelist);
 567
 568        /*
 569         * If there wasn't already a pending request and we haven't flushed the
 570         * software queues yet, flush the software queues and check again.
 571         */
 572        if (!rq && !*flushed) {
 573                kyber_flush_busy_ctxs(khd, hctx);
 574                *flushed = true;
 575                rq = list_first_entry_or_null(rqs, struct request, queuelist);
 576        }
 577
 578        if (rq) {
 579                nr = kyber_get_domain_token(kqd, khd, hctx);
 580                if (nr >= 0) {
 581                        khd->batching++;
 582                        rq_set_domain_token(rq, nr);
 583                        list_del_init(&rq->queuelist);
 584                        return rq;
 585                }
 586        }
 587
 588        /* There were either no pending requests or no tokens. */
 589        return NULL;
 590}
 591
 592static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
 593{
 594        struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
 595        struct kyber_hctx_data *khd = hctx->sched_data;
 596        bool flushed = false;
 597        struct request *rq;
 598        int i;
 599
 600        spin_lock(&khd->lock);
 601
 602        /*
 603         * First, if we are still entitled to batch, try to dispatch a request
 604         * from the batch.
 605         */
 606        if (khd->batching < kyber_batch_size[khd->cur_domain]) {
 607                rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
 608                if (rq)
 609                        goto out;
 610        }
 611
 612        /*
 613         * Either,
 614         * 1. We were no longer entitled to a batch.
 615         * 2. The domain we were batching didn't have any requests.
 616         * 3. The domain we were batching was out of tokens.
 617         *
 618         * Start another batch. Note that this wraps back around to the original
 619         * domain if no other domains have requests or tokens.
 620         */
 621        khd->batching = 0;
 622        for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 623                if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
 624                        khd->cur_domain = 0;
 625                else
 626                        khd->cur_domain++;
 627
 628                rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
 629                if (rq)
 630                        goto out;
 631        }
 632
 633        rq = NULL;
 634out:
 635        spin_unlock(&khd->lock);
 636        return rq;
 637}
 638
 639static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
 640{
 641        struct kyber_hctx_data *khd = hctx->sched_data;
 642        int i;
 643
 644        for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 645                if (!list_empty_careful(&khd->rqs[i]))
 646                        return true;
 647        }
 648        return false;
 649}
 650
 651#define KYBER_LAT_SHOW_STORE(op)                                        \
 652static ssize_t kyber_##op##_lat_show(struct elevator_queue *e,          \
 653                                     char *page)                        \
 654{                                                                       \
 655        struct kyber_queue_data *kqd = e->elevator_data;                \
 656                                                                        \
 657        return sprintf(page, "%llu\n", kqd->op##_lat_nsec);             \
 658}                                                                       \
 659                                                                        \
 660static ssize_t kyber_##op##_lat_store(struct elevator_queue *e,         \
 661                                      const char *page, size_t count)   \
 662{                                                                       \
 663        struct kyber_queue_data *kqd = e->elevator_data;                \
 664        unsigned long long nsec;                                        \
 665        int ret;                                                        \
 666                                                                        \
 667        ret = kstrtoull(page, 10, &nsec);                               \
 668        if (ret)                                                        \
 669                return ret;                                             \
 670                                                                        \
 671        kqd->op##_lat_nsec = nsec;                                      \
 672                                                                        \
 673        return count;                                                   \
 674}
 675KYBER_LAT_SHOW_STORE(read);
 676KYBER_LAT_SHOW_STORE(write);
 677#undef KYBER_LAT_SHOW_STORE
 678
 679#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
 680static struct elv_fs_entry kyber_sched_attrs[] = {
 681        KYBER_LAT_ATTR(read),
 682        KYBER_LAT_ATTR(write),
 683        __ATTR_NULL
 684};
 685#undef KYBER_LAT_ATTR
 686
 687#ifdef CONFIG_BLK_DEBUG_FS
 688#define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name)                        \
 689static int kyber_##name##_tokens_show(void *data, struct seq_file *m)   \
 690{                                                                       \
 691        struct request_queue *q = data;                                 \
 692        struct kyber_queue_data *kqd = q->elevator->elevator_data;      \
 693                                                                        \
 694        sbitmap_queue_show(&kqd->domain_tokens[domain], m);             \
 695        return 0;                                                       \
 696}                                                                       \
 697                                                                        \
 698static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos)  \
 699        __acquires(&khd->lock)                                          \
 700{                                                                       \
 701        struct blk_mq_hw_ctx *hctx = m->private;                        \
 702        struct kyber_hctx_data *khd = hctx->sched_data;                 \
 703                                                                        \
 704        spin_lock(&khd->lock);                                          \
 705        return seq_list_start(&khd->rqs[domain], *pos);                 \
 706}                                                                       \
 707                                                                        \
 708static void *kyber_##name##_rqs_next(struct seq_file *m, void *v,       \
 709                                     loff_t *pos)                       \
 710{                                                                       \
 711        struct blk_mq_hw_ctx *hctx = m->private;                        \
 712        struct kyber_hctx_data *khd = hctx->sched_data;                 \
 713                                                                        \
 714        return seq_list_next(v, &khd->rqs[domain], pos);                \
 715}                                                                       \
 716                                                                        \
 717static void kyber_##name##_rqs_stop(struct seq_file *m, void *v)        \
 718        __releases(&khd->lock)                                          \
 719{                                                                       \
 720        struct blk_mq_hw_ctx *hctx = m->private;                        \
 721        struct kyber_hctx_data *khd = hctx->sched_data;                 \
 722                                                                        \
 723        spin_unlock(&khd->lock);                                        \
 724}                                                                       \
 725                                                                        \
 726static const struct seq_operations kyber_##name##_rqs_seq_ops = {       \
 727        .start  = kyber_##name##_rqs_start,                             \
 728        .next   = kyber_##name##_rqs_next,                              \
 729        .stop   = kyber_##name##_rqs_stop,                              \
 730        .show   = blk_mq_debugfs_rq_show,                               \
 731};                                                                      \
 732                                                                        \
 733static int kyber_##name##_waiting_show(void *data, struct seq_file *m)  \
 734{                                                                       \
 735        struct blk_mq_hw_ctx *hctx = data;                              \
 736        struct kyber_hctx_data *khd = hctx->sched_data;                 \
 737        wait_queue_t *wait = &khd->domain_wait[domain];                 \
 738                                                                        \
 739        seq_printf(m, "%d\n", !list_empty_careful(&wait->task_list));   \
 740        return 0;                                                       \
 741}
 742KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
 743KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write)
 744KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
 745#undef KYBER_DEBUGFS_DOMAIN_ATTRS
 746
 747static int kyber_async_depth_show(void *data, struct seq_file *m)
 748{
 749        struct request_queue *q = data;
 750        struct kyber_queue_data *kqd = q->elevator->elevator_data;
 751
 752        seq_printf(m, "%u\n", kqd->async_depth);
 753        return 0;
 754}
 755
 756static int kyber_cur_domain_show(void *data, struct seq_file *m)
 757{
 758        struct blk_mq_hw_ctx *hctx = data;
 759        struct kyber_hctx_data *khd = hctx->sched_data;
 760
 761        switch (khd->cur_domain) {
 762        case KYBER_READ:
 763                seq_puts(m, "READ\n");
 764                break;
 765        case KYBER_SYNC_WRITE:
 766                seq_puts(m, "SYNC_WRITE\n");
 767                break;
 768        case KYBER_OTHER:
 769                seq_puts(m, "OTHER\n");
 770                break;
 771        default:
 772                seq_printf(m, "%u\n", khd->cur_domain);
 773                break;
 774        }
 775        return 0;
 776}
 777
 778static int kyber_batching_show(void *data, struct seq_file *m)
 779{
 780        struct blk_mq_hw_ctx *hctx = data;
 781        struct kyber_hctx_data *khd = hctx->sched_data;
 782
 783        seq_printf(m, "%u\n", khd->batching);
 784        return 0;
 785}
 786
 787#define KYBER_QUEUE_DOMAIN_ATTRS(name)  \
 788        {#name "_tokens", 0400, kyber_##name##_tokens_show}
 789static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
 790        KYBER_QUEUE_DOMAIN_ATTRS(read),
 791        KYBER_QUEUE_DOMAIN_ATTRS(sync_write),
 792        KYBER_QUEUE_DOMAIN_ATTRS(other),
 793        {"async_depth", 0400, kyber_async_depth_show},
 794        {},
 795};
 796#undef KYBER_QUEUE_DOMAIN_ATTRS
 797
 798#define KYBER_HCTX_DOMAIN_ATTRS(name)                                   \
 799        {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops},   \
 800        {#name "_waiting", 0400, kyber_##name##_waiting_show}
 801static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
 802        KYBER_HCTX_DOMAIN_ATTRS(read),
 803        KYBER_HCTX_DOMAIN_ATTRS(sync_write),
 804        KYBER_HCTX_DOMAIN_ATTRS(other),
 805        {"cur_domain", 0400, kyber_cur_domain_show},
 806        {"batching", 0400, kyber_batching_show},
 807        {},
 808};
 809#undef KYBER_HCTX_DOMAIN_ATTRS
 810#endif
 811
 812static struct elevator_type kyber_sched = {
 813        .ops.mq = {
 814                .init_sched = kyber_init_sched,
 815                .exit_sched = kyber_exit_sched,
 816                .init_hctx = kyber_init_hctx,
 817                .exit_hctx = kyber_exit_hctx,
 818                .get_request = kyber_get_request,
 819                .put_request = kyber_put_request,
 820                .completed_request = kyber_completed_request,
 821                .dispatch_request = kyber_dispatch_request,
 822                .has_work = kyber_has_work,
 823        },
 824        .uses_mq = true,
 825#ifdef CONFIG_BLK_DEBUG_FS
 826        .queue_debugfs_attrs = kyber_queue_debugfs_attrs,
 827        .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs,
 828#endif
 829        .elevator_attrs = kyber_sched_attrs,
 830        .elevator_name = "kyber",
 831        .elevator_owner = THIS_MODULE,
 832};
 833
 834static int __init kyber_init(void)
 835{
 836        return elv_register(&kyber_sched);
 837}
 838
 839static void __exit kyber_exit(void)
 840{
 841        elv_unregister(&kyber_sched);
 842}
 843
 844module_init(kyber_init);
 845module_exit(kyber_exit);
 846
 847MODULE_AUTHOR("Omar Sandoval");
 848MODULE_LICENSE("GPL");
 849MODULE_DESCRIPTION("Kyber I/O scheduler");
 850