linux/block/kyber-iosched.c
<<
>>
Prefs
   1/*
   2 * The Kyber I/O scheduler. Controls latency by throttling queue depths using
   3 * scalable techniques.
   4 *
   5 * Copyright (C) 2017 Facebook
   6 *
   7 * This program is free software; you can redistribute it and/or
   8 * modify it under the terms of the GNU General Public
   9 * License v2 as published by the Free Software Foundation.
  10 *
  11 * This program is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  18 */
  19
  20#include <linux/kernel.h>
  21#include <linux/blkdev.h>
  22#include <linux/blk-mq.h>
  23#include <linux/elevator.h>
  24#include <linux/module.h>
  25#include <linux/sbitmap.h>
  26
  27#include "blk.h"
  28#include "blk-mq.h"
  29#include "blk-mq-debugfs.h"
  30#include "blk-mq-sched.h"
  31#include "blk-mq-tag.h"
  32#include "blk-stat.h"
  33
  34/* Scheduling domains. */
  35enum {
  36        KYBER_READ,
  37        KYBER_SYNC_WRITE,
  38        KYBER_OTHER, /* Async writes, discard, etc. */
  39        KYBER_NUM_DOMAINS,
  40};
  41
  42enum {
  43        KYBER_MIN_DEPTH = 256,
  44
  45        /*
  46         * In order to prevent starvation of synchronous requests by a flood of
  47         * asynchronous requests, we reserve 25% of requests for synchronous
  48         * operations.
  49         */
  50        KYBER_ASYNC_PERCENT = 75,
  51};
  52
  53/*
  54 * Initial device-wide depths for each scheduling domain.
  55 *
  56 * Even for fast devices with lots of tags like NVMe, you can saturate
  57 * the device with only a fraction of the maximum possible queue depth.
  58 * So, we cap these to a reasonable value.
  59 */
  60static const unsigned int kyber_depth[] = {
  61        [KYBER_READ] = 256,
  62        [KYBER_SYNC_WRITE] = 128,
  63        [KYBER_OTHER] = 64,
  64};
  65
  66/*
  67 * Scheduling domain batch sizes. We favor reads.
  68 */
  69static const unsigned int kyber_batch_size[] = {
  70        [KYBER_READ] = 16,
  71        [KYBER_SYNC_WRITE] = 8,
  72        [KYBER_OTHER] = 8,
  73};
  74
  75struct kyber_queue_data {
  76        struct request_queue *q;
  77
  78        struct blk_stat_callback *cb;
  79
  80        /*
  81         * The device is divided into multiple scheduling domains based on the
  82         * request type. Each domain has a fixed number of in-flight requests of
  83         * that type device-wide, limited by these tokens.
  84         */
  85        struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
  86
  87        /*
  88         * Async request percentage, converted to per-word depth for
  89         * sbitmap_get_shallow().
  90         */
  91        unsigned int async_depth;
  92
  93        /* Target latencies in nanoseconds. */
  94        u64 read_lat_nsec, write_lat_nsec;
  95};
  96
  97struct kyber_hctx_data {
  98        spinlock_t lock;
  99        struct list_head rqs[KYBER_NUM_DOMAINS];
 100        unsigned int cur_domain;
 101        unsigned int batching;
 102        wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
 103        struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
 104        atomic_t wait_index[KYBER_NUM_DOMAINS];
 105};
 106
 107static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 108                             void *key);
 109
 110static int rq_sched_domain(const struct request *rq)
 111{
 112        unsigned int op = rq->cmd_flags;
 113
 114        if ((op & REQ_OP_MASK) == REQ_OP_READ)
 115                return KYBER_READ;
 116        else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
 117                return KYBER_SYNC_WRITE;
 118        else
 119                return KYBER_OTHER;
 120}
 121
 122enum {
 123        NONE = 0,
 124        GOOD = 1,
 125        GREAT = 2,
 126        BAD = -1,
 127        AWFUL = -2,
 128};
 129
 130#define IS_GOOD(status) ((status) > 0)
 131#define IS_BAD(status) ((status) < 0)
 132
 133static int kyber_lat_status(struct blk_stat_callback *cb,
 134                            unsigned int sched_domain, u64 target)
 135{
 136        u64 latency;
 137
 138        if (!cb->stat[sched_domain].nr_samples)
 139                return NONE;
 140
 141        latency = cb->stat[sched_domain].mean;
 142        if (latency >= 2 * target)
 143                return AWFUL;
 144        else if (latency > target)
 145                return BAD;
 146        else if (latency <= target / 2)
 147                return GREAT;
 148        else /* (latency <= target) */
 149                return GOOD;
 150}
 151
 152/*
 153 * Adjust the read or synchronous write depth given the status of reads and
 154 * writes. The goal is that the latencies of the two domains are fair (i.e., if
 155 * one is good, then the other is good).
 156 */
 157static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
 158                                  unsigned int sched_domain, int this_status,
 159                                  int other_status)
 160{
 161        unsigned int orig_depth, depth;
 162
 163        /*
 164         * If this domain had no samples, or reads and writes are both good or
 165         * both bad, don't adjust the depth.
 166         */
 167        if (this_status == NONE ||
 168            (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
 169            (IS_BAD(this_status) && IS_BAD(other_status)))
 170                return;
 171
 172        orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
 173
 174        if (other_status == NONE) {
 175                depth++;
 176        } else {
 177                switch (this_status) {
 178                case GOOD:
 179                        if (other_status == AWFUL)
 180                                depth -= max(depth / 4, 1U);
 181                        else
 182                                depth -= max(depth / 8, 1U);
 183                        break;
 184                case GREAT:
 185                        if (other_status == AWFUL)
 186                                depth /= 2;
 187                        else
 188                                depth -= max(depth / 4, 1U);
 189                        break;
 190                case BAD:
 191                        depth++;
 192                        break;
 193                case AWFUL:
 194                        if (other_status == GREAT)
 195                                depth += 2;
 196                        else
 197                                depth++;
 198                        break;
 199                }
 200        }
 201
 202        depth = clamp(depth, 1U, kyber_depth[sched_domain]);
 203        if (depth != orig_depth)
 204                sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
 205}
 206
 207/*
 208 * Adjust the depth of other requests given the status of reads and synchronous
 209 * writes. As long as either domain is doing fine, we don't throttle, but if
 210 * both domains are doing badly, we throttle heavily.
 211 */
 212static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
 213                                     int read_status, int write_status,
 214                                     bool have_samples)
 215{
 216        unsigned int orig_depth, depth;
 217        int status;
 218
 219        orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
 220
 221        if (read_status == NONE && write_status == NONE) {
 222                depth += 2;
 223        } else if (have_samples) {
 224                if (read_status == NONE)
 225                        status = write_status;
 226                else if (write_status == NONE)
 227                        status = read_status;
 228                else
 229                        status = max(read_status, write_status);
 230                switch (status) {
 231                case GREAT:
 232                        depth += 2;
 233                        break;
 234                case GOOD:
 235                        depth++;
 236                        break;
 237                case BAD:
 238                        depth -= max(depth / 4, 1U);
 239                        break;
 240                case AWFUL:
 241                        depth /= 2;
 242                        break;
 243                }
 244        }
 245
 246        depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
 247        if (depth != orig_depth)
 248                sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
 249}
 250
 251/*
 252 * Apply heuristics for limiting queue depths based on gathered latency
 253 * statistics.
 254 */
 255static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
 256{
 257        struct kyber_queue_data *kqd = cb->data;
 258        int read_status, write_status;
 259
 260        read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
 261        write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
 262
 263        kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
 264        kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
 265        kyber_adjust_other_depth(kqd, read_status, write_status,
 266                                 cb->stat[KYBER_OTHER].nr_samples != 0);
 267
 268        /*
 269         * Continue monitoring latencies if we aren't hitting the targets or
 270         * we're still throttling other requests.
 271         */
 272        if (!blk_stat_is_active(kqd->cb) &&
 273            ((IS_BAD(read_status) || IS_BAD(write_status) ||
 274              kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
 275                blk_stat_activate_msecs(kqd->cb, 100);
 276}
 277
 278static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
 279{
 280        /*
 281         * All of the hardware queues have the same depth, so we can just grab
 282         * the shift of the first one.
 283         */
 284        return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
 285}
 286
 287static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
 288{
 289        struct kyber_queue_data *kqd;
 290        unsigned int max_tokens;
 291        unsigned int shift;
 292        int ret = -ENOMEM;
 293        int i;
 294
 295        kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
 296        if (!kqd)
 297                goto err;
 298        kqd->q = q;
 299
 300        kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, rq_sched_domain,
 301                                          KYBER_NUM_DOMAINS, kqd);
 302        if (!kqd->cb)
 303                goto err_kqd;
 304
 305        /*
 306         * The maximum number of tokens for any scheduling domain is at least
 307         * the queue depth of a single hardware queue. If the hardware doesn't
 308         * have many tags, still provide a reasonable number.
 309         */
 310        max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
 311                           KYBER_MIN_DEPTH);
 312        for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 313                WARN_ON(!kyber_depth[i]);
 314                WARN_ON(!kyber_batch_size[i]);
 315                ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
 316                                              max_tokens, -1, false, GFP_KERNEL,
 317                                              q->node);
 318                if (ret) {
 319                        while (--i >= 0)
 320                                sbitmap_queue_free(&kqd->domain_tokens[i]);
 321                        goto err_cb;
 322                }
 323                sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
 324        }
 325
 326        shift = kyber_sched_tags_shift(kqd);
 327        kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
 328
 329        kqd->read_lat_nsec = 2000000ULL;
 330        kqd->write_lat_nsec = 10000000ULL;
 331
 332        return kqd;
 333
 334err_cb:
 335        blk_stat_free_callback(kqd->cb);
 336err_kqd:
 337        kfree(kqd);
 338err:
 339        return ERR_PTR(ret);
 340}
 341
 342static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
 343{
 344        struct kyber_queue_data *kqd;
 345        struct elevator_queue *eq;
 346
 347        eq = elevator_alloc(q, e);
 348        if (!eq)
 349                return -ENOMEM;
 350
 351        kqd = kyber_queue_data_alloc(q);
 352        if (IS_ERR(kqd)) {
 353                kobject_put(&eq->kobj);
 354                return PTR_ERR(kqd);
 355        }
 356
 357        eq->elevator_data = kqd;
 358        q->elevator = eq;
 359
 360        blk_stat_add_callback(q, kqd->cb);
 361
 362        return 0;
 363}
 364
 365static void kyber_exit_sched(struct elevator_queue *e)
 366{
 367        struct kyber_queue_data *kqd = e->elevator_data;
 368        struct request_queue *q = kqd->q;
 369        int i;
 370
 371        blk_stat_remove_callback(q, kqd->cb);
 372
 373        for (i = 0; i < KYBER_NUM_DOMAINS; i++)
 374                sbitmap_queue_free(&kqd->domain_tokens[i]);
 375        blk_stat_free_callback(kqd->cb);
 376        kfree(kqd);
 377}
 378
 379static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 380{
 381        struct kyber_hctx_data *khd;
 382        int i;
 383
 384        khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
 385        if (!khd)
 386                return -ENOMEM;
 387
 388        spin_lock_init(&khd->lock);
 389
 390        for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 391                INIT_LIST_HEAD(&khd->rqs[i]);
 392                init_waitqueue_func_entry(&khd->domain_wait[i],
 393                                          kyber_domain_wake);
 394                khd->domain_wait[i].private = hctx;
 395                INIT_LIST_HEAD(&khd->domain_wait[i].entry);
 396                atomic_set(&khd->wait_index[i], 0);
 397        }
 398
 399        khd->cur_domain = 0;
 400        khd->batching = 0;
 401
 402        hctx->sched_data = khd;
 403
 404        return 0;
 405}
 406
 407static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 408{
 409        kfree(hctx->sched_data);
 410}
 411
 412static int rq_get_domain_token(struct request *rq)
 413{
 414        return (long)rq->elv.priv[0];
 415}
 416
 417static void rq_set_domain_token(struct request *rq, int token)
 418{
 419        rq->elv.priv[0] = (void *)(long)token;
 420}
 421
 422static void rq_clear_domain_token(struct kyber_queue_data *kqd,
 423                                  struct request *rq)
 424{
 425        unsigned int sched_domain;
 426        int nr;
 427
 428        nr = rq_get_domain_token(rq);
 429        if (nr != -1) {
 430                sched_domain = rq_sched_domain(rq);
 431                sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
 432                                    rq->mq_ctx->cpu);
 433        }
 434}
 435
 436static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
 437{
 438        /*
 439         * We use the scheduler tags as per-hardware queue queueing tokens.
 440         * Async requests can be limited at this stage.
 441         */
 442        if (!op_is_sync(op)) {
 443                struct kyber_queue_data *kqd = data->q->elevator->elevator_data;
 444
 445                data->shallow_depth = kqd->async_depth;
 446        }
 447}
 448
 449static void kyber_prepare_request(struct request *rq, struct bio *bio)
 450{
 451        rq_set_domain_token(rq, -1);
 452}
 453
 454static void kyber_finish_request(struct request *rq)
 455{
 456        struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
 457
 458        rq_clear_domain_token(kqd, rq);
 459}
 460
 461static void kyber_completed_request(struct request *rq)
 462{
 463        struct request_queue *q = rq->q;
 464        struct kyber_queue_data *kqd = q->elevator->elevator_data;
 465        unsigned int sched_domain;
 466        u64 now, latency, target;
 467
 468        /*
 469         * Check if this request met our latency goal. If not, quickly gather
 470         * some statistics and start throttling.
 471         */
 472        sched_domain = rq_sched_domain(rq);
 473        switch (sched_domain) {
 474        case KYBER_READ:
 475                target = kqd->read_lat_nsec;
 476                break;
 477        case KYBER_SYNC_WRITE:
 478                target = kqd->write_lat_nsec;
 479                break;
 480        default:
 481                return;
 482        }
 483
 484        /* If we are already monitoring latencies, don't check again. */
 485        if (blk_stat_is_active(kqd->cb))
 486                return;
 487
 488        now = __blk_stat_time(ktime_to_ns(ktime_get()));
 489        if (now < blk_stat_time(&rq->issue_stat))
 490                return;
 491
 492        latency = now - blk_stat_time(&rq->issue_stat);
 493
 494        if (latency > target)
 495                blk_stat_activate_msecs(kqd->cb, 10);
 496}
 497
 498static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
 499                                  struct blk_mq_hw_ctx *hctx)
 500{
 501        LIST_HEAD(rq_list);
 502        struct request *rq, *next;
 503
 504        blk_mq_flush_busy_ctxs(hctx, &rq_list);
 505        list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
 506                unsigned int sched_domain;
 507
 508                sched_domain = rq_sched_domain(rq);
 509                list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]);
 510        }
 511}
 512
 513static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 514                             void *key)
 515{
 516        struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
 517
 518        list_del_init(&wait->entry);
 519        blk_mq_run_hw_queue(hctx, true);
 520        return 1;
 521}
 522
 523static int kyber_get_domain_token(struct kyber_queue_data *kqd,
 524                                  struct kyber_hctx_data *khd,
 525                                  struct blk_mq_hw_ctx *hctx)
 526{
 527        unsigned int sched_domain = khd->cur_domain;
 528        struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
 529        wait_queue_entry_t *wait = &khd->domain_wait[sched_domain];
 530        struct sbq_wait_state *ws;
 531        int nr;
 532
 533        nr = __sbitmap_queue_get(domain_tokens);
 534
 535        /*
 536         * If we failed to get a domain token, make sure the hardware queue is
 537         * run when one becomes available. Note that this is serialized on
 538         * khd->lock, but we still need to be careful about the waker.
 539         */
 540        if (nr < 0 && list_empty_careful(&wait->entry)) {
 541                ws = sbq_wait_ptr(domain_tokens,
 542                                  &khd->wait_index[sched_domain]);
 543                khd->domain_ws[sched_domain] = ws;
 544                add_wait_queue(&ws->wait, wait);
 545
 546                /*
 547                 * Try again in case a token was freed before we got on the wait
 548                 * queue.
 549                 */
 550                nr = __sbitmap_queue_get(domain_tokens);
 551        }
 552
 553        /*
 554         * If we got a token while we were on the wait queue, remove ourselves
 555         * from the wait queue to ensure that all wake ups make forward
 556         * progress. It's possible that the waker already deleted the entry
 557         * between the !list_empty_careful() check and us grabbing the lock, but
 558         * list_del_init() is okay with that.
 559         */
 560        if (nr >= 0 && !list_empty_careful(&wait->entry)) {
 561                ws = khd->domain_ws[sched_domain];
 562                spin_lock_irq(&ws->wait.lock);
 563                list_del_init(&wait->entry);
 564                spin_unlock_irq(&ws->wait.lock);
 565        }
 566
 567        return nr;
 568}
 569
 570static struct request *
 571kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
 572                          struct kyber_hctx_data *khd,
 573                          struct blk_mq_hw_ctx *hctx,
 574                          bool *flushed)
 575{
 576        struct list_head *rqs;
 577        struct request *rq;
 578        int nr;
 579
 580        rqs = &khd->rqs[khd->cur_domain];
 581        rq = list_first_entry_or_null(rqs, struct request, queuelist);
 582
 583        /*
 584         * If there wasn't already a pending request and we haven't flushed the
 585         * software queues yet, flush the software queues and check again.
 586         */
 587        if (!rq && !*flushed) {
 588                kyber_flush_busy_ctxs(khd, hctx);
 589                *flushed = true;
 590                rq = list_first_entry_or_null(rqs, struct request, queuelist);
 591        }
 592
 593        if (rq) {
 594                nr = kyber_get_domain_token(kqd, khd, hctx);
 595                if (nr >= 0) {
 596                        khd->batching++;
 597                        rq_set_domain_token(rq, nr);
 598                        list_del_init(&rq->queuelist);
 599                        return rq;
 600                }
 601        }
 602
 603        /* There were either no pending requests or no tokens. */
 604        return NULL;
 605}
 606
 607static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
 608{
 609        struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
 610        struct kyber_hctx_data *khd = hctx->sched_data;
 611        bool flushed = false;
 612        struct request *rq;
 613        int i;
 614
 615        spin_lock(&khd->lock);
 616
 617        /*
 618         * First, if we are still entitled to batch, try to dispatch a request
 619         * from the batch.
 620         */
 621        if (khd->batching < kyber_batch_size[khd->cur_domain]) {
 622                rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
 623                if (rq)
 624                        goto out;
 625        }
 626
 627        /*
 628         * Either,
 629         * 1. We were no longer entitled to a batch.
 630         * 2. The domain we were batching didn't have any requests.
 631         * 3. The domain we were batching was out of tokens.
 632         *
 633         * Start another batch. Note that this wraps back around to the original
 634         * domain if no other domains have requests or tokens.
 635         */
 636        khd->batching = 0;
 637        for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 638                if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
 639                        khd->cur_domain = 0;
 640                else
 641                        khd->cur_domain++;
 642
 643                rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
 644                if (rq)
 645                        goto out;
 646        }
 647
 648        rq = NULL;
 649out:
 650        spin_unlock(&khd->lock);
 651        return rq;
 652}
 653
 654static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
 655{
 656        struct kyber_hctx_data *khd = hctx->sched_data;
 657        int i;
 658
 659        for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 660                if (!list_empty_careful(&khd->rqs[i]))
 661                        return true;
 662        }
 663        return sbitmap_any_bit_set(&hctx->ctx_map);
 664}
 665
 666#define KYBER_LAT_SHOW_STORE(op)                                        \
 667static ssize_t kyber_##op##_lat_show(struct elevator_queue *e,          \
 668                                     char *page)                        \
 669{                                                                       \
 670        struct kyber_queue_data *kqd = e->elevator_data;                \
 671                                                                        \
 672        return sprintf(page, "%llu\n", kqd->op##_lat_nsec);             \
 673}                                                                       \
 674                                                                        \
 675static ssize_t kyber_##op##_lat_store(struct elevator_queue *e,         \
 676                                      const char *page, size_t count)   \
 677{                                                                       \
 678        struct kyber_queue_data *kqd = e->elevator_data;                \
 679        unsigned long long nsec;                                        \
 680        int ret;                                                        \
 681                                                                        \
 682        ret = kstrtoull(page, 10, &nsec);                               \
 683        if (ret)                                                        \
 684                return ret;                                             \
 685                                                                        \
 686        kqd->op##_lat_nsec = nsec;                                      \
 687                                                                        \
 688        return count;                                                   \
 689}
 690KYBER_LAT_SHOW_STORE(read);
 691KYBER_LAT_SHOW_STORE(write);
 692#undef KYBER_LAT_SHOW_STORE
 693
 694#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
 695static struct elv_fs_entry kyber_sched_attrs[] = {
 696        KYBER_LAT_ATTR(read),
 697        KYBER_LAT_ATTR(write),
 698        __ATTR_NULL
 699};
 700#undef KYBER_LAT_ATTR
 701
 702#ifdef CONFIG_BLK_DEBUG_FS
 703#define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name)                        \
 704static int kyber_##name##_tokens_show(void *data, struct seq_file *m)   \
 705{                                                                       \
 706        struct request_queue *q = data;                                 \
 707        struct kyber_queue_data *kqd = q->elevator->elevator_data;      \
 708                                                                        \
 709        sbitmap_queue_show(&kqd->domain_tokens[domain], m);             \
 710        return 0;                                                       \
 711}                                                                       \
 712                                                                        \
 713static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos)  \
 714        __acquires(&khd->lock)                                          \
 715{                                                                       \
 716        struct blk_mq_hw_ctx *hctx = m->private;                        \
 717        struct kyber_hctx_data *khd = hctx->sched_data;                 \
 718                                                                        \
 719        spin_lock(&khd->lock);                                          \
 720        return seq_list_start(&khd->rqs[domain], *pos);                 \
 721}                                                                       \
 722                                                                        \
 723static void *kyber_##name##_rqs_next(struct seq_file *m, void *v,       \
 724                                     loff_t *pos)                       \
 725{                                                                       \
 726        struct blk_mq_hw_ctx *hctx = m->private;                        \
 727        struct kyber_hctx_data *khd = hctx->sched_data;                 \
 728                                                                        \
 729        return seq_list_next(v, &khd->rqs[domain], pos);                \
 730}                                                                       \
 731                                                                        \
 732static void kyber_##name##_rqs_stop(struct seq_file *m, void *v)        \
 733        __releases(&khd->lock)                                          \
 734{                                                                       \
 735        struct blk_mq_hw_ctx *hctx = m->private;                        \
 736        struct kyber_hctx_data *khd = hctx->sched_data;                 \
 737                                                                        \
 738        spin_unlock(&khd->lock);                                        \
 739}                                                                       \
 740                                                                        \
 741static const struct seq_operations kyber_##name##_rqs_seq_ops = {       \
 742        .start  = kyber_##name##_rqs_start,                             \
 743        .next   = kyber_##name##_rqs_next,                              \
 744        .stop   = kyber_##name##_rqs_stop,                              \
 745        .show   = blk_mq_debugfs_rq_show,                               \
 746};                                                                      \
 747                                                                        \
 748static int kyber_##name##_waiting_show(void *data, struct seq_file *m)  \
 749{                                                                       \
 750        struct blk_mq_hw_ctx *hctx = data;                              \
 751        struct kyber_hctx_data *khd = hctx->sched_data;                 \
 752        wait_queue_entry_t *wait = &khd->domain_wait[domain];           \
 753                                                                        \
 754        seq_printf(m, "%d\n", !list_empty_careful(&wait->entry));       \
 755        return 0;                                                       \
 756}
 757KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
 758KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write)
 759KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
 760#undef KYBER_DEBUGFS_DOMAIN_ATTRS
 761
 762static int kyber_async_depth_show(void *data, struct seq_file *m)
 763{
 764        struct request_queue *q = data;
 765        struct kyber_queue_data *kqd = q->elevator->elevator_data;
 766
 767        seq_printf(m, "%u\n", kqd->async_depth);
 768        return 0;
 769}
 770
 771static int kyber_cur_domain_show(void *data, struct seq_file *m)
 772{
 773        struct blk_mq_hw_ctx *hctx = data;
 774        struct kyber_hctx_data *khd = hctx->sched_data;
 775
 776        switch (khd->cur_domain) {
 777        case KYBER_READ:
 778                seq_puts(m, "READ\n");
 779                break;
 780        case KYBER_SYNC_WRITE:
 781                seq_puts(m, "SYNC_WRITE\n");
 782                break;
 783        case KYBER_OTHER:
 784                seq_puts(m, "OTHER\n");
 785                break;
 786        default:
 787                seq_printf(m, "%u\n", khd->cur_domain);
 788                break;
 789        }
 790        return 0;
 791}
 792
 793static int kyber_batching_show(void *data, struct seq_file *m)
 794{
 795        struct blk_mq_hw_ctx *hctx = data;
 796        struct kyber_hctx_data *khd = hctx->sched_data;
 797
 798        seq_printf(m, "%u\n", khd->batching);
 799        return 0;
 800}
 801
 802#define KYBER_QUEUE_DOMAIN_ATTRS(name)  \
 803        {#name "_tokens", 0400, kyber_##name##_tokens_show}
 804static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
 805        KYBER_QUEUE_DOMAIN_ATTRS(read),
 806        KYBER_QUEUE_DOMAIN_ATTRS(sync_write),
 807        KYBER_QUEUE_DOMAIN_ATTRS(other),
 808        {"async_depth", 0400, kyber_async_depth_show},
 809        {},
 810};
 811#undef KYBER_QUEUE_DOMAIN_ATTRS
 812
 813#define KYBER_HCTX_DOMAIN_ATTRS(name)                                   \
 814        {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops},   \
 815        {#name "_waiting", 0400, kyber_##name##_waiting_show}
 816static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
 817        KYBER_HCTX_DOMAIN_ATTRS(read),
 818        KYBER_HCTX_DOMAIN_ATTRS(sync_write),
 819        KYBER_HCTX_DOMAIN_ATTRS(other),
 820        {"cur_domain", 0400, kyber_cur_domain_show},
 821        {"batching", 0400, kyber_batching_show},
 822        {},
 823};
 824#undef KYBER_HCTX_DOMAIN_ATTRS
 825#endif
 826
 827static struct elevator_type kyber_sched = {
 828        .ops.mq = {
 829                .init_sched = kyber_init_sched,
 830                .exit_sched = kyber_exit_sched,
 831                .init_hctx = kyber_init_hctx,
 832                .exit_hctx = kyber_exit_hctx,
 833                .limit_depth = kyber_limit_depth,
 834                .prepare_request = kyber_prepare_request,
 835                .finish_request = kyber_finish_request,
 836                .requeue_request = kyber_finish_request,
 837                .completed_request = kyber_completed_request,
 838                .dispatch_request = kyber_dispatch_request,
 839                .has_work = kyber_has_work,
 840        },
 841        .uses_mq = true,
 842#ifdef CONFIG_BLK_DEBUG_FS
 843        .queue_debugfs_attrs = kyber_queue_debugfs_attrs,
 844        .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs,
 845#endif
 846        .elevator_attrs = kyber_sched_attrs,
 847        .elevator_name = "kyber",
 848        .elevator_owner = THIS_MODULE,
 849};
 850
 851static int __init kyber_init(void)
 852{
 853        return elv_register(&kyber_sched);
 854}
 855
 856static void __exit kyber_exit(void)
 857{
 858        elv_unregister(&kyber_sched);
 859}
 860
 861module_init(kyber_init);
 862module_exit(kyber_exit);
 863
 864MODULE_AUTHOR("Omar Sandoval");
 865MODULE_LICENSE("GPL");
 866MODULE_DESCRIPTION("Kyber I/O scheduler");
 867