linux/block/blk-mq-sched.c
<<
>>
Prefs
   1/*
   2 * blk-mq scheduling framework
   3 *
   4 * Copyright (C) 2016 Jens Axboe
   5 */
   6#include <linux/kernel.h>
   7#include <linux/module.h>
   8#include <linux/blk-mq.h>
   9
  10#include <trace/events/block.h>
  11
  12#include "blk.h"
  13#include "blk-mq.h"
  14#include "blk-mq-sched.h"
  15#include "blk-mq-tag.h"
  16#include "blk-wbt.h"
  17
  18void blk_mq_sched_free_hctx_data(struct request_queue *q,
  19                                 void (*exit)(struct blk_mq_hw_ctx *))
  20{
  21        struct blk_mq_hw_ctx *hctx;
  22        int i;
  23
  24        queue_for_each_hw_ctx(q, hctx, i) {
  25                if (exit && hctx->sched_data)
  26                        exit(hctx);
  27                kfree(hctx->sched_data);
  28                hctx->sched_data = NULL;
  29        }
  30}
  31EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
  32
  33int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
  34                                int (*init)(struct blk_mq_hw_ctx *),
  35                                void (*exit)(struct blk_mq_hw_ctx *))
  36{
  37        struct blk_mq_hw_ctx *hctx;
  38        int ret;
  39        int i;
  40
  41        queue_for_each_hw_ctx(q, hctx, i) {
  42                hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node);
  43                if (!hctx->sched_data) {
  44                        ret = -ENOMEM;
  45                        goto error;
  46                }
  47
  48                if (init) {
  49                        ret = init(hctx);
  50                        if (ret) {
  51                                /*
  52                                 * We don't want to give exit() a partially
  53                                 * initialized sched_data. init() must clean up
  54                                 * if it fails.
  55                                 */
  56                                kfree(hctx->sched_data);
  57                                hctx->sched_data = NULL;
  58                                goto error;
  59                        }
  60                }
  61        }
  62
  63        return 0;
  64error:
  65        blk_mq_sched_free_hctx_data(q, exit);
  66        return ret;
  67}
  68EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data);
  69
  70static void __blk_mq_sched_assign_ioc(struct request_queue *q,
  71                                      struct request *rq,
  72                                      struct bio *bio,
  73                                      struct io_context *ioc)
  74{
  75        struct io_cq *icq;
  76
  77        spin_lock_irq(q->queue_lock);
  78        icq = ioc_lookup_icq(ioc, q);
  79        spin_unlock_irq(q->queue_lock);
  80
  81        if (!icq) {
  82                icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
  83                if (!icq)
  84                        return;
  85        }
  86
  87        rq->elv.icq = icq;
  88        if (!blk_mq_sched_get_rq_priv(q, rq, bio)) {
  89                rq->rq_flags |= RQF_ELVPRIV;
  90                get_io_context(icq->ioc);
  91                return;
  92        }
  93
  94        rq->elv.icq = NULL;
  95}
  96
  97static void blk_mq_sched_assign_ioc(struct request_queue *q,
  98                                    struct request *rq, struct bio *bio)
  99{
 100        struct io_context *ioc;
 101
 102        ioc = rq_ioc(bio);
 103        if (ioc)
 104                __blk_mq_sched_assign_ioc(q, rq, bio, ioc);
 105}
 106
 107struct request *blk_mq_sched_get_request(struct request_queue *q,
 108                                         struct bio *bio,
 109                                         unsigned int op,
 110                                         struct blk_mq_alloc_data *data)
 111{
 112        struct elevator_queue *e = q->elevator;
 113        struct request *rq;
 114
 115        blk_queue_enter_live(q);
 116        data->q = q;
 117        if (likely(!data->ctx))
 118                data->ctx = blk_mq_get_ctx(q);
 119        if (likely(!data->hctx))
 120                data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
 121
 122        if (e) {
 123                data->flags |= BLK_MQ_REQ_INTERNAL;
 124
 125                /*
 126                 * Flush requests are special and go directly to the
 127                 * dispatch list.
 128                 */
 129                if (!op_is_flush(op) && e->type->ops.mq.get_request) {
 130                        rq = e->type->ops.mq.get_request(q, op, data);
 131                        if (rq)
 132                                rq->rq_flags |= RQF_QUEUED;
 133                } else
 134                        rq = __blk_mq_alloc_request(data, op);
 135        } else {
 136                rq = __blk_mq_alloc_request(data, op);
 137        }
 138
 139        if (rq) {
 140                if (!op_is_flush(op)) {
 141                        rq->elv.icq = NULL;
 142                        if (e && e->type->icq_cache)
 143                                blk_mq_sched_assign_ioc(q, rq, bio);
 144                }
 145                data->hctx->queued++;
 146                return rq;
 147        }
 148
 149        blk_queue_exit(q);
 150        return NULL;
 151}
 152
 153void blk_mq_sched_put_request(struct request *rq)
 154{
 155        struct request_queue *q = rq->q;
 156        struct elevator_queue *e = q->elevator;
 157
 158        if (rq->rq_flags & RQF_ELVPRIV) {
 159                blk_mq_sched_put_rq_priv(rq->q, rq);
 160                if (rq->elv.icq) {
 161                        put_io_context(rq->elv.icq->ioc);
 162                        rq->elv.icq = NULL;
 163                }
 164        }
 165
 166        if ((rq->rq_flags & RQF_QUEUED) && e && e->type->ops.mq.put_request)
 167                e->type->ops.mq.put_request(rq);
 168        else
 169                blk_mq_finish_request(rq);
 170}
 171
 172void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 173{
 174        struct request_queue *q = hctx->queue;
 175        struct elevator_queue *e = q->elevator;
 176        const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
 177        bool did_work = false;
 178        LIST_HEAD(rq_list);
 179
 180        if (unlikely(blk_mq_hctx_stopped(hctx)))
 181                return;
 182
 183        hctx->run++;
 184
 185        /*
 186         * If we have previous entries on our dispatch list, grab them first for
 187         * more fair dispatch.
 188         */
 189        if (!list_empty_careful(&hctx->dispatch)) {
 190                spin_lock(&hctx->lock);
 191                if (!list_empty(&hctx->dispatch))
 192                        list_splice_init(&hctx->dispatch, &rq_list);
 193                spin_unlock(&hctx->lock);
 194        }
 195
 196        /*
 197         * Only ask the scheduler for requests, if we didn't have residual
 198         * requests from the dispatch list. This is to avoid the case where
 199         * we only ever dispatch a fraction of the requests available because
 200         * of low device queue depth. Once we pull requests out of the IO
 201         * scheduler, we can no longer merge or sort them. So it's best to
 202         * leave them there for as long as we can. Mark the hw queue as
 203         * needing a restart in that case.
 204         */
 205        if (!list_empty(&rq_list)) {
 206                blk_mq_sched_mark_restart_hctx(hctx);
 207                did_work = blk_mq_dispatch_rq_list(q, &rq_list);
 208        } else if (!has_sched_dispatch) {
 209                blk_mq_flush_busy_ctxs(hctx, &rq_list);
 210                blk_mq_dispatch_rq_list(q, &rq_list);
 211        }
 212
 213        /*
 214         * We want to dispatch from the scheduler if we had no work left
 215         * on the dispatch list, OR if we did have work but weren't able
 216         * to make progress.
 217         */
 218        if (!did_work && has_sched_dispatch) {
 219                do {
 220                        struct request *rq;
 221
 222                        rq = e->type->ops.mq.dispatch_request(hctx);
 223                        if (!rq)
 224                                break;
 225                        list_add(&rq->queuelist, &rq_list);
 226                } while (blk_mq_dispatch_rq_list(q, &rq_list));
 227        }
 228}
 229
 230void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
 231                                   struct list_head *rq_list,
 232                                   struct request *(*get_rq)(struct blk_mq_hw_ctx *))
 233{
 234        do {
 235                struct request *rq;
 236
 237                rq = get_rq(hctx);
 238                if (!rq)
 239                        break;
 240
 241                list_add_tail(&rq->queuelist, rq_list);
 242        } while (1);
 243}
 244EXPORT_SYMBOL_GPL(blk_mq_sched_move_to_dispatch);
 245
 246bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
 247                            struct request **merged_request)
 248{
 249        struct request *rq;
 250
 251        switch (elv_merge(q, &rq, bio)) {
 252        case ELEVATOR_BACK_MERGE:
 253                if (!blk_mq_sched_allow_merge(q, rq, bio))
 254                        return false;
 255                if (!bio_attempt_back_merge(q, rq, bio))
 256                        return false;
 257                *merged_request = attempt_back_merge(q, rq);
 258                if (!*merged_request)
 259                        elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
 260                return true;
 261        case ELEVATOR_FRONT_MERGE:
 262                if (!blk_mq_sched_allow_merge(q, rq, bio))
 263                        return false;
 264                if (!bio_attempt_front_merge(q, rq, bio))
 265                        return false;
 266                *merged_request = attempt_front_merge(q, rq);
 267                if (!*merged_request)
 268                        elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
 269                return true;
 270        default:
 271                return false;
 272        }
 273}
 274EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
 275
 276bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 277{
 278        struct elevator_queue *e = q->elevator;
 279
 280        if (e->type->ops.mq.bio_merge) {
 281                struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
 282                struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 283
 284                blk_mq_put_ctx(ctx);
 285                return e->type->ops.mq.bio_merge(hctx, bio);
 286        }
 287
 288        return false;
 289}
 290
 291bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
 292{
 293        return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
 294}
 295EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
 296
 297void blk_mq_sched_request_inserted(struct request *rq)
 298{
 299        trace_block_rq_insert(rq->q, rq);
 300}
 301EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
 302
 303static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
 304                                       struct request *rq)
 305{
 306        if (rq->tag == -1) {
 307                rq->rq_flags |= RQF_SORTED;
 308                return false;
 309        }
 310
 311        /*
 312         * If we already have a real request tag, send directly to
 313         * the dispatch list.
 314         */
 315        spin_lock(&hctx->lock);
 316        list_add(&rq->queuelist, &hctx->dispatch);
 317        spin_unlock(&hctx->lock);
 318        return true;
 319}
 320
 321static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 322{
 323        if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
 324                clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 325                if (blk_mq_hctx_has_pending(hctx)) {
 326                        blk_mq_run_hw_queue(hctx, true);
 327                        return true;
 328                }
 329        }
 330        return false;
 331}
 332
 333/**
 334 * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
 335 * @pos:    loop cursor.
 336 * @skip:   the list element that will not be examined. Iteration starts at
 337 *          @skip->next.
 338 * @head:   head of the list to examine. This list must have at least one
 339 *          element, namely @skip.
 340 * @member: name of the list_head structure within typeof(*pos).
 341 */
 342#define list_for_each_entry_rcu_rr(pos, skip, head, member)             \
 343        for ((pos) = (skip);                                            \
 344             (pos = (pos)->member.next != (head) ? list_entry_rcu(      \
 345                        (pos)->member.next, typeof(*pos), member) :     \
 346              list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
 347             (pos) != (skip); )
 348
 349/*
 350 * Called after a driver tag has been freed to check whether a hctx needs to
 351 * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
 352 * queues in a round-robin fashion if the tag set of @hctx is shared with other
 353 * hardware queues.
 354 */
 355void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
 356{
 357        struct blk_mq_tags *const tags = hctx->tags;
 358        struct blk_mq_tag_set *const set = hctx->queue->tag_set;
 359        struct request_queue *const queue = hctx->queue, *q;
 360        struct blk_mq_hw_ctx *hctx2;
 361        unsigned int i, j;
 362
 363        if (set->flags & BLK_MQ_F_TAG_SHARED) {
 364                rcu_read_lock();
 365                list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
 366                                           tag_set_list) {
 367                        queue_for_each_hw_ctx(q, hctx2, i)
 368                                if (hctx2->tags == tags &&
 369                                    blk_mq_sched_restart_hctx(hctx2))
 370                                        goto done;
 371                }
 372                j = hctx->queue_num + 1;
 373                for (i = 0; i < queue->nr_hw_queues; i++, j++) {
 374                        if (j == queue->nr_hw_queues)
 375                                j = 0;
 376                        hctx2 = queue->queue_hw_ctx[j];
 377                        if (hctx2->tags == tags &&
 378                            blk_mq_sched_restart_hctx(hctx2))
 379                                break;
 380                }
 381done:
 382                rcu_read_unlock();
 383        } else {
 384                blk_mq_sched_restart_hctx(hctx);
 385        }
 386}
 387
 388/*
 389 * Add flush/fua to the queue. If we fail getting a driver tag, then
 390 * punt to the requeue list. Requeue will re-invoke us from a context
 391 * that's safe to block from.
 392 */
 393static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
 394                                      struct request *rq, bool can_block)
 395{
 396        if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
 397                blk_insert_flush(rq);
 398                blk_mq_run_hw_queue(hctx, true);
 399        } else
 400                blk_mq_add_to_requeue_list(rq, false, true);
 401}
 402
 403void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 404                                 bool run_queue, bool async, bool can_block)
 405{
 406        struct request_queue *q = rq->q;
 407        struct elevator_queue *e = q->elevator;
 408        struct blk_mq_ctx *ctx = rq->mq_ctx;
 409        struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 410
 411        if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) {
 412                blk_mq_sched_insert_flush(hctx, rq, can_block);
 413                return;
 414        }
 415
 416        if (e && blk_mq_sched_bypass_insert(hctx, rq))
 417                goto run;
 418
 419        if (e && e->type->ops.mq.insert_requests) {
 420                LIST_HEAD(list);
 421
 422                list_add(&rq->queuelist, &list);
 423                e->type->ops.mq.insert_requests(hctx, &list, at_head);
 424        } else {
 425                spin_lock(&ctx->lock);
 426                __blk_mq_insert_request(hctx, rq, at_head);
 427                spin_unlock(&ctx->lock);
 428        }
 429
 430run:
 431        if (run_queue)
 432                blk_mq_run_hw_queue(hctx, async);
 433}
 434
 435void blk_mq_sched_insert_requests(struct request_queue *q,
 436                                  struct blk_mq_ctx *ctx,
 437                                  struct list_head *list, bool run_queue_async)
 438{
 439        struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 440        struct elevator_queue *e = hctx->queue->elevator;
 441
 442        if (e) {
 443                struct request *rq, *next;
 444
 445                /*
 446                 * We bypass requests that already have a driver tag assigned,
 447                 * which should only be flushes. Flushes are only ever inserted
 448                 * as single requests, so we shouldn't ever hit the
 449                 * WARN_ON_ONCE() below (but let's handle it just in case).
 450                 */
 451                list_for_each_entry_safe(rq, next, list, queuelist) {
 452                        if (WARN_ON_ONCE(rq->tag != -1)) {
 453                                list_del_init(&rq->queuelist);
 454                                blk_mq_sched_bypass_insert(hctx, rq);
 455                        }
 456                }
 457        }
 458
 459        if (e && e->type->ops.mq.insert_requests)
 460                e->type->ops.mq.insert_requests(hctx, list, false);
 461        else
 462                blk_mq_insert_requests(hctx, ctx, list);
 463
 464        blk_mq_run_hw_queue(hctx, run_queue_async);
 465}
 466
 467static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
 468                                   struct blk_mq_hw_ctx *hctx,
 469                                   unsigned int hctx_idx)
 470{
 471        if (hctx->sched_tags) {
 472                blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
 473                blk_mq_free_rq_map(hctx->sched_tags);
 474                hctx->sched_tags = NULL;
 475        }
 476}
 477
 478static int blk_mq_sched_alloc_tags(struct request_queue *q,
 479                                   struct blk_mq_hw_ctx *hctx,
 480                                   unsigned int hctx_idx)
 481{
 482        struct blk_mq_tag_set *set = q->tag_set;
 483        int ret;
 484
 485        hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
 486                                               set->reserved_tags);
 487        if (!hctx->sched_tags)
 488                return -ENOMEM;
 489
 490        ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
 491        if (ret)
 492                blk_mq_sched_free_tags(set, hctx, hctx_idx);
 493
 494        return ret;
 495}
 496
 497static void blk_mq_sched_tags_teardown(struct request_queue *q)
 498{
 499        struct blk_mq_tag_set *set = q->tag_set;
 500        struct blk_mq_hw_ctx *hctx;
 501        int i;
 502
 503        queue_for_each_hw_ctx(q, hctx, i)
 504                blk_mq_sched_free_tags(set, hctx, i);
 505}
 506
 507int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
 508                           unsigned int hctx_idx)
 509{
 510        struct elevator_queue *e = q->elevator;
 511
 512        if (!e)
 513                return 0;
 514
 515        return blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
 516}
 517
 518void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
 519                            unsigned int hctx_idx)
 520{
 521        struct elevator_queue *e = q->elevator;
 522
 523        if (!e)
 524                return;
 525
 526        blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
 527}
 528
 529int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 530{
 531        struct blk_mq_hw_ctx *hctx;
 532        unsigned int i;
 533        int ret;
 534
 535        if (!e) {
 536                q->elevator = NULL;
 537                return 0;
 538        }
 539
 540        /*
 541         * Default to 256, since we don't split into sync/async like the
 542         * old code did. Additionally, this is a per-hw queue depth.
 543         */
 544        q->nr_requests = 2 * BLKDEV_MAX_RQ;
 545
 546        queue_for_each_hw_ctx(q, hctx, i) {
 547                ret = blk_mq_sched_alloc_tags(q, hctx, i);
 548                if (ret)
 549                        goto err;
 550        }
 551
 552        ret = e->ops.mq.init_sched(q, e);
 553        if (ret)
 554                goto err;
 555
 556        return 0;
 557
 558err:
 559        blk_mq_sched_tags_teardown(q);
 560        q->elevator = NULL;
 561        return ret;
 562}
 563
 564void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
 565{
 566        if (e->type->ops.mq.exit_sched)
 567                e->type->ops.mq.exit_sched(e);
 568        blk_mq_sched_tags_teardown(q);
 569        q->elevator = NULL;
 570}
 571
 572int blk_mq_sched_init(struct request_queue *q)
 573{
 574        int ret;
 575
 576        mutex_lock(&q->sysfs_lock);
 577        ret = elevator_init(q, NULL);
 578        mutex_unlock(&q->sysfs_lock);
 579
 580        return ret;
 581}
 582