linux/block/blk-mq.c
<<
>>
Prefs
   1/*
   2 * Block multiqueue core code
   3 *
   4 * Copyright (C) 2013-2014 Jens Axboe
   5 * Copyright (C) 2013-2014 Christoph Hellwig
   6 */
   7#include <linux/kernel.h>
   8#include <linux/module.h>
   9#include <linux/backing-dev.h>
  10#include <linux/bio.h>
  11#include <linux/blkdev.h>
  12#include <linux/kmemleak.h>
  13#include <linux/mm.h>
  14#include <linux/init.h>
  15#include <linux/slab.h>
  16#include <linux/workqueue.h>
  17#include <linux/smp.h>
  18#include <linux/llist.h>
  19#include <linux/list_sort.h>
  20#include <linux/cpu.h>
  21#include <linux/cache.h>
  22#include <linux/sched/sysctl.h>
  23#include <linux/delay.h>
  24#include <linux/crash_dump.h>
  25
  26#include <trace/events/block.h>
  27
  28#include <linux/blk-mq.h>
  29#include "blk.h"
  30#include "blk-mq.h"
  31#include "blk-mq-debugfs.h"
  32#include "blk-mq-tag.h"
  33#include "blk-mq-sched.h"
  34#include "blk-stat.h"
  35
  36static DEFINE_MUTEX(all_q_mutex);
  37static LIST_HEAD(all_q_list);
  38
  39static void blk_mq_poll_stats_start(struct request_queue *q);
  40static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
  41
  42/*
  43 * Check if any of the ctx's have pending work in this hardware queue
  44 */
  45bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
  46{
  47        return sbitmap_any_bit_set(&hctx->ctx_map) ||
  48                        !list_empty_careful(&hctx->dispatch) ||
  49                        blk_mq_sched_has_work(hctx);
  50}
  51
  52/*
  53 * Mark this ctx as having pending work in this hardware queue
  54 */
  55static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
  56                                     struct blk_mq_ctx *ctx)
  57{
  58        if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
  59                sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
  60}
  61
  62static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
  63                                      struct blk_mq_ctx *ctx)
  64{
  65        sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
  66}
  67
  68void blk_freeze_queue_start(struct request_queue *q)
  69{
  70        int freeze_depth;
  71
  72        freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
  73        if (freeze_depth == 1) {
  74                percpu_ref_kill(&q->q_usage_counter);
  75                if (q->mq_ops)
  76                        blk_mq_run_hw_queues(q, false);
  77        }
  78}
  79EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
  80
  81void blk_mq_freeze_queue_wait(struct request_queue *q)
  82{
  83        wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
  84}
  85EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
  86
  87int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
  88                                     unsigned long timeout)
  89{
  90        return wait_event_timeout(q->mq_freeze_wq,
  91                                        percpu_ref_is_zero(&q->q_usage_counter),
  92                                        timeout);
  93}
  94EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
  95
  96/*
  97 * Guarantee no request is in use, so we can change any data structure of
  98 * the queue afterward.
  99 */
 100void blk_freeze_queue(struct request_queue *q)
 101{
 102        /*
 103         * In the !blk_mq case we are only calling this to kill the
 104         * q_usage_counter, otherwise this increases the freeze depth
 105         * and waits for it to return to zero.  For this reason there is
 106         * no blk_unfreeze_queue(), and blk_freeze_queue() is not
 107         * exported to drivers as the only user for unfreeze is blk_mq.
 108         */
 109        blk_freeze_queue_start(q);
 110        if (!q->mq_ops)
 111                blk_drain_queue(q);
 112        blk_mq_freeze_queue_wait(q);
 113}
 114
 115void blk_mq_freeze_queue(struct request_queue *q)
 116{
 117        /*
 118         * ...just an alias to keep freeze and unfreeze actions balanced
 119         * in the blk_mq_* namespace
 120         */
 121        blk_freeze_queue(q);
 122}
 123EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 124
 125void blk_mq_unfreeze_queue(struct request_queue *q)
 126{
 127        int freeze_depth;
 128
 129        freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
 130        WARN_ON_ONCE(freeze_depth < 0);
 131        if (!freeze_depth) {
 132                percpu_ref_reinit(&q->q_usage_counter);
 133                wake_up_all(&q->mq_freeze_wq);
 134        }
 135}
 136EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 137
 138/**
 139 * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished
 140 * @q: request queue.
 141 *
 142 * Note: this function does not prevent that the struct request end_io()
 143 * callback function is invoked. Additionally, it is not prevented that
 144 * new queue_rq() calls occur unless the queue has been stopped first.
 145 */
 146void blk_mq_quiesce_queue(struct request_queue *q)
 147{
 148        struct blk_mq_hw_ctx *hctx;
 149        unsigned int i;
 150        bool rcu = false;
 151
 152        blk_mq_stop_hw_queues(q);
 153
 154        queue_for_each_hw_ctx(q, hctx, i) {
 155                if (hctx->flags & BLK_MQ_F_BLOCKING)
 156                        synchronize_srcu(&hctx->queue_rq_srcu);
 157                else
 158                        rcu = true;
 159        }
 160        if (rcu)
 161                synchronize_rcu();
 162}
 163EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
 164
 165void blk_mq_wake_waiters(struct request_queue *q)
 166{
 167        struct blk_mq_hw_ctx *hctx;
 168        unsigned int i;
 169
 170        queue_for_each_hw_ctx(q, hctx, i)
 171                if (blk_mq_hw_queue_mapped(hctx))
 172                        blk_mq_tag_wakeup_all(hctx->tags, true);
 173}
 174
 175bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 176{
 177        return blk_mq_has_free_tags(hctx->tags);
 178}
 179EXPORT_SYMBOL(blk_mq_can_queue);
 180
 181void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 182                        struct request *rq, unsigned int rw_flags)
 183{
 184        if (blk_queue_io_stat(q))
 185                rw_flags |= REQ_IO_STAT;
 186
 187        INIT_LIST_HEAD(&rq->queuelist);
 188        /* csd/requeue_work/fifo_time is initialized before use */
 189        rq->q = q;
 190        rq->mq_ctx = ctx;
 191        rq->cmd_flags |= rw_flags;
 192        /* do not touch atomic flags, it needs atomic ops against the timer */
 193        rq->cpu = -1;
 194        INIT_HLIST_NODE(&rq->hash);
 195        RB_CLEAR_NODE(&rq->rb_node);
 196        rq->rq_disk = NULL;
 197        rq->part = NULL;
 198        rq->start_time = jiffies;
 199#ifdef CONFIG_BLK_CGROUP
 200        rq->rl = NULL;
 201        set_start_time_ns(rq);
 202        rq->io_start_time_ns = 0;
 203#endif
 204        rq->nr_phys_segments = 0;
 205#if defined(CONFIG_BLK_DEV_INTEGRITY)
 206        rq->nr_integrity_segments = 0;
 207#endif
 208        rq->special = NULL;
 209        /* tag was already set */
 210        rq->errors = 0;
 211
 212        rq->cmd = rq->__cmd;
 213
 214        rq->extra_len = 0;
 215        rq->sense_len = 0;
 216        rq->resid_len = 0;
 217        rq->sense = NULL;
 218
 219        INIT_LIST_HEAD(&rq->timeout_list);
 220        rq->timeout = 0;
 221
 222        rq->end_io = NULL;
 223        rq->end_io_data = NULL;
 224        rq->next_rq = NULL;
 225
 226        ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
 227}
 228EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init);
 229
 230struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
 231{
 232        struct request *rq;
 233        unsigned int tag;
 234
 235        tag = blk_mq_get_tag(data);
 236        if (tag != BLK_MQ_TAG_FAIL) {
 237                struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 238
 239                rq = tags->static_rqs[tag];
 240
 241                if (data->flags & BLK_MQ_REQ_INTERNAL) {
 242                        rq->tag = -1;
 243                        __rq_aux(rq, data->q)->internal_tag = tag;
 244                } else {
 245                        if (blk_mq_tag_busy(data->hctx)) {
 246                                rq->cmd_flags = REQ_MQ_INFLIGHT;
 247                                atomic_inc(&data->hctx->nr_active);
 248                        }
 249                        rq->tag = tag;
 250                        __rq_aux(rq, data->q)->internal_tag = -1;
 251                        data->hctx->tags->rqs[rq->tag] = rq;
 252                }
 253
 254                blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw);
 255                if (data->flags & BLK_MQ_REQ_PREEMPT)
 256                        rq->cmd_flags |= REQ_PREEMPT;
 257
 258                return rq;
 259        }
 260
 261        return NULL;
 262}
 263EXPORT_SYMBOL_GPL(__blk_mq_alloc_request);
 264
 265struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 266                unsigned int flags)
 267{
 268        struct blk_mq_alloc_data alloc_data = { .flags = flags };
 269        struct request *rq;
 270        int ret;
 271
 272        ret = blk_queue_enter(q, flags);
 273        if (ret)
 274                return ERR_PTR(ret);
 275
 276        rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
 277
 278        blk_mq_put_ctx(alloc_data.ctx);
 279        blk_queue_exit(q);
 280
 281        if (!rq)
 282                return ERR_PTR(-EWOULDBLOCK);
 283        return rq;
 284}
 285EXPORT_SYMBOL(blk_mq_alloc_request);
 286
 287struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
 288                unsigned int flags, unsigned int hctx_idx)
 289{
 290        struct blk_mq_alloc_data alloc_data = { .flags = flags };
 291        struct request *rq;
 292        unsigned int cpu;
 293        int ret;
 294
 295        /*
 296         * If the tag allocator sleeps we could get an allocation for a
 297         * different hardware context.  No need to complicate the low level
 298         * allocator for this for the rare use case of a command tied to
 299         * a specific queue.
 300         */
 301        if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
 302                return ERR_PTR(-EINVAL);
 303
 304        if (hctx_idx >= q->nr_hw_queues)
 305                return ERR_PTR(-EIO);
 306
 307        ret = blk_queue_enter(q, flags);
 308        if (ret)
 309                return ERR_PTR(ret);
 310
 311        /*
 312         * Check if the hardware context is actually mapped to anything.
 313         * If not tell the caller that it should skip this queue.
 314         */
 315        alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
 316        if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
 317                blk_queue_exit(q);
 318                return ERR_PTR(-EXDEV);
 319        }
 320        cpu = cpumask_first(alloc_data.hctx->cpumask);
 321        alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
 322
 323        rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
 324
 325        blk_queue_exit(q);
 326
 327        if (!rq)
 328                return ERR_PTR(-EWOULDBLOCK);
 329
 330        return rq;
 331}
 332EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 333
 334static void
 335blk_mq_sched_completed_request(struct request *rq)
 336{
 337        struct elevator_queue *e = rq->q->elevator;
 338
 339        if (e && e->aux->ops.mq.completed_request)
 340                e->aux->ops.mq.completed_request(rq);
 341}
 342
 343void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 344                             struct request *rq)
 345{
 346        const int sched_tag = rq_aux(rq)->internal_tag;
 347        struct request_queue *q = rq->q;
 348
 349        if (rq->cmd_flags & REQ_MQ_INFLIGHT)
 350                atomic_dec(&hctx->nr_active);
 351        rq->cmd_flags = 0;
 352
 353        clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 354        if (rq->tag != -1)
 355                blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
 356        if (sched_tag != -1)
 357                blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
 358        blk_mq_sched_restart(hctx);
 359        blk_queue_exit(q);
 360}
 361
 362static void blk_mq_finish_hctx_request(struct blk_mq_hw_ctx *hctx,
 363                                       struct request *rq)
 364{
 365        struct blk_mq_ctx *ctx = rq->mq_ctx;
 366
 367        ctx->rq_completed[rq_is_sync(rq)]++;
 368        __blk_mq_finish_request(hctx, ctx, rq);
 369}
 370EXPORT_SYMBOL_GPL(blk_mq_finish_request);
 371
 372void blk_mq_finish_request(struct request *rq)
 373{
 374        blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
 375 }
 376
 377void blk_mq_free_request(struct request *rq)
 378{
 379        blk_mq_sched_put_request(rq);
 380}
 381EXPORT_SYMBOL_GPL(blk_mq_free_request);
 382
 383inline void __blk_mq_end_request(struct request *rq, int error)
 384{
 385        blk_account_io_done(rq);
 386
 387        if (rq->end_io) {
 388                rq->end_io(rq, error);
 389        } else {
 390                if (unlikely(blk_bidi_rq(rq)))
 391                        blk_mq_free_request(rq->next_rq);
 392                blk_mq_free_request(rq);
 393        }
 394}
 395EXPORT_SYMBOL(__blk_mq_end_request);
 396
 397void blk_mq_end_request(struct request *rq, int error)
 398{
 399        if (blk_update_request(rq, error, blk_rq_bytes(rq)))
 400                BUG();
 401        __blk_mq_end_request(rq, error);
 402}
 403EXPORT_SYMBOL(blk_mq_end_request);
 404
 405static void __blk_mq_complete_request_remote(void *data)
 406{
 407        struct request *rq = data;
 408
 409        rq->q->softirq_done_fn(rq);
 410}
 411
 412static void blk_mq_ipi_complete_request(struct request *rq)
 413{
 414        struct blk_mq_ctx *ctx = rq->mq_ctx;
 415        bool shared = false;
 416        int cpu;
 417
 418        if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
 419                rq->q->softirq_done_fn(rq);
 420                return;
 421        }
 422
 423        cpu = get_cpu();
 424        if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
 425                shared = cpus_share_cache(cpu, ctx->cpu);
 426
 427        if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
 428                rq->csd.func = __blk_mq_complete_request_remote;
 429                rq->csd.info = rq;
 430                rq->csd.flags = 0;
 431                smp_call_function_single_async(ctx->cpu, &rq->csd);
 432        } else {
 433                rq->q->softirq_done_fn(rq);
 434        }
 435        put_cpu();
 436}
 437
 438static void blk_mq_stat_add(struct request *rq)
 439{
 440        if (rq->cmd_flags & REQ_STATS) {
 441                blk_mq_poll_stats_start(rq->q);
 442                blk_stat_add(rq);
 443        }
 444}
 445
 446static void __blk_mq_complete_request(struct request *rq)
 447{
 448        struct request_queue *q = rq->q;
 449
 450        if (rq_aux(rq)->internal_tag != -1)
 451                blk_mq_sched_completed_request(rq);
 452
 453        blk_mq_stat_add(rq);
 454
 455        if (!q->softirq_done_fn)
 456                blk_mq_end_request(rq, rq->errors);
 457        else
 458                blk_mq_ipi_complete_request(rq);
 459}
 460
 461/**
 462 * blk_mq_complete_request - end I/O on a request
 463 * @rq:         the request being processed
 464 *
 465 * Description:
 466 *      Ends all I/O on a request. It does not handle partial completions.
 467 *      The actual completion happens out-of-order, through a IPI handler.
 468 **/
 469void blk_mq_complete_request(struct request *rq, int error)
 470{
 471        struct request_queue *q = rq->q;
 472
 473        if (unlikely(blk_should_fake_timeout(q)))
 474                return;
 475        if (!blk_mark_rq_complete(rq)) {
 476                rq->errors = error;
 477                __blk_mq_complete_request(rq);
 478        }
 479}
 480EXPORT_SYMBOL(blk_mq_complete_request);
 481
 482int blk_mq_request_started(struct request *rq)
 483{
 484        return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 485}
 486EXPORT_SYMBOL_GPL(blk_mq_request_started);
 487
 488void blk_mq_start_request(struct request *rq)
 489{
 490        struct request_queue *q = rq->q;
 491
 492        blk_mq_sched_started_request(rq);
 493
 494        trace_block_rq_issue(q, rq);
 495
 496        rq->resid_len = blk_rq_bytes(rq);
 497        if (unlikely(blk_bidi_rq(rq)))
 498                rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
 499
 500        if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
 501                blk_stat_set_issue_time(&rq_aux(rq)->issue_stat);
 502                rq->cmd_flags |= REQ_STATS;
 503        }
 504
 505        blk_add_timer(rq);
 506
 507        /*
 508         * Ensure that ->deadline is visible before set the started
 509         * flag and clear the completed flag.
 510         */
 511        smp_mb__before_atomic();
 512
 513        /*
 514         * Mark us as started and clear complete. Complete might have been
 515         * set if requeue raced with timeout, which then marked it as
 516         * complete. So be sure to clear complete again when we start
 517         * the request, otherwise we'll ignore the completion event.
 518         */
 519        if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 520                set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 521        if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
 522                clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
 523
 524        if (q->dma_drain_size && blk_rq_bytes(rq)) {
 525                /*
 526                 * Make sure space for the drain appears.  We know we can do
 527                 * this because max_hw_segments has been adjusted to be one
 528                 * fewer than the device can handle.
 529                 */
 530                rq->nr_phys_segments++;
 531        }
 532}
 533EXPORT_SYMBOL(blk_mq_start_request);
 534
 535/*
 536 * When we reach here because queue is busy, REQ_ATOM_COMPLETE
 537 * flag isn't set yet, so there may be race with timeout hanlder,
 538 * but given rq->deadline is just set in .queue_rq() under
 539 * this situation, the race won't be possible in reality because
 540 * rq->timeout should be set as big enough to cover the window
 541 * between blk_mq_start_request() called from .queue_rq() and
 542 * clearing REQ_ATOM_STARTED here.
 543 */
 544static void __blk_mq_requeue_request(struct request *rq)
 545{
 546        struct request_queue *q = rq->q;
 547
 548        blk_mq_put_driver_tag(rq);
 549
 550        trace_block_rq_requeue(q, rq);
 551        blk_mq_sched_requeue_request(rq);
 552
 553        if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
 554                if (q->dma_drain_size && blk_rq_bytes(rq))
 555                        rq->nr_phys_segments--;
 556        }
 557}
 558
 559void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
 560{
 561        __blk_mq_requeue_request(rq);
 562
 563        BUG_ON(blk_queued_rq(rq));
 564        blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
 565}
 566EXPORT_SYMBOL(blk_mq_requeue_request);
 567
 568static void blk_mq_requeue_work(struct work_struct *work)
 569{
 570        struct request_queue *q =
 571                container_of(work, struct request_queue, requeue_work.work);
 572        LIST_HEAD(rq_list);
 573        struct request *rq, *next;
 574        unsigned long flags;
 575
 576        spin_lock_irqsave(&q->requeue_lock, flags);
 577        list_splice_init(&q->requeue_list, &rq_list);
 578        spin_unlock_irqrestore(&q->requeue_lock, flags);
 579
 580        list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
 581                if (!(rq->cmd_flags & REQ_SOFTBARRIER))
 582                        continue;
 583
 584                rq->cmd_flags &= ~REQ_SOFTBARRIER;
 585                list_del_init(&rq->queuelist);
 586                blk_mq_sched_insert_request(rq, true, false, false, true);
 587        }
 588
 589        while (!list_empty(&rq_list)) {
 590                rq = list_entry(rq_list.next, struct request, queuelist);
 591                list_del_init(&rq->queuelist);
 592                blk_mq_sched_insert_request(rq, false, false, false, true);
 593        }
 594
 595        blk_mq_run_hw_queues(q, false);
 596}
 597
 598void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 599                                bool kick_requeue_list)
 600{
 601        struct request_queue *q = rq->q;
 602        unsigned long flags;
 603
 604        /*
 605         * We abuse this flag that is otherwise used by the I/O scheduler to
 606         * request head insertation from the workqueue.
 607         */
 608        BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER);
 609
 610        spin_lock_irqsave(&q->requeue_lock, flags);
 611        if (at_head) {
 612                rq->cmd_flags |= REQ_SOFTBARRIER;
 613                list_add(&rq->queuelist, &q->requeue_list);
 614        } else {
 615                list_add_tail(&rq->queuelist, &q->requeue_list);
 616        }
 617        spin_unlock_irqrestore(&q->requeue_lock, flags);
 618
 619        if (kick_requeue_list)
 620                blk_mq_kick_requeue_list(q);
 621}
 622EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
 623
 624void blk_mq_kick_requeue_list(struct request_queue *q)
 625{
 626        kblockd_schedule_delayed_work(&q->requeue_work, 0);
 627}
 628EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 629
 630void blk_mq_delay_kick_requeue_list(struct request_queue *q,
 631                                    unsigned long msecs)
 632{
 633        kblockd_schedule_delayed_work(&q->requeue_work,
 634                                      msecs_to_jiffies(msecs));
 635}
 636EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 637
 638struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 639{
 640        if (tag < tags->nr_tags)
 641                return tags->rqs[tag];
 642
 643        return NULL;
 644}
 645EXPORT_SYMBOL(blk_mq_tag_to_rq);
 646
 647struct blk_mq_timeout_data {
 648        unsigned long next;
 649        unsigned int next_set;
 650};
 651
 652void blk_mq_rq_timed_out(struct request *req, bool reserved)
 653{
 654        struct blk_mq_ops *ops = req->q->mq_ops;
 655        enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
 656
 657        /*
 658         * We know that complete is set at this point. If STARTED isn't set
 659         * anymore, then the request isn't active and the "timeout" should
 660         * just be ignored. This can happen due to the bitflag ordering.
 661         * Timeout first checks if STARTED is set, and if it is, assumes
 662         * the request is active. But if we race with completion, then
 663         * we both flags will get cleared. So check here again, and ignore
 664         * a timeout event with a request that isn't active.
 665         */
 666        if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
 667                return;
 668
 669        if (ops->timeout)
 670                ret = ops->timeout(req, reserved);
 671
 672        switch (ret) {
 673        case BLK_EH_HANDLED:
 674                __blk_mq_complete_request(req);
 675                break;
 676        case BLK_EH_RESET_TIMER:
 677                blk_add_timer(req);
 678                blk_clear_rq_complete(req);
 679                break;
 680        case BLK_EH_NOT_HANDLED:
 681                break;
 682        default:
 683                printk(KERN_ERR "block: bad eh return: %d\n", ret);
 684                break;
 685        }
 686}
 687
 688static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 689                struct request *rq, void *priv, bool reserved)
 690{
 691        struct blk_mq_timeout_data *data = priv;
 692
 693        if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 694                return;
 695
 696        /*
 697         * The rq being checked may have been freed and reallocated
 698         * out already here, we avoid this race by checking rq->deadline
 699         * and REQ_ATOM_COMPLETE flag together:
 700         *
 701         * - if rq->deadline is observed as new value because of
 702         *   reusing, the rq won't be timed out because of timing.
 703         * - if rq->deadline is observed as previous value,
 704         *   REQ_ATOM_COMPLETE flag won't be cleared in reuse path
 705         *   because we put a barrier between setting rq->deadline
 706         *   and clearing the flag in blk_mq_start_request(), so
 707         *   this rq won't be timed out too.
 708         */
 709        if (time_after_eq(jiffies, rq->deadline)) {
 710                if (!blk_mark_rq_complete(rq))
 711                        blk_mq_rq_timed_out(rq, reserved);
 712        } else if (!data->next_set || time_after(data->next, rq->deadline)) {
 713                data->next = rq->deadline;
 714                data->next_set = 1;
 715        }
 716}
 717
 718static void blk_mq_timeout_work(struct work_struct *work)
 719{
 720        struct request_queue *q =
 721                container_of(work, struct request_queue, timeout_work);
 722        struct blk_mq_timeout_data data = {
 723                .next           = 0,
 724                .next_set       = 0,
 725        };
 726        int i;
 727
 728        /* A deadlock might occur if a request is stuck requiring a
 729         * timeout at the same time a queue freeze is waiting
 730         * completion, since the timeout code would not be able to
 731         * acquire the queue reference here.
 732         *
 733         * That's why we don't use blk_queue_enter here; instead, we use
 734         * percpu_ref_tryget directly, because we need to be able to
 735         * obtain a reference even in the short window between the queue
 736         * starting to freeze, by dropping the first reference in
 737         * blk_freeze_queue_start, and the moment the last request is
 738         * consumed, marked by the instant q_usage_counter reaches
 739         * zero.
 740         */
 741        if (!percpu_ref_tryget(&q->q_usage_counter))
 742                return;
 743
 744        blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
 745
 746        if (data.next_set) {
 747                data.next = blk_rq_timeout(round_jiffies_up(data.next));
 748                mod_timer(&q->timeout, data.next);
 749        } else {
 750                struct blk_mq_hw_ctx *hctx;
 751
 752                queue_for_each_hw_ctx(q, hctx, i) {
 753                        /* the hctx may be unmapped, so check it here */
 754                        if (blk_mq_hw_queue_mapped(hctx))
 755                                blk_mq_tag_idle(hctx);
 756                }
 757        }
 758        blk_queue_exit(q);
 759}
 760
 761/*
 762 * Reverse check our software queue for entries that we could potentially
 763 * merge with. Currently includes a hand-wavy stop count of 8, to not spend
 764 * too much time checking for merges.
 765 */
 766static bool blk_mq_attempt_merge(struct request_queue *q,
 767                                 struct blk_mq_ctx *ctx, struct bio *bio)
 768{
 769        struct request *rq;
 770        int checked = 8;
 771
 772        list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
 773                int el_ret;
 774
 775                if (!checked--)
 776                        break;
 777
 778                if (!blk_rq_merge_ok(rq, bio))
 779                        continue;
 780
 781                el_ret = blk_try_merge(rq, bio);
 782                if (el_ret == ELEVATOR_NO_MERGE)
 783                        continue;
 784
 785                if (!blk_mq_sched_allow_merge(q, rq, bio))
 786                        break;
 787
 788                if (el_ret == ELEVATOR_BACK_MERGE) {
 789                        if (bio_attempt_back_merge(q, rq, bio)) {
 790                                ctx->rq_merged++;
 791                                return true;
 792                        }
 793                        break;
 794                } else if (el_ret == ELEVATOR_FRONT_MERGE) {
 795                        if (bio_attempt_front_merge(q, rq, bio)) {
 796                                ctx->rq_merged++;
 797                                return true;
 798                        }
 799                        break;
 800                }
 801        }
 802
 803        return false;
 804}
 805
 806struct flush_busy_ctx_data {
 807        struct blk_mq_hw_ctx *hctx;
 808        struct list_head *list;
 809};
 810
 811static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
 812{
 813        struct flush_busy_ctx_data *flush_data = data;
 814        struct blk_mq_hw_ctx *hctx = flush_data->hctx;
 815        struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
 816
 817        sbitmap_clear_bit(sb, bitnr);
 818        spin_lock(&ctx->lock);
 819        list_splice_tail_init(&ctx->rq_list, flush_data->list);
 820        spin_unlock(&ctx->lock);
 821        return true;
 822}
 823
 824/*
 825 * Process software queues that have been marked busy, splicing them
 826 * to the for-dispatch
 827 */
 828void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 829{
 830        struct flush_busy_ctx_data data = {
 831                .hctx = hctx,
 832                .list = list,
 833        };
 834
 835        sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
 836}
 837EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
 838
 839struct dispatch_rq_data {
 840        struct blk_mq_hw_ctx *hctx;
 841        struct request *rq;
 842};
 843
 844static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
 845                void *data)
 846{
 847        struct dispatch_rq_data *dispatch_data = data;
 848        struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
 849        struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
 850
 851        spin_lock(&ctx->lock);
 852        if (unlikely(!list_empty(&ctx->rq_list))) {
 853                dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
 854                list_del_init(&dispatch_data->rq->queuelist);
 855                if (list_empty(&ctx->rq_list))
 856                        sbitmap_clear_bit(sb, bitnr);
 857        }
 858        spin_unlock(&ctx->lock);
 859
 860        return !dispatch_data->rq;
 861}
 862
 863struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
 864                                        struct blk_mq_ctx *start)
 865{
 866        unsigned off = start ? start->index_hw : 0;
 867        struct dispatch_rq_data data = {
 868                .hctx = hctx,
 869                .rq   = NULL,
 870        };
 871
 872        __sbitmap_for_each_set(&hctx->ctx_map, off,
 873                               dispatch_rq_from_ctx, &data);
 874
 875        return data.rq;
 876}
 877
 878static inline unsigned int queued_to_index(unsigned int queued)
 879{
 880        if (!queued)
 881                return 0;
 882
 883        return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
 884}
 885
 886bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
 887                           bool wait)
 888{
 889        struct blk_mq_alloc_data data = {
 890                .q = rq->q,
 891                .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
 892                .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
 893        };
 894
 895        if (rq->tag != -1)
 896                goto done;
 897
 898        if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq_aux(rq)->internal_tag))
 899                data.flags |= BLK_MQ_REQ_RESERVED;
 900
 901        rq->tag = blk_mq_get_tag(&data);
 902        if (rq->tag >= 0) {
 903                if (blk_mq_tag_busy(data.hctx)) {
 904                        rq->cmd_flags |= REQ_MQ_INFLIGHT;
 905                        atomic_inc(&data.hctx->nr_active);
 906                }
 907                data.hctx->tags->rqs[rq->tag] = rq;
 908        }
 909
 910done:
 911        if (hctx)
 912                *hctx = data.hctx;
 913        return rq->tag != -1;
 914}
 915
 916static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode,
 917                                int flags, void *key)
 918{
 919        struct blk_mq_hw_ctx *hctx;
 920
 921        hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
 922
 923        list_del_init(&wait->task_list);
 924        blk_mq_run_hw_queue(hctx, true);
 925        return 1;
 926}
 927
 928/*
 929 * Mark us waiting for a tag. For shared tags, this involves hooking us into
 930 * the tag wakeups. For non-shared tags, we can simply mark us nedeing a
 931 * restart. For both caes, take care to check the condition again after
 932 * marking us as waiting.
 933 */
 934static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
 935                                 struct request *rq)
 936{
 937        struct blk_mq_hw_ctx *this_hctx = *hctx;
 938        bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0;
 939        struct sbq_wait_state *ws;
 940        wait_queue_t *wait;
 941        bool ret;
 942
 943        if (!shared_tags) {
 944                if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))
 945                        set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state);
 946        } else {
 947                wait = &this_hctx->dispatch_wait;
 948                if (!list_empty_careful(&wait->task_list))
 949                        return false;
 950
 951                spin_lock(&this_hctx->lock);
 952                if (!list_empty(&wait->task_list)) {
 953                        spin_unlock(&this_hctx->lock);
 954                        return false;
 955                }
 956
 957                ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
 958                add_wait_queue(&ws->wait, wait);
 959        }
 960
 961        /*
 962         * It's possible that a tag was freed in the window between the
 963         * allocation failure and adding the hardware queue to the wait
 964         * queue.
 965         */
 966        ret = blk_mq_get_driver_tag(rq, hctx, false);
 967
 968        if (!shared_tags) {
 969                /*
 970                 * Don't clear RESTART here, someone else could have set it.
 971                 * At most this will cost an extra queue run.
 972                 */
 973                return ret;
 974        } else {
 975                if (!ret) {
 976                        spin_unlock(&this_hctx->lock);
 977                        return false;
 978                }
 979
 980                /*
 981                 * We got a tag, remove ourselves from the wait queue to ensure
 982                 * someone else gets the wakeup.
 983                 */
 984                spin_lock_irq(&ws->wait.lock);
 985                list_del_init(&wait->task_list);
 986                spin_unlock_irq(&ws->wait.lock);
 987                spin_unlock(&this_hctx->lock);
 988                return true;
 989        }
 990}
 991
 992bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 993                             bool got_budget)
 994{
 995        struct blk_mq_hw_ctx *hctx;
 996        bool no_tag = false;
 997        struct request *rq, *nxt;
 998        LIST_HEAD(driver_list);
 999        struct list_head *dptr;
1000        int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK;
1001
1002        if (list_empty(list))
1003                return false;
1004
1005        WARN_ON(!list_is_singular(list) && got_budget);
1006
1007        /*
1008         * Start off with dptr being NULL, so we start the first request
1009         * immediately, even if we have more pending.
1010         */
1011        dptr = NULL;
1012
1013        /*
1014         * Now process all the entries, sending them to the driver.
1015         */
1016        errors = queued = 0;
1017        do {
1018                struct blk_mq_queue_data bd;
1019
1020                rq = list_first_entry(list, struct request, queuelist);
1021                if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
1022                        /*
1023                         * The initial allocation attempt failed, so we need to
1024                         * rerun the hardware queue when a tag is freed. The
1025                         * waitqueue takes care of that. If the queue is run
1026                         * before we add this entry back on the dispatch list,
1027                         * we'll re-run it below.
1028                         */
1029                        if (!blk_mq_mark_tag_wait(&hctx, rq)) {
1030                                if (got_budget)
1031                                        blk_mq_put_dispatch_budget(hctx);
1032                                /*
1033                                 * For non-shared tags, the RESTART check
1034                                 * will suffice.
1035                                 */
1036                                if (hctx->flags & BLK_MQ_F_TAG_SHARED)
1037                                        no_tag = true;
1038                                break;
1039                        }
1040                }
1041
1042                if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
1043                        blk_mq_put_driver_tag(rq);
1044                        break;
1045                }
1046
1047                list_del_init(&rq->queuelist);
1048
1049                bd.rq = rq;
1050                bd.list = dptr;
1051
1052                /*
1053                 * Flag last if we have no more requests, or if we have more
1054                 * but can't assign a driver tag to it.
1055                 */
1056                if (list_empty(list))
1057                        bd.last = true;
1058                else {
1059                        nxt = list_first_entry(list, struct request, queuelist);
1060                        bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
1061                }
1062
1063                ret = q->mq_ops->queue_rq(hctx, &bd);
1064                switch (ret) {
1065                case BLK_MQ_RQ_QUEUE_OK:
1066                        queued++;
1067                        break;
1068                case BLK_MQ_RQ_QUEUE_BUSY:
1069                        /*
1070                         * If an I/O scheduler has been configured and we got a
1071                         * driver tag for the next request already, free it again.
1072                         */
1073                        if (!list_empty(list)) {
1074                                nxt = list_first_entry(list, struct request, queuelist);
1075                                blk_mq_put_driver_tag(nxt);
1076                        }
1077                        list_add(&rq->queuelist, list);
1078                        __blk_mq_requeue_request(rq);
1079                        break;
1080                default:
1081                        pr_err("blk-mq: bad return on queue: %d\n", ret);
1082                case BLK_MQ_RQ_QUEUE_ERROR:
1083                        errors++;
1084                        rq->errors = -EIO;
1085                        blk_mq_end_request(rq, rq->errors);
1086                        break;
1087                }
1088
1089                if (ret == BLK_MQ_RQ_QUEUE_BUSY)
1090                        break;
1091
1092                /*
1093                 * We've done the first request. If we have more than 1
1094                 * left in the list, set dptr to defer issue.
1095                 */
1096                if (!dptr && list->next != list->prev)
1097                        dptr = &driver_list;
1098        } while (!list_empty(list));
1099
1100        hctx->dispatched[queued_to_index(queued)]++;
1101
1102        /*
1103         * Any items that need requeuing? Stuff them into hctx->dispatch,
1104         * that is where we will continue on next queue run.
1105         */
1106        if (!list_empty(list)) {
1107                spin_lock(&hctx->lock);
1108                list_splice_init(list, &hctx->dispatch);
1109                spin_unlock(&hctx->lock);
1110
1111                /*
1112                 * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
1113                 * it's possible the queue is stopped and restarted again
1114                 * before this. Queue restart will dispatch requests. And since
1115                 * requests in rq_list aren't added into hctx->dispatch yet,
1116                 * the requests in rq_list might get lost.
1117                 *
1118                 * blk_mq_run_hw_queue() already checks the STOPPED bit
1119                 *
1120                 * If RESTART or TAG_WAITING is set, then let completion restart
1121                 * the queue instead of potentially looping here.
1122                 *
1123                 * If 'no_tag' is set, that means that we failed getting
1124                 * a driver tag with an I/O scheduler attached. If our dispatch
1125                 * waitqueue is no longer active, ensure that we run the queue
1126                 * AFTER adding our entries back to the list.
1127                 */
1128                if (!blk_mq_sched_needs_restart(hctx) ||
1129                    (no_tag && list_empty_careful(&hctx->dispatch_wait.task_list)))
1130                        blk_mq_run_hw_queue(hctx, true);
1131        }
1132
1133        return (queued + errors) != 0;
1134}
1135
1136static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
1137{
1138        int srcu_idx;
1139
1140        WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
1141                cpu_online(hctx->next_cpu));
1142
1143        if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
1144                rcu_read_lock();
1145                blk_mq_sched_dispatch_requests(hctx);
1146                rcu_read_unlock();
1147        } else {
1148                might_sleep();
1149
1150                srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
1151                blk_mq_sched_dispatch_requests(hctx);
1152                srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
1153        }
1154}
1155
1156/*
1157 * It'd be great if the workqueue API had a way to pass
1158 * in a mask and had some smarts for more clever placement.
1159 * For now we just round-robin here, switching for every
1160 * BLK_MQ_CPU_WORK_BATCH queued items.
1161 */
1162static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
1163{
1164        if (hctx->queue->nr_hw_queues == 1)
1165                return WORK_CPU_UNBOUND;
1166
1167        if (--hctx->next_cpu_batch <= 0) {
1168                int next_cpu;
1169
1170                next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
1171                if (next_cpu >= nr_cpu_ids)
1172                        next_cpu = cpumask_first(hctx->cpumask);
1173
1174                hctx->next_cpu = next_cpu;
1175                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1176        }
1177
1178        return hctx->next_cpu;
1179}
1180
1181static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
1182                                        unsigned long msecs)
1183{
1184        if (unlikely(blk_mq_hctx_stopped(hctx) ||
1185                     !blk_mq_hw_queue_mapped(hctx)))
1186                return;
1187
1188        if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
1189                int cpu = get_cpu();
1190                if (cpumask_test_cpu(cpu, hctx->cpumask)) {
1191                        __blk_mq_run_hw_queue(hctx);
1192                        put_cpu();
1193                        return;
1194                }
1195
1196                put_cpu();
1197        }
1198
1199        kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
1200                        &hctx->run_work, msecs);
1201}
1202
1203void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1204{
1205        __blk_mq_delay_run_hw_queue(hctx, true, msecs);
1206}
1207EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
1208
1209void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1210{
1211        __blk_mq_delay_run_hw_queue(hctx, async, 0);
1212}
1213EXPORT_SYMBOL(blk_mq_run_hw_queue);
1214
1215void blk_mq_run_hw_queues(struct request_queue *q, bool async)
1216{
1217        struct blk_mq_hw_ctx *hctx;
1218        int i;
1219
1220        queue_for_each_hw_ctx(q, hctx, i) {
1221                if (!blk_mq_hctx_has_pending(hctx) ||
1222                    blk_mq_hctx_stopped(hctx))
1223                        continue;
1224
1225                blk_mq_run_hw_queue(hctx, async);
1226        }
1227}
1228EXPORT_SYMBOL(blk_mq_run_hw_queues);
1229
1230/**
1231 * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
1232 * @q: request queue.
1233 *
1234 * The caller is responsible for serializing this function against
1235 * blk_mq_{start,stop}_hw_queue().
1236 */
1237bool blk_mq_queue_stopped(struct request_queue *q)
1238{
1239        struct blk_mq_hw_ctx *hctx;
1240        int i;
1241
1242        queue_for_each_hw_ctx(q, hctx, i)
1243                if (blk_mq_hctx_stopped(hctx))
1244                        return true;
1245
1246        return false;
1247}
1248EXPORT_SYMBOL(blk_mq_queue_stopped);
1249
1250void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
1251{
1252        cancel_delayed_work(&hctx->run_work);
1253        cancel_delayed_work(&hctx->delay_work);
1254        set_bit(BLK_MQ_S_STOPPED, &hctx->state);
1255}
1256EXPORT_SYMBOL(blk_mq_stop_hw_queue);
1257
1258void blk_mq_stop_hw_queues(struct request_queue *q)
1259{
1260        struct blk_mq_hw_ctx *hctx;
1261        int i;
1262
1263        queue_for_each_hw_ctx(q, hctx, i)
1264                blk_mq_stop_hw_queue(hctx);
1265}
1266EXPORT_SYMBOL(blk_mq_stop_hw_queues);
1267
1268void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
1269{
1270        clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1271
1272        blk_mq_run_hw_queue(hctx, false);
1273}
1274EXPORT_SYMBOL(blk_mq_start_hw_queue);
1275
1276void blk_mq_start_hw_queues(struct request_queue *q)
1277{
1278        struct blk_mq_hw_ctx *hctx;
1279        int i;
1280
1281        queue_for_each_hw_ctx(q, hctx, i)
1282                blk_mq_start_hw_queue(hctx);
1283}
1284EXPORT_SYMBOL(blk_mq_start_hw_queues);
1285
1286void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
1287{
1288        struct blk_mq_hw_ctx *hctx;
1289        int i;
1290
1291        queue_for_each_hw_ctx(q, hctx, i) {
1292                if (!blk_mq_hctx_stopped(hctx))
1293                        continue;
1294
1295                clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1296                blk_mq_run_hw_queue(hctx, async);
1297        }
1298}
1299EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
1300
1301static void blk_mq_run_work_fn(struct work_struct *work)
1302{
1303        struct blk_mq_hw_ctx *hctx;
1304
1305        hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
1306
1307        __blk_mq_run_hw_queue(hctx);
1308}
1309
1310static void blk_mq_delay_work_fn(struct work_struct *work)
1311{
1312        struct blk_mq_hw_ctx *hctx;
1313
1314        hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
1315
1316        if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
1317                __blk_mq_run_hw_queue(hctx);
1318}
1319
1320void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1321{
1322        if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
1323                return;
1324
1325        blk_mq_stop_hw_queue(hctx);
1326        kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
1327                        &hctx->delay_work, msecs_to_jiffies(msecs));
1328}
1329EXPORT_SYMBOL(blk_mq_delay_queue);
1330
1331static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
1332                                            struct request *rq,
1333                                            bool at_head)
1334{
1335        struct blk_mq_ctx *ctx = rq->mq_ctx;
1336
1337        trace_block_rq_insert(hctx->queue, rq);
1338
1339        if (at_head)
1340                list_add(&rq->queuelist, &ctx->rq_list);
1341        else
1342                list_add_tail(&rq->queuelist, &ctx->rq_list);
1343}
1344
1345void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
1346                             bool at_head)
1347{
1348        struct blk_mq_ctx *ctx = rq->mq_ctx;
1349
1350        __blk_mq_insert_req_list(hctx, rq, at_head);
1351        blk_mq_hctx_mark_pending(hctx, ctx);
1352}
1353
1354/*
1355 * Should only be used carefully, when the caller knows we want to
1356 * bypass a potential IO scheduler on the target device.
1357 */
1358void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
1359{
1360        struct blk_mq_ctx *ctx = rq->mq_ctx;
1361        struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
1362
1363        spin_lock(&hctx->lock);
1364        list_add_tail(&rq->queuelist, &hctx->dispatch);
1365        spin_unlock(&hctx->lock);
1366
1367        if (run_queue)
1368                blk_mq_run_hw_queue(hctx, false);
1369}
1370
1371void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
1372                            struct list_head *list)
1373
1374{
1375        /*
1376         * preemption doesn't flush plug list, so it's possible ctx->cpu is
1377         * offline now
1378         */
1379        spin_lock(&ctx->lock);
1380        while (!list_empty(list)) {
1381                struct request *rq;
1382
1383                rq = list_first_entry(list, struct request, queuelist);
1384                BUG_ON(rq->mq_ctx != ctx);
1385                list_del_init(&rq->queuelist);
1386                __blk_mq_insert_req_list(hctx, rq, false);
1387        }
1388        blk_mq_hctx_mark_pending(hctx, ctx);
1389        spin_unlock(&ctx->lock);
1390}
1391
1392static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
1393{
1394        struct request *rqa = container_of(a, struct request, queuelist);
1395        struct request *rqb = container_of(b, struct request, queuelist);
1396
1397        return !(rqa->mq_ctx < rqb->mq_ctx ||
1398                 (rqa->mq_ctx == rqb->mq_ctx &&
1399                  blk_rq_pos(rqa) < blk_rq_pos(rqb)));
1400}
1401
1402void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1403{
1404        struct blk_mq_ctx *this_ctx;
1405        struct request_queue *this_q;
1406        struct request *rq;
1407        LIST_HEAD(list);
1408        LIST_HEAD(ctx_list);
1409        unsigned int depth;
1410
1411        list_splice_init(&plug->mq_list, &list);
1412
1413        list_sort(NULL, &list, plug_ctx_cmp);
1414
1415        this_q = NULL;
1416        this_ctx = NULL;
1417        depth = 0;
1418
1419        while (!list_empty(&list)) {
1420                rq = list_entry_rq(list.next);
1421                list_del_init(&rq->queuelist);
1422                BUG_ON(!rq->q);
1423                if (rq->mq_ctx != this_ctx) {
1424                        if (this_ctx) {
1425                                trace_block_unplug(this_q, depth, from_schedule);
1426                                blk_mq_sched_insert_requests(this_q, this_ctx,
1427                                                                &ctx_list,
1428                                                                from_schedule);
1429                        }
1430
1431                        this_ctx = rq->mq_ctx;
1432                        this_q = rq->q;
1433                        depth = 0;
1434                }
1435
1436                depth++;
1437                list_add_tail(&rq->queuelist, &ctx_list);
1438        }
1439
1440        /*
1441         * If 'this_ctx' is set, we know we have entries to complete
1442         * on 'ctx_list'. Do those.
1443         */
1444        if (this_ctx) {
1445                trace_block_unplug(this_q, depth, from_schedule);
1446                blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
1447                                                from_schedule);
1448        }
1449}
1450
1451static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1452{
1453        init_request_from_bio(rq, bio);
1454
1455        if (blk_do_io_stat(rq))
1456                blk_account_io_start(rq, true);
1457}
1458
1459static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
1460{
1461        return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
1462                !blk_queue_nomerges(hctx->queue);
1463}
1464
1465/* attempt to merge bio into current sw queue */
1466static inline bool blk_mq_merge_bio(struct request_queue *q, struct bio *bio)
1467{
1468        bool ret = false;
1469        struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
1470        struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
1471
1472        if (hctx_allow_merges(hctx) && bio_mergeable(bio)) {
1473                spin_lock(&ctx->lock);
1474                ret = blk_mq_attempt_merge(q, ctx, bio);
1475                spin_unlock(&ctx->lock);
1476        }
1477
1478        blk_mq_put_ctx(ctx);
1479        return ret;
1480}
1481
1482static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx,
1483                                   struct blk_mq_ctx *ctx,
1484                                   struct request *rq)
1485{
1486        spin_lock(&ctx->lock);
1487        __blk_mq_insert_request(hctx, rq, false);
1488        spin_unlock(&ctx->lock);
1489}
1490
1491static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1492                                        struct request *rq, bool may_sleep)
1493{
1494        struct request_queue *q = rq->q;
1495        struct blk_mq_queue_data bd = {
1496                .rq = rq,
1497                .list = NULL,
1498                .last = true,
1499        };
1500        int ret;
1501        bool run_queue = true;
1502
1503        if (blk_mq_hctx_stopped(hctx)) {
1504                run_queue = false;
1505                goto insert;
1506        }
1507
1508        if (q->elevator)
1509                goto insert;
1510
1511        if (!blk_mq_get_driver_tag(rq, NULL, false))
1512                goto insert;
1513
1514        if (!blk_mq_get_dispatch_budget(hctx)) {
1515                blk_mq_put_driver_tag(rq);
1516                goto insert;
1517        }
1518
1519        /*
1520         * For OK queue, we are done. For error, kill it. Any other
1521         * error (busy), just add it to our list as we previously
1522         * would have done
1523         */
1524        ret = q->mq_ops->queue_rq(hctx, &bd);
1525        if (ret == BLK_MQ_RQ_QUEUE_OK)
1526                return;
1527
1528        if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
1529                rq->errors = -EIO;
1530                blk_mq_end_request(rq, rq->errors);
1531                return;
1532        }
1533
1534        __blk_mq_requeue_request(rq);
1535insert:
1536        blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep);
1537}
1538
1539static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1540                                      struct request *rq)
1541{
1542        if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
1543                rcu_read_lock();
1544                __blk_mq_try_issue_directly(hctx, rq, false);
1545                rcu_read_unlock();
1546        } else {
1547                unsigned int srcu_idx;
1548
1549                might_sleep();
1550
1551                srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
1552                __blk_mq_try_issue_directly(hctx, rq, true);
1553                srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
1554        }
1555}
1556
1557static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
1558{
1559        const int is_sync = rw_is_sync(bio->bi_rw);
1560        const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
1561        struct blk_mq_alloc_data data = { .flags = 0 };
1562        struct request *rq;
1563        unsigned int request_count = 0;
1564        struct blk_plug *plug;
1565        struct request *same_queue_rq = NULL;
1566
1567        blk_queue_bounce(q, &bio);
1568
1569        if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1570                bio_endio(bio, -EIO);
1571                return;
1572        }
1573
1574        if (!is_flush_fua && !blk_queue_nomerges(q) &&
1575            blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1576                return;
1577
1578        if (blk_mq_sched_bio_merge(q, bio))
1579                return;
1580
1581        if (blk_mq_merge_bio(q, bio))
1582                return;
1583
1584        trace_block_getrq(q, bio, bio->bi_rw);
1585
1586        rq = blk_mq_sched_get_request(q, bio, bio->bi_rw, &data);
1587        if (unlikely(!rq))
1588                return;
1589
1590        plug = current->plug;
1591        if (unlikely(is_flush_fua)) {
1592                blk_mq_put_ctx(data.ctx);
1593                blk_mq_bio_to_request(rq, bio);
1594
1595                /* bypass scheduler for flush rq */
1596                blk_insert_flush(rq);
1597                blk_mq_run_hw_queue(data.hctx, true);
1598        } else if (plug && q->nr_hw_queues == 1) {
1599                struct request *last = NULL;
1600
1601                blk_mq_put_ctx(data.ctx);
1602                blk_mq_bio_to_request(rq, bio);
1603
1604                /*
1605                 * @request_count may become stale because of schedule
1606                 * out, so check the list again.
1607                 */
1608                if (list_empty(&plug->mq_list))
1609                        request_count = 0;
1610                else if (blk_queue_nomerges(q))
1611                        request_count = blk_plug_queued_count(q);
1612
1613                if (!request_count)
1614                        trace_block_plug(q);
1615                else
1616                        last = list_entry_rq(plug->mq_list.prev);
1617
1618                if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
1619                    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
1620                        blk_flush_plug_list(plug, false);
1621                        trace_block_plug(q);
1622                }
1623
1624                list_add_tail(&rq->queuelist, &plug->mq_list);
1625        } else if (plug && !blk_queue_nomerges(q)) {
1626                blk_mq_bio_to_request(rq, bio);
1627
1628                /*
1629                 * We do limited plugging. If the bio can be merged, do that.
1630                 * Otherwise the existing request in the plug list will be
1631                 * issued. So the plug list will have one request at most
1632                 * The plug list might get flushed before this. If that happens,
1633                 * the plug list is empty, and same_queue_rq is invalid.
1634                 */
1635                if (list_empty(&plug->mq_list))
1636                        same_queue_rq = NULL;
1637                if (same_queue_rq)
1638                        list_del_init(&same_queue_rq->queuelist);
1639                list_add_tail(&rq->queuelist, &plug->mq_list);
1640
1641                blk_mq_put_ctx(data.ctx);
1642
1643                if (same_queue_rq) {
1644                        data.hctx = blk_mq_map_queue(q,
1645                                        same_queue_rq->mq_ctx->cpu);
1646                        blk_mq_try_issue_directly(data.hctx, same_queue_rq);
1647                }
1648        } else if (q->nr_hw_queues > 1 && is_sync) {
1649                blk_mq_put_ctx(data.ctx);
1650                blk_mq_bio_to_request(rq, bio);
1651                blk_mq_try_issue_directly(data.hctx, rq);
1652        } else if (q->elevator) {
1653                blk_mq_put_ctx(data.ctx);
1654                blk_mq_bio_to_request(rq, bio);
1655                blk_mq_sched_insert_request(rq, false, true, true, true);
1656        } else {
1657                blk_mq_put_ctx(data.ctx);
1658                blk_mq_bio_to_request(rq, bio);
1659                blk_mq_queue_io(data.hctx, data.ctx, rq);
1660                blk_mq_run_hw_queue(data.hctx, true);
1661        }
1662}
1663
1664void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
1665                     unsigned int hctx_idx)
1666{
1667        struct page *page;
1668
1669        if (tags->rqs && set->ops->exit_request) {
1670                int i;
1671
1672                for (i = 0; i < tags->nr_tags; i++) {
1673                        struct request *rq = tags->static_rqs[i];
1674
1675                        if (!rq)
1676                                continue;
1677                        set->ops->exit_request(set->driver_data, rq,
1678                                                hctx_idx, i);
1679                        tags->static_rqs[i] = NULL;
1680                }
1681        }
1682
1683        while (!list_empty(&tags->page_list)) {
1684                page = list_first_entry(&tags->page_list, struct page, lru);
1685                list_del_init(&page->lru);
1686                /*
1687                 * Remove kmemleak object previously allocated in
1688                 * blk_mq_init_rq_map().
1689                 */
1690                kmemleak_free(page_address(page));
1691                __free_pages(page, page->private);
1692        }
1693}
1694
1695void blk_mq_free_rq_map(struct blk_mq_tags *tags)
1696{
1697        kfree(tags->rqs);
1698        tags->rqs = NULL;
1699        kfree(tags->static_rqs);
1700        tags->static_rqs = NULL;
1701
1702        blk_mq_free_tags(tags);
1703}
1704
1705struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
1706                                        unsigned int hctx_idx,
1707                                        unsigned int nr_tags,
1708                                        unsigned int reserved_tags)
1709{
1710        struct blk_mq_tags *tags;
1711
1712        tags = blk_mq_init_tags(nr_tags, reserved_tags,
1713                                set->numa_node,
1714                                BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
1715        if (!tags)
1716                return NULL;
1717
1718        tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *),
1719                                 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
1720                                 set->numa_node);
1721        if (!tags->rqs) {
1722                blk_mq_free_tags(tags);
1723                return NULL;
1724        }
1725
1726        tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *),
1727                                 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
1728                                 set->numa_node);
1729        if (!tags->static_rqs) {
1730                kfree(tags->rqs);
1731                blk_mq_free_tags(tags);
1732                return NULL;
1733        }
1734
1735        return tags;
1736}
1737
1738static size_t order_to_size(unsigned int order)
1739{
1740        return (size_t)PAGE_SIZE << order;
1741}
1742
1743int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
1744                     unsigned int hctx_idx, unsigned int depth)
1745{
1746        unsigned int i, j, entries_per_page, max_order = 4;
1747        size_t rq_size, left;
1748
1749        INIT_LIST_HEAD(&tags->page_list);
1750
1751        /*
1752         * rq_size is the size of the request plus driver payload, rounded
1753         * to the cacheline size
1754         */
1755        rq_size = round_up(sizeof(struct request) + set->cmd_size +
1756                           sizeof(struct request_aux), cache_line_size());
1757        left = rq_size * depth;
1758
1759        for (i = 0; i < depth; ) {
1760                int this_order = max_order;
1761                struct page *page;
1762                int to_do;
1763                void *p;
1764
1765                while (this_order && left < order_to_size(this_order - 1))
1766                        this_order--;
1767
1768                do {
1769                        page = alloc_pages_node(set->numa_node,
1770                                GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
1771                                this_order);
1772                        if (page)
1773                                break;
1774                        if (!this_order--)
1775                                break;
1776                        if (order_to_size(this_order) < rq_size)
1777                                break;
1778                } while (1);
1779
1780                if (!page)
1781                        goto fail;
1782
1783                page->private = this_order;
1784                list_add_tail(&page->lru, &tags->page_list);
1785
1786                p = page_address(page);
1787                /*
1788                 * Allow kmemleak to scan these pages as they contain pointers
1789                 * to additional allocations like via ops->init_request().
1790                 */
1791                kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
1792                entries_per_page = order_to_size(this_order) / rq_size;
1793                to_do = min(entries_per_page, depth - i);
1794                left -= to_do * rq_size;
1795                for (j = 0; j < to_do; j++) {
1796                        struct request *rq = p;
1797
1798                        tags->static_rqs[i] = rq;
1799                        if (set->ops->init_request) {
1800                                if (set->ops->init_request(set->driver_data,
1801                                                rq, hctx_idx, i,
1802                                                set->numa_node)) {
1803                                        tags->static_rqs[i] = NULL;
1804                                        goto fail;
1805                                }
1806                        }
1807
1808                        p += rq_size;
1809                        i++;
1810                }
1811        }
1812        return 0;
1813
1814fail:
1815        blk_mq_free_rqs(set, tags, hctx_idx);
1816        return -ENOMEM;
1817}
1818
1819/*
1820 * 'cpu' is going away. splice any existing rq_list entries from this
1821 * software queue to the hw queue dispatch list, and ensure that it
1822 * gets run.
1823 */
1824static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
1825{
1826        struct blk_mq_ctx *ctx;
1827        LIST_HEAD(tmp);
1828
1829        ctx = __blk_mq_get_ctx(hctx->queue, cpu);
1830
1831        spin_lock(&ctx->lock);
1832        if (!list_empty(&ctx->rq_list)) {
1833                list_splice_init(&ctx->rq_list, &tmp);
1834                blk_mq_hctx_clear_pending(hctx, ctx);
1835        }
1836        spin_unlock(&ctx->lock);
1837
1838        if (list_empty(&tmp))
1839                return NOTIFY_OK;
1840
1841        spin_lock(&hctx->lock);
1842        list_splice_tail_init(&tmp, &hctx->dispatch);
1843        spin_unlock(&hctx->lock);
1844
1845        blk_mq_run_hw_queue(hctx, true);
1846        return NOTIFY_OK;
1847}
1848
1849static int blk_mq_hctx_notify(void *data, unsigned long action,
1850                              unsigned int cpu)
1851{
1852        struct blk_mq_hw_ctx *hctx = data;
1853
1854        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
1855                return blk_mq_hctx_cpu_offline(hctx, cpu);
1856
1857        /*
1858         * In case of CPU online, tags may be reallocated
1859         * in blk_mq_map_swqueue() after mapping is updated.
1860         */
1861
1862        return NOTIFY_OK;
1863}
1864
1865/* hctx->ctxs will be freed in queue's release handler */
1866static void blk_mq_exit_hctx(struct request_queue *q,
1867                struct blk_mq_tag_set *set,
1868                struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
1869{
1870        unsigned flush_start_tag = set->queue_depth;
1871
1872        blk_mq_debugfs_unregister_hctx(hctx);
1873
1874        if (blk_mq_hw_queue_mapped(hctx))
1875                blk_mq_tag_idle(hctx);
1876
1877        if (set->ops->exit_request)
1878                set->ops->exit_request(set->driver_data,
1879                                       hctx->fq->flush_rq, hctx_idx,
1880                                       flush_start_tag + hctx_idx);
1881
1882        blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
1883
1884        if (set->ops->exit_hctx)
1885                set->ops->exit_hctx(hctx, hctx_idx);
1886
1887        if (hctx->flags & BLK_MQ_F_BLOCKING)
1888                cleanup_srcu_struct(&hctx->queue_rq_srcu);
1889
1890        blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1891        blk_free_flush_queue(hctx->fq);
1892        sbitmap_free(&hctx->ctx_map);
1893}
1894
1895static void blk_mq_exit_hw_queues(struct request_queue *q,
1896                struct blk_mq_tag_set *set, int nr_queue)
1897{
1898        struct blk_mq_hw_ctx *hctx;
1899        unsigned int i;
1900
1901        queue_for_each_hw_ctx(q, hctx, i) {
1902                if (i == nr_queue)
1903                        break;
1904                blk_mq_exit_hctx(q, set, hctx, i);
1905        }
1906}
1907
1908static void blk_mq_free_hw_queues(struct request_queue *q,
1909                struct blk_mq_tag_set *set)
1910{
1911        struct blk_mq_hw_ctx *hctx;
1912        unsigned int i;
1913
1914        queue_for_each_hw_ctx(q, hctx, i)
1915                free_cpumask_var(hctx->cpumask);
1916}
1917
1918static int blk_mq_init_hctx(struct request_queue *q,
1919                struct blk_mq_tag_set *set,
1920                struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
1921{
1922        int node;
1923        unsigned flush_start_tag = set->queue_depth;
1924
1925        node = hctx->numa_node;
1926        if (node == NUMA_NO_NODE)
1927                node = hctx->numa_node = set->numa_node;
1928
1929        INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
1930        INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
1931        spin_lock_init(&hctx->lock);
1932        INIT_LIST_HEAD(&hctx->dispatch);
1933        hctx->queue = q;
1934        hctx->queue_num = hctx_idx;
1935        hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
1936
1937        blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
1938                                        blk_mq_hctx_notify, hctx);
1939        blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
1940
1941        hctx->tags = set->tags[hctx_idx];
1942
1943        /*
1944         * Allocate space for all possible cpus to avoid allocation at
1945         * runtime
1946         */
1947        hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
1948                                        GFP_KERNEL, node);
1949        if (!hctx->ctxs)
1950                goto unregister_cpu_notifier;
1951
1952        if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL,
1953                              node))
1954                goto free_ctxs;
1955
1956        hctx->nr_ctx = 0;
1957
1958        init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
1959        INIT_LIST_HEAD(&hctx->dispatch_wait.task_list);
1960
1961        if (set->ops->init_hctx &&
1962            set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
1963                goto free_bitmap;
1964
1965        if (blk_mq_sched_init_hctx(q, hctx, hctx_idx))
1966                goto exit_hctx;
1967
1968        hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size +
1969                        sizeof(struct request_aux));
1970        if (!hctx->fq)
1971                goto sched_exit_hctx;
1972
1973        if (set->ops->init_request &&
1974            set->ops->init_request(set->driver_data,
1975                                   hctx->fq->flush_rq, hctx_idx,
1976                                   flush_start_tag + hctx_idx, node))
1977                goto free_fq;
1978
1979        if (hctx->flags & BLK_MQ_F_BLOCKING)
1980                init_srcu_struct(&hctx->queue_rq_srcu);
1981
1982        blk_mq_debugfs_register_hctx(q, hctx);
1983
1984        return 0;
1985
1986 free_fq:
1987        kfree(hctx->fq);
1988 sched_exit_hctx:
1989        blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
1990 exit_hctx:
1991        if (set->ops->exit_hctx)
1992                set->ops->exit_hctx(hctx, hctx_idx);
1993 free_bitmap:
1994        sbitmap_free(&hctx->ctx_map);
1995 free_ctxs:
1996        kfree(hctx->ctxs);
1997 unregister_cpu_notifier:
1998        blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1999
2000        return -1;
2001}
2002
2003static void blk_mq_init_cpu_queues(struct request_queue *q,
2004                                   unsigned int nr_hw_queues)
2005{
2006        unsigned int i;
2007
2008        for_each_possible_cpu(i) {
2009                struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
2010                struct blk_mq_hw_ctx *hctx;
2011
2012                memset(__ctx, 0, sizeof(*__ctx));
2013                __ctx->cpu = i;
2014                spin_lock_init(&__ctx->lock);
2015                INIT_LIST_HEAD(&__ctx->rq_list);
2016                __ctx->queue = q;
2017
2018                /* If the cpu isn't online, the cpu is mapped to first hctx */
2019                if (!cpu_online(i))
2020                        continue;
2021
2022                hctx = blk_mq_map_queue(q, i);
2023
2024                /*
2025                 * Set local node, IFF we have more than one hw queue. If
2026                 * not, we remain on the home node of the device
2027                 */
2028                if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2029                        hctx->numa_node = local_memory_node(cpu_to_node(i));
2030        }
2031}
2032
2033static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
2034{
2035        int ret = 0;
2036
2037        set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
2038                                        set->queue_depth, set->reserved_tags);
2039        if (!set->tags[hctx_idx])
2040                return false;
2041
2042        ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
2043                                set->queue_depth);
2044        if (!ret)
2045                return true;
2046
2047        blk_mq_free_rq_map(set->tags[hctx_idx]);
2048        set->tags[hctx_idx] = NULL;
2049        return false;
2050}
2051
2052static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
2053                                         unsigned int hctx_idx)
2054{
2055        if (set->tags[hctx_idx]) {
2056                blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
2057                blk_mq_free_rq_map(set->tags[hctx_idx]);
2058                set->tags[hctx_idx] = NULL;
2059        }
2060}
2061
2062static void blk_mq_map_swqueue(struct request_queue *q,
2063                               const struct cpumask *online_mask)
2064{
2065        unsigned int i, hctx_idx;
2066        struct blk_mq_hw_ctx *hctx;
2067        struct blk_mq_ctx *ctx;
2068        struct blk_mq_tag_set *set = q->tag_set;
2069
2070        /*
2071         * Avoid others reading imcomplete hctx->cpumask through sysfs
2072         */
2073        mutex_lock(&q->sysfs_lock);
2074
2075        queue_for_each_hw_ctx(q, hctx, i) {
2076                cpumask_clear(hctx->cpumask);
2077                hctx->nr_ctx = 0;
2078        }
2079
2080        /*
2081         * Map software to hardware queues
2082         */
2083        for_each_possible_cpu(i) {
2084                /* If the cpu isn't online, the cpu is mapped to first hctx */
2085                if (!cpumask_test_cpu(i, online_mask))
2086                        continue;
2087
2088                hctx_idx = q->mq_map[i];
2089                /* unmapped hw queue can be remapped after CPU topo changed */
2090                if (!set->tags[hctx_idx] &&
2091                    !__blk_mq_alloc_rq_map(set, hctx_idx)) {
2092                        /*
2093                         * If tags initialization fail for some hctx,
2094                         * that hctx won't be brought online.  In this
2095                         * case, remap the current ctx to hctx[0] which
2096                         * is guaranteed to always have tags allocated
2097                         */
2098                        q->mq_map[i] = 0;
2099                }
2100
2101                ctx = per_cpu_ptr(q->queue_ctx, i);
2102                hctx = blk_mq_map_queue(q, i);
2103
2104                cpumask_set_cpu(i, hctx->cpumask);
2105                ctx->index_hw = hctx->nr_ctx;
2106                hctx->ctxs[hctx->nr_ctx++] = ctx;
2107        }
2108
2109        mutex_unlock(&q->sysfs_lock);
2110
2111        queue_for_each_hw_ctx(q, hctx, i) {
2112                /*
2113                 * If no software queues are mapped to this hardware queue,
2114                 * disable it and free the request entries.
2115                 */
2116                if (!hctx->nr_ctx) {
2117                        /* Never unmap queue 0.  We need it as a
2118                         * fallback in case of a new remap fails
2119                         * allocation
2120                         */
2121                        if (i && set->tags[i])
2122                                blk_mq_free_map_and_requests(set, i);
2123
2124                        hctx->tags = NULL;
2125                        continue;
2126                }
2127
2128                hctx->tags = set->tags[i];
2129                WARN_ON(!hctx->tags);
2130
2131                /*
2132                 * Set the map size to the number of mapped software queues.
2133                 * This is more accurate and more efficient than looping
2134                 * over all possibly mapped software queues.
2135                 */
2136                sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
2137
2138                /*
2139                 * Initialize batch roundrobin counts
2140                 */
2141                hctx->next_cpu = cpumask_first(hctx->cpumask);
2142                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
2143        }
2144}
2145
2146/*
2147 * Caller needs to ensure that we're either frozen/quiesced, or that
2148 * the queue isn't live yet.
2149 */
2150static void queue_set_hctx_shared(struct request_queue *q, bool shared)
2151{
2152        struct blk_mq_hw_ctx *hctx;
2153        int i;
2154
2155        queue_for_each_hw_ctx(q, hctx, i) {
2156                if (shared) {
2157                        if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
2158                                atomic_inc(&q->shared_hctx_restart);
2159                        hctx->flags |= BLK_MQ_F_TAG_SHARED;
2160                } else {
2161                        if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
2162                                atomic_dec(&q->shared_hctx_restart);
2163                        hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
2164                }
2165        }
2166}
2167
2168static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
2169                                        bool shared)
2170{
2171        struct request_queue *q;
2172
2173        lockdep_assert_held(&set->tag_list_lock);
2174
2175        list_for_each_entry(q, &set->tag_list, tag_set_list) {
2176                blk_mq_freeze_queue(q);
2177                queue_set_hctx_shared(q, shared);
2178                blk_mq_unfreeze_queue(q);
2179        }
2180}
2181
2182static void blk_mq_del_queue_tag_set(struct request_queue *q)
2183{
2184        struct blk_mq_tag_set *set = q->tag_set;
2185
2186        mutex_lock(&set->tag_list_lock);
2187        list_del_rcu(&q->tag_set_list);
2188        INIT_LIST_HEAD(&q->tag_set_list);
2189        if (list_is_singular(&set->tag_list)) {
2190                /* just transitioned to unshared */
2191                set->flags &= ~BLK_MQ_F_TAG_SHARED;
2192                /* update existing queue */
2193                blk_mq_update_tag_set_depth(set, false);
2194        }
2195        mutex_unlock(&set->tag_list_lock);
2196
2197        synchronize_rcu();
2198}
2199
2200static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
2201                                     struct request_queue *q)
2202{
2203        q->tag_set = set;
2204
2205        mutex_lock(&set->tag_list_lock);
2206
2207        /* Check to see if we're transitioning to shared (from 1 to 2 queues). */
2208        if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) {
2209                set->flags |= BLK_MQ_F_TAG_SHARED;
2210                /* update existing queue */
2211                blk_mq_update_tag_set_depth(set, true);
2212        }
2213        if (set->flags & BLK_MQ_F_TAG_SHARED)
2214                queue_set_hctx_shared(q, true);
2215        list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
2216
2217        mutex_unlock(&set->tag_list_lock);
2218}
2219
2220/*
2221 * It is the actual release handler for mq, but we do it from
2222 * request queue's release handler for avoiding use-after-free
2223 * and headache because q->mq_kobj shouldn't have been introduced,
2224 * but we can't group ctx/kctx kobj without it.
2225 */
2226void blk_mq_release(struct request_queue *q)
2227{
2228        struct blk_mq_hw_ctx *hctx;
2229        unsigned int i;
2230
2231        /* hctx kobj stays in hctx */
2232        queue_for_each_hw_ctx(q, hctx, i) {
2233                if (!hctx)
2234                        continue;
2235                kfree(hctx->ctxs);
2236                kfree(hctx);
2237        }
2238
2239        q->mq_map = NULL;
2240
2241        kfree(q->queue_hw_ctx);
2242
2243        /* ctx kobj stays in queue_ctx */
2244        free_percpu(q->queue_ctx);
2245}
2246
2247struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
2248{
2249        struct request_queue *uninit_q, *q;
2250
2251        uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
2252        if (!uninit_q)
2253                return ERR_PTR(-ENOMEM);
2254
2255        q = blk_mq_init_allocated_queue(set, uninit_q);
2256        if (IS_ERR(q))
2257                blk_cleanup_queue(uninit_q);
2258
2259        return q;
2260}
2261EXPORT_SYMBOL(blk_mq_init_queue);
2262
2263static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2264                                                struct request_queue *q)
2265{
2266        int i, j;
2267        struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
2268
2269        blk_mq_sysfs_unregister(q);
2270        for (i = 0; i < set->nr_hw_queues; i++) {
2271                int node;
2272
2273                if (hctxs[i])
2274                        continue;
2275
2276                node = blk_mq_hw_queue_to_node(q->mq_map, i);
2277                hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
2278                                        GFP_KERNEL, node);
2279                if (!hctxs[i])
2280                        break;
2281
2282                if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
2283                                                node)) {
2284                        kfree(hctxs[i]);
2285                        hctxs[i] = NULL;
2286                        break;
2287                }
2288
2289                atomic_set(&hctxs[i]->nr_active, 0);
2290                hctxs[i]->numa_node = node;
2291                hctxs[i]->queue_num = i;
2292
2293                if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
2294                        free_cpumask_var(hctxs[i]->cpumask);
2295                        kfree(hctxs[i]);
2296                        hctxs[i] = NULL;
2297                        break;
2298                }
2299                blk_mq_hctx_kobj_init(hctxs[i]);
2300        }
2301        for (j = i; j < q->nr_hw_queues; j++) {
2302                struct blk_mq_hw_ctx *hctx = hctxs[j];
2303
2304                if (hctx) {
2305                        if (hctx->tags)
2306                                blk_mq_free_map_and_requests(set, j);
2307                        blk_mq_exit_hctx(q, set, hctx, j);
2308                        free_cpumask_var(hctx->cpumask);
2309                        kobject_put(&hctx->kobj);
2310                        kfree(hctx->ctxs);
2311                        kfree(hctx);
2312                        hctxs[j] = NULL;
2313
2314                }
2315        }
2316        q->nr_hw_queues = i;
2317        blk_mq_sysfs_register(q);
2318}
2319
2320struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2321                                                  struct request_queue *q)
2322{
2323        /* mark the queue as mq asap */
2324        q->mq_ops = set->ops;
2325
2326        q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
2327                                             blk_stat_rq_ddir, 2, q);
2328        if (!q->poll_cb)
2329                goto err_exit;
2330
2331        q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2332        if (!q->queue_ctx)
2333                goto err_exit;
2334
2335        q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)),
2336                                                GFP_KERNEL, set->numa_node);
2337        if (!q->queue_hw_ctx)
2338                goto err_percpu;
2339
2340        q->mq_map = set->mq_map;
2341
2342        blk_mq_realloc_hw_ctxs(set, q);
2343        if (!q->nr_hw_queues)
2344                goto err_hctxs;
2345
2346        INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
2347        blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
2348
2349        q->nr_queues = nr_cpu_ids;
2350
2351        q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
2352
2353        if (!(set->flags & BLK_MQ_F_SG_MERGE))
2354                q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
2355
2356        q->sg_reserved_size = INT_MAX;
2357
2358        INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
2359        INIT_LIST_HEAD(&q->requeue_list);
2360        spin_lock_init(&q->requeue_lock);
2361
2362        blk_queue_make_request(q, blk_mq_make_request);
2363
2364        /*
2365         * Do this after blk_queue_make_request() overrides it...
2366         */
2367        q->nr_requests = set->queue_depth;
2368
2369        if (set->ops->complete)
2370                blk_queue_softirq_done(q, set->ops->complete);
2371
2372        blk_mq_init_cpu_queues(q, set->nr_hw_queues);
2373
2374        get_online_cpus();
2375        mutex_lock(&all_q_mutex);
2376
2377        list_add_tail(&q->all_q_node, &all_q_list);
2378        blk_mq_add_queue_tag_set(set, q);
2379        blk_mq_map_swqueue(q, cpu_online_mask);
2380
2381        mutex_unlock(&all_q_mutex);
2382        put_online_cpus();
2383
2384        if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
2385                int ret;
2386
2387                ret = blk_mq_sched_init(q);
2388                if (ret)
2389                        return ERR_PTR(ret);
2390        }
2391
2392        return q;
2393
2394err_hctxs:
2395        kfree(q->queue_hw_ctx);
2396err_percpu:
2397        free_percpu(q->queue_ctx);
2398err_exit:
2399        q->mq_ops = NULL;
2400        return ERR_PTR(-ENOMEM);
2401}
2402EXPORT_SYMBOL(blk_mq_init_allocated_queue);
2403
2404void blk_mq_free_queue(struct request_queue *q)
2405{
2406        struct blk_mq_tag_set   *set = q->tag_set;
2407
2408        mutex_lock(&all_q_mutex);
2409        list_del_init(&q->all_q_node);
2410        mutex_unlock(&all_q_mutex);
2411
2412        blk_mq_del_queue_tag_set(q);
2413
2414        blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
2415        blk_mq_free_hw_queues(q, set);
2416}
2417
2418/* Basically redo blk_mq_init_queue with queue frozen */
2419static void blk_mq_queue_reinit(struct request_queue *q,
2420                                const struct cpumask *online_mask)
2421{
2422        WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
2423
2424        blk_mq_debugfs_unregister_hctxs(q);
2425        blk_mq_sysfs_unregister(q);
2426
2427        /*
2428         * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
2429         * we should change hctx numa_node according to new topology (this
2430         * involves free and re-allocate memory, worthy doing?)
2431         */
2432
2433        blk_mq_map_swqueue(q, online_mask);
2434
2435        blk_mq_sysfs_register(q);
2436        blk_mq_debugfs_register_hctxs(q);
2437}
2438
2439static void blk_mq_freeze_queue_list(struct list_head *list)
2440{
2441        struct request_queue *q;
2442
2443        /*
2444         * We need to freeze and reinit all existing queues.  Freezing
2445         * involves synchronous wait for an RCU grace period and doing it
2446         * one by one may take a long time.  Start freezing all queues in
2447         * one swoop and then wait for the completions so that freezing can
2448         * take place in parallel.
2449         */
2450        list_for_each_entry(q, list, all_q_node)
2451                blk_freeze_queue_start(q);
2452        list_for_each_entry(q, list, all_q_node) {
2453                blk_mq_freeze_queue_wait(q);
2454
2455                /*
2456                 * timeout handler can't touch hw queue during the
2457                 * reinitialization
2458                 */
2459                del_timer_sync(&q->timeout);
2460        }
2461}
2462
2463/*
2464 * When freezing queues in blk_mq_queue_reinit_notify(), we have to freeze
2465 * queues in order from the list of 'all_q_list' for avoid IO deadlock:
2466 *
2467 * 1) DM queue or other queue which is at the top of usual queues, it
2468 * has to be frozen before the underlying queues, otherwise once the
2469 * underlying queue is frozen, any IO from upper layer queue can't be
2470 * drained up, and blk_mq_freeze_queue_wait() will wait for ever on this
2471 * kind of queue
2472 *
2473 * 2) NVMe admin queue is used in NVMe's reset handler, and IO queue is
2474 * frozen and quiesced before resetting controller, if there is any pending
2475 * IO before sending requests to admin queue, IO hang is caused because admin
2476 * queue may has been frozon, so reset can't move on, and finally
2477 * blk_mq_freeze_queue_wait() waits for ever on NVMe IO queue in
2478 * blk_mq_queue_reinit_notify(). Avoid this issue by freezing admin queue
2479 * after NVMe namespace queue is frozen.
2480 */
2481static void __blk_mq_freeze_all_queue_list(void)
2482{
2483        struct request_queue *q, *next;
2484        LIST_HEAD(front);
2485        LIST_HEAD(tail);
2486
2487        list_for_each_entry_safe(q, next, &all_q_list, all_q_node) {
2488                if (q->front_queue)
2489                        list_move(&q->all_q_node, &front);
2490                else if (q->tail_queue)
2491                        list_move(&q->all_q_node, &tail);
2492        }
2493
2494        blk_mq_freeze_queue_list(&front);
2495        blk_mq_freeze_queue_list(&all_q_list);
2496        blk_mq_freeze_queue_list(&tail);
2497
2498        list_splice(&front, &all_q_list);
2499        list_splice_tail(&tail, &all_q_list);
2500}
2501
2502static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
2503                                      unsigned long action, void *hcpu)
2504{
2505        struct request_queue *q;
2506        int cpu = (unsigned long)hcpu;
2507        /*
2508         * New online cpumask which is going to be set in this hotplug event.
2509         * Declare this cpumasks as global as cpu-hotplug operation is invoked
2510         * one-by-one and dynamically allocating this could result in a failure.
2511         */
2512        static struct cpumask online_new;
2513
2514        /*
2515         * Before hotadded cpu starts handling requests, new mappings must
2516         * be established.  Otherwise, these requests in hw queue might
2517         * never be dispatched.
2518         *
2519         * For example, there is a single hw queue (hctx) and two CPU queues
2520         * (ctx0 for CPU0, and ctx1 for CPU1).
2521         *
2522         * Now CPU1 is just onlined and a request is inserted into
2523         * ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is
2524         * still zero.
2525         *
2526         * And then while running hw queue, blk_mq_flush_busy_ctxs() finds
2527         * bit0 is set in pending bitmap and tries to retrieve requests in
2528         * hctx->ctxs[0]->rq_list. But htx->ctxs[0] is a pointer to ctx0, so
2529         * the request in ctx1->rq_list is ignored.
2530         */
2531        switch (action & ~CPU_TASKS_FROZEN) {
2532        case CPU_DEAD:
2533        case CPU_UP_CANCELED:
2534                cpumask_copy(&online_new, cpu_online_mask);
2535                break;
2536        case CPU_UP_PREPARE:
2537                cpumask_copy(&online_new, cpu_online_mask);
2538                cpumask_set_cpu(cpu, &online_new);
2539                break;
2540        default:
2541                return NOTIFY_OK;
2542        }
2543
2544        mutex_lock(&all_q_mutex);
2545
2546        __blk_mq_freeze_all_queue_list();
2547
2548        list_for_each_entry(q, &all_q_list, all_q_node)
2549                blk_mq_queue_reinit(q, &online_new);
2550
2551        list_for_each_entry(q, &all_q_list, all_q_node)
2552                blk_mq_unfreeze_queue(q);
2553
2554        mutex_unlock(&all_q_mutex);
2555        return NOTIFY_OK;
2556}
2557
2558static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2559{
2560        int i;
2561
2562        for (i = 0; i < set->nr_hw_queues; i++)
2563                if (!__blk_mq_alloc_rq_map(set, i))
2564                        goto out_unwind;
2565
2566        return 0;
2567
2568out_unwind:
2569        while (--i >= 0)
2570                blk_mq_free_rq_map(set->tags[i]);
2571
2572        return -ENOMEM;
2573}
2574
2575/*
2576 * Allocate the request maps associated with this tag_set. Note that this
2577 * may reduce the depth asked for, if memory is tight. set->queue_depth
2578 * will be updated to reflect the allocated depth.
2579 */
2580static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2581{
2582        unsigned int depth;
2583        int err;
2584
2585        depth = set->queue_depth;
2586        do {
2587                err = __blk_mq_alloc_rq_maps(set);
2588                if (!err)
2589                        break;
2590
2591                set->queue_depth >>= 1;
2592                if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
2593                        err = -ENOMEM;
2594                        break;
2595                }
2596        } while (set->queue_depth);
2597
2598        if (!set->queue_depth || err) {
2599                pr_err("blk-mq: failed to allocate request map\n");
2600                return -ENOMEM;
2601        }
2602
2603        if (depth != set->queue_depth)
2604                pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
2605                                                depth, set->queue_depth);
2606
2607        return 0;
2608}
2609
2610static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2611{
2612        if (set->ops->aux_ops && set->ops->aux_ops->map_queues)
2613                return set->ops->aux_ops->map_queues(set);
2614        else
2615                return blk_mq_map_queues(set);
2616}
2617
2618/*
2619 * Alloc a tag set to be associated with one or more request queues.
2620 * May fail with EINVAL for various error conditions. May adjust the
2621 * requested depth down, if if it too large. In that case, the set
2622 * value will be stored in set->queue_depth.
2623 */
2624int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2625{
2626        int ret;
2627
2628        BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
2629
2630        if (!set->nr_hw_queues)
2631                return -EINVAL;
2632        if (!set->queue_depth)
2633                return -EINVAL;
2634        if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
2635                return -EINVAL;
2636
2637        if (!set->ops->queue_rq)
2638                return -EINVAL;
2639
2640        if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
2641                pr_info("blk-mq: reduced tag depth to %u\n",
2642                        BLK_MQ_MAX_DEPTH);
2643                set->queue_depth = BLK_MQ_MAX_DEPTH;
2644        }
2645
2646        /*
2647         * If a crashdump is active, then we are potentially in a very
2648         * memory constrained environment. Limit us to 1 queue and
2649         * 64 tags to prevent using too much memory.
2650         */
2651        if (is_kdump_kernel()) {
2652                set->nr_hw_queues = 1;
2653                set->queue_depth = min(64U, set->queue_depth);
2654        }
2655        /*
2656         * There is no use for more h/w queues than cpus.
2657         */
2658        if (set->nr_hw_queues > nr_cpu_ids)
2659                set->nr_hw_queues = nr_cpu_ids;
2660
2661        set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *),
2662                                 GFP_KERNEL, set->numa_node);
2663        if (!set->tags)
2664                return -ENOMEM;
2665
2666        ret = -ENOMEM;
2667        set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
2668                        GFP_KERNEL, set->numa_node);
2669        if (!set->mq_map)
2670                goto out_free_tags;
2671
2672        ret = blk_mq_update_queue_map(set);
2673        if (ret)
2674                goto out_free_mq_map;
2675
2676        ret = blk_mq_alloc_rq_maps(set);
2677        if (ret)
2678                goto out_free_mq_map;
2679
2680        mutex_init(&set->tag_list_lock);
2681        INIT_LIST_HEAD(&set->tag_list);
2682
2683        return 0;
2684
2685out_free_mq_map:
2686        kfree(set->mq_map);
2687        set->mq_map = NULL;
2688out_free_tags:
2689        kfree(set->tags);
2690        set->tags = NULL;
2691        return ret;
2692}
2693EXPORT_SYMBOL(blk_mq_alloc_tag_set);
2694
2695void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2696{
2697        int i;
2698
2699        for (i = 0; i < nr_cpu_ids; i++)
2700                blk_mq_free_map_and_requests(set, i);
2701
2702        kfree(set->mq_map);
2703        set->mq_map = NULL;
2704
2705        kfree(set->tags);
2706        set->tags = NULL;
2707}
2708EXPORT_SYMBOL(blk_mq_free_tag_set);
2709
2710int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2711{
2712        struct blk_mq_tag_set *set = q->tag_set;
2713        struct blk_mq_hw_ctx *hctx;
2714        int i, ret;
2715
2716        if (!set)
2717                return -EINVAL;
2718
2719        blk_mq_freeze_queue(q);
2720        blk_mq_quiesce_queue(q);
2721
2722        ret = 0;
2723        queue_for_each_hw_ctx(q, hctx, i) {
2724                if (!hctx->tags)
2725                        continue;
2726                /*
2727                 * If we're using an MQ scheduler, just update the scheduler
2728                 * queue depth. This is similar to what the old code would do.
2729                 */
2730                if (!hctx->sched_tags) {
2731                        ret = blk_mq_tag_update_depth(hctx, &hctx->tags,
2732                                                        min(nr, set->queue_depth),
2733                                                        false);
2734                } else {
2735                        ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
2736                                                        nr, true);
2737                }
2738                if (ret)
2739                        break;
2740        }
2741
2742        if (!ret)
2743                q->nr_requests = nr;
2744
2745        blk_mq_unfreeze_queue(q);
2746        blk_mq_start_stopped_hw_queues(q, true);
2747
2748        return ret;
2749}
2750
2751static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
2752                                                        int nr_hw_queues)
2753{
2754        struct request_queue *q;
2755
2756        lockdep_assert_held(&set->tag_list_lock);
2757
2758        if (nr_hw_queues > nr_cpu_ids)
2759                nr_hw_queues = nr_cpu_ids;
2760        if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
2761                return;
2762
2763        list_for_each_entry(q, &set->tag_list, tag_set_list)
2764                blk_mq_freeze_queue(q);
2765
2766        set->nr_hw_queues = nr_hw_queues;
2767        blk_mq_update_queue_map(set);
2768        list_for_each_entry(q, &set->tag_list, tag_set_list) {
2769                blk_mq_realloc_hw_ctxs(set, q);
2770                blk_mq_queue_reinit(q, cpu_online_mask);
2771        }
2772
2773        list_for_each_entry(q, &set->tag_list, tag_set_list)
2774                blk_mq_unfreeze_queue(q);
2775}
2776
2777void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
2778{
2779        mutex_lock(&set->tag_list_lock);
2780        __blk_mq_update_nr_hw_queues(set, nr_hw_queues);
2781        mutex_unlock(&set->tag_list_lock);
2782}
2783EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
2784
2785static void blk_mq_poll_stats_start(struct request_queue *q)
2786{
2787        /*
2788         * We don't arm the callback if polling stats are not enabled or the
2789         * callback is already active.
2790         */
2791        if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
2792            blk_stat_is_active(q->poll_cb))
2793                return;
2794
2795        blk_stat_activate_msecs(q->poll_cb, 100);
2796}
2797
2798static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
2799{
2800        struct request_queue *q = cb->data;
2801
2802        if (cb->stat[READ].nr_samples)
2803                q->poll_stat[READ] = cb->stat[READ];
2804        if (cb->stat[WRITE].nr_samples)
2805                q->poll_stat[WRITE] = cb->stat[WRITE];
2806}
2807
2808void blk_mq_disable_hotplug(void)
2809{
2810        mutex_lock(&all_q_mutex);
2811}
2812
2813void blk_mq_enable_hotplug(void)
2814{
2815        mutex_unlock(&all_q_mutex);
2816}
2817
2818static int __init blk_mq_init(void)
2819{
2820        blk_mq_cpu_init();
2821
2822        hotcpu_notifier(blk_mq_queue_reinit_notify, 0);
2823
2824        return 0;
2825}
2826subsys_initcall(blk_mq_init);
2827