linux/block/blk-mq.c
<<
>>
Prefs
   1/*
   2 * Block multiqueue core code
   3 *
   4 * Copyright (C) 2013-2014 Jens Axboe
   5 * Copyright (C) 2013-2014 Christoph Hellwig
   6 */
   7#include <linux/kernel.h>
   8#include <linux/module.h>
   9#include <linux/backing-dev.h>
  10#include <linux/bio.h>
  11#include <linux/blkdev.h>
  12#include <linux/mm.h>
  13#include <linux/init.h>
  14#include <linux/slab.h>
  15#include <linux/workqueue.h>
  16#include <linux/smp.h>
  17#include <linux/llist.h>
  18#include <linux/list_sort.h>
  19#include <linux/cpu.h>
  20#include <linux/cache.h>
  21#include <linux/sched/sysctl.h>
  22#include <linux/delay.h>
  23#include <linux/crash_dump.h>
  24
  25#include <trace/events/block.h>
  26
  27#include <linux/blk-mq.h>
  28#include "blk.h"
  29#include "blk-mq.h"
  30#include "blk-mq-tag.h"
  31
  32static DEFINE_MUTEX(all_q_mutex);
  33static LIST_HEAD(all_q_list);
  34
  35static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
  36
  37/*
  38 * Check if any of the ctx's have pending work in this hardware queue
  39 */
  40static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
  41{
  42        unsigned int i;
  43
  44        for (i = 0; i < hctx->ctx_map.size; i++)
  45                if (hctx->ctx_map.map[i].word)
  46                        return true;
  47
  48        return false;
  49}
  50
  51static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
  52                                              struct blk_mq_ctx *ctx)
  53{
  54        return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
  55}
  56
  57#define CTX_TO_BIT(hctx, ctx)   \
  58        ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
  59
  60/*
  61 * Mark this ctx as having pending work in this hardware queue
  62 */
  63static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
  64                                     struct blk_mq_ctx *ctx)
  65{
  66        struct blk_align_bitmap *bm = get_bm(hctx, ctx);
  67
  68        if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
  69                set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
  70}
  71
  72static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
  73                                      struct blk_mq_ctx *ctx)
  74{
  75        struct blk_align_bitmap *bm = get_bm(hctx, ctx);
  76
  77        clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
  78}
  79
  80static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp)
  81{
  82        while (true) {
  83                int ret;
  84
  85                if (percpu_ref_tryget_live(&q->mq_usage_counter))
  86                        return 0;
  87
  88                if (!(gfp & __GFP_WAIT))
  89                        return -EBUSY;
  90
  91                ret = wait_event_interruptible(q->mq_freeze_wq,
  92                                !atomic_read(&q->mq_freeze_depth) ||
  93                                blk_queue_dying(q));
  94                if (blk_queue_dying(q))
  95                        return -ENODEV;
  96                if (ret)
  97                        return ret;
  98        }
  99}
 100
 101static void blk_mq_queue_exit(struct request_queue *q)
 102{
 103        percpu_ref_put(&q->mq_usage_counter);
 104}
 105
 106static void blk_mq_usage_counter_release(struct percpu_ref *ref)
 107{
 108        struct request_queue *q =
 109                container_of(ref, struct request_queue, mq_usage_counter);
 110
 111        wake_up_all(&q->mq_freeze_wq);
 112}
 113
 114void blk_mq_freeze_queue_start(struct request_queue *q)
 115{
 116        int freeze_depth;
 117
 118        freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
 119        if (freeze_depth == 1) {
 120                percpu_ref_kill(&q->mq_usage_counter);
 121                blk_mq_run_hw_queues(q, false);
 122        }
 123}
 124EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
 125
 126static void blk_mq_freeze_queue_wait(struct request_queue *q)
 127{
 128        wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
 129}
 130
 131/*
 132 * Guarantee no request is in use, so we can change any data structure of
 133 * the queue afterward.
 134 */
 135void blk_mq_freeze_queue(struct request_queue *q)
 136{
 137        blk_mq_freeze_queue_start(q);
 138        blk_mq_freeze_queue_wait(q);
 139}
 140EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 141
 142void blk_mq_unfreeze_queue(struct request_queue *q)
 143{
 144        int freeze_depth;
 145
 146        freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
 147        WARN_ON_ONCE(freeze_depth < 0);
 148        if (!freeze_depth) {
 149                percpu_ref_reinit(&q->mq_usage_counter);
 150                wake_up_all(&q->mq_freeze_wq);
 151        }
 152}
 153EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 154
 155void blk_mq_wake_waiters(struct request_queue *q)
 156{
 157        struct blk_mq_hw_ctx *hctx;
 158        unsigned int i;
 159
 160        queue_for_each_hw_ctx(q, hctx, i)
 161                if (blk_mq_hw_queue_mapped(hctx))
 162                        blk_mq_tag_wakeup_all(hctx->tags, true);
 163
 164        /*
 165         * If we are called because the queue has now been marked as
 166         * dying, we need to ensure that processes currently waiting on
 167         * the queue are notified as well.
 168         */
 169        wake_up_all(&q->mq_freeze_wq);
 170}
 171
 172bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 173{
 174        return blk_mq_has_free_tags(hctx->tags);
 175}
 176EXPORT_SYMBOL(blk_mq_can_queue);
 177
 178static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 179                               struct request *rq, unsigned int rw_flags)
 180{
 181        if (blk_queue_io_stat(q))
 182                rw_flags |= REQ_IO_STAT;
 183
 184        INIT_LIST_HEAD(&rq->queuelist);
 185        /* csd/requeue_work/fifo_time is initialized before use */
 186        rq->q = q;
 187        rq->mq_ctx = ctx;
 188        rq->cmd_flags |= rw_flags;
 189        /* do not touch atomic flags, it needs atomic ops against the timer */
 190        rq->cpu = -1;
 191        INIT_HLIST_NODE(&rq->hash);
 192        RB_CLEAR_NODE(&rq->rb_node);
 193        rq->rq_disk = NULL;
 194        rq->part = NULL;
 195        rq->start_time = jiffies;
 196#ifdef CONFIG_BLK_CGROUP
 197        rq->rl = NULL;
 198        set_start_time_ns(rq);
 199        rq->io_start_time_ns = 0;
 200#endif
 201        rq->nr_phys_segments = 0;
 202#if defined(CONFIG_BLK_DEV_INTEGRITY)
 203        rq->nr_integrity_segments = 0;
 204#endif
 205        rq->special = NULL;
 206        /* tag was already set */
 207        rq->errors = 0;
 208
 209        rq->cmd = rq->__cmd;
 210
 211        rq->extra_len = 0;
 212        rq->sense_len = 0;
 213        rq->resid_len = 0;
 214        rq->sense = NULL;
 215
 216        INIT_LIST_HEAD(&rq->timeout_list);
 217        rq->timeout = 0;
 218
 219        rq->end_io = NULL;
 220        rq->end_io_data = NULL;
 221        rq->next_rq = NULL;
 222
 223        ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
 224}
 225
 226static struct request *
 227__blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
 228{
 229        struct request *rq;
 230        unsigned int tag;
 231
 232        tag = blk_mq_get_tag(data);
 233        if (tag != BLK_MQ_TAG_FAIL) {
 234                rq = data->hctx->tags->rqs[tag];
 235
 236                if (blk_mq_tag_busy(data->hctx)) {
 237                        rq->cmd_flags = REQ_MQ_INFLIGHT;
 238                        atomic_inc(&data->hctx->nr_active);
 239                }
 240
 241                rq->tag = tag;
 242                blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw);
 243                return rq;
 244        }
 245
 246        return NULL;
 247}
 248
 249struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
 250                bool reserved)
 251{
 252        struct blk_mq_ctx *ctx;
 253        struct blk_mq_hw_ctx *hctx;
 254        struct request *rq;
 255        struct blk_mq_alloc_data alloc_data;
 256        int ret;
 257
 258        ret = blk_mq_queue_enter(q, gfp);
 259        if (ret)
 260                return ERR_PTR(ret);
 261
 262        ctx = blk_mq_get_ctx(q);
 263        hctx = q->mq_ops->map_queue(q, ctx->cpu);
 264        blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT,
 265                        reserved, ctx, hctx);
 266
 267        rq = __blk_mq_alloc_request(&alloc_data, rw);
 268        if (!rq && (gfp & __GFP_WAIT)) {
 269                __blk_mq_run_hw_queue(hctx);
 270                blk_mq_put_ctx(ctx);
 271
 272                ctx = blk_mq_get_ctx(q);
 273                hctx = q->mq_ops->map_queue(q, ctx->cpu);
 274                blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx,
 275                                hctx);
 276                rq =  __blk_mq_alloc_request(&alloc_data, rw);
 277                ctx = alloc_data.ctx;
 278        }
 279        blk_mq_put_ctx(ctx);
 280        if (!rq) {
 281                blk_mq_queue_exit(q);
 282                return ERR_PTR(-EWOULDBLOCK);
 283        }
 284        return rq;
 285}
 286EXPORT_SYMBOL(blk_mq_alloc_request);
 287
 288static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 289                                  struct blk_mq_ctx *ctx, struct request *rq)
 290{
 291        const int tag = rq->tag;
 292        struct request_queue *q = rq->q;
 293
 294        if (rq->cmd_flags & REQ_MQ_INFLIGHT)
 295                atomic_dec(&hctx->nr_active);
 296        rq->cmd_flags = 0;
 297
 298        clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 299        blk_mq_put_tag(hctx, tag, &ctx->last_tag);
 300        blk_mq_queue_exit(q);
 301}
 302
 303void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
 304{
 305        struct blk_mq_ctx *ctx = rq->mq_ctx;
 306
 307        ctx->rq_completed[rq_is_sync(rq)]++;
 308        __blk_mq_free_request(hctx, ctx, rq);
 309
 310}
 311EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
 312
 313void blk_mq_free_request(struct request *rq)
 314{
 315        struct blk_mq_hw_ctx *hctx;
 316        struct request_queue *q = rq->q;
 317
 318        hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu);
 319        blk_mq_free_hctx_request(hctx, rq);
 320}
 321EXPORT_SYMBOL_GPL(blk_mq_free_request);
 322
 323inline void __blk_mq_end_request(struct request *rq, int error)
 324{
 325        blk_account_io_done(rq);
 326
 327        if (rq->end_io) {
 328                rq->end_io(rq, error);
 329        } else {
 330                if (unlikely(blk_bidi_rq(rq)))
 331                        blk_mq_free_request(rq->next_rq);
 332                blk_mq_free_request(rq);
 333        }
 334}
 335EXPORT_SYMBOL(__blk_mq_end_request);
 336
 337void blk_mq_end_request(struct request *rq, int error)
 338{
 339        if (blk_update_request(rq, error, blk_rq_bytes(rq)))
 340                BUG();
 341        __blk_mq_end_request(rq, error);
 342}
 343EXPORT_SYMBOL(blk_mq_end_request);
 344
 345static void __blk_mq_complete_request_remote(void *data)
 346{
 347        struct request *rq = data;
 348
 349        rq->q->softirq_done_fn(rq);
 350}
 351
 352static void blk_mq_ipi_complete_request(struct request *rq)
 353{
 354        struct blk_mq_ctx *ctx = rq->mq_ctx;
 355        bool shared = false;
 356        int cpu;
 357
 358        if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
 359                rq->q->softirq_done_fn(rq);
 360                return;
 361        }
 362
 363        cpu = get_cpu();
 364        if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
 365                shared = cpus_share_cache(cpu, ctx->cpu);
 366
 367        if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
 368                rq->csd.func = __blk_mq_complete_request_remote;
 369                rq->csd.info = rq;
 370                rq->csd.flags = 0;
 371                smp_call_function_single_async(ctx->cpu, &rq->csd);
 372        } else {
 373                rq->q->softirq_done_fn(rq);
 374        }
 375        put_cpu();
 376}
 377
 378void __blk_mq_complete_request(struct request *rq)
 379{
 380        struct request_queue *q = rq->q;
 381
 382        if (!q->softirq_done_fn)
 383                blk_mq_end_request(rq, rq->errors);
 384        else
 385                blk_mq_ipi_complete_request(rq);
 386}
 387
 388/**
 389 * blk_mq_complete_request - end I/O on a request
 390 * @rq:         the request being processed
 391 *
 392 * Description:
 393 *      Ends all I/O on a request. It does not handle partial completions.
 394 *      The actual completion happens out-of-order, through a IPI handler.
 395 **/
 396void blk_mq_complete_request(struct request *rq, int error)
 397{
 398        struct request_queue *q = rq->q;
 399
 400        if (unlikely(blk_should_fake_timeout(q)))
 401                return;
 402        if (!blk_mark_rq_complete(rq)) {
 403                rq->errors = error;
 404                __blk_mq_complete_request(rq);
 405        }
 406}
 407EXPORT_SYMBOL(blk_mq_complete_request);
 408
 409int blk_mq_request_started(struct request *rq)
 410{
 411        return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 412}
 413EXPORT_SYMBOL_GPL(blk_mq_request_started);
 414
 415void blk_mq_start_request(struct request *rq)
 416{
 417        struct request_queue *q = rq->q;
 418
 419        trace_block_rq_issue(q, rq);
 420
 421        rq->resid_len = blk_rq_bytes(rq);
 422        if (unlikely(blk_bidi_rq(rq)))
 423                rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
 424
 425        blk_add_timer(rq);
 426
 427        /*
 428         * Ensure that ->deadline is visible before set the started
 429         * flag and clear the completed flag.
 430         */
 431        smp_mb__before_atomic();
 432
 433        /*
 434         * Mark us as started and clear complete. Complete might have been
 435         * set if requeue raced with timeout, which then marked it as
 436         * complete. So be sure to clear complete again when we start
 437         * the request, otherwise we'll ignore the completion event.
 438         */
 439        if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 440                set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 441        if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
 442                clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
 443
 444        if (q->dma_drain_size && blk_rq_bytes(rq)) {
 445                /*
 446                 * Make sure space for the drain appears.  We know we can do
 447                 * this because max_hw_segments has been adjusted to be one
 448                 * fewer than the device can handle.
 449                 */
 450                rq->nr_phys_segments++;
 451        }
 452}
 453EXPORT_SYMBOL(blk_mq_start_request);
 454
 455static void __blk_mq_requeue_request(struct request *rq)
 456{
 457        struct request_queue *q = rq->q;
 458
 459        trace_block_rq_requeue(q, rq);
 460
 461        if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
 462                if (q->dma_drain_size && blk_rq_bytes(rq))
 463                        rq->nr_phys_segments--;
 464        }
 465}
 466
 467void blk_mq_requeue_request(struct request *rq)
 468{
 469        __blk_mq_requeue_request(rq);
 470
 471        BUG_ON(blk_queued_rq(rq));
 472        blk_mq_add_to_requeue_list(rq, true);
 473}
 474EXPORT_SYMBOL(blk_mq_requeue_request);
 475
 476static void blk_mq_requeue_work(struct work_struct *work)
 477{
 478        struct request_queue *q =
 479                container_of(work, struct request_queue, requeue_work);
 480        LIST_HEAD(rq_list);
 481        struct request *rq, *next;
 482        unsigned long flags;
 483
 484        spin_lock_irqsave(&q->requeue_lock, flags);
 485        list_splice_init(&q->requeue_list, &rq_list);
 486        spin_unlock_irqrestore(&q->requeue_lock, flags);
 487
 488        list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
 489                if (!(rq->cmd_flags & REQ_SOFTBARRIER))
 490                        continue;
 491
 492                rq->cmd_flags &= ~REQ_SOFTBARRIER;
 493                list_del_init(&rq->queuelist);
 494                blk_mq_insert_request(rq, true, false, false);
 495        }
 496
 497        while (!list_empty(&rq_list)) {
 498                rq = list_entry(rq_list.next, struct request, queuelist);
 499                list_del_init(&rq->queuelist);
 500                blk_mq_insert_request(rq, false, false, false);
 501        }
 502
 503        /*
 504         * Use the start variant of queue running here, so that running
 505         * the requeue work will kick stopped queues.
 506         */
 507        blk_mq_start_hw_queues(q);
 508}
 509
 510void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
 511{
 512        struct request_queue *q = rq->q;
 513        unsigned long flags;
 514
 515        /*
 516         * We abuse this flag that is otherwise used by the I/O scheduler to
 517         * request head insertation from the workqueue.
 518         */
 519        BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER);
 520
 521        spin_lock_irqsave(&q->requeue_lock, flags);
 522        if (at_head) {
 523                rq->cmd_flags |= REQ_SOFTBARRIER;
 524                list_add(&rq->queuelist, &q->requeue_list);
 525        } else {
 526                list_add_tail(&rq->queuelist, &q->requeue_list);
 527        }
 528        spin_unlock_irqrestore(&q->requeue_lock, flags);
 529}
 530EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
 531
 532void blk_mq_cancel_requeue_work(struct request_queue *q)
 533{
 534        cancel_work_sync(&q->requeue_work);
 535}
 536EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work);
 537
 538void blk_mq_kick_requeue_list(struct request_queue *q)
 539{
 540        kblockd_schedule_work(&q->requeue_work);
 541}
 542EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 543
 544void blk_mq_abort_requeue_list(struct request_queue *q)
 545{
 546        unsigned long flags;
 547        LIST_HEAD(rq_list);
 548
 549        spin_lock_irqsave(&q->requeue_lock, flags);
 550        list_splice_init(&q->requeue_list, &rq_list);
 551        spin_unlock_irqrestore(&q->requeue_lock, flags);
 552
 553        while (!list_empty(&rq_list)) {
 554                struct request *rq;
 555
 556                rq = list_first_entry(&rq_list, struct request, queuelist);
 557                list_del_init(&rq->queuelist);
 558                rq->errors = -EIO;
 559                blk_mq_end_request(rq, rq->errors);
 560        }
 561}
 562EXPORT_SYMBOL(blk_mq_abort_requeue_list);
 563
 564struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 565{
 566        return tags->rqs[tag];
 567}
 568EXPORT_SYMBOL(blk_mq_tag_to_rq);
 569
 570struct blk_mq_timeout_data {
 571        unsigned long next;
 572        unsigned int next_set;
 573};
 574
 575void blk_mq_rq_timed_out(struct request *req, bool reserved)
 576{
 577        struct blk_mq_ops *ops = req->q->mq_ops;
 578        enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
 579
 580        /*
 581         * We know that complete is set at this point. If STARTED isn't set
 582         * anymore, then the request isn't active and the "timeout" should
 583         * just be ignored. This can happen due to the bitflag ordering.
 584         * Timeout first checks if STARTED is set, and if it is, assumes
 585         * the request is active. But if we race with completion, then
 586         * we both flags will get cleared. So check here again, and ignore
 587         * a timeout event with a request that isn't active.
 588         */
 589        if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
 590                return;
 591
 592        if (ops->timeout)
 593                ret = ops->timeout(req, reserved);
 594
 595        switch (ret) {
 596        case BLK_EH_HANDLED:
 597                __blk_mq_complete_request(req);
 598                break;
 599        case BLK_EH_RESET_TIMER:
 600                blk_add_timer(req);
 601                blk_clear_rq_complete(req);
 602                break;
 603        case BLK_EH_NOT_HANDLED:
 604                break;
 605        default:
 606                printk(KERN_ERR "block: bad eh return: %d\n", ret);
 607                break;
 608        }
 609}
 610
 611static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 612                struct request *rq, void *priv, bool reserved)
 613{
 614        struct blk_mq_timeout_data *data = priv;
 615
 616        if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
 617                /*
 618                 * If a request wasn't started before the queue was
 619                 * marked dying, kill it here or it'll go unnoticed.
 620                 */
 621                if (unlikely(blk_queue_dying(rq->q)))
 622                        blk_mq_complete_request(rq, -EIO);
 623                return;
 624        }
 625        if (rq->cmd_flags & REQ_NO_TIMEOUT)
 626                return;
 627
 628        if (time_after_eq(jiffies, rq->deadline)) {
 629                if (!blk_mark_rq_complete(rq))
 630                        blk_mq_rq_timed_out(rq, reserved);
 631        } else if (!data->next_set || time_after(data->next, rq->deadline)) {
 632                data->next = rq->deadline;
 633                data->next_set = 1;
 634        }
 635}
 636
 637static void blk_mq_rq_timer(unsigned long priv)
 638{
 639        struct request_queue *q = (struct request_queue *)priv;
 640        struct blk_mq_timeout_data data = {
 641                .next           = 0,
 642                .next_set       = 0,
 643        };
 644        int i;
 645
 646        blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
 647
 648        if (data.next_set) {
 649                data.next = blk_rq_timeout(round_jiffies_up(data.next));
 650                mod_timer(&q->timeout, data.next);
 651        } else {
 652                struct blk_mq_hw_ctx *hctx;
 653
 654                queue_for_each_hw_ctx(q, hctx, i) {
 655                        /* the hctx may be unmapped, so check it here */
 656                        if (blk_mq_hw_queue_mapped(hctx))
 657                                blk_mq_tag_idle(hctx);
 658                }
 659        }
 660}
 661
 662/*
 663 * Reverse check our software queue for entries that we could potentially
 664 * merge with. Currently includes a hand-wavy stop count of 8, to not spend
 665 * too much time checking for merges.
 666 */
 667static bool blk_mq_attempt_merge(struct request_queue *q,
 668                                 struct blk_mq_ctx *ctx, struct bio *bio)
 669{
 670        struct request *rq;
 671        int checked = 8;
 672
 673        list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
 674                int el_ret;
 675
 676                if (!checked--)
 677                        break;
 678
 679                if (!blk_rq_merge_ok(rq, bio))
 680                        continue;
 681
 682                el_ret = blk_try_merge(rq, bio);
 683                if (el_ret == ELEVATOR_BACK_MERGE) {
 684                        if (bio_attempt_back_merge(q, rq, bio)) {
 685                                ctx->rq_merged++;
 686                                return true;
 687                        }
 688                        break;
 689                } else if (el_ret == ELEVATOR_FRONT_MERGE) {
 690                        if (bio_attempt_front_merge(q, rq, bio)) {
 691                                ctx->rq_merged++;
 692                                return true;
 693                        }
 694                        break;
 695                }
 696        }
 697
 698        return false;
 699}
 700
 701/*
 702 * Process software queues that have been marked busy, splicing them
 703 * to the for-dispatch
 704 */
 705static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 706{
 707        struct blk_mq_ctx *ctx;
 708        int i;
 709
 710        for (i = 0; i < hctx->ctx_map.size; i++) {
 711                struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
 712                unsigned int off, bit;
 713
 714                if (!bm->word)
 715                        continue;
 716
 717                bit = 0;
 718                off = i * hctx->ctx_map.bits_per_word;
 719                do {
 720                        bit = find_next_bit(&bm->word, bm->depth, bit);
 721                        if (bit >= bm->depth)
 722                                break;
 723
 724                        ctx = hctx->ctxs[bit + off];
 725                        clear_bit(bit, &bm->word);
 726                        spin_lock(&ctx->lock);
 727                        list_splice_tail_init(&ctx->rq_list, list);
 728                        spin_unlock(&ctx->lock);
 729
 730                        bit++;
 731                } while (1);
 732        }
 733}
 734
 735/*
 736 * Run this hardware queue, pulling any software queues mapped to it in.
 737 * Note that this function currently has various problems around ordering
 738 * of IO. In particular, we'd like FIFO behaviour on handling existing
 739 * items on the hctx->dispatch list. Ignore that for now.
 740 */
 741static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 742{
 743        struct request_queue *q = hctx->queue;
 744        struct request *rq;
 745        LIST_HEAD(rq_list);
 746        LIST_HEAD(driver_list);
 747        struct list_head *dptr;
 748        int queued;
 749
 750        WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
 751
 752        if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
 753                return;
 754
 755        hctx->run++;
 756
 757        /*
 758         * Touch any software queue that has pending entries.
 759         */
 760        flush_busy_ctxs(hctx, &rq_list);
 761
 762        /*
 763         * If we have previous entries on our dispatch list, grab them
 764         * and stuff them at the front for more fair dispatch.
 765         */
 766        if (!list_empty_careful(&hctx->dispatch)) {
 767                spin_lock(&hctx->lock);
 768                if (!list_empty(&hctx->dispatch))
 769                        list_splice_init(&hctx->dispatch, &rq_list);
 770                spin_unlock(&hctx->lock);
 771        }
 772
 773        /*
 774         * Start off with dptr being NULL, so we start the first request
 775         * immediately, even if we have more pending.
 776         */
 777        dptr = NULL;
 778
 779        /*
 780         * Now process all the entries, sending them to the driver.
 781         */
 782        queued = 0;
 783        while (!list_empty(&rq_list)) {
 784                struct blk_mq_queue_data bd;
 785                int ret;
 786
 787                rq = list_first_entry(&rq_list, struct request, queuelist);
 788                list_del_init(&rq->queuelist);
 789
 790                bd.rq = rq;
 791                bd.list = dptr;
 792                bd.last = list_empty(&rq_list);
 793
 794                ret = q->mq_ops->queue_rq(hctx, &bd);
 795                switch (ret) {
 796                case BLK_MQ_RQ_QUEUE_OK:
 797                        queued++;
 798                        continue;
 799                case BLK_MQ_RQ_QUEUE_BUSY:
 800                        list_add(&rq->queuelist, &rq_list);
 801                        __blk_mq_requeue_request(rq);
 802                        break;
 803                default:
 804                        pr_err("blk-mq: bad return on queue: %d\n", ret);
 805                case BLK_MQ_RQ_QUEUE_ERROR:
 806                        rq->errors = -EIO;
 807                        blk_mq_end_request(rq, rq->errors);
 808                        break;
 809                }
 810
 811                if (ret == BLK_MQ_RQ_QUEUE_BUSY)
 812                        break;
 813
 814                /*
 815                 * We've done the first request. If we have more than 1
 816                 * left in the list, set dptr to defer issue.
 817                 */
 818                if (!dptr && rq_list.next != rq_list.prev)
 819                        dptr = &driver_list;
 820        }
 821
 822        if (!queued)
 823                hctx->dispatched[0]++;
 824        else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
 825                hctx->dispatched[ilog2(queued) + 1]++;
 826
 827        /*
 828         * Any items that need requeuing? Stuff them into hctx->dispatch,
 829         * that is where we will continue on next queue run.
 830         */
 831        if (!list_empty(&rq_list)) {
 832                spin_lock(&hctx->lock);
 833                list_splice(&rq_list, &hctx->dispatch);
 834                spin_unlock(&hctx->lock);
 835                /*
 836                 * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
 837                 * it's possible the queue is stopped and restarted again
 838                 * before this. Queue restart will dispatch requests. And since
 839                 * requests in rq_list aren't added into hctx->dispatch yet,
 840                 * the requests in rq_list might get lost.
 841                 *
 842                 * blk_mq_run_hw_queue() already checks the STOPPED bit
 843                 **/
 844                blk_mq_run_hw_queue(hctx, true);
 845        }
 846}
 847
 848/*
 849 * It'd be great if the workqueue API had a way to pass
 850 * in a mask and had some smarts for more clever placement.
 851 * For now we just round-robin here, switching for every
 852 * BLK_MQ_CPU_WORK_BATCH queued items.
 853 */
 854static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 855{
 856        if (hctx->queue->nr_hw_queues == 1)
 857                return WORK_CPU_UNBOUND;
 858
 859        if (--hctx->next_cpu_batch <= 0) {
 860                int cpu = hctx->next_cpu, next_cpu;
 861
 862                next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
 863                if (next_cpu >= nr_cpu_ids)
 864                        next_cpu = cpumask_first(hctx->cpumask);
 865
 866                hctx->next_cpu = next_cpu;
 867                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
 868
 869                return cpu;
 870        }
 871
 872        return hctx->next_cpu;
 873}
 874
 875void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 876{
 877        if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) ||
 878            !blk_mq_hw_queue_mapped(hctx)))
 879                return;
 880
 881        if (!async) {
 882                int cpu = get_cpu();
 883                if (cpumask_test_cpu(cpu, hctx->cpumask)) {
 884                        __blk_mq_run_hw_queue(hctx);
 885                        put_cpu();
 886                        return;
 887                }
 888
 889                put_cpu();
 890        }
 891
 892        kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
 893                        &hctx->run_work, 0);
 894}
 895
 896void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 897{
 898        struct blk_mq_hw_ctx *hctx;
 899        int i;
 900
 901        queue_for_each_hw_ctx(q, hctx, i) {
 902                if ((!blk_mq_hctx_has_pending(hctx) &&
 903                    list_empty_careful(&hctx->dispatch)) ||
 904                    test_bit(BLK_MQ_S_STOPPED, &hctx->state))
 905                        continue;
 906
 907                blk_mq_run_hw_queue(hctx, async);
 908        }
 909}
 910EXPORT_SYMBOL(blk_mq_run_hw_queues);
 911
 912void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 913{
 914        cancel_delayed_work(&hctx->run_work);
 915        cancel_delayed_work(&hctx->delay_work);
 916        set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 917}
 918EXPORT_SYMBOL(blk_mq_stop_hw_queue);
 919
 920void blk_mq_stop_hw_queues(struct request_queue *q)
 921{
 922        struct blk_mq_hw_ctx *hctx;
 923        int i;
 924
 925        queue_for_each_hw_ctx(q, hctx, i)
 926                blk_mq_stop_hw_queue(hctx);
 927}
 928EXPORT_SYMBOL(blk_mq_stop_hw_queues);
 929
 930void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
 931{
 932        clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
 933
 934        blk_mq_run_hw_queue(hctx, false);
 935}
 936EXPORT_SYMBOL(blk_mq_start_hw_queue);
 937
 938void blk_mq_start_hw_queues(struct request_queue *q)
 939{
 940        struct blk_mq_hw_ctx *hctx;
 941        int i;
 942
 943        queue_for_each_hw_ctx(q, hctx, i)
 944                blk_mq_start_hw_queue(hctx);
 945}
 946EXPORT_SYMBOL(blk_mq_start_hw_queues);
 947
 948void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 949{
 950        struct blk_mq_hw_ctx *hctx;
 951        int i;
 952
 953        queue_for_each_hw_ctx(q, hctx, i) {
 954                if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
 955                        continue;
 956
 957                clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
 958                blk_mq_run_hw_queue(hctx, async);
 959        }
 960}
 961EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
 962
 963static void blk_mq_run_work_fn(struct work_struct *work)
 964{
 965        struct blk_mq_hw_ctx *hctx;
 966
 967        hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
 968
 969        __blk_mq_run_hw_queue(hctx);
 970}
 971
 972static void blk_mq_delay_work_fn(struct work_struct *work)
 973{
 974        struct blk_mq_hw_ctx *hctx;
 975
 976        hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
 977
 978        if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
 979                __blk_mq_run_hw_queue(hctx);
 980}
 981
 982void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 983{
 984        if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
 985                return;
 986
 987        kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
 988                        &hctx->delay_work, msecs_to_jiffies(msecs));
 989}
 990EXPORT_SYMBOL(blk_mq_delay_queue);
 991
 992static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
 993                                    struct request *rq, bool at_head)
 994{
 995        struct blk_mq_ctx *ctx = rq->mq_ctx;
 996
 997        trace_block_rq_insert(hctx->queue, rq);
 998
 999        if (at_head)
1000                list_add(&rq->queuelist, &ctx->rq_list);
1001        else
1002                list_add_tail(&rq->queuelist, &ctx->rq_list);
1003
1004        blk_mq_hctx_mark_pending(hctx, ctx);
1005}
1006
1007void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
1008                bool async)
1009{
1010        struct request_queue *q = rq->q;
1011        struct blk_mq_hw_ctx *hctx;
1012        struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx;
1013
1014        current_ctx = blk_mq_get_ctx(q);
1015        if (!cpu_online(ctx->cpu))
1016                rq->mq_ctx = ctx = current_ctx;
1017
1018        hctx = q->mq_ops->map_queue(q, ctx->cpu);
1019
1020        spin_lock(&ctx->lock);
1021        __blk_mq_insert_request(hctx, rq, at_head);
1022        spin_unlock(&ctx->lock);
1023
1024        if (run_queue)
1025                blk_mq_run_hw_queue(hctx, async);
1026
1027        blk_mq_put_ctx(current_ctx);
1028}
1029
1030static void blk_mq_insert_requests(struct request_queue *q,
1031                                     struct blk_mq_ctx *ctx,
1032                                     struct list_head *list,
1033                                     int depth,
1034                                     bool from_schedule)
1035
1036{
1037        struct blk_mq_hw_ctx *hctx;
1038        struct blk_mq_ctx *current_ctx;
1039
1040        trace_block_unplug(q, depth, !from_schedule);
1041
1042        current_ctx = blk_mq_get_ctx(q);
1043
1044        if (!cpu_online(ctx->cpu))
1045                ctx = current_ctx;
1046        hctx = q->mq_ops->map_queue(q, ctx->cpu);
1047
1048        /*
1049         * preemption doesn't flush plug list, so it's possible ctx->cpu is
1050         * offline now
1051         */
1052        spin_lock(&ctx->lock);
1053        while (!list_empty(list)) {
1054                struct request *rq;
1055
1056                rq = list_first_entry(list, struct request, queuelist);
1057                list_del_init(&rq->queuelist);
1058                rq->mq_ctx = ctx;
1059                __blk_mq_insert_request(hctx, rq, false);
1060        }
1061        spin_unlock(&ctx->lock);
1062
1063        blk_mq_run_hw_queue(hctx, from_schedule);
1064        blk_mq_put_ctx(current_ctx);
1065}
1066
1067static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
1068{
1069        struct request *rqa = container_of(a, struct request, queuelist);
1070        struct request *rqb = container_of(b, struct request, queuelist);
1071
1072        return !(rqa->mq_ctx < rqb->mq_ctx ||
1073                 (rqa->mq_ctx == rqb->mq_ctx &&
1074                  blk_rq_pos(rqa) < blk_rq_pos(rqb)));
1075}
1076
1077void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1078{
1079        struct blk_mq_ctx *this_ctx;
1080        struct request_queue *this_q;
1081        struct request *rq;
1082        LIST_HEAD(list);
1083        LIST_HEAD(ctx_list);
1084        unsigned int depth;
1085
1086        list_splice_init(&plug->mq_list, &list);
1087
1088        list_sort(NULL, &list, plug_ctx_cmp);
1089
1090        this_q = NULL;
1091        this_ctx = NULL;
1092        depth = 0;
1093
1094        while (!list_empty(&list)) {
1095                rq = list_entry_rq(list.next);
1096                list_del_init(&rq->queuelist);
1097                BUG_ON(!rq->q);
1098                if (rq->mq_ctx != this_ctx) {
1099                        if (this_ctx) {
1100                                blk_mq_insert_requests(this_q, this_ctx,
1101                                                        &ctx_list, depth,
1102                                                        from_schedule);
1103                        }
1104
1105                        this_ctx = rq->mq_ctx;
1106                        this_q = rq->q;
1107                        depth = 0;
1108                }
1109
1110                depth++;
1111                list_add_tail(&rq->queuelist, &ctx_list);
1112        }
1113
1114        /*
1115         * If 'this_ctx' is set, we know we have entries to complete
1116         * on 'ctx_list'. Do those.
1117         */
1118        if (this_ctx) {
1119                blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
1120                                       from_schedule);
1121        }
1122}
1123
1124static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1125{
1126        init_request_from_bio(rq, bio);
1127
1128        if (blk_do_io_stat(rq))
1129                blk_account_io_start(rq, 1);
1130}
1131
1132static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
1133{
1134        return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
1135                !blk_queue_nomerges(hctx->queue);
1136}
1137
1138static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
1139                                         struct blk_mq_ctx *ctx,
1140                                         struct request *rq, struct bio *bio)
1141{
1142        if (!hctx_allow_merges(hctx)) {
1143                blk_mq_bio_to_request(rq, bio);
1144                spin_lock(&ctx->lock);
1145insert_rq:
1146                __blk_mq_insert_request(hctx, rq, false);
1147                spin_unlock(&ctx->lock);
1148                return false;
1149        } else {
1150                struct request_queue *q = hctx->queue;
1151
1152                spin_lock(&ctx->lock);
1153                if (!blk_mq_attempt_merge(q, ctx, bio)) {
1154                        blk_mq_bio_to_request(rq, bio);
1155                        goto insert_rq;
1156                }
1157
1158                spin_unlock(&ctx->lock);
1159                __blk_mq_free_request(hctx, ctx, rq);
1160                return true;
1161        }
1162}
1163
1164struct blk_map_ctx {
1165        struct blk_mq_hw_ctx *hctx;
1166        struct blk_mq_ctx *ctx;
1167};
1168
1169static struct request *blk_mq_map_request(struct request_queue *q,
1170                                          struct bio *bio,
1171                                          struct blk_map_ctx *data)
1172{
1173        struct blk_mq_hw_ctx *hctx;
1174        struct blk_mq_ctx *ctx;
1175        struct request *rq;
1176        int rw = bio_data_dir(bio);
1177        struct blk_mq_alloc_data alloc_data;
1178
1179        if (unlikely(blk_mq_queue_enter(q, GFP_KERNEL))) {
1180                bio_io_error(bio);
1181                return NULL;
1182        }
1183
1184        ctx = blk_mq_get_ctx(q);
1185        hctx = q->mq_ops->map_queue(q, ctx->cpu);
1186
1187        if (rw_is_sync(bio->bi_rw))
1188                rw |= REQ_SYNC;
1189
1190        trace_block_getrq(q, bio, rw);
1191        blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx,
1192                        hctx);
1193        rq = __blk_mq_alloc_request(&alloc_data, rw);
1194        if (unlikely(!rq)) {
1195                __blk_mq_run_hw_queue(hctx);
1196                blk_mq_put_ctx(ctx);
1197                trace_block_sleeprq(q, bio, rw);
1198
1199                ctx = blk_mq_get_ctx(q);
1200                hctx = q->mq_ops->map_queue(q, ctx->cpu);
1201                blk_mq_set_alloc_data(&alloc_data, q,
1202                                __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx);
1203                rq = __blk_mq_alloc_request(&alloc_data, rw);
1204                ctx = alloc_data.ctx;
1205                hctx = alloc_data.hctx;
1206        }
1207
1208        hctx->queued++;
1209        data->hctx = hctx;
1210        data->ctx = ctx;
1211        return rq;
1212}
1213
1214static int blk_mq_direct_issue_request(struct request *rq)
1215{
1216        int ret;
1217        struct request_queue *q = rq->q;
1218        struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q,
1219                        rq->mq_ctx->cpu);
1220        struct blk_mq_queue_data bd = {
1221                .rq = rq,
1222                .list = NULL,
1223                .last = 1
1224        };
1225
1226        /*
1227         * For OK queue, we are done. For error, kill it. Any other
1228         * error (busy), just add it to our list as we previously
1229         * would have done
1230         */
1231        ret = q->mq_ops->queue_rq(hctx, &bd);
1232        if (ret == BLK_MQ_RQ_QUEUE_OK)
1233                return 0;
1234        else {
1235                __blk_mq_requeue_request(rq);
1236
1237                if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
1238                        rq->errors = -EIO;
1239                        blk_mq_end_request(rq, rq->errors);
1240                        return 0;
1241                }
1242                return -1;
1243        }
1244}
1245
1246/*
1247 * Multiple hardware queue variant. This will not use per-process plugs,
1248 * but will attempt to bypass the hctx queueing if we can go straight to
1249 * hardware for SYNC IO.
1250 */
1251static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
1252{
1253        const int is_sync = rw_is_sync(bio->bi_rw);
1254        const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
1255        struct blk_map_ctx data;
1256        struct request *rq;
1257        unsigned int request_count = 0;
1258        struct blk_plug *plug;
1259        struct request *same_queue_rq = NULL;
1260
1261        blk_queue_bounce(q, &bio);
1262
1263        if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1264                bio_io_error(bio);
1265                return;
1266        }
1267
1268        blk_queue_split(q, &bio, q->bio_split);
1269
1270        if (!is_flush_fua && !blk_queue_nomerges(q) &&
1271            blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1272                return;
1273
1274        rq = blk_mq_map_request(q, bio, &data);
1275        if (unlikely(!rq))
1276                return;
1277
1278        if (unlikely(is_flush_fua)) {
1279                blk_mq_bio_to_request(rq, bio);
1280                blk_insert_flush(rq);
1281                goto run_queue;
1282        }
1283
1284        plug = current->plug;
1285        /*
1286         * If the driver supports defer issued based on 'last', then
1287         * queue it up like normal since we can potentially save some
1288         * CPU this way.
1289         */
1290        if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
1291            !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
1292                struct request *old_rq = NULL;
1293
1294                blk_mq_bio_to_request(rq, bio);
1295
1296                /*
1297                 * we do limited pluging. If bio can be merged, do merge.
1298                 * Otherwise the existing request in the plug list will be
1299                 * issued. So the plug list will have one request at most
1300                 */
1301                if (plug) {
1302                        /*
1303                         * The plug list might get flushed before this. If that
1304                         * happens, same_queue_rq is invalid and plug list is empty
1305                         **/
1306                        if (same_queue_rq && !list_empty(&plug->mq_list)) {
1307                                old_rq = same_queue_rq;
1308                                list_del_init(&old_rq->queuelist);
1309                        }
1310                        list_add_tail(&rq->queuelist, &plug->mq_list);
1311                } else /* is_sync */
1312                        old_rq = rq;
1313                blk_mq_put_ctx(data.ctx);
1314                if (!old_rq)
1315                        return;
1316                if (!blk_mq_direct_issue_request(old_rq))
1317                        return;
1318                blk_mq_insert_request(old_rq, false, true, true);
1319                return;
1320        }
1321
1322        if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1323                /*
1324                 * For a SYNC request, send it to the hardware immediately. For
1325                 * an ASYNC request, just ensure that we run it later on. The
1326                 * latter allows for merging opportunities and more efficient
1327                 * dispatching.
1328                 */
1329run_queue:
1330                blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
1331        }
1332        blk_mq_put_ctx(data.ctx);
1333}
1334
1335/*
1336 * Single hardware queue variant. This will attempt to use any per-process
1337 * plug for merging and IO deferral.
1338 */
1339static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
1340{
1341        const int is_sync = rw_is_sync(bio->bi_rw);
1342        const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
1343        struct blk_plug *plug;
1344        unsigned int request_count = 0;
1345        struct blk_map_ctx data;
1346        struct request *rq;
1347
1348        blk_queue_bounce(q, &bio);
1349
1350        if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1351                bio_io_error(bio);
1352                return;
1353        }
1354
1355        blk_queue_split(q, &bio, q->bio_split);
1356
1357        if (!is_flush_fua && !blk_queue_nomerges(q) &&
1358            blk_attempt_plug_merge(q, bio, &request_count, NULL))
1359                return;
1360
1361        rq = blk_mq_map_request(q, bio, &data);
1362        if (unlikely(!rq))
1363                return;
1364
1365        if (unlikely(is_flush_fua)) {
1366                blk_mq_bio_to_request(rq, bio);
1367                blk_insert_flush(rq);
1368                goto run_queue;
1369        }
1370
1371        /*
1372         * A task plug currently exists. Since this is completely lockless,
1373         * utilize that to temporarily store requests until the task is
1374         * either done or scheduled away.
1375         */
1376        plug = current->plug;
1377        if (plug) {
1378                blk_mq_bio_to_request(rq, bio);
1379                if (list_empty(&plug->mq_list))
1380                        trace_block_plug(q);
1381                else if (request_count >= BLK_MAX_REQUEST_COUNT) {
1382                        blk_flush_plug_list(plug, false);
1383                        trace_block_plug(q);
1384                }
1385                list_add_tail(&rq->queuelist, &plug->mq_list);
1386                blk_mq_put_ctx(data.ctx);
1387                return;
1388        }
1389
1390        if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1391                /*
1392                 * For a SYNC request, send it to the hardware immediately. For
1393                 * an ASYNC request, just ensure that we run it later on. The
1394                 * latter allows for merging opportunities and more efficient
1395                 * dispatching.
1396                 */
1397run_queue:
1398                blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
1399        }
1400
1401        blk_mq_put_ctx(data.ctx);
1402}
1403
1404/*
1405 * Default mapping to a software queue, since we use one per CPU.
1406 */
1407struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
1408{
1409        return q->queue_hw_ctx[q->mq_map[cpu]];
1410}
1411EXPORT_SYMBOL(blk_mq_map_queue);
1412
1413static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
1414                struct blk_mq_tags *tags, unsigned int hctx_idx)
1415{
1416        struct page *page;
1417
1418        if (tags->rqs && set->ops->exit_request) {
1419                int i;
1420
1421                for (i = 0; i < tags->nr_tags; i++) {
1422                        if (!tags->rqs[i])
1423                                continue;
1424                        set->ops->exit_request(set->driver_data, tags->rqs[i],
1425                                                hctx_idx, i);
1426                        tags->rqs[i] = NULL;
1427                }
1428        }
1429
1430        while (!list_empty(&tags->page_list)) {
1431                page = list_first_entry(&tags->page_list, struct page, lru);
1432                list_del_init(&page->lru);
1433                __free_pages(page, page->private);
1434        }
1435
1436        kfree(tags->rqs);
1437
1438        blk_mq_free_tags(tags);
1439}
1440
1441static size_t order_to_size(unsigned int order)
1442{
1443        return (size_t)PAGE_SIZE << order;
1444}
1445
1446static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
1447                unsigned int hctx_idx)
1448{
1449        struct blk_mq_tags *tags;
1450        unsigned int i, j, entries_per_page, max_order = 4;
1451        size_t rq_size, left;
1452
1453        tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
1454                                set->numa_node,
1455                                BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
1456        if (!tags)
1457                return NULL;
1458
1459        INIT_LIST_HEAD(&tags->page_list);
1460
1461        tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
1462                                 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
1463                                 set->numa_node);
1464        if (!tags->rqs) {
1465                blk_mq_free_tags(tags);
1466                return NULL;
1467        }
1468
1469        /*
1470         * rq_size is the size of the request plus driver payload, rounded
1471         * to the cacheline size
1472         */
1473        rq_size = round_up(sizeof(struct request) + set->cmd_size,
1474                                cache_line_size());
1475        left = rq_size * set->queue_depth;
1476
1477        for (i = 0; i < set->queue_depth; ) {
1478                int this_order = max_order;
1479                struct page *page;
1480                int to_do;
1481                void *p;
1482
1483                while (left < order_to_size(this_order - 1) && this_order)
1484                        this_order--;
1485
1486                do {
1487                        page = alloc_pages_node(set->numa_node,
1488                                GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
1489                                this_order);
1490                        if (page)
1491                                break;
1492                        if (!this_order--)
1493                                break;
1494                        if (order_to_size(this_order) < rq_size)
1495                                break;
1496                } while (1);
1497
1498                if (!page)
1499                        goto fail;
1500
1501                page->private = this_order;
1502                list_add_tail(&page->lru, &tags->page_list);
1503
1504                p = page_address(page);
1505                entries_per_page = order_to_size(this_order) / rq_size;
1506                to_do = min(entries_per_page, set->queue_depth - i);
1507                left -= to_do * rq_size;
1508                for (j = 0; j < to_do; j++) {
1509                        tags->rqs[i] = p;
1510                        if (set->ops->init_request) {
1511                                if (set->ops->init_request(set->driver_data,
1512                                                tags->rqs[i], hctx_idx, i,
1513                                                set->numa_node)) {
1514                                        tags->rqs[i] = NULL;
1515                                        goto fail;
1516                                }
1517                        }
1518
1519                        p += rq_size;
1520                        i++;
1521                }
1522        }
1523        return tags;
1524
1525fail:
1526        blk_mq_free_rq_map(set, tags, hctx_idx);
1527        return NULL;
1528}
1529
1530static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
1531{
1532        kfree(bitmap->map);
1533}
1534
1535static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
1536{
1537        unsigned int bpw = 8, total, num_maps, i;
1538
1539        bitmap->bits_per_word = bpw;
1540
1541        num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
1542        bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
1543                                        GFP_KERNEL, node);
1544        if (!bitmap->map)
1545                return -ENOMEM;
1546
1547        total = nr_cpu_ids;
1548        for (i = 0; i < num_maps; i++) {
1549                bitmap->map[i].depth = min(total, bitmap->bits_per_word);
1550                total -= bitmap->map[i].depth;
1551        }
1552
1553        return 0;
1554}
1555
1556static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
1557{
1558        struct request_queue *q = hctx->queue;
1559        struct blk_mq_ctx *ctx;
1560        LIST_HEAD(tmp);
1561
1562        /*
1563         * Move ctx entries to new CPU, if this one is going away.
1564         */
1565        ctx = __blk_mq_get_ctx(q, cpu);
1566
1567        spin_lock(&ctx->lock);
1568        if (!list_empty(&ctx->rq_list)) {
1569                list_splice_init(&ctx->rq_list, &tmp);
1570                blk_mq_hctx_clear_pending(hctx, ctx);
1571        }
1572        spin_unlock(&ctx->lock);
1573
1574        if (list_empty(&tmp))
1575                return NOTIFY_OK;
1576
1577        ctx = blk_mq_get_ctx(q);
1578        spin_lock(&ctx->lock);
1579
1580        while (!list_empty(&tmp)) {
1581                struct request *rq;
1582
1583                rq = list_first_entry(&tmp, struct request, queuelist);
1584                rq->mq_ctx = ctx;
1585                list_move_tail(&rq->queuelist, &ctx->rq_list);
1586        }
1587
1588        hctx = q->mq_ops->map_queue(q, ctx->cpu);
1589        blk_mq_hctx_mark_pending(hctx, ctx);
1590
1591        spin_unlock(&ctx->lock);
1592
1593        blk_mq_run_hw_queue(hctx, true);
1594        blk_mq_put_ctx(ctx);
1595        return NOTIFY_OK;
1596}
1597
1598static int blk_mq_hctx_notify(void *data, unsigned long action,
1599                              unsigned int cpu)
1600{
1601        struct blk_mq_hw_ctx *hctx = data;
1602
1603        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
1604                return blk_mq_hctx_cpu_offline(hctx, cpu);
1605
1606        /*
1607         * In case of CPU online, tags may be reallocated
1608         * in blk_mq_map_swqueue() after mapping is updated.
1609         */
1610
1611        return NOTIFY_OK;
1612}
1613
1614/* hctx->ctxs will be freed in queue's release handler */
1615static void blk_mq_exit_hctx(struct request_queue *q,
1616                struct blk_mq_tag_set *set,
1617                struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
1618{
1619        unsigned flush_start_tag = set->queue_depth;
1620
1621        blk_mq_tag_idle(hctx);
1622
1623        if (set->ops->exit_request)
1624                set->ops->exit_request(set->driver_data,
1625                                       hctx->fq->flush_rq, hctx_idx,
1626                                       flush_start_tag + hctx_idx);
1627
1628        if (set->ops->exit_hctx)
1629                set->ops->exit_hctx(hctx, hctx_idx);
1630
1631        blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1632        blk_free_flush_queue(hctx->fq);
1633        blk_mq_free_bitmap(&hctx->ctx_map);
1634}
1635
1636static void blk_mq_exit_hw_queues(struct request_queue *q,
1637                struct blk_mq_tag_set *set, int nr_queue)
1638{
1639        struct blk_mq_hw_ctx *hctx;
1640        unsigned int i;
1641
1642        queue_for_each_hw_ctx(q, hctx, i) {
1643                if (i == nr_queue)
1644                        break;
1645                blk_mq_exit_hctx(q, set, hctx, i);
1646        }
1647}
1648
1649static void blk_mq_free_hw_queues(struct request_queue *q,
1650                struct blk_mq_tag_set *set)
1651{
1652        struct blk_mq_hw_ctx *hctx;
1653        unsigned int i;
1654
1655        queue_for_each_hw_ctx(q, hctx, i)
1656                free_cpumask_var(hctx->cpumask);
1657}
1658
1659static int blk_mq_init_hctx(struct request_queue *q,
1660                struct blk_mq_tag_set *set,
1661                struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
1662{
1663        int node;
1664        unsigned flush_start_tag = set->queue_depth;
1665
1666        node = hctx->numa_node;
1667        if (node == NUMA_NO_NODE)
1668                node = hctx->numa_node = set->numa_node;
1669
1670        INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
1671        INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
1672        spin_lock_init(&hctx->lock);
1673        INIT_LIST_HEAD(&hctx->dispatch);
1674        hctx->queue = q;
1675        hctx->queue_num = hctx_idx;
1676        hctx->flags = set->flags;
1677
1678        blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
1679                                        blk_mq_hctx_notify, hctx);
1680        blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
1681
1682        hctx->tags = set->tags[hctx_idx];
1683
1684        /*
1685         * Allocate space for all possible cpus to avoid allocation at
1686         * runtime
1687         */
1688        hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
1689                                        GFP_KERNEL, node);
1690        if (!hctx->ctxs)
1691                goto unregister_cpu_notifier;
1692
1693        if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
1694                goto free_ctxs;
1695
1696        hctx->nr_ctx = 0;
1697
1698        if (set->ops->init_hctx &&
1699            set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
1700                goto free_bitmap;
1701
1702        hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
1703        if (!hctx->fq)
1704                goto exit_hctx;
1705
1706        if (set->ops->init_request &&
1707            set->ops->init_request(set->driver_data,
1708                                   hctx->fq->flush_rq, hctx_idx,
1709                                   flush_start_tag + hctx_idx, node))
1710                goto free_fq;
1711
1712        return 0;
1713
1714 free_fq:
1715        kfree(hctx->fq);
1716 exit_hctx:
1717        if (set->ops->exit_hctx)
1718                set->ops->exit_hctx(hctx, hctx_idx);
1719 free_bitmap:
1720        blk_mq_free_bitmap(&hctx->ctx_map);
1721 free_ctxs:
1722        kfree(hctx->ctxs);
1723 unregister_cpu_notifier:
1724        blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1725
1726        return -1;
1727}
1728
1729static int blk_mq_init_hw_queues(struct request_queue *q,
1730                struct blk_mq_tag_set *set)
1731{
1732        struct blk_mq_hw_ctx *hctx;
1733        unsigned int i;
1734
1735        /*
1736         * Initialize hardware queues
1737         */
1738        queue_for_each_hw_ctx(q, hctx, i) {
1739                if (blk_mq_init_hctx(q, set, hctx, i))
1740                        break;
1741        }
1742
1743        if (i == q->nr_hw_queues)
1744                return 0;
1745
1746        /*
1747         * Init failed
1748         */
1749        blk_mq_exit_hw_queues(q, set, i);
1750
1751        return 1;
1752}
1753
1754static void blk_mq_init_cpu_queues(struct request_queue *q,
1755                                   unsigned int nr_hw_queues)
1756{
1757        unsigned int i;
1758
1759        for_each_possible_cpu(i) {
1760                struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
1761                struct blk_mq_hw_ctx *hctx;
1762
1763                memset(__ctx, 0, sizeof(*__ctx));
1764                __ctx->cpu = i;
1765                spin_lock_init(&__ctx->lock);
1766                INIT_LIST_HEAD(&__ctx->rq_list);
1767                __ctx->queue = q;
1768
1769                /* If the cpu isn't online, the cpu is mapped to first hctx */
1770                if (!cpu_online(i))
1771                        continue;
1772
1773                hctx = q->mq_ops->map_queue(q, i);
1774
1775                /*
1776                 * Set local node, IFF we have more than one hw queue. If
1777                 * not, we remain on the home node of the device
1778                 */
1779                if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
1780                        hctx->numa_node = cpu_to_node(i);
1781        }
1782}
1783
1784static void blk_mq_map_swqueue(struct request_queue *q,
1785                               const struct cpumask *online_mask)
1786{
1787        unsigned int i;
1788        struct blk_mq_hw_ctx *hctx;
1789        struct blk_mq_ctx *ctx;
1790        struct blk_mq_tag_set *set = q->tag_set;
1791
1792        /*
1793         * Avoid others reading imcomplete hctx->cpumask through sysfs
1794         */
1795        mutex_lock(&q->sysfs_lock);
1796
1797        queue_for_each_hw_ctx(q, hctx, i) {
1798                cpumask_clear(hctx->cpumask);
1799                hctx->nr_ctx = 0;
1800        }
1801
1802        /*
1803         * Map software to hardware queues
1804         */
1805        queue_for_each_ctx(q, ctx, i) {
1806                /* If the cpu isn't online, the cpu is mapped to first hctx */
1807                if (!cpumask_test_cpu(i, online_mask))
1808                        continue;
1809
1810                hctx = q->mq_ops->map_queue(q, i);
1811                cpumask_set_cpu(i, hctx->cpumask);
1812                ctx->index_hw = hctx->nr_ctx;
1813                hctx->ctxs[hctx->nr_ctx++] = ctx;
1814        }
1815
1816        mutex_unlock(&q->sysfs_lock);
1817
1818        queue_for_each_hw_ctx(q, hctx, i) {
1819                struct blk_mq_ctxmap *map = &hctx->ctx_map;
1820
1821                /*
1822                 * If no software queues are mapped to this hardware queue,
1823                 * disable it and free the request entries.
1824                 */
1825                if (!hctx->nr_ctx) {
1826                        if (set->tags[i]) {
1827                                blk_mq_free_rq_map(set, set->tags[i], i);
1828                                set->tags[i] = NULL;
1829                        }
1830                        hctx->tags = NULL;
1831                        continue;
1832                }
1833
1834                /* unmapped hw queue can be remapped after CPU topo changed */
1835                if (!set->tags[i])
1836                        set->tags[i] = blk_mq_init_rq_map(set, i);
1837                hctx->tags = set->tags[i];
1838                WARN_ON(!hctx->tags);
1839
1840                /*
1841                 * Set the map size to the number of mapped software queues.
1842                 * This is more accurate and more efficient than looping
1843                 * over all possibly mapped software queues.
1844                 */
1845                map->size = DIV_ROUND_UP(hctx->nr_ctx, map->bits_per_word);
1846
1847                /*
1848                 * Initialize batch roundrobin counts
1849                 */
1850                hctx->next_cpu = cpumask_first(hctx->cpumask);
1851                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1852        }
1853
1854        queue_for_each_ctx(q, ctx, i) {
1855                if (!cpumask_test_cpu(i, online_mask))
1856                        continue;
1857
1858                hctx = q->mq_ops->map_queue(q, i);
1859                cpumask_set_cpu(i, hctx->tags->cpumask);
1860        }
1861}
1862
1863static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set)
1864{
1865        struct blk_mq_hw_ctx *hctx;
1866        struct request_queue *q;
1867        bool shared;
1868        int i;
1869
1870        if (set->tag_list.next == set->tag_list.prev)
1871                shared = false;
1872        else
1873                shared = true;
1874
1875        list_for_each_entry(q, &set->tag_list, tag_set_list) {
1876                blk_mq_freeze_queue(q);
1877
1878                queue_for_each_hw_ctx(q, hctx, i) {
1879                        if (shared)
1880                                hctx->flags |= BLK_MQ_F_TAG_SHARED;
1881                        else
1882                                hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
1883                }
1884                blk_mq_unfreeze_queue(q);
1885        }
1886}
1887
1888static void blk_mq_del_queue_tag_set(struct request_queue *q)
1889{
1890        struct blk_mq_tag_set *set = q->tag_set;
1891
1892        mutex_lock(&set->tag_list_lock);
1893        list_del_init(&q->tag_set_list);
1894        blk_mq_update_tag_set_depth(set);
1895        mutex_unlock(&set->tag_list_lock);
1896}
1897
1898static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
1899                                     struct request_queue *q)
1900{
1901        q->tag_set = set;
1902
1903        mutex_lock(&set->tag_list_lock);
1904        list_add_tail(&q->tag_set_list, &set->tag_list);
1905        blk_mq_update_tag_set_depth(set);
1906        mutex_unlock(&set->tag_list_lock);
1907}
1908
1909/*
1910 * It is the actual release handler for mq, but we do it from
1911 * request queue's release handler for avoiding use-after-free
1912 * and headache because q->mq_kobj shouldn't have been introduced,
1913 * but we can't group ctx/kctx kobj without it.
1914 */
1915void blk_mq_release(struct request_queue *q)
1916{
1917        struct blk_mq_hw_ctx *hctx;
1918        unsigned int i;
1919
1920        /* hctx kobj stays in hctx */
1921        queue_for_each_hw_ctx(q, hctx, i) {
1922                if (!hctx)
1923                        continue;
1924                kfree(hctx->ctxs);
1925                kfree(hctx);
1926        }
1927
1928        kfree(q->mq_map);
1929        q->mq_map = NULL;
1930
1931        kfree(q->queue_hw_ctx);
1932
1933        /* ctx kobj stays in queue_ctx */
1934        free_percpu(q->queue_ctx);
1935}
1936
1937struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
1938{
1939        struct request_queue *uninit_q, *q;
1940
1941        uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
1942        if (!uninit_q)
1943                return ERR_PTR(-ENOMEM);
1944
1945        q = blk_mq_init_allocated_queue(set, uninit_q);
1946        if (IS_ERR(q))
1947                blk_cleanup_queue(uninit_q);
1948
1949        return q;
1950}
1951EXPORT_SYMBOL(blk_mq_init_queue);
1952
1953struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
1954                                                  struct request_queue *q)
1955{
1956        struct blk_mq_hw_ctx **hctxs;
1957        struct blk_mq_ctx __percpu *ctx;
1958        unsigned int *map;
1959        int i;
1960
1961        ctx = alloc_percpu(struct blk_mq_ctx);
1962        if (!ctx)
1963                return ERR_PTR(-ENOMEM);
1964
1965        hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
1966                        set->numa_node);
1967
1968        if (!hctxs)
1969                goto err_percpu;
1970
1971        map = blk_mq_make_queue_map(set);
1972        if (!map)
1973                goto err_map;
1974
1975        for (i = 0; i < set->nr_hw_queues; i++) {
1976                int node = blk_mq_hw_queue_to_node(map, i);
1977
1978                hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
1979                                        GFP_KERNEL, node);
1980                if (!hctxs[i])
1981                        goto err_hctxs;
1982
1983                if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
1984                                                node))
1985                        goto err_hctxs;
1986
1987                atomic_set(&hctxs[i]->nr_active, 0);
1988                hctxs[i]->numa_node = node;
1989                hctxs[i]->queue_num = i;
1990        }
1991
1992        /*
1993         * Init percpu_ref in atomic mode so that it's faster to shutdown.
1994         * See blk_register_queue() for details.
1995         */
1996        if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release,
1997                            PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
1998                goto err_hctxs;
1999
2000        setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
2001        blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
2002
2003        q->nr_queues = nr_cpu_ids;
2004        q->nr_hw_queues = set->nr_hw_queues;
2005        q->mq_map = map;
2006
2007        q->queue_ctx = ctx;
2008        q->queue_hw_ctx = hctxs;
2009
2010        q->mq_ops = set->ops;
2011        q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
2012
2013        if (!(set->flags & BLK_MQ_F_SG_MERGE))
2014                q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
2015
2016        q->sg_reserved_size = INT_MAX;
2017
2018        INIT_WORK(&q->requeue_work, blk_mq_requeue_work);
2019        INIT_LIST_HEAD(&q->requeue_list);
2020        spin_lock_init(&q->requeue_lock);
2021
2022        if (q->nr_hw_queues > 1)
2023                blk_queue_make_request(q, blk_mq_make_request);
2024        else
2025                blk_queue_make_request(q, blk_sq_make_request);
2026
2027        /*
2028         * Do this after blk_queue_make_request() overrides it...
2029         */
2030        q->nr_requests = set->queue_depth;
2031
2032        if (set->ops->complete)
2033                blk_queue_softirq_done(q, set->ops->complete);
2034
2035        blk_mq_init_cpu_queues(q, set->nr_hw_queues);
2036
2037        if (blk_mq_init_hw_queues(q, set))
2038                goto err_hctxs;
2039
2040        get_online_cpus();
2041        mutex_lock(&all_q_mutex);
2042
2043        list_add_tail(&q->all_q_node, &all_q_list);
2044        blk_mq_add_queue_tag_set(set, q);
2045        blk_mq_map_swqueue(q, cpu_online_mask);
2046
2047        mutex_unlock(&all_q_mutex);
2048        put_online_cpus();
2049
2050        return q;
2051
2052err_hctxs:
2053        kfree(map);
2054        for (i = 0; i < set->nr_hw_queues; i++) {
2055                if (!hctxs[i])
2056                        break;
2057                free_cpumask_var(hctxs[i]->cpumask);
2058                kfree(hctxs[i]);
2059        }
2060err_map:
2061        kfree(hctxs);
2062err_percpu:
2063        free_percpu(ctx);
2064        return ERR_PTR(-ENOMEM);
2065}
2066EXPORT_SYMBOL(blk_mq_init_allocated_queue);
2067
2068void blk_mq_free_queue(struct request_queue *q)
2069{
2070        struct blk_mq_tag_set   *set = q->tag_set;
2071
2072        mutex_lock(&all_q_mutex);
2073        list_del_init(&q->all_q_node);
2074        mutex_unlock(&all_q_mutex);
2075
2076        blk_mq_del_queue_tag_set(q);
2077
2078        blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
2079        blk_mq_free_hw_queues(q, set);
2080
2081        percpu_ref_exit(&q->mq_usage_counter);
2082}
2083
2084/* Basically redo blk_mq_init_queue with queue frozen */
2085static void blk_mq_queue_reinit(struct request_queue *q,
2086                                const struct cpumask *online_mask)
2087{
2088        WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
2089
2090        blk_mq_sysfs_unregister(q);
2091
2092        blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask);
2093
2094        /*
2095         * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
2096         * we should change hctx numa_node according to new topology (this
2097         * involves free and re-allocate memory, worthy doing?)
2098         */
2099
2100        blk_mq_map_swqueue(q, online_mask);
2101
2102        blk_mq_sysfs_register(q);
2103}
2104
2105static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
2106                                      unsigned long action, void *hcpu)
2107{
2108        struct request_queue *q;
2109        int cpu = (unsigned long)hcpu;
2110        /*
2111         * New online cpumask which is going to be set in this hotplug event.
2112         * Declare this cpumasks as global as cpu-hotplug operation is invoked
2113         * one-by-one and dynamically allocating this could result in a failure.
2114         */
2115        static struct cpumask online_new;
2116
2117        /*
2118         * Before hotadded cpu starts handling requests, new mappings must
2119         * be established.  Otherwise, these requests in hw queue might
2120         * never be dispatched.
2121         *
2122         * For example, there is a single hw queue (hctx) and two CPU queues
2123         * (ctx0 for CPU0, and ctx1 for CPU1).
2124         *
2125         * Now CPU1 is just onlined and a request is inserted into
2126         * ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is
2127         * still zero.
2128         *
2129         * And then while running hw queue, flush_busy_ctxs() finds bit0 is
2130         * set in pending bitmap and tries to retrieve requests in
2131         * hctx->ctxs[0]->rq_list.  But htx->ctxs[0] is a pointer to ctx0,
2132         * so the request in ctx1->rq_list is ignored.
2133         */
2134        switch (action & ~CPU_TASKS_FROZEN) {
2135        case CPU_DEAD:
2136        case CPU_UP_CANCELED:
2137                cpumask_copy(&online_new, cpu_online_mask);
2138                break;
2139        case CPU_UP_PREPARE:
2140                cpumask_copy(&online_new, cpu_online_mask);
2141                cpumask_set_cpu(cpu, &online_new);
2142                break;
2143        default:
2144                return NOTIFY_OK;
2145        }
2146
2147        mutex_lock(&all_q_mutex);
2148
2149        /*
2150         * We need to freeze and reinit all existing queues.  Freezing
2151         * involves synchronous wait for an RCU grace period and doing it
2152         * one by one may take a long time.  Start freezing all queues in
2153         * one swoop and then wait for the completions so that freezing can
2154         * take place in parallel.
2155         */
2156        list_for_each_entry(q, &all_q_list, all_q_node)
2157                blk_mq_freeze_queue_start(q);
2158        list_for_each_entry(q, &all_q_list, all_q_node) {
2159                blk_mq_freeze_queue_wait(q);
2160
2161                /*
2162                 * timeout handler can't touch hw queue during the
2163                 * reinitialization
2164                 */
2165                del_timer_sync(&q->timeout);
2166        }
2167
2168        list_for_each_entry(q, &all_q_list, all_q_node)
2169                blk_mq_queue_reinit(q, &online_new);
2170
2171        list_for_each_entry(q, &all_q_list, all_q_node)
2172                blk_mq_unfreeze_queue(q);
2173
2174        mutex_unlock(&all_q_mutex);
2175        return NOTIFY_OK;
2176}
2177
2178static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2179{
2180        int i;
2181
2182        for (i = 0; i < set->nr_hw_queues; i++) {
2183                set->tags[i] = blk_mq_init_rq_map(set, i);
2184                if (!set->tags[i])
2185                        goto out_unwind;
2186        }
2187
2188        return 0;
2189
2190out_unwind:
2191        while (--i >= 0)
2192                blk_mq_free_rq_map(set, set->tags[i], i);
2193
2194        return -ENOMEM;
2195}
2196
2197/*
2198 * Allocate the request maps associated with this tag_set. Note that this
2199 * may reduce the depth asked for, if memory is tight. set->queue_depth
2200 * will be updated to reflect the allocated depth.
2201 */
2202static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2203{
2204        unsigned int depth;
2205        int err;
2206
2207        depth = set->queue_depth;
2208        do {
2209                err = __blk_mq_alloc_rq_maps(set);
2210                if (!err)
2211                        break;
2212
2213                set->queue_depth >>= 1;
2214                if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
2215                        err = -ENOMEM;
2216                        break;
2217                }
2218        } while (set->queue_depth);
2219
2220        if (!set->queue_depth || err) {
2221                pr_err("blk-mq: failed to allocate request map\n");
2222                return -ENOMEM;
2223        }
2224
2225        if (depth != set->queue_depth)
2226                pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
2227                                                depth, set->queue_depth);
2228
2229        return 0;
2230}
2231
2232struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags)
2233{
2234        return tags->cpumask;
2235}
2236EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
2237
2238/*
2239 * Alloc a tag set to be associated with one or more request queues.
2240 * May fail with EINVAL for various error conditions. May adjust the
2241 * requested depth down, if if it too large. In that case, the set
2242 * value will be stored in set->queue_depth.
2243 */
2244int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2245{
2246        BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
2247
2248        if (!set->nr_hw_queues)
2249                return -EINVAL;
2250        if (!set->queue_depth)
2251                return -EINVAL;
2252        if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
2253                return -EINVAL;
2254
2255        if (!set->ops->queue_rq || !set->ops->map_queue)
2256                return -EINVAL;
2257
2258        if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
2259                pr_info("blk-mq: reduced tag depth to %u\n",
2260                        BLK_MQ_MAX_DEPTH);
2261                set->queue_depth = BLK_MQ_MAX_DEPTH;
2262        }
2263
2264        /*
2265         * If a crashdump is active, then we are potentially in a very
2266         * memory constrained environment. Limit us to 1 queue and
2267         * 64 tags to prevent using too much memory.
2268         */
2269        if (is_kdump_kernel()) {
2270                set->nr_hw_queues = 1;
2271                set->queue_depth = min(64U, set->queue_depth);
2272        }
2273
2274        set->tags = kmalloc_node(set->nr_hw_queues *
2275                                 sizeof(struct blk_mq_tags *),
2276                                 GFP_KERNEL, set->numa_node);
2277        if (!set->tags)
2278                return -ENOMEM;
2279
2280        if (blk_mq_alloc_rq_maps(set))
2281                goto enomem;
2282
2283        mutex_init(&set->tag_list_lock);
2284        INIT_LIST_HEAD(&set->tag_list);
2285
2286        return 0;
2287enomem:
2288        kfree(set->tags);
2289        set->tags = NULL;
2290        return -ENOMEM;
2291}
2292EXPORT_SYMBOL(blk_mq_alloc_tag_set);
2293
2294void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2295{
2296        int i;
2297
2298        for (i = 0; i < set->nr_hw_queues; i++) {
2299                if (set->tags[i])
2300                        blk_mq_free_rq_map(set, set->tags[i], i);
2301        }
2302
2303        kfree(set->tags);
2304        set->tags = NULL;
2305}
2306EXPORT_SYMBOL(blk_mq_free_tag_set);
2307
2308int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2309{
2310        struct blk_mq_tag_set *set = q->tag_set;
2311        struct blk_mq_hw_ctx *hctx;
2312        int i, ret;
2313
2314        if (!set || nr > set->queue_depth)
2315                return -EINVAL;
2316
2317        ret = 0;
2318        queue_for_each_hw_ctx(q, hctx, i) {
2319                ret = blk_mq_tag_update_depth(hctx->tags, nr);
2320                if (ret)
2321                        break;
2322        }
2323
2324        if (!ret)
2325                q->nr_requests = nr;
2326
2327        return ret;
2328}
2329
2330void blk_mq_disable_hotplug(void)
2331{
2332        mutex_lock(&all_q_mutex);
2333}
2334
2335void blk_mq_enable_hotplug(void)
2336{
2337        mutex_unlock(&all_q_mutex);
2338}
2339
2340static int __init blk_mq_init(void)
2341{
2342        blk_mq_cpu_init();
2343
2344        hotcpu_notifier(blk_mq_queue_reinit_notify, 0);
2345
2346        return 0;
2347}
2348subsys_initcall(blk_mq_init);
2349