LXR linux/block/blk-mq.c

   1/*
   2 * Block multiqueue core code
   3 *
   4 * Copyright (C) 2013-2014 Jens Axboe
   5 * Copyright (C) 2013-2014 Christoph Hellwig
   6 */
   7#include <linux/kernel.h>
   8#include <linux/module.h>
   9#include <linux/backing-dev.h>
  10#include <linux/bio.h>
  11#include <linux/blkdev.h>
  12#include <linux/kmemleak.h>
  13#include <linux/mm.h>
  14#include <linux/init.h>
  15#include <linux/slab.h>
  16#include <linux/workqueue.h>
  17#include <linux/smp.h>
  18#include <linux/llist.h>
  19#include <linux/list_sort.h>
  20#include <linux/cpu.h>
  21#include <linux/cache.h>
  22#include <linux/sched/sysctl.h>
  23#include <linux/sched/topology.h>
  24#include <linux/sched/signal.h>
  25#include <linux/delay.h>
  26#include <linux/crash_dump.h>
  27#include <linux/prefetch.h>
  28
  29#include <trace/events/block.h>
  30
  31#include <linux/blk-mq.h>
  32#include "blk.h"
  33#include "blk-mq.h"
  34#include "blk-mq-debugfs.h"
  35#include "blk-mq-tag.h"
  36#include "blk-pm.h"
  37#include "blk-stat.h"
  38#include "blk-mq-sched.h"
  39#include "blk-rq-qos.h"
  40
  41static void blk_mq_poll_stats_start(struct request_queue *q);
  42static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
  43
  44static int blk_mq_poll_stats_bkt(const struct request *rq)
  45{
  46        int ddir, bytes, bucket;
  47
  48        ddir = rq_data_dir(rq);
  49        bytes = blk_rq_bytes(rq);
  50
  51        bucket = ddir + 2*(ilog2(bytes) - 9);
  52
  53        if (bucket < 0)
  54                return -1;
  55        else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
  56                return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
  57
  58        return bucket;
  59}
  60
  61/*
  62 * Check if any of the ctx, dispatch list or elevator
  63 * have pending work in this hardware queue.
  64 */
  65static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
  66{
  67        return !list_empty_careful(&hctx->dispatch) ||
  68                sbitmap_any_bit_set(&hctx->ctx_map) ||
  69                        blk_mq_sched_has_work(hctx);
  70}
  71
  72/*
  73 * Mark this ctx as having pending work in this hardware queue
  74 */
  75static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
  76                                     struct blk_mq_ctx *ctx)
  77{
  78        const int bit = ctx->index_hw[hctx->type];
  79
  80        if (!sbitmap_test_bit(&hctx->ctx_map, bit))
  81                sbitmap_set_bit(&hctx->ctx_map, bit);
  82}
  83
  84static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
  85                                      struct blk_mq_ctx *ctx)
  86{
  87        const int bit = ctx->index_hw[hctx->type];
  88
  89        sbitmap_clear_bit(&hctx->ctx_map, bit);
  90}
  91
  92struct mq_inflight {
  93        struct hd_struct *part;
  94        unsigned int *inflight;
  95};
  96
  97static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
  98                                  struct request *rq, void *priv,
  99                                  bool reserved)
 100{
 101        struct mq_inflight *mi = priv;
 102
 103        /*
 104         * index[0] counts the specific partition that was asked for.
 105         */
 106        if (rq->part == mi->part)
 107                mi->inflight[0]++;
 108
 109        return true;
 110}
 111
 112unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
 113{
 114        unsigned inflight[2];
 115        struct mq_inflight mi = { .part = part, .inflight = inflight, };
 116
 117        inflight[0] = inflight[1] = 0;
 118        blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
 119
 120        return inflight[0];
 121}
 122
 123static bool blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
 124                                     struct request *rq, void *priv,
 125                                     bool reserved)
 126{
 127        struct mq_inflight *mi = priv;
 128
 129        if (rq->part == mi->part)
 130                mi->inflight[rq_data_dir(rq)]++;
 131
 132        return true;
 133}
 134
 135void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
 136                         unsigned int inflight[2])
 137{
 138        struct mq_inflight mi = { .part = part, .inflight = inflight, };
 139
 140        inflight[0] = inflight[1] = 0;
 141        blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi);
 142}
 143
 144void blk_freeze_queue_start(struct request_queue *q)
 145{
 146        mutex_lock(&q->mq_freeze_lock);
 147        if (++q->mq_freeze_depth == 1) {
 148                percpu_ref_kill(&q->q_usage_counter);
 149                mutex_unlock(&q->mq_freeze_lock);
 150                if (queue_is_mq(q))
 151                        blk_mq_run_hw_queues(q, false);
 152        } else {
 153                mutex_unlock(&q->mq_freeze_lock);
 154        }
 155}
 156EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
 157
 158void blk_mq_freeze_queue_wait(struct request_queue *q)
 159{
 160        wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
 161}
 162EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
 163
 164int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 165                                     unsigned long timeout)
 166{
 167        return wait_event_timeout(q->mq_freeze_wq,
 168                                        percpu_ref_is_zero(&q->q_usage_counter),
 169                                        timeout);
 170}
 171EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
 172
 173/*
 174 * Guarantee no request is in use, so we can change any data structure of
 175 * the queue afterward.
 176 */
 177void blk_freeze_queue(struct request_queue *q)
 178{
 179        /*
 180         * In the !blk_mq case we are only calling this to kill the
 181         * q_usage_counter, otherwise this increases the freeze depth
 182         * and waits for it to return to zero.  For this reason there is
 183         * no blk_unfreeze_queue(), and blk_freeze_queue() is not
 184         * exported to drivers as the only user for unfreeze is blk_mq.
 185         */
 186        blk_freeze_queue_start(q);
 187        blk_mq_freeze_queue_wait(q);
 188}
 189
 190void blk_mq_freeze_queue(struct request_queue *q)
 191{
 192        /*
 193         * ...just an alias to keep freeze and unfreeze actions balanced
 194         * in the blk_mq_* namespace
 195         */
 196        blk_freeze_queue(q);
 197}
 198EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 199
 200void blk_mq_unfreeze_queue(struct request_queue *q)
 201{
 202        mutex_lock(&q->mq_freeze_lock);
 203        q->mq_freeze_depth--;
 204        WARN_ON_ONCE(q->mq_freeze_depth < 0);
 205        if (!q->mq_freeze_depth) {
 206                percpu_ref_resurrect(&q->q_usage_counter);
 207                wake_up_all(&q->mq_freeze_wq);
 208        }
 209        mutex_unlock(&q->mq_freeze_lock);
 210}
 211EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 212
 213/*
 214 * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
 215 * mpt3sas driver such that this function can be removed.
 216 */
 217void blk_mq_quiesce_queue_nowait(struct request_queue *q)
 218{
 219        blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
 220}
 221EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
 222
 223/**
 224 * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
 225 * @q: request queue.
 226 *
 227 * Note: this function does not prevent that the struct request end_io()
 228 * callback function is invoked. Once this function is returned, we make
 229 * sure no dispatch can happen until the queue is unquiesced via
 230 * blk_mq_unquiesce_queue().
 231 */
 232void blk_mq_quiesce_queue(struct request_queue *q)
 233{
 234        struct blk_mq_hw_ctx *hctx;
 235        unsigned int i;
 236        bool rcu = false;
 237
 238        blk_mq_quiesce_queue_nowait(q);
 239
 240        queue_for_each_hw_ctx(q, hctx, i) {
 241                if (hctx->flags & BLK_MQ_F_BLOCKING)
 242                        synchronize_srcu(hctx->srcu);
 243                else
 244                        rcu = true;
 245        }
 246        if (rcu)
 247                synchronize_rcu();
 248}
 249EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
 250
 251/*
 252 * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
 253 * @q: request queue.
 254 *
 255 * This function recovers queue into the state before quiescing
 256 * which is done by blk_mq_quiesce_queue.
 257 */
 258void blk_mq_unquiesce_queue(struct request_queue *q)
 259{
 260        blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
 261
 262        /* dispatch requests which are inserted during quiescing */
 263        blk_mq_run_hw_queues(q, true);
 264}
 265EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
 266
 267void blk_mq_wake_waiters(struct request_queue *q)
 268{
 269        struct blk_mq_hw_ctx *hctx;
 270        unsigned int i;
 271
 272        queue_for_each_hw_ctx(q, hctx, i)
 273                if (blk_mq_hw_queue_mapped(hctx))
 274                        blk_mq_tag_wakeup_all(hctx->tags, true);
 275}
 276
 277bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 278{
 279        return blk_mq_has_free_tags(hctx->tags);
 280}
 281EXPORT_SYMBOL(blk_mq_can_queue);
 282
 283/*
 284 * Only need start/end time stamping if we have stats enabled, or using
 285 * an IO scheduler.
 286 */
 287static inline bool blk_mq_need_time_stamp(struct request *rq)
 288{
 289        return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator;
 290}
 291
 292static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 293                unsigned int tag, unsigned int op)
 294{
 295        struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 296        struct request *rq = tags->static_rqs[tag];
 297        req_flags_t rq_flags = 0;
 298
 299        if (data->flags & BLK_MQ_REQ_INTERNAL) {
 300                rq->tag = -1;
 301                rq->internal_tag = tag;
 302        } else {
 303                if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
 304                        rq_flags = RQF_MQ_INFLIGHT;
 305                        atomic_inc(&data->hctx->nr_active);
 306                }
 307                rq->tag = tag;
 308                rq->internal_tag = -1;
 309                data->hctx->tags->rqs[rq->tag] = rq;
 310        }
 311
 312        /* csd/requeue_work/fifo_time is initialized before use */
 313        rq->q = data->q;
 314        rq->mq_ctx = data->ctx;
 315        rq->mq_hctx = data->hctx;
 316        rq->rq_flags = rq_flags;
 317        rq->cmd_flags = op;
 318        if (data->flags & BLK_MQ_REQ_PREEMPT)
 319                rq->rq_flags |= RQF_PREEMPT;
 320        if (blk_queue_io_stat(data->q))
 321                rq->rq_flags |= RQF_IO_STAT;
 322        INIT_LIST_HEAD(&rq->queuelist);
 323        INIT_HLIST_NODE(&rq->hash);
 324        RB_CLEAR_NODE(&rq->rb_node);
 325        rq->rq_disk = NULL;
 326        rq->part = NULL;
 327        if (blk_mq_need_time_stamp(rq))
 328                rq->start_time_ns = ktime_get_ns();
 329        else
 330                rq->start_time_ns = 0;
 331        rq->io_start_time_ns = 0;
 332        rq->nr_phys_segments = 0;
 333#if defined(CONFIG_BLK_DEV_INTEGRITY)
 334        rq->nr_integrity_segments = 0;
 335#endif
 336        rq->special = NULL;
 337        /* tag was already set */
 338        rq->extra_len = 0;
 339        WRITE_ONCE(rq->deadline, 0);
 340
 341        rq->timeout = 0;
 342
 343        rq->end_io = NULL;
 344        rq->end_io_data = NULL;
 345        rq->next_rq = NULL;
 346
 347        data->ctx->rq_dispatched[op_is_sync(op)]++;
 348        refcount_set(&rq->ref, 1);
 349        return rq;
 350}
 351
 352static struct request *blk_mq_get_request(struct request_queue *q,
 353                                          struct bio *bio,
 354                                          struct blk_mq_alloc_data *data)
 355{
 356        struct elevator_queue *e = q->elevator;
 357        struct request *rq;
 358        unsigned int tag;
 359        bool clear_ctx_on_error = false;
 360
 361        blk_queue_enter_live(q);
 362        data->q = q;
 363        if (likely(!data->ctx)) {
 364                data->ctx = blk_mq_get_ctx(q);
 365                clear_ctx_on_error = true;
 366        }
 367        if (likely(!data->hctx))
 368                data->hctx = blk_mq_map_queue(q, data->cmd_flags,
 369                                                data->ctx);
 370        if (data->cmd_flags & REQ_NOWAIT)
 371                data->flags |= BLK_MQ_REQ_NOWAIT;
 372
 373        if (e) {
 374                data->flags |= BLK_MQ_REQ_INTERNAL;
 375
 376                /*
 377                 * Flush requests are special and go directly to the
 378                 * dispatch list. Don't include reserved tags in the
 379                 * limiting, as it isn't useful.
 380                 */
 381                if (!op_is_flush(data->cmd_flags) &&
 382                    e->type->ops.limit_depth &&
 383                    !(data->flags & BLK_MQ_REQ_RESERVED))
 384                        e->type->ops.limit_depth(data->cmd_flags, data);
 385        } else {
 386                blk_mq_tag_busy(data->hctx);
 387        }
 388
 389        tag = blk_mq_get_tag(data);
 390        if (tag == BLK_MQ_TAG_FAIL) {
 391                if (clear_ctx_on_error)
 392                        data->ctx = NULL;
 393                blk_queue_exit(q);
 394                return NULL;
 395        }
 396
 397        rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags);
 398        if (!op_is_flush(data->cmd_flags)) {
 399                rq->elv.icq = NULL;
 400                if (e && e->type->ops.prepare_request) {
 401                        if (e->type->icq_cache)
 402                                blk_mq_sched_assign_ioc(rq);
 403
 404                        e->type->ops.prepare_request(rq, bio);
 405                        rq->rq_flags |= RQF_ELVPRIV;
 406                }
 407        }
 408        data->hctx->queued++;
 409        return rq;
 410}
 411
 412struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 413                blk_mq_req_flags_t flags)
 414{
 415        struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
 416        struct request *rq;
 417        int ret;
 418
 419        ret = blk_queue_enter(q, flags);
 420        if (ret)
 421                return ERR_PTR(ret);
 422
 423        rq = blk_mq_get_request(q, NULL, &alloc_data);
 424        blk_queue_exit(q);
 425
 426        if (!rq)
 427                return ERR_PTR(-EWOULDBLOCK);
 428
 429        rq->__data_len = 0;
 430        rq->__sector = (sector_t) -1;
 431        rq->bio = rq->biotail = NULL;
 432        return rq;
 433}
 434EXPORT_SYMBOL(blk_mq_alloc_request);
 435
 436struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 437        unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
 438{
 439        struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
 440        struct request *rq;
 441        unsigned int cpu;
 442        int ret;
 443
 444        /*
 445         * If the tag allocator sleeps we could get an allocation for a
 446         * different hardware context.  No need to complicate the low level
 447         * allocator for this for the rare use case of a command tied to
 448         * a specific queue.
 449         */
 450        if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
 451                return ERR_PTR(-EINVAL);
 452
 453        if (hctx_idx >= q->nr_hw_queues)
 454                return ERR_PTR(-EIO);
 455
 456        ret = blk_queue_enter(q, flags);
 457        if (ret)
 458                return ERR_PTR(ret);
 459
 460        /*
 461         * Check if the hardware context is actually mapped to anything.
 462         * If not tell the caller that it should skip this queue.
 463         */
 464        alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
 465        if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
 466                blk_queue_exit(q);
 467                return ERR_PTR(-EXDEV);
 468        }
 469        cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
 470        alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
 471
 472        rq = blk_mq_get_request(q, NULL, &alloc_data);
 473        blk_queue_exit(q);
 474
 475        if (!rq)
 476                return ERR_PTR(-EWOULDBLOCK);
 477
 478        return rq;
 479}
 480EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 481
 482static void __blk_mq_free_request(struct request *rq)
 483{
 484        struct request_queue *q = rq->q;
 485        struct blk_mq_ctx *ctx = rq->mq_ctx;
 486        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 487        const int sched_tag = rq->internal_tag;
 488
 489        blk_pm_mark_last_busy(rq);
 490        rq->mq_hctx = NULL;
 491        if (rq->tag != -1)
 492                blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
 493        if (sched_tag != -1)
 494                blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
 495        blk_mq_sched_restart(hctx);
 496        blk_queue_exit(q);
 497}
 498
 499void blk_mq_free_request(struct request *rq)
 500{
 501        struct request_queue *q = rq->q;
 502        struct elevator_queue *e = q->elevator;
 503        struct blk_mq_ctx *ctx = rq->mq_ctx;
 504        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 505
 506        if (rq->rq_flags & RQF_ELVPRIV) {
 507                if (e && e->type->ops.finish_request)
 508                        e->type->ops.finish_request(rq);
 509                if (rq->elv.icq) {
 510                        put_io_context(rq->elv.icq->ioc);
 511                        rq->elv.icq = NULL;
 512                }
 513        }
 514
 515        ctx->rq_completed[rq_is_sync(rq)]++;
 516        if (rq->rq_flags & RQF_MQ_INFLIGHT)
 517                atomic_dec(&hctx->nr_active);
 518
 519        if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
 520                laptop_io_completion(q->backing_dev_info);
 521
 522        rq_qos_done(q, rq);
 523
 524        WRITE_ONCE(rq->state, MQ_RQ_IDLE);
 525        if (refcount_dec_and_test(&rq->ref))
 526                __blk_mq_free_request(rq);
 527}
 528EXPORT_SYMBOL_GPL(blk_mq_free_request);
 529
 530inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
 531{
 532        u64 now = 0;
 533
 534        if (blk_mq_need_time_stamp(rq))
 535                now = ktime_get_ns();
 536
 537        if (rq->rq_flags & RQF_STATS) {
 538                blk_mq_poll_stats_start(rq->q);
 539                blk_stat_add(rq, now);
 540        }
 541
 542        if (rq->internal_tag != -1)
 543                blk_mq_sched_completed_request(rq, now);
 544
 545        blk_account_io_done(rq, now);
 546
 547        if (rq->end_io) {
 548                rq_qos_done(rq->q, rq);
 549                rq->end_io(rq, error);
 550        } else {
 551                if (unlikely(blk_bidi_rq(rq)))
 552                        blk_mq_free_request(rq->next_rq);
 553                blk_mq_free_request(rq);
 554        }
 555}
 556EXPORT_SYMBOL(__blk_mq_end_request);
 557
 558void blk_mq_end_request(struct request *rq, blk_status_t error)
 559{
 560        if (blk_update_request(rq, error, blk_rq_bytes(rq)))
 561                BUG();
 562        __blk_mq_end_request(rq, error);
 563}
 564EXPORT_SYMBOL(blk_mq_end_request);
 565
 566static void __blk_mq_complete_request_remote(void *data)
 567{
 568        struct request *rq = data;
 569        struct request_queue *q = rq->q;
 570
 571        q->mq_ops->complete(rq);
 572}
 573
 574static void __blk_mq_complete_request(struct request *rq)
 575{
 576        struct blk_mq_ctx *ctx = rq->mq_ctx;
 577        struct request_queue *q = rq->q;
 578        bool shared = false;
 579        int cpu;
 580
 581        WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
 582        /*
 583         * Most of single queue controllers, there is only one irq vector
 584         * for handling IO completion, and the only irq's affinity is set
 585         * as all possible CPUs. On most of ARCHs, this affinity means the
 586         * irq is handled on one specific CPU.
 587         *
 588         * So complete IO reqeust in softirq context in case of single queue
 589         * for not degrading IO performance by irqsoff latency.
 590         */
 591        if (q->nr_hw_queues == 1) {
 592                __blk_complete_request(rq);
 593                return;
 594        }
 595
 596        /*
 597         * For a polled request, always complete locallly, it's pointless
 598         * to redirect the completion.
 599         */
 600        if ((rq->cmd_flags & REQ_HIPRI) ||
 601            !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) {
 602                q->mq_ops->complete(rq);
 603                return;
 604        }
 605
 606        cpu = get_cpu();
 607        if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
 608                shared = cpus_share_cache(cpu, ctx->cpu);
 609
 610        if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
 611                rq->csd.func = __blk_mq_complete_request_remote;
 612                rq->csd.info = rq;
 613                rq->csd.flags = 0;
 614                smp_call_function_single_async(ctx->cpu, &rq->csd);
 615        } else {
 616                q->mq_ops->complete(rq);
 617        }
 618        put_cpu();
 619}
 620
 621static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
 622        __releases(hctx->srcu)
 623{
 624        if (!(hctx->flags & BLK_MQ_F_BLOCKING))
 625                rcu_read_unlock();
 626        else
 627                srcu_read_unlock(hctx->srcu, srcu_idx);
 628}
 629
 630static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
 631        __acquires(hctx->srcu)
 632{
 633        if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
 634                /* shut up gcc false positive */
 635                *srcu_idx = 0;
 636                rcu_read_lock();
 637        } else
 638                *srcu_idx = srcu_read_lock(hctx->srcu);
 639}
 640
 641/**
 642 * blk_mq_complete_request - end I/O on a request
 643 * @rq:         the request being processed
 644 *
 645 * Description:
 646 *      Ends all I/O on a request. It does not handle partial completions.
 647 *      The actual completion happens out-of-order, through a IPI handler.
 648 **/
 649bool blk_mq_complete_request(struct request *rq)
 650{
 651        if (unlikely(blk_should_fake_timeout(rq->q)))
 652                return false;
 653        __blk_mq_complete_request(rq);
 654        return true;
 655}
 656EXPORT_SYMBOL(blk_mq_complete_request);
 657
 658int blk_mq_request_started(struct request *rq)
 659{
 660        return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
 661}
 662EXPORT_SYMBOL_GPL(blk_mq_request_started);
 663
 664int blk_mq_request_completed(struct request *rq)
 665{
 666        return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE;
 667}
 668EXPORT_SYMBOL_GPL(blk_mq_request_completed);
 669
 670void blk_mq_start_request(struct request *rq)
 671{
 672        struct request_queue *q = rq->q;
 673
 674        blk_mq_sched_started_request(rq);
 675
 676        trace_block_rq_issue(q, rq);
 677
 678        if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
 679                rq->io_start_time_ns = ktime_get_ns();
 680#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 681                rq->throtl_size = blk_rq_sectors(rq);
 682#endif
 683                rq->rq_flags |= RQF_STATS;
 684                rq_qos_issue(q, rq);
 685        }
 686
 687        WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
 688
 689        blk_add_timer(rq);
 690        WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
 691
 692        if (q->dma_drain_size && blk_rq_bytes(rq)) {
 693                /*
 694                 * Make sure space for the drain appears.  We know we can do
 695                 * this because max_hw_segments has been adjusted to be one
 696                 * fewer than the device can handle.
 697                 */
 698                rq->nr_phys_segments++;
 699        }
 700}
 701EXPORT_SYMBOL(blk_mq_start_request);
 702
 703static void __blk_mq_requeue_request(struct request *rq)
 704{
 705        struct request_queue *q = rq->q;
 706
 707        blk_mq_put_driver_tag(rq);
 708
 709        trace_block_rq_requeue(q, rq);
 710        rq_qos_requeue(q, rq);
 711
 712        if (blk_mq_request_started(rq)) {
 713                WRITE_ONCE(rq->state, MQ_RQ_IDLE);
 714                rq->rq_flags &= ~RQF_TIMED_OUT;
 715                if (q->dma_drain_size && blk_rq_bytes(rq))
 716                        rq->nr_phys_segments--;
 717        }
 718}
 719
 720void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
 721{
 722        __blk_mq_requeue_request(rq);
 723
 724        /* this request will be re-inserted to io scheduler queue */
 725        blk_mq_sched_requeue_request(rq);
 726
 727        BUG_ON(!list_empty(&rq->queuelist));
 728        blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
 729}
 730EXPORT_SYMBOL(blk_mq_requeue_request);
 731
 732static void blk_mq_requeue_work(struct work_struct *work)
 733{
 734        struct request_queue *q =
 735                container_of(work, struct request_queue, requeue_work.work);
 736        LIST_HEAD(rq_list);
 737        struct request *rq, *next;
 738
 739        spin_lock_irq(&q->requeue_lock);
 740        list_splice_init(&q->requeue_list, &rq_list);
 741        spin_unlock_irq(&q->requeue_lock);
 742
 743        list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
 744                if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP)))
 745                        continue;
 746
 747                rq->rq_flags &= ~RQF_SOFTBARRIER;
 748                list_del_init(&rq->queuelist);
 749                /*
 750                 * If RQF_DONTPREP, rq has contained some driver specific
 751                 * data, so insert it to hctx dispatch list to avoid any
 752                 * merge.
 753                 */
 754                if (rq->rq_flags & RQF_DONTPREP)
 755                        blk_mq_request_bypass_insert(rq, false, false);
 756                else
 757                        blk_mq_sched_insert_request(rq, true, false, false);
 758        }
 759
 760        while (!list_empty(&rq_list)) {
 761                rq = list_entry(rq_list.next, struct request, queuelist);
 762                list_del_init(&rq->queuelist);
 763                blk_mq_sched_insert_request(rq, false, false, false);
 764        }
 765
 766        blk_mq_run_hw_queues(q, false);
 767}
 768
 769void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 770                                bool kick_requeue_list)
 771{
 772        struct request_queue *q = rq->q;
 773        unsigned long flags;
 774
 775        /*
 776         * We abuse this flag that is otherwise used by the I/O scheduler to
 777         * request head insertion from the workqueue.
 778         */
 779        BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
 780
 781        spin_lock_irqsave(&q->requeue_lock, flags);
 782        if (at_head) {
 783                rq->rq_flags |= RQF_SOFTBARRIER;
 784                list_add(&rq->queuelist, &q->requeue_list);
 785        } else {
 786                list_add_tail(&rq->queuelist, &q->requeue_list);
 787        }
 788        spin_unlock_irqrestore(&q->requeue_lock, flags);
 789
 790        if (kick_requeue_list)
 791                blk_mq_kick_requeue_list(q);
 792}
 793
 794void blk_mq_kick_requeue_list(struct request_queue *q)
 795{
 796        kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
 797}
 798EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 799
 800void blk_mq_delay_kick_requeue_list(struct request_queue *q,
 801                                    unsigned long msecs)
 802{
 803        kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
 804                                    msecs_to_jiffies(msecs));
 805}
 806EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 807
 808struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 809{
 810        if (tag < tags->nr_tags) {
 811                prefetch(tags->rqs[tag]);
 812                return tags->rqs[tag];
 813        }
 814
 815        return NULL;
 816}
 817EXPORT_SYMBOL(blk_mq_tag_to_rq);
 818
 819static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
 820                               void *priv, bool reserved)
 821{
 822        /*
 823         * If we find a request that is inflight and the queue matches,
 824         * we know the queue is busy. Return false to stop the iteration.
 825         */
 826        if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) {
 827                bool *busy = priv;
 828
 829                *busy = true;
 830                return false;
 831        }
 832
 833        return true;
 834}
 835
 836bool blk_mq_queue_inflight(struct request_queue *q)
 837{
 838        bool busy = false;
 839
 840        blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
 841        return busy;
 842}
 843EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
 844
 845static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 846{
 847        req->rq_flags |= RQF_TIMED_OUT;
 848        if (req->q->mq_ops->timeout) {
 849                enum blk_eh_timer_return ret;
 850
 851                ret = req->q->mq_ops->timeout(req, reserved);
 852                if (ret == BLK_EH_DONE)
 853                        return;
 854                WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
 855        }
 856
 857        blk_add_timer(req);
 858}
 859
 860static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
 861{
 862        unsigned long deadline;
 863
 864        if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
 865                return false;
 866        if (rq->rq_flags & RQF_TIMED_OUT)
 867                return false;
 868
 869        deadline = READ_ONCE(rq->deadline);
 870        if (time_after_eq(jiffies, deadline))
 871                return true;
 872
 873        if (*next == 0)
 874                *next = deadline;
 875        else if (time_after(*next, deadline))
 876                *next = deadline;
 877        return false;
 878}
 879
 880static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 881                struct request *rq, void *priv, bool reserved)
 882{
 883        unsigned long *next = priv;
 884
 885        /*
 886         * Just do a quick check if it is expired before locking the request in
 887         * so we're not unnecessarilly synchronizing across CPUs.
 888         */
 889        if (!blk_mq_req_expired(rq, next))
 890                return true;
 891
 892        /*
 893         * We have reason to believe the request may be expired. Take a
 894         * reference on the request to lock this request lifetime into its
 895         * currently allocated context to prevent it from being reallocated in
 896         * the event the completion by-passes this timeout handler.
 897         *
 898         * If the reference was already released, then the driver beat the
 899         * timeout handler to posting a natural completion.
 900         */
 901        if (!refcount_inc_not_zero(&rq->ref))
 902                return true;
 903
 904        /*
 905         * The request is now locked and cannot be reallocated underneath the
 906         * timeout handler's processing. Re-verify this exact request is truly
 907         * expired; if it is not expired, then the request was completed and
 908         * reallocated as a new request.
 909         */
 910        if (blk_mq_req_expired(rq, next))
 911                blk_mq_rq_timed_out(rq, reserved);
 912        if (refcount_dec_and_test(&rq->ref))
 913                __blk_mq_free_request(rq);
 914
 915        return true;
 916}
 917
 918static void blk_mq_timeout_work(struct work_struct *work)
 919{
 920        struct request_queue *q =
 921                container_of(work, struct request_queue, timeout_work);
 922        unsigned long next = 0;
 923        struct blk_mq_hw_ctx *hctx;
 924        int i;
 925
 926        /* A deadlock might occur if a request is stuck requiring a
 927         * timeout at the same time a queue freeze is waiting
 928         * completion, since the timeout code would not be able to
 929         * acquire the queue reference here.
 930         *
 931         * That's why we don't use blk_queue_enter here; instead, we use
 932         * percpu_ref_tryget directly, because we need to be able to
 933         * obtain a reference even in the short window between the queue
 934         * starting to freeze, by dropping the first reference in
 935         * blk_freeze_queue_start, and the moment the last request is
 936         * consumed, marked by the instant q_usage_counter reaches
 937         * zero.
 938         */
 939        if (!percpu_ref_tryget(&q->q_usage_counter))
 940                return;
 941
 942        blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
 943
 944        if (next != 0) {
 945                mod_timer(&q->timeout, next);
 946        } else {
 947                /*
 948                 * Request timeouts are handled as a forward rolling timer. If
 949                 * we end up here it means that no requests are pending and
 950                 * also that no request has been pending for a while. Mark
 951                 * each hctx as idle.
 952                 */
 953                queue_for_each_hw_ctx(q, hctx, i) {
 954                        /* the hctx may be unmapped, so check it here */
 955                        if (blk_mq_hw_queue_mapped(hctx))
 956                                blk_mq_tag_idle(hctx);
 957                }
 958        }
 959        blk_queue_exit(q);
 960}
 961
 962struct flush_busy_ctx_data {
 963        struct blk_mq_hw_ctx *hctx;
 964        struct list_head *list;
 965};
 966
 967static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
 968{
 969        struct flush_busy_ctx_data *flush_data = data;
 970        struct blk_mq_hw_ctx *hctx = flush_data->hctx;
 971        struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
 972        enum hctx_type type = hctx->type;
 973
 974        spin_lock(&ctx->lock);
 975        list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
 976        sbitmap_clear_bit(sb, bitnr);
 977        spin_unlock(&ctx->lock);
 978        return true;
 979}
 980
 981/*
 982 * Process software queues that have been marked busy, splicing them
 983 * to the for-dispatch
 984 */
 985void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 986{
 987        struct flush_busy_ctx_data data = {
 988                .hctx = hctx,
 989                .list = list,
 990        };
 991
 992        sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
 993}
 994EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
 995
 996struct dispatch_rq_data {
 997        struct blk_mq_hw_ctx *hctx;
 998        struct request *rq;
 999};
1000

1001static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
1002                void *data)
1003{
1004        struct dispatch_rq_data *dispatch_data = data;
1005        struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
1006        struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
1007        enum hctx_type type = hctx->type;
1008
1009        spin_lock(&ctx->lock);
1010        if (!list_empty(&ctx->rq_lists[type])) {
1011                dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
1012                list_del_init(&dispatch_data->rq->queuelist);
1013                if (list_empty(&ctx->rq_lists[type]))
1014                        sbitmap_clear_bit(sb, bitnr);
1015        }
1016        spin_unlock(&ctx->lock);
1017
1018        return !dispatch_data->rq;
1019}
1020
1021struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
1022                                        struct blk_mq_ctx *start)
1023{
1024        unsigned off = start ? start->index_hw[hctx->type] : 0;
1025        struct dispatch_rq_data data = {
1026                .hctx = hctx,
1027                .rq   = NULL,
1028        };
1029
1030        __sbitmap_for_each_set(&hctx->ctx_map, off,
1031                               dispatch_rq_from_ctx, &data);
1032
1033        return data.rq;
1034}
1035
1036static inline unsigned int queued_to_index(unsigned int queued)
1037{
1038        if (!queued)
1039                return 0;
1040
1041        return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
1042}
1043
1044bool blk_mq_get_driver_tag(struct request *rq)
1045{
1046        struct blk_mq_alloc_data data = {
1047                .q = rq->q,
1048                .hctx = rq->mq_hctx,
1049                .flags = BLK_MQ_REQ_NOWAIT,
1050                .cmd_flags = rq->cmd_flags,
1051        };
1052        bool shared;
1053
1054        if (rq->tag != -1)
1055                goto done;
1056
1057        if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
1058                data.flags |= BLK_MQ_REQ_RESERVED;
1059
1060        shared = blk_mq_tag_busy(data.hctx);
1061        rq->tag = blk_mq_get_tag(&data);
1062        if (rq->tag >= 0) {
1063                if (shared) {
1064                        rq->rq_flags |= RQF_MQ_INFLIGHT;
1065                        atomic_inc(&data.hctx->nr_active);
1066                }
1067                data.hctx->tags->rqs[rq->tag] = rq;
1068        }
1069
1070done:
1071        return rq->tag != -1;
1072}
1073
1074static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
1075                                int flags, void *key)
1076{
1077        struct blk_mq_hw_ctx *hctx;
1078
1079        hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
1080
1081        spin_lock(&hctx->dispatch_wait_lock);
1082        if (!list_empty(&wait->entry)) {
1083                struct sbitmap_queue *sbq;
1084
1085                list_del_init(&wait->entry);
1086                sbq = &hctx->tags->bitmap_tags;
1087                atomic_dec(&sbq->ws_active);
1088        }
1089        spin_unlock(&hctx->dispatch_wait_lock);
1090
1091        blk_mq_run_hw_queue(hctx, true);
1092        return 1;
1093}
1094
1095/*
1096 * Mark us waiting for a tag. For shared tags, this involves hooking us into
1097 * the tag wakeups. For non-shared tags, we can simply mark us needing a
1098 * restart. For both cases, take care to check the condition again after
1099 * marking us as waiting.
1100 */
1101static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
1102                                 struct request *rq)
1103{
1104        struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags;
1105        struct wait_queue_head *wq;
1106        wait_queue_entry_t *wait;
1107        bool ret;
1108
1109        if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
1110                blk_mq_sched_mark_restart_hctx(hctx);
1111
1112                /*
1113                 * It's possible that a tag was freed in the window between the
1114                 * allocation failure and adding the hardware queue to the wait
1115                 * queue.
1116                 *
1117                 * Don't clear RESTART here, someone else could have set it.
1118                 * At most this will cost an extra queue run.
1119                 */
1120                return blk_mq_get_driver_tag(rq);
1121        }
1122
1123        wait = &hctx->dispatch_wait;
1124        if (!list_empty_careful(&wait->entry))
1125                return false;
1126
1127        wq = &bt_wait_ptr(sbq, hctx)->wait;
1128
1129        spin_lock_irq(&wq->lock);
1130        spin_lock(&hctx->dispatch_wait_lock);
1131        if (!list_empty(&wait->entry)) {
1132                spin_unlock(&hctx->dispatch_wait_lock);
1133                spin_unlock_irq(&wq->lock);
1134                return false;
1135        }
1136
1137        atomic_inc(&sbq->ws_active);
1138        wait->flags &= ~WQ_FLAG_EXCLUSIVE;
1139        __add_wait_queue(wq, wait);
1140
1141        /*
1142         * It's possible that a tag was freed in the window between the
1143         * allocation failure and adding the hardware queue to the wait
1144         * queue.
1145         */
1146        ret = blk_mq_get_driver_tag(rq);
1147        if (!ret) {
1148                spin_unlock(&hctx->dispatch_wait_lock);
1149                spin_unlock_irq(&wq->lock);
1150                return false;
1151        }
1152
1153        /*
1154         * We got a tag, remove ourselves from the wait queue to ensure
1155         * someone else gets the wakeup.
1156         */
1157        list_del_init(&wait->entry);
1158        atomic_dec(&sbq->ws_active);
1159        spin_unlock(&hctx->dispatch_wait_lock);
1160        spin_unlock_irq(&wq->lock);
1161
1162        return true;
1163}
1164
1165#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT  8
1166#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR  4
1167/*
1168 * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
1169 * - EWMA is one simple way to compute running average value
1170 * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
1171 * - take 4 as factor for avoiding to get too small(0) result, and this
1172 *   factor doesn't matter because EWMA decreases exponentially
1173 */
1174static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
1175{
1176        unsigned int ewma;
1177
1178        if (hctx->queue->elevator)
1179                return;
1180
1181        ewma = hctx->dispatch_busy;
1182
1183        if (!ewma && !busy)
1184                return;
1185
1186        ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
1187        if (busy)
1188                ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
1189        ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
1190
1191        hctx->dispatch_busy = ewma;
1192}
1193
1194#define BLK_MQ_RESOURCE_DELAY   3               /* ms units */
1195
1196/*
1197 * Returns true if we did some work AND can potentially do more.
1198 */
1199bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1200                             bool got_budget)
1201{
1202        struct blk_mq_hw_ctx *hctx;
1203        struct request *rq, *nxt;
1204        bool no_tag = false;
1205        int errors, queued;
1206        blk_status_t ret = BLK_STS_OK;
1207
1208        if (list_empty(list))
1209                return false;
1210
1211        WARN_ON(!list_is_singular(list) && got_budget);
1212
1213        /*
1214         * Now process all the entries, sending them to the driver.
1215         */
1216        errors = queued = 0;
1217        do {
1218                struct blk_mq_queue_data bd;
1219
1220                rq = list_first_entry(list, struct request, queuelist);
1221
1222                hctx = rq->mq_hctx;
1223                if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
1224                        break;
1225
1226                if (!blk_mq_get_driver_tag(rq)) {
1227                        /*
1228                         * The initial allocation attempt failed, so we need to
1229                         * rerun the hardware queue when a tag is freed. The
1230                         * waitqueue takes care of that. If the queue is run
1231                         * before we add this entry back on the dispatch list,
1232                         * we'll re-run it below.
1233                         */
1234                        if (!blk_mq_mark_tag_wait(hctx, rq)) {
1235                                blk_mq_put_dispatch_budget(hctx);
1236                                /*
1237                                 * For non-shared tags, the RESTART check
1238                                 * will suffice.
1239                                 */
1240                                if (hctx->flags & BLK_MQ_F_TAG_SHARED)
1241                                        no_tag = true;
1242                                break;
1243                        }
1244                }
1245
1246                list_del_init(&rq->queuelist);
1247
1248                bd.rq = rq;
1249
1250                /*
1251                 * Flag last if we have no more requests, or if we have more
1252                 * but can't assign a driver tag to it.
1253                 */
1254                if (list_empty(list))
1255                        bd.last = true;
1256                else {
1257                        nxt = list_first_entry(list, struct request, queuelist);
1258                        bd.last = !blk_mq_get_driver_tag(nxt);
1259                }
1260
1261                ret = q->mq_ops->queue_rq(hctx, &bd);
1262                if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
1263                        /*
1264                         * If an I/O scheduler has been configured and we got a
1265                         * driver tag for the next request already, free it
1266                         * again.
1267                         */
1268                        if (!list_empty(list)) {
1269                                nxt = list_first_entry(list, struct request, queuelist);
1270                                blk_mq_put_driver_tag(nxt);
1271                        }
1272                        list_add(&rq->queuelist, list);
1273                        __blk_mq_requeue_request(rq);
1274                        break;
1275                }
1276
1277                if (unlikely(ret != BLK_STS_OK)) {
1278                        errors++;
1279                        blk_mq_end_request(rq, BLK_STS_IOERR);
1280                        continue;
1281                }
1282
1283                queued++;
1284        } while (!list_empty(list));
1285
1286        hctx->dispatched[queued_to_index(queued)]++;
1287
1288        /*
1289         * Any items that need requeuing? Stuff them into hctx->dispatch,
1290         * that is where we will continue on next queue run.
1291         */
1292        if (!list_empty(list)) {
1293                bool needs_restart;
1294
1295                /*
1296                 * If we didn't flush the entire list, we could have told
1297                 * the driver there was more coming, but that turned out to
1298                 * be a lie.
1299                 */
1300                if (q->mq_ops->commit_rqs)
1301                        q->mq_ops->commit_rqs(hctx);
1302
1303                spin_lock(&hctx->lock);
1304                list_splice_tail_init(list, &hctx->dispatch);
1305                spin_unlock(&hctx->lock);
1306
1307                /*
1308                 * If SCHED_RESTART was set by the caller of this function and
1309                 * it is no longer set that means that it was cleared by another
1310                 * thread and hence that a queue rerun is needed.
1311                 *
1312                 * If 'no_tag' is set, that means that we failed getting
1313                 * a driver tag with an I/O scheduler attached. If our dispatch
1314                 * waitqueue is no longer active, ensure that we run the queue
1315                 * AFTER adding our entries back to the list.
1316                 *
1317                 * If no I/O scheduler has been configured it is possible that
1318                 * the hardware queue got stopped and restarted before requests
1319                 * were pushed back onto the dispatch list. Rerun the queue to
1320                 * avoid starvation. Notes:
1321                 * - blk_mq_run_hw_queue() checks whether or not a queue has
1322                 *   been stopped before rerunning a queue.
1323                 * - Some but not all block drivers stop a queue before
1324                 *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
1325                 *   and dm-rq.
1326                 *
1327                 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
1328                 * bit is set, run queue after a delay to avoid IO stalls
1329                 * that could otherwise occur if the queue is idle.
1330                 */
1331                needs_restart = blk_mq_sched_needs_restart(hctx);
1332                if (!needs_restart ||
1333                    (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
1334                        blk_mq_run_hw_queue(hctx, true);
1335                else if (needs_restart && (ret == BLK_STS_RESOURCE))
1336                        blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
1337
1338                blk_mq_update_dispatch_busy(hctx, true);
1339                return false;
1340        } else
1341                blk_mq_update_dispatch_busy(hctx, false);
1342
1343        /*
1344         * If the host/device is unable to accept more work, inform the
1345         * caller of that.
1346         */
1347        if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
1348                return false;
1349
1350        return (queued + errors) != 0;
1351}
1352
1353static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
1354{
1355        int srcu_idx;
1356
1357        /*
1358         * We should be running this queue from one of the CPUs that
1359         * are mapped to it.
1360         *
1361         * There are at least two related races now between setting
1362         * hctx->next_cpu from blk_mq_hctx_next_cpu() and running
1363         * __blk_mq_run_hw_queue():
1364         *
1365         * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(),
1366         *   but later it becomes online, then this warning is harmless
1367         *   at all
1368         *
1369         * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(),
1370         *   but later it becomes offline, then the warning can't be
1371         *   triggered, and we depend on blk-mq timeout handler to
1372         *   handle dispatched requests to this hctx
1373         */
1374        if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
1375                cpu_online(hctx->next_cpu)) {
1376                printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n",
1377                        raw_smp_processor_id(),
1378                        cpumask_empty(hctx->cpumask) ? "inactive": "active");
1379                dump_stack();
1380        }
1381
1382        /*
1383         * We can't run the queue inline with ints disabled. Ensure that
1384         * we catch bad users of this early.
1385         */
1386        WARN_ON_ONCE(in_interrupt());
1387
1388        might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
1389
1390        hctx_lock(hctx, &srcu_idx);
1391        blk_mq_sched_dispatch_requests(hctx);
1392        hctx_unlock(hctx, srcu_idx);
1393}
1394
1395static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
1396{
1397        int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
1398
1399        if (cpu >= nr_cpu_ids)
1400                cpu = cpumask_first(hctx->cpumask);
1401        return cpu;
1402}
1403
1404/*
1405 * It'd be great if the workqueue API had a way to pass
1406 * in a mask and had some smarts for more clever placement.
1407 * For now we just round-robin here, switching for every
1408 * BLK_MQ_CPU_WORK_BATCH queued items.
1409 */
1410static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
1411{
1412        bool tried = false;
1413        int next_cpu = hctx->next_cpu;
1414
1415        if (hctx->queue->nr_hw_queues == 1)
1416                return WORK_CPU_UNBOUND;
1417
1418        if (--hctx->next_cpu_batch <= 0) {
1419select_cpu:
1420                next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
1421                                cpu_online_mask);
1422                if (next_cpu >= nr_cpu_ids)
1423                        next_cpu = blk_mq_first_mapped_cpu(hctx);
1424                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1425        }
1426
1427        /*
1428         * Do unbound schedule if we can't find a online CPU for this hctx,
1429         * and it should only happen in the path of handling CPU DEAD.
1430         */
1431        if (!cpu_online(next_cpu)) {
1432                if (!tried) {
1433                        tried = true;
1434                        goto select_cpu;
1435                }
1436
1437                /*
1438                 * Make sure to re-select CPU next time once after CPUs
1439                 * in hctx->cpumask become online again.
1440                 */
1441                hctx->next_cpu = next_cpu;
1442                hctx->next_cpu_batch = 1;
1443                return WORK_CPU_UNBOUND;
1444        }
1445
1446        hctx->next_cpu = next_cpu;
1447        return next_cpu;
1448}
1449
1450static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
1451                                        unsigned long msecs)
1452{
1453        if (unlikely(blk_mq_hctx_stopped(hctx)))
1454                return;
1455
1456        if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
1457                int cpu = get_cpu();
1458                if (cpumask_test_cpu(cpu, hctx->cpumask)) {
1459                        __blk_mq_run_hw_queue(hctx);
1460                        put_cpu();
1461                        return;
1462                }
1463
1464                put_cpu();
1465        }
1466
1467        kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
1468                                    msecs_to_jiffies(msecs));
1469}
1470
1471void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1472{
1473        __blk_mq_delay_run_hw_queue(hctx, true, msecs);
1474}
1475EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
1476
1477bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1478{
1479        int srcu_idx;
1480        bool need_run;
1481
1482        /*
1483         * When queue is quiesced, we may be switching io scheduler, or
1484         * updating nr_hw_queues, or other things, and we can't run queue
1485         * any more, even __blk_mq_hctx_has_pending() can't be called safely.
1486         *
1487         * And queue will be rerun in blk_mq_unquiesce_queue() if it is
1488         * quiesced.
1489         */
1490        hctx_lock(hctx, &srcu_idx);
1491        need_run = !blk_queue_quiesced(hctx->queue) &&
1492                blk_mq_hctx_has_pending(hctx);
1493        hctx_unlock(hctx, srcu_idx);
1494
1495        if (need_run) {
1496                __blk_mq_delay_run_hw_queue(hctx, async, 0);
1497                return true;
1498        }
1499
1500        return false;
1501}
1502EXPORT_SYMBOL(blk_mq_run_hw_queue);
1503
1504void blk_mq_run_hw_queues(struct request_queue *q, bool async)
1505{
1506        struct blk_mq_hw_ctx *hctx;
1507        int i;
1508
1509        queue_for_each_hw_ctx(q, hctx, i) {
1510                if (blk_mq_hctx_stopped(hctx))
1511                        continue;
1512
1513                blk_mq_run_hw_queue(hctx, async);
1514        }
1515}
1516EXPORT_SYMBOL(blk_mq_run_hw_queues);
1517
1518/**
1519 * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
1520 * @q: request queue.
1521 *
1522 * The caller is responsible for serializing this function against
1523 * blk_mq_{start,stop}_hw_queue().
1524 */
1525bool blk_mq_queue_stopped(struct request_queue *q)
1526{
1527        struct blk_mq_hw_ctx *hctx;
1528        int i;
1529
1530        queue_for_each_hw_ctx(q, hctx, i)
1531                if (blk_mq_hctx_stopped(hctx))
1532                        return true;
1533
1534        return false;
1535}
1536EXPORT_SYMBOL(blk_mq_queue_stopped);
1537
1538/*
1539 * This function is often used for pausing .queue_rq() by driver when
1540 * there isn't enough resource or some conditions aren't satisfied, and
1541 * BLK_STS_RESOURCE is usually returned.
1542 *
1543 * We do not guarantee that dispatch can be drained or blocked
1544 * after blk_mq_stop_hw_queue() returns. Please use
1545 * blk_mq_quiesce_queue() for that requirement.
1546 */
1547void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
1548{
1549        cancel_delayed_work(&hctx->run_work);
1550
1551        set_bit(BLK_MQ_S_STOPPED, &hctx->state);
1552}
1553EXPORT_SYMBOL(blk_mq_stop_hw_queue);
1554
1555/*
1556 * This function is often used for pausing .queue_rq() by driver when
1557 * there isn't enough resource or some conditions aren't satisfied, and
1558 * BLK_STS_RESOURCE is usually returned.
1559 *
1560 * We do not guarantee that dispatch can be drained or blocked
1561 * after blk_mq_stop_hw_queues() returns. Please use
1562 * blk_mq_quiesce_queue() for that requirement.
1563 */
1564void blk_mq_stop_hw_queues(struct request_queue *q)
1565{
1566        struct blk_mq_hw_ctx *hctx;
1567        int i;
1568
1569        queue_for_each_hw_ctx(q, hctx, i)
1570                blk_mq_stop_hw_queue(hctx);
1571}
1572EXPORT_SYMBOL(blk_mq_stop_hw_queues);
1573
1574void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
1575{
1576        clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1577
1578        blk_mq_run_hw_queue(hctx, false);
1579}
1580EXPORT_SYMBOL(blk_mq_start_hw_queue);
1581
1582void blk_mq_start_hw_queues(struct request_queue *q)
1583{
1584        struct blk_mq_hw_ctx *hctx;
1585        int i;
1586
1587        queue_for_each_hw_ctx(q, hctx, i)
1588                blk_mq_start_hw_queue(hctx);
1589}
1590EXPORT_SYMBOL(blk_mq_start_hw_queues);
1591
1592void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1593{
1594        if (!blk_mq_hctx_stopped(hctx))
1595                return;
1596
1597        clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1598        blk_mq_run_hw_queue(hctx, async);
1599}
1600EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
1601
1602void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
1603{
1604        struct blk_mq_hw_ctx *hctx;
1605        int i;
1606
1607        queue_for_each_hw_ctx(q, hctx, i)
1608                blk_mq_start_stopped_hw_queue(hctx, async);
1609}
1610EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
1611
1612static void blk_mq_run_work_fn(struct work_struct *work)
1613{
1614        struct blk_mq_hw_ctx *hctx;
1615
1616        hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
1617
1618        /*
1619         * If we are stopped, don't run the queue.
1620         */
1621        if (test_bit(BLK_MQ_S_STOPPED, &hctx->state))
1622                return;
1623
1624        __blk_mq_run_hw_queue(hctx);
1625}
1626
1627static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
1628                                            struct request *rq,
1629                                            bool at_head)
1630{
1631        struct blk_mq_ctx *ctx = rq->mq_ctx;
1632        enum hctx_type type = hctx->type;
1633
1634        lockdep_assert_held(&ctx->lock);
1635
1636        trace_block_rq_insert(hctx->queue, rq);
1637
1638        if (at_head)
1639                list_add(&rq->queuelist, &ctx->rq_lists[type]);
1640        else
1641                list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
1642}
1643
1644void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
1645                             bool at_head)
1646{
1647        struct blk_mq_ctx *ctx = rq->mq_ctx;
1648
1649        lockdep_assert_held(&ctx->lock);
1650
1651        __blk_mq_insert_req_list(hctx, rq, at_head);
1652        blk_mq_hctx_mark_pending(hctx, ctx);
1653}
1654
1655/*
1656 * Should only be used carefully, when the caller knows we want to
1657 * bypass a potential IO scheduler on the target device.
1658 */
1659void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
1660                                  bool run_queue)
1661{
1662        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1663
1664        spin_lock(&hctx->lock);
1665        if (at_head)
1666                list_add(&rq->queuelist, &hctx->dispatch);
1667        else
1668                list_add_tail(&rq->queuelist, &hctx->dispatch);
1669        spin_unlock(&hctx->lock);
1670
1671        if (run_queue)
1672                blk_mq_run_hw_queue(hctx, false);
1673}
1674
1675void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
1676                            struct list_head *list)
1677
1678{
1679        struct request *rq;
1680        enum hctx_type type = hctx->type;
1681
1682        /*
1683         * preemption doesn't flush plug list, so it's possible ctx->cpu is
1684         * offline now
1685         */
1686        list_for_each_entry(rq, list, queuelist) {
1687                BUG_ON(rq->mq_ctx != ctx);
1688                trace_block_rq_insert(hctx->queue, rq);
1689        }
1690
1691        spin_lock(&ctx->lock);
1692        list_splice_tail_init(list, &ctx->rq_lists[type]);
1693        blk_mq_hctx_mark_pending(hctx, ctx);
1694        spin_unlock(&ctx->lock);
1695}
1696
1697static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
1698{
1699        struct request *rqa = container_of(a, struct request, queuelist);
1700        struct request *rqb = container_of(b, struct request, queuelist);
1701
1702        if (rqa->mq_ctx < rqb->mq_ctx)
1703                return -1;
1704        else if (rqa->mq_ctx > rqb->mq_ctx)
1705                return 1;
1706        else if (rqa->mq_hctx < rqb->mq_hctx)
1707                return -1;
1708        else if (rqa->mq_hctx > rqb->mq_hctx)
1709                return 1;
1710
1711        return blk_rq_pos(rqa) > blk_rq_pos(rqb);
1712}
1713
1714void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1715{
1716        struct blk_mq_hw_ctx *this_hctx;
1717        struct blk_mq_ctx *this_ctx;
1718        struct request_queue *this_q;
1719        struct request *rq;
1720        LIST_HEAD(list);
1721        LIST_HEAD(rq_list);
1722        unsigned int depth;
1723
1724        list_splice_init(&plug->mq_list, &list);
1725
1726        if (plug->rq_count > 2 && plug->multiple_queues)
1727                list_sort(NULL, &list, plug_rq_cmp);
1728
1729        plug->rq_count = 0;
1730
1731        this_q = NULL;
1732        this_hctx = NULL;
1733        this_ctx = NULL;
1734        depth = 0;
1735
1736        while (!list_empty(&list)) {
1737                rq = list_entry_rq(list.next);
1738                list_del_init(&rq->queuelist);
1739                BUG_ON(!rq->q);
1740                if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) {
1741                        if (this_hctx) {
1742                                trace_block_unplug(this_q, depth, !from_schedule);
1743                                blk_mq_sched_insert_requests(this_hctx, this_ctx,
1744                                                                &rq_list,
1745                                                                from_schedule);
1746                        }
1747
1748                        this_q = rq->q;
1749                        this_ctx = rq->mq_ctx;
1750                        this_hctx = rq->mq_hctx;
1751                        depth = 0;
1752                }
1753
1754                depth++;
1755                list_add_tail(&rq->queuelist, &rq_list);
1756        }
1757
1758        /*
1759         * If 'this_hctx' is set, we know we have entries to complete
1760         * on 'rq_list'. Do those.
1761         */
1762        if (this_hctx) {
1763                trace_block_unplug(this_q, depth, !from_schedule);
1764                blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
1765                                                from_schedule);
1766        }
1767}
1768
1769static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1770{
1771        if (bio->bi_opf & REQ_RAHEAD)
1772                rq->cmd_flags |= REQ_FAILFAST_MASK;
1773
1774        rq->__sector = bio->bi_iter.bi_sector;
1775        rq->write_hint = bio->bi_write_hint;
1776        blk_rq_bio_prep(rq->q, rq, bio);
1777
1778        blk_account_io_start(rq, true);
1779}
1780
1781static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
1782                                            struct request *rq,
1783                                            blk_qc_t *cookie, bool last)
1784{
1785        struct request_queue *q = rq->q;
1786        struct blk_mq_queue_data bd = {
1787                .rq = rq,
1788                .last = last,
1789        };
1790        blk_qc_t new_cookie;
1791        blk_status_t ret;
1792
1793        new_cookie = request_to_qc_t(hctx, rq);
1794
1795        /*
1796         * For OK queue, we are done. For error, caller may kill it.
1797         * Any other error (busy), just add it to our list as we
1798         * previously would have done.
1799         */
1800        ret = q->mq_ops->queue_rq(hctx, &bd);
1801        switch (ret) {
1802        case BLK_STS_OK:
1803                blk_mq_update_dispatch_busy(hctx, false);
1804                *cookie = new_cookie;
1805                break;
1806        case BLK_STS_RESOURCE:
1807        case BLK_STS_DEV_RESOURCE:
1808                blk_mq_update_dispatch_busy(hctx, true);
1809                __blk_mq_requeue_request(rq);
1810                break;
1811        default:
1812                blk_mq_update_dispatch_busy(hctx, false);
1813                *cookie = BLK_QC_T_NONE;
1814                break;
1815        }
1816
1817        return ret;
1818}
1819
1820static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1821                                                struct request *rq,
1822                                                blk_qc_t *cookie,
1823                                                bool bypass_insert, bool last)
1824{
1825        struct request_queue *q = rq->q;
1826        bool run_queue = true;
1827
1828        /*
1829         * RCU or SRCU read lock is needed before checking quiesced flag.
1830         *
1831         * When queue is stopped or quiesced, ignore 'bypass_insert' from
1832         * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
1833         * and avoid driver to try to dispatch again.
1834         */
1835        if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
1836                run_queue = false;
1837                bypass_insert = false;
1838                goto insert;
1839        }
1840
1841        if (q->elevator && !bypass_insert)
1842                goto insert;
1843
1844        if (!blk_mq_get_dispatch_budget(hctx))
1845                goto insert;
1846
1847        if (!blk_mq_get_driver_tag(rq)) {
1848                blk_mq_put_dispatch_budget(hctx);
1849                goto insert;
1850        }
1851
1852        return __blk_mq_issue_directly(hctx, rq, cookie, last);
1853insert:
1854        if (bypass_insert)
1855                return BLK_STS_RESOURCE;
1856
1857        blk_mq_request_bypass_insert(rq, false, run_queue);
1858        return BLK_STS_OK;
1859}
1860
1861static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1862                struct request *rq, blk_qc_t *cookie)
1863{
1864        blk_status_t ret;
1865        int srcu_idx;
1866
1867        might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
1868
1869        hctx_lock(hctx, &srcu_idx);
1870
1871        ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
1872        if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
1873                blk_mq_request_bypass_insert(rq, false, true);
1874        else if (ret != BLK_STS_OK)
1875                blk_mq_end_request(rq, ret);
1876
1877        hctx_unlock(hctx, srcu_idx);
1878}
1879
1880blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
1881{
1882        blk_status_t ret;
1883        int srcu_idx;
1884        blk_qc_t unused_cookie;
1885        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1886
1887        hctx_lock(hctx, &srcu_idx);
1888        ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
1889        hctx_unlock(hctx, srcu_idx);
1890
1891        return ret;
1892}
1893
1894void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
1895                struct list_head *list)
1896{
1897        while (!list_empty(list)) {
1898                blk_status_t ret;
1899                struct request *rq = list_first_entry(list, struct request,
1900                                queuelist);
1901
1902                list_del_init(&rq->queuelist);
1903                ret = blk_mq_request_issue_directly(rq, list_empty(list));
1904                if (ret != BLK_STS_OK) {
1905                        if (ret == BLK_STS_RESOURCE ||
1906                                        ret == BLK_STS_DEV_RESOURCE) {
1907                                blk_mq_request_bypass_insert(rq, false,
1908                                                        list_empty(list));
1909                                break;
1910                        }
1911                        blk_mq_end_request(rq, ret);
1912                }
1913        }
1914
1915        /*
1916         * If we didn't flush the entire list, we could have told
1917         * the driver there was more coming, but that turned out to
1918         * be a lie.
1919         */
1920        if (!list_empty(list) && hctx->queue->mq_ops->commit_rqs)
1921                hctx->queue->mq_ops->commit_rqs(hctx);
1922}
1923
1924static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
1925{
1926        list_add_tail(&rq->queuelist, &plug->mq_list);
1927        plug->rq_count++;
1928        if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
1929                struct request *tmp;
1930
1931                tmp = list_first_entry(&plug->mq_list, struct request,
1932                                                queuelist);
1933                if (tmp->q != rq->q)
1934                        plug->multiple_queues = true;
1935        }
1936}
1937
1938static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1939{
1940        const int is_sync = op_is_sync(bio->bi_opf);
1941        const int is_flush_fua = op_is_flush(bio->bi_opf);
1942        struct blk_mq_alloc_data data = { .flags = 0};
1943        struct request *rq;
1944        struct blk_plug *plug;
1945        struct request *same_queue_rq = NULL;
1946        blk_qc_t cookie;
1947
1948        blk_queue_bounce(q, &bio);
1949
1950        blk_queue_split(q, &bio);
1951
1952        if (!bio_integrity_prep(bio))
1953                return BLK_QC_T_NONE;
1954
1955        if (!is_flush_fua && !blk_queue_nomerges(q) &&
1956            blk_attempt_plug_merge(q, bio, &same_queue_rq))
1957                return BLK_QC_T_NONE;
1958
1959        if (blk_mq_sched_bio_merge(q, bio))
1960                return BLK_QC_T_NONE;
1961
1962        rq_qos_throttle(q, bio);
1963
1964        data.cmd_flags = bio->bi_opf;
1965        rq = blk_mq_get_request(q, bio, &data);
1966        if (unlikely(!rq)) {
1967                rq_qos_cleanup(q, bio);
1968                if (bio->bi_opf & REQ_NOWAIT)
1969                        bio_wouldblock_error(bio);
1970                return BLK_QC_T_NONE;
1971        }
1972
1973        trace_block_getrq(q, bio, bio->bi_opf);
1974
1975        rq_qos_track(q, rq, bio);
1976
1977        cookie = request_to_qc_t(data.hctx, rq);
1978
1979        blk_mq_bio_to_request(rq, bio);
1980
1981        plug = blk_mq_plug(q, bio);
1982        if (unlikely(is_flush_fua)) {
1983                /* bypass scheduler for flush rq */
1984                blk_insert_flush(rq);
1985                blk_mq_run_hw_queue(data.hctx, true);
1986        } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs ||
1987                                !blk_queue_nonrot(q))) {
1988                /*
1989                 * Use plugging if we have a ->commit_rqs() hook as well, as
1990                 * we know the driver uses bd->last in a smart fashion.
1991                 *
1992                 * Use normal plugging if this disk is slow HDD, as sequential
1993                 * IO may benefit a lot from plug merging.
1994                 */
1995                unsigned int request_count = plug->rq_count;
1996                struct request *last = NULL;
1997
1998                if (!request_count)
1999                        trace_block_plug(q);
2000                else

2001                        last = list_entry_rq(plug->mq_list.prev);
2002
2003                if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
2004                    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
2005                        blk_flush_plug_list(plug, false);
2006                        trace_block_plug(q);
2007                }
2008
2009                blk_add_rq_to_plug(plug, rq);
2010        } else if (q->elevator) {
2011                blk_mq_sched_insert_request(rq, false, true, true);
2012        } else if (plug && !blk_queue_nomerges(q)) {
2013                /*
2014                 * We do limited plugging. If the bio can be merged, do that.
2015                 * Otherwise the existing request in the plug list will be
2016                 * issued. So the plug list will have one request at most
2017                 * The plug list might get flushed before this. If that happens,
2018                 * the plug list is empty, and same_queue_rq is invalid.
2019                 */
2020                if (list_empty(&plug->mq_list))
2021                        same_queue_rq = NULL;
2022                if (same_queue_rq) {
2023                        list_del_init(&same_queue_rq->queuelist);
2024                        plug->rq_count--;
2025                }
2026                blk_add_rq_to_plug(plug, rq);
2027                trace_block_plug(q);
2028
2029                if (same_queue_rq) {
2030                        data.hctx = same_queue_rq->mq_hctx;
2031                        trace_block_unplug(q, 1, true);
2032                        blk_mq_try_issue_directly(data.hctx, same_queue_rq,
2033                                        &cookie);
2034                }
2035        } else if ((q->nr_hw_queues > 1 && is_sync) ||
2036                        !data.hctx->dispatch_busy) {
2037                blk_mq_try_issue_directly(data.hctx, rq, &cookie);
2038        } else {
2039                blk_mq_sched_insert_request(rq, false, true, true);
2040        }
2041
2042        return cookie;
2043}
2044
2045void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
2046                     unsigned int hctx_idx)
2047{
2048        struct page *page;
2049
2050        if (tags->rqs && set->ops->exit_request) {
2051                int i;
2052
2053                for (i = 0; i < tags->nr_tags; i++) {
2054                        struct request *rq = tags->static_rqs[i];
2055
2056                        if (!rq)
2057                                continue;
2058                        set->ops->exit_request(set, rq, hctx_idx);
2059                        tags->static_rqs[i] = NULL;
2060                }
2061        }
2062
2063        while (!list_empty(&tags->page_list)) {
2064                page = list_first_entry(&tags->page_list, struct page, lru);
2065                list_del_init(&page->lru);
2066                /*
2067                 * Remove kmemleak object previously allocated in
2068                 * blk_mq_init_rq_map().
2069                 */
2070                kmemleak_free(page_address(page));
2071                __free_pages(page, page->private);
2072        }
2073}
2074
2075void blk_mq_free_rq_map(struct blk_mq_tags *tags)
2076{
2077        kfree(tags->rqs);
2078        tags->rqs = NULL;
2079        kfree(tags->static_rqs);
2080        tags->static_rqs = NULL;
2081
2082        blk_mq_free_tags(tags);
2083}
2084
2085struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
2086                                        unsigned int hctx_idx,
2087                                        unsigned int nr_tags,
2088                                        unsigned int reserved_tags)
2089{
2090        struct blk_mq_tags *tags;
2091        int node;
2092
2093        node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
2094        if (node == NUMA_NO_NODE)
2095                node = set->numa_node;
2096
2097        tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
2098                                BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
2099        if (!tags)
2100                return NULL;
2101
2102        tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
2103                                 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
2104                                 node);
2105        if (!tags->rqs) {
2106                blk_mq_free_tags(tags);
2107                return NULL;
2108        }
2109
2110        tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
2111                                        GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
2112                                        node);
2113        if (!tags->static_rqs) {
2114                kfree(tags->rqs);
2115                blk_mq_free_tags(tags);
2116                return NULL;
2117        }
2118
2119        return tags;
2120}
2121
2122static size_t order_to_size(unsigned int order)
2123{
2124        return (size_t)PAGE_SIZE << order;
2125}
2126
2127static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
2128                               unsigned int hctx_idx, int node)
2129{
2130        int ret;
2131
2132        if (set->ops->init_request) {
2133                ret = set->ops->init_request(set, rq, hctx_idx, node);
2134                if (ret)
2135                        return ret;
2136        }
2137
2138        WRITE_ONCE(rq->state, MQ_RQ_IDLE);
2139        return 0;
2140}
2141
2142int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
2143                     unsigned int hctx_idx, unsigned int depth)
2144{
2145        unsigned int i, j, entries_per_page, max_order = 4;
2146        size_t rq_size, left;
2147        int node;
2148
2149        node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
2150        if (node == NUMA_NO_NODE)
2151                node = set->numa_node;
2152
2153        INIT_LIST_HEAD(&tags->page_list);
2154
2155        /*
2156         * rq_size is the size of the request plus driver payload, rounded
2157         * to the cacheline size
2158         */
2159        rq_size = round_up(sizeof(struct request) + set->cmd_size,
2160                                cache_line_size());
2161        left = rq_size * depth;
2162
2163        for (i = 0; i < depth; ) {
2164                int this_order = max_order;
2165                struct page *page;
2166                int to_do;
2167                void *p;
2168
2169                while (this_order && left < order_to_size(this_order - 1))
2170                        this_order--;
2171
2172                do {
2173                        page = alloc_pages_node(node,
2174                                GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
2175                                this_order);
2176                        if (page)
2177                                break;
2178                        if (!this_order--)
2179                                break;
2180                        if (order_to_size(this_order) < rq_size)
2181                                break;
2182                } while (1);
2183
2184                if (!page)
2185                        goto fail;
2186
2187                page->private = this_order;
2188                list_add_tail(&page->lru, &tags->page_list);
2189
2190                p = page_address(page);
2191                /*
2192                 * Allow kmemleak to scan these pages as they contain pointers
2193                 * to additional allocations like via ops->init_request().
2194                 */
2195                kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
2196                entries_per_page = order_to_size(this_order) / rq_size;
2197                to_do = min(entries_per_page, depth - i);
2198                left -= to_do * rq_size;
2199                for (j = 0; j < to_do; j++) {
2200                        struct request *rq = p;
2201
2202                        tags->static_rqs[i] = rq;
2203                        if (blk_mq_init_request(set, rq, hctx_idx, node)) {
2204                                tags->static_rqs[i] = NULL;
2205                                goto fail;
2206                        }
2207
2208                        p += rq_size;
2209                        i++;
2210                }
2211        }
2212        return 0;
2213
2214fail:
2215        blk_mq_free_rqs(set, tags, hctx_idx);
2216        return -ENOMEM;
2217}
2218
2219/*
2220 * 'cpu' is going away. splice any existing rq_list entries from this
2221 * software queue to the hw queue dispatch list, and ensure that it
2222 * gets run.
2223 */
2224static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
2225{
2226        struct blk_mq_hw_ctx *hctx;
2227        struct blk_mq_ctx *ctx;
2228        LIST_HEAD(tmp);
2229        enum hctx_type type;
2230
2231        hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
2232        ctx = __blk_mq_get_ctx(hctx->queue, cpu);
2233        type = hctx->type;
2234
2235        spin_lock(&ctx->lock);
2236        if (!list_empty(&ctx->rq_lists[type])) {
2237                list_splice_init(&ctx->rq_lists[type], &tmp);
2238                blk_mq_hctx_clear_pending(hctx, ctx);
2239        }
2240        spin_unlock(&ctx->lock);
2241
2242        if (list_empty(&tmp))
2243                return 0;
2244
2245        spin_lock(&hctx->lock);
2246        list_splice_tail_init(&tmp, &hctx->dispatch);
2247        spin_unlock(&hctx->lock);
2248
2249        blk_mq_run_hw_queue(hctx, true);
2250        return 0;
2251}
2252
2253static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
2254{
2255        cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
2256                                            &hctx->cpuhp_dead);
2257}
2258
2259/* hctx->ctxs will be freed in queue's release handler */
2260static void blk_mq_exit_hctx(struct request_queue *q,
2261                struct blk_mq_tag_set *set,
2262                struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
2263{
2264        if (blk_mq_hw_queue_mapped(hctx))
2265                blk_mq_tag_idle(hctx);
2266
2267        if (set->ops->exit_request)
2268                set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
2269
2270        if (set->ops->exit_hctx)
2271                set->ops->exit_hctx(hctx, hctx_idx);
2272
2273        blk_mq_remove_cpuhp(hctx);
2274
2275        spin_lock(&q->unused_hctx_lock);
2276        list_add(&hctx->hctx_list, &q->unused_hctx_list);
2277        spin_unlock(&q->unused_hctx_lock);
2278}
2279
2280static void blk_mq_exit_hw_queues(struct request_queue *q,
2281                struct blk_mq_tag_set *set, int nr_queue)
2282{
2283        struct blk_mq_hw_ctx *hctx;
2284        unsigned int i;
2285
2286        queue_for_each_hw_ctx(q, hctx, i) {
2287                if (i == nr_queue)
2288                        break;
2289                blk_mq_debugfs_unregister_hctx(hctx);
2290                blk_mq_exit_hctx(q, set, hctx, i);
2291        }
2292}
2293
2294static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
2295{
2296        int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
2297
2298        BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
2299                           __alignof__(struct blk_mq_hw_ctx)) !=
2300                     sizeof(struct blk_mq_hw_ctx));
2301
2302        if (tag_set->flags & BLK_MQ_F_BLOCKING)
2303                hw_ctx_size += sizeof(struct srcu_struct);
2304
2305        return hw_ctx_size;
2306}
2307
2308static int blk_mq_init_hctx(struct request_queue *q,
2309                struct blk_mq_tag_set *set,
2310                struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
2311{
2312        hctx->queue_num = hctx_idx;
2313
2314        cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
2315
2316        hctx->tags = set->tags[hctx_idx];
2317
2318        if (set->ops->init_hctx &&
2319            set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2320                goto unregister_cpu_notifier;
2321
2322        if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
2323                                hctx->numa_node))
2324                goto exit_hctx;
2325        return 0;
2326
2327 exit_hctx:
2328        if (set->ops->exit_hctx)
2329                set->ops->exit_hctx(hctx, hctx_idx);
2330 unregister_cpu_notifier:
2331        blk_mq_remove_cpuhp(hctx);
2332        return -1;
2333}
2334
2335static struct blk_mq_hw_ctx *
2336blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
2337                int node)
2338{
2339        struct blk_mq_hw_ctx *hctx;
2340        gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
2341
2342        hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
2343        if (!hctx)
2344                goto fail_alloc_hctx;
2345
2346        if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
2347                goto free_hctx;
2348
2349        atomic_set(&hctx->nr_active, 0);
2350        if (node == NUMA_NO_NODE)
2351                node = set->numa_node;
2352        hctx->numa_node = node;
2353
2354        INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
2355        spin_lock_init(&hctx->lock);
2356        INIT_LIST_HEAD(&hctx->dispatch);
2357        hctx->queue = q;
2358        hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
2359
2360        INIT_LIST_HEAD(&hctx->hctx_list);
2361
2362        /*
2363         * Allocate space for all possible cpus to avoid allocation at
2364         * runtime
2365         */
2366        hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
2367                        gfp, node);
2368        if (!hctx->ctxs)
2369                goto free_cpumask;
2370
2371        if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
2372                                gfp, node))
2373                goto free_ctxs;
2374        hctx->nr_ctx = 0;
2375
2376        spin_lock_init(&hctx->dispatch_wait_lock);
2377        init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
2378        INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
2379
2380        hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size,
2381                        gfp);
2382        if (!hctx->fq)
2383                goto free_bitmap;
2384
2385        if (hctx->flags & BLK_MQ_F_BLOCKING)
2386                init_srcu_struct(hctx->srcu);
2387        blk_mq_hctx_kobj_init(hctx);
2388
2389        return hctx;
2390
2391 free_bitmap:
2392        sbitmap_free(&hctx->ctx_map);
2393 free_ctxs:
2394        kfree(hctx->ctxs);
2395 free_cpumask:
2396        free_cpumask_var(hctx->cpumask);
2397 free_hctx:
2398        kfree(hctx);
2399 fail_alloc_hctx:
2400        return NULL;
2401}
2402
2403static void blk_mq_init_cpu_queues(struct request_queue *q,
2404                                   unsigned int nr_hw_queues)
2405{
2406        struct blk_mq_tag_set *set = q->tag_set;
2407        unsigned int i, j;
2408
2409        for_each_possible_cpu(i) {
2410                struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
2411                struct blk_mq_hw_ctx *hctx;
2412                int k;
2413
2414                __ctx->cpu = i;
2415                spin_lock_init(&__ctx->lock);
2416                for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
2417                        INIT_LIST_HEAD(&__ctx->rq_lists[k]);
2418
2419                __ctx->queue = q;
2420
2421                /*
2422                 * Set local node, IFF we have more than one hw queue. If
2423                 * not, we remain on the home node of the device
2424                 */
2425                for (j = 0; j < set->nr_maps; j++) {
2426                        hctx = blk_mq_map_queue_type(q, j, i);
2427                        if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2428                                hctx->numa_node = local_memory_node(cpu_to_node(i));
2429                }
2430        }
2431}
2432
2433static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
2434{
2435        int ret = 0;
2436
2437        set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
2438                                        set->queue_depth, set->reserved_tags);
2439        if (!set->tags[hctx_idx])
2440                return false;
2441
2442        ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
2443                                set->queue_depth);
2444        if (!ret)
2445                return true;
2446
2447        blk_mq_free_rq_map(set->tags[hctx_idx]);
2448        set->tags[hctx_idx] = NULL;
2449        return false;
2450}
2451
2452static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
2453                                         unsigned int hctx_idx)
2454{
2455        if (set->tags && set->tags[hctx_idx]) {
2456                blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
2457                blk_mq_free_rq_map(set->tags[hctx_idx]);
2458                set->tags[hctx_idx] = NULL;
2459        }
2460}
2461
2462static void blk_mq_map_swqueue(struct request_queue *q)
2463{
2464        unsigned int i, j, hctx_idx;
2465        struct blk_mq_hw_ctx *hctx;
2466        struct blk_mq_ctx *ctx;
2467        struct blk_mq_tag_set *set = q->tag_set;
2468
2469        queue_for_each_hw_ctx(q, hctx, i) {
2470                cpumask_clear(hctx->cpumask);
2471                hctx->nr_ctx = 0;
2472                hctx->dispatch_from = NULL;
2473        }
2474
2475        /*
2476         * Map software to hardware queues.
2477         *
2478         * If the cpu isn't present, the cpu is mapped to first hctx.
2479         */
2480        for_each_possible_cpu(i) {
2481                hctx_idx = set->map[HCTX_TYPE_DEFAULT].mq_map[i];
2482                /* unmapped hw queue can be remapped after CPU topo changed */
2483                if (!set->tags[hctx_idx] &&
2484                    !__blk_mq_alloc_rq_map(set, hctx_idx)) {
2485                        /*
2486                         * If tags initialization fail for some hctx,
2487                         * that hctx won't be brought online.  In this
2488                         * case, remap the current ctx to hctx[0] which
2489                         * is guaranteed to always have tags allocated
2490                         */
2491                        set->map[HCTX_TYPE_DEFAULT].mq_map[i] = 0;
2492                }
2493
2494                ctx = per_cpu_ptr(q->queue_ctx, i);
2495                for (j = 0; j < set->nr_maps; j++) {
2496                        if (!set->map[j].nr_queues) {
2497                                ctx->hctxs[j] = blk_mq_map_queue_type(q,
2498                                                HCTX_TYPE_DEFAULT, i);
2499                                continue;
2500                        }
2501
2502                        hctx = blk_mq_map_queue_type(q, j, i);
2503                        ctx->hctxs[j] = hctx;
2504                        /*
2505                         * If the CPU is already set in the mask, then we've
2506                         * mapped this one already. This can happen if
2507                         * devices share queues across queue maps.
2508                         */
2509                        if (cpumask_test_cpu(i, hctx->cpumask))
2510                                continue;
2511
2512                        cpumask_set_cpu(i, hctx->cpumask);
2513                        hctx->type = j;
2514                        ctx->index_hw[hctx->type] = hctx->nr_ctx;
2515                        hctx->ctxs[hctx->nr_ctx++] = ctx;
2516
2517                        /*
2518                         * If the nr_ctx type overflows, we have exceeded the
2519                         * amount of sw queues we can support.
2520                         */
2521                        BUG_ON(!hctx->nr_ctx);
2522                }
2523
2524                for (; j < HCTX_MAX_TYPES; j++)
2525                        ctx->hctxs[j] = blk_mq_map_queue_type(q,
2526                                        HCTX_TYPE_DEFAULT, i);
2527        }
2528
2529        queue_for_each_hw_ctx(q, hctx, i) {
2530                /*
2531                 * If no software queues are mapped to this hardware queue,
2532                 * disable it and free the request entries.
2533                 */
2534                if (!hctx->nr_ctx) {
2535                        /* Never unmap queue 0.  We need it as a
2536                         * fallback in case of a new remap fails
2537                         * allocation
2538                         */
2539                        if (i && set->tags[i])
2540                                blk_mq_free_map_and_requests(set, i);
2541
2542                        hctx->tags = NULL;
2543                        continue;
2544                }
2545
2546                hctx->tags = set->tags[i];
2547                WARN_ON(!hctx->tags);
2548
2549                /*
2550                 * Set the map size to the number of mapped software queues.
2551                 * This is more accurate and more efficient than looping
2552                 * over all possibly mapped software queues.
2553                 */
2554                sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
2555
2556                /*
2557                 * Initialize batch roundrobin counts
2558                 */
2559                hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
2560                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
2561        }
2562}
2563
2564/*
2565 * Caller needs to ensure that we're either frozen/quiesced, or that
2566 * the queue isn't live yet.
2567 */
2568static void queue_set_hctx_shared(struct request_queue *q, bool shared)
2569{
2570        struct blk_mq_hw_ctx *hctx;
2571        int i;
2572
2573        queue_for_each_hw_ctx(q, hctx, i) {
2574                if (shared)
2575                        hctx->flags |= BLK_MQ_F_TAG_SHARED;
2576                else
2577                        hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
2578        }
2579}
2580
2581static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
2582                                        bool shared)
2583{
2584        struct request_queue *q;
2585
2586        lockdep_assert_held(&set->tag_list_lock);
2587
2588        list_for_each_entry(q, &set->tag_list, tag_set_list) {
2589                blk_mq_freeze_queue(q);
2590                queue_set_hctx_shared(q, shared);
2591                blk_mq_unfreeze_queue(q);
2592        }
2593}
2594
2595static void blk_mq_del_queue_tag_set(struct request_queue *q)
2596{
2597        struct blk_mq_tag_set *set = q->tag_set;
2598
2599        mutex_lock(&set->tag_list_lock);
2600        list_del_rcu(&q->tag_set_list);
2601        if (list_is_singular(&set->tag_list)) {
2602                /* just transitioned to unshared */
2603                set->flags &= ~BLK_MQ_F_TAG_SHARED;
2604                /* update existing queue */
2605                blk_mq_update_tag_set_depth(set, false);
2606        }
2607        mutex_unlock(&set->tag_list_lock);
2608        INIT_LIST_HEAD(&q->tag_set_list);
2609}
2610
2611static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
2612                                     struct request_queue *q)
2613{
2614        mutex_lock(&set->tag_list_lock);
2615
2616        /*
2617         * Check to see if we're transitioning to shared (from 1 to 2 queues).
2618         */
2619        if (!list_empty(&set->tag_list) &&
2620            !(set->flags & BLK_MQ_F_TAG_SHARED)) {
2621                set->flags |= BLK_MQ_F_TAG_SHARED;
2622                /* update existing queue */
2623                blk_mq_update_tag_set_depth(set, true);
2624        }
2625        if (set->flags & BLK_MQ_F_TAG_SHARED)
2626                queue_set_hctx_shared(q, true);
2627        list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
2628
2629        mutex_unlock(&set->tag_list_lock);
2630}
2631
2632/* All allocations will be freed in release handler of q->mq_kobj */
2633static int blk_mq_alloc_ctxs(struct request_queue *q)
2634{
2635        struct blk_mq_ctxs *ctxs;
2636        int cpu;
2637
2638        ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
2639        if (!ctxs)
2640                return -ENOMEM;
2641
2642        ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2643        if (!ctxs->queue_ctx)
2644                goto fail;
2645
2646        for_each_possible_cpu(cpu) {
2647                struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
2648                ctx->ctxs = ctxs;
2649        }
2650
2651        q->mq_kobj = &ctxs->kobj;
2652        q->queue_ctx = ctxs->queue_ctx;
2653
2654        return 0;
2655 fail:
2656        kfree(ctxs);
2657        return -ENOMEM;
2658}
2659
2660/*
2661 * It is the actual release handler for mq, but we do it from
2662 * request queue's release handler for avoiding use-after-free
2663 * and headache because q->mq_kobj shouldn't have been introduced,
2664 * but we can't group ctx/kctx kobj without it.
2665 */
2666void blk_mq_release(struct request_queue *q)
2667{
2668        struct blk_mq_hw_ctx *hctx, *next;
2669        int i;
2670
2671        cancel_delayed_work_sync(&q->requeue_work);
2672
2673        queue_for_each_hw_ctx(q, hctx, i)
2674                WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
2675
2676        /* all hctx are in .unused_hctx_list now */
2677        list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
2678                list_del_init(&hctx->hctx_list);
2679                kobject_put(&hctx->kobj);
2680        }
2681
2682        kfree(q->queue_hw_ctx);
2683
2684        /*
2685         * release .mq_kobj and sw queue's kobject now because
2686         * both share lifetime with request queue.
2687         */
2688        blk_mq_sysfs_deinit(q);
2689}
2690
2691struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
2692{
2693        struct request_queue *uninit_q, *q;
2694
2695        uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
2696        if (!uninit_q)
2697                return ERR_PTR(-ENOMEM);
2698
2699        q = blk_mq_init_allocated_queue(set, uninit_q);
2700        if (IS_ERR(q))
2701                blk_cleanup_queue(uninit_q);
2702
2703        return q;
2704}
2705EXPORT_SYMBOL(blk_mq_init_queue);
2706
2707/*
2708 * Helper for setting up a queue with mq ops, given queue depth, and
2709 * the passed in mq ops flags.
2710 */
2711struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
2712                                           const struct blk_mq_ops *ops,
2713                                           unsigned int queue_depth,
2714                                           unsigned int set_flags)
2715{
2716        struct request_queue *q;
2717        int ret;
2718
2719        memset(set, 0, sizeof(*set));
2720        set->ops = ops;
2721        set->nr_hw_queues = 1;
2722        set->nr_maps = 1;
2723        set->queue_depth = queue_depth;
2724        set->numa_node = NUMA_NO_NODE;
2725        set->flags = set_flags;
2726
2727        ret = blk_mq_alloc_tag_set(set);
2728        if (ret)
2729                return ERR_PTR(ret);
2730
2731        q = blk_mq_init_queue(set);
2732        if (IS_ERR(q)) {
2733                blk_mq_free_tag_set(set);
2734                return q;
2735        }
2736
2737        return q;
2738}
2739EXPORT_SYMBOL(blk_mq_init_sq_queue);
2740
2741static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
2742                struct blk_mq_tag_set *set, struct request_queue *q,
2743                int hctx_idx, int node)
2744{
2745        struct blk_mq_hw_ctx *hctx = NULL, *tmp;
2746
2747        /* reuse dead hctx first */
2748        spin_lock(&q->unused_hctx_lock);
2749        list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
2750                if (tmp->numa_node == node) {
2751                        hctx = tmp;
2752                        break;
2753                }
2754        }
2755        if (hctx)
2756                list_del_init(&hctx->hctx_list);
2757        spin_unlock(&q->unused_hctx_lock);
2758
2759        if (!hctx)
2760                hctx = blk_mq_alloc_hctx(q, set, node);
2761        if (!hctx)
2762                goto fail;
2763
2764        if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
2765                goto free_hctx;
2766
2767        return hctx;
2768
2769 free_hctx:
2770        kobject_put(&hctx->kobj);
2771 fail:
2772        return NULL;
2773}
2774
2775static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2776                                                struct request_queue *q)
2777{
2778        int i, j, end;
2779        struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
2780
2781        /* protect against switching io scheduler  */
2782        mutex_lock(&q->sysfs_lock);
2783        for (i = 0; i < set->nr_hw_queues; i++) {
2784                int node;
2785                struct blk_mq_hw_ctx *hctx;
2786
2787                node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
2788                /*
2789                 * If the hw queue has been mapped to another numa node,
2790                 * we need to realloc the hctx. If allocation fails, fallback
2791                 * to use the previous one.
2792                 */
2793                if (hctxs[i] && (hctxs[i]->numa_node == node))
2794                        continue;
2795
2796                hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
2797                if (hctx) {
2798                        if (hctxs[i])
2799                                blk_mq_exit_hctx(q, set, hctxs[i], i);
2800                        hctxs[i] = hctx;
2801                } else {
2802                        if (hctxs[i])
2803                                pr_warn("Allocate new hctx on node %d fails,\
2804                                                fallback to previous one on node %d\n",
2805                                                node, hctxs[i]->numa_node);
2806                        else
2807                                break;
2808                }
2809        }
2810        /*
2811         * Increasing nr_hw_queues fails. Free the newly allocated
2812         * hctxs and keep the previous q->nr_hw_queues.
2813         */
2814        if (i != set->nr_hw_queues) {
2815                j = q->nr_hw_queues;
2816                end = i;
2817        } else {
2818                j = i;
2819                end = q->nr_hw_queues;
2820                q->nr_hw_queues = set->nr_hw_queues;
2821        }
2822
2823        for (; j < end; j++) {
2824                struct blk_mq_hw_ctx *hctx = hctxs[j];
2825
2826                if (hctx) {
2827                        if (hctx->tags)
2828                                blk_mq_free_map_and_requests(set, j);
2829                        blk_mq_exit_hctx(q, set, hctx, j);
2830                        hctxs[j] = NULL;
2831                }
2832        }
2833        mutex_unlock(&q->sysfs_lock);
2834}
2835
2836/*
2837 * Maximum number of hardware queues we support. For single sets, we'll never
2838 * have more than the CPUs (software queues). For multiple sets, the tag_set
2839 * user may have set ->nr_hw_queues larger.
2840 */
2841static unsigned int nr_hw_queues(struct blk_mq_tag_set *set)
2842{
2843        if (set->nr_maps == 1)
2844                return nr_cpu_ids;
2845
2846        return max(set->nr_hw_queues, nr_cpu_ids);
2847}
2848
2849struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2850                                                  struct request_queue *q)
2851{
2852        /* mark the queue as mq asap */
2853        q->mq_ops = set->ops;
2854
2855        q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
2856                                             blk_mq_poll_stats_bkt,
2857                                             BLK_MQ_POLL_STATS_BKTS, q);
2858        if (!q->poll_cb)
2859                goto err_exit;
2860
2861        if (blk_mq_alloc_ctxs(q))
2862                goto err_exit;
2863
2864        /* init q->mq_kobj and sw queues' kobjects */
2865        blk_mq_sysfs_init(q);
2866
2867        q->nr_queues = nr_hw_queues(set);
2868        q->queue_hw_ctx = kcalloc_node(q->nr_queues, sizeof(*(q->queue_hw_ctx)),
2869                                                GFP_KERNEL, set->numa_node);
2870        if (!q->queue_hw_ctx)
2871                goto err_sys_init;
2872
2873        INIT_LIST_HEAD(&q->unused_hctx_list);
2874        spin_lock_init(&q->unused_hctx_lock);
2875
2876        blk_mq_realloc_hw_ctxs(set, q);
2877        if (!q->nr_hw_queues)
2878                goto err_hctxs;
2879
2880        INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
2881        blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
2882
2883        q->tag_set = set;
2884
2885        q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
2886        if (set->nr_maps > HCTX_TYPE_POLL &&
2887            set->map[HCTX_TYPE_POLL].nr_queues)
2888                blk_queue_flag_set(QUEUE_FLAG_POLL, q);
2889
2890        q->sg_reserved_size = INT_MAX;
2891
2892        INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
2893        INIT_LIST_HEAD(&q->requeue_list);
2894        spin_lock_init(&q->requeue_lock);
2895
2896        blk_queue_make_request(q, blk_mq_make_request);
2897
2898        /*
2899         * Do this after blk_queue_make_request() overrides it...
2900         */
2901        q->nr_requests = set->queue_depth;
2902
2903        /*
2904         * Default to classic polling
2905         */
2906        q->poll_nsec = BLK_MQ_POLL_CLASSIC;
2907
2908        blk_mq_init_cpu_queues(q, set->nr_hw_queues);
2909        blk_mq_add_queue_tag_set(set, q);
2910        blk_mq_map_swqueue(q);
2911
2912        if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
2913                int ret;
2914
2915                ret = elevator_init_mq(q);
2916                if (ret)
2917                        return ERR_PTR(ret);
2918        }
2919
2920        return q;
2921
2922err_hctxs:
2923        kfree(q->queue_hw_ctx);
2924err_sys_init:
2925        blk_mq_sysfs_deinit(q);
2926err_exit:
2927        q->mq_ops = NULL;
2928        return ERR_PTR(-ENOMEM);
2929}
2930EXPORT_SYMBOL(blk_mq_init_allocated_queue);
2931
2932/* tags can _not_ be used after returning from blk_mq_exit_queue */
2933void blk_mq_exit_queue(struct request_queue *q)
2934{
2935        struct blk_mq_tag_set   *set = q->tag_set;
2936
2937        blk_mq_del_queue_tag_set(q);
2938        blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
2939}
2940
2941static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2942{
2943        int i;
2944
2945        for (i = 0; i < set->nr_hw_queues; i++)
2946                if (!__blk_mq_alloc_rq_map(set, i))
2947                        goto out_unwind;
2948
2949        return 0;
2950
2951out_unwind:
2952        while (--i >= 0)
2953                blk_mq_free_rq_map(set->tags[i]);
2954
2955        return -ENOMEM;
2956}
2957
2958/*
2959 * Allocate the request maps associated with this tag_set. Note that this
2960 * may reduce the depth asked for, if memory is tight. set->queue_depth
2961 * will be updated to reflect the allocated depth.
2962 */
2963static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2964{
2965        unsigned int depth;
2966        int err;
2967
2968        depth = set->queue_depth;
2969        do {
2970                err = __blk_mq_alloc_rq_maps(set);
2971                if (!err)
2972                        break;
2973
2974                set->queue_depth >>= 1;
2975                if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
2976                        err = -ENOMEM;
2977                        break;
2978                }
2979        } while (set->queue_depth);
2980
2981        if (!set->queue_depth || err) {
2982                pr_err("blk-mq: failed to allocate request map\n");
2983                return -ENOMEM;
2984        }
2985
2986        if (depth != set->queue_depth)
2987                pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
2988                                                depth, set->queue_depth);
2989
2990        return 0;
2991}
2992
2993static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2994{
2995        /*
2996         * blk_mq_map_queues() and multiple .map_queues() implementations
2997         * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
2998         * number of hardware queues.
2999         */
3000        if (set->nr_maps == 1)

3001                set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
3002
3003        if (set->ops->map_queues && !is_kdump_kernel()) {
3004                int i;
3005
3006                /*
3007                 * transport .map_queues is usually done in the following
3008                 * way:
3009                 *
3010                 * for (queue = 0; queue < set->nr_hw_queues; queue++) {
3011                 *      mask = get_cpu_mask(queue)
3012                 *      for_each_cpu(cpu, mask)
3013                 *              set->map[x].mq_map[cpu] = queue;
3014                 * }
3015                 *
3016                 * When we need to remap, the table has to be cleared for
3017                 * killing stale mapping since one CPU may not be mapped
3018                 * to any hw queue.
3019                 */
3020                for (i = 0; i < set->nr_maps; i++)
3021                        blk_mq_clear_mq_map(&set->map[i]);
3022
3023                return set->ops->map_queues(set);
3024        } else {
3025                BUG_ON(set->nr_maps > 1);
3026                return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
3027        }
3028}
3029
3030/*
3031 * Alloc a tag set to be associated with one or more request queues.
3032 * May fail with EINVAL for various error conditions. May adjust the
3033 * requested depth down, if it's too large. In that case, the set
3034 * value will be stored in set->queue_depth.
3035 */
3036int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
3037{
3038        int i, ret;
3039
3040        BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
3041
3042        if (!set->nr_hw_queues)
3043                return -EINVAL;
3044        if (!set->queue_depth)
3045                return -EINVAL;
3046        if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
3047                return -EINVAL;
3048
3049        if (!set->ops->queue_rq)
3050                return -EINVAL;
3051
3052        if (!set->ops->get_budget ^ !set->ops->put_budget)
3053                return -EINVAL;
3054
3055        if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
3056                pr_info("blk-mq: reduced tag depth to %u\n",
3057                        BLK_MQ_MAX_DEPTH);
3058                set->queue_depth = BLK_MQ_MAX_DEPTH;
3059        }
3060
3061        if (!set->nr_maps)
3062                set->nr_maps = 1;
3063        else if (set->nr_maps > HCTX_MAX_TYPES)
3064                return -EINVAL;
3065
3066        /*
3067         * If a crashdump is active, then we are potentially in a very
3068         * memory constrained environment. Limit us to 1 queue and
3069         * 64 tags to prevent using too much memory.
3070         */
3071        if (is_kdump_kernel()) {
3072                set->nr_hw_queues = 1;
3073                set->nr_maps = 1;
3074                set->queue_depth = min(64U, set->queue_depth);
3075        }
3076        /*
3077         * There is no use for more h/w queues than cpus if we just have
3078         * a single map
3079         */
3080        if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
3081                set->nr_hw_queues = nr_cpu_ids;
3082
3083        set->tags = kcalloc_node(nr_hw_queues(set), sizeof(struct blk_mq_tags *),
3084                                 GFP_KERNEL, set->numa_node);
3085        if (!set->tags)
3086                return -ENOMEM;
3087
3088        ret = -ENOMEM;
3089        for (i = 0; i < set->nr_maps; i++) {
3090                set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
3091                                                  sizeof(struct blk_mq_queue_map),
3092                                                  GFP_KERNEL, set->numa_node);
3093                if (!set->map[i].mq_map)
3094                        goto out_free_mq_map;
3095                set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
3096        }
3097
3098        ret = blk_mq_update_queue_map(set);
3099        if (ret)
3100                goto out_free_mq_map;
3101
3102        ret = blk_mq_alloc_rq_maps(set);
3103        if (ret)
3104                goto out_free_mq_map;
3105
3106        mutex_init(&set->tag_list_lock);
3107        INIT_LIST_HEAD(&set->tag_list);
3108
3109        return 0;
3110
3111out_free_mq_map:
3112        for (i = 0; i < set->nr_maps; i++) {
3113                kfree(set->map[i].mq_map);
3114                set->map[i].mq_map = NULL;
3115        }
3116        kfree(set->tags);
3117        set->tags = NULL;
3118        return ret;
3119}
3120EXPORT_SYMBOL(blk_mq_alloc_tag_set);
3121
3122void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
3123{
3124        int i, j;
3125
3126        for (i = 0; i < nr_hw_queues(set); i++)
3127                blk_mq_free_map_and_requests(set, i);
3128
3129        for (j = 0; j < set->nr_maps; j++) {
3130                kfree(set->map[j].mq_map);
3131                set->map[j].mq_map = NULL;
3132        }
3133
3134        kfree(set->tags);
3135        set->tags = NULL;
3136}
3137EXPORT_SYMBOL(blk_mq_free_tag_set);
3138
3139int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
3140{
3141        struct blk_mq_tag_set *set = q->tag_set;
3142        struct blk_mq_hw_ctx *hctx;
3143        int i, ret;
3144
3145        if (!set)
3146                return -EINVAL;
3147
3148        if (q->nr_requests == nr)
3149                return 0;
3150
3151        blk_mq_freeze_queue(q);
3152        blk_mq_quiesce_queue(q);
3153
3154        ret = 0;
3155        queue_for_each_hw_ctx(q, hctx, i) {
3156                if (!hctx->tags)
3157                        continue;
3158                /*
3159                 * If we're using an MQ scheduler, just update the scheduler
3160                 * queue depth. This is similar to what the old code would do.
3161                 */
3162                if (!hctx->sched_tags) {
3163                        ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
3164                                                        false);
3165                } else {
3166                        ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
3167                                                        nr, true);
3168                }
3169                if (ret)
3170                        break;
3171                if (q->elevator && q->elevator->type->ops.depth_updated)
3172                        q->elevator->type->ops.depth_updated(hctx);
3173        }
3174
3175        if (!ret)
3176                q->nr_requests = nr;
3177
3178        blk_mq_unquiesce_queue(q);
3179        blk_mq_unfreeze_queue(q);
3180
3181        return ret;
3182}
3183
3184/*
3185 * request_queue and elevator_type pair.
3186 * It is just used by __blk_mq_update_nr_hw_queues to cache
3187 * the elevator_type associated with a request_queue.
3188 */
3189struct blk_mq_qe_pair {
3190        struct list_head node;
3191        struct request_queue *q;
3192        struct elevator_type *type;
3193};
3194
3195/*
3196 * Cache the elevator_type in qe pair list and switch the
3197 * io scheduler to 'none'
3198 */
3199static bool blk_mq_elv_switch_none(struct list_head *head,
3200                struct request_queue *q)
3201{
3202        struct blk_mq_qe_pair *qe;
3203
3204        if (!q->elevator)
3205                return true;
3206
3207        qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
3208        if (!qe)
3209                return false;
3210
3211        INIT_LIST_HEAD(&qe->node);
3212        qe->q = q;
3213        qe->type = q->elevator->type;
3214        list_add(&qe->node, head);
3215
3216        mutex_lock(&q->sysfs_lock);
3217        /*
3218         * After elevator_switch_mq, the previous elevator_queue will be
3219         * released by elevator_release. The reference of the io scheduler
3220         * module get by elevator_get will also be put. So we need to get
3221         * a reference of the io scheduler module here to prevent it to be
3222         * removed.
3223         */
3224        __module_get(qe->type->elevator_owner);
3225        elevator_switch_mq(q, NULL);
3226        mutex_unlock(&q->sysfs_lock);
3227
3228        return true;
3229}
3230
3231static void blk_mq_elv_switch_back(struct list_head *head,
3232                struct request_queue *q)
3233{
3234        struct blk_mq_qe_pair *qe;
3235        struct elevator_type *t = NULL;
3236
3237        list_for_each_entry(qe, head, node)
3238                if (qe->q == q) {
3239                        t = qe->type;
3240                        break;
3241                }
3242
3243        if (!t)
3244                return;
3245
3246        list_del(&qe->node);
3247        kfree(qe);
3248
3249        mutex_lock(&q->sysfs_lock);
3250        elevator_switch_mq(q, t);
3251        mutex_unlock(&q->sysfs_lock);
3252}
3253
3254static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
3255                                                        int nr_hw_queues)
3256{
3257        struct request_queue *q;
3258        LIST_HEAD(head);
3259        int prev_nr_hw_queues;
3260
3261        lockdep_assert_held(&set->tag_list_lock);
3262
3263        if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
3264                nr_hw_queues = nr_cpu_ids;
3265        if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
3266                return;
3267
3268        list_for_each_entry(q, &set->tag_list, tag_set_list)
3269                blk_mq_freeze_queue(q);
3270        /*
3271         * Sync with blk_mq_queue_tag_busy_iter.
3272         */
3273        synchronize_rcu();
3274        /*
3275         * Switch IO scheduler to 'none', cleaning up the data associated
3276         * with the previous scheduler. We will switch back once we are done
3277         * updating the new sw to hw queue mappings.
3278         */
3279        list_for_each_entry(q, &set->tag_list, tag_set_list)
3280                if (!blk_mq_elv_switch_none(&head, q))
3281                        goto switch_back;
3282
3283        list_for_each_entry(q, &set->tag_list, tag_set_list) {
3284                blk_mq_debugfs_unregister_hctxs(q);
3285                blk_mq_sysfs_unregister(q);
3286        }
3287
3288        prev_nr_hw_queues = set->nr_hw_queues;
3289        set->nr_hw_queues = nr_hw_queues;
3290        blk_mq_update_queue_map(set);
3291fallback:
3292        list_for_each_entry(q, &set->tag_list, tag_set_list) {
3293                blk_mq_realloc_hw_ctxs(set, q);
3294                if (q->nr_hw_queues != set->nr_hw_queues) {
3295                        pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
3296                                        nr_hw_queues, prev_nr_hw_queues);
3297                        set->nr_hw_queues = prev_nr_hw_queues;
3298                        blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
3299                        goto fallback;
3300                }
3301                blk_mq_map_swqueue(q);
3302        }
3303
3304        list_for_each_entry(q, &set->tag_list, tag_set_list) {
3305                blk_mq_sysfs_register(q);
3306                blk_mq_debugfs_register_hctxs(q);
3307        }
3308
3309switch_back:
3310        list_for_each_entry(q, &set->tag_list, tag_set_list)
3311                blk_mq_elv_switch_back(&head, q);
3312
3313        list_for_each_entry(q, &set->tag_list, tag_set_list)
3314                blk_mq_unfreeze_queue(q);
3315}
3316
3317void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
3318{
3319        mutex_lock(&set->tag_list_lock);
3320        __blk_mq_update_nr_hw_queues(set, nr_hw_queues);
3321        mutex_unlock(&set->tag_list_lock);
3322}
3323EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
3324
3325/* Enable polling stats and return whether they were already enabled. */
3326static bool blk_poll_stats_enable(struct request_queue *q)
3327{
3328        if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
3329            blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q))
3330                return true;
3331        blk_stat_add_callback(q, q->poll_cb);
3332        return false;
3333}
3334
3335static void blk_mq_poll_stats_start(struct request_queue *q)
3336{
3337        /*
3338         * We don't arm the callback if polling stats are not enabled or the
3339         * callback is already active.
3340         */
3341        if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
3342            blk_stat_is_active(q->poll_cb))
3343                return;
3344
3345        blk_stat_activate_msecs(q->poll_cb, 100);
3346}
3347
3348static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
3349{
3350        struct request_queue *q = cb->data;
3351        int bucket;
3352
3353        for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
3354                if (cb->stat[bucket].nr_samples)
3355                        q->poll_stat[bucket] = cb->stat[bucket];
3356        }
3357}
3358
3359static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
3360                                       struct blk_mq_hw_ctx *hctx,
3361                                       struct request *rq)
3362{
3363        unsigned long ret = 0;
3364        int bucket;
3365
3366        /*
3367         * If stats collection isn't on, don't sleep but turn it on for
3368         * future users
3369         */
3370        if (!blk_poll_stats_enable(q))
3371                return 0;
3372
3373        /*
3374         * As an optimistic guess, use half of the mean service time
3375         * for this type of request. We can (and should) make this smarter.
3376         * For instance, if the completion latencies are tight, we can
3377         * get closer than just half the mean. This is especially
3378         * important on devices where the completion latencies are longer
3379         * than ~10 usec. We do use the stats for the relevant IO size
3380         * if available which does lead to better estimates.
3381         */
3382        bucket = blk_mq_poll_stats_bkt(rq);
3383        if (bucket < 0)
3384                return ret;
3385
3386        if (q->poll_stat[bucket].nr_samples)
3387                ret = (q->poll_stat[bucket].mean + 1) / 2;
3388
3389        return ret;
3390}
3391
3392static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
3393                                     struct blk_mq_hw_ctx *hctx,
3394                                     struct request *rq)
3395{
3396        struct hrtimer_sleeper hs;
3397        enum hrtimer_mode mode;
3398        unsigned int nsecs;
3399        ktime_t kt;
3400
3401        if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
3402                return false;
3403
3404        /*
3405         * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
3406         *
3407         *  0:  use half of prev avg
3408         * >0:  use this specific value
3409         */
3410        if (q->poll_nsec > 0)
3411                nsecs = q->poll_nsec;
3412        else
3413                nsecs = blk_mq_poll_nsecs(q, hctx, rq);
3414
3415        if (!nsecs)
3416                return false;
3417
3418        rq->rq_flags |= RQF_MQ_POLL_SLEPT;
3419
3420        /*
3421         * This will be replaced with the stats tracking code, using
3422         * 'avg_completion_time / 2' as the pre-sleep target.
3423         */
3424        kt = nsecs;
3425
3426        mode = HRTIMER_MODE_REL;
3427        hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
3428        hrtimer_set_expires(&hs.timer, kt);
3429
3430        hrtimer_init_sleeper(&hs, current);
3431        do {
3432                if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
3433                        break;
3434                set_current_state(TASK_UNINTERRUPTIBLE);
3435                hrtimer_start_expires(&hs.timer, mode);
3436                if (hs.task)
3437                        io_schedule();
3438                hrtimer_cancel(&hs.timer);
3439                mode = HRTIMER_MODE_ABS;
3440        } while (hs.task && !signal_pending(current));
3441
3442        __set_current_state(TASK_RUNNING);
3443        destroy_hrtimer_on_stack(&hs.timer);
3444        return true;
3445}
3446
3447static bool blk_mq_poll_hybrid(struct request_queue *q,
3448                               struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
3449{
3450        struct request *rq;
3451
3452        if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
3453                return false;
3454
3455        if (!blk_qc_t_is_internal(cookie))
3456                rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
3457        else {
3458                rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
3459                /*
3460                 * With scheduling, if the request has completed, we'll
3461                 * get a NULL return here, as we clear the sched tag when
3462                 * that happens. The request still remains valid, like always,
3463                 * so we should be safe with just the NULL check.
3464                 */
3465                if (!rq)
3466                        return false;
3467        }
3468
3469        return blk_mq_poll_hybrid_sleep(q, hctx, rq);
3470}
3471
3472/**
3473 * blk_poll - poll for IO completions
3474 * @q:  the queue
3475 * @cookie: cookie passed back at IO submission time
3476 * @spin: whether to spin for completions
3477 *
3478 * Description:
3479 *    Poll for completions on the passed in queue. Returns number of
3480 *    completed entries found. If @spin is true, then blk_poll will continue
3481 *    looping until at least one completion is found, unless the task is
3482 *    otherwise marked running (or we need to reschedule).
3483 */
3484int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
3485{
3486        struct blk_mq_hw_ctx *hctx;
3487        long state;
3488
3489        if (!blk_qc_t_valid(cookie) ||
3490            !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
3491                return 0;
3492
3493        if (current->plug)
3494                blk_flush_plug_list(current->plug, false);
3495
3496        hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
3497
3498        /*
3499         * If we sleep, have the caller restart the poll loop to reset
3500         * the state. Like for the other success return cases, the
3501         * caller is responsible for checking if the IO completed. If
3502         * the IO isn't complete, we'll get called again and will go
3503         * straight to the busy poll loop.
3504         */
3505        if (blk_mq_poll_hybrid(q, hctx, cookie))
3506                return 1;
3507
3508        hctx->poll_considered++;
3509
3510        state = current->state;
3511        do {
3512                int ret;
3513
3514                hctx->poll_invoked++;
3515
3516                ret = q->mq_ops->poll(hctx);
3517                if (ret > 0) {
3518                        hctx->poll_success++;
3519                        __set_current_state(TASK_RUNNING);
3520                        return ret;
3521                }
3522
3523                if (signal_pending_state(state, current))
3524                        __set_current_state(TASK_RUNNING);
3525
3526                if (current->state == TASK_RUNNING)
3527                        return 1;
3528                if (ret < 0 || !spin)
3529                        break;
3530                cpu_relax();
3531        } while (!need_resched());
3532
3533        __set_current_state(TASK_RUNNING);
3534        return 0;
3535}
3536EXPORT_SYMBOL_GPL(blk_poll);
3537
3538unsigned int blk_mq_rq_cpu(struct request *rq)
3539{
3540        return rq->mq_ctx->cpu;
3541}
3542EXPORT_SYMBOL(blk_mq_rq_cpu);
3543
3544static int __init blk_mq_init(void)
3545{
3546        cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
3547                                blk_mq_hctx_notify_dead);
3548        return 0;
3549}
3550subsys_initcall(blk_mq_init);
3551