LXR linux/block/blk-core.c

   1/*
   2 * Copyright (C) 1991, 1992 Linus Torvalds
   3 * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
   4 * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
   5 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
   6 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
   7 *      -  July2000
   8 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
   9 */
  10
  11/*
  12 * This handles all read/write requests to block devices
  13 */
  14#include <linux/kernel.h>
  15#include <linux/module.h>
  16#include <linux/backing-dev.h>
  17#include <linux/bio.h>
  18#include <linux/blkdev.h>
  19#include <linux/blk-mq.h>
  20#include <linux/highmem.h>
  21#include <linux/mm.h>
  22#include <linux/kernel_stat.h>
  23#include <linux/string.h>
  24#include <linux/init.h>
  25#include <linux/completion.h>
  26#include <linux/slab.h>
  27#include <linux/swap.h>
  28#include <linux/writeback.h>
  29#include <linux/task_io_accounting_ops.h>
  30#include <linux/fault-inject.h>
  31#include <linux/list_sort.h>
  32#include <linux/delay.h>
  33#include <linux/ratelimit.h>
  34#include <linux/pm_runtime.h>
  35#include <linux/blk-cgroup.h>
  36#include <linux/debugfs.h>
  37#include <linux/bpf.h>
  38
  39#define CREATE_TRACE_POINTS
  40#include <trace/events/block.h>
  41
  42#include "blk.h"
  43#include "blk-mq.h"
  44#include "blk-mq-sched.h"
  45#include "blk-rq-qos.h"
  46
  47#ifdef CONFIG_DEBUG_FS
  48struct dentry *blk_debugfs_root;
  49#endif
  50
  51EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
  52EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
  53EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
  54EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
  55EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
  56
  57DEFINE_IDA(blk_queue_ida);
  58
  59/*
  60 * For the allocated request tables
  61 */
  62struct kmem_cache *request_cachep;
  63
  64/*
  65 * For queue allocation
  66 */
  67struct kmem_cache *blk_requestq_cachep;
  68
  69/*
  70 * Controlling structure to kblockd
  71 */
  72static struct workqueue_struct *kblockd_workqueue;
  73
  74/**
  75 * blk_queue_flag_set - atomically set a queue flag
  76 * @flag: flag to be set
  77 * @q: request queue
  78 */
  79void blk_queue_flag_set(unsigned int flag, struct request_queue *q)
  80{
  81        unsigned long flags;
  82
  83        spin_lock_irqsave(q->queue_lock, flags);
  84        queue_flag_set(flag, q);
  85        spin_unlock_irqrestore(q->queue_lock, flags);
  86}
  87EXPORT_SYMBOL(blk_queue_flag_set);
  88
  89/**
  90 * blk_queue_flag_clear - atomically clear a queue flag
  91 * @flag: flag to be cleared
  92 * @q: request queue
  93 */
  94void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
  95{
  96        unsigned long flags;
  97
  98        spin_lock_irqsave(q->queue_lock, flags);
  99        queue_flag_clear(flag, q);
 100        spin_unlock_irqrestore(q->queue_lock, flags);
 101}
 102EXPORT_SYMBOL(blk_queue_flag_clear);
 103
 104/**
 105 * blk_queue_flag_test_and_set - atomically test and set a queue flag
 106 * @flag: flag to be set
 107 * @q: request queue
 108 *
 109 * Returns the previous value of @flag - 0 if the flag was not set and 1 if
 110 * the flag was already set.
 111 */
 112bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
 113{
 114        unsigned long flags;
 115        bool res;
 116
 117        spin_lock_irqsave(q->queue_lock, flags);
 118        res = queue_flag_test_and_set(flag, q);
 119        spin_unlock_irqrestore(q->queue_lock, flags);
 120
 121        return res;
 122}
 123EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
 124
 125/**
 126 * blk_queue_flag_test_and_clear - atomically test and clear a queue flag
 127 * @flag: flag to be cleared
 128 * @q: request queue
 129 *
 130 * Returns the previous value of @flag - 0 if the flag was not set and 1 if
 131 * the flag was set.
 132 */
 133bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q)
 134{
 135        unsigned long flags;
 136        bool res;
 137
 138        spin_lock_irqsave(q->queue_lock, flags);
 139        res = queue_flag_test_and_clear(flag, q);
 140        spin_unlock_irqrestore(q->queue_lock, flags);
 141
 142        return res;
 143}
 144EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_clear);
 145
 146static void blk_clear_congested(struct request_list *rl, int sync)
 147{
 148#ifdef CONFIG_CGROUP_WRITEBACK
 149        clear_wb_congested(rl->blkg->wb_congested, sync);
 150#else
 151        /*
 152         * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't
 153         * flip its congestion state for events on other blkcgs.
 154         */
 155        if (rl == &rl->q->root_rl)
 156                clear_wb_congested(rl->q->backing_dev_info->wb.congested, sync);
 157#endif
 158}
 159
 160static void blk_set_congested(struct request_list *rl, int sync)
 161{
 162#ifdef CONFIG_CGROUP_WRITEBACK
 163        set_wb_congested(rl->blkg->wb_congested, sync);
 164#else
 165        /* see blk_clear_congested() */
 166        if (rl == &rl->q->root_rl)
 167                set_wb_congested(rl->q->backing_dev_info->wb.congested, sync);
 168#endif
 169}
 170
 171void blk_queue_congestion_threshold(struct request_queue *q)
 172{
 173        int nr;
 174
 175        nr = q->nr_requests - (q->nr_requests / 8) + 1;
 176        if (nr > q->nr_requests)
 177                nr = q->nr_requests;
 178        q->nr_congestion_on = nr;
 179
 180        nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
 181        if (nr < 1)
 182                nr = 1;
 183        q->nr_congestion_off = nr;
 184}
 185
 186void blk_rq_init(struct request_queue *q, struct request *rq)
 187{
 188        memset(rq, 0, sizeof(*rq));
 189
 190        INIT_LIST_HEAD(&rq->queuelist);
 191        INIT_LIST_HEAD(&rq->timeout_list);
 192        rq->cpu = -1;
 193        rq->q = q;
 194        rq->__sector = (sector_t) -1;
 195        INIT_HLIST_NODE(&rq->hash);
 196        RB_CLEAR_NODE(&rq->rb_node);
 197        rq->tag = -1;
 198        rq->internal_tag = -1;
 199        rq->start_time_ns = ktime_get_ns();
 200        rq->part = NULL;
 201}
 202EXPORT_SYMBOL(blk_rq_init);
 203
 204static const struct {
 205        int             errno;
 206        const char      *name;
 207} blk_errors[] = {
 208        [BLK_STS_OK]            = { 0,          "" },
 209        [BLK_STS_NOTSUPP]       = { -EOPNOTSUPP, "operation not supported" },
 210        [BLK_STS_TIMEOUT]       = { -ETIMEDOUT, "timeout" },
 211        [BLK_STS_NOSPC]         = { -ENOSPC,    "critical space allocation" },
 212        [BLK_STS_TRANSPORT]     = { -ENOLINK,   "recoverable transport" },
 213        [BLK_STS_TARGET]        = { -EREMOTEIO, "critical target" },
 214        [BLK_STS_NEXUS]         = { -EBADE,     "critical nexus" },
 215        [BLK_STS_MEDIUM]        = { -ENODATA,   "critical medium" },
 216        [BLK_STS_PROTECTION]    = { -EILSEQ,    "protection" },
 217        [BLK_STS_RESOURCE]      = { -ENOMEM,    "kernel resource" },
 218        [BLK_STS_DEV_RESOURCE]  = { -EBUSY,     "device resource" },
 219        [BLK_STS_AGAIN]         = { -EAGAIN,    "nonblocking retry" },
 220
 221        /* device mapper special case, should not leak out: */
 222        [BLK_STS_DM_REQUEUE]    = { -EREMCHG, "dm internal retry" },
 223
 224        /* everything else not covered above: */
 225        [BLK_STS_IOERR]         = { -EIO,       "I/O" },
 226};
 227
 228blk_status_t errno_to_blk_status(int errno)
 229{
 230        int i;
 231
 232        for (i = 0; i < ARRAY_SIZE(blk_errors); i++) {
 233                if (blk_errors[i].errno == errno)
 234                        return (__force blk_status_t)i;
 235        }
 236
 237        return BLK_STS_IOERR;
 238}
 239EXPORT_SYMBOL_GPL(errno_to_blk_status);
 240
 241int blk_status_to_errno(blk_status_t status)
 242{
 243        int idx = (__force int)status;
 244
 245        if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
 246                return -EIO;
 247        return blk_errors[idx].errno;
 248}
 249EXPORT_SYMBOL_GPL(blk_status_to_errno);
 250
 251static void print_req_error(struct request *req, blk_status_t status)
 252{
 253        int idx = (__force int)status;
 254
 255        if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
 256                return;
 257
 258        printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n",
 259                           __func__, blk_errors[idx].name, req->rq_disk ?
 260                           req->rq_disk->disk_name : "?",
 261                           (unsigned long long)blk_rq_pos(req));
 262}
 263
 264static void req_bio_endio(struct request *rq, struct bio *bio,
 265                          unsigned int nbytes, blk_status_t error)
 266{
 267        if (error)
 268                bio->bi_status = error;
 269
 270        if (unlikely(rq->rq_flags & RQF_QUIET))
 271                bio_set_flag(bio, BIO_QUIET);
 272
 273        bio_advance(bio, nbytes);
 274
 275        /* don't actually finish bio if it's part of flush sequence */
 276        if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
 277                bio_endio(bio);
 278}
 279
 280void blk_dump_rq_flags(struct request *rq, char *msg)
 281{
 282        printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
 283                rq->rq_disk ? rq->rq_disk->disk_name : "?",
 284                (unsigned long long) rq->cmd_flags);
 285
 286        printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
 287               (unsigned long long)blk_rq_pos(rq),
 288               blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
 289        printk(KERN_INFO "  bio %p, biotail %p, len %u\n",
 290               rq->bio, rq->biotail, blk_rq_bytes(rq));
 291}
 292EXPORT_SYMBOL(blk_dump_rq_flags);
 293
 294static void blk_delay_work(struct work_struct *work)
 295{
 296        struct request_queue *q;
 297
 298        q = container_of(work, struct request_queue, delay_work.work);
 299        spin_lock_irq(q->queue_lock);
 300        __blk_run_queue(q);
 301        spin_unlock_irq(q->queue_lock);
 302}
 303
 304/**
 305 * blk_delay_queue - restart queueing after defined interval
 306 * @q:          The &struct request_queue in question
 307 * @msecs:      Delay in msecs
 308 *
 309 * Description:
 310 *   Sometimes queueing needs to be postponed for a little while, to allow
 311 *   resources to come back. This function will make sure that queueing is
 312 *   restarted around the specified time.
 313 */
 314void blk_delay_queue(struct request_queue *q, unsigned long msecs)
 315{
 316        lockdep_assert_held(q->queue_lock);
 317        WARN_ON_ONCE(q->mq_ops);
 318
 319        if (likely(!blk_queue_dead(q)))
 320                queue_delayed_work(kblockd_workqueue, &q->delay_work,
 321                                   msecs_to_jiffies(msecs));
 322}
 323EXPORT_SYMBOL(blk_delay_queue);
 324
 325/**
 326 * blk_start_queue_async - asynchronously restart a previously stopped queue
 327 * @q:    The &struct request_queue in question
 328 *
 329 * Description:
 330 *   blk_start_queue_async() will clear the stop flag on the queue, and
 331 *   ensure that the request_fn for the queue is run from an async
 332 *   context.
 333 **/
 334void blk_start_queue_async(struct request_queue *q)
 335{
 336        lockdep_assert_held(q->queue_lock);
 337        WARN_ON_ONCE(q->mq_ops);
 338
 339        queue_flag_clear(QUEUE_FLAG_STOPPED, q);
 340        blk_run_queue_async(q);
 341}
 342EXPORT_SYMBOL(blk_start_queue_async);
 343
 344/**
 345 * blk_start_queue - restart a previously stopped queue
 346 * @q:    The &struct request_queue in question
 347 *
 348 * Description:
 349 *   blk_start_queue() will clear the stop flag on the queue, and call
 350 *   the request_fn for the queue if it was in a stopped state when
 351 *   entered. Also see blk_stop_queue().
 352 **/
 353void blk_start_queue(struct request_queue *q)
 354{
 355        lockdep_assert_held(q->queue_lock);
 356        WARN_ON_ONCE(q->mq_ops);
 357
 358        queue_flag_clear(QUEUE_FLAG_STOPPED, q);
 359        __blk_run_queue(q);
 360}
 361EXPORT_SYMBOL(blk_start_queue);
 362
 363/**
 364 * blk_stop_queue - stop a queue
 365 * @q:    The &struct request_queue in question
 366 *
 367 * Description:
 368 *   The Linux block layer assumes that a block driver will consume all
 369 *   entries on the request queue when the request_fn strategy is called.
 370 *   Often this will not happen, because of hardware limitations (queue
 371 *   depth settings). If a device driver gets a 'queue full' response,
 372 *   or if it simply chooses not to queue more I/O at one point, it can
 373 *   call this function to prevent the request_fn from being called until
 374 *   the driver has signalled it's ready to go again. This happens by calling
 375 *   blk_start_queue() to restart queue operations.
 376 **/
 377void blk_stop_queue(struct request_queue *q)
 378{
 379        lockdep_assert_held(q->queue_lock);
 380        WARN_ON_ONCE(q->mq_ops);
 381
 382        cancel_delayed_work(&q->delay_work);
 383        queue_flag_set(QUEUE_FLAG_STOPPED, q);
 384}
 385EXPORT_SYMBOL(blk_stop_queue);
 386
 387/**
 388 * blk_sync_queue - cancel any pending callbacks on a queue
 389 * @q: the queue
 390 *
 391 * Description:
 392 *     The block layer may perform asynchronous callback activity
 393 *     on a queue, such as calling the unplug function after a timeout.
 394 *     A block device may call blk_sync_queue to ensure that any
 395 *     such activity is cancelled, thus allowing it to release resources
 396 *     that the callbacks might use. The caller must already have made sure
 397 *     that its ->make_request_fn will not re-add plugging prior to calling
 398 *     this function.
 399 *
 400 *     This function does not cancel any asynchronous activity arising
 401 *     out of elevator or throttling code. That would require elevator_exit()
 402 *     and blkcg_exit_queue() to be called with queue lock initialized.
 403 *
 404 */
 405void blk_sync_queue(struct request_queue *q)
 406{
 407        del_timer_sync(&q->timeout);
 408        cancel_work_sync(&q->timeout_work);
 409
 410        if (q->mq_ops) {
 411                struct blk_mq_hw_ctx *hctx;
 412                int i;
 413
 414                cancel_delayed_work_sync(&q->requeue_work);
 415                queue_for_each_hw_ctx(q, hctx, i)
 416                        cancel_delayed_work_sync(&hctx->run_work);
 417        } else {
 418                cancel_delayed_work_sync(&q->delay_work);
 419        }
 420}
 421EXPORT_SYMBOL(blk_sync_queue);
 422
 423/**
 424 * blk_set_preempt_only - set QUEUE_FLAG_PREEMPT_ONLY
 425 * @q: request queue pointer
 426 *
 427 * Returns the previous value of the PREEMPT_ONLY flag - 0 if the flag was not
 428 * set and 1 if the flag was already set.
 429 */
 430int blk_set_preempt_only(struct request_queue *q)
 431{
 432        return blk_queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q);
 433}
 434EXPORT_SYMBOL_GPL(blk_set_preempt_only);
 435
 436void blk_clear_preempt_only(struct request_queue *q)
 437{
 438        blk_queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
 439        wake_up_all(&q->mq_freeze_wq);
 440}
 441EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
 442
 443/**
 444 * __blk_run_queue_uncond - run a queue whether or not it has been stopped
 445 * @q:  The queue to run
 446 *
 447 * Description:
 448 *    Invoke request handling on a queue if there are any pending requests.
 449 *    May be used to restart request handling after a request has completed.
 450 *    This variant runs the queue whether or not the queue has been
 451 *    stopped. Must be called with the queue lock held and interrupts
 452 *    disabled. See also @blk_run_queue.
 453 */
 454inline void __blk_run_queue_uncond(struct request_queue *q)
 455{
 456        lockdep_assert_held(q->queue_lock);
 457        WARN_ON_ONCE(q->mq_ops);
 458
 459        if (unlikely(blk_queue_dead(q)))
 460                return;
 461
 462        /*
 463         * Some request_fn implementations, e.g. scsi_request_fn(), unlock
 464         * the queue lock internally. As a result multiple threads may be
 465         * running such a request function concurrently. Keep track of the
 466         * number of active request_fn invocations such that blk_drain_queue()
 467         * can wait until all these request_fn calls have finished.
 468         */
 469        q->request_fn_active++;
 470        q->request_fn(q);
 471        q->request_fn_active--;
 472}
 473EXPORT_SYMBOL_GPL(__blk_run_queue_uncond);
 474
 475/**
 476 * __blk_run_queue - run a single device queue
 477 * @q:  The queue to run
 478 *
 479 * Description:
 480 *    See @blk_run_queue.
 481 */
 482void __blk_run_queue(struct request_queue *q)
 483{
 484        lockdep_assert_held(q->queue_lock);
 485        WARN_ON_ONCE(q->mq_ops);
 486
 487        if (unlikely(blk_queue_stopped(q)))
 488                return;
 489
 490        __blk_run_queue_uncond(q);
 491}
 492EXPORT_SYMBOL(__blk_run_queue);
 493
 494/**
 495 * blk_run_queue_async - run a single device queue in workqueue context
 496 * @q:  The queue to run
 497 *
 498 * Description:
 499 *    Tells kblockd to perform the equivalent of @blk_run_queue on behalf
 500 *    of us.
 501 *
 502 * Note:
 503 *    Since it is not allowed to run q->delay_work after blk_cleanup_queue()
 504 *    has canceled q->delay_work, callers must hold the queue lock to avoid
 505 *    race conditions between blk_cleanup_queue() and blk_run_queue_async().
 506 */
 507void blk_run_queue_async(struct request_queue *q)
 508{
 509        lockdep_assert_held(q->queue_lock);
 510        WARN_ON_ONCE(q->mq_ops);
 511
 512        if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q)))
 513                mod_delayed_work(kblockd_workqueue, &q->delay_work, 0);
 514}
 515EXPORT_SYMBOL(blk_run_queue_async);
 516
 517/**
 518 * blk_run_queue - run a single device queue
 519 * @q: The queue to run
 520 *
 521 * Description:
 522 *    Invoke request handling on this queue, if it has pending work to do.
 523 *    May be used to restart queueing when a request has completed.
 524 */
 525void blk_run_queue(struct request_queue *q)
 526{
 527        unsigned long flags;
 528
 529        WARN_ON_ONCE(q->mq_ops);
 530
 531        spin_lock_irqsave(q->queue_lock, flags);
 532        __blk_run_queue(q);
 533        spin_unlock_irqrestore(q->queue_lock, flags);
 534}
 535EXPORT_SYMBOL(blk_run_queue);
 536
 537void blk_put_queue(struct request_queue *q)
 538{
 539        kobject_put(&q->kobj);
 540}
 541EXPORT_SYMBOL(blk_put_queue);
 542
 543/**
 544 * __blk_drain_queue - drain requests from request_queue
 545 * @q: queue to drain
 546 * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV
 547 *
 548 * Drain requests from @q.  If @drain_all is set, all requests are drained.
 549 * If not, only ELVPRIV requests are drained.  The caller is responsible
 550 * for ensuring that no new requests which need to be drained are queued.
 551 */
 552static void __blk_drain_queue(struct request_queue *q, bool drain_all)
 553        __releases(q->queue_lock)
 554        __acquires(q->queue_lock)
 555{
 556        int i;
 557
 558        lockdep_assert_held(q->queue_lock);
 559        WARN_ON_ONCE(q->mq_ops);
 560
 561        while (true) {
 562                bool drain = false;
 563
 564                /*
 565                 * The caller might be trying to drain @q before its
 566                 * elevator is initialized.
 567                 */
 568                if (q->elevator)
 569                        elv_drain_elevator(q);
 570
 571                blkcg_drain_queue(q);
 572
 573                /*
 574                 * This function might be called on a queue which failed
 575                 * driver init after queue creation or is not yet fully
 576                 * active yet.  Some drivers (e.g. fd and loop) get unhappy
 577                 * in such cases.  Kick queue iff dispatch queue has
 578                 * something on it and @q has request_fn set.
 579                 */
 580                if (!list_empty(&q->queue_head) && q->request_fn)
 581                        __blk_run_queue(q);
 582
 583                drain |= q->nr_rqs_elvpriv;
 584                drain |= q->request_fn_active;
 585
 586                /*
 587                 * Unfortunately, requests are queued at and tracked from
 588                 * multiple places and there's no single counter which can
 589                 * be drained.  Check all the queues and counters.
 590                 */
 591                if (drain_all) {
 592                        struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
 593                        drain |= !list_empty(&q->queue_head);
 594                        for (i = 0; i < 2; i++) {
 595                                drain |= q->nr_rqs[i];
 596                                drain |= q->in_flight[i];
 597                                if (fq)
 598                                    drain |= !list_empty(&fq->flush_queue[i]);
 599                        }
 600                }
 601
 602                if (!drain)
 603                        break;
 604
 605                spin_unlock_irq(q->queue_lock);
 606
 607                msleep(10);
 608
 609                spin_lock_irq(q->queue_lock);
 610        }
 611
 612        /*
 613         * With queue marked dead, any woken up waiter will fail the
 614         * allocation path, so the wakeup chaining is lost and we're
 615         * left with hung waiters. We need to wake up those waiters.
 616         */
 617        if (q->request_fn) {
 618                struct request_list *rl;
 619
 620                blk_queue_for_each_rl(rl, q)
 621                        for (i = 0; i < ARRAY_SIZE(rl->wait); i++)
 622                                wake_up_all(&rl->wait[i]);
 623        }
 624}
 625
 626void blk_drain_queue(struct request_queue *q)
 627{
 628        spin_lock_irq(q->queue_lock);
 629        __blk_drain_queue(q, true);
 630        spin_unlock_irq(q->queue_lock);
 631}
 632
 633/**
 634 * blk_queue_bypass_start - enter queue bypass mode
 635 * @q: queue of interest
 636 *
 637 * In bypass mode, only the dispatch FIFO queue of @q is used.  This
 638 * function makes @q enter bypass mode and drains all requests which were
 639 * throttled or issued before.  On return, it's guaranteed that no request
 640 * is being throttled or has ELVPRIV set and blk_queue_bypass() %true
 641 * inside queue or RCU read lock.
 642 */
 643void blk_queue_bypass_start(struct request_queue *q)
 644{
 645        WARN_ON_ONCE(q->mq_ops);
 646
 647        spin_lock_irq(q->queue_lock);
 648        q->bypass_depth++;
 649        queue_flag_set(QUEUE_FLAG_BYPASS, q);
 650        spin_unlock_irq(q->queue_lock);
 651
 652        /*
 653         * Queues start drained.  Skip actual draining till init is
 654         * complete.  This avoids lenghty delays during queue init which
 655         * can happen many times during boot.
 656         */
 657        if (blk_queue_init_done(q)) {
 658                spin_lock_irq(q->queue_lock);
 659                __blk_drain_queue(q, false);
 660                spin_unlock_irq(q->queue_lock);
 661
 662                /* ensure blk_queue_bypass() is %true inside RCU read lock */
 663                synchronize_rcu();
 664        }
 665}
 666EXPORT_SYMBOL_GPL(blk_queue_bypass_start);
 667
 668/**
 669 * blk_queue_bypass_end - leave queue bypass mode
 670 * @q: queue of interest
 671 *
 672 * Leave bypass mode and restore the normal queueing behavior.
 673 *
 674 * Note: although blk_queue_bypass_start() is only called for blk-sq queues,
 675 * this function is called for both blk-sq and blk-mq queues.
 676 */
 677void blk_queue_bypass_end(struct request_queue *q)
 678{
 679        spin_lock_irq(q->queue_lock);
 680        if (!--q->bypass_depth)
 681                queue_flag_clear(QUEUE_FLAG_BYPASS, q);
 682        WARN_ON_ONCE(q->bypass_depth < 0);
 683        spin_unlock_irq(q->queue_lock);
 684}
 685EXPORT_SYMBOL_GPL(blk_queue_bypass_end);
 686
 687void blk_set_queue_dying(struct request_queue *q)
 688{
 689        blk_queue_flag_set(QUEUE_FLAG_DYING, q);
 690
 691        /*
 692         * When queue DYING flag is set, we need to block new req
 693         * entering queue, so we call blk_freeze_queue_start() to
 694         * prevent I/O from crossing blk_queue_enter().
 695         */
 696        blk_freeze_queue_start(q);
 697
 698        if (q->mq_ops)
 699                blk_mq_wake_waiters(q);
 700        else {
 701                struct request_list *rl;
 702
 703                spin_lock_irq(q->queue_lock);
 704                blk_queue_for_each_rl(rl, q) {
 705                        if (rl->rq_pool) {
 706                                wake_up_all(&rl->wait[BLK_RW_SYNC]);
 707                                wake_up_all(&rl->wait[BLK_RW_ASYNC]);
 708                        }
 709                }
 710                spin_unlock_irq(q->queue_lock);
 711        }
 712
 713        /* Make blk_queue_enter() reexamine the DYING flag. */
 714        wake_up_all(&q->mq_freeze_wq);
 715}
 716EXPORT_SYMBOL_GPL(blk_set_queue_dying);
 717
 718/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
 719void blk_exit_queue(struct request_queue *q)
 720{
 721        /*
 722         * Since the I/O scheduler exit code may access cgroup information,
 723         * perform I/O scheduler exit before disassociating from the block
 724         * cgroup controller.
 725         */
 726        if (q->elevator) {
 727                ioc_clear_queue(q);
 728                elevator_exit(q, q->elevator);
 729                q->elevator = NULL;
 730        }
 731
 732        /*
 733         * Remove all references to @q from the block cgroup controller before
 734         * restoring @q->queue_lock to avoid that restoring this pointer causes
 735         * e.g. blkcg_print_blkgs() to crash.
 736         */
 737        blkcg_exit_queue(q);
 738
 739        /*
 740         * Since the cgroup code may dereference the @q->backing_dev_info
 741         * pointer, only decrease its reference count after having removed the
 742         * association with the block cgroup controller.
 743         */
 744        bdi_put(q->backing_dev_info);
 745}
 746
 747/**
 748 * blk_cleanup_queue - shutdown a request queue
 749 * @q: request queue to shutdown
 750 *
 751 * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and
 752 * put it.  All future requests will be failed immediately with -ENODEV.
 753 */
 754void blk_cleanup_queue(struct request_queue *q)
 755{
 756        spinlock_t *lock = q->queue_lock;
 757
 758        /* mark @q DYING, no new request or merges will be allowed afterwards */
 759        mutex_lock(&q->sysfs_lock);
 760        blk_set_queue_dying(q);
 761        spin_lock_irq(lock);
 762
 763        /*
 764         * A dying queue is permanently in bypass mode till released.  Note
 765         * that, unlike blk_queue_bypass_start(), we aren't performing
 766         * synchronize_rcu() after entering bypass mode to avoid the delay
 767         * as some drivers create and destroy a lot of queues while
 768         * probing.  This is still safe because blk_release_queue() will be
 769         * called only after the queue refcnt drops to zero and nothing,
 770         * RCU or not, would be traversing the queue by then.
 771         */
 772        q->bypass_depth++;
 773        queue_flag_set(QUEUE_FLAG_BYPASS, q);
 774
 775        queue_flag_set(QUEUE_FLAG_NOMERGES, q);
 776        queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
 777        queue_flag_set(QUEUE_FLAG_DYING, q);
 778        spin_unlock_irq(lock);
 779        mutex_unlock(&q->sysfs_lock);
 780
 781        /*
 782         * Drain all requests queued before DYING marking. Set DEAD flag to
 783         * prevent that q->request_fn() gets invoked after draining finished.
 784         */
 785        blk_freeze_queue(q);
 786        spin_lock_irq(lock);
 787        queue_flag_set(QUEUE_FLAG_DEAD, q);
 788        spin_unlock_irq(lock);
 789
 790        /*
 791         * make sure all in-progress dispatch are completed because
 792         * blk_freeze_queue() can only complete all requests, and
 793         * dispatch may still be in-progress since we dispatch requests
 794         * from more than one contexts.
 795         *
 796         * No need to quiesce queue if it isn't initialized yet since
 797         * blk_freeze_queue() should be enough for cases of passthrough
 798         * request.
 799         */
 800        if (q->mq_ops && blk_queue_init_done(q))
 801                blk_mq_quiesce_queue(q);
 802
 803        /* for synchronous bio-based driver finish in-flight integrity i/o */
 804        blk_flush_integrity();
 805
 806        /* @q won't process any more request, flush async actions */
 807        del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer);
 808        blk_sync_queue(q);
 809
 810        /*
 811         * I/O scheduler exit is only safe after the sysfs scheduler attribute
 812         * has been removed.
 813         */
 814        WARN_ON_ONCE(q->kobj.state_in_sysfs);
 815
 816        blk_exit_queue(q);
 817
 818        if (q->mq_ops)
 819                blk_mq_free_queue(q);
 820        percpu_ref_exit(&q->q_usage_counter);
 821
 822        spin_lock_irq(lock);
 823        if (q->queue_lock != &q->__queue_lock)
 824                q->queue_lock = &q->__queue_lock;
 825        spin_unlock_irq(lock);
 826
 827        /* @q is and will stay empty, shutdown and put */
 828        blk_put_queue(q);
 829}
 830EXPORT_SYMBOL(blk_cleanup_queue);
 831
 832/* Allocate memory local to the request queue */
 833static void *alloc_request_simple(gfp_t gfp_mask, void *data)
 834{
 835        struct request_queue *q = data;
 836
 837        return kmem_cache_alloc_node(request_cachep, gfp_mask, q->node);
 838}
 839
 840static void free_request_simple(void *element, void *data)
 841{
 842        kmem_cache_free(request_cachep, element);
 843}
 844
 845static void *alloc_request_size(gfp_t gfp_mask, void *data)
 846{
 847        struct request_queue *q = data;
 848        struct request *rq;
 849
 850        rq = kmalloc_node(sizeof(struct request) + q->cmd_size, gfp_mask,
 851                        q->node);
 852        if (rq && q->init_rq_fn && q->init_rq_fn(q, rq, gfp_mask) < 0) {
 853                kfree(rq);
 854                rq = NULL;
 855        }
 856        return rq;
 857}
 858
 859static void free_request_size(void *element, void *data)
 860{
 861        struct request_queue *q = data;
 862
 863        if (q->exit_rq_fn)
 864                q->exit_rq_fn(q, element);
 865        kfree(element);
 866}
 867
 868int blk_init_rl(struct request_list *rl, struct request_queue *q,
 869                gfp_t gfp_mask)
 870{
 871        if (unlikely(rl->rq_pool) || q->mq_ops)
 872                return 0;
 873
 874        rl->q = q;
 875        rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
 876        rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
 877        init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
 878        init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
 879
 880        if (q->cmd_size) {
 881                rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ,
 882                                alloc_request_size, free_request_size,
 883                                q, gfp_mask, q->node);
 884        } else {
 885                rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ,
 886                                alloc_request_simple, free_request_simple,
 887                                q, gfp_mask, q->node);
 888        }
 889        if (!rl->rq_pool)
 890                return -ENOMEM;
 891
 892        if (rl != &q->root_rl)
 893                WARN_ON_ONCE(!blk_get_queue(q));
 894
 895        return 0;
 896}
 897
 898void blk_exit_rl(struct request_queue *q, struct request_list *rl)
 899{
 900        if (rl->rq_pool) {
 901                mempool_destroy(rl->rq_pool);
 902                if (rl != &q->root_rl)
 903                        blk_put_queue(q);
 904        }
 905}
 906
 907struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 908{
 909        return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE, NULL);
 910}
 911EXPORT_SYMBOL(blk_alloc_queue);
 912
 913/**
 914 * blk_queue_enter() - try to increase q->q_usage_counter
 915 * @q: request queue pointer
 916 * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
 917 */
 918int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
 919{
 920        const bool preempt = flags & BLK_MQ_REQ_PREEMPT;
 921
 922        while (true) {
 923                bool success = false;
 924
 925                rcu_read_lock();
 926                if (percpu_ref_tryget_live(&q->q_usage_counter)) {
 927                        /*
 928                         * The code that sets the PREEMPT_ONLY flag is
 929                         * responsible for ensuring that that flag is globally
 930                         * visible before the queue is unfrozen.
 931                         */
 932                        if (preempt || !blk_queue_preempt_only(q)) {
 933                                success = true;
 934                        } else {
 935                                percpu_ref_put(&q->q_usage_counter);
 936                        }
 937                }
 938                rcu_read_unlock();
 939
 940                if (success)
 941                        return 0;
 942
 943                if (flags & BLK_MQ_REQ_NOWAIT)
 944                        return -EBUSY;
 945
 946                /*
 947                 * read pair of barrier in blk_freeze_queue_start(),
 948                 * we need to order reading __PERCPU_REF_DEAD flag of
 949                 * .q_usage_counter and reading .mq_freeze_depth or
 950                 * queue dying flag, otherwise the following wait may
 951                 * never return if the two reads are reordered.
 952                 */
 953                smp_rmb();
 954
 955                wait_event(q->mq_freeze_wq,
 956                           (atomic_read(&q->mq_freeze_depth) == 0 &&
 957                            (preempt || !blk_queue_preempt_only(q))) ||
 958                           blk_queue_dying(q));
 959                if (blk_queue_dying(q))
 960                        return -ENODEV;
 961        }
 962}
 963
 964void blk_queue_exit(struct request_queue *q)
 965{
 966        percpu_ref_put(&q->q_usage_counter);
 967}
 968
 969static void blk_queue_usage_counter_release(struct percpu_ref *ref)
 970{
 971        struct request_queue *q =
 972                container_of(ref, struct request_queue, q_usage_counter);
 973
 974        wake_up_all(&q->mq_freeze_wq);
 975}
 976
 977static void blk_rq_timed_out_timer(struct timer_list *t)
 978{
 979        struct request_queue *q = from_timer(q, t, timeout);
 980
 981        kblockd_schedule_work(&q->timeout_work);
 982}
 983
 984/**
 985 * blk_alloc_queue_node - allocate a request queue
 986 * @gfp_mask: memory allocation flags
 987 * @node_id: NUMA node to allocate memory from
 988 * @lock: For legacy queues, pointer to a spinlock that will be used to e.g.
 989 *        serialize calls to the legacy .request_fn() callback. Ignored for
 990 *        blk-mq request queues.
 991 *
 992 * Note: pass the queue lock as the third argument to this function instead of
 993 * setting the queue lock pointer explicitly to avoid triggering a sporadic
 994 * crash in the blkcg code. This function namely calls blkcg_init_queue() and
 995 * the queue lock pointer must be set before blkcg_init_queue() is called.
 996 */
 997struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
 998                                           spinlock_t *lock)
 999{
1000        struct request_queue *q;

1001        int ret;
1002
1003        q = kmem_cache_alloc_node(blk_requestq_cachep,
1004                                gfp_mask | __GFP_ZERO, node_id);
1005        if (!q)
1006                return NULL;
1007
1008        INIT_LIST_HEAD(&q->queue_head);
1009        q->last_merge = NULL;
1010        q->end_sector = 0;
1011        q->boundary_rq = NULL;
1012
1013        q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
1014        if (q->id < 0)
1015                goto fail_q;
1016
1017        ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
1018        if (ret)
1019                goto fail_id;
1020
1021        q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id);
1022        if (!q->backing_dev_info)
1023                goto fail_split;
1024
1025        q->stats = blk_alloc_queue_stats();
1026        if (!q->stats)
1027                goto fail_stats;
1028
1029        q->backing_dev_info->ra_pages =
1030                        (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
1031        q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
1032        q->backing_dev_info->name = "block";
1033        q->node = node_id;
1034
1035        timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
1036                    laptop_mode_timer_fn, 0);
1037        timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
1038        INIT_WORK(&q->timeout_work, NULL);
1039        INIT_LIST_HEAD(&q->timeout_list);
1040        INIT_LIST_HEAD(&q->icq_list);
1041#ifdef CONFIG_BLK_CGROUP
1042        INIT_LIST_HEAD(&q->blkg_list);
1043#endif
1044        INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
1045
1046        kobject_init(&q->kobj, &blk_queue_ktype);
1047
1048#ifdef CONFIG_BLK_DEV_IO_TRACE
1049        mutex_init(&q->blk_trace_mutex);
1050#endif
1051        mutex_init(&q->sysfs_lock);
1052        spin_lock_init(&q->__queue_lock);
1053
1054        if (!q->mq_ops)
1055                q->queue_lock = lock ? : &q->__queue_lock;
1056
1057        /*
1058         * A queue starts its life with bypass turned on to avoid
1059         * unnecessary bypass on/off overhead and nasty surprises during
1060         * init.  The initial bypass will be finished when the queue is
1061         * registered by blk_register_queue().
1062         */
1063        q->bypass_depth = 1;
1064        queue_flag_set_unlocked(QUEUE_FLAG_BYPASS, q);
1065
1066        init_waitqueue_head(&q->mq_freeze_wq);
1067
1068        /*
1069         * Init percpu_ref in atomic mode so that it's faster to shutdown.
1070         * See blk_register_queue() for details.
1071         */
1072        if (percpu_ref_init(&q->q_usage_counter,
1073                                blk_queue_usage_counter_release,
1074                                PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
1075                goto fail_bdi;
1076
1077        if (blkcg_init_queue(q))
1078                goto fail_ref;
1079
1080        return q;
1081
1082fail_ref:
1083        percpu_ref_exit(&q->q_usage_counter);
1084fail_bdi:
1085        blk_free_queue_stats(q->stats);
1086fail_stats:
1087        bdi_put(q->backing_dev_info);
1088fail_split:
1089        bioset_exit(&q->bio_split);
1090fail_id:
1091        ida_simple_remove(&blk_queue_ida, q->id);
1092fail_q:
1093        kmem_cache_free(blk_requestq_cachep, q);
1094        return NULL;
1095}
1096EXPORT_SYMBOL(blk_alloc_queue_node);
1097
1098/**
1099 * blk_init_queue  - prepare a request queue for use with a block device
1100 * @rfn:  The function to be called to process requests that have been
1101 *        placed on the queue.
1102 * @lock: Request queue spin lock
1103 *
1104 * Description:
1105 *    If a block device wishes to use the standard request handling procedures,
1106 *    which sorts requests and coalesces adjacent requests, then it must
1107 *    call blk_init_queue().  The function @rfn will be called when there
1108 *    are requests on the queue that need to be processed.  If the device
1109 *    supports plugging, then @rfn may not be called immediately when requests
1110 *    are available on the queue, but may be called at some time later instead.
1111 *    Plugged queues are generally unplugged when a buffer belonging to one
1112 *    of the requests on the queue is needed, or due to memory pressure.
1113 *
1114 *    @rfn is not required, or even expected, to remove all requests off the
1115 *    queue, but only as many as it can handle at a time.  If it does leave
1116 *    requests on the queue, it is responsible for arranging that the requests
1117 *    get dealt with eventually.
1118 *
1119 *    The queue spin lock must be held while manipulating the requests on the
1120 *    request queue; this lock will be taken also from interrupt context, so irq
1121 *    disabling is needed for it.
1122 *
1123 *    Function returns a pointer to the initialized request queue, or %NULL if
1124 *    it didn't succeed.
1125 *
1126 * Note:
1127 *    blk_init_queue() must be paired with a blk_cleanup_queue() call
1128 *    when the block device is deactivated (such as at module unload).
1129 **/
1130
1131struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
1132{
1133        return blk_init_queue_node(rfn, lock, NUMA_NO_NODE);
1134}
1135EXPORT_SYMBOL(blk_init_queue);
1136
1137struct request_queue *
1138blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
1139{
1140        struct request_queue *q;
1141
1142        q = blk_alloc_queue_node(GFP_KERNEL, node_id, lock);
1143        if (!q)
1144                return NULL;
1145
1146        q->request_fn = rfn;
1147        if (blk_init_allocated_queue(q) < 0) {
1148                blk_cleanup_queue(q);
1149                return NULL;
1150        }
1151
1152        return q;
1153}
1154EXPORT_SYMBOL(blk_init_queue_node);
1155
1156static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio);
1157
1158
1159int blk_init_allocated_queue(struct request_queue *q)
1160{
1161        WARN_ON_ONCE(q->mq_ops);
1162
1163        q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size);
1164        if (!q->fq)
1165                return -ENOMEM;
1166
1167        if (q->init_rq_fn && q->init_rq_fn(q, q->fq->flush_rq, GFP_KERNEL))
1168                goto out_free_flush_queue;
1169
1170        if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
1171                goto out_exit_flush_rq;
1172
1173        INIT_WORK(&q->timeout_work, blk_timeout_work);
1174        q->queue_flags          |= QUEUE_FLAG_DEFAULT;
1175
1176        /*
1177         * This also sets hw/phys segments, boundary and size
1178         */
1179        blk_queue_make_request(q, blk_queue_bio);
1180
1181        q->sg_reserved_size = INT_MAX;
1182
1183        if (elevator_init(q))
1184                goto out_exit_flush_rq;
1185        return 0;
1186
1187out_exit_flush_rq:
1188        if (q->exit_rq_fn)
1189                q->exit_rq_fn(q, q->fq->flush_rq);
1190out_free_flush_queue:
1191        blk_free_flush_queue(q->fq);
1192        q->fq = NULL;
1193        return -ENOMEM;
1194}
1195EXPORT_SYMBOL(blk_init_allocated_queue);
1196
1197bool blk_get_queue(struct request_queue *q)
1198{
1199        if (likely(!blk_queue_dying(q))) {
1200                __blk_get_queue(q);
1201                return true;
1202        }
1203
1204        return false;
1205}
1206EXPORT_SYMBOL(blk_get_queue);
1207
1208static inline void blk_free_request(struct request_list *rl, struct request *rq)
1209{
1210        if (rq->rq_flags & RQF_ELVPRIV) {
1211                elv_put_request(rl->q, rq);
1212                if (rq->elv.icq)
1213                        put_io_context(rq->elv.icq->ioc);
1214        }
1215
1216        mempool_free(rq, rl->rq_pool);
1217}
1218
1219/*
1220 * ioc_batching returns true if the ioc is a valid batching request and
1221 * should be given priority access to a request.
1222 */
1223static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
1224{
1225        if (!ioc)
1226                return 0;
1227
1228        /*
1229         * Make sure the process is able to allocate at least 1 request
1230         * even if the batch times out, otherwise we could theoretically
1231         * lose wakeups.
1232         */
1233        return ioc->nr_batch_requests == q->nr_batching ||
1234                (ioc->nr_batch_requests > 0
1235                && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
1236}
1237
1238/*
1239 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
1240 * will cause the process to be a "batcher" on all queues in the system. This
1241 * is the behaviour we want though - once it gets a wakeup it should be given
1242 * a nice run.
1243 */
1244static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
1245{
1246        if (!ioc || ioc_batching(q, ioc))
1247                return;
1248
1249        ioc->nr_batch_requests = q->nr_batching;
1250        ioc->last_waited = jiffies;
1251}
1252
1253static void __freed_request(struct request_list *rl, int sync)
1254{
1255        struct request_queue *q = rl->q;
1256
1257        if (rl->count[sync] < queue_congestion_off_threshold(q))
1258                blk_clear_congested(rl, sync);
1259
1260        if (rl->count[sync] + 1 <= q->nr_requests) {
1261                if (waitqueue_active(&rl->wait[sync]))
1262                        wake_up(&rl->wait[sync]);
1263
1264                blk_clear_rl_full(rl, sync);
1265        }
1266}
1267
1268/*
1269 * A request has just been released.  Account for it, update the full and
1270 * congestion status, wake up any waiters.   Called under q->queue_lock.
1271 */
1272static void freed_request(struct request_list *rl, bool sync,
1273                req_flags_t rq_flags)
1274{
1275        struct request_queue *q = rl->q;
1276
1277        q->nr_rqs[sync]--;
1278        rl->count[sync]--;
1279        if (rq_flags & RQF_ELVPRIV)
1280                q->nr_rqs_elvpriv--;
1281
1282        __freed_request(rl, sync);
1283
1284        if (unlikely(rl->starved[sync ^ 1]))
1285                __freed_request(rl, sync ^ 1);
1286}
1287
1288int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
1289{
1290        struct request_list *rl;
1291        int on_thresh, off_thresh;
1292
1293        WARN_ON_ONCE(q->mq_ops);
1294
1295        spin_lock_irq(q->queue_lock);
1296        q->nr_requests = nr;
1297        blk_queue_congestion_threshold(q);
1298        on_thresh = queue_congestion_on_threshold(q);
1299        off_thresh = queue_congestion_off_threshold(q);
1300
1301        blk_queue_for_each_rl(rl, q) {
1302                if (rl->count[BLK_RW_SYNC] >= on_thresh)
1303                        blk_set_congested(rl, BLK_RW_SYNC);
1304                else if (rl->count[BLK_RW_SYNC] < off_thresh)
1305                        blk_clear_congested(rl, BLK_RW_SYNC);
1306
1307                if (rl->count[BLK_RW_ASYNC] >= on_thresh)
1308                        blk_set_congested(rl, BLK_RW_ASYNC);
1309                else if (rl->count[BLK_RW_ASYNC] < off_thresh)
1310                        blk_clear_congested(rl, BLK_RW_ASYNC);
1311
1312                if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
1313                        blk_set_rl_full(rl, BLK_RW_SYNC);
1314                } else {
1315                        blk_clear_rl_full(rl, BLK_RW_SYNC);
1316                        wake_up(&rl->wait[BLK_RW_SYNC]);
1317                }
1318
1319                if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
1320                        blk_set_rl_full(rl, BLK_RW_ASYNC);
1321                } else {
1322                        blk_clear_rl_full(rl, BLK_RW_ASYNC);
1323                        wake_up(&rl->wait[BLK_RW_ASYNC]);
1324                }
1325        }
1326
1327        spin_unlock_irq(q->queue_lock);
1328        return 0;
1329}
1330
1331/**
1332 * __get_request - get a free request
1333 * @rl: request list to allocate from
1334 * @op: operation and flags
1335 * @bio: bio to allocate request for (can be %NULL)
1336 * @flags: BLQ_MQ_REQ_* flags
1337 * @gfp_mask: allocator flags
1338 *
1339 * Get a free request from @q.  This function may fail under memory
1340 * pressure or if @q is dead.
1341 *
1342 * Must be called with @q->queue_lock held and,
1343 * Returns ERR_PTR on failure, with @q->queue_lock held.
1344 * Returns request pointer on success, with @q->queue_lock *not held*.
1345 */
1346static struct request *__get_request(struct request_list *rl, unsigned int op,
1347                struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp_mask)
1348{
1349        struct request_queue *q = rl->q;
1350        struct request *rq;
1351        struct elevator_type *et = q->elevator->type;
1352        struct io_context *ioc = rq_ioc(bio);
1353        struct io_cq *icq = NULL;
1354        const bool is_sync = op_is_sync(op);
1355        int may_queue;
1356        req_flags_t rq_flags = RQF_ALLOCED;
1357
1358        lockdep_assert_held(q->queue_lock);
1359
1360        if (unlikely(blk_queue_dying(q)))
1361                return ERR_PTR(-ENODEV);
1362
1363        may_queue = elv_may_queue(q, op);
1364        if (may_queue == ELV_MQUEUE_NO)
1365                goto rq_starved;
1366
1367        if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
1368                if (rl->count[is_sync]+1 >= q->nr_requests) {
1369                        /*
1370                         * The queue will fill after this allocation, so set
1371                         * it as full, and mark this process as "batching".
1372                         * This process will be allowed to complete a batch of
1373                         * requests, others will be blocked.
1374                         */
1375                        if (!blk_rl_full(rl, is_sync)) {
1376                                ioc_set_batching(q, ioc);
1377                                blk_set_rl_full(rl, is_sync);
1378                        } else {
1379                                if (may_queue != ELV_MQUEUE_MUST
1380                                                && !ioc_batching(q, ioc)) {
1381                                        /*
1382                                         * The queue is full and the allocating
1383                                         * process is not a "batcher", and not
1384                                         * exempted by the IO scheduler
1385                                         */
1386                                        return ERR_PTR(-ENOMEM);
1387                                }
1388                        }
1389                }
1390                blk_set_congested(rl, is_sync);
1391        }
1392
1393        /*
1394         * Only allow batching queuers to allocate up to 50% over the defined
1395         * limit of requests, otherwise we could have thousands of requests
1396         * allocated with any setting of ->nr_requests
1397         */
1398        if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
1399                return ERR_PTR(-ENOMEM);
1400
1401        q->nr_rqs[is_sync]++;
1402        rl->count[is_sync]++;
1403        rl->starved[is_sync] = 0;
1404
1405        /*
1406         * Decide whether the new request will be managed by elevator.  If
1407         * so, mark @rq_flags and increment elvpriv.  Non-zero elvpriv will
1408         * prevent the current elevator from being destroyed until the new
1409         * request is freed.  This guarantees icq's won't be destroyed and
1410         * makes creating new ones safe.
1411         *
1412         * Flush requests do not use the elevator so skip initialization.
1413         * This allows a request to share the flush and elevator data.
1414         *
1415         * Also, lookup icq while holding queue_lock.  If it doesn't exist,
1416         * it will be created after releasing queue_lock.
1417         */
1418        if (!op_is_flush(op) && !blk_queue_bypass(q)) {
1419                rq_flags |= RQF_ELVPRIV;
1420                q->nr_rqs_elvpriv++;
1421                if (et->icq_cache && ioc)
1422                        icq = ioc_lookup_icq(ioc, q);
1423        }
1424
1425        if (blk_queue_io_stat(q))
1426                rq_flags |= RQF_IO_STAT;
1427        spin_unlock_irq(q->queue_lock);
1428
1429        /* allocate and init request */
1430        rq = mempool_alloc(rl->rq_pool, gfp_mask);
1431        if (!rq)
1432                goto fail_alloc;
1433
1434        blk_rq_init(q, rq);
1435        blk_rq_set_rl(rq, rl);
1436        rq->cmd_flags = op;
1437        rq->rq_flags = rq_flags;
1438        if (flags & BLK_MQ_REQ_PREEMPT)
1439                rq->rq_flags |= RQF_PREEMPT;
1440
1441        /* init elvpriv */
1442        if (rq_flags & RQF_ELVPRIV) {
1443                if (unlikely(et->icq_cache && !icq)) {
1444                        if (ioc)
1445                                icq = ioc_create_icq(ioc, q, gfp_mask);
1446                        if (!icq)
1447                                goto fail_elvpriv;
1448                }
1449
1450                rq->elv.icq = icq;
1451                if (unlikely(elv_set_request(q, rq, bio, gfp_mask)))
1452                        goto fail_elvpriv;
1453
1454                /* @rq->elv.icq holds io_context until @rq is freed */
1455                if (icq)
1456                        get_io_context(icq->ioc);
1457        }
1458out:
1459        /*
1460         * ioc may be NULL here, and ioc_batching will be false. That's
1461         * OK, if the queue is under the request limit then requests need
1462         * not count toward the nr_batch_requests limit. There will always
1463         * be some limit enforced by BLK_BATCH_TIME.
1464         */
1465        if (ioc_batching(q, ioc))
1466                ioc->nr_batch_requests--;
1467
1468        trace_block_getrq(q, bio, op);
1469        return rq;
1470
1471fail_elvpriv:
1472        /*
1473         * elvpriv init failed.  ioc, icq and elvpriv aren't mempool backed
1474         * and may fail indefinitely under memory pressure and thus
1475         * shouldn't stall IO.  Treat this request as !elvpriv.  This will
1476         * disturb iosched and blkcg but weird is bettern than dead.
1477         */
1478        printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n",
1479                           __func__, dev_name(q->backing_dev_info->dev));
1480
1481        rq->rq_flags &= ~RQF_ELVPRIV;
1482        rq->elv.icq = NULL;
1483
1484        spin_lock_irq(q->queue_lock);
1485        q->nr_rqs_elvpriv--;
1486        spin_unlock_irq(q->queue_lock);
1487        goto out;
1488
1489fail_alloc:
1490        /*
1491         * Allocation failed presumably due to memory. Undo anything we
1492         * might have messed up.
1493         *
1494         * Allocating task should really be put onto the front of the wait
1495         * queue, but this is pretty rare.
1496         */
1497        spin_lock_irq(q->queue_lock);
1498        freed_request(rl, is_sync, rq_flags);
1499
1500        /*
1501         * in the very unlikely event that allocation failed and no
1502         * requests for this direction was pending, mark us starved so that
1503         * freeing of a request in the other direction will notice
1504         * us. another possible fix would be to split the rq mempool into
1505         * READ and WRITE
1506         */
1507rq_starved:
1508        if (unlikely(rl->count[is_sync] == 0))
1509                rl->starved[is_sync] = 1;
1510        return ERR_PTR(-ENOMEM);
1511}
1512
1513/**
1514 * get_request - get a free request
1515 * @q: request_queue to allocate request from
1516 * @op: operation and flags
1517 * @bio: bio to allocate request for (can be %NULL)
1518 * @flags: BLK_MQ_REQ_* flags.
1519 * @gfp: allocator flags
1520 *
1521 * Get a free request from @q.  If %BLK_MQ_REQ_NOWAIT is set in @flags,
1522 * this function keeps retrying under memory pressure and fails iff @q is dead.
1523 *
1524 * Must be called with @q->queue_lock held and,
1525 * Returns ERR_PTR on failure, with @q->queue_lock held.
1526 * Returns request pointer on success, with @q->queue_lock *not held*.
1527 */
1528static struct request *get_request(struct request_queue *q, unsigned int op,
1529                struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp)
1530{
1531        const bool is_sync = op_is_sync(op);
1532        DEFINE_WAIT(wait);
1533        struct request_list *rl;
1534        struct request *rq;
1535
1536        lockdep_assert_held(q->queue_lock);
1537        WARN_ON_ONCE(q->mq_ops);
1538
1539        rl = blk_get_rl(q, bio);        /* transferred to @rq on success */
1540retry:
1541        rq = __get_request(rl, op, bio, flags, gfp);
1542        if (!IS_ERR(rq))
1543                return rq;
1544
1545        if (op & REQ_NOWAIT) {
1546                blk_put_rl(rl);
1547                return ERR_PTR(-EAGAIN);
1548        }
1549
1550        if ((flags & BLK_MQ_REQ_NOWAIT) || unlikely(blk_queue_dying(q))) {
1551                blk_put_rl(rl);
1552                return rq;
1553        }
1554
1555        /* wait on @rl and retry */
1556        prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
1557                                  TASK_UNINTERRUPTIBLE);
1558
1559        trace_block_sleeprq(q, bio, op);
1560
1561        spin_unlock_irq(q->queue_lock);
1562        io_schedule();
1563
1564        /*
1565         * After sleeping, we become a "batching" process and will be able
1566         * to allocate at least one request, and up to a big batch of them
1567         * for a small period time.  See ioc_batching, ioc_set_batching
1568         */
1569        ioc_set_batching(q, current->io_context);
1570
1571        spin_lock_irq(q->queue_lock);
1572        finish_wait(&rl->wait[is_sync], &wait);
1573
1574        goto retry;
1575}
1576
1577/* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */
1578static struct request *blk_old_get_request(struct request_queue *q,
1579                                unsigned int op, blk_mq_req_flags_t flags)
1580{
1581        struct request *rq;
1582        gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : GFP_NOIO;
1583        int ret = 0;
1584
1585        WARN_ON_ONCE(q->mq_ops);
1586
1587        /* create ioc upfront */
1588        create_io_context(gfp_mask, q->node);
1589
1590        ret = blk_queue_enter(q, flags);
1591        if (ret)
1592                return ERR_PTR(ret);
1593        spin_lock_irq(q->queue_lock);
1594        rq = get_request(q, op, NULL, flags, gfp_mask);
1595        if (IS_ERR(rq)) {
1596                spin_unlock_irq(q->queue_lock);
1597                blk_queue_exit(q);
1598                return rq;
1599        }
1600
1601        /* q->queue_lock is unlocked at this point */
1602        rq->__data_len = 0;
1603        rq->__sector = (sector_t) -1;
1604        rq->bio = rq->biotail = NULL;
1605        return rq;
1606}
1607
1608/**
1609 * blk_get_request - allocate a request
1610 * @q: request queue to allocate a request for
1611 * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC.
1612 * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
1613 */
1614struct request *blk_get_request(struct request_queue *q, unsigned int op,
1615                                blk_mq_req_flags_t flags)
1616{
1617        struct request *req;
1618
1619        WARN_ON_ONCE(op & REQ_NOWAIT);
1620        WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT));
1621
1622        if (q->mq_ops) {
1623                req = blk_mq_alloc_request(q, op, flags);
1624                if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
1625                        q->mq_ops->initialize_rq_fn(req);
1626        } else {
1627                req = blk_old_get_request(q, op, flags);
1628                if (!IS_ERR(req) && q->initialize_rq_fn)
1629                        q->initialize_rq_fn(req);
1630        }
1631
1632        return req;
1633}
1634EXPORT_SYMBOL(blk_get_request);
1635
1636/**
1637 * blk_requeue_request - put a request back on queue
1638 * @q:          request queue where request should be inserted
1639 * @rq:         request to be inserted
1640 *
1641 * Description:
1642 *    Drivers often keep queueing requests until the hardware cannot accept
1643 *    more, when that condition happens we need to put the request back
1644 *    on the queue. Must be called with queue lock held.
1645 */
1646void blk_requeue_request(struct request_queue *q, struct request *rq)
1647{
1648        lockdep_assert_held(q->queue_lock);
1649        WARN_ON_ONCE(q->mq_ops);
1650
1651        blk_delete_timer(rq);
1652        blk_clear_rq_complete(rq);
1653        trace_block_rq_requeue(q, rq);
1654        rq_qos_requeue(q, rq);
1655
1656        if (rq->rq_flags & RQF_QUEUED)
1657                blk_queue_end_tag(q, rq);
1658
1659        BUG_ON(blk_queued_rq(rq));
1660
1661        elv_requeue_request(q, rq);
1662}
1663EXPORT_SYMBOL(blk_requeue_request);
1664
1665static void add_acct_request(struct request_queue *q, struct request *rq,
1666                             int where)
1667{
1668        blk_account_io_start(rq, true);
1669        __elv_add_request(q, rq, where);
1670}
1671
1672static void part_round_stats_single(struct request_queue *q, int cpu,
1673                                    struct hd_struct *part, unsigned long now,
1674                                    unsigned int inflight)
1675{
1676        if (inflight) {
1677                __part_stat_add(cpu, part, time_in_queue,
1678                                inflight * (now - part->stamp));
1679                __part_stat_add(cpu, part, io_ticks, (now - part->stamp));
1680        }
1681        part->stamp = now;
1682}
1683
1684/**
1685 * part_round_stats() - Round off the performance stats on a struct disk_stats.
1686 * @q: target block queue
1687 * @cpu: cpu number for stats access
1688 * @part: target partition
1689 *
1690 * The average IO queue length and utilisation statistics are maintained
1691 * by observing the current state of the queue length and the amount of
1692 * time it has been in this state for.
1693 *
1694 * Normally, that accounting is done on IO completion, but that can result
1695 * in more than a second's worth of IO being accounted for within any one
1696 * second, leading to >100% utilisation.  To deal with that, we call this
1697 * function to do a round-off before returning the results when reading
1698 * /proc/diskstats.  This accounts immediately for all queue usage up to
1699 * the current jiffies and restarts the counters again.
1700 */
1701void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part)
1702{
1703        struct hd_struct *part2 = NULL;
1704        unsigned long now = jiffies;
1705        unsigned int inflight[2];
1706        int stats = 0;
1707
1708        if (part->stamp != now)
1709                stats |= 1;
1710
1711        if (part->partno) {
1712                part2 = &part_to_disk(part)->part0;
1713                if (part2->stamp != now)
1714                        stats |= 2;
1715        }
1716
1717        if (!stats)
1718                return;
1719
1720        part_in_flight(q, part, inflight);
1721
1722        if (stats & 2)
1723                part_round_stats_single(q, cpu, part2, now, inflight[1]);
1724        if (stats & 1)
1725                part_round_stats_single(q, cpu, part, now, inflight[0]);
1726}
1727EXPORT_SYMBOL_GPL(part_round_stats);
1728
1729#ifdef CONFIG_PM
1730static void blk_pm_put_request(struct request *rq)
1731{
1732        if (rq->q->dev && !(rq->rq_flags & RQF_PM) && !--rq->q->nr_pending)
1733                pm_runtime_mark_last_busy(rq->q->dev);
1734}
1735#else
1736static inline void blk_pm_put_request(struct request *rq) {}
1737#endif
1738
1739void __blk_put_request(struct request_queue *q, struct request *req)
1740{
1741        req_flags_t rq_flags = req->rq_flags;
1742
1743        if (unlikely(!q))
1744                return;
1745
1746        if (q->mq_ops) {
1747                blk_mq_free_request(req);
1748                return;
1749        }
1750
1751        lockdep_assert_held(q->queue_lock);
1752
1753        blk_req_zone_write_unlock(req);
1754        blk_pm_put_request(req);
1755
1756        elv_completed_request(q, req);
1757
1758        /* this is a bio leak */
1759        WARN_ON(req->bio != NULL);
1760
1761        rq_qos_done(q, req);
1762
1763        /*
1764         * Request may not have originated from ll_rw_blk. if not,
1765         * it didn't come out of our reserved rq pools
1766         */
1767        if (rq_flags & RQF_ALLOCED) {
1768                struct request_list *rl = blk_rq_rl(req);
1769                bool sync = op_is_sync(req->cmd_flags);
1770
1771                BUG_ON(!list_empty(&req->queuelist));
1772                BUG_ON(ELV_ON_HASH(req));
1773
1774                blk_free_request(rl, req);
1775                freed_request(rl, sync, rq_flags);
1776                blk_put_rl(rl);
1777                blk_queue_exit(q);
1778        }
1779}
1780EXPORT_SYMBOL_GPL(__blk_put_request);
1781
1782void blk_put_request(struct request *req)
1783{
1784        struct request_queue *q = req->q;
1785
1786        if (q->mq_ops)
1787                blk_mq_free_request(req);
1788        else {
1789                unsigned long flags;
1790
1791                spin_lock_irqsave(q->queue_lock, flags);
1792                __blk_put_request(q, req);
1793                spin_unlock_irqrestore(q->queue_lock, flags);
1794        }
1795}
1796EXPORT_SYMBOL(blk_put_request);
1797
1798bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1799                            struct bio *bio)
1800{
1801        const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
1802
1803        if (!ll_back_merge_fn(q, req, bio))
1804                return false;
1805
1806        trace_block_bio_backmerge(q, req, bio);
1807
1808        if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1809                blk_rq_set_mixed_merge(req);
1810
1811        req->biotail->bi_next = bio;
1812        req->biotail = bio;
1813        req->__data_len += bio->bi_iter.bi_size;
1814        req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1815
1816        blk_account_io_start(req, false);
1817        return true;
1818}
1819
1820bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
1821                             struct bio *bio)
1822{
1823        const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
1824
1825        if (!ll_front_merge_fn(q, req, bio))
1826                return false;
1827
1828        trace_block_bio_frontmerge(q, req, bio);
1829
1830        if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1831                blk_rq_set_mixed_merge(req);
1832
1833        bio->bi_next = req->bio;
1834        req->bio = bio;
1835
1836        req->__sector = bio->bi_iter.bi_sector;
1837        req->__data_len += bio->bi_iter.bi_size;
1838        req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1839
1840        blk_account_io_start(req, false);
1841        return true;
1842}
1843
1844bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
1845                struct bio *bio)
1846{
1847        unsigned short segments = blk_rq_nr_discard_segments(req);
1848
1849        if (segments >= queue_max_discard_segments(q))
1850                goto no_merge;
1851        if (blk_rq_sectors(req) + bio_sectors(bio) >
1852            blk_rq_get_max_sectors(req, blk_rq_pos(req)))
1853                goto no_merge;
1854
1855        req->biotail->bi_next = bio;
1856        req->biotail = bio;
1857        req->__data_len += bio->bi_iter.bi_size;
1858        req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1859        req->nr_phys_segments = segments + 1;
1860
1861        blk_account_io_start(req, false);
1862        return true;
1863no_merge:
1864        req_set_nomerge(q, req);
1865        return false;
1866}
1867
1868/**
1869 * blk_attempt_plug_merge - try to merge with %current's plugged list
1870 * @q: request_queue new bio is being queued at
1871 * @bio: new bio being queued
1872 * @request_count: out parameter for number of traversed plugged requests
1873 * @same_queue_rq: pointer to &struct request that gets filled in when
1874 * another request associated with @q is found on the plug list
1875 * (optional, may be %NULL)
1876 *
1877 * Determine whether @bio being queued on @q can be merged with a request
1878 * on %current's plugged list.  Returns %true if merge was successful,
1879 * otherwise %false.
1880 *
1881 * Plugging coalesces IOs from the same issuer for the same purpose without
1882 * going through @q->queue_lock.  As such it's more of an issuing mechanism
1883 * than scheduling, and the request, while may have elvpriv data, is not
1884 * added on the elevator at this point.  In addition, we don't have
1885 * reliable access to the elevator outside queue lock.  Only check basic
1886 * merging parameters without querying the elevator.
1887 *
1888 * Caller must ensure !blk_queue_nomerges(q) beforehand.
1889 */
1890bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1891                            unsigned int *request_count,
1892                            struct request **same_queue_rq)
1893{
1894        struct blk_plug *plug;
1895        struct request *rq;
1896        struct list_head *plug_list;
1897
1898        plug = current->plug;
1899        if (!plug)
1900                return false;
1901        *request_count = 0;
1902
1903        if (q->mq_ops)
1904                plug_list = &plug->mq_list;
1905        else
1906                plug_list = &plug->list;
1907
1908        list_for_each_entry_reverse(rq, plug_list, queuelist) {
1909                bool merged = false;
1910
1911                if (rq->q == q) {
1912                        (*request_count)++;
1913                        /*
1914                         * Only blk-mq multiple hardware queues case checks the
1915                         * rq in the same queue, there should be only one such
1916                         * rq in a queue
1917                         **/
1918                        if (same_queue_rq)
1919                                *same_queue_rq = rq;
1920                }
1921
1922                if (rq->q != q || !blk_rq_merge_ok(rq, bio))
1923                        continue;
1924
1925                switch (blk_try_merge(rq, bio)) {
1926                case ELEVATOR_BACK_MERGE:
1927                        merged = bio_attempt_back_merge(q, rq, bio);
1928                        break;
1929                case ELEVATOR_FRONT_MERGE:
1930                        merged = bio_attempt_front_merge(q, rq, bio);
1931                        break;
1932                case ELEVATOR_DISCARD_MERGE:
1933                        merged = bio_attempt_discard_merge(q, rq, bio);
1934                        break;
1935                default:
1936                        break;
1937                }
1938
1939                if (merged)
1940                        return true;
1941        }
1942
1943        return false;
1944}
1945
1946unsigned int blk_plug_queued_count(struct request_queue *q)
1947{
1948        struct blk_plug *plug;
1949        struct request *rq;
1950        struct list_head *plug_list;
1951        unsigned int ret = 0;
1952
1953        plug = current->plug;
1954        if (!plug)
1955                goto out;
1956
1957        if (q->mq_ops)
1958                plug_list = &plug->mq_list;
1959        else
1960                plug_list = &plug->list;
1961
1962        list_for_each_entry(rq, plug_list, queuelist) {
1963                if (rq->q == q)
1964                        ret++;
1965        }
1966out:
1967        return ret;
1968}
1969
1970void blk_init_request_from_bio(struct request *req, struct bio *bio)
1971{
1972        struct io_context *ioc = rq_ioc(bio);
1973
1974        if (bio->bi_opf & REQ_RAHEAD)
1975                req->cmd_flags |= REQ_FAILFAST_MASK;
1976
1977        req->__sector = bio->bi_iter.bi_sector;
1978        if (ioprio_valid(bio_prio(bio)))
1979                req->ioprio = bio_prio(bio);
1980        else if (ioc)
1981                req->ioprio = ioc->ioprio;
1982        else
1983                req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
1984        req->write_hint = bio->bi_write_hint;
1985        blk_rq_bio_prep(req->q, req, bio);
1986}
1987EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
1988
1989static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
1990{
1991        struct blk_plug *plug;
1992        int where = ELEVATOR_INSERT_SORT;
1993        struct request *req, *free;
1994        unsigned int request_count = 0;
1995
1996        /*
1997         * low level driver can indicate that it wants pages above a
1998         * certain limit bounced to low memory (ie for highmem, or even
1999         * ISA dma in theory)
2000         */

2001        blk_queue_bounce(q, &bio);
2002
2003        blk_queue_split(q, &bio);
2004
2005        if (!bio_integrity_prep(bio))
2006                return BLK_QC_T_NONE;
2007
2008        if (op_is_flush(bio->bi_opf)) {
2009                spin_lock_irq(q->queue_lock);
2010                where = ELEVATOR_INSERT_FLUSH;
2011                goto get_rq;
2012        }
2013
2014        /*
2015         * Check if we can merge with the plugged list before grabbing
2016         * any locks.
2017         */
2018        if (!blk_queue_nomerges(q)) {
2019                if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
2020                        return BLK_QC_T_NONE;
2021        } else
2022                request_count = blk_plug_queued_count(q);
2023
2024        spin_lock_irq(q->queue_lock);
2025
2026        switch (elv_merge(q, &req, bio)) {
2027        case ELEVATOR_BACK_MERGE:
2028                if (!bio_attempt_back_merge(q, req, bio))
2029                        break;
2030                elv_bio_merged(q, req, bio);
2031                free = attempt_back_merge(q, req);
2032                if (free)
2033                        __blk_put_request(q, free);
2034                else
2035                        elv_merged_request(q, req, ELEVATOR_BACK_MERGE);
2036                goto out_unlock;
2037        case ELEVATOR_FRONT_MERGE:
2038                if (!bio_attempt_front_merge(q, req, bio))
2039                        break;
2040                elv_bio_merged(q, req, bio);
2041                free = attempt_front_merge(q, req);
2042                if (free)
2043                        __blk_put_request(q, free);
2044                else
2045                        elv_merged_request(q, req, ELEVATOR_FRONT_MERGE);
2046                goto out_unlock;
2047        default:
2048                break;
2049        }
2050
2051get_rq:
2052        rq_qos_throttle(q, bio, q->queue_lock);
2053
2054        /*
2055         * Grab a free request. This is might sleep but can not fail.
2056         * Returns with the queue unlocked.
2057         */
2058        blk_queue_enter_live(q);
2059        req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO);
2060        if (IS_ERR(req)) {
2061                blk_queue_exit(q);
2062                rq_qos_cleanup(q, bio);
2063                if (PTR_ERR(req) == -ENOMEM)
2064                        bio->bi_status = BLK_STS_RESOURCE;
2065                else
2066                        bio->bi_status = BLK_STS_IOERR;
2067                bio_endio(bio);
2068                goto out_unlock;
2069        }
2070
2071        rq_qos_track(q, req, bio);
2072
2073        /*
2074         * After dropping the lock and possibly sleeping here, our request
2075         * may now be mergeable after it had proven unmergeable (above).
2076         * We don't worry about that case for efficiency. It won't happen
2077         * often, and the elevators are able to handle it.
2078         */
2079        blk_init_request_from_bio(req, bio);
2080
2081        if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
2082                req->cpu = raw_smp_processor_id();
2083
2084        plug = current->plug;
2085        if (plug) {
2086                /*
2087                 * If this is the first request added after a plug, fire
2088                 * of a plug trace.
2089                 *
2090                 * @request_count may become stale because of schedule
2091                 * out, so check plug list again.
2092                 */
2093                if (!request_count || list_empty(&plug->list))
2094                        trace_block_plug(q);
2095                else {
2096                        struct request *last = list_entry_rq(plug->list.prev);
2097                        if (request_count >= BLK_MAX_REQUEST_COUNT ||
2098                            blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE) {
2099                                blk_flush_plug_list(plug, false);
2100                                trace_block_plug(q);
2101                        }
2102                }
2103                list_add_tail(&req->queuelist, &plug->list);
2104                blk_account_io_start(req, true);
2105        } else {
2106                spin_lock_irq(q->queue_lock);
2107                add_acct_request(q, req, where);
2108                __blk_run_queue(q);
2109out_unlock:
2110                spin_unlock_irq(q->queue_lock);
2111        }
2112
2113        return BLK_QC_T_NONE;
2114}
2115
2116static void handle_bad_sector(struct bio *bio, sector_t maxsector)
2117{
2118        char b[BDEVNAME_SIZE];
2119
2120        printk(KERN_INFO "attempt to access beyond end of device\n");
2121        printk(KERN_INFO "%s: rw=%d, want=%Lu, limit=%Lu\n",
2122                        bio_devname(bio, b), bio->bi_opf,
2123                        (unsigned long long)bio_end_sector(bio),
2124                        (long long)maxsector);
2125}
2126
2127#ifdef CONFIG_FAIL_MAKE_REQUEST
2128
2129static DECLARE_FAULT_ATTR(fail_make_request);
2130
2131static int __init setup_fail_make_request(char *str)
2132{
2133        return setup_fault_attr(&fail_make_request, str);
2134}
2135__setup("fail_make_request=", setup_fail_make_request);
2136
2137static bool should_fail_request(struct hd_struct *part, unsigned int bytes)
2138{
2139        return part->make_it_fail && should_fail(&fail_make_request, bytes);
2140}
2141
2142static int __init fail_make_request_debugfs(void)
2143{
2144        struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
2145                                                NULL, &fail_make_request);
2146
2147        return PTR_ERR_OR_ZERO(dir);
2148}
2149
2150late_initcall(fail_make_request_debugfs);
2151
2152#else /* CONFIG_FAIL_MAKE_REQUEST */
2153
2154static inline bool should_fail_request(struct hd_struct *part,
2155                                        unsigned int bytes)
2156{
2157        return false;
2158}
2159
2160#endif /* CONFIG_FAIL_MAKE_REQUEST */
2161
2162static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
2163{
2164        const int op = bio_op(bio);
2165
2166        if (part->policy && op_is_write(op)) {
2167                char b[BDEVNAME_SIZE];
2168
2169                if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
2170                        return false;
2171
2172                WARN_ONCE(1,
2173                       "generic_make_request: Trying to write "
2174                        "to read-only block-device %s (partno %d)\n",
2175                        bio_devname(bio, b), part->partno);
2176                /* Older lvm-tools actually trigger this */
2177                return false;
2178        }
2179
2180        return false;
2181}
2182
2183static noinline int should_fail_bio(struct bio *bio)
2184{
2185        if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
2186                return -EIO;
2187        return 0;
2188}
2189ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
2190
2191/*
2192 * Check whether this bio extends beyond the end of the device or partition.
2193 * This may well happen - the kernel calls bread() without checking the size of
2194 * the device, e.g., when mounting a file system.
2195 */
2196static inline int bio_check_eod(struct bio *bio, sector_t maxsector)
2197{
2198        unsigned int nr_sectors = bio_sectors(bio);
2199
2200        if (nr_sectors && maxsector &&
2201            (nr_sectors > maxsector ||
2202             bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
2203                handle_bad_sector(bio, maxsector);
2204                return -EIO;
2205        }
2206        return 0;
2207}
2208
2209/*
2210 * Remap block n of partition p to block n+start(p) of the disk.
2211 */
2212static inline int blk_partition_remap(struct bio *bio)
2213{
2214        struct hd_struct *p;
2215        int ret = -EIO;
2216
2217        rcu_read_lock();
2218        p = __disk_get_part(bio->bi_disk, bio->bi_partno);
2219        if (unlikely(!p))
2220                goto out;
2221        if (unlikely(should_fail_request(p, bio->bi_iter.bi_size)))
2222                goto out;
2223        if (unlikely(bio_check_ro(bio, p)))
2224                goto out;
2225
2226        /*
2227         * Zone reset does not include bi_size so bio_sectors() is always 0.
2228         * Include a test for the reset op code and perform the remap if needed.
2229         */
2230        if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) {
2231                if (bio_check_eod(bio, part_nr_sects_read(p)))
2232                        goto out;
2233                bio->bi_iter.bi_sector += p->start_sect;
2234                trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
2235                                      bio->bi_iter.bi_sector - p->start_sect);
2236        }
2237        bio->bi_partno = 0;
2238        ret = 0;
2239out:
2240        rcu_read_unlock();
2241        return ret;
2242}
2243
2244static noinline_for_stack bool
2245generic_make_request_checks(struct bio *bio)
2246{
2247        struct request_queue *q;
2248        int nr_sectors = bio_sectors(bio);
2249        blk_status_t status = BLK_STS_IOERR;
2250        char b[BDEVNAME_SIZE];
2251
2252        might_sleep();
2253
2254        q = bio->bi_disk->queue;
2255        if (unlikely(!q)) {
2256                printk(KERN_ERR
2257                       "generic_make_request: Trying to access "
2258                        "nonexistent block-device %s (%Lu)\n",
2259                        bio_devname(bio, b), (long long)bio->bi_iter.bi_sector);
2260                goto end_io;
2261        }
2262
2263        /*
2264         * For a REQ_NOWAIT based request, return -EOPNOTSUPP
2265         * if queue is not a request based queue.
2266         */
2267        if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
2268                goto not_supported;
2269
2270        if (should_fail_bio(bio))
2271                goto end_io;
2272
2273        if (bio->bi_partno) {
2274                if (unlikely(blk_partition_remap(bio)))
2275                        goto end_io;
2276        } else {
2277                if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
2278                        goto end_io;
2279                if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk))))
2280                        goto end_io;
2281        }
2282
2283        /*
2284         * Filter flush bio's early so that make_request based
2285         * drivers without flush support don't have to worry
2286         * about them.
2287         */
2288        if (op_is_flush(bio->bi_opf) &&
2289            !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
2290                bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
2291                if (!nr_sectors) {
2292                        status = BLK_STS_OK;
2293                        goto end_io;
2294                }
2295        }
2296
2297        switch (bio_op(bio)) {
2298        case REQ_OP_DISCARD:
2299                if (!blk_queue_discard(q))
2300                        goto not_supported;
2301                break;
2302        case REQ_OP_SECURE_ERASE:
2303                if (!blk_queue_secure_erase(q))
2304                        goto not_supported;
2305                break;
2306        case REQ_OP_WRITE_SAME:
2307                if (!q->limits.max_write_same_sectors)
2308                        goto not_supported;
2309                break;
2310        case REQ_OP_ZONE_REPORT:
2311        case REQ_OP_ZONE_RESET:
2312                if (!blk_queue_is_zoned(q))
2313                        goto not_supported;
2314                break;
2315        case REQ_OP_WRITE_ZEROES:
2316                if (!q->limits.max_write_zeroes_sectors)
2317                        goto not_supported;
2318                break;
2319        default:
2320                break;
2321        }
2322
2323        /*
2324         * Various block parts want %current->io_context and lazy ioc
2325         * allocation ends up trading a lot of pain for a small amount of
2326         * memory.  Just allocate it upfront.  This may fail and block
2327         * layer knows how to live with it.
2328         */
2329        create_io_context(GFP_ATOMIC, q->node);
2330
2331        if (!blkcg_bio_issue_check(q, bio))
2332                return false;
2333
2334        if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
2335                trace_block_bio_queue(q, bio);
2336                /* Now that enqueuing has been traced, we need to trace
2337                 * completion as well.
2338                 */
2339                bio_set_flag(bio, BIO_TRACE_COMPLETION);
2340        }
2341        return true;
2342
2343not_supported:
2344        status = BLK_STS_NOTSUPP;
2345end_io:
2346        bio->bi_status = status;
2347        bio_endio(bio);
2348        return false;
2349}
2350
2351/**
2352 * generic_make_request - hand a buffer to its device driver for I/O
2353 * @bio:  The bio describing the location in memory and on the device.
2354 *
2355 * generic_make_request() is used to make I/O requests of block
2356 * devices. It is passed a &struct bio, which describes the I/O that needs
2357 * to be done.
2358 *
2359 * generic_make_request() does not return any status.  The
2360 * success/failure status of the request, along with notification of
2361 * completion, is delivered asynchronously through the bio->bi_end_io
2362 * function described (one day) else where.
2363 *
2364 * The caller of generic_make_request must make sure that bi_io_vec
2365 * are set to describe the memory buffer, and that bi_dev and bi_sector are
2366 * set to describe the device address, and the
2367 * bi_end_io and optionally bi_private are set to describe how
2368 * completion notification should be signaled.
2369 *
2370 * generic_make_request and the drivers it calls may use bi_next if this
2371 * bio happens to be merged with someone else, and may resubmit the bio to
2372 * a lower device by calling into generic_make_request recursively, which
2373 * means the bio should NOT be touched after the call to ->make_request_fn.
2374 */
2375blk_qc_t generic_make_request(struct bio *bio)
2376{
2377        /*
2378         * bio_list_on_stack[0] contains bios submitted by the current
2379         * make_request_fn.
2380         * bio_list_on_stack[1] contains bios that were submitted before
2381         * the current make_request_fn, but that haven't been processed
2382         * yet.
2383         */
2384        struct bio_list bio_list_on_stack[2];
2385        blk_mq_req_flags_t flags = 0;
2386        struct request_queue *q = bio->bi_disk->queue;
2387        blk_qc_t ret = BLK_QC_T_NONE;
2388
2389        if (bio->bi_opf & REQ_NOWAIT)
2390                flags = BLK_MQ_REQ_NOWAIT;
2391        if (bio_flagged(bio, BIO_QUEUE_ENTERED))
2392                blk_queue_enter_live(q);
2393        else if (blk_queue_enter(q, flags) < 0) {
2394                if (!blk_queue_dying(q) && (bio->bi_opf & REQ_NOWAIT))
2395                        bio_wouldblock_error(bio);
2396                else
2397                        bio_io_error(bio);
2398                return ret;
2399        }
2400
2401        if (!generic_make_request_checks(bio))
2402                goto out;
2403
2404        /*
2405         * We only want one ->make_request_fn to be active at a time, else
2406         * stack usage with stacked devices could be a problem.  So use
2407         * current->bio_list to keep a list of requests submited by a
2408         * make_request_fn function.  current->bio_list is also used as a
2409         * flag to say if generic_make_request is currently active in this
2410         * task or not.  If it is NULL, then no make_request is active.  If
2411         * it is non-NULL, then a make_request is active, and new requests
2412         * should be added at the tail
2413         */
2414        if (current->bio_list) {
2415                bio_list_add(&current->bio_list[0], bio);
2416                goto out;
2417        }
2418
2419        /* following loop may be a bit non-obvious, and so deserves some
2420         * explanation.
2421         * Before entering the loop, bio->bi_next is NULL (as all callers
2422         * ensure that) so we have a list with a single bio.
2423         * We pretend that we have just taken it off a longer list, so
2424         * we assign bio_list to a pointer to the bio_list_on_stack,
2425         * thus initialising the bio_list of new bios to be
2426         * added.  ->make_request() may indeed add some more bios
2427         * through a recursive call to generic_make_request.  If it
2428         * did, we find a non-NULL value in bio_list and re-enter the loop
2429         * from the top.  In this case we really did just take the bio
2430         * of the top of the list (no pretending) and so remove it from
2431         * bio_list, and call into ->make_request() again.
2432         */
2433        BUG_ON(bio->bi_next);
2434        bio_list_init(&bio_list_on_stack[0]);
2435        current->bio_list = bio_list_on_stack;
2436        do {
2437                bool enter_succeeded = true;
2438
2439                if (unlikely(q != bio->bi_disk->queue)) {
2440                        if (q)
2441                                blk_queue_exit(q);
2442                        q = bio->bi_disk->queue;
2443                        flags = 0;
2444                        if (bio->bi_opf & REQ_NOWAIT)
2445                                flags = BLK_MQ_REQ_NOWAIT;
2446                        if (blk_queue_enter(q, flags) < 0) {
2447                                enter_succeeded = false;
2448                                q = NULL;
2449                        }
2450                }
2451
2452                if (enter_succeeded) {
2453                        struct bio_list lower, same;
2454
2455                        /* Create a fresh bio_list for all subordinate requests */
2456                        bio_list_on_stack[1] = bio_list_on_stack[0];
2457                        bio_list_init(&bio_list_on_stack[0]);
2458                        ret = q->make_request_fn(q, bio);
2459
2460                        /* sort new bios into those for a lower level
2461                         * and those for the same level
2462                         */
2463                        bio_list_init(&lower);
2464                        bio_list_init(&same);
2465                        while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
2466                                if (q == bio->bi_disk->queue)
2467                                        bio_list_add(&same, bio);
2468                                else
2469                                        bio_list_add(&lower, bio);
2470                        /* now assemble so we handle the lowest level first */
2471                        bio_list_merge(&bio_list_on_stack[0], &lower);
2472                        bio_list_merge(&bio_list_on_stack[0], &same);
2473                        bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
2474                } else {
2475                        if (unlikely(!blk_queue_dying(q) &&
2476                                        (bio->bi_opf & REQ_NOWAIT)))
2477                                bio_wouldblock_error(bio);
2478                        else
2479                                bio_io_error(bio);
2480                }
2481                bio = bio_list_pop(&bio_list_on_stack[0]);
2482        } while (bio);
2483        current->bio_list = NULL; /* deactivate */
2484
2485out:
2486        if (q)
2487                blk_queue_exit(q);
2488        return ret;
2489}
2490EXPORT_SYMBOL(generic_make_request);
2491
2492/**
2493 * direct_make_request - hand a buffer directly to its device driver for I/O
2494 * @bio:  The bio describing the location in memory and on the device.
2495 *
2496 * This function behaves like generic_make_request(), but does not protect
2497 * against recursion.  Must only be used if the called driver is known
2498 * to not call generic_make_request (or direct_make_request) again from
2499 * its make_request function.  (Calling direct_make_request again from
2500 * a workqueue is perfectly fine as that doesn't recurse).
2501 */
2502blk_qc_t direct_make_request(struct bio *bio)
2503{
2504        struct request_queue *q = bio->bi_disk->queue;
2505        bool nowait = bio->bi_opf & REQ_NOWAIT;
2506        blk_qc_t ret;
2507
2508        if (!generic_make_request_checks(bio))
2509                return BLK_QC_T_NONE;
2510
2511        if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) {
2512                if (nowait && !blk_queue_dying(q))
2513                        bio->bi_status = BLK_STS_AGAIN;
2514                else
2515                        bio->bi_status = BLK_STS_IOERR;
2516                bio_endio(bio);
2517                return BLK_QC_T_NONE;
2518        }
2519
2520        ret = q->make_request_fn(q, bio);
2521        blk_queue_exit(q);
2522        return ret;
2523}
2524EXPORT_SYMBOL_GPL(direct_make_request);
2525
2526/**
2527 * submit_bio - submit a bio to the block device layer for I/O
2528 * @bio: The &struct bio which describes the I/O
2529 *
2530 * submit_bio() is very similar in purpose to generic_make_request(), and
2531 * uses that function to do most of the work. Both are fairly rough
2532 * interfaces; @bio must be presetup and ready for I/O.
2533 *
2534 */
2535blk_qc_t submit_bio(struct bio *bio)
2536{
2537        /*
2538         * If it's a regular read/write or a barrier with data attached,
2539         * go through the normal accounting stuff before submission.
2540         */
2541        if (bio_has_data(bio)) {
2542                unsigned int count;
2543
2544                if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
2545                        count = queue_logical_block_size(bio->bi_disk->queue) >> 9;
2546                else
2547                        count = bio_sectors(bio);
2548
2549                if (op_is_write(bio_op(bio))) {
2550                        count_vm_events(PGPGOUT, count);
2551                } else {
2552                        task_io_account_read(bio->bi_iter.bi_size);
2553                        count_vm_events(PGPGIN, count);
2554                }
2555
2556                if (unlikely(block_dump)) {
2557                        char b[BDEVNAME_SIZE];
2558                        printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
2559                        current->comm, task_pid_nr(current),
2560                                op_is_write(bio_op(bio)) ? "WRITE" : "READ",
2561                                (unsigned long long)bio->bi_iter.bi_sector,
2562                                bio_devname(bio, b), count);
2563                }
2564        }
2565
2566        return generic_make_request(bio);
2567}
2568EXPORT_SYMBOL(submit_bio);
2569
2570bool blk_poll(struct request_queue *q, blk_qc_t cookie)
2571{
2572        if (!q->poll_fn || !blk_qc_t_valid(cookie))
2573                return false;
2574
2575        if (current->plug)
2576                blk_flush_plug_list(current->plug, false);
2577        return q->poll_fn(q, cookie);
2578}
2579EXPORT_SYMBOL_GPL(blk_poll);
2580
2581/**
2582 * blk_cloned_rq_check_limits - Helper function to check a cloned request
2583 *                              for new the queue limits
2584 * @q:  the queue
2585 * @rq: the request being checked
2586 *
2587 * Description:
2588 *    @rq may have been made based on weaker limitations of upper-level queues
2589 *    in request stacking drivers, and it may violate the limitation of @q.
2590 *    Since the block layer and the underlying device driver trust @rq
2591 *    after it is inserted to @q, it should be checked against @q before
2592 *    the insertion using this generic function.
2593 *
2594 *    Request stacking drivers like request-based dm may change the queue
2595 *    limits when retrying requests on other queues. Those requests need
2596 *    to be checked against the new queue limits again during dispatch.
2597 */
2598static int blk_cloned_rq_check_limits(struct request_queue *q,
2599                                      struct request *rq)
2600{
2601        if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, req_op(rq))) {
2602                printk(KERN_ERR "%s: over max size limit.\n", __func__);
2603                return -EIO;
2604        }
2605
2606        /*
2607         * queue's settings related to segment counting like q->bounce_pfn
2608         * may differ from that of other stacking queues.
2609         * Recalculate it to check the request correctly on this queue's
2610         * limitation.
2611         */
2612        blk_recalc_rq_segments(rq);
2613        if (rq->nr_phys_segments > queue_max_segments(q)) {
2614                printk(KERN_ERR "%s: over max segments limit.\n", __func__);
2615                return -EIO;
2616        }
2617
2618        return 0;
2619}
2620
2621/**
2622 * blk_insert_cloned_request - Helper for stacking drivers to submit a request
2623 * @q:  the queue to submit the request
2624 * @rq: the request being queued
2625 */
2626blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
2627{
2628        unsigned long flags;
2629        int where = ELEVATOR_INSERT_BACK;
2630
2631        if (blk_cloned_rq_check_limits(q, rq))
2632                return BLK_STS_IOERR;
2633
2634        if (rq->rq_disk &&
2635            should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
2636                return BLK_STS_IOERR;
2637
2638        if (q->mq_ops) {
2639                if (blk_queue_io_stat(q))
2640                        blk_account_io_start(rq, true);
2641                /*
2642                 * Since we have a scheduler attached on the top device,
2643                 * bypass a potential scheduler on the bottom device for
2644                 * insert.
2645                 */
2646                return blk_mq_request_issue_directly(rq);
2647        }
2648
2649        spin_lock_irqsave(q->queue_lock, flags);
2650        if (unlikely(blk_queue_dying(q))) {
2651                spin_unlock_irqrestore(q->queue_lock, flags);
2652                return BLK_STS_IOERR;
2653        }
2654
2655        /*
2656         * Submitting request must be dequeued before calling this function
2657         * because it will be linked to another request_queue
2658         */
2659        BUG_ON(blk_queued_rq(rq));
2660
2661        if (op_is_flush(rq->cmd_flags))
2662                where = ELEVATOR_INSERT_FLUSH;
2663
2664        add_acct_request(q, rq, where);
2665        if (where == ELEVATOR_INSERT_FLUSH)
2666                __blk_run_queue(q);
2667        spin_unlock_irqrestore(q->queue_lock, flags);
2668
2669        return BLK_STS_OK;
2670}
2671EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
2672
2673/**
2674 * blk_rq_err_bytes - determine number of bytes till the next failure boundary
2675 * @rq: request to examine
2676 *
2677 * Description:
2678 *     A request could be merge of IOs which require different failure
2679 *     handling.  This function determines the number of bytes which
2680 *     can be failed from the beginning of the request without
2681 *     crossing into area which need to be retried further.
2682 *
2683 * Return:
2684 *     The number of bytes to fail.
2685 */
2686unsigned int blk_rq_err_bytes(const struct request *rq)
2687{
2688        unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
2689        unsigned int bytes = 0;
2690        struct bio *bio;
2691
2692        if (!(rq->rq_flags & RQF_MIXED_MERGE))
2693                return blk_rq_bytes(rq);
2694
2695        /*
2696         * Currently the only 'mixing' which can happen is between
2697         * different fastfail types.  We can safely fail portions
2698         * which have all the failfast bits that the first one has -
2699         * the ones which are at least as eager to fail as the first
2700         * one.
2701         */
2702        for (bio = rq->bio; bio; bio = bio->bi_next) {
2703                if ((bio->bi_opf & ff) != ff)
2704                        break;
2705                bytes += bio->bi_iter.bi_size;
2706        }
2707
2708        /* this could lead to infinite loop */
2709        BUG_ON(blk_rq_bytes(rq) && !bytes);
2710        return bytes;
2711}
2712EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
2713
2714void blk_account_io_completion(struct request *req, unsigned int bytes)
2715{
2716        if (blk_do_io_stat(req)) {
2717                const int sgrp = op_stat_group(req_op(req));
2718                struct hd_struct *part;
2719                int cpu;
2720
2721                cpu = part_stat_lock();
2722                part = req->part;
2723                part_stat_add(cpu, part, sectors[sgrp], bytes >> 9);
2724                part_stat_unlock();
2725        }
2726}
2727
2728void blk_account_io_done(struct request *req, u64 now)
2729{
2730        /*
2731         * Account IO completion.  flush_rq isn't accounted as a
2732         * normal IO on queueing nor completion.  Accounting the
2733         * containing request is enough.
2734         */
2735        if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
2736                const int sgrp = op_stat_group(req_op(req));
2737                struct hd_struct *part;
2738                int cpu;
2739
2740                cpu = part_stat_lock();
2741                part = req->part;
2742
2743                part_stat_inc(cpu, part, ios[sgrp]);
2744                part_stat_add(cpu, part, nsecs[sgrp], now - req->start_time_ns);
2745                part_round_stats(req->q, cpu, part);
2746                part_dec_in_flight(req->q, part, rq_data_dir(req));
2747
2748                hd_struct_put(part);
2749                part_stat_unlock();
2750        }
2751}
2752
2753#ifdef CONFIG_PM
2754/*
2755 * Don't process normal requests when queue is suspended
2756 * or in the process of suspending/resuming
2757 */
2758static bool blk_pm_allow_request(struct request *rq)
2759{
2760        switch (rq->q->rpm_status) {
2761        case RPM_RESUMING:
2762        case RPM_SUSPENDING:
2763                return rq->rq_flags & RQF_PM;
2764        case RPM_SUSPENDED:
2765                return false;
2766        default:
2767                return true;
2768        }
2769}
2770#else
2771static bool blk_pm_allow_request(struct request *rq)
2772{
2773        return true;
2774}
2775#endif
2776
2777void blk_account_io_start(struct request *rq, bool new_io)
2778{
2779        struct hd_struct *part;
2780        int rw = rq_data_dir(rq);
2781        int cpu;
2782
2783        if (!blk_do_io_stat(rq))
2784                return;
2785
2786        cpu = part_stat_lock();
2787
2788        if (!new_io) {
2789                part = rq->part;
2790                part_stat_inc(cpu, part, merges[rw]);
2791        } else {
2792                part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
2793                if (!hd_struct_try_get(part)) {
2794                        /*
2795                         * The partition is already being removed,
2796                         * the request will be accounted on the disk only
2797                         *
2798                         * We take a reference on disk->part0 although that
2799                         * partition will never be deleted, so we can treat
2800                         * it as any other partition.
2801                         */
2802                        part = &rq->rq_disk->part0;
2803                        hd_struct_get(part);
2804                }
2805                part_round_stats(rq->q, cpu, part);
2806                part_inc_in_flight(rq->q, part, rw);
2807                rq->part = part;
2808        }
2809
2810        part_stat_unlock();
2811}
2812
2813static struct request *elv_next_request(struct request_queue *q)
2814{
2815        struct request *rq;
2816        struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
2817
2818        WARN_ON_ONCE(q->mq_ops);
2819
2820        while (1) {
2821                list_for_each_entry(rq, &q->queue_head, queuelist) {
2822                        if (blk_pm_allow_request(rq))
2823                                return rq;
2824
2825                        if (rq->rq_flags & RQF_SOFTBARRIER)
2826                                break;
2827                }
2828
2829                /*
2830                 * Flush request is running and flush request isn't queueable
2831                 * in the drive, we can hold the queue till flush request is
2832                 * finished. Even we don't do this, driver can't dispatch next
2833                 * requests and will requeue them. And this can improve
2834                 * throughput too. For example, we have request flush1, write1,
2835                 * flush 2. flush1 is dispatched, then queue is hold, write1
2836                 * isn't inserted to queue. After flush1 is finished, flush2
2837                 * will be dispatched. Since disk cache is already clean,
2838                 * flush2 will be finished very soon, so looks like flush2 is
2839                 * folded to flush1.
2840                 * Since the queue is hold, a flag is set to indicate the queue
2841                 * should be restarted later. Please see flush_end_io() for
2842                 * details.
2843                 */
2844                if (fq->flush_pending_idx != fq->flush_running_idx &&
2845                                !queue_flush_queueable(q)) {
2846                        fq->flush_queue_delayed = 1;
2847                        return NULL;
2848                }
2849                if (unlikely(blk_queue_bypass(q)) ||
2850                    !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
2851                        return NULL;
2852        }
2853}
2854
2855/**
2856 * blk_peek_request - peek at the top of a request queue
2857 * @q: request queue to peek at
2858 *
2859 * Description:
2860 *     Return the request at the top of @q.  The returned request
2861 *     should be started using blk_start_request() before LLD starts
2862 *     processing it.
2863 *
2864 * Return:
2865 *     Pointer to the request at the top of @q if available.  Null
2866 *     otherwise.
2867 */
2868struct request *blk_peek_request(struct request_queue *q)
2869{
2870        struct request *rq;
2871        int ret;
2872
2873        lockdep_assert_held(q->queue_lock);
2874        WARN_ON_ONCE(q->mq_ops);
2875
2876        while ((rq = elv_next_request(q)) != NULL) {
2877                if (!(rq->rq_flags & RQF_STARTED)) {
2878                        /*
2879                         * This is the first time the device driver
2880                         * sees this request (possibly after
2881                         * requeueing).  Notify IO scheduler.
2882                         */
2883                        if (rq->rq_flags & RQF_SORTED)
2884                                elv_activate_rq(q, rq);
2885
2886                        /*
2887                         * just mark as started even if we don't start
2888                         * it, a request that has been delayed should
2889                         * not be passed by new incoming requests
2890                         */
2891                        rq->rq_flags |= RQF_STARTED;
2892                        trace_block_rq_issue(q, rq);
2893                }
2894
2895                if (!q->boundary_rq || q->boundary_rq == rq) {
2896                        q->end_sector = rq_end_sector(rq);
2897                        q->boundary_rq = NULL;
2898                }
2899
2900                if (rq->rq_flags & RQF_DONTPREP)
2901                        break;
2902
2903                if (q->dma_drain_size && blk_rq_bytes(rq)) {
2904                        /*
2905                         * make sure space for the drain appears we
2906                         * know we can do this because max_hw_segments
2907                         * has been adjusted to be one fewer than the
2908                         * device can handle
2909                         */
2910                        rq->nr_phys_segments++;
2911                }
2912
2913                if (!q->prep_rq_fn)
2914                        break;
2915
2916                ret = q->prep_rq_fn(q, rq);
2917                if (ret == BLKPREP_OK) {
2918                        break;
2919                } else if (ret == BLKPREP_DEFER) {
2920                        /*
2921                         * the request may have been (partially) prepped.
2922                         * we need to keep this request in the front to
2923                         * avoid resource deadlock.  RQF_STARTED will
2924                         * prevent other fs requests from passing this one.
2925                         */
2926                        if (q->dma_drain_size && blk_rq_bytes(rq) &&
2927                            !(rq->rq_flags & RQF_DONTPREP)) {
2928                                /*
2929                                 * remove the space for the drain we added
2930                                 * so that we don't add it again
2931                                 */
2932                                --rq->nr_phys_segments;
2933                        }
2934
2935                        rq = NULL;
2936                        break;
2937                } else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) {
2938                        rq->rq_flags |= RQF_QUIET;
2939                        /*
2940                         * Mark this request as started so we don't trigger
2941                         * any debug logic in the end I/O path.
2942                         */
2943                        blk_start_request(rq);
2944                        __blk_end_request_all(rq, ret == BLKPREP_INVALID ?
2945                                        BLK_STS_TARGET : BLK_STS_IOERR);
2946                } else {
2947                        printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
2948                        break;
2949                }
2950        }
2951
2952        return rq;
2953}
2954EXPORT_SYMBOL(blk_peek_request);
2955
2956static void blk_dequeue_request(struct request *rq)
2957{
2958        struct request_queue *q = rq->q;
2959
2960        BUG_ON(list_empty(&rq->queuelist));
2961        BUG_ON(ELV_ON_HASH(rq));
2962
2963        list_del_init(&rq->queuelist);
2964
2965        /*
2966         * the time frame between a request being removed from the lists
2967         * and to it is freed is accounted as io that is in progress at
2968         * the driver side.
2969         */
2970        if (blk_account_rq(rq))
2971                q->in_flight[rq_is_sync(rq)]++;
2972}
2973
2974/**
2975 * blk_start_request - start request processing on the driver
2976 * @req: request to dequeue
2977 *
2978 * Description:
2979 *     Dequeue @req and start timeout timer on it.  This hands off the
2980 *     request to the driver.
2981 */
2982void blk_start_request(struct request *req)
2983{
2984        lockdep_assert_held(req->q->queue_lock);
2985        WARN_ON_ONCE(req->q->mq_ops);
2986
2987        blk_dequeue_request(req);
2988
2989        if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
2990                req->io_start_time_ns = ktime_get_ns();
2991#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2992                req->throtl_size = blk_rq_sectors(req);
2993#endif
2994                req->rq_flags |= RQF_STATS;
2995                rq_qos_issue(req->q, req);
2996        }
2997
2998        BUG_ON(blk_rq_is_complete(req));
2999        blk_add_timer(req);
3000}

3001EXPORT_SYMBOL(blk_start_request);
3002
3003/**
3004 * blk_fetch_request - fetch a request from a request queue
3005 * @q: request queue to fetch a request from
3006 *
3007 * Description:
3008 *     Return the request at the top of @q.  The request is started on
3009 *     return and LLD can start processing it immediately.
3010 *
3011 * Return:
3012 *     Pointer to the request at the top of @q if available.  Null
3013 *     otherwise.
3014 */
3015struct request *blk_fetch_request(struct request_queue *q)
3016{
3017        struct request *rq;
3018
3019        lockdep_assert_held(q->queue_lock);
3020        WARN_ON_ONCE(q->mq_ops);
3021
3022        rq = blk_peek_request(q);
3023        if (rq)
3024                blk_start_request(rq);
3025        return rq;
3026}
3027EXPORT_SYMBOL(blk_fetch_request);
3028
3029/*
3030 * Steal bios from a request and add them to a bio list.
3031 * The request must not have been partially completed before.
3032 */
3033void blk_steal_bios(struct bio_list *list, struct request *rq)
3034{
3035        if (rq->bio) {
3036                if (list->tail)
3037                        list->tail->bi_next = rq->bio;
3038                else
3039                        list->head = rq->bio;
3040                list->tail = rq->biotail;
3041
3042                rq->bio = NULL;
3043                rq->biotail = NULL;
3044        }
3045
3046        rq->__data_len = 0;
3047}
3048EXPORT_SYMBOL_GPL(blk_steal_bios);
3049
3050/**
3051 * blk_update_request - Special helper function for request stacking drivers
3052 * @req:      the request being processed
3053 * @error:    block status code
3054 * @nr_bytes: number of bytes to complete @req
3055 *
3056 * Description:
3057 *     Ends I/O on a number of bytes attached to @req, but doesn't complete
3058 *     the request structure even if @req doesn't have leftover.
3059 *     If @req has leftover, sets it up for the next range of segments.
3060 *
3061 *     This special helper function is only for request stacking drivers
3062 *     (e.g. request-based dm) so that they can handle partial completion.
3063 *     Actual device drivers should use blk_end_request instead.
3064 *
3065 *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
3066 *     %false return from this function.
3067 *
3068 * Note:
3069 *      The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both
3070 *      blk_rq_bytes() and in blk_update_request().
3071 *
3072 * Return:
3073 *     %false - this request doesn't have any more data
3074 *     %true  - this request has more data
3075 **/
3076bool blk_update_request(struct request *req, blk_status_t error,
3077                unsigned int nr_bytes)
3078{
3079        int total_bytes;
3080
3081        trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes);
3082
3083        if (!req->bio)
3084                return false;
3085
3086        if (unlikely(error && !blk_rq_is_passthrough(req) &&
3087                     !(req->rq_flags & RQF_QUIET)))
3088                print_req_error(req, error);
3089
3090        blk_account_io_completion(req, nr_bytes);
3091
3092        total_bytes = 0;
3093        while (req->bio) {
3094                struct bio *bio = req->bio;
3095                unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
3096
3097                if (bio_bytes == bio->bi_iter.bi_size)
3098                        req->bio = bio->bi_next;
3099
3100                /* Completion has already been traced */
3101                bio_clear_flag(bio, BIO_TRACE_COMPLETION);
3102                req_bio_endio(req, bio, bio_bytes, error);
3103
3104                total_bytes += bio_bytes;
3105                nr_bytes -= bio_bytes;
3106
3107                if (!nr_bytes)
3108                        break;
3109        }
3110
3111        /*
3112         * completely done
3113         */
3114        if (!req->bio) {
3115                /*
3116                 * Reset counters so that the request stacking driver
3117                 * can find how many bytes remain in the request
3118                 * later.
3119                 */
3120                req->__data_len = 0;
3121                return false;
3122        }
3123
3124        req->__data_len -= total_bytes;
3125
3126        /* update sector only for requests with clear definition of sector */
3127        if (!blk_rq_is_passthrough(req))
3128                req->__sector += total_bytes >> 9;
3129
3130        /* mixed attributes always follow the first bio */
3131        if (req->rq_flags & RQF_MIXED_MERGE) {
3132                req->cmd_flags &= ~REQ_FAILFAST_MASK;
3133                req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
3134        }
3135
3136        if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
3137                /*
3138                 * If total number of sectors is less than the first segment
3139                 * size, something has gone terribly wrong.
3140                 */
3141                if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
3142                        blk_dump_rq_flags(req, "request botched");
3143                        req->__data_len = blk_rq_cur_bytes(req);
3144                }
3145
3146                /* recalculate the number of segments */
3147                blk_recalc_rq_segments(req);
3148        }
3149
3150        return true;
3151}
3152EXPORT_SYMBOL_GPL(blk_update_request);
3153
3154static bool blk_update_bidi_request(struct request *rq, blk_status_t error,
3155                                    unsigned int nr_bytes,
3156                                    unsigned int bidi_bytes)
3157{
3158        if (blk_update_request(rq, error, nr_bytes))
3159                return true;
3160
3161        /* Bidi request must be completed as a whole */
3162        if (unlikely(blk_bidi_rq(rq)) &&
3163            blk_update_request(rq->next_rq, error, bidi_bytes))
3164                return true;
3165
3166        if (blk_queue_add_random(rq->q))
3167                add_disk_randomness(rq->rq_disk);
3168
3169        return false;
3170}
3171
3172/**
3173 * blk_unprep_request - unprepare a request
3174 * @req:        the request
3175 *
3176 * This function makes a request ready for complete resubmission (or
3177 * completion).  It happens only after all error handling is complete,
3178 * so represents the appropriate moment to deallocate any resources
3179 * that were allocated to the request in the prep_rq_fn.  The queue
3180 * lock is held when calling this.
3181 */
3182void blk_unprep_request(struct request *req)
3183{
3184        struct request_queue *q = req->q;
3185
3186        req->rq_flags &= ~RQF_DONTPREP;
3187        if (q->unprep_rq_fn)
3188                q->unprep_rq_fn(q, req);
3189}
3190EXPORT_SYMBOL_GPL(blk_unprep_request);
3191
3192void blk_finish_request(struct request *req, blk_status_t error)
3193{
3194        struct request_queue *q = req->q;
3195        u64 now = ktime_get_ns();
3196
3197        lockdep_assert_held(req->q->queue_lock);
3198        WARN_ON_ONCE(q->mq_ops);
3199
3200        if (req->rq_flags & RQF_STATS)
3201                blk_stat_add(req, now);
3202
3203        if (req->rq_flags & RQF_QUEUED)
3204                blk_queue_end_tag(q, req);
3205
3206        BUG_ON(blk_queued_rq(req));
3207
3208        if (unlikely(laptop_mode) && !blk_rq_is_passthrough(req))
3209                laptop_io_completion(req->q->backing_dev_info);
3210
3211        blk_delete_timer(req);
3212
3213        if (req->rq_flags & RQF_DONTPREP)
3214                blk_unprep_request(req);
3215
3216        blk_account_io_done(req, now);
3217
3218        if (req->end_io) {
3219                rq_qos_done(q, req);
3220                req->end_io(req, error);
3221        } else {
3222                if (blk_bidi_rq(req))
3223                        __blk_put_request(req->next_rq->q, req->next_rq);
3224
3225                __blk_put_request(q, req);
3226        }
3227}
3228EXPORT_SYMBOL(blk_finish_request);
3229
3230/**
3231 * blk_end_bidi_request - Complete a bidi request
3232 * @rq:         the request to complete
3233 * @error:      block status code
3234 * @nr_bytes:   number of bytes to complete @rq
3235 * @bidi_bytes: number of bytes to complete @rq->next_rq
3236 *
3237 * Description:
3238 *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
3239 *     Drivers that supports bidi can safely call this member for any
3240 *     type of request, bidi or uni.  In the later case @bidi_bytes is
3241 *     just ignored.
3242 *
3243 * Return:
3244 *     %false - we are done with this request
3245 *     %true  - still buffers pending for this request
3246 **/
3247static bool blk_end_bidi_request(struct request *rq, blk_status_t error,
3248                                 unsigned int nr_bytes, unsigned int bidi_bytes)
3249{
3250        struct request_queue *q = rq->q;
3251        unsigned long flags;
3252
3253        WARN_ON_ONCE(q->mq_ops);
3254
3255        if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
3256                return true;
3257
3258        spin_lock_irqsave(q->queue_lock, flags);
3259        blk_finish_request(rq, error);
3260        spin_unlock_irqrestore(q->queue_lock, flags);
3261
3262        return false;
3263}
3264
3265/**
3266 * __blk_end_bidi_request - Complete a bidi request with queue lock held
3267 * @rq:         the request to complete
3268 * @error:      block status code
3269 * @nr_bytes:   number of bytes to complete @rq
3270 * @bidi_bytes: number of bytes to complete @rq->next_rq
3271 *
3272 * Description:
3273 *     Identical to blk_end_bidi_request() except that queue lock is
3274 *     assumed to be locked on entry and remains so on return.
3275 *
3276 * Return:
3277 *     %false - we are done with this request
3278 *     %true  - still buffers pending for this request
3279 **/
3280static bool __blk_end_bidi_request(struct request *rq, blk_status_t error,
3281                                   unsigned int nr_bytes, unsigned int bidi_bytes)
3282{
3283        lockdep_assert_held(rq->q->queue_lock);
3284        WARN_ON_ONCE(rq->q->mq_ops);
3285
3286        if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
3287                return true;
3288
3289        blk_finish_request(rq, error);
3290
3291        return false;
3292}
3293
3294/**
3295 * blk_end_request - Helper function for drivers to complete the request.
3296 * @rq:       the request being processed
3297 * @error:    block status code
3298 * @nr_bytes: number of bytes to complete
3299 *
3300 * Description:
3301 *     Ends I/O on a number of bytes attached to @rq.
3302 *     If @rq has leftover, sets it up for the next range of segments.
3303 *
3304 * Return:
3305 *     %false - we are done with this request
3306 *     %true  - still buffers pending for this request
3307 **/
3308bool blk_end_request(struct request *rq, blk_status_t error,
3309                unsigned int nr_bytes)
3310{
3311        WARN_ON_ONCE(rq->q->mq_ops);
3312        return blk_end_bidi_request(rq, error, nr_bytes, 0);
3313}
3314EXPORT_SYMBOL(blk_end_request);
3315
3316/**
3317 * blk_end_request_all - Helper function for drives to finish the request.
3318 * @rq: the request to finish
3319 * @error: block status code
3320 *
3321 * Description:
3322 *     Completely finish @rq.
3323 */
3324void blk_end_request_all(struct request *rq, blk_status_t error)
3325{
3326        bool pending;
3327        unsigned int bidi_bytes = 0;
3328
3329        if (unlikely(blk_bidi_rq(rq)))
3330                bidi_bytes = blk_rq_bytes(rq->next_rq);
3331
3332        pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
3333        BUG_ON(pending);
3334}
3335EXPORT_SYMBOL(blk_end_request_all);
3336
3337/**
3338 * __blk_end_request - Helper function for drivers to complete the request.
3339 * @rq:       the request being processed
3340 * @error:    block status code
3341 * @nr_bytes: number of bytes to complete
3342 *
3343 * Description:
3344 *     Must be called with queue lock held unlike blk_end_request().
3345 *
3346 * Return:
3347 *     %false - we are done with this request
3348 *     %true  - still buffers pending for this request
3349 **/
3350bool __blk_end_request(struct request *rq, blk_status_t error,
3351                unsigned int nr_bytes)
3352{
3353        lockdep_assert_held(rq->q->queue_lock);
3354        WARN_ON_ONCE(rq->q->mq_ops);
3355
3356        return __blk_end_bidi_request(rq, error, nr_bytes, 0);
3357}
3358EXPORT_SYMBOL(__blk_end_request);
3359
3360/**
3361 * __blk_end_request_all - Helper function for drives to finish the request.
3362 * @rq: the request to finish
3363 * @error:    block status code
3364 *
3365 * Description:
3366 *     Completely finish @rq.  Must be called with queue lock held.
3367 */
3368void __blk_end_request_all(struct request *rq, blk_status_t error)
3369{
3370        bool pending;
3371        unsigned int bidi_bytes = 0;
3372
3373        lockdep_assert_held(rq->q->queue_lock);
3374        WARN_ON_ONCE(rq->q->mq_ops);
3375
3376        if (unlikely(blk_bidi_rq(rq)))
3377                bidi_bytes = blk_rq_bytes(rq->next_rq);
3378
3379        pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
3380        BUG_ON(pending);
3381}
3382EXPORT_SYMBOL(__blk_end_request_all);
3383
3384/**
3385 * __blk_end_request_cur - Helper function to finish the current request chunk.
3386 * @rq: the request to finish the current chunk for
3387 * @error:    block status code
3388 *
3389 * Description:
3390 *     Complete the current consecutively mapped chunk from @rq.  Must
3391 *     be called with queue lock held.
3392 *
3393 * Return:
3394 *     %false - we are done with this request
3395 *     %true  - still buffers pending for this request
3396 */
3397bool __blk_end_request_cur(struct request *rq, blk_status_t error)
3398{
3399        return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));
3400}
3401EXPORT_SYMBOL(__blk_end_request_cur);
3402
3403void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
3404                     struct bio *bio)
3405{
3406        if (bio_has_data(bio))
3407                rq->nr_phys_segments = bio_phys_segments(q, bio);
3408        else if (bio_op(bio) == REQ_OP_DISCARD)
3409                rq->nr_phys_segments = 1;
3410
3411        rq->__data_len = bio->bi_iter.bi_size;
3412        rq->bio = rq->biotail = bio;
3413
3414        if (bio->bi_disk)
3415                rq->rq_disk = bio->bi_disk;
3416}
3417
3418#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
3419/**
3420 * rq_flush_dcache_pages - Helper function to flush all pages in a request
3421 * @rq: the request to be flushed
3422 *
3423 * Description:
3424 *     Flush all pages in @rq.
3425 */
3426void rq_flush_dcache_pages(struct request *rq)
3427{
3428        struct req_iterator iter;
3429        struct bio_vec bvec;
3430
3431        rq_for_each_segment(bvec, rq, iter)
3432                flush_dcache_page(bvec.bv_page);
3433}
3434EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
3435#endif
3436
3437/**
3438 * blk_lld_busy - Check if underlying low-level drivers of a device are busy
3439 * @q : the queue of the device being checked
3440 *
3441 * Description:
3442 *    Check if underlying low-level drivers of a device are busy.
3443 *    If the drivers want to export their busy state, they must set own
3444 *    exporting function using blk_queue_lld_busy() first.
3445 *
3446 *    Basically, this function is used only by request stacking drivers
3447 *    to stop dispatching requests to underlying devices when underlying
3448 *    devices are busy.  This behavior helps more I/O merging on the queue
3449 *    of the request stacking driver and prevents I/O throughput regression
3450 *    on burst I/O load.
3451 *
3452 * Return:
3453 *    0 - Not busy (The request stacking driver should dispatch request)
3454 *    1 - Busy (The request stacking driver should stop dispatching request)
3455 */
3456int blk_lld_busy(struct request_queue *q)
3457{
3458        if (q->lld_busy_fn)
3459                return q->lld_busy_fn(q);
3460
3461        return 0;
3462}
3463EXPORT_SYMBOL_GPL(blk_lld_busy);
3464
3465/**
3466 * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
3467 * @rq: the clone request to be cleaned up
3468 *
3469 * Description:
3470 *     Free all bios in @rq for a cloned request.
3471 */
3472void blk_rq_unprep_clone(struct request *rq)
3473{
3474        struct bio *bio;
3475
3476        while ((bio = rq->bio) != NULL) {
3477                rq->bio = bio->bi_next;
3478
3479                bio_put(bio);
3480        }
3481}
3482EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
3483
3484/*
3485 * Copy attributes of the original request to the clone request.
3486 * The actual data parts (e.g. ->cmd, ->sense) are not copied.
3487 */
3488static void __blk_rq_prep_clone(struct request *dst, struct request *src)
3489{
3490        dst->cpu = src->cpu;
3491        dst->__sector = blk_rq_pos(src);
3492        dst->__data_len = blk_rq_bytes(src);
3493        if (src->rq_flags & RQF_SPECIAL_PAYLOAD) {
3494                dst->rq_flags |= RQF_SPECIAL_PAYLOAD;
3495                dst->special_vec = src->special_vec;
3496        }
3497        dst->nr_phys_segments = src->nr_phys_segments;
3498        dst->ioprio = src->ioprio;
3499        dst->extra_len = src->extra_len;
3500}
3501
3502/**
3503 * blk_rq_prep_clone - Helper function to setup clone request
3504 * @rq: the request to be setup
3505 * @rq_src: original request to be cloned
3506 * @bs: bio_set that bios for clone are allocated from
3507 * @gfp_mask: memory allocation mask for bio
3508 * @bio_ctr: setup function to be called for each clone bio.
3509 *           Returns %0 for success, non %0 for failure.
3510 * @data: private data to be passed to @bio_ctr
3511 *
3512 * Description:
3513 *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
3514 *     The actual data parts of @rq_src (e.g. ->cmd, ->sense)
3515 *     are not copied, and copying such parts is the caller's responsibility.
3516 *     Also, pages which the original bios are pointing to are not copied
3517 *     and the cloned bios just point same pages.
3518 *     So cloned bios must be completed before original bios, which means
3519 *     the caller must complete @rq before @rq_src.
3520 */
3521int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
3522                      struct bio_set *bs, gfp_t gfp_mask,
3523                      int (*bio_ctr)(struct bio *, struct bio *, void *),
3524                      void *data)
3525{
3526        struct bio *bio, *bio_src;
3527
3528        if (!bs)
3529                bs = &fs_bio_set;
3530
3531        __rq_for_each_bio(bio_src, rq_src) {
3532                bio = bio_clone_fast(bio_src, gfp_mask, bs);
3533                if (!bio)
3534                        goto free_and_out;
3535
3536                if (bio_ctr && bio_ctr(bio, bio_src, data))
3537                        goto free_and_out;
3538
3539                if (rq->bio) {
3540                        rq->biotail->bi_next = bio;
3541                        rq->biotail = bio;
3542                } else
3543                        rq->bio = rq->biotail = bio;
3544        }
3545
3546        __blk_rq_prep_clone(rq, rq_src);
3547
3548        return 0;
3549
3550free_and_out:
3551        if (bio)
3552                bio_put(bio);
3553        blk_rq_unprep_clone(rq);
3554
3555        return -ENOMEM;
3556}
3557EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
3558
3559int kblockd_schedule_work(struct work_struct *work)
3560{
3561        return queue_work(kblockd_workqueue, work);
3562}
3563EXPORT_SYMBOL(kblockd_schedule_work);
3564
3565int kblockd_schedule_work_on(int cpu, struct work_struct *work)
3566{
3567        return queue_work_on(cpu, kblockd_workqueue, work);
3568}
3569EXPORT_SYMBOL(kblockd_schedule_work_on);
3570
3571int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
3572                                unsigned long delay)
3573{
3574        return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
3575}
3576EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
3577
3578/**
3579 * blk_start_plug - initialize blk_plug and track it inside the task_struct
3580 * @plug:       The &struct blk_plug that needs to be initialized
3581 *
3582 * Description:
3583 *   Tracking blk_plug inside the task_struct will help with auto-flushing the
3584 *   pending I/O should the task end up blocking between blk_start_plug() and
3585 *   blk_finish_plug(). This is important from a performance perspective, but
3586 *   also ensures that we don't deadlock. For instance, if the task is blocking
3587 *   for a memory allocation, memory reclaim could end up wanting to free a
3588 *   page belonging to that request that is currently residing in our private
3589 *   plug. By flushing the pending I/O when the process goes to sleep, we avoid
3590 *   this kind of deadlock.
3591 */
3592void blk_start_plug(struct blk_plug *plug)
3593{
3594        struct task_struct *tsk = current;
3595
3596        /*
3597         * If this is a nested plug, don't actually assign it.
3598         */
3599        if (tsk->plug)
3600                return;
3601
3602        INIT_LIST_HEAD(&plug->list);
3603        INIT_LIST_HEAD(&plug->mq_list);
3604        INIT_LIST_HEAD(&plug->cb_list);
3605        /*
3606         * Store ordering should not be needed here, since a potential
3607         * preempt will imply a full memory barrier
3608         */
3609        tsk->plug = plug;
3610}
3611EXPORT_SYMBOL(blk_start_plug);
3612
3613static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
3614{
3615        struct request *rqa = container_of(a, struct request, queuelist);
3616        struct request *rqb = container_of(b, struct request, queuelist);
3617
3618        return !(rqa->q < rqb->q ||
3619                (rqa->q == rqb->q && blk_rq_pos(rqa) < blk_rq_pos(rqb)));
3620}
3621
3622/*
3623 * If 'from_schedule' is true, then postpone the dispatch of requests
3624 * until a safe kblockd context. We due this to avoid accidental big
3625 * additional stack usage in driver dispatch, in places where the originally
3626 * plugger did not intend it.
3627 */
3628static void queue_unplugged(struct request_queue *q, unsigned int depth,
3629                            bool from_schedule)
3630        __releases(q->queue_lock)
3631{
3632        lockdep_assert_held(q->queue_lock);
3633
3634        trace_block_unplug(q, depth, !from_schedule);
3635
3636        if (from_schedule)
3637                blk_run_queue_async(q);
3638        else
3639                __blk_run_queue(q);
3640        spin_unlock_irq(q->queue_lock);
3641}
3642
3643static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
3644{
3645        LIST_HEAD(callbacks);
3646
3647        while (!list_empty(&plug->cb_list)) {
3648                list_splice_init(&plug->cb_list, &callbacks);
3649
3650                while (!list_empty(&callbacks)) {
3651                        struct blk_plug_cb *cb = list_first_entry(&callbacks,
3652                                                          struct blk_plug_cb,
3653                                                          list);
3654                        list_del(&cb->list);
3655                        cb->callback(cb, from_schedule);
3656                }
3657        }
3658}
3659
3660struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
3661                                      int size)
3662{
3663        struct blk_plug *plug = current->plug;
3664        struct blk_plug_cb *cb;
3665
3666        if (!plug)
3667                return NULL;
3668
3669        list_for_each_entry(cb, &plug->cb_list, list)
3670                if (cb->callback == unplug && cb->data == data)
3671                        return cb;
3672
3673        /* Not currently on the callback list */
3674        BUG_ON(size < sizeof(*cb));
3675        cb = kzalloc(size, GFP_ATOMIC);
3676        if (cb) {
3677                cb->data = data;
3678                cb->callback = unplug;
3679                list_add(&cb->list, &plug->cb_list);
3680        }
3681        return cb;
3682}
3683EXPORT_SYMBOL(blk_check_plugged);
3684
3685void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3686{
3687        struct request_queue *q;
3688        struct request *rq;
3689        LIST_HEAD(list);
3690        unsigned int depth;
3691
3692        flush_plug_callbacks(plug, from_schedule);
3693
3694        if (!list_empty(&plug->mq_list))
3695                blk_mq_flush_plug_list(plug, from_schedule);
3696
3697        if (list_empty(&plug->list))
3698                return;
3699
3700        list_splice_init(&plug->list, &list);
3701
3702        list_sort(NULL, &list, plug_rq_cmp);
3703
3704        q = NULL;
3705        depth = 0;
3706
3707        while (!list_empty(&list)) {
3708                rq = list_entry_rq(list.next);
3709                list_del_init(&rq->queuelist);
3710                BUG_ON(!rq->q);
3711                if (rq->q != q) {
3712                        /*
3713                         * This drops the queue lock
3714                         */
3715                        if (q)
3716                                queue_unplugged(q, depth, from_schedule);
3717                        q = rq->q;
3718                        depth = 0;
3719                        spin_lock_irq(q->queue_lock);
3720                }
3721
3722                /*
3723                 * Short-circuit if @q is dead
3724                 */
3725                if (unlikely(blk_queue_dying(q))) {
3726                        __blk_end_request_all(rq, BLK_STS_IOERR);
3727                        continue;
3728                }
3729
3730                /*
3731                 * rq is already accounted, so use raw insert
3732                 */
3733                if (op_is_flush(rq->cmd_flags))
3734                        __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
3735                else
3736                        __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
3737
3738                depth++;
3739        }
3740
3741        /*
3742         * This drops the queue lock
3743         */
3744        if (q)
3745                queue_unplugged(q, depth, from_schedule);
3746}
3747
3748void blk_finish_plug(struct blk_plug *plug)
3749{
3750        if (plug != current->plug)
3751                return;
3752        blk_flush_plug_list(plug, false);
3753
3754        current->plug = NULL;
3755}
3756EXPORT_SYMBOL(blk_finish_plug);
3757
3758#ifdef CONFIG_PM
3759/**
3760 * blk_pm_runtime_init - Block layer runtime PM initialization routine
3761 * @q: the queue of the device
3762 * @dev: the device the queue belongs to
3763 *
3764 * Description:
3765 *    Initialize runtime-PM-related fields for @q and start auto suspend for
3766 *    @dev. Drivers that want to take advantage of request-based runtime PM
3767 *    should call this function after @dev has been initialized, and its
3768 *    request queue @q has been allocated, and runtime PM for it can not happen
3769 *    yet(either due to disabled/forbidden or its usage_count > 0). In most
3770 *    cases, driver should call this function before any I/O has taken place.
3771 *
3772 *    This function takes care of setting up using auto suspend for the device,
3773 *    the autosuspend delay is set to -1 to make runtime suspend impossible
3774 *    until an updated value is either set by user or by driver. Drivers do
3775 *    not need to touch other autosuspend settings.
3776 *
3777 *    The block layer runtime PM is request based, so only works for drivers
3778 *    that use request as their IO unit instead of those directly use bio's.
3779 */
3780void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
3781{
3782        /* Don't enable runtime PM for blk-mq until it is ready */
3783        if (q->mq_ops) {
3784                pm_runtime_disable(dev);
3785                return;
3786        }
3787
3788        q->dev = dev;
3789        q->rpm_status = RPM_ACTIVE;
3790        pm_runtime_set_autosuspend_delay(q->dev, -1);
3791        pm_runtime_use_autosuspend(q->dev);
3792}
3793EXPORT_SYMBOL(blk_pm_runtime_init);
3794
3795/**
3796 * blk_pre_runtime_suspend - Pre runtime suspend check
3797 * @q: the queue of the device
3798 *
3799 * Description:
3800 *    This function will check if runtime suspend is allowed for the device
3801 *    by examining if there are any requests pending in the queue. If there
3802 *    are requests pending, the device can not be runtime suspended; otherwise,
3803 *    the queue's status will be updated to SUSPENDING and the driver can
3804 *    proceed to suspend the device.
3805 *
3806 *    For the not allowed case, we mark last busy for the device so that
3807 *    runtime PM core will try to autosuspend it some time later.
3808 *
3809 *    This function should be called near the start of the device's
3810 *    runtime_suspend callback.
3811 *
3812 * Return:
3813 *    0         - OK to runtime suspend the device
3814 *    -EBUSY    - Device should not be runtime suspended
3815 */
3816int blk_pre_runtime_suspend(struct request_queue *q)
3817{
3818        int ret = 0;
3819
3820        if (!q->dev)
3821                return ret;
3822
3823        spin_lock_irq(q->queue_lock);
3824        if (q->nr_pending) {
3825                ret = -EBUSY;
3826                pm_runtime_mark_last_busy(q->dev);
3827        } else {
3828                q->rpm_status = RPM_SUSPENDING;
3829        }
3830        spin_unlock_irq(q->queue_lock);
3831        return ret;
3832}
3833EXPORT_SYMBOL(blk_pre_runtime_suspend);
3834
3835/**
3836 * blk_post_runtime_suspend - Post runtime suspend processing
3837 * @q: the queue of the device
3838 * @err: return value of the device's runtime_suspend function
3839 *
3840 * Description:
3841 *    Update the queue's runtime status according to the return value of the
3842 *    device's runtime suspend function and mark last busy for the device so
3843 *    that PM core will try to auto suspend the device at a later time.
3844 *
3845 *    This function should be called near the end of the device's
3846 *    runtime_suspend callback.
3847 */
3848void blk_post_runtime_suspend(struct request_queue *q, int err)
3849{
3850        if (!q->dev)
3851                return;
3852
3853        spin_lock_irq(q->queue_lock);
3854        if (!err) {
3855                q->rpm_status = RPM_SUSPENDED;
3856        } else {
3857                q->rpm_status = RPM_ACTIVE;
3858                pm_runtime_mark_last_busy(q->dev);
3859        }
3860        spin_unlock_irq(q->queue_lock);
3861}
3862EXPORT_SYMBOL(blk_post_runtime_suspend);
3863
3864/**
3865 * blk_pre_runtime_resume - Pre runtime resume processing
3866 * @q: the queue of the device
3867 *
3868 * Description:
3869 *    Update the queue's runtime status to RESUMING in preparation for the
3870 *    runtime resume of the device.
3871 *
3872 *    This function should be called near the start of the device's
3873 *    runtime_resume callback.
3874 */
3875void blk_pre_runtime_resume(struct request_queue *q)
3876{
3877        if (!q->dev)
3878                return;
3879
3880        spin_lock_irq(q->queue_lock);
3881        q->rpm_status = RPM_RESUMING;
3882        spin_unlock_irq(q->queue_lock);
3883}
3884EXPORT_SYMBOL(blk_pre_runtime_resume);
3885
3886/**
3887 * blk_post_runtime_resume - Post runtime resume processing
3888 * @q: the queue of the device
3889 * @err: return value of the device's runtime_resume function
3890 *
3891 * Description:
3892 *    Update the queue's runtime status according to the return value of the
3893 *    device's runtime_resume function. If it is successfully resumed, process
3894 *    the requests that are queued into the device's queue when it is resuming
3895 *    and then mark last busy and initiate autosuspend for it.
3896 *
3897 *    This function should be called near the end of the device's
3898 *    runtime_resume callback.
3899 */
3900void blk_post_runtime_resume(struct request_queue *q, int err)
3901{
3902        if (!q->dev)
3903                return;
3904
3905        spin_lock_irq(q->queue_lock);
3906        if (!err) {
3907                q->rpm_status = RPM_ACTIVE;
3908                __blk_run_queue(q);
3909                pm_runtime_mark_last_busy(q->dev);
3910                pm_request_autosuspend(q->dev);
3911        } else {
3912                q->rpm_status = RPM_SUSPENDED;
3913        }
3914        spin_unlock_irq(q->queue_lock);
3915}
3916EXPORT_SYMBOL(blk_post_runtime_resume);
3917
3918/**
3919 * blk_set_runtime_active - Force runtime status of the queue to be active
3920 * @q: the queue of the device
3921 *
3922 * If the device is left runtime suspended during system suspend the resume
3923 * hook typically resumes the device and corrects runtime status
3924 * accordingly. However, that does not affect the queue runtime PM status
3925 * which is still "suspended". This prevents processing requests from the
3926 * queue.
3927 *
3928 * This function can be used in driver's resume hook to correct queue
3929 * runtime PM status and re-enable peeking requests from the queue. It
3930 * should be called before first request is added to the queue.
3931 */
3932void blk_set_runtime_active(struct request_queue *q)
3933{
3934        spin_lock_irq(q->queue_lock);
3935        q->rpm_status = RPM_ACTIVE;
3936        pm_runtime_mark_last_busy(q->dev);
3937        pm_request_autosuspend(q->dev);
3938        spin_unlock_irq(q->queue_lock);
3939}
3940EXPORT_SYMBOL(blk_set_runtime_active);
3941#endif
3942
3943int __init blk_dev_init(void)
3944{
3945        BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
3946        BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
3947                        FIELD_SIZEOF(struct request, cmd_flags));
3948        BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
3949                        FIELD_SIZEOF(struct bio, bi_opf));
3950
3951        /* used for unplugging and affects IO latency/throughput - HIGHPRI */
3952        kblockd_workqueue = alloc_workqueue("kblockd",
3953                                            WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
3954        if (!kblockd_workqueue)
3955                panic("Failed to create kblockd\n");
3956
3957        request_cachep = kmem_cache_create("blkdev_requests",
3958                        sizeof(struct request), 0, SLAB_PANIC, NULL);
3959
3960        blk_requestq_cachep = kmem_cache_create("request_queue",
3961                        sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
3962
3963#ifdef CONFIG_DEBUG_FS
3964        blk_debugfs_root = debugfs_create_dir("block", NULL);
3965#endif
3966
3967        return 0;
3968}
3969