LXR linux/block/blk-flush.c

   1/*
   2 * Functions to sequence PREFLUSH and FUA writes.
   3 *
   4 * Copyright (C) 2011           Max Planck Institute for Gravitational Physics
   5 * Copyright (C) 2011           Tejun Heo <tj@kernel.org>
   6 *
   7 * This file is released under the GPLv2.
   8 *
   9 * REQ_{PREFLUSH|FUA} requests are decomposed to sequences consisted of three
  10 * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
  11 * properties and hardware capability.
  12 *
  13 * If a request doesn't have data, only REQ_PREFLUSH makes sense, which
  14 * indicates a simple flush request.  If there is data, REQ_PREFLUSH indicates
  15 * that the device cache should be flushed before the data is executed, and
  16 * REQ_FUA means that the data must be on non-volatile media on request
  17 * completion.
  18 *
  19 * If the device doesn't have writeback cache, PREFLUSH and FUA don't make any
  20 * difference.  The requests are either completed immediately if there's no data
  21 * or executed as normal requests otherwise.
  22 *
  23 * If the device has writeback cache and supports FUA, REQ_PREFLUSH is
  24 * translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
  25 *
  26 * If the device has writeback cache and doesn't support FUA, REQ_PREFLUSH
  27 * is translated to PREFLUSH and REQ_FUA to POSTFLUSH.
  28 *
  29 * The actual execution of flush is double buffered.  Whenever a request
  30 * needs to execute PRE or POSTFLUSH, it queues at
  31 * fq->flush_queue[fq->flush_pending_idx].  Once certain criteria are met, a
  32 * REQ_OP_FLUSH is issued and the pending_idx is toggled.  When the flush
  33 * completes, all the requests which were pending are proceeded to the next
  34 * step.  This allows arbitrary merging of different types of PREFLUSH/FUA
  35 * requests.
  36 *
  37 * Currently, the following conditions are used to determine when to issue
  38 * flush.
  39 *
  40 * C1. At any given time, only one flush shall be in progress.  This makes
  41 *     double buffering sufficient.
  42 *
  43 * C2. Flush is deferred if any request is executing DATA of its sequence.
  44 *     This avoids issuing separate POSTFLUSHes for requests which shared
  45 *     PREFLUSH.
  46 *
  47 * C3. The second condition is ignored if there is a request which has
  48 *     waited longer than FLUSH_PENDING_TIMEOUT.  This is to avoid
  49 *     starvation in the unlikely case where there are continuous stream of
  50 *     FUA (without PREFLUSH) requests.
  51 *
  52 * For devices which support FUA, it isn't clear whether C2 (and thus C3)
  53 * is beneficial.
  54 *
  55 * Note that a sequenced PREFLUSH/FUA request with DATA is completed twice.
  56 * Once while executing DATA and again after the whole sequence is
  57 * complete.  The first completion updates the contained bio but doesn't
  58 * finish it so that the bio submitter is notified only after the whole
  59 * sequence is complete.  This is implemented by testing RQF_FLUSH_SEQ in
  60 * req_bio_endio().
  61 *
  62 * The above peculiarity requires that each PREFLUSH/FUA request has only one
  63 * bio attached to it, which is guaranteed as they aren't allowed to be
  64 * merged in the usual way.
  65 */
  66
  67#include <linux/kernel.h>
  68#include <linux/module.h>
  69#include <linux/bio.h>
  70#include <linux/blkdev.h>
  71#include <linux/gfp.h>
  72#include <linux/blk-mq.h>
  73
  74#include "blk.h"
  75#include "blk-mq.h"
  76#include "blk-mq-tag.h"
  77#include "blk-mq-sched.h"
  78
  79/* PREFLUSH/FUA sequences */
  80enum {
  81        REQ_FSEQ_PREFLUSH       = (1 << 0), /* pre-flushing in progress */
  82        REQ_FSEQ_DATA           = (1 << 1), /* data write in progress */
  83        REQ_FSEQ_POSTFLUSH      = (1 << 2), /* post-flushing in progress */
  84        REQ_FSEQ_DONE           = (1 << 3),
  85
  86        REQ_FSEQ_ACTIONS        = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA |
  87                                  REQ_FSEQ_POSTFLUSH,
  88
  89        /*
  90         * If flush has been pending longer than the following timeout,
  91         * it's issued even if flush_data requests are still in flight.
  92         */
  93        FLUSH_PENDING_TIMEOUT   = 5 * HZ,
  94};
  95
  96static bool blk_kick_flush(struct request_queue *q,
  97                           struct blk_flush_queue *fq);
  98
  99static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
 100{
 101        unsigned int policy = 0;
 102
 103        if (blk_rq_sectors(rq))
 104                policy |= REQ_FSEQ_DATA;
 105
 106        if (fflags & (1UL << QUEUE_FLAG_WC)) {
 107                if (rq->cmd_flags & REQ_PREFLUSH)
 108                        policy |= REQ_FSEQ_PREFLUSH;
 109                if (!(fflags & (1UL << QUEUE_FLAG_FUA)) &&
 110                    (rq->cmd_flags & REQ_FUA))
 111                        policy |= REQ_FSEQ_POSTFLUSH;
 112        }
 113        return policy;
 114}
 115
 116static unsigned int blk_flush_cur_seq(struct request *rq)
 117{
 118        return 1 << ffz(rq->flush.seq);
 119}
 120
 121static void blk_flush_restore_request(struct request *rq)
 122{
 123        /*
 124         * After flush data completion, @rq->bio is %NULL but we need to
 125         * complete the bio again.  @rq->biotail is guaranteed to equal the
 126         * original @rq->bio.  Restore it.
 127         */
 128        rq->bio = rq->biotail;
 129
 130        /* make @rq a normal request */
 131        rq->rq_flags &= ~RQF_FLUSH_SEQ;
 132        rq->end_io = rq->flush.saved_end_io;
 133}
 134
 135static bool blk_flush_queue_rq(struct request *rq, bool add_front)
 136{
 137        if (rq->q->mq_ops) {
 138                blk_mq_add_to_requeue_list(rq, add_front, true);
 139                return false;
 140        } else {
 141                if (add_front)
 142                        list_add(&rq->queuelist, &rq->q->queue_head);
 143                else
 144                        list_add_tail(&rq->queuelist, &rq->q->queue_head);
 145                return true;
 146        }
 147}
 148
 149/**
 150 * blk_flush_complete_seq - complete flush sequence
 151 * @rq: PREFLUSH/FUA request being sequenced
 152 * @fq: flush queue
 153 * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)
 154 * @error: whether an error occurred
 155 *
 156 * @rq just completed @seq part of its flush sequence, record the
 157 * completion and trigger the next step.
 158 *
 159 * CONTEXT:
 160 * spin_lock_irq(q->queue_lock or fq->mq_flush_lock)
 161 *
 162 * RETURNS:
 163 * %true if requests were added to the dispatch queue, %false otherwise.
 164 */
 165static bool blk_flush_complete_seq(struct request *rq,
 166                                   struct blk_flush_queue *fq,
 167                                   unsigned int seq, blk_status_t error)
 168{
 169        struct request_queue *q = rq->q;
 170        struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
 171        bool queued = false, kicked;
 172
 173        BUG_ON(rq->flush.seq & seq);
 174        rq->flush.seq |= seq;
 175
 176        if (likely(!error))
 177                seq = blk_flush_cur_seq(rq);
 178        else
 179                seq = REQ_FSEQ_DONE;
 180
 181        switch (seq) {
 182        case REQ_FSEQ_PREFLUSH:
 183        case REQ_FSEQ_POSTFLUSH:
 184                /* queue for flush */
 185                if (list_empty(pending))
 186                        fq->flush_pending_since = jiffies;
 187                list_move_tail(&rq->flush.list, pending);
 188                break;
 189
 190        case REQ_FSEQ_DATA:
 191                list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
 192                queued = blk_flush_queue_rq(rq, true);
 193                break;
 194
 195        case REQ_FSEQ_DONE:
 196                /*
 197                 * @rq was previously adjusted by blk_flush_issue() for
 198                 * flush sequencing and may already have gone through the
 199                 * flush data request completion path.  Restore @rq for
 200                 * normal completion and end it.
 201                 */
 202                BUG_ON(!list_empty(&rq->queuelist));
 203                list_del_init(&rq->flush.list);
 204                blk_flush_restore_request(rq);
 205                if (q->mq_ops)
 206                        blk_mq_end_request(rq, error);
 207                else
 208                        __blk_end_request_all(rq, error);
 209                break;
 210
 211        default:
 212                BUG();
 213        }
 214
 215        kicked = blk_kick_flush(q, fq);
 216        return kicked | queued;
 217}
 218
 219static void flush_end_io(struct request *flush_rq, blk_status_t error)
 220{
 221        struct request_queue *q = flush_rq->q;
 222        struct list_head *running;
 223        bool queued = false;
 224        struct request *rq, *n;
 225        unsigned long flags = 0;
 226        struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
 227
 228        if (q->mq_ops) {
 229                struct blk_mq_hw_ctx *hctx;
 230
 231                /* release the tag's ownership to the req cloned from */
 232                spin_lock_irqsave(&fq->mq_flush_lock, flags);
 233                hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu);
 234                if (!q->elevator) {
 235                        blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
 236                        flush_rq->tag = -1;
 237                } else {
 238                        blk_mq_put_driver_tag_hctx(hctx, flush_rq);
 239                        flush_rq->internal_tag = -1;
 240                }
 241        }
 242
 243        running = &fq->flush_queue[fq->flush_running_idx];
 244        BUG_ON(fq->flush_pending_idx == fq->flush_running_idx);
 245
 246        /* account completion of the flush request */
 247        fq->flush_running_idx ^= 1;
 248
 249        if (!q->mq_ops)
 250                elv_completed_request(q, flush_rq);
 251
 252        /* and push the waiting requests to the next stage */
 253        list_for_each_entry_safe(rq, n, running, flush.list) {
 254                unsigned int seq = blk_flush_cur_seq(rq);
 255
 256                BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
 257                queued |= blk_flush_complete_seq(rq, fq, seq, error);
 258        }
 259
 260        /*
 261         * Kick the queue to avoid stall for two cases:
 262         * 1. Moving a request silently to empty queue_head may stall the
 263         * queue.
 264         * 2. When flush request is running in non-queueable queue, the
 265         * queue is hold. Restart the queue after flush request is finished
 266         * to avoid stall.
 267         * This function is called from request completion path and calling
 268         * directly into request_fn may confuse the driver.  Always use
 269         * kblockd.
 270         */
 271        if (queued || fq->flush_queue_delayed) {
 272                WARN_ON(q->mq_ops);
 273                blk_run_queue_async(q);
 274        }
 275        fq->flush_queue_delayed = 0;
 276        if (q->mq_ops)
 277                spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
 278}
 279
 280/**
 281 * blk_kick_flush - consider issuing flush request
 282 * @q: request_queue being kicked
 283 * @fq: flush queue
 284 *
 285 * Flush related states of @q have changed, consider issuing flush request.
 286 * Please read the comment at the top of this file for more info.
 287 *
 288 * CONTEXT:
 289 * spin_lock_irq(q->queue_lock or fq->mq_flush_lock)
 290 *
 291 * RETURNS:
 292 * %true if flush was issued, %false otherwise.
 293 */
 294static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
 295{
 296        struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
 297        struct request *first_rq =
 298                list_first_entry(pending, struct request, flush.list);
 299        struct request *flush_rq = fq->flush_rq;
 300
 301        /* C1 described at the top of this file */
 302        if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending))
 303                return false;
 304
 305        /* C2 and C3
 306         *
 307         * For blk-mq + scheduling, we can risk having all driver tags
 308         * assigned to empty flushes, and we deadlock if we are expecting
 309         * other requests to make progress. Don't defer for that case.
 310         */
 311        if (!list_empty(&fq->flush_data_in_flight) &&
 312            !(q->mq_ops && q->elevator) &&
 313            time_before(jiffies,
 314                        fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
 315                return false;
 316
 317        /*
 318         * Issue flush and toggle pending_idx.  This makes pending_idx
 319         * different from running_idx, which means flush is in flight.
 320         */
 321        fq->flush_pending_idx ^= 1;
 322
 323        blk_rq_init(q, flush_rq);
 324
 325        /*
 326         * In case of none scheduler, borrow tag from the first request
 327         * since they can't be in flight at the same time. And acquire
 328         * the tag's ownership for flush req.
 329         *
 330         * In case of IO scheduler, flush rq need to borrow scheduler tag
 331         * just for cheating put/get driver tag.
 332         */
 333        if (q->mq_ops) {
 334                struct blk_mq_hw_ctx *hctx;
 335
 336                flush_rq->mq_ctx = first_rq->mq_ctx;
 337
 338                if (!q->elevator) {
 339                        fq->orig_rq = first_rq;
 340                        flush_rq->tag = first_rq->tag;
 341                        hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
 342                        blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
 343                } else {
 344                        flush_rq->internal_tag = first_rq->internal_tag;
 345                }
 346        }
 347
 348        flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
 349        flush_rq->rq_flags |= RQF_FLUSH_SEQ;
 350        flush_rq->rq_disk = first_rq->rq_disk;
 351        flush_rq->end_io = flush_end_io;
 352
 353        return blk_flush_queue_rq(flush_rq, false);
 354}
 355
 356static void flush_data_end_io(struct request *rq, blk_status_t error)
 357{
 358        struct request_queue *q = rq->q;
 359        struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
 360
 361        lockdep_assert_held(q->queue_lock);
 362
 363        /*
 364         * Updating q->in_flight[] here for making this tag usable
 365         * early. Because in blk_queue_start_tag(),
 366         * q->in_flight[BLK_RW_ASYNC] is used to limit async I/O and
 367         * reserve tags for sync I/O.
 368         *
 369         * More importantly this way can avoid the following I/O
 370         * deadlock:
 371         *
 372         * - suppose there are 40 fua requests comming to flush queue
 373         *   and queue depth is 31
 374         * - 30 rqs are scheduled then blk_queue_start_tag() can't alloc
 375         *   tag for async I/O any more
 376         * - all the 30 rqs are completed before FLUSH_PENDING_TIMEOUT
 377         *   and flush_data_end_io() is called
 378         * - the other rqs still can't go ahead if not updating
 379         *   q->in_flight[BLK_RW_ASYNC] here, meantime these rqs
 380         *   are held in flush data queue and make no progress of
 381         *   handling post flush rq
 382         * - only after the post flush rq is handled, all these rqs
 383         *   can be completed
 384         */
 385
 386        elv_completed_request(q, rq);
 387
 388        /* for avoiding double accounting */
 389        rq->rq_flags &= ~RQF_STARTED;
 390
 391        /*
 392         * After populating an empty queue, kick it to avoid stall.  Read
 393         * the comment in flush_end_io().
 394         */
 395        if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error))
 396                blk_run_queue_async(q);
 397}
 398
 399static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
 400{
 401        struct request_queue *q = rq->q;
 402        struct blk_mq_hw_ctx *hctx;
 403        struct blk_mq_ctx *ctx = rq->mq_ctx;
 404        unsigned long flags;
 405        struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx);
 406
 407        hctx = blk_mq_map_queue(q, ctx->cpu);
 408
 409        if (q->elevator) {
 410                WARN_ON(rq->tag < 0);
 411                blk_mq_put_driver_tag_hctx(hctx, rq);
 412        }
 413
 414        /*
 415         * After populating an empty queue, kick it to avoid stall.  Read
 416         * the comment in flush_end_io().
 417         */
 418        spin_lock_irqsave(&fq->mq_flush_lock, flags);
 419        blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error);
 420        spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
 421
 422        blk_mq_run_hw_queue(hctx, true);
 423}
 424
 425/**
 426 * blk_insert_flush - insert a new PREFLUSH/FUA request
 427 * @rq: request to insert
 428 *
 429 * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
 430 * or __blk_mq_run_hw_queue() to dispatch request.
 431 * @rq is being submitted.  Analyze what needs to be done and put it on the
 432 * right queue.
 433 */
 434void blk_insert_flush(struct request *rq)
 435{
 436        struct request_queue *q = rq->q;
 437        unsigned long fflags = q->queue_flags;  /* may change, cache */
 438        unsigned int policy = blk_flush_policy(fflags, rq);
 439        struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
 440
 441        if (!q->mq_ops)
 442                lockdep_assert_held(q->queue_lock);
 443
 444        /*
 445         * @policy now records what operations need to be done.  Adjust
 446         * REQ_PREFLUSH and FUA for the driver.
 447         */
 448        rq->cmd_flags &= ~REQ_PREFLUSH;
 449        if (!(fflags & (1UL << QUEUE_FLAG_FUA)))
 450                rq->cmd_flags &= ~REQ_FUA;
 451
 452        /*
 453         * REQ_PREFLUSH|REQ_FUA implies REQ_SYNC, so if we clear any
 454         * of those flags, we have to set REQ_SYNC to avoid skewing
 455         * the request accounting.
 456         */
 457        rq->cmd_flags |= REQ_SYNC;
 458
 459        /*
 460         * An empty flush handed down from a stacking driver may
 461         * translate into nothing if the underlying device does not
 462         * advertise a write-back cache.  In this case, simply
 463         * complete the request.
 464         */
 465        if (!policy) {
 466                if (q->mq_ops)
 467                        blk_mq_end_request(rq, 0);
 468                else
 469                        __blk_end_request(rq, 0, 0);
 470                return;
 471        }
 472
 473        BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */
 474
 475        /*
 476         * If there's data but flush is not necessary, the request can be
 477         * processed directly without going through flush machinery.  Queue
 478         * for normal execution.
 479         */
 480        if ((policy & REQ_FSEQ_DATA) &&
 481            !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
 482                if (q->mq_ops)
 483                        blk_mq_request_bypass_insert(rq, false);
 484                else
 485                        list_add_tail(&rq->queuelist, &q->queue_head);
 486                return;
 487        }
 488
 489        /*
 490         * @rq should go through flush machinery.  Mark it part of flush
 491         * sequence and submit for further processing.
 492         */
 493        memset(&rq->flush, 0, sizeof(rq->flush));
 494        INIT_LIST_HEAD(&rq->flush.list);
 495        rq->rq_flags |= RQF_FLUSH_SEQ;
 496        rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
 497        if (q->mq_ops) {
 498                rq->end_io = mq_flush_data_end_io;
 499
 500                spin_lock_irq(&fq->mq_flush_lock);
 501                blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
 502                spin_unlock_irq(&fq->mq_flush_lock);
 503                return;
 504        }
 505        rq->end_io = flush_data_end_io;
 506
 507        blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
 508}
 509
 510/**
 511 * blkdev_issue_flush - queue a flush
 512 * @bdev:       blockdev to issue flush for
 513 * @gfp_mask:   memory allocation flags (for bio_alloc)
 514 * @error_sector:       error sector
 515 *
 516 * Description:
 517 *    Issue a flush for the block device in question. Caller can supply
 518 *    room for storing the error offset in case of a flush error, if they
 519 *    wish to.
 520 */
 521int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 522                sector_t *error_sector)
 523{
 524        struct request_queue *q;
 525        struct bio *bio;
 526        int ret = 0;
 527
 528        if (bdev->bd_disk == NULL)
 529                return -ENXIO;
 530
 531        q = bdev_get_queue(bdev);
 532        if (!q)
 533                return -ENXIO;
 534
 535        /*
 536         * some block devices may not have their queue correctly set up here
 537         * (e.g. loop device without a backing file) and so issuing a flush
 538         * here will panic. Ensure there is a request function before issuing
 539         * the flush.
 540         */
 541        if (!q->make_request_fn)
 542                return -ENXIO;
 543
 544        bio = bio_alloc(gfp_mask, 0);
 545        bio_set_dev(bio, bdev);
 546        bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 547
 548        ret = submit_bio_wait(bio);
 549
 550        /*
 551         * The driver must store the error location in ->bi_sector, if
 552         * it supports it. For non-stacked drivers, this should be
 553         * copied from blk_rq_pos(rq).
 554         */
 555        if (error_sector)
 556                *error_sector = bio->bi_iter.bi_sector;
 557
 558        bio_put(bio);
 559        return ret;
 560}
 561EXPORT_SYMBOL(blkdev_issue_flush);
 562
 563struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
 564                int node, int cmd_size)
 565{
 566        struct blk_flush_queue *fq;
 567        int rq_sz = sizeof(struct request);
 568
 569        fq = kzalloc_node(sizeof(*fq), GFP_KERNEL, node);
 570        if (!fq)
 571                goto fail;
 572
 573        if (q->mq_ops)
 574                spin_lock_init(&fq->mq_flush_lock);
 575
 576        rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
 577        fq->flush_rq = kzalloc_node(rq_sz, GFP_KERNEL, node);
 578        if (!fq->flush_rq)
 579                goto fail_rq;
 580
 581        INIT_LIST_HEAD(&fq->flush_queue[0]);
 582        INIT_LIST_HEAD(&fq->flush_queue[1]);
 583        INIT_LIST_HEAD(&fq->flush_data_in_flight);
 584
 585        return fq;
 586
 587 fail_rq:
 588        kfree(fq);
 589 fail:
 590        return NULL;
 591}
 592
 593void blk_free_flush_queue(struct blk_flush_queue *fq)
 594{
 595        /* bio based request queue hasn't flush queue */
 596        if (!fq)
 597                return;
 598
 599        kfree(fq->flush_rq);
 600        kfree(fq);
 601}
 602