linux/block/blk-flush.c
<<
>>
Prefs
   1/*
   2 * Functions to sequence FLUSH and FUA writes.
   3 *
   4 * Copyright (C) 2011           Max Planck Institute for Gravitational Physics
   5 * Copyright (C) 2011           Tejun Heo <tj@kernel.org>
   6 *
   7 * This file is released under the GPLv2.
   8 *
   9 * REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three
  10 * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
  11 * properties and hardware capability.
  12 *
  13 * If a request doesn't have data, only REQ_PREFLUSH makes sense, which
  14 * indicates a simple flush request.  If there is data, REQ_PREFLUSH indicates
  15 * that the device cache should be flushed before the data is executed, and
  16 * REQ_FUA means that the data must be on non-volatile media on request
  17 * completion.
  18 *
  19 * If the device doesn't have writeback cache, FLUSH and FUA don't make any
  20 * difference.  The requests are either completed immediately if there's no
  21 * data or executed as normal requests otherwise.
  22 *
  23 * If the device has writeback cache and supports FUA, REQ_PREFLUSH is
  24 * translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
  25 *
  26 * If the device has writeback cache and doesn't support FUA, REQ_PREFLUSH
  27 * is translated to PREFLUSH and REQ_FUA to POSTFLUSH.
  28 *
  29 * The actual execution of flush is double buffered.  Whenever a request
  30 * needs to execute PRE or POSTFLUSH, it queues at
  31 * fq->flush_queue[fq->flush_pending_idx].  Once certain criteria are met, a
  32 * REQ_OP_FLUSH is issued and the pending_idx is toggled.  When the flush
  33 * completes, all the requests which were pending are proceeded to the next
  34 * step.  This allows arbitrary merging of different types of FLUSH/FUA
  35 * requests.
  36 *
  37 * Currently, the following conditions are used to determine when to issue
  38 * flush.
  39 *
  40 * C1. At any given time, only one flush shall be in progress.  This makes
  41 *     double buffering sufficient.
  42 *
  43 * C2. Flush is deferred if any request is executing DATA of its sequence.
  44 *     This avoids issuing separate POSTFLUSHes for requests which shared
  45 *     PREFLUSH.
  46 *
  47 * C3. The second condition is ignored if there is a request which has
  48 *     waited longer than FLUSH_PENDING_TIMEOUT.  This is to avoid
  49 *     starvation in the unlikely case where there are continuous stream of
  50 *     FUA (without FLUSH) requests.
  51 *
  52 * For devices which support FUA, it isn't clear whether C2 (and thus C3)
  53 * is beneficial.
  54 *
  55 * Note that a sequenced FLUSH/FUA request with DATA is completed twice.
  56 * Once while executing DATA and again after the whole sequence is
  57 * complete.  The first completion updates the contained bio but doesn't
  58 * finish it so that the bio submitter is notified only after the whole
  59 * sequence is complete.  This is implemented by testing REQ_FLUSH_SEQ in
  60 * req_bio_endio().
  61 *
  62 * The above peculiarity requires that each FLUSH/FUA request has only one
  63 * bio attached to it, which is guaranteed as they aren't allowed to be
  64 * merged in the usual way.
  65 */
  66
  67#include <linux/kernel.h>
  68#include <linux/module.h>
  69#include <linux/bio.h>
  70#include <linux/blkdev.h>
  71#include <linux/gfp.h>
  72#include <linux/blk-mq.h>
  73
  74#include "blk.h"
  75#include "blk-mq.h"
  76#include "blk-mq-tag.h"
  77
  78/* FLUSH/FUA sequences */
  79enum {
  80        REQ_FSEQ_PREFLUSH       = (1 << 0), /* pre-flushing in progress */
  81        REQ_FSEQ_DATA           = (1 << 1), /* data write in progress */
  82        REQ_FSEQ_POSTFLUSH      = (1 << 2), /* post-flushing in progress */
  83        REQ_FSEQ_DONE           = (1 << 3),
  84
  85        REQ_FSEQ_ACTIONS        = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA |
  86                                  REQ_FSEQ_POSTFLUSH,
  87
  88        /*
  89         * If flush has been pending longer than the following timeout,
  90         * it's issued even if flush_data requests are still in flight.
  91         */
  92        FLUSH_PENDING_TIMEOUT   = 5 * HZ,
  93};
  94
  95static bool blk_kick_flush(struct request_queue *q,
  96                           struct blk_flush_queue *fq);
  97
  98static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
  99{
 100        unsigned int policy = 0;
 101
 102        if (blk_rq_sectors(rq))
 103                policy |= REQ_FSEQ_DATA;
 104
 105        if (fflags & (1UL << QUEUE_FLAG_WC)) {
 106                if (rq->cmd_flags & REQ_PREFLUSH)
 107                        policy |= REQ_FSEQ_PREFLUSH;
 108                if (!(fflags & (1UL << QUEUE_FLAG_FUA)) &&
 109                    (rq->cmd_flags & REQ_FUA))
 110                        policy |= REQ_FSEQ_POSTFLUSH;
 111        }
 112        return policy;
 113}
 114
 115static unsigned int blk_flush_cur_seq(struct request *rq)
 116{
 117        return 1 << ffz(rq->flush.seq);
 118}
 119
 120static void blk_flush_restore_request(struct request *rq)
 121{
 122        /*
 123         * After flush data completion, @rq->bio is %NULL but we need to
 124         * complete the bio again.  @rq->biotail is guaranteed to equal the
 125         * original @rq->bio.  Restore it.
 126         */
 127        rq->bio = rq->biotail;
 128
 129        /* make @rq a normal request */
 130        rq->cmd_flags &= ~REQ_FLUSH_SEQ;
 131        rq->end_io = rq->flush.saved_end_io;
 132}
 133
 134static bool blk_flush_queue_rq(struct request *rq, bool add_front)
 135{
 136        if (rq->q->mq_ops) {
 137                struct request_queue *q = rq->q;
 138
 139                blk_mq_add_to_requeue_list(rq, add_front);
 140                blk_mq_kick_requeue_list(q);
 141                return false;
 142        } else {
 143                if (add_front)
 144                        list_add(&rq->queuelist, &rq->q->queue_head);
 145                else
 146                        list_add_tail(&rq->queuelist, &rq->q->queue_head);
 147                return true;
 148        }
 149}
 150
 151/**
 152 * blk_flush_complete_seq - complete flush sequence
 153 * @rq: FLUSH/FUA request being sequenced
 154 * @fq: flush queue
 155 * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)
 156 * @error: whether an error occurred
 157 *
 158 * @rq just completed @seq part of its flush sequence, record the
 159 * completion and trigger the next step.
 160 *
 161 * CONTEXT:
 162 * spin_lock_irq(q->queue_lock or fq->mq_flush_lock)
 163 *
 164 * RETURNS:
 165 * %true if requests were added to the dispatch queue, %false otherwise.
 166 */
 167static bool blk_flush_complete_seq(struct request *rq,
 168                                   struct blk_flush_queue *fq,
 169                                   unsigned int seq, int error)
 170{
 171        struct request_queue *q = rq->q;
 172        struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
 173        bool queued = false, kicked;
 174
 175        BUG_ON(rq->flush.seq & seq);
 176        rq->flush.seq |= seq;
 177
 178        if (likely(!error))
 179                seq = blk_flush_cur_seq(rq);
 180        else
 181                seq = REQ_FSEQ_DONE;
 182
 183        switch (seq) {
 184        case REQ_FSEQ_PREFLUSH:
 185        case REQ_FSEQ_POSTFLUSH:
 186                /* queue for flush */
 187                if (list_empty(pending))
 188                        fq->flush_pending_since = jiffies;
 189                list_move_tail(&rq->flush.list, pending);
 190                break;
 191
 192        case REQ_FSEQ_DATA:
 193                list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
 194                queued = blk_flush_queue_rq(rq, true);
 195                break;
 196
 197        case REQ_FSEQ_DONE:
 198                /*
 199                 * @rq was previously adjusted by blk_flush_issue() for
 200                 * flush sequencing and may already have gone through the
 201                 * flush data request completion path.  Restore @rq for
 202                 * normal completion and end it.
 203                 */
 204                BUG_ON(!list_empty(&rq->queuelist));
 205                list_del_init(&rq->flush.list);
 206                blk_flush_restore_request(rq);
 207                if (q->mq_ops)
 208                        blk_mq_end_request(rq, error);
 209                else
 210                        __blk_end_request_all(rq, error);
 211                break;
 212
 213        default:
 214                BUG();
 215        }
 216
 217        kicked = blk_kick_flush(q, fq);
 218        return kicked | queued;
 219}
 220
 221static void flush_end_io(struct request *flush_rq, int error)
 222{
 223        struct request_queue *q = flush_rq->q;
 224        struct list_head *running;
 225        bool queued = false;
 226        struct request *rq, *n;
 227        unsigned long flags = 0;
 228        struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
 229
 230        if (q->mq_ops) {
 231                struct blk_mq_hw_ctx *hctx;
 232
 233                /* release the tag's ownership to the req cloned from */
 234                spin_lock_irqsave(&fq->mq_flush_lock, flags);
 235                hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu);
 236                blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
 237                flush_rq->tag = -1;
 238        }
 239
 240        running = &fq->flush_queue[fq->flush_running_idx];
 241        BUG_ON(fq->flush_pending_idx == fq->flush_running_idx);
 242
 243        /* account completion of the flush request */
 244        fq->flush_running_idx ^= 1;
 245
 246        if (!q->mq_ops)
 247                elv_completed_request(q, flush_rq);
 248
 249        /* and push the waiting requests to the next stage */
 250        list_for_each_entry_safe(rq, n, running, flush.list) {
 251                unsigned int seq = blk_flush_cur_seq(rq);
 252
 253                BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
 254                queued |= blk_flush_complete_seq(rq, fq, seq, error);
 255        }
 256
 257        /*
 258         * Kick the queue to avoid stall for two cases:
 259         * 1. Moving a request silently to empty queue_head may stall the
 260         * queue.
 261         * 2. When flush request is running in non-queueable queue, the
 262         * queue is hold. Restart the queue after flush request is finished
 263         * to avoid stall.
 264         * This function is called from request completion path and calling
 265         * directly into request_fn may confuse the driver.  Always use
 266         * kblockd.
 267         */
 268        if (queued || fq->flush_queue_delayed) {
 269                WARN_ON(q->mq_ops);
 270                blk_run_queue_async(q);
 271        }
 272        fq->flush_queue_delayed = 0;
 273        if (q->mq_ops)
 274                spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
 275}
 276
 277/**
 278 * blk_kick_flush - consider issuing flush request
 279 * @q: request_queue being kicked
 280 * @fq: flush queue
 281 *
 282 * Flush related states of @q have changed, consider issuing flush request.
 283 * Please read the comment at the top of this file for more info.
 284 *
 285 * CONTEXT:
 286 * spin_lock_irq(q->queue_lock or fq->mq_flush_lock)
 287 *
 288 * RETURNS:
 289 * %true if flush was issued, %false otherwise.
 290 */
 291static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
 292{
 293        struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
 294        struct request *first_rq =
 295                list_first_entry(pending, struct request, flush.list);
 296        struct request *flush_rq = fq->flush_rq;
 297
 298        /* C1 described at the top of this file */
 299        if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending))
 300                return false;
 301
 302        /* C2 and C3 */
 303        if (!list_empty(&fq->flush_data_in_flight) &&
 304            time_before(jiffies,
 305                        fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
 306                return false;
 307
 308        /*
 309         * Issue flush and toggle pending_idx.  This makes pending_idx
 310         * different from running_idx, which means flush is in flight.
 311         */
 312        fq->flush_pending_idx ^= 1;
 313
 314        blk_rq_init(q, flush_rq);
 315
 316        /*
 317         * Borrow tag from the first request since they can't
 318         * be in flight at the same time. And acquire the tag's
 319         * ownership for flush req.
 320         */
 321        if (q->mq_ops) {
 322                struct blk_mq_hw_ctx *hctx;
 323
 324                flush_rq->mq_ctx = first_rq->mq_ctx;
 325                flush_rq->tag = first_rq->tag;
 326                fq->orig_rq = first_rq;
 327
 328                hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
 329                blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
 330        }
 331
 332        flush_rq->cmd_type = REQ_TYPE_FS;
 333        req_set_op_attrs(flush_rq, REQ_OP_FLUSH, WRITE_FLUSH | REQ_FLUSH_SEQ);
 334        flush_rq->rq_disk = first_rq->rq_disk;
 335        flush_rq->end_io = flush_end_io;
 336
 337        return blk_flush_queue_rq(flush_rq, false);
 338}
 339
 340static void flush_data_end_io(struct request *rq, int error)
 341{
 342        struct request_queue *q = rq->q;
 343        struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
 344
 345        /*
 346         * Updating q->in_flight[] here for making this tag usable
 347         * early. Because in blk_queue_start_tag(),
 348         * q->in_flight[BLK_RW_ASYNC] is used to limit async I/O and
 349         * reserve tags for sync I/O.
 350         *
 351         * More importantly this way can avoid the following I/O
 352         * deadlock:
 353         *
 354         * - suppose there are 40 fua requests comming to flush queue
 355         *   and queue depth is 31
 356         * - 30 rqs are scheduled then blk_queue_start_tag() can't alloc
 357         *   tag for async I/O any more
 358         * - all the 30 rqs are completed before FLUSH_PENDING_TIMEOUT
 359         *   and flush_data_end_io() is called
 360         * - the other rqs still can't go ahead if not updating
 361         *   q->in_flight[BLK_RW_ASYNC] here, meantime these rqs
 362         *   are held in flush data queue and make no progress of
 363         *   handling post flush rq
 364         * - only after the post flush rq is handled, all these rqs
 365         *   can be completed
 366         */
 367
 368        elv_completed_request(q, rq);
 369
 370        /* for avoiding double accounting */
 371        rq->cmd_flags &= ~REQ_STARTED;
 372
 373        /*
 374         * After populating an empty queue, kick it to avoid stall.  Read
 375         * the comment in flush_end_io().
 376         */
 377        if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error))
 378                blk_run_queue_async(q);
 379}
 380
 381static void mq_flush_data_end_io(struct request *rq, int error)
 382{
 383        struct request_queue *q = rq->q;
 384        struct blk_mq_hw_ctx *hctx;
 385        struct blk_mq_ctx *ctx = rq->mq_ctx;
 386        unsigned long flags;
 387        struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx);
 388
 389        hctx = blk_mq_map_queue(q, ctx->cpu);
 390
 391        /*
 392         * After populating an empty queue, kick it to avoid stall.  Read
 393         * the comment in flush_end_io().
 394         */
 395        spin_lock_irqsave(&fq->mq_flush_lock, flags);
 396        if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error))
 397                blk_mq_run_hw_queue(hctx, true);
 398        spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
 399}
 400
 401/**
 402 * blk_insert_flush - insert a new FLUSH/FUA request
 403 * @rq: request to insert
 404 *
 405 * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
 406 * or __blk_mq_run_hw_queue() to dispatch request.
 407 * @rq is being submitted.  Analyze what needs to be done and put it on the
 408 * right queue.
 409 *
 410 * CONTEXT:
 411 * spin_lock_irq(q->queue_lock) in !mq case
 412 */
 413void blk_insert_flush(struct request *rq)
 414{
 415        struct request_queue *q = rq->q;
 416        unsigned long fflags = q->queue_flags;  /* may change, cache */
 417        unsigned int policy = blk_flush_policy(fflags, rq);
 418        struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
 419
 420        /*
 421         * @policy now records what operations need to be done.  Adjust
 422         * REQ_PREFLUSH and FUA for the driver.
 423         */
 424        rq->cmd_flags &= ~REQ_PREFLUSH;
 425        if (!(fflags & (1UL << QUEUE_FLAG_FUA)))
 426                rq->cmd_flags &= ~REQ_FUA;
 427
 428        /*
 429         * An empty flush handed down from a stacking driver may
 430         * translate into nothing if the underlying device does not
 431         * advertise a write-back cache.  In this case, simply
 432         * complete the request.
 433         */
 434        if (!policy) {
 435                if (q->mq_ops)
 436                        blk_mq_end_request(rq, 0);
 437                else
 438                        __blk_end_bidi_request(rq, 0, 0, 0);
 439                return;
 440        }
 441
 442        BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */
 443
 444        /*
 445         * If there's data but flush is not necessary, the request can be
 446         * processed directly without going through flush machinery.  Queue
 447         * for normal execution.
 448         */
 449        if ((policy & REQ_FSEQ_DATA) &&
 450            !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
 451                if (q->mq_ops) {
 452                        blk_mq_insert_request(rq, false, false, true);
 453                } else
 454                        list_add_tail(&rq->queuelist, &q->queue_head);
 455                return;
 456        }
 457
 458        /*
 459         * @rq should go through flush machinery.  Mark it part of flush
 460         * sequence and submit for further processing.
 461         */
 462        memset(&rq->flush, 0, sizeof(rq->flush));
 463        INIT_LIST_HEAD(&rq->flush.list);
 464        rq->cmd_flags |= REQ_FLUSH_SEQ;
 465        rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
 466        if (q->mq_ops) {
 467                rq->end_io = mq_flush_data_end_io;
 468
 469                spin_lock_irq(&fq->mq_flush_lock);
 470                blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
 471                spin_unlock_irq(&fq->mq_flush_lock);
 472                return;
 473        }
 474        rq->end_io = flush_data_end_io;
 475
 476        blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
 477}
 478
 479/**
 480 * blkdev_issue_flush - queue a flush
 481 * @bdev:       blockdev to issue flush for
 482 * @gfp_mask:   memory allocation flags (for bio_alloc)
 483 * @error_sector:       error sector
 484 *
 485 * Description:
 486 *    Issue a flush for the block device in question. Caller can supply
 487 *    room for storing the error offset in case of a flush error, if they
 488 *    wish to. If WAIT flag is not passed then caller may check only what
 489 *    request was pushed in some internal queue for later handling.
 490 */
 491int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 492                sector_t *error_sector)
 493{
 494        struct request_queue *q;
 495        struct bio *bio;
 496        int ret = 0;
 497
 498        if (bdev->bd_disk == NULL)
 499                return -ENXIO;
 500
 501        q = bdev_get_queue(bdev);
 502        if (!q)
 503                return -ENXIO;
 504
 505        /*
 506         * some block devices may not have their queue correctly set up here
 507         * (e.g. loop device without a backing file) and so issuing a flush
 508         * here will panic. Ensure there is a request function before issuing
 509         * the flush.
 510         */
 511        if (!q->make_request_fn)
 512                return -ENXIO;
 513
 514        bio = bio_alloc(gfp_mask, 0);
 515        bio->bi_bdev = bdev;
 516        bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
 517
 518        ret = submit_bio_wait(bio);
 519
 520        /*
 521         * The driver must store the error location in ->bi_sector, if
 522         * it supports it. For non-stacked drivers, this should be
 523         * copied from blk_rq_pos(rq).
 524         */
 525        if (error_sector)
 526                *error_sector = bio->bi_iter.bi_sector;
 527
 528        bio_put(bio);
 529        return ret;
 530}
 531EXPORT_SYMBOL(blkdev_issue_flush);
 532
 533struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
 534                int node, int cmd_size)
 535{
 536        struct blk_flush_queue *fq;
 537        int rq_sz = sizeof(struct request);
 538
 539        fq = kzalloc_node(sizeof(*fq), GFP_KERNEL, node);
 540        if (!fq)
 541                goto fail;
 542
 543        if (q->mq_ops) {
 544                spin_lock_init(&fq->mq_flush_lock);
 545                rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
 546        }
 547
 548        fq->flush_rq = kzalloc_node(rq_sz, GFP_KERNEL, node);
 549        if (!fq->flush_rq)
 550                goto fail_rq;
 551
 552        INIT_LIST_HEAD(&fq->flush_queue[0]);
 553        INIT_LIST_HEAD(&fq->flush_queue[1]);
 554        INIT_LIST_HEAD(&fq->flush_data_in_flight);
 555
 556        return fq;
 557
 558 fail_rq:
 559        kfree(fq);
 560 fail:
 561        return NULL;
 562}
 563
 564void blk_free_flush_queue(struct blk_flush_queue *fq)
 565{
 566        /* bio based request queue hasn't flush queue */
 567        if (!fq)
 568                return;
 569
 570        kfree(fq->flush_rq);
 571        kfree(fq);
 572}
 573