linux/drivers/md/raid5-cache.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2015 Shaohua Li <shli@fb.com>
   3 * Copyright (C) 2016 Song Liu <songliubraving@fb.com>
   4 *
   5 * This program is free software; you can redistribute it and/or modify it
   6 * under the terms and conditions of the GNU General Public License,
   7 * version 2, as published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope it will be useful, but WITHOUT
  10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  12 * more details.
  13 *
  14 */
  15#include <linux/kernel.h>
  16#include <linux/wait.h>
  17#include <linux/blkdev.h>
  18#include <linux/slab.h>
  19#include <linux/raid/md_p.h>
  20#include <linux/crc32c.h>
  21#include <linux/random.h>
  22#include <linux/kthread.h>
  23#include "md.h"
  24#include "raid5.h"
  25#include "bitmap.h"
  26
  27/*
  28 * metadata/data stored in disk with 4k size unit (a block) regardless
  29 * underneath hardware sector size. only works with PAGE_SIZE == 4096
  30 */
  31#define BLOCK_SECTORS (8)
  32
  33/*
  34 * log->max_free_space is min(1/4 disk size, 10G reclaimable space).
  35 *
  36 * In write through mode, the reclaim runs every log->max_free_space.
  37 * This can prevent the recovery scans for too long
  38 */
  39#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
  40#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
  41
  42/* wake up reclaim thread periodically */
  43#define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
  44/* start flush with these full stripes */
  45#define R5C_FULL_STRIPE_FLUSH_BATCH 256
  46/* reclaim stripes in groups */
  47#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
  48
  49/*
  50 * We only need 2 bios per I/O unit to make progress, but ensure we
  51 * have a few more available to not get too tight.
  52 */
  53#define R5L_POOL_SIZE   4
  54
  55/*
  56 * r5c journal modes of the array: write-back or write-through.
  57 * write-through mode has identical behavior as existing log only
  58 * implementation.
  59 */
  60enum r5c_journal_mode {
  61        R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
  62        R5C_JOURNAL_MODE_WRITE_BACK = 1,
  63};
  64
  65static char *r5c_journal_mode_str[] = {"write-through",
  66                                       "write-back"};
  67/*
  68 * raid5 cache state machine
  69 *
  70 * With the RAID cache, each stripe works in two phases:
  71 *      - caching phase
  72 *      - writing-out phase
  73 *
  74 * These two phases are controlled by bit STRIPE_R5C_CACHING:
  75 *   if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
  76 *   if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
  77 *
  78 * When there is no journal, or the journal is in write-through mode,
  79 * the stripe is always in writing-out phase.
  80 *
  81 * For write-back journal, the stripe is sent to caching phase on write
  82 * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off
  83 * the write-out phase by clearing STRIPE_R5C_CACHING.
  84 *
  85 * Stripes in caching phase do not write the raid disks. Instead, all
  86 * writes are committed from the log device. Therefore, a stripe in
  87 * caching phase handles writes as:
  88 *      - write to log device
  89 *      - return IO
  90 *
  91 * Stripes in writing-out phase handle writes as:
  92 *      - calculate parity
  93 *      - write pending data and parity to journal
  94 *      - write data and parity to raid disks
  95 *      - return IO for pending writes
  96 */
  97
  98struct r5l_log {
  99        struct md_rdev *rdev;
 100
 101        u32 uuid_checksum;
 102
 103        sector_t device_size;           /* log device size, round to
 104                                         * BLOCK_SECTORS */
 105        sector_t max_free_space;        /* reclaim run if free space is at
 106                                         * this size */
 107
 108        sector_t last_checkpoint;       /* log tail. where recovery scan
 109                                         * starts from */
 110        u64 last_cp_seq;                /* log tail sequence */
 111
 112        sector_t log_start;             /* log head. where new data appends */
 113        u64 seq;                        /* log head sequence */
 114
 115        sector_t next_checkpoint;
 116
 117        struct mutex io_mutex;
 118        struct r5l_io_unit *current_io; /* current io_unit accepting new data */
 119
 120        spinlock_t io_list_lock;
 121        struct list_head running_ios;   /* io_units which are still running,
 122                                         * and have not yet been completely
 123                                         * written to the log */
 124        struct list_head io_end_ios;    /* io_units which have been completely
 125                                         * written to the log but not yet written
 126                                         * to the RAID */
 127        struct list_head flushing_ios;  /* io_units which are waiting for log
 128                                         * cache flush */
 129        struct list_head finished_ios;  /* io_units which settle down in log disk */
 130        struct bio flush_bio;
 131
 132        struct list_head no_mem_stripes;   /* pending stripes, -ENOMEM */
 133
 134        struct kmem_cache *io_kc;
 135        mempool_t *io_pool;
 136        struct bio_set *bs;
 137        mempool_t *meta_pool;
 138
 139        struct md_thread *reclaim_thread;
 140        unsigned long reclaim_target;   /* number of space that need to be
 141                                         * reclaimed.  if it's 0, reclaim spaces
 142                                         * used by io_units which are in
 143                                         * IO_UNIT_STRIPE_END state (eg, reclaim
 144                                         * dones't wait for specific io_unit
 145                                         * switching to IO_UNIT_STRIPE_END
 146                                         * state) */
 147        wait_queue_head_t iounit_wait;
 148
 149        struct list_head no_space_stripes; /* pending stripes, log has no space */
 150        spinlock_t no_space_stripes_lock;
 151
 152        bool need_cache_flush;
 153
 154        /* for r5c_cache */
 155        enum r5c_journal_mode r5c_journal_mode;
 156
 157        /* all stripes in r5cache, in the order of seq at sh->log_start */
 158        struct list_head stripe_in_journal_list;
 159
 160        spinlock_t stripe_in_journal_lock;
 161        atomic_t stripe_in_journal_count;
 162
 163        /* to submit async io_units, to fulfill ordering of flush */
 164        struct work_struct deferred_io_work;
 165        /* to disable write back during in degraded mode */
 166        struct work_struct disable_writeback_work;
 167};
 168
 169/*
 170 * an IO range starts from a meta data block and end at the next meta data
 171 * block. The io unit's the meta data block tracks data/parity followed it. io
 172 * unit is written to log disk with normal write, as we always flush log disk
 173 * first and then start move data to raid disks, there is no requirement to
 174 * write io unit with FLUSH/FUA
 175 */
 176struct r5l_io_unit {
 177        struct r5l_log *log;
 178
 179        struct page *meta_page; /* store meta block */
 180        int meta_offset;        /* current offset in meta_page */
 181
 182        struct bio *current_bio;/* current_bio accepting new data */
 183
 184        atomic_t pending_stripe;/* how many stripes not flushed to raid */
 185        u64 seq;                /* seq number of the metablock */
 186        sector_t log_start;     /* where the io_unit starts */
 187        sector_t log_end;       /* where the io_unit ends */
 188        struct list_head log_sibling; /* log->running_ios */
 189        struct list_head stripe_list; /* stripes added to the io_unit */
 190
 191        int state;
 192        bool need_split_bio;
 193        struct bio *split_bio;
 194
 195        unsigned int has_flush:1;      /* include flush request */
 196        unsigned int has_fua:1;        /* include fua request */
 197        unsigned int has_null_flush:1; /* include empty flush request */
 198        /*
 199         * io isn't sent yet, flush/fua request can only be submitted till it's
 200         * the first IO in running_ios list
 201         */
 202        unsigned int io_deferred:1;
 203
 204        struct bio_list flush_barriers;   /* size == 0 flush bios */
 205};
 206
 207/* r5l_io_unit state */
 208enum r5l_io_unit_state {
 209        IO_UNIT_RUNNING = 0,    /* accepting new IO */
 210        IO_UNIT_IO_START = 1,   /* io_unit bio start writing to log,
 211                                 * don't accepting new bio */
 212        IO_UNIT_IO_END = 2,     /* io_unit bio finish writing to log */
 213        IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
 214};
 215
 216bool r5c_is_writeback(struct r5l_log *log)
 217{
 218        return (log != NULL &&
 219                log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
 220}
 221
 222static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
 223{
 224        start += inc;
 225        if (start >= log->device_size)
 226                start = start - log->device_size;
 227        return start;
 228}
 229
 230static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
 231                                  sector_t end)
 232{
 233        if (end >= start)
 234                return end - start;
 235        else
 236                return end + log->device_size - start;
 237}
 238
 239static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
 240{
 241        sector_t used_size;
 242
 243        used_size = r5l_ring_distance(log, log->last_checkpoint,
 244                                        log->log_start);
 245
 246        return log->device_size > used_size + size;
 247}
 248
 249static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
 250                                    enum r5l_io_unit_state state)
 251{
 252        if (WARN_ON(io->state >= state))
 253                return;
 254        io->state = state;
 255}
 256
 257static void
 258r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
 259                              struct bio_list *return_bi)
 260{
 261        struct bio *wbi, *wbi2;
 262
 263        wbi = dev->written;
 264        dev->written = NULL;
 265        while (wbi && wbi->bi_iter.bi_sector <
 266               dev->sector + STRIPE_SECTORS) {
 267                wbi2 = r5_next_bio(wbi, dev->sector);
 268                if (!raid5_dec_bi_active_stripes(wbi)) {
 269                        md_write_end(conf->mddev);
 270                        bio_list_add(return_bi, wbi);
 271                }
 272                wbi = wbi2;
 273        }
 274}
 275
 276void r5c_handle_cached_data_endio(struct r5conf *conf,
 277          struct stripe_head *sh, int disks, struct bio_list *return_bi)
 278{
 279        int i;
 280
 281        for (i = sh->disks; i--; ) {
 282                if (sh->dev[i].written) {
 283                        set_bit(R5_UPTODATE, &sh->dev[i].flags);
 284                        r5c_return_dev_pending_writes(conf, &sh->dev[i],
 285                                                      return_bi);
 286                        bitmap_endwrite(conf->mddev->bitmap, sh->sector,
 287                                        STRIPE_SECTORS,
 288                                        !test_bit(STRIPE_DEGRADED, &sh->state),
 289                                        0);
 290                }
 291        }
 292}
 293
 294/* Check whether we should flush some stripes to free up stripe cache */
 295void r5c_check_stripe_cache_usage(struct r5conf *conf)
 296{
 297        int total_cached;
 298
 299        if (!r5c_is_writeback(conf->log))
 300                return;
 301
 302        total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
 303                atomic_read(&conf->r5c_cached_full_stripes);
 304
 305        /*
 306         * The following condition is true for either of the following:
 307         *   - stripe cache pressure high:
 308         *          total_cached > 3/4 min_nr_stripes ||
 309         *          empty_inactive_list_nr > 0
 310         *   - stripe cache pressure moderate:
 311         *          total_cached > 1/2 min_nr_stripes
 312         */
 313        if (total_cached > conf->min_nr_stripes * 1 / 2 ||
 314            atomic_read(&conf->empty_inactive_list_nr) > 0)
 315                r5l_wake_reclaim(conf->log, 0);
 316}
 317
 318/*
 319 * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full
 320 * stripes in the cache
 321 */
 322void r5c_check_cached_full_stripe(struct r5conf *conf)
 323{
 324        if (!r5c_is_writeback(conf->log))
 325                return;
 326
 327        /*
 328         * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes
 329         * or a full stripe (chunk size / 4k stripes).
 330         */
 331        if (atomic_read(&conf->r5c_cached_full_stripes) >=
 332            min(R5C_FULL_STRIPE_FLUSH_BATCH,
 333                conf->chunk_sectors >> STRIPE_SHIFT))
 334                r5l_wake_reclaim(conf->log, 0);
 335}
 336
 337/*
 338 * Total log space (in sectors) needed to flush all data in cache
 339 *
 340 * Currently, writing-out phase automatically includes all pending writes
 341 * to the same sector. So the reclaim of each stripe takes up to
 342 * (conf->raid_disks + 1) pages of log space.
 343 *
 344 * To totally avoid deadlock due to log space, the code reserves
 345 * (conf->raid_disks + 1) pages for each stripe in cache, which is not
 346 * necessary in most cases.
 347 *
 348 * To improve this, we will need writing-out phase to be able to NOT include
 349 * pending writes, which will reduce the requirement to
 350 * (conf->max_degraded + 1) pages per stripe in cache.
 351 */
 352static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
 353{
 354        struct r5l_log *log = conf->log;
 355
 356        if (!r5c_is_writeback(log))
 357                return 0;
 358
 359        return BLOCK_SECTORS * (conf->raid_disks + 1) *
 360                atomic_read(&log->stripe_in_journal_count);
 361}
 362
 363/*
 364 * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL
 365 *
 366 * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of
 367 * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log
 368 * device is less than 2x of reclaim_required_space.
 369 */
 370static inline void r5c_update_log_state(struct r5l_log *log)
 371{
 372        struct r5conf *conf = log->rdev->mddev->private;
 373        sector_t free_space;
 374        sector_t reclaim_space;
 375        bool wake_reclaim = false;
 376
 377        if (!r5c_is_writeback(log))
 378                return;
 379
 380        free_space = r5l_ring_distance(log, log->log_start,
 381                                       log->last_checkpoint);
 382        reclaim_space = r5c_log_required_to_flush_cache(conf);
 383        if (free_space < 2 * reclaim_space)
 384                set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
 385        else {
 386                if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
 387                        wake_reclaim = true;
 388                clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
 389        }
 390        if (free_space < 3 * reclaim_space)
 391                set_bit(R5C_LOG_TIGHT, &conf->cache_state);
 392        else
 393                clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
 394
 395        if (wake_reclaim)
 396                r5l_wake_reclaim(log, 0);
 397}
 398
 399/*
 400 * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
 401 * This function should only be called in write-back mode.
 402 */
 403void r5c_make_stripe_write_out(struct stripe_head *sh)
 404{
 405        struct r5conf *conf = sh->raid_conf;
 406        struct r5l_log *log = conf->log;
 407
 408        BUG_ON(!r5c_is_writeback(log));
 409
 410        WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
 411        clear_bit(STRIPE_R5C_CACHING, &sh->state);
 412
 413        if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 414                atomic_inc(&conf->preread_active_stripes);
 415
 416        if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
 417                BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
 418                atomic_dec(&conf->r5c_cached_partial_stripes);
 419        }
 420
 421        if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
 422                BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
 423                atomic_dec(&conf->r5c_cached_full_stripes);
 424        }
 425}
 426
 427static void r5c_handle_data_cached(struct stripe_head *sh)
 428{
 429        int i;
 430
 431        for (i = sh->disks; i--; )
 432                if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
 433                        set_bit(R5_InJournal, &sh->dev[i].flags);
 434                        clear_bit(R5_LOCKED, &sh->dev[i].flags);
 435                }
 436        clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
 437}
 438
 439/*
 440 * this journal write must contain full parity,
 441 * it may also contain some data pages
 442 */
 443static void r5c_handle_parity_cached(struct stripe_head *sh)
 444{
 445        int i;
 446
 447        for (i = sh->disks; i--; )
 448                if (test_bit(R5_InJournal, &sh->dev[i].flags))
 449                        set_bit(R5_Wantwrite, &sh->dev[i].flags);
 450}
 451
 452/*
 453 * Setting proper flags after writing (or flushing) data and/or parity to the
 454 * log device. This is called from r5l_log_endio() or r5l_log_flush_endio().
 455 */
 456static void r5c_finish_cache_stripe(struct stripe_head *sh)
 457{
 458        struct r5l_log *log = sh->raid_conf->log;
 459
 460        if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
 461                BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
 462                /*
 463                 * Set R5_InJournal for parity dev[pd_idx]. This means
 464                 * all data AND parity in the journal. For RAID 6, it is
 465                 * NOT necessary to set the flag for dev[qd_idx], as the
 466                 * two parities are written out together.
 467                 */
 468                set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
 469        } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) {
 470                r5c_handle_data_cached(sh);
 471        } else {
 472                r5c_handle_parity_cached(sh);
 473                set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
 474        }
 475}
 476
 477static void r5l_io_run_stripes(struct r5l_io_unit *io)
 478{
 479        struct stripe_head *sh, *next;
 480
 481        list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
 482                list_del_init(&sh->log_list);
 483
 484                r5c_finish_cache_stripe(sh);
 485
 486                set_bit(STRIPE_HANDLE, &sh->state);
 487                raid5_release_stripe(sh);
 488        }
 489}
 490
 491static void r5l_log_run_stripes(struct r5l_log *log)
 492{
 493        struct r5l_io_unit *io, *next;
 494
 495        assert_spin_locked(&log->io_list_lock);
 496
 497        list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
 498                /* don't change list order */
 499                if (io->state < IO_UNIT_IO_END)
 500                        break;
 501
 502                list_move_tail(&io->log_sibling, &log->finished_ios);
 503                r5l_io_run_stripes(io);
 504        }
 505}
 506
 507static void r5l_move_to_end_ios(struct r5l_log *log)
 508{
 509        struct r5l_io_unit *io, *next;
 510
 511        assert_spin_locked(&log->io_list_lock);
 512
 513        list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
 514                /* don't change list order */
 515                if (io->state < IO_UNIT_IO_END)
 516                        break;
 517                list_move_tail(&io->log_sibling, &log->io_end_ios);
 518        }
 519}
 520
 521static void __r5l_stripe_write_finished(struct r5l_io_unit *io);
 522static void r5l_log_endio(struct bio *bio)
 523{
 524        struct r5l_io_unit *io = bio->bi_private;
 525        struct r5l_io_unit *io_deferred;
 526        struct r5l_log *log = io->log;
 527        unsigned long flags;
 528
 529        if (bio->bi_error)
 530                md_error(log->rdev->mddev, log->rdev);
 531
 532        bio_put(bio);
 533        mempool_free(io->meta_page, log->meta_pool);
 534
 535        spin_lock_irqsave(&log->io_list_lock, flags);
 536        __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
 537        if (log->need_cache_flush)
 538                r5l_move_to_end_ios(log);
 539        else
 540                r5l_log_run_stripes(log);
 541        if (!list_empty(&log->running_ios)) {
 542                /*
 543                 * FLUSH/FUA io_unit is deferred because of ordering, now we
 544                 * can dispatch it
 545                 */
 546                io_deferred = list_first_entry(&log->running_ios,
 547                                               struct r5l_io_unit, log_sibling);
 548                if (io_deferred->io_deferred)
 549                        schedule_work(&log->deferred_io_work);
 550        }
 551
 552        spin_unlock_irqrestore(&log->io_list_lock, flags);
 553
 554        if (log->need_cache_flush)
 555                md_wakeup_thread(log->rdev->mddev->thread);
 556
 557        if (io->has_null_flush) {
 558                struct bio *bi;
 559
 560                WARN_ON(bio_list_empty(&io->flush_barriers));
 561                while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
 562                        bio_endio(bi);
 563                        atomic_dec(&io->pending_stripe);
 564                }
 565                if (atomic_read(&io->pending_stripe) == 0)
 566                        __r5l_stripe_write_finished(io);
 567        }
 568}
 569
 570static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
 571{
 572        unsigned long flags;
 573
 574        spin_lock_irqsave(&log->io_list_lock, flags);
 575        __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
 576        spin_unlock_irqrestore(&log->io_list_lock, flags);
 577
 578        if (io->has_flush)
 579                io->current_bio->bi_opf |= REQ_PREFLUSH;
 580        if (io->has_fua)
 581                io->current_bio->bi_opf |= REQ_FUA;
 582        submit_bio(io->current_bio);
 583
 584        if (!io->split_bio)
 585                return;
 586
 587        if (io->has_flush)
 588                io->split_bio->bi_opf |= REQ_PREFLUSH;
 589        if (io->has_fua)
 590                io->split_bio->bi_opf |= REQ_FUA;
 591        submit_bio(io->split_bio);
 592}
 593
 594/* deferred io_unit will be dispatched here */
 595static void r5l_submit_io_async(struct work_struct *work)
 596{
 597        struct r5l_log *log = container_of(work, struct r5l_log,
 598                                           deferred_io_work);
 599        struct r5l_io_unit *io = NULL;
 600        unsigned long flags;
 601
 602        spin_lock_irqsave(&log->io_list_lock, flags);
 603        if (!list_empty(&log->running_ios)) {
 604                io = list_first_entry(&log->running_ios, struct r5l_io_unit,
 605                                      log_sibling);
 606                if (!io->io_deferred)
 607                        io = NULL;
 608                else
 609                        io->io_deferred = 0;
 610        }
 611        spin_unlock_irqrestore(&log->io_list_lock, flags);
 612        if (io)
 613                r5l_do_submit_io(log, io);
 614}
 615
 616static void r5c_disable_writeback_async(struct work_struct *work)
 617{
 618        struct r5l_log *log = container_of(work, struct r5l_log,
 619                                           disable_writeback_work);
 620        struct mddev *mddev = log->rdev->mddev;
 621
 622        if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
 623                return;
 624        pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
 625                mdname(mddev));
 626        mddev_suspend(mddev);
 627        log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
 628        mddev_resume(mddev);
 629}
 630
 631static void r5l_submit_current_io(struct r5l_log *log)
 632{
 633        struct r5l_io_unit *io = log->current_io;
 634        struct bio *bio;
 635        struct r5l_meta_block *block;
 636        unsigned long flags;
 637        u32 crc;
 638        bool do_submit = true;
 639
 640        if (!io)
 641                return;
 642
 643        block = page_address(io->meta_page);
 644        block->meta_size = cpu_to_le32(io->meta_offset);
 645        crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
 646        block->checksum = cpu_to_le32(crc);
 647        bio = io->current_bio;
 648
 649        log->current_io = NULL;
 650        spin_lock_irqsave(&log->io_list_lock, flags);
 651        if (io->has_flush || io->has_fua) {
 652                if (io != list_first_entry(&log->running_ios,
 653                                           struct r5l_io_unit, log_sibling)) {
 654                        io->io_deferred = 1;
 655                        do_submit = false;
 656                }
 657        }
 658        spin_unlock_irqrestore(&log->io_list_lock, flags);
 659        if (do_submit)
 660                r5l_do_submit_io(log, io);
 661}
 662
 663static struct bio *r5l_bio_alloc(struct r5l_log *log)
 664{
 665        struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
 666
 667        bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 668        bio->bi_bdev = log->rdev->bdev;
 669        bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
 670
 671        return bio;
 672}
 673
 674static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
 675{
 676        log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
 677
 678        r5c_update_log_state(log);
 679        /*
 680         * If we filled up the log device start from the beginning again,
 681         * which will require a new bio.
 682         *
 683         * Note: for this to work properly the log size needs to me a multiple
 684         * of BLOCK_SECTORS.
 685         */
 686        if (log->log_start == 0)
 687                io->need_split_bio = true;
 688
 689        io->log_end = log->log_start;
 690}
 691
 692static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 693{
 694        struct r5l_io_unit *io;
 695        struct r5l_meta_block *block;
 696
 697        io = mempool_alloc(log->io_pool, GFP_ATOMIC);
 698        if (!io)
 699                return NULL;
 700        memset(io, 0, sizeof(*io));
 701
 702        io->log = log;
 703        INIT_LIST_HEAD(&io->log_sibling);
 704        INIT_LIST_HEAD(&io->stripe_list);
 705        bio_list_init(&io->flush_barriers);
 706        io->state = IO_UNIT_RUNNING;
 707
 708        io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
 709        block = page_address(io->meta_page);
 710        clear_page(block);
 711        block->magic = cpu_to_le32(R5LOG_MAGIC);
 712        block->version = R5LOG_VERSION;
 713        block->seq = cpu_to_le64(log->seq);
 714        block->position = cpu_to_le64(log->log_start);
 715
 716        io->log_start = log->log_start;
 717        io->meta_offset = sizeof(struct r5l_meta_block);
 718        io->seq = log->seq++;
 719
 720        io->current_bio = r5l_bio_alloc(log);
 721        io->current_bio->bi_end_io = r5l_log_endio;
 722        io->current_bio->bi_private = io;
 723        bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
 724
 725        r5_reserve_log_entry(log, io);
 726
 727        spin_lock_irq(&log->io_list_lock);
 728        list_add_tail(&io->log_sibling, &log->running_ios);
 729        spin_unlock_irq(&log->io_list_lock);
 730
 731        return io;
 732}
 733
 734static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
 735{
 736        if (log->current_io &&
 737            log->current_io->meta_offset + payload_size > PAGE_SIZE)
 738                r5l_submit_current_io(log);
 739
 740        if (!log->current_io) {
 741                log->current_io = r5l_new_meta(log);
 742                if (!log->current_io)
 743                        return -ENOMEM;
 744        }
 745
 746        return 0;
 747}
 748
 749static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
 750                                    sector_t location,
 751                                    u32 checksum1, u32 checksum2,
 752                                    bool checksum2_valid)
 753{
 754        struct r5l_io_unit *io = log->current_io;
 755        struct r5l_payload_data_parity *payload;
 756
 757        payload = page_address(io->meta_page) + io->meta_offset;
 758        payload->header.type = cpu_to_le16(type);
 759        payload->header.flags = cpu_to_le16(0);
 760        payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
 761                                    (PAGE_SHIFT - 9));
 762        payload->location = cpu_to_le64(location);
 763        payload->checksum[0] = cpu_to_le32(checksum1);
 764        if (checksum2_valid)
 765                payload->checksum[1] = cpu_to_le32(checksum2);
 766
 767        io->meta_offset += sizeof(struct r5l_payload_data_parity) +
 768                sizeof(__le32) * (1 + !!checksum2_valid);
 769}
 770
 771static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
 772{
 773        struct r5l_io_unit *io = log->current_io;
 774
 775        if (io->need_split_bio) {
 776                BUG_ON(io->split_bio);
 777                io->split_bio = io->current_bio;
 778                io->current_bio = r5l_bio_alloc(log);
 779                bio_chain(io->current_bio, io->split_bio);
 780                io->need_split_bio = false;
 781        }
 782
 783        if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
 784                BUG();
 785
 786        r5_reserve_log_entry(log, io);
 787}
 788
 789static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 790                           int data_pages, int parity_pages)
 791{
 792        int i;
 793        int meta_size;
 794        int ret;
 795        struct r5l_io_unit *io;
 796
 797        meta_size =
 798                ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
 799                 * data_pages) +
 800                sizeof(struct r5l_payload_data_parity) +
 801                sizeof(__le32) * parity_pages;
 802
 803        ret = r5l_get_meta(log, meta_size);
 804        if (ret)
 805                return ret;
 806
 807        io = log->current_io;
 808
 809        if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state))
 810                io->has_flush = 1;
 811
 812        for (i = 0; i < sh->disks; i++) {
 813                if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
 814                    test_bit(R5_InJournal, &sh->dev[i].flags))
 815                        continue;
 816                if (i == sh->pd_idx || i == sh->qd_idx)
 817                        continue;
 818                if (test_bit(R5_WantFUA, &sh->dev[i].flags) &&
 819                    log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) {
 820                        io->has_fua = 1;
 821                        /*
 822                         * we need to flush journal to make sure recovery can
 823                         * reach the data with fua flag
 824                         */
 825                        io->has_flush = 1;
 826                }
 827                r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
 828                                        raid5_compute_blocknr(sh, i, 0),
 829                                        sh->dev[i].log_checksum, 0, false);
 830                r5l_append_payload_page(log, sh->dev[i].page);
 831        }
 832
 833        if (parity_pages == 2) {
 834                r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
 835                                        sh->sector, sh->dev[sh->pd_idx].log_checksum,
 836                                        sh->dev[sh->qd_idx].log_checksum, true);
 837                r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
 838                r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
 839        } else if (parity_pages == 1) {
 840                r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
 841                                        sh->sector, sh->dev[sh->pd_idx].log_checksum,
 842                                        0, false);
 843                r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
 844        } else  /* Just writing data, not parity, in caching phase */
 845                BUG_ON(parity_pages != 0);
 846
 847        list_add_tail(&sh->log_list, &io->stripe_list);
 848        atomic_inc(&io->pending_stripe);
 849        sh->log_io = io;
 850
 851        if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
 852                return 0;
 853
 854        if (sh->log_start == MaxSector) {
 855                BUG_ON(!list_empty(&sh->r5c));
 856                sh->log_start = io->log_start;
 857                spin_lock_irq(&log->stripe_in_journal_lock);
 858                list_add_tail(&sh->r5c,
 859                              &log->stripe_in_journal_list);
 860                spin_unlock_irq(&log->stripe_in_journal_lock);
 861                atomic_inc(&log->stripe_in_journal_count);
 862        }
 863        return 0;
 864}
 865
 866/* add stripe to no_space_stripes, and then wake up reclaim */
 867static inline void r5l_add_no_space_stripe(struct r5l_log *log,
 868                                           struct stripe_head *sh)
 869{
 870        spin_lock(&log->no_space_stripes_lock);
 871        list_add_tail(&sh->log_list, &log->no_space_stripes);
 872        spin_unlock(&log->no_space_stripes_lock);
 873}
 874
 875/*
 876 * running in raid5d, where reclaim could wait for raid5d too (when it flushes
 877 * data from log to raid disks), so we shouldn't wait for reclaim here
 878 */
 879int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 880{
 881        struct r5conf *conf = sh->raid_conf;
 882        int write_disks = 0;
 883        int data_pages, parity_pages;
 884        int reserve;
 885        int i;
 886        int ret = 0;
 887        bool wake_reclaim = false;
 888
 889        if (!log)
 890                return -EAGAIN;
 891        /* Don't support stripe batch */
 892        if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
 893            test_bit(STRIPE_SYNCING, &sh->state)) {
 894                /* the stripe is written to log, we start writing it to raid */
 895                clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
 896                return -EAGAIN;
 897        }
 898
 899        WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
 900
 901        for (i = 0; i < sh->disks; i++) {
 902                void *addr;
 903
 904                if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
 905                    test_bit(R5_InJournal, &sh->dev[i].flags))
 906                        continue;
 907
 908                write_disks++;
 909                /* checksum is already calculated in last run */
 910                if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
 911                        continue;
 912                addr = kmap_atomic(sh->dev[i].page);
 913                sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
 914                                                    addr, PAGE_SIZE);
 915                kunmap_atomic(addr);
 916        }
 917        parity_pages = 1 + !!(sh->qd_idx >= 0);
 918        data_pages = write_disks - parity_pages;
 919
 920        set_bit(STRIPE_LOG_TRAPPED, &sh->state);
 921        /*
 922         * The stripe must enter state machine again to finish the write, so
 923         * don't delay.
 924         */
 925        clear_bit(STRIPE_DELAYED, &sh->state);
 926        atomic_inc(&sh->count);
 927
 928        mutex_lock(&log->io_mutex);
 929        /* meta + data */
 930        reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
 931
 932        if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
 933                if (!r5l_has_free_space(log, reserve)) {
 934                        r5l_add_no_space_stripe(log, sh);
 935                        wake_reclaim = true;
 936                } else {
 937                        ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
 938                        if (ret) {
 939                                spin_lock_irq(&log->io_list_lock);
 940                                list_add_tail(&sh->log_list,
 941                                              &log->no_mem_stripes);
 942                                spin_unlock_irq(&log->io_list_lock);
 943                        }
 944                }
 945        } else {  /* R5C_JOURNAL_MODE_WRITE_BACK */
 946                /*
 947                 * log space critical, do not process stripes that are
 948                 * not in cache yet (sh->log_start == MaxSector).
 949                 */
 950                if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
 951                    sh->log_start == MaxSector) {
 952                        r5l_add_no_space_stripe(log, sh);
 953                        wake_reclaim = true;
 954                        reserve = 0;
 955                } else if (!r5l_has_free_space(log, reserve)) {
 956                        if (sh->log_start == log->last_checkpoint)
 957                                BUG();
 958                        else
 959                                r5l_add_no_space_stripe(log, sh);
 960                } else {
 961                        ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
 962                        if (ret) {
 963                                spin_lock_irq(&log->io_list_lock);
 964                                list_add_tail(&sh->log_list,
 965                                              &log->no_mem_stripes);
 966                                spin_unlock_irq(&log->io_list_lock);
 967                        }
 968                }
 969        }
 970
 971        mutex_unlock(&log->io_mutex);
 972        if (wake_reclaim)
 973                r5l_wake_reclaim(log, reserve);
 974        return 0;
 975}
 976
 977void r5l_write_stripe_run(struct r5l_log *log)
 978{
 979        if (!log)
 980                return;
 981        mutex_lock(&log->io_mutex);
 982        r5l_submit_current_io(log);
 983        mutex_unlock(&log->io_mutex);
 984}
 985
 986int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
 987{
 988        if (!log)
 989                return -ENODEV;
 990
 991        if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
 992                /*
 993                 * in write through (journal only)
 994                 * we flush log disk cache first, then write stripe data to
 995                 * raid disks. So if bio is finished, the log disk cache is
 996                 * flushed already. The recovery guarantees we can recovery
 997                 * the bio from log disk, so we don't need to flush again
 998                 */
 999                if (bio->bi_iter.bi_size == 0) {
1000                        bio_endio(bio);
1001                        return 0;
1002                }
1003                bio->bi_opf &= ~REQ_PREFLUSH;
1004        } else {
1005                /* write back (with cache) */
1006                if (bio->bi_iter.bi_size == 0) {
1007                        mutex_lock(&log->io_mutex);
1008                        r5l_get_meta(log, 0);
1009                        bio_list_add(&log->current_io->flush_barriers, bio);
1010                        log->current_io->has_flush = 1;
1011                        log->current_io->has_null_flush = 1;
1012                        atomic_inc(&log->current_io->pending_stripe);
1013                        r5l_submit_current_io(log);
1014                        mutex_unlock(&log->io_mutex);
1015                        return 0;
1016                }
1017        }
1018        return -EAGAIN;
1019}
1020
1021/* This will run after log space is reclaimed */
1022static void r5l_run_no_space_stripes(struct r5l_log *log)
1023{
1024        struct stripe_head *sh;
1025
1026        spin_lock(&log->no_space_stripes_lock);
1027        while (!list_empty(&log->no_space_stripes)) {
1028                sh = list_first_entry(&log->no_space_stripes,
1029                                      struct stripe_head, log_list);
1030                list_del_init(&sh->log_list);
1031                set_bit(STRIPE_HANDLE, &sh->state);
1032                raid5_release_stripe(sh);
1033        }
1034        spin_unlock(&log->no_space_stripes_lock);
1035}
1036
1037/*
1038 * calculate new last_checkpoint
1039 * for write through mode, returns log->next_checkpoint
1040 * for write back, returns log_start of first sh in stripe_in_journal_list
1041 */
1042static sector_t r5c_calculate_new_cp(struct r5conf *conf)
1043{
1044        struct stripe_head *sh;
1045        struct r5l_log *log = conf->log;
1046        sector_t new_cp;
1047        unsigned long flags;
1048
1049        if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
1050                return log->next_checkpoint;
1051
1052        spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1053        if (list_empty(&conf->log->stripe_in_journal_list)) {
1054                /* all stripes flushed */
1055                spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1056                return log->next_checkpoint;
1057        }
1058        sh = list_first_entry(&conf->log->stripe_in_journal_list,
1059                              struct stripe_head, r5c);
1060        new_cp = sh->log_start;
1061        spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1062        return new_cp;
1063}
1064
1065static sector_t r5l_reclaimable_space(struct r5l_log *log)
1066{
1067        struct r5conf *conf = log->rdev->mddev->private;
1068
1069        return r5l_ring_distance(log, log->last_checkpoint,
1070                                 r5c_calculate_new_cp(conf));
1071}
1072
1073static void r5l_run_no_mem_stripe(struct r5l_log *log)
1074{
1075        struct stripe_head *sh;
1076
1077        assert_spin_locked(&log->io_list_lock);
1078
1079        if (!list_empty(&log->no_mem_stripes)) {
1080                sh = list_first_entry(&log->no_mem_stripes,
1081                                      struct stripe_head, log_list);
1082                list_del_init(&sh->log_list);
1083                set_bit(STRIPE_HANDLE, &sh->state);
1084                raid5_release_stripe(sh);
1085        }
1086}
1087
1088static bool r5l_complete_finished_ios(struct r5l_log *log)
1089{
1090        struct r5l_io_unit *io, *next;
1091        bool found = false;
1092
1093        assert_spin_locked(&log->io_list_lock);
1094
1095        list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
1096                /* don't change list order */
1097                if (io->state < IO_UNIT_STRIPE_END)
1098                        break;
1099
1100                log->next_checkpoint = io->log_start;
1101
1102                list_del(&io->log_sibling);
1103                mempool_free(io, log->io_pool);
1104                r5l_run_no_mem_stripe(log);
1105
1106                found = true;
1107        }
1108
1109        return found;
1110}
1111
1112static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
1113{
1114        struct r5l_log *log = io->log;
1115        struct r5conf *conf = log->rdev->mddev->private;
1116        unsigned long flags;
1117
1118        spin_lock_irqsave(&log->io_list_lock, flags);
1119        __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
1120
1121        if (!r5l_complete_finished_ios(log)) {
1122                spin_unlock_irqrestore(&log->io_list_lock, flags);
1123                return;
1124        }
1125
1126        if (r5l_reclaimable_space(log) > log->max_free_space ||
1127            test_bit(R5C_LOG_TIGHT, &conf->cache_state))
1128                r5l_wake_reclaim(log, 0);
1129
1130        spin_unlock_irqrestore(&log->io_list_lock, flags);
1131        wake_up(&log->iounit_wait);
1132}
1133
1134void r5l_stripe_write_finished(struct stripe_head *sh)
1135{
1136        struct r5l_io_unit *io;
1137
1138        io = sh->log_io;
1139        sh->log_io = NULL;
1140
1141        if (io && atomic_dec_and_test(&io->pending_stripe))
1142                __r5l_stripe_write_finished(io);
1143}
1144
1145static void r5l_log_flush_endio(struct bio *bio)
1146{
1147        struct r5l_log *log = container_of(bio, struct r5l_log,
1148                flush_bio);
1149        unsigned long flags;
1150        struct r5l_io_unit *io;
1151
1152        if (bio->bi_error)
1153                md_error(log->rdev->mddev, log->rdev);
1154
1155        spin_lock_irqsave(&log->io_list_lock, flags);
1156        list_for_each_entry(io, &log->flushing_ios, log_sibling)
1157                r5l_io_run_stripes(io);
1158        list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
1159        spin_unlock_irqrestore(&log->io_list_lock, flags);
1160}
1161
1162/*
1163 * Starting dispatch IO to raid.
1164 * io_unit(meta) consists of a log. There is one situation we want to avoid. A
1165 * broken meta in the middle of a log causes recovery can't find meta at the
1166 * head of log. If operations require meta at the head persistent in log, we
1167 * must make sure meta before it persistent in log too. A case is:
1168 *
1169 * stripe data/parity is in log, we start write stripe to raid disks. stripe
1170 * data/parity must be persistent in log before we do the write to raid disks.
1171 *
1172 * The solution is we restrictly maintain io_unit list order. In this case, we
1173 * only write stripes of an io_unit to raid disks till the io_unit is the first
1174 * one whose data/parity is in log.
1175 */
1176void r5l_flush_stripe_to_raid(struct r5l_log *log)
1177{
1178        bool do_flush;
1179
1180        if (!log || !log->need_cache_flush)
1181                return;
1182
1183        spin_lock_irq(&log->io_list_lock);
1184        /* flush bio is running */
1185        if (!list_empty(&log->flushing_ios)) {
1186                spin_unlock_irq(&log->io_list_lock);
1187                return;
1188        }
1189        list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
1190        do_flush = !list_empty(&log->flushing_ios);
1191        spin_unlock_irq(&log->io_list_lock);
1192
1193        if (!do_flush)
1194                return;
1195        bio_reset(&log->flush_bio);
1196        log->flush_bio.bi_bdev = log->rdev->bdev;
1197        log->flush_bio.bi_end_io = r5l_log_flush_endio;
1198        log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
1199        submit_bio(&log->flush_bio);
1200}
1201
1202static void r5l_write_super(struct r5l_log *log, sector_t cp);
1203static void r5l_write_super_and_discard_space(struct r5l_log *log,
1204        sector_t end)
1205{
1206        struct block_device *bdev = log->rdev->bdev;
1207        struct mddev *mddev;
1208
1209        r5l_write_super(log, end);
1210
1211        if (!blk_queue_discard(bdev_get_queue(bdev)))
1212                return;
1213
1214        mddev = log->rdev->mddev;
1215        /*
1216         * Discard could zero data, so before discard we must make sure
1217         * superblock is updated to new log tail. Updating superblock (either
1218         * directly call md_update_sb() or depend on md thread) must hold
1219         * reconfig mutex. On the other hand, raid5_quiesce is called with
1220         * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
1221         * for all IO finish, hence waitting for reclaim thread, while reclaim
1222         * thread is calling this function and waitting for reconfig mutex. So
1223         * there is a deadlock. We workaround this issue with a trylock.
1224         * FIXME: we could miss discard if we can't take reconfig mutex
1225         */
1226        set_mask_bits(&mddev->sb_flags, 0,
1227                BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1228        if (!mddev_trylock(mddev))
1229                return;
1230        md_update_sb(mddev, 1);
1231        mddev_unlock(mddev);
1232
1233        /* discard IO error really doesn't matter, ignore it */
1234        if (log->last_checkpoint < end) {
1235                blkdev_issue_discard(bdev,
1236                                log->last_checkpoint + log->rdev->data_offset,
1237                                end - log->last_checkpoint, GFP_NOIO, 0);
1238        } else {
1239                blkdev_issue_discard(bdev,
1240                                log->last_checkpoint + log->rdev->data_offset,
1241                                log->device_size - log->last_checkpoint,
1242                                GFP_NOIO, 0);
1243                blkdev_issue_discard(bdev, log->rdev->data_offset, end,
1244                                GFP_NOIO, 0);
1245        }
1246}
1247
1248/*
1249 * r5c_flush_stripe moves stripe from cached list to handle_list. When called,
1250 * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes.
1251 *
1252 * must hold conf->device_lock
1253 */
1254static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
1255{
1256        BUG_ON(list_empty(&sh->lru));
1257        BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
1258        BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
1259
1260        /*
1261         * The stripe is not ON_RELEASE_LIST, so it is safe to call
1262         * raid5_release_stripe() while holding conf->device_lock
1263         */
1264        BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
1265        assert_spin_locked(&conf->device_lock);
1266
1267        list_del_init(&sh->lru);
1268        atomic_inc(&sh->count);
1269
1270        set_bit(STRIPE_HANDLE, &sh->state);
1271        atomic_inc(&conf->active_stripes);
1272        r5c_make_stripe_write_out(sh);
1273
1274        raid5_release_stripe(sh);
1275}
1276
1277/*
1278 * if num == 0, flush all full stripes
1279 * if num > 0, flush all full stripes. If less than num full stripes are
1280 *             flushed, flush some partial stripes until totally num stripes are
1281 *             flushed or there is no more cached stripes.
1282 */
1283void r5c_flush_cache(struct r5conf *conf, int num)
1284{
1285        int count;
1286        struct stripe_head *sh, *next;
1287
1288        assert_spin_locked(&conf->device_lock);
1289        if (!conf->log)
1290                return;
1291
1292        count = 0;
1293        list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
1294                r5c_flush_stripe(conf, sh);
1295                count++;
1296        }
1297
1298        if (count >= num)
1299                return;
1300        list_for_each_entry_safe(sh, next,
1301                                 &conf->r5c_partial_stripe_list, lru) {
1302                r5c_flush_stripe(conf, sh);
1303                if (++count >= num)
1304                        break;
1305        }
1306}
1307
1308static void r5c_do_reclaim(struct r5conf *conf)
1309{
1310        struct r5l_log *log = conf->log;
1311        struct stripe_head *sh;
1312        int count = 0;
1313        unsigned long flags;
1314        int total_cached;
1315        int stripes_to_flush;
1316
1317        if (!r5c_is_writeback(log))
1318                return;
1319
1320        total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
1321                atomic_read(&conf->r5c_cached_full_stripes);
1322
1323        if (total_cached > conf->min_nr_stripes * 3 / 4 ||
1324            atomic_read(&conf->empty_inactive_list_nr) > 0)
1325                /*
1326                 * if stripe cache pressure high, flush all full stripes and
1327                 * some partial stripes
1328                 */
1329                stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
1330        else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
1331                 atomic_read(&conf->r5c_cached_full_stripes) >
1332                 R5C_FULL_STRIPE_FLUSH_BATCH)
1333                /*
1334                 * if stripe cache pressure moderate, or if there is many full
1335                 * stripes,flush all full stripes
1336                 */
1337                stripes_to_flush = 0;
1338        else
1339                /* no need to flush */
1340                stripes_to_flush = -1;
1341
1342        if (stripes_to_flush >= 0) {
1343                spin_lock_irqsave(&conf->device_lock, flags);
1344                r5c_flush_cache(conf, stripes_to_flush);
1345                spin_unlock_irqrestore(&conf->device_lock, flags);
1346        }
1347
1348        /* if log space is tight, flush stripes on stripe_in_journal_list */
1349        if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) {
1350                spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1351                spin_lock(&conf->device_lock);
1352                list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) {
1353                        /*
1354                         * stripes on stripe_in_journal_list could be in any
1355                         * state of the stripe_cache state machine. In this
1356                         * case, we only want to flush stripe on
1357                         * r5c_cached_full/partial_stripes. The following
1358                         * condition makes sure the stripe is on one of the
1359                         * two lists.
1360                         */
1361                        if (!list_empty(&sh->lru) &&
1362                            !test_bit(STRIPE_HANDLE, &sh->state) &&
1363                            atomic_read(&sh->count) == 0) {
1364                                r5c_flush_stripe(conf, sh);
1365                        }
1366                        if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
1367                                break;
1368                }
1369                spin_unlock(&conf->device_lock);
1370                spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1371        }
1372
1373        if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
1374                r5l_run_no_space_stripes(log);
1375
1376        md_wakeup_thread(conf->mddev->thread);
1377}
1378
1379static void r5l_do_reclaim(struct r5l_log *log)
1380{
1381        struct r5conf *conf = log->rdev->mddev->private;
1382        sector_t reclaim_target = xchg(&log->reclaim_target, 0);
1383        sector_t reclaimable;
1384        sector_t next_checkpoint;
1385        bool write_super;
1386
1387        spin_lock_irq(&log->io_list_lock);
1388        write_super = r5l_reclaimable_space(log) > log->max_free_space ||
1389                reclaim_target != 0 || !list_empty(&log->no_space_stripes);
1390        /*
1391         * move proper io_unit to reclaim list. We should not change the order.
1392         * reclaimable/unreclaimable io_unit can be mixed in the list, we
1393         * shouldn't reuse space of an unreclaimable io_unit
1394         */
1395        while (1) {
1396                reclaimable = r5l_reclaimable_space(log);
1397                if (reclaimable >= reclaim_target ||
1398                    (list_empty(&log->running_ios) &&
1399                     list_empty(&log->io_end_ios) &&
1400                     list_empty(&log->flushing_ios) &&
1401                     list_empty(&log->finished_ios)))
1402                        break;
1403
1404                md_wakeup_thread(log->rdev->mddev->thread);
1405                wait_event_lock_irq(log->iounit_wait,
1406                                    r5l_reclaimable_space(log) > reclaimable,
1407                                    log->io_list_lock);
1408        }
1409
1410        next_checkpoint = r5c_calculate_new_cp(conf);
1411        spin_unlock_irq(&log->io_list_lock);
1412
1413        if (reclaimable == 0 || !write_super)
1414                return;
1415
1416        /*
1417         * write_super will flush cache of each raid disk. We must write super
1418         * here, because the log area might be reused soon and we don't want to
1419         * confuse recovery
1420         */
1421        r5l_write_super_and_discard_space(log, next_checkpoint);
1422
1423        mutex_lock(&log->io_mutex);
1424        log->last_checkpoint = next_checkpoint;
1425        r5c_update_log_state(log);
1426        mutex_unlock(&log->io_mutex);
1427
1428        r5l_run_no_space_stripes(log);
1429}
1430
1431static void r5l_reclaim_thread(struct md_thread *thread)
1432{
1433        struct mddev *mddev = thread->mddev;
1434        struct r5conf *conf = mddev->private;
1435        struct r5l_log *log = conf->log;
1436
1437        if (!log)
1438                return;
1439        r5c_do_reclaim(conf);
1440        r5l_do_reclaim(log);
1441}
1442
1443void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
1444{
1445        unsigned long target;
1446        unsigned long new = (unsigned long)space; /* overflow in theory */
1447
1448        if (!log)
1449                return;
1450        do {
1451                target = log->reclaim_target;
1452                if (new < target)
1453                        return;
1454        } while (cmpxchg(&log->reclaim_target, target, new) != target);
1455        md_wakeup_thread(log->reclaim_thread);
1456}
1457
1458void r5l_quiesce(struct r5l_log *log, int state)
1459{
1460        struct mddev *mddev;
1461        if (!log || state == 2)
1462                return;
1463        if (state == 0)
1464                kthread_unpark(log->reclaim_thread->tsk);
1465        else if (state == 1) {
1466                /* make sure r5l_write_super_and_discard_space exits */
1467                mddev = log->rdev->mddev;
1468                wake_up(&mddev->sb_wait);
1469                kthread_park(log->reclaim_thread->tsk);
1470                r5l_wake_reclaim(log, MaxSector);
1471                r5l_do_reclaim(log);
1472        }
1473}
1474
1475bool r5l_log_disk_error(struct r5conf *conf)
1476{
1477        struct r5l_log *log;
1478        bool ret;
1479        /* don't allow write if journal disk is missing */
1480        rcu_read_lock();
1481        log = rcu_dereference(conf->log);
1482
1483        if (!log)
1484                ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
1485        else
1486                ret = test_bit(Faulty, &log->rdev->flags);
1487        rcu_read_unlock();
1488        return ret;
1489}
1490
1491struct r5l_recovery_ctx {
1492        struct page *meta_page;         /* current meta */
1493        sector_t meta_total_blocks;     /* total size of current meta and data */
1494        sector_t pos;                   /* recovery position */
1495        u64 seq;                        /* recovery position seq */
1496        int data_parity_stripes;        /* number of data_parity stripes */
1497        int data_only_stripes;          /* number of data_only stripes */
1498        struct list_head cached_list;
1499};
1500
1501static int r5l_recovery_read_meta_block(struct r5l_log *log,
1502                                        struct r5l_recovery_ctx *ctx)
1503{
1504        struct page *page = ctx->meta_page;
1505        struct r5l_meta_block *mb;
1506        u32 crc, stored_crc;
1507
1508        if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0,
1509                          false))
1510                return -EIO;
1511
1512        mb = page_address(page);
1513        stored_crc = le32_to_cpu(mb->checksum);
1514        mb->checksum = 0;
1515
1516        if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1517            le64_to_cpu(mb->seq) != ctx->seq ||
1518            mb->version != R5LOG_VERSION ||
1519            le64_to_cpu(mb->position) != ctx->pos)
1520                return -EINVAL;
1521
1522        crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1523        if (stored_crc != crc)
1524                return -EINVAL;
1525
1526        if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
1527                return -EINVAL;
1528
1529        ctx->meta_total_blocks = BLOCK_SECTORS;
1530
1531        return 0;
1532}
1533
1534static void
1535r5l_recovery_create_empty_meta_block(struct r5l_log *log,
1536                                     struct page *page,
1537                                     sector_t pos, u64 seq)
1538{
1539        struct r5l_meta_block *mb;
1540
1541        mb = page_address(page);
1542        clear_page(mb);
1543        mb->magic = cpu_to_le32(R5LOG_MAGIC);
1544        mb->version = R5LOG_VERSION;
1545        mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
1546        mb->seq = cpu_to_le64(seq);
1547        mb->position = cpu_to_le64(pos);
1548}
1549
1550static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
1551                                          u64 seq)
1552{
1553        struct page *page;
1554        struct r5l_meta_block *mb;
1555
1556        page = alloc_page(GFP_KERNEL);
1557        if (!page)
1558                return -ENOMEM;
1559        r5l_recovery_create_empty_meta_block(log, page, pos, seq);
1560        mb = page_address(page);
1561        mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
1562                                             mb, PAGE_SIZE));
1563        if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
1564                          REQ_FUA, false)) {
1565                __free_page(page);
1566                return -EIO;
1567        }
1568        __free_page(page);
1569        return 0;
1570}
1571
1572/*
1573 * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite
1574 * to mark valid (potentially not flushed) data in the journal.
1575 *
1576 * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb,
1577 * so there should not be any mismatch here.
1578 */
1579static void r5l_recovery_load_data(struct r5l_log *log,
1580                                   struct stripe_head *sh,
1581                                   struct r5l_recovery_ctx *ctx,
1582                                   struct r5l_payload_data_parity *payload,
1583                                   sector_t log_offset)
1584{
1585        struct mddev *mddev = log->rdev->mddev;
1586        struct r5conf *conf = mddev->private;
1587        int dd_idx;
1588
1589        raid5_compute_sector(conf,
1590                             le64_to_cpu(payload->location), 0,
1591                             &dd_idx, sh);
1592        sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1593                     sh->dev[dd_idx].page, REQ_OP_READ, 0, false);
1594        sh->dev[dd_idx].log_checksum =
1595                le32_to_cpu(payload->checksum[0]);
1596        ctx->meta_total_blocks += BLOCK_SECTORS;
1597
1598        set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags);
1599        set_bit(STRIPE_R5C_CACHING, &sh->state);
1600}
1601
1602static void r5l_recovery_load_parity(struct r5l_log *log,
1603                                     struct stripe_head *sh,
1604                                     struct r5l_recovery_ctx *ctx,
1605                                     struct r5l_payload_data_parity *payload,
1606                                     sector_t log_offset)
1607{
1608        struct mddev *mddev = log->rdev->mddev;
1609        struct r5conf *conf = mddev->private;
1610
1611        ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
1612        sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1613                     sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false);
1614        sh->dev[sh->pd_idx].log_checksum =
1615                le32_to_cpu(payload->checksum[0]);
1616        set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
1617
1618        if (sh->qd_idx >= 0) {
1619                sync_page_io(log->rdev,
1620                             r5l_ring_add(log, log_offset, BLOCK_SECTORS),
1621                             PAGE_SIZE, sh->dev[sh->qd_idx].page,
1622                             REQ_OP_READ, 0, false);
1623                sh->dev[sh->qd_idx].log_checksum =
1624                        le32_to_cpu(payload->checksum[1]);
1625                set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
1626        }
1627        clear_bit(STRIPE_R5C_CACHING, &sh->state);
1628}
1629
1630static void r5l_recovery_reset_stripe(struct stripe_head *sh)
1631{
1632        int i;
1633
1634        sh->state = 0;
1635        sh->log_start = MaxSector;
1636        for (i = sh->disks; i--; )
1637                sh->dev[i].flags = 0;
1638}
1639
1640static void
1641r5l_recovery_replay_one_stripe(struct r5conf *conf,
1642                               struct stripe_head *sh,
1643                               struct r5l_recovery_ctx *ctx)
1644{
1645        struct md_rdev *rdev, *rrdev;
1646        int disk_index;
1647        int data_count = 0;
1648
1649        for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1650                if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1651                        continue;
1652                if (disk_index == sh->qd_idx || disk_index == sh->pd_idx)
1653                        continue;
1654                data_count++;
1655        }
1656
1657        /*
1658         * stripes that only have parity must have been flushed
1659         * before the crash that we are now recovering from, so
1660         * there is nothing more to recovery.
1661         */
1662        if (data_count == 0)
1663                goto out;
1664
1665        for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1666                if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1667                        continue;
1668
1669                /* in case device is broken */
1670                rcu_read_lock();
1671                rdev = rcu_dereference(conf->disks[disk_index].rdev);
1672                if (rdev) {
1673                        atomic_inc(&rdev->nr_pending);
1674                        rcu_read_unlock();
1675                        sync_page_io(rdev, sh->sector, PAGE_SIZE,
1676                                     sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1677                                     false);
1678                        rdev_dec_pending(rdev, rdev->mddev);
1679                        rcu_read_lock();
1680                }
1681                rrdev = rcu_dereference(conf->disks[disk_index].replacement);
1682                if (rrdev) {
1683                        atomic_inc(&rrdev->nr_pending);
1684                        rcu_read_unlock();
1685                        sync_page_io(rrdev, sh->sector, PAGE_SIZE,
1686                                     sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1687                                     false);
1688                        rdev_dec_pending(rrdev, rrdev->mddev);
1689                        rcu_read_lock();
1690                }
1691                rcu_read_unlock();
1692        }
1693        ctx->data_parity_stripes++;
1694out:
1695        r5l_recovery_reset_stripe(sh);
1696}
1697
1698static struct stripe_head *
1699r5c_recovery_alloc_stripe(struct r5conf *conf,
1700                          sector_t stripe_sect)
1701{
1702        struct stripe_head *sh;
1703
1704        sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0);
1705        if (!sh)
1706                return NULL;  /* no more stripe available */
1707
1708        r5l_recovery_reset_stripe(sh);
1709
1710        return sh;
1711}
1712
1713static struct stripe_head *
1714r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect)
1715{
1716        struct stripe_head *sh;
1717
1718        list_for_each_entry(sh, list, lru)
1719                if (sh->sector == sect)
1720                        return sh;
1721        return NULL;
1722}
1723
1724static void
1725r5c_recovery_drop_stripes(struct list_head *cached_stripe_list,
1726                          struct r5l_recovery_ctx *ctx)
1727{
1728        struct stripe_head *sh, *next;
1729
1730        list_for_each_entry_safe(sh, next, cached_stripe_list, lru) {
1731                r5l_recovery_reset_stripe(sh);
1732                list_del_init(&sh->lru);
1733                raid5_release_stripe(sh);
1734        }
1735}
1736
1737static void
1738r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
1739                            struct r5l_recovery_ctx *ctx)
1740{
1741        struct stripe_head *sh, *next;
1742
1743        list_for_each_entry_safe(sh, next, cached_stripe_list, lru)
1744                if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
1745                        r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx);
1746                        list_del_init(&sh->lru);
1747                        raid5_release_stripe(sh);
1748                }
1749}
1750
1751/* if matches return 0; otherwise return -EINVAL */
1752static int
1753r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page,
1754                                  sector_t log_offset, __le32 log_checksum)
1755{
1756        void *addr;
1757        u32 checksum;
1758
1759        sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1760                     page, REQ_OP_READ, 0, false);
1761        addr = kmap_atomic(page);
1762        checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
1763        kunmap_atomic(addr);
1764        return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL;
1765}
1766
1767/*
1768 * before loading data to stripe cache, we need verify checksum for all data,
1769 * if there is mismatch for any data page, we drop all data in the mata block
1770 */
1771static int
1772r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
1773                                         struct r5l_recovery_ctx *ctx)
1774{
1775        struct mddev *mddev = log->rdev->mddev;
1776        struct r5conf *conf = mddev->private;
1777        struct r5l_meta_block *mb = page_address(ctx->meta_page);
1778        sector_t mb_offset = sizeof(struct r5l_meta_block);
1779        sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
1780        struct page *page;
1781        struct r5l_payload_data_parity *payload;
1782
1783        page = alloc_page(GFP_KERNEL);
1784        if (!page)
1785                return -ENOMEM;
1786
1787        while (mb_offset < le32_to_cpu(mb->meta_size)) {
1788                payload = (void *)mb + mb_offset;
1789
1790                if (payload->header.type == R5LOG_PAYLOAD_DATA) {
1791                        if (r5l_recovery_verify_data_checksum(
1792                                    log, page, log_offset,
1793                                    payload->checksum[0]) < 0)
1794                                goto mismatch;
1795                } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) {
1796                        if (r5l_recovery_verify_data_checksum(
1797                                    log, page, log_offset,
1798                                    payload->checksum[0]) < 0)
1799                                goto mismatch;
1800                        if (conf->max_degraded == 2 && /* q for RAID 6 */
1801                            r5l_recovery_verify_data_checksum(
1802                                    log, page,
1803                                    r5l_ring_add(log, log_offset,
1804                                                 BLOCK_SECTORS),
1805                                    payload->checksum[1]) < 0)
1806                                goto mismatch;
1807                } else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */
1808                        goto mismatch;
1809
1810                log_offset = r5l_ring_add(log, log_offset,
1811                                          le32_to_cpu(payload->size));
1812
1813                mb_offset += sizeof(struct r5l_payload_data_parity) +
1814                        sizeof(__le32) *
1815                        (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
1816        }
1817
1818        put_page(page);
1819        return 0;
1820
1821mismatch:
1822        put_page(page);
1823        return -EINVAL;
1824}
1825
1826/*
1827 * Analyze all data/parity pages in one meta block
1828 * Returns:
1829 * 0 for success
1830 * -EINVAL for unknown playload type
1831 * -EAGAIN for checksum mismatch of data page
1832 * -ENOMEM for run out of memory (alloc_page failed or run out of stripes)
1833 */
1834static int
1835r5c_recovery_analyze_meta_block(struct r5l_log *log,
1836                                struct r5l_recovery_ctx *ctx,
1837                                struct list_head *cached_stripe_list)
1838{
1839        struct mddev *mddev = log->rdev->mddev;
1840        struct r5conf *conf = mddev->private;
1841        struct r5l_meta_block *mb;
1842        struct r5l_payload_data_parity *payload;
1843        int mb_offset;
1844        sector_t log_offset;
1845        sector_t stripe_sect;
1846        struct stripe_head *sh;
1847        int ret;
1848
1849        /*
1850         * for mismatch in data blocks, we will drop all data in this mb, but
1851         * we will still read next mb for other data with FLUSH flag, as
1852         * io_unit could finish out of order.
1853         */
1854        ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx);
1855        if (ret == -EINVAL)
1856                return -EAGAIN;
1857        else if (ret)
1858                return ret;   /* -ENOMEM duo to alloc_page() failed */
1859
1860        mb = page_address(ctx->meta_page);
1861        mb_offset = sizeof(struct r5l_meta_block);
1862        log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
1863
1864        while (mb_offset < le32_to_cpu(mb->meta_size)) {
1865                int dd;
1866
1867                payload = (void *)mb + mb_offset;
1868                stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ?
1869                        raid5_compute_sector(
1870                                conf, le64_to_cpu(payload->location), 0, &dd,
1871                                NULL)
1872                        : le64_to_cpu(payload->location);
1873
1874                sh = r5c_recovery_lookup_stripe(cached_stripe_list,
1875                                                stripe_sect);
1876
1877                if (!sh) {
1878                        sh = r5c_recovery_alloc_stripe(conf, stripe_sect);
1879                        /*
1880                         * cannot get stripe from raid5_get_active_stripe
1881                         * try replay some stripes
1882                         */
1883                        if (!sh) {
1884                                r5c_recovery_replay_stripes(
1885                                        cached_stripe_list, ctx);
1886                                sh = r5c_recovery_alloc_stripe(
1887                                        conf, stripe_sect);
1888                        }
1889                        if (!sh) {
1890                                pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
1891                                        mdname(mddev),
1892                                        conf->min_nr_stripes * 2);
1893                                raid5_set_cache_size(mddev,
1894                                                     conf->min_nr_stripes * 2);
1895                                sh = r5c_recovery_alloc_stripe(conf,
1896                                                               stripe_sect);
1897                        }
1898                        if (!sh) {
1899                                pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
1900                                       mdname(mddev));
1901                                return -ENOMEM;
1902                        }
1903                        list_add_tail(&sh->lru, cached_stripe_list);
1904                }
1905
1906                if (payload->header.type == R5LOG_PAYLOAD_DATA) {
1907                        if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
1908                            test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
1909                                r5l_recovery_replay_one_stripe(conf, sh, ctx);
1910                                list_move_tail(&sh->lru, cached_stripe_list);
1911                        }
1912                        r5l_recovery_load_data(log, sh, ctx, payload,
1913                                               log_offset);
1914                } else if (payload->header.type == R5LOG_PAYLOAD_PARITY)
1915                        r5l_recovery_load_parity(log, sh, ctx, payload,
1916                                                 log_offset);
1917                else
1918                        return -EINVAL;
1919
1920                log_offset = r5l_ring_add(log, log_offset,
1921                                          le32_to_cpu(payload->size));
1922
1923                mb_offset += sizeof(struct r5l_payload_data_parity) +
1924                        sizeof(__le32) *
1925                        (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
1926        }
1927
1928        return 0;
1929}
1930
1931/*
1932 * Load the stripe into cache. The stripe will be written out later by
1933 * the stripe cache state machine.
1934 */
1935static void r5c_recovery_load_one_stripe(struct r5l_log *log,
1936                                         struct stripe_head *sh)
1937{
1938        struct r5dev *dev;
1939        int i;
1940
1941        for (i = sh->disks; i--; ) {
1942                dev = sh->dev + i;
1943                if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) {
1944                        set_bit(R5_InJournal, &dev->flags);
1945                        set_bit(R5_UPTODATE, &dev->flags);
1946                }
1947        }
1948}
1949
1950/*
1951 * Scan through the log for all to-be-flushed data
1952 *
1953 * For stripes with data and parity, namely Data-Parity stripe
1954 * (STRIPE_R5C_CACHING == 0), we simply replay all the writes.
1955 *
1956 * For stripes with only data, namely Data-Only stripe
1957 * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine.
1958 *
1959 * For a stripe, if we see data after parity, we should discard all previous
1960 * data and parity for this stripe, as these data are already flushed to
1961 * the array.
1962 *
1963 * At the end of the scan, we return the new journal_tail, which points to
1964 * first data-only stripe on the journal device, or next invalid meta block.
1965 */
1966static int r5c_recovery_flush_log(struct r5l_log *log,
1967                                  struct r5l_recovery_ctx *ctx)
1968{
1969        struct stripe_head *sh;
1970        int ret = 0;
1971
1972        /* scan through the log */
1973        while (1) {
1974                if (r5l_recovery_read_meta_block(log, ctx))
1975                        break;
1976
1977                ret = r5c_recovery_analyze_meta_block(log, ctx,
1978                                                      &ctx->cached_list);
1979                /*
1980                 * -EAGAIN means mismatch in data block, in this case, we still
1981                 * try scan the next metablock
1982                 */
1983                if (ret && ret != -EAGAIN)
1984                        break;   /* ret == -EINVAL or -ENOMEM */
1985                ctx->seq++;
1986                ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
1987        }
1988
1989        if (ret == -ENOMEM) {
1990                r5c_recovery_drop_stripes(&ctx->cached_list, ctx);
1991                return ret;
1992        }
1993
1994        /* replay data-parity stripes */
1995        r5c_recovery_replay_stripes(&ctx->cached_list, ctx);
1996
1997        /* load data-only stripes to stripe cache */
1998        list_for_each_entry(sh, &ctx->cached_list, lru) {
1999                WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
2000                r5c_recovery_load_one_stripe(log, sh);
2001                ctx->data_only_stripes++;
2002        }
2003
2004        return 0;
2005}
2006
2007/*
2008 * we did a recovery. Now ctx.pos points to an invalid meta block. New
2009 * log will start here. but we can't let superblock point to last valid
2010 * meta block. The log might looks like:
2011 * | meta 1| meta 2| meta 3|
2012 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
2013 * superblock points to meta 1, we write a new valid meta 2n.  if crash
2014 * happens again, new recovery will start from meta 1. Since meta 2n is
2015 * valid now, recovery will think meta 3 is valid, which is wrong.
2016 * The solution is we create a new meta in meta2 with its seq == meta
2017 * 1's seq + 10000 and let superblock points to meta2. The same recovery
2018 * will not think meta 3 is a valid meta, because its seq doesn't match
2019 */
2020
2021/*
2022 * Before recovery, the log looks like the following
2023 *
2024 *   ---------------------------------------------
2025 *   |           valid log        | invalid log  |
2026 *   ---------------------------------------------
2027 *   ^
2028 *   |- log->last_checkpoint
2029 *   |- log->last_cp_seq
2030 *
2031 * Now we scan through the log until we see invalid entry
2032 *
2033 *   ---------------------------------------------
2034 *   |           valid log        | invalid log  |
2035 *   ---------------------------------------------
2036 *   ^                            ^
2037 *   |- log->last_checkpoint      |- ctx->pos
2038 *   |- log->last_cp_seq          |- ctx->seq
2039 *
2040 * From this point, we need to increase seq number by 10 to avoid
2041 * confusing next recovery.
2042 *
2043 *   ---------------------------------------------
2044 *   |           valid log        | invalid log  |
2045 *   ---------------------------------------------
2046 *   ^                              ^
2047 *   |- log->last_checkpoint        |- ctx->pos+1
2048 *   |- log->last_cp_seq            |- ctx->seq+10001
2049 *
2050 * However, it is not safe to start the state machine yet, because data only
2051 * parities are not yet secured in RAID. To save these data only parities, we
2052 * rewrite them from seq+11.
2053 *
2054 *   -----------------------------------------------------------------
2055 *   |           valid log        | data only stripes | invalid log  |
2056 *   -----------------------------------------------------------------
2057 *   ^                                                ^
2058 *   |- log->last_checkpoint                          |- ctx->pos+n
2059 *   |- log->last_cp_seq                              |- ctx->seq+10000+n
2060 *
2061 * If failure happens again during this process, the recovery can safe start
2062 * again from log->last_checkpoint.
2063 *
2064 * Once data only stripes are rewritten to journal, we move log_tail
2065 *
2066 *   -----------------------------------------------------------------
2067 *   |     old log        |    data only stripes    | invalid log  |
2068 *   -----------------------------------------------------------------
2069 *                        ^                         ^
2070 *                        |- log->last_checkpoint   |- ctx->pos+n
2071 *                        |- log->last_cp_seq       |- ctx->seq+10000+n
2072 *
2073 * Then we can safely start the state machine. If failure happens from this
2074 * point on, the recovery will start from new log->last_checkpoint.
2075 */
2076static int
2077r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
2078                                       struct r5l_recovery_ctx *ctx)
2079{
2080        struct stripe_head *sh;
2081        struct mddev *mddev = log->rdev->mddev;
2082        struct page *page;
2083        sector_t next_checkpoint = MaxSector;
2084
2085        page = alloc_page(GFP_KERNEL);
2086        if (!page) {
2087                pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n",
2088                       mdname(mddev));
2089                return -ENOMEM;
2090        }
2091
2092        WARN_ON(list_empty(&ctx->cached_list));
2093
2094        list_for_each_entry(sh, &ctx->cached_list, lru) {
2095                struct r5l_meta_block *mb;
2096                int i;
2097                int offset;
2098                sector_t write_pos;
2099
2100                WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
2101                r5l_recovery_create_empty_meta_block(log, page,
2102                                                     ctx->pos, ctx->seq);
2103                mb = page_address(page);
2104                offset = le32_to_cpu(mb->meta_size);
2105                write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
2106
2107                for (i = sh->disks; i--; ) {
2108                        struct r5dev *dev = &sh->dev[i];
2109                        struct r5l_payload_data_parity *payload;
2110                        void *addr;
2111
2112                        if (test_bit(R5_InJournal, &dev->flags)) {
2113                                payload = (void *)mb + offset;
2114                                payload->header.type = cpu_to_le16(
2115                                        R5LOG_PAYLOAD_DATA);
2116                                payload->size = BLOCK_SECTORS;
2117                                payload->location = cpu_to_le64(
2118                                        raid5_compute_blocknr(sh, i, 0));
2119                                addr = kmap_atomic(dev->page);
2120                                payload->checksum[0] = cpu_to_le32(
2121                                        crc32c_le(log->uuid_checksum, addr,
2122                                                  PAGE_SIZE));
2123                                kunmap_atomic(addr);
2124                                sync_page_io(log->rdev, write_pos, PAGE_SIZE,
2125                                             dev->page, REQ_OP_WRITE, 0, false);
2126                                write_pos = r5l_ring_add(log, write_pos,
2127                                                         BLOCK_SECTORS);
2128                                offset += sizeof(__le32) +
2129                                        sizeof(struct r5l_payload_data_parity);
2130
2131                        }
2132                }
2133                mb->meta_size = cpu_to_le32(offset);
2134                mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
2135                                                     mb, PAGE_SIZE));
2136                sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
2137                             REQ_OP_WRITE, REQ_FUA, false);
2138                sh->log_start = ctx->pos;
2139                list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
2140                atomic_inc(&log->stripe_in_journal_count);
2141                ctx->pos = write_pos;
2142                ctx->seq += 1;
2143                next_checkpoint = sh->log_start;
2144        }
2145        log->next_checkpoint = next_checkpoint;
2146        __free_page(page);
2147        return 0;
2148}
2149
2150static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
2151                                                 struct r5l_recovery_ctx *ctx)
2152{
2153        struct mddev *mddev = log->rdev->mddev;
2154        struct r5conf *conf = mddev->private;
2155        struct stripe_head *sh, *next;
2156
2157        if (ctx->data_only_stripes == 0)
2158                return;
2159
2160        log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
2161
2162        list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
2163                r5c_make_stripe_write_out(sh);
2164                set_bit(STRIPE_HANDLE, &sh->state);
2165                list_del_init(&sh->lru);
2166                raid5_release_stripe(sh);
2167        }
2168
2169        md_wakeup_thread(conf->mddev->thread);
2170        /* reuse conf->wait_for_quiescent in recovery */
2171        wait_event(conf->wait_for_quiescent,
2172                   atomic_read(&conf->active_stripes) == 0);
2173
2174        log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
2175}
2176
2177static int r5l_recovery_log(struct r5l_log *log)
2178{
2179        struct mddev *mddev = log->rdev->mddev;
2180        struct r5l_recovery_ctx ctx;
2181        int ret;
2182        sector_t pos;
2183
2184        ctx.pos = log->last_checkpoint;
2185        ctx.seq = log->last_cp_seq;
2186        ctx.meta_page = alloc_page(GFP_KERNEL);
2187        ctx.data_only_stripes = 0;
2188        ctx.data_parity_stripes = 0;
2189        INIT_LIST_HEAD(&ctx.cached_list);
2190
2191        if (!ctx.meta_page)
2192                return -ENOMEM;
2193
2194        ret = r5c_recovery_flush_log(log, &ctx);
2195        __free_page(ctx.meta_page);
2196
2197        if (ret)
2198                return ret;
2199
2200        pos = ctx.pos;
2201        ctx.seq += 10000;
2202
2203
2204        if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
2205                pr_debug("md/raid:%s: starting from clean shutdown\n",
2206                         mdname(mddev));
2207        else
2208                pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
2209                         mdname(mddev), ctx.data_only_stripes,
2210                         ctx.data_parity_stripes);
2211
2212        if (ctx.data_only_stripes == 0) {
2213                log->next_checkpoint = ctx.pos;
2214                r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
2215                ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
2216        } else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
2217                pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
2218                       mdname(mddev));
2219                return -EIO;
2220        }
2221
2222        log->log_start = ctx.pos;
2223        log->seq = ctx.seq;
2224        log->last_checkpoint = pos;
2225        r5l_write_super(log, pos);
2226
2227        r5c_recovery_flush_data_only_stripes(log, &ctx);
2228        return 0;
2229}
2230
2231static void r5l_write_super(struct r5l_log *log, sector_t cp)
2232{
2233        struct mddev *mddev = log->rdev->mddev;
2234
2235        log->rdev->journal_tail = cp;
2236        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2237}
2238
2239static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
2240{
2241        struct r5conf *conf = mddev->private;
2242        int ret;
2243
2244        if (!conf->log)
2245                return 0;
2246
2247        switch (conf->log->r5c_journal_mode) {
2248        case R5C_JOURNAL_MODE_WRITE_THROUGH:
2249                ret = snprintf(
2250                        page, PAGE_SIZE, "[%s] %s\n",
2251                        r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
2252                        r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
2253                break;
2254        case R5C_JOURNAL_MODE_WRITE_BACK:
2255                ret = snprintf(
2256                        page, PAGE_SIZE, "%s [%s]\n",
2257                        r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
2258                        r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
2259                break;
2260        default:
2261                ret = 0;
2262        }
2263        return ret;
2264}
2265
2266static ssize_t r5c_journal_mode_store(struct mddev *mddev,
2267                                      const char *page, size_t length)
2268{
2269        struct r5conf *conf = mddev->private;
2270        struct r5l_log *log = conf->log;
2271        int val = -1, i;
2272        int len = length;
2273
2274        if (!log)
2275                return -ENODEV;
2276
2277        if (len && page[len - 1] == '\n')
2278                len -= 1;
2279        for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++)
2280                if (strlen(r5c_journal_mode_str[i]) == len &&
2281                    strncmp(page, r5c_journal_mode_str[i], len) == 0) {
2282                        val = i;
2283                        break;
2284                }
2285        if (val < R5C_JOURNAL_MODE_WRITE_THROUGH ||
2286            val > R5C_JOURNAL_MODE_WRITE_BACK)
2287                return -EINVAL;
2288
2289        if (raid5_calc_degraded(conf) > 0 &&
2290            val == R5C_JOURNAL_MODE_WRITE_BACK)
2291                return -EINVAL;
2292
2293        mddev_suspend(mddev);
2294        conf->log->r5c_journal_mode = val;
2295        mddev_resume(mddev);
2296
2297        pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
2298                 mdname(mddev), val, r5c_journal_mode_str[val]);
2299        return length;
2300}
2301
2302struct md_sysfs_entry
2303r5c_journal_mode = __ATTR(journal_mode, 0644,
2304                          r5c_journal_mode_show, r5c_journal_mode_store);
2305
2306/*
2307 * Try handle write operation in caching phase. This function should only
2308 * be called in write-back mode.
2309 *
2310 * If all outstanding writes can be handled in caching phase, returns 0
2311 * If writes requires write-out phase, call r5c_make_stripe_write_out()
2312 * and returns -EAGAIN
2313 */
2314int r5c_try_caching_write(struct r5conf *conf,
2315                          struct stripe_head *sh,
2316                          struct stripe_head_state *s,
2317                          int disks)
2318{
2319        struct r5l_log *log = conf->log;
2320        int i;
2321        struct r5dev *dev;
2322        int to_cache = 0;
2323
2324        BUG_ON(!r5c_is_writeback(log));
2325
2326        if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
2327                /*
2328                 * There are two different scenarios here:
2329                 *  1. The stripe has some data cached, and it is sent to
2330                 *     write-out phase for reclaim
2331                 *  2. The stripe is clean, and this is the first write
2332                 *
2333                 * For 1, return -EAGAIN, so we continue with
2334                 * handle_stripe_dirtying().
2335                 *
2336                 * For 2, set STRIPE_R5C_CACHING and continue with caching
2337                 * write.
2338                 */
2339
2340                /* case 1: anything injournal or anything in written */
2341                if (s->injournal > 0 || s->written > 0)
2342                        return -EAGAIN;
2343                /* case 2 */
2344                set_bit(STRIPE_R5C_CACHING, &sh->state);
2345        }
2346
2347        /*
2348         * When run in degraded mode, array is set to write-through mode.
2349         * This check helps drain pending write safely in the transition to
2350         * write-through mode.
2351         */
2352        if (s->failed) {
2353                r5c_make_stripe_write_out(sh);
2354                return -EAGAIN;
2355        }
2356
2357        for (i = disks; i--; ) {
2358                dev = &sh->dev[i];
2359                /* if non-overwrite, use writing-out phase */
2360                if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
2361                    !test_bit(R5_InJournal, &dev->flags)) {
2362                        r5c_make_stripe_write_out(sh);
2363                        return -EAGAIN;
2364                }
2365        }
2366
2367        for (i = disks; i--; ) {
2368                dev = &sh->dev[i];
2369                if (dev->towrite) {
2370                        set_bit(R5_Wantwrite, &dev->flags);
2371                        set_bit(R5_Wantdrain, &dev->flags);
2372                        set_bit(R5_LOCKED, &dev->flags);
2373                        to_cache++;
2374                }
2375        }
2376
2377        if (to_cache) {
2378                set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2379                /*
2380                 * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data()
2381                 * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in
2382                 * r5c_handle_data_cached()
2383                 */
2384                set_bit(STRIPE_LOG_TRAPPED, &sh->state);
2385        }
2386
2387        return 0;
2388}
2389
2390/*
2391 * free extra pages (orig_page) we allocated for prexor
2392 */
2393void r5c_release_extra_page(struct stripe_head *sh)
2394{
2395        struct r5conf *conf = sh->raid_conf;
2396        int i;
2397        bool using_disk_info_extra_page;
2398
2399        using_disk_info_extra_page =
2400                sh->dev[0].orig_page == conf->disks[0].extra_page;
2401
2402        for (i = sh->disks; i--; )
2403                if (sh->dev[i].page != sh->dev[i].orig_page) {
2404                        struct page *p = sh->dev[i].orig_page;
2405
2406                        sh->dev[i].orig_page = sh->dev[i].page;
2407                        clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2408
2409                        if (!using_disk_info_extra_page)
2410                                put_page(p);
2411                }
2412
2413        if (using_disk_info_extra_page) {
2414                clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state);
2415                md_wakeup_thread(conf->mddev->thread);
2416        }
2417}
2418
2419void r5c_use_extra_page(struct stripe_head *sh)
2420{
2421        struct r5conf *conf = sh->raid_conf;
2422        int i;
2423        struct r5dev *dev;
2424
2425        for (i = sh->disks; i--; ) {
2426                dev = &sh->dev[i];
2427                if (dev->orig_page != dev->page)
2428                        put_page(dev->orig_page);
2429                dev->orig_page = conf->disks[i].extra_page;
2430        }
2431}
2432
2433/*
2434 * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the
2435 * stripe is committed to RAID disks.
2436 */
2437void r5c_finish_stripe_write_out(struct r5conf *conf,
2438                                 struct stripe_head *sh,
2439                                 struct stripe_head_state *s)
2440{
2441        int i;
2442        int do_wakeup = 0;
2443
2444        if (!conf->log ||
2445            !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
2446                return;
2447
2448        WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
2449        clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
2450
2451        if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
2452                return;
2453
2454        for (i = sh->disks; i--; ) {
2455                clear_bit(R5_InJournal, &sh->dev[i].flags);
2456                if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2457                        do_wakeup = 1;
2458        }
2459
2460        /*
2461         * analyse_stripe() runs before r5c_finish_stripe_write_out(),
2462         * We updated R5_InJournal, so we also update s->injournal.
2463         */
2464        s->injournal = 0;
2465
2466        if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2467                if (atomic_dec_and_test(&conf->pending_full_writes))
2468                        md_wakeup_thread(conf->mddev->thread);
2469
2470        if (do_wakeup)
2471                wake_up(&conf->wait_for_overlap);
2472
2473        spin_lock_irq(&conf->log->stripe_in_journal_lock);
2474        list_del_init(&sh->r5c);
2475        spin_unlock_irq(&conf->log->stripe_in_journal_lock);
2476        sh->log_start = MaxSector;
2477        atomic_dec(&conf->log->stripe_in_journal_count);
2478        r5c_update_log_state(conf->log);
2479}
2480
2481int
2482r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
2483               struct stripe_head_state *s)
2484{
2485        struct r5conf *conf = sh->raid_conf;
2486        int pages = 0;
2487        int reserve;
2488        int i;
2489        int ret = 0;
2490
2491        BUG_ON(!log);
2492
2493        for (i = 0; i < sh->disks; i++) {
2494                void *addr;
2495
2496                if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
2497                        continue;
2498                addr = kmap_atomic(sh->dev[i].page);
2499                sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
2500                                                    addr, PAGE_SIZE);
2501                kunmap_atomic(addr);
2502                pages++;
2503        }
2504        WARN_ON(pages == 0);
2505
2506        /*
2507         * The stripe must enter state machine again to call endio, so
2508         * don't delay.
2509         */
2510        clear_bit(STRIPE_DELAYED, &sh->state);
2511        atomic_inc(&sh->count);
2512
2513        mutex_lock(&log->io_mutex);
2514        /* meta + data */
2515        reserve = (1 + pages) << (PAGE_SHIFT - 9);
2516
2517        if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
2518            sh->log_start == MaxSector)
2519                r5l_add_no_space_stripe(log, sh);
2520        else if (!r5l_has_free_space(log, reserve)) {
2521                if (sh->log_start == log->last_checkpoint)
2522                        BUG();
2523                else
2524                        r5l_add_no_space_stripe(log, sh);
2525        } else {
2526                ret = r5l_log_stripe(log, sh, pages, 0);
2527                if (ret) {
2528                        spin_lock_irq(&log->io_list_lock);
2529                        list_add_tail(&sh->log_list, &log->no_mem_stripes);
2530                        spin_unlock_irq(&log->io_list_lock);
2531                }
2532        }
2533
2534        mutex_unlock(&log->io_mutex);
2535        return 0;
2536}
2537
2538static int r5l_load_log(struct r5l_log *log)
2539{
2540        struct md_rdev *rdev = log->rdev;
2541        struct page *page;
2542        struct r5l_meta_block *mb;
2543        sector_t cp = log->rdev->journal_tail;
2544        u32 stored_crc, expected_crc;
2545        bool create_super = false;
2546        int ret = 0;
2547
2548        /* Make sure it's valid */
2549        if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
2550                cp = 0;
2551        page = alloc_page(GFP_KERNEL);
2552        if (!page)
2553                return -ENOMEM;
2554
2555        if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
2556                ret = -EIO;
2557                goto ioerr;
2558        }
2559        mb = page_address(page);
2560
2561        if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
2562            mb->version != R5LOG_VERSION) {
2563                create_super = true;
2564                goto create;
2565        }
2566        stored_crc = le32_to_cpu(mb->checksum);
2567        mb->checksum = 0;
2568        expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
2569        if (stored_crc != expected_crc) {
2570                create_super = true;
2571                goto create;
2572        }
2573        if (le64_to_cpu(mb->position) != cp) {
2574                create_super = true;
2575                goto create;
2576        }
2577create:
2578        if (create_super) {
2579                log->last_cp_seq = prandom_u32();
2580                cp = 0;
2581                r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
2582                /*
2583                 * Make sure super points to correct address. Log might have
2584                 * data very soon. If super hasn't correct log tail address,
2585                 * recovery can't find the log
2586                 */
2587                r5l_write_super(log, cp);
2588        } else
2589                log->last_cp_seq = le64_to_cpu(mb->seq);
2590
2591        log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
2592        log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
2593        if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
2594                log->max_free_space = RECLAIM_MAX_FREE_SPACE;
2595        log->last_checkpoint = cp;
2596
2597        __free_page(page);
2598
2599        if (create_super) {
2600                log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS);
2601                log->seq = log->last_cp_seq + 1;
2602                log->next_checkpoint = cp;
2603        } else
2604                ret = r5l_recovery_log(log);
2605
2606        r5c_update_log_state(log);
2607        return ret;
2608ioerr:
2609        __free_page(page);
2610        return ret;
2611}
2612
2613void r5c_update_on_rdev_error(struct mddev *mddev)
2614{
2615        struct r5conf *conf = mddev->private;
2616        struct r5l_log *log = conf->log;
2617
2618        if (!log)
2619                return;
2620
2621        if (raid5_calc_degraded(conf) > 0 &&
2622            conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
2623                schedule_work(&log->disable_writeback_work);
2624}
2625
2626int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
2627{
2628        struct request_queue *q = bdev_get_queue(rdev->bdev);
2629        struct r5l_log *log;
2630
2631        if (PAGE_SIZE != 4096)
2632                return -EINVAL;
2633
2634        /*
2635         * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and
2636         * raid_disks r5l_payload_data_parity.
2637         *
2638         * Write journal and cache does not work for very big array
2639         * (raid_disks > 203)
2640         */
2641        if (sizeof(struct r5l_meta_block) +
2642            ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
2643             conf->raid_disks) > PAGE_SIZE) {
2644                pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
2645                       mdname(conf->mddev), conf->raid_disks);
2646                return -EINVAL;
2647        }
2648
2649        log = kzalloc(sizeof(*log), GFP_KERNEL);
2650        if (!log)
2651                return -ENOMEM;
2652        log->rdev = rdev;
2653
2654        log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
2655
2656        log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
2657                                       sizeof(rdev->mddev->uuid));
2658
2659        mutex_init(&log->io_mutex);
2660
2661        spin_lock_init(&log->io_list_lock);
2662        INIT_LIST_HEAD(&log->running_ios);
2663        INIT_LIST_HEAD(&log->io_end_ios);
2664        INIT_LIST_HEAD(&log->flushing_ios);
2665        INIT_LIST_HEAD(&log->finished_ios);
2666        bio_init(&log->flush_bio, NULL, 0);
2667
2668        log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
2669        if (!log->io_kc)
2670                goto io_kc;
2671
2672        log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
2673        if (!log->io_pool)
2674                goto io_pool;
2675
2676        log->bs = bioset_create(R5L_POOL_SIZE, 0);
2677        if (!log->bs)
2678                goto io_bs;
2679
2680        log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
2681        if (!log->meta_pool)
2682                goto out_mempool;
2683
2684        log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
2685                                                 log->rdev->mddev, "reclaim");
2686        if (!log->reclaim_thread)
2687                goto reclaim_thread;
2688        log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
2689
2690        init_waitqueue_head(&log->iounit_wait);
2691
2692        INIT_LIST_HEAD(&log->no_mem_stripes);
2693
2694        INIT_LIST_HEAD(&log->no_space_stripes);
2695        spin_lock_init(&log->no_space_stripes_lock);
2696
2697        INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
2698        INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async);
2699
2700        log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
2701        INIT_LIST_HEAD(&log->stripe_in_journal_list);
2702        spin_lock_init(&log->stripe_in_journal_lock);
2703        atomic_set(&log->stripe_in_journal_count, 0);
2704
2705        rcu_assign_pointer(conf->log, log);
2706
2707        if (r5l_load_log(log))
2708                goto error;
2709
2710        set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
2711        return 0;
2712
2713error:
2714        rcu_assign_pointer(conf->log, NULL);
2715        md_unregister_thread(&log->reclaim_thread);
2716reclaim_thread:
2717        mempool_destroy(log->meta_pool);
2718out_mempool:
2719        bioset_free(log->bs);
2720io_bs:
2721        mempool_destroy(log->io_pool);
2722io_pool:
2723        kmem_cache_destroy(log->io_kc);
2724io_kc:
2725        kfree(log);
2726        return -EINVAL;
2727}
2728
2729void r5l_exit_log(struct r5l_log *log)
2730{
2731        flush_work(&log->disable_writeback_work);
2732        md_unregister_thread(&log->reclaim_thread);
2733        mempool_destroy(log->meta_pool);
2734        bioset_free(log->bs);
2735        mempool_destroy(log->io_pool);
2736        kmem_cache_destroy(log->io_kc);
2737        kfree(log);
2738}
2739