linux/drivers/md/raid5-cache.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2015 Shaohua Li <shli@fb.com>
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms and conditions of the GNU General Public License,
   6 * version 2, as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 *
  13 */
  14#include <linux/kernel.h>
  15#include <linux/wait.h>
  16#include <linux/blkdev.h>
  17#include <linux/slab.h>
  18#include <linux/raid/md_p.h>
  19#include <linux/crc32c.h>
  20#include <linux/random.h>
  21#include "md.h"
  22#include "raid5.h"
  23
  24/*
  25 * metadata/data stored in disk with 4k size unit (a block) regardless
  26 * underneath hardware sector size. only works with PAGE_SIZE == 4096
  27 */
  28#define BLOCK_SECTORS (8)
  29
  30/*
  31 * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
  32 * recovery scans a very long log
  33 */
  34#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
  35#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
  36
  37/*
  38 * We only need 2 bios per I/O unit to make progress, but ensure we
  39 * have a few more available to not get too tight.
  40 */
  41#define R5L_POOL_SIZE   4
  42
  43struct r5l_log {
  44        struct md_rdev *rdev;
  45
  46        u32 uuid_checksum;
  47
  48        sector_t device_size;           /* log device size, round to
  49                                         * BLOCK_SECTORS */
  50        sector_t max_free_space;        /* reclaim run if free space is at
  51                                         * this size */
  52
  53        sector_t last_checkpoint;       /* log tail. where recovery scan
  54                                         * starts from */
  55        u64 last_cp_seq;                /* log tail sequence */
  56
  57        sector_t log_start;             /* log head. where new data appends */
  58        u64 seq;                        /* log head sequence */
  59
  60        sector_t next_checkpoint;
  61        u64 next_cp_seq;
  62
  63        struct mutex io_mutex;
  64        struct r5l_io_unit *current_io; /* current io_unit accepting new data */
  65
  66        spinlock_t io_list_lock;
  67        struct list_head running_ios;   /* io_units which are still running,
  68                                         * and have not yet been completely
  69                                         * written to the log */
  70        struct list_head io_end_ios;    /* io_units which have been completely
  71                                         * written to the log but not yet written
  72                                         * to the RAID */
  73        struct list_head flushing_ios;  /* io_units which are waiting for log
  74                                         * cache flush */
  75        struct list_head finished_ios;  /* io_units which settle down in log disk */
  76        struct bio flush_bio;
  77
  78        struct list_head no_mem_stripes;   /* pending stripes, -ENOMEM */
  79
  80        struct kmem_cache *io_kc;
  81        mempool_t *io_pool;
  82        struct bio_set *bs;
  83        mempool_t *meta_pool;
  84
  85        struct md_thread *reclaim_thread;
  86        unsigned long reclaim_target;   /* number of space that need to be
  87                                         * reclaimed.  if it's 0, reclaim spaces
  88                                         * used by io_units which are in
  89                                         * IO_UNIT_STRIPE_END state (eg, reclaim
  90                                         * dones't wait for specific io_unit
  91                                         * switching to IO_UNIT_STRIPE_END
  92                                         * state) */
  93        wait_queue_head_t iounit_wait;
  94
  95        struct list_head no_space_stripes; /* pending stripes, log has no space */
  96        spinlock_t no_space_stripes_lock;
  97
  98        bool need_cache_flush;
  99};
 100
 101/*
 102 * an IO range starts from a meta data block and end at the next meta data
 103 * block. The io unit's the meta data block tracks data/parity followed it. io
 104 * unit is written to log disk with normal write, as we always flush log disk
 105 * first and then start move data to raid disks, there is no requirement to
 106 * write io unit with FLUSH/FUA
 107 */
 108struct r5l_io_unit {
 109        struct r5l_log *log;
 110
 111        struct page *meta_page; /* store meta block */
 112        int meta_offset;        /* current offset in meta_page */
 113
 114        struct bio *current_bio;/* current_bio accepting new data */
 115
 116        atomic_t pending_stripe;/* how many stripes not flushed to raid */
 117        u64 seq;                /* seq number of the metablock */
 118        sector_t log_start;     /* where the io_unit starts */
 119        sector_t log_end;       /* where the io_unit ends */
 120        struct list_head log_sibling; /* log->running_ios */
 121        struct list_head stripe_list; /* stripes added to the io_unit */
 122
 123        int state;
 124        bool need_split_bio;
 125};
 126
 127/* r5l_io_unit state */
 128enum r5l_io_unit_state {
 129        IO_UNIT_RUNNING = 0,    /* accepting new IO */
 130        IO_UNIT_IO_START = 1,   /* io_unit bio start writing to log,
 131                                 * don't accepting new bio */
 132        IO_UNIT_IO_END = 2,     /* io_unit bio finish writing to log */
 133        IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
 134};
 135
 136static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
 137{
 138        start += inc;
 139        if (start >= log->device_size)
 140                start = start - log->device_size;
 141        return start;
 142}
 143
 144static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
 145                                  sector_t end)
 146{
 147        if (end >= start)
 148                return end - start;
 149        else
 150                return end + log->device_size - start;
 151}
 152
 153static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
 154{
 155        sector_t used_size;
 156
 157        used_size = r5l_ring_distance(log, log->last_checkpoint,
 158                                        log->log_start);
 159
 160        return log->device_size > used_size + size;
 161}
 162
 163static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
 164                                    enum r5l_io_unit_state state)
 165{
 166        if (WARN_ON(io->state >= state))
 167                return;
 168        io->state = state;
 169}
 170
 171static void r5l_io_run_stripes(struct r5l_io_unit *io)
 172{
 173        struct stripe_head *sh, *next;
 174
 175        list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
 176                list_del_init(&sh->log_list);
 177                set_bit(STRIPE_HANDLE, &sh->state);
 178                raid5_release_stripe(sh);
 179        }
 180}
 181
 182static void r5l_log_run_stripes(struct r5l_log *log)
 183{
 184        struct r5l_io_unit *io, *next;
 185
 186        assert_spin_locked(&log->io_list_lock);
 187
 188        list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
 189                /* don't change list order */
 190                if (io->state < IO_UNIT_IO_END)
 191                        break;
 192
 193                list_move_tail(&io->log_sibling, &log->finished_ios);
 194                r5l_io_run_stripes(io);
 195        }
 196}
 197
 198static void r5l_move_to_end_ios(struct r5l_log *log)
 199{
 200        struct r5l_io_unit *io, *next;
 201
 202        assert_spin_locked(&log->io_list_lock);
 203
 204        list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
 205                /* don't change list order */
 206                if (io->state < IO_UNIT_IO_END)
 207                        break;
 208                list_move_tail(&io->log_sibling, &log->io_end_ios);
 209        }
 210}
 211
 212static void r5l_log_endio(struct bio *bio)
 213{
 214        struct r5l_io_unit *io = bio->bi_private;
 215        struct r5l_log *log = io->log;
 216        unsigned long flags;
 217
 218        if (bio->bi_error)
 219                md_error(log->rdev->mddev, log->rdev);
 220
 221        bio_put(bio);
 222        mempool_free(io->meta_page, log->meta_pool);
 223
 224        spin_lock_irqsave(&log->io_list_lock, flags);
 225        __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
 226        if (log->need_cache_flush)
 227                r5l_move_to_end_ios(log);
 228        else
 229                r5l_log_run_stripes(log);
 230        spin_unlock_irqrestore(&log->io_list_lock, flags);
 231
 232        if (log->need_cache_flush)
 233                md_wakeup_thread(log->rdev->mddev->thread);
 234}
 235
 236static void r5l_submit_current_io(struct r5l_log *log)
 237{
 238        struct r5l_io_unit *io = log->current_io;
 239        struct r5l_meta_block *block;
 240        unsigned long flags;
 241        u32 crc;
 242
 243        if (!io)
 244                return;
 245
 246        block = page_address(io->meta_page);
 247        block->meta_size = cpu_to_le32(io->meta_offset);
 248        crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
 249        block->checksum = cpu_to_le32(crc);
 250
 251        log->current_io = NULL;
 252        spin_lock_irqsave(&log->io_list_lock, flags);
 253        __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
 254        spin_unlock_irqrestore(&log->io_list_lock, flags);
 255
 256        submit_bio(io->current_bio);
 257}
 258
 259static struct bio *r5l_bio_alloc(struct r5l_log *log)
 260{
 261        struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
 262
 263        bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 264        bio->bi_bdev = log->rdev->bdev;
 265        bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
 266
 267        return bio;
 268}
 269
 270static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
 271{
 272        log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
 273
 274        /*
 275         * If we filled up the log device start from the beginning again,
 276         * which will require a new bio.
 277         *
 278         * Note: for this to work properly the log size needs to me a multiple
 279         * of BLOCK_SECTORS.
 280         */
 281        if (log->log_start == 0)
 282                io->need_split_bio = true;
 283
 284        io->log_end = log->log_start;
 285}
 286
 287static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 288{
 289        struct r5l_io_unit *io;
 290        struct r5l_meta_block *block;
 291
 292        io = mempool_alloc(log->io_pool, GFP_ATOMIC);
 293        if (!io)
 294                return NULL;
 295        memset(io, 0, sizeof(*io));
 296
 297        io->log = log;
 298        INIT_LIST_HEAD(&io->log_sibling);
 299        INIT_LIST_HEAD(&io->stripe_list);
 300        io->state = IO_UNIT_RUNNING;
 301
 302        io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
 303        block = page_address(io->meta_page);
 304        clear_page(block);
 305        block->magic = cpu_to_le32(R5LOG_MAGIC);
 306        block->version = R5LOG_VERSION;
 307        block->seq = cpu_to_le64(log->seq);
 308        block->position = cpu_to_le64(log->log_start);
 309
 310        io->log_start = log->log_start;
 311        io->meta_offset = sizeof(struct r5l_meta_block);
 312        io->seq = log->seq++;
 313
 314        io->current_bio = r5l_bio_alloc(log);
 315        io->current_bio->bi_end_io = r5l_log_endio;
 316        io->current_bio->bi_private = io;
 317        bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
 318
 319        r5_reserve_log_entry(log, io);
 320
 321        spin_lock_irq(&log->io_list_lock);
 322        list_add_tail(&io->log_sibling, &log->running_ios);
 323        spin_unlock_irq(&log->io_list_lock);
 324
 325        return io;
 326}
 327
 328static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
 329{
 330        if (log->current_io &&
 331            log->current_io->meta_offset + payload_size > PAGE_SIZE)
 332                r5l_submit_current_io(log);
 333
 334        if (!log->current_io) {
 335                log->current_io = r5l_new_meta(log);
 336                if (!log->current_io)
 337                        return -ENOMEM;
 338        }
 339
 340        return 0;
 341}
 342
 343static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
 344                                    sector_t location,
 345                                    u32 checksum1, u32 checksum2,
 346                                    bool checksum2_valid)
 347{
 348        struct r5l_io_unit *io = log->current_io;
 349        struct r5l_payload_data_parity *payload;
 350
 351        payload = page_address(io->meta_page) + io->meta_offset;
 352        payload->header.type = cpu_to_le16(type);
 353        payload->header.flags = cpu_to_le16(0);
 354        payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
 355                                    (PAGE_SHIFT - 9));
 356        payload->location = cpu_to_le64(location);
 357        payload->checksum[0] = cpu_to_le32(checksum1);
 358        if (checksum2_valid)
 359                payload->checksum[1] = cpu_to_le32(checksum2);
 360
 361        io->meta_offset += sizeof(struct r5l_payload_data_parity) +
 362                sizeof(__le32) * (1 + !!checksum2_valid);
 363}
 364
 365static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
 366{
 367        struct r5l_io_unit *io = log->current_io;
 368
 369        if (io->need_split_bio) {
 370                struct bio *prev = io->current_bio;
 371
 372                io->current_bio = r5l_bio_alloc(log);
 373                bio_chain(io->current_bio, prev);
 374
 375                submit_bio(prev);
 376        }
 377
 378        if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
 379                BUG();
 380
 381        r5_reserve_log_entry(log, io);
 382}
 383
 384static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 385                           int data_pages, int parity_pages)
 386{
 387        int i;
 388        int meta_size;
 389        int ret;
 390        struct r5l_io_unit *io;
 391
 392        meta_size =
 393                ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
 394                 * data_pages) +
 395                sizeof(struct r5l_payload_data_parity) +
 396                sizeof(__le32) * parity_pages;
 397
 398        ret = r5l_get_meta(log, meta_size);
 399        if (ret)
 400                return ret;
 401
 402        io = log->current_io;
 403
 404        for (i = 0; i < sh->disks; i++) {
 405                if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
 406                        continue;
 407                if (i == sh->pd_idx || i == sh->qd_idx)
 408                        continue;
 409                r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
 410                                        raid5_compute_blocknr(sh, i, 0),
 411                                        sh->dev[i].log_checksum, 0, false);
 412                r5l_append_payload_page(log, sh->dev[i].page);
 413        }
 414
 415        if (sh->qd_idx >= 0) {
 416                r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
 417                                        sh->sector, sh->dev[sh->pd_idx].log_checksum,
 418                                        sh->dev[sh->qd_idx].log_checksum, true);
 419                r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
 420                r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
 421        } else {
 422                r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
 423                                        sh->sector, sh->dev[sh->pd_idx].log_checksum,
 424                                        0, false);
 425                r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
 426        }
 427
 428        list_add_tail(&sh->log_list, &io->stripe_list);
 429        atomic_inc(&io->pending_stripe);
 430        sh->log_io = io;
 431
 432        return 0;
 433}
 434
 435static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
 436/*
 437 * running in raid5d, where reclaim could wait for raid5d too (when it flushes
 438 * data from log to raid disks), so we shouldn't wait for reclaim here
 439 */
 440int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 441{
 442        int write_disks = 0;
 443        int data_pages, parity_pages;
 444        int meta_size;
 445        int reserve;
 446        int i;
 447        int ret = 0;
 448
 449        if (!log)
 450                return -EAGAIN;
 451        /* Don't support stripe batch */
 452        if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
 453            test_bit(STRIPE_SYNCING, &sh->state)) {
 454                /* the stripe is written to log, we start writing it to raid */
 455                clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
 456                return -EAGAIN;
 457        }
 458
 459        for (i = 0; i < sh->disks; i++) {
 460                void *addr;
 461
 462                if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
 463                        continue;
 464                write_disks++;
 465                /* checksum is already calculated in last run */
 466                if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
 467                        continue;
 468                addr = kmap_atomic(sh->dev[i].page);
 469                sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
 470                                                    addr, PAGE_SIZE);
 471                kunmap_atomic(addr);
 472        }
 473        parity_pages = 1 + !!(sh->qd_idx >= 0);
 474        data_pages = write_disks - parity_pages;
 475
 476        meta_size =
 477                ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
 478                 * data_pages) +
 479                sizeof(struct r5l_payload_data_parity) +
 480                sizeof(__le32) * parity_pages;
 481        /* Doesn't work with very big raid array */
 482        if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
 483                return -EINVAL;
 484
 485        set_bit(STRIPE_LOG_TRAPPED, &sh->state);
 486        /*
 487         * The stripe must enter state machine again to finish the write, so
 488         * don't delay.
 489         */
 490        clear_bit(STRIPE_DELAYED, &sh->state);
 491        atomic_inc(&sh->count);
 492
 493        mutex_lock(&log->io_mutex);
 494        /* meta + data */
 495        reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
 496        if (!r5l_has_free_space(log, reserve)) {
 497                spin_lock(&log->no_space_stripes_lock);
 498                list_add_tail(&sh->log_list, &log->no_space_stripes);
 499                spin_unlock(&log->no_space_stripes_lock);
 500
 501                r5l_wake_reclaim(log, reserve);
 502        } else {
 503                ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
 504                if (ret) {
 505                        spin_lock_irq(&log->io_list_lock);
 506                        list_add_tail(&sh->log_list, &log->no_mem_stripes);
 507                        spin_unlock_irq(&log->io_list_lock);
 508                }
 509        }
 510
 511        mutex_unlock(&log->io_mutex);
 512        return 0;
 513}
 514
 515void r5l_write_stripe_run(struct r5l_log *log)
 516{
 517        if (!log)
 518                return;
 519        mutex_lock(&log->io_mutex);
 520        r5l_submit_current_io(log);
 521        mutex_unlock(&log->io_mutex);
 522}
 523
 524int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
 525{
 526        if (!log)
 527                return -ENODEV;
 528        /*
 529         * we flush log disk cache first, then write stripe data to raid disks.
 530         * So if bio is finished, the log disk cache is flushed already. The
 531         * recovery guarantees we can recovery the bio from log disk, so we
 532         * don't need to flush again
 533         */
 534        if (bio->bi_iter.bi_size == 0) {
 535                bio_endio(bio);
 536                return 0;
 537        }
 538        bio->bi_opf &= ~REQ_PREFLUSH;
 539        return -EAGAIN;
 540}
 541
 542/* This will run after log space is reclaimed */
 543static void r5l_run_no_space_stripes(struct r5l_log *log)
 544{
 545        struct stripe_head *sh;
 546
 547        spin_lock(&log->no_space_stripes_lock);
 548        while (!list_empty(&log->no_space_stripes)) {
 549                sh = list_first_entry(&log->no_space_stripes,
 550                                      struct stripe_head, log_list);
 551                list_del_init(&sh->log_list);
 552                set_bit(STRIPE_HANDLE, &sh->state);
 553                raid5_release_stripe(sh);
 554        }
 555        spin_unlock(&log->no_space_stripes_lock);
 556}
 557
 558static sector_t r5l_reclaimable_space(struct r5l_log *log)
 559{
 560        return r5l_ring_distance(log, log->last_checkpoint,
 561                                 log->next_checkpoint);
 562}
 563
 564static void r5l_run_no_mem_stripe(struct r5l_log *log)
 565{
 566        struct stripe_head *sh;
 567
 568        assert_spin_locked(&log->io_list_lock);
 569
 570        if (!list_empty(&log->no_mem_stripes)) {
 571                sh = list_first_entry(&log->no_mem_stripes,
 572                                      struct stripe_head, log_list);
 573                list_del_init(&sh->log_list);
 574                set_bit(STRIPE_HANDLE, &sh->state);
 575                raid5_release_stripe(sh);
 576        }
 577}
 578
 579static bool r5l_complete_finished_ios(struct r5l_log *log)
 580{
 581        struct r5l_io_unit *io, *next;
 582        bool found = false;
 583
 584        assert_spin_locked(&log->io_list_lock);
 585
 586        list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
 587                /* don't change list order */
 588                if (io->state < IO_UNIT_STRIPE_END)
 589                        break;
 590
 591                log->next_checkpoint = io->log_start;
 592                log->next_cp_seq = io->seq;
 593
 594                list_del(&io->log_sibling);
 595                mempool_free(io, log->io_pool);
 596                r5l_run_no_mem_stripe(log);
 597
 598                found = true;
 599        }
 600
 601        return found;
 602}
 603
 604static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
 605{
 606        struct r5l_log *log = io->log;
 607        unsigned long flags;
 608
 609        spin_lock_irqsave(&log->io_list_lock, flags);
 610        __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
 611
 612        if (!r5l_complete_finished_ios(log)) {
 613                spin_unlock_irqrestore(&log->io_list_lock, flags);
 614                return;
 615        }
 616
 617        if (r5l_reclaimable_space(log) > log->max_free_space)
 618                r5l_wake_reclaim(log, 0);
 619
 620        spin_unlock_irqrestore(&log->io_list_lock, flags);
 621        wake_up(&log->iounit_wait);
 622}
 623
 624void r5l_stripe_write_finished(struct stripe_head *sh)
 625{
 626        struct r5l_io_unit *io;
 627
 628        io = sh->log_io;
 629        sh->log_io = NULL;
 630
 631        if (io && atomic_dec_and_test(&io->pending_stripe))
 632                __r5l_stripe_write_finished(io);
 633}
 634
 635static void r5l_log_flush_endio(struct bio *bio)
 636{
 637        struct r5l_log *log = container_of(bio, struct r5l_log,
 638                flush_bio);
 639        unsigned long flags;
 640        struct r5l_io_unit *io;
 641
 642        if (bio->bi_error)
 643                md_error(log->rdev->mddev, log->rdev);
 644
 645        spin_lock_irqsave(&log->io_list_lock, flags);
 646        list_for_each_entry(io, &log->flushing_ios, log_sibling)
 647                r5l_io_run_stripes(io);
 648        list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
 649        spin_unlock_irqrestore(&log->io_list_lock, flags);
 650}
 651
 652/*
 653 * Starting dispatch IO to raid.
 654 * io_unit(meta) consists of a log. There is one situation we want to avoid. A
 655 * broken meta in the middle of a log causes recovery can't find meta at the
 656 * head of log. If operations require meta at the head persistent in log, we
 657 * must make sure meta before it persistent in log too. A case is:
 658 *
 659 * stripe data/parity is in log, we start write stripe to raid disks. stripe
 660 * data/parity must be persistent in log before we do the write to raid disks.
 661 *
 662 * The solution is we restrictly maintain io_unit list order. In this case, we
 663 * only write stripes of an io_unit to raid disks till the io_unit is the first
 664 * one whose data/parity is in log.
 665 */
 666void r5l_flush_stripe_to_raid(struct r5l_log *log)
 667{
 668        bool do_flush;
 669
 670        if (!log || !log->need_cache_flush)
 671                return;
 672
 673        spin_lock_irq(&log->io_list_lock);
 674        /* flush bio is running */
 675        if (!list_empty(&log->flushing_ios)) {
 676                spin_unlock_irq(&log->io_list_lock);
 677                return;
 678        }
 679        list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
 680        do_flush = !list_empty(&log->flushing_ios);
 681        spin_unlock_irq(&log->io_list_lock);
 682
 683        if (!do_flush)
 684                return;
 685        bio_reset(&log->flush_bio);
 686        log->flush_bio.bi_bdev = log->rdev->bdev;
 687        log->flush_bio.bi_end_io = r5l_log_flush_endio;
 688        bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH);
 689        submit_bio(&log->flush_bio);
 690}
 691
 692static void r5l_write_super(struct r5l_log *log, sector_t cp);
 693static void r5l_write_super_and_discard_space(struct r5l_log *log,
 694        sector_t end)
 695{
 696        struct block_device *bdev = log->rdev->bdev;
 697        struct mddev *mddev;
 698
 699        r5l_write_super(log, end);
 700
 701        if (!blk_queue_discard(bdev_get_queue(bdev)))
 702                return;
 703
 704        mddev = log->rdev->mddev;
 705        /*
 706         * Discard could zero data, so before discard we must make sure
 707         * superblock is updated to new log tail. Updating superblock (either
 708         * directly call md_update_sb() or depend on md thread) must hold
 709         * reconfig mutex. On the other hand, raid5_quiesce is called with
 710         * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
 711         * for all IO finish, hence waitting for reclaim thread, while reclaim
 712         * thread is calling this function and waitting for reconfig mutex. So
 713         * there is a deadlock. We workaround this issue with a trylock.
 714         * FIXME: we could miss discard if we can't take reconfig mutex
 715         */
 716        set_mask_bits(&mddev->flags, 0,
 717                BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
 718        if (!mddev_trylock(mddev))
 719                return;
 720        md_update_sb(mddev, 1);
 721        mddev_unlock(mddev);
 722
 723        /* discard IO error really doesn't matter, ignore it */
 724        if (log->last_checkpoint < end) {
 725                blkdev_issue_discard(bdev,
 726                                log->last_checkpoint + log->rdev->data_offset,
 727                                end - log->last_checkpoint, GFP_NOIO, 0);
 728        } else {
 729                blkdev_issue_discard(bdev,
 730                                log->last_checkpoint + log->rdev->data_offset,
 731                                log->device_size - log->last_checkpoint,
 732                                GFP_NOIO, 0);
 733                blkdev_issue_discard(bdev, log->rdev->data_offset, end,
 734                                GFP_NOIO, 0);
 735        }
 736}
 737
 738
 739static void r5l_do_reclaim(struct r5l_log *log)
 740{
 741        sector_t reclaim_target = xchg(&log->reclaim_target, 0);
 742        sector_t reclaimable;
 743        sector_t next_checkpoint;
 744        u64 next_cp_seq;
 745
 746        spin_lock_irq(&log->io_list_lock);
 747        /*
 748         * move proper io_unit to reclaim list. We should not change the order.
 749         * reclaimable/unreclaimable io_unit can be mixed in the list, we
 750         * shouldn't reuse space of an unreclaimable io_unit
 751         */
 752        while (1) {
 753                reclaimable = r5l_reclaimable_space(log);
 754                if (reclaimable >= reclaim_target ||
 755                    (list_empty(&log->running_ios) &&
 756                     list_empty(&log->io_end_ios) &&
 757                     list_empty(&log->flushing_ios) &&
 758                     list_empty(&log->finished_ios)))
 759                        break;
 760
 761                md_wakeup_thread(log->rdev->mddev->thread);
 762                wait_event_lock_irq(log->iounit_wait,
 763                                    r5l_reclaimable_space(log) > reclaimable,
 764                                    log->io_list_lock);
 765        }
 766
 767        next_checkpoint = log->next_checkpoint;
 768        next_cp_seq = log->next_cp_seq;
 769        spin_unlock_irq(&log->io_list_lock);
 770
 771        BUG_ON(reclaimable < 0);
 772        if (reclaimable == 0)
 773                return;
 774
 775        /*
 776         * write_super will flush cache of each raid disk. We must write super
 777         * here, because the log area might be reused soon and we don't want to
 778         * confuse recovery
 779         */
 780        r5l_write_super_and_discard_space(log, next_checkpoint);
 781
 782        mutex_lock(&log->io_mutex);
 783        log->last_checkpoint = next_checkpoint;
 784        log->last_cp_seq = next_cp_seq;
 785        mutex_unlock(&log->io_mutex);
 786
 787        r5l_run_no_space_stripes(log);
 788}
 789
 790static void r5l_reclaim_thread(struct md_thread *thread)
 791{
 792        struct mddev *mddev = thread->mddev;
 793        struct r5conf *conf = mddev->private;
 794        struct r5l_log *log = conf->log;
 795
 796        if (!log)
 797                return;
 798        r5l_do_reclaim(log);
 799}
 800
 801static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
 802{
 803        unsigned long target;
 804        unsigned long new = (unsigned long)space; /* overflow in theory */
 805
 806        do {
 807                target = log->reclaim_target;
 808                if (new < target)
 809                        return;
 810        } while (cmpxchg(&log->reclaim_target, target, new) != target);
 811        md_wakeup_thread(log->reclaim_thread);
 812}
 813
 814void r5l_quiesce(struct r5l_log *log, int state)
 815{
 816        struct mddev *mddev;
 817        if (!log || state == 2)
 818                return;
 819        if (state == 0) {
 820                /*
 821                 * This is a special case for hotadd. In suspend, the array has
 822                 * no journal. In resume, journal is initialized as well as the
 823                 * reclaim thread.
 824                 */
 825                if (log->reclaim_thread)
 826                        return;
 827                log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
 828                                        log->rdev->mddev, "reclaim");
 829        } else if (state == 1) {
 830                /* make sure r5l_write_super_and_discard_space exits */
 831                mddev = log->rdev->mddev;
 832                wake_up(&mddev->sb_wait);
 833                r5l_wake_reclaim(log, -1L);
 834                md_unregister_thread(&log->reclaim_thread);
 835                r5l_do_reclaim(log);
 836        }
 837}
 838
 839bool r5l_log_disk_error(struct r5conf *conf)
 840{
 841        struct r5l_log *log;
 842        bool ret;
 843        /* don't allow write if journal disk is missing */
 844        rcu_read_lock();
 845        log = rcu_dereference(conf->log);
 846
 847        if (!log)
 848                ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
 849        else
 850                ret = test_bit(Faulty, &log->rdev->flags);
 851        rcu_read_unlock();
 852        return ret;
 853}
 854
 855struct r5l_recovery_ctx {
 856        struct page *meta_page;         /* current meta */
 857        sector_t meta_total_blocks;     /* total size of current meta and data */
 858        sector_t pos;                   /* recovery position */
 859        u64 seq;                        /* recovery position seq */
 860};
 861
 862static int r5l_read_meta_block(struct r5l_log *log,
 863                               struct r5l_recovery_ctx *ctx)
 864{
 865        struct page *page = ctx->meta_page;
 866        struct r5l_meta_block *mb;
 867        u32 crc, stored_crc;
 868
 869        if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0,
 870                          false))
 871                return -EIO;
 872
 873        mb = page_address(page);
 874        stored_crc = le32_to_cpu(mb->checksum);
 875        mb->checksum = 0;
 876
 877        if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
 878            le64_to_cpu(mb->seq) != ctx->seq ||
 879            mb->version != R5LOG_VERSION ||
 880            le64_to_cpu(mb->position) != ctx->pos)
 881                return -EINVAL;
 882
 883        crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
 884        if (stored_crc != crc)
 885                return -EINVAL;
 886
 887        if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
 888                return -EINVAL;
 889
 890        ctx->meta_total_blocks = BLOCK_SECTORS;
 891
 892        return 0;
 893}
 894
 895static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
 896                                         struct r5l_recovery_ctx *ctx,
 897                                         sector_t stripe_sect,
 898                                         int *offset, sector_t *log_offset)
 899{
 900        struct r5conf *conf = log->rdev->mddev->private;
 901        struct stripe_head *sh;
 902        struct r5l_payload_data_parity *payload;
 903        int disk_index;
 904
 905        sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
 906        while (1) {
 907                payload = page_address(ctx->meta_page) + *offset;
 908
 909                if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
 910                        raid5_compute_sector(conf,
 911                                             le64_to_cpu(payload->location), 0,
 912                                             &disk_index, sh);
 913
 914                        sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
 915                                     sh->dev[disk_index].page, REQ_OP_READ, 0,
 916                                     false);
 917                        sh->dev[disk_index].log_checksum =
 918                                le32_to_cpu(payload->checksum[0]);
 919                        set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
 920                        ctx->meta_total_blocks += BLOCK_SECTORS;
 921                } else {
 922                        disk_index = sh->pd_idx;
 923                        sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
 924                                     sh->dev[disk_index].page, REQ_OP_READ, 0,
 925                                     false);
 926                        sh->dev[disk_index].log_checksum =
 927                                le32_to_cpu(payload->checksum[0]);
 928                        set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
 929
 930                        if (sh->qd_idx >= 0) {
 931                                disk_index = sh->qd_idx;
 932                                sync_page_io(log->rdev,
 933                                             r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
 934                                             PAGE_SIZE, sh->dev[disk_index].page,
 935                                             REQ_OP_READ, 0, false);
 936                                sh->dev[disk_index].log_checksum =
 937                                        le32_to_cpu(payload->checksum[1]);
 938                                set_bit(R5_Wantwrite,
 939                                        &sh->dev[disk_index].flags);
 940                        }
 941                        ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
 942                }
 943
 944                *log_offset = r5l_ring_add(log, *log_offset,
 945                                           le32_to_cpu(payload->size));
 946                *offset += sizeof(struct r5l_payload_data_parity) +
 947                        sizeof(__le32) *
 948                        (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
 949                if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
 950                        break;
 951        }
 952
 953        for (disk_index = 0; disk_index < sh->disks; disk_index++) {
 954                void *addr;
 955                u32 checksum;
 956
 957                if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
 958                        continue;
 959                addr = kmap_atomic(sh->dev[disk_index].page);
 960                checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
 961                kunmap_atomic(addr);
 962                if (checksum != sh->dev[disk_index].log_checksum)
 963                        goto error;
 964        }
 965
 966        for (disk_index = 0; disk_index < sh->disks; disk_index++) {
 967                struct md_rdev *rdev, *rrdev;
 968
 969                if (!test_and_clear_bit(R5_Wantwrite,
 970                                        &sh->dev[disk_index].flags))
 971                        continue;
 972
 973                /* in case device is broken */
 974                rdev = rcu_dereference(conf->disks[disk_index].rdev);
 975                if (rdev)
 976                        sync_page_io(rdev, stripe_sect, PAGE_SIZE,
 977                                     sh->dev[disk_index].page, REQ_OP_WRITE, 0,
 978                                     false);
 979                rrdev = rcu_dereference(conf->disks[disk_index].replacement);
 980                if (rrdev)
 981                        sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
 982                                     sh->dev[disk_index].page, REQ_OP_WRITE, 0,
 983                                     false);
 984        }
 985        raid5_release_stripe(sh);
 986        return 0;
 987
 988error:
 989        for (disk_index = 0; disk_index < sh->disks; disk_index++)
 990                sh->dev[disk_index].flags = 0;
 991        raid5_release_stripe(sh);
 992        return -EINVAL;
 993}
 994
 995static int r5l_recovery_flush_one_meta(struct r5l_log *log,
 996                                       struct r5l_recovery_ctx *ctx)
 997{
 998        struct r5conf *conf = log->rdev->mddev->private;
 999        struct r5l_payload_data_parity *payload;
1000        struct r5l_meta_block *mb;
1001        int offset;
1002        sector_t log_offset;
1003        sector_t stripe_sector;
1004
1005        mb = page_address(ctx->meta_page);
1006        offset = sizeof(struct r5l_meta_block);
1007        log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
1008
1009        while (offset < le32_to_cpu(mb->meta_size)) {
1010                int dd;
1011
1012                payload = (void *)mb + offset;
1013                stripe_sector = raid5_compute_sector(conf,
1014                                                     le64_to_cpu(payload->location), 0, &dd, NULL);
1015                if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
1016                                                  &offset, &log_offset))
1017                        return -EINVAL;
1018        }
1019        return 0;
1020}
1021
1022/* copy data/parity from log to raid disks */
1023static void r5l_recovery_flush_log(struct r5l_log *log,
1024                                   struct r5l_recovery_ctx *ctx)
1025{
1026        while (1) {
1027                if (r5l_read_meta_block(log, ctx))
1028                        return;
1029                if (r5l_recovery_flush_one_meta(log, ctx))
1030                        return;
1031                ctx->seq++;
1032                ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
1033        }
1034}
1035
1036static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
1037                                          u64 seq)
1038{
1039        struct page *page;
1040        struct r5l_meta_block *mb;
1041        u32 crc;
1042
1043        page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1044        if (!page)
1045                return -ENOMEM;
1046        mb = page_address(page);
1047        mb->magic = cpu_to_le32(R5LOG_MAGIC);
1048        mb->version = R5LOG_VERSION;
1049        mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
1050        mb->seq = cpu_to_le64(seq);
1051        mb->position = cpu_to_le64(pos);
1052        crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1053        mb->checksum = cpu_to_le32(crc);
1054
1055        if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
1056                          WRITE_FUA, false)) {
1057                __free_page(page);
1058                return -EIO;
1059        }
1060        __free_page(page);
1061        return 0;
1062}
1063
1064static int r5l_recovery_log(struct r5l_log *log)
1065{
1066        struct r5l_recovery_ctx ctx;
1067
1068        ctx.pos = log->last_checkpoint;
1069        ctx.seq = log->last_cp_seq;
1070        ctx.meta_page = alloc_page(GFP_KERNEL);
1071        if (!ctx.meta_page)
1072                return -ENOMEM;
1073
1074        r5l_recovery_flush_log(log, &ctx);
1075        __free_page(ctx.meta_page);
1076
1077        /*
1078         * we did a recovery. Now ctx.pos points to an invalid meta block. New
1079         * log will start here. but we can't let superblock point to last valid
1080         * meta block. The log might looks like:
1081         * | meta 1| meta 2| meta 3|
1082         * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
1083         * superblock points to meta 1, we write a new valid meta 2n.  if crash
1084         * happens again, new recovery will start from meta 1. Since meta 2n is
1085         * valid now, recovery will think meta 3 is valid, which is wrong.
1086         * The solution is we create a new meta in meta2 with its seq == meta
1087         * 1's seq + 10 and let superblock points to meta2. The same recovery will
1088         * not think meta 3 is a valid meta, because its seq doesn't match
1089         */
1090        if (ctx.seq > log->last_cp_seq + 1) {
1091                int ret;
1092
1093                ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
1094                if (ret)
1095                        return ret;
1096                log->seq = ctx.seq + 11;
1097                log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
1098                r5l_write_super(log, ctx.pos);
1099        } else {
1100                log->log_start = ctx.pos;
1101                log->seq = ctx.seq;
1102        }
1103        return 0;
1104}
1105
1106static void r5l_write_super(struct r5l_log *log, sector_t cp)
1107{
1108        struct mddev *mddev = log->rdev->mddev;
1109
1110        log->rdev->journal_tail = cp;
1111        set_bit(MD_CHANGE_DEVS, &mddev->flags);
1112}
1113
1114static int r5l_load_log(struct r5l_log *log)
1115{
1116        struct md_rdev *rdev = log->rdev;
1117        struct page *page;
1118        struct r5l_meta_block *mb;
1119        sector_t cp = log->rdev->journal_tail;
1120        u32 stored_crc, expected_crc;
1121        bool create_super = false;
1122        int ret;
1123
1124        /* Make sure it's valid */
1125        if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
1126                cp = 0;
1127        page = alloc_page(GFP_KERNEL);
1128        if (!page)
1129                return -ENOMEM;
1130
1131        if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
1132                ret = -EIO;
1133                goto ioerr;
1134        }
1135        mb = page_address(page);
1136
1137        if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1138            mb->version != R5LOG_VERSION) {
1139                create_super = true;
1140                goto create;
1141        }
1142        stored_crc = le32_to_cpu(mb->checksum);
1143        mb->checksum = 0;
1144        expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1145        if (stored_crc != expected_crc) {
1146                create_super = true;
1147                goto create;
1148        }
1149        if (le64_to_cpu(mb->position) != cp) {
1150                create_super = true;
1151                goto create;
1152        }
1153create:
1154        if (create_super) {
1155                log->last_cp_seq = prandom_u32();
1156                cp = 0;
1157                /*
1158                 * Make sure super points to correct address. Log might have
1159                 * data very soon. If super hasn't correct log tail address,
1160                 * recovery can't find the log
1161                 */
1162                r5l_write_super(log, cp);
1163        } else
1164                log->last_cp_seq = le64_to_cpu(mb->seq);
1165
1166        log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
1167        log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
1168        if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
1169                log->max_free_space = RECLAIM_MAX_FREE_SPACE;
1170        log->last_checkpoint = cp;
1171
1172        __free_page(page);
1173
1174        return r5l_recovery_log(log);
1175ioerr:
1176        __free_page(page);
1177        return ret;
1178}
1179
1180int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1181{
1182        struct request_queue *q = bdev_get_queue(rdev->bdev);
1183        struct r5l_log *log;
1184
1185        if (PAGE_SIZE != 4096)
1186                return -EINVAL;
1187        log = kzalloc(sizeof(*log), GFP_KERNEL);
1188        if (!log)
1189                return -ENOMEM;
1190        log->rdev = rdev;
1191
1192        log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
1193
1194        log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
1195                                       sizeof(rdev->mddev->uuid));
1196
1197        mutex_init(&log->io_mutex);
1198
1199        spin_lock_init(&log->io_list_lock);
1200        INIT_LIST_HEAD(&log->running_ios);
1201        INIT_LIST_HEAD(&log->io_end_ios);
1202        INIT_LIST_HEAD(&log->flushing_ios);
1203        INIT_LIST_HEAD(&log->finished_ios);
1204        bio_init(&log->flush_bio);
1205
1206        log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
1207        if (!log->io_kc)
1208                goto io_kc;
1209
1210        log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
1211        if (!log->io_pool)
1212                goto io_pool;
1213
1214        log->bs = bioset_create(R5L_POOL_SIZE, 0);
1215        if (!log->bs)
1216                goto io_bs;
1217
1218        log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
1219        if (!log->meta_pool)
1220                goto out_mempool;
1221
1222        log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
1223                                                 log->rdev->mddev, "reclaim");
1224        if (!log->reclaim_thread)
1225                goto reclaim_thread;
1226        init_waitqueue_head(&log->iounit_wait);
1227
1228        INIT_LIST_HEAD(&log->no_mem_stripes);
1229
1230        INIT_LIST_HEAD(&log->no_space_stripes);
1231        spin_lock_init(&log->no_space_stripes_lock);
1232
1233        if (r5l_load_log(log))
1234                goto error;
1235
1236        rcu_assign_pointer(conf->log, log);
1237        set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
1238        return 0;
1239
1240error:
1241        md_unregister_thread(&log->reclaim_thread);
1242reclaim_thread:
1243        mempool_destroy(log->meta_pool);
1244out_mempool:
1245        bioset_free(log->bs);
1246io_bs:
1247        mempool_destroy(log->io_pool);
1248io_pool:
1249        kmem_cache_destroy(log->io_kc);
1250io_kc:
1251        kfree(log);
1252        return -EINVAL;
1253}
1254
1255void r5l_exit_log(struct r5l_log *log)
1256{
1257        md_unregister_thread(&log->reclaim_thread);
1258        mempool_destroy(log->meta_pool);
1259        bioset_free(log->bs);
1260        mempool_destroy(log->io_pool);
1261        kmem_cache_destroy(log->io_kc);
1262        kfree(log);
1263}
1264