linux/drivers/md/md.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3   md.c : Multiple Devices driver for Linux
   4     Copyright (C) 1998, 1999, 2000 Ingo Molnar
   5
   6     completely rewritten, based on the MD driver code from Marc Zyngier
   7
   8   Changes:
   9
  10   - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
  11   - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
  12   - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
  13   - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
  14   - kmod support by: Cyrus Durgin
  15   - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
  16   - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
  17
  18   - lots of fixes and improvements to the RAID1/RAID5 and generic
  19     RAID code (such as request based resynchronization):
  20
  21     Neil Brown <neilb@cse.unsw.edu.au>.
  22
  23   - persistent bitmap code
  24     Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
  25
  26
  27   Errors, Warnings, etc.
  28   Please use:
  29     pr_crit() for error conditions that risk data loss
  30     pr_err() for error conditions that are unexpected, like an IO error
  31         or internal inconsistency
  32     pr_warn() for error conditions that could have been predicated, like
  33         adding a device to an array when it has incompatible metadata
  34     pr_info() for every interesting, very rare events, like an array starting
  35         or stopping, or resync starting or stopping
  36     pr_debug() for everything else.
  37
  38*/
  39
  40#include <linux/sched/mm.h>
  41#include <linux/sched/signal.h>
  42#include <linux/kthread.h>
  43#include <linux/blkdev.h>
  44#include <linux/badblocks.h>
  45#include <linux/sysctl.h>
  46#include <linux/seq_file.h>
  47#include <linux/fs.h>
  48#include <linux/poll.h>
  49#include <linux/ctype.h>
  50#include <linux/string.h>
  51#include <linux/hdreg.h>
  52#include <linux/proc_fs.h>
  53#include <linux/random.h>
  54#include <linux/module.h>
  55#include <linux/reboot.h>
  56#include <linux/file.h>
  57#include <linux/compat.h>
  58#include <linux/delay.h>
  59#include <linux/raid/md_p.h>
  60#include <linux/raid/md_u.h>
  61#include <linux/slab.h>
  62#include <linux/percpu-refcount.h>
  63
  64#include <trace/events/block.h>
  65#include "md.h"
  66#include "md-bitmap.h"
  67#include "md-cluster.h"
  68
  69#ifndef MODULE
  70static void autostart_arrays(int part);
  71#endif
  72
  73/* pers_list is a list of registered personalities protected
  74 * by pers_lock.
  75 * pers_lock does extra service to protect accesses to
  76 * mddev->thread when the mutex cannot be held.
  77 */
  78static LIST_HEAD(pers_list);
  79static DEFINE_SPINLOCK(pers_lock);
  80
  81static struct kobj_type md_ktype;
  82
  83struct md_cluster_operations *md_cluster_ops;
  84EXPORT_SYMBOL(md_cluster_ops);
  85static struct module *md_cluster_mod;
  86
  87static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
  88static struct workqueue_struct *md_wq;
  89static struct workqueue_struct *md_misc_wq;
  90
  91static int remove_and_add_spares(struct mddev *mddev,
  92                                 struct md_rdev *this);
  93static void mddev_detach(struct mddev *mddev);
  94
  95/*
  96 * Default number of read corrections we'll attempt on an rdev
  97 * before ejecting it from the array. We divide the read error
  98 * count by 2 for every hour elapsed between read errors.
  99 */
 100#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
 101/*
 102 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
 103 * is 1000 KB/sec, so the extra system load does not show up that much.
 104 * Increase it if you want to have more _guaranteed_ speed. Note that
 105 * the RAID driver will use the maximum available bandwidth if the IO
 106 * subsystem is idle. There is also an 'absolute maximum' reconstruction
 107 * speed limit - in case reconstruction slows down your system despite
 108 * idle IO detection.
 109 *
 110 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
 111 * or /sys/block/mdX/md/sync_speed_{min,max}
 112 */
 113
 114static int sysctl_speed_limit_min = 1000;
 115static int sysctl_speed_limit_max = 200000;
 116static inline int speed_min(struct mddev *mddev)
 117{
 118        return mddev->sync_speed_min ?
 119                mddev->sync_speed_min : sysctl_speed_limit_min;
 120}
 121
 122static inline int speed_max(struct mddev *mddev)
 123{
 124        return mddev->sync_speed_max ?
 125                mddev->sync_speed_max : sysctl_speed_limit_max;
 126}
 127
 128static int rdev_init_wb(struct md_rdev *rdev)
 129{
 130        if (rdev->bdev->bd_queue->nr_hw_queues == 1)
 131                return 0;
 132
 133        spin_lock_init(&rdev->wb_list_lock);
 134        INIT_LIST_HEAD(&rdev->wb_list);
 135        init_waitqueue_head(&rdev->wb_io_wait);
 136        set_bit(WBCollisionCheck, &rdev->flags);
 137
 138        return 1;
 139}
 140
 141/*
 142 * Create wb_info_pool if rdev is the first multi-queue device flaged
 143 * with writemostly, also write-behind mode is enabled.
 144 */
 145void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
 146                          bool is_suspend)
 147{
 148        if (mddev->bitmap_info.max_write_behind == 0)
 149                return;
 150
 151        if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev))
 152                return;
 153
 154        if (mddev->wb_info_pool == NULL) {
 155                unsigned int noio_flag;
 156
 157                if (!is_suspend)
 158                        mddev_suspend(mddev);
 159                noio_flag = memalloc_noio_save();
 160                mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS,
 161                                                        sizeof(struct wb_info));
 162                memalloc_noio_restore(noio_flag);
 163                if (!mddev->wb_info_pool)
 164                        pr_err("can't alloc memory pool for writemostly\n");
 165                if (!is_suspend)
 166                        mddev_resume(mddev);
 167        }
 168}
 169EXPORT_SYMBOL_GPL(mddev_create_wb_pool);
 170
 171/*
 172 * destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck.
 173 */
 174static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev)
 175{
 176        if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags))
 177                return;
 178
 179        if (mddev->wb_info_pool) {
 180                struct md_rdev *temp;
 181                int num = 0;
 182
 183                /*
 184                 * Check if other rdevs need wb_info_pool.
 185                 */
 186                rdev_for_each(temp, mddev)
 187                        if (temp != rdev &&
 188                            test_bit(WBCollisionCheck, &temp->flags))
 189                                num++;
 190                if (!num) {
 191                        mddev_suspend(rdev->mddev);
 192                        mempool_destroy(mddev->wb_info_pool);
 193                        mddev->wb_info_pool = NULL;
 194                        mddev_resume(rdev->mddev);
 195                }
 196        }
 197}
 198
 199static struct ctl_table_header *raid_table_header;
 200
 201static struct ctl_table raid_table[] = {
 202        {
 203                .procname       = "speed_limit_min",
 204                .data           = &sysctl_speed_limit_min,
 205                .maxlen         = sizeof(int),
 206                .mode           = S_IRUGO|S_IWUSR,
 207                .proc_handler   = proc_dointvec,
 208        },
 209        {
 210                .procname       = "speed_limit_max",
 211                .data           = &sysctl_speed_limit_max,
 212                .maxlen         = sizeof(int),
 213                .mode           = S_IRUGO|S_IWUSR,
 214                .proc_handler   = proc_dointvec,
 215        },
 216        { }
 217};
 218
 219static struct ctl_table raid_dir_table[] = {
 220        {
 221                .procname       = "raid",
 222                .maxlen         = 0,
 223                .mode           = S_IRUGO|S_IXUGO,
 224                .child          = raid_table,
 225        },
 226        { }
 227};
 228
 229static struct ctl_table raid_root_table[] = {
 230        {
 231                .procname       = "dev",
 232                .maxlen         = 0,
 233                .mode           = 0555,
 234                .child          = raid_dir_table,
 235        },
 236        {  }
 237};
 238
 239static const struct block_device_operations md_fops;
 240
 241static int start_readonly;
 242
 243/*
 244 * The original mechanism for creating an md device is to create
 245 * a device node in /dev and to open it.  This causes races with device-close.
 246 * The preferred method is to write to the "new_array" module parameter.
 247 * This can avoid races.
 248 * Setting create_on_open to false disables the original mechanism
 249 * so all the races disappear.
 250 */
 251static bool create_on_open = true;
 252
 253struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 254                            struct mddev *mddev)
 255{
 256        if (!mddev || !bioset_initialized(&mddev->bio_set))
 257                return bio_alloc(gfp_mask, nr_iovecs);
 258
 259        return bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set);
 260}
 261EXPORT_SYMBOL_GPL(bio_alloc_mddev);
 262
 263static struct bio *md_bio_alloc_sync(struct mddev *mddev)
 264{
 265        if (!mddev || !bioset_initialized(&mddev->sync_set))
 266                return bio_alloc(GFP_NOIO, 1);
 267
 268        return bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set);
 269}
 270
 271/*
 272 * We have a system wide 'event count' that is incremented
 273 * on any 'interesting' event, and readers of /proc/mdstat
 274 * can use 'poll' or 'select' to find out when the event
 275 * count increases.
 276 *
 277 * Events are:
 278 *  start array, stop array, error, add device, remove device,
 279 *  start build, activate spare
 280 */
 281static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
 282static atomic_t md_event_count;
 283void md_new_event(struct mddev *mddev)
 284{
 285        atomic_inc(&md_event_count);
 286        wake_up(&md_event_waiters);
 287}
 288EXPORT_SYMBOL_GPL(md_new_event);
 289
 290/*
 291 * Enables to iterate over all existing md arrays
 292 * all_mddevs_lock protects this list.
 293 */
 294static LIST_HEAD(all_mddevs);
 295static DEFINE_SPINLOCK(all_mddevs_lock);
 296
 297/*
 298 * iterates through all used mddevs in the system.
 299 * We take care to grab the all_mddevs_lock whenever navigating
 300 * the list, and to always hold a refcount when unlocked.
 301 * Any code which breaks out of this loop while own
 302 * a reference to the current mddev and must mddev_put it.
 303 */
 304#define for_each_mddev(_mddev,_tmp)                                     \
 305                                                                        \
 306        for (({ spin_lock(&all_mddevs_lock);                            \
 307                _tmp = all_mddevs.next;                                 \
 308                _mddev = NULL;});                                       \
 309             ({ if (_tmp != &all_mddevs)                                \
 310                        mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
 311                spin_unlock(&all_mddevs_lock);                          \
 312                if (_mddev) mddev_put(_mddev);                          \
 313                _mddev = list_entry(_tmp, struct mddev, all_mddevs);    \
 314                _tmp != &all_mddevs;});                                 \
 315             ({ spin_lock(&all_mddevs_lock);                            \
 316                _tmp = _tmp->next;})                                    \
 317                )
 318
 319/* Rather than calling directly into the personality make_request function,
 320 * IO requests come here first so that we can check if the device is
 321 * being suspended pending a reconfiguration.
 322 * We hold a refcount over the call to ->make_request.  By the time that
 323 * call has finished, the bio has been linked into some internal structure
 324 * and so is visible to ->quiesce(), so we don't need the refcount any more.
 325 */
 326static bool is_suspended(struct mddev *mddev, struct bio *bio)
 327{
 328        if (mddev->suspended)
 329                return true;
 330        if (bio_data_dir(bio) != WRITE)
 331                return false;
 332        if (mddev->suspend_lo >= mddev->suspend_hi)
 333                return false;
 334        if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
 335                return false;
 336        if (bio_end_sector(bio) < mddev->suspend_lo)
 337                return false;
 338        return true;
 339}
 340
 341void md_handle_request(struct mddev *mddev, struct bio *bio)
 342{
 343check_suspended:
 344        rcu_read_lock();
 345        if (is_suspended(mddev, bio)) {
 346                DEFINE_WAIT(__wait);
 347                for (;;) {
 348                        prepare_to_wait(&mddev->sb_wait, &__wait,
 349                                        TASK_UNINTERRUPTIBLE);
 350                        if (!is_suspended(mddev, bio))
 351                                break;
 352                        rcu_read_unlock();
 353                        schedule();
 354                        rcu_read_lock();
 355                }
 356                finish_wait(&mddev->sb_wait, &__wait);
 357        }
 358        atomic_inc(&mddev->active_io);
 359        rcu_read_unlock();
 360
 361        if (!mddev->pers->make_request(mddev, bio)) {
 362                atomic_dec(&mddev->active_io);
 363                wake_up(&mddev->sb_wait);
 364                goto check_suspended;
 365        }
 366
 367        if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
 368                wake_up(&mddev->sb_wait);
 369}
 370EXPORT_SYMBOL(md_handle_request);
 371
 372static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 373{
 374        const int rw = bio_data_dir(bio);
 375        const int sgrp = op_stat_group(bio_op(bio));
 376        struct mddev *mddev = q->queuedata;
 377        unsigned int sectors;
 378
 379        blk_queue_split(q, &bio);
 380
 381        if (mddev == NULL || mddev->pers == NULL) {
 382                bio_io_error(bio);
 383                return BLK_QC_T_NONE;
 384        }
 385        if (mddev->ro == 1 && unlikely(rw == WRITE)) {
 386                if (bio_sectors(bio) != 0)
 387                        bio->bi_status = BLK_STS_IOERR;
 388                bio_endio(bio);
 389                return BLK_QC_T_NONE;
 390        }
 391
 392        /*
 393         * save the sectors now since our bio can
 394         * go away inside make_request
 395         */
 396        sectors = bio_sectors(bio);
 397        /* bio could be mergeable after passing to underlayer */
 398        bio->bi_opf &= ~REQ_NOMERGE;
 399
 400        md_handle_request(mddev, bio);
 401
 402        part_stat_lock();
 403        part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
 404        part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
 405        part_stat_unlock();
 406
 407        return BLK_QC_T_NONE;
 408}
 409
 410/* mddev_suspend makes sure no new requests are submitted
 411 * to the device, and that any requests that have been submitted
 412 * are completely handled.
 413 * Once mddev_detach() is called and completes, the module will be
 414 * completely unused.
 415 */
 416void mddev_suspend(struct mddev *mddev)
 417{
 418        WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
 419        lockdep_assert_held(&mddev->reconfig_mutex);
 420        if (mddev->suspended++)
 421                return;
 422        synchronize_rcu();
 423        wake_up(&mddev->sb_wait);
 424        set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
 425        smp_mb__after_atomic();
 426        wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
 427        mddev->pers->quiesce(mddev, 1);
 428        clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
 429        wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
 430
 431        del_timer_sync(&mddev->safemode_timer);
 432}
 433EXPORT_SYMBOL_GPL(mddev_suspend);
 434
 435void mddev_resume(struct mddev *mddev)
 436{
 437        lockdep_assert_held(&mddev->reconfig_mutex);
 438        if (--mddev->suspended)
 439                return;
 440        wake_up(&mddev->sb_wait);
 441        mddev->pers->quiesce(mddev, 0);
 442
 443        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 444        md_wakeup_thread(mddev->thread);
 445        md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
 446}
 447EXPORT_SYMBOL_GPL(mddev_resume);
 448
 449int mddev_congested(struct mddev *mddev, int bits)
 450{
 451        struct md_personality *pers = mddev->pers;
 452        int ret = 0;
 453
 454        rcu_read_lock();
 455        if (mddev->suspended)
 456                ret = 1;
 457        else if (pers && pers->congested)
 458                ret = pers->congested(mddev, bits);
 459        rcu_read_unlock();
 460        return ret;
 461}
 462EXPORT_SYMBOL_GPL(mddev_congested);
 463static int md_congested(void *data, int bits)
 464{
 465        struct mddev *mddev = data;
 466        return mddev_congested(mddev, bits);
 467}
 468
 469/*
 470 * Generic flush handling for md
 471 */
 472
 473static void md_end_flush(struct bio *bio)
 474{
 475        struct md_rdev *rdev = bio->bi_private;
 476        struct mddev *mddev = rdev->mddev;
 477
 478        rdev_dec_pending(rdev, mddev);
 479
 480        if (atomic_dec_and_test(&mddev->flush_pending)) {
 481                /* The pre-request flush has finished */
 482                queue_work(md_wq, &mddev->flush_work);
 483        }
 484        bio_put(bio);
 485}
 486
 487static void md_submit_flush_data(struct work_struct *ws);
 488
 489static void submit_flushes(struct work_struct *ws)
 490{
 491        struct mddev *mddev = container_of(ws, struct mddev, flush_work);
 492        struct md_rdev *rdev;
 493
 494        mddev->start_flush = ktime_get_boottime();
 495        INIT_WORK(&mddev->flush_work, md_submit_flush_data);
 496        atomic_set(&mddev->flush_pending, 1);
 497        rcu_read_lock();
 498        rdev_for_each_rcu(rdev, mddev)
 499                if (rdev->raid_disk >= 0 &&
 500                    !test_bit(Faulty, &rdev->flags)) {
 501                        /* Take two references, one is dropped
 502                         * when request finishes, one after
 503                         * we reclaim rcu_read_lock
 504                         */
 505                        struct bio *bi;
 506                        atomic_inc(&rdev->nr_pending);
 507                        atomic_inc(&rdev->nr_pending);
 508                        rcu_read_unlock();
 509                        bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
 510                        bi->bi_end_io = md_end_flush;
 511                        bi->bi_private = rdev;
 512                        bio_set_dev(bi, rdev->bdev);
 513                        bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 514                        atomic_inc(&mddev->flush_pending);
 515                        submit_bio(bi);
 516                        rcu_read_lock();
 517                        rdev_dec_pending(rdev, mddev);
 518                }
 519        rcu_read_unlock();
 520        if (atomic_dec_and_test(&mddev->flush_pending))
 521                queue_work(md_wq, &mddev->flush_work);
 522}
 523
 524static void md_submit_flush_data(struct work_struct *ws)
 525{
 526        struct mddev *mddev = container_of(ws, struct mddev, flush_work);
 527        struct bio *bio = mddev->flush_bio;
 528
 529        /*
 530         * must reset flush_bio before calling into md_handle_request to avoid a
 531         * deadlock, because other bios passed md_handle_request suspend check
 532         * could wait for this and below md_handle_request could wait for those
 533         * bios because of suspend check
 534         */
 535        mddev->last_flush = mddev->start_flush;
 536        mddev->flush_bio = NULL;
 537        wake_up(&mddev->sb_wait);
 538
 539        if (bio->bi_iter.bi_size == 0) {
 540                /* an empty barrier - all done */
 541                bio_endio(bio);
 542        } else {
 543                bio->bi_opf &= ~REQ_PREFLUSH;
 544                md_handle_request(mddev, bio);
 545        }
 546}
 547
 548void md_flush_request(struct mddev *mddev, struct bio *bio)
 549{
 550        ktime_t start = ktime_get_boottime();
 551        spin_lock_irq(&mddev->lock);
 552        wait_event_lock_irq(mddev->sb_wait,
 553                            !mddev->flush_bio ||
 554                            ktime_after(mddev->last_flush, start),
 555                            mddev->lock);
 556        if (!ktime_after(mddev->last_flush, start)) {
 557                WARN_ON(mddev->flush_bio);
 558                mddev->flush_bio = bio;
 559                bio = NULL;
 560        }
 561        spin_unlock_irq(&mddev->lock);
 562
 563        if (!bio) {
 564                INIT_WORK(&mddev->flush_work, submit_flushes);
 565                queue_work(md_wq, &mddev->flush_work);
 566        } else {
 567                /* flush was performed for some other bio while we waited. */
 568                if (bio->bi_iter.bi_size == 0)
 569                        /* an empty barrier - all done */
 570                        bio_endio(bio);
 571                else {
 572                        bio->bi_opf &= ~REQ_PREFLUSH;
 573                        mddev->pers->make_request(mddev, bio);
 574                }
 575        }
 576}
 577EXPORT_SYMBOL(md_flush_request);
 578
 579static inline struct mddev *mddev_get(struct mddev *mddev)
 580{
 581        atomic_inc(&mddev->active);
 582        return mddev;
 583}
 584
 585static void mddev_delayed_delete(struct work_struct *ws);
 586
 587static void mddev_put(struct mddev *mddev)
 588{
 589        if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
 590                return;
 591        if (!mddev->raid_disks && list_empty(&mddev->disks) &&
 592            mddev->ctime == 0 && !mddev->hold_active) {
 593                /* Array is not configured at all, and not held active,
 594                 * so destroy it */
 595                list_del_init(&mddev->all_mddevs);
 596
 597                /*
 598                 * Call queue_work inside the spinlock so that
 599                 * flush_workqueue() after mddev_find will succeed in waiting
 600                 * for the work to be done.
 601                 */
 602                INIT_WORK(&mddev->del_work, mddev_delayed_delete);
 603                queue_work(md_misc_wq, &mddev->del_work);
 604        }
 605        spin_unlock(&all_mddevs_lock);
 606}
 607
 608static void md_safemode_timeout(struct timer_list *t);
 609
 610void mddev_init(struct mddev *mddev)
 611{
 612        kobject_init(&mddev->kobj, &md_ktype);
 613        mutex_init(&mddev->open_mutex);
 614        mutex_init(&mddev->reconfig_mutex);
 615        mutex_init(&mddev->bitmap_info.mutex);
 616        INIT_LIST_HEAD(&mddev->disks);
 617        INIT_LIST_HEAD(&mddev->all_mddevs);
 618        timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
 619        atomic_set(&mddev->active, 1);
 620        atomic_set(&mddev->openers, 0);
 621        atomic_set(&mddev->active_io, 0);
 622        spin_lock_init(&mddev->lock);
 623        atomic_set(&mddev->flush_pending, 0);
 624        init_waitqueue_head(&mddev->sb_wait);
 625        init_waitqueue_head(&mddev->recovery_wait);
 626        mddev->reshape_position = MaxSector;
 627        mddev->reshape_backwards = 0;
 628        mddev->last_sync_action = "none";
 629        mddev->resync_min = 0;
 630        mddev->resync_max = MaxSector;
 631        mddev->level = LEVEL_NONE;
 632}
 633EXPORT_SYMBOL_GPL(mddev_init);
 634
 635static struct mddev *mddev_find(dev_t unit)
 636{
 637        struct mddev *mddev, *new = NULL;
 638
 639        if (unit && MAJOR(unit) != MD_MAJOR)
 640                unit &= ~((1<<MdpMinorShift)-1);
 641
 642 retry:
 643        spin_lock(&all_mddevs_lock);
 644
 645        if (unit) {
 646                list_for_each_entry(mddev, &all_mddevs, all_mddevs)
 647                        if (mddev->unit == unit) {
 648                                mddev_get(mddev);
 649                                spin_unlock(&all_mddevs_lock);
 650                                kfree(new);
 651                                return mddev;
 652                        }
 653
 654                if (new) {
 655                        list_add(&new->all_mddevs, &all_mddevs);
 656                        spin_unlock(&all_mddevs_lock);
 657                        new->hold_active = UNTIL_IOCTL;
 658                        return new;
 659                }
 660        } else if (new) {
 661                /* find an unused unit number */
 662                static int next_minor = 512;
 663                int start = next_minor;
 664                int is_free = 0;
 665                int dev = 0;
 666                while (!is_free) {
 667                        dev = MKDEV(MD_MAJOR, next_minor);
 668                        next_minor++;
 669                        if (next_minor > MINORMASK)
 670                                next_minor = 0;
 671                        if (next_minor == start) {
 672                                /* Oh dear, all in use. */
 673                                spin_unlock(&all_mddevs_lock);
 674                                kfree(new);
 675                                return NULL;
 676                        }
 677
 678                        is_free = 1;
 679                        list_for_each_entry(mddev, &all_mddevs, all_mddevs)
 680                                if (mddev->unit == dev) {
 681                                        is_free = 0;
 682                                        break;
 683                                }
 684                }
 685                new->unit = dev;
 686                new->md_minor = MINOR(dev);
 687                new->hold_active = UNTIL_STOP;
 688                list_add(&new->all_mddevs, &all_mddevs);
 689                spin_unlock(&all_mddevs_lock);
 690                return new;
 691        }
 692        spin_unlock(&all_mddevs_lock);
 693
 694        new = kzalloc(sizeof(*new), GFP_KERNEL);
 695        if (!new)
 696                return NULL;
 697
 698        new->unit = unit;
 699        if (MAJOR(unit) == MD_MAJOR)
 700                new->md_minor = MINOR(unit);
 701        else
 702                new->md_minor = MINOR(unit) >> MdpMinorShift;
 703
 704        mddev_init(new);
 705
 706        goto retry;
 707}
 708
 709static struct attribute_group md_redundancy_group;
 710
 711void mddev_unlock(struct mddev *mddev)
 712{
 713        if (mddev->to_remove) {
 714                /* These cannot be removed under reconfig_mutex as
 715                 * an access to the files will try to take reconfig_mutex
 716                 * while holding the file unremovable, which leads to
 717                 * a deadlock.
 718                 * So hold set sysfs_active while the remove in happeing,
 719                 * and anything else which might set ->to_remove or my
 720                 * otherwise change the sysfs namespace will fail with
 721                 * -EBUSY if sysfs_active is still set.
 722                 * We set sysfs_active under reconfig_mutex and elsewhere
 723                 * test it under the same mutex to ensure its correct value
 724                 * is seen.
 725                 */
 726                struct attribute_group *to_remove = mddev->to_remove;
 727                mddev->to_remove = NULL;
 728                mddev->sysfs_active = 1;
 729                mutex_unlock(&mddev->reconfig_mutex);
 730
 731                if (mddev->kobj.sd) {
 732                        if (to_remove != &md_redundancy_group)
 733                                sysfs_remove_group(&mddev->kobj, to_remove);
 734                        if (mddev->pers == NULL ||
 735                            mddev->pers->sync_request == NULL) {
 736                                sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
 737                                if (mddev->sysfs_action)
 738                                        sysfs_put(mddev->sysfs_action);
 739                                mddev->sysfs_action = NULL;
 740                        }
 741                }
 742                mddev->sysfs_active = 0;
 743        } else
 744                mutex_unlock(&mddev->reconfig_mutex);
 745
 746        /* As we've dropped the mutex we need a spinlock to
 747         * make sure the thread doesn't disappear
 748         */
 749        spin_lock(&pers_lock);
 750        md_wakeup_thread(mddev->thread);
 751        wake_up(&mddev->sb_wait);
 752        spin_unlock(&pers_lock);
 753}
 754EXPORT_SYMBOL_GPL(mddev_unlock);
 755
 756struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
 757{
 758        struct md_rdev *rdev;
 759
 760        rdev_for_each_rcu(rdev, mddev)
 761                if (rdev->desc_nr == nr)
 762                        return rdev;
 763
 764        return NULL;
 765}
 766EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
 767
 768static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
 769{
 770        struct md_rdev *rdev;
 771
 772        rdev_for_each(rdev, mddev)
 773                if (rdev->bdev->bd_dev == dev)
 774                        return rdev;
 775
 776        return NULL;
 777}
 778
 779struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
 780{
 781        struct md_rdev *rdev;
 782
 783        rdev_for_each_rcu(rdev, mddev)
 784                if (rdev->bdev->bd_dev == dev)
 785                        return rdev;
 786
 787        return NULL;
 788}
 789EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
 790
 791static struct md_personality *find_pers(int level, char *clevel)
 792{
 793        struct md_personality *pers;
 794        list_for_each_entry(pers, &pers_list, list) {
 795                if (level != LEVEL_NONE && pers->level == level)
 796                        return pers;
 797                if (strcmp(pers->name, clevel)==0)
 798                        return pers;
 799        }
 800        return NULL;
 801}
 802
 803/* return the offset of the super block in 512byte sectors */
 804static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
 805{
 806        sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
 807        return MD_NEW_SIZE_SECTORS(num_sectors);
 808}
 809
 810static int alloc_disk_sb(struct md_rdev *rdev)
 811{
 812        rdev->sb_page = alloc_page(GFP_KERNEL);
 813        if (!rdev->sb_page)
 814                return -ENOMEM;
 815        return 0;
 816}
 817
 818void md_rdev_clear(struct md_rdev *rdev)
 819{
 820        if (rdev->sb_page) {
 821                put_page(rdev->sb_page);
 822                rdev->sb_loaded = 0;
 823                rdev->sb_page = NULL;
 824                rdev->sb_start = 0;
 825                rdev->sectors = 0;
 826        }
 827        if (rdev->bb_page) {
 828                put_page(rdev->bb_page);
 829                rdev->bb_page = NULL;
 830        }
 831        badblocks_exit(&rdev->badblocks);
 832}
 833EXPORT_SYMBOL_GPL(md_rdev_clear);
 834
 835static void super_written(struct bio *bio)
 836{
 837        struct md_rdev *rdev = bio->bi_private;
 838        struct mddev *mddev = rdev->mddev;
 839
 840        if (bio->bi_status) {
 841                pr_err("md: super_written gets error=%d\n", bio->bi_status);
 842                md_error(mddev, rdev);
 843                if (!test_bit(Faulty, &rdev->flags)
 844                    && (bio->bi_opf & MD_FAILFAST)) {
 845                        set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
 846                        set_bit(LastDev, &rdev->flags);
 847                }
 848        } else
 849                clear_bit(LastDev, &rdev->flags);
 850
 851        if (atomic_dec_and_test(&mddev->pending_writes))
 852                wake_up(&mddev->sb_wait);
 853        rdev_dec_pending(rdev, mddev);
 854        bio_put(bio);
 855}
 856
 857void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
 858                   sector_t sector, int size, struct page *page)
 859{
 860        /* write first size bytes of page to sector of rdev
 861         * Increment mddev->pending_writes before returning
 862         * and decrement it on completion, waking up sb_wait
 863         * if zero is reached.
 864         * If an error occurred, call md_error
 865         */
 866        struct bio *bio;
 867        int ff = 0;
 868
 869        if (!page)
 870                return;
 871
 872        if (test_bit(Faulty, &rdev->flags))
 873                return;
 874
 875        bio = md_bio_alloc_sync(mddev);
 876
 877        atomic_inc(&rdev->nr_pending);
 878
 879        bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev);
 880        bio->bi_iter.bi_sector = sector;
 881        bio_add_page(bio, page, size, 0);
 882        bio->bi_private = rdev;
 883        bio->bi_end_io = super_written;
 884
 885        if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
 886            test_bit(FailFast, &rdev->flags) &&
 887            !test_bit(LastDev, &rdev->flags))
 888                ff = MD_FAILFAST;
 889        bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff;
 890
 891        atomic_inc(&mddev->pending_writes);
 892        submit_bio(bio);
 893}
 894
 895int md_super_wait(struct mddev *mddev)
 896{
 897        /* wait for all superblock writes that were scheduled to complete */
 898        wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
 899        if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
 900                return -EAGAIN;
 901        return 0;
 902}
 903
 904int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 905                 struct page *page, int op, int op_flags, bool metadata_op)
 906{
 907        struct bio *bio = md_bio_alloc_sync(rdev->mddev);
 908        int ret;
 909
 910        if (metadata_op && rdev->meta_bdev)
 911                bio_set_dev(bio, rdev->meta_bdev);
 912        else
 913                bio_set_dev(bio, rdev->bdev);
 914        bio_set_op_attrs(bio, op, op_flags);
 915        if (metadata_op)
 916                bio->bi_iter.bi_sector = sector + rdev->sb_start;
 917        else if (rdev->mddev->reshape_position != MaxSector &&
 918                 (rdev->mddev->reshape_backwards ==
 919                  (sector >= rdev->mddev->reshape_position)))
 920                bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
 921        else
 922                bio->bi_iter.bi_sector = sector + rdev->data_offset;
 923        bio_add_page(bio, page, size, 0);
 924
 925        submit_bio_wait(bio);
 926
 927        ret = !bio->bi_status;
 928        bio_put(bio);
 929        return ret;
 930}
 931EXPORT_SYMBOL_GPL(sync_page_io);
 932
 933static int read_disk_sb(struct md_rdev *rdev, int size)
 934{
 935        char b[BDEVNAME_SIZE];
 936
 937        if (rdev->sb_loaded)
 938                return 0;
 939
 940        if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true))
 941                goto fail;
 942        rdev->sb_loaded = 1;
 943        return 0;
 944
 945fail:
 946        pr_err("md: disabled device %s, could not read superblock.\n",
 947               bdevname(rdev->bdev,b));
 948        return -EINVAL;
 949}
 950
 951static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
 952{
 953        return  sb1->set_uuid0 == sb2->set_uuid0 &&
 954                sb1->set_uuid1 == sb2->set_uuid1 &&
 955                sb1->set_uuid2 == sb2->set_uuid2 &&
 956                sb1->set_uuid3 == sb2->set_uuid3;
 957}
 958
 959static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
 960{
 961        int ret;
 962        mdp_super_t *tmp1, *tmp2;
 963
 964        tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
 965        tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
 966
 967        if (!tmp1 || !tmp2) {
 968                ret = 0;
 969                goto abort;
 970        }
 971
 972        *tmp1 = *sb1;
 973        *tmp2 = *sb2;
 974
 975        /*
 976         * nr_disks is not constant
 977         */
 978        tmp1->nr_disks = 0;
 979        tmp2->nr_disks = 0;
 980
 981        ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
 982abort:
 983        kfree(tmp1);
 984        kfree(tmp2);
 985        return ret;
 986}
 987
 988static u32 md_csum_fold(u32 csum)
 989{
 990        csum = (csum & 0xffff) + (csum >> 16);
 991        return (csum & 0xffff) + (csum >> 16);
 992}
 993
 994static unsigned int calc_sb_csum(mdp_super_t *sb)
 995{
 996        u64 newcsum = 0;
 997        u32 *sb32 = (u32*)sb;
 998        int i;
 999        unsigned int disk_csum, csum;
1000
1001        disk_csum = sb->sb_csum;
1002        sb->sb_csum = 0;
1003
1004        for (i = 0; i < MD_SB_BYTES/4 ; i++)
1005                newcsum += sb32[i];
1006        csum = (newcsum & 0xffffffff) + (newcsum>>32);
1007
1008#ifdef CONFIG_ALPHA
1009        /* This used to use csum_partial, which was wrong for several
1010         * reasons including that different results are returned on
1011         * different architectures.  It isn't critical that we get exactly
1012         * the same return value as before (we always csum_fold before
1013         * testing, and that removes any differences).  However as we
1014         * know that csum_partial always returned a 16bit value on
1015         * alphas, do a fold to maximise conformity to previous behaviour.
1016         */
1017        sb->sb_csum = md_csum_fold(disk_csum);
1018#else
1019        sb->sb_csum = disk_csum;
1020#endif
1021        return csum;
1022}
1023
1024/*
1025 * Handle superblock details.
1026 * We want to be able to handle multiple superblock formats
1027 * so we have a common interface to them all, and an array of
1028 * different handlers.
1029 * We rely on user-space to write the initial superblock, and support
1030 * reading and updating of superblocks.
1031 * Interface methods are:
1032 *   int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
1033 *      loads and validates a superblock on dev.
1034 *      if refdev != NULL, compare superblocks on both devices
1035 *    Return:
1036 *      0 - dev has a superblock that is compatible with refdev
1037 *      1 - dev has a superblock that is compatible and newer than refdev
1038 *          so dev should be used as the refdev in future
1039 *     -EINVAL superblock incompatible or invalid
1040 *     -othererror e.g. -EIO
1041 *
1042 *   int validate_super(struct mddev *mddev, struct md_rdev *dev)
1043 *      Verify that dev is acceptable into mddev.
1044 *       The first time, mddev->raid_disks will be 0, and data from
1045 *       dev should be merged in.  Subsequent calls check that dev
1046 *       is new enough.  Return 0 or -EINVAL
1047 *
1048 *   void sync_super(struct mddev *mddev, struct md_rdev *dev)
1049 *     Update the superblock for rdev with data in mddev
1050 *     This does not write to disc.
1051 *
1052 */
1053
1054struct super_type  {
1055        char                *name;
1056        struct module       *owner;
1057        int                 (*load_super)(struct md_rdev *rdev,
1058                                          struct md_rdev *refdev,
1059                                          int minor_version);
1060        int                 (*validate_super)(struct mddev *mddev,
1061                                              struct md_rdev *rdev);
1062        void                (*sync_super)(struct mddev *mddev,
1063                                          struct md_rdev *rdev);
1064        unsigned long long  (*rdev_size_change)(struct md_rdev *rdev,
1065                                                sector_t num_sectors);
1066        int                 (*allow_new_offset)(struct md_rdev *rdev,
1067                                                unsigned long long new_offset);
1068};
1069
1070/*
1071 * Check that the given mddev has no bitmap.
1072 *
1073 * This function is called from the run method of all personalities that do not
1074 * support bitmaps. It prints an error message and returns non-zero if mddev
1075 * has a bitmap. Otherwise, it returns 0.
1076 *
1077 */
1078int md_check_no_bitmap(struct mddev *mddev)
1079{
1080        if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1081                return 0;
1082        pr_warn("%s: bitmaps are not supported for %s\n",
1083                mdname(mddev), mddev->pers->name);
1084        return 1;
1085}
1086EXPORT_SYMBOL(md_check_no_bitmap);
1087
1088/*
1089 * load_super for 0.90.0
1090 */
1091static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1092{
1093        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1094        mdp_super_t *sb;
1095        int ret;
1096
1097        /*
1098         * Calculate the position of the superblock (512byte sectors),
1099         * it's at the end of the disk.
1100         *
1101         * It also happens to be a multiple of 4Kb.
1102         */
1103        rdev->sb_start = calc_dev_sboffset(rdev);
1104
1105        ret = read_disk_sb(rdev, MD_SB_BYTES);
1106        if (ret)
1107                return ret;
1108
1109        ret = -EINVAL;
1110
1111        bdevname(rdev->bdev, b);
1112        sb = page_address(rdev->sb_page);
1113
1114        if (sb->md_magic != MD_SB_MAGIC) {
1115                pr_warn("md: invalid raid superblock magic on %s\n", b);
1116                goto abort;
1117        }
1118
1119        if (sb->major_version != 0 ||
1120            sb->minor_version < 90 ||
1121            sb->minor_version > 91) {
1122                pr_warn("Bad version number %d.%d on %s\n",
1123                        sb->major_version, sb->minor_version, b);
1124                goto abort;
1125        }
1126
1127        if (sb->raid_disks <= 0)
1128                goto abort;
1129
1130        if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1131                pr_warn("md: invalid superblock checksum on %s\n", b);
1132                goto abort;
1133        }
1134
1135        rdev->preferred_minor = sb->md_minor;
1136        rdev->data_offset = 0;
1137        rdev->new_data_offset = 0;
1138        rdev->sb_size = MD_SB_BYTES;
1139        rdev->badblocks.shift = -1;
1140
1141        if (sb->level == LEVEL_MULTIPATH)
1142                rdev->desc_nr = -1;
1143        else
1144                rdev->desc_nr = sb->this_disk.number;
1145
1146        if (!refdev) {
1147                ret = 1;
1148        } else {
1149                __u64 ev1, ev2;
1150                mdp_super_t *refsb = page_address(refdev->sb_page);
1151                if (!md_uuid_equal(refsb, sb)) {
1152                        pr_warn("md: %s has different UUID to %s\n",
1153                                b, bdevname(refdev->bdev,b2));
1154                        goto abort;
1155                }
1156                if (!md_sb_equal(refsb, sb)) {
1157                        pr_warn("md: %s has same UUID but different superblock to %s\n",
1158                                b, bdevname(refdev->bdev, b2));
1159                        goto abort;
1160                }
1161                ev1 = md_event(sb);
1162                ev2 = md_event(refsb);
1163                if (ev1 > ev2)
1164                        ret = 1;
1165                else
1166                        ret = 0;
1167        }
1168        rdev->sectors = rdev->sb_start;
1169        /* Limit to 4TB as metadata cannot record more than that.
1170         * (not needed for Linear and RAID0 as metadata doesn't
1171         * record this size)
1172         */
1173        if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1174                rdev->sectors = (sector_t)(2ULL << 32) - 2;
1175
1176        if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1177                /* "this cannot possibly happen" ... */
1178                ret = -EINVAL;
1179
1180 abort:
1181        return ret;
1182}
1183
1184/*
1185 * validate_super for 0.90.0
1186 */
1187static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1188{
1189        mdp_disk_t *desc;
1190        mdp_super_t *sb = page_address(rdev->sb_page);
1191        __u64 ev1 = md_event(sb);
1192
1193        rdev->raid_disk = -1;
1194        clear_bit(Faulty, &rdev->flags);
1195        clear_bit(In_sync, &rdev->flags);
1196        clear_bit(Bitmap_sync, &rdev->flags);
1197        clear_bit(WriteMostly, &rdev->flags);
1198
1199        if (mddev->raid_disks == 0) {
1200                mddev->major_version = 0;
1201                mddev->minor_version = sb->minor_version;
1202                mddev->patch_version = sb->patch_version;
1203                mddev->external = 0;
1204                mddev->chunk_sectors = sb->chunk_size >> 9;
1205                mddev->ctime = sb->ctime;
1206                mddev->utime = sb->utime;
1207                mddev->level = sb->level;
1208                mddev->clevel[0] = 0;
1209                mddev->layout = sb->layout;
1210                mddev->raid_disks = sb->raid_disks;
1211                mddev->dev_sectors = ((sector_t)sb->size) * 2;
1212                mddev->events = ev1;
1213                mddev->bitmap_info.offset = 0;
1214                mddev->bitmap_info.space = 0;
1215                /* bitmap can use 60 K after the 4K superblocks */
1216                mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1217                mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1218                mddev->reshape_backwards = 0;
1219
1220                if (mddev->minor_version >= 91) {
1221                        mddev->reshape_position = sb->reshape_position;
1222                        mddev->delta_disks = sb->delta_disks;
1223                        mddev->new_level = sb->new_level;
1224                        mddev->new_layout = sb->new_layout;
1225                        mddev->new_chunk_sectors = sb->new_chunk >> 9;
1226                        if (mddev->delta_disks < 0)
1227                                mddev->reshape_backwards = 1;
1228                } else {
1229                        mddev->reshape_position = MaxSector;
1230                        mddev->delta_disks = 0;
1231                        mddev->new_level = mddev->level;
1232                        mddev->new_layout = mddev->layout;
1233                        mddev->new_chunk_sectors = mddev->chunk_sectors;
1234                }
1235
1236                if (sb->state & (1<<MD_SB_CLEAN))
1237                        mddev->recovery_cp = MaxSector;
1238                else {
1239                        if (sb->events_hi == sb->cp_events_hi &&
1240                                sb->events_lo == sb->cp_events_lo) {
1241                                mddev->recovery_cp = sb->recovery_cp;
1242                        } else
1243                                mddev->recovery_cp = 0;
1244                }
1245
1246                memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1247                memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1248                memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1249                memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1250
1251                mddev->max_disks = MD_SB_DISKS;
1252
1253                if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1254                    mddev->bitmap_info.file == NULL) {
1255                        mddev->bitmap_info.offset =
1256                                mddev->bitmap_info.default_offset;
1257                        mddev->bitmap_info.space =
1258                                mddev->bitmap_info.default_space;
1259                }
1260
1261        } else if (mddev->pers == NULL) {
1262                /* Insist on good event counter while assembling, except
1263                 * for spares (which don't need an event count) */
1264                ++ev1;
1265                if (sb->disks[rdev->desc_nr].state & (
1266                            (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1267                        if (ev1 < mddev->events)
1268                                return -EINVAL;
1269        } else if (mddev->bitmap) {
1270                /* if adding to array with a bitmap, then we can accept an
1271                 * older device ... but not too old.
1272                 */
1273                if (ev1 < mddev->bitmap->events_cleared)
1274                        return 0;
1275                if (ev1 < mddev->events)
1276                        set_bit(Bitmap_sync, &rdev->flags);
1277        } else {
1278                if (ev1 < mddev->events)
1279                        /* just a hot-add of a new device, leave raid_disk at -1 */
1280                        return 0;
1281        }
1282
1283        if (mddev->level != LEVEL_MULTIPATH) {
1284                desc = sb->disks + rdev->desc_nr;
1285
1286                if (desc->state & (1<<MD_DISK_FAULTY))
1287                        set_bit(Faulty, &rdev->flags);
1288                else if (desc->state & (1<<MD_DISK_SYNC) /* &&
1289                            desc->raid_disk < mddev->raid_disks */) {
1290                        set_bit(In_sync, &rdev->flags);
1291                        rdev->raid_disk = desc->raid_disk;
1292                        rdev->saved_raid_disk = desc->raid_disk;
1293                } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1294                        /* active but not in sync implies recovery up to
1295                         * reshape position.  We don't know exactly where
1296                         * that is, so set to zero for now */
1297                        if (mddev->minor_version >= 91) {
1298                                rdev->recovery_offset = 0;
1299                                rdev->raid_disk = desc->raid_disk;
1300                        }
1301                }
1302                if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1303                        set_bit(WriteMostly, &rdev->flags);
1304                if (desc->state & (1<<MD_DISK_FAILFAST))
1305                        set_bit(FailFast, &rdev->flags);
1306        } else /* MULTIPATH are always insync */
1307                set_bit(In_sync, &rdev->flags);
1308        return 0;
1309}
1310
1311/*
1312 * sync_super for 0.90.0
1313 */
1314static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1315{
1316        mdp_super_t *sb;
1317        struct md_rdev *rdev2;
1318        int next_spare = mddev->raid_disks;
1319
1320        /* make rdev->sb match mddev data..
1321         *
1322         * 1/ zero out disks
1323         * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1324         * 3/ any empty disks < next_spare become removed
1325         *
1326         * disks[0] gets initialised to REMOVED because
1327         * we cannot be sure from other fields if it has
1328         * been initialised or not.
1329         */
1330        int i;
1331        int active=0, working=0,failed=0,spare=0,nr_disks=0;
1332
1333        rdev->sb_size = MD_SB_BYTES;
1334
1335        sb = page_address(rdev->sb_page);
1336
1337        memset(sb, 0, sizeof(*sb));
1338
1339        sb->md_magic = MD_SB_MAGIC;
1340        sb->major_version = mddev->major_version;
1341        sb->patch_version = mddev->patch_version;
1342        sb->gvalid_words  = 0; /* ignored */
1343        memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1344        memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1345        memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1346        memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1347
1348        sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
1349        sb->level = mddev->level;
1350        sb->size = mddev->dev_sectors / 2;
1351        sb->raid_disks = mddev->raid_disks;
1352        sb->md_minor = mddev->md_minor;
1353        sb->not_persistent = 0;
1354        sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
1355        sb->state = 0;
1356        sb->events_hi = (mddev->events>>32);
1357        sb->events_lo = (u32)mddev->events;
1358
1359        if (mddev->reshape_position == MaxSector)
1360                sb->minor_version = 90;
1361        else {
1362                sb->minor_version = 91;
1363                sb->reshape_position = mddev->reshape_position;
1364                sb->new_level = mddev->new_level;
1365                sb->delta_disks = mddev->delta_disks;
1366                sb->new_layout = mddev->new_layout;
1367                sb->new_chunk = mddev->new_chunk_sectors << 9;
1368        }
1369        mddev->minor_version = sb->minor_version;
1370        if (mddev->in_sync)
1371        {
1372                sb->recovery_cp = mddev->recovery_cp;
1373                sb->cp_events_hi = (mddev->events>>32);
1374                sb->cp_events_lo = (u32)mddev->events;
1375                if (mddev->recovery_cp == MaxSector)
1376                        sb->state = (1<< MD_SB_CLEAN);
1377        } else
1378                sb->recovery_cp = 0;
1379
1380        sb->layout = mddev->layout;
1381        sb->chunk_size = mddev->chunk_sectors << 9;
1382
1383        if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1384                sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1385
1386        sb->disks[0].state = (1<<MD_DISK_REMOVED);
1387        rdev_for_each(rdev2, mddev) {
1388                mdp_disk_t *d;
1389                int desc_nr;
1390                int is_active = test_bit(In_sync, &rdev2->flags);
1391
1392                if (rdev2->raid_disk >= 0 &&
1393                    sb->minor_version >= 91)
1394                        /* we have nowhere to store the recovery_offset,
1395                         * but if it is not below the reshape_position,
1396                         * we can piggy-back on that.
1397                         */
1398                        is_active = 1;
1399                if (rdev2->raid_disk < 0 ||
1400                    test_bit(Faulty, &rdev2->flags))
1401                        is_active = 0;
1402                if (is_active)
1403                        desc_nr = rdev2->raid_disk;
1404                else
1405                        desc_nr = next_spare++;
1406                rdev2->desc_nr = desc_nr;
1407                d = &sb->disks[rdev2->desc_nr];
1408                nr_disks++;
1409                d->number = rdev2->desc_nr;
1410                d->major = MAJOR(rdev2->bdev->bd_dev);
1411                d->minor = MINOR(rdev2->bdev->bd_dev);
1412                if (is_active)
1413                        d->raid_disk = rdev2->raid_disk;
1414                else
1415                        d->raid_disk = rdev2->desc_nr; /* compatibility */
1416                if (test_bit(Faulty, &rdev2->flags))
1417                        d->state = (1<<MD_DISK_FAULTY);
1418                else if (is_active) {
1419                        d->state = (1<<MD_DISK_ACTIVE);
1420                        if (test_bit(In_sync, &rdev2->flags))
1421                                d->state |= (1<<MD_DISK_SYNC);
1422                        active++;
1423                        working++;
1424                } else {
1425                        d->state = 0;
1426                        spare++;
1427                        working++;
1428                }
1429                if (test_bit(WriteMostly, &rdev2->flags))
1430                        d->state |= (1<<MD_DISK_WRITEMOSTLY);
1431                if (test_bit(FailFast, &rdev2->flags))
1432                        d->state |= (1<<MD_DISK_FAILFAST);
1433        }
1434        /* now set the "removed" and "faulty" bits on any missing devices */
1435        for (i=0 ; i < mddev->raid_disks ; i++) {
1436                mdp_disk_t *d = &sb->disks[i];
1437                if (d->state == 0 && d->number == 0) {
1438                        d->number = i;
1439                        d->raid_disk = i;
1440                        d->state = (1<<MD_DISK_REMOVED);
1441                        d->state |= (1<<MD_DISK_FAULTY);
1442                        failed++;
1443                }
1444        }
1445        sb->nr_disks = nr_disks;
1446        sb->active_disks = active;
1447        sb->working_disks = working;
1448        sb->failed_disks = failed;
1449        sb->spare_disks = spare;
1450
1451        sb->this_disk = sb->disks[rdev->desc_nr];
1452        sb->sb_csum = calc_sb_csum(sb);
1453}
1454
1455/*
1456 * rdev_size_change for 0.90.0
1457 */
1458static unsigned long long
1459super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1460{
1461        if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1462                return 0; /* component must fit device */
1463        if (rdev->mddev->bitmap_info.offset)
1464                return 0; /* can't move bitmap */
1465        rdev->sb_start = calc_dev_sboffset(rdev);
1466        if (!num_sectors || num_sectors > rdev->sb_start)
1467                num_sectors = rdev->sb_start;
1468        /* Limit to 4TB as metadata cannot record more than that.
1469         * 4TB == 2^32 KB, or 2*2^32 sectors.
1470         */
1471        if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1472                num_sectors = (sector_t)(2ULL << 32) - 2;
1473        do {
1474                md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1475                       rdev->sb_page);
1476        } while (md_super_wait(rdev->mddev) < 0);
1477        return num_sectors;
1478}
1479
1480static int
1481super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1482{
1483        /* non-zero offset changes not possible with v0.90 */
1484        return new_offset == 0;
1485}
1486
1487/*
1488 * version 1 superblock
1489 */
1490
1491static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1492{
1493        __le32 disk_csum;
1494        u32 csum;
1495        unsigned long long newcsum;
1496        int size = 256 + le32_to_cpu(sb->max_dev)*2;
1497        __le32 *isuper = (__le32*)sb;
1498
1499        disk_csum = sb->sb_csum;
1500        sb->sb_csum = 0;
1501        newcsum = 0;
1502        for (; size >= 4; size -= 4)
1503                newcsum += le32_to_cpu(*isuper++);
1504
1505        if (size == 2)
1506                newcsum += le16_to_cpu(*(__le16*) isuper);
1507
1508        csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1509        sb->sb_csum = disk_csum;
1510        return cpu_to_le32(csum);
1511}
1512
1513static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1514{
1515        struct mdp_superblock_1 *sb;
1516        int ret;
1517        sector_t sb_start;
1518        sector_t sectors;
1519        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1520        int bmask;
1521
1522        /*
1523         * Calculate the position of the superblock in 512byte sectors.
1524         * It is always aligned to a 4K boundary and
1525         * depeding on minor_version, it can be:
1526         * 0: At least 8K, but less than 12K, from end of device
1527         * 1: At start of device
1528         * 2: 4K from start of device.
1529         */
1530        switch(minor_version) {
1531        case 0:
1532                sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1533                sb_start -= 8*2;
1534                sb_start &= ~(sector_t)(4*2-1);
1535                break;
1536        case 1:
1537                sb_start = 0;
1538                break;
1539        case 2:
1540                sb_start = 8;
1541                break;
1542        default:
1543                return -EINVAL;
1544        }
1545        rdev->sb_start = sb_start;
1546
1547        /* superblock is rarely larger than 1K, but it can be larger,
1548         * and it is safe to read 4k, so we do that
1549         */
1550        ret = read_disk_sb(rdev, 4096);
1551        if (ret) return ret;
1552
1553        sb = page_address(rdev->sb_page);
1554
1555        if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1556            sb->major_version != cpu_to_le32(1) ||
1557            le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1558            le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1559            (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1560                return -EINVAL;
1561
1562        if (calc_sb_1_csum(sb) != sb->sb_csum) {
1563                pr_warn("md: invalid superblock checksum on %s\n",
1564                        bdevname(rdev->bdev,b));
1565                return -EINVAL;
1566        }
1567        if (le64_to_cpu(sb->data_size) < 10) {
1568                pr_warn("md: data_size too small on %s\n",
1569                        bdevname(rdev->bdev,b));
1570                return -EINVAL;
1571        }
1572        if (sb->pad0 ||
1573            sb->pad3[0] ||
1574            memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1575                /* Some padding is non-zero, might be a new feature */
1576                return -EINVAL;
1577
1578        rdev->preferred_minor = 0xffff;
1579        rdev->data_offset = le64_to_cpu(sb->data_offset);
1580        rdev->new_data_offset = rdev->data_offset;
1581        if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1582            (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1583                rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1584        atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1585
1586        rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1587        bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1588        if (rdev->sb_size & bmask)
1589                rdev->sb_size = (rdev->sb_size | bmask) + 1;
1590
1591        if (minor_version
1592            && rdev->data_offset < sb_start + (rdev->sb_size/512))
1593                return -EINVAL;
1594        if (minor_version
1595            && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1596                return -EINVAL;
1597
1598        if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1599                rdev->desc_nr = -1;
1600        else
1601                rdev->desc_nr = le32_to_cpu(sb->dev_number);
1602
1603        if (!rdev->bb_page) {
1604                rdev->bb_page = alloc_page(GFP_KERNEL);
1605                if (!rdev->bb_page)
1606                        return -ENOMEM;
1607        }
1608        if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1609            rdev->badblocks.count == 0) {
1610                /* need to load the bad block list.
1611                 * Currently we limit it to one page.
1612                 */
1613                s32 offset;
1614                sector_t bb_sector;
1615                __le64 *bbp;
1616                int i;
1617                int sectors = le16_to_cpu(sb->bblog_size);
1618                if (sectors > (PAGE_SIZE / 512))
1619                        return -EINVAL;
1620                offset = le32_to_cpu(sb->bblog_offset);
1621                if (offset == 0)
1622                        return -EINVAL;
1623                bb_sector = (long long)offset;
1624                if (!sync_page_io(rdev, bb_sector, sectors << 9,
1625                                  rdev->bb_page, REQ_OP_READ, 0, true))
1626                        return -EIO;
1627                bbp = (__le64 *)page_address(rdev->bb_page);
1628                rdev->badblocks.shift = sb->bblog_shift;
1629                for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1630                        u64 bb = le64_to_cpu(*bbp);
1631                        int count = bb & (0x3ff);
1632                        u64 sector = bb >> 10;
1633                        sector <<= sb->bblog_shift;
1634                        count <<= sb->bblog_shift;
1635                        if (bb + 1 == 0)
1636                                break;
1637                        if (badblocks_set(&rdev->badblocks, sector, count, 1))
1638                                return -EINVAL;
1639                }
1640        } else if (sb->bblog_offset != 0)
1641                rdev->badblocks.shift = 0;
1642
1643        if ((le32_to_cpu(sb->feature_map) &
1644            (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
1645                rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
1646                rdev->ppl.size = le16_to_cpu(sb->ppl.size);
1647                rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
1648        }
1649
1650        if (!refdev) {
1651                ret = 1;
1652        } else {
1653                __u64 ev1, ev2;
1654                struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1655
1656                if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1657                    sb->level != refsb->level ||
1658                    sb->layout != refsb->layout ||
1659                    sb->chunksize != refsb->chunksize) {
1660                        pr_warn("md: %s has strangely different superblock to %s\n",
1661                                bdevname(rdev->bdev,b),
1662                                bdevname(refdev->bdev,b2));
1663                        return -EINVAL;
1664                }
1665                ev1 = le64_to_cpu(sb->events);
1666                ev2 = le64_to_cpu(refsb->events);
1667
1668                if (ev1 > ev2)
1669                        ret = 1;
1670                else
1671                        ret = 0;
1672        }
1673        if (minor_version) {
1674                sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
1675                sectors -= rdev->data_offset;
1676        } else
1677                sectors = rdev->sb_start;
1678        if (sectors < le64_to_cpu(sb->data_size))
1679                return -EINVAL;
1680        rdev->sectors = le64_to_cpu(sb->data_size);
1681        return ret;
1682}
1683
1684static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1685{
1686        struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1687        __u64 ev1 = le64_to_cpu(sb->events);
1688
1689        rdev->raid_disk = -1;
1690        clear_bit(Faulty, &rdev->flags);
1691        clear_bit(In_sync, &rdev->flags);
1692        clear_bit(Bitmap_sync, &rdev->flags);
1693        clear_bit(WriteMostly, &rdev->flags);
1694
1695        if (mddev->raid_disks == 0) {
1696                mddev->major_version = 1;
1697                mddev->patch_version = 0;
1698                mddev->external = 0;
1699                mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1700                mddev->ctime = le64_to_cpu(sb->ctime);
1701                mddev->utime = le64_to_cpu(sb->utime);
1702                mddev->level = le32_to_cpu(sb->level);
1703                mddev->clevel[0] = 0;
1704                mddev->layout = le32_to_cpu(sb->layout);
1705                mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1706                mddev->dev_sectors = le64_to_cpu(sb->size);
1707                mddev->events = ev1;
1708                mddev->bitmap_info.offset = 0;
1709                mddev->bitmap_info.space = 0;
1710                /* Default location for bitmap is 1K after superblock
1711                 * using 3K - total of 4K
1712                 */
1713                mddev->bitmap_info.default_offset = 1024 >> 9;
1714                mddev->bitmap_info.default_space = (4096-1024) >> 9;
1715                mddev->reshape_backwards = 0;
1716
1717                mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1718                memcpy(mddev->uuid, sb->set_uuid, 16);
1719
1720                mddev->max_disks =  (4096-256)/2;
1721
1722                if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1723                    mddev->bitmap_info.file == NULL) {
1724                        mddev->bitmap_info.offset =
1725                                (__s32)le32_to_cpu(sb->bitmap_offset);
1726                        /* Metadata doesn't record how much space is available.
1727                         * For 1.0, we assume we can use up to the superblock
1728                         * if before, else to 4K beyond superblock.
1729                         * For others, assume no change is possible.
1730                         */
1731                        if (mddev->minor_version > 0)
1732                                mddev->bitmap_info.space = 0;
1733                        else if (mddev->bitmap_info.offset > 0)
1734                                mddev->bitmap_info.space =
1735                                        8 - mddev->bitmap_info.offset;
1736                        else
1737                                mddev->bitmap_info.space =
1738                                        -mddev->bitmap_info.offset;
1739                }
1740
1741                if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1742                        mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1743                        mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1744                        mddev->new_level = le32_to_cpu(sb->new_level);
1745                        mddev->new_layout = le32_to_cpu(sb->new_layout);
1746                        mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1747                        if (mddev->delta_disks < 0 ||
1748                            (mddev->delta_disks == 0 &&
1749                             (le32_to_cpu(sb->feature_map)
1750                              & MD_FEATURE_RESHAPE_BACKWARDS)))
1751                                mddev->reshape_backwards = 1;
1752                } else {
1753                        mddev->reshape_position = MaxSector;
1754                        mddev->delta_disks = 0;
1755                        mddev->new_level = mddev->level;
1756                        mddev->new_layout = mddev->layout;
1757                        mddev->new_chunk_sectors = mddev->chunk_sectors;
1758                }
1759
1760                if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
1761                        set_bit(MD_HAS_JOURNAL, &mddev->flags);
1762
1763                if (le32_to_cpu(sb->feature_map) &
1764                    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
1765                        if (le32_to_cpu(sb->feature_map) &
1766                            (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
1767                                return -EINVAL;
1768                        if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
1769                            (le32_to_cpu(sb->feature_map) &
1770                                            MD_FEATURE_MULTIPLE_PPLS))
1771                                return -EINVAL;
1772                        set_bit(MD_HAS_PPL, &mddev->flags);
1773                }
1774        } else if (mddev->pers == NULL) {
1775                /* Insist of good event counter while assembling, except for
1776                 * spares (which don't need an event count) */
1777                ++ev1;
1778                if (rdev->desc_nr >= 0 &&
1779                    rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1780                    (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1781                     le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1782                        if (ev1 < mddev->events)
1783                                return -EINVAL;
1784        } else if (mddev->bitmap) {
1785                /* If adding to array with a bitmap, then we can accept an
1786                 * older device, but not too old.
1787                 */
1788                if (ev1 < mddev->bitmap->events_cleared)
1789                        return 0;
1790                if (ev1 < mddev->events)
1791                        set_bit(Bitmap_sync, &rdev->flags);
1792        } else {
1793                if (ev1 < mddev->events)
1794                        /* just a hot-add of a new device, leave raid_disk at -1 */
1795                        return 0;
1796        }
1797        if (mddev->level != LEVEL_MULTIPATH) {
1798                int role;
1799                if (rdev->desc_nr < 0 ||
1800                    rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1801                        role = MD_DISK_ROLE_SPARE;
1802                        rdev->desc_nr = -1;
1803                } else
1804                        role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1805                switch(role) {
1806                case MD_DISK_ROLE_SPARE: /* spare */
1807                        break;
1808                case MD_DISK_ROLE_FAULTY: /* faulty */
1809                        set_bit(Faulty, &rdev->flags);
1810                        break;
1811                case MD_DISK_ROLE_JOURNAL: /* journal device */
1812                        if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
1813                                /* journal device without journal feature */
1814                                pr_warn("md: journal device provided without journal feature, ignoring the device\n");
1815                                return -EINVAL;
1816                        }
1817                        set_bit(Journal, &rdev->flags);
1818                        rdev->journal_tail = le64_to_cpu(sb->journal_tail);
1819                        rdev->raid_disk = 0;
1820                        break;
1821                default:
1822                        rdev->saved_raid_disk = role;
1823                        if ((le32_to_cpu(sb->feature_map) &
1824                             MD_FEATURE_RECOVERY_OFFSET)) {
1825                                rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1826                                if (!(le32_to_cpu(sb->feature_map) &
1827                                      MD_FEATURE_RECOVERY_BITMAP))
1828                                        rdev->saved_raid_disk = -1;
1829                        } else
1830                                set_bit(In_sync, &rdev->flags);
1831                        rdev->raid_disk = role;
1832                        break;
1833                }
1834                if (sb->devflags & WriteMostly1)
1835                        set_bit(WriteMostly, &rdev->flags);
1836                if (sb->devflags & FailFast1)
1837                        set_bit(FailFast, &rdev->flags);
1838                if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1839                        set_bit(Replacement, &rdev->flags);
1840        } else /* MULTIPATH are always insync */
1841                set_bit(In_sync, &rdev->flags);
1842
1843        return 0;
1844}
1845
1846static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1847{
1848        struct mdp_superblock_1 *sb;
1849        struct md_rdev *rdev2;
1850        int max_dev, i;
1851        /* make rdev->sb match mddev and rdev data. */
1852
1853        sb = page_address(rdev->sb_page);
1854
1855        sb->feature_map = 0;
1856        sb->pad0 = 0;
1857        sb->recovery_offset = cpu_to_le64(0);
1858        memset(sb->pad3, 0, sizeof(sb->pad3));
1859
1860        sb->utime = cpu_to_le64((__u64)mddev->utime);
1861        sb->events = cpu_to_le64(mddev->events);
1862        if (mddev->in_sync)
1863                sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1864        else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
1865                sb->resync_offset = cpu_to_le64(MaxSector);
1866        else
1867                sb->resync_offset = cpu_to_le64(0);
1868
1869        sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1870
1871        sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1872        sb->size = cpu_to_le64(mddev->dev_sectors);
1873        sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1874        sb->level = cpu_to_le32(mddev->level);
1875        sb->layout = cpu_to_le32(mddev->layout);
1876        if (test_bit(FailFast, &rdev->flags))
1877                sb->devflags |= FailFast1;
1878        else
1879                sb->devflags &= ~FailFast1;
1880
1881        if (test_bit(WriteMostly, &rdev->flags))
1882                sb->devflags |= WriteMostly1;
1883        else
1884                sb->devflags &= ~WriteMostly1;
1885        sb->data_offset = cpu_to_le64(rdev->data_offset);
1886        sb->data_size = cpu_to_le64(rdev->sectors);
1887
1888        if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1889                sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1890                sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1891        }
1892
1893        if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
1894            !test_bit(In_sync, &rdev->flags)) {
1895                sb->feature_map |=
1896                        cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1897                sb->recovery_offset =
1898                        cpu_to_le64(rdev->recovery_offset);
1899                if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
1900                        sb->feature_map |=
1901                                cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
1902        }
1903        /* Note: recovery_offset and journal_tail share space  */
1904        if (test_bit(Journal, &rdev->flags))
1905                sb->journal_tail = cpu_to_le64(rdev->journal_tail);
1906        if (test_bit(Replacement, &rdev->flags))
1907                sb->feature_map |=
1908                        cpu_to_le32(MD_FEATURE_REPLACEMENT);
1909
1910        if (mddev->reshape_position != MaxSector) {
1911                sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1912                sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1913                sb->new_layout = cpu_to_le32(mddev->new_layout);
1914                sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1915                sb->new_level = cpu_to_le32(mddev->new_level);
1916                sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1917                if (mddev->delta_disks == 0 &&
1918                    mddev->reshape_backwards)
1919                        sb->feature_map
1920                                |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
1921                if (rdev->new_data_offset != rdev->data_offset) {
1922                        sb->feature_map
1923                                |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
1924                        sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
1925                                                             - rdev->data_offset));
1926                }
1927        }
1928
1929        if (mddev_is_clustered(mddev))
1930                sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
1931
1932        if (rdev->badblocks.count == 0)
1933                /* Nothing to do for bad blocks*/ ;
1934        else if (sb->bblog_offset == 0)
1935                /* Cannot record bad blocks on this device */
1936                md_error(mddev, rdev);
1937        else {
1938                struct badblocks *bb = &rdev->badblocks;
1939                __le64 *bbp = (__le64 *)page_address(rdev->bb_page);
1940                u64 *p = bb->page;
1941                sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1942                if (bb->changed) {
1943                        unsigned seq;
1944
1945retry:
1946                        seq = read_seqbegin(&bb->lock);
1947
1948                        memset(bbp, 0xff, PAGE_SIZE);
1949
1950                        for (i = 0 ; i < bb->count ; i++) {
1951                                u64 internal_bb = p[i];
1952                                u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1953                                                | BB_LEN(internal_bb));
1954                                bbp[i] = cpu_to_le64(store_bb);
1955                        }
1956                        bb->changed = 0;
1957                        if (read_seqretry(&bb->lock, seq))
1958                                goto retry;
1959
1960                        bb->sector = (rdev->sb_start +
1961                                      (int)le32_to_cpu(sb->bblog_offset));
1962                        bb->size = le16_to_cpu(sb->bblog_size);
1963                }
1964        }
1965
1966        max_dev = 0;
1967        rdev_for_each(rdev2, mddev)
1968                if (rdev2->desc_nr+1 > max_dev)
1969                        max_dev = rdev2->desc_nr+1;
1970
1971        if (max_dev > le32_to_cpu(sb->max_dev)) {
1972                int bmask;
1973                sb->max_dev = cpu_to_le32(max_dev);
1974                rdev->sb_size = max_dev * 2 + 256;
1975                bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1976                if (rdev->sb_size & bmask)
1977                        rdev->sb_size = (rdev->sb_size | bmask) + 1;
1978        } else
1979                max_dev = le32_to_cpu(sb->max_dev);
1980
1981        for (i=0; i<max_dev;i++)
1982                sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
1983
1984        if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
1985                sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
1986
1987        if (test_bit(MD_HAS_PPL, &mddev->flags)) {
1988                if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
1989                        sb->feature_map |=
1990                            cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
1991                else
1992                        sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
1993                sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
1994                sb->ppl.size = cpu_to_le16(rdev->ppl.size);
1995        }
1996
1997        rdev_for_each(rdev2, mddev) {
1998                i = rdev2->desc_nr;
1999                if (test_bit(Faulty, &rdev2->flags))
2000                        sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
2001                else if (test_bit(In_sync, &rdev2->flags))
2002                        sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2003                else if (test_bit(Journal, &rdev2->flags))
2004                        sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
2005                else if (rdev2->raid_disk >= 0)
2006                        sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2007                else
2008                        sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2009        }
2010
2011        sb->sb_csum = calc_sb_1_csum(sb);
2012}
2013
2014static unsigned long long
2015super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
2016{
2017        struct mdp_superblock_1 *sb;
2018        sector_t max_sectors;
2019        if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
2020                return 0; /* component must fit device */
2021        if (rdev->data_offset != rdev->new_data_offset)
2022                return 0; /* too confusing */
2023        if (rdev->sb_start < rdev->data_offset) {
2024                /* minor versions 1 and 2; superblock before data */
2025                max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
2026                max_sectors -= rdev->data_offset;
2027                if (!num_sectors || num_sectors > max_sectors)
2028                        num_sectors = max_sectors;
2029        } else if (rdev->mddev->bitmap_info.offset) {
2030                /* minor version 0 with bitmap we can't move */
2031                return 0;
2032        } else {
2033                /* minor version 0; superblock after data */
2034                sector_t sb_start;
2035                sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
2036                sb_start &= ~(sector_t)(4*2 - 1);
2037                max_sectors = rdev->sectors + sb_start - rdev->sb_start;
2038                if (!num_sectors || num_sectors > max_sectors)
2039                        num_sectors = max_sectors;
2040                rdev->sb_start = sb_start;
2041        }
2042        sb = page_address(rdev->sb_page);
2043        sb->data_size = cpu_to_le64(num_sectors);
2044        sb->super_offset = cpu_to_le64(rdev->sb_start);
2045        sb->sb_csum = calc_sb_1_csum(sb);
2046        do {
2047                md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
2048                               rdev->sb_page);
2049        } while (md_super_wait(rdev->mddev) < 0);
2050        return num_sectors;
2051
2052}
2053
2054static int
2055super_1_allow_new_offset(struct md_rdev *rdev,
2056                         unsigned long long new_offset)
2057{
2058        /* All necessary checks on new >= old have been done */
2059        struct bitmap *bitmap;
2060        if (new_offset >= rdev->data_offset)
2061                return 1;
2062
2063        /* with 1.0 metadata, there is no metadata to tread on
2064         * so we can always move back */
2065        if (rdev->mddev->minor_version == 0)
2066                return 1;
2067
2068        /* otherwise we must be sure not to step on
2069         * any metadata, so stay:
2070         * 36K beyond start of superblock
2071         * beyond end of badblocks
2072         * beyond write-intent bitmap
2073         */
2074        if (rdev->sb_start + (32+4)*2 > new_offset)
2075                return 0;
2076        bitmap = rdev->mddev->bitmap;
2077        if (bitmap && !rdev->mddev->bitmap_info.file &&
2078            rdev->sb_start + rdev->mddev->bitmap_info.offset +
2079            bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
2080                return 0;
2081        if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
2082                return 0;
2083
2084        return 1;
2085}
2086
2087static struct super_type super_types[] = {
2088        [0] = {
2089                .name   = "0.90.0",
2090                .owner  = THIS_MODULE,
2091                .load_super         = super_90_load,
2092                .validate_super     = super_90_validate,
2093                .sync_super         = super_90_sync,
2094                .rdev_size_change   = super_90_rdev_size_change,
2095                .allow_new_offset   = super_90_allow_new_offset,
2096        },
2097        [1] = {
2098                .name   = "md-1",
2099                .owner  = THIS_MODULE,
2100                .load_super         = super_1_load,
2101                .validate_super     = super_1_validate,
2102                .sync_super         = super_1_sync,
2103                .rdev_size_change   = super_1_rdev_size_change,
2104                .allow_new_offset   = super_1_allow_new_offset,
2105        },
2106};
2107
2108static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
2109{
2110        if (mddev->sync_super) {
2111                mddev->sync_super(mddev, rdev);
2112                return;
2113        }
2114
2115        BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
2116
2117        super_types[mddev->major_version].sync_super(mddev, rdev);
2118}
2119
2120static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
2121{
2122        struct md_rdev *rdev, *rdev2;
2123
2124        rcu_read_lock();
2125        rdev_for_each_rcu(rdev, mddev1) {
2126                if (test_bit(Faulty, &rdev->flags) ||
2127                    test_bit(Journal, &rdev->flags) ||
2128                    rdev->raid_disk == -1)
2129                        continue;
2130                rdev_for_each_rcu(rdev2, mddev2) {
2131                        if (test_bit(Faulty, &rdev2->flags) ||
2132                            test_bit(Journal, &rdev2->flags) ||
2133                            rdev2->raid_disk == -1)
2134                                continue;
2135                        if (rdev->bdev->bd_contains ==
2136                            rdev2->bdev->bd_contains) {
2137                                rcu_read_unlock();
2138                                return 1;
2139                        }
2140                }
2141        }
2142        rcu_read_unlock();
2143        return 0;
2144}
2145
2146static LIST_HEAD(pending_raid_disks);
2147
2148/*
2149 * Try to register data integrity profile for an mddev
2150 *
2151 * This is called when an array is started and after a disk has been kicked
2152 * from the array. It only succeeds if all working and active component devices
2153 * are integrity capable with matching profiles.
2154 */
2155int md_integrity_register(struct mddev *mddev)
2156{
2157        struct md_rdev *rdev, *reference = NULL;
2158
2159        if (list_empty(&mddev->disks))
2160                return 0; /* nothing to do */
2161        if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
2162                return 0; /* shouldn't register, or already is */
2163        rdev_for_each(rdev, mddev) {
2164                /* skip spares and non-functional disks */
2165                if (test_bit(Faulty, &rdev->flags))
2166                        continue;
2167                if (rdev->raid_disk < 0)
2168                        continue;
2169                if (!reference) {
2170                        /* Use the first rdev as the reference */
2171                        reference = rdev;
2172                        continue;
2173                }
2174                /* does this rdev's profile match the reference profile? */
2175                if (blk_integrity_compare(reference->bdev->bd_disk,
2176                                rdev->bdev->bd_disk) < 0)
2177                        return -EINVAL;
2178        }
2179        if (!reference || !bdev_get_integrity(reference->bdev))
2180                return 0;
2181        /*
2182         * All component devices are integrity capable and have matching
2183         * profiles, register the common profile for the md device.
2184         */
2185        blk_integrity_register(mddev->gendisk,
2186                               bdev_get_integrity(reference->bdev));
2187
2188        pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
2189        if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE)) {
2190                pr_err("md: failed to create integrity pool for %s\n",
2191                       mdname(mddev));
2192                return -EINVAL;
2193        }
2194        return 0;
2195}
2196EXPORT_SYMBOL(md_integrity_register);
2197
2198/*
2199 * Attempt to add an rdev, but only if it is consistent with the current
2200 * integrity profile
2201 */
2202int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
2203{
2204        struct blk_integrity *bi_mddev;
2205        char name[BDEVNAME_SIZE];
2206
2207        if (!mddev->gendisk)
2208                return 0;
2209
2210        bi_mddev = blk_get_integrity(mddev->gendisk);
2211
2212        if (!bi_mddev) /* nothing to do */
2213                return 0;
2214
2215        if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
2216                pr_err("%s: incompatible integrity profile for %s\n",
2217                       mdname(mddev), bdevname(rdev->bdev, name));
2218                return -ENXIO;
2219        }
2220
2221        return 0;
2222}
2223EXPORT_SYMBOL(md_integrity_add_rdev);
2224
2225static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2226{
2227        char b[BDEVNAME_SIZE];
2228        struct kobject *ko;
2229        int err;
2230
2231        /* prevent duplicates */
2232        if (find_rdev(mddev, rdev->bdev->bd_dev))
2233                return -EEXIST;
2234
2235        if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) &&
2236            mddev->pers)
2237                return -EROFS;
2238
2239        /* make sure rdev->sectors exceeds mddev->dev_sectors */
2240        if (!test_bit(Journal, &rdev->flags) &&
2241            rdev->sectors &&
2242            (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
2243                if (mddev->pers) {
2244                        /* Cannot change size, so fail
2245                         * If mddev->level <= 0, then we don't care
2246                         * about aligning sizes (e.g. linear)
2247                         */
2248                        if (mddev->level > 0)
2249                                return -ENOSPC;
2250                } else
2251                        mddev->dev_sectors = rdev->sectors;
2252        }
2253
2254        /* Verify rdev->desc_nr is unique.
2255         * If it is -1, assign a free number, else
2256         * check number is not in use
2257         */
2258        rcu_read_lock();
2259        if (rdev->desc_nr < 0) {
2260                int choice = 0;
2261                if (mddev->pers)
2262                        choice = mddev->raid_disks;
2263                while (md_find_rdev_nr_rcu(mddev, choice))
2264                        choice++;
2265                rdev->desc_nr = choice;
2266        } else {
2267                if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2268                        rcu_read_unlock();
2269                        return -EBUSY;
2270                }
2271        }
2272        rcu_read_unlock();
2273        if (!test_bit(Journal, &rdev->flags) &&
2274            mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2275                pr_warn("md: %s: array is limited to %d devices\n",
2276                        mdname(mddev), mddev->max_disks);
2277                return -EBUSY;
2278        }
2279        bdevname(rdev->bdev,b);
2280        strreplace(b, '/', '!');
2281
2282        rdev->mddev = mddev;
2283        pr_debug("md: bind<%s>\n", b);
2284
2285        if (mddev->raid_disks)
2286                mddev_create_wb_pool(mddev, rdev, false);
2287
2288        if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2289                goto fail;
2290
2291        ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
2292        if (sysfs_create_link(&rdev->kobj, ko, "block"))
2293                /* failure here is OK */;
2294        rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2295
2296        list_add_rcu(&rdev->same_set, &mddev->disks);
2297        bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2298
2299        /* May as well allow recovery to be retried once */
2300        mddev->recovery_disabled++;
2301
2302        return 0;
2303
2304 fail:
2305        pr_warn("md: failed to register dev-%s for %s\n",
2306                b, mdname(mddev));
2307        return err;
2308}
2309
2310static void md_delayed_delete(struct work_struct *ws)
2311{
2312        struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
2313        kobject_del(&rdev->kobj);
2314        kobject_put(&rdev->kobj);
2315}
2316
2317static void unbind_rdev_from_array(struct md_rdev *rdev)
2318{
2319        char b[BDEVNAME_SIZE];
2320
2321        bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2322        list_del_rcu(&rdev->same_set);
2323        pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
2324        mddev_destroy_wb_pool(rdev->mddev, rdev);
2325        rdev->mddev = NULL;
2326        sysfs_remove_link(&rdev->kobj, "block");
2327        sysfs_put(rdev->sysfs_state);
2328        rdev->sysfs_state = NULL;
2329        rdev->badblocks.count = 0;
2330        /* We need to delay this, otherwise we can deadlock when
2331         * writing to 'remove' to "dev/state".  We also need
2332         * to delay it due to rcu usage.
2333         */
2334        synchronize_rcu();
2335        INIT_WORK(&rdev->del_work, md_delayed_delete);
2336        kobject_get(&rdev->kobj);
2337        queue_work(md_misc_wq, &rdev->del_work);
2338}
2339
2340/*
2341 * prevent the device from being mounted, repartitioned or
2342 * otherwise reused by a RAID array (or any other kernel
2343 * subsystem), by bd_claiming the device.
2344 */
2345static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
2346{
2347        int err = 0;
2348        struct block_device *bdev;
2349        char b[BDEVNAME_SIZE];
2350
2351        bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2352                                 shared ? (struct md_rdev *)lock_rdev : rdev);
2353        if (IS_ERR(bdev)) {
2354                pr_warn("md: could not open %s.\n", __bdevname(dev, b));
2355                return PTR_ERR(bdev);
2356        }
2357        rdev->bdev = bdev;
2358        return err;
2359}
2360
2361static void unlock_rdev(struct md_rdev *rdev)
2362{
2363        struct block_device *bdev = rdev->bdev;
2364        rdev->bdev = NULL;
2365        blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2366}
2367
2368void md_autodetect_dev(dev_t dev);
2369
2370static void export_rdev(struct md_rdev *rdev)
2371{
2372        char b[BDEVNAME_SIZE];
2373
2374        pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b));
2375        md_rdev_clear(rdev);
2376#ifndef MODULE
2377        if (test_bit(AutoDetected, &rdev->flags))
2378                md_autodetect_dev(rdev->bdev->bd_dev);
2379#endif
2380        unlock_rdev(rdev);
2381        kobject_put(&rdev->kobj);
2382}
2383
2384void md_kick_rdev_from_array(struct md_rdev *rdev)
2385{
2386        unbind_rdev_from_array(rdev);
2387        export_rdev(rdev);
2388}
2389EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
2390
2391static void export_array(struct mddev *mddev)
2392{
2393        struct md_rdev *rdev;
2394
2395        while (!list_empty(&mddev->disks)) {
2396                rdev = list_first_entry(&mddev->disks, struct md_rdev,
2397                                        same_set);
2398                md_kick_rdev_from_array(rdev);
2399        }
2400        mddev->raid_disks = 0;
2401        mddev->major_version = 0;
2402}
2403
2404static bool set_in_sync(struct mddev *mddev)
2405{
2406        lockdep_assert_held(&mddev->lock);
2407        if (!mddev->in_sync) {
2408                mddev->sync_checkers++;
2409                spin_unlock(&mddev->lock);
2410                percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
2411                spin_lock(&mddev->lock);
2412                if (!mddev->in_sync &&
2413                    percpu_ref_is_zero(&mddev->writes_pending)) {
2414                        mddev->in_sync = 1;
2415                        /*
2416                         * Ensure ->in_sync is visible before we clear
2417                         * ->sync_checkers.
2418                         */
2419                        smp_mb();
2420                        set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2421                        sysfs_notify_dirent_safe(mddev->sysfs_state);
2422                }
2423                if (--mddev->sync_checkers == 0)
2424                        percpu_ref_switch_to_percpu(&mddev->writes_pending);
2425        }
2426        if (mddev->safemode == 1)
2427                mddev->safemode = 0;
2428        return mddev->in_sync;
2429}
2430
2431static void sync_sbs(struct mddev *mddev, int nospares)
2432{
2433        /* Update each superblock (in-memory image), but
2434         * if we are allowed to, skip spares which already
2435         * have the right event counter, or have one earlier
2436         * (which would mean they aren't being marked as dirty
2437         * with the rest of the array)
2438         */
2439        struct md_rdev *rdev;
2440        rdev_for_each(rdev, mddev) {
2441                if (rdev->sb_events == mddev->events ||
2442                    (nospares &&
2443                     rdev->raid_disk < 0 &&
2444                     rdev->sb_events+1 == mddev->events)) {
2445                        /* Don't update this superblock */
2446                        rdev->sb_loaded = 2;
2447                } else {
2448                        sync_super(mddev, rdev);
2449                        rdev->sb_loaded = 1;
2450                }
2451        }
2452}
2453
2454static bool does_sb_need_changing(struct mddev *mddev)
2455{
2456        struct md_rdev *rdev;
2457        struct mdp_superblock_1 *sb;
2458        int role;
2459
2460        /* Find a good rdev */
2461        rdev_for_each(rdev, mddev)
2462                if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
2463                        break;
2464
2465        /* No good device found. */
2466        if (!rdev)
2467                return false;
2468
2469        sb = page_address(rdev->sb_page);
2470        /* Check if a device has become faulty or a spare become active */
2471        rdev_for_each(rdev, mddev) {
2472                role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2473                /* Device activated? */
2474                if (role == 0xffff && rdev->raid_disk >=0 &&
2475                    !test_bit(Faulty, &rdev->flags))
2476                        return true;
2477                /* Device turned faulty? */
2478                if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
2479                        return true;
2480        }
2481
2482        /* Check if any mddev parameters have changed */
2483        if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
2484            (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2485            (mddev->layout != le32_to_cpu(sb->layout)) ||
2486            (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
2487            (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
2488                return true;
2489
2490        return false;
2491}
2492
2493void md_update_sb(struct mddev *mddev, int force_change)
2494{
2495        struct md_rdev *rdev;
2496        int sync_req;
2497        int nospares = 0;
2498        int any_badblocks_changed = 0;
2499        int ret = -1;
2500
2501        if (mddev->ro) {
2502                if (force_change)
2503                        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2504                return;
2505        }
2506
2507repeat:
2508        if (mddev_is_clustered(mddev)) {
2509                if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2510                        force_change = 1;
2511                if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2512                        nospares = 1;
2513                ret = md_cluster_ops->metadata_update_start(mddev);
2514                /* Has someone else has updated the sb */
2515                if (!does_sb_need_changing(mddev)) {
2516                        if (ret == 0)
2517                                md_cluster_ops->metadata_update_cancel(mddev);
2518                        bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2519                                                         BIT(MD_SB_CHANGE_DEVS) |
2520                                                         BIT(MD_SB_CHANGE_CLEAN));
2521                        return;
2522                }
2523        }
2524
2525        /*
2526         * First make sure individual recovery_offsets are correct
2527         * curr_resync_completed can only be used during recovery.
2528         * During reshape/resync it might use array-addresses rather
2529         * that device addresses.
2530         */
2531        rdev_for_each(rdev, mddev) {
2532                if (rdev->raid_disk >= 0 &&
2533                    mddev->delta_disks >= 0 &&
2534                    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
2535                    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
2536                    !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2537                    !test_bit(Journal, &rdev->flags) &&
2538                    !test_bit(In_sync, &rdev->flags) &&
2539                    mddev->curr_resync_completed > rdev->recovery_offset)
2540                                rdev->recovery_offset = mddev->curr_resync_completed;
2541
2542        }
2543        if (!mddev->persistent) {
2544                clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2545                clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2546                if (!mddev->external) {
2547                        clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2548                        rdev_for_each(rdev, mddev) {
2549                                if (rdev->badblocks.changed) {
2550                                        rdev->badblocks.changed = 0;
2551                                        ack_all_badblocks(&rdev->badblocks);
2552                                        md_error(mddev, rdev);
2553                                }
2554                                clear_bit(Blocked, &rdev->flags);
2555                                clear_bit(BlockedBadBlocks, &rdev->flags);
2556                                wake_up(&rdev->blocked_wait);
2557                        }
2558                }
2559                wake_up(&mddev->sb_wait);
2560                return;
2561        }
2562
2563        spin_lock(&mddev->lock);
2564
2565        mddev->utime = ktime_get_real_seconds();
2566
2567        if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2568                force_change = 1;
2569        if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2570                /* just a clean<-> dirty transition, possibly leave spares alone,
2571                 * though if events isn't the right even/odd, we will have to do
2572                 * spares after all
2573                 */
2574                nospares = 1;
2575        if (force_change)
2576                nospares = 0;
2577        if (mddev->degraded)
2578                /* If the array is degraded, then skipping spares is both
2579                 * dangerous and fairly pointless.
2580                 * Dangerous because a device that was removed from the array
2581                 * might have a event_count that still looks up-to-date,
2582                 * so it can be re-added without a resync.
2583                 * Pointless because if there are any spares to skip,
2584                 * then a recovery will happen and soon that array won't
2585                 * be degraded any more and the spare can go back to sleep then.
2586                 */
2587                nospares = 0;
2588
2589        sync_req = mddev->in_sync;
2590
2591        /* If this is just a dirty<->clean transition, and the array is clean
2592         * and 'events' is odd, we can roll back to the previous clean state */
2593        if (nospares
2594            && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2595            && mddev->can_decrease_events
2596            && mddev->events != 1) {
2597                mddev->events--;
2598                mddev->can_decrease_events = 0;
2599        } else {
2600                /* otherwise we have to go forward and ... */
2601                mddev->events ++;
2602                mddev->can_decrease_events = nospares;
2603        }
2604
2605        /*
2606         * This 64-bit counter should never wrap.
2607         * Either we are in around ~1 trillion A.C., assuming
2608         * 1 reboot per second, or we have a bug...
2609         */
2610        WARN_ON(mddev->events == 0);
2611
2612        rdev_for_each(rdev, mddev) {
2613                if (rdev->badblocks.changed)
2614                        any_badblocks_changed++;
2615                if (test_bit(Faulty, &rdev->flags))
2616                        set_bit(FaultRecorded, &rdev->flags);
2617        }
2618
2619        sync_sbs(mddev, nospares);
2620        spin_unlock(&mddev->lock);
2621
2622        pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2623                 mdname(mddev), mddev->in_sync);
2624
2625        if (mddev->queue)
2626                blk_add_trace_msg(mddev->queue, "md md_update_sb");
2627rewrite:
2628        md_bitmap_update_sb(mddev->bitmap);
2629        rdev_for_each(rdev, mddev) {
2630                char b[BDEVNAME_SIZE];
2631
2632                if (rdev->sb_loaded != 1)
2633                        continue; /* no noise on spare devices */
2634
2635                if (!test_bit(Faulty, &rdev->flags)) {
2636                        md_super_write(mddev,rdev,
2637                                       rdev->sb_start, rdev->sb_size,
2638                                       rdev->sb_page);
2639                        pr_debug("md: (write) %s's sb offset: %llu\n",
2640                                 bdevname(rdev->bdev, b),
2641                                 (unsigned long long)rdev->sb_start);
2642                        rdev->sb_events = mddev->events;
2643                        if (rdev->badblocks.size) {
2644                                md_super_write(mddev, rdev,
2645                                               rdev->badblocks.sector,
2646                                               rdev->badblocks.size << 9,
2647                                               rdev->bb_page);
2648                                rdev->badblocks.size = 0;
2649                        }
2650
2651                } else
2652                        pr_debug("md: %s (skipping faulty)\n",
2653                                 bdevname(rdev->bdev, b));
2654
2655                if (mddev->level == LEVEL_MULTIPATH)
2656                        /* only need to write one superblock... */
2657                        break;
2658        }
2659        if (md_super_wait(mddev) < 0)
2660                goto rewrite;
2661        /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
2662
2663        if (mddev_is_clustered(mddev) && ret == 0)
2664                md_cluster_ops->metadata_update_finish(mddev);
2665
2666        if (mddev->in_sync != sync_req ||
2667            !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2668                               BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
2669                /* have to write it out again */
2670                goto repeat;
2671        wake_up(&mddev->sb_wait);
2672        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2673                sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2674
2675        rdev_for_each(rdev, mddev) {
2676                if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2677                        clear_bit(Blocked, &rdev->flags);
2678
2679                if (any_badblocks_changed)
2680                        ack_all_badblocks(&rdev->badblocks);
2681                clear_bit(BlockedBadBlocks, &rdev->flags);
2682                wake_up(&rdev->blocked_wait);
2683        }
2684}
2685EXPORT_SYMBOL(md_update_sb);
2686
2687static int add_bound_rdev(struct md_rdev *rdev)
2688{
2689        struct mddev *mddev = rdev->mddev;
2690        int err = 0;
2691        bool add_journal = test_bit(Journal, &rdev->flags);
2692
2693        if (!mddev->pers->hot_remove_disk || add_journal) {
2694                /* If there is hot_add_disk but no hot_remove_disk
2695                 * then added disks for geometry changes,
2696                 * and should be added immediately.
2697                 */
2698                super_types[mddev->major_version].
2699                        validate_super(mddev, rdev);
2700                if (add_journal)
2701                        mddev_suspend(mddev);
2702                err = mddev->pers->hot_add_disk(mddev, rdev);
2703                if (add_journal)
2704                        mddev_resume(mddev);
2705                if (err) {
2706                        md_kick_rdev_from_array(rdev);
2707                        return err;
2708                }
2709        }
2710        sysfs_notify_dirent_safe(rdev->sysfs_state);
2711
2712        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2713        if (mddev->degraded)
2714                set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2715        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2716        md_new_event(mddev);
2717        md_wakeup_thread(mddev->thread);
2718        return 0;
2719}
2720
2721/* words written to sysfs files may, or may not, be \n terminated.
2722 * We want to accept with case. For this we use cmd_match.
2723 */
2724static int cmd_match(const char *cmd, const char *str)
2725{
2726        /* See if cmd, written into a sysfs file, matches
2727         * str.  They must either be the same, or cmd can
2728         * have a trailing newline
2729         */
2730        while (*cmd && *str && *cmd == *str) {
2731                cmd++;
2732                str++;
2733        }
2734        if (*cmd == '\n')
2735                cmd++;
2736        if (*str || *cmd)
2737                return 0;
2738        return 1;
2739}
2740
2741struct rdev_sysfs_entry {
2742        struct attribute attr;
2743        ssize_t (*show)(struct md_rdev *, char *);
2744        ssize_t (*store)(struct md_rdev *, const char *, size_t);
2745};
2746
2747static ssize_t
2748state_show(struct md_rdev *rdev, char *page)
2749{
2750        char *sep = ",";
2751        size_t len = 0;
2752        unsigned long flags = READ_ONCE(rdev->flags);
2753
2754        if (test_bit(Faulty, &flags) ||
2755            (!test_bit(ExternalBbl, &flags) &&
2756            rdev->badblocks.unacked_exist))
2757                len += sprintf(page+len, "faulty%s", sep);
2758        if (test_bit(In_sync, &flags))
2759                len += sprintf(page+len, "in_sync%s", sep);
2760        if (test_bit(Journal, &flags))
2761                len += sprintf(page+len, "journal%s", sep);
2762        if (test_bit(WriteMostly, &flags))
2763                len += sprintf(page+len, "write_mostly%s", sep);
2764        if (test_bit(Blocked, &flags) ||
2765            (rdev->badblocks.unacked_exist
2766             && !test_bit(Faulty, &flags)))
2767                len += sprintf(page+len, "blocked%s", sep);
2768        if (!test_bit(Faulty, &flags) &&
2769            !test_bit(Journal, &flags) &&
2770            !test_bit(In_sync, &flags))
2771                len += sprintf(page+len, "spare%s", sep);
2772        if (test_bit(WriteErrorSeen, &flags))
2773                len += sprintf(page+len, "write_error%s", sep);
2774        if (test_bit(WantReplacement, &flags))
2775                len += sprintf(page+len, "want_replacement%s", sep);
2776        if (test_bit(Replacement, &flags))
2777                len += sprintf(page+len, "replacement%s", sep);
2778        if (test_bit(ExternalBbl, &flags))
2779                len += sprintf(page+len, "external_bbl%s", sep);
2780        if (test_bit(FailFast, &flags))
2781                len += sprintf(page+len, "failfast%s", sep);
2782
2783        if (len)
2784                len -= strlen(sep);
2785
2786        return len+sprintf(page+len, "\n");
2787}
2788
2789static ssize_t
2790state_store(struct md_rdev *rdev, const char *buf, size_t len)
2791{
2792        /* can write
2793         *  faulty  - simulates an error
2794         *  remove  - disconnects the device
2795         *  writemostly - sets write_mostly
2796         *  -writemostly - clears write_mostly
2797         *  blocked - sets the Blocked flags
2798         *  -blocked - clears the Blocked and possibly simulates an error
2799         *  insync - sets Insync providing device isn't active
2800         *  -insync - clear Insync for a device with a slot assigned,
2801         *            so that it gets rebuilt based on bitmap
2802         *  write_error - sets WriteErrorSeen
2803         *  -write_error - clears WriteErrorSeen
2804         *  {,-}failfast - set/clear FailFast
2805         */
2806        int err = -EINVAL;
2807        if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2808                md_error(rdev->mddev, rdev);
2809                if (test_bit(Faulty, &rdev->flags))
2810                        err = 0;
2811                else
2812                        err = -EBUSY;
2813        } else if (cmd_match(buf, "remove")) {
2814                if (rdev->mddev->pers) {
2815                        clear_bit(Blocked, &rdev->flags);
2816                        remove_and_add_spares(rdev->mddev, rdev);
2817                }
2818                if (rdev->raid_disk >= 0)
2819                        err = -EBUSY;
2820                else {
2821                        struct mddev *mddev = rdev->mddev;
2822                        err = 0;
2823                        if (mddev_is_clustered(mddev))
2824                                err = md_cluster_ops->remove_disk(mddev, rdev);
2825
2826                        if (err == 0) {
2827                                md_kick_rdev_from_array(rdev);
2828                                if (mddev->pers) {
2829                                        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2830                                        md_wakeup_thread(mddev->thread);
2831                                }
2832                                md_new_event(mddev);
2833                        }
2834                }
2835        } else if (cmd_match(buf, "writemostly")) {
2836                set_bit(WriteMostly, &rdev->flags);
2837                mddev_create_wb_pool(rdev->mddev, rdev, false);
2838                err = 0;
2839        } else if (cmd_match(buf, "-writemostly")) {
2840                mddev_destroy_wb_pool(rdev->mddev, rdev);
2841                clear_bit(WriteMostly, &rdev->flags);
2842                err = 0;
2843        } else if (cmd_match(buf, "blocked")) {
2844                set_bit(Blocked, &rdev->flags);
2845                err = 0;
2846        } else if (cmd_match(buf, "-blocked")) {
2847                if (!test_bit(Faulty, &rdev->flags) &&
2848                    !test_bit(ExternalBbl, &rdev->flags) &&
2849                    rdev->badblocks.unacked_exist) {
2850                        /* metadata handler doesn't understand badblocks,
2851                         * so we need to fail the device
2852                         */
2853                        md_error(rdev->mddev, rdev);
2854                }
2855                clear_bit(Blocked, &rdev->flags);
2856                clear_bit(BlockedBadBlocks, &rdev->flags);
2857                wake_up(&rdev->blocked_wait);
2858                set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2859                md_wakeup_thread(rdev->mddev->thread);
2860
2861                err = 0;
2862        } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2863                set_bit(In_sync, &rdev->flags);
2864                err = 0;
2865        } else if (cmd_match(buf, "failfast")) {
2866                set_bit(FailFast, &rdev->flags);
2867                err = 0;
2868        } else if (cmd_match(buf, "-failfast")) {
2869                clear_bit(FailFast, &rdev->flags);
2870                err = 0;
2871        } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
2872                   !test_bit(Journal, &rdev->flags)) {
2873                if (rdev->mddev->pers == NULL) {
2874                        clear_bit(In_sync, &rdev->flags);
2875                        rdev->saved_raid_disk = rdev->raid_disk;
2876                        rdev->raid_disk = -1;
2877                        err = 0;
2878                }
2879        } else if (cmd_match(buf, "write_error")) {
2880                set_bit(WriteErrorSeen, &rdev->flags);
2881                err = 0;
2882        } else if (cmd_match(buf, "-write_error")) {
2883                clear_bit(WriteErrorSeen, &rdev->flags);
2884                err = 0;
2885        } else if (cmd_match(buf, "want_replacement")) {
2886                /* Any non-spare device that is not a replacement can
2887                 * become want_replacement at any time, but we then need to
2888                 * check if recovery is needed.
2889                 */
2890                if (rdev->raid_disk >= 0 &&
2891                    !test_bit(Journal, &rdev->flags) &&
2892                    !test_bit(Replacement, &rdev->flags))
2893                        set_bit(WantReplacement, &rdev->flags);
2894                set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2895                md_wakeup_thread(rdev->mddev->thread);
2896                err = 0;
2897        } else if (cmd_match(buf, "-want_replacement")) {
2898                /* Clearing 'want_replacement' is always allowed.
2899                 * Once replacements starts it is too late though.
2900                 */
2901                err = 0;
2902                clear_bit(WantReplacement, &rdev->flags);
2903        } else if (cmd_match(buf, "replacement")) {
2904                /* Can only set a device as a replacement when array has not
2905                 * yet been started.  Once running, replacement is automatic
2906                 * from spares, or by assigning 'slot'.
2907                 */
2908                if (rdev->mddev->pers)
2909                        err = -EBUSY;
2910                else {
2911                        set_bit(Replacement, &rdev->flags);
2912                        err = 0;
2913                }
2914        } else if (cmd_match(buf, "-replacement")) {
2915                /* Similarly, can only clear Replacement before start */
2916                if (rdev->mddev->pers)
2917                        err = -EBUSY;
2918                else {
2919                        clear_bit(Replacement, &rdev->flags);
2920                        err = 0;
2921                }
2922        } else if (cmd_match(buf, "re-add")) {
2923                if (!rdev->mddev->pers)
2924                        err = -EINVAL;
2925                else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
2926                                rdev->saved_raid_disk >= 0) {
2927                        /* clear_bit is performed _after_ all the devices
2928                         * have their local Faulty bit cleared. If any writes
2929                         * happen in the meantime in the local node, they
2930                         * will land in the local bitmap, which will be synced
2931                         * by this node eventually
2932                         */
2933                        if (!mddev_is_clustered(rdev->mddev) ||
2934                            (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
2935                                clear_bit(Faulty, &rdev->flags);
2936                                err = add_bound_rdev(rdev);
2937                        }
2938                } else
2939                        err = -EBUSY;
2940        } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
2941                set_bit(ExternalBbl, &rdev->flags);
2942                rdev->badblocks.shift = 0;
2943                err = 0;
2944        } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
2945                clear_bit(ExternalBbl, &rdev->flags);
2946                err = 0;
2947        }
2948        if (!err)
2949                sysfs_notify_dirent_safe(rdev->sysfs_state);
2950        return err ? err : len;
2951}
2952static struct rdev_sysfs_entry rdev_state =
2953__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
2954
2955static ssize_t
2956errors_show(struct md_rdev *rdev, char *page)
2957{
2958        return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2959}
2960
2961static ssize_t
2962errors_store(struct md_rdev *rdev, const char *buf, size_t len)
2963{
2964        unsigned int n;
2965        int rv;
2966
2967        rv = kstrtouint(buf, 10, &n);
2968        if (rv < 0)
2969                return rv;
2970        atomic_set(&rdev->corrected_errors, n);
2971        return len;
2972}
2973static struct rdev_sysfs_entry rdev_errors =
2974__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2975
2976static ssize_t
2977slot_show(struct md_rdev *rdev, char *page)
2978{
2979        if (test_bit(Journal, &rdev->flags))
2980                return sprintf(page, "journal\n");
2981        else if (rdev->raid_disk < 0)
2982                return sprintf(page, "none\n");
2983        else
2984                return sprintf(page, "%d\n", rdev->raid_disk);
2985}
2986
2987static ssize_t
2988slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2989{
2990        int slot;
2991        int err;
2992
2993        if (test_bit(Journal, &rdev->flags))
2994                return -EBUSY;
2995        if (strncmp(buf, "none", 4)==0)
2996                slot = -1;
2997        else {
2998                err = kstrtouint(buf, 10, (unsigned int *)&slot);
2999                if (err < 0)
3000                        return err;
3001        }
3002        if (rdev->mddev->pers && slot == -1) {
3003                /* Setting 'slot' on an active array requires also
3004                 * updating the 'rd%d' link, and communicating
3005                 * with the personality with ->hot_*_disk.
3006                 * For now we only support removing
3007                 * failed/spare devices.  This normally happens automatically,
3008                 * but not when the metadata is externally managed.
3009                 */
3010                if (rdev->raid_disk == -1)
3011                        return -EEXIST;
3012                /* personality does all needed checks */
3013                if (rdev->mddev->pers->hot_remove_disk == NULL)
3014                        return -EINVAL;
3015                clear_bit(Blocked, &rdev->flags);
3016                remove_and_add_spares(rdev->mddev, rdev);
3017                if (rdev->raid_disk >= 0)
3018                        return -EBUSY;
3019                set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3020                md_wakeup_thread(rdev->mddev->thread);
3021        } else if (rdev->mddev->pers) {
3022                /* Activating a spare .. or possibly reactivating
3023                 * if we ever get bitmaps working here.
3024                 */
3025                int err;
3026
3027                if (rdev->raid_disk != -1)
3028                        return -EBUSY;
3029
3030                if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
3031                        return -EBUSY;
3032
3033                if (rdev->mddev->pers->hot_add_disk == NULL)
3034                        return -EINVAL;
3035
3036                if (slot >= rdev->mddev->raid_disks &&
3037                    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3038                        return -ENOSPC;
3039
3040                rdev->raid_disk = slot;
3041                if (test_bit(In_sync, &rdev->flags))
3042                        rdev->saved_raid_disk = slot;
3043                else
3044                        rdev->saved_raid_disk = -1;
3045                clear_bit(In_sync, &rdev->flags);
3046                clear_bit(Bitmap_sync, &rdev->flags);
3047                err = rdev->mddev->pers->
3048                        hot_add_disk(rdev->mddev, rdev);
3049                if (err) {
3050                        rdev->raid_disk = -1;
3051                        return err;
3052                } else
3053                        sysfs_notify_dirent_safe(rdev->sysfs_state);
3054                if (sysfs_link_rdev(rdev->mddev, rdev))
3055                        /* failure here is OK */;
3056                /* don't wakeup anyone, leave that to userspace. */
3057        } else {
3058                if (slot >= rdev->mddev->raid_disks &&
3059                    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3060                        return -ENOSPC;
3061                rdev->raid_disk = slot;
3062                /* assume it is working */
3063                clear_bit(Faulty, &rdev->flags);
3064                clear_bit(WriteMostly, &rdev->flags);
3065                set_bit(In_sync, &rdev->flags);
3066                sysfs_notify_dirent_safe(rdev->sysfs_state);
3067        }
3068        return len;
3069}
3070
3071static struct rdev_sysfs_entry rdev_slot =
3072__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
3073
3074static ssize_t
3075offset_show(struct md_rdev *rdev, char *page)
3076{
3077        return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
3078}
3079
3080static ssize_t
3081offset_store(struct md_rdev *rdev, const char *buf, size_t len)
3082{
3083        unsigned long long offset;
3084        if (kstrtoull(buf, 10, &offset) < 0)
3085                return -EINVAL;
3086        if (rdev->mddev->pers && rdev->raid_disk >= 0)
3087                return -EBUSY;
3088        if (rdev->sectors && rdev->mddev->external)
3089                /* Must set offset before size, so overlap checks
3090                 * can be sane */
3091                return -EBUSY;
3092        rdev->data_offset = offset;
3093        rdev->new_data_offset = offset;
3094        return len;
3095}
3096
3097static struct rdev_sysfs_entry rdev_offset =
3098__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
3099
3100static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
3101{
3102        return sprintf(page, "%llu\n",
3103                       (unsigned long long)rdev->new_data_offset);
3104}
3105
3106static ssize_t new_offset_store(struct md_rdev *rdev,
3107                                const char *buf, size_t len)
3108{
3109        unsigned long long new_offset;
3110        struct mddev *mddev = rdev->mddev;
3111
3112        if (kstrtoull(buf, 10, &new_offset) < 0)
3113                return -EINVAL;
3114
3115        if (mddev->sync_thread ||
3116            test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
3117                return -EBUSY;
3118        if (new_offset == rdev->data_offset)
3119                /* reset is always permitted */
3120                ;
3121        else if (new_offset > rdev->data_offset) {
3122                /* must not push array size beyond rdev_sectors */
3123                if (new_offset - rdev->data_offset
3124                    + mddev->dev_sectors > rdev->sectors)
3125                                return -E2BIG;
3126        }
3127        /* Metadata worries about other space details. */
3128
3129        /* decreasing the offset is inconsistent with a backwards
3130         * reshape.
3131         */
3132        if (new_offset < rdev->data_offset &&
3133            mddev->reshape_backwards)
3134                return -EINVAL;
3135        /* Increasing offset is inconsistent with forwards
3136         * reshape.  reshape_direction should be set to
3137         * 'backwards' first.
3138         */
3139        if (new_offset > rdev->data_offset &&
3140            !mddev->reshape_backwards)
3141                return -EINVAL;
3142
3143        if (mddev->pers && mddev->persistent &&
3144            !super_types[mddev->major_version]
3145            .allow_new_offset(rdev, new_offset))
3146                return -E2BIG;
3147        rdev->new_data_offset = new_offset;
3148        if (new_offset > rdev->data_offset)
3149                mddev->reshape_backwards = 1;
3150        else if (new_offset < rdev->data_offset)
3151                mddev->reshape_backwards = 0;
3152
3153        return len;
3154}
3155static struct rdev_sysfs_entry rdev_new_offset =
3156__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
3157
3158static ssize_t
3159rdev_size_show(struct md_rdev *rdev, char *page)
3160{
3161        return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
3162}
3163
3164static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
3165{
3166        /* check if two start/length pairs overlap */
3167        if (s1+l1 <= s2)
3168                return 0;
3169        if (s2+l2 <= s1)
3170                return 0;
3171        return 1;
3172}
3173
3174static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
3175{
3176        unsigned long long blocks;
3177        sector_t new;
3178
3179        if (kstrtoull(buf, 10, &blocks) < 0)
3180                return -EINVAL;
3181
3182        if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
3183                return -EINVAL; /* sector conversion overflow */
3184
3185        new = blocks * 2;
3186        if (new != blocks * 2)
3187                return -EINVAL; /* unsigned long long to sector_t overflow */
3188
3189        *sectors = new;
3190        return 0;
3191}
3192
3193static ssize_t
3194rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3195{
3196        struct mddev *my_mddev = rdev->mddev;
3197        sector_t oldsectors = rdev->sectors;
3198        sector_t sectors;
3199
3200        if (test_bit(Journal, &rdev->flags))
3201                return -EBUSY;
3202        if (strict_blocks_to_sectors(buf, &sectors) < 0)
3203                return -EINVAL;
3204        if (rdev->data_offset != rdev->new_data_offset)
3205                return -EINVAL; /* too confusing */
3206        if (my_mddev->pers && rdev->raid_disk >= 0) {
3207                if (my_mddev->persistent) {
3208                        sectors = super_types[my_mddev->major_version].
3209                                rdev_size_change(rdev, sectors);
3210                        if (!sectors)
3211                                return -EBUSY;
3212                } else if (!sectors)
3213                        sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
3214                                rdev->data_offset;
3215                if (!my_mddev->pers->resize)
3216                        /* Cannot change size for RAID0 or Linear etc */
3217                        return -EINVAL;
3218        }
3219        if (sectors < my_mddev->dev_sectors)
3220                return -EINVAL; /* component must fit device */
3221
3222        rdev->sectors = sectors;
3223        if (sectors > oldsectors && my_mddev->external) {
3224                /* Need to check that all other rdevs with the same
3225                 * ->bdev do not overlap.  'rcu' is sufficient to walk
3226                 * the rdev lists safely.
3227                 * This check does not provide a hard guarantee, it
3228                 * just helps avoid dangerous mistakes.
3229                 */
3230                struct mddev *mddev;
3231                int overlap = 0;
3232                struct list_head *tmp;
3233
3234                rcu_read_lock();
3235                for_each_mddev(mddev, tmp) {
3236                        struct md_rdev *rdev2;
3237
3238                        rdev_for_each(rdev2, mddev)
3239                                if (rdev->bdev == rdev2->bdev &&
3240                                    rdev != rdev2 &&
3241                                    overlaps(rdev->data_offset, rdev->sectors,
3242                                             rdev2->data_offset,
3243                                             rdev2->sectors)) {
3244                                        overlap = 1;
3245                                        break;
3246                                }
3247                        if (overlap) {
3248                                mddev_put(mddev);
3249                                break;
3250                        }
3251                }
3252                rcu_read_unlock();
3253                if (overlap) {
3254                        /* Someone else could have slipped in a size
3255                         * change here, but doing so is just silly.
3256                         * We put oldsectors back because we *know* it is
3257                         * safe, and trust userspace not to race with
3258                         * itself
3259                         */
3260                        rdev->sectors = oldsectors;
3261                        return -EBUSY;
3262                }
3263        }
3264        return len;
3265}
3266
3267static struct rdev_sysfs_entry rdev_size =
3268__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3269
3270static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3271{
3272        unsigned long long recovery_start = rdev->recovery_offset;
3273
3274        if (test_bit(In_sync, &rdev->flags) ||
3275            recovery_start == MaxSector)
3276                return sprintf(page, "none\n");
3277
3278        return sprintf(page, "%llu\n", recovery_start);
3279}
3280
3281static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3282{
3283        unsigned long long recovery_start;
3284
3285        if (cmd_match(buf, "none"))
3286                recovery_start = MaxSector;
3287        else if (kstrtoull(buf, 10, &recovery_start))
3288                return -EINVAL;
3289
3290        if (rdev->mddev->pers &&
3291            rdev->raid_disk >= 0)
3292                return -EBUSY;
3293
3294        rdev->recovery_offset = recovery_start;
3295        if (recovery_start == MaxSector)
3296                set_bit(In_sync, &rdev->flags);
3297        else
3298                clear_bit(In_sync, &rdev->flags);
3299        return len;
3300}
3301
3302static struct rdev_sysfs_entry rdev_recovery_start =
3303__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3304
3305/* sysfs access to bad-blocks list.
3306 * We present two files.
3307 * 'bad-blocks' lists sector numbers and lengths of ranges that
3308 *    are recorded as bad.  The list is truncated to fit within
3309 *    the one-page limit of sysfs.
3310 *    Writing "sector length" to this file adds an acknowledged
3311 *    bad block list.
3312 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
3313 *    been acknowledged.  Writing to this file adds bad blocks
3314 *    without acknowledging them.  This is largely for testing.
3315 */
3316static ssize_t bb_show(struct md_rdev *rdev, char *page)
3317{
3318        return badblocks_show(&rdev->badblocks, page, 0);
3319}
3320static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3321{
3322        int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3323        /* Maybe that ack was all we needed */
3324        if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3325                wake_up(&rdev->blocked_wait);
3326        return rv;
3327}
3328static struct rdev_sysfs_entry rdev_bad_blocks =
3329__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3330
3331static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3332{
3333        return badblocks_show(&rdev->badblocks, page, 1);
3334}
3335static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3336{
3337        return badblocks_store(&rdev->badblocks, page, len, 1);
3338}
3339static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3340__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3341
3342static ssize_t
3343ppl_sector_show(struct md_rdev *rdev, char *page)
3344{
3345        return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
3346}
3347
3348static ssize_t
3349ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
3350{
3351        unsigned long long sector;
3352
3353        if (kstrtoull(buf, 10, &sector) < 0)
3354                return -EINVAL;
3355        if (sector != (sector_t)sector)
3356                return -EINVAL;
3357
3358        if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3359            rdev->raid_disk >= 0)
3360                return -EBUSY;
3361
3362        if (rdev->mddev->persistent) {
3363                if (rdev->mddev->major_version == 0)
3364                        return -EINVAL;
3365                if ((sector > rdev->sb_start &&
3366                     sector - rdev->sb_start > S16_MAX) ||
3367                    (sector < rdev->sb_start &&
3368                     rdev->sb_start - sector > -S16_MIN))
3369                        return -EINVAL;
3370                rdev->ppl.offset = sector - rdev->sb_start;
3371        } else if (!rdev->mddev->external) {
3372                return -EBUSY;
3373        }
3374        rdev->ppl.sector = sector;
3375        return len;
3376}
3377
3378static struct rdev_sysfs_entry rdev_ppl_sector =
3379__ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
3380
3381static ssize_t
3382ppl_size_show(struct md_rdev *rdev, char *page)
3383{
3384        return sprintf(page, "%u\n", rdev->ppl.size);
3385}
3386
3387static ssize_t
3388ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3389{
3390        unsigned int size;
3391
3392        if (kstrtouint(buf, 10, &size) < 0)
3393                return -EINVAL;
3394
3395        if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3396            rdev->raid_disk >= 0)
3397                return -EBUSY;
3398
3399        if (rdev->mddev->persistent) {
3400                if (rdev->mddev->major_version == 0)
3401                        return -EINVAL;
3402                if (size > U16_MAX)
3403                        return -EINVAL;
3404        } else if (!rdev->mddev->external) {
3405                return -EBUSY;
3406        }
3407        rdev->ppl.size = size;
3408        return len;
3409}
3410
3411static struct rdev_sysfs_entry rdev_ppl_size =
3412__ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
3413
3414static struct attribute *rdev_default_attrs[] = {
3415        &rdev_state.attr,
3416        &rdev_errors.attr,
3417        &rdev_slot.attr,
3418        &rdev_offset.attr,
3419        &rdev_new_offset.attr,
3420        &rdev_size.attr,
3421        &rdev_recovery_start.attr,
3422        &rdev_bad_blocks.attr,
3423        &rdev_unack_bad_blocks.attr,
3424        &rdev_ppl_sector.attr,
3425        &rdev_ppl_size.attr,
3426        NULL,
3427};
3428static ssize_t
3429rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3430{
3431        struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3432        struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3433
3434        if (!entry->show)
3435                return -EIO;
3436        if (!rdev->mddev)
3437                return -ENODEV;
3438        return entry->show(rdev, page);
3439}
3440
3441static ssize_t
3442rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3443              const char *page, size_t length)
3444{
3445        struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3446        struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3447        ssize_t rv;
3448        struct mddev *mddev = rdev->mddev;
3449
3450        if (!entry->store)
3451                return -EIO;
3452        if (!capable(CAP_SYS_ADMIN))
3453                return -EACCES;
3454        rv = mddev ? mddev_lock(mddev) : -ENODEV;
3455        if (!rv) {
3456                if (rdev->mddev == NULL)
3457                        rv = -ENODEV;
3458                else
3459                        rv = entry->store(rdev, page, length);
3460                mddev_unlock(mddev);
3461        }
3462        return rv;
3463}
3464
3465static void rdev_free(struct kobject *ko)
3466{
3467        struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3468        kfree(rdev);
3469}
3470static const struct sysfs_ops rdev_sysfs_ops = {
3471        .show           = rdev_attr_show,
3472        .store          = rdev_attr_store,
3473};
3474static struct kobj_type rdev_ktype = {
3475        .release        = rdev_free,
3476        .sysfs_ops      = &rdev_sysfs_ops,
3477        .default_attrs  = rdev_default_attrs,
3478};
3479
3480int md_rdev_init(struct md_rdev *rdev)
3481{
3482        rdev->desc_nr = -1;
3483        rdev->saved_raid_disk = -1;
3484        rdev->raid_disk = -1;
3485        rdev->flags = 0;
3486        rdev->data_offset = 0;
3487        rdev->new_data_offset = 0;
3488        rdev->sb_events = 0;
3489        rdev->last_read_error = 0;
3490        rdev->sb_loaded = 0;
3491        rdev->bb_page = NULL;
3492        atomic_set(&rdev->nr_pending, 0);
3493        atomic_set(&rdev->read_errors, 0);
3494        atomic_set(&rdev->corrected_errors, 0);
3495
3496        INIT_LIST_HEAD(&rdev->same_set);
3497        init_waitqueue_head(&rdev->blocked_wait);
3498
3499        /* Add space to store bad block list.
3500         * This reserves the space even on arrays where it cannot
3501         * be used - I wonder if that matters
3502         */
3503        return badblocks_init(&rdev->badblocks, 0);
3504}
3505EXPORT_SYMBOL_GPL(md_rdev_init);
3506/*
3507 * Import a device. If 'super_format' >= 0, then sanity check the superblock
3508 *
3509 * mark the device faulty if:
3510 *
3511 *   - the device is nonexistent (zero size)
3512 *   - the device has no valid superblock
3513 *
3514 * a faulty rdev _never_ has rdev->sb set.
3515 */
3516static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3517{
3518        char b[BDEVNAME_SIZE];
3519        int err;
3520        struct md_rdev *rdev;
3521        sector_t size;
3522
3523        rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3524        if (!rdev)
3525                return ERR_PTR(-ENOMEM);
3526
3527        err = md_rdev_init(rdev);
3528        if (err)
3529                goto abort_free;
3530        err = alloc_disk_sb(rdev);
3531        if (err)
3532                goto abort_free;
3533
3534        err = lock_rdev(rdev, newdev, super_format == -2);
3535        if (err)
3536                goto abort_free;
3537
3538        kobject_init(&rdev->kobj, &rdev_ktype);
3539
3540        size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
3541        if (!size) {
3542                pr_warn("md: %s has zero or unknown size, marking faulty!\n",
3543                        bdevname(rdev->bdev,b));
3544                err = -EINVAL;
3545                goto abort_free;
3546        }
3547
3548        if (super_format >= 0) {
3549                err = super_types[super_format].
3550                        load_super(rdev, NULL, super_minor);
3551                if (err == -EINVAL) {
3552                        pr_warn("md: %s does not have a valid v%d.%d superblock, not importing!\n",
3553                                bdevname(rdev->bdev,b),
3554                                super_format, super_minor);
3555                        goto abort_free;
3556                }
3557                if (err < 0) {
3558                        pr_warn("md: could not read %s's sb, not importing!\n",
3559                                bdevname(rdev->bdev,b));
3560                        goto abort_free;
3561                }
3562        }
3563
3564        return rdev;
3565
3566abort_free:
3567        if (rdev->bdev)
3568                unlock_rdev(rdev);
3569        md_rdev_clear(rdev);
3570        kfree(rdev);
3571        return ERR_PTR(err);
3572}
3573
3574/*
3575 * Check a full RAID array for plausibility
3576 */
3577
3578static void analyze_sbs(struct mddev *mddev)
3579{
3580        int i;
3581        struct md_rdev *rdev, *freshest, *tmp;
3582        char b[BDEVNAME_SIZE];
3583
3584        freshest = NULL;
3585        rdev_for_each_safe(rdev, tmp, mddev)
3586                switch (super_types[mddev->major_version].
3587                        load_super(rdev, freshest, mddev->minor_version)) {
3588                case 1:
3589                        freshest = rdev;
3590                        break;
3591                case 0:
3592                        break;
3593                default:
3594                        pr_warn("md: fatal superblock inconsistency in %s -- removing from array\n",
3595                                bdevname(rdev->bdev,b));
3596                        md_kick_rdev_from_array(rdev);
3597                }
3598
3599        super_types[mddev->major_version].
3600                validate_super(mddev, freshest);
3601
3602        i = 0;
3603        rdev_for_each_safe(rdev, tmp, mddev) {
3604                if (mddev->max_disks &&
3605                    (rdev->desc_nr >= mddev->max_disks ||
3606                     i > mddev->max_disks)) {
3607                        pr_warn("md: %s: %s: only %d devices permitted\n",
3608                                mdname(mddev), bdevname(rdev->bdev, b),
3609                                mddev->max_disks);
3610                        md_kick_rdev_from_array(rdev);
3611                        continue;
3612                }
3613                if (rdev != freshest) {
3614                        if (super_types[mddev->major_version].
3615                            validate_super(mddev, rdev)) {
3616                                pr_warn("md: kicking non-fresh %s from array!\n",
3617                                        bdevname(rdev->bdev,b));
3618                                md_kick_rdev_from_array(rdev);
3619                                continue;
3620                        }
3621                }
3622                if (mddev->level == LEVEL_MULTIPATH) {
3623                        rdev->desc_nr = i++;
3624                        rdev->raid_disk = rdev->desc_nr;
3625                        set_bit(In_sync, &rdev->flags);
3626                } else if (rdev->raid_disk >=
3627                            (mddev->raid_disks - min(0, mddev->delta_disks)) &&
3628                           !test_bit(Journal, &rdev->flags)) {
3629                        rdev->raid_disk = -1;
3630                        clear_bit(In_sync, &rdev->flags);
3631                }
3632        }
3633}
3634
3635/* Read a fixed-point number.
3636 * Numbers in sysfs attributes should be in "standard" units where
3637 * possible, so time should be in seconds.
3638 * However we internally use a a much smaller unit such as
3639 * milliseconds or jiffies.
3640 * This function takes a decimal number with a possible fractional
3641 * component, and produces an integer which is the result of
3642 * multiplying that number by 10^'scale'.
3643 * all without any floating-point arithmetic.
3644 */
3645int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3646{
3647        unsigned long result = 0;
3648        long decimals = -1;
3649        while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3650                if (*cp == '.')
3651                        decimals = 0;
3652                else if (decimals < scale) {
3653                        unsigned int value;
3654                        value = *cp - '0';
3655                        result = result * 10 + value;
3656                        if (decimals >= 0)
3657                                decimals++;
3658                }
3659                cp++;
3660        }
3661        if (*cp == '\n')
3662                cp++;
3663        if (*cp)
3664                return -EINVAL;
3665        if (decimals < 0)
3666                decimals = 0;
3667        while (decimals < scale) {
3668                result *= 10;
3669                decimals ++;
3670        }
3671        *res = result;
3672        return 0;
3673}
3674
3675static ssize_t
3676safe_delay_show(struct mddev *mddev, char *page)
3677{
3678        int msec = (mddev->safemode_delay*1000)/HZ;
3679        return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
3680}
3681static ssize_t
3682safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3683{
3684        unsigned long msec;
3685
3686        if (mddev_is_clustered(mddev)) {
3687                pr_warn("md: Safemode is disabled for clustered mode\n");
3688                return -EINVAL;
3689        }
3690
3691        if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
3692                return -EINVAL;
3693        if (msec == 0)
3694                mddev->safemode_delay = 0;
3695        else {
3696                unsigned long old_delay = mddev->safemode_delay;
3697                unsigned long new_delay = (msec*HZ)/1000;
3698
3699                if (new_delay == 0)
3700                        new_delay = 1;
3701                mddev->safemode_delay = new_delay;
3702                if (new_delay < old_delay || old_delay == 0)
3703                        mod_timer(&mddev->safemode_timer, jiffies+1);
3704        }
3705        return len;
3706}
3707static struct md_sysfs_entry md_safe_delay =
3708__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3709
3710static ssize_t
3711level_show(struct mddev *mddev, char *page)
3712{
3713        struct md_personality *p;
3714        int ret;
3715        spin_lock(&mddev->lock);
3716        p = mddev->pers;
3717        if (p)
3718                ret = sprintf(page, "%s\n", p->name);
3719        else if (mddev->clevel[0])
3720                ret = sprintf(page, "%s\n", mddev->clevel);
3721        else if (mddev->level != LEVEL_NONE)
3722                ret = sprintf(page, "%d\n", mddev->level);
3723        else
3724                ret = 0;
3725        spin_unlock(&mddev->lock);
3726        return ret;
3727}
3728
3729static ssize_t
3730level_store(struct mddev *mddev, const char *buf, size_t len)
3731{
3732        char clevel[16];
3733        ssize_t rv;
3734        size_t slen = len;
3735        struct md_personality *pers, *oldpers;
3736        long level;
3737        void *priv, *oldpriv;
3738        struct md_rdev *rdev;
3739
3740        if (slen == 0 || slen >= sizeof(clevel))
3741                return -EINVAL;
3742
3743        rv = mddev_lock(mddev);
3744        if (rv)
3745                return rv;
3746
3747        if (mddev->pers == NULL) {
3748                strncpy(mddev->clevel, buf, slen);
3749                if (mddev->clevel[slen-1] == '\n')
3750                        slen--;
3751                mddev->clevel[slen] = 0;
3752                mddev->level = LEVEL_NONE;
3753                rv = len;
3754                goto out_unlock;
3755        }
3756        rv = -EROFS;
3757        if (mddev->ro)
3758                goto out_unlock;
3759
3760        /* request to change the personality.  Need to ensure:
3761         *  - array is not engaged in resync/recovery/reshape
3762         *  - old personality can be suspended
3763         *  - new personality will access other array.
3764         */
3765
3766        rv = -EBUSY;
3767        if (mddev->sync_thread ||
3768            test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3769            mddev->reshape_position != MaxSector ||
3770            mddev->sysfs_active)
3771                goto out_unlock;
3772
3773        rv = -EINVAL;
3774        if (!mddev->pers->quiesce) {
3775                pr_warn("md: %s: %s does not support online personality change\n",
3776                        mdname(mddev), mddev->pers->name);
3777                goto out_unlock;
3778        }
3779
3780        /* Now find the new personality */
3781        strncpy(clevel, buf, slen);
3782        if (clevel[slen-1] == '\n')
3783                slen--;
3784        clevel[slen] = 0;
3785        if (kstrtol(clevel, 10, &level))
3786                level = LEVEL_NONE;
3787
3788        if (request_module("md-%s", clevel) != 0)
3789                request_module("md-level-%s", clevel);
3790        spin_lock(&pers_lock);
3791        pers = find_pers(level, clevel);
3792        if (!pers || !try_module_get(pers->owner)) {
3793                spin_unlock(&pers_lock);
3794                pr_warn("md: personality %s not loaded\n", clevel);
3795                rv = -EINVAL;
3796                goto out_unlock;
3797        }
3798        spin_unlock(&pers_lock);
3799
3800        if (pers == mddev->pers) {
3801                /* Nothing to do! */
3802                module_put(pers->owner);
3803                rv = len;
3804                goto out_unlock;
3805        }
3806        if (!pers->takeover) {
3807                module_put(pers->owner);
3808                pr_warn("md: %s: %s does not support personality takeover\n",
3809                        mdname(mddev), clevel);
3810                rv = -EINVAL;
3811                goto out_unlock;
3812        }
3813
3814        rdev_for_each(rdev, mddev)
3815                rdev->new_raid_disk = rdev->raid_disk;
3816
3817        /* ->takeover must set new_* and/or delta_disks
3818         * if it succeeds, and may set them when it fails.
3819         */
3820        priv = pers->takeover(mddev);
3821        if (IS_ERR(priv)) {
3822                mddev->new_level = mddev->level;
3823                mddev->new_layout = mddev->layout;
3824                mddev->new_chunk_sectors = mddev->chunk_sectors;
3825                mddev->raid_disks -= mddev->delta_disks;
3826                mddev->delta_disks = 0;
3827                mddev->reshape_backwards = 0;
3828                module_put(pers->owner);
3829                pr_warn("md: %s: %s would not accept array\n",
3830                        mdname(mddev), clevel);
3831                rv = PTR_ERR(priv);
3832                goto out_unlock;
3833        }
3834
3835        /* Looks like we have a winner */
3836        mddev_suspend(mddev);
3837        mddev_detach(mddev);
3838
3839        spin_lock(&mddev->lock);
3840        oldpers = mddev->pers;
3841        oldpriv = mddev->private;
3842        mddev->pers = pers;
3843        mddev->private = priv;
3844        strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3845        mddev->level = mddev->new_level;
3846        mddev->layout = mddev->new_layout;
3847        mddev->chunk_sectors = mddev->new_chunk_sectors;
3848        mddev->delta_disks = 0;
3849        mddev->reshape_backwards = 0;
3850        mddev->degraded = 0;
3851        spin_unlock(&mddev->lock);
3852
3853        if (oldpers->sync_request == NULL &&
3854            mddev->external) {
3855                /* We are converting from a no-redundancy array
3856                 * to a redundancy array and metadata is managed
3857                 * externally so we need to be sure that writes
3858                 * won't block due to a need to transition
3859                 *      clean->dirty
3860                 * until external management is started.
3861                 */
3862                mddev->in_sync = 0;
3863                mddev->safemode_delay = 0;
3864                mddev->safemode = 0;
3865        }
3866
3867        oldpers->free(mddev, oldpriv);
3868
3869        if (oldpers->sync_request == NULL &&
3870            pers->sync_request != NULL) {
3871                /* need to add the md_redundancy_group */
3872                if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3873                        pr_warn("md: cannot register extra attributes for %s\n",
3874                                mdname(mddev));
3875                mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
3876        }
3877        if (oldpers->sync_request != NULL &&
3878            pers->sync_request == NULL) {
3879                /* need to remove the md_redundancy_group */
3880                if (mddev->to_remove == NULL)
3881                        mddev->to_remove = &md_redundancy_group;
3882        }
3883
3884        module_put(oldpers->owner);
3885
3886        rdev_for_each(rdev, mddev) {
3887                if (rdev->raid_disk < 0)
3888                        continue;
3889                if (rdev->new_raid_disk >= mddev->raid_disks)
3890                        rdev->new_raid_disk = -1;
3891                if (rdev->new_raid_disk == rdev->raid_disk)
3892                        continue;
3893                sysfs_unlink_rdev(mddev, rdev);
3894        }
3895        rdev_for_each(rdev, mddev) {
3896                if (rdev->raid_disk < 0)
3897                        continue;
3898                if (rdev->new_raid_disk == rdev->raid_disk)
3899                        continue;
3900                rdev->raid_disk = rdev->new_raid_disk;
3901                if (rdev->raid_disk < 0)
3902                        clear_bit(In_sync, &rdev->flags);
3903                else {
3904                        if (sysfs_link_rdev(mddev, rdev))
3905                                pr_warn("md: cannot register rd%d for %s after level change\n",
3906                                        rdev->raid_disk, mdname(mddev));
3907                }
3908        }
3909
3910        if (pers->sync_request == NULL) {
3911                /* this is now an array without redundancy, so
3912                 * it must always be in_sync
3913                 */
3914                mddev->in_sync = 1;
3915                del_timer_sync(&mddev->safemode_timer);
3916        }
3917        blk_set_stacking_limits(&mddev->queue->limits);
3918        pers->run(mddev);
3919        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
3920        mddev_resume(mddev);
3921        if (!mddev->thread)
3922                md_update_sb(mddev, 1);
3923        sysfs_notify(&mddev->kobj, NULL, "level");
3924        md_new_event(mddev);
3925        rv = len;
3926out_unlock:
3927        mddev_unlock(mddev);
3928        return rv;
3929}
3930
3931static struct md_sysfs_entry md_level =
3932__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
3933
3934static ssize_t
3935layout_show(struct mddev *mddev, char *page)
3936{
3937        /* just a number, not meaningful for all levels */
3938        if (mddev->reshape_position != MaxSector &&
3939            mddev->layout != mddev->new_layout)
3940                return sprintf(page, "%d (%d)\n",
3941                               mddev->new_layout, mddev->layout);
3942        return sprintf(page, "%d\n", mddev->layout);
3943}
3944
3945static ssize_t
3946layout_store(struct mddev *mddev, const char *buf, size_t len)
3947{
3948        unsigned int n;
3949        int err;
3950
3951        err = kstrtouint(buf, 10, &n);
3952        if (err < 0)
3953                return err;
3954        err = mddev_lock(mddev);
3955        if (err)
3956                return err;
3957
3958        if (mddev->pers) {
3959                if (mddev->pers->check_reshape == NULL)
3960                        err = -EBUSY;
3961                else if (mddev->ro)
3962                        err = -EROFS;
3963                else {
3964                        mddev->new_layout = n;
3965                        err = mddev->pers->check_reshape(mddev);
3966                        if (err)
3967                                mddev->new_layout = mddev->layout;
3968                }
3969        } else {
3970                mddev->new_layout = n;
3971                if (mddev->reshape_position == MaxSector)
3972                        mddev->layout = n;
3973        }
3974        mddev_unlock(mddev);
3975        return err ?: len;
3976}
3977static struct md_sysfs_entry md_layout =
3978__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
3979
3980static ssize_t
3981raid_disks_show(struct mddev *mddev, char *page)
3982{
3983        if (mddev->raid_disks == 0)
3984                return 0;
3985        if (mddev->reshape_position != MaxSector &&
3986            mddev->delta_disks != 0)
3987                return sprintf(page, "%d (%d)\n", mddev->raid_disks,
3988                               mddev->raid_disks - mddev->delta_disks);
3989        return sprintf(page, "%d\n", mddev->raid_disks);
3990}
3991
3992static int update_raid_disks(struct mddev *mddev, int raid_disks);
3993
3994static ssize_t
3995raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
3996{
3997        unsigned int n;
3998        int err;
3999
4000        err = kstrtouint(buf, 10, &n);
4001        if (err < 0)
4002                return err;
4003
4004        err = mddev_lock(mddev);
4005        if (err)
4006                return err;
4007        if (mddev->pers)
4008                err = update_raid_disks(mddev, n);
4009        else if (mddev->reshape_position != MaxSector) {
4010                struct md_rdev *rdev;
4011                int olddisks = mddev->raid_disks - mddev->delta_disks;
4012
4013                err = -EINVAL;
4014                rdev_for_each(rdev, mddev) {
4015                        if (olddisks < n &&
4016                            rdev->data_offset < rdev->new_data_offset)
4017                                goto out_unlock;
4018                        if (olddisks > n &&
4019                            rdev->data_offset > rdev->new_data_offset)
4020                                goto out_unlock;
4021                }
4022                err = 0;
4023                mddev->delta_disks = n - olddisks;
4024                mddev->raid_disks = n;
4025                mddev->reshape_backwards = (mddev->delta_disks < 0);
4026        } else
4027                mddev->raid_disks = n;
4028out_unlock:
4029        mddev_unlock(mddev);
4030        return err ? err : len;
4031}
4032static struct md_sysfs_entry md_raid_disks =
4033__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
4034
4035static ssize_t
4036chunk_size_show(struct mddev *mddev, char *page)
4037{
4038        if (mddev->reshape_position != MaxSector &&
4039            mddev->chunk_sectors != mddev->new_chunk_sectors)
4040                return sprintf(page, "%d (%d)\n",
4041                               mddev->new_chunk_sectors << 9,
4042                               mddev->chunk_sectors << 9);
4043        return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
4044}
4045
4046static ssize_t
4047chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
4048{
4049        unsigned long n;
4050        int err;
4051
4052        err = kstrtoul(buf, 10, &n);
4053        if (err < 0)
4054                return err;
4055
4056        err = mddev_lock(mddev);
4057        if (err)
4058                return err;
4059        if (mddev->pers) {
4060                if (mddev->pers->check_reshape == NULL)
4061                        err = -EBUSY;
4062                else if (mddev->ro)
4063                        err = -EROFS;
4064                else {
4065                        mddev->new_chunk_sectors = n >> 9;
4066                        err = mddev->pers->check_reshape(mddev);
4067                        if (err)
4068                                mddev->new_chunk_sectors = mddev->chunk_sectors;
4069                }
4070        } else {
4071                mddev->new_chunk_sectors = n >> 9;
4072                if (mddev->reshape_position == MaxSector)
4073                        mddev->chunk_sectors = n >> 9;
4074        }
4075        mddev_unlock(mddev);
4076        return err ?: len;
4077}
4078static struct md_sysfs_entry md_chunk_size =
4079__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
4080
4081static ssize_t
4082resync_start_show(struct mddev *mddev, char *page)
4083{
4084        if (mddev->recovery_cp == MaxSector)
4085                return sprintf(page, "none\n");
4086        return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
4087}
4088
4089static ssize_t
4090resync_start_store(struct mddev *mddev, const char *buf, size_t len)
4091{
4092        unsigned long long n;
4093        int err;
4094
4095        if (cmd_match(buf, "none"))
4096                n = MaxSector;
4097        else {
4098                err = kstrtoull(buf, 10, &n);
4099                if (err < 0)
4100                        return err;
4101                if (n != (sector_t)n)
4102                        return -EINVAL;
4103        }
4104
4105        err = mddev_lock(mddev);
4106        if (err)
4107                return err;
4108        if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4109                err = -EBUSY;
4110
4111        if (!err) {
4112                mddev->recovery_cp = n;
4113                if (mddev->pers)
4114                        set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
4115        }
4116        mddev_unlock(mddev);
4117        return err ?: len;
4118}
4119static struct md_sysfs_entry md_resync_start =
4120__ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
4121                resync_start_show, resync_start_store);
4122
4123/*
4124 * The array state can be:
4125 *
4126 * clear
4127 *     No devices, no size, no level
4128 *     Equivalent to STOP_ARRAY ioctl
4129 * inactive
4130 *     May have some settings, but array is not active
4131 *        all IO results in error
4132 *     When written, doesn't tear down array, but just stops it
4133 * suspended (not supported yet)
4134 *     All IO requests will block. The array can be reconfigured.
4135 *     Writing this, if accepted, will block until array is quiescent
4136 * readonly
4137 *     no resync can happen.  no superblocks get written.
4138 *     write requests fail
4139 * read-auto
4140 *     like readonly, but behaves like 'clean' on a write request.
4141 *
4142 * clean - no pending writes, but otherwise active.
4143 *     When written to inactive array, starts without resync
4144 *     If a write request arrives then
4145 *       if metadata is known, mark 'dirty' and switch to 'active'.
4146 *       if not known, block and switch to write-pending
4147 *     If written to an active array that has pending writes, then fails.
4148 * active
4149 *     fully active: IO and resync can be happening.
4150 *     When written to inactive array, starts with resync
4151 *
4152 * write-pending
4153 *     clean, but writes are blocked waiting for 'active' to be written.
4154 *
4155 * active-idle
4156 *     like active, but no writes have been seen for a while (100msec).
4157 *
4158 */
4159enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
4160                   write_pending, active_idle, bad_word};
4161static char *array_states[] = {
4162        "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
4163        "write-pending", "active-idle", NULL };
4164
4165static int match_word(const char *word, char **list)
4166{
4167        int n;
4168        for (n=0; list[n]; n++)
4169                if (cmd_match(word, list[n]))
4170                        break;
4171        return n;
4172}
4173
4174static ssize_t
4175array_state_show(struct mddev *mddev, char *page)
4176{
4177        enum array_state st = inactive;
4178
4179        if (mddev->pers)
4180                switch(mddev->ro) {
4181                case 1:
4182                        st = readonly;
4183                        break;
4184                case 2:
4185                        st = read_auto;
4186                        break;
4187                case 0:
4188                        spin_lock(&mddev->lock);
4189                        if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
4190                                st = write_pending;
4191                        else if (mddev->in_sync)
4192                                st = clean;
4193                        else if (mddev->safemode)
4194                                st = active_idle;
4195                        else
4196                                st = active;
4197                        spin_unlock(&mddev->lock);
4198                }
4199        else {
4200                if (list_empty(&mddev->disks) &&
4201                    mddev->raid_disks == 0 &&
4202                    mddev->dev_sectors == 0)
4203                        st = clear;
4204                else
4205                        st = inactive;
4206        }
4207        return sprintf(page, "%s\n", array_states[st]);
4208}
4209
4210static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
4211static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
4212static int do_md_run(struct mddev *mddev);
4213static int restart_array(struct mddev *mddev);
4214
4215static ssize_t
4216array_state_store(struct mddev *mddev, const char *buf, size_t len)
4217{
4218        int err = 0;
4219        enum array_state st = match_word(buf, array_states);
4220
4221        if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
4222                /* don't take reconfig_mutex when toggling between
4223                 * clean and active
4224                 */
4225                spin_lock(&mddev->lock);
4226                if (st == active) {
4227                        restart_array(mddev);
4228                        clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4229                        md_wakeup_thread(mddev->thread);
4230                        wake_up(&mddev->sb_wait);
4231                } else /* st == clean */ {
4232                        restart_array(mddev);
4233                        if (!set_in_sync(mddev))
4234                                err = -EBUSY;
4235                }
4236                if (!err)
4237                        sysfs_notify_dirent_safe(mddev->sysfs_state);
4238                spin_unlock(&mddev->lock);
4239                return err ?: len;
4240        }
4241        err = mddev_lock(mddev);
4242        if (err)
4243                return err;
4244        err = -EINVAL;
4245        switch(st) {
4246        case bad_word:
4247                break;
4248        case clear:
4249                /* stopping an active array */
4250                err = do_md_stop(mddev, 0, NULL);
4251                break;
4252        case inactive:
4253                /* stopping an active array */
4254                if (mddev->pers)
4255                        err = do_md_stop(mddev, 2, NULL);
4256                else
4257                        err = 0; /* already inactive */
4258                break;
4259        case suspended:
4260                break; /* not supported yet */
4261        case readonly:
4262                if (mddev->pers)
4263                        err = md_set_readonly(mddev, NULL);
4264                else {
4265                        mddev->ro = 1;
4266                        set_disk_ro(mddev->gendisk, 1);
4267                        err = do_md_run(mddev);
4268                }
4269                break;
4270        case read_auto:
4271                if (mddev->pers) {
4272                        if (mddev->ro == 0)
4273                                err = md_set_readonly(mddev, NULL);
4274                        else if (mddev->ro == 1)
4275                                err = restart_array(mddev);
4276                        if (err == 0) {
4277                                mddev->ro = 2;
4278                                set_disk_ro(mddev->gendisk, 0);
4279                        }
4280                } else {
4281                        mddev->ro = 2;
4282                        err = do_md_run(mddev);
4283                }
4284                break;
4285        case clean:
4286                if (mddev->pers) {
4287                        err = restart_array(mddev);
4288                        if (err)
4289                                break;
4290                        spin_lock(&mddev->lock);
4291                        if (!set_in_sync(mddev))
4292                                err = -EBUSY;
4293                        spin_unlock(&mddev->lock);
4294                } else
4295                        err = -EINVAL;
4296                break;
4297        case active:
4298                if (mddev->pers) {
4299                        err = restart_array(mddev);
4300                        if (err)
4301                                break;
4302                        clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4303                        wake_up(&mddev->sb_wait);
4304                        err = 0;
4305                } else {
4306                        mddev->ro = 0;
4307                        set_disk_ro(mddev->gendisk, 0);
4308                        err = do_md_run(mddev);
4309                }
4310                break;
4311        case write_pending:
4312        case active_idle:
4313                /* these cannot be set */
4314                break;
4315        }
4316
4317        if (!err) {
4318                if (mddev->hold_active == UNTIL_IOCTL)
4319                        mddev->hold_active = 0;
4320                sysfs_notify_dirent_safe(mddev->sysfs_state);
4321        }
4322        mddev_unlock(mddev);
4323        return err ?: len;
4324}
4325static struct md_sysfs_entry md_array_state =
4326__ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
4327
4328static ssize_t
4329max_corrected_read_errors_show(struct mddev *mddev, char *page) {
4330        return sprintf(page, "%d\n",
4331                       atomic_read(&mddev->max_corr_read_errors));
4332}
4333
4334static ssize_t
4335max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
4336{
4337        unsigned int n;
4338        int rv;
4339
4340        rv = kstrtouint(buf, 10, &n);
4341        if (rv < 0)
4342                return rv;
4343        atomic_set(&mddev->max_corr_read_errors, n);
4344        return len;
4345}
4346
4347static struct md_sysfs_entry max_corr_read_errors =
4348__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
4349        max_corrected_read_errors_store);
4350
4351static ssize_t
4352null_show(struct mddev *mddev, char *page)
4353{
4354        return -EINVAL;
4355}
4356
4357static ssize_t
4358new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4359{
4360        /* buf must be %d:%d\n? giving major and minor numbers */
4361        /* The new device is added to the array.
4362         * If the array has a persistent superblock, we read the
4363         * superblock to initialise info and check validity.
4364         * Otherwise, only checking done is that in bind_rdev_to_array,
4365         * which mainly checks size.
4366         */
4367        char *e;
4368        int major = simple_strtoul(buf, &e, 10);
4369        int minor;
4370        dev_t dev;
4371        struct md_rdev *rdev;
4372        int err;
4373
4374        if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
4375                return -EINVAL;
4376        minor = simple_strtoul(e+1, &e, 10);
4377        if (*e && *e != '\n')
4378                return -EINVAL;
4379        dev = MKDEV(major, minor);
4380        if (major != MAJOR(dev) ||
4381            minor != MINOR(dev))
4382                return -EOVERFLOW;
4383
4384        flush_workqueue(md_misc_wq);
4385
4386        err = mddev_lock(mddev);
4387        if (err)
4388                return err;
4389        if (mddev->persistent) {
4390                rdev = md_import_device(dev, mddev->major_version,
4391                                        mddev->minor_version);
4392                if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4393                        struct md_rdev *rdev0
4394                                = list_entry(mddev->disks.next,
4395                                             struct md_rdev, same_set);
4396                        err = super_types[mddev->major_version]
4397                                .load_super(rdev, rdev0, mddev->minor_version);
4398                        if (err < 0)
4399                                goto out;
4400                }
4401        } else if (mddev->external)
4402                rdev = md_import_device(dev, -2, -1);
4403        else
4404                rdev = md_import_device(dev, -1, -1);
4405
4406        if (IS_ERR(rdev)) {
4407                mddev_unlock(mddev);
4408                return PTR_ERR(rdev);
4409        }
4410        err = bind_rdev_to_array(rdev, mddev);
4411 out:
4412        if (err)
4413                export_rdev(rdev);
4414        mddev_unlock(mddev);
4415        if (!err)
4416                md_new_event(mddev);
4417        return err ? err : len;
4418}
4419
4420static struct md_sysfs_entry md_new_device =
4421__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4422
4423static ssize_t
4424bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4425{
4426        char *end;
4427        unsigned long chunk, end_chunk;
4428        int err;
4429
4430        err = mddev_lock(mddev);
4431        if (err)
4432                return err;
4433        if (!mddev->bitmap)
4434                goto out;
4435        /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
4436        while (*buf) {
4437                chunk = end_chunk = simple_strtoul(buf, &end, 0);
4438                if (buf == end) break;
4439                if (*end == '-') { /* range */
4440                        buf = end + 1;
4441                        end_chunk = simple_strtoul(buf, &end, 0);
4442                        if (buf == end) break;
4443                }
4444                if (*end && !isspace(*end)) break;
4445                md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
4446                buf = skip_spaces(end);
4447        }
4448        md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
4449out:
4450        mddev_unlock(mddev);
4451        return len;
4452}
4453
4454static struct md_sysfs_entry md_bitmap =
4455__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4456
4457static ssize_t
4458size_show(struct mddev *mddev, char *page)
4459{
4460        return sprintf(page, "%llu\n",
4461                (unsigned long long)mddev->dev_sectors / 2);
4462}
4463
4464static int update_size(struct mddev *mddev, sector_t num_sectors);
4465
4466static ssize_t
4467size_store(struct mddev *mddev, const char *buf, size_t len)
4468{
4469        /* If array is inactive, we can reduce the component size, but
4470         * not increase it (except from 0).
4471         * If array is active, we can try an on-line resize
4472         */
4473        sector_t sectors;
4474        int err = strict_blocks_to_sectors(buf, &sectors);
4475
4476        if (err < 0)
4477                return err;
4478        err = mddev_lock(mddev);
4479        if (err)
4480                return err;
4481        if (mddev->pers) {
4482                err = update_size(mddev, sectors);
4483                if (err == 0)
4484                        md_update_sb(mddev, 1);
4485        } else {
4486                if (mddev->dev_sectors == 0 ||
4487                    mddev->dev_sectors > sectors)
4488                        mddev->dev_sectors = sectors;
4489                else
4490                        err = -ENOSPC;
4491        }
4492        mddev_unlock(mddev);
4493        return err ? err : len;
4494}
4495
4496static struct md_sysfs_entry md_size =
4497__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4498
4499/* Metadata version.
4500 * This is one of
4501 *   'none' for arrays with no metadata (good luck...)
4502 *   'external' for arrays with externally managed metadata,
4503 * or N.M for internally known formats
4504 */
4505static ssize_t
4506metadata_show(struct mddev *mddev, char *page)
4507{
4508        if (mddev->persistent)
4509                return sprintf(page, "%d.%d\n",
4510                               mddev->major_version, mddev->minor_version);
4511        else if (mddev->external)
4512                return sprintf(page, "external:%s\n", mddev->metadata_type);
4513        else
4514                return sprintf(page, "none\n");
4515}
4516
4517static ssize_t
4518metadata_store(struct mddev *mddev, const char *buf, size_t len)
4519{
4520        int major, minor;
4521        char *e;
4522        int err;
4523        /* Changing the details of 'external' metadata is
4524         * always permitted.  Otherwise there must be
4525         * no devices attached to the array.
4526         */
4527
4528        err = mddev_lock(mddev);
4529        if (err)
4530                return err;
4531        err = -EBUSY;
4532        if (mddev->external && strncmp(buf, "external:", 9) == 0)
4533                ;
4534        else if (!list_empty(&mddev->disks))
4535                goto out_unlock;
4536
4537        err = 0;
4538        if (cmd_match(buf, "none")) {
4539                mddev->persistent = 0;
4540                mddev->external = 0;
4541                mddev->major_version = 0;
4542                mddev->minor_version = 90;
4543                goto out_unlock;
4544        }
4545        if (strncmp(buf, "external:", 9) == 0) {
4546                size_t namelen = len-9;
4547                if (namelen >= sizeof(mddev->metadata_type))
4548                        namelen = sizeof(mddev->metadata_type)-1;
4549                strncpy(mddev->metadata_type, buf+9, namelen);
4550                mddev->metadata_type[namelen] = 0;
4551                if (namelen && mddev->metadata_type[namelen-1] == '\n')
4552                        mddev->metadata_type[--namelen] = 0;
4553                mddev->persistent = 0;
4554                mddev->external = 1;
4555                mddev->major_version = 0;
4556                mddev->minor_version = 90;
4557                goto out_unlock;
4558        }
4559        major = simple_strtoul(buf, &e, 10);
4560        err = -EINVAL;
4561        if (e==buf || *e != '.')
4562                goto out_unlock;
4563        buf = e+1;
4564        minor = simple_strtoul(buf, &e, 10);
4565        if (e==buf || (*e && *e != '\n') )
4566                goto out_unlock;
4567        err = -ENOENT;
4568        if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
4569                goto out_unlock;
4570        mddev->major_version = major;
4571        mddev->minor_version = minor;
4572        mddev->persistent = 1;
4573        mddev->external = 0;
4574        err = 0;
4575out_unlock:
4576        mddev_unlock(mddev);
4577        return err ?: len;
4578}
4579
4580static struct md_sysfs_entry md_metadata =
4581__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4582
4583static ssize_t
4584action_show(struct mddev *mddev, char *page)
4585{
4586        char *type = "idle";
4587        unsigned long recovery = mddev->recovery;
4588        if (test_bit(MD_RECOVERY_FROZEN, &recovery))
4589                type = "frozen";
4590        else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
4591            (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
4592                if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
4593                        type = "reshape";
4594                else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
4595                        if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
4596                                type = "resync";
4597                        else if (test_bit(MD_RECOVERY_CHECK, &recovery))
4598                                type = "check";
4599                        else
4600                                type = "repair";
4601                } else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
4602                        type = "recover";
4603                else if (mddev->reshape_position != MaxSector)
4604                        type = "reshape";
4605        }
4606        return sprintf(page, "%s\n", type);
4607}
4608
4609static ssize_t
4610action_store(struct mddev *mddev, const char *page, size_t len)
4611{
4612        if (!mddev->pers || !mddev->pers->sync_request)
4613                return -EINVAL;
4614
4615
4616        if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
4617                if (cmd_match(page, "frozen"))
4618                        set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4619                else
4620                        clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4621                if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
4622                    mddev_lock(mddev) == 0) {
4623                        flush_workqueue(md_misc_wq);
4624                        if (mddev->sync_thread) {
4625                                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4626                                md_reap_sync_thread(mddev);
4627                        }
4628                        mddev_unlock(mddev);
4629                }
4630        } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4631                return -EBUSY;
4632        else if (cmd_match(page, "resync"))
4633                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4634        else if (cmd_match(page, "recover")) {
4635                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4636                set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4637        } else if (cmd_match(page, "reshape")) {
4638                int err;
4639                if (mddev->pers->start_reshape == NULL)
4640                        return -EINVAL;
4641                err = mddev_lock(mddev);
4642                if (!err) {
4643                        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4644                                err =  -EBUSY;
4645                        else {
4646                                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4647                                err = mddev->pers->start_reshape(mddev);
4648                        }
4649                        mddev_unlock(mddev);
4650                }
4651                if (err)
4652                        return err;
4653                sysfs_notify(&mddev->kobj, NULL, "degraded");
4654        } else {
4655                if (cmd_match(page, "check"))
4656                        set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4657                else if (!cmd_match(page, "repair"))
4658                        return -EINVAL;
4659                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4660                set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
4661                set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4662        }
4663        if (mddev->ro == 2) {
4664                /* A write to sync_action is enough to justify
4665                 * canceling read-auto mode
4666                 */
4667                mddev->ro = 0;
4668                md_wakeup_thread(mddev->sync_thread);
4669        }
4670        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4671        md_wakeup_thread(mddev->thread);
4672        sysfs_notify_dirent_safe(mddev->sysfs_action);
4673        return len;
4674}
4675
4676static struct md_sysfs_entry md_scan_mode =
4677__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
4678
4679static ssize_t
4680last_sync_action_show(struct mddev *mddev, char *page)
4681{
4682        return sprintf(page, "%s\n", mddev->last_sync_action);
4683}
4684
4685static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
4686
4687static ssize_t
4688mismatch_cnt_show(struct mddev *mddev, char *page)
4689{
4690        return sprintf(page, "%llu\n",
4691                       (unsigned long long)
4692                       atomic64_read(&mddev->resync_mismatches));
4693}
4694
4695static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
4696
4697static ssize_t
4698sync_min_show(struct mddev *mddev, char *page)
4699{
4700        return sprintf(page, "%d (%s)\n", speed_min(mddev),
4701                       mddev->sync_speed_min ? "local": "system");
4702}
4703
4704static ssize_t
4705sync_min_store(struct mddev *mddev, const char *buf, size_t len)
4706{
4707        unsigned int min;
4708        int rv;
4709
4710        if (strncmp(buf, "system", 6)==0) {
4711                min = 0;
4712        } else {
4713                rv = kstrtouint(buf, 10, &min);
4714                if (rv < 0)
4715                        return rv;
4716                if (min == 0)
4717                        return -EINVAL;
4718        }
4719        mddev->sync_speed_min = min;
4720        return len;
4721}
4722
4723static struct md_sysfs_entry md_sync_min =
4724__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
4725
4726static ssize_t
4727sync_max_show(struct mddev *mddev, char *page)
4728{
4729        return sprintf(page, "%d (%s)\n", speed_max(mddev),
4730                       mddev->sync_speed_max ? "local": "system");
4731}
4732
4733static ssize_t
4734sync_max_store(struct mddev *mddev, const char *buf, size_t len)
4735{
4736        unsigned int max;
4737        int rv;
4738
4739        if (strncmp(buf, "system", 6)==0) {
4740                max = 0;
4741        } else {
4742                rv = kstrtouint(buf, 10, &max);
4743                if (rv < 0)
4744                        return rv;
4745                if (max == 0)
4746                        return -EINVAL;
4747        }
4748        mddev->sync_speed_max = max;
4749        return len;
4750}
4751
4752static struct md_sysfs_entry md_sync_max =
4753__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
4754
4755static ssize_t
4756degraded_show(struct mddev *mddev, char *page)
4757{
4758        return sprintf(page, "%d\n", mddev->degraded);
4759}
4760static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
4761
4762static ssize_t
4763sync_force_parallel_show(struct mddev *mddev, char *page)
4764{
4765        return sprintf(page, "%d\n", mddev->parallel_resync);
4766}
4767
4768static ssize_t
4769sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
4770{
4771        long n;
4772
4773        if (kstrtol(buf, 10, &n))
4774                return -EINVAL;
4775
4776        if (n != 0 && n != 1)
4777                return -EINVAL;
4778
4779        mddev->parallel_resync = n;
4780
4781        if (mddev->sync_thread)
4782                wake_up(&resync_wait);
4783
4784        return len;
4785}
4786
4787/* force parallel resync, even with shared block devices */
4788static struct md_sysfs_entry md_sync_force_parallel =
4789__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
4790       sync_force_parallel_show, sync_force_parallel_store);
4791
4792static ssize_t
4793sync_speed_show(struct mddev *mddev, char *page)
4794{
4795        unsigned long resync, dt, db;
4796        if (mddev->curr_resync == 0)
4797                return sprintf(page, "none\n");
4798        resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
4799        dt = (jiffies - mddev->resync_mark) / HZ;
4800        if (!dt) dt++;
4801        db = resync - mddev->resync_mark_cnt;
4802        return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
4803}
4804
4805static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
4806
4807static ssize_t
4808sync_completed_show(struct mddev *mddev, char *page)
4809{
4810        unsigned long long max_sectors, resync;
4811
4812        if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4813                return sprintf(page, "none\n");
4814
4815        if (mddev->curr_resync == 1 ||
4816            mddev->curr_resync == 2)
4817                return sprintf(page, "delayed\n");
4818
4819        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
4820            test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4821                max_sectors = mddev->resync_max_sectors;
4822        else
4823                max_sectors = mddev->dev_sectors;
4824
4825        resync = mddev->curr_resync_completed;
4826        return sprintf(page, "%llu / %llu\n", resync, max_sectors);
4827}
4828
4829static struct md_sysfs_entry md_sync_completed =
4830        __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
4831
4832static ssize_t
4833min_sync_show(struct mddev *mddev, char *page)
4834{
4835        return sprintf(page, "%llu\n",
4836                       (unsigned long long)mddev->resync_min);
4837}
4838static ssize_t
4839min_sync_store(struct mddev *mddev, const char *buf, size_t len)
4840{
4841        unsigned long long min;
4842        int err;
4843
4844        if (kstrtoull(buf, 10, &min))
4845                return -EINVAL;
4846
4847        spin_lock(&mddev->lock);
4848        err = -EINVAL;
4849        if (min > mddev->resync_max)
4850                goto out_unlock;
4851
4852        err = -EBUSY;
4853        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4854                goto out_unlock;
4855
4856        /* Round down to multiple of 4K for safety */
4857        mddev->resync_min = round_down(min, 8);
4858        err = 0;
4859
4860out_unlock:
4861        spin_unlock(&mddev->lock);
4862        return err ?: len;
4863}
4864
4865static struct md_sysfs_entry md_min_sync =
4866__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
4867
4868static ssize_t
4869max_sync_show(struct mddev *mddev, char *page)
4870{
4871        if (mddev->resync_max == MaxSector)
4872                return sprintf(page, "max\n");
4873        else
4874                return sprintf(page, "%llu\n",
4875                               (unsigned long long)mddev->resync_max);
4876}
4877static ssize_t
4878max_sync_store(struct mddev *mddev, const char *buf, size_t len)
4879{
4880        int err;
4881        spin_lock(&mddev->lock);
4882        if (strncmp(buf, "max", 3) == 0)
4883                mddev->resync_max = MaxSector;
4884        else {
4885                unsigned long long max;
4886                int chunk;
4887
4888                err = -EINVAL;
4889                if (kstrtoull(buf, 10, &max))
4890                        goto out_unlock;
4891                if (max < mddev->resync_min)
4892                        goto out_unlock;
4893
4894                err = -EBUSY;
4895                if (max < mddev->resync_max &&
4896                    mddev->ro == 0 &&
4897                    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4898                        goto out_unlock;
4899
4900                /* Must be a multiple of chunk_size */
4901                chunk = mddev->chunk_sectors;
4902                if (chunk) {
4903                        sector_t temp = max;
4904
4905                        err = -EINVAL;
4906                        if (sector_div(temp, chunk))
4907                                goto out_unlock;
4908                }
4909                mddev->resync_max = max;
4910        }
4911        wake_up(&mddev->recovery_wait);
4912        err = 0;
4913out_unlock:
4914        spin_unlock(&mddev->lock);
4915        return err ?: len;
4916}
4917
4918static struct md_sysfs_entry md_max_sync =
4919__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
4920
4921static ssize_t
4922suspend_lo_show(struct mddev *mddev, char *page)
4923{
4924        return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
4925}
4926
4927static ssize_t
4928suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
4929{
4930        unsigned long long new;
4931        int err;
4932
4933        err = kstrtoull(buf, 10, &new);
4934        if (err < 0)
4935                return err;
4936        if (new != (sector_t)new)
4937                return -EINVAL;
4938
4939        err = mddev_lock(mddev);
4940        if (err)
4941                return err;
4942        err = -EINVAL;
4943        if (mddev->pers == NULL ||
4944            mddev->pers->quiesce == NULL)
4945                goto unlock;
4946        mddev_suspend(mddev);
4947        mddev->suspend_lo = new;
4948        mddev_resume(mddev);
4949
4950        err = 0;
4951unlock:
4952        mddev_unlock(mddev);
4953        return err ?: len;
4954}
4955static struct md_sysfs_entry md_suspend_lo =
4956__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
4957
4958static ssize_t
4959suspend_hi_show(struct mddev *mddev, char *page)
4960{
4961        return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
4962}
4963
4964static ssize_t
4965suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
4966{
4967        unsigned long long new;
4968        int err;
4969
4970        err = kstrtoull(buf, 10, &new);
4971        if (err < 0)
4972                return err;
4973        if (new != (sector_t)new)
4974                return -EINVAL;
4975
4976        err = mddev_lock(mddev);
4977        if (err)
4978                return err;
4979        err = -EINVAL;
4980        if (mddev->pers == NULL)
4981                goto unlock;
4982
4983        mddev_suspend(mddev);
4984        mddev->suspend_hi = new;
4985        mddev_resume(mddev);
4986
4987        err = 0;
4988unlock:
4989        mddev_unlock(mddev);
4990        return err ?: len;
4991}
4992static struct md_sysfs_entry md_suspend_hi =
4993__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
4994
4995static ssize_t
4996reshape_position_show(struct mddev *mddev, char *page)
4997{
4998        if (mddev->reshape_position != MaxSector)
4999                return sprintf(page, "%llu\n",
5000                               (unsigned long long)mddev->reshape_position);
5001        strcpy(page, "none\n");
5002        return 5;
5003}
5004
5005static ssize_t
5006reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
5007{
5008        struct md_rdev *rdev;
5009        unsigned long long new;
5010        int err;
5011
5012        err = kstrtoull(buf, 10, &new);
5013        if (err < 0)
5014                return err;
5015        if (new != (sector_t)new)
5016                return -EINVAL;
5017        err = mddev_lock(mddev);
5018        if (err)
5019                return err;
5020        err = -EBUSY;
5021        if (mddev->pers)
5022                goto unlock;
5023        mddev->reshape_position = new;
5024        mddev->delta_disks = 0;
5025        mddev->reshape_backwards = 0;
5026        mddev->new_level = mddev->level;
5027        mddev->new_layout = mddev->layout;
5028        mddev->new_chunk_sectors = mddev->chunk_sectors;
5029        rdev_for_each(rdev, mddev)
5030                rdev->new_data_offset = rdev->data_offset;
5031        err = 0;
5032unlock:
5033        mddev_unlock(mddev);
5034        return err ?: len;
5035}
5036
5037static struct md_sysfs_entry md_reshape_position =
5038__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
5039       reshape_position_store);
5040
5041static ssize_t
5042reshape_direction_show(struct mddev *mddev, char *page)
5043{
5044        return sprintf(page, "%s\n",
5045                       mddev->reshape_backwards ? "backwards" : "forwards");
5046}
5047
5048static ssize_t
5049reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
5050{
5051        int backwards = 0;
5052        int err;
5053
5054        if (cmd_match(buf, "forwards"))
5055                backwards = 0;
5056        else if (cmd_match(buf, "backwards"))
5057                backwards = 1;
5058        else
5059                return -EINVAL;
5060        if (mddev->reshape_backwards == backwards)
5061                return len;
5062
5063        err = mddev_lock(mddev);
5064        if (err)
5065                return err;
5066        /* check if we are allowed to change */
5067        if (mddev->delta_disks)
5068                err = -EBUSY;
5069        else if (mddev->persistent &&
5070            mddev->major_version == 0)
5071                err =  -EINVAL;
5072        else
5073                mddev->reshape_backwards = backwards;
5074        mddev_unlock(mddev);
5075        return err ?: len;
5076}
5077
5078static struct md_sysfs_entry md_reshape_direction =
5079__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
5080       reshape_direction_store);
5081
5082static ssize_t
5083array_size_show(struct mddev *mddev, char *page)
5084{
5085        if (mddev->external_size)
5086                return sprintf(page, "%llu\n",
5087                               (unsigned long long)mddev->array_sectors/2);
5088        else
5089                return sprintf(page, "default\n");
5090}
5091
5092static ssize_t
5093array_size_store(struct mddev *mddev, const char *buf, size_t len)
5094{
5095        sector_t sectors;
5096        int err;
5097
5098        err = mddev_lock(mddev);
5099        if (err)
5100                return err;
5101
5102        /* cluster raid doesn't support change array_sectors */
5103        if (mddev_is_clustered(mddev)) {
5104                mddev_unlock(mddev);
5105                return -EINVAL;
5106        }
5107
5108        if (strncmp(buf, "default", 7) == 0) {
5109                if (mddev->pers)
5110                        sectors = mddev->pers->size(mddev, 0, 0);
5111                else
5112                        sectors = mddev->array_sectors;
5113
5114                mddev->external_size = 0;
5115        } else {
5116                if (strict_blocks_to_sectors(buf, &sectors) < 0)
5117                        err = -EINVAL;
5118                else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
5119                        err = -E2BIG;
5120                else
5121                        mddev->external_size = 1;
5122        }
5123
5124        if (!err) {
5125                mddev->array_sectors = sectors;
5126                if (mddev->pers) {
5127                        set_capacity(mddev->gendisk, mddev->array_sectors);
5128                        revalidate_disk(mddev->gendisk);
5129                }
5130        }
5131        mddev_unlock(mddev);
5132        return err ?: len;
5133}
5134
5135static struct md_sysfs_entry md_array_size =
5136__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
5137       array_size_store);
5138
5139static ssize_t
5140consistency_policy_show(struct mddev *mddev, char *page)
5141{
5142        int ret;
5143
5144        if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
5145                ret = sprintf(page, "journal\n");
5146        } else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
5147                ret = sprintf(page, "ppl\n");
5148        } else if (mddev->bitmap) {
5149                ret = sprintf(page, "bitmap\n");
5150        } else if (mddev->pers) {
5151                if (mddev->pers->sync_request)
5152                        ret = sprintf(page, "resync\n");
5153                else
5154                        ret = sprintf(page, "none\n");
5155        } else {
5156                ret = sprintf(page, "unknown\n");
5157        }
5158
5159        return ret;
5160}
5161
5162static ssize_t
5163consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
5164{
5165        int err = 0;
5166
5167        if (mddev->pers) {
5168                if (mddev->pers->change_consistency_policy)
5169                        err = mddev->pers->change_consistency_policy(mddev, buf);
5170                else
5171                        err = -EBUSY;
5172        } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
5173                set_bit(MD_HAS_PPL, &mddev->flags);
5174        } else {
5175                err = -EINVAL;
5176        }
5177
5178        return err ? err : len;
5179}
5180
5181static struct md_sysfs_entry md_consistency_policy =
5182__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
5183       consistency_policy_store);
5184
5185static struct attribute *md_default_attrs[] = {
5186        &md_level.attr,
5187        &md_layout.attr,
5188        &md_raid_disks.attr,
5189        &md_chunk_size.attr,
5190        &md_size.attr,
5191        &md_resync_start.attr,
5192        &md_metadata.attr,
5193        &md_new_device.attr,
5194        &md_safe_delay.attr,
5195        &md_array_state.attr,
5196        &md_reshape_position.attr,
5197        &md_reshape_direction.attr,
5198        &md_array_size.attr,
5199        &max_corr_read_errors.attr,
5200        &md_consistency_policy.attr,
5201        NULL,
5202};
5203
5204static struct attribute *md_redundancy_attrs[] = {
5205        &md_scan_mode.attr,
5206        &md_last_scan_mode.attr,
5207        &md_mismatches.attr,
5208        &md_sync_min.attr,
5209        &md_sync_max.attr,
5210        &md_sync_speed.attr,
5211        &md_sync_force_parallel.attr,
5212        &md_sync_completed.attr,
5213        &md_min_sync.attr,
5214        &md_max_sync.attr,
5215        &md_suspend_lo.attr,
5216        &md_suspend_hi.attr,
5217        &md_bitmap.attr,
5218        &md_degraded.attr,
5219        NULL,
5220};
5221static struct attribute_group md_redundancy_group = {
5222        .name = NULL,
5223        .attrs = md_redundancy_attrs,
5224};
5225
5226static ssize_t
5227md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
5228{
5229        struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5230        struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5231        ssize_t rv;
5232
5233        if (!entry->show)
5234                return -EIO;
5235        spin_lock(&all_mddevs_lock);
5236        if (list_empty(&mddev->all_mddevs)) {
5237                spin_unlock(&all_mddevs_lock);
5238                return -EBUSY;
5239        }
5240        mddev_get(mddev);
5241        spin_unlock(&all_mddevs_lock);
5242
5243        rv = entry->show(mddev, page);
5244        mddev_put(mddev);
5245        return rv;
5246}
5247
5248static ssize_t
5249md_attr_store(struct kobject *kobj, struct attribute *attr,
5250              const char *page, size_t length)
5251{
5252        struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5253        struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5254        ssize_t rv;
5255
5256        if (!entry->store)
5257                return -EIO;
5258        if (!capable(CAP_SYS_ADMIN))
5259                return -EACCES;
5260        spin_lock(&all_mddevs_lock);
5261        if (list_empty(&mddev->all_mddevs)) {
5262                spin_unlock(&all_mddevs_lock);
5263                return -EBUSY;
5264        }
5265        mddev_get(mddev);
5266        spin_unlock(&all_mddevs_lock);
5267        rv = entry->store(mddev, page, length);
5268        mddev_put(mddev);
5269        return rv;
5270}
5271
5272static void md_free(struct kobject *ko)
5273{
5274        struct mddev *mddev = container_of(ko, struct mddev, kobj);
5275
5276        if (mddev->sysfs_state)
5277                sysfs_put(mddev->sysfs_state);
5278
5279        if (mddev->gendisk)
5280                del_gendisk(mddev->gendisk);
5281        if (mddev->queue)
5282                blk_cleanup_queue(mddev->queue);
5283        if (mddev->gendisk)
5284                put_disk(mddev->gendisk);
5285        percpu_ref_exit(&mddev->writes_pending);
5286
5287        bioset_exit(&mddev->bio_set);
5288        bioset_exit(&mddev->sync_set);
5289        kfree(mddev);
5290}
5291
5292static const struct sysfs_ops md_sysfs_ops = {
5293        .show   = md_attr_show,
5294        .store  = md_attr_store,
5295};
5296static struct kobj_type md_ktype = {
5297        .release        = md_free,
5298        .sysfs_ops      = &md_sysfs_ops,
5299        .default_attrs  = md_default_attrs,
5300};
5301
5302int mdp_major = 0;
5303
5304static void mddev_delayed_delete(struct work_struct *ws)
5305{
5306        struct mddev *mddev = container_of(ws, struct mddev, del_work);
5307
5308        sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
5309        kobject_del(&mddev->kobj);
5310        kobject_put(&mddev->kobj);
5311}
5312
5313static void no_op(struct percpu_ref *r) {}
5314
5315int mddev_init_writes_pending(struct mddev *mddev)
5316{
5317        if (mddev->writes_pending.percpu_count_ptr)
5318                return 0;
5319        if (percpu_ref_init(&mddev->writes_pending, no_op,
5320                            PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0)
5321                return -ENOMEM;
5322        /* We want to start with the refcount at zero */
5323        percpu_ref_put(&mddev->writes_pending);
5324        return 0;
5325}
5326EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
5327
5328static int md_alloc(dev_t dev, char *name)
5329{
5330        /*
5331         * If dev is zero, name is the name of a device to allocate with
5332         * an arbitrary minor number.  It will be "md_???"
5333         * If dev is non-zero it must be a device number with a MAJOR of
5334         * MD_MAJOR or mdp_major.  In this case, if "name" is NULL, then
5335         * the device is being created by opening a node in /dev.
5336         * If "name" is not NULL, the device is being created by
5337         * writing to /sys/module/md_mod/parameters/new_array.
5338         */
5339        static DEFINE_MUTEX(disks_mutex);
5340        struct mddev *mddev = mddev_find(dev);
5341        struct gendisk *disk;
5342        int partitioned;
5343        int shift;
5344        int unit;
5345        int error;
5346
5347        if (!mddev)
5348                return -ENODEV;
5349
5350        partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
5351        shift = partitioned ? MdpMinorShift : 0;
5352        unit = MINOR(mddev->unit) >> shift;
5353
5354        /* wait for any previous instance of this device to be
5355         * completely removed (mddev_delayed_delete).
5356         */
5357        flush_workqueue(md_misc_wq);
5358
5359        mutex_lock(&disks_mutex);
5360        error = -EEXIST;
5361        if (mddev->gendisk)
5362                goto abort;
5363
5364        if (name && !dev) {
5365                /* Need to ensure that 'name' is not a duplicate.
5366                 */
5367                struct mddev *mddev2;
5368                spin_lock(&all_mddevs_lock);
5369
5370                list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
5371                        if (mddev2->gendisk &&
5372                            strcmp(mddev2->gendisk->disk_name, name) == 0) {
5373                                spin_unlock(&all_mddevs_lock);
5374                                goto abort;
5375                        }
5376                spin_unlock(&all_mddevs_lock);
5377        }
5378        if (name && dev)
5379                /*
5380                 * Creating /dev/mdNNN via "newarray", so adjust hold_active.
5381                 */
5382                mddev->hold_active = UNTIL_STOP;
5383
5384        error = -ENOMEM;
5385        mddev->queue = blk_alloc_queue(GFP_KERNEL);
5386        if (!mddev->queue)
5387                goto abort;
5388        mddev->queue->queuedata = mddev;
5389
5390        blk_queue_make_request(mddev->queue, md_make_request);
5391        blk_set_stacking_limits(&mddev->queue->limits);
5392
5393        disk = alloc_disk(1 << shift);
5394        if (!disk) {
5395                blk_cleanup_queue(mddev->queue);
5396                mddev->queue = NULL;
5397                goto abort;
5398        }
5399        disk->major = MAJOR(mddev->unit);
5400        disk->first_minor = unit << shift;
5401        if (name)
5402                strcpy(disk->disk_name, name);
5403        else if (partitioned)
5404                sprintf(disk->disk_name, "md_d%d", unit);
5405        else
5406                sprintf(disk->disk_name, "md%d", unit);
5407        disk->fops = &md_fops;
5408        disk->private_data = mddev;
5409        disk->queue = mddev->queue;
5410        blk_queue_write_cache(mddev->queue, true, true);
5411        /* Allow extended partitions.  This makes the
5412         * 'mdp' device redundant, but we can't really
5413         * remove it now.
5414         */
5415        disk->flags |= GENHD_FL_EXT_DEVT;
5416        mddev->gendisk = disk;
5417        /* As soon as we call add_disk(), another thread could get
5418         * through to md_open, so make sure it doesn't get too far
5419         */
5420        mutex_lock(&mddev->open_mutex);
5421        add_disk(disk);
5422
5423        error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
5424        if (error) {
5425                /* This isn't possible, but as kobject_init_and_add is marked
5426                 * __must_check, we must do something with the result
5427                 */
5428                pr_debug("md: cannot register %s/md - name in use\n",
5429                         disk->disk_name);
5430                error = 0;
5431        }
5432        if (mddev->kobj.sd &&
5433            sysfs_create_group(&mddev->kobj, &md_bitmap_group))
5434                pr_debug("pointless warning\n");
5435        mutex_unlock(&mddev->open_mutex);
5436 abort:
5437        mutex_unlock(&disks_mutex);
5438        if (!error && mddev->kobj.sd) {
5439                kobject_uevent(&mddev->kobj, KOBJ_ADD);
5440                mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
5441        }
5442        mddev_put(mddev);
5443        return error;
5444}
5445
5446static struct kobject *md_probe(dev_t dev, int *part, void *data)
5447{
5448        if (create_on_open)
5449                md_alloc(dev, NULL);
5450        return NULL;
5451}
5452
5453static int add_named_array(const char *val, const struct kernel_param *kp)
5454{
5455        /*
5456         * val must be "md_*" or "mdNNN".
5457         * For "md_*" we allocate an array with a large free minor number, and
5458         * set the name to val.  val must not already be an active name.
5459         * For "mdNNN" we allocate an array with the minor number NNN
5460         * which must not already be in use.
5461         */
5462        int len = strlen(val);
5463        char buf[DISK_NAME_LEN];
5464        unsigned long devnum;
5465
5466        while (len && val[len-1] == '\n')
5467                len--;
5468        if (len >= DISK_NAME_LEN)
5469                return -E2BIG;
5470        strlcpy(buf, val, len+1);
5471        if (strncmp(buf, "md_", 3) == 0)
5472                return md_alloc(0, buf);
5473        if (strncmp(buf, "md", 2) == 0 &&
5474            isdigit(buf[2]) &&
5475            kstrtoul(buf+2, 10, &devnum) == 0 &&
5476            devnum <= MINORMASK)
5477                return md_alloc(MKDEV(MD_MAJOR, devnum), NULL);
5478
5479        return -EINVAL;
5480}
5481
5482static void md_safemode_timeout(struct timer_list *t)
5483{
5484        struct mddev *mddev = from_timer(mddev, t, safemode_timer);
5485
5486        mddev->safemode = 1;
5487        if (mddev->external)
5488                sysfs_notify_dirent_safe(mddev->sysfs_state);
5489
5490        md_wakeup_thread(mddev->thread);
5491}
5492
5493static int start_dirty_degraded;
5494
5495int md_run(struct mddev *mddev)
5496{
5497        int err;
5498        struct md_rdev *rdev;
5499        struct md_personality *pers;
5500
5501        if (list_empty(&mddev->disks))
5502                /* cannot run an array with no devices.. */
5503                return -EINVAL;
5504
5505        if (mddev->pers)
5506                return -EBUSY;
5507        /* Cannot run until previous stop completes properly */
5508        if (mddev->sysfs_active)
5509                return -EBUSY;
5510
5511        /*
5512         * Analyze all RAID superblock(s)
5513         */
5514        if (!mddev->raid_disks) {
5515                if (!mddev->persistent)
5516                        return -EINVAL;
5517                analyze_sbs(mddev);
5518        }
5519
5520        if (mddev->level != LEVEL_NONE)
5521                request_module("md-level-%d", mddev->level);
5522        else if (mddev->clevel[0])
5523                request_module("md-%s", mddev->clevel);
5524
5525        /*
5526         * Drop all container device buffers, from now on
5527         * the only valid external interface is through the md
5528         * device.
5529         */
5530        mddev->has_superblocks = false;
5531        rdev_for_each(rdev, mddev) {
5532                if (test_bit(Faulty, &rdev->flags))
5533                        continue;
5534                sync_blockdev(rdev->bdev);
5535                invalidate_bdev(rdev->bdev);
5536                if (mddev->ro != 1 &&
5537                    (bdev_read_only(rdev->bdev) ||
5538                     bdev_read_only(rdev->meta_bdev))) {
5539                        mddev->ro = 1;
5540                        if (mddev->gendisk)
5541                                set_disk_ro(mddev->gendisk, 1);
5542                }
5543
5544                if (rdev->sb_page)
5545                        mddev->has_superblocks = true;
5546
5547                /* perform some consistency tests on the device.
5548                 * We don't want the data to overlap the metadata,
5549                 * Internal Bitmap issues have been handled elsewhere.
5550                 */
5551                if (rdev->meta_bdev) {
5552                        /* Nothing to check */;
5553                } else if (rdev->data_offset < rdev->sb_start) {
5554                        if (mddev->dev_sectors &&
5555                            rdev->data_offset + mddev->dev_sectors
5556                            > rdev->sb_start) {
5557                                pr_warn("md: %s: data overlaps metadata\n",
5558                                        mdname(mddev));
5559                                return -EINVAL;
5560                        }
5561                } else {
5562                        if (rdev->sb_start + rdev->sb_size/512
5563                            > rdev->data_offset) {
5564                                pr_warn("md: %s: metadata overlaps data\n",
5565                                        mdname(mddev));
5566                                return -EINVAL;
5567                        }
5568                }
5569                sysfs_notify_dirent_safe(rdev->sysfs_state);
5570        }
5571
5572        if (!bioset_initialized(&mddev->bio_set)) {
5573                err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
5574                if (err)
5575                        return err;
5576        }
5577        if (!bioset_initialized(&mddev->sync_set)) {
5578                err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
5579                if (err)
5580                        return err;
5581        }
5582
5583        spin_lock(&pers_lock);
5584        pers = find_pers(mddev->level, mddev->clevel);
5585        if (!pers || !try_module_get(pers->owner)) {
5586                spin_unlock(&pers_lock);
5587                if (mddev->level != LEVEL_NONE)
5588                        pr_warn("md: personality for level %d is not loaded!\n",
5589                                mddev->level);
5590                else
5591                        pr_warn("md: personality for level %s is not loaded!\n",
5592                                mddev->clevel);
5593                err = -EINVAL;
5594                goto abort;
5595        }
5596        spin_unlock(&pers_lock);
5597        if (mddev->level != pers->level) {
5598                mddev->level = pers->level;
5599                mddev->new_level = pers->level;
5600        }
5601        strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
5602
5603        if (mddev->reshape_position != MaxSector &&
5604            pers->start_reshape == NULL) {
5605                /* This personality cannot handle reshaping... */
5606                module_put(pers->owner);
5607                err = -EINVAL;
5608                goto abort;
5609        }
5610
5611        if (pers->sync_request) {
5612                /* Warn if this is a potentially silly
5613                 * configuration.
5614                 */
5615                char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5616                struct md_rdev *rdev2;
5617                int warned = 0;
5618
5619                rdev_for_each(rdev, mddev)
5620                        rdev_for_each(rdev2, mddev) {
5621                                if (rdev < rdev2 &&
5622                                    rdev->bdev->bd_contains ==
5623                                    rdev2->bdev->bd_contains) {
5624                                        pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n",
5625                                                mdname(mddev),
5626                                                bdevname(rdev->bdev,b),
5627                                                bdevname(rdev2->bdev,b2));
5628                                        warned = 1;
5629                                }
5630                        }
5631
5632                if (warned)
5633                        pr_warn("True protection against single-disk failure might be compromised.\n");
5634        }
5635
5636        mddev->recovery = 0;
5637        /* may be over-ridden by personality */
5638        mddev->resync_max_sectors = mddev->dev_sectors;
5639
5640        mddev->ok_start_degraded = start_dirty_degraded;
5641
5642        if (start_readonly && mddev->ro == 0)
5643                mddev->ro = 2; /* read-only, but switch on first write */
5644
5645        err = pers->run(mddev);
5646        if (err)
5647                pr_warn("md: pers->run() failed ...\n");
5648        else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
5649                WARN_ONCE(!mddev->external_size,
5650                          "%s: default size too small, but 'external_size' not in effect?\n",
5651                          __func__);
5652                pr_warn("md: invalid array_size %llu > default size %llu\n",
5653                        (unsigned long long)mddev->array_sectors / 2,
5654                        (unsigned long long)pers->size(mddev, 0, 0) / 2);
5655                err = -EINVAL;
5656        }
5657        if (err == 0 && pers->sync_request &&
5658            (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
5659                struct bitmap *bitmap;
5660
5661                bitmap = md_bitmap_create(mddev, -1);
5662                if (IS_ERR(bitmap)) {
5663                        err = PTR_ERR(bitmap);
5664                        pr_warn("%s: failed to create bitmap (%d)\n",
5665                                mdname(mddev), err);
5666                } else
5667                        mddev->bitmap = bitmap;
5668
5669        }
5670        if (err)
5671                goto bitmap_abort;
5672
5673        if (mddev->bitmap_info.max_write_behind > 0) {
5674                bool creat_pool = false;
5675
5676                rdev_for_each(rdev, mddev) {
5677                        if (test_bit(WriteMostly, &rdev->flags) &&
5678                            rdev_init_wb(rdev))
5679                                creat_pool = true;
5680                }
5681                if (creat_pool && mddev->wb_info_pool == NULL) {
5682                        mddev->wb_info_pool =
5683                                mempool_create_kmalloc_pool(NR_WB_INFOS,
5684                                                    sizeof(struct wb_info));
5685                        if (!mddev->wb_info_pool) {
5686                                err = -ENOMEM;
5687                                goto bitmap_abort;
5688                        }
5689                }
5690        }
5691
5692        if (mddev->queue) {
5693                bool nonrot = true;
5694
5695                rdev_for_each(rdev, mddev) {
5696                        if (rdev->raid_disk >= 0 &&
5697                            !blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
5698                                nonrot = false;
5699                                break;
5700                        }
5701                }
5702                if (mddev->degraded)
5703                        nonrot = false;
5704                if (nonrot)
5705                        blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue);
5706                else
5707                        blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue);
5708                mddev->queue->backing_dev_info->congested_data = mddev;
5709                mddev->queue->backing_dev_info->congested_fn = md_congested;
5710        }
5711        if (pers->sync_request) {
5712                if (mddev->kobj.sd &&
5713                    sysfs_create_group(&mddev->kobj, &md_redundancy_group))
5714                        pr_warn("md: cannot register extra attributes for %s\n",
5715                                mdname(mddev));
5716                mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
5717        } else if (mddev->ro == 2) /* auto-readonly not meaningful */
5718                mddev->ro = 0;
5719
5720        atomic_set(&mddev->max_corr_read_errors,
5721                   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
5722        mddev->safemode = 0;
5723        if (mddev_is_clustered(mddev))
5724                mddev->safemode_delay = 0;
5725        else
5726                mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
5727        mddev->in_sync = 1;
5728        smp_wmb();
5729        spin_lock(&mddev->lock);
5730        mddev->pers = pers;
5731        spin_unlock(&mddev->lock);
5732        rdev_for_each(rdev, mddev)
5733                if (rdev->raid_disk >= 0)
5734                        sysfs_link_rdev(mddev, rdev); /* failure here is OK */
5735
5736        if (mddev->degraded && !mddev->ro)
5737                /* This ensures that recovering status is reported immediately
5738                 * via sysfs - until a lack of spares is confirmed.
5739                 */
5740                set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5741        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5742
5743        if (mddev->sb_flags)
5744                md_update_sb(mddev, 0);
5745
5746        md_new_event(mddev);
5747        sysfs_notify_dirent_safe(mddev->sysfs_state);
5748        sysfs_notify_dirent_safe(mddev->sysfs_action);
5749        sysfs_notify(&mddev->kobj, NULL, "degraded");
5750        return 0;
5751
5752bitmap_abort:
5753        mddev_detach(mddev);
5754        if (mddev->private)
5755                pers->free(mddev, mddev->private);
5756        mddev->private = NULL;
5757        module_put(pers->owner);
5758        md_bitmap_destroy(mddev);
5759abort:
5760        bioset_exit(&mddev->bio_set);
5761        bioset_exit(&mddev->sync_set);
5762        return err;
5763}
5764EXPORT_SYMBOL_GPL(md_run);
5765
5766static int do_md_run(struct mddev *mddev)
5767{
5768        int err;
5769
5770        err = md_run(mddev);
5771        if (err)
5772                goto out;
5773        err = md_bitmap_load(mddev);
5774        if (err) {
5775                md_bitmap_destroy(mddev);
5776                goto out;
5777        }
5778
5779        if (mddev_is_clustered(mddev))
5780                md_allow_write(mddev);
5781
5782        /* run start up tasks that require md_thread */
5783        md_start(mddev);
5784
5785        md_wakeup_thread(mddev->thread);
5786        md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
5787
5788        set_capacity(mddev->gendisk, mddev->array_sectors);
5789        revalidate_disk(mddev->gendisk);
5790        mddev->changed = 1;
5791        kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5792out:
5793        return err;
5794}
5795
5796int md_start(struct mddev *mddev)
5797{
5798        int ret = 0;
5799
5800        if (mddev->pers->start) {
5801                set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
5802                md_wakeup_thread(mddev->thread);
5803                ret = mddev->pers->start(mddev);
5804                clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
5805                md_wakeup_thread(mddev->sync_thread);
5806        }
5807        return ret;
5808}
5809EXPORT_SYMBOL_GPL(md_start);
5810
5811static int restart_array(struct mddev *mddev)
5812{
5813        struct gendisk *disk = mddev->gendisk;
5814        struct md_rdev *rdev;
5815        bool has_journal = false;
5816        bool has_readonly = false;
5817
5818        /* Complain if it has no devices */
5819        if (list_empty(&mddev->disks))
5820                return -ENXIO;
5821        if (!mddev->pers)
5822                return -EINVAL;
5823        if (!mddev->ro)
5824                return -EBUSY;
5825
5826        rcu_read_lock();
5827        rdev_for_each_rcu(rdev, mddev) {
5828                if (test_bit(Journal, &rdev->flags) &&
5829                    !test_bit(Faulty, &rdev->flags))
5830                        has_journal = true;
5831                if (bdev_read_only(rdev->bdev))
5832                        has_readonly = true;
5833        }
5834        rcu_read_unlock();
5835        if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
5836                /* Don't restart rw with journal missing/faulty */
5837                        return -EINVAL;
5838        if (has_readonly)
5839                return -EROFS;
5840
5841        mddev->safemode = 0;
5842        mddev->ro = 0;
5843        set_disk_ro(disk, 0);
5844        pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
5845        /* Kick recovery or resync if necessary */
5846        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5847        md_wakeup_thread(mddev->thread);
5848        md_wakeup_thread(mddev->sync_thread);
5849        sysfs_notify_dirent_safe(mddev->sysfs_state);
5850        return 0;
5851}
5852
5853static void md_clean(struct mddev *mddev)
5854{
5855        mddev->array_sectors = 0;
5856        mddev->external_size = 0;
5857        mddev->dev_sectors = 0;
5858        mddev->raid_disks = 0;
5859        mddev->recovery_cp = 0;
5860        mddev->resync_min = 0;
5861        mddev->resync_max = MaxSector;
5862        mddev->reshape_position = MaxSector;
5863        mddev->external = 0;
5864        mddev->persistent = 0;
5865        mddev->level = LEVEL_NONE;
5866        mddev->clevel[0] = 0;
5867        mddev->flags = 0;
5868        mddev->sb_flags = 0;
5869        mddev->ro = 0;
5870        mddev->metadata_type[0] = 0;
5871        mddev->chunk_sectors = 0;
5872        mddev->ctime = mddev->utime = 0;
5873        mddev->layout = 0;
5874        mddev->max_disks = 0;
5875        mddev->events = 0;
5876        mddev->can_decrease_events = 0;
5877        mddev->delta_disks = 0;
5878        mddev->reshape_backwards = 0;
5879        mddev->new_level = LEVEL_NONE;
5880        mddev->new_layout = 0;
5881        mddev->new_chunk_sectors = 0;
5882        mddev->curr_resync = 0;
5883        atomic64_set(&mddev->resync_mismatches, 0);
5884        mddev->suspend_lo = mddev->suspend_hi = 0;
5885        mddev->sync_speed_min = mddev->sync_speed_max = 0;
5886        mddev->recovery = 0;
5887        mddev->in_sync = 0;
5888        mddev->changed = 0;
5889        mddev->degraded = 0;
5890        mddev->safemode = 0;
5891        mddev->private = NULL;
5892        mddev->cluster_info = NULL;
5893        mddev->bitmap_info.offset = 0;
5894        mddev->bitmap_info.default_offset = 0;
5895        mddev->bitmap_info.default_space = 0;
5896        mddev->bitmap_info.chunksize = 0;
5897        mddev->bitmap_info.daemon_sleep = 0;
5898        mddev->bitmap_info.max_write_behind = 0;
5899        mddev->bitmap_info.nodes = 0;
5900}
5901
5902static void __md_stop_writes(struct mddev *mddev)
5903{
5904        set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5905        flush_workqueue(md_misc_wq);
5906        if (mddev->sync_thread) {
5907                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5908                md_reap_sync_thread(mddev);
5909        }
5910
5911        del_timer_sync(&mddev->safemode_timer);
5912
5913        if (mddev->pers && mddev->pers->quiesce) {
5914                mddev->pers->quiesce(mddev, 1);
5915                mddev->pers->quiesce(mddev, 0);
5916        }
5917        md_bitmap_flush(mddev);
5918
5919        if (mddev->ro == 0 &&
5920            ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
5921             mddev->sb_flags)) {
5922                /* mark array as shutdown cleanly */
5923                if (!mddev_is_clustered(mddev))
5924                        mddev->in_sync = 1;
5925                md_update_sb(mddev, 1);
5926        }
5927        mempool_destroy(mddev->wb_info_pool);
5928        mddev->wb_info_pool = NULL;
5929}
5930
5931void md_stop_writes(struct mddev *mddev)
5932{
5933        mddev_lock_nointr(mddev);
5934        __md_stop_writes(mddev);
5935        mddev_unlock(mddev);
5936}
5937EXPORT_SYMBOL_GPL(md_stop_writes);
5938
5939static void mddev_detach(struct mddev *mddev)
5940{
5941        md_bitmap_wait_behind_writes(mddev);
5942        if (mddev->pers && mddev->pers->quiesce) {
5943                mddev->pers->quiesce(mddev, 1);
5944                mddev->pers->quiesce(mddev, 0);
5945        }
5946        md_unregister_thread(&mddev->thread);
5947        if (mddev->queue)
5948                blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
5949}
5950
5951static void __md_stop(struct mddev *mddev)
5952{
5953        struct md_personality *pers = mddev->pers;
5954        md_bitmap_destroy(mddev);
5955        mddev_detach(mddev);
5956        /* Ensure ->event_work is done */
5957        flush_workqueue(md_misc_wq);
5958        spin_lock(&mddev->lock);
5959        mddev->pers = NULL;
5960        spin_unlock(&mddev->lock);
5961        pers->free(mddev, mddev->private);
5962        mddev->private = NULL;
5963        if (pers->sync_request && mddev->to_remove == NULL)
5964                mddev->to_remove = &md_redundancy_group;
5965        module_put(pers->owner);
5966        clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5967}
5968
5969void md_stop(struct mddev *mddev)
5970{
5971        /* stop the array and free an attached data structures.
5972         * This is called from dm-raid
5973         */
5974        __md_stop(mddev);
5975        bioset_exit(&mddev->bio_set);
5976        bioset_exit(&mddev->sync_set);
5977}
5978
5979EXPORT_SYMBOL_GPL(md_stop);
5980
5981static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
5982{
5983        int err = 0;
5984        int did_freeze = 0;
5985
5986        if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5987                did_freeze = 1;
5988                set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5989                md_wakeup_thread(mddev->thread);
5990        }
5991        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5992                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5993        if (mddev->sync_thread)
5994                /* Thread might be blocked waiting for metadata update
5995                 * which will now never happen */
5996                wake_up_process(mddev->sync_thread->tsk);
5997
5998        if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
5999                return -EBUSY;
6000        mddev_unlock(mddev);
6001        wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
6002                                          &mddev->recovery));
6003        wait_event(mddev->sb_wait,
6004                   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
6005        mddev_lock_nointr(mddev);
6006
6007        mutex_lock(&mddev->open_mutex);
6008        if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
6009            mddev->sync_thread ||
6010            test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6011                pr_warn("md: %s still in use.\n",mdname(mddev));
6012                if (did_freeze) {
6013                        clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6014                        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6015                        md_wakeup_thread(mddev->thread);
6016                }
6017                err = -EBUSY;
6018                goto out;
6019        }
6020        if (mddev->pers) {
6021                __md_stop_writes(mddev);
6022
6023                err  = -ENXIO;
6024                if (mddev->ro==1)
6025                        goto out;
6026                mddev->ro = 1;
6027                set_disk_ro(mddev->gendisk, 1);
6028                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6029                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6030                md_wakeup_thread(mddev->thread);
6031                sysfs_notify_dirent_safe(mddev->sysfs_state);
6032                err = 0;
6033        }
6034out:
6035        mutex_unlock(&mddev->open_mutex);
6036        return err;
6037}
6038
6039/* mode:
6040 *   0 - completely stop and dis-assemble array
6041 *   2 - stop but do not disassemble array
6042 */
6043static int do_md_stop(struct mddev *mddev, int mode,
6044                      struct block_device *bdev)
6045{
6046        struct gendisk *disk = mddev->gendisk;
6047        struct md_rdev *rdev;
6048        int did_freeze = 0;
6049
6050        if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6051                did_freeze = 1;
6052                set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6053                md_wakeup_thread(mddev->thread);
6054        }
6055        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
6056                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6057        if (mddev->sync_thread)
6058                /* Thread might be blocked waiting for metadata update
6059                 * which will now never happen */
6060                wake_up_process(mddev->sync_thread->tsk);
6061
6062        mddev_unlock(mddev);
6063        wait_event(resync_wait, (mddev->sync_thread == NULL &&
6064                                 !test_bit(MD_RECOVERY_RUNNING,
6065                                           &mddev->recovery)));
6066        mddev_lock_nointr(mddev);
6067
6068        mutex_lock(&mddev->open_mutex);
6069        if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
6070            mddev->sysfs_active ||
6071            mddev->sync_thread ||
6072            test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6073                pr_warn("md: %s still in use.\n",mdname(mddev));
6074                mutex_unlock(&mddev->open_mutex);
6075                if (did_freeze) {
6076                        clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6077                        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6078                        md_wakeup_thread(mddev->thread);
6079                }
6080                return -EBUSY;
6081        }
6082        if (mddev->pers) {
6083                if (mddev->ro)
6084                        set_disk_ro(disk, 0);
6085
6086                __md_stop_writes(mddev);
6087                __md_stop(mddev);
6088                mddev->queue->backing_dev_info->congested_fn = NULL;
6089
6090                /* tell userspace to handle 'inactive' */
6091                sysfs_notify_dirent_safe(mddev->sysfs_state);
6092
6093                rdev_for_each(rdev, mddev)
6094                        if (rdev->raid_disk >= 0)
6095                                sysfs_unlink_rdev(mddev, rdev);
6096
6097                set_capacity(disk, 0);
6098                mutex_unlock(&mddev->open_mutex);
6099                mddev->changed = 1;
6100                revalidate_disk(disk);
6101
6102                if (mddev->ro)
6103                        mddev->ro = 0;
6104        } else
6105                mutex_unlock(&mddev->open_mutex);
6106        /*
6107         * Free resources if final stop
6108         */
6109        if (mode == 0) {
6110                pr_info("md: %s stopped.\n", mdname(mddev));
6111
6112                if (mddev->bitmap_info.file) {
6113                        struct file *f = mddev->bitmap_info.file;
6114                        spin_lock(&mddev->lock);
6115                        mddev->bitmap_info.file = NULL;
6116                        spin_unlock(&mddev->lock);
6117                        fput(f);
6118                }
6119                mddev->bitmap_info.offset = 0;
6120
6121                export_array(mddev);
6122
6123                md_clean(mddev);
6124                if (mddev->hold_active == UNTIL_STOP)
6125                        mddev->hold_active = 0;
6126        }
6127        md_new_event(mddev);
6128        sysfs_notify_dirent_safe(mddev->sysfs_state);
6129        return 0;
6130}
6131
6132#ifndef MODULE
6133static void autorun_array(struct mddev *mddev)
6134{
6135        struct md_rdev *rdev;
6136        int err;
6137
6138        if (list_empty(&mddev->disks))
6139                return;
6140
6141        pr_info("md: running: ");
6142
6143        rdev_for_each(rdev, mddev) {
6144                char b[BDEVNAME_SIZE];
6145                pr_cont("<%s>", bdevname(rdev->bdev,b));
6146        }
6147        pr_cont("\n");
6148
6149        err = do_md_run(mddev);
6150        if (err) {
6151                pr_warn("md: do_md_run() returned %d\n", err);
6152                do_md_stop(mddev, 0, NULL);
6153        }
6154}
6155
6156/*
6157 * lets try to run arrays based on all disks that have arrived
6158 * until now. (those are in pending_raid_disks)
6159 *
6160 * the method: pick the first pending disk, collect all disks with
6161 * the same UUID, remove all from the pending list and put them into
6162 * the 'same_array' list. Then order this list based on superblock
6163 * update time (freshest comes first), kick out 'old' disks and
6164 * compare superblocks. If everything's fine then run it.
6165 *
6166 * If "unit" is allocated, then bump its reference count
6167 */
6168static void autorun_devices(int part)
6169{
6170        struct md_rdev *rdev0, *rdev, *tmp;
6171        struct mddev *mddev;
6172        char b[BDEVNAME_SIZE];
6173
6174        pr_info("md: autorun ...\n");
6175        while (!list_empty(&pending_raid_disks)) {
6176                int unit;
6177                dev_t dev;
6178                LIST_HEAD(candidates);
6179                rdev0 = list_entry(pending_raid_disks.next,
6180                                         struct md_rdev, same_set);
6181
6182                pr_debug("md: considering %s ...\n", bdevname(rdev0->bdev,b));
6183                INIT_LIST_HEAD(&candidates);
6184                rdev_for_each_list(rdev, tmp, &pending_raid_disks)
6185                        if (super_90_load(rdev, rdev0, 0) >= 0) {
6186                                pr_debug("md:  adding %s ...\n",
6187                                         bdevname(rdev->bdev,b));
6188                                list_move(&rdev->same_set, &candidates);
6189                        }
6190                /*
6191                 * now we have a set of devices, with all of them having
6192                 * mostly sane superblocks. It's time to allocate the
6193                 * mddev.
6194                 */
6195                if (part) {
6196                        dev = MKDEV(mdp_major,
6197                                    rdev0->preferred_minor << MdpMinorShift);
6198                        unit = MINOR(dev) >> MdpMinorShift;
6199                } else {
6200                        dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
6201                        unit = MINOR(dev);
6202                }
6203                if (rdev0->preferred_minor != unit) {
6204                        pr_warn("md: unit number in %s is bad: %d\n",
6205                                bdevname(rdev0->bdev, b), rdev0->preferred_minor);
6206                        break;
6207                }
6208
6209                md_probe(dev, NULL, NULL);
6210                mddev = mddev_find(dev);
6211                if (!mddev || !mddev->gendisk) {
6212                        if (mddev)
6213                                mddev_put(mddev);
6214                        break;
6215                }
6216                if (mddev_lock(mddev))
6217                        pr_warn("md: %s locked, cannot run\n", mdname(mddev));
6218                else if (mddev->raid_disks || mddev->major_version
6219                         || !list_empty(&mddev->disks)) {
6220                        pr_warn("md: %s already running, cannot run %s\n",
6221                                mdname(mddev), bdevname(rdev0->bdev,b));
6222                        mddev_unlock(mddev);
6223                } else {
6224                        pr_debug("md: created %s\n", mdname(mddev));
6225                        mddev->persistent = 1;
6226                        rdev_for_each_list(rdev, tmp, &candidates) {
6227                                list_del_init(&rdev->same_set);
6228                                if (bind_rdev_to_array(rdev, mddev))
6229                                        export_rdev(rdev);
6230                        }
6231                        autorun_array(mddev);
6232                        mddev_unlock(mddev);
6233                }
6234                /* on success, candidates will be empty, on error
6235                 * it won't...
6236                 */
6237                rdev_for_each_list(rdev, tmp, &candidates) {
6238                        list_del_init(&rdev->same_set);
6239                        export_rdev(rdev);
6240                }
6241                mddev_put(mddev);
6242        }
6243        pr_info("md: ... autorun DONE.\n");
6244}
6245#endif /* !MODULE */
6246
6247static int get_version(void __user *arg)
6248{
6249        mdu_version_t ver;
6250
6251        ver.major = MD_MAJOR_VERSION;
6252        ver.minor = MD_MINOR_VERSION;
6253        ver.patchlevel = MD_PATCHLEVEL_VERSION;
6254
6255        if (copy_to_user(arg, &ver, sizeof(ver)))
6256                return -EFAULT;
6257
6258        return 0;
6259}
6260
6261static int get_array_info(struct mddev *mddev, void __user *arg)
6262{
6263        mdu_array_info_t info;
6264        int nr,working,insync,failed,spare;
6265        struct md_rdev *rdev;
6266
6267        nr = working = insync = failed = spare = 0;
6268        rcu_read_lock();
6269        rdev_for_each_rcu(rdev, mddev) {
6270                nr++;
6271                if (test_bit(Faulty, &rdev->flags))
6272                        failed++;
6273                else {
6274                        working++;
6275                        if (test_bit(In_sync, &rdev->flags))
6276                                insync++;
6277                        else if (test_bit(Journal, &rdev->flags))
6278                                /* TODO: add journal count to md_u.h */
6279                                ;
6280                        else
6281                                spare++;
6282                }
6283        }
6284        rcu_read_unlock();
6285
6286        info.major_version = mddev->major_version;
6287        info.minor_version = mddev->minor_version;
6288        info.patch_version = MD_PATCHLEVEL_VERSION;
6289        info.ctime         = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
6290        info.level         = mddev->level;
6291        info.size          = mddev->dev_sectors / 2;
6292        if (info.size != mddev->dev_sectors / 2) /* overflow */
6293                info.size = -1;
6294        info.nr_disks      = nr;
6295        info.raid_disks    = mddev->raid_disks;
6296        info.md_minor      = mddev->md_minor;
6297        info.not_persistent= !mddev->persistent;
6298
6299        info.utime         = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
6300        info.state         = 0;
6301        if (mddev->in_sync)
6302                info.state = (1<<MD_SB_CLEAN);
6303        if (mddev->bitmap && mddev->bitmap_info.offset)
6304                info.state |= (1<<MD_SB_BITMAP_PRESENT);
6305        if (mddev_is_clustered(mddev))
6306                info.state |= (1<<MD_SB_CLUSTERED);
6307        info.active_disks  = insync;
6308        info.working_disks = working;
6309        info.failed_disks  = failed;
6310        info.spare_disks   = spare;
6311
6312        info.layout        = mddev->layout;
6313        info.chunk_size    = mddev->chunk_sectors << 9;
6314
6315        if (copy_to_user(arg, &info, sizeof(info)))
6316                return -EFAULT;
6317
6318        return 0;
6319}
6320
6321static int get_bitmap_file(struct mddev *mddev, void __user * arg)
6322{
6323        mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
6324        char *ptr;
6325        int err;
6326
6327        file = kzalloc(sizeof(*file), GFP_NOIO);
6328        if (!file)
6329                return -ENOMEM;
6330
6331        err = 0;
6332        spin_lock(&mddev->lock);
6333        /* bitmap enabled */
6334        if (mddev->bitmap_info.file) {
6335                ptr = file_path(mddev->bitmap_info.file, file->pathname,
6336                                sizeof(file->pathname));
6337                if (IS_ERR(ptr))
6338                        err = PTR_ERR(ptr);
6339                else
6340                        memmove(file->pathname, ptr,
6341                                sizeof(file->pathname)-(ptr-file->pathname));
6342        }
6343        spin_unlock(&mddev->lock);
6344
6345        if (err == 0 &&
6346            copy_to_user(arg, file, sizeof(*file)))
6347                err = -EFAULT;
6348
6349        kfree(file);
6350        return err;
6351}
6352
6353static int get_disk_info(struct mddev *mddev, void __user * arg)
6354{
6355        mdu_disk_info_t info;
6356        struct md_rdev *rdev;
6357
6358        if (copy_from_user(&info, arg, sizeof(info)))
6359                return -EFAULT;
6360
6361        rcu_read_lock();
6362        rdev = md_find_rdev_nr_rcu(mddev, info.number);
6363        if (rdev) {
6364                info.major = MAJOR(rdev->bdev->bd_dev);
6365                info.minor = MINOR(rdev->bdev->bd_dev);
6366                info.raid_disk = rdev->raid_disk;
6367                info.state = 0;
6368                if (test_bit(Faulty, &rdev->flags))
6369                        info.state |= (1<<MD_DISK_FAULTY);
6370                else if (test_bit(In_sync, &rdev->flags)) {
6371                        info.state |= (1<<MD_DISK_ACTIVE);
6372                        info.state |= (1<<MD_DISK_SYNC);
6373                }
6374                if (test_bit(Journal, &rdev->flags))
6375                        info.state |= (1<<MD_DISK_JOURNAL);
6376                if (test_bit(WriteMostly, &rdev->flags))
6377                        info.state |= (1<<MD_DISK_WRITEMOSTLY);
6378                if (test_bit(FailFast, &rdev->flags))
6379                        info.state |= (1<<MD_DISK_FAILFAST);
6380        } else {
6381                info.major = info.minor = 0;
6382                info.raid_disk = -1;
6383                info.state = (1<<MD_DISK_REMOVED);
6384        }
6385        rcu_read_unlock();
6386
6387        if (copy_to_user(arg, &info, sizeof(info)))
6388                return -EFAULT;
6389
6390        return 0;
6391}
6392
6393static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
6394{
6395        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
6396        struct md_rdev *rdev;
6397        dev_t dev = MKDEV(info->major,info->minor);
6398
6399        if (mddev_is_clustered(mddev) &&
6400                !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
6401                pr_warn("%s: Cannot add to clustered mddev.\n",
6402                        mdname(mddev));
6403                return -EINVAL;
6404        }
6405
6406        if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
6407                return -EOVERFLOW;
6408
6409        if (!mddev->raid_disks) {
6410                int err;
6411                /* expecting a device which has a superblock */
6412                rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
6413                if (IS_ERR(rdev)) {
6414                        pr_warn("md: md_import_device returned %ld\n",
6415                                PTR_ERR(rdev));
6416                        return PTR_ERR(rdev);
6417                }
6418                if (!list_empty(&mddev->disks)) {
6419                        struct md_rdev *rdev0
6420                                = list_entry(mddev->disks.next,
6421                                             struct md_rdev, same_set);
6422                        err = super_types[mddev->major_version]
6423                                .load_super(rdev, rdev0, mddev->minor_version);
6424                        if (err < 0) {
6425                                pr_warn("md: %s has different UUID to %s\n",
6426                                        bdevname(rdev->bdev,b),
6427                                        bdevname(rdev0->bdev,b2));
6428                                export_rdev(rdev);
6429                                return -EINVAL;
6430                        }
6431                }
6432                err = bind_rdev_to_array(rdev, mddev);
6433                if (err)
6434                        export_rdev(rdev);
6435                return err;
6436        }
6437
6438        /*
6439         * add_new_disk can be used once the array is assembled
6440         * to add "hot spares".  They must already have a superblock
6441         * written
6442         */
6443        if (mddev->pers) {
6444                int err;
6445                if (!mddev->pers->hot_add_disk) {
6446                        pr_warn("%s: personality does not support diskops!\n",
6447                                mdname(mddev));
6448                        return -EINVAL;
6449                }
6450                if (mddev->persistent)
6451                        rdev = md_import_device(dev, mddev->major_version,
6452                                                mddev->minor_version);
6453                else
6454                        rdev = md_import_device(dev, -1, -1);
6455                if (IS_ERR(rdev)) {
6456                        pr_warn("md: md_import_device returned %ld\n",
6457                                PTR_ERR(rdev));
6458                        return PTR_ERR(rdev);
6459                }
6460                /* set saved_raid_disk if appropriate */
6461                if (!mddev->persistent) {
6462                        if (info->state & (1<<MD_DISK_SYNC)  &&
6463                            info->raid_disk < mddev->raid_disks) {
6464                                rdev->raid_disk = info->raid_disk;
6465                                set_bit(In_sync, &rdev->flags);
6466                                clear_bit(Bitmap_sync, &rdev->flags);
6467                        } else
6468                                rdev->raid_disk = -1;
6469                        rdev->saved_raid_disk = rdev->raid_disk;
6470                } else
6471                        super_types[mddev->major_version].
6472                                validate_super(mddev, rdev);
6473                if ((info->state & (1<<MD_DISK_SYNC)) &&
6474                     rdev->raid_disk != info->raid_disk) {
6475                        /* This was a hot-add request, but events doesn't
6476                         * match, so reject it.
6477                         */
6478                        export_rdev(rdev);
6479                        return -EINVAL;
6480                }
6481
6482                clear_bit(In_sync, &rdev->flags); /* just to be sure */
6483                if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6484                        set_bit(WriteMostly, &rdev->flags);
6485                else
6486                        clear_bit(WriteMostly, &rdev->flags);
6487                if (info->state & (1<<MD_DISK_FAILFAST))
6488                        set_bit(FailFast, &rdev->flags);
6489                else
6490                        clear_bit(FailFast, &rdev->flags);
6491
6492                if (info->state & (1<<MD_DISK_JOURNAL)) {
6493                        struct md_rdev *rdev2;
6494                        bool has_journal = false;
6495
6496                        /* make sure no existing journal disk */
6497                        rdev_for_each(rdev2, mddev) {
6498                                if (test_bit(Journal, &rdev2->flags)) {
6499                                        has_journal = true;
6500                                        break;
6501                                }
6502                        }
6503                        if (has_journal || mddev->bitmap) {
6504                                export_rdev(rdev);
6505                                return -EBUSY;
6506                        }
6507                        set_bit(Journal, &rdev->flags);
6508                }
6509                /*
6510                 * check whether the device shows up in other nodes
6511                 */
6512                if (mddev_is_clustered(mddev)) {
6513                        if (info->state & (1 << MD_DISK_CANDIDATE))
6514                                set_bit(Candidate, &rdev->flags);
6515                        else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
6516                                /* --add initiated by this node */
6517                                err = md_cluster_ops->add_new_disk(mddev, rdev);
6518                                if (err) {
6519                                        export_rdev(rdev);
6520                                        return err;
6521                                }
6522                        }
6523                }
6524
6525                rdev->raid_disk = -1;
6526                err = bind_rdev_to_array(rdev, mddev);
6527
6528                if (err)
6529                        export_rdev(rdev);
6530
6531                if (mddev_is_clustered(mddev)) {
6532                        if (info->state & (1 << MD_DISK_CANDIDATE)) {
6533                                if (!err) {
6534                                        err = md_cluster_ops->new_disk_ack(mddev,
6535                                                err == 0);
6536                                        if (err)
6537                                                md_kick_rdev_from_array(rdev);
6538                                }
6539                        } else {
6540                                if (err)
6541                                        md_cluster_ops->add_new_disk_cancel(mddev);
6542                                else
6543                                        err = add_bound_rdev(rdev);
6544                        }
6545
6546                } else if (!err)
6547                        err = add_bound_rdev(rdev);
6548
6549                return err;
6550        }
6551
6552        /* otherwise, add_new_disk is only allowed
6553         * for major_version==0 superblocks
6554         */
6555        if (mddev->major_version != 0) {
6556                pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev));
6557                return -EINVAL;
6558        }
6559
6560        if (!(info->state & (1<<MD_DISK_FAULTY))) {
6561                int err;
6562                rdev = md_import_device(dev, -1, 0);
6563                if (IS_ERR(rdev)) {
6564                        pr_warn("md: error, md_import_device() returned %ld\n",
6565                                PTR_ERR(rdev));
6566                        return PTR_ERR(rdev);
6567                }
6568                rdev->desc_nr = info->number;
6569                if (info->raid_disk < mddev->raid_disks)
6570                        rdev->raid_disk = info->raid_disk;
6571                else
6572                        rdev->raid_disk = -1;
6573
6574                if (rdev->raid_disk < mddev->raid_disks)
6575                        if (info->state & (1<<MD_DISK_SYNC))
6576                                set_bit(In_sync, &rdev->flags);
6577
6578                if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6579                        set_bit(WriteMostly, &rdev->flags);
6580                if (info->state & (1<<MD_DISK_FAILFAST))
6581                        set_bit(FailFast, &rdev->flags);
6582
6583                if (!mddev->persistent) {
6584                        pr_debug("md: nonpersistent superblock ...\n");
6585                        rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
6586                } else
6587                        rdev->sb_start = calc_dev_sboffset(rdev);
6588                rdev->sectors = rdev->sb_start;
6589
6590                err = bind_rdev_to_array(rdev, mddev);
6591                if (err) {
6592                        export_rdev(rdev);
6593                        return err;
6594                }
6595        }
6596
6597        return 0;
6598}
6599
6600static int hot_remove_disk(struct mddev *mddev, dev_t dev)
6601{
6602        char b[BDEVNAME_SIZE];
6603        struct md_rdev *rdev;
6604
6605        if (!mddev->pers)
6606                return -ENODEV;
6607
6608        rdev = find_rdev(mddev, dev);
6609        if (!rdev)
6610                return -ENXIO;
6611
6612        if (rdev->raid_disk < 0)
6613                goto kick_rdev;
6614
6615        clear_bit(Blocked, &rdev->flags);
6616        remove_and_add_spares(mddev, rdev);
6617
6618        if (rdev->raid_disk >= 0)
6619                goto busy;
6620
6621kick_rdev:
6622        if (mddev_is_clustered(mddev))
6623                md_cluster_ops->remove_disk(mddev, rdev);
6624
6625        md_kick_rdev_from_array(rdev);
6626        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6627        if (mddev->thread)
6628                md_wakeup_thread(mddev->thread);
6629        else
6630                md_update_sb(mddev, 1);
6631        md_new_event(mddev);
6632
6633        return 0;
6634busy:
6635        pr_debug("md: cannot remove active disk %s from %s ...\n",
6636                 bdevname(rdev->bdev,b), mdname(mddev));
6637        return -EBUSY;
6638}
6639
6640static int hot_add_disk(struct mddev *mddev, dev_t dev)
6641{
6642        char b[BDEVNAME_SIZE];
6643        int err;
6644        struct md_rdev *rdev;
6645
6646        if (!mddev->pers)
6647                return -ENODEV;
6648
6649        if (mddev->major_version != 0) {
6650                pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n",
6651                        mdname(mddev));
6652                return -EINVAL;
6653        }
6654        if (!mddev->pers->hot_add_disk) {
6655                pr_warn("%s: personality does not support diskops!\n",
6656                        mdname(mddev));
6657                return -EINVAL;
6658        }
6659
6660        rdev = md_import_device(dev, -1, 0);
6661        if (IS_ERR(rdev)) {
6662                pr_warn("md: error, md_import_device() returned %ld\n",
6663                        PTR_ERR(rdev));
6664                return -EINVAL;
6665        }
6666
6667        if (mddev->persistent)
6668                rdev->sb_start = calc_dev_sboffset(rdev);
6669        else
6670                rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
6671
6672        rdev->sectors = rdev->sb_start;
6673
6674        if (test_bit(Faulty, &rdev->flags)) {
6675                pr_warn("md: can not hot-add faulty %s disk to %s!\n",
6676                        bdevname(rdev->bdev,b), mdname(mddev));
6677                err = -EINVAL;
6678                goto abort_export;
6679        }
6680
6681        clear_bit(In_sync, &rdev->flags);
6682        rdev->desc_nr = -1;
6683        rdev->saved_raid_disk = -1;
6684        err = bind_rdev_to_array(rdev, mddev);
6685        if (err)
6686                goto abort_export;
6687
6688        /*
6689         * The rest should better be atomic, we can have disk failures
6690         * noticed in interrupt contexts ...
6691         */
6692
6693        rdev->raid_disk = -1;
6694
6695        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6696        if (!mddev->thread)
6697                md_update_sb(mddev, 1);
6698        /*
6699         * Kick recovery, maybe this spare has to be added to the
6700         * array immediately.
6701         */
6702        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6703        md_wakeup_thread(mddev->thread);
6704        md_new_event(mddev);
6705        return 0;
6706
6707abort_export:
6708        export_rdev(rdev);
6709        return err;
6710}
6711
6712static int set_bitmap_file(struct mddev *mddev, int fd)
6713{
6714        int err = 0;
6715
6716        if (mddev->pers) {
6717                if (!mddev->pers->quiesce || !mddev->thread)
6718                        return -EBUSY;
6719                if (mddev->recovery || mddev->sync_thread)
6720                        return -EBUSY;
6721                /* we should be able to change the bitmap.. */
6722        }
6723
6724        if (fd >= 0) {
6725                struct inode *inode;
6726                struct file *f;
6727
6728                if (mddev->bitmap || mddev->bitmap_info.file)
6729                        return -EEXIST; /* cannot add when bitmap is present */
6730                f = fget(fd);
6731
6732                if (f == NULL) {
6733                        pr_warn("%s: error: failed to get bitmap file\n",
6734                                mdname(mddev));
6735                        return -EBADF;
6736                }
6737
6738                inode = f->f_mapping->host;
6739                if (!S_ISREG(inode->i_mode)) {
6740                        pr_warn("%s: error: bitmap file must be a regular file\n",
6741                                mdname(mddev));
6742                        err = -EBADF;
6743                } else if (!(f->f_mode & FMODE_WRITE)) {
6744                        pr_warn("%s: error: bitmap file must open for write\n",
6745                                mdname(mddev));
6746                        err = -EBADF;
6747                } else if (atomic_read(&inode->i_writecount) != 1) {
6748                        pr_warn("%s: error: bitmap file is already in use\n",
6749                                mdname(mddev));
6750                        err = -EBUSY;
6751                }
6752                if (err) {
6753                        fput(f);
6754                        return err;
6755                }
6756                mddev->bitmap_info.file = f;
6757                mddev->bitmap_info.offset = 0; /* file overrides offset */
6758        } else if (mddev->bitmap == NULL)
6759                return -ENOENT; /* cannot remove what isn't there */
6760        err = 0;
6761        if (mddev->pers) {
6762                if (fd >= 0) {
6763                        struct bitmap *bitmap;
6764
6765                        bitmap = md_bitmap_create(mddev, -1);
6766                        mddev_suspend(mddev);
6767                        if (!IS_ERR(bitmap)) {
6768                                mddev->bitmap = bitmap;
6769                                err = md_bitmap_load(mddev);
6770                        } else
6771                                err = PTR_ERR(bitmap);
6772                        if (err) {
6773                                md_bitmap_destroy(mddev);
6774                                fd = -1;
6775                        }
6776                        mddev_resume(mddev);
6777                } else if (fd < 0) {
6778                        mddev_suspend(mddev);
6779                        md_bitmap_destroy(mddev);
6780                        mddev_resume(mddev);
6781                }
6782        }
6783        if (fd < 0) {
6784                struct file *f = mddev->bitmap_info.file;
6785                if (f) {
6786                        spin_lock(&mddev->lock);
6787                        mddev->bitmap_info.file = NULL;
6788                        spin_unlock(&mddev->lock);
6789                        fput(f);
6790                }
6791        }
6792
6793        return err;
6794}
6795
6796/*
6797 * set_array_info is used two different ways
6798 * The original usage is when creating a new array.
6799 * In this usage, raid_disks is > 0 and it together with
6800 *  level, size, not_persistent,layout,chunksize determine the
6801 *  shape of the array.
6802 *  This will always create an array with a type-0.90.0 superblock.
6803 * The newer usage is when assembling an array.
6804 *  In this case raid_disks will be 0, and the major_version field is
6805 *  use to determine which style super-blocks are to be found on the devices.
6806 *  The minor and patch _version numbers are also kept incase the
6807 *  super_block handler wishes to interpret them.
6808 */
6809static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
6810{
6811
6812        if (info->raid_disks == 0) {
6813                /* just setting version number for superblock loading */
6814                if (info->major_version < 0 ||
6815                    info->major_version >= ARRAY_SIZE(super_types) ||
6816                    super_types[info->major_version].name == NULL) {
6817                        /* maybe try to auto-load a module? */
6818                        pr_warn("md: superblock version %d not known\n",
6819                                info->major_version);
6820                        return -EINVAL;
6821                }
6822                mddev->major_version = info->major_version;
6823                mddev->minor_version = info->minor_version;
6824                mddev->patch_version = info->patch_version;
6825                mddev->persistent = !info->not_persistent;
6826                /* ensure mddev_put doesn't delete this now that there
6827                 * is some minimal configuration.
6828                 */
6829                mddev->ctime         = ktime_get_real_seconds();
6830                return 0;
6831        }
6832        mddev->major_version = MD_MAJOR_VERSION;
6833        mddev->minor_version = MD_MINOR_VERSION;
6834        mddev->patch_version = MD_PATCHLEVEL_VERSION;
6835        mddev->ctime         = ktime_get_real_seconds();
6836
6837        mddev->level         = info->level;
6838        mddev->clevel[0]     = 0;
6839        mddev->dev_sectors   = 2 * (sector_t)info->size;
6840        mddev->raid_disks    = info->raid_disks;
6841        /* don't set md_minor, it is determined by which /dev/md* was
6842         * openned
6843         */
6844        if (info->state & (1<<MD_SB_CLEAN))
6845                mddev->recovery_cp = MaxSector;
6846        else
6847                mddev->recovery_cp = 0;
6848        mddev->persistent    = ! info->not_persistent;
6849        mddev->external      = 0;
6850
6851        mddev->layout        = info->layout;
6852        mddev->chunk_sectors = info->chunk_size >> 9;
6853
6854        if (mddev->persistent) {
6855                mddev->max_disks = MD_SB_DISKS;
6856                mddev->flags = 0;
6857                mddev->sb_flags = 0;
6858        }
6859        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6860
6861        mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
6862        mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
6863        mddev->bitmap_info.offset = 0;
6864
6865        mddev->reshape_position = MaxSector;
6866
6867        /*
6868         * Generate a 128 bit UUID
6869         */
6870        get_random_bytes(mddev->uuid, 16);
6871
6872        mddev->new_level = mddev->level;
6873        mddev->new_chunk_sectors = mddev->chunk_sectors;
6874        mddev->new_layout = mddev->layout;
6875        mddev->delta_disks = 0;
6876        mddev->reshape_backwards = 0;
6877
6878        return 0;
6879}
6880
6881void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
6882{
6883        lockdep_assert_held(&mddev->reconfig_mutex);
6884
6885        if (mddev->external_size)
6886                return;
6887
6888        mddev->array_sectors = array_sectors;
6889}
6890EXPORT_SYMBOL(md_set_array_sectors);
6891
6892static int update_size(struct mddev *mddev, sector_t num_sectors)
6893{
6894        struct md_rdev *rdev;
6895        int rv;
6896        int fit = (num_sectors == 0);
6897        sector_t old_dev_sectors = mddev->dev_sectors;
6898
6899        if (mddev->pers->resize == NULL)
6900                return -EINVAL;
6901        /* The "num_sectors" is the number of sectors of each device that
6902         * is used.  This can only make sense for arrays with redundancy.
6903         * linear and raid0 always use whatever space is available. We can only
6904         * consider changing this number if no resync or reconstruction is
6905         * happening, and if the new size is acceptable. It must fit before the
6906         * sb_start or, if that is <data_offset, it must fit before the size
6907         * of each device.  If num_sectors is zero, we find the largest size
6908         * that fits.
6909         */
6910        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6911            mddev->sync_thread)
6912                return -EBUSY;
6913        if (mddev->ro)
6914                return -EROFS;
6915
6916        rdev_for_each(rdev, mddev) {
6917                sector_t avail = rdev->sectors;
6918
6919                if (fit && (num_sectors == 0 || num_sectors > avail))
6920                        num_sectors = avail;
6921                if (avail < num_sectors)
6922                        return -ENOSPC;
6923        }
6924        rv = mddev->pers->resize(mddev, num_sectors);
6925        if (!rv) {
6926                if (mddev_is_clustered(mddev))
6927                        md_cluster_ops->update_size(mddev, old_dev_sectors);
6928                else if (mddev->queue) {
6929                        set_capacity(mddev->gendisk, mddev->array_sectors);
6930                        revalidate_disk(mddev->gendisk);
6931                }
6932        }
6933        return rv;
6934}
6935
6936static int update_raid_disks(struct mddev *mddev, int raid_disks)
6937{
6938        int rv;
6939        struct md_rdev *rdev;
6940        /* change the number of raid disks */
6941        if (mddev->pers->check_reshape == NULL)
6942                return -EINVAL;
6943        if (mddev->ro)
6944                return -EROFS;
6945        if (raid_disks <= 0 ||
6946            (mddev->max_disks && raid_disks >= mddev->max_disks))
6947                return -EINVAL;
6948        if (mddev->sync_thread ||
6949            test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6950            mddev->reshape_position != MaxSector)
6951                return -EBUSY;
6952
6953        rdev_for_each(rdev, mddev) {
6954                if (mddev->raid_disks < raid_disks &&
6955                    rdev->data_offset < rdev->new_data_offset)
6956                        return -EINVAL;
6957                if (mddev->raid_disks > raid_disks &&
6958                    rdev->data_offset > rdev->new_data_offset)
6959                        return -EINVAL;
6960        }
6961
6962        mddev->delta_disks = raid_disks - mddev->raid_disks;
6963        if (mddev->delta_disks < 0)
6964                mddev->reshape_backwards = 1;
6965        else if (mddev->delta_disks > 0)
6966                mddev->reshape_backwards = 0;
6967
6968        rv = mddev->pers->check_reshape(mddev);
6969        if (rv < 0) {
6970                mddev->delta_disks = 0;
6971                mddev->reshape_backwards = 0;
6972        }
6973        return rv;
6974}
6975
6976/*
6977 * update_array_info is used to change the configuration of an
6978 * on-line array.
6979 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
6980 * fields in the info are checked against the array.
6981 * Any differences that cannot be handled will cause an error.
6982 * Normally, only one change can be managed at a time.
6983 */
6984static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6985{
6986        int rv = 0;
6987        int cnt = 0;
6988        int state = 0;
6989
6990        /* calculate expected state,ignoring low bits */
6991        if (mddev->bitmap && mddev->bitmap_info.offset)
6992                state |= (1 << MD_SB_BITMAP_PRESENT);
6993
6994        if (mddev->major_version != info->major_version ||
6995            mddev->minor_version != info->minor_version ||
6996/*          mddev->patch_version != info->patch_version || */
6997            mddev->ctime         != info->ctime         ||
6998            mddev->level         != info->level         ||
6999/*          mddev->layout        != info->layout        || */
7000            mddev->persistent    != !info->not_persistent ||
7001            mddev->chunk_sectors != info->chunk_size >> 9 ||
7002            /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
7003            ((state^info->state) & 0xfffffe00)
7004                )
7005                return -EINVAL;
7006        /* Check there is only one change */
7007        if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7008                cnt++;
7009        if (mddev->raid_disks != info->raid_disks)
7010                cnt++;
7011        if (mddev->layout != info->layout)
7012                cnt++;
7013        if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
7014                cnt++;
7015        if (cnt == 0)
7016                return 0;
7017        if (cnt > 1)
7018                return -EINVAL;
7019
7020        if (mddev->layout != info->layout) {
7021                /* Change layout
7022                 * we don't need to do anything at the md level, the
7023                 * personality will take care of it all.
7024                 */
7025                if (mddev->pers->check_reshape == NULL)
7026                        return -EINVAL;
7027                else {
7028                        mddev->new_layout = info->layout;
7029                        rv = mddev->pers->check_reshape(mddev);
7030                        if (rv)
7031                                mddev->new_layout = mddev->layout;
7032                        return rv;
7033                }
7034        }
7035        if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7036                rv = update_size(mddev, (sector_t)info->size * 2);
7037
7038        if (mddev->raid_disks    != info->raid_disks)
7039                rv = update_raid_disks(mddev, info->raid_disks);
7040
7041        if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
7042                if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
7043                        rv = -EINVAL;
7044                        goto err;
7045                }
7046                if (mddev->recovery || mddev->sync_thread) {
7047                        rv = -EBUSY;
7048                        goto err;
7049                }
7050                if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
7051                        struct bitmap *bitmap;
7052                        /* add the bitmap */
7053                        if (mddev->bitmap) {
7054                                rv = -EEXIST;
7055                                goto err;
7056                        }
7057                        if (mddev->bitmap_info.default_offset == 0) {
7058                                rv = -EINVAL;
7059                                goto err;
7060                        }
7061                        mddev->bitmap_info.offset =
7062                                mddev->bitmap_info.default_offset;
7063                        mddev->bitmap_info.space =
7064                                mddev->bitmap_info.default_space;
7065                        bitmap = md_bitmap_create(mddev, -1);
7066                        mddev_suspend(mddev);
7067                        if (!IS_ERR(bitmap)) {
7068                                mddev->bitmap = bitmap;
7069                                rv = md_bitmap_load(mddev);
7070                        } else
7071                                rv = PTR_ERR(bitmap);
7072                        if (rv)
7073                                md_bitmap_destroy(mddev);
7074                        mddev_resume(mddev);
7075                } else {
7076                        /* remove the bitmap */
7077                        if (!mddev->bitmap) {
7078                                rv = -ENOENT;
7079                                goto err;
7080                        }
7081                        if (mddev->bitmap->storage.file) {
7082                                rv = -EINVAL;
7083                                goto err;
7084                        }
7085                        if (mddev->bitmap_info.nodes) {
7086                                /* hold PW on all the bitmap lock */
7087                                if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
7088                                        pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
7089                                        rv = -EPERM;
7090                                        md_cluster_ops->unlock_all_bitmaps(mddev);
7091                                        goto err;
7092                                }
7093
7094                                mddev->bitmap_info.nodes = 0;
7095                                md_cluster_ops->leave(mddev);
7096                        }
7097                        mddev_suspend(mddev);
7098                        md_bitmap_destroy(mddev);
7099                        mddev_resume(mddev);
7100                        mddev->bitmap_info.offset = 0;
7101                }
7102        }
7103        md_update_sb(mddev, 1);
7104        return rv;
7105err:
7106        return rv;
7107}
7108
7109static int set_disk_faulty(struct mddev *mddev, dev_t dev)
7110{
7111        struct md_rdev *rdev;
7112        int err = 0;
7113
7114        if (mddev->pers == NULL)
7115                return -ENODEV;
7116
7117        rcu_read_lock();
7118        rdev = md_find_rdev_rcu(mddev, dev);
7119        if (!rdev)
7120                err =  -ENODEV;
7121        else {
7122                md_error(mddev, rdev);
7123                if (!test_bit(Faulty, &rdev->flags))
7124                        err = -EBUSY;
7125        }
7126        rcu_read_unlock();
7127        return err;
7128}
7129
7130/*
7131 * We have a problem here : there is no easy way to give a CHS
7132 * virtual geometry. We currently pretend that we have a 2 heads
7133 * 4 sectors (with a BIG number of cylinders...). This drives
7134 * dosfs just mad... ;-)
7135 */
7136static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
7137{
7138        struct mddev *mddev = bdev->bd_disk->private_data;
7139
7140        geo->heads = 2;
7141        geo->sectors = 4;
7142        geo->cylinders = mddev->array_sectors / 8;
7143        return 0;
7144}
7145
7146static inline bool md_ioctl_valid(unsigned int cmd)
7147{
7148        switch (cmd) {
7149        case ADD_NEW_DISK:
7150        case BLKROSET:
7151        case GET_ARRAY_INFO:
7152        case GET_BITMAP_FILE:
7153        case GET_DISK_INFO:
7154        case HOT_ADD_DISK:
7155        case HOT_REMOVE_DISK:
7156        case RAID_AUTORUN:
7157        case RAID_VERSION:
7158        case RESTART_ARRAY_RW:
7159        case RUN_ARRAY:
7160        case SET_ARRAY_INFO:
7161        case SET_BITMAP_FILE:
7162        case SET_DISK_FAULTY:
7163        case STOP_ARRAY:
7164        case STOP_ARRAY_RO:
7165        case CLUSTERED_DISK_NACK:
7166                return true;
7167        default:
7168                return false;
7169        }
7170}
7171
7172static int md_ioctl(struct block_device *bdev, fmode_t mode,
7173                        unsigned int cmd, unsigned long arg)
7174{
7175        int err = 0;
7176        void __user *argp = (void __user *)arg;
7177        struct mddev *mddev = NULL;
7178        int ro;
7179        bool did_set_md_closing = false;
7180
7181        if (!md_ioctl_valid(cmd))
7182                return -ENOTTY;
7183
7184        switch (cmd) {
7185        case RAID_VERSION:
7186        case GET_ARRAY_INFO:
7187        case GET_DISK_INFO:
7188                break;
7189        default:
7190                if (!capable(CAP_SYS_ADMIN))
7191                        return -EACCES;
7192        }
7193
7194        /*
7195         * Commands dealing with the RAID driver but not any
7196         * particular array:
7197         */
7198        switch (cmd) {
7199        case RAID_VERSION:
7200                err = get_version(argp);
7201                goto out;
7202
7203#ifndef MODULE
7204        case RAID_AUTORUN:
7205                err = 0;
7206                autostart_arrays(arg);
7207                goto out;
7208#endif
7209        default:;
7210        }
7211
7212        /*
7213         * Commands creating/starting a new array:
7214         */
7215
7216        mddev = bdev->bd_disk->private_data;
7217
7218        if (!mddev) {
7219                BUG();
7220                goto out;
7221        }
7222
7223        /* Some actions do not requires the mutex */
7224        switch (cmd) {
7225        case GET_ARRAY_INFO:
7226                if (!mddev->raid_disks && !mddev->external)
7227                        err = -ENODEV;
7228                else
7229                        err = get_array_info(mddev, argp);
7230                goto out;
7231
7232        case GET_DISK_INFO:
7233                if (!mddev->raid_disks && !mddev->external)
7234                        err = -ENODEV;
7235                else
7236                        err = get_disk_info(mddev, argp);
7237                goto out;
7238
7239        case SET_DISK_FAULTY:
7240                err = set_disk_faulty(mddev, new_decode_dev(arg));
7241                goto out;
7242
7243        case GET_BITMAP_FILE:
7244                err = get_bitmap_file(mddev, argp);
7245                goto out;
7246
7247        }
7248
7249        if (cmd == ADD_NEW_DISK)
7250                /* need to ensure md_delayed_delete() has completed */
7251                flush_workqueue(md_misc_wq);
7252
7253        if (cmd == HOT_REMOVE_DISK)
7254                /* need to ensure recovery thread has run */
7255                wait_event_interruptible_timeout(mddev->sb_wait,
7256                                                 !test_bit(MD_RECOVERY_NEEDED,
7257                                                           &mddev->recovery),
7258                                                 msecs_to_jiffies(5000));
7259        if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
7260                /* Need to flush page cache, and ensure no-one else opens
7261                 * and writes
7262                 */
7263                mutex_lock(&mddev->open_mutex);
7264                if (mddev->pers && atomic_read(&mddev->openers) > 1) {
7265                        mutex_unlock(&mddev->open_mutex);
7266                        err = -EBUSY;
7267                        goto out;
7268                }
7269                WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags));
7270                set_bit(MD_CLOSING, &mddev->flags);
7271                did_set_md_closing = true;
7272                mutex_unlock(&mddev->open_mutex);
7273                sync_blockdev(bdev);
7274        }
7275        err = mddev_lock(mddev);
7276        if (err) {
7277                pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
7278                         err, cmd);
7279                goto out;
7280        }
7281
7282        if (cmd == SET_ARRAY_INFO) {
7283                mdu_array_info_t info;
7284                if (!arg)
7285                        memset(&info, 0, sizeof(info));
7286                else if (copy_from_user(&info, argp, sizeof(info))) {
7287                        err = -EFAULT;
7288                        goto unlock;
7289                }
7290                if (mddev->pers) {
7291                        err = update_array_info(mddev, &info);
7292                        if (err) {
7293                                pr_warn("md: couldn't update array info. %d\n", err);
7294                                goto unlock;
7295                        }
7296                        goto unlock;
7297                }
7298                if (!list_empty(&mddev->disks)) {
7299                        pr_warn("md: array %s already has disks!\n", mdname(mddev));
7300                        err = -EBUSY;
7301                        goto unlock;
7302                }
7303                if (mddev->raid_disks) {
7304                        pr_warn("md: array %s already initialised!\n", mdname(mddev));
7305                        err = -EBUSY;
7306                        goto unlock;
7307                }
7308                err = set_array_info(mddev, &info);
7309                if (err) {
7310                        pr_warn("md: couldn't set array info. %d\n", err);
7311                        goto unlock;
7312                }
7313                goto unlock;
7314        }
7315
7316        /*
7317         * Commands querying/configuring an existing array:
7318         */
7319        /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
7320         * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
7321        if ((!mddev->raid_disks && !mddev->external)
7322            && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
7323            && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
7324            && cmd != GET_BITMAP_FILE) {
7325                err = -ENODEV;
7326                goto unlock;
7327        }
7328
7329        /*
7330         * Commands even a read-only array can execute:
7331         */
7332        switch (cmd) {
7333        case RESTART_ARRAY_RW:
7334                err = restart_array(mddev);
7335                goto unlock;
7336
7337        case STOP_ARRAY:
7338                err = do_md_stop(mddev, 0, bdev);
7339                goto unlock;
7340
7341        case STOP_ARRAY_RO:
7342                err = md_set_readonly(mddev, bdev);
7343                goto unlock;
7344
7345        case HOT_REMOVE_DISK:
7346                err = hot_remove_disk(mddev, new_decode_dev(arg));
7347                goto unlock;
7348
7349        case ADD_NEW_DISK:
7350                /* We can support ADD_NEW_DISK on read-only arrays
7351                 * only if we are re-adding a preexisting device.
7352                 * So require mddev->pers and MD_DISK_SYNC.
7353                 */
7354                if (mddev->pers) {
7355                        mdu_disk_info_t info;
7356                        if (copy_from_user(&info, argp, sizeof(info)))
7357                                err = -EFAULT;
7358                        else if (!(info.state & (1<<MD_DISK_SYNC)))
7359                                /* Need to clear read-only for this */
7360                                break;
7361                        else
7362                                err = add_new_disk(mddev, &info);
7363                        goto unlock;
7364                }
7365                break;
7366
7367        case BLKROSET:
7368                if (get_user(ro, (int __user *)(arg))) {
7369                        err = -EFAULT;
7370                        goto unlock;
7371                }
7372                err = -EINVAL;
7373
7374                /* if the bdev is going readonly the value of mddev->ro
7375                 * does not matter, no writes are coming
7376                 */
7377                if (ro)
7378                        goto unlock;
7379
7380                /* are we are already prepared for writes? */
7381                if (mddev->ro != 1)
7382                        goto unlock;
7383
7384                /* transitioning to readauto need only happen for
7385                 * arrays that call md_write_start
7386                 */
7387                if (mddev->pers) {
7388                        err = restart_array(mddev);
7389                        if (err == 0) {
7390                                mddev->ro = 2;
7391                                set_disk_ro(mddev->gendisk, 0);
7392                        }
7393                }
7394                goto unlock;
7395        }
7396
7397        /*
7398         * The remaining ioctls are changing the state of the
7399         * superblock, so we do not allow them on read-only arrays.
7400         */
7401        if (mddev->ro && mddev->pers) {
7402                if (mddev->ro == 2) {
7403                        mddev->ro = 0;
7404                        sysfs_notify_dirent_safe(mddev->sysfs_state);
7405                        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7406                        /* mddev_unlock will wake thread */
7407                        /* If a device failed while we were read-only, we
7408                         * need to make sure the metadata is updated now.
7409                         */
7410                        if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
7411                                mddev_unlock(mddev);
7412                                wait_event(mddev->sb_wait,
7413                                           !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
7414                                           !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
7415                                mddev_lock_nointr(mddev);
7416                        }
7417                } else {
7418                        err = -EROFS;
7419                        goto unlock;
7420                }
7421        }
7422
7423        switch (cmd) {
7424        case ADD_NEW_DISK:
7425        {
7426                mdu_disk_info_t info;
7427                if (copy_from_user(&info, argp, sizeof(info)))
7428                        err = -EFAULT;
7429                else
7430                        err = add_new_disk(mddev, &info);
7431                goto unlock;
7432        }
7433
7434        case CLUSTERED_DISK_NACK:
7435                if (mddev_is_clustered(mddev))
7436                        md_cluster_ops->new_disk_ack(mddev, false);
7437                else
7438                        err = -EINVAL;
7439                goto unlock;
7440
7441        case HOT_ADD_DISK:
7442                err = hot_add_disk(mddev, new_decode_dev(arg));
7443                goto unlock;
7444
7445        case RUN_ARRAY:
7446                err = do_md_run(mddev);
7447                goto unlock;
7448
7449        case SET_BITMAP_FILE:
7450                err = set_bitmap_file(mddev, (int)arg);
7451                goto unlock;
7452
7453        default:
7454                err = -EINVAL;
7455                goto unlock;
7456        }
7457
7458unlock:
7459        if (mddev->hold_active == UNTIL_IOCTL &&
7460            err != -EINVAL)
7461                mddev->hold_active = 0;
7462        mddev_unlock(mddev);
7463out:
7464        if(did_set_md_closing)
7465                clear_bit(MD_CLOSING, &mddev->flags);
7466        return err;
7467}
7468#ifdef CONFIG_COMPAT
7469static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
7470                    unsigned int cmd, unsigned long arg)
7471{
7472        switch (cmd) {
7473        case HOT_REMOVE_DISK:
7474        case HOT_ADD_DISK:
7475        case SET_DISK_FAULTY:
7476        case SET_BITMAP_FILE:
7477                /* These take in integer arg, do not convert */
7478                break;
7479        default:
7480                arg = (unsigned long)compat_ptr(arg);
7481                break;
7482        }
7483
7484        return md_ioctl(bdev, mode, cmd, arg);
7485}
7486#endif /* CONFIG_COMPAT */
7487
7488static int md_open(struct block_device *bdev, fmode_t mode)
7489{
7490        /*
7491         * Succeed if we can lock the mddev, which confirms that
7492         * it isn't being stopped right now.
7493         */
7494        struct mddev *mddev = mddev_find(bdev->bd_dev);
7495        int err;
7496
7497        if (!mddev)
7498                return -ENODEV;
7499
7500        if (mddev->gendisk != bdev->bd_disk) {
7501                /* we are racing with mddev_put which is discarding this
7502                 * bd_disk.
7503                 */
7504                mddev_put(mddev);
7505                /* Wait until bdev->bd_disk is definitely gone */
7506                flush_workqueue(md_misc_wq);
7507                /* Then retry the open from the top */
7508                return -ERESTARTSYS;
7509        }
7510        BUG_ON(mddev != bdev->bd_disk->private_data);
7511
7512        if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
7513                goto out;
7514
7515        if (test_bit(MD_CLOSING, &mddev->flags)) {
7516                mutex_unlock(&mddev->open_mutex);
7517                err = -ENODEV;
7518                goto out;
7519        }
7520
7521        err = 0;
7522        atomic_inc(&mddev->openers);
7523        mutex_unlock(&mddev->open_mutex);
7524
7525        check_disk_change(bdev);
7526 out:
7527        if (err)
7528                mddev_put(mddev);
7529        return err;
7530}
7531
7532static void md_release(struct gendisk *disk, fmode_t mode)
7533{
7534        struct mddev *mddev = disk->private_data;
7535
7536        BUG_ON(!mddev);
7537        atomic_dec(&mddev->openers);
7538        mddev_put(mddev);
7539}
7540
7541static int md_media_changed(struct gendisk *disk)
7542{
7543        struct mddev *mddev = disk->private_data;
7544
7545        return mddev->changed;
7546}
7547
7548static int md_revalidate(struct gendisk *disk)
7549{
7550        struct mddev *mddev = disk->private_data;
7551
7552        mddev->changed = 0;
7553        return 0;
7554}
7555static const struct block_device_operations md_fops =
7556{
7557        .owner          = THIS_MODULE,
7558        .open           = md_open,
7559        .release        = md_release,
7560        .ioctl          = md_ioctl,
7561#ifdef CONFIG_COMPAT
7562        .compat_ioctl   = md_compat_ioctl,
7563#endif
7564        .getgeo         = md_getgeo,
7565        .media_changed  = md_media_changed,
7566        .revalidate_disk= md_revalidate,
7567};
7568
7569static int md_thread(void *arg)
7570{
7571        struct md_thread *thread = arg;
7572
7573        /*
7574         * md_thread is a 'system-thread', it's priority should be very
7575         * high. We avoid resource deadlocks individually in each
7576         * raid personality. (RAID5 does preallocation) We also use RR and
7577         * the very same RT priority as kswapd, thus we will never get
7578         * into a priority inversion deadlock.
7579         *
7580         * we definitely have to have equal or higher priority than
7581         * bdflush, otherwise bdflush will deadlock if there are too
7582         * many dirty RAID5 blocks.
7583         */
7584
7585        allow_signal(SIGKILL);
7586        while (!kthread_should_stop()) {
7587
7588                /* We need to wait INTERRUPTIBLE so that
7589                 * we don't add to the load-average.
7590                 * That means we need to be sure no signals are
7591                 * pending
7592                 */
7593                if (signal_pending(current))
7594                        flush_signals(current);
7595
7596                wait_event_interruptible_timeout
7597                        (thread->wqueue,
7598                         test_bit(THREAD_WAKEUP, &thread->flags)
7599                         || kthread_should_stop() || kthread_should_park(),
7600                         thread->timeout);
7601
7602                clear_bit(THREAD_WAKEUP, &thread->flags);
7603                if (kthread_should_park())
7604                        kthread_parkme();
7605                if (!kthread_should_stop())
7606                        thread->run(thread);
7607        }
7608
7609        return 0;
7610}
7611
7612void md_wakeup_thread(struct md_thread *thread)
7613{
7614        if (thread) {
7615                pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
7616                set_bit(THREAD_WAKEUP, &thread->flags);
7617                wake_up(&thread->wqueue);
7618        }
7619}
7620EXPORT_SYMBOL(md_wakeup_thread);
7621
7622struct md_thread *md_register_thread(void (*run) (struct md_thread *),
7623                struct mddev *mddev, const char *name)
7624{
7625        struct md_thread *thread;
7626
7627        thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
7628        if (!thread)
7629                return NULL;
7630
7631        init_waitqueue_head(&thread->wqueue);
7632
7633        thread->run = run;
7634        thread->mddev = mddev;
7635        thread->timeout = MAX_SCHEDULE_TIMEOUT;
7636        thread->tsk = kthread_run(md_thread, thread,
7637                                  "%s_%s",
7638                                  mdname(thread->mddev),
7639                                  name);
7640        if (IS_ERR(thread->tsk)) {
7641                kfree(thread);
7642                return NULL;
7643        }
7644        return thread;
7645}
7646EXPORT_SYMBOL(md_register_thread);
7647
7648void md_unregister_thread(struct md_thread **threadp)
7649{
7650        struct md_thread *thread = *threadp;
7651        if (!thread)
7652                return;
7653        pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
7654        /* Locking ensures that mddev_unlock does not wake_up a
7655         * non-existent thread
7656         */
7657        spin_lock(&pers_lock);
7658        *threadp = NULL;
7659        spin_unlock(&pers_lock);
7660
7661        kthread_stop(thread->tsk);
7662        kfree(thread);
7663}
7664EXPORT_SYMBOL(md_unregister_thread);
7665
7666void md_error(struct mddev *mddev, struct md_rdev *rdev)
7667{
7668        if (!rdev || test_bit(Faulty, &rdev->flags))
7669                return;
7670
7671        if (!mddev->pers || !mddev->pers->error_handler)
7672                return;
7673        mddev->pers->error_handler(mddev,rdev);
7674        if (mddev->degraded)
7675                set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7676        sysfs_notify_dirent_safe(rdev->sysfs_state);
7677        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7678        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7679        md_wakeup_thread(mddev->thread);
7680        if (mddev->event_work.func)
7681                queue_work(md_misc_wq, &mddev->event_work);
7682        md_new_event(mddev);
7683}
7684EXPORT_SYMBOL(md_error);
7685
7686/* seq_file implementation /proc/mdstat */
7687
7688static void status_unused(struct seq_file *seq)
7689{
7690        int i = 0;
7691        struct md_rdev *rdev;
7692
7693        seq_printf(seq, "unused devices: ");
7694
7695        list_for_each_entry(rdev, &pending_raid_disks, same_set) {
7696                char b[BDEVNAME_SIZE];
7697                i++;
7698                seq_printf(seq, "%s ",
7699                              bdevname(rdev->bdev,b));
7700        }
7701        if (!i)
7702                seq_printf(seq, "<none>");
7703
7704        seq_printf(seq, "\n");
7705}
7706
7707static int status_resync(struct seq_file *seq, struct mddev *mddev)
7708{
7709        sector_t max_sectors, resync, res;
7710        unsigned long dt, db = 0;
7711        sector_t rt, curr_mark_cnt, resync_mark_cnt;
7712        int scale, recovery_active;
7713        unsigned int per_milli;
7714
7715        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
7716            test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7717                max_sectors = mddev->resync_max_sectors;
7718        else
7719                max_sectors = mddev->dev_sectors;
7720
7721        resync = mddev->curr_resync;
7722        if (resync <= 3) {
7723                if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
7724                        /* Still cleaning up */
7725                        resync = max_sectors;
7726        } else if (resync > max_sectors)
7727                resync = max_sectors;
7728        else
7729                resync -= atomic_read(&mddev->recovery_active);
7730
7731        if (resync == 0) {
7732                if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
7733                        struct md_rdev *rdev;
7734
7735                        rdev_for_each(rdev, mddev)
7736                                if (rdev->raid_disk >= 0 &&
7737                                    !test_bit(Faulty, &rdev->flags) &&
7738                                    rdev->recovery_offset != MaxSector &&
7739                                    rdev->recovery_offset) {
7740                                        seq_printf(seq, "\trecover=REMOTE");
7741                                        return 1;
7742                                }
7743                        if (mddev->reshape_position != MaxSector)
7744                                seq_printf(seq, "\treshape=REMOTE");
7745                        else
7746                                seq_printf(seq, "\tresync=REMOTE");
7747                        return 1;
7748                }
7749                if (mddev->recovery_cp < MaxSector) {
7750                        seq_printf(seq, "\tresync=PENDING");
7751                        return 1;
7752                }
7753                return 0;
7754        }
7755        if (resync < 3) {
7756                seq_printf(seq, "\tresync=DELAYED");
7757                return 1;
7758        }
7759
7760        WARN_ON(max_sectors == 0);
7761        /* Pick 'scale' such that (resync>>scale)*1000 will fit
7762         * in a sector_t, and (max_sectors>>scale) will fit in a
7763         * u32, as those are the requirements for sector_div.
7764         * Thus 'scale' must be at least 10
7765         */
7766        scale = 10;
7767        if (sizeof(sector_t) > sizeof(unsigned long)) {
7768                while ( max_sectors/2 > (1ULL<<(scale+32)))
7769                        scale++;
7770        }
7771        res = (resync>>scale)*1000;
7772        sector_div(res, (u32)((max_sectors>>scale)+1));
7773
7774        per_milli = res;
7775        {
7776                int i, x = per_milli/50, y = 20-x;
7777                seq_printf(seq, "[");
7778                for (i = 0; i < x; i++)
7779                        seq_printf(seq, "=");
7780                seq_printf(seq, ">");
7781                for (i = 0; i < y; i++)
7782                        seq_printf(seq, ".");
7783                seq_printf(seq, "] ");
7784        }
7785        seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
7786                   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
7787                    "reshape" :
7788                    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
7789                     "check" :
7790                     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
7791                      "resync" : "recovery"))),
7792                   per_milli/10, per_milli % 10,
7793                   (unsigned long long) resync/2,
7794                   (unsigned long long) max_sectors/2);
7795
7796        /*
7797         * dt: time from mark until now
7798         * db: blocks written from mark until now
7799         * rt: remaining time
7800         *
7801         * rt is a sector_t, which is always 64bit now. We are keeping
7802         * the original algorithm, but it is not really necessary.
7803         *
7804         * Original algorithm:
7805         *   So we divide before multiply in case it is 32bit and close
7806         *   to the limit.
7807         *   We scale the divisor (db) by 32 to avoid losing precision
7808         *   near the end of resync when the number of remaining sectors
7809         *   is close to 'db'.
7810         *   We then divide rt by 32 after multiplying by db to compensate.
7811         *   The '+1' avoids division by zero if db is very small.
7812         */
7813        dt = ((jiffies - mddev->resync_mark) / HZ);
7814        if (!dt) dt++;
7815
7816        curr_mark_cnt = mddev->curr_mark_cnt;
7817        recovery_active = atomic_read(&mddev->recovery_active);
7818        resync_mark_cnt = mddev->resync_mark_cnt;
7819
7820        if (curr_mark_cnt >= (recovery_active + resync_mark_cnt))
7821                db = curr_mark_cnt - (recovery_active + resync_mark_cnt);
7822
7823        rt = max_sectors - resync;    /* number of remaining sectors */
7824        rt = div64_u64(rt, db/32+1);
7825        rt *= dt;
7826        rt >>= 5;
7827
7828        seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
7829                   ((unsigned long)rt % 60)/6);
7830
7831        seq_printf(seq, " speed=%ldK/sec", db/2/dt);
7832        return 1;
7833}
7834
7835static void *md_seq_start(struct seq_file *seq, loff_t *pos)
7836{
7837        struct list_head *tmp;
7838        loff_t l = *pos;
7839        struct mddev *mddev;
7840
7841        if (l >= 0x10000)
7842                return NULL;
7843        if (!l--)
7844                /* header */
7845                return (void*)1;
7846
7847        spin_lock(&all_mddevs_lock);
7848        list_for_each(tmp,&all_mddevs)
7849                if (!l--) {
7850                        mddev = list_entry(tmp, struct mddev, all_mddevs);
7851                        mddev_get(mddev);
7852                        spin_unlock(&all_mddevs_lock);
7853                        return mddev;
7854                }
7855        spin_unlock(&all_mddevs_lock);
7856        if (!l--)
7857                return (void*)2;/* tail */
7858        return NULL;
7859}
7860
7861static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
7862{
7863        struct list_head *tmp;
7864        struct mddev *next_mddev, *mddev = v;
7865
7866        ++*pos;
7867        if (v == (void*)2)
7868                return NULL;
7869
7870        spin_lock(&all_mddevs_lock);
7871        if (v == (void*)1)
7872                tmp = all_mddevs.next;
7873        else
7874                tmp = mddev->all_mddevs.next;
7875        if (tmp != &all_mddevs)
7876                next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
7877        else {
7878                next_mddev = (void*)2;
7879                *pos = 0x10000;
7880        }
7881        spin_unlock(&all_mddevs_lock);
7882
7883        if (v != (void*)1)
7884                mddev_put(mddev);
7885        return next_mddev;
7886
7887}
7888
7889static void md_seq_stop(struct seq_file *seq, void *v)
7890{
7891        struct mddev *mddev = v;
7892
7893        if (mddev && v != (void*)1 && v != (void*)2)
7894                mddev_put(mddev);
7895}
7896
7897static int md_seq_show(struct seq_file *seq, void *v)
7898{
7899        struct mddev *mddev = v;
7900        sector_t sectors;
7901        struct md_rdev *rdev;
7902
7903        if (v == (void*)1) {
7904                struct md_personality *pers;
7905                seq_printf(seq, "Personalities : ");
7906                spin_lock(&pers_lock);
7907                list_for_each_entry(pers, &pers_list, list)
7908                        seq_printf(seq, "[%s] ", pers->name);
7909
7910                spin_unlock(&pers_lock);
7911                seq_printf(seq, "\n");
7912                seq->poll_event = atomic_read(&md_event_count);
7913                return 0;
7914        }
7915        if (v == (void*)2) {
7916                status_unused(seq);
7917                return 0;
7918        }
7919
7920        spin_lock(&mddev->lock);
7921        if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
7922                seq_printf(seq, "%s : %sactive", mdname(mddev),
7923                                                mddev->pers ? "" : "in");
7924                if (mddev->pers) {
7925                        if (mddev->ro==1)
7926                                seq_printf(seq, " (read-only)");
7927                        if (mddev->ro==2)
7928                                seq_printf(seq, " (auto-read-only)");
7929                        seq_printf(seq, " %s", mddev->pers->name);
7930                }
7931
7932                sectors = 0;
7933                rcu_read_lock();
7934                rdev_for_each_rcu(rdev, mddev) {
7935                        char b[BDEVNAME_SIZE];
7936                        seq_printf(seq, " %s[%d]",
7937                                bdevname(rdev->bdev,b), rdev->desc_nr);
7938                        if (test_bit(WriteMostly, &rdev->flags))
7939                                seq_printf(seq, "(W)");
7940                        if (test_bit(Journal, &rdev->flags))
7941                                seq_printf(seq, "(J)");
7942                        if (test_bit(Faulty, &rdev->flags)) {
7943                                seq_printf(seq, "(F)");
7944                                continue;
7945                        }
7946                        if (rdev->raid_disk < 0)
7947                                seq_printf(seq, "(S)"); /* spare */
7948                        if (test_bit(Replacement, &rdev->flags))
7949                                seq_printf(seq, "(R)");
7950                        sectors += rdev->sectors;
7951                }
7952                rcu_read_unlock();
7953
7954                if (!list_empty(&mddev->disks)) {
7955                        if (mddev->pers)
7956                                seq_printf(seq, "\n      %llu blocks",
7957                                           (unsigned long long)
7958                                           mddev->array_sectors / 2);
7959                        else
7960                                seq_printf(seq, "\n      %llu blocks",
7961                                           (unsigned long long)sectors / 2);
7962                }
7963                if (mddev->persistent) {
7964                        if (mddev->major_version != 0 ||
7965                            mddev->minor_version != 90) {
7966                                seq_printf(seq," super %d.%d",
7967                                           mddev->major_version,
7968                                           mddev->minor_version);
7969                        }
7970                } else if (mddev->external)
7971                        seq_printf(seq, " super external:%s",
7972                                   mddev->metadata_type);
7973                else
7974                        seq_printf(seq, " super non-persistent");
7975
7976                if (mddev->pers) {
7977                        mddev->pers->status(seq, mddev);
7978                        seq_printf(seq, "\n      ");
7979                        if (mddev->pers->sync_request) {
7980                                if (status_resync(seq, mddev))
7981                                        seq_printf(seq, "\n      ");
7982                        }
7983                } else
7984                        seq_printf(seq, "\n       ");
7985
7986                md_bitmap_status(seq, mddev->bitmap);
7987
7988                seq_printf(seq, "\n");
7989        }
7990        spin_unlock(&mddev->lock);
7991
7992        return 0;
7993}
7994
7995static const struct seq_operations md_seq_ops = {
7996        .start  = md_seq_start,
7997        .next   = md_seq_next,
7998        .stop   = md_seq_stop,
7999        .show   = md_seq_show,
8000};
8001
8002static int md_seq_open(struct inode *inode, struct file *file)
8003{
8004        struct seq_file *seq;
8005        int error;
8006
8007        error = seq_open(file, &md_seq_ops);
8008        if (error)
8009                return error;
8010
8011        seq = file->private_data;
8012        seq->poll_event = atomic_read(&md_event_count);
8013        return error;
8014}
8015
8016static int md_unloading;
8017static __poll_t mdstat_poll(struct file *filp, poll_table *wait)
8018{
8019        struct seq_file *seq = filp->private_data;
8020        __poll_t mask;
8021
8022        if (md_unloading)
8023                return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
8024        poll_wait(filp, &md_event_waiters, wait);
8025
8026        /* always allow read */
8027        mask = EPOLLIN | EPOLLRDNORM;
8028
8029        if (seq->poll_event != atomic_read(&md_event_count))
8030                mask |= EPOLLERR | EPOLLPRI;
8031        return mask;
8032}
8033
8034static const struct file_operations md_seq_fops = {
8035        .owner          = THIS_MODULE,
8036        .open           = md_seq_open,
8037        .read           = seq_read,
8038        .llseek         = seq_lseek,
8039        .release        = seq_release,
8040        .poll           = mdstat_poll,
8041};
8042
8043int register_md_personality(struct md_personality *p)
8044{
8045        pr_debug("md: %s personality registered for level %d\n",
8046                 p->name, p->level);
8047        spin_lock(&pers_lock);
8048        list_add_tail(&p->list, &pers_list);
8049        spin_unlock(&pers_lock);
8050        return 0;
8051}
8052EXPORT_SYMBOL(register_md_personality);
8053
8054int unregister_md_personality(struct md_personality *p)
8055{
8056        pr_debug("md: %s personality unregistered\n", p->name);
8057        spin_lock(&pers_lock);
8058        list_del_init(&p->list);
8059        spin_unlock(&pers_lock);
8060        return 0;
8061}
8062EXPORT_SYMBOL(unregister_md_personality);
8063
8064int register_md_cluster_operations(struct md_cluster_operations *ops,
8065                                   struct module *module)
8066{
8067        int ret = 0;
8068        spin_lock(&pers_lock);
8069        if (md_cluster_ops != NULL)
8070                ret = -EALREADY;
8071        else {
8072                md_cluster_ops = ops;
8073                md_cluster_mod = module;
8074        }
8075        spin_unlock(&pers_lock);
8076        return ret;
8077}
8078EXPORT_SYMBOL(register_md_cluster_operations);
8079
8080int unregister_md_cluster_operations(void)
8081{
8082        spin_lock(&pers_lock);
8083        md_cluster_ops = NULL;
8084        spin_unlock(&pers_lock);
8085        return 0;
8086}
8087EXPORT_SYMBOL(unregister_md_cluster_operations);
8088
8089int md_setup_cluster(struct mddev *mddev, int nodes)
8090{
8091        if (!md_cluster_ops)
8092                request_module("md-cluster");
8093        spin_lock(&pers_lock);
8094        /* ensure module won't be unloaded */
8095        if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
8096                pr_warn("can't find md-cluster module or get it's reference.\n");
8097                spin_unlock(&pers_lock);
8098                return -ENOENT;
8099        }
8100        spin_unlock(&pers_lock);
8101
8102        return md_cluster_ops->join(mddev, nodes);
8103}
8104
8105void md_cluster_stop(struct mddev *mddev)
8106{
8107        if (!md_cluster_ops)
8108                return;
8109        md_cluster_ops->leave(mddev);
8110        module_put(md_cluster_mod);
8111}
8112
8113static int is_mddev_idle(struct mddev *mddev, int init)
8114{
8115        struct md_rdev *rdev;
8116        int idle;
8117        int curr_events;
8118
8119        idle = 1;
8120        rcu_read_lock();
8121        rdev_for_each_rcu(rdev, mddev) {
8122                struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
8123                curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
8124                              atomic_read(&disk->sync_io);
8125                /* sync IO will cause sync_io to increase before the disk_stats
8126                 * as sync_io is counted when a request starts, and
8127                 * disk_stats is counted when it completes.
8128                 * So resync activity will cause curr_events to be smaller than
8129                 * when there was no such activity.
8130                 * non-sync IO will cause disk_stat to increase without
8131                 * increasing sync_io so curr_events will (eventually)
8132                 * be larger than it was before.  Once it becomes
8133                 * substantially larger, the test below will cause
8134                 * the array to appear non-idle, and resync will slow
8135                 * down.
8136                 * If there is a lot of outstanding resync activity when
8137                 * we set last_event to curr_events, then all that activity
8138                 * completing might cause the array to appear non-idle
8139                 * and resync will be slowed down even though there might
8140                 * not have been non-resync activity.  This will only
8141                 * happen once though.  'last_events' will soon reflect
8142                 * the state where there is little or no outstanding
8143                 * resync requests, and further resync activity will
8144                 * always make curr_events less than last_events.
8145                 *
8146                 */
8147                if (init || curr_events - rdev->last_events > 64) {
8148                        rdev->last_events = curr_events;
8149                        idle = 0;
8150                }
8151        }
8152        rcu_read_unlock();
8153        return idle;
8154}
8155
8156void md_done_sync(struct mddev *mddev, int blocks, int ok)
8157{
8158        /* another "blocks" (512byte) blocks have been synced */
8159        atomic_sub(blocks, &mddev->recovery_active);
8160        wake_up(&mddev->recovery_wait);
8161        if (!ok) {
8162                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8163                set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
8164                md_wakeup_thread(mddev->thread);
8165                // stop recovery, signal do_sync ....
8166        }
8167}
8168EXPORT_SYMBOL(md_done_sync);
8169
8170/* md_write_start(mddev, bi)
8171 * If we need to update some array metadata (e.g. 'active' flag
8172 * in superblock) before writing, schedule a superblock update
8173 * and wait for it to complete.
8174 * A return value of 'false' means that the write wasn't recorded
8175 * and cannot proceed as the array is being suspend.
8176 */
8177bool md_write_start(struct mddev *mddev, struct bio *bi)
8178{
8179        int did_change = 0;
8180
8181        if (bio_data_dir(bi) != WRITE)
8182                return true;
8183
8184        BUG_ON(mddev->ro == 1);
8185        if (mddev->ro == 2) {
8186                /* need to switch to read/write */
8187                mddev->ro = 0;
8188                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8189                md_wakeup_thread(mddev->thread);
8190                md_wakeup_thread(mddev->sync_thread);
8191                did_change = 1;
8192        }
8193        rcu_read_lock();
8194        percpu_ref_get(&mddev->writes_pending);
8195        smp_mb(); /* Match smp_mb in set_in_sync() */
8196        if (mddev->safemode == 1)
8197                mddev->safemode = 0;
8198        /* sync_checkers is always 0 when writes_pending is in per-cpu mode */
8199        if (mddev->in_sync || mddev->sync_checkers) {
8200                spin_lock(&mddev->lock);
8201                if (mddev->in_sync) {
8202                        mddev->in_sync = 0;
8203                        set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8204                        set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8205                        md_wakeup_thread(mddev->thread);
8206                        did_change = 1;
8207                }
8208                spin_unlock(&mddev->lock);
8209        }
8210        rcu_read_unlock();
8211        if (did_change)
8212                sysfs_notify_dirent_safe(mddev->sysfs_state);
8213        if (!mddev->has_superblocks)
8214                return true;
8215        wait_event(mddev->sb_wait,
8216                   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
8217                   mddev->suspended);
8218        if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
8219                percpu_ref_put(&mddev->writes_pending);
8220                return false;
8221        }
8222        return true;
8223}
8224EXPORT_SYMBOL(md_write_start);
8225
8226/* md_write_inc can only be called when md_write_start() has
8227 * already been called at least once of the current request.
8228 * It increments the counter and is useful when a single request
8229 * is split into several parts.  Each part causes an increment and
8230 * so needs a matching md_write_end().
8231 * Unlike md_write_start(), it is safe to call md_write_inc() inside
8232 * a spinlocked region.
8233 */
8234void md_write_inc(struct mddev *mddev, struct bio *bi)
8235{
8236        if (bio_data_dir(bi) != WRITE)
8237                return;
8238        WARN_ON_ONCE(mddev->in_sync || mddev->ro);
8239        percpu_ref_get(&mddev->writes_pending);
8240}
8241EXPORT_SYMBOL(md_write_inc);
8242
8243void md_write_end(struct mddev *mddev)
8244{
8245        percpu_ref_put(&mddev->writes_pending);
8246
8247        if (mddev->safemode == 2)
8248                md_wakeup_thread(mddev->thread);
8249        else if (mddev->safemode_delay)
8250                /* The roundup() ensures this only performs locking once
8251                 * every ->safemode_delay jiffies
8252                 */
8253                mod_timer(&mddev->safemode_timer,
8254                          roundup(jiffies, mddev->safemode_delay) +
8255                          mddev->safemode_delay);
8256}
8257
8258EXPORT_SYMBOL(md_write_end);
8259
8260/* md_allow_write(mddev)
8261 * Calling this ensures that the array is marked 'active' so that writes
8262 * may proceed without blocking.  It is important to call this before
8263 * attempting a GFP_KERNEL allocation while holding the mddev lock.
8264 * Must be called with mddev_lock held.
8265 */
8266void md_allow_write(struct mddev *mddev)
8267{
8268        if (!mddev->pers)
8269                return;
8270        if (mddev->ro)
8271                return;
8272        if (!mddev->pers->sync_request)
8273                return;
8274
8275        spin_lock(&mddev->lock);
8276        if (mddev->in_sync) {
8277                mddev->in_sync = 0;
8278                set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8279                set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8280                if (mddev->safemode_delay &&
8281                    mddev->safemode == 0)
8282                        mddev->safemode = 1;
8283                spin_unlock(&mddev->lock);
8284                md_update_sb(mddev, 0);
8285                sysfs_notify_dirent_safe(mddev->sysfs_state);
8286                /* wait for the dirty state to be recorded in the metadata */
8287                wait_event(mddev->sb_wait,
8288                           !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
8289        } else
8290                spin_unlock(&mddev->lock);
8291}
8292EXPORT_SYMBOL_GPL(md_allow_write);
8293
8294#define SYNC_MARKS      10
8295#define SYNC_MARK_STEP  (3*HZ)
8296#define UPDATE_FREQUENCY (5*60*HZ)
8297void md_do_sync(struct md_thread *thread)
8298{
8299        struct mddev *mddev = thread->mddev;
8300        struct mddev *mddev2;
8301        unsigned int currspeed = 0, window;
8302        sector_t max_sectors,j, io_sectors, recovery_done;
8303        unsigned long mark[SYNC_MARKS];
8304        unsigned long update_time;
8305        sector_t mark_cnt[SYNC_MARKS];
8306        int last_mark,m;
8307        struct list_head *tmp;
8308        sector_t last_check;
8309        int skipped = 0;
8310        struct md_rdev *rdev;
8311        char *desc, *action = NULL;
8312        struct blk_plug plug;
8313        int ret;
8314
8315        /* just incase thread restarts... */
8316        if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
8317            test_bit(MD_RECOVERY_WAIT, &mddev->recovery))
8318                return;
8319        if (mddev->ro) {/* never try to sync a read-only array */
8320                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8321                return;
8322        }
8323
8324        if (mddev_is_clustered(mddev)) {
8325                ret = md_cluster_ops->resync_start(mddev);
8326                if (ret)
8327                        goto skip;
8328
8329                set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
8330                if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
8331                        test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
8332                        test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
8333                     && ((unsigned long long)mddev->curr_resync_completed
8334                         < (unsigned long long)mddev->resync_max_sectors))
8335                        goto skip;
8336        }
8337
8338        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8339                if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
8340                        desc = "data-check";
8341                        action = "check";
8342                } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
8343                        desc = "requested-resync";
8344                        action = "repair";
8345                } else
8346                        desc = "resync";
8347        } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8348                desc = "reshape";
8349        else
8350                desc = "recovery";
8351
8352        mddev->last_sync_action = action ?: desc;
8353
8354        /* we overload curr_resync somewhat here.
8355         * 0 == not engaged in resync at all
8356         * 2 == checking that there is no conflict with another sync
8357         * 1 == like 2, but have yielded to allow conflicting resync to
8358         *              commence
8359         * other == active in resync - this many blocks
8360         *
8361         * Before starting a resync we must have set curr_resync to
8362         * 2, and then checked that every "conflicting" array has curr_resync
8363         * less than ours.  When we find one that is the same or higher
8364         * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
8365         * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
8366         * This will mean we have to start checking from the beginning again.
8367         *
8368         */
8369
8370        do {
8371                int mddev2_minor = -1;
8372                mddev->curr_resync = 2;
8373
8374        try_again:
8375                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8376                        goto skip;
8377                for_each_mddev(mddev2, tmp) {
8378                        if (mddev2 == mddev)
8379                                continue;
8380                        if (!mddev->parallel_resync
8381                        &&  mddev2->curr_resync
8382                        &&  match_mddev_units(mddev, mddev2)) {
8383                                DEFINE_WAIT(wq);
8384                                if (mddev < mddev2 && mddev->curr_resync == 2) {
8385                                        /* arbitrarily yield */
8386                                        mddev->curr_resync = 1;
8387                                        wake_up(&resync_wait);
8388                                }
8389                                if (mddev > mddev2 && mddev->curr_resync == 1)
8390                                        /* no need to wait here, we can wait the next
8391                                         * time 'round when curr_resync == 2
8392                                         */
8393                                        continue;
8394                                /* We need to wait 'interruptible' so as not to
8395                                 * contribute to the load average, and not to
8396                                 * be caught by 'softlockup'
8397                                 */
8398                                prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
8399                                if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8400                                    mddev2->curr_resync >= mddev->curr_resync) {
8401                                        if (mddev2_minor != mddev2->md_minor) {
8402                                                mddev2_minor = mddev2->md_minor;
8403                                                pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
8404                                                        desc, mdname(mddev),
8405                                                        mdname(mddev2));
8406                                        }
8407                                        mddev_put(mddev2);
8408                                        if (signal_pending(current))
8409                                                flush_signals(current);
8410                                        schedule();
8411                                        finish_wait(&resync_wait, &wq);
8412                                        goto try_again;
8413                                }
8414                                finish_wait(&resync_wait, &wq);
8415                        }
8416                }
8417        } while (mddev->curr_resync < 2);
8418
8419        j = 0;
8420        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8421                /* resync follows the size requested by the personality,
8422                 * which defaults to physical size, but can be virtual size
8423                 */
8424                max_sectors = mddev->resync_max_sectors;
8425                atomic64_set(&mddev->resync_mismatches, 0);
8426                /* we don't use the checkpoint if there's a bitmap */
8427                if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8428                        j = mddev->resync_min;
8429                else if (!mddev->bitmap)
8430                        j = mddev->recovery_cp;
8431
8432        } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
8433                max_sectors = mddev->resync_max_sectors;
8434                /*
8435                 * If the original node aborts reshaping then we continue the
8436                 * reshaping, so set j again to avoid restart reshape from the
8437                 * first beginning
8438                 */
8439                if (mddev_is_clustered(mddev) &&
8440                    mddev->reshape_position != MaxSector)
8441                        j = mddev->reshape_position;
8442        } else {
8443                /* recovery follows the physical size of devices */
8444                max_sectors = mddev->dev_sectors;
8445                j = MaxSector;
8446                rcu_read_lock();
8447                rdev_for_each_rcu(rdev, mddev)
8448                        if (rdev->raid_disk >= 0 &&
8449                            !test_bit(Journal, &rdev->flags) &&
8450                            !test_bit(Faulty, &rdev->flags) &&
8451                            !test_bit(In_sync, &rdev->flags) &&
8452                            rdev->recovery_offset < j)
8453                                j = rdev->recovery_offset;
8454                rcu_read_unlock();
8455
8456                /* If there is a bitmap, we need to make sure all
8457                 * writes that started before we added a spare
8458                 * complete before we start doing a recovery.
8459                 * Otherwise the write might complete and (via
8460                 * bitmap_endwrite) set a bit in the bitmap after the
8461                 * recovery has checked that bit and skipped that
8462                 * region.
8463                 */
8464                if (mddev->bitmap) {
8465                        mddev->pers->quiesce(mddev, 1);
8466                        mddev->pers->quiesce(mddev, 0);
8467                }
8468        }
8469
8470        pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
8471        pr_debug("md: minimum _guaranteed_  speed: %d KB/sec/disk.\n", speed_min(mddev));
8472        pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
8473                 speed_max(mddev), desc);
8474
8475        is_mddev_idle(mddev, 1); /* this initializes IO event counters */
8476
8477        io_sectors = 0;
8478        for (m = 0; m < SYNC_MARKS; m++) {
8479                mark[m] = jiffies;
8480                mark_cnt[m] = io_sectors;
8481        }
8482        last_mark = 0;
8483        mddev->resync_mark = mark[last_mark];
8484        mddev->resync_mark_cnt = mark_cnt[last_mark];
8485
8486        /*
8487         * Tune reconstruction:
8488         */
8489        window = 32 * (PAGE_SIZE / 512);
8490        pr_debug("md: using %dk window, over a total of %lluk.\n",
8491                 window/2, (unsigned long long)max_sectors/2);
8492
8493        atomic_set(&mddev->recovery_active, 0);
8494        last_check = 0;
8495
8496        if (j>2) {
8497                pr_debug("md: resuming %s of %s from checkpoint.\n",
8498                         desc, mdname(mddev));
8499                mddev->curr_resync = j;
8500        } else
8501                mddev->curr_resync = 3; /* no longer delayed */
8502        mddev->curr_resync_completed = j;
8503        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8504        md_new_event(mddev);
8505        update_time = jiffies;
8506
8507        blk_start_plug(&plug);
8508        while (j < max_sectors) {
8509                sector_t sectors;
8510
8511                skipped = 0;
8512
8513                if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8514                    ((mddev->curr_resync > mddev->curr_resync_completed &&
8515                      (mddev->curr_resync - mddev->curr_resync_completed)
8516                      > (max_sectors >> 4)) ||
8517                     time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
8518                     (j - mddev->curr_resync_completed)*2
8519                     >= mddev->resync_max - mddev->curr_resync_completed ||
8520                     mddev->curr_resync_completed > mddev->resync_max
8521                            )) {
8522                        /* time to update curr_resync_completed */
8523                        wait_event(mddev->recovery_wait,
8524                                   atomic_read(&mddev->recovery_active) == 0);
8525                        mddev->curr_resync_completed = j;
8526                        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
8527                            j > mddev->recovery_cp)
8528                                mddev->recovery_cp = j;
8529                        update_time = jiffies;
8530                        set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8531                        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8532                }
8533
8534                while (j >= mddev->resync_max &&
8535                       !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8536                        /* As this condition is controlled by user-space,
8537                         * we can block indefinitely, so use '_interruptible'
8538                         * to avoid triggering warnings.
8539                         */
8540                        flush_signals(current); /* just in case */
8541                        wait_event_interruptible(mddev->recovery_wait,
8542                                                 mddev->resync_max > j
8543                                                 || test_bit(MD_RECOVERY_INTR,
8544                                                             &mddev->recovery));
8545                }
8546
8547                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8548                        break;
8549
8550                sectors = mddev->pers->sync_request(mddev, j, &skipped);
8551                if (sectors == 0) {
8552                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8553                        break;
8554                }
8555
8556                if (!skipped) { /* actual IO requested */
8557                        io_sectors += sectors;
8558                        atomic_add(sectors, &mddev->recovery_active);
8559                }
8560
8561                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8562                        break;
8563
8564                j += sectors;
8565                if (j > max_sectors)
8566                        /* when skipping, extra large numbers can be returned. */
8567                        j = max_sectors;
8568                if (j > 2)
8569                        mddev->curr_resync = j;
8570                mddev->curr_mark_cnt = io_sectors;
8571                if (last_check == 0)
8572                        /* this is the earliest that rebuild will be
8573                         * visible in /proc/mdstat
8574                         */
8575                        md_new_event(mddev);
8576
8577                if (last_check + window > io_sectors || j == max_sectors)
8578                        continue;
8579
8580                last_check = io_sectors;
8581        repeat:
8582                if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
8583                        /* step marks */
8584                        int next = (last_mark+1) % SYNC_MARKS;
8585
8586                        mddev->resync_mark = mark[next];
8587                        mddev->resync_mark_cnt = mark_cnt[next];
8588                        mark[next] = jiffies;
8589                        mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
8590                        last_mark = next;
8591                }
8592
8593                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8594                        break;
8595
8596                /*
8597                 * this loop exits only if either when we are slower than
8598                 * the 'hard' speed limit, or the system was IO-idle for
8599                 * a jiffy.
8600                 * the system might be non-idle CPU-wise, but we only care
8601                 * about not overloading the IO subsystem. (things like an
8602                 * e2fsck being done on the RAID array should execute fast)
8603                 */
8604                cond_resched();
8605
8606                recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
8607                currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
8608                        /((jiffies-mddev->resync_mark)/HZ +1) +1;
8609
8610                if (currspeed > speed_min(mddev)) {
8611                        if (currspeed > speed_max(mddev)) {
8612                                msleep(500);
8613                                goto repeat;
8614                        }
8615                        if (!is_mddev_idle(mddev, 0)) {
8616                                /*
8617                                 * Give other IO more of a chance.
8618                                 * The faster the devices, the less we wait.
8619                                 */
8620                                wait_event(mddev->recovery_wait,
8621                                           !atomic_read(&mddev->recovery_active));
8622                        }
8623                }
8624        }
8625        pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
8626                test_bit(MD_RECOVERY_INTR, &mddev->recovery)
8627                ? "interrupted" : "done");
8628        /*
8629         * this also signals 'finished resyncing' to md_stop
8630         */
8631        blk_finish_plug(&plug);
8632        wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
8633
8634        if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8635            !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8636            mddev->curr_resync > 3) {
8637                mddev->curr_resync_completed = mddev->curr_resync;
8638                sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8639        }
8640        mddev->pers->sync_request(mddev, max_sectors, &skipped);
8641
8642        if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
8643            mddev->curr_resync > 3) {
8644                if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8645                        if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8646                                if (mddev->curr_resync >= mddev->recovery_cp) {
8647                                        pr_debug("md: checkpointing %s of %s.\n",
8648                                                 desc, mdname(mddev));
8649                                        if (test_bit(MD_RECOVERY_ERROR,
8650                                                &mddev->recovery))
8651                                                mddev->recovery_cp =
8652                                                        mddev->curr_resync_completed;
8653                                        else
8654                                                mddev->recovery_cp =
8655                                                        mddev->curr_resync;
8656                                }
8657                        } else
8658                                mddev->recovery_cp = MaxSector;
8659                } else {
8660                        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8661                                mddev->curr_resync = MaxSector;
8662                        if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8663                            test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
8664                                rcu_read_lock();
8665                                rdev_for_each_rcu(rdev, mddev)
8666                                        if (rdev->raid_disk >= 0 &&
8667                                            mddev->delta_disks >= 0 &&
8668                                            !test_bit(Journal, &rdev->flags) &&
8669                                            !test_bit(Faulty, &rdev->flags) &&
8670                                            !test_bit(In_sync, &rdev->flags) &&
8671                                            rdev->recovery_offset < mddev->curr_resync)
8672                                                rdev->recovery_offset = mddev->curr_resync;
8673                                rcu_read_unlock();
8674                        }
8675                }
8676        }
8677 skip:
8678        /* set CHANGE_PENDING here since maybe another update is needed,
8679         * so other nodes are informed. It should be harmless for normal
8680         * raid */
8681        set_mask_bits(&mddev->sb_flags, 0,
8682                      BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
8683
8684        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8685                        !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8686                        mddev->delta_disks > 0 &&
8687                        mddev->pers->finish_reshape &&
8688                        mddev->pers->size &&
8689                        mddev->queue) {
8690                mddev_lock_nointr(mddev);
8691                md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
8692                mddev_unlock(mddev);
8693                if (!mddev_is_clustered(mddev)) {
8694                        set_capacity(mddev->gendisk, mddev->array_sectors);
8695                        revalidate_disk(mddev->gendisk);
8696                }
8697        }
8698
8699        spin_lock(&mddev->lock);
8700        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8701                /* We completed so min/max setting can be forgotten if used. */
8702                if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8703                        mddev->resync_min = 0;
8704                mddev->resync_max = MaxSector;
8705        } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8706                mddev->resync_min = mddev->curr_resync_completed;
8707        set_bit(MD_RECOVERY_DONE, &mddev->recovery);
8708        mddev->curr_resync = 0;
8709        spin_unlock(&mddev->lock);
8710
8711        wake_up(&resync_wait);
8712        md_wakeup_thread(mddev->thread);
8713        return;
8714}
8715EXPORT_SYMBOL_GPL(md_do_sync);
8716
8717static int remove_and_add_spares(struct mddev *mddev,
8718                                 struct md_rdev *this)
8719{
8720        struct md_rdev *rdev;
8721        int spares = 0;
8722        int removed = 0;
8723        bool remove_some = false;
8724
8725        if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
8726                /* Mustn't remove devices when resync thread is running */
8727                return 0;
8728
8729        rdev_for_each(rdev, mddev) {
8730                if ((this == NULL || rdev == this) &&
8731                    rdev->raid_disk >= 0 &&
8732                    !test_bit(Blocked, &rdev->flags) &&
8733                    test_bit(Faulty, &rdev->flags) &&
8734                    atomic_read(&rdev->nr_pending)==0) {
8735                        /* Faulty non-Blocked devices with nr_pending == 0
8736                         * never get nr_pending incremented,
8737                         * never get Faulty cleared, and never get Blocked set.
8738                         * So we can synchronize_rcu now rather than once per device
8739                         */
8740                        remove_some = true;
8741                        set_bit(RemoveSynchronized, &rdev->flags);
8742                }
8743        }
8744
8745        if (remove_some)
8746                synchronize_rcu();
8747        rdev_for_each(rdev, mddev) {
8748                if ((this == NULL || rdev == this) &&
8749                    rdev->raid_disk >= 0 &&
8750                    !test_bit(Blocked, &rdev->flags) &&
8751                    ((test_bit(RemoveSynchronized, &rdev->flags) ||
8752                     (!test_bit(In_sync, &rdev->flags) &&
8753                      !test_bit(Journal, &rdev->flags))) &&
8754                    atomic_read(&rdev->nr_pending)==0)) {
8755                        if (mddev->pers->hot_remove_disk(
8756                                    mddev, rdev) == 0) {
8757                                sysfs_unlink_rdev(mddev, rdev);
8758                                rdev->saved_raid_disk = rdev->raid_disk;
8759                                rdev->raid_disk = -1;
8760                                removed++;
8761                        }
8762                }
8763                if (remove_some && test_bit(RemoveSynchronized, &rdev->flags))
8764                        clear_bit(RemoveSynchronized, &rdev->flags);
8765        }
8766
8767        if (removed && mddev->kobj.sd)
8768                sysfs_notify(&mddev->kobj, NULL, "degraded");
8769
8770        if (this && removed)
8771                goto no_add;
8772
8773        rdev_for_each(rdev, mddev) {
8774                if (this && this != rdev)
8775                        continue;
8776                if (test_bit(Candidate, &rdev->flags))
8777                        continue;
8778                if (rdev->raid_disk >= 0 &&
8779                    !test_bit(In_sync, &rdev->flags) &&
8780                    !test_bit(Journal, &rdev->flags) &&
8781                    !test_bit(Faulty, &rdev->flags))
8782                        spares++;
8783                if (rdev->raid_disk >= 0)
8784                        continue;
8785                if (test_bit(Faulty, &rdev->flags))
8786                        continue;
8787                if (!test_bit(Journal, &rdev->flags)) {
8788                        if (mddev->ro &&
8789                            ! (rdev->saved_raid_disk >= 0 &&
8790                               !test_bit(Bitmap_sync, &rdev->flags)))
8791                                continue;
8792
8793                        rdev->recovery_offset = 0;
8794                }
8795                if (mddev->pers->
8796                    hot_add_disk(mddev, rdev) == 0) {
8797                        if (sysfs_link_rdev(mddev, rdev))
8798                                /* failure here is OK */;
8799                        if (!test_bit(Journal, &rdev->flags))
8800                                spares++;
8801                        md_new_event(mddev);
8802                        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8803                }
8804        }
8805no_add:
8806        if (removed)
8807                set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8808        return spares;
8809}
8810
8811static void md_start_sync(struct work_struct *ws)
8812{
8813        struct mddev *mddev = container_of(ws, struct mddev, del_work);
8814
8815        mddev->sync_thread = md_register_thread(md_do_sync,
8816                                                mddev,
8817                                                "resync");
8818        if (!mddev->sync_thread) {
8819                pr_warn("%s: could not start resync thread...\n",
8820                        mdname(mddev));
8821                /* leave the spares where they are, it shouldn't hurt */
8822                clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8823                clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8824                clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
8825                clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8826                clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8827                wake_up(&resync_wait);
8828                if (test_and_clear_bit(MD_RECOVERY_RECOVER,
8829                                       &mddev->recovery))
8830                        if (mddev->sysfs_action)
8831                                sysfs_notify_dirent_safe(mddev->sysfs_action);
8832        } else
8833                md_wakeup_thread(mddev->sync_thread);
8834        sysfs_notify_dirent_safe(mddev->sysfs_action);
8835        md_new_event(mddev);
8836}
8837
8838/*
8839 * This routine is regularly called by all per-raid-array threads to
8840 * deal with generic issues like resync and super-block update.
8841 * Raid personalities that don't have a thread (linear/raid0) do not
8842 * need this as they never do any recovery or update the superblock.
8843 *
8844 * It does not do any resync itself, but rather "forks" off other threads
8845 * to do that as needed.
8846 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
8847 * "->recovery" and create a thread at ->sync_thread.
8848 * When the thread finishes it sets MD_RECOVERY_DONE
8849 * and wakeups up this thread which will reap the thread and finish up.
8850 * This thread also removes any faulty devices (with nr_pending == 0).
8851 *
8852 * The overall approach is:
8853 *  1/ if the superblock needs updating, update it.
8854 *  2/ If a recovery thread is running, don't do anything else.
8855 *  3/ If recovery has finished, clean up, possibly marking spares active.
8856 *  4/ If there are any faulty devices, remove them.
8857 *  5/ If array is degraded, try to add spares devices
8858 *  6/ If array has spares or is not in-sync, start a resync thread.
8859 */
8860void md_check_recovery(struct mddev *mddev)
8861{
8862        if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
8863                /* Write superblock - thread that called mddev_suspend()
8864                 * holds reconfig_mutex for us.
8865                 */
8866                set_bit(MD_UPDATING_SB, &mddev->flags);
8867                smp_mb__after_atomic();
8868                if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
8869                        md_update_sb(mddev, 0);
8870                clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
8871                wake_up(&mddev->sb_wait);
8872        }
8873
8874        if (mddev->suspended)
8875                return;
8876
8877        if (mddev->bitmap)
8878                md_bitmap_daemon_work(mddev);
8879
8880        if (signal_pending(current)) {
8881                if (mddev->pers->sync_request && !mddev->external) {
8882                        pr_debug("md: %s in immediate safe mode\n",
8883                                 mdname(mddev));
8884                        mddev->safemode = 2;
8885                }
8886                flush_signals(current);
8887        }
8888
8889        if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
8890                return;
8891        if ( ! (
8892                (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
8893                test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
8894                test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
8895                (mddev->external == 0 && mddev->safemode == 1) ||
8896                (mddev->safemode == 2
8897                 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
8898                ))
8899                return;
8900
8901        if (mddev_trylock(mddev)) {
8902                int spares = 0;
8903
8904                if (!mddev->external && mddev->safemode == 1)
8905                        mddev->safemode = 0;
8906
8907                if (mddev->ro) {
8908                        struct md_rdev *rdev;
8909                        if (!mddev->external && mddev->in_sync)
8910                                /* 'Blocked' flag not needed as failed devices
8911                                 * will be recorded if array switched to read/write.
8912                                 * Leaving it set will prevent the device
8913                                 * from being removed.
8914                                 */
8915                                rdev_for_each(rdev, mddev)
8916                                        clear_bit(Blocked, &rdev->flags);
8917                        /* On a read-only array we can:
8918                         * - remove failed devices
8919                         * - add already-in_sync devices if the array itself
8920                         *   is in-sync.
8921                         * As we only add devices that are already in-sync,
8922                         * we can activate the spares immediately.
8923                         */
8924                        remove_and_add_spares(mddev, NULL);
8925                        /* There is no thread, but we need to call
8926                         * ->spare_active and clear saved_raid_disk
8927                         */
8928                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8929                        md_reap_sync_thread(mddev);
8930                        clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8931                        clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8932                        clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8933                        goto unlock;
8934                }
8935
8936                if (mddev_is_clustered(mddev)) {
8937                        struct md_rdev *rdev;
8938                        /* kick the device if another node issued a
8939                         * remove disk.
8940                         */
8941                        rdev_for_each(rdev, mddev) {
8942                                if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
8943                                                rdev->raid_disk < 0)
8944                                        md_kick_rdev_from_array(rdev);
8945                        }
8946                }
8947
8948                if (!mddev->external && !mddev->in_sync) {
8949                        spin_lock(&mddev->lock);
8950                        set_in_sync(mddev);
8951                        spin_unlock(&mddev->lock);
8952                }
8953
8954                if (mddev->sb_flags)
8955                        md_update_sb(mddev, 0);
8956
8957                if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
8958                    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
8959                        /* resync/recovery still happening */
8960                        clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8961                        goto unlock;
8962                }
8963                if (mddev->sync_thread) {
8964                        md_reap_sync_thread(mddev);
8965                        goto unlock;
8966                }
8967                /* Set RUNNING before clearing NEEDED to avoid
8968                 * any transients in the value of "sync_action".
8969                 */
8970                mddev->curr_resync_completed = 0;
8971                spin_lock(&mddev->lock);
8972                set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8973                spin_unlock(&mddev->lock);
8974                /* Clear some bits that don't mean anything, but
8975                 * might be left set
8976                 */
8977                clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
8978                clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
8979
8980                if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
8981                    test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
8982                        goto not_running;
8983                /* no recovery is running.
8984                 * remove any failed drives, then
8985                 * add spares if possible.
8986                 * Spares are also removed and re-added, to allow
8987                 * the personality to fail the re-add.
8988                 */
8989
8990                if (mddev->reshape_position != MaxSector) {
8991                        if (mddev->pers->check_reshape == NULL ||
8992                            mddev->pers->check_reshape(mddev) != 0)
8993                                /* Cannot proceed */
8994                                goto not_running;
8995                        set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8996                        clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8997                } else if ((spares = remove_and_add_spares(mddev, NULL))) {
8998                        clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8999                        clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9000                        clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9001                        set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9002                } else if (mddev->recovery_cp < MaxSector) {
9003                        set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9004                        clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9005                } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
9006                        /* nothing to be done ... */
9007                        goto not_running;
9008
9009                if (mddev->pers->sync_request) {
9010                        if (spares) {
9011                                /* We are adding a device or devices to an array
9012                                 * which has the bitmap stored on all devices.
9013                                 * So make sure all bitmap pages get written
9014                                 */
9015                                md_bitmap_write_all(mddev->bitmap);
9016                        }
9017                        INIT_WORK(&mddev->del_work, md_start_sync);
9018                        queue_work(md_misc_wq, &mddev->del_work);
9019                        goto unlock;
9020                }
9021        not_running:
9022                if (!mddev->sync_thread) {
9023                        clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9024                        wake_up(&resync_wait);
9025                        if (test_and_clear_bit(MD_RECOVERY_RECOVER,
9026                                               &mddev->recovery))
9027                                if (mddev->sysfs_action)
9028                                        sysfs_notify_dirent_safe(mddev->sysfs_action);
9029                }
9030        unlock:
9031                wake_up(&mddev->sb_wait);
9032                mddev_unlock(mddev);
9033        }
9034}
9035EXPORT_SYMBOL(md_check_recovery);
9036
9037void md_reap_sync_thread(struct mddev *mddev)
9038{
9039        struct md_rdev *rdev;
9040        sector_t old_dev_sectors = mddev->dev_sectors;
9041        bool is_reshaped = false;
9042
9043        /* resync has finished, collect result */
9044        md_unregister_thread(&mddev->sync_thread);
9045        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9046            !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
9047                /* success...*/
9048                /* activate any spares */
9049                if (mddev->pers->spare_active(mddev)) {
9050                        sysfs_notify(&mddev->kobj, NULL,
9051                                     "degraded");
9052                        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9053                }
9054        }
9055        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9056            mddev->pers->finish_reshape) {
9057                mddev->pers->finish_reshape(mddev);
9058                if (mddev_is_clustered(mddev))
9059                        is_reshaped = true;
9060        }
9061
9062        /* If array is no-longer degraded, then any saved_raid_disk
9063         * information must be scrapped.
9064         */
9065        if (!mddev->degraded)
9066                rdev_for_each(rdev, mddev)
9067                        rdev->saved_raid_disk = -1;
9068
9069        md_update_sb(mddev, 1);
9070        /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can
9071         * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
9072         * clustered raid */
9073        if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
9074                md_cluster_ops->resync_finish(mddev);
9075        clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9076        clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
9077        clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9078        clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9079        clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9080        clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9081        /*
9082         * We call md_cluster_ops->update_size here because sync_size could
9083         * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
9084         * so it is time to update size across cluster.
9085         */
9086        if (mddev_is_clustered(mddev) && is_reshaped
9087                                      && !test_bit(MD_CLOSING, &mddev->flags))
9088                md_cluster_ops->update_size(mddev, old_dev_sectors);
9089        wake_up(&resync_wait);
9090        /* flag recovery needed just to double check */
9091        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9092        sysfs_notify_dirent_safe(mddev->sysfs_action);
9093        md_new_event(mddev);
9094        if (mddev->event_work.func)
9095                queue_work(md_misc_wq, &mddev->event_work);
9096}
9097EXPORT_SYMBOL(md_reap_sync_thread);
9098
9099void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
9100{
9101        sysfs_notify_dirent_safe(rdev->sysfs_state);
9102        wait_event_timeout(rdev->blocked_wait,
9103                           !test_bit(Blocked, &rdev->flags) &&
9104                           !test_bit(BlockedBadBlocks, &rdev->flags),
9105                           msecs_to_jiffies(5000));
9106        rdev_dec_pending(rdev, mddev);
9107}
9108EXPORT_SYMBOL(md_wait_for_blocked_rdev);
9109
9110void md_finish_reshape(struct mddev *mddev)
9111{
9112        /* called be personality module when reshape completes. */
9113        struct md_rdev *rdev;
9114
9115        rdev_for_each(rdev, mddev) {
9116                if (rdev->data_offset > rdev->new_data_offset)
9117                        rdev->sectors += rdev->data_offset - rdev->new_data_offset;
9118                else
9119                        rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
9120                rdev->data_offset = rdev->new_data_offset;
9121        }
9122}
9123EXPORT_SYMBOL(md_finish_reshape);
9124
9125/* Bad block management */
9126
9127/* Returns 1 on success, 0 on failure */
9128int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9129                       int is_new)
9130{
9131        struct mddev *mddev = rdev->mddev;
9132        int rv;
9133        if (is_new)
9134                s += rdev->new_data_offset;
9135        else
9136                s += rdev->data_offset;
9137        rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
9138        if (rv == 0) {
9139                /* Make sure they get written out promptly */
9140                if (test_bit(ExternalBbl, &rdev->flags))
9141                        sysfs_notify(&rdev->kobj, NULL,
9142                                     "unacknowledged_bad_blocks");
9143                sysfs_notify_dirent_safe(rdev->sysfs_state);
9144                set_mask_bits(&mddev->sb_flags, 0,
9145                              BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
9146                md_wakeup_thread(rdev->mddev->thread);
9147                return 1;
9148        } else
9149                return 0;
9150}
9151EXPORT_SYMBOL_GPL(rdev_set_badblocks);
9152
9153int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9154                         int is_new)
9155{
9156        int rv;
9157        if (is_new)
9158                s += rdev->new_data_offset;
9159        else
9160                s += rdev->data_offset;
9161        rv = badblocks_clear(&rdev->badblocks, s, sectors);
9162        if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
9163                sysfs_notify(&rdev->kobj, NULL, "bad_blocks");
9164        return rv;
9165}
9166EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
9167
9168static int md_notify_reboot(struct notifier_block *this,
9169                            unsigned long code, void *x)
9170{
9171        struct list_head *tmp;
9172        struct mddev *mddev;
9173        int need_delay = 0;
9174
9175        for_each_mddev(mddev, tmp) {
9176                if (mddev_trylock(mddev)) {
9177                        if (mddev->pers)
9178                                __md_stop_writes(mddev);
9179                        if (mddev->persistent)
9180                                mddev->safemode = 2;
9181                        mddev_unlock(mddev);
9182                }
9183                need_delay = 1;
9184        }
9185        /*
9186         * certain more exotic SCSI devices are known to be
9187         * volatile wrt too early system reboots. While the
9188         * right place to handle this issue is the given
9189         * driver, we do want to have a safe RAID driver ...
9190         */
9191        if (need_delay)
9192                mdelay(1000*1);
9193
9194        return NOTIFY_DONE;
9195}
9196
9197static struct notifier_block md_notifier = {
9198        .notifier_call  = md_notify_reboot,
9199        .next           = NULL,
9200        .priority       = INT_MAX, /* before any real devices */
9201};
9202
9203static void md_geninit(void)
9204{
9205        pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
9206
9207        proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
9208}
9209
9210static int __init md_init(void)
9211{
9212        int ret = -ENOMEM;
9213
9214        md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
9215        if (!md_wq)
9216                goto err_wq;
9217
9218        md_misc_wq = alloc_workqueue("md_misc", 0, 0);
9219        if (!md_misc_wq)
9220                goto err_misc_wq;
9221
9222        if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
9223                goto err_md;
9224
9225        if ((ret = register_blkdev(0, "mdp")) < 0)
9226                goto err_mdp;
9227        mdp_major = ret;
9228
9229        blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE,
9230                            md_probe, NULL, NULL);
9231        blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
9232                            md_probe, NULL, NULL);
9233
9234        register_reboot_notifier(&md_notifier);
9235        raid_table_header = register_sysctl_table(raid_root_table);
9236
9237        md_geninit();
9238        return 0;
9239
9240err_mdp:
9241        unregister_blkdev(MD_MAJOR, "md");
9242err_md:
9243        destroy_workqueue(md_misc_wq);
9244err_misc_wq:
9245        destroy_workqueue(md_wq);
9246err_wq:
9247        return ret;
9248}
9249
9250static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
9251{
9252        struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
9253        struct md_rdev *rdev2;
9254        int role, ret;
9255        char b[BDEVNAME_SIZE];
9256
9257        /*
9258         * If size is changed in another node then we need to
9259         * do resize as well.
9260         */
9261        if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
9262                ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
9263                if (ret)
9264                        pr_info("md-cluster: resize failed\n");
9265                else
9266                        md_bitmap_update_sb(mddev->bitmap);
9267        }
9268
9269        /* Check for change of roles in the active devices */
9270        rdev_for_each(rdev2, mddev) {
9271                if (test_bit(Faulty, &rdev2->flags))
9272                        continue;
9273
9274                /* Check if the roles changed */
9275                role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
9276
9277                if (test_bit(Candidate, &rdev2->flags)) {
9278                        if (role == 0xfffe) {
9279                                pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
9280                                md_kick_rdev_from_array(rdev2);
9281                                continue;
9282                        }
9283                        else
9284                                clear_bit(Candidate, &rdev2->flags);
9285                }
9286
9287                if (role != rdev2->raid_disk) {
9288                        /*
9289                         * got activated except reshape is happening.
9290                         */
9291                        if (rdev2->raid_disk == -1 && role != 0xffff &&
9292                            !(le32_to_cpu(sb->feature_map) &
9293                              MD_FEATURE_RESHAPE_ACTIVE)) {
9294                                rdev2->saved_raid_disk = role;
9295                                ret = remove_and_add_spares(mddev, rdev2);
9296                                pr_info("Activated spare: %s\n",
9297                                        bdevname(rdev2->bdev,b));
9298                                /* wakeup mddev->thread here, so array could
9299                                 * perform resync with the new activated disk */
9300                                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9301                                md_wakeup_thread(mddev->thread);
9302                        }
9303                        /* device faulty
9304                         * We just want to do the minimum to mark the disk
9305                         * as faulty. The recovery is performed by the
9306                         * one who initiated the error.
9307                         */
9308                        if ((role == 0xfffe) || (role == 0xfffd)) {
9309                                md_error(mddev, rdev2);
9310                                clear_bit(Blocked, &rdev2->flags);
9311                        }
9312                }
9313        }
9314
9315        if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
9316                update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
9317
9318        /*
9319         * Since mddev->delta_disks has already updated in update_raid_disks,
9320         * so it is time to check reshape.
9321         */
9322        if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
9323            (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
9324                /*
9325                 * reshape is happening in the remote node, we need to
9326                 * update reshape_position and call start_reshape.
9327                 */
9328                mddev->reshape_position = le64_to_cpu(sb->reshape_position);
9329                if (mddev->pers->update_reshape_pos)
9330                        mddev->pers->update_reshape_pos(mddev);
9331                if (mddev->pers->start_reshape)
9332                        mddev->pers->start_reshape(mddev);
9333        } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
9334                   mddev->reshape_position != MaxSector &&
9335                   !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
9336                /* reshape is just done in another node. */
9337                mddev->reshape_position = MaxSector;
9338                if (mddev->pers->update_reshape_pos)
9339                        mddev->pers->update_reshape_pos(mddev);
9340        }
9341
9342        /* Finally set the event to be up to date */
9343        mddev->events = le64_to_cpu(sb->events);
9344}
9345
9346static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
9347{
9348        int err;
9349        struct page *swapout = rdev->sb_page;
9350        struct mdp_superblock_1 *sb;
9351
9352        /* Store the sb page of the rdev in the swapout temporary
9353         * variable in case we err in the future
9354         */
9355        rdev->sb_page = NULL;
9356        err = alloc_disk_sb(rdev);
9357        if (err == 0) {
9358                ClearPageUptodate(rdev->sb_page);
9359                rdev->sb_loaded = 0;
9360                err = super_types[mddev->major_version].
9361                        load_super(rdev, NULL, mddev->minor_version);
9362        }
9363        if (err < 0) {
9364                pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
9365                                __func__, __LINE__, rdev->desc_nr, err);
9366                if (rdev->sb_page)
9367                        put_page(rdev->sb_page);
9368                rdev->sb_page = swapout;
9369                rdev->sb_loaded = 1;
9370                return err;
9371        }
9372
9373        sb = page_address(rdev->sb_page);
9374        /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
9375         * is not set
9376         */
9377
9378        if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
9379                rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
9380
9381        /* The other node finished recovery, call spare_active to set
9382         * device In_sync and mddev->degraded
9383         */
9384        if (rdev->recovery_offset == MaxSector &&
9385            !test_bit(In_sync, &rdev->flags) &&
9386            mddev->pers->spare_active(mddev))
9387                sysfs_notify(&mddev->kobj, NULL, "degraded");
9388
9389        put_page(swapout);
9390        return 0;
9391}
9392
9393void md_reload_sb(struct mddev *mddev, int nr)
9394{
9395        struct md_rdev *rdev;
9396        int err;
9397
9398        /* Find the rdev */
9399        rdev_for_each_rcu(rdev, mddev) {
9400                if (rdev->desc_nr == nr)
9401                        break;
9402        }
9403
9404        if (!rdev || rdev->desc_nr != nr) {
9405                pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
9406                return;
9407        }
9408
9409        err = read_rdev(mddev, rdev);
9410        if (err < 0)
9411                return;
9412
9413        check_sb_changes(mddev, rdev);
9414
9415        /* Read all rdev's to update recovery_offset */
9416        rdev_for_each_rcu(rdev, mddev) {
9417                if (!test_bit(Faulty, &rdev->flags))
9418                        read_rdev(mddev, rdev);
9419        }
9420}
9421EXPORT_SYMBOL(md_reload_sb);
9422
9423#ifndef MODULE
9424
9425/*
9426 * Searches all registered partitions for autorun RAID arrays
9427 * at boot time.
9428 */
9429
9430static DEFINE_MUTEX(detected_devices_mutex);
9431static LIST_HEAD(all_detected_devices);
9432struct detected_devices_node {
9433        struct list_head list;
9434        dev_t dev;
9435};
9436
9437void md_autodetect_dev(dev_t dev)
9438{
9439        struct detected_devices_node *node_detected_dev;
9440
9441        node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
9442        if (node_detected_dev) {
9443                node_detected_dev->dev = dev;
9444                mutex_lock(&detected_devices_mutex);
9445                list_add_tail(&node_detected_dev->list, &all_detected_devices);
9446                mutex_unlock(&detected_devices_mutex);
9447        }
9448}
9449
9450static void autostart_arrays(int part)
9451{
9452        struct md_rdev *rdev;
9453        struct detected_devices_node *node_detected_dev;
9454        dev_t dev;
9455        int i_scanned, i_passed;
9456
9457        i_scanned = 0;
9458        i_passed = 0;
9459
9460        pr_info("md: Autodetecting RAID arrays.\n");
9461
9462        mutex_lock(&detected_devices_mutex);
9463        while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
9464                i_scanned++;
9465                node_detected_dev = list_entry(all_detected_devices.next,
9466                                        struct detected_devices_node, list);
9467                list_del(&node_detected_dev->list);
9468                dev = node_detected_dev->dev;
9469                kfree(node_detected_dev);
9470                mutex_unlock(&detected_devices_mutex);
9471                rdev = md_import_device(dev,0, 90);
9472                mutex_lock(&detected_devices_mutex);
9473                if (IS_ERR(rdev))
9474                        continue;
9475
9476                if (test_bit(Faulty, &rdev->flags))
9477                        continue;
9478
9479                set_bit(AutoDetected, &rdev->flags);
9480                list_add(&rdev->same_set, &pending_raid_disks);
9481                i_passed++;
9482        }
9483        mutex_unlock(&detected_devices_mutex);
9484
9485        pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed);
9486
9487        autorun_devices(part);
9488}
9489
9490#endif /* !MODULE */
9491
9492static __exit void md_exit(void)
9493{
9494        struct mddev *mddev;
9495        struct list_head *tmp;
9496        int delay = 1;
9497
9498        blk_unregister_region(MKDEV(MD_MAJOR,0), 512);
9499        blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
9500
9501        unregister_blkdev(MD_MAJOR,"md");
9502        unregister_blkdev(mdp_major, "mdp");
9503        unregister_reboot_notifier(&md_notifier);
9504        unregister_sysctl_table(raid_table_header);
9505
9506        /* We cannot unload the modules while some process is
9507         * waiting for us in select() or poll() - wake them up
9508         */
9509        md_unloading = 1;
9510        while (waitqueue_active(&md_event_waiters)) {
9511                /* not safe to leave yet */
9512                wake_up(&md_event_waiters);
9513                msleep(delay);
9514                delay += delay;
9515        }
9516        remove_proc_entry("mdstat", NULL);
9517
9518        for_each_mddev(mddev, tmp) {
9519                export_array(mddev);
9520                mddev->ctime = 0;
9521                mddev->hold_active = 0;
9522                /*
9523                 * for_each_mddev() will call mddev_put() at the end of each
9524                 * iteration.  As the mddev is now fully clear, this will
9525                 * schedule the mddev for destruction by a workqueue, and the
9526                 * destroy_workqueue() below will wait for that to complete.
9527                 */
9528        }
9529        destroy_workqueue(md_misc_wq);
9530        destroy_workqueue(md_wq);
9531}
9532
9533subsys_initcall(md_init);
9534module_exit(md_exit)
9535
9536static int get_ro(char *buffer, const struct kernel_param *kp)
9537{
9538        return sprintf(buffer, "%d", start_readonly);
9539}
9540static int set_ro(const char *val, const struct kernel_param *kp)
9541{
9542        return kstrtouint(val, 10, (unsigned int *)&start_readonly);
9543}
9544
9545module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
9546module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
9547module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
9548module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
9549
9550MODULE_LICENSE("GPL");
9551MODULE_DESCRIPTION("MD RAID framework");
9552MODULE_ALIAS("md");
9553MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
9554