linux/drivers/md/md.c
<<
>>
Prefs
   1/*
   2   md.c : Multiple Devices driver for Linux
   3     Copyright (C) 1998, 1999, 2000 Ingo Molnar
   4
   5     completely rewritten, based on the MD driver code from Marc Zyngier
   6
   7   Changes:
   8
   9   - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
  10   - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
  11   - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
  12   - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
  13   - kmod support by: Cyrus Durgin
  14   - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
  15   - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
  16
  17   - lots of fixes and improvements to the RAID1/RAID5 and generic
  18     RAID code (such as request based resynchronization):
  19
  20     Neil Brown <neilb@cse.unsw.edu.au>.
  21
  22   - persistent bitmap code
  23     Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
  24
  25   This program is free software; you can redistribute it and/or modify
  26   it under the terms of the GNU General Public License as published by
  27   the Free Software Foundation; either version 2, or (at your option)
  28   any later version.
  29
  30   You should have received a copy of the GNU General Public License
  31   (for example /usr/src/linux/COPYING); if not, write to the Free
  32   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  33
  34   Errors, Warnings, etc.
  35   Please use:
  36     pr_crit() for error conditions that risk data loss
  37     pr_err() for error conditions that are unexpected, like an IO error
  38         or internal inconsistency
  39     pr_warn() for error conditions that could have been predicated, like
  40         adding a device to an array when it has incompatible metadata
  41     pr_info() for every interesting, very rare events, like an array starting
  42         or stopping, or resync starting or stopping
  43     pr_debug() for everything else.
  44
  45*/
  46
  47#include <linux/sched/mm.h>
  48#include <linux/sched/signal.h>
  49#include <linux/kthread.h>
  50#include <linux/blkdev.h>
  51#include <linux/badblocks.h>
  52#include <linux/sysctl.h>
  53#include <linux/seq_file.h>
  54#include <linux/fs.h>
  55#include <linux/poll.h>
  56#include <linux/ctype.h>
  57#include <linux/string.h>
  58#include <linux/hdreg.h>
  59#include <linux/proc_fs.h>
  60#include <linux/random.h>
  61#include <linux/module.h>
  62#include <linux/reboot.h>
  63#include <linux/file.h>
  64#include <linux/compat.h>
  65#include <linux/delay.h>
  66#include <linux/raid/md_p.h>
  67#include <linux/raid/md_u.h>
  68#include <linux/raid/detect.h>
  69#include <linux/slab.h>
  70#include <linux/percpu-refcount.h>
  71
  72#include <trace/events/block.h>
  73#include "md.h"
  74#include "md-bitmap.h"
  75#include "md-cluster.h"
  76
  77#ifndef MODULE
  78static void autostart_arrays(int part);
  79#endif
  80
  81/* pers_list is a list of registered personalities protected
  82 * by pers_lock.
  83 * pers_lock does extra service to protect accesses to
  84 * mddev->thread when the mutex cannot be held.
  85 */
  86static LIST_HEAD(pers_list);
  87static DEFINE_SPINLOCK(pers_lock);
  88
  89static struct kobj_type md_ktype;
  90
  91struct md_cluster_operations *md_cluster_ops;
  92EXPORT_SYMBOL(md_cluster_ops);
  93static struct module *md_cluster_mod;
  94
  95static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
  96static struct workqueue_struct *md_wq;
  97static struct workqueue_struct *md_misc_wq;
  98static struct workqueue_struct *md_rdev_misc_wq;
  99
 100static int remove_and_add_spares(struct mddev *mddev,
 101                                 struct md_rdev *this);
 102static void mddev_detach(struct mddev *mddev);
 103
 104/*
 105 * Default number of read corrections we'll attempt on an rdev
 106 * before ejecting it from the array. We divide the read error
 107 * count by 2 for every hour elapsed between read errors.
 108 */
 109#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
 110/* Default safemode delay: 200 msec */
 111#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
 112/*
 113 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
 114 * is 1000 KB/sec, so the extra system load does not show up that much.
 115 * Increase it if you want to have more _guaranteed_ speed. Note that
 116 * the RAID driver will use the maximum available bandwidth if the IO
 117 * subsystem is idle. There is also an 'absolute maximum' reconstruction
 118 * speed limit - in case reconstruction slows down your system despite
 119 * idle IO detection.
 120 *
 121 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
 122 * or /sys/block/mdX/md/sync_speed_{min,max}
 123 */
 124
 125static int sysctl_speed_limit_min = 1000;
 126static int sysctl_speed_limit_max = 200000;
 127static inline int speed_min(struct mddev *mddev)
 128{
 129        return mddev->sync_speed_min ?
 130                mddev->sync_speed_min : sysctl_speed_limit_min;
 131}
 132
 133static inline int speed_max(struct mddev *mddev)
 134{
 135        return mddev->sync_speed_max ?
 136                mddev->sync_speed_max : sysctl_speed_limit_max;
 137}
 138
 139static void rdev_uninit_serial(struct md_rdev *rdev)
 140{
 141        if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
 142                return;
 143
 144        kvfree(rdev->serial);
 145        rdev->serial = NULL;
 146}
 147
 148static void rdevs_uninit_serial(struct mddev *mddev)
 149{
 150        struct md_rdev *rdev;
 151
 152        rdev_for_each(rdev, mddev)
 153                rdev_uninit_serial(rdev);
 154}
 155
 156static int rdev_init_serial(struct md_rdev *rdev)
 157{
 158        /* serial_nums equals with BARRIER_BUCKETS_NR */
 159        int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t))));
 160        struct serial_in_rdev *serial = NULL;
 161
 162        if (test_bit(CollisionCheck, &rdev->flags))
 163                return 0;
 164
 165        serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums,
 166                          GFP_KERNEL);
 167        if (!serial)
 168                return -ENOMEM;
 169
 170        for (i = 0; i < serial_nums; i++) {
 171                struct serial_in_rdev *serial_tmp = &serial[i];
 172
 173                spin_lock_init(&serial_tmp->serial_lock);
 174                serial_tmp->serial_rb = RB_ROOT_CACHED;
 175                init_waitqueue_head(&serial_tmp->serial_io_wait);
 176        }
 177
 178        rdev->serial = serial;
 179        set_bit(CollisionCheck, &rdev->flags);
 180
 181        return 0;
 182}
 183
 184static int rdevs_init_serial(struct mddev *mddev)
 185{
 186        struct md_rdev *rdev;
 187        int ret = 0;
 188
 189        rdev_for_each(rdev, mddev) {
 190                ret = rdev_init_serial(rdev);
 191                if (ret)
 192                        break;
 193        }
 194
 195        /* Free all resources if pool is not existed */
 196        if (ret && !mddev->serial_info_pool)
 197                rdevs_uninit_serial(mddev);
 198
 199        return ret;
 200}
 201
 202/*
 203 * rdev needs to enable serial stuffs if it meets the conditions:
 204 * 1. it is multi-queue device flaged with writemostly.
 205 * 2. the write-behind mode is enabled.
 206 */
 207static int rdev_need_serial(struct md_rdev *rdev)
 208{
 209        return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 &&
 210                rdev->bdev->bd_queue->nr_hw_queues != 1 &&
 211                test_bit(WriteMostly, &rdev->flags));
 212}
 213
 214/*
 215 * Init resource for rdev(s), then create serial_info_pool if:
 216 * 1. rdev is the first device which return true from rdev_enable_serial.
 217 * 2. rdev is NULL, means we want to enable serialization for all rdevs.
 218 */
 219void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
 220                              bool is_suspend)
 221{
 222        int ret = 0;
 223
 224        if (rdev && !rdev_need_serial(rdev) &&
 225            !test_bit(CollisionCheck, &rdev->flags))
 226                return;
 227
 228        if (!is_suspend)
 229                mddev_suspend(mddev);
 230
 231        if (!rdev)
 232                ret = rdevs_init_serial(mddev);
 233        else
 234                ret = rdev_init_serial(rdev);
 235        if (ret)
 236                goto abort;
 237
 238        if (mddev->serial_info_pool == NULL) {
 239                /*
 240                 * already in memalloc noio context by
 241                 * mddev_suspend()
 242                 */
 243                mddev->serial_info_pool =
 244                        mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
 245                                                sizeof(struct serial_info));
 246                if (!mddev->serial_info_pool) {
 247                        rdevs_uninit_serial(mddev);
 248                        pr_err("can't alloc memory pool for serialization\n");
 249                }
 250        }
 251
 252abort:
 253        if (!is_suspend)
 254                mddev_resume(mddev);
 255}
 256
 257/*
 258 * Free resource from rdev(s), and destroy serial_info_pool under conditions:
 259 * 1. rdev is the last device flaged with CollisionCheck.
 260 * 2. when bitmap is destroyed while policy is not enabled.
 261 * 3. for disable policy, the pool is destroyed only when no rdev needs it.
 262 */
 263void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
 264                               bool is_suspend)
 265{
 266        if (rdev && !test_bit(CollisionCheck, &rdev->flags))
 267                return;
 268
 269        if (mddev->serial_info_pool) {
 270                struct md_rdev *temp;
 271                int num = 0; /* used to track if other rdevs need the pool */
 272
 273                if (!is_suspend)
 274                        mddev_suspend(mddev);
 275                rdev_for_each(temp, mddev) {
 276                        if (!rdev) {
 277                                if (!mddev->serialize_policy ||
 278                                    !rdev_need_serial(temp))
 279                                        rdev_uninit_serial(temp);
 280                                else
 281                                        num++;
 282                        } else if (temp != rdev &&
 283                                   test_bit(CollisionCheck, &temp->flags))
 284                                num++;
 285                }
 286
 287                if (rdev)
 288                        rdev_uninit_serial(rdev);
 289
 290                if (num)
 291                        pr_info("The mempool could be used by other devices\n");
 292                else {
 293                        mempool_destroy(mddev->serial_info_pool);
 294                        mddev->serial_info_pool = NULL;
 295                }
 296                if (!is_suspend)
 297                        mddev_resume(mddev);
 298        }
 299}
 300
 301static struct ctl_table_header *raid_table_header;
 302
 303static struct ctl_table raid_table[] = {
 304        {
 305                .procname       = "speed_limit_min",
 306                .data           = &sysctl_speed_limit_min,
 307                .maxlen         = sizeof(int),
 308                .mode           = S_IRUGO|S_IWUSR,
 309                .proc_handler   = proc_dointvec,
 310        },
 311        {
 312                .procname       = "speed_limit_max",
 313                .data           = &sysctl_speed_limit_max,
 314                .maxlen         = sizeof(int),
 315                .mode           = S_IRUGO|S_IWUSR,
 316                .proc_handler   = proc_dointvec,
 317        },
 318        { }
 319};
 320
 321static struct ctl_table raid_dir_table[] = {
 322        {
 323                .procname       = "raid",
 324                .maxlen         = 0,
 325                .mode           = S_IRUGO|S_IXUGO,
 326                .child          = raid_table,
 327        },
 328        { }
 329};
 330
 331static struct ctl_table raid_root_table[] = {
 332        {
 333                .procname       = "dev",
 334                .maxlen         = 0,
 335                .mode           = 0555,
 336                .child          = raid_dir_table,
 337        },
 338        {  }
 339};
 340
 341static const struct block_device_operations md_fops;
 342
 343static int start_readonly;
 344
 345/*
 346 * The original mechanism for creating an md device is to create
 347 * a device node in /dev and to open it.  This causes races with device-close.
 348 * The preferred method is to write to the "new_array" module parameter.
 349 * This can avoid races.
 350 * Setting create_on_open to false disables the original mechanism
 351 * so all the races disappear.
 352 */
 353static bool create_on_open = true;
 354
 355struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 356                            struct mddev *mddev)
 357{
 358        if (!mddev || !bioset_initialized(&mddev->bio_set))
 359                return bio_alloc(gfp_mask, nr_iovecs);
 360
 361        return bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set);
 362}
 363EXPORT_SYMBOL_GPL(bio_alloc_mddev);
 364
 365static struct bio *md_bio_alloc_sync(struct mddev *mddev)
 366{
 367        if (!mddev || !bioset_initialized(&mddev->sync_set))
 368                return bio_alloc(GFP_NOIO, 1);
 369
 370        return bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set);
 371}
 372
 373/*
 374 * We have a system wide 'event count' that is incremented
 375 * on any 'interesting' event, and readers of /proc/mdstat
 376 * can use 'poll' or 'select' to find out when the event
 377 * count increases.
 378 *
 379 * Events are:
 380 *  start array, stop array, error, add device, remove device,
 381 *  start build, activate spare
 382 */
 383static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
 384static atomic_t md_event_count;
 385void md_new_event(struct mddev *mddev)
 386{
 387        atomic_inc(&md_event_count);
 388        wake_up(&md_event_waiters);
 389}
 390EXPORT_SYMBOL_GPL(md_new_event);
 391
 392/*
 393 * Enables to iterate over all existing md arrays
 394 * all_mddevs_lock protects this list.
 395 */
 396static LIST_HEAD(all_mddevs);
 397static DEFINE_SPINLOCK(all_mddevs_lock);
 398
 399/*
 400 * iterates through all used mddevs in the system.
 401 * We take care to grab the all_mddevs_lock whenever navigating
 402 * the list, and to always hold a refcount when unlocked.
 403 * Any code which breaks out of this loop while own
 404 * a reference to the current mddev and must mddev_put it.
 405 */
 406#define for_each_mddev(_mddev,_tmp)                                     \
 407                                                                        \
 408        for (({ spin_lock(&all_mddevs_lock);                            \
 409                _tmp = all_mddevs.next;                                 \
 410                _mddev = NULL;});                                       \
 411             ({ if (_tmp != &all_mddevs)                                \
 412                        mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
 413                spin_unlock(&all_mddevs_lock);                          \
 414                if (_mddev) mddev_put(_mddev);                          \
 415                _mddev = list_entry(_tmp, struct mddev, all_mddevs);    \
 416                _tmp != &all_mddevs;});                                 \
 417             ({ spin_lock(&all_mddevs_lock);                            \
 418                _tmp = _tmp->next;})                                    \
 419                )
 420
 421/* Rather than calling directly into the personality make_request function,
 422 * IO requests come here first so that we can check if the device is
 423 * being suspended pending a reconfiguration.
 424 * We hold a refcount over the call to ->make_request.  By the time that
 425 * call has finished, the bio has been linked into some internal structure
 426 * and so is visible to ->quiesce(), so we don't need the refcount any more.
 427 */
 428static bool is_suspended(struct mddev *mddev, struct bio *bio)
 429{
 430        if (mddev->suspended)
 431                return true;
 432        if (bio_data_dir(bio) != WRITE)
 433                return false;
 434        if (mddev->suspend_lo >= mddev->suspend_hi)
 435                return false;
 436        if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
 437                return false;
 438        if (bio_end_sector(bio) < mddev->suspend_lo)
 439                return false;
 440        return true;
 441}
 442
 443void md_handle_request(struct mddev *mddev, struct bio *bio)
 444{
 445check_suspended:
 446        rcu_read_lock();
 447        if (is_suspended(mddev, bio)) {
 448                DEFINE_WAIT(__wait);
 449                for (;;) {
 450                        prepare_to_wait(&mddev->sb_wait, &__wait,
 451                                        TASK_UNINTERRUPTIBLE);
 452                        if (!is_suspended(mddev, bio))
 453                                break;
 454                        rcu_read_unlock();
 455                        schedule();
 456                        rcu_read_lock();
 457                }
 458                finish_wait(&mddev->sb_wait, &__wait);
 459        }
 460        atomic_inc(&mddev->active_io);
 461        rcu_read_unlock();
 462
 463        if (!mddev->pers->make_request(mddev, bio)) {
 464                atomic_dec(&mddev->active_io);
 465                wake_up(&mddev->sb_wait);
 466                goto check_suspended;
 467        }
 468
 469        if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
 470                wake_up(&mddev->sb_wait);
 471}
 472EXPORT_SYMBOL(md_handle_request);
 473
 474static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 475{
 476        const int rw = bio_data_dir(bio);
 477        const int sgrp = op_stat_group(bio_op(bio));
 478        struct mddev *mddev = bio->bi_disk->private_data;
 479        unsigned int sectors;
 480
 481        if (mddev == NULL || mddev->pers == NULL) {
 482                bio_io_error(bio);
 483                return BLK_QC_T_NONE;
 484        }
 485
 486        if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
 487                bio_io_error(bio);
 488                return BLK_QC_T_NONE;
 489        }
 490
 491        blk_queue_split(q, &bio);
 492
 493        if (mddev->ro == 1 && unlikely(rw == WRITE)) {
 494                if (bio_sectors(bio) != 0)
 495                        bio->bi_status = BLK_STS_IOERR;
 496                bio_endio(bio);
 497                return BLK_QC_T_NONE;
 498        }
 499
 500        /*
 501         * save the sectors now since our bio can
 502         * go away inside make_request
 503         */
 504        sectors = bio_sectors(bio);
 505        /* bio could be mergeable after passing to underlayer */
 506        bio->bi_opf &= ~REQ_NOMERGE;
 507
 508        md_handle_request(mddev, bio);
 509
 510        part_stat_lock();
 511        part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
 512        part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
 513        part_stat_unlock();
 514
 515        return BLK_QC_T_NONE;
 516}
 517
 518/* mddev_suspend makes sure no new requests are submitted
 519 * to the device, and that any requests that have been submitted
 520 * are completely handled.
 521 * Once mddev_detach() is called and completes, the module will be
 522 * completely unused.
 523 */
 524void mddev_suspend(struct mddev *mddev)
 525{
 526        WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
 527        lockdep_assert_held(&mddev->reconfig_mutex);
 528        if (mddev->suspended++)
 529                return;
 530        synchronize_rcu();
 531        wake_up(&mddev->sb_wait);
 532        set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
 533        smp_mb__after_atomic();
 534        wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
 535        mddev->pers->quiesce(mddev, 1);
 536        clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
 537        wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
 538
 539        del_timer_sync(&mddev->safemode_timer);
 540        /* restrict memory reclaim I/O during raid array is suspend */
 541        mddev->noio_flag = memalloc_noio_save();
 542}
 543EXPORT_SYMBOL_GPL(mddev_suspend);
 544
 545void mddev_resume(struct mddev *mddev)
 546{
 547        /* entred the memalloc scope from mddev_suspend() */
 548        memalloc_noio_restore(mddev->noio_flag);
 549        lockdep_assert_held(&mddev->reconfig_mutex);
 550        if (--mddev->suspended)
 551                return;
 552        wake_up(&mddev->sb_wait);
 553        mddev->pers->quiesce(mddev, 0);
 554
 555        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 556        md_wakeup_thread(mddev->thread);
 557        md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
 558}
 559EXPORT_SYMBOL_GPL(mddev_resume);
 560
 561int mddev_congested(struct mddev *mddev, int bits)
 562{
 563        struct md_personality *pers = mddev->pers;
 564        int ret = 0;
 565
 566        rcu_read_lock();
 567        if (mddev->suspended)
 568                ret = 1;
 569        else if (pers && pers->congested)
 570                ret = pers->congested(mddev, bits);
 571        rcu_read_unlock();
 572        return ret;
 573}
 574EXPORT_SYMBOL_GPL(mddev_congested);
 575static int md_congested(void *data, int bits)
 576{
 577        struct mddev *mddev = data;
 578        return mddev_congested(mddev, bits);
 579}
 580
 581/*
 582 * Generic flush handling for md
 583 */
 584
 585static void md_end_flush(struct bio *bio)
 586{
 587        struct md_rdev *rdev = bio->bi_private;
 588        struct mddev *mddev = rdev->mddev;
 589
 590        rdev_dec_pending(rdev, mddev);
 591
 592        if (atomic_dec_and_test(&mddev->flush_pending)) {
 593                /* The pre-request flush has finished */
 594                queue_work(md_wq, &mddev->flush_work);
 595        }
 596        bio_put(bio);
 597}
 598
 599static void md_submit_flush_data(struct work_struct *ws);
 600
 601static void submit_flushes(struct work_struct *ws)
 602{
 603        struct mddev *mddev = container_of(ws, struct mddev, flush_work);
 604        struct md_rdev *rdev;
 605
 606        mddev->start_flush = ktime_get_boottime();
 607        INIT_WORK(&mddev->flush_work, md_submit_flush_data);
 608        atomic_set(&mddev->flush_pending, 1);
 609        rcu_read_lock();
 610        rdev_for_each_rcu(rdev, mddev)
 611                if (rdev->raid_disk >= 0 &&
 612                    !test_bit(Faulty, &rdev->flags)) {
 613                        /* Take two references, one is dropped
 614                         * when request finishes, one after
 615                         * we reclaim rcu_read_lock
 616                         */
 617                        struct bio *bi;
 618                        atomic_inc(&rdev->nr_pending);
 619                        atomic_inc(&rdev->nr_pending);
 620                        rcu_read_unlock();
 621                        bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
 622                        bi->bi_end_io = md_end_flush;
 623                        bi->bi_private = rdev;
 624                        bio_set_dev(bi, rdev->bdev);
 625                        bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 626                        atomic_inc(&mddev->flush_pending);
 627                        submit_bio(bi);
 628                        rcu_read_lock();
 629                        rdev_dec_pending(rdev, mddev);
 630                }
 631        rcu_read_unlock();
 632        if (atomic_dec_and_test(&mddev->flush_pending))
 633                queue_work(md_wq, &mddev->flush_work);
 634}
 635
 636static void md_submit_flush_data(struct work_struct *ws)
 637{
 638        struct mddev *mddev = container_of(ws, struct mddev, flush_work);
 639        struct bio *bio = mddev->flush_bio;
 640
 641        /*
 642         * must reset flush_bio before calling into md_handle_request to avoid a
 643         * deadlock, because other bios passed md_handle_request suspend check
 644         * could wait for this and below md_handle_request could wait for those
 645         * bios because of suspend check
 646         */
 647        spin_lock_irq(&mddev->lock);
 648        mddev->prev_flush_start = mddev->start_flush;
 649        mddev->flush_bio = NULL;
 650        spin_unlock_irq(&mddev->lock);
 651        wake_up(&mddev->sb_wait);
 652
 653        if (bio->bi_iter.bi_size == 0) {
 654                /* an empty barrier - all done */
 655                bio_endio(bio);
 656        } else {
 657                bio->bi_opf &= ~REQ_PREFLUSH;
 658                md_handle_request(mddev, bio);
 659        }
 660}
 661
 662/*
 663 * Manages consolidation of flushes and submitting any flushes needed for
 664 * a bio with REQ_PREFLUSH.  Returns true if the bio is finished or is
 665 * being finished in another context.  Returns false if the flushing is
 666 * complete but still needs the I/O portion of the bio to be processed.
 667 */
 668bool md_flush_request(struct mddev *mddev, struct bio *bio)
 669{
 670        ktime_t req_start = ktime_get_boottime();
 671        spin_lock_irq(&mddev->lock);
 672        /* flush requests wait until ongoing flush completes,
 673         * hence coalescing all the pending requests.
 674         */
 675        wait_event_lock_irq(mddev->sb_wait,
 676                            !mddev->flush_bio ||
 677                            ktime_before(req_start, mddev->prev_flush_start),
 678                            mddev->lock);
 679        /* new request after previous flush is completed */
 680        if (ktime_after(req_start, mddev->prev_flush_start)) {
 681                WARN_ON(mddev->flush_bio);
 682                mddev->flush_bio = bio;
 683                bio = NULL;
 684        }
 685        spin_unlock_irq(&mddev->lock);
 686
 687        if (!bio) {
 688                INIT_WORK(&mddev->flush_work, submit_flushes);
 689                queue_work(md_wq, &mddev->flush_work);
 690        } else {
 691                /* flush was performed for some other bio while we waited. */
 692                if (bio->bi_iter.bi_size == 0)
 693                        /* an empty barrier - all done */
 694                        bio_endio(bio);
 695                else {
 696                        bio->bi_opf &= ~REQ_PREFLUSH;
 697                        return false;
 698                }
 699        }
 700        return true;
 701}
 702EXPORT_SYMBOL(md_flush_request);
 703
 704static inline struct mddev *mddev_get(struct mddev *mddev)
 705{
 706        atomic_inc(&mddev->active);
 707        return mddev;
 708}
 709
 710static void mddev_delayed_delete(struct work_struct *ws);
 711
 712static void mddev_put(struct mddev *mddev)
 713{
 714        if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
 715                return;
 716        if (!mddev->raid_disks && list_empty(&mddev->disks) &&
 717            mddev->ctime == 0 && !mddev->hold_active) {
 718                /* Array is not configured at all, and not held active,
 719                 * so destroy it */
 720                list_del_init(&mddev->all_mddevs);
 721
 722                /*
 723                 * Call queue_work inside the spinlock so that
 724                 * flush_workqueue() after mddev_find will succeed in waiting
 725                 * for the work to be done.
 726                 */
 727                INIT_WORK(&mddev->del_work, mddev_delayed_delete);
 728                queue_work(md_misc_wq, &mddev->del_work);
 729        }
 730        spin_unlock(&all_mddevs_lock);
 731}
 732
 733static void md_safemode_timeout(struct timer_list *t);
 734
 735void mddev_init(struct mddev *mddev)
 736{
 737        kobject_init(&mddev->kobj, &md_ktype);
 738        mutex_init(&mddev->open_mutex);
 739        mutex_init(&mddev->reconfig_mutex);
 740        mutex_init(&mddev->bitmap_info.mutex);
 741        INIT_LIST_HEAD(&mddev->disks);
 742        INIT_LIST_HEAD(&mddev->all_mddevs);
 743        timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
 744        atomic_set(&mddev->active, 1);
 745        atomic_set(&mddev->openers, 0);
 746        atomic_set(&mddev->active_io, 0);
 747        spin_lock_init(&mddev->lock);
 748        atomic_set(&mddev->flush_pending, 0);
 749        init_waitqueue_head(&mddev->sb_wait);
 750        init_waitqueue_head(&mddev->recovery_wait);
 751        mddev->reshape_position = MaxSector;
 752        mddev->reshape_backwards = 0;
 753        mddev->last_sync_action = "none";
 754        mddev->resync_min = 0;
 755        mddev->resync_max = MaxSector;
 756        mddev->level = LEVEL_NONE;
 757}
 758EXPORT_SYMBOL_GPL(mddev_init);
 759
 760static struct mddev *mddev_find_locked(dev_t unit)
 761{
 762        struct mddev *mddev;
 763
 764        list_for_each_entry(mddev, &all_mddevs, all_mddevs)
 765                if (mddev->unit == unit)
 766                        return mddev;
 767
 768        return NULL;
 769}
 770
 771/* find an unused unit number */
 772static dev_t mddev_alloc_unit(void)
 773{
 774        static int next_minor = 512;
 775        int start = next_minor;
 776        bool is_free = 0;
 777        dev_t dev = 0;
 778
 779        while (!is_free) {
 780                dev = MKDEV(MD_MAJOR, next_minor);
 781                next_minor++;
 782                if (next_minor > MINORMASK)
 783                        next_minor = 0;
 784                if (next_minor == start)
 785                        return 0;               /* Oh dear, all in use. */
 786                is_free = !mddev_find_locked(dev);
 787        }
 788
 789        return dev;
 790}
 791
 792static struct mddev *mddev_find(dev_t unit)
 793{
 794        struct mddev *mddev;
 795
 796        if (MAJOR(unit) != MD_MAJOR)
 797                unit &= ~((1 << MdpMinorShift) - 1);
 798
 799        spin_lock(&all_mddevs_lock);
 800        mddev = mddev_find_locked(unit);
 801        if (mddev)
 802                mddev_get(mddev);
 803        spin_unlock(&all_mddevs_lock);
 804
 805        return mddev;
 806}
 807
 808static struct mddev *mddev_alloc(dev_t unit)
 809{
 810        struct mddev *new;
 811        int error;
 812
 813        if (unit && MAJOR(unit) != MD_MAJOR)
 814                unit &= ~((1 << MdpMinorShift) - 1);
 815
 816        new = kzalloc(sizeof(*new), GFP_KERNEL);
 817        if (!new)
 818                return ERR_PTR(-ENOMEM);
 819        mddev_init(new);
 820
 821        spin_lock(&all_mddevs_lock);
 822        if (unit) {
 823                error = -EEXIST;
 824                if (mddev_find_locked(unit))
 825                        goto out_free_new;
 826                new->unit = unit;
 827                if (MAJOR(unit) == MD_MAJOR)
 828                        new->md_minor = MINOR(unit);
 829                else
 830                        new->md_minor = MINOR(unit) >> MdpMinorShift;
 831                new->hold_active = UNTIL_IOCTL;
 832        } else {
 833                error = -ENODEV;
 834                new->unit = mddev_alloc_unit();
 835                if (!new->unit)
 836                        goto out_free_new;
 837                new->md_minor = MINOR(new->unit);
 838                new->hold_active = UNTIL_STOP;
 839        }
 840
 841        list_add(&new->all_mddevs, &all_mddevs);
 842        spin_unlock(&all_mddevs_lock);
 843        return new;
 844out_free_new:
 845        spin_unlock(&all_mddevs_lock);
 846        kfree(new);
 847        return ERR_PTR(error);
 848}
 849
 850static struct attribute_group md_redundancy_group;
 851
 852void mddev_unlock(struct mddev *mddev)
 853{
 854        if (mddev->to_remove) {
 855                /* These cannot be removed under reconfig_mutex as
 856                 * an access to the files will try to take reconfig_mutex
 857                 * while holding the file unremovable, which leads to
 858                 * a deadlock.
 859                 * So hold set sysfs_active while the remove in happeing,
 860                 * and anything else which might set ->to_remove or my
 861                 * otherwise change the sysfs namespace will fail with
 862                 * -EBUSY if sysfs_active is still set.
 863                 * We set sysfs_active under reconfig_mutex and elsewhere
 864                 * test it under the same mutex to ensure its correct value
 865                 * is seen.
 866                 */
 867                struct attribute_group *to_remove = mddev->to_remove;
 868                mddev->to_remove = NULL;
 869                mddev->sysfs_active = 1;
 870                mutex_unlock(&mddev->reconfig_mutex);
 871
 872                if (mddev->kobj.sd) {
 873                        if (to_remove != &md_redundancy_group)
 874                                sysfs_remove_group(&mddev->kobj, to_remove);
 875                        if (mddev->pers == NULL ||
 876                            mddev->pers->sync_request == NULL) {
 877                                sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
 878                                if (mddev->sysfs_action)
 879                                        sysfs_put(mddev->sysfs_action);
 880                                if (mddev->sysfs_completed)
 881                                        sysfs_put(mddev->sysfs_completed);
 882                                if (mddev->sysfs_degraded)
 883                                        sysfs_put(mddev->sysfs_degraded);
 884                                mddev->sysfs_action = NULL;
 885                                mddev->sysfs_completed = NULL;
 886                                mddev->sysfs_degraded = NULL;
 887                        }
 888                }
 889                mddev->sysfs_active = 0;
 890        } else
 891                mutex_unlock(&mddev->reconfig_mutex);
 892
 893        /* As we've dropped the mutex we need a spinlock to
 894         * make sure the thread doesn't disappear
 895         */
 896        spin_lock(&pers_lock);
 897        md_wakeup_thread(mddev->thread);
 898        wake_up(&mddev->sb_wait);
 899        spin_unlock(&pers_lock);
 900}
 901EXPORT_SYMBOL_GPL(mddev_unlock);
 902
 903struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
 904{
 905        struct md_rdev *rdev;
 906
 907        rdev_for_each_rcu(rdev, mddev)
 908                if (rdev->desc_nr == nr)
 909                        return rdev;
 910
 911        return NULL;
 912}
 913EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
 914
 915static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
 916{
 917        struct md_rdev *rdev;
 918
 919        rdev_for_each(rdev, mddev)
 920                if (rdev->bdev->bd_dev == dev)
 921                        return rdev;
 922
 923        return NULL;
 924}
 925
 926struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
 927{
 928        struct md_rdev *rdev;
 929
 930        rdev_for_each_rcu(rdev, mddev)
 931                if (rdev->bdev->bd_dev == dev)
 932                        return rdev;
 933
 934        return NULL;
 935}
 936EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
 937
 938static struct md_personality *find_pers(int level, char *clevel)
 939{
 940        struct md_personality *pers;
 941        list_for_each_entry(pers, &pers_list, list) {
 942                if (level != LEVEL_NONE && pers->level == level)
 943                        return pers;
 944                if (strcmp(pers->name, clevel)==0)
 945                        return pers;
 946        }
 947        return NULL;
 948}
 949
 950/* return the offset of the super block in 512byte sectors */
 951static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
 952{
 953        sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
 954        return MD_NEW_SIZE_SECTORS(num_sectors);
 955}
 956
 957static int alloc_disk_sb(struct md_rdev *rdev)
 958{
 959        rdev->sb_page = alloc_page(GFP_KERNEL);
 960        if (!rdev->sb_page)
 961                return -ENOMEM;
 962        return 0;
 963}
 964
 965void md_rdev_clear(struct md_rdev *rdev)
 966{
 967        if (rdev->sb_page) {
 968                put_page(rdev->sb_page);
 969                rdev->sb_loaded = 0;
 970                rdev->sb_page = NULL;
 971                rdev->sb_start = 0;
 972                rdev->sectors = 0;
 973        }
 974        if (rdev->bb_page) {
 975                put_page(rdev->bb_page);
 976                rdev->bb_page = NULL;
 977        }
 978        badblocks_exit(&rdev->badblocks);
 979}
 980EXPORT_SYMBOL_GPL(md_rdev_clear);
 981
 982static void super_written(struct bio *bio)
 983{
 984        struct md_rdev *rdev = bio->bi_private;
 985        struct mddev *mddev = rdev->mddev;
 986
 987        if (bio->bi_status) {
 988                pr_err("md: %s gets error=%d\n", __func__,
 989                       blk_status_to_errno(bio->bi_status));
 990                md_error(mddev, rdev);
 991                if (!test_bit(Faulty, &rdev->flags)
 992                    && (bio->bi_opf & MD_FAILFAST)) {
 993                        set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
 994                        set_bit(LastDev, &rdev->flags);
 995                }
 996        } else
 997                clear_bit(LastDev, &rdev->flags);
 998
 999        if (atomic_dec_and_test(&mddev->pending_writes))
1000                wake_up(&mddev->sb_wait);
1001        rdev_dec_pending(rdev, mddev);
1002        bio_put(bio);
1003}
1004
1005void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
1006                   sector_t sector, int size, struct page *page)
1007{
1008        /* write first size bytes of page to sector of rdev
1009         * Increment mddev->pending_writes before returning
1010         * and decrement it on completion, waking up sb_wait
1011         * if zero is reached.
1012         * If an error occurred, call md_error
1013         */
1014        struct bio *bio;
1015        int ff = 0;
1016
1017        if (!page)
1018                return;
1019
1020        if (test_bit(Faulty, &rdev->flags))
1021                return;
1022
1023        bio = md_bio_alloc_sync(mddev);
1024
1025        atomic_inc(&rdev->nr_pending);
1026
1027        bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev);
1028        bio->bi_iter.bi_sector = sector;
1029        bio_add_page(bio, page, size, 0);
1030        bio->bi_private = rdev;
1031        bio->bi_end_io = super_written;
1032
1033        if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
1034            test_bit(FailFast, &rdev->flags) &&
1035            !test_bit(LastDev, &rdev->flags))
1036                ff = MD_FAILFAST;
1037        bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff;
1038
1039        atomic_inc(&mddev->pending_writes);
1040        submit_bio(bio);
1041}
1042
1043int md_super_wait(struct mddev *mddev)
1044{
1045        /* wait for all superblock writes that were scheduled to complete */
1046        wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
1047        if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
1048                return -EAGAIN;
1049        return 0;
1050}
1051
1052int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
1053                 struct page *page, int op, int op_flags, bool metadata_op)
1054{
1055        struct bio *bio = md_bio_alloc_sync(rdev->mddev);
1056        int ret;
1057
1058        if (metadata_op && rdev->meta_bdev)
1059                bio_set_dev(bio, rdev->meta_bdev);
1060        else
1061                bio_set_dev(bio, rdev->bdev);
1062        bio_set_op_attrs(bio, op, op_flags);
1063        if (metadata_op)
1064                bio->bi_iter.bi_sector = sector + rdev->sb_start;
1065        else if (rdev->mddev->reshape_position != MaxSector &&
1066                 (rdev->mddev->reshape_backwards ==
1067                  (sector >= rdev->mddev->reshape_position)))
1068                bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
1069        else
1070                bio->bi_iter.bi_sector = sector + rdev->data_offset;
1071        bio_add_page(bio, page, size, 0);
1072
1073        submit_bio_wait(bio);
1074
1075        ret = !bio->bi_status;
1076        bio_put(bio);
1077        return ret;
1078}
1079EXPORT_SYMBOL_GPL(sync_page_io);
1080
1081static int read_disk_sb(struct md_rdev *rdev, int size)
1082{
1083        char b[BDEVNAME_SIZE];
1084
1085        if (rdev->sb_loaded)
1086                return 0;
1087
1088        if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true))
1089                goto fail;
1090        rdev->sb_loaded = 1;
1091        return 0;
1092
1093fail:
1094        pr_err("md: disabled device %s, could not read superblock.\n",
1095               bdevname(rdev->bdev,b));
1096        return -EINVAL;
1097}
1098
1099static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
1100{
1101        return  sb1->set_uuid0 == sb2->set_uuid0 &&
1102                sb1->set_uuid1 == sb2->set_uuid1 &&
1103                sb1->set_uuid2 == sb2->set_uuid2 &&
1104                sb1->set_uuid3 == sb2->set_uuid3;
1105}
1106
1107static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
1108{
1109        int ret;
1110        mdp_super_t *tmp1, *tmp2;
1111
1112        tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
1113        tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
1114
1115        if (!tmp1 || !tmp2) {
1116                ret = 0;
1117                goto abort;
1118        }
1119
1120        *tmp1 = *sb1;
1121        *tmp2 = *sb2;
1122
1123        /*
1124         * nr_disks is not constant
1125         */
1126        tmp1->nr_disks = 0;
1127        tmp2->nr_disks = 0;
1128
1129        ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
1130abort:
1131        kfree(tmp1);
1132        kfree(tmp2);
1133        return ret;
1134}
1135
1136static u32 md_csum_fold(u32 csum)
1137{
1138        csum = (csum & 0xffff) + (csum >> 16);
1139        return (csum & 0xffff) + (csum >> 16);
1140}
1141
1142static unsigned int calc_sb_csum(mdp_super_t *sb)
1143{
1144        u64 newcsum = 0;
1145        u32 *sb32 = (u32*)sb;
1146        int i;
1147        unsigned int disk_csum, csum;
1148
1149        disk_csum = sb->sb_csum;
1150        sb->sb_csum = 0;
1151
1152        for (i = 0; i < MD_SB_BYTES/4 ; i++)
1153                newcsum += sb32[i];
1154        csum = (newcsum & 0xffffffff) + (newcsum>>32);
1155
1156#ifdef CONFIG_ALPHA
1157        /* This used to use csum_partial, which was wrong for several
1158         * reasons including that different results are returned on
1159         * different architectures.  It isn't critical that we get exactly
1160         * the same return value as before (we always csum_fold before
1161         * testing, and that removes any differences).  However as we
1162         * know that csum_partial always returned a 16bit value on
1163         * alphas, do a fold to maximise conformity to previous behaviour.
1164         */
1165        sb->sb_csum = md_csum_fold(disk_csum);
1166#else
1167        sb->sb_csum = disk_csum;
1168#endif
1169        return csum;
1170}
1171
1172/*
1173 * Handle superblock details.
1174 * We want to be able to handle multiple superblock formats
1175 * so we have a common interface to them all, and an array of
1176 * different handlers.
1177 * We rely on user-space to write the initial superblock, and support
1178 * reading and updating of superblocks.
1179 * Interface methods are:
1180 *   int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
1181 *      loads and validates a superblock on dev.
1182 *      if refdev != NULL, compare superblocks on both devices
1183 *    Return:
1184 *      0 - dev has a superblock that is compatible with refdev
1185 *      1 - dev has a superblock that is compatible and newer than refdev
1186 *          so dev should be used as the refdev in future
1187 *     -EINVAL superblock incompatible or invalid
1188 *     -othererror e.g. -EIO
1189 *
1190 *   int validate_super(struct mddev *mddev, struct md_rdev *dev)
1191 *      Verify that dev is acceptable into mddev.
1192 *       The first time, mddev->raid_disks will be 0, and data from
1193 *       dev should be merged in.  Subsequent calls check that dev
1194 *       is new enough.  Return 0 or -EINVAL
1195 *
1196 *   void sync_super(struct mddev *mddev, struct md_rdev *dev)
1197 *     Update the superblock for rdev with data in mddev
1198 *     This does not write to disc.
1199 *
1200 */
1201
1202struct super_type  {
1203        char                *name;
1204        struct module       *owner;
1205        int                 (*load_super)(struct md_rdev *rdev,
1206                                          struct md_rdev *refdev,
1207                                          int minor_version);
1208        int                 (*validate_super)(struct mddev *mddev,
1209                                              struct md_rdev *rdev);
1210        void                (*sync_super)(struct mddev *mddev,
1211                                          struct md_rdev *rdev);
1212        unsigned long long  (*rdev_size_change)(struct md_rdev *rdev,
1213                                                sector_t num_sectors);
1214        int                 (*allow_new_offset)(struct md_rdev *rdev,
1215                                                unsigned long long new_offset);
1216};
1217
1218/*
1219 * Check that the given mddev has no bitmap.
1220 *
1221 * This function is called from the run method of all personalities that do not
1222 * support bitmaps. It prints an error message and returns non-zero if mddev
1223 * has a bitmap. Otherwise, it returns 0.
1224 *
1225 */
1226int md_check_no_bitmap(struct mddev *mddev)
1227{
1228        if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1229                return 0;
1230        pr_warn("%s: bitmaps are not supported for %s\n",
1231                mdname(mddev), mddev->pers->name);
1232        return 1;
1233}
1234EXPORT_SYMBOL(md_check_no_bitmap);
1235
1236/*
1237 * load_super for 0.90.0
1238 */
1239static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1240{
1241        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1242        mdp_super_t *sb;
1243        int ret;
1244        bool spare_disk = true;
1245
1246        /*
1247         * Calculate the position of the superblock (512byte sectors),
1248         * it's at the end of the disk.
1249         *
1250         * It also happens to be a multiple of 4Kb.
1251         */
1252        rdev->sb_start = calc_dev_sboffset(rdev);
1253
1254        ret = read_disk_sb(rdev, MD_SB_BYTES);
1255        if (ret)
1256                return ret;
1257
1258        ret = -EINVAL;
1259
1260        bdevname(rdev->bdev, b);
1261        sb = page_address(rdev->sb_page);
1262
1263        if (sb->md_magic != MD_SB_MAGIC) {
1264                pr_warn("md: invalid raid superblock magic on %s\n", b);
1265                goto abort;
1266        }
1267
1268        if (sb->major_version != 0 ||
1269            sb->minor_version < 90 ||
1270            sb->minor_version > 91) {
1271                pr_warn("Bad version number %d.%d on %s\n",
1272                        sb->major_version, sb->minor_version, b);
1273                goto abort;
1274        }
1275
1276        if (sb->raid_disks <= 0)
1277                goto abort;
1278
1279        if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1280                pr_warn("md: invalid superblock checksum on %s\n", b);
1281                goto abort;
1282        }
1283
1284        rdev->preferred_minor = sb->md_minor;
1285        rdev->data_offset = 0;
1286        rdev->new_data_offset = 0;
1287        rdev->sb_size = MD_SB_BYTES;
1288        rdev->badblocks.shift = -1;
1289
1290        if (sb->level == LEVEL_MULTIPATH)
1291                rdev->desc_nr = -1;
1292        else
1293                rdev->desc_nr = sb->this_disk.number;
1294
1295        /* not spare disk, or LEVEL_MULTIPATH */
1296        if (sb->level == LEVEL_MULTIPATH ||
1297                (rdev->desc_nr >= 0 &&
1298                 rdev->desc_nr < MD_SB_DISKS &&
1299                 sb->disks[rdev->desc_nr].state &
1300                 ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))))
1301                spare_disk = false;
1302
1303        if (!refdev) {
1304                if (!spare_disk)
1305                        ret = 1;
1306                else
1307                        ret = 0;
1308        } else {
1309                __u64 ev1, ev2;
1310                mdp_super_t *refsb = page_address(refdev->sb_page);
1311                if (!md_uuid_equal(refsb, sb)) {
1312                        pr_warn("md: %s has different UUID to %s\n",
1313                                b, bdevname(refdev->bdev,b2));
1314                        goto abort;
1315                }
1316                if (!md_sb_equal(refsb, sb)) {
1317                        pr_warn("md: %s has same UUID but different superblock to %s\n",
1318                                b, bdevname(refdev->bdev, b2));
1319                        goto abort;
1320                }
1321                ev1 = md_event(sb);
1322                ev2 = md_event(refsb);
1323
1324                if (!spare_disk && ev1 > ev2)
1325                        ret = 1;
1326                else
1327                        ret = 0;
1328        }
1329        rdev->sectors = rdev->sb_start;
1330        /* Limit to 4TB as metadata cannot record more than that.
1331         * (not needed for Linear and RAID0 as metadata doesn't
1332         * record this size)
1333         */
1334        if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1335                rdev->sectors = (sector_t)(2ULL << 32) - 2;
1336
1337        if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1338                /* "this cannot possibly happen" ... */
1339                ret = -EINVAL;
1340
1341 abort:
1342        return ret;
1343}
1344
1345/*
1346 * validate_super for 0.90.0
1347 */
1348static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1349{
1350        mdp_disk_t *desc;
1351        mdp_super_t *sb = page_address(rdev->sb_page);
1352        __u64 ev1 = md_event(sb);
1353
1354        rdev->raid_disk = -1;
1355        clear_bit(Faulty, &rdev->flags);
1356        clear_bit(In_sync, &rdev->flags);
1357        clear_bit(Bitmap_sync, &rdev->flags);
1358        clear_bit(WriteMostly, &rdev->flags);
1359
1360        if (mddev->raid_disks == 0) {
1361                mddev->major_version = 0;
1362                mddev->minor_version = sb->minor_version;
1363                mddev->patch_version = sb->patch_version;
1364                mddev->external = 0;
1365                mddev->chunk_sectors = sb->chunk_size >> 9;
1366                mddev->ctime = sb->ctime;
1367                mddev->utime = sb->utime;
1368                mddev->level = sb->level;
1369                mddev->clevel[0] = 0;
1370                mddev->layout = sb->layout;
1371                mddev->raid_disks = sb->raid_disks;
1372                mddev->dev_sectors = ((sector_t)sb->size) * 2;
1373                mddev->events = ev1;
1374                mddev->bitmap_info.offset = 0;
1375                mddev->bitmap_info.space = 0;
1376                /* bitmap can use 60 K after the 4K superblocks */
1377                mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1378                mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1379                mddev->reshape_backwards = 0;
1380
1381                if (mddev->minor_version >= 91) {
1382                        mddev->reshape_position = sb->reshape_position;
1383                        mddev->delta_disks = sb->delta_disks;
1384                        mddev->new_level = sb->new_level;
1385                        mddev->new_layout = sb->new_layout;
1386                        mddev->new_chunk_sectors = sb->new_chunk >> 9;
1387                        if (mddev->delta_disks < 0)
1388                                mddev->reshape_backwards = 1;
1389                } else {
1390                        mddev->reshape_position = MaxSector;
1391                        mddev->delta_disks = 0;
1392                        mddev->new_level = mddev->level;
1393                        mddev->new_layout = mddev->layout;
1394                        mddev->new_chunk_sectors = mddev->chunk_sectors;
1395                }
1396                if (mddev->level == 0)
1397                        mddev->layout = -1;
1398
1399                if (sb->state & (1<<MD_SB_CLEAN))
1400                        mddev->recovery_cp = MaxSector;
1401                else {
1402                        if (sb->events_hi == sb->cp_events_hi &&
1403                                sb->events_lo == sb->cp_events_lo) {
1404                                mddev->recovery_cp = sb->recovery_cp;
1405                        } else
1406                                mddev->recovery_cp = 0;
1407                }
1408
1409                memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1410                memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1411                memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1412                memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1413
1414                mddev->max_disks = MD_SB_DISKS;
1415
1416                if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1417                    mddev->bitmap_info.file == NULL) {
1418                        mddev->bitmap_info.offset =
1419                                mddev->bitmap_info.default_offset;
1420                        mddev->bitmap_info.space =
1421                                mddev->bitmap_info.default_space;
1422                }
1423
1424        } else if (mddev->pers == NULL) {
1425                /* Insist on good event counter while assembling, except
1426                 * for spares (which don't need an event count) */
1427                ++ev1;
1428                if (sb->disks[rdev->desc_nr].state & (
1429                            (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1430                        if (ev1 < mddev->events)
1431                                return -EINVAL;
1432        } else if (mddev->bitmap) {
1433                /* if adding to array with a bitmap, then we can accept an
1434                 * older device ... but not too old.
1435                 */
1436                if (ev1 < mddev->bitmap->events_cleared)
1437                        return 0;
1438                if (ev1 < mddev->events)
1439                        set_bit(Bitmap_sync, &rdev->flags);
1440        } else {
1441                if (ev1 < mddev->events)
1442                        /* just a hot-add of a new device, leave raid_disk at -1 */
1443                        return 0;
1444        }
1445
1446        if (mddev->level != LEVEL_MULTIPATH) {
1447                desc = sb->disks + rdev->desc_nr;
1448
1449                if (desc->state & (1<<MD_DISK_FAULTY))
1450                        set_bit(Faulty, &rdev->flags);
1451                else if (desc->state & (1<<MD_DISK_SYNC) /* &&
1452                            desc->raid_disk < mddev->raid_disks */) {
1453                        set_bit(In_sync, &rdev->flags);
1454                        rdev->raid_disk = desc->raid_disk;
1455                        rdev->saved_raid_disk = desc->raid_disk;
1456                } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1457                        /* active but not in sync implies recovery up to
1458                         * reshape position.  We don't know exactly where
1459                         * that is, so set to zero for now */
1460                        if (mddev->minor_version >= 91) {
1461                                rdev->recovery_offset = 0;
1462                                rdev->raid_disk = desc->raid_disk;
1463                        }
1464                }
1465                if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1466                        set_bit(WriteMostly, &rdev->flags);
1467                if (desc->state & (1<<MD_DISK_FAILFAST))
1468                        set_bit(FailFast, &rdev->flags);
1469        } else /* MULTIPATH are always insync */
1470                set_bit(In_sync, &rdev->flags);
1471        return 0;
1472}
1473
1474/*
1475 * sync_super for 0.90.0
1476 */
1477static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1478{
1479        mdp_super_t *sb;
1480        struct md_rdev *rdev2;
1481        int next_spare = mddev->raid_disks;
1482
1483        /* make rdev->sb match mddev data..
1484         *
1485         * 1/ zero out disks
1486         * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1487         * 3/ any empty disks < next_spare become removed
1488         *
1489         * disks[0] gets initialised to REMOVED because
1490         * we cannot be sure from other fields if it has
1491         * been initialised or not.
1492         */
1493        int i;
1494        int active=0, working=0,failed=0,spare=0,nr_disks=0;
1495
1496        rdev->sb_size = MD_SB_BYTES;
1497
1498        sb = page_address(rdev->sb_page);
1499
1500        memset(sb, 0, sizeof(*sb));
1501
1502        sb->md_magic = MD_SB_MAGIC;
1503        sb->major_version = mddev->major_version;
1504        sb->patch_version = mddev->patch_version;
1505        sb->gvalid_words  = 0; /* ignored */
1506        memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1507        memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1508        memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1509        memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1510
1511        sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
1512        sb->level = mddev->level;
1513        sb->size = mddev->dev_sectors / 2;
1514        sb->raid_disks = mddev->raid_disks;
1515        sb->md_minor = mddev->md_minor;
1516        sb->not_persistent = 0;
1517        sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
1518        sb->state = 0;
1519        sb->events_hi = (mddev->events>>32);
1520        sb->events_lo = (u32)mddev->events;
1521
1522        if (mddev->reshape_position == MaxSector)
1523                sb->minor_version = 90;
1524        else {
1525                sb->minor_version = 91;
1526                sb->reshape_position = mddev->reshape_position;
1527                sb->new_level = mddev->new_level;
1528                sb->delta_disks = mddev->delta_disks;
1529                sb->new_layout = mddev->new_layout;
1530                sb->new_chunk = mddev->new_chunk_sectors << 9;
1531        }
1532        mddev->minor_version = sb->minor_version;
1533        if (mddev->in_sync)
1534        {
1535                sb->recovery_cp = mddev->recovery_cp;
1536                sb->cp_events_hi = (mddev->events>>32);
1537                sb->cp_events_lo = (u32)mddev->events;
1538                if (mddev->recovery_cp == MaxSector)
1539                        sb->state = (1<< MD_SB_CLEAN);
1540        } else
1541                sb->recovery_cp = 0;
1542
1543        sb->layout = mddev->layout;
1544        sb->chunk_size = mddev->chunk_sectors << 9;
1545
1546        if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1547                sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1548
1549        sb->disks[0].state = (1<<MD_DISK_REMOVED);
1550        rdev_for_each(rdev2, mddev) {
1551                mdp_disk_t *d;
1552                int desc_nr;
1553                int is_active = test_bit(In_sync, &rdev2->flags);
1554
1555                if (rdev2->raid_disk >= 0 &&
1556                    sb->minor_version >= 91)
1557                        /* we have nowhere to store the recovery_offset,
1558                         * but if it is not below the reshape_position,
1559                         * we can piggy-back on that.
1560                         */
1561                        is_active = 1;
1562                if (rdev2->raid_disk < 0 ||
1563                    test_bit(Faulty, &rdev2->flags))
1564                        is_active = 0;
1565                if (is_active)
1566                        desc_nr = rdev2->raid_disk;
1567                else
1568                        desc_nr = next_spare++;
1569                rdev2->desc_nr = desc_nr;
1570                d = &sb->disks[rdev2->desc_nr];
1571                nr_disks++;
1572                d->number = rdev2->desc_nr;
1573                d->major = MAJOR(rdev2->bdev->bd_dev);
1574                d->minor = MINOR(rdev2->bdev->bd_dev);
1575                if (is_active)
1576                        d->raid_disk = rdev2->raid_disk;
1577                else
1578                        d->raid_disk = rdev2->desc_nr; /* compatibility */
1579                if (test_bit(Faulty, &rdev2->flags))
1580                        d->state = (1<<MD_DISK_FAULTY);
1581                else if (is_active) {
1582                        d->state = (1<<MD_DISK_ACTIVE);
1583                        if (test_bit(In_sync, &rdev2->flags))
1584                                d->state |= (1<<MD_DISK_SYNC);
1585                        active++;
1586                        working++;
1587                } else {
1588                        d->state = 0;
1589                        spare++;
1590                        working++;
1591                }
1592                if (test_bit(WriteMostly, &rdev2->flags))
1593                        d->state |= (1<<MD_DISK_WRITEMOSTLY);
1594                if (test_bit(FailFast, &rdev2->flags))
1595                        d->state |= (1<<MD_DISK_FAILFAST);
1596        }
1597        /* now set the "removed" and "faulty" bits on any missing devices */
1598        for (i=0 ; i < mddev->raid_disks ; i++) {
1599                mdp_disk_t *d = &sb->disks[i];
1600                if (d->state == 0 && d->number == 0) {
1601                        d->number = i;
1602                        d->raid_disk = i;
1603                        d->state = (1<<MD_DISK_REMOVED);
1604                        d->state |= (1<<MD_DISK_FAULTY);
1605                        failed++;
1606                }
1607        }
1608        sb->nr_disks = nr_disks;
1609        sb->active_disks = active;
1610        sb->working_disks = working;
1611        sb->failed_disks = failed;
1612        sb->spare_disks = spare;
1613
1614        sb->this_disk = sb->disks[rdev->desc_nr];
1615        sb->sb_csum = calc_sb_csum(sb);
1616}
1617
1618/*
1619 * rdev_size_change for 0.90.0
1620 */
1621static unsigned long long
1622super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1623{
1624        if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1625                return 0; /* component must fit device */
1626        if (rdev->mddev->bitmap_info.offset)
1627                return 0; /* can't move bitmap */
1628        rdev->sb_start = calc_dev_sboffset(rdev);
1629        if (!num_sectors || num_sectors > rdev->sb_start)
1630                num_sectors = rdev->sb_start;
1631        /* Limit to 4TB as metadata cannot record more than that.
1632         * 4TB == 2^32 KB, or 2*2^32 sectors.
1633         */
1634        if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1635                num_sectors = (sector_t)(2ULL << 32) - 2;
1636        do {
1637                md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1638                       rdev->sb_page);
1639        } while (md_super_wait(rdev->mddev) < 0);
1640        return num_sectors;
1641}
1642
1643static int
1644super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1645{
1646        /* non-zero offset changes not possible with v0.90 */
1647        return new_offset == 0;
1648}
1649
1650/*
1651 * version 1 superblock
1652 */
1653
1654static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1655{
1656        __le32 disk_csum;
1657        u32 csum;
1658        unsigned long long newcsum;
1659        int size = 256 + le32_to_cpu(sb->max_dev)*2;
1660        __le32 *isuper = (__le32*)sb;
1661
1662        disk_csum = sb->sb_csum;
1663        sb->sb_csum = 0;
1664        newcsum = 0;
1665        for (; size >= 4; size -= 4)
1666                newcsum += le32_to_cpu(*isuper++);
1667
1668        if (size == 2)
1669                newcsum += le16_to_cpu(*(__le16*) isuper);
1670
1671        csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1672        sb->sb_csum = disk_csum;
1673        return cpu_to_le32(csum);
1674}
1675
1676static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1677{
1678        struct mdp_superblock_1 *sb;
1679        int ret;
1680        sector_t sb_start;
1681        sector_t sectors;
1682        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1683        int bmask;
1684        bool spare_disk = true;
1685
1686        /*
1687         * Calculate the position of the superblock in 512byte sectors.
1688         * It is always aligned to a 4K boundary and
1689         * depeding on minor_version, it can be:
1690         * 0: At least 8K, but less than 12K, from end of device
1691         * 1: At start of device
1692         * 2: 4K from start of device.
1693         */
1694        switch(minor_version) {
1695        case 0:
1696                sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1697                sb_start -= 8*2;
1698                sb_start &= ~(sector_t)(4*2-1);
1699                break;
1700        case 1:
1701                sb_start = 0;
1702                break;
1703        case 2:
1704                sb_start = 8;
1705                break;
1706        default:
1707                return -EINVAL;
1708        }
1709        rdev->sb_start = sb_start;
1710
1711        /* superblock is rarely larger than 1K, but it can be larger,
1712         * and it is safe to read 4k, so we do that
1713         */
1714        ret = read_disk_sb(rdev, 4096);
1715        if (ret) return ret;
1716
1717        sb = page_address(rdev->sb_page);
1718
1719        if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1720            sb->major_version != cpu_to_le32(1) ||
1721            le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1722            le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1723            (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1724                return -EINVAL;
1725
1726        if (calc_sb_1_csum(sb) != sb->sb_csum) {
1727                pr_warn("md: invalid superblock checksum on %s\n",
1728                        bdevname(rdev->bdev,b));
1729                return -EINVAL;
1730        }
1731        if (le64_to_cpu(sb->data_size) < 10) {
1732                pr_warn("md: data_size too small on %s\n",
1733                        bdevname(rdev->bdev,b));
1734                return -EINVAL;
1735        }
1736        if (sb->pad0 ||
1737            sb->pad3[0] ||
1738            memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1739                /* Some padding is non-zero, might be a new feature */
1740                return -EINVAL;
1741
1742        rdev->preferred_minor = 0xffff;
1743        rdev->data_offset = le64_to_cpu(sb->data_offset);
1744        rdev->new_data_offset = rdev->data_offset;
1745        if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1746            (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1747                rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1748        atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1749
1750        rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1751        bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1752        if (rdev->sb_size & bmask)
1753                rdev->sb_size = (rdev->sb_size | bmask) + 1;
1754
1755        if (minor_version
1756            && rdev->data_offset < sb_start + (rdev->sb_size/512))
1757                return -EINVAL;
1758        if (minor_version
1759            && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1760                return -EINVAL;
1761
1762        if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1763                rdev->desc_nr = -1;
1764        else
1765                rdev->desc_nr = le32_to_cpu(sb->dev_number);
1766
1767        if (!rdev->bb_page) {
1768                rdev->bb_page = alloc_page(GFP_KERNEL);
1769                if (!rdev->bb_page)
1770                        return -ENOMEM;
1771        }
1772        if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1773            rdev->badblocks.count == 0) {
1774                /* need to load the bad block list.
1775                 * Currently we limit it to one page.
1776                 */
1777                s32 offset;
1778                sector_t bb_sector;
1779                __le64 *bbp;
1780                int i;
1781                int sectors = le16_to_cpu(sb->bblog_size);
1782                if (sectors > (PAGE_SIZE / 512))
1783                        return -EINVAL;
1784                offset = le32_to_cpu(sb->bblog_offset);
1785                if (offset == 0)
1786                        return -EINVAL;
1787                bb_sector = (long long)offset;
1788                if (!sync_page_io(rdev, bb_sector, sectors << 9,
1789                                  rdev->bb_page, REQ_OP_READ, 0, true))
1790                        return -EIO;
1791                bbp = (__le64 *)page_address(rdev->bb_page);
1792                rdev->badblocks.shift = sb->bblog_shift;
1793                for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1794                        u64 bb = le64_to_cpu(*bbp);
1795                        int count = bb & (0x3ff);
1796                        u64 sector = bb >> 10;
1797                        sector <<= sb->bblog_shift;
1798                        count <<= sb->bblog_shift;
1799                        if (bb + 1 == 0)
1800                                break;
1801                        if (badblocks_set(&rdev->badblocks, sector, count, 1))
1802                                return -EINVAL;
1803                }
1804        } else if (sb->bblog_offset != 0)
1805                rdev->badblocks.shift = 0;
1806
1807        if ((le32_to_cpu(sb->feature_map) &
1808            (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
1809                rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
1810                rdev->ppl.size = le16_to_cpu(sb->ppl.size);
1811                rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
1812        }
1813
1814        if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) &&
1815            sb->level != 0)
1816                return -EINVAL;
1817
1818        /* not spare disk, or LEVEL_MULTIPATH */
1819        if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) ||
1820                (rdev->desc_nr >= 0 &&
1821                rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1822                (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1823                 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)))
1824                spare_disk = false;
1825
1826        if (!refdev) {
1827                if (!spare_disk)
1828                        ret = 1;
1829                else
1830                        ret = 0;
1831        } else {
1832                __u64 ev1, ev2;
1833                struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1834
1835                if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1836                    sb->level != refsb->level ||
1837                    sb->layout != refsb->layout ||
1838                    sb->chunksize != refsb->chunksize) {
1839                        pr_warn("md: %s has strangely different superblock to %s\n",
1840                                bdevname(rdev->bdev,b),
1841                                bdevname(refdev->bdev,b2));
1842                        return -EINVAL;
1843                }
1844                ev1 = le64_to_cpu(sb->events);
1845                ev2 = le64_to_cpu(refsb->events);
1846
1847                if (!spare_disk && ev1 > ev2)
1848                        ret = 1;
1849                else
1850                        ret = 0;
1851        }
1852        if (minor_version) {
1853                sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
1854                sectors -= rdev->data_offset;
1855        } else
1856                sectors = rdev->sb_start;
1857        if (sectors < le64_to_cpu(sb->data_size))
1858                return -EINVAL;
1859        rdev->sectors = le64_to_cpu(sb->data_size);
1860        return ret;
1861}
1862
1863static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1864{
1865        struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1866        __u64 ev1 = le64_to_cpu(sb->events);
1867
1868        rdev->raid_disk = -1;
1869        clear_bit(Faulty, &rdev->flags);
1870        clear_bit(In_sync, &rdev->flags);
1871        clear_bit(Bitmap_sync, &rdev->flags);
1872        clear_bit(WriteMostly, &rdev->flags);
1873
1874        if (mddev->raid_disks == 0) {
1875                mddev->major_version = 1;
1876                mddev->patch_version = 0;
1877                mddev->external = 0;
1878                mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1879                mddev->ctime = le64_to_cpu(sb->ctime);
1880                mddev->utime = le64_to_cpu(sb->utime);
1881                mddev->level = le32_to_cpu(sb->level);
1882                mddev->clevel[0] = 0;
1883                mddev->layout = le32_to_cpu(sb->layout);
1884                mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1885                mddev->dev_sectors = le64_to_cpu(sb->size);
1886                mddev->events = ev1;
1887                mddev->bitmap_info.offset = 0;
1888                mddev->bitmap_info.space = 0;
1889                /* Default location for bitmap is 1K after superblock
1890                 * using 3K - total of 4K
1891                 */
1892                mddev->bitmap_info.default_offset = 1024 >> 9;
1893                mddev->bitmap_info.default_space = (4096-1024) >> 9;
1894                mddev->reshape_backwards = 0;
1895
1896                mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1897                memcpy(mddev->uuid, sb->set_uuid, 16);
1898
1899                mddev->max_disks =  (4096-256)/2;
1900
1901                if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1902                    mddev->bitmap_info.file == NULL) {
1903                        mddev->bitmap_info.offset =
1904                                (__s32)le32_to_cpu(sb->bitmap_offset);
1905                        /* Metadata doesn't record how much space is available.
1906                         * For 1.0, we assume we can use up to the superblock
1907                         * if before, else to 4K beyond superblock.
1908                         * For others, assume no change is possible.
1909                         */
1910                        if (mddev->minor_version > 0)
1911                                mddev->bitmap_info.space = 0;
1912                        else if (mddev->bitmap_info.offset > 0)
1913                                mddev->bitmap_info.space =
1914                                        8 - mddev->bitmap_info.offset;
1915                        else
1916                                mddev->bitmap_info.space =
1917                                        -mddev->bitmap_info.offset;
1918                }
1919
1920                if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1921                        mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1922                        mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1923                        mddev->new_level = le32_to_cpu(sb->new_level);
1924                        mddev->new_layout = le32_to_cpu(sb->new_layout);
1925                        mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1926                        if (mddev->delta_disks < 0 ||
1927                            (mddev->delta_disks == 0 &&
1928                             (le32_to_cpu(sb->feature_map)
1929                              & MD_FEATURE_RESHAPE_BACKWARDS)))
1930                                mddev->reshape_backwards = 1;
1931                } else {
1932                        mddev->reshape_position = MaxSector;
1933                        mddev->delta_disks = 0;
1934                        mddev->new_level = mddev->level;
1935                        mddev->new_layout = mddev->layout;
1936                        mddev->new_chunk_sectors = mddev->chunk_sectors;
1937                }
1938
1939                if (mddev->level == 0 &&
1940                    !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT))
1941                        mddev->layout = -1;
1942
1943                if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
1944                        set_bit(MD_HAS_JOURNAL, &mddev->flags);
1945
1946                if (le32_to_cpu(sb->feature_map) &
1947                    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
1948                        if (le32_to_cpu(sb->feature_map) &
1949                            (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
1950                                return -EINVAL;
1951                        if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
1952                            (le32_to_cpu(sb->feature_map) &
1953                                            MD_FEATURE_MULTIPLE_PPLS))
1954                                return -EINVAL;
1955                        set_bit(MD_HAS_PPL, &mddev->flags);
1956                }
1957        } else if (mddev->pers == NULL) {
1958                /* Insist of good event counter while assembling, except for
1959                 * spares (which don't need an event count) */
1960                ++ev1;
1961                if (rdev->desc_nr >= 0 &&
1962                    rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1963                    (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1964                     le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1965                        if (ev1 < mddev->events)
1966                                return -EINVAL;
1967        } else if (mddev->bitmap) {
1968                /* If adding to array with a bitmap, then we can accept an
1969                 * older device, but not too old.
1970                 */
1971                if (ev1 < mddev->bitmap->events_cleared)
1972                        return 0;
1973                if (ev1 < mddev->events)
1974                        set_bit(Bitmap_sync, &rdev->flags);
1975        } else {
1976                if (ev1 < mddev->events)
1977                        /* just a hot-add of a new device, leave raid_disk at -1 */
1978                        return 0;
1979        }
1980        if (mddev->level != LEVEL_MULTIPATH) {
1981                int role;
1982                if (rdev->desc_nr < 0 ||
1983                    rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1984                        role = MD_DISK_ROLE_SPARE;
1985                        rdev->desc_nr = -1;
1986                } else
1987                        role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1988                switch(role) {
1989                case MD_DISK_ROLE_SPARE: /* spare */
1990                        break;
1991                case MD_DISK_ROLE_FAULTY: /* faulty */
1992                        set_bit(Faulty, &rdev->flags);
1993                        break;
1994                case MD_DISK_ROLE_JOURNAL: /* journal device */
1995                        if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
1996                                /* journal device without journal feature */
1997                                pr_warn("md: journal device provided without journal feature, ignoring the device\n");
1998                                return -EINVAL;
1999                        }
2000                        set_bit(Journal, &rdev->flags);
2001                        rdev->journal_tail = le64_to_cpu(sb->journal_tail);
2002                        rdev->raid_disk = 0;
2003                        break;
2004                default:
2005                        rdev->saved_raid_disk = role;
2006                        if ((le32_to_cpu(sb->feature_map) &
2007                             MD_FEATURE_RECOVERY_OFFSET)) {
2008                                rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
2009                                if (!(le32_to_cpu(sb->feature_map) &
2010                                      MD_FEATURE_RECOVERY_BITMAP))
2011                                        rdev->saved_raid_disk = -1;
2012                        } else {
2013                                /*
2014                                 * If the array is FROZEN, then the device can't
2015                                 * be in_sync with rest of array.
2016                                 */
2017                                if (!test_bit(MD_RECOVERY_FROZEN,
2018                                              &mddev->recovery))
2019                                        set_bit(In_sync, &rdev->flags);
2020                        }
2021                        rdev->raid_disk = role;
2022                        break;
2023                }
2024                if (sb->devflags & WriteMostly1)
2025                        set_bit(WriteMostly, &rdev->flags);
2026                if (sb->devflags & FailFast1)
2027                        set_bit(FailFast, &rdev->flags);
2028                if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
2029                        set_bit(Replacement, &rdev->flags);
2030        } else /* MULTIPATH are always insync */
2031                set_bit(In_sync, &rdev->flags);
2032
2033        return 0;
2034}
2035
2036static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
2037{
2038        struct mdp_superblock_1 *sb;
2039        struct md_rdev *rdev2;
2040        int max_dev, i;
2041        /* make rdev->sb match mddev and rdev data. */
2042
2043        sb = page_address(rdev->sb_page);
2044
2045        sb->feature_map = 0;
2046        sb->pad0 = 0;
2047        sb->recovery_offset = cpu_to_le64(0);
2048        memset(sb->pad3, 0, sizeof(sb->pad3));
2049
2050        sb->utime = cpu_to_le64((__u64)mddev->utime);
2051        sb->events = cpu_to_le64(mddev->events);
2052        if (mddev->in_sync)
2053                sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
2054        else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
2055                sb->resync_offset = cpu_to_le64(MaxSector);
2056        else
2057                sb->resync_offset = cpu_to_le64(0);
2058
2059        sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
2060
2061        sb->raid_disks = cpu_to_le32(mddev->raid_disks);
2062        sb->size = cpu_to_le64(mddev->dev_sectors);
2063        sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
2064        sb->level = cpu_to_le32(mddev->level);
2065        sb->layout = cpu_to_le32(mddev->layout);
2066        if (test_bit(FailFast, &rdev->flags))
2067                sb->devflags |= FailFast1;
2068        else
2069                sb->devflags &= ~FailFast1;
2070
2071        if (test_bit(WriteMostly, &rdev->flags))
2072                sb->devflags |= WriteMostly1;
2073        else
2074                sb->devflags &= ~WriteMostly1;
2075        sb->data_offset = cpu_to_le64(rdev->data_offset);
2076        sb->data_size = cpu_to_le64(rdev->sectors);
2077
2078        if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
2079                sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
2080                sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
2081        }
2082
2083        if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
2084            !test_bit(In_sync, &rdev->flags)) {
2085                sb->feature_map |=
2086                        cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
2087                sb->recovery_offset =
2088                        cpu_to_le64(rdev->recovery_offset);
2089                if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
2090                        sb->feature_map |=
2091                                cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
2092        }
2093        /* Note: recovery_offset and journal_tail share space  */
2094        if (test_bit(Journal, &rdev->flags))
2095                sb->journal_tail = cpu_to_le64(rdev->journal_tail);
2096        if (test_bit(Replacement, &rdev->flags))
2097                sb->feature_map |=
2098                        cpu_to_le32(MD_FEATURE_REPLACEMENT);
2099
2100        if (mddev->reshape_position != MaxSector) {
2101                sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
2102                sb->reshape_position = cpu_to_le64(mddev->reshape_position);
2103                sb->new_layout = cpu_to_le32(mddev->new_layout);
2104                sb->delta_disks = cpu_to_le32(mddev->delta_disks);
2105                sb->new_level = cpu_to_le32(mddev->new_level);
2106                sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
2107                if (mddev->delta_disks == 0 &&
2108                    mddev->reshape_backwards)
2109                        sb->feature_map
2110                                |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
2111                if (rdev->new_data_offset != rdev->data_offset) {
2112                        sb->feature_map
2113                                |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
2114                        sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
2115                                                             - rdev->data_offset));
2116                }
2117        }
2118
2119        if (mddev_is_clustered(mddev))
2120                sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
2121
2122        if (rdev->badblocks.count == 0)
2123                /* Nothing to do for bad blocks*/ ;
2124        else if (sb->bblog_offset == 0)
2125                /* Cannot record bad blocks on this device */
2126                md_error(mddev, rdev);
2127        else {
2128                struct badblocks *bb = &rdev->badblocks;
2129                __le64 *bbp = (__le64 *)page_address(rdev->bb_page);
2130                u64 *p = bb->page;
2131                sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
2132                if (bb->changed) {
2133                        unsigned seq;
2134
2135retry:
2136                        seq = read_seqbegin(&bb->lock);
2137
2138                        memset(bbp, 0xff, PAGE_SIZE);
2139
2140                        for (i = 0 ; i < bb->count ; i++) {
2141                                u64 internal_bb = p[i];
2142                                u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
2143                                                | BB_LEN(internal_bb));
2144                                bbp[i] = cpu_to_le64(store_bb);
2145                        }
2146                        bb->changed = 0;
2147                        if (read_seqretry(&bb->lock, seq))
2148                                goto retry;
2149
2150                        bb->sector = (rdev->sb_start +
2151                                      (int)le32_to_cpu(sb->bblog_offset));
2152                        bb->size = le16_to_cpu(sb->bblog_size);
2153                }
2154        }
2155
2156        max_dev = 0;
2157        rdev_for_each(rdev2, mddev)
2158                if (rdev2->desc_nr+1 > max_dev)
2159                        max_dev = rdev2->desc_nr+1;
2160
2161        if (max_dev > le32_to_cpu(sb->max_dev)) {
2162                int bmask;
2163                sb->max_dev = cpu_to_le32(max_dev);
2164                rdev->sb_size = max_dev * 2 + 256;
2165                bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
2166                if (rdev->sb_size & bmask)
2167                        rdev->sb_size = (rdev->sb_size | bmask) + 1;
2168        } else
2169                max_dev = le32_to_cpu(sb->max_dev);
2170
2171        for (i=0; i<max_dev;i++)
2172                sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2173
2174        if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
2175                sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
2176
2177        if (test_bit(MD_HAS_PPL, &mddev->flags)) {
2178                if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
2179                        sb->feature_map |=
2180                            cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
2181                else
2182                        sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
2183                sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
2184                sb->ppl.size = cpu_to_le16(rdev->ppl.size);
2185        }
2186
2187        rdev_for_each(rdev2, mddev) {
2188                i = rdev2->desc_nr;
2189                if (test_bit(Faulty, &rdev2->flags))
2190                        sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
2191                else if (test_bit(In_sync, &rdev2->flags))
2192                        sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2193                else if (test_bit(Journal, &rdev2->flags))
2194                        sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
2195                else if (rdev2->raid_disk >= 0)
2196                        sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2197                else
2198                        sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2199        }
2200
2201        sb->sb_csum = calc_sb_1_csum(sb);
2202}
2203
2204static sector_t super_1_choose_bm_space(sector_t dev_size)
2205{
2206        sector_t bm_space;
2207
2208        /* if the device is bigger than 8Gig, save 64k for bitmap
2209         * usage, if bigger than 200Gig, save 128k
2210         */
2211        if (dev_size < 64*2)
2212                bm_space = 0;
2213        else if (dev_size - 64*2 >= 200*1024*1024*2)
2214                bm_space = 128*2;
2215        else if (dev_size - 4*2 > 8*1024*1024*2)
2216                bm_space = 64*2;
2217        else
2218                bm_space = 4*2;
2219        return bm_space;
2220}
2221
2222static unsigned long long
2223super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
2224{
2225        struct mdp_superblock_1 *sb;
2226        sector_t max_sectors;
2227        if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
2228                return 0; /* component must fit device */
2229        if (rdev->data_offset != rdev->new_data_offset)
2230                return 0; /* too confusing */
2231        if (rdev->sb_start < rdev->data_offset) {
2232                /* minor versions 1 and 2; superblock before data */
2233                max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
2234                max_sectors -= rdev->data_offset;
2235                if (!num_sectors || num_sectors > max_sectors)
2236                        num_sectors = max_sectors;
2237        } else if (rdev->mddev->bitmap_info.offset) {
2238                /* minor version 0 with bitmap we can't move */
2239                return 0;
2240        } else {
2241                /* minor version 0; superblock after data */
2242                sector_t sb_start, bm_space;
2243                sector_t dev_size = i_size_read(rdev->bdev->bd_inode) >> 9;
2244
2245                /* 8K is for superblock */
2246                sb_start = dev_size - 8*2;
2247                sb_start &= ~(sector_t)(4*2 - 1);
2248
2249                bm_space = super_1_choose_bm_space(dev_size);
2250
2251                /* Space that can be used to store date needs to decrease
2252                 * superblock bitmap space and bad block space(4K)
2253                 */
2254                max_sectors = sb_start - bm_space - 4*2;
2255
2256                if (!num_sectors || num_sectors > max_sectors)
2257                        num_sectors = max_sectors;
2258        }
2259        sb = page_address(rdev->sb_page);
2260        sb->data_size = cpu_to_le64(num_sectors);
2261        sb->super_offset = cpu_to_le64(rdev->sb_start);
2262        sb->sb_csum = calc_sb_1_csum(sb);
2263        do {
2264                md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
2265                               rdev->sb_page);
2266        } while (md_super_wait(rdev->mddev) < 0);
2267        return num_sectors;
2268
2269}
2270
2271static int
2272super_1_allow_new_offset(struct md_rdev *rdev,
2273                         unsigned long long new_offset)
2274{
2275        /* All necessary checks on new >= old have been done */
2276        struct bitmap *bitmap;
2277        if (new_offset >= rdev->data_offset)
2278                return 1;
2279
2280        /* with 1.0 metadata, there is no metadata to tread on
2281         * so we can always move back */
2282        if (rdev->mddev->minor_version == 0)
2283                return 1;
2284
2285        /* otherwise we must be sure not to step on
2286         * any metadata, so stay:
2287         * 36K beyond start of superblock
2288         * beyond end of badblocks
2289         * beyond write-intent bitmap
2290         */
2291        if (rdev->sb_start + (32+4)*2 > new_offset)
2292                return 0;
2293        bitmap = rdev->mddev->bitmap;
2294        if (bitmap && !rdev->mddev->bitmap_info.file &&
2295            rdev->sb_start + rdev->mddev->bitmap_info.offset +
2296            bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
2297                return 0;
2298        if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
2299                return 0;
2300
2301        return 1;
2302}
2303
2304static struct super_type super_types[] = {
2305        [0] = {
2306                .name   = "0.90.0",
2307                .owner  = THIS_MODULE,
2308                .load_super         = super_90_load,
2309                .validate_super     = super_90_validate,
2310                .sync_super         = super_90_sync,
2311                .rdev_size_change   = super_90_rdev_size_change,
2312                .allow_new_offset   = super_90_allow_new_offset,
2313        },
2314        [1] = {
2315                .name   = "md-1",
2316                .owner  = THIS_MODULE,
2317                .load_super         = super_1_load,
2318                .validate_super     = super_1_validate,
2319                .sync_super         = super_1_sync,
2320                .rdev_size_change   = super_1_rdev_size_change,
2321                .allow_new_offset   = super_1_allow_new_offset,
2322        },
2323};
2324
2325static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
2326{
2327        if (mddev->sync_super) {
2328                mddev->sync_super(mddev, rdev);
2329                return;
2330        }
2331
2332        BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
2333
2334        super_types[mddev->major_version].sync_super(mddev, rdev);
2335}
2336
2337static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
2338{
2339        struct md_rdev *rdev, *rdev2;
2340
2341        rcu_read_lock();
2342        rdev_for_each_rcu(rdev, mddev1) {
2343                if (test_bit(Faulty, &rdev->flags) ||
2344                    test_bit(Journal, &rdev->flags) ||
2345                    rdev->raid_disk == -1)
2346                        continue;
2347                rdev_for_each_rcu(rdev2, mddev2) {
2348                        if (test_bit(Faulty, &rdev2->flags) ||
2349                            test_bit(Journal, &rdev2->flags) ||
2350                            rdev2->raid_disk == -1)
2351                                continue;
2352                        if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) {
2353                                rcu_read_unlock();
2354                                return 1;
2355                        }
2356                }
2357        }
2358        rcu_read_unlock();
2359        return 0;
2360}
2361
2362static LIST_HEAD(pending_raid_disks);
2363
2364/*
2365 * Try to register data integrity profile for an mddev
2366 *
2367 * This is called when an array is started and after a disk has been kicked
2368 * from the array. It only succeeds if all working and active component devices
2369 * are integrity capable with matching profiles.
2370 */
2371int md_integrity_register(struct mddev *mddev)
2372{
2373        struct md_rdev *rdev, *reference = NULL;
2374
2375        if (list_empty(&mddev->disks))
2376                return 0; /* nothing to do */
2377        if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
2378                return 0; /* shouldn't register, or already is */
2379        rdev_for_each(rdev, mddev) {
2380                /* skip spares and non-functional disks */
2381                if (test_bit(Faulty, &rdev->flags))
2382                        continue;
2383                if (rdev->raid_disk < 0)
2384                        continue;
2385                if (!reference) {
2386                        /* Use the first rdev as the reference */
2387                        reference = rdev;
2388                        continue;
2389                }
2390                /* does this rdev's profile match the reference profile? */
2391                if (blk_integrity_compare(reference->bdev->bd_disk,
2392                                rdev->bdev->bd_disk) < 0)
2393                        return -EINVAL;
2394        }
2395        if (!reference || !bdev_get_integrity(reference->bdev))
2396                return 0;
2397        /*
2398         * All component devices are integrity capable and have matching
2399         * profiles, register the common profile for the md device.
2400         */
2401        blk_integrity_register(mddev->gendisk,
2402                               bdev_get_integrity(reference->bdev));
2403
2404        pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
2405        if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE)) {
2406                pr_err("md: failed to create integrity pool for %s\n",
2407                       mdname(mddev));
2408                return -EINVAL;
2409        }
2410        return 0;
2411}
2412EXPORT_SYMBOL(md_integrity_register);
2413
2414/*
2415 * Attempt to add an rdev, but only if it is consistent with the current
2416 * integrity profile
2417 */
2418int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
2419{
2420        struct blk_integrity *bi_mddev;
2421        char name[BDEVNAME_SIZE];
2422
2423        if (!mddev->gendisk)
2424                return 0;
2425
2426        bi_mddev = blk_get_integrity(mddev->gendisk);
2427
2428        if (!bi_mddev) /* nothing to do */
2429                return 0;
2430
2431        if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
2432                pr_err("%s: incompatible integrity profile for %s\n",
2433                       mdname(mddev), bdevname(rdev->bdev, name));
2434                return -ENXIO;
2435        }
2436
2437        return 0;
2438}
2439EXPORT_SYMBOL(md_integrity_add_rdev);
2440
2441static bool rdev_read_only(struct md_rdev *rdev)
2442{
2443        return bdev_read_only(rdev->bdev) ||
2444                (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev));
2445}
2446
2447static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2448{
2449        char b[BDEVNAME_SIZE];
2450        int err;
2451
2452        /* prevent duplicates */
2453        if (find_rdev(mddev, rdev->bdev->bd_dev))
2454                return -EEXIST;
2455
2456        if (rdev_read_only(rdev) && mddev->pers)
2457                return -EROFS;
2458
2459        /* make sure rdev->sectors exceeds mddev->dev_sectors */
2460        if (!test_bit(Journal, &rdev->flags) &&
2461            rdev->sectors &&
2462            (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
2463                if (mddev->pers) {
2464                        /* Cannot change size, so fail
2465                         * If mddev->level <= 0, then we don't care
2466                         * about aligning sizes (e.g. linear)
2467                         */
2468                        if (mddev->level > 0)
2469                                return -ENOSPC;
2470                } else
2471                        mddev->dev_sectors = rdev->sectors;
2472        }
2473
2474        /* Verify rdev->desc_nr is unique.
2475         * If it is -1, assign a free number, else
2476         * check number is not in use
2477         */
2478        rcu_read_lock();
2479        if (rdev->desc_nr < 0) {
2480                int choice = 0;
2481                if (mddev->pers)
2482                        choice = mddev->raid_disks;
2483                while (md_find_rdev_nr_rcu(mddev, choice))
2484                        choice++;
2485                rdev->desc_nr = choice;
2486        } else {
2487                if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2488                        rcu_read_unlock();
2489                        return -EBUSY;
2490                }
2491        }
2492        rcu_read_unlock();
2493        if (!test_bit(Journal, &rdev->flags) &&
2494            mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2495                pr_warn("md: %s: array is limited to %d devices\n",
2496                        mdname(mddev), mddev->max_disks);
2497                return -EBUSY;
2498        }
2499        bdevname(rdev->bdev,b);
2500        strreplace(b, '/', '!');
2501
2502        rdev->mddev = mddev;
2503        pr_debug("md: bind<%s>\n", b);
2504
2505        if (mddev->raid_disks)
2506                mddev_create_serial_pool(mddev, rdev, false);
2507
2508        if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2509                goto fail;
2510
2511        /* failure here is OK */
2512        err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block");
2513        rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2514        rdev->sysfs_unack_badblocks =
2515                sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks");
2516        rdev->sysfs_badblocks =
2517                sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks");
2518
2519        list_add_rcu(&rdev->same_set, &mddev->disks);
2520        bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2521
2522        /* May as well allow recovery to be retried once */
2523        mddev->recovery_disabled++;
2524
2525        return 0;
2526
2527 fail:
2528        pr_warn("md: failed to register dev-%s for %s\n",
2529                b, mdname(mddev));
2530        return err;
2531}
2532
2533static void rdev_delayed_delete(struct work_struct *ws)
2534{
2535        struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
2536        kobject_del(&rdev->kobj);
2537        kobject_put(&rdev->kobj);
2538}
2539
2540static void unbind_rdev_from_array(struct md_rdev *rdev)
2541{
2542        char b[BDEVNAME_SIZE];
2543
2544        bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2545        list_del_rcu(&rdev->same_set);
2546        pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
2547        mddev_destroy_serial_pool(rdev->mddev, rdev, false);
2548        rdev->mddev = NULL;
2549        sysfs_remove_link(&rdev->kobj, "block");
2550        sysfs_put(rdev->sysfs_state);
2551        sysfs_put(rdev->sysfs_unack_badblocks);
2552        sysfs_put(rdev->sysfs_badblocks);
2553        rdev->sysfs_state = NULL;
2554        rdev->sysfs_unack_badblocks = NULL;
2555        rdev->sysfs_badblocks = NULL;
2556        rdev->badblocks.count = 0;
2557        /* We need to delay this, otherwise we can deadlock when
2558         * writing to 'remove' to "dev/state".  We also need
2559         * to delay it due to rcu usage.
2560         */
2561        synchronize_rcu();
2562        INIT_WORK(&rdev->del_work, rdev_delayed_delete);
2563        kobject_get(&rdev->kobj);
2564        queue_work(md_rdev_misc_wq, &rdev->del_work);
2565}
2566
2567/*
2568 * prevent the device from being mounted, repartitioned or
2569 * otherwise reused by a RAID array (or any other kernel
2570 * subsystem), by bd_claiming the device.
2571 */
2572static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
2573{
2574        int err = 0;
2575        struct block_device *bdev;
2576
2577        bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2578                                 shared ? (struct md_rdev *)lock_rdev : rdev);
2579        if (IS_ERR(bdev)) {
2580                pr_warn("md: could not open device unknown-block(%u,%u).\n",
2581                        MAJOR(dev), MINOR(dev));
2582                return PTR_ERR(bdev);
2583        }
2584        rdev->bdev = bdev;
2585        return err;
2586}
2587
2588static void unlock_rdev(struct md_rdev *rdev)
2589{
2590        struct block_device *bdev = rdev->bdev;
2591        rdev->bdev = NULL;
2592        blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2593}
2594
2595void md_autodetect_dev(dev_t dev);
2596
2597static void export_rdev(struct md_rdev *rdev)
2598{
2599        char b[BDEVNAME_SIZE];
2600
2601        pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b));
2602        md_rdev_clear(rdev);
2603#ifndef MODULE
2604        if (test_bit(AutoDetected, &rdev->flags))
2605                md_autodetect_dev(rdev->bdev->bd_dev);
2606#endif
2607        unlock_rdev(rdev);
2608        kobject_put(&rdev->kobj);
2609}
2610
2611void md_kick_rdev_from_array(struct md_rdev *rdev)
2612{
2613        unbind_rdev_from_array(rdev);
2614        export_rdev(rdev);
2615}
2616EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
2617
2618static void export_array(struct mddev *mddev)
2619{
2620        struct md_rdev *rdev;
2621
2622        while (!list_empty(&mddev->disks)) {
2623                rdev = list_first_entry(&mddev->disks, struct md_rdev,
2624                                        same_set);
2625                md_kick_rdev_from_array(rdev);
2626        }
2627        mddev->raid_disks = 0;
2628        mddev->major_version = 0;
2629}
2630
2631static bool set_in_sync(struct mddev *mddev)
2632{
2633        lockdep_assert_held(&mddev->lock);
2634        if (!mddev->in_sync) {
2635                mddev->sync_checkers++;
2636                spin_unlock(&mddev->lock);
2637                percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
2638                spin_lock(&mddev->lock);
2639                if (!mddev->in_sync &&
2640                    percpu_ref_is_zero(&mddev->writes_pending)) {
2641                        mddev->in_sync = 1;
2642                        /*
2643                         * Ensure ->in_sync is visible before we clear
2644                         * ->sync_checkers.
2645                         */
2646                        smp_mb();
2647                        set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2648                        sysfs_notify_dirent_safe(mddev->sysfs_state);
2649                }
2650                if (--mddev->sync_checkers == 0)
2651                        percpu_ref_switch_to_percpu(&mddev->writes_pending);
2652        }
2653        if (mddev->safemode == 1)
2654                mddev->safemode = 0;
2655        return mddev->in_sync;
2656}
2657
2658static void sync_sbs(struct mddev *mddev, int nospares)
2659{
2660        /* Update each superblock (in-memory image), but
2661         * if we are allowed to, skip spares which already
2662         * have the right event counter, or have one earlier
2663         * (which would mean they aren't being marked as dirty
2664         * with the rest of the array)
2665         */
2666        struct md_rdev *rdev;
2667        rdev_for_each(rdev, mddev) {
2668                if (rdev->sb_events == mddev->events ||
2669                    (nospares &&
2670                     rdev->raid_disk < 0 &&
2671                     rdev->sb_events+1 == mddev->events)) {
2672                        /* Don't update this superblock */
2673                        rdev->sb_loaded = 2;
2674                } else {
2675                        sync_super(mddev, rdev);
2676                        rdev->sb_loaded = 1;
2677                }
2678        }
2679}
2680
2681static bool does_sb_need_changing(struct mddev *mddev)
2682{
2683        struct md_rdev *rdev;
2684        struct mdp_superblock_1 *sb;
2685        int role;
2686
2687        /* Find a good rdev */
2688        rdev_for_each(rdev, mddev)
2689                if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
2690                        break;
2691
2692        /* No good device found. */
2693        if (!rdev)
2694                return false;
2695
2696        sb = page_address(rdev->sb_page);
2697        /* Check if a device has become faulty or a spare become active */
2698        rdev_for_each(rdev, mddev) {
2699                role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2700                /* Device activated? */
2701                if (role == 0xffff && rdev->raid_disk >=0 &&
2702                    !test_bit(Faulty, &rdev->flags))
2703                        return true;
2704                /* Device turned faulty? */
2705                if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
2706                        return true;
2707        }
2708
2709        /* Check if any mddev parameters have changed */
2710        if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
2711            (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2712            (mddev->layout != le32_to_cpu(sb->layout)) ||
2713            (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
2714            (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
2715                return true;
2716
2717        return false;
2718}
2719
2720void md_update_sb(struct mddev *mddev, int force_change)
2721{
2722        struct md_rdev *rdev;
2723        int sync_req;
2724        int nospares = 0;
2725        int any_badblocks_changed = 0;
2726        int ret = -1;
2727
2728        if (mddev->ro) {
2729                if (force_change)
2730                        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2731                return;
2732        }
2733
2734repeat:
2735        if (mddev_is_clustered(mddev)) {
2736                if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2737                        force_change = 1;
2738                if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2739                        nospares = 1;
2740                ret = md_cluster_ops->metadata_update_start(mddev);
2741                /* Has someone else has updated the sb */
2742                if (!does_sb_need_changing(mddev)) {
2743                        if (ret == 0)
2744                                md_cluster_ops->metadata_update_cancel(mddev);
2745                        bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2746                                                         BIT(MD_SB_CHANGE_DEVS) |
2747                                                         BIT(MD_SB_CHANGE_CLEAN));
2748                        return;
2749                }
2750        }
2751
2752        /*
2753         * First make sure individual recovery_offsets are correct
2754         * curr_resync_completed can only be used during recovery.
2755         * During reshape/resync it might use array-addresses rather
2756         * that device addresses.
2757         */
2758        rdev_for_each(rdev, mddev) {
2759                if (rdev->raid_disk >= 0 &&
2760                    mddev->delta_disks >= 0 &&
2761                    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
2762                    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
2763                    !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2764                    !test_bit(Journal, &rdev->flags) &&
2765                    !test_bit(In_sync, &rdev->flags) &&
2766                    mddev->curr_resync_completed > rdev->recovery_offset)
2767                                rdev->recovery_offset = mddev->curr_resync_completed;
2768
2769        }
2770        if (!mddev->persistent) {
2771                clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2772                clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2773                if (!mddev->external) {
2774                        clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2775                        rdev_for_each(rdev, mddev) {
2776                                if (rdev->badblocks.changed) {
2777                                        rdev->badblocks.changed = 0;
2778                                        ack_all_badblocks(&rdev->badblocks);
2779                                        md_error(mddev, rdev);
2780                                }
2781                                clear_bit(Blocked, &rdev->flags);
2782                                clear_bit(BlockedBadBlocks, &rdev->flags);
2783                                wake_up(&rdev->blocked_wait);
2784                        }
2785                }
2786                wake_up(&mddev->sb_wait);
2787                return;
2788        }
2789
2790        spin_lock(&mddev->lock);
2791
2792        mddev->utime = ktime_get_real_seconds();
2793
2794        if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2795                force_change = 1;
2796        if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2797                /* just a clean<-> dirty transition, possibly leave spares alone,
2798                 * though if events isn't the right even/odd, we will have to do
2799                 * spares after all
2800                 */
2801                nospares = 1;
2802        if (force_change)
2803                nospares = 0;
2804        if (mddev->degraded)
2805                /* If the array is degraded, then skipping spares is both
2806                 * dangerous and fairly pointless.
2807                 * Dangerous because a device that was removed from the array
2808                 * might have a event_count that still looks up-to-date,
2809                 * so it can be re-added without a resync.
2810                 * Pointless because if there are any spares to skip,
2811                 * then a recovery will happen and soon that array won't
2812                 * be degraded any more and the spare can go back to sleep then.
2813                 */
2814                nospares = 0;
2815
2816        sync_req = mddev->in_sync;
2817
2818        /* If this is just a dirty<->clean transition, and the array is clean
2819         * and 'events' is odd, we can roll back to the previous clean state */
2820        if (nospares
2821            && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2822            && mddev->can_decrease_events
2823            && mddev->events != 1) {
2824                mddev->events--;
2825                mddev->can_decrease_events = 0;
2826        } else {
2827                /* otherwise we have to go forward and ... */
2828                mddev->events ++;
2829                mddev->can_decrease_events = nospares;
2830        }
2831
2832        /*
2833         * This 64-bit counter should never wrap.
2834         * Either we are in around ~1 trillion A.C., assuming
2835         * 1 reboot per second, or we have a bug...
2836         */
2837        WARN_ON(mddev->events == 0);
2838
2839        rdev_for_each(rdev, mddev) {
2840                if (rdev->badblocks.changed)
2841                        any_badblocks_changed++;
2842                if (test_bit(Faulty, &rdev->flags))
2843                        set_bit(FaultRecorded, &rdev->flags);
2844        }
2845
2846        sync_sbs(mddev, nospares);
2847        spin_unlock(&mddev->lock);
2848
2849        pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2850                 mdname(mddev), mddev->in_sync);
2851
2852        if (mddev->queue)
2853                blk_add_trace_msg(mddev->queue, "md md_update_sb");
2854rewrite:
2855        md_bitmap_update_sb(mddev->bitmap);
2856        rdev_for_each(rdev, mddev) {
2857                char b[BDEVNAME_SIZE];
2858
2859                if (rdev->sb_loaded != 1)
2860                        continue; /* no noise on spare devices */
2861
2862                if (!test_bit(Faulty, &rdev->flags)) {
2863                        md_super_write(mddev,rdev,
2864                                       rdev->sb_start, rdev->sb_size,
2865                                       rdev->sb_page);
2866                        pr_debug("md: (write) %s's sb offset: %llu\n",
2867                                 bdevname(rdev->bdev, b),
2868                                 (unsigned long long)rdev->sb_start);
2869                        rdev->sb_events = mddev->events;
2870                        if (rdev->badblocks.size) {
2871                                md_super_write(mddev, rdev,
2872                                               rdev->badblocks.sector,
2873                                               rdev->badblocks.size << 9,
2874                                               rdev->bb_page);
2875                                rdev->badblocks.size = 0;
2876                        }
2877
2878                } else
2879                        pr_debug("md: %s (skipping faulty)\n",
2880                                 bdevname(rdev->bdev, b));
2881
2882                if (mddev->level == LEVEL_MULTIPATH)
2883                        /* only need to write one superblock... */
2884                        break;
2885        }
2886        if (md_super_wait(mddev) < 0)
2887                goto rewrite;
2888        /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
2889
2890        if (mddev_is_clustered(mddev) && ret == 0)
2891                md_cluster_ops->metadata_update_finish(mddev);
2892
2893        if (mddev->in_sync != sync_req ||
2894            !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2895                               BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
2896                /* have to write it out again */
2897                goto repeat;
2898        wake_up(&mddev->sb_wait);
2899        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2900                sysfs_notify_dirent_safe(mddev->sysfs_completed);
2901
2902        rdev_for_each(rdev, mddev) {
2903                if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2904                        clear_bit(Blocked, &rdev->flags);
2905
2906                if (any_badblocks_changed)
2907                        ack_all_badblocks(&rdev->badblocks);
2908                clear_bit(BlockedBadBlocks, &rdev->flags);
2909                wake_up(&rdev->blocked_wait);
2910        }
2911}
2912EXPORT_SYMBOL(md_update_sb);
2913
2914static int add_bound_rdev(struct md_rdev *rdev)
2915{
2916        struct mddev *mddev = rdev->mddev;
2917        int err = 0;
2918        bool add_journal = test_bit(Journal, &rdev->flags);
2919
2920        if (!mddev->pers->hot_remove_disk || add_journal) {
2921                /* If there is hot_add_disk but no hot_remove_disk
2922                 * then added disks for geometry changes,
2923                 * and should be added immediately.
2924                 */
2925                super_types[mddev->major_version].
2926                        validate_super(mddev, rdev);
2927                if (add_journal)
2928                        mddev_suspend(mddev);
2929                err = mddev->pers->hot_add_disk(mddev, rdev);
2930                if (add_journal)
2931                        mddev_resume(mddev);
2932                if (err) {
2933                        md_kick_rdev_from_array(rdev);
2934                        return err;
2935                }
2936        }
2937        sysfs_notify_dirent_safe(rdev->sysfs_state);
2938
2939        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2940        if (mddev->degraded)
2941                set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2942        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2943        md_new_event(mddev);
2944        md_wakeup_thread(mddev->thread);
2945        return 0;
2946}
2947
2948/* words written to sysfs files may, or may not, be \n terminated.
2949 * We want to accept with case. For this we use cmd_match.
2950 */
2951static int cmd_match(const char *cmd, const char *str)
2952{
2953        /* See if cmd, written into a sysfs file, matches
2954         * str.  They must either be the same, or cmd can
2955         * have a trailing newline
2956         */
2957        while (*cmd && *str && *cmd == *str) {
2958                cmd++;
2959                str++;
2960        }
2961        if (*cmd == '\n')
2962                cmd++;
2963        if (*str || *cmd)
2964                return 0;
2965        return 1;
2966}
2967
2968struct rdev_sysfs_entry {
2969        struct attribute attr;
2970        ssize_t (*show)(struct md_rdev *, char *);
2971        ssize_t (*store)(struct md_rdev *, const char *, size_t);
2972};
2973
2974static ssize_t
2975state_show(struct md_rdev *rdev, char *page)
2976{
2977        char *sep = ",";
2978        size_t len = 0;
2979        unsigned long flags = READ_ONCE(rdev->flags);
2980
2981        if (test_bit(Faulty, &flags) ||
2982            (!test_bit(ExternalBbl, &flags) &&
2983            rdev->badblocks.unacked_exist))
2984                len += sprintf(page+len, "faulty%s", sep);
2985        if (test_bit(In_sync, &flags))
2986                len += sprintf(page+len, "in_sync%s", sep);
2987        if (test_bit(Journal, &flags))
2988                len += sprintf(page+len, "journal%s", sep);
2989        if (test_bit(WriteMostly, &flags))
2990                len += sprintf(page+len, "write_mostly%s", sep);
2991        if (test_bit(Blocked, &flags) ||
2992            (rdev->badblocks.unacked_exist
2993             && !test_bit(Faulty, &flags)))
2994                len += sprintf(page+len, "blocked%s", sep);
2995        if (!test_bit(Faulty, &flags) &&
2996            !test_bit(Journal, &flags) &&
2997            !test_bit(In_sync, &flags))
2998                len += sprintf(page+len, "spare%s", sep);
2999        if (test_bit(WriteErrorSeen, &flags))
3000                len += sprintf(page+len, "write_error%s", sep);
3001        if (test_bit(WantReplacement, &flags))
3002                len += sprintf(page+len, "want_replacement%s", sep);
3003        if (test_bit(Replacement, &flags))
3004                len += sprintf(page+len, "replacement%s", sep);
3005        if (test_bit(ExternalBbl, &flags))
3006                len += sprintf(page+len, "external_bbl%s", sep);
3007        if (test_bit(FailFast, &flags))
3008                len += sprintf(page+len, "failfast%s", sep);
3009
3010        if (len)
3011                len -= strlen(sep);
3012
3013        return len+sprintf(page+len, "\n");
3014}
3015
3016static ssize_t
3017state_store(struct md_rdev *rdev, const char *buf, size_t len)
3018{
3019        /* can write
3020         *  faulty  - simulates an error
3021         *  remove  - disconnects the device
3022         *  writemostly - sets write_mostly
3023         *  -writemostly - clears write_mostly
3024         *  blocked - sets the Blocked flags
3025         *  -blocked - clears the Blocked and possibly simulates an error
3026         *  insync - sets Insync providing device isn't active
3027         *  -insync - clear Insync for a device with a slot assigned,
3028         *            so that it gets rebuilt based on bitmap
3029         *  write_error - sets WriteErrorSeen
3030         *  -write_error - clears WriteErrorSeen
3031         *  {,-}failfast - set/clear FailFast
3032         */
3033        int err = -EINVAL;
3034        if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
3035                md_error(rdev->mddev, rdev);
3036                if (test_bit(Faulty, &rdev->flags))
3037                        err = 0;
3038                else
3039                        err = -EBUSY;
3040        } else if (cmd_match(buf, "remove")) {
3041                if (rdev->mddev->pers) {
3042                        clear_bit(Blocked, &rdev->flags);
3043                        remove_and_add_spares(rdev->mddev, rdev);
3044                }
3045                if (rdev->raid_disk >= 0)
3046                        err = -EBUSY;
3047                else {
3048                        struct mddev *mddev = rdev->mddev;
3049                        err = 0;
3050                        if (mddev_is_clustered(mddev))
3051                                err = md_cluster_ops->remove_disk(mddev, rdev);
3052
3053                        if (err == 0) {
3054                                md_kick_rdev_from_array(rdev);
3055                                if (mddev->pers) {
3056                                        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
3057                                        md_wakeup_thread(mddev->thread);
3058                                }
3059                                md_new_event(mddev);
3060                        }
3061                }
3062        } else if (cmd_match(buf, "writemostly")) {
3063                set_bit(WriteMostly, &rdev->flags);
3064                mddev_create_serial_pool(rdev->mddev, rdev, false);
3065                err = 0;
3066        } else if (cmd_match(buf, "-writemostly")) {
3067                mddev_destroy_serial_pool(rdev->mddev, rdev, false);
3068                clear_bit(WriteMostly, &rdev->flags);
3069                err = 0;
3070        } else if (cmd_match(buf, "blocked")) {
3071                set_bit(Blocked, &rdev->flags);
3072                err = 0;
3073        } else if (cmd_match(buf, "-blocked")) {
3074                if (!test_bit(Faulty, &rdev->flags) &&
3075                    !test_bit(ExternalBbl, &rdev->flags) &&
3076                    rdev->badblocks.unacked_exist) {
3077                        /* metadata handler doesn't understand badblocks,
3078                         * so we need to fail the device
3079                         */
3080                        md_error(rdev->mddev, rdev);
3081                }
3082                clear_bit(Blocked, &rdev->flags);
3083                clear_bit(BlockedBadBlocks, &rdev->flags);
3084                wake_up(&rdev->blocked_wait);
3085                set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3086                md_wakeup_thread(rdev->mddev->thread);
3087
3088                err = 0;
3089        } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
3090                set_bit(In_sync, &rdev->flags);
3091                err = 0;
3092        } else if (cmd_match(buf, "failfast")) {
3093                set_bit(FailFast, &rdev->flags);
3094                err = 0;
3095        } else if (cmd_match(buf, "-failfast")) {
3096                clear_bit(FailFast, &rdev->flags);
3097                err = 0;
3098        } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
3099                   !test_bit(Journal, &rdev->flags)) {
3100                if (rdev->mddev->pers == NULL) {
3101                        clear_bit(In_sync, &rdev->flags);
3102                        rdev->saved_raid_disk = rdev->raid_disk;
3103                        rdev->raid_disk = -1;
3104                        err = 0;
3105                }
3106        } else if (cmd_match(buf, "write_error")) {
3107                set_bit(WriteErrorSeen, &rdev->flags);
3108                err = 0;
3109        } else if (cmd_match(buf, "-write_error")) {
3110                clear_bit(WriteErrorSeen, &rdev->flags);
3111                err = 0;
3112        } else if (cmd_match(buf, "want_replacement")) {
3113                /* Any non-spare device that is not a replacement can
3114                 * become want_replacement at any time, but we then need to
3115                 * check if recovery is needed.
3116                 */
3117                if (rdev->raid_disk >= 0 &&
3118                    !test_bit(Journal, &rdev->flags) &&
3119                    !test_bit(Replacement, &rdev->flags))
3120                        set_bit(WantReplacement, &rdev->flags);
3121                set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3122                md_wakeup_thread(rdev->mddev->thread);
3123                err = 0;
3124        } else if (cmd_match(buf, "-want_replacement")) {
3125                /* Clearing 'want_replacement' is always allowed.
3126                 * Once replacements starts it is too late though.
3127                 */
3128                err = 0;
3129                clear_bit(WantReplacement, &rdev->flags);
3130        } else if (cmd_match(buf, "replacement")) {
3131                /* Can only set a device as a replacement when array has not
3132                 * yet been started.  Once running, replacement is automatic
3133                 * from spares, or by assigning 'slot'.
3134                 */
3135                if (rdev->mddev->pers)
3136                        err = -EBUSY;
3137                else {
3138                        set_bit(Replacement, &rdev->flags);
3139                        err = 0;
3140                }
3141        } else if (cmd_match(buf, "-replacement")) {
3142                /* Similarly, can only clear Replacement before start */
3143                if (rdev->mddev->pers)
3144                        err = -EBUSY;
3145                else {
3146                        clear_bit(Replacement, &rdev->flags);
3147                        err = 0;
3148                }
3149        } else if (cmd_match(buf, "re-add")) {
3150                if (!rdev->mddev->pers)
3151                        err = -EINVAL;
3152                else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
3153                                rdev->saved_raid_disk >= 0) {
3154                        /* clear_bit is performed _after_ all the devices
3155                         * have their local Faulty bit cleared. If any writes
3156                         * happen in the meantime in the local node, they
3157                         * will land in the local bitmap, which will be synced
3158                         * by this node eventually
3159                         */
3160                        if (!mddev_is_clustered(rdev->mddev) ||
3161                            (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
3162                                clear_bit(Faulty, &rdev->flags);
3163                                err = add_bound_rdev(rdev);
3164                        }
3165                } else
3166                        err = -EBUSY;
3167        } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
3168                set_bit(ExternalBbl, &rdev->flags);
3169                rdev->badblocks.shift = 0;
3170                err = 0;
3171        } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
3172                clear_bit(ExternalBbl, &rdev->flags);
3173                err = 0;
3174        }
3175        if (!err)
3176                sysfs_notify_dirent_safe(rdev->sysfs_state);
3177        return err ? err : len;
3178}
3179static struct rdev_sysfs_entry rdev_state =
3180__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
3181
3182static ssize_t
3183errors_show(struct md_rdev *rdev, char *page)
3184{
3185        return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
3186}
3187
3188static ssize_t
3189errors_store(struct md_rdev *rdev, const char *buf, size_t len)
3190{
3191        unsigned int n;
3192        int rv;
3193
3194        rv = kstrtouint(buf, 10, &n);
3195        if (rv < 0)
3196                return rv;
3197        atomic_set(&rdev->corrected_errors, n);
3198        return len;
3199}
3200static struct rdev_sysfs_entry rdev_errors =
3201__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
3202
3203static ssize_t
3204slot_show(struct md_rdev *rdev, char *page)
3205{
3206        if (test_bit(Journal, &rdev->flags))
3207                return sprintf(page, "journal\n");
3208        else if (rdev->raid_disk < 0)
3209                return sprintf(page, "none\n");
3210        else
3211                return sprintf(page, "%d\n", rdev->raid_disk);
3212}
3213
3214static ssize_t
3215slot_store(struct md_rdev *rdev, const char *buf, size_t len)
3216{
3217        int slot;
3218        int err;
3219
3220        if (test_bit(Journal, &rdev->flags))
3221                return -EBUSY;
3222        if (strncmp(buf, "none", 4)==0)
3223                slot = -1;
3224        else {
3225                err = kstrtouint(buf, 10, (unsigned int *)&slot);
3226                if (err < 0)
3227                        return err;
3228        }
3229        if (rdev->mddev->pers && slot == -1) {
3230                /* Setting 'slot' on an active array requires also
3231                 * updating the 'rd%d' link, and communicating
3232                 * with the personality with ->hot_*_disk.
3233                 * For now we only support removing
3234                 * failed/spare devices.  This normally happens automatically,
3235                 * but not when the metadata is externally managed.
3236                 */
3237                if (rdev->raid_disk == -1)
3238                        return -EEXIST;
3239                /* personality does all needed checks */
3240                if (rdev->mddev->pers->hot_remove_disk == NULL)
3241                        return -EINVAL;
3242                clear_bit(Blocked, &rdev->flags);
3243                remove_and_add_spares(rdev->mddev, rdev);
3244                if (rdev->raid_disk >= 0)
3245                        return -EBUSY;
3246                set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3247                md_wakeup_thread(rdev->mddev->thread);
3248        } else if (rdev->mddev->pers) {
3249                /* Activating a spare .. or possibly reactivating
3250                 * if we ever get bitmaps working here.
3251                 */
3252                int err;
3253
3254                if (rdev->raid_disk != -1)
3255                        return -EBUSY;
3256
3257                if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
3258                        return -EBUSY;
3259
3260                if (rdev->mddev->pers->hot_add_disk == NULL)
3261                        return -EINVAL;
3262
3263                if (slot >= rdev->mddev->raid_disks &&
3264                    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3265                        return -ENOSPC;
3266
3267                rdev->raid_disk = slot;
3268                if (test_bit(In_sync, &rdev->flags))
3269                        rdev->saved_raid_disk = slot;
3270                else
3271                        rdev->saved_raid_disk = -1;
3272                clear_bit(In_sync, &rdev->flags);
3273                clear_bit(Bitmap_sync, &rdev->flags);
3274                err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev);
3275                if (err) {
3276                        rdev->raid_disk = -1;
3277                        return err;
3278                } else
3279                        sysfs_notify_dirent_safe(rdev->sysfs_state);
3280                /* failure here is OK */;
3281                sysfs_link_rdev(rdev->mddev, rdev);
3282                /* don't wakeup anyone, leave that to userspace. */
3283        } else {
3284                if (slot >= rdev->mddev->raid_disks &&
3285                    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3286                        return -ENOSPC;
3287                rdev->raid_disk = slot;
3288                /* assume it is working */
3289                clear_bit(Faulty, &rdev->flags);
3290                clear_bit(WriteMostly, &rdev->flags);
3291                set_bit(In_sync, &rdev->flags);
3292                sysfs_notify_dirent_safe(rdev->sysfs_state);
3293        }
3294        return len;
3295}
3296
3297static struct rdev_sysfs_entry rdev_slot =
3298__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
3299
3300static ssize_t
3301offset_show(struct md_rdev *rdev, char *page)
3302{
3303        return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
3304}
3305
3306static ssize_t
3307offset_store(struct md_rdev *rdev, const char *buf, size_t len)
3308{
3309        unsigned long long offset;
3310        if (kstrtoull(buf, 10, &offset) < 0)
3311                return -EINVAL;
3312        if (rdev->mddev->pers && rdev->raid_disk >= 0)
3313                return -EBUSY;
3314        if (rdev->sectors && rdev->mddev->external)
3315                /* Must set offset before size, so overlap checks
3316                 * can be sane */
3317                return -EBUSY;
3318        rdev->data_offset = offset;
3319        rdev->new_data_offset = offset;
3320        return len;
3321}
3322
3323static struct rdev_sysfs_entry rdev_offset =
3324__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
3325
3326static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
3327{
3328        return sprintf(page, "%llu\n",
3329                       (unsigned long long)rdev->new_data_offset);
3330}
3331
3332static ssize_t new_offset_store(struct md_rdev *rdev,
3333                                const char *buf, size_t len)
3334{
3335        unsigned long long new_offset;
3336        struct mddev *mddev = rdev->mddev;
3337
3338        if (kstrtoull(buf, 10, &new_offset) < 0)
3339                return -EINVAL;
3340
3341        if (mddev->sync_thread ||
3342            test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
3343                return -EBUSY;
3344        if (new_offset == rdev->data_offset)
3345                /* reset is always permitted */
3346                ;
3347        else if (new_offset > rdev->data_offset) {
3348                /* must not push array size beyond rdev_sectors */
3349                if (new_offset - rdev->data_offset
3350                    + mddev->dev_sectors > rdev->sectors)
3351                                return -E2BIG;
3352        }
3353        /* Metadata worries about other space details. */
3354
3355        /* decreasing the offset is inconsistent with a backwards
3356         * reshape.
3357         */
3358        if (new_offset < rdev->data_offset &&
3359            mddev->reshape_backwards)
3360                return -EINVAL;
3361        /* Increasing offset is inconsistent with forwards
3362         * reshape.  reshape_direction should be set to
3363         * 'backwards' first.
3364         */
3365        if (new_offset > rdev->data_offset &&
3366            !mddev->reshape_backwards)
3367                return -EINVAL;
3368
3369        if (mddev->pers && mddev->persistent &&
3370            !super_types[mddev->major_version]
3371            .allow_new_offset(rdev, new_offset))
3372                return -E2BIG;
3373        rdev->new_data_offset = new_offset;
3374        if (new_offset > rdev->data_offset)
3375                mddev->reshape_backwards = 1;
3376        else if (new_offset < rdev->data_offset)
3377                mddev->reshape_backwards = 0;
3378
3379        return len;
3380}
3381static struct rdev_sysfs_entry rdev_new_offset =
3382__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
3383
3384static ssize_t
3385rdev_size_show(struct md_rdev *rdev, char *page)
3386{
3387        return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
3388}
3389
3390static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
3391{
3392        /* check if two start/length pairs overlap */
3393        if (s1+l1 <= s2)
3394                return 0;
3395        if (s2+l2 <= s1)
3396                return 0;
3397        return 1;
3398}
3399
3400static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
3401{
3402        unsigned long long blocks;
3403        sector_t new;
3404
3405        if (kstrtoull(buf, 10, &blocks) < 0)
3406                return -EINVAL;
3407
3408        if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
3409                return -EINVAL; /* sector conversion overflow */
3410
3411        new = blocks * 2;
3412        if (new != blocks * 2)
3413                return -EINVAL; /* unsigned long long to sector_t overflow */
3414
3415        *sectors = new;
3416        return 0;
3417}
3418
3419static ssize_t
3420rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3421{
3422        struct mddev *my_mddev = rdev->mddev;
3423        sector_t oldsectors = rdev->sectors;
3424        sector_t sectors;
3425
3426        if (test_bit(Journal, &rdev->flags))
3427                return -EBUSY;
3428        if (strict_blocks_to_sectors(buf, &sectors) < 0)
3429                return -EINVAL;
3430        if (rdev->data_offset != rdev->new_data_offset)
3431                return -EINVAL; /* too confusing */
3432        if (my_mddev->pers && rdev->raid_disk >= 0) {
3433                if (my_mddev->persistent) {
3434                        sectors = super_types[my_mddev->major_version].
3435                                rdev_size_change(rdev, sectors);
3436                        if (!sectors)
3437                                return -EBUSY;
3438                } else if (!sectors)
3439                        sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
3440                                rdev->data_offset;
3441                if (!my_mddev->pers->resize)
3442                        /* Cannot change size for RAID0 or Linear etc */
3443                        return -EINVAL;
3444        }
3445        if (sectors < my_mddev->dev_sectors)
3446                return -EINVAL; /* component must fit device */
3447
3448        rdev->sectors = sectors;
3449        if (sectors > oldsectors && my_mddev->external) {
3450                /* Need to check that all other rdevs with the same
3451                 * ->bdev do not overlap.  'rcu' is sufficient to walk
3452                 * the rdev lists safely.
3453                 * This check does not provide a hard guarantee, it
3454                 * just helps avoid dangerous mistakes.
3455                 */
3456                struct mddev *mddev;
3457                int overlap = 0;
3458                struct list_head *tmp;
3459
3460                rcu_read_lock();
3461                for_each_mddev(mddev, tmp) {
3462                        struct md_rdev *rdev2;
3463
3464                        rdev_for_each(rdev2, mddev)
3465                                if (rdev->bdev == rdev2->bdev &&
3466                                    rdev != rdev2 &&
3467                                    overlaps(rdev->data_offset, rdev->sectors,
3468                                             rdev2->data_offset,
3469                                             rdev2->sectors)) {
3470                                        overlap = 1;
3471                                        break;
3472                                }
3473                        if (overlap) {
3474                                mddev_put(mddev);
3475                                break;
3476                        }
3477                }
3478                rcu_read_unlock();
3479                if (overlap) {
3480                        /* Someone else could have slipped in a size
3481                         * change here, but doing so is just silly.
3482                         * We put oldsectors back because we *know* it is
3483                         * safe, and trust userspace not to race with
3484                         * itself
3485                         */
3486                        rdev->sectors = oldsectors;
3487                        return -EBUSY;
3488                }
3489        }
3490        return len;
3491}
3492
3493static struct rdev_sysfs_entry rdev_size =
3494__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3495
3496static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3497{
3498        unsigned long long recovery_start = rdev->recovery_offset;
3499
3500        if (test_bit(In_sync, &rdev->flags) ||
3501            recovery_start == MaxSector)
3502                return sprintf(page, "none\n");
3503
3504        return sprintf(page, "%llu\n", recovery_start);
3505}
3506
3507static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3508{
3509        unsigned long long recovery_start;
3510
3511        if (cmd_match(buf, "none"))
3512                recovery_start = MaxSector;
3513        else if (kstrtoull(buf, 10, &recovery_start))
3514                return -EINVAL;
3515
3516        if (rdev->mddev->pers &&
3517            rdev->raid_disk >= 0)
3518                return -EBUSY;
3519
3520        rdev->recovery_offset = recovery_start;
3521        if (recovery_start == MaxSector)
3522                set_bit(In_sync, &rdev->flags);
3523        else
3524                clear_bit(In_sync, &rdev->flags);
3525        return len;
3526}
3527
3528static struct rdev_sysfs_entry rdev_recovery_start =
3529__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3530
3531/* sysfs access to bad-blocks list.
3532 * We present two files.
3533 * 'bad-blocks' lists sector numbers and lengths of ranges that
3534 *    are recorded as bad.  The list is truncated to fit within
3535 *    the one-page limit of sysfs.
3536 *    Writing "sector length" to this file adds an acknowledged
3537 *    bad block list.
3538 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
3539 *    been acknowledged.  Writing to this file adds bad blocks
3540 *    without acknowledging them.  This is largely for testing.
3541 */
3542static ssize_t bb_show(struct md_rdev *rdev, char *page)
3543{
3544        return badblocks_show(&rdev->badblocks, page, 0);
3545}
3546static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3547{
3548        int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3549        /* Maybe that ack was all we needed */
3550        if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3551                wake_up(&rdev->blocked_wait);
3552        return rv;
3553}
3554static struct rdev_sysfs_entry rdev_bad_blocks =
3555__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3556
3557static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3558{
3559        return badblocks_show(&rdev->badblocks, page, 1);
3560}
3561static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3562{
3563        return badblocks_store(&rdev->badblocks, page, len, 1);
3564}
3565static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3566__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3567
3568static ssize_t
3569ppl_sector_show(struct md_rdev *rdev, char *page)
3570{
3571        return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
3572}
3573
3574static ssize_t
3575ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
3576{
3577        unsigned long long sector;
3578
3579        if (kstrtoull(buf, 10, &sector) < 0)
3580                return -EINVAL;
3581        if (sector != (sector_t)sector)
3582                return -EINVAL;
3583
3584        if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3585            rdev->raid_disk >= 0)
3586                return -EBUSY;
3587
3588        if (rdev->mddev->persistent) {
3589                if (rdev->mddev->major_version == 0)
3590                        return -EINVAL;
3591                if ((sector > rdev->sb_start &&
3592                     sector - rdev->sb_start > S16_MAX) ||
3593                    (sector < rdev->sb_start &&
3594                     rdev->sb_start - sector > -S16_MIN))
3595                        return -EINVAL;
3596                rdev->ppl.offset = sector - rdev->sb_start;
3597        } else if (!rdev->mddev->external) {
3598                return -EBUSY;
3599        }
3600        rdev->ppl.sector = sector;
3601        return len;
3602}
3603
3604static struct rdev_sysfs_entry rdev_ppl_sector =
3605__ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
3606
3607static ssize_t
3608ppl_size_show(struct md_rdev *rdev, char *page)
3609{
3610        return sprintf(page, "%u\n", rdev->ppl.size);
3611}
3612
3613static ssize_t
3614ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3615{
3616        unsigned int size;
3617
3618        if (kstrtouint(buf, 10, &size) < 0)
3619                return -EINVAL;
3620
3621        if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3622            rdev->raid_disk >= 0)
3623                return -EBUSY;
3624
3625        if (rdev->mddev->persistent) {
3626                if (rdev->mddev->major_version == 0)
3627                        return -EINVAL;
3628                if (size > U16_MAX)
3629                        return -EINVAL;
3630        } else if (!rdev->mddev->external) {
3631                return -EBUSY;
3632        }
3633        rdev->ppl.size = size;
3634        return len;
3635}
3636
3637static struct rdev_sysfs_entry rdev_ppl_size =
3638__ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
3639
3640static struct attribute *rdev_default_attrs[] = {
3641        &rdev_state.attr,
3642        &rdev_errors.attr,
3643        &rdev_slot.attr,
3644        &rdev_offset.attr,
3645        &rdev_new_offset.attr,
3646        &rdev_size.attr,
3647        &rdev_recovery_start.attr,
3648        &rdev_bad_blocks.attr,
3649        &rdev_unack_bad_blocks.attr,
3650        &rdev_ppl_sector.attr,
3651        &rdev_ppl_size.attr,
3652        NULL,
3653};
3654static ssize_t
3655rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3656{
3657        struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3658        struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3659
3660        if (!entry->show)
3661                return -EIO;
3662        if (!rdev->mddev)
3663                return -ENODEV;
3664        return entry->show(rdev, page);
3665}
3666
3667static ssize_t
3668rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3669              const char *page, size_t length)
3670{
3671        struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3672        struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3673        ssize_t rv;
3674        struct mddev *mddev = rdev->mddev;
3675
3676        if (!entry->store)
3677                return -EIO;
3678        if (!capable(CAP_SYS_ADMIN))
3679                return -EACCES;
3680        rv = mddev ? mddev_lock(mddev) : -ENODEV;
3681        if (!rv) {
3682                if (rdev->mddev == NULL)
3683                        rv = -ENODEV;
3684                else
3685                        rv = entry->store(rdev, page, length);
3686                mddev_unlock(mddev);
3687        }
3688        return rv;
3689}
3690
3691static void rdev_free(struct kobject *ko)
3692{
3693        struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3694        kfree(rdev);
3695}
3696static const struct sysfs_ops rdev_sysfs_ops = {
3697        .show           = rdev_attr_show,
3698        .store          = rdev_attr_store,
3699};
3700static struct kobj_type rdev_ktype = {
3701        .release        = rdev_free,
3702        .sysfs_ops      = &rdev_sysfs_ops,
3703        .default_attrs  = rdev_default_attrs,
3704};
3705
3706int md_rdev_init(struct md_rdev *rdev)
3707{
3708        rdev->desc_nr = -1;
3709        rdev->saved_raid_disk = -1;
3710        rdev->raid_disk = -1;
3711        rdev->flags = 0;
3712        rdev->data_offset = 0;
3713        rdev->new_data_offset = 0;
3714        rdev->sb_events = 0;
3715        rdev->last_read_error = 0;
3716        rdev->sb_loaded = 0;
3717        rdev->bb_page = NULL;
3718        atomic_set(&rdev->nr_pending, 0);
3719        atomic_set(&rdev->read_errors, 0);
3720        atomic_set(&rdev->corrected_errors, 0);
3721
3722        INIT_LIST_HEAD(&rdev->same_set);
3723        init_waitqueue_head(&rdev->blocked_wait);
3724
3725        /* Add space to store bad block list.
3726         * This reserves the space even on arrays where it cannot
3727         * be used - I wonder if that matters
3728         */
3729        return badblocks_init(&rdev->badblocks, 0);
3730}
3731EXPORT_SYMBOL_GPL(md_rdev_init);
3732/*
3733 * Import a device. If 'super_format' >= 0, then sanity check the superblock
3734 *
3735 * mark the device faulty if:
3736 *
3737 *   - the device is nonexistent (zero size)
3738 *   - the device has no valid superblock
3739 *
3740 * a faulty rdev _never_ has rdev->sb set.
3741 */
3742static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3743{
3744        char b[BDEVNAME_SIZE];
3745        int err;
3746        struct md_rdev *rdev;
3747        sector_t size;
3748
3749        rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3750        if (!rdev)
3751                return ERR_PTR(-ENOMEM);
3752
3753        err = md_rdev_init(rdev);
3754        if (err)
3755                goto abort_free;
3756        err = alloc_disk_sb(rdev);
3757        if (err)
3758                goto abort_free;
3759
3760        err = lock_rdev(rdev, newdev, super_format == -2);
3761        if (err)
3762                goto abort_free;
3763
3764        kobject_init(&rdev->kobj, &rdev_ktype);
3765
3766        size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
3767        if (!size) {
3768                pr_warn("md: %s has zero or unknown size, marking faulty!\n",
3769                        bdevname(rdev->bdev,b));
3770                err = -EINVAL;
3771                goto abort_free;
3772        }
3773
3774        if (super_format >= 0) {
3775                err = super_types[super_format].
3776                        load_super(rdev, NULL, super_minor);
3777                if (err == -EINVAL) {
3778                        pr_warn("md: %s does not have a valid v%d.%d superblock, not importing!\n",
3779                                bdevname(rdev->bdev,b),
3780                                super_format, super_minor);
3781                        goto abort_free;
3782                }
3783                if (err < 0) {
3784                        pr_warn("md: could not read %s's sb, not importing!\n",
3785                                bdevname(rdev->bdev,b));
3786                        goto abort_free;
3787                }
3788        }
3789
3790        return rdev;
3791
3792abort_free:
3793        if (rdev->bdev)
3794                unlock_rdev(rdev);
3795        md_rdev_clear(rdev);
3796        kfree(rdev);
3797        return ERR_PTR(err);
3798}
3799
3800/*
3801 * Check a full RAID array for plausibility
3802 */
3803
3804static int analyze_sbs(struct mddev *mddev)
3805{
3806        int i;
3807        struct md_rdev *rdev, *freshest, *tmp;
3808        char b[BDEVNAME_SIZE];
3809
3810        freshest = NULL;
3811        rdev_for_each_safe(rdev, tmp, mddev)
3812                switch (super_types[mddev->major_version].
3813                        load_super(rdev, freshest, mddev->minor_version)) {
3814                case 1:
3815                        freshest = rdev;
3816                        break;
3817                case 0:
3818                        break;
3819                default:
3820                        pr_warn("md: fatal superblock inconsistency in %s -- removing from array\n",
3821                                bdevname(rdev->bdev,b));
3822                        md_kick_rdev_from_array(rdev);
3823                }
3824
3825        /* Cannot find a valid fresh disk */
3826        if (!freshest) {
3827                pr_warn("md: cannot find a valid disk\n");
3828                return -EINVAL;
3829        }
3830
3831        super_types[mddev->major_version].
3832                validate_super(mddev, freshest);
3833
3834        i = 0;
3835        rdev_for_each_safe(rdev, tmp, mddev) {
3836                if (mddev->max_disks &&
3837                    (rdev->desc_nr >= mddev->max_disks ||
3838                     i > mddev->max_disks)) {
3839                        pr_warn("md: %s: %s: only %d devices permitted\n",
3840                                mdname(mddev), bdevname(rdev->bdev, b),
3841                                mddev->max_disks);
3842                        md_kick_rdev_from_array(rdev);
3843                        continue;
3844                }
3845                if (rdev != freshest) {
3846                        if (super_types[mddev->major_version].
3847                            validate_super(mddev, rdev)) {
3848                                pr_warn("md: kicking non-fresh %s from array!\n",
3849                                        bdevname(rdev->bdev,b));
3850                                md_kick_rdev_from_array(rdev);
3851                                continue;
3852                        }
3853                }
3854                if (mddev->level == LEVEL_MULTIPATH) {
3855                        rdev->desc_nr = i++;
3856                        rdev->raid_disk = rdev->desc_nr;
3857                        set_bit(In_sync, &rdev->flags);
3858                } else if (rdev->raid_disk >=
3859                            (mddev->raid_disks - min(0, mddev->delta_disks)) &&
3860                           !test_bit(Journal, &rdev->flags)) {
3861                        rdev->raid_disk = -1;
3862                        clear_bit(In_sync, &rdev->flags);
3863                }
3864        }
3865
3866        return 0;
3867}
3868
3869/* Read a fixed-point number.
3870 * Numbers in sysfs attributes should be in "standard" units where
3871 * possible, so time should be in seconds.
3872 * However we internally use a a much smaller unit such as
3873 * milliseconds or jiffies.
3874 * This function takes a decimal number with a possible fractional
3875 * component, and produces an integer which is the result of
3876 * multiplying that number by 10^'scale'.
3877 * all without any floating-point arithmetic.
3878 */
3879int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3880{
3881        unsigned long result = 0;
3882        long decimals = -1;
3883        while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3884                if (*cp == '.')
3885                        decimals = 0;
3886                else if (decimals < scale) {
3887                        unsigned int value;
3888                        value = *cp - '0';
3889                        result = result * 10 + value;
3890                        if (decimals >= 0)
3891                                decimals++;
3892                }
3893                cp++;
3894        }
3895        if (*cp == '\n')
3896                cp++;
3897        if (*cp)
3898                return -EINVAL;
3899        if (decimals < 0)
3900                decimals = 0;
3901        while (decimals < scale) {
3902                result *= 10;
3903                decimals ++;
3904        }
3905        *res = result;
3906        return 0;
3907}
3908
3909static ssize_t
3910safe_delay_show(struct mddev *mddev, char *page)
3911{
3912        int msec = (mddev->safemode_delay*1000)/HZ;
3913        return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
3914}
3915static ssize_t
3916safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3917{
3918        unsigned long msec;
3919
3920        if (mddev_is_clustered(mddev)) {
3921                pr_warn("md: Safemode is disabled for clustered mode\n");
3922                return -EINVAL;
3923        }
3924
3925        if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
3926                return -EINVAL;
3927        if (msec == 0)
3928                mddev->safemode_delay = 0;
3929        else {
3930                unsigned long old_delay = mddev->safemode_delay;
3931                unsigned long new_delay = (msec*HZ)/1000;
3932
3933                if (new_delay == 0)
3934                        new_delay = 1;
3935                mddev->safemode_delay = new_delay;
3936                if (new_delay < old_delay || old_delay == 0)
3937                        mod_timer(&mddev->safemode_timer, jiffies+1);
3938        }
3939        return len;
3940}
3941static struct md_sysfs_entry md_safe_delay =
3942__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3943
3944static ssize_t
3945level_show(struct mddev *mddev, char *page)
3946{
3947        struct md_personality *p;
3948        int ret;
3949        spin_lock(&mddev->lock);
3950        p = mddev->pers;
3951        if (p)
3952                ret = sprintf(page, "%s\n", p->name);
3953        else if (mddev->clevel[0])
3954                ret = sprintf(page, "%s\n", mddev->clevel);
3955        else if (mddev->level != LEVEL_NONE)
3956                ret = sprintf(page, "%d\n", mddev->level);
3957        else
3958                ret = 0;
3959        spin_unlock(&mddev->lock);
3960        return ret;
3961}
3962
3963static ssize_t
3964level_store(struct mddev *mddev, const char *buf, size_t len)
3965{
3966        char clevel[16];
3967        ssize_t rv;
3968        size_t slen = len;
3969        struct md_personality *pers, *oldpers;
3970        long level;
3971        void *priv, *oldpriv;
3972        struct md_rdev *rdev;
3973
3974        if (slen == 0 || slen >= sizeof(clevel))
3975                return -EINVAL;
3976
3977        rv = mddev_lock(mddev);
3978        if (rv)
3979                return rv;
3980
3981        if (mddev->pers == NULL) {
3982                strncpy(mddev->clevel, buf, slen);
3983                if (mddev->clevel[slen-1] == '\n')
3984                        slen--;
3985                mddev->clevel[slen] = 0;
3986                mddev->level = LEVEL_NONE;
3987                rv = len;
3988                goto out_unlock;
3989        }
3990        rv = -EROFS;
3991        if (mddev->ro)
3992                goto out_unlock;
3993
3994        /* request to change the personality.  Need to ensure:
3995         *  - array is not engaged in resync/recovery/reshape
3996         *  - old personality can be suspended
3997         *  - new personality will access other array.
3998         */
3999
4000        rv = -EBUSY;
4001        if (mddev->sync_thread ||
4002            test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4003            mddev->reshape_position != MaxSector ||
4004            mddev->sysfs_active)
4005                goto out_unlock;
4006
4007        rv = -EINVAL;
4008        if (!mddev->pers->quiesce) {
4009                pr_warn("md: %s: %s does not support online personality change\n",
4010                        mdname(mddev), mddev->pers->name);
4011                goto out_unlock;
4012        }
4013
4014        /* Now find the new personality */
4015        strncpy(clevel, buf, slen);
4016        if (clevel[slen-1] == '\n')
4017                slen--;
4018        clevel[slen] = 0;
4019        if (kstrtol(clevel, 10, &level))
4020                level = LEVEL_NONE;
4021
4022        if (request_module("md-%s", clevel) != 0)
4023                request_module("md-level-%s", clevel);
4024        spin_lock(&pers_lock);
4025        pers = find_pers(level, clevel);
4026        if (!pers || !try_module_get(pers->owner)) {
4027                spin_unlock(&pers_lock);
4028                pr_warn("md: personality %s not loaded\n", clevel);
4029                rv = -EINVAL;
4030                goto out_unlock;
4031        }
4032        spin_unlock(&pers_lock);
4033
4034        if (pers == mddev->pers) {
4035                /* Nothing to do! */
4036                module_put(pers->owner);
4037                rv = len;
4038                goto out_unlock;
4039        }
4040        if (!pers->takeover) {
4041                module_put(pers->owner);
4042                pr_warn("md: %s: %s does not support personality takeover\n",
4043                        mdname(mddev), clevel);
4044                rv = -EINVAL;
4045                goto out_unlock;
4046        }
4047
4048        rdev_for_each(rdev, mddev)
4049                rdev->new_raid_disk = rdev->raid_disk;
4050
4051        /* ->takeover must set new_* and/or delta_disks
4052         * if it succeeds, and may set them when it fails.
4053         */
4054        priv = pers->takeover(mddev);
4055        if (IS_ERR(priv)) {
4056                mddev->new_level = mddev->level;
4057                mddev->new_layout = mddev->layout;
4058                mddev->new_chunk_sectors = mddev->chunk_sectors;
4059                mddev->raid_disks -= mddev->delta_disks;
4060                mddev->delta_disks = 0;
4061                mddev->reshape_backwards = 0;
4062                module_put(pers->owner);
4063                pr_warn("md: %s: %s would not accept array\n",
4064                        mdname(mddev), clevel);
4065                rv = PTR_ERR(priv);
4066                goto out_unlock;
4067        }
4068
4069        /* Looks like we have a winner */
4070        mddev_suspend(mddev);
4071        mddev_detach(mddev);
4072
4073        spin_lock(&mddev->lock);
4074        oldpers = mddev->pers;
4075        oldpriv = mddev->private;
4076        mddev->pers = pers;
4077        mddev->private = priv;
4078        strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
4079        mddev->level = mddev->new_level;
4080        mddev->layout = mddev->new_layout;
4081        mddev->chunk_sectors = mddev->new_chunk_sectors;
4082        mddev->delta_disks = 0;
4083        mddev->reshape_backwards = 0;
4084        mddev->degraded = 0;
4085        spin_unlock(&mddev->lock);
4086
4087        if (oldpers->sync_request == NULL &&
4088            mddev->external) {
4089                /* We are converting from a no-redundancy array
4090                 * to a redundancy array and metadata is managed
4091                 * externally so we need to be sure that writes
4092                 * won't block due to a need to transition
4093                 *      clean->dirty
4094                 * until external management is started.
4095                 */
4096                mddev->in_sync = 0;
4097                mddev->safemode_delay = 0;
4098                mddev->safemode = 0;
4099        }
4100
4101        oldpers->free(mddev, oldpriv);
4102
4103        if (oldpers->sync_request == NULL &&
4104            pers->sync_request != NULL) {
4105                /* need to add the md_redundancy_group */
4106                if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
4107                        pr_warn("md: cannot register extra attributes for %s\n",
4108                                mdname(mddev));
4109                mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
4110                mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
4111                mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
4112        }
4113        if (oldpers->sync_request != NULL &&
4114            pers->sync_request == NULL) {
4115                /* need to remove the md_redundancy_group */
4116                if (mddev->to_remove == NULL)
4117                        mddev->to_remove = &md_redundancy_group;
4118        }
4119
4120        module_put(oldpers->owner);
4121
4122        rdev_for_each(rdev, mddev) {
4123                if (rdev->raid_disk < 0)
4124                        continue;
4125                if (rdev->new_raid_disk >= mddev->raid_disks)
4126                        rdev->new_raid_disk = -1;
4127                if (rdev->new_raid_disk == rdev->raid_disk)
4128                        continue;
4129                sysfs_unlink_rdev(mddev, rdev);
4130        }
4131        rdev_for_each(rdev, mddev) {
4132                if (rdev->raid_disk < 0)
4133                        continue;
4134                if (rdev->new_raid_disk == rdev->raid_disk)
4135                        continue;
4136                rdev->raid_disk = rdev->new_raid_disk;
4137                if (rdev->raid_disk < 0)
4138                        clear_bit(In_sync, &rdev->flags);
4139                else {
4140                        if (sysfs_link_rdev(mddev, rdev))
4141                                pr_warn("md: cannot register rd%d for %s after level change\n",
4142                                        rdev->raid_disk, mdname(mddev));
4143                }
4144        }
4145
4146        if (pers->sync_request == NULL) {
4147                /* this is now an array without redundancy, so
4148                 * it must always be in_sync
4149                 */
4150                mddev->in_sync = 1;
4151                del_timer_sync(&mddev->safemode_timer);
4152        }
4153        blk_set_stacking_limits(&mddev->queue->limits);
4154        pers->run(mddev);
4155        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4156        mddev_resume(mddev);
4157        if (!mddev->thread)
4158                md_update_sb(mddev, 1);
4159        sysfs_notify_dirent_safe(mddev->sysfs_level);
4160        md_new_event(mddev);
4161        rv = len;
4162out_unlock:
4163        mddev_unlock(mddev);
4164        return rv;
4165}
4166
4167static struct md_sysfs_entry md_level =
4168__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
4169
4170static ssize_t
4171layout_show(struct mddev *mddev, char *page)
4172{
4173        /* just a number, not meaningful for all levels */
4174        if (mddev->reshape_position != MaxSector &&
4175            mddev->layout != mddev->new_layout)
4176                return sprintf(page, "%d (%d)\n",
4177                               mddev->new_layout, mddev->layout);
4178        return sprintf(page, "%d\n", mddev->layout);
4179}
4180
4181static ssize_t
4182layout_store(struct mddev *mddev, const char *buf, size_t len)
4183{
4184        unsigned int n;
4185        int err;
4186
4187        err = kstrtouint(buf, 10, &n);
4188        if (err < 0)
4189                return err;
4190        err = mddev_lock(mddev);
4191        if (err)
4192                return err;
4193
4194        if (mddev->pers) {
4195                if (mddev->pers->check_reshape == NULL)
4196                        err = -EBUSY;
4197                else if (mddev->ro)
4198                        err = -EROFS;
4199                else {
4200                        mddev->new_layout = n;
4201                        err = mddev->pers->check_reshape(mddev);
4202                        if (err)
4203                                mddev->new_layout = mddev->layout;
4204                }
4205        } else {
4206                mddev->new_layout = n;
4207                if (mddev->reshape_position == MaxSector)
4208                        mddev->layout = n;
4209        }
4210        mddev_unlock(mddev);
4211        return err ?: len;
4212}
4213static struct md_sysfs_entry md_layout =
4214__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
4215
4216static ssize_t
4217raid_disks_show(struct mddev *mddev, char *page)
4218{
4219        if (mddev->raid_disks == 0)
4220                return 0;
4221        if (mddev->reshape_position != MaxSector &&
4222            mddev->delta_disks != 0)
4223                return sprintf(page, "%d (%d)\n", mddev->raid_disks,
4224                               mddev->raid_disks - mddev->delta_disks);
4225        return sprintf(page, "%d\n", mddev->raid_disks);
4226}
4227
4228static int update_raid_disks(struct mddev *mddev, int raid_disks);
4229
4230static ssize_t
4231raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
4232{
4233        unsigned int n;
4234        int err;
4235
4236        err = kstrtouint(buf, 10, &n);
4237        if (err < 0)
4238                return err;
4239
4240        err = mddev_lock(mddev);
4241        if (err)
4242                return err;
4243        if (mddev->pers)
4244                err = update_raid_disks(mddev, n);
4245        else if (mddev->reshape_position != MaxSector) {
4246                struct md_rdev *rdev;
4247                int olddisks = mddev->raid_disks - mddev->delta_disks;
4248
4249                err = -EINVAL;
4250                rdev_for_each(rdev, mddev) {
4251                        if (olddisks < n &&
4252                            rdev->data_offset < rdev->new_data_offset)
4253                                goto out_unlock;
4254                        if (olddisks > n &&
4255                            rdev->data_offset > rdev->new_data_offset)
4256                                goto out_unlock;
4257                }
4258                err = 0;
4259                mddev->delta_disks = n - olddisks;
4260                mddev->raid_disks = n;
4261                mddev->reshape_backwards = (mddev->delta_disks < 0);
4262        } else
4263                mddev->raid_disks = n;
4264out_unlock:
4265        mddev_unlock(mddev);
4266        return err ? err : len;
4267}
4268static struct md_sysfs_entry md_raid_disks =
4269__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
4270
4271static ssize_t
4272uuid_show(struct mddev *mddev, char *page)
4273{
4274        return sprintf(page, "%pU\n", mddev->uuid);
4275}
4276static struct md_sysfs_entry md_uuid =
4277__ATTR(uuid, S_IRUGO, uuid_show, NULL);
4278
4279static ssize_t
4280chunk_size_show(struct mddev *mddev, char *page)
4281{
4282        if (mddev->reshape_position != MaxSector &&
4283            mddev->chunk_sectors != mddev->new_chunk_sectors)
4284                return sprintf(page, "%d (%d)\n",
4285                               mddev->new_chunk_sectors << 9,
4286                               mddev->chunk_sectors << 9);
4287        return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
4288}
4289
4290static ssize_t
4291chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
4292{
4293        unsigned long n;
4294        int err;
4295
4296        err = kstrtoul(buf, 10, &n);
4297        if (err < 0)
4298                return err;
4299
4300        err = mddev_lock(mddev);
4301        if (err)
4302                return err;
4303        if (mddev->pers) {
4304                if (mddev->pers->check_reshape == NULL)
4305                        err = -EBUSY;
4306                else if (mddev->ro)
4307                        err = -EROFS;
4308                else {
4309                        mddev->new_chunk_sectors = n >> 9;
4310                        err = mddev->pers->check_reshape(mddev);
4311                        if (err)
4312                                mddev->new_chunk_sectors = mddev->chunk_sectors;
4313                }
4314        } else {
4315                mddev->new_chunk_sectors = n >> 9;
4316                if (mddev->reshape_position == MaxSector)
4317                        mddev->chunk_sectors = n >> 9;
4318        }
4319        mddev_unlock(mddev);
4320        return err ?: len;
4321}
4322static struct md_sysfs_entry md_chunk_size =
4323__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
4324
4325static ssize_t
4326resync_start_show(struct mddev *mddev, char *page)
4327{
4328        if (mddev->recovery_cp == MaxSector)
4329                return sprintf(page, "none\n");
4330        return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
4331}
4332
4333static ssize_t
4334resync_start_store(struct mddev *mddev, const char *buf, size_t len)
4335{
4336        unsigned long long n;
4337        int err;
4338
4339        if (cmd_match(buf, "none"))
4340                n = MaxSector;
4341        else {
4342                err = kstrtoull(buf, 10, &n);
4343                if (err < 0)
4344                        return err;
4345                if (n != (sector_t)n)
4346                        return -EINVAL;
4347        }
4348
4349        err = mddev_lock(mddev);
4350        if (err)
4351                return err;
4352        if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4353                err = -EBUSY;
4354
4355        if (!err) {
4356                mddev->recovery_cp = n;
4357                if (mddev->pers)
4358                        set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
4359        }
4360        mddev_unlock(mddev);
4361        return err ?: len;
4362}
4363static struct md_sysfs_entry md_resync_start =
4364__ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
4365                resync_start_show, resync_start_store);
4366
4367/*
4368 * The array state can be:
4369 *
4370 * clear
4371 *     No devices, no size, no level
4372 *     Equivalent to STOP_ARRAY ioctl
4373 * inactive
4374 *     May have some settings, but array is not active
4375 *        all IO results in error
4376 *     When written, doesn't tear down array, but just stops it
4377 * suspended (not supported yet)
4378 *     All IO requests will block. The array can be reconfigured.
4379 *     Writing this, if accepted, will block until array is quiescent
4380 * readonly
4381 *     no resync can happen.  no superblocks get written.
4382 *     write requests fail
4383 * read-auto
4384 *     like readonly, but behaves like 'clean' on a write request.
4385 *
4386 * clean - no pending writes, but otherwise active.
4387 *     When written to inactive array, starts without resync
4388 *     If a write request arrives then
4389 *       if metadata is known, mark 'dirty' and switch to 'active'.
4390 *       if not known, block and switch to write-pending
4391 *     If written to an active array that has pending writes, then fails.
4392 * active
4393 *     fully active: IO and resync can be happening.
4394 *     When written to inactive array, starts with resync
4395 *
4396 * write-pending
4397 *     clean, but writes are blocked waiting for 'active' to be written.
4398 *
4399 * active-idle
4400 *     like active, but no writes have been seen for a while (100msec).
4401 *
4402 * broken
4403 *     RAID0/LINEAR-only: same as clean, but array is missing a member.
4404 *     It's useful because RAID0/LINEAR mounted-arrays aren't stopped
4405 *     when a member is gone, so this state will at least alert the
4406 *     user that something is wrong.
4407 */
4408enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
4409                   write_pending, active_idle, broken, bad_word};
4410static char *array_states[] = {
4411        "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
4412        "write-pending", "active-idle", "broken", NULL };
4413
4414static int match_word(const char *word, char **list)
4415{
4416        int n;
4417        for (n=0; list[n]; n++)
4418                if (cmd_match(word, list[n]))
4419                        break;
4420        return n;
4421}
4422
4423static ssize_t
4424array_state_show(struct mddev *mddev, char *page)
4425{
4426        enum array_state st = inactive;
4427
4428        if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
4429                switch(mddev->ro) {
4430                case 1:
4431                        st = readonly;
4432                        break;
4433                case 2:
4434                        st = read_auto;
4435                        break;
4436                case 0:
4437                        spin_lock(&mddev->lock);
4438                        if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
4439                                st = write_pending;
4440                        else if (mddev->in_sync)
4441                                st = clean;
4442                        else if (mddev->safemode)
4443                                st = active_idle;
4444                        else
4445                                st = active;
4446                        spin_unlock(&mddev->lock);
4447                }
4448
4449                if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
4450                        st = broken;
4451        } else {
4452                if (list_empty(&mddev->disks) &&
4453                    mddev->raid_disks == 0 &&
4454                    mddev->dev_sectors == 0)
4455                        st = clear;
4456                else
4457                        st = inactive;
4458        }
4459        return sprintf(page, "%s\n", array_states[st]);
4460}
4461
4462static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
4463static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
4464static int do_md_run(struct mddev *mddev);
4465static int restart_array(struct mddev *mddev);
4466
4467static ssize_t
4468array_state_store(struct mddev *mddev, const char *buf, size_t len)
4469{
4470        int err = 0;
4471        enum array_state st = match_word(buf, array_states);
4472
4473        if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
4474                /* don't take reconfig_mutex when toggling between
4475                 * clean and active
4476                 */
4477                spin_lock(&mddev->lock);
4478                if (st == active) {
4479                        restart_array(mddev);
4480                        clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4481                        md_wakeup_thread(mddev->thread);
4482                        wake_up(&mddev->sb_wait);
4483                } else /* st == clean */ {
4484                        restart_array(mddev);
4485                        if (!set_in_sync(mddev))
4486                                err = -EBUSY;
4487                }
4488                if (!err)
4489                        sysfs_notify_dirent_safe(mddev->sysfs_state);
4490                spin_unlock(&mddev->lock);
4491                return err ?: len;
4492        }
4493        err = mddev_lock(mddev);
4494        if (err)
4495                return err;
4496        err = -EINVAL;
4497        switch(st) {
4498        case bad_word:
4499                break;
4500        case clear:
4501                /* stopping an active array */
4502                err = do_md_stop(mddev, 0, NULL);
4503                break;
4504        case inactive:
4505                /* stopping an active array */
4506                if (mddev->pers)
4507                        err = do_md_stop(mddev, 2, NULL);
4508                else
4509                        err = 0; /* already inactive */
4510                break;
4511        case suspended:
4512                break; /* not supported yet */
4513        case readonly:
4514                if (mddev->pers)
4515                        err = md_set_readonly(mddev, NULL);
4516                else {
4517                        mddev->ro = 1;
4518                        set_disk_ro(mddev->gendisk, 1);
4519                        err = do_md_run(mddev);
4520                }
4521                break;
4522        case read_auto:
4523                if (mddev->pers) {
4524                        if (mddev->ro == 0)
4525                                err = md_set_readonly(mddev, NULL);
4526                        else if (mddev->ro == 1)
4527                                err = restart_array(mddev);
4528                        if (err == 0) {
4529                                mddev->ro = 2;
4530                                set_disk_ro(mddev->gendisk, 0);
4531                        }
4532                } else {
4533                        mddev->ro = 2;
4534                        err = do_md_run(mddev);
4535                }
4536                break;
4537        case clean:
4538                if (mddev->pers) {
4539                        err = restart_array(mddev);
4540                        if (err)
4541                                break;
4542                        spin_lock(&mddev->lock);
4543                        if (!set_in_sync(mddev))
4544                                err = -EBUSY;
4545                        spin_unlock(&mddev->lock);
4546                } else
4547                        err = -EINVAL;
4548                break;
4549        case active:
4550                if (mddev->pers) {
4551                        err = restart_array(mddev);
4552                        if (err)
4553                                break;
4554                        clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4555                        wake_up(&mddev->sb_wait);
4556                        err = 0;
4557                } else {
4558                        mddev->ro = 0;
4559                        set_disk_ro(mddev->gendisk, 0);
4560                        err = do_md_run(mddev);
4561                }
4562                break;
4563        case write_pending:
4564        case active_idle:
4565        case broken:
4566                /* these cannot be set */
4567                break;
4568        }
4569
4570        if (!err) {
4571                if (mddev->hold_active == UNTIL_IOCTL)
4572                        mddev->hold_active = 0;
4573                sysfs_notify_dirent_safe(mddev->sysfs_state);
4574        }
4575        mddev_unlock(mddev);
4576        return err ?: len;
4577}
4578static struct md_sysfs_entry md_array_state =
4579__ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
4580
4581static ssize_t
4582max_corrected_read_errors_show(struct mddev *mddev, char *page) {
4583        return sprintf(page, "%d\n",
4584                       atomic_read(&mddev->max_corr_read_errors));
4585}
4586
4587static ssize_t
4588max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
4589{
4590        unsigned int n;
4591        int rv;
4592
4593        rv = kstrtouint(buf, 10, &n);
4594        if (rv < 0)
4595                return rv;
4596        atomic_set(&mddev->max_corr_read_errors, n);
4597        return len;
4598}
4599
4600static struct md_sysfs_entry max_corr_read_errors =
4601__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
4602        max_corrected_read_errors_store);
4603
4604static ssize_t
4605null_show(struct mddev *mddev, char *page)
4606{
4607        return -EINVAL;
4608}
4609
4610/* need to ensure rdev_delayed_delete() has completed */
4611static void flush_rdev_wq(struct mddev *mddev)
4612{
4613        struct md_rdev *rdev;
4614
4615        rcu_read_lock();
4616        rdev_for_each_rcu(rdev, mddev)
4617                if (work_pending(&rdev->del_work)) {
4618                        flush_workqueue(md_rdev_misc_wq);
4619                        break;
4620                }
4621        rcu_read_unlock();
4622}
4623
4624static ssize_t
4625new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4626{
4627        /* buf must be %d:%d\n? giving major and minor numbers */
4628        /* The new device is added to the array.
4629         * If the array has a persistent superblock, we read the
4630         * superblock to initialise info and check validity.
4631         * Otherwise, only checking done is that in bind_rdev_to_array,
4632         * which mainly checks size.
4633         */
4634        char *e;
4635        int major = simple_strtoul(buf, &e, 10);
4636        int minor;
4637        dev_t dev;
4638        struct md_rdev *rdev;
4639        int err;
4640
4641        if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
4642                return -EINVAL;
4643        minor = simple_strtoul(e+1, &e, 10);
4644        if (*e && *e != '\n')
4645                return -EINVAL;
4646        dev = MKDEV(major, minor);
4647        if (major != MAJOR(dev) ||
4648            minor != MINOR(dev))
4649                return -EOVERFLOW;
4650
4651        flush_rdev_wq(mddev);
4652        err = mddev_lock(mddev);
4653        if (err)
4654                return err;
4655        if (mddev->persistent) {
4656                rdev = md_import_device(dev, mddev->major_version,
4657                                        mddev->minor_version);
4658                if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4659                        struct md_rdev *rdev0
4660                                = list_entry(mddev->disks.next,
4661                                             struct md_rdev, same_set);
4662                        err = super_types[mddev->major_version]
4663                                .load_super(rdev, rdev0, mddev->minor_version);
4664                        if (err < 0)
4665                                goto out;
4666                }
4667        } else if (mddev->external)
4668                rdev = md_import_device(dev, -2, -1);
4669        else
4670                rdev = md_import_device(dev, -1, -1);
4671
4672        if (IS_ERR(rdev)) {
4673                mddev_unlock(mddev);
4674                return PTR_ERR(rdev);
4675        }
4676        err = bind_rdev_to_array(rdev, mddev);
4677 out:
4678        if (err)
4679                export_rdev(rdev);
4680        mddev_unlock(mddev);
4681        if (!err)
4682                md_new_event(mddev);
4683        return err ? err : len;
4684}
4685
4686static struct md_sysfs_entry md_new_device =
4687__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4688
4689static ssize_t
4690bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4691{
4692        char *end;
4693        unsigned long chunk, end_chunk;
4694        int err;
4695
4696        err = mddev_lock(mddev);
4697        if (err)
4698                return err;
4699        if (!mddev->bitmap)
4700                goto out;
4701        /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
4702        while (*buf) {
4703                chunk = end_chunk = simple_strtoul(buf, &end, 0);
4704                if (buf == end) break;
4705                if (*end == '-') { /* range */
4706                        buf = end + 1;
4707                        end_chunk = simple_strtoul(buf, &end, 0);
4708                        if (buf == end) break;
4709                }
4710                if (*end && !isspace(*end)) break;
4711                md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
4712                buf = skip_spaces(end);
4713        }
4714        md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
4715out:
4716        mddev_unlock(mddev);
4717        return len;
4718}
4719
4720static struct md_sysfs_entry md_bitmap =
4721__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4722
4723static ssize_t
4724size_show(struct mddev *mddev, char *page)
4725{
4726        return sprintf(page, "%llu\n",
4727                (unsigned long long)mddev->dev_sectors / 2);
4728}
4729
4730static int update_size(struct mddev *mddev, sector_t num_sectors);
4731
4732static ssize_t
4733size_store(struct mddev *mddev, const char *buf, size_t len)
4734{
4735        /* If array is inactive, we can reduce the component size, but
4736         * not increase it (except from 0).
4737         * If array is active, we can try an on-line resize
4738         */
4739        sector_t sectors;
4740        int err = strict_blocks_to_sectors(buf, &sectors);
4741
4742        if (err < 0)
4743                return err;
4744        err = mddev_lock(mddev);
4745        if (err)
4746                return err;
4747        if (mddev->pers) {
4748                err = update_size(mddev, sectors);
4749                if (err == 0)
4750                        md_update_sb(mddev, 1);
4751        } else {
4752                if (mddev->dev_sectors == 0 ||
4753                    mddev->dev_sectors > sectors)
4754                        mddev->dev_sectors = sectors;
4755                else
4756                        err = -ENOSPC;
4757        }
4758        mddev_unlock(mddev);
4759        return err ? err : len;
4760}
4761
4762static struct md_sysfs_entry md_size =
4763__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4764
4765/* Metadata version.
4766 * This is one of
4767 *   'none' for arrays with no metadata (good luck...)
4768 *   'external' for arrays with externally managed metadata,
4769 * or N.M for internally known formats
4770 */
4771static ssize_t
4772metadata_show(struct mddev *mddev, char *page)
4773{
4774        if (mddev->persistent)
4775                return sprintf(page, "%d.%d\n",
4776                               mddev->major_version, mddev->minor_version);
4777        else if (mddev->external)
4778                return sprintf(page, "external:%s\n", mddev->metadata_type);
4779        else
4780                return sprintf(page, "none\n");
4781}
4782
4783static ssize_t
4784metadata_store(struct mddev *mddev, const char *buf, size_t len)
4785{
4786        int major, minor;
4787        char *e;
4788        int err;
4789        /* Changing the details of 'external' metadata is
4790         * always permitted.  Otherwise there must be
4791         * no devices attached to the array.
4792         */
4793
4794        err = mddev_lock(mddev);
4795        if (err)
4796                return err;
4797        err = -EBUSY;
4798        if (mddev->external && strncmp(buf, "external:", 9) == 0)
4799                ;
4800        else if (!list_empty(&mddev->disks))
4801                goto out_unlock;
4802
4803        err = 0;
4804        if (cmd_match(buf, "none")) {
4805                mddev->persistent = 0;
4806                mddev->external = 0;
4807                mddev->major_version = 0;
4808                mddev->minor_version = 90;
4809                goto out_unlock;
4810        }
4811        if (strncmp(buf, "external:", 9) == 0) {
4812                size_t namelen = len-9;
4813                if (namelen >= sizeof(mddev->metadata_type))
4814                        namelen = sizeof(mddev->metadata_type)-1;
4815                strncpy(mddev->metadata_type, buf+9, namelen);
4816                mddev->metadata_type[namelen] = 0;
4817                if (namelen && mddev->metadata_type[namelen-1] == '\n')
4818                        mddev->metadata_type[--namelen] = 0;
4819                mddev->persistent = 0;
4820                mddev->external = 1;
4821                mddev->major_version = 0;
4822                mddev->minor_version = 90;
4823                goto out_unlock;
4824        }
4825        major = simple_strtoul(buf, &e, 10);
4826        err = -EINVAL;
4827        if (e==buf || *e != '.')
4828                goto out_unlock;
4829        buf = e+1;
4830        minor = simple_strtoul(buf, &e, 10);
4831        if (e==buf || (*e && *e != '\n') )
4832                goto out_unlock;
4833        err = -ENOENT;
4834        if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
4835                goto out_unlock;
4836        mddev->major_version = major;
4837        mddev->minor_version = minor;
4838        mddev->persistent = 1;
4839        mddev->external = 0;
4840        err = 0;
4841out_unlock:
4842        mddev_unlock(mddev);
4843        return err ?: len;
4844}
4845
4846static struct md_sysfs_entry md_metadata =
4847__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4848
4849static ssize_t
4850action_show(struct mddev *mddev, char *page)
4851{
4852        char *type = "idle";
4853        unsigned long recovery = mddev->recovery;
4854        if (test_bit(MD_RECOVERY_FROZEN, &recovery))
4855                type = "frozen";
4856        else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
4857            (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
4858                if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
4859                        type = "reshape";
4860                else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
4861                        if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
4862                                type = "resync";
4863                        else if (test_bit(MD_RECOVERY_CHECK, &recovery))
4864                                type = "check";
4865                        else
4866                                type = "repair";
4867                } else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
4868                        type = "recover";
4869                else if (mddev->reshape_position != MaxSector)
4870                        type = "reshape";
4871        }
4872        return sprintf(page, "%s\n", type);
4873}
4874
4875static ssize_t
4876action_store(struct mddev *mddev, const char *page, size_t len)
4877{
4878        if (!mddev->pers || !mddev->pers->sync_request)
4879                return -EINVAL;
4880
4881
4882        if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
4883                if (cmd_match(page, "frozen"))
4884                        set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4885                else
4886                        clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4887                if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
4888                    mddev_lock(mddev) == 0) {
4889                        if (work_pending(&mddev->del_work))
4890                                flush_workqueue(md_misc_wq);
4891                        if (mddev->sync_thread) {
4892                                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4893                                md_reap_sync_thread(mddev);
4894                        }
4895                        mddev_unlock(mddev);
4896                }
4897        } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4898                return -EBUSY;
4899        else if (cmd_match(page, "resync"))
4900                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4901        else if (cmd_match(page, "recover")) {
4902                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4903                set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4904        } else if (cmd_match(page, "reshape")) {
4905                int err;
4906                if (mddev->pers->start_reshape == NULL)
4907                        return -EINVAL;
4908                err = mddev_lock(mddev);
4909                if (!err) {
4910                        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4911                                err =  -EBUSY;
4912                        else {
4913                                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4914                                err = mddev->pers->start_reshape(mddev);
4915                        }
4916                        mddev_unlock(mddev);
4917                }
4918                if (err)
4919                        return err;
4920                sysfs_notify_dirent_safe(mddev->sysfs_degraded);
4921        } else {
4922                if (cmd_match(page, "check"))
4923                        set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4924                else if (!cmd_match(page, "repair"))
4925                        return -EINVAL;
4926                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4927                set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
4928                set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4929        }
4930        if (mddev->ro == 2) {
4931                /* A write to sync_action is enough to justify
4932                 * canceling read-auto mode
4933                 */
4934                mddev->ro = 0;
4935                md_wakeup_thread(mddev->sync_thread);
4936        }
4937        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4938        md_wakeup_thread(mddev->thread);
4939        sysfs_notify_dirent_safe(mddev->sysfs_action);
4940        return len;
4941}
4942
4943static struct md_sysfs_entry md_scan_mode =
4944__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
4945
4946static ssize_t
4947last_sync_action_show(struct mddev *mddev, char *page)
4948{
4949        return sprintf(page, "%s\n", mddev->last_sync_action);
4950}
4951
4952static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
4953
4954static ssize_t
4955mismatch_cnt_show(struct mddev *mddev, char *page)
4956{
4957        return sprintf(page, "%llu\n",
4958                       (unsigned long long)
4959                       atomic64_read(&mddev->resync_mismatches));
4960}
4961
4962static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
4963
4964static ssize_t
4965sync_min_show(struct mddev *mddev, char *page)
4966{
4967        return sprintf(page, "%d (%s)\n", speed_min(mddev),
4968                       mddev->sync_speed_min ? "local": "system");
4969}
4970
4971static ssize_t
4972sync_min_store(struct mddev *mddev, const char *buf, size_t len)
4973{
4974        unsigned int min;
4975        int rv;
4976
4977        if (strncmp(buf, "system", 6)==0) {
4978                min = 0;
4979        } else {
4980                rv = kstrtouint(buf, 10, &min);
4981                if (rv < 0)
4982                        return rv;
4983                if (min == 0)
4984                        return -EINVAL;
4985        }
4986        mddev->sync_speed_min = min;
4987        return len;
4988}
4989
4990static struct md_sysfs_entry md_sync_min =
4991__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
4992
4993static ssize_t
4994sync_max_show(struct mddev *mddev, char *page)
4995{
4996        return sprintf(page, "%d (%s)\n", speed_max(mddev),
4997                       mddev->sync_speed_max ? "local": "system");
4998}
4999
5000static ssize_t
5001sync_max_store(struct mddev *mddev, const char *buf, size_t len)
5002{
5003        unsigned int max;
5004        int rv;
5005
5006        if (strncmp(buf, "system", 6)==0) {
5007                max = 0;
5008        } else {
5009                rv = kstrtouint(buf, 10, &max);
5010                if (rv < 0)
5011                        return rv;
5012                if (max == 0)
5013                        return -EINVAL;
5014        }
5015        mddev->sync_speed_max = max;
5016        return len;
5017}
5018
5019static struct md_sysfs_entry md_sync_max =
5020__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
5021
5022static ssize_t
5023degraded_show(struct mddev *mddev, char *page)
5024{
5025        return sprintf(page, "%d\n", mddev->degraded);
5026}
5027static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
5028
5029static ssize_t
5030sync_force_parallel_show(struct mddev *mddev, char *page)
5031{
5032        return sprintf(page, "%d\n", mddev->parallel_resync);
5033}
5034
5035static ssize_t
5036sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
5037{
5038        long n;
5039
5040        if (kstrtol(buf, 10, &n))
5041                return -EINVAL;
5042
5043        if (n != 0 && n != 1)
5044                return -EINVAL;
5045
5046        mddev->parallel_resync = n;
5047
5048        if (mddev->sync_thread)
5049                wake_up(&resync_wait);
5050
5051        return len;
5052}
5053
5054/* force parallel resync, even with shared block devices */
5055static struct md_sysfs_entry md_sync_force_parallel =
5056__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
5057       sync_force_parallel_show, sync_force_parallel_store);
5058
5059static ssize_t
5060sync_speed_show(struct mddev *mddev, char *page)
5061{
5062        unsigned long resync, dt, db;
5063        if (mddev->curr_resync == 0)
5064                return sprintf(page, "none\n");
5065        resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
5066        dt = (jiffies - mddev->resync_mark) / HZ;
5067        if (!dt) dt++;
5068        db = resync - mddev->resync_mark_cnt;
5069        return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
5070}
5071
5072static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
5073
5074static ssize_t
5075sync_completed_show(struct mddev *mddev, char *page)
5076{
5077        unsigned long long max_sectors, resync;
5078
5079        if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5080                return sprintf(page, "none\n");
5081
5082        if (mddev->curr_resync == 1 ||
5083            mddev->curr_resync == 2)
5084                return sprintf(page, "delayed\n");
5085
5086        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
5087            test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5088                max_sectors = mddev->resync_max_sectors;
5089        else
5090                max_sectors = mddev->dev_sectors;
5091
5092        resync = mddev->curr_resync_completed;
5093        return sprintf(page, "%llu / %llu\n", resync, max_sectors);
5094}
5095
5096static struct md_sysfs_entry md_sync_completed =
5097        __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
5098
5099static ssize_t
5100min_sync_show(struct mddev *mddev, char *page)
5101{
5102        return sprintf(page, "%llu\n",
5103                       (unsigned long long)mddev->resync_min);
5104}
5105static ssize_t
5106min_sync_store(struct mddev *mddev, const char *buf, size_t len)
5107{
5108        unsigned long long min;
5109        int err;
5110
5111        if (kstrtoull(buf, 10, &min))
5112                return -EINVAL;
5113
5114        spin_lock(&mddev->lock);
5115        err = -EINVAL;
5116        if (min > mddev->resync_max)
5117                goto out_unlock;
5118
5119        err = -EBUSY;
5120        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5121                goto out_unlock;
5122
5123        /* Round down to multiple of 4K for safety */
5124        mddev->resync_min = round_down(min, 8);
5125        err = 0;
5126
5127out_unlock:
5128        spin_unlock(&mddev->lock);
5129        return err ?: len;
5130}
5131
5132static struct md_sysfs_entry md_min_sync =
5133__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
5134
5135static ssize_t
5136max_sync_show(struct mddev *mddev, char *page)
5137{
5138        if (mddev->resync_max == MaxSector)
5139                return sprintf(page, "max\n");
5140        else
5141                return sprintf(page, "%llu\n",
5142                               (unsigned long long)mddev->resync_max);
5143}
5144static ssize_t
5145max_sync_store(struct mddev *mddev, const char *buf, size_t len)
5146{
5147        int err;
5148        spin_lock(&mddev->lock);
5149        if (strncmp(buf, "max", 3) == 0)
5150                mddev->resync_max = MaxSector;
5151        else {
5152                unsigned long long max;
5153                int chunk;
5154
5155                err = -EINVAL;
5156                if (kstrtoull(buf, 10, &max))
5157                        goto out_unlock;
5158                if (max < mddev->resync_min)
5159                        goto out_unlock;
5160
5161                err = -EBUSY;
5162                if (max < mddev->resync_max &&
5163                    mddev->ro == 0 &&
5164                    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5165                        goto out_unlock;
5166
5167                /* Must be a multiple of chunk_size */
5168                chunk = mddev->chunk_sectors;
5169                if (chunk) {
5170                        sector_t temp = max;
5171
5172                        err = -EINVAL;
5173                        if (sector_div(temp, chunk))
5174                                goto out_unlock;
5175                }
5176                mddev->resync_max = max;
5177        }
5178        wake_up(&mddev->recovery_wait);
5179        err = 0;
5180out_unlock:
5181        spin_unlock(&mddev->lock);
5182        return err ?: len;
5183}
5184
5185static struct md_sysfs_entry md_max_sync =
5186__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
5187
5188static ssize_t
5189suspend_lo_show(struct mddev *mddev, char *page)
5190{
5191        return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
5192}
5193
5194static ssize_t
5195suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
5196{
5197        unsigned long long new;
5198        int err;
5199
5200        err = kstrtoull(buf, 10, &new);
5201        if (err < 0)
5202                return err;
5203        if (new != (sector_t)new)
5204                return -EINVAL;
5205
5206        err = mddev_lock(mddev);
5207        if (err)
5208                return err;
5209        err = -EINVAL;
5210        if (mddev->pers == NULL ||
5211            mddev->pers->quiesce == NULL)
5212                goto unlock;
5213        mddev_suspend(mddev);
5214        mddev->suspend_lo = new;
5215        mddev_resume(mddev);
5216
5217        err = 0;
5218unlock:
5219        mddev_unlock(mddev);
5220        return err ?: len;
5221}
5222static struct md_sysfs_entry md_suspend_lo =
5223__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
5224
5225static ssize_t
5226suspend_hi_show(struct mddev *mddev, char *page)
5227{
5228        return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
5229}
5230
5231static ssize_t
5232suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
5233{
5234        unsigned long long new;
5235        int err;
5236
5237        err = kstrtoull(buf, 10, &new);
5238        if (err < 0)
5239                return err;
5240        if (new != (sector_t)new)
5241                return -EINVAL;
5242
5243        err = mddev_lock(mddev);
5244        if (err)
5245                return err;
5246        err = -EINVAL;
5247        if (mddev->pers == NULL)
5248                goto unlock;
5249
5250        mddev_suspend(mddev);
5251        mddev->suspend_hi = new;
5252        mddev_resume(mddev);
5253
5254        err = 0;
5255unlock:
5256        mddev_unlock(mddev);
5257        return err ?: len;
5258}
5259static struct md_sysfs_entry md_suspend_hi =
5260__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
5261
5262static ssize_t
5263reshape_position_show(struct mddev *mddev, char *page)
5264{
5265        if (mddev->reshape_position != MaxSector)
5266                return sprintf(page, "%llu\n",
5267                               (unsigned long long)mddev->reshape_position);
5268        strcpy(page, "none\n");
5269        return 5;
5270}
5271
5272static ssize_t
5273reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
5274{
5275        struct md_rdev *rdev;
5276        unsigned long long new;
5277        int err;
5278
5279        err = kstrtoull(buf, 10, &new);
5280        if (err < 0)
5281                return err;
5282        if (new != (sector_t)new)
5283                return -EINVAL;
5284        err = mddev_lock(mddev);
5285        if (err)
5286                return err;
5287        err = -EBUSY;
5288        if (mddev->pers)
5289                goto unlock;
5290        mddev->reshape_position = new;
5291        mddev->delta_disks = 0;
5292        mddev->reshape_backwards = 0;
5293        mddev->new_level = mddev->level;
5294        mddev->new_layout = mddev->layout;
5295        mddev->new_chunk_sectors = mddev->chunk_sectors;
5296        rdev_for_each(rdev, mddev)
5297                rdev->new_data_offset = rdev->data_offset;
5298        err = 0;
5299unlock:
5300        mddev_unlock(mddev);
5301        return err ?: len;
5302}
5303
5304static struct md_sysfs_entry md_reshape_position =
5305__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
5306       reshape_position_store);
5307
5308static ssize_t
5309reshape_direction_show(struct mddev *mddev, char *page)
5310{
5311        return sprintf(page, "%s\n",
5312                       mddev->reshape_backwards ? "backwards" : "forwards");
5313}
5314
5315static ssize_t
5316reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
5317{
5318        int backwards = 0;
5319        int err;
5320
5321        if (cmd_match(buf, "forwards"))
5322                backwards = 0;
5323        else if (cmd_match(buf, "backwards"))
5324                backwards = 1;
5325        else
5326                return -EINVAL;
5327        if (mddev->reshape_backwards == backwards)
5328                return len;
5329
5330        err = mddev_lock(mddev);
5331        if (err)
5332                return err;
5333        /* check if we are allowed to change */
5334        if (mddev->delta_disks)
5335                err = -EBUSY;
5336        else if (mddev->persistent &&
5337            mddev->major_version == 0)
5338                err =  -EINVAL;
5339        else
5340                mddev->reshape_backwards = backwards;
5341        mddev_unlock(mddev);
5342        return err ?: len;
5343}
5344
5345static struct md_sysfs_entry md_reshape_direction =
5346__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
5347       reshape_direction_store);
5348
5349static ssize_t
5350array_size_show(struct mddev *mddev, char *page)
5351{
5352        if (mddev->external_size)
5353                return sprintf(page, "%llu\n",
5354                               (unsigned long long)mddev->array_sectors/2);
5355        else
5356                return sprintf(page, "default\n");
5357}
5358
5359static ssize_t
5360array_size_store(struct mddev *mddev, const char *buf, size_t len)
5361{
5362        sector_t sectors;
5363        int err;
5364
5365        err = mddev_lock(mddev);
5366        if (err)
5367                return err;
5368
5369        /* cluster raid doesn't support change array_sectors */
5370        if (mddev_is_clustered(mddev)) {
5371                mddev_unlock(mddev);
5372                return -EINVAL;
5373        }
5374
5375        if (strncmp(buf, "default", 7) == 0) {
5376                if (mddev->pers)
5377                        sectors = mddev->pers->size(mddev, 0, 0);
5378                else
5379                        sectors = mddev->array_sectors;
5380
5381                mddev->external_size = 0;
5382        } else {
5383                if (strict_blocks_to_sectors(buf, &sectors) < 0)
5384                        err = -EINVAL;
5385                else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
5386                        err = -E2BIG;
5387                else
5388                        mddev->external_size = 1;
5389        }
5390
5391        if (!err) {
5392                mddev->array_sectors = sectors;
5393                if (mddev->pers) {
5394                        set_capacity(mddev->gendisk, mddev->array_sectors);
5395                        revalidate_disk_size(mddev->gendisk, true);
5396                }
5397        }
5398        mddev_unlock(mddev);
5399        return err ?: len;
5400}
5401
5402static struct md_sysfs_entry md_array_size =
5403__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
5404       array_size_store);
5405
5406static ssize_t
5407consistency_policy_show(struct mddev *mddev, char *page)
5408{
5409        int ret;
5410
5411        if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
5412                ret = sprintf(page, "journal\n");
5413        } else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
5414                ret = sprintf(page, "ppl\n");
5415        } else if (mddev->bitmap) {
5416                ret = sprintf(page, "bitmap\n");
5417        } else if (mddev->pers) {
5418                if (mddev->pers->sync_request)
5419                        ret = sprintf(page, "resync\n");
5420                else
5421                        ret = sprintf(page, "none\n");
5422        } else {
5423                ret = sprintf(page, "unknown\n");
5424        }
5425
5426        return ret;
5427}
5428
5429static ssize_t
5430consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
5431{
5432        int err = 0;
5433
5434        if (mddev->pers) {
5435                if (mddev->pers->change_consistency_policy)
5436                        err = mddev->pers->change_consistency_policy(mddev, buf);
5437                else
5438                        err = -EBUSY;
5439        } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
5440                set_bit(MD_HAS_PPL, &mddev->flags);
5441        } else {
5442                err = -EINVAL;
5443        }
5444
5445        return err ? err : len;
5446}
5447
5448static struct md_sysfs_entry md_consistency_policy =
5449__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
5450       consistency_policy_store);
5451
5452static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
5453{
5454        return sprintf(page, "%d\n", mddev->fail_last_dev);
5455}
5456
5457/*
5458 * Setting fail_last_dev to true to allow last device to be forcibly removed
5459 * from RAID1/RAID10.
5460 */
5461static ssize_t
5462fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
5463{
5464        int ret;
5465        bool value;
5466
5467        ret = kstrtobool(buf, &value);
5468        if (ret)
5469                return ret;
5470
5471        if (value != mddev->fail_last_dev)
5472                mddev->fail_last_dev = value;
5473
5474        return len;
5475}
5476static struct md_sysfs_entry md_fail_last_dev =
5477__ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
5478       fail_last_dev_store);
5479
5480static ssize_t serialize_policy_show(struct mddev *mddev, char *page)
5481{
5482        if (mddev->pers == NULL || (mddev->pers->level != 1))
5483                return sprintf(page, "n/a\n");
5484        else
5485                return sprintf(page, "%d\n", mddev->serialize_policy);
5486}
5487
5488/*
5489 * Setting serialize_policy to true to enforce write IO is not reordered
5490 * for raid1.
5491 */
5492static ssize_t
5493serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
5494{
5495        int err;
5496        bool value;
5497
5498        err = kstrtobool(buf, &value);
5499        if (err)
5500                return err;
5501
5502        if (value == mddev->serialize_policy)
5503                return len;
5504
5505        err = mddev_lock(mddev);
5506        if (err)
5507                return err;
5508        if (mddev->pers == NULL || (mddev->pers->level != 1)) {
5509                pr_err("md: serialize_policy is only effective for raid1\n");
5510                err = -EINVAL;
5511                goto unlock;
5512        }
5513
5514        mddev_suspend(mddev);
5515        if (value)
5516                mddev_create_serial_pool(mddev, NULL, true);
5517        else
5518                mddev_destroy_serial_pool(mddev, NULL, true);
5519        mddev->serialize_policy = value;
5520        mddev_resume(mddev);
5521unlock:
5522        mddev_unlock(mddev);
5523        return err ?: len;
5524}
5525
5526static struct md_sysfs_entry md_serialize_policy =
5527__ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
5528       serialize_policy_store);
5529
5530
5531static struct attribute *md_default_attrs[] = {
5532        &md_level.attr,
5533        &md_layout.attr,
5534        &md_raid_disks.attr,
5535        &md_uuid.attr,
5536        &md_chunk_size.attr,
5537        &md_size.attr,
5538        &md_resync_start.attr,
5539        &md_metadata.attr,
5540        &md_new_device.attr,
5541        &md_safe_delay.attr,
5542        &md_array_state.attr,
5543        &md_reshape_position.attr,
5544        &md_reshape_direction.attr,
5545        &md_array_size.attr,
5546        &max_corr_read_errors.attr,
5547        &md_consistency_policy.attr,
5548        &md_fail_last_dev.attr,
5549        &md_serialize_policy.attr,
5550        NULL,
5551};
5552
5553static struct attribute *md_redundancy_attrs[] = {
5554        &md_scan_mode.attr,
5555        &md_last_scan_mode.attr,
5556        &md_mismatches.attr,
5557        &md_sync_min.attr,
5558        &md_sync_max.attr,
5559        &md_sync_speed.attr,
5560        &md_sync_force_parallel.attr,
5561        &md_sync_completed.attr,
5562        &md_min_sync.attr,
5563        &md_max_sync.attr,
5564        &md_suspend_lo.attr,
5565        &md_suspend_hi.attr,
5566        &md_bitmap.attr,
5567        &md_degraded.attr,
5568        NULL,
5569};
5570static struct attribute_group md_redundancy_group = {
5571        .name = NULL,
5572        .attrs = md_redundancy_attrs,
5573};
5574
5575static ssize_t
5576md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
5577{
5578        struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5579        struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5580        ssize_t rv;
5581
5582        if (!entry->show)
5583                return -EIO;
5584        spin_lock(&all_mddevs_lock);
5585        if (list_empty(&mddev->all_mddevs)) {
5586                spin_unlock(&all_mddevs_lock);
5587                return -EBUSY;
5588        }
5589        mddev_get(mddev);
5590        spin_unlock(&all_mddevs_lock);
5591
5592        rv = entry->show(mddev, page);
5593        mddev_put(mddev);
5594        return rv;
5595}
5596
5597static ssize_t
5598md_attr_store(struct kobject *kobj, struct attribute *attr,
5599              const char *page, size_t length)
5600{
5601        struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5602        struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5603        ssize_t rv;
5604
5605        if (!entry->store)
5606                return -EIO;
5607        if (!capable(CAP_SYS_ADMIN))
5608                return -EACCES;
5609        spin_lock(&all_mddevs_lock);
5610        if (list_empty(&mddev->all_mddevs)) {
5611                spin_unlock(&all_mddevs_lock);
5612                return -EBUSY;
5613        }
5614        mddev_get(mddev);
5615        spin_unlock(&all_mddevs_lock);
5616        rv = entry->store(mddev, page, length);
5617        mddev_put(mddev);
5618        return rv;
5619}
5620
5621static void md_free(struct kobject *ko)
5622{
5623        struct mddev *mddev = container_of(ko, struct mddev, kobj);
5624
5625        if (mddev->sysfs_state)
5626                sysfs_put(mddev->sysfs_state);
5627        if (mddev->sysfs_level)
5628                sysfs_put(mddev->sysfs_level);
5629
5630        if (mddev->gendisk)
5631                del_gendisk(mddev->gendisk);
5632        if (mddev->queue)
5633                blk_cleanup_queue(mddev->queue);
5634        if (mddev->gendisk)
5635                put_disk(mddev->gendisk);
5636        percpu_ref_exit(&mddev->writes_pending);
5637
5638        bioset_exit(&mddev->bio_set);
5639        bioset_exit(&mddev->sync_set);
5640        kfree(mddev);
5641}
5642
5643static const struct sysfs_ops md_sysfs_ops = {
5644        .show   = md_attr_show,
5645        .store  = md_attr_store,
5646};
5647static struct kobj_type md_ktype = {
5648        .release        = md_free,
5649        .sysfs_ops      = &md_sysfs_ops,
5650        .default_attrs  = md_default_attrs,
5651};
5652
5653int mdp_major = 0;
5654
5655static void mddev_delayed_delete(struct work_struct *ws)
5656{
5657        struct mddev *mddev = container_of(ws, struct mddev, del_work);
5658
5659        sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
5660        kobject_del(&mddev->kobj);
5661        kobject_put(&mddev->kobj);
5662}
5663
5664static void no_op(struct percpu_ref *r) {}
5665
5666int mddev_init_writes_pending(struct mddev *mddev)
5667{
5668        if (mddev->writes_pending.percpu_count_ptr)
5669                return 0;
5670        if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0)
5671                return -ENOMEM;
5672        /* We want to start with the refcount at zero */
5673        percpu_ref_put(&mddev->writes_pending);
5674        return 0;
5675}
5676EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
5677
5678static int md_alloc(dev_t dev, char *name)
5679{
5680        /*
5681         * If dev is zero, name is the name of a device to allocate with
5682         * an arbitrary minor number.  It will be "md_???"
5683         * If dev is non-zero it must be a device number with a MAJOR of
5684         * MD_MAJOR or mdp_major.  In this case, if "name" is NULL, then
5685         * the device is being created by opening a node in /dev.
5686         * If "name" is not NULL, the device is being created by
5687         * writing to /sys/module/md_mod/parameters/new_array.
5688         */
5689        static DEFINE_MUTEX(disks_mutex);
5690        struct mddev *mddev;
5691        struct gendisk *disk;
5692        int partitioned;
5693        int shift;
5694        int unit;
5695        int error ;
5696
5697        /*
5698         * Wait for any previous instance of this device to be completely
5699         * removed (mddev_delayed_delete).
5700         */
5701        flush_workqueue(md_misc_wq);
5702
5703        mutex_lock(&disks_mutex);
5704        mddev = mddev_alloc(dev);
5705        if (IS_ERR(mddev)) {
5706                mutex_unlock(&disks_mutex);
5707                return PTR_ERR(mddev);
5708        }
5709
5710        partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
5711        shift = partitioned ? MdpMinorShift : 0;
5712        unit = MINOR(mddev->unit) >> shift;
5713
5714        if (name && !dev) {
5715                /* Need to ensure that 'name' is not a duplicate.
5716                 */
5717                struct mddev *mddev2;
5718                spin_lock(&all_mddevs_lock);
5719
5720                list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
5721                        if (mddev2->gendisk &&
5722                            strcmp(mddev2->gendisk->disk_name, name) == 0) {
5723                                spin_unlock(&all_mddevs_lock);
5724                                error = -EEXIST;
5725                                goto abort;
5726                        }
5727                spin_unlock(&all_mddevs_lock);
5728        }
5729        if (name && dev)
5730                /*
5731                 * Creating /dev/mdNNN via "newarray", so adjust hold_active.
5732                 */
5733                mddev->hold_active = UNTIL_STOP;
5734
5735        error = -ENOMEM;
5736        mddev->queue = blk_alloc_queue_rh(md_make_request, NUMA_NO_NODE);
5737        if (!mddev->queue)
5738                goto abort;
5739
5740        blk_set_stacking_limits(&mddev->queue->limits);
5741
5742        disk = alloc_disk(1 << shift);
5743        if (!disk) {
5744                blk_cleanup_queue(mddev->queue);
5745                mddev->queue = NULL;
5746                goto abort;
5747        }
5748        disk->major = MAJOR(mddev->unit);
5749        disk->first_minor = unit << shift;
5750        if (name)
5751                strcpy(disk->disk_name, name);
5752        else if (partitioned)
5753                sprintf(disk->disk_name, "md_d%d", unit);
5754        else
5755                sprintf(disk->disk_name, "md%d", unit);
5756        disk->fops = &md_fops;
5757        disk->private_data = mddev;
5758        disk->queue = mddev->queue;
5759        blk_queue_write_cache(mddev->queue, true, true);
5760        /* Allow extended partitions.  This makes the
5761         * 'mdp' device redundant, but we can't really
5762         * remove it now.
5763         */
5764        disk->flags |= GENHD_FL_EXT_DEVT;
5765        mddev->gendisk = disk;
5766        /* As soon as we call add_disk(), another thread could get
5767         * through to md_open, so make sure it doesn't get too far
5768         */
5769        mutex_lock(&mddev->open_mutex);
5770        add_disk(disk);
5771
5772        error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
5773        if (error) {
5774                /* This isn't possible, but as kobject_init_and_add is marked
5775                 * __must_check, we must do something with the result
5776                 */
5777                pr_debug("md: cannot register %s/md - name in use\n",
5778                         disk->disk_name);
5779                error = 0;
5780        }
5781        if (mddev->kobj.sd &&
5782            sysfs_create_group(&mddev->kobj, &md_bitmap_group))
5783                pr_debug("pointless warning\n");
5784        mutex_unlock(&mddev->open_mutex);
5785 abort:
5786        mutex_unlock(&disks_mutex);
5787        if (!error && mddev->kobj.sd) {
5788                kobject_uevent(&mddev->kobj, KOBJ_ADD);
5789                mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
5790                mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
5791        }
5792        mddev_put(mddev);
5793        return error;
5794}
5795
5796static void md_probe(dev_t dev)
5797{
5798        if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512)
5799                return;
5800        if (create_on_open)
5801                md_alloc(dev, NULL);
5802}
5803
5804static int add_named_array(const char *val, const struct kernel_param *kp)
5805{
5806        /*
5807         * val must be "md_*" or "mdNNN".
5808         * For "md_*" we allocate an array with a large free minor number, and
5809         * set the name to val.  val must not already be an active name.
5810         * For "mdNNN" we allocate an array with the minor number NNN
5811         * which must not already be in use.
5812         */
5813        int len = strlen(val);
5814        char buf[DISK_NAME_LEN];
5815        unsigned long devnum;
5816
5817        while (len && val[len-1] == '\n')
5818                len--;
5819        if (len >= DISK_NAME_LEN)
5820                return -E2BIG;
5821        strlcpy(buf, val, len+1);
5822        if (strncmp(buf, "md_", 3) == 0)
5823                return md_alloc(0, buf);
5824        if (strncmp(buf, "md", 2) == 0 &&
5825            isdigit(buf[2]) &&
5826            kstrtoul(buf+2, 10, &devnum) == 0 &&
5827            devnum <= MINORMASK)
5828                return md_alloc(MKDEV(MD_MAJOR, devnum), NULL);
5829
5830        return -EINVAL;
5831}
5832
5833static void md_safemode_timeout(struct timer_list *t)
5834{
5835        struct mddev *mddev = from_timer(mddev, t, safemode_timer);
5836
5837        mddev->safemode = 1;
5838        if (mddev->external)
5839                sysfs_notify_dirent_safe(mddev->sysfs_state);
5840
5841        md_wakeup_thread(mddev->thread);
5842}
5843
5844static int start_dirty_degraded;
5845
5846int md_run(struct mddev *mddev)
5847{
5848        int err;
5849        struct md_rdev *rdev;
5850        struct md_personality *pers;
5851
5852        if (list_empty(&mddev->disks))
5853                /* cannot run an array with no devices.. */
5854                return -EINVAL;
5855
5856        if (mddev->pers)
5857                return -EBUSY;
5858        /* Cannot run until previous stop completes properly */
5859        if (mddev->sysfs_active)
5860                return -EBUSY;
5861
5862        /*
5863         * Analyze all RAID superblock(s)
5864         */
5865        if (!mddev->raid_disks) {
5866                if (!mddev->persistent)
5867                        return -EINVAL;
5868                err = analyze_sbs(mddev);
5869                if (err)
5870                        return -EINVAL;
5871        }
5872
5873        if (mddev->level != LEVEL_NONE)
5874                request_module("md-level-%d", mddev->level);
5875        else if (mddev->clevel[0])
5876                request_module("md-%s", mddev->clevel);
5877
5878        /*
5879         * Drop all container device buffers, from now on
5880         * the only valid external interface is through the md
5881         * device.
5882         */
5883        mddev->has_superblocks = false;
5884        rdev_for_each(rdev, mddev) {
5885                if (test_bit(Faulty, &rdev->flags))
5886                        continue;
5887                sync_blockdev(rdev->bdev);
5888                invalidate_bdev(rdev->bdev);
5889                if (mddev->ro != 1 && rdev_read_only(rdev)) {
5890                        mddev->ro = 1;
5891                        if (mddev->gendisk)
5892                                set_disk_ro(mddev->gendisk, 1);
5893                }
5894
5895                if (rdev->sb_page)
5896                        mddev->has_superblocks = true;
5897
5898                /* perform some consistency tests on the device.
5899                 * We don't want the data to overlap the metadata,
5900                 * Internal Bitmap issues have been handled elsewhere.
5901                 */
5902                if (rdev->meta_bdev) {
5903                        /* Nothing to check */;
5904                } else if (rdev->data_offset < rdev->sb_start) {
5905                        if (mddev->dev_sectors &&
5906                            rdev->data_offset + mddev->dev_sectors
5907                            > rdev->sb_start) {
5908                                pr_warn("md: %s: data overlaps metadata\n",
5909                                        mdname(mddev));
5910                                return -EINVAL;
5911                        }
5912                } else {
5913                        if (rdev->sb_start + rdev->sb_size/512
5914                            > rdev->data_offset) {
5915                                pr_warn("md: %s: metadata overlaps data\n",
5916                                        mdname(mddev));
5917                                return -EINVAL;
5918                        }
5919                }
5920                sysfs_notify_dirent_safe(rdev->sysfs_state);
5921        }
5922
5923        if (!bioset_initialized(&mddev->bio_set)) {
5924                err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
5925                if (err)
5926                        return err;
5927        }
5928        if (!bioset_initialized(&mddev->sync_set)) {
5929                err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
5930                if (err)
5931                        return err;
5932        }
5933
5934        spin_lock(&pers_lock);
5935        pers = find_pers(mddev->level, mddev->clevel);
5936        if (!pers || !try_module_get(pers->owner)) {
5937                spin_unlock(&pers_lock);
5938                if (mddev->level != LEVEL_NONE)
5939                        pr_warn("md: personality for level %d is not loaded!\n",
5940                                mddev->level);
5941                else
5942                        pr_warn("md: personality for level %s is not loaded!\n",
5943                                mddev->clevel);
5944                err = -EINVAL;
5945                goto abort;
5946        }
5947        spin_unlock(&pers_lock);
5948        if (mddev->level != pers->level) {
5949                mddev->level = pers->level;
5950                mddev->new_level = pers->level;
5951        }
5952        strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
5953
5954        if (mddev->reshape_position != MaxSector &&
5955            pers->start_reshape == NULL) {
5956                /* This personality cannot handle reshaping... */
5957                module_put(pers->owner);
5958                err = -EINVAL;
5959                goto abort;
5960        }
5961
5962        if (pers->sync_request) {
5963                /* Warn if this is a potentially silly
5964                 * configuration.
5965                 */
5966                char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5967                struct md_rdev *rdev2;
5968                int warned = 0;
5969
5970                rdev_for_each(rdev, mddev)
5971                        rdev_for_each(rdev2, mddev) {
5972                                if (rdev < rdev2 &&
5973                                    rdev->bdev->bd_disk ==
5974                                    rdev2->bdev->bd_disk) {
5975                                        pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n",
5976                                                mdname(mddev),
5977                                                bdevname(rdev->bdev,b),
5978                                                bdevname(rdev2->bdev,b2));
5979                                        warned = 1;
5980                                }
5981                        }
5982
5983                if (warned)
5984                        pr_warn("True protection against single-disk failure might be compromised.\n");
5985        }
5986
5987        mddev->recovery = 0;
5988        /* may be over-ridden by personality */
5989        mddev->resync_max_sectors = mddev->dev_sectors;
5990
5991        mddev->ok_start_degraded = start_dirty_degraded;
5992
5993        if (start_readonly && mddev->ro == 0)
5994                mddev->ro = 2; /* read-only, but switch on first write */
5995
5996        err = pers->run(mddev);
5997        if (err)
5998                pr_warn("md: pers->run() failed ...\n");
5999        else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
6000                WARN_ONCE(!mddev->external_size,
6001                          "%s: default size too small, but 'external_size' not in effect?\n",
6002                          __func__);
6003                pr_warn("md: invalid array_size %llu > default size %llu\n",
6004                        (unsigned long long)mddev->array_sectors / 2,
6005                        (unsigned long long)pers->size(mddev, 0, 0) / 2);
6006                err = -EINVAL;
6007        }
6008        if (err == 0 && pers->sync_request &&
6009            (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
6010                struct bitmap *bitmap;
6011
6012                bitmap = md_bitmap_create(mddev, -1);
6013                if (IS_ERR(bitmap)) {
6014                        err = PTR_ERR(bitmap);
6015                        pr_warn("%s: failed to create bitmap (%d)\n",
6016                                mdname(mddev), err);
6017                } else
6018                        mddev->bitmap = bitmap;
6019
6020        }
6021        if (err)
6022                goto bitmap_abort;
6023
6024        if (mddev->bitmap_info.max_write_behind > 0) {
6025                bool create_pool = false;
6026
6027                rdev_for_each(rdev, mddev) {
6028                        if (test_bit(WriteMostly, &rdev->flags) &&
6029                            rdev_init_serial(rdev))
6030                                create_pool = true;
6031                }
6032                if (create_pool && mddev->serial_info_pool == NULL) {
6033                        mddev->serial_info_pool =
6034                                mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
6035                                                    sizeof(struct serial_info));
6036                        if (!mddev->serial_info_pool) {
6037                                err = -ENOMEM;
6038                                goto bitmap_abort;
6039                        }
6040                }
6041        }
6042
6043        if (mddev->queue) {
6044                bool nonrot = true;
6045
6046                rdev_for_each(rdev, mddev) {
6047                        if (rdev->raid_disk >= 0 &&
6048                            !blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
6049                                nonrot = false;
6050                                break;
6051                        }
6052                }
6053                if (mddev->degraded)
6054                        nonrot = false;
6055                if (nonrot)
6056                        blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue);
6057                else
6058                        blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue);
6059                mddev->queue->backing_dev_info->congested_data = mddev;
6060                mddev->queue->backing_dev_info->congested_fn = md_congested;
6061        }
6062        if (pers->sync_request) {
6063                if (mddev->kobj.sd &&
6064                    sysfs_create_group(&mddev->kobj, &md_redundancy_group))
6065                        pr_warn("md: cannot register extra attributes for %s\n",
6066                                mdname(mddev));
6067                mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
6068                mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
6069                mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
6070        } else if (mddev->ro == 2) /* auto-readonly not meaningful */
6071                mddev->ro = 0;
6072
6073        atomic_set(&mddev->max_corr_read_errors,
6074                   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
6075        mddev->safemode = 0;
6076        if (mddev_is_clustered(mddev))
6077                mddev->safemode_delay = 0;
6078        else
6079                mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
6080        mddev->in_sync = 1;
6081        smp_wmb();
6082        spin_lock(&mddev->lock);
6083        mddev->pers = pers;
6084        spin_unlock(&mddev->lock);
6085        rdev_for_each(rdev, mddev)
6086                if (rdev->raid_disk >= 0)
6087                        sysfs_link_rdev(mddev, rdev); /* failure here is OK */
6088
6089        if (mddev->degraded && !mddev->ro)
6090                /* This ensures that recovering status is reported immediately
6091                 * via sysfs - until a lack of spares is confirmed.
6092                 */
6093                set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6094        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6095
6096        if (mddev->sb_flags)
6097                md_update_sb(mddev, 0);
6098
6099        md_new_event(mddev);
6100        return 0;
6101
6102bitmap_abort:
6103        mddev_detach(mddev);
6104        if (mddev->private)
6105                pers->free(mddev, mddev->private);
6106        mddev->private = NULL;
6107        module_put(pers->owner);
6108        md_bitmap_destroy(mddev);
6109abort:
6110        bioset_exit(&mddev->bio_set);
6111        bioset_exit(&mddev->sync_set);
6112        return err;
6113}
6114EXPORT_SYMBOL_GPL(md_run);
6115
6116static int do_md_run(struct mddev *mddev)
6117{
6118        int err;
6119
6120        set_bit(MD_NOT_READY, &mddev->flags);
6121        err = md_run(mddev);
6122        if (err)
6123                goto out;
6124        err = md_bitmap_load(mddev);
6125        if (err) {
6126                md_bitmap_destroy(mddev);
6127                goto out;
6128        }
6129
6130        if (mddev_is_clustered(mddev))
6131                md_allow_write(mddev);
6132
6133        /* run start up tasks that require md_thread */
6134        md_start(mddev);
6135
6136        md_wakeup_thread(mddev->thread);
6137        md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
6138
6139        set_capacity(mddev->gendisk, mddev->array_sectors);
6140        revalidate_disk_size(mddev->gendisk, true);
6141        clear_bit(MD_NOT_READY, &mddev->flags);
6142        mddev->changed = 1;
6143        kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
6144        sysfs_notify_dirent_safe(mddev->sysfs_state);
6145        sysfs_notify_dirent_safe(mddev->sysfs_action);
6146        sysfs_notify_dirent_safe(mddev->sysfs_degraded);
6147out:
6148        clear_bit(MD_NOT_READY, &mddev->flags);
6149        return err;
6150}
6151
6152int md_start(struct mddev *mddev)
6153{
6154        int ret = 0;
6155
6156        if (mddev->pers->start) {
6157                set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
6158                md_wakeup_thread(mddev->thread);
6159                ret = mddev->pers->start(mddev);
6160                clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
6161                md_wakeup_thread(mddev->sync_thread);
6162        }
6163        return ret;
6164}
6165EXPORT_SYMBOL_GPL(md_start);
6166
6167static int restart_array(struct mddev *mddev)
6168{
6169        struct gendisk *disk = mddev->gendisk;
6170        struct md_rdev *rdev;
6171        bool has_journal = false;
6172        bool has_readonly = false;
6173
6174        /* Complain if it has no devices */
6175        if (list_empty(&mddev->disks))
6176                return -ENXIO;
6177        if (!mddev->pers)
6178                return -EINVAL;
6179        if (!mddev->ro)
6180                return -EBUSY;
6181
6182        rcu_read_lock();
6183        rdev_for_each_rcu(rdev, mddev) {
6184                if (test_bit(Journal, &rdev->flags) &&
6185                    !test_bit(Faulty, &rdev->flags))
6186                        has_journal = true;
6187                if (bdev_read_only(rdev->bdev))
6188                        has_readonly = true;
6189        }
6190        rcu_read_unlock();
6191        if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
6192                /* Don't restart rw with journal missing/faulty */
6193                        return -EINVAL;
6194        if (has_readonly)
6195                return -EROFS;
6196
6197        mddev->safemode = 0;
6198        mddev->ro = 0;
6199        set_disk_ro(disk, 0);
6200        pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
6201        /* Kick recovery or resync if necessary */
6202        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6203        md_wakeup_thread(mddev->thread);
6204        md_wakeup_thread(mddev->sync_thread);
6205        sysfs_notify_dirent_safe(mddev->sysfs_state);
6206        return 0;
6207}
6208
6209static void md_clean(struct mddev *mddev)
6210{
6211        mddev->array_sectors = 0;
6212        mddev->external_size = 0;
6213        mddev->dev_sectors = 0;
6214        mddev->raid_disks = 0;
6215        mddev->recovery_cp = 0;
6216        mddev->resync_min = 0;
6217        mddev->resync_max = MaxSector;
6218        mddev->reshape_position = MaxSector;
6219        mddev->external = 0;
6220        mddev->persistent = 0;
6221        mddev->level = LEVEL_NONE;
6222        mddev->clevel[0] = 0;
6223        mddev->flags = 0;
6224        mddev->sb_flags = 0;
6225        mddev->ro = 0;
6226        mddev->metadata_type[0] = 0;
6227        mddev->chunk_sectors = 0;
6228        mddev->ctime = mddev->utime = 0;
6229        mddev->layout = 0;
6230        mddev->max_disks = 0;
6231        mddev->events = 0;
6232        mddev->can_decrease_events = 0;
6233        mddev->delta_disks = 0;
6234        mddev->reshape_backwards = 0;
6235        mddev->new_level = LEVEL_NONE;
6236        mddev->new_layout = 0;
6237        mddev->new_chunk_sectors = 0;
6238        mddev->curr_resync = 0;
6239        atomic64_set(&mddev->resync_mismatches, 0);
6240        mddev->suspend_lo = mddev->suspend_hi = 0;
6241        mddev->sync_speed_min = mddev->sync_speed_max = 0;
6242        mddev->recovery = 0;
6243        mddev->in_sync = 0;
6244        mddev->changed = 0;
6245        mddev->degraded = 0;
6246        mddev->safemode = 0;
6247        mddev->private = NULL;
6248        mddev->cluster_info = NULL;
6249        mddev->bitmap_info.offset = 0;
6250        mddev->bitmap_info.default_offset = 0;
6251        mddev->bitmap_info.default_space = 0;
6252        mddev->bitmap_info.chunksize = 0;
6253        mddev->bitmap_info.daemon_sleep = 0;
6254        mddev->bitmap_info.max_write_behind = 0;
6255        mddev->bitmap_info.nodes = 0;
6256}
6257
6258static void __md_stop_writes(struct mddev *mddev)
6259{
6260        set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6261        if (work_pending(&mddev->del_work))
6262                flush_workqueue(md_misc_wq);
6263        if (mddev->sync_thread) {
6264                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6265                md_reap_sync_thread(mddev);
6266        }
6267
6268        del_timer_sync(&mddev->safemode_timer);
6269
6270        if (mddev->pers && mddev->pers->quiesce) {
6271                mddev->pers->quiesce(mddev, 1);
6272                mddev->pers->quiesce(mddev, 0);
6273        }
6274        md_bitmap_flush(mddev);
6275
6276        if (mddev->ro == 0 &&
6277            ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
6278             mddev->sb_flags)) {
6279                /* mark array as shutdown cleanly */
6280                if (!mddev_is_clustered(mddev))
6281                        mddev->in_sync = 1;
6282                md_update_sb(mddev, 1);
6283        }
6284        /* disable policy to guarantee rdevs free resources for serialization */
6285        mddev->serialize_policy = 0;
6286        mddev_destroy_serial_pool(mddev, NULL, true);
6287}
6288
6289void md_stop_writes(struct mddev *mddev)
6290{
6291        mddev_lock_nointr(mddev);
6292        __md_stop_writes(mddev);
6293        mddev_unlock(mddev);
6294}
6295EXPORT_SYMBOL_GPL(md_stop_writes);
6296
6297static void mddev_detach(struct mddev *mddev)
6298{
6299        md_bitmap_wait_behind_writes(mddev);
6300        if (mddev->pers && mddev->pers->quiesce && !mddev->suspended) {
6301                mddev->pers->quiesce(mddev, 1);
6302                mddev->pers->quiesce(mddev, 0);
6303        }
6304        md_unregister_thread(&mddev->thread);
6305        if (mddev->queue)
6306                blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
6307}
6308
6309static void __md_stop(struct mddev *mddev)
6310{
6311        struct md_personality *pers = mddev->pers;
6312        md_bitmap_destroy(mddev);
6313        mddev_detach(mddev);
6314        /* Ensure ->event_work is done */
6315        if (mddev->event_work.func)
6316                flush_workqueue(md_misc_wq);
6317        spin_lock(&mddev->lock);
6318        mddev->pers = NULL;
6319        spin_unlock(&mddev->lock);
6320        pers->free(mddev, mddev->private);
6321        mddev->private = NULL;
6322        if (pers->sync_request && mddev->to_remove == NULL)
6323                mddev->to_remove = &md_redundancy_group;
6324        module_put(pers->owner);
6325        clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6326}
6327
6328void md_stop(struct mddev *mddev)
6329{
6330        /* stop the array and free an attached data structures.
6331         * This is called from dm-raid
6332         */
6333        __md_stop(mddev);
6334        bioset_exit(&mddev->bio_set);
6335        bioset_exit(&mddev->sync_set);
6336}
6337
6338EXPORT_SYMBOL_GPL(md_stop);
6339
6340static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
6341{
6342        int err = 0;
6343        int did_freeze = 0;
6344
6345        if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6346                did_freeze = 1;
6347                set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6348                md_wakeup_thread(mddev->thread);
6349        }
6350        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
6351                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6352        if (mddev->sync_thread)
6353                /* Thread might be blocked waiting for metadata update
6354                 * which will now never happen */
6355                wake_up_process(mddev->sync_thread->tsk);
6356
6357        if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
6358                return -EBUSY;
6359        mddev_unlock(mddev);
6360        wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
6361                                          &mddev->recovery));
6362        wait_event(mddev->sb_wait,
6363                   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
6364        mddev_lock_nointr(mddev);
6365
6366        mutex_lock(&mddev->open_mutex);
6367        if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
6368            mddev->sync_thread ||
6369            test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6370                pr_warn("md: %s still in use.\n",mdname(mddev));
6371                if (did_freeze) {
6372                        clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6373                        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6374                        md_wakeup_thread(mddev->thread);
6375                }
6376                err = -EBUSY;
6377                goto out;
6378        }
6379        if (mddev->pers) {
6380                __md_stop_writes(mddev);
6381
6382                err  = -ENXIO;
6383                if (mddev->ro==1)
6384                        goto out;
6385                mddev->ro = 1;
6386                set_disk_ro(mddev->gendisk, 1);
6387                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6388                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6389                md_wakeup_thread(mddev->thread);
6390                sysfs_notify_dirent_safe(mddev->sysfs_state);
6391                err = 0;
6392        }
6393out:
6394        mutex_unlock(&mddev->open_mutex);
6395        return err;
6396}
6397
6398/* mode:
6399 *   0 - completely stop and dis-assemble array
6400 *   2 - stop but do not disassemble array
6401 */
6402static int do_md_stop(struct mddev *mddev, int mode,
6403                      struct block_device *bdev)
6404{
6405        struct gendisk *disk = mddev->gendisk;
6406        struct md_rdev *rdev;
6407        int did_freeze = 0;
6408
6409        if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6410                did_freeze = 1;
6411                set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6412                md_wakeup_thread(mddev->thread);
6413        }
6414        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
6415                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6416        if (mddev->sync_thread)
6417                /* Thread might be blocked waiting for metadata update
6418                 * which will now never happen */
6419                wake_up_process(mddev->sync_thread->tsk);
6420
6421        mddev_unlock(mddev);
6422        wait_event(resync_wait, (mddev->sync_thread == NULL &&
6423                                 !test_bit(MD_RECOVERY_RUNNING,
6424                                           &mddev->recovery)));
6425        mddev_lock_nointr(mddev);
6426
6427        mutex_lock(&mddev->open_mutex);
6428        if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
6429            mddev->sysfs_active ||
6430            mddev->sync_thread ||
6431            test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6432                pr_warn("md: %s still in use.\n",mdname(mddev));
6433                mutex_unlock(&mddev->open_mutex);
6434                if (did_freeze) {
6435                        clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6436                        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6437                        md_wakeup_thread(mddev->thread);
6438                }
6439                return -EBUSY;
6440        }
6441        if (mddev->pers) {
6442                if (mddev->ro)
6443                        set_disk_ro(disk, 0);
6444
6445                __md_stop_writes(mddev);
6446                __md_stop(mddev);
6447                mddev->queue->backing_dev_info->congested_fn = NULL;
6448
6449                /* tell userspace to handle 'inactive' */
6450                sysfs_notify_dirent_safe(mddev->sysfs_state);
6451
6452                rdev_for_each(rdev, mddev)
6453                        if (rdev->raid_disk >= 0)
6454                                sysfs_unlink_rdev(mddev, rdev);
6455
6456                set_capacity(disk, 0);
6457                mutex_unlock(&mddev->open_mutex);
6458                mddev->changed = 1;
6459                revalidate_disk_size(disk, true);
6460
6461                if (mddev->ro)
6462                        mddev->ro = 0;
6463        } else
6464                mutex_unlock(&mddev->open_mutex);
6465        /*
6466         * Free resources if final stop
6467         */
6468        if (mode == 0) {
6469                pr_info("md: %s stopped.\n", mdname(mddev));
6470
6471                if (mddev->bitmap_info.file) {
6472                        struct file *f = mddev->bitmap_info.file;
6473                        spin_lock(&mddev->lock);
6474                        mddev->bitmap_info.file = NULL;
6475                        spin_unlock(&mddev->lock);
6476                        fput(f);
6477                }
6478                mddev->bitmap_info.offset = 0;
6479
6480                export_array(mddev);
6481
6482                md_clean(mddev);
6483                if (mddev->hold_active == UNTIL_STOP)
6484                        mddev->hold_active = 0;
6485        }
6486        md_new_event(mddev);
6487        sysfs_notify_dirent_safe(mddev->sysfs_state);
6488        return 0;
6489}
6490
6491#ifndef MODULE
6492static void autorun_array(struct mddev *mddev)
6493{
6494        struct md_rdev *rdev;
6495        int err;
6496
6497        if (list_empty(&mddev->disks))
6498                return;
6499
6500        pr_info("md: running: ");
6501
6502        rdev_for_each(rdev, mddev) {
6503                char b[BDEVNAME_SIZE];
6504                pr_cont("<%s>", bdevname(rdev->bdev,b));
6505        }
6506        pr_cont("\n");
6507
6508        err = do_md_run(mddev);
6509        if (err) {
6510                pr_warn("md: do_md_run() returned %d\n", err);
6511                do_md_stop(mddev, 0, NULL);
6512        }
6513}
6514
6515/*
6516 * lets try to run arrays based on all disks that have arrived
6517 * until now. (those are in pending_raid_disks)
6518 *
6519 * the method: pick the first pending disk, collect all disks with
6520 * the same UUID, remove all from the pending list and put them into
6521 * the 'same_array' list. Then order this list based on superblock
6522 * update time (freshest comes first), kick out 'old' disks and
6523 * compare superblocks. If everything's fine then run it.
6524 *
6525 * If "unit" is allocated, then bump its reference count
6526 */
6527static void autorun_devices(int part)
6528{
6529        struct md_rdev *rdev0, *rdev, *tmp;
6530        struct mddev *mddev;
6531        char b[BDEVNAME_SIZE];
6532
6533        pr_info("md: autorun ...\n");
6534        while (!list_empty(&pending_raid_disks)) {
6535                int unit;
6536                dev_t dev;
6537                LIST_HEAD(candidates);
6538                rdev0 = list_entry(pending_raid_disks.next,
6539                                         struct md_rdev, same_set);
6540
6541                pr_debug("md: considering %s ...\n", bdevname(rdev0->bdev,b));
6542                INIT_LIST_HEAD(&candidates);
6543                rdev_for_each_list(rdev, tmp, &pending_raid_disks)
6544                        if (super_90_load(rdev, rdev0, 0) >= 0) {
6545                                pr_debug("md:  adding %s ...\n",
6546                                         bdevname(rdev->bdev,b));
6547                                list_move(&rdev->same_set, &candidates);
6548                        }
6549                /*
6550                 * now we have a set of devices, with all of them having
6551                 * mostly sane superblocks. It's time to allocate the
6552                 * mddev.
6553                 */
6554                if (part) {
6555                        dev = MKDEV(mdp_major,
6556                                    rdev0->preferred_minor << MdpMinorShift);
6557                        unit = MINOR(dev) >> MdpMinorShift;
6558                } else {
6559                        dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
6560                        unit = MINOR(dev);
6561                }
6562                if (rdev0->preferred_minor != unit) {
6563                        pr_warn("md: unit number in %s is bad: %d\n",
6564                                bdevname(rdev0->bdev, b), rdev0->preferred_minor);
6565                        break;
6566                }
6567
6568                md_probe(dev);
6569                mddev = mddev_find(dev);
6570                if (!mddev)
6571                        break;
6572
6573                if (mddev_lock(mddev))
6574                        pr_warn("md: %s locked, cannot run\n", mdname(mddev));
6575                else if (mddev->raid_disks || mddev->major_version
6576                         || !list_empty(&mddev->disks)) {
6577                        pr_warn("md: %s already running, cannot run %s\n",
6578                                mdname(mddev), bdevname(rdev0->bdev,b));
6579                        mddev_unlock(mddev);
6580                } else {
6581                        pr_debug("md: created %s\n", mdname(mddev));
6582                        mddev->persistent = 1;
6583                        rdev_for_each_list(rdev, tmp, &candidates) {
6584                                list_del_init(&rdev->same_set);
6585                                if (bind_rdev_to_array(rdev, mddev))
6586                                        export_rdev(rdev);
6587                        }
6588                        autorun_array(mddev);
6589                        mddev_unlock(mddev);
6590                }
6591                /* on success, candidates will be empty, on error
6592                 * it won't...
6593                 */
6594                rdev_for_each_list(rdev, tmp, &candidates) {
6595                        list_del_init(&rdev->same_set);
6596                        export_rdev(rdev);
6597                }
6598                mddev_put(mddev);
6599        }
6600        pr_info("md: ... autorun DONE.\n");
6601}
6602#endif /* !MODULE */
6603
6604static int get_version(void __user *arg)
6605{
6606        mdu_version_t ver;
6607
6608        ver.major = MD_MAJOR_VERSION;
6609        ver.minor = MD_MINOR_VERSION;
6610        ver.patchlevel = MD_PATCHLEVEL_VERSION;
6611
6612        if (copy_to_user(arg, &ver, sizeof(ver)))
6613                return -EFAULT;
6614
6615        return 0;
6616}
6617
6618static int get_array_info(struct mddev *mddev, void __user *arg)
6619{
6620        mdu_array_info_t info;
6621        int nr,working,insync,failed,spare;
6622        struct md_rdev *rdev;
6623
6624        nr = working = insync = failed = spare = 0;
6625        rcu_read_lock();
6626        rdev_for_each_rcu(rdev, mddev) {
6627                nr++;
6628                if (test_bit(Faulty, &rdev->flags))
6629                        failed++;
6630                else {
6631                        working++;
6632                        if (test_bit(In_sync, &rdev->flags))
6633                                insync++;
6634                        else if (test_bit(Journal, &rdev->flags))
6635                                /* TODO: add journal count to md_u.h */
6636                                ;
6637                        else
6638                                spare++;
6639                }
6640        }
6641        rcu_read_unlock();
6642
6643        info.major_version = mddev->major_version;
6644        info.minor_version = mddev->minor_version;
6645        info.patch_version = MD_PATCHLEVEL_VERSION;
6646        info.ctime         = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
6647        info.level         = mddev->level;
6648        info.size          = mddev->dev_sectors / 2;
6649        if (info.size != mddev->dev_sectors / 2) /* overflow */
6650                info.size = -1;
6651        info.nr_disks      = nr;
6652        info.raid_disks    = mddev->raid_disks;
6653        info.md_minor      = mddev->md_minor;
6654        info.not_persistent= !mddev->persistent;
6655
6656        info.utime         = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
6657        info.state         = 0;
6658        if (mddev->in_sync)
6659                info.state = (1<<MD_SB_CLEAN);
6660        if (mddev->bitmap && mddev->bitmap_info.offset)
6661                info.state |= (1<<MD_SB_BITMAP_PRESENT);
6662        if (mddev_is_clustered(mddev))
6663                info.state |= (1<<MD_SB_CLUSTERED);
6664        info.active_disks  = insync;
6665        info.working_disks = working;
6666        info.failed_disks  = failed;
6667        info.spare_disks   = spare;
6668
6669        info.layout        = mddev->layout;
6670        info.chunk_size    = mddev->chunk_sectors << 9;
6671
6672        if (copy_to_user(arg, &info, sizeof(info)))
6673                return -EFAULT;
6674
6675        return 0;
6676}
6677
6678static int get_bitmap_file(struct mddev *mddev, void __user * arg)
6679{
6680        mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
6681        char *ptr;
6682        int err;
6683
6684        file = kzalloc(sizeof(*file), GFP_NOIO);
6685        if (!file)
6686                return -ENOMEM;
6687
6688        err = 0;
6689        spin_lock(&mddev->lock);
6690        /* bitmap enabled */
6691        if (mddev->bitmap_info.file) {
6692                ptr = file_path(mddev->bitmap_info.file, file->pathname,
6693                                sizeof(file->pathname));
6694                if (IS_ERR(ptr))
6695                        err = PTR_ERR(ptr);
6696                else
6697                        memmove(file->pathname, ptr,
6698                                sizeof(file->pathname)-(ptr-file->pathname));
6699        }
6700        spin_unlock(&mddev->lock);
6701
6702        if (err == 0 &&
6703            copy_to_user(arg, file, sizeof(*file)))
6704                err = -EFAULT;
6705
6706        kfree(file);
6707        return err;
6708}
6709
6710static int get_disk_info(struct mddev *mddev, void __user * arg)
6711{
6712        mdu_disk_info_t info;
6713        struct md_rdev *rdev;
6714
6715        if (copy_from_user(&info, arg, sizeof(info)))
6716                return -EFAULT;
6717
6718        rcu_read_lock();
6719        rdev = md_find_rdev_nr_rcu(mddev, info.number);
6720        if (rdev) {
6721                info.major = MAJOR(rdev->bdev->bd_dev);
6722                info.minor = MINOR(rdev->bdev->bd_dev);
6723                info.raid_disk = rdev->raid_disk;
6724                info.state = 0;
6725                if (test_bit(Faulty, &rdev->flags))
6726                        info.state |= (1<<MD_DISK_FAULTY);
6727                else if (test_bit(In_sync, &rdev->flags)) {
6728                        info.state |= (1<<MD_DISK_ACTIVE);
6729                        info.state |= (1<<MD_DISK_SYNC);
6730                }
6731                if (test_bit(Journal, &rdev->flags))
6732                        info.state |= (1<<MD_DISK_JOURNAL);
6733                if (test_bit(WriteMostly, &rdev->flags))
6734                        info.state |= (1<<MD_DISK_WRITEMOSTLY);
6735                if (test_bit(FailFast, &rdev->flags))
6736                        info.state |= (1<<MD_DISK_FAILFAST);
6737        } else {
6738                info.major = info.minor = 0;
6739                info.raid_disk = -1;
6740                info.state = (1<<MD_DISK_REMOVED);
6741        }
6742        rcu_read_unlock();
6743
6744        if (copy_to_user(arg, &info, sizeof(info)))
6745                return -EFAULT;
6746
6747        return 0;
6748}
6749
6750static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
6751{
6752        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
6753        struct md_rdev *rdev;
6754        dev_t dev = MKDEV(info->major,info->minor);
6755
6756        if (mddev_is_clustered(mddev) &&
6757                !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
6758                pr_warn("%s: Cannot add to clustered mddev.\n",
6759                        mdname(mddev));
6760                return -EINVAL;
6761        }
6762
6763        if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
6764                return -EOVERFLOW;
6765
6766        if (!mddev->raid_disks) {
6767                int err;
6768                /* expecting a device which has a superblock */
6769                rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
6770                if (IS_ERR(rdev)) {
6771                        pr_warn("md: md_import_device returned %ld\n",
6772                                PTR_ERR(rdev));
6773                        return PTR_ERR(rdev);
6774                }
6775                if (!list_empty(&mddev->disks)) {
6776                        struct md_rdev *rdev0
6777                                = list_entry(mddev->disks.next,
6778                                             struct md_rdev, same_set);
6779                        err = super_types[mddev->major_version]
6780                                .load_super(rdev, rdev0, mddev->minor_version);
6781                        if (err < 0) {
6782                                pr_warn("md: %s has different UUID to %s\n",
6783                                        bdevname(rdev->bdev,b),
6784                                        bdevname(rdev0->bdev,b2));
6785                                export_rdev(rdev);
6786                                return -EINVAL;
6787                        }
6788                }
6789                err = bind_rdev_to_array(rdev, mddev);
6790                if (err)
6791                        export_rdev(rdev);
6792                return err;
6793        }
6794
6795        /*
6796         * add_new_disk can be used once the array is assembled
6797         * to add "hot spares".  They must already have a superblock
6798         * written
6799         */
6800        if (mddev->pers) {
6801                int err;
6802                if (!mddev->pers->hot_add_disk) {
6803                        pr_warn("%s: personality does not support diskops!\n",
6804                                mdname(mddev));
6805                        return -EINVAL;
6806                }
6807                if (mddev->persistent)
6808                        rdev = md_import_device(dev, mddev->major_version,
6809                                                mddev->minor_version);
6810                else
6811                        rdev = md_import_device(dev, -1, -1);
6812                if (IS_ERR(rdev)) {
6813                        pr_warn("md: md_import_device returned %ld\n",
6814                                PTR_ERR(rdev));
6815                        return PTR_ERR(rdev);
6816                }
6817                /* set saved_raid_disk if appropriate */
6818                if (!mddev->persistent) {
6819                        if (info->state & (1<<MD_DISK_SYNC)  &&
6820                            info->raid_disk < mddev->raid_disks) {
6821                                rdev->raid_disk = info->raid_disk;
6822                                set_bit(In_sync, &rdev->flags);
6823                                clear_bit(Bitmap_sync, &rdev->flags);
6824                        } else
6825                                rdev->raid_disk = -1;
6826                        rdev->saved_raid_disk = rdev->raid_disk;
6827                } else
6828                        super_types[mddev->major_version].
6829                                validate_super(mddev, rdev);
6830                if ((info->state & (1<<MD_DISK_SYNC)) &&
6831                     rdev->raid_disk != info->raid_disk) {
6832                        /* This was a hot-add request, but events doesn't
6833                         * match, so reject it.
6834                         */
6835                        export_rdev(rdev);
6836                        return -EINVAL;
6837                }
6838
6839                clear_bit(In_sync, &rdev->flags); /* just to be sure */
6840                if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6841                        set_bit(WriteMostly, &rdev->flags);
6842                else
6843                        clear_bit(WriteMostly, &rdev->flags);
6844                if (info->state & (1<<MD_DISK_FAILFAST))
6845                        set_bit(FailFast, &rdev->flags);
6846                else
6847                        clear_bit(FailFast, &rdev->flags);
6848
6849                if (info->state & (1<<MD_DISK_JOURNAL)) {
6850                        struct md_rdev *rdev2;
6851                        bool has_journal = false;
6852
6853                        /* make sure no existing journal disk */
6854                        rdev_for_each(rdev2, mddev) {
6855                                if (test_bit(Journal, &rdev2->flags)) {
6856                                        has_journal = true;
6857                                        break;
6858                                }
6859                        }
6860                        if (has_journal || mddev->bitmap) {
6861                                export_rdev(rdev);
6862                                return -EBUSY;
6863                        }
6864                        set_bit(Journal, &rdev->flags);
6865                }
6866                /*
6867                 * check whether the device shows up in other nodes
6868                 */
6869                if (mddev_is_clustered(mddev)) {
6870                        if (info->state & (1 << MD_DISK_CANDIDATE))
6871                                set_bit(Candidate, &rdev->flags);
6872                        else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
6873                                /* --add initiated by this node */
6874                                err = md_cluster_ops->add_new_disk(mddev, rdev);
6875                                if (err) {
6876                                        export_rdev(rdev);
6877                                        return err;
6878                                }
6879                        }
6880                }
6881
6882                rdev->raid_disk = -1;
6883                err = bind_rdev_to_array(rdev, mddev);
6884
6885                if (err)
6886                        export_rdev(rdev);
6887
6888                if (mddev_is_clustered(mddev)) {
6889                        if (info->state & (1 << MD_DISK_CANDIDATE)) {
6890                                if (!err) {
6891                                        err = md_cluster_ops->new_disk_ack(mddev,
6892                                                err == 0);
6893                                        if (err)
6894                                                md_kick_rdev_from_array(rdev);
6895                                }
6896                        } else {
6897                                if (err)
6898                                        md_cluster_ops->add_new_disk_cancel(mddev);
6899                                else
6900                                        err = add_bound_rdev(rdev);
6901                        }
6902
6903                } else if (!err)
6904                        err = add_bound_rdev(rdev);
6905
6906                return err;
6907        }
6908
6909        /* otherwise, add_new_disk is only allowed
6910         * for major_version==0 superblocks
6911         */
6912        if (mddev->major_version != 0) {
6913                pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev));
6914                return -EINVAL;
6915        }
6916
6917        if (!(info->state & (1<<MD_DISK_FAULTY))) {
6918                int err;
6919                rdev = md_import_device(dev, -1, 0);
6920                if (IS_ERR(rdev)) {
6921                        pr_warn("md: error, md_import_device() returned %ld\n",
6922                                PTR_ERR(rdev));
6923                        return PTR_ERR(rdev);
6924                }
6925                rdev->desc_nr = info->number;
6926                if (info->raid_disk < mddev->raid_disks)
6927                        rdev->raid_disk = info->raid_disk;
6928                else
6929                        rdev->raid_disk = -1;
6930
6931                if (rdev->raid_disk < mddev->raid_disks)
6932                        if (info->state & (1<<MD_DISK_SYNC))
6933                                set_bit(In_sync, &rdev->flags);
6934
6935                if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6936                        set_bit(WriteMostly, &rdev->flags);
6937                if (info->state & (1<<MD_DISK_FAILFAST))
6938                        set_bit(FailFast, &rdev->flags);
6939
6940                if (!mddev->persistent) {
6941                        pr_debug("md: nonpersistent superblock ...\n");
6942                        rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
6943                } else
6944                        rdev->sb_start = calc_dev_sboffset(rdev);
6945                rdev->sectors = rdev->sb_start;
6946
6947                err = bind_rdev_to_array(rdev, mddev);
6948                if (err) {
6949                        export_rdev(rdev);
6950                        return err;
6951                }
6952        }
6953
6954        return 0;
6955}
6956
6957static int hot_remove_disk(struct mddev *mddev, dev_t dev)
6958{
6959        char b[BDEVNAME_SIZE];
6960        struct md_rdev *rdev;
6961
6962        if (!mddev->pers)
6963                return -ENODEV;
6964
6965        rdev = find_rdev(mddev, dev);
6966        if (!rdev)
6967                return -ENXIO;
6968
6969        if (rdev->raid_disk < 0)
6970                goto kick_rdev;
6971
6972        clear_bit(Blocked, &rdev->flags);
6973        remove_and_add_spares(mddev, rdev);
6974
6975        if (rdev->raid_disk >= 0)
6976                goto busy;
6977
6978kick_rdev:
6979        if (mddev_is_clustered(mddev)) {
6980                if (md_cluster_ops->remove_disk(mddev, rdev))
6981                        goto busy;
6982        }
6983
6984        md_kick_rdev_from_array(rdev);
6985        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6986        if (mddev->thread)
6987                md_wakeup_thread(mddev->thread);
6988        else
6989                md_update_sb(mddev, 1);
6990        md_new_event(mddev);
6991
6992        return 0;
6993busy:
6994        pr_debug("md: cannot remove active disk %s from %s ...\n",
6995                 bdevname(rdev->bdev,b), mdname(mddev));
6996        return -EBUSY;
6997}
6998
6999static int hot_add_disk(struct mddev *mddev, dev_t dev)
7000{
7001        char b[BDEVNAME_SIZE];
7002        int err;
7003        struct md_rdev *rdev;
7004
7005        if (!mddev->pers)
7006                return -ENODEV;
7007
7008        if (mddev->major_version != 0) {
7009                pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n",
7010                        mdname(mddev));
7011                return -EINVAL;
7012        }
7013        if (!mddev->pers->hot_add_disk) {
7014                pr_warn("%s: personality does not support diskops!\n",
7015                        mdname(mddev));
7016                return -EINVAL;
7017        }
7018
7019        rdev = md_import_device(dev, -1, 0);
7020        if (IS_ERR(rdev)) {
7021                pr_warn("md: error, md_import_device() returned %ld\n",
7022                        PTR_ERR(rdev));
7023                return -EINVAL;
7024        }
7025
7026        if (mddev->persistent)
7027                rdev->sb_start = calc_dev_sboffset(rdev);
7028        else
7029                rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
7030
7031        rdev->sectors = rdev->sb_start;
7032
7033        if (test_bit(Faulty, &rdev->flags)) {
7034                pr_warn("md: can not hot-add faulty %s disk to %s!\n",
7035                        bdevname(rdev->bdev,b), mdname(mddev));
7036                err = -EINVAL;
7037                goto abort_export;
7038        }
7039
7040        clear_bit(In_sync, &rdev->flags);
7041        rdev->desc_nr = -1;
7042        rdev->saved_raid_disk = -1;
7043        err = bind_rdev_to_array(rdev, mddev);
7044        if (err)
7045                goto abort_export;
7046
7047        /*
7048         * The rest should better be atomic, we can have disk failures
7049         * noticed in interrupt contexts ...
7050         */
7051
7052        rdev->raid_disk = -1;
7053
7054        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7055        if (!mddev->thread)
7056                md_update_sb(mddev, 1);
7057        /*
7058         * Kick recovery, maybe this spare has to be added to the
7059         * array immediately.
7060         */
7061        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7062        md_wakeup_thread(mddev->thread);
7063        md_new_event(mddev);
7064        return 0;
7065
7066abort_export:
7067        export_rdev(rdev);
7068        return err;
7069}
7070
7071static int set_bitmap_file(struct mddev *mddev, int fd)
7072{
7073        int err = 0;
7074
7075        if (mddev->pers) {
7076                if (!mddev->pers->quiesce || !mddev->thread)
7077                        return -EBUSY;
7078                if (mddev->recovery || mddev->sync_thread)
7079                        return -EBUSY;
7080                /* we should be able to change the bitmap.. */
7081        }
7082
7083        if (fd >= 0) {
7084                struct inode *inode;
7085                struct file *f;
7086
7087                if (mddev->bitmap || mddev->bitmap_info.file)
7088                        return -EEXIST; /* cannot add when bitmap is present */
7089                f = fget(fd);
7090
7091                if (f == NULL) {
7092                        pr_warn("%s: error: failed to get bitmap file\n",
7093                                mdname(mddev));
7094                        return -EBADF;
7095                }
7096
7097                inode = f->f_mapping->host;
7098                if (!S_ISREG(inode->i_mode)) {
7099                        pr_warn("%s: error: bitmap file must be a regular file\n",
7100                                mdname(mddev));
7101                        err = -EBADF;
7102                } else if (!(f->f_mode & FMODE_WRITE)) {
7103                        pr_warn("%s: error: bitmap file must open for write\n",
7104                                mdname(mddev));
7105                        err = -EBADF;
7106                } else if (atomic_read(&inode->i_writecount) != 1) {
7107                        pr_warn("%s: error: bitmap file is already in use\n",
7108                                mdname(mddev));
7109                        err = -EBUSY;
7110                }
7111                if (err) {
7112                        fput(f);
7113                        return err;
7114                }
7115                mddev->bitmap_info.file = f;
7116                mddev->bitmap_info.offset = 0; /* file overrides offset */
7117        } else if (mddev->bitmap == NULL)
7118                return -ENOENT; /* cannot remove what isn't there */
7119        err = 0;
7120        if (mddev->pers) {
7121                if (fd >= 0) {
7122                        struct bitmap *bitmap;
7123
7124                        bitmap = md_bitmap_create(mddev, -1);
7125                        mddev_suspend(mddev);
7126                        if (!IS_ERR(bitmap)) {
7127                                mddev->bitmap = bitmap;
7128                                err = md_bitmap_load(mddev);
7129                        } else
7130                                err = PTR_ERR(bitmap);
7131                        if (err) {
7132                                md_bitmap_destroy(mddev);
7133                                fd = -1;
7134                        }
7135                        mddev_resume(mddev);
7136                } else if (fd < 0) {
7137                        mddev_suspend(mddev);
7138                        md_bitmap_destroy(mddev);
7139                        mddev_resume(mddev);
7140                }
7141        }
7142        if (fd < 0) {
7143                struct file *f = mddev->bitmap_info.file;
7144                if (f) {
7145                        spin_lock(&mddev->lock);
7146                        mddev->bitmap_info.file = NULL;
7147                        spin_unlock(&mddev->lock);
7148                        fput(f);
7149                }
7150        }
7151
7152        return err;
7153}
7154
7155/*
7156 * set_array_info is used two different ways
7157 * The original usage is when creating a new array.
7158 * In this usage, raid_disks is > 0 and it together with
7159 *  level, size, not_persistent,layout,chunksize determine the
7160 *  shape of the array.
7161 *  This will always create an array with a type-0.90.0 superblock.
7162 * The newer usage is when assembling an array.
7163 *  In this case raid_disks will be 0, and the major_version field is
7164 *  use to determine which style super-blocks are to be found on the devices.
7165 *  The minor and patch _version numbers are also kept incase the
7166 *  super_block handler wishes to interpret them.
7167 */
7168static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
7169{
7170
7171        if (info->raid_disks == 0) {
7172                /* just setting version number for superblock loading */
7173                if (info->major_version < 0 ||
7174                    info->major_version >= ARRAY_SIZE(super_types) ||
7175                    super_types[info->major_version].name == NULL) {
7176                        /* maybe try to auto-load a module? */
7177                        pr_warn("md: superblock version %d not known\n",
7178                                info->major_version);
7179                        return -EINVAL;
7180                }
7181                mddev->major_version = info->major_version;
7182                mddev->minor_version = info->minor_version;
7183                mddev->patch_version = info->patch_version;
7184                mddev->persistent = !info->not_persistent;
7185                /* ensure mddev_put doesn't delete this now that there
7186                 * is some minimal configuration.
7187                 */
7188                mddev->ctime         = ktime_get_real_seconds();
7189                return 0;
7190        }
7191        mddev->major_version = MD_MAJOR_VERSION;
7192        mddev->minor_version = MD_MINOR_VERSION;
7193        mddev->patch_version = MD_PATCHLEVEL_VERSION;
7194        mddev->ctime         = ktime_get_real_seconds();
7195
7196        mddev->level         = info->level;
7197        mddev->clevel[0]     = 0;
7198        mddev->dev_sectors   = 2 * (sector_t)info->size;
7199        mddev->raid_disks    = info->raid_disks;
7200        /* don't set md_minor, it is determined by which /dev/md* was
7201         * openned
7202         */
7203        if (info->state & (1<<MD_SB_CLEAN))
7204                mddev->recovery_cp = MaxSector;
7205        else
7206                mddev->recovery_cp = 0;
7207        mddev->persistent    = ! info->not_persistent;
7208        mddev->external      = 0;
7209
7210        mddev->layout        = info->layout;
7211        if (mddev->level == 0)
7212                /* Cannot trust RAID0 layout info here */
7213                mddev->layout = -1;
7214        mddev->chunk_sectors = info->chunk_size >> 9;
7215
7216        if (mddev->persistent) {
7217                mddev->max_disks = MD_SB_DISKS;
7218                mddev->flags = 0;
7219                mddev->sb_flags = 0;
7220        }
7221        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7222
7223        mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
7224        mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
7225        mddev->bitmap_info.offset = 0;
7226
7227        mddev->reshape_position = MaxSector;
7228
7229        /*
7230         * Generate a 128 bit UUID
7231         */
7232        get_random_bytes(mddev->uuid, 16);
7233
7234        mddev->new_level = mddev->level;
7235        mddev->new_chunk_sectors = mddev->chunk_sectors;
7236        mddev->new_layout = mddev->layout;
7237        mddev->delta_disks = 0;
7238        mddev->reshape_backwards = 0;
7239
7240        return 0;
7241}
7242
7243void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
7244{
7245        lockdep_assert_held(&mddev->reconfig_mutex);
7246
7247        if (mddev->external_size)
7248                return;
7249
7250        mddev->array_sectors = array_sectors;
7251}
7252EXPORT_SYMBOL(md_set_array_sectors);
7253
7254static int update_size(struct mddev *mddev, sector_t num_sectors)
7255{
7256        struct md_rdev *rdev;
7257        int rv;
7258        int fit = (num_sectors == 0);
7259        sector_t old_dev_sectors = mddev->dev_sectors;
7260
7261        if (mddev->pers->resize == NULL)
7262                return -EINVAL;
7263        /* The "num_sectors" is the number of sectors of each device that
7264         * is used.  This can only make sense for arrays with redundancy.
7265         * linear and raid0 always use whatever space is available. We can only
7266         * consider changing this number if no resync or reconstruction is
7267         * happening, and if the new size is acceptable. It must fit before the
7268         * sb_start or, if that is <data_offset, it must fit before the size
7269         * of each device.  If num_sectors is zero, we find the largest size
7270         * that fits.
7271         */
7272        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
7273            mddev->sync_thread)
7274                return -EBUSY;
7275        if (mddev->ro)
7276                return -EROFS;
7277
7278        rdev_for_each(rdev, mddev) {
7279                sector_t avail = rdev->sectors;
7280
7281                if (fit && (num_sectors == 0 || num_sectors > avail))
7282                        num_sectors = avail;
7283                if (avail < num_sectors)
7284                        return -ENOSPC;
7285        }
7286        rv = mddev->pers->resize(mddev, num_sectors);
7287        if (!rv) {
7288                if (mddev_is_clustered(mddev))
7289                        md_cluster_ops->update_size(mddev, old_dev_sectors);
7290                else if (mddev->queue) {
7291                        set_capacity(mddev->gendisk, mddev->array_sectors);
7292                        revalidate_disk_size(mddev->gendisk, true);
7293                }
7294        }
7295        return rv;
7296}
7297
7298static int update_raid_disks(struct mddev *mddev, int raid_disks)
7299{
7300        int rv;
7301        struct md_rdev *rdev;
7302        /* change the number of raid disks */
7303        if (mddev->pers->check_reshape == NULL)
7304                return -EINVAL;
7305        if (mddev->ro)
7306                return -EROFS;
7307        if (raid_disks <= 0 ||
7308            (mddev->max_disks && raid_disks >= mddev->max_disks))
7309                return -EINVAL;
7310        if (mddev->sync_thread ||
7311            test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
7312            test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) ||
7313            mddev->reshape_position != MaxSector)
7314                return -EBUSY;
7315
7316        rdev_for_each(rdev, mddev) {
7317                if (mddev->raid_disks < raid_disks &&
7318                    rdev->data_offset < rdev->new_data_offset)
7319                        return -EINVAL;
7320                if (mddev->raid_disks > raid_disks &&
7321                    rdev->data_offset > rdev->new_data_offset)
7322                        return -EINVAL;
7323        }
7324
7325        mddev->delta_disks = raid_disks - mddev->raid_disks;
7326        if (mddev->delta_disks < 0)
7327                mddev->reshape_backwards = 1;
7328        else if (mddev->delta_disks > 0)
7329                mddev->reshape_backwards = 0;
7330
7331        rv = mddev->pers->check_reshape(mddev);
7332        if (rv < 0) {
7333                mddev->delta_disks = 0;
7334                mddev->reshape_backwards = 0;
7335        }
7336        return rv;
7337}
7338
7339/*
7340 * update_array_info is used to change the configuration of an
7341 * on-line array.
7342 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
7343 * fields in the info are checked against the array.
7344 * Any differences that cannot be handled will cause an error.
7345 * Normally, only one change can be managed at a time.
7346 */
7347static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
7348{
7349        int rv = 0;
7350        int cnt = 0;
7351        int state = 0;
7352
7353        /* calculate expected state,ignoring low bits */
7354        if (mddev->bitmap && mddev->bitmap_info.offset)
7355                state |= (1 << MD_SB_BITMAP_PRESENT);
7356
7357        if (mddev->major_version != info->major_version ||
7358            mddev->minor_version != info->minor_version ||
7359/*          mddev->patch_version != info->patch_version || */
7360            mddev->ctime         != info->ctime         ||
7361            mddev->level         != info->level         ||
7362/*          mddev->layout        != info->layout        || */
7363            mddev->persistent    != !info->not_persistent ||
7364            mddev->chunk_sectors != info->chunk_size >> 9 ||
7365            /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
7366            ((state^info->state) & 0xfffffe00)
7367                )
7368                return -EINVAL;
7369        /* Check there is only one change */
7370        if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7371                cnt++;
7372        if (mddev->raid_disks != info->raid_disks)
7373                cnt++;
7374        if (mddev->layout != info->layout)
7375                cnt++;
7376        if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
7377                cnt++;
7378        if (cnt == 0)
7379                return 0;
7380        if (cnt > 1)
7381                return -EINVAL;
7382
7383        if (mddev->layout != info->layout) {
7384                /* Change layout
7385                 * we don't need to do anything at the md level, the
7386                 * personality will take care of it all.
7387                 */
7388                if (mddev->pers->check_reshape == NULL)
7389                        return -EINVAL;
7390                else {
7391                        mddev->new_layout = info->layout;
7392                        rv = mddev->pers->check_reshape(mddev);
7393                        if (rv)
7394                                mddev->new_layout = mddev->layout;
7395                        return rv;
7396                }
7397        }
7398        if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7399                rv = update_size(mddev, (sector_t)info->size * 2);
7400
7401        if (mddev->raid_disks    != info->raid_disks)
7402                rv = update_raid_disks(mddev, info->raid_disks);
7403
7404        if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
7405                if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
7406                        rv = -EINVAL;
7407                        goto err;
7408                }
7409                if (mddev->recovery || mddev->sync_thread) {
7410                        rv = -EBUSY;
7411                        goto err;
7412                }
7413                if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
7414                        struct bitmap *bitmap;
7415                        /* add the bitmap */
7416                        if (mddev->bitmap) {
7417                                rv = -EEXIST;
7418                                goto err;
7419                        }
7420                        if (mddev->bitmap_info.default_offset == 0) {
7421                                rv = -EINVAL;
7422                                goto err;
7423                        }
7424                        mddev->bitmap_info.offset =
7425                                mddev->bitmap_info.default_offset;
7426                        mddev->bitmap_info.space =
7427                                mddev->bitmap_info.default_space;
7428                        bitmap = md_bitmap_create(mddev, -1);
7429                        mddev_suspend(mddev);
7430                        if (!IS_ERR(bitmap)) {
7431                                mddev->bitmap = bitmap;
7432                                rv = md_bitmap_load(mddev);
7433                        } else
7434                                rv = PTR_ERR(bitmap);
7435                        if (rv)
7436                                md_bitmap_destroy(mddev);
7437                        mddev_resume(mddev);
7438                } else {
7439                        /* remove the bitmap */
7440                        if (!mddev->bitmap) {
7441                                rv = -ENOENT;
7442                                goto err;
7443                        }
7444                        if (mddev->bitmap->storage.file) {
7445                                rv = -EINVAL;
7446                                goto err;
7447                        }
7448                        if (mddev->bitmap_info.nodes) {
7449                                /* hold PW on all the bitmap lock */
7450                                if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
7451                                        pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
7452                                        rv = -EPERM;
7453                                        md_cluster_ops->unlock_all_bitmaps(mddev);
7454                                        goto err;
7455                                }
7456
7457                                mddev->bitmap_info.nodes = 0;
7458                                md_cluster_ops->leave(mddev);
7459                                module_put(md_cluster_mod);
7460                                mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
7461                        }
7462                        mddev_suspend(mddev);
7463                        md_bitmap_destroy(mddev);
7464                        mddev_resume(mddev);
7465                        mddev->bitmap_info.offset = 0;
7466                }
7467        }
7468        md_update_sb(mddev, 1);
7469        return rv;
7470err:
7471        return rv;
7472}
7473
7474static int set_disk_faulty(struct mddev *mddev, dev_t dev)
7475{
7476        struct md_rdev *rdev;
7477        int err = 0;
7478
7479        if (mddev->pers == NULL)
7480                return -ENODEV;
7481
7482        rcu_read_lock();
7483        rdev = md_find_rdev_rcu(mddev, dev);
7484        if (!rdev)
7485                err =  -ENODEV;
7486        else {
7487                md_error(mddev, rdev);
7488                if (!test_bit(Faulty, &rdev->flags))
7489                        err = -EBUSY;
7490        }
7491        rcu_read_unlock();
7492        return err;
7493}
7494
7495/*
7496 * We have a problem here : there is no easy way to give a CHS
7497 * virtual geometry. We currently pretend that we have a 2 heads
7498 * 4 sectors (with a BIG number of cylinders...). This drives
7499 * dosfs just mad... ;-)
7500 */
7501static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
7502{
7503        struct mddev *mddev = bdev->bd_disk->private_data;
7504
7505        geo->heads = 2;
7506        geo->sectors = 4;
7507        geo->cylinders = mddev->array_sectors / 8;
7508        return 0;
7509}
7510
7511static inline bool md_ioctl_valid(unsigned int cmd)
7512{
7513        switch (cmd) {
7514        case ADD_NEW_DISK:
7515        case GET_ARRAY_INFO:
7516        case GET_BITMAP_FILE:
7517        case GET_DISK_INFO:
7518        case HOT_ADD_DISK:
7519        case HOT_REMOVE_DISK:
7520        case RAID_AUTORUN:
7521        case RAID_VERSION:
7522        case RESTART_ARRAY_RW:
7523        case RUN_ARRAY:
7524        case SET_ARRAY_INFO:
7525        case SET_BITMAP_FILE:
7526        case SET_DISK_FAULTY:
7527        case STOP_ARRAY:
7528        case STOP_ARRAY_RO:
7529        case CLUSTERED_DISK_NACK:
7530                return true;
7531        default:
7532                return false;
7533        }
7534}
7535
7536static int md_ioctl(struct block_device *bdev, fmode_t mode,
7537                        unsigned int cmd, unsigned long arg)
7538{
7539        int err = 0;
7540        void __user *argp = (void __user *)arg;
7541        struct mddev *mddev = NULL;
7542        bool did_set_md_closing = false;
7543
7544        if (!md_ioctl_valid(cmd))
7545                return -ENOTTY;
7546
7547        switch (cmd) {
7548        case RAID_VERSION:
7549        case GET_ARRAY_INFO:
7550        case GET_DISK_INFO:
7551                break;
7552        default:
7553                if (!capable(CAP_SYS_ADMIN))
7554                        return -EACCES;
7555        }
7556
7557        /*
7558         * Commands dealing with the RAID driver but not any
7559         * particular array:
7560         */
7561        switch (cmd) {
7562        case RAID_VERSION:
7563                err = get_version(argp);
7564                goto out;
7565
7566#ifndef MODULE
7567        case RAID_AUTORUN:
7568                err = 0;
7569                autostart_arrays(arg);
7570                goto out;
7571#endif
7572        default:;
7573        }
7574
7575        /*
7576         * Commands creating/starting a new array:
7577         */
7578
7579        mddev = bdev->bd_disk->private_data;
7580
7581        if (!mddev) {
7582                BUG();
7583                goto out;
7584        }
7585
7586        /* Some actions do not requires the mutex */
7587        switch (cmd) {
7588        case GET_ARRAY_INFO:
7589                if (!mddev->raid_disks && !mddev->external)
7590                        err = -ENODEV;
7591                else
7592                        err = get_array_info(mddev, argp);
7593                goto out;
7594
7595        case GET_DISK_INFO:
7596                if (!mddev->raid_disks && !mddev->external)
7597                        err = -ENODEV;
7598                else
7599                        err = get_disk_info(mddev, argp);
7600                goto out;
7601
7602        case SET_DISK_FAULTY:
7603                err = set_disk_faulty(mddev, new_decode_dev(arg));
7604                goto out;
7605
7606        case GET_BITMAP_FILE:
7607                err = get_bitmap_file(mddev, argp);
7608                goto out;
7609
7610        }
7611
7612        if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK)
7613                flush_rdev_wq(mddev);
7614
7615        if (cmd == HOT_REMOVE_DISK)
7616                /* need to ensure recovery thread has run */
7617                wait_event_interruptible_timeout(mddev->sb_wait,
7618                                                 !test_bit(MD_RECOVERY_NEEDED,
7619                                                           &mddev->recovery),
7620                                                 msecs_to_jiffies(5000));
7621        if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
7622                /* Need to flush page cache, and ensure no-one else opens
7623                 * and writes
7624                 */
7625                mutex_lock(&mddev->open_mutex);
7626                if (mddev->pers && atomic_read(&mddev->openers) > 1) {
7627                        mutex_unlock(&mddev->open_mutex);
7628                        err = -EBUSY;
7629                        goto out;
7630                }
7631                if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
7632                        mutex_unlock(&mddev->open_mutex);
7633                        err = -EBUSY;
7634                        goto out;
7635                }
7636                did_set_md_closing = true;
7637                mutex_unlock(&mddev->open_mutex);
7638                sync_blockdev(bdev);
7639        }
7640        err = mddev_lock(mddev);
7641        if (err) {
7642                pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
7643                         err, cmd);
7644                goto out;
7645        }
7646
7647        if (cmd == SET_ARRAY_INFO) {
7648                mdu_array_info_t info;
7649                if (!arg)
7650                        memset(&info, 0, sizeof(info));
7651                else if (copy_from_user(&info, argp, sizeof(info))) {
7652                        err = -EFAULT;
7653                        goto unlock;
7654                }
7655                if (mddev->pers) {
7656                        err = update_array_info(mddev, &info);
7657                        if (err) {
7658                                pr_warn("md: couldn't update array info. %d\n", err);
7659                                goto unlock;
7660                        }
7661                        goto unlock;
7662                }
7663                if (!list_empty(&mddev->disks)) {
7664                        pr_warn("md: array %s already has disks!\n", mdname(mddev));
7665                        err = -EBUSY;
7666                        goto unlock;
7667                }
7668                if (mddev->raid_disks) {
7669                        pr_warn("md: array %s already initialised!\n", mdname(mddev));
7670                        err = -EBUSY;
7671                        goto unlock;
7672                }
7673                err = set_array_info(mddev, &info);
7674                if (err) {
7675                        pr_warn("md: couldn't set array info. %d\n", err);
7676                        goto unlock;
7677                }
7678                goto unlock;
7679        }
7680
7681        /*
7682         * Commands querying/configuring an existing array:
7683         */
7684        /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
7685         * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
7686        if ((!mddev->raid_disks && !mddev->external)
7687            && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
7688            && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
7689            && cmd != GET_BITMAP_FILE) {
7690                err = -ENODEV;
7691                goto unlock;
7692        }
7693
7694        /*
7695         * Commands even a read-only array can execute:
7696         */
7697        switch (cmd) {
7698        case RESTART_ARRAY_RW:
7699                err = restart_array(mddev);
7700                goto unlock;
7701
7702        case STOP_ARRAY:
7703                err = do_md_stop(mddev, 0, bdev);
7704                goto unlock;
7705
7706        case STOP_ARRAY_RO:
7707                err = md_set_readonly(mddev, bdev);
7708                goto unlock;
7709
7710        case HOT_REMOVE_DISK:
7711                err = hot_remove_disk(mddev, new_decode_dev(arg));
7712                goto unlock;
7713
7714        case ADD_NEW_DISK:
7715                /* We can support ADD_NEW_DISK on read-only arrays
7716                 * only if we are re-adding a preexisting device.
7717                 * So require mddev->pers and MD_DISK_SYNC.
7718                 */
7719                if (mddev->pers) {
7720                        mdu_disk_info_t info;
7721                        if (copy_from_user(&info, argp, sizeof(info)))
7722                                err = -EFAULT;
7723                        else if (!(info.state & (1<<MD_DISK_SYNC)))
7724                                /* Need to clear read-only for this */
7725                                break;
7726                        else
7727                                err = add_new_disk(mddev, &info);
7728                        goto unlock;
7729                }
7730                break;
7731        }
7732
7733        /*
7734         * The remaining ioctls are changing the state of the
7735         * superblock, so we do not allow them on read-only arrays.
7736         */
7737        if (mddev->ro && mddev->pers) {
7738                if (mddev->ro == 2) {
7739                        mddev->ro = 0;
7740                        sysfs_notify_dirent_safe(mddev->sysfs_state);
7741                        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7742                        /* mddev_unlock will wake thread */
7743                        /* If a device failed while we were read-only, we
7744                         * need to make sure the metadata is updated now.
7745                         */
7746                        if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
7747                                mddev_unlock(mddev);
7748                                wait_event(mddev->sb_wait,
7749                                           !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
7750                                           !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
7751                                mddev_lock_nointr(mddev);
7752                        }
7753                } else {
7754                        err = -EROFS;
7755                        goto unlock;
7756                }
7757        }
7758
7759        switch (cmd) {
7760        case ADD_NEW_DISK:
7761        {
7762                mdu_disk_info_t info;
7763                if (copy_from_user(&info, argp, sizeof(info)))
7764                        err = -EFAULT;
7765                else
7766                        err = add_new_disk(mddev, &info);
7767                goto unlock;
7768        }
7769
7770        case CLUSTERED_DISK_NACK:
7771                if (mddev_is_clustered(mddev))
7772                        md_cluster_ops->new_disk_ack(mddev, false);
7773                else
7774                        err = -EINVAL;
7775                goto unlock;
7776
7777        case HOT_ADD_DISK:
7778                err = hot_add_disk(mddev, new_decode_dev(arg));
7779                goto unlock;
7780
7781        case RUN_ARRAY:
7782                err = do_md_run(mddev);
7783                goto unlock;
7784
7785        case SET_BITMAP_FILE:
7786                err = set_bitmap_file(mddev, (int)arg);
7787                goto unlock;
7788
7789        default:
7790                err = -EINVAL;
7791                goto unlock;
7792        }
7793
7794unlock:
7795        if (mddev->hold_active == UNTIL_IOCTL &&
7796            err != -EINVAL)
7797                mddev->hold_active = 0;
7798        mddev_unlock(mddev);
7799out:
7800        if(did_set_md_closing)
7801                clear_bit(MD_CLOSING, &mddev->flags);
7802        return err;
7803}
7804#ifdef CONFIG_COMPAT
7805static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
7806                    unsigned int cmd, unsigned long arg)
7807{
7808        switch (cmd) {
7809        case HOT_REMOVE_DISK:
7810        case HOT_ADD_DISK:
7811        case SET_DISK_FAULTY:
7812        case SET_BITMAP_FILE:
7813                /* These take in integer arg, do not convert */
7814                break;
7815        default:
7816                arg = (unsigned long)compat_ptr(arg);
7817                break;
7818        }
7819
7820        return md_ioctl(bdev, mode, cmd, arg);
7821}
7822#endif /* CONFIG_COMPAT */
7823
7824static int md_set_read_only(struct block_device *bdev, bool ro)
7825{
7826        struct mddev *mddev = bdev->bd_disk->private_data;
7827        int err;
7828
7829        err = mddev_lock(mddev);
7830        if (err)
7831                return err;
7832
7833        if (!mddev->raid_disks && !mddev->external) {
7834                err = -ENODEV;
7835                goto out_unlock;
7836        }
7837
7838        /*
7839         * Transitioning to read-auto need only happen for arrays that call
7840         * md_write_start and which are not ready for writes yet.
7841         */
7842        if (!ro && mddev->ro == 1 && mddev->pers) {
7843                err = restart_array(mddev);
7844                if (err)
7845                        goto out_unlock;
7846                mddev->ro = 2;
7847        }
7848
7849out_unlock:
7850        mddev_unlock(mddev);
7851        return err;
7852}
7853
7854static int md_open(struct block_device *bdev, fmode_t mode)
7855{
7856        /*
7857         * Succeed if we can lock the mddev, which confirms that
7858         * it isn't being stopped right now.
7859         */
7860        struct mddev *mddev = mddev_find(bdev->bd_dev);
7861        int err;
7862
7863        if (!mddev)
7864                return -ENODEV;
7865
7866        if (mddev->gendisk != bdev->bd_disk) {
7867                /* we are racing with mddev_put which is discarding this
7868                 * bd_disk.
7869                 */
7870                mddev_put(mddev);
7871                /* Wait until bdev->bd_disk is definitely gone */
7872                if (work_pending(&mddev->del_work))
7873                        flush_workqueue(md_misc_wq);
7874                return -EBUSY;
7875        }
7876        BUG_ON(mddev != bdev->bd_disk->private_data);
7877
7878        if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
7879                goto out;
7880
7881        if (test_bit(MD_CLOSING, &mddev->flags)) {
7882                mutex_unlock(&mddev->open_mutex);
7883                err = -ENODEV;
7884                goto out;
7885        }
7886
7887        err = 0;
7888        atomic_inc(&mddev->openers);
7889        mutex_unlock(&mddev->open_mutex);
7890
7891        bdev_check_media_change(bdev);
7892 out:
7893        if (err)
7894                mddev_put(mddev);
7895        return err;
7896}
7897
7898static void md_release(struct gendisk *disk, fmode_t mode)
7899{
7900        struct mddev *mddev = disk->private_data;
7901
7902        BUG_ON(!mddev);
7903        atomic_dec(&mddev->openers);
7904        mddev_put(mddev);
7905}
7906
7907static int md_media_changed(struct gendisk *disk)
7908{
7909        struct mddev *mddev = disk->private_data;
7910
7911        return mddev->changed;
7912}
7913
7914static int md_revalidate(struct gendisk *disk)
7915{
7916        struct mddev *mddev = disk->private_data;
7917
7918        mddev->changed = 0;
7919        return 0;
7920}
7921static const struct block_device_operations md_fops =
7922{
7923        .owner          = THIS_MODULE,
7924        .open           = md_open,
7925        .release        = md_release,
7926        .ioctl          = md_ioctl,
7927#ifdef CONFIG_COMPAT
7928        .compat_ioctl   = md_compat_ioctl,
7929#endif
7930        .getgeo         = md_getgeo,
7931        .media_changed  = md_media_changed,
7932        .revalidate_disk= md_revalidate,
7933        .set_read_only  = md_set_read_only,
7934};
7935
7936static int md_thread(void *arg)
7937{
7938        struct md_thread *thread = arg;
7939
7940        /*
7941         * md_thread is a 'system-thread', it's priority should be very
7942         * high. We avoid resource deadlocks individually in each
7943         * raid personality. (RAID5 does preallocation) We also use RR and
7944         * the very same RT priority as kswapd, thus we will never get
7945         * into a priority inversion deadlock.
7946         *
7947         * we definitely have to have equal or higher priority than
7948         * bdflush, otherwise bdflush will deadlock if there are too
7949         * many dirty RAID5 blocks.
7950         */
7951
7952        allow_signal(SIGKILL);
7953        while (!kthread_should_stop()) {
7954
7955                /* We need to wait INTERRUPTIBLE so that
7956                 * we don't add to the load-average.
7957                 * That means we need to be sure no signals are
7958                 * pending
7959                 */
7960                if (signal_pending(current))
7961                        flush_signals(current);
7962
7963                wait_event_interruptible_timeout
7964                        (thread->wqueue,
7965                         test_bit(THREAD_WAKEUP, &thread->flags)
7966                         || kthread_should_stop() || kthread_should_park(),
7967                         thread->timeout);
7968
7969                clear_bit(THREAD_WAKEUP, &thread->flags);
7970                if (kthread_should_park())
7971                        kthread_parkme();
7972                if (!kthread_should_stop())
7973                        thread->run(thread);
7974        }
7975
7976        return 0;
7977}
7978
7979void md_wakeup_thread(struct md_thread *thread)
7980{
7981        if (thread) {
7982                pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
7983                set_bit(THREAD_WAKEUP, &thread->flags);
7984                wake_up(&thread->wqueue);
7985        }
7986}
7987EXPORT_SYMBOL(md_wakeup_thread);
7988
7989struct md_thread *md_register_thread(void (*run) (struct md_thread *),
7990                struct mddev *mddev, const char *name)
7991{
7992        struct md_thread *thread;
7993
7994        thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
7995        if (!thread)
7996                return NULL;
7997
7998        init_waitqueue_head(&thread->wqueue);
7999
8000        thread->run = run;
8001        thread->mddev = mddev;
8002        thread->timeout = MAX_SCHEDULE_TIMEOUT;
8003        thread->tsk = kthread_run(md_thread, thread,
8004                                  "%s_%s",
8005                                  mdname(thread->mddev),
8006                                  name);
8007        if (IS_ERR(thread->tsk)) {
8008                kfree(thread);
8009                return NULL;
8010        }
8011        return thread;
8012}
8013EXPORT_SYMBOL(md_register_thread);
8014
8015void md_unregister_thread(struct md_thread **threadp)
8016{
8017        struct md_thread *thread = *threadp;
8018        if (!thread)
8019                return;
8020        pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
8021        /* Locking ensures that mddev_unlock does not wake_up a
8022         * non-existent thread
8023         */
8024        spin_lock(&pers_lock);
8025        *threadp = NULL;
8026        spin_unlock(&pers_lock);
8027
8028        kthread_stop(thread->tsk);
8029        kfree(thread);
8030}
8031EXPORT_SYMBOL(md_unregister_thread);
8032
8033void md_error(struct mddev *mddev, struct md_rdev *rdev)
8034{
8035        if (!rdev || test_bit(Faulty, &rdev->flags))
8036                return;
8037
8038        if (!mddev->pers || !mddev->pers->error_handler)
8039                return;
8040        mddev->pers->error_handler(mddev,rdev);
8041        if (mddev->degraded)
8042                set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8043        sysfs_notify_dirent_safe(rdev->sysfs_state);
8044        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8045        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8046        md_wakeup_thread(mddev->thread);
8047        if (mddev->event_work.func)
8048                queue_work(md_misc_wq, &mddev->event_work);
8049        md_new_event(mddev);
8050}
8051EXPORT_SYMBOL(md_error);
8052
8053/* seq_file implementation /proc/mdstat */
8054
8055static void status_unused(struct seq_file *seq)
8056{
8057        int i = 0;
8058        struct md_rdev *rdev;
8059
8060        seq_printf(seq, "unused devices: ");
8061
8062        list_for_each_entry(rdev, &pending_raid_disks, same_set) {
8063                char b[BDEVNAME_SIZE];
8064                i++;
8065                seq_printf(seq, "%s ",
8066                              bdevname(rdev->bdev,b));
8067        }
8068        if (!i)
8069                seq_printf(seq, "<none>");
8070
8071        seq_printf(seq, "\n");
8072}
8073
8074static int status_resync(struct seq_file *seq, struct mddev *mddev)
8075{
8076        sector_t max_sectors, resync, res;
8077        unsigned long dt, db = 0;
8078        sector_t rt, curr_mark_cnt, resync_mark_cnt;
8079        int scale, recovery_active;
8080        unsigned int per_milli;
8081
8082        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
8083            test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8084                max_sectors = mddev->resync_max_sectors;
8085        else
8086                max_sectors = mddev->dev_sectors;
8087
8088        resync = mddev->curr_resync;
8089        if (resync <= 3) {
8090                if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
8091                        /* Still cleaning up */
8092                        resync = max_sectors;
8093        } else if (resync > max_sectors)
8094                resync = max_sectors;
8095        else
8096                resync -= atomic_read(&mddev->recovery_active);
8097
8098        if (resync == 0) {
8099                if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
8100                        struct md_rdev *rdev;
8101
8102                        rdev_for_each(rdev, mddev)
8103                                if (rdev->raid_disk >= 0 &&
8104                                    !test_bit(Faulty, &rdev->flags) &&
8105                                    rdev->recovery_offset != MaxSector &&
8106                                    rdev->recovery_offset) {
8107                                        seq_printf(seq, "\trecover=REMOTE");
8108                                        return 1;
8109                                }
8110                        if (mddev->reshape_position != MaxSector)
8111                                seq_printf(seq, "\treshape=REMOTE");
8112                        else
8113                                seq_printf(seq, "\tresync=REMOTE");
8114                        return 1;
8115                }
8116                if (mddev->recovery_cp < MaxSector) {
8117                        seq_printf(seq, "\tresync=PENDING");
8118                        return 1;
8119                }
8120                return 0;
8121        }
8122        if (resync < 3) {
8123                seq_printf(seq, "\tresync=DELAYED");
8124                return 1;
8125        }
8126
8127        WARN_ON(max_sectors == 0);
8128        /* Pick 'scale' such that (resync>>scale)*1000 will fit
8129         * in a sector_t, and (max_sectors>>scale) will fit in a
8130         * u32, as those are the requirements for sector_div.
8131         * Thus 'scale' must be at least 10
8132         */
8133        scale = 10;
8134        if (sizeof(sector_t) > sizeof(unsigned long)) {
8135                while ( max_sectors/2 > (1ULL<<(scale+32)))
8136                        scale++;
8137        }
8138        res = (resync>>scale)*1000;
8139        sector_div(res, (u32)((max_sectors>>scale)+1));
8140
8141        per_milli = res;
8142        {
8143                int i, x = per_milli/50, y = 20-x;
8144                seq_printf(seq, "[");
8145                for (i = 0; i < x; i++)
8146                        seq_printf(seq, "=");
8147                seq_printf(seq, ">");
8148                for (i = 0; i < y; i++)
8149                        seq_printf(seq, ".");
8150                seq_printf(seq, "] ");
8151        }
8152        seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
8153                   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
8154                    "reshape" :
8155                    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
8156                     "check" :
8157                     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
8158                      "resync" : "recovery"))),
8159                   per_milli/10, per_milli % 10,
8160                   (unsigned long long) resync/2,
8161                   (unsigned long long) max_sectors/2);
8162
8163        /*
8164         * dt: time from mark until now
8165         * db: blocks written from mark until now
8166         * rt: remaining time
8167         *
8168         * rt is a sector_t, which is always 64bit now. We are keeping
8169         * the original algorithm, but it is not really necessary.
8170         *
8171         * Original algorithm:
8172         *   So we divide before multiply in case it is 32bit and close
8173         *   to the limit.
8174         *   We scale the divisor (db) by 32 to avoid losing precision
8175         *   near the end of resync when the number of remaining sectors
8176         *   is close to 'db'.
8177         *   We then divide rt by 32 after multiplying by db to compensate.
8178         *   The '+1' avoids division by zero if db is very small.
8179         */
8180        dt = ((jiffies - mddev->resync_mark) / HZ);
8181        if (!dt) dt++;
8182
8183        curr_mark_cnt = mddev->curr_mark_cnt;
8184        recovery_active = atomic_read(&mddev->recovery_active);
8185        resync_mark_cnt = mddev->resync_mark_cnt;
8186
8187        if (curr_mark_cnt >= (recovery_active + resync_mark_cnt))
8188                db = curr_mark_cnt - (recovery_active + resync_mark_cnt);
8189
8190        rt = max_sectors - resync;    /* number of remaining sectors */
8191        rt = div64_u64(rt, db/32+1);
8192        rt *= dt;
8193        rt >>= 5;
8194
8195        seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
8196                   ((unsigned long)rt % 60)/6);
8197
8198        seq_printf(seq, " speed=%ldK/sec", db/2/dt);
8199        return 1;
8200}
8201
8202static void *md_seq_start(struct seq_file *seq, loff_t *pos)
8203{
8204        struct list_head *tmp;
8205        loff_t l = *pos;
8206        struct mddev *mddev;
8207
8208        if (l == 0x10000) {
8209                ++*pos;
8210                return (void *)2;
8211        }
8212        if (l > 0x10000)
8213                return NULL;
8214        if (!l--)
8215                /* header */
8216                return (void*)1;
8217
8218        spin_lock(&all_mddevs_lock);
8219        list_for_each(tmp,&all_mddevs)
8220                if (!l--) {
8221                        mddev = list_entry(tmp, struct mddev, all_mddevs);
8222                        mddev_get(mddev);
8223                        spin_unlock(&all_mddevs_lock);
8224                        return mddev;
8225                }
8226        spin_unlock(&all_mddevs_lock);
8227        if (!l--)
8228                return (void*)2;/* tail */
8229        return NULL;
8230}
8231
8232static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
8233{
8234        struct list_head *tmp;
8235        struct mddev *next_mddev, *mddev = v;
8236
8237        ++*pos;
8238        if (v == (void*)2)
8239                return NULL;
8240
8241        spin_lock(&all_mddevs_lock);
8242        if (v == (void*)1)
8243                tmp = all_mddevs.next;
8244        else
8245                tmp = mddev->all_mddevs.next;
8246        if (tmp != &all_mddevs)
8247                next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
8248        else {
8249                next_mddev = (void*)2;
8250                *pos = 0x10000;
8251        }
8252        spin_unlock(&all_mddevs_lock);
8253
8254        if (v != (void*)1)
8255                mddev_put(mddev);
8256        return next_mddev;
8257
8258}
8259
8260static void md_seq_stop(struct seq_file *seq, void *v)
8261{
8262        struct mddev *mddev = v;
8263
8264        if (mddev && v != (void*)1 && v != (void*)2)
8265                mddev_put(mddev);
8266}
8267
8268static int md_seq_show(struct seq_file *seq, void *v)
8269{
8270        struct mddev *mddev = v;
8271        sector_t sectors;
8272        struct md_rdev *rdev;
8273
8274        if (v == (void*)1) {
8275                struct md_personality *pers;
8276                seq_printf(seq, "Personalities : ");
8277                spin_lock(&pers_lock);
8278                list_for_each_entry(pers, &pers_list, list)
8279                        seq_printf(seq, "[%s] ", pers->name);
8280
8281                spin_unlock(&pers_lock);
8282                seq_printf(seq, "\n");
8283                seq->poll_event = atomic_read(&md_event_count);
8284                return 0;
8285        }
8286        if (v == (void*)2) {
8287                status_unused(seq);
8288                return 0;
8289        }
8290
8291        spin_lock(&mddev->lock);
8292        if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
8293                seq_printf(seq, "%s : %sactive", mdname(mddev),
8294                                                mddev->pers ? "" : "in");
8295                if (mddev->pers) {
8296                        if (mddev->ro==1)
8297                                seq_printf(seq, " (read-only)");
8298                        if (mddev->ro==2)
8299                                seq_printf(seq, " (auto-read-only)");
8300                        seq_printf(seq, " %s", mddev->pers->name);
8301                }
8302
8303                sectors = 0;
8304                rcu_read_lock();
8305                rdev_for_each_rcu(rdev, mddev) {
8306                        char b[BDEVNAME_SIZE];
8307                        seq_printf(seq, " %s[%d]",
8308                                bdevname(rdev->bdev,b), rdev->desc_nr);
8309                        if (test_bit(WriteMostly, &rdev->flags))
8310                                seq_printf(seq, "(W)");
8311                        if (test_bit(Journal, &rdev->flags))
8312                                seq_printf(seq, "(J)");
8313                        if (test_bit(Faulty, &rdev->flags)) {
8314                                seq_printf(seq, "(F)");
8315                                continue;
8316                        }
8317                        if (rdev->raid_disk < 0)
8318                                seq_printf(seq, "(S)"); /* spare */
8319                        if (test_bit(Replacement, &rdev->flags))
8320                                seq_printf(seq, "(R)");
8321                        sectors += rdev->sectors;
8322                }
8323                rcu_read_unlock();
8324
8325                if (!list_empty(&mddev->disks)) {
8326                        if (mddev->pers)
8327                                seq_printf(seq, "\n      %llu blocks",
8328                                           (unsigned long long)
8329                                           mddev->array_sectors / 2);
8330                        else
8331                                seq_printf(seq, "\n      %llu blocks",
8332                                           (unsigned long long)sectors / 2);
8333                }
8334                if (mddev->persistent) {
8335                        if (mddev->major_version != 0 ||
8336                            mddev->minor_version != 90) {
8337                                seq_printf(seq," super %d.%d",
8338                                           mddev->major_version,
8339                                           mddev->minor_version);
8340                        }
8341                } else if (mddev->external)
8342                        seq_printf(seq, " super external:%s",
8343                                   mddev->metadata_type);
8344                else
8345                        seq_printf(seq, " super non-persistent");
8346
8347                if (mddev->pers) {
8348                        mddev->pers->status(seq, mddev);
8349                        seq_printf(seq, "\n      ");
8350                        if (mddev->pers->sync_request) {
8351                                if (status_resync(seq, mddev))
8352                                        seq_printf(seq, "\n      ");
8353                        }
8354                } else
8355                        seq_printf(seq, "\n       ");
8356
8357                md_bitmap_status(seq, mddev->bitmap);
8358
8359                seq_printf(seq, "\n");
8360        }
8361        spin_unlock(&mddev->lock);
8362
8363        return 0;
8364}
8365
8366static const struct seq_operations md_seq_ops = {
8367        .start  = md_seq_start,
8368        .next   = md_seq_next,
8369        .stop   = md_seq_stop,
8370        .show   = md_seq_show,
8371};
8372
8373static int md_seq_open(struct inode *inode, struct file *file)
8374{
8375        struct seq_file *seq;
8376        int error;
8377
8378        error = seq_open(file, &md_seq_ops);
8379        if (error)
8380                return error;
8381
8382        seq = file->private_data;
8383        seq->poll_event = atomic_read(&md_event_count);
8384        return error;
8385}
8386
8387static int md_unloading;
8388static __poll_t mdstat_poll(struct file *filp, poll_table *wait)
8389{
8390        struct seq_file *seq = filp->private_data;
8391        __poll_t mask;
8392
8393        if (md_unloading)
8394                return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
8395        poll_wait(filp, &md_event_waiters, wait);
8396
8397        /* always allow read */
8398        mask = EPOLLIN | EPOLLRDNORM;
8399
8400        if (seq->poll_event != atomic_read(&md_event_count))
8401                mask |= EPOLLERR | EPOLLPRI;
8402        return mask;
8403}
8404
8405static const struct file_operations md_seq_fops = {
8406        .owner          = THIS_MODULE,
8407        .open           = md_seq_open,
8408        .read           = seq_read,
8409        .llseek         = seq_lseek,
8410        .release        = seq_release,
8411        .poll           = mdstat_poll,
8412};
8413
8414int register_md_personality(struct md_personality *p)
8415{
8416        pr_debug("md: %s personality registered for level %d\n",
8417                 p->name, p->level);
8418        spin_lock(&pers_lock);
8419        list_add_tail(&p->list, &pers_list);
8420        spin_unlock(&pers_lock);
8421        return 0;
8422}
8423EXPORT_SYMBOL(register_md_personality);
8424
8425int unregister_md_personality(struct md_personality *p)
8426{
8427        pr_debug("md: %s personality unregistered\n", p->name);
8428        spin_lock(&pers_lock);
8429        list_del_init(&p->list);
8430        spin_unlock(&pers_lock);
8431        return 0;
8432}
8433EXPORT_SYMBOL(unregister_md_personality);
8434
8435int register_md_cluster_operations(struct md_cluster_operations *ops,
8436                                   struct module *module)
8437{
8438        int ret = 0;
8439        spin_lock(&pers_lock);
8440        if (md_cluster_ops != NULL)
8441                ret = -EALREADY;
8442        else {
8443                md_cluster_ops = ops;
8444                md_cluster_mod = module;
8445        }
8446        spin_unlock(&pers_lock);
8447        return ret;
8448}
8449EXPORT_SYMBOL(register_md_cluster_operations);
8450
8451int unregister_md_cluster_operations(void)
8452{
8453        spin_lock(&pers_lock);
8454        md_cluster_ops = NULL;
8455        spin_unlock(&pers_lock);
8456        return 0;
8457}
8458EXPORT_SYMBOL(unregister_md_cluster_operations);
8459
8460int md_setup_cluster(struct mddev *mddev, int nodes)
8461{
8462        int ret;
8463        if (!md_cluster_ops)
8464                request_module("md-cluster");
8465        spin_lock(&pers_lock);
8466        /* ensure module won't be unloaded */
8467        if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
8468                pr_warn("can't find md-cluster module or get it's reference.\n");
8469                spin_unlock(&pers_lock);
8470                return -ENOENT;
8471        }
8472        spin_unlock(&pers_lock);
8473
8474        ret = md_cluster_ops->join(mddev, nodes);
8475        if (!ret)
8476                mddev->safemode_delay = 0;
8477        return ret;
8478}
8479
8480void md_cluster_stop(struct mddev *mddev)
8481{
8482        if (!md_cluster_ops)
8483                return;
8484        md_cluster_ops->leave(mddev);
8485        module_put(md_cluster_mod);
8486}
8487
8488static int is_mddev_idle(struct mddev *mddev, int init)
8489{
8490        struct md_rdev *rdev;
8491        int idle;
8492        int curr_events;
8493
8494        idle = 1;
8495        rcu_read_lock();
8496        rdev_for_each_rcu(rdev, mddev) {
8497                struct gendisk *disk = rdev->bdev->bd_disk;
8498                curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
8499                              atomic_read(&disk->sync_io);
8500                /* sync IO will cause sync_io to increase before the disk_stats
8501                 * as sync_io is counted when a request starts, and
8502                 * disk_stats is counted when it completes.
8503                 * So resync activity will cause curr_events to be smaller than
8504                 * when there was no such activity.
8505                 * non-sync IO will cause disk_stat to increase without
8506                 * increasing sync_io so curr_events will (eventually)
8507                 * be larger than it was before.  Once it becomes
8508                 * substantially larger, the test below will cause
8509                 * the array to appear non-idle, and resync will slow
8510                 * down.
8511                 * If there is a lot of outstanding resync activity when
8512                 * we set last_event to curr_events, then all that activity
8513                 * completing might cause the array to appear non-idle
8514                 * and resync will be slowed down even though there might
8515                 * not have been non-resync activity.  This will only
8516                 * happen once though.  'last_events' will soon reflect
8517                 * the state where there is little or no outstanding
8518                 * resync requests, and further resync activity will
8519                 * always make curr_events less than last_events.
8520                 *
8521                 */
8522                if (init || curr_events - rdev->last_events > 64) {
8523                        rdev->last_events = curr_events;
8524                        idle = 0;
8525                }
8526        }
8527        rcu_read_unlock();
8528        return idle;
8529}
8530
8531void md_done_sync(struct mddev *mddev, int blocks, int ok)
8532{
8533        /* another "blocks" (512byte) blocks have been synced */
8534        atomic_sub(blocks, &mddev->recovery_active);
8535        wake_up(&mddev->recovery_wait);
8536        if (!ok) {
8537                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8538                set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
8539                md_wakeup_thread(mddev->thread);
8540                // stop recovery, signal do_sync ....
8541        }
8542}
8543EXPORT_SYMBOL(md_done_sync);
8544
8545/* md_write_start(mddev, bi)
8546 * If we need to update some array metadata (e.g. 'active' flag
8547 * in superblock) before writing, schedule a superblock update
8548 * and wait for it to complete.
8549 * A return value of 'false' means that the write wasn't recorded
8550 * and cannot proceed as the array is being suspend.
8551 */
8552bool md_write_start(struct mddev *mddev, struct bio *bi)
8553{
8554        int did_change = 0;
8555
8556        if (bio_data_dir(bi) != WRITE)
8557                return true;
8558
8559        BUG_ON(mddev->ro == 1);
8560        if (mddev->ro == 2) {
8561                /* need to switch to read/write */
8562                mddev->ro = 0;
8563                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8564                md_wakeup_thread(mddev->thread);
8565                md_wakeup_thread(mddev->sync_thread);
8566                did_change = 1;
8567        }
8568        rcu_read_lock();
8569        percpu_ref_get(&mddev->writes_pending);
8570        smp_mb(); /* Match smp_mb in set_in_sync() */
8571        if (mddev->safemode == 1)
8572                mddev->safemode = 0;
8573        /* sync_checkers is always 0 when writes_pending is in per-cpu mode */
8574        if (mddev->in_sync || mddev->sync_checkers) {
8575                spin_lock(&mddev->lock);
8576                if (mddev->in_sync) {
8577                        mddev->in_sync = 0;
8578                        set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8579                        set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8580                        md_wakeup_thread(mddev->thread);
8581                        did_change = 1;
8582                }
8583                spin_unlock(&mddev->lock);
8584        }
8585        rcu_read_unlock();
8586        if (did_change)
8587                sysfs_notify_dirent_safe(mddev->sysfs_state);
8588        if (!mddev->has_superblocks)
8589                return true;
8590        wait_event(mddev->sb_wait,
8591                   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
8592                   mddev->suspended);
8593        if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
8594                percpu_ref_put(&mddev->writes_pending);
8595                return false;
8596        }
8597        return true;
8598}
8599EXPORT_SYMBOL(md_write_start);
8600
8601/* md_write_inc can only be called when md_write_start() has
8602 * already been called at least once of the current request.
8603 * It increments the counter and is useful when a single request
8604 * is split into several parts.  Each part causes an increment and
8605 * so needs a matching md_write_end().
8606 * Unlike md_write_start(), it is safe to call md_write_inc() inside
8607 * a spinlocked region.
8608 */
8609void md_write_inc(struct mddev *mddev, struct bio *bi)
8610{
8611        if (bio_data_dir(bi) != WRITE)
8612                return;
8613        WARN_ON_ONCE(mddev->in_sync || mddev->ro);
8614        percpu_ref_get(&mddev->writes_pending);
8615}
8616EXPORT_SYMBOL(md_write_inc);
8617
8618void md_write_end(struct mddev *mddev)
8619{
8620        percpu_ref_put(&mddev->writes_pending);
8621
8622        if (mddev->safemode == 2)
8623                md_wakeup_thread(mddev->thread);
8624        else if (mddev->safemode_delay)
8625                /* The roundup() ensures this only performs locking once
8626                 * every ->safemode_delay jiffies
8627                 */
8628                mod_timer(&mddev->safemode_timer,
8629                          roundup(jiffies, mddev->safemode_delay) +
8630                          mddev->safemode_delay);
8631}
8632
8633EXPORT_SYMBOL(md_write_end);
8634
8635/* This is used by raid0 and raid10 */
8636void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
8637                        struct bio *bio, sector_t start, sector_t size)
8638{
8639        struct bio *discard_bio = NULL;
8640
8641        if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, 0,
8642                        &discard_bio) || !discard_bio)
8643                return;
8644
8645        bio_chain(discard_bio, bio);
8646        bio_clone_blkg_association(discard_bio, bio);
8647        if (mddev->gendisk)
8648                trace_block_bio_remap(discard_bio->bi_disk->queue,
8649                                discard_bio, disk_devt(mddev->gendisk),
8650                                bio->bi_iter.bi_sector);
8651        generic_make_request(discard_bio);
8652}
8653EXPORT_SYMBOL_GPL(md_submit_discard_bio);
8654
8655/* md_allow_write(mddev)
8656 * Calling this ensures that the array is marked 'active' so that writes
8657 * may proceed without blocking.  It is important to call this before
8658 * attempting a GFP_KERNEL allocation while holding the mddev lock.
8659 * Must be called with mddev_lock held.
8660 */
8661void md_allow_write(struct mddev *mddev)
8662{
8663        if (!mddev->pers)
8664                return;
8665        if (mddev->ro)
8666                return;
8667        if (!mddev->pers->sync_request)
8668                return;
8669
8670        spin_lock(&mddev->lock);
8671        if (mddev->in_sync) {
8672                mddev->in_sync = 0;
8673                set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8674                set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8675                if (mddev->safemode_delay &&
8676                    mddev->safemode == 0)
8677                        mddev->safemode = 1;
8678                spin_unlock(&mddev->lock);
8679                md_update_sb(mddev, 0);
8680                sysfs_notify_dirent_safe(mddev->sysfs_state);
8681                /* wait for the dirty state to be recorded in the metadata */
8682                wait_event(mddev->sb_wait,
8683                           !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
8684        } else
8685                spin_unlock(&mddev->lock);
8686}
8687EXPORT_SYMBOL_GPL(md_allow_write);
8688
8689#define SYNC_MARKS      10
8690#define SYNC_MARK_STEP  (3*HZ)
8691#define UPDATE_FREQUENCY (5*60*HZ)
8692void md_do_sync(struct md_thread *thread)
8693{
8694        struct mddev *mddev = thread->mddev;
8695        struct mddev *mddev2;
8696        unsigned int currspeed = 0, window;
8697        sector_t max_sectors,j, io_sectors, recovery_done;
8698        unsigned long mark[SYNC_MARKS];
8699        unsigned long update_time;
8700        sector_t mark_cnt[SYNC_MARKS];
8701        int last_mark,m;
8702        struct list_head *tmp;
8703        sector_t last_check;
8704        int skipped = 0;
8705        struct md_rdev *rdev;
8706        char *desc, *action = NULL;
8707        struct blk_plug plug;
8708        int ret;
8709
8710        /* just incase thread restarts... */
8711        if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
8712            test_bit(MD_RECOVERY_WAIT, &mddev->recovery))
8713                return;
8714        if (mddev->ro) {/* never try to sync a read-only array */
8715                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8716                return;
8717        }
8718
8719        if (mddev_is_clustered(mddev)) {
8720                ret = md_cluster_ops->resync_start(mddev);
8721                if (ret)
8722                        goto skip;
8723
8724                set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
8725                if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
8726                        test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
8727                        test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
8728                     && ((unsigned long long)mddev->curr_resync_completed
8729                         < (unsigned long long)mddev->resync_max_sectors))
8730                        goto skip;
8731        }
8732
8733        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8734                if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
8735                        desc = "data-check";
8736                        action = "check";
8737                } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
8738                        desc = "requested-resync";
8739                        action = "repair";
8740                } else
8741                        desc = "resync";
8742        } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8743                desc = "reshape";
8744        else
8745                desc = "recovery";
8746
8747        mddev->last_sync_action = action ?: desc;
8748
8749        /* we overload curr_resync somewhat here.
8750         * 0 == not engaged in resync at all
8751         * 2 == checking that there is no conflict with another sync
8752         * 1 == like 2, but have yielded to allow conflicting resync to
8753         *              commence
8754         * other == active in resync - this many blocks
8755         *
8756         * Before starting a resync we must have set curr_resync to
8757         * 2, and then checked that every "conflicting" array has curr_resync
8758         * less than ours.  When we find one that is the same or higher
8759         * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
8760         * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
8761         * This will mean we have to start checking from the beginning again.
8762         *
8763         */
8764
8765        do {
8766                int mddev2_minor = -1;
8767                mddev->curr_resync = 2;
8768
8769        try_again:
8770                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8771                        goto skip;
8772                for_each_mddev(mddev2, tmp) {
8773                        if (mddev2 == mddev)
8774                                continue;
8775                        if (!mddev->parallel_resync
8776                        &&  mddev2->curr_resync
8777                        &&  match_mddev_units(mddev, mddev2)) {
8778                                DEFINE_WAIT(wq);
8779                                if (mddev < mddev2 && mddev->curr_resync == 2) {
8780                                        /* arbitrarily yield */
8781                                        mddev->curr_resync = 1;
8782                                        wake_up(&resync_wait);
8783                                }
8784                                if (mddev > mddev2 && mddev->curr_resync == 1)
8785                                        /* no need to wait here, we can wait the next
8786                                         * time 'round when curr_resync == 2
8787                                         */
8788                                        continue;
8789                                /* We need to wait 'interruptible' so as not to
8790                                 * contribute to the load average, and not to
8791                                 * be caught by 'softlockup'
8792                                 */
8793                                prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
8794                                if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8795                                    mddev2->curr_resync >= mddev->curr_resync) {
8796                                        if (mddev2_minor != mddev2->md_minor) {
8797                                                mddev2_minor = mddev2->md_minor;
8798                                                pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
8799                                                        desc, mdname(mddev),
8800                                                        mdname(mddev2));
8801                                        }
8802                                        mddev_put(mddev2);
8803                                        if (signal_pending(current))
8804                                                flush_signals(current);
8805                                        schedule();
8806                                        finish_wait(&resync_wait, &wq);
8807                                        goto try_again;
8808                                }
8809                                finish_wait(&resync_wait, &wq);
8810                        }
8811                }
8812        } while (mddev->curr_resync < 2);
8813
8814        j = 0;
8815        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8816                /* resync follows the size requested by the personality,
8817                 * which defaults to physical size, but can be virtual size
8818                 */
8819                max_sectors = mddev->resync_max_sectors;
8820                atomic64_set(&mddev->resync_mismatches, 0);
8821                /* we don't use the checkpoint if there's a bitmap */
8822                if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8823                        j = mddev->resync_min;
8824                else if (!mddev->bitmap)
8825                        j = mddev->recovery_cp;
8826
8827        } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
8828                max_sectors = mddev->resync_max_sectors;
8829                /*
8830                 * If the original node aborts reshaping then we continue the
8831                 * reshaping, so set j again to avoid restart reshape from the
8832                 * first beginning
8833                 */
8834                if (mddev_is_clustered(mddev) &&
8835                    mddev->reshape_position != MaxSector)
8836                        j = mddev->reshape_position;
8837        } else {
8838                /* recovery follows the physical size of devices */
8839                max_sectors = mddev->dev_sectors;
8840                j = MaxSector;
8841                rcu_read_lock();
8842                rdev_for_each_rcu(rdev, mddev)
8843                        if (rdev->raid_disk >= 0 &&
8844                            !test_bit(Journal, &rdev->flags) &&
8845                            !test_bit(Faulty, &rdev->flags) &&
8846                            !test_bit(In_sync, &rdev->flags) &&
8847                            rdev->recovery_offset < j)
8848                                j = rdev->recovery_offset;
8849                rcu_read_unlock();
8850
8851                /* If there is a bitmap, we need to make sure all
8852                 * writes that started before we added a spare
8853                 * complete before we start doing a recovery.
8854                 * Otherwise the write might complete and (via
8855                 * bitmap_endwrite) set a bit in the bitmap after the
8856                 * recovery has checked that bit and skipped that
8857                 * region.
8858                 */
8859                if (mddev->bitmap) {
8860                        mddev->pers->quiesce(mddev, 1);
8861                        mddev->pers->quiesce(mddev, 0);
8862                }
8863        }
8864
8865        pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
8866        pr_debug("md: minimum _guaranteed_  speed: %d KB/sec/disk.\n", speed_min(mddev));
8867        pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
8868                 speed_max(mddev), desc);
8869
8870        is_mddev_idle(mddev, 1); /* this initializes IO event counters */
8871
8872        io_sectors = 0;
8873        for (m = 0; m < SYNC_MARKS; m++) {
8874                mark[m] = jiffies;
8875                mark_cnt[m] = io_sectors;
8876        }
8877        last_mark = 0;
8878        mddev->resync_mark = mark[last_mark];
8879        mddev->resync_mark_cnt = mark_cnt[last_mark];
8880
8881        /*
8882         * Tune reconstruction:
8883         */
8884        window = 32 * (PAGE_SIZE / 512);
8885        pr_debug("md: using %dk window, over a total of %lluk.\n",
8886                 window/2, (unsigned long long)max_sectors/2);
8887
8888        atomic_set(&mddev->recovery_active, 0);
8889        last_check = 0;
8890
8891        if (j>2) {
8892                pr_debug("md: resuming %s of %s from checkpoint.\n",
8893                         desc, mdname(mddev));
8894                mddev->curr_resync = j;
8895        } else
8896                mddev->curr_resync = 3; /* no longer delayed */
8897        mddev->curr_resync_completed = j;
8898        sysfs_notify_dirent_safe(mddev->sysfs_completed);
8899        md_new_event(mddev);
8900        update_time = jiffies;
8901
8902        blk_start_plug(&plug);
8903        while (j < max_sectors) {
8904                sector_t sectors;
8905
8906                skipped = 0;
8907
8908                if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8909                    ((mddev->curr_resync > mddev->curr_resync_completed &&
8910                      (mddev->curr_resync - mddev->curr_resync_completed)
8911                      > (max_sectors >> 4)) ||
8912                     time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
8913                     (j - mddev->curr_resync_completed)*2
8914                     >= mddev->resync_max - mddev->curr_resync_completed ||
8915                     mddev->curr_resync_completed > mddev->resync_max
8916                            )) {
8917                        /* time to update curr_resync_completed */
8918                        wait_event(mddev->recovery_wait,
8919                                   atomic_read(&mddev->recovery_active) == 0);
8920                        mddev->curr_resync_completed = j;
8921                        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
8922                            j > mddev->recovery_cp)
8923                                mddev->recovery_cp = j;
8924                        update_time = jiffies;
8925                        set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8926                        sysfs_notify_dirent_safe(mddev->sysfs_completed);
8927                }
8928
8929                while (j >= mddev->resync_max &&
8930                       !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8931                        /* As this condition is controlled by user-space,
8932                         * we can block indefinitely, so use '_interruptible'
8933                         * to avoid triggering warnings.
8934                         */
8935                        flush_signals(current); /* just in case */
8936                        wait_event_interruptible(mddev->recovery_wait,
8937                                                 mddev->resync_max > j
8938                                                 || test_bit(MD_RECOVERY_INTR,
8939                                                             &mddev->recovery));
8940                }
8941
8942                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8943                        break;
8944
8945                sectors = mddev->pers->sync_request(mddev, j, &skipped);
8946                if (sectors == 0) {
8947                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8948                        break;
8949                }
8950
8951                if (!skipped) { /* actual IO requested */
8952                        io_sectors += sectors;
8953                        atomic_add(sectors, &mddev->recovery_active);
8954                }
8955
8956                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8957                        break;
8958
8959                j += sectors;
8960                if (j > max_sectors)
8961                        /* when skipping, extra large numbers can be returned. */
8962                        j = max_sectors;
8963                if (j > 2)
8964                        mddev->curr_resync = j;
8965                mddev->curr_mark_cnt = io_sectors;
8966                if (last_check == 0)
8967                        /* this is the earliest that rebuild will be
8968                         * visible in /proc/mdstat
8969                         */
8970                        md_new_event(mddev);
8971
8972                if (last_check + window > io_sectors || j == max_sectors)
8973                        continue;
8974
8975                last_check = io_sectors;
8976        repeat:
8977                if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
8978                        /* step marks */
8979                        int next = (last_mark+1) % SYNC_MARKS;
8980
8981                        mddev->resync_mark = mark[next];
8982                        mddev->resync_mark_cnt = mark_cnt[next];
8983                        mark[next] = jiffies;
8984                        mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
8985                        last_mark = next;
8986                }
8987
8988                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8989                        break;
8990
8991                /*
8992                 * this loop exits only if either when we are slower than
8993                 * the 'hard' speed limit, or the system was IO-idle for
8994                 * a jiffy.
8995                 * the system might be non-idle CPU-wise, but we only care
8996                 * about not overloading the IO subsystem. (things like an
8997                 * e2fsck being done on the RAID array should execute fast)
8998                 */
8999                cond_resched();
9000
9001                recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
9002                currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
9003                        /((jiffies-mddev->resync_mark)/HZ +1) +1;
9004
9005                if (currspeed > speed_min(mddev)) {
9006                        if (currspeed > speed_max(mddev)) {
9007                                msleep(500);
9008                                goto repeat;
9009                        }
9010                        if (!is_mddev_idle(mddev, 0)) {
9011                                /*
9012                                 * Give other IO more of a chance.
9013                                 * The faster the devices, the less we wait.
9014                                 */
9015                                wait_event(mddev->recovery_wait,
9016                                           !atomic_read(&mddev->recovery_active));
9017                        }
9018                }
9019        }
9020        pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
9021                test_bit(MD_RECOVERY_INTR, &mddev->recovery)
9022                ? "interrupted" : "done");
9023        /*
9024         * this also signals 'finished resyncing' to md_stop
9025         */
9026        blk_finish_plug(&plug);
9027        wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
9028
9029        if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9030            !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9031            mddev->curr_resync > 3) {
9032                mddev->curr_resync_completed = mddev->curr_resync;
9033                sysfs_notify_dirent_safe(mddev->sysfs_completed);
9034        }
9035        mddev->pers->sync_request(mddev, max_sectors, &skipped);
9036
9037        if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
9038            mddev->curr_resync > 3) {
9039                if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
9040                        if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9041                                if (mddev->curr_resync >= mddev->recovery_cp) {
9042                                        pr_debug("md: checkpointing %s of %s.\n",
9043                                                 desc, mdname(mddev));
9044                                        if (test_bit(MD_RECOVERY_ERROR,
9045                                                &mddev->recovery))
9046                                                mddev->recovery_cp =
9047                                                        mddev->curr_resync_completed;
9048                                        else
9049                                                mddev->recovery_cp =
9050                                                        mddev->curr_resync;
9051                                }
9052                        } else
9053                                mddev->recovery_cp = MaxSector;
9054                } else {
9055                        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9056                                mddev->curr_resync = MaxSector;
9057                        if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9058                            test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
9059                                rcu_read_lock();
9060                                rdev_for_each_rcu(rdev, mddev)
9061                                        if (rdev->raid_disk >= 0 &&
9062                                            mddev->delta_disks >= 0 &&
9063                                            !test_bit(Journal, &rdev->flags) &&
9064                                            !test_bit(Faulty, &rdev->flags) &&
9065                                            !test_bit(In_sync, &rdev->flags) &&
9066                                            rdev->recovery_offset < mddev->curr_resync)
9067                                                rdev->recovery_offset = mddev->curr_resync;
9068                                rcu_read_unlock();
9069                        }
9070                }
9071        }
9072 skip:
9073        /* set CHANGE_PENDING here since maybe another update is needed,
9074         * so other nodes are informed. It should be harmless for normal
9075         * raid */
9076        set_mask_bits(&mddev->sb_flags, 0,
9077                      BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
9078
9079        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9080                        !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9081                        mddev->delta_disks > 0 &&
9082                        mddev->pers->finish_reshape &&
9083                        mddev->pers->size &&
9084                        mddev->queue) {
9085                mddev_lock_nointr(mddev);
9086                md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
9087                mddev_unlock(mddev);
9088                if (!mddev_is_clustered(mddev)) {
9089                        set_capacity(mddev->gendisk, mddev->array_sectors);
9090                        revalidate_disk_size(mddev->gendisk, true);
9091                }
9092        }
9093
9094        spin_lock(&mddev->lock);
9095        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9096                /* We completed so min/max setting can be forgotten if used. */
9097                if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
9098                        mddev->resync_min = 0;
9099                mddev->resync_max = MaxSector;
9100        } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
9101                mddev->resync_min = mddev->curr_resync_completed;
9102        set_bit(MD_RECOVERY_DONE, &mddev->recovery);
9103        mddev->curr_resync = 0;
9104        spin_unlock(&mddev->lock);
9105
9106        wake_up(&resync_wait);
9107        md_wakeup_thread(mddev->thread);
9108        return;
9109}
9110EXPORT_SYMBOL_GPL(md_do_sync);
9111
9112static int remove_and_add_spares(struct mddev *mddev,
9113                                 struct md_rdev *this)
9114{
9115        struct md_rdev *rdev;
9116        int spares = 0;
9117        int removed = 0;
9118        bool remove_some = false;
9119
9120        if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
9121                /* Mustn't remove devices when resync thread is running */
9122                return 0;
9123
9124        rdev_for_each(rdev, mddev) {
9125                if ((this == NULL || rdev == this) &&
9126                    rdev->raid_disk >= 0 &&
9127                    !test_bit(Blocked, &rdev->flags) &&
9128                    test_bit(Faulty, &rdev->flags) &&
9129                    atomic_read(&rdev->nr_pending)==0) {
9130                        /* Faulty non-Blocked devices with nr_pending == 0
9131                         * never get nr_pending incremented,
9132                         * never get Faulty cleared, and never get Blocked set.
9133                         * So we can synchronize_rcu now rather than once per device
9134                         */
9135                        remove_some = true;
9136                        set_bit(RemoveSynchronized, &rdev->flags);
9137                }
9138        }
9139
9140        if (remove_some)
9141                synchronize_rcu();
9142        rdev_for_each(rdev, mddev) {
9143                if ((this == NULL || rdev == this) &&
9144                    rdev->raid_disk >= 0 &&
9145                    !test_bit(Blocked, &rdev->flags) &&
9146                    ((test_bit(RemoveSynchronized, &rdev->flags) ||
9147                     (!test_bit(In_sync, &rdev->flags) &&
9148                      !test_bit(Journal, &rdev->flags))) &&
9149                    atomic_read(&rdev->nr_pending)==0)) {
9150                        if (mddev->pers->hot_remove_disk(
9151                                    mddev, rdev) == 0) {
9152                                sysfs_unlink_rdev(mddev, rdev);
9153                                rdev->saved_raid_disk = rdev->raid_disk;
9154                                rdev->raid_disk = -1;
9155                                removed++;
9156                        }
9157                }
9158                if (remove_some && test_bit(RemoveSynchronized, &rdev->flags))
9159                        clear_bit(RemoveSynchronized, &rdev->flags);
9160        }
9161
9162        if (removed && mddev->kobj.sd)
9163                sysfs_notify_dirent_safe(mddev->sysfs_degraded);
9164
9165        if (this && removed)
9166                goto no_add;
9167
9168        rdev_for_each(rdev, mddev) {
9169                if (this && this != rdev)
9170                        continue;
9171                if (test_bit(Candidate, &rdev->flags))
9172                        continue;
9173                if (rdev->raid_disk >= 0 &&
9174                    !test_bit(In_sync, &rdev->flags) &&
9175                    !test_bit(Journal, &rdev->flags) &&
9176                    !test_bit(Faulty, &rdev->flags))
9177                        spares++;
9178                if (rdev->raid_disk >= 0)
9179                        continue;
9180                if (test_bit(Faulty, &rdev->flags))
9181                        continue;
9182                if (!test_bit(Journal, &rdev->flags)) {
9183                        if (mddev->ro &&
9184                            ! (rdev->saved_raid_disk >= 0 &&
9185                               !test_bit(Bitmap_sync, &rdev->flags)))
9186                                continue;
9187
9188                        rdev->recovery_offset = 0;
9189                }
9190                if (mddev->pers->hot_add_disk(mddev, rdev) == 0) {
9191                        /* failure here is OK */
9192                        sysfs_link_rdev(mddev, rdev);
9193                        if (!test_bit(Journal, &rdev->flags))
9194                                spares++;
9195                        md_new_event(mddev);
9196                        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9197                }
9198        }
9199no_add:
9200        if (removed)
9201                set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9202        return spares;
9203}
9204
9205static void md_start_sync(struct work_struct *ws)
9206{
9207        struct mddev *mddev = container_of(ws, struct mddev, del_work);
9208
9209        mddev->sync_thread = md_register_thread(md_do_sync,
9210                                                mddev,
9211                                                "resync");
9212        if (!mddev->sync_thread) {
9213                pr_warn("%s: could not start resync thread...\n",
9214                        mdname(mddev));
9215                /* leave the spares where they are, it shouldn't hurt */
9216                clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9217                clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9218                clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9219                clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9220                clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9221                wake_up(&resync_wait);
9222                if (test_and_clear_bit(MD_RECOVERY_RECOVER,
9223                                       &mddev->recovery))
9224                        if (mddev->sysfs_action)
9225                                sysfs_notify_dirent_safe(mddev->sysfs_action);
9226        } else
9227                md_wakeup_thread(mddev->sync_thread);
9228        sysfs_notify_dirent_safe(mddev->sysfs_action);
9229        md_new_event(mddev);
9230}
9231
9232/*
9233 * This routine is regularly called by all per-raid-array threads to
9234 * deal with generic issues like resync and super-block update.
9235 * Raid personalities that don't have a thread (linear/raid0) do not
9236 * need this as they never do any recovery or update the superblock.
9237 *
9238 * It does not do any resync itself, but rather "forks" off other threads
9239 * to do that as needed.
9240 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
9241 * "->recovery" and create a thread at ->sync_thread.
9242 * When the thread finishes it sets MD_RECOVERY_DONE
9243 * and wakeups up this thread which will reap the thread and finish up.
9244 * This thread also removes any faulty devices (with nr_pending == 0).
9245 *
9246 * The overall approach is:
9247 *  1/ if the superblock needs updating, update it.
9248 *  2/ If a recovery thread is running, don't do anything else.
9249 *  3/ If recovery has finished, clean up, possibly marking spares active.
9250 *  4/ If there are any faulty devices, remove them.
9251 *  5/ If array is degraded, try to add spares devices
9252 *  6/ If array has spares or is not in-sync, start a resync thread.
9253 */
9254void md_check_recovery(struct mddev *mddev)
9255{
9256        if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
9257                /* Write superblock - thread that called mddev_suspend()
9258                 * holds reconfig_mutex for us.
9259                 */
9260                set_bit(MD_UPDATING_SB, &mddev->flags);
9261                smp_mb__after_atomic();
9262                if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
9263                        md_update_sb(mddev, 0);
9264                clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
9265                wake_up(&mddev->sb_wait);
9266        }
9267
9268        if (mddev->suspended)
9269                return;
9270
9271        if (mddev->bitmap)
9272                md_bitmap_daemon_work(mddev);
9273
9274        if (signal_pending(current)) {
9275                if (mddev->pers->sync_request && !mddev->external) {
9276                        pr_debug("md: %s in immediate safe mode\n",
9277                                 mdname(mddev));
9278                        mddev->safemode = 2;
9279                }
9280                flush_signals(current);
9281        }
9282
9283        if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
9284                return;
9285        if ( ! (
9286                (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
9287                test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
9288                test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
9289                (mddev->external == 0 && mddev->safemode == 1) ||
9290                (mddev->safemode == 2
9291                 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
9292                ))
9293                return;
9294
9295        if (mddev_trylock(mddev)) {
9296                int spares = 0;
9297                bool try_set_sync = mddev->safemode != 0;
9298
9299                if (!mddev->external && mddev->safemode == 1)
9300                        mddev->safemode = 0;
9301
9302                if (mddev->ro) {
9303                        struct md_rdev *rdev;
9304                        if (!mddev->external && mddev->in_sync)
9305                                /* 'Blocked' flag not needed as failed devices
9306                                 * will be recorded if array switched to read/write.
9307                                 * Leaving it set will prevent the device
9308                                 * from being removed.
9309                                 */
9310                                rdev_for_each(rdev, mddev)
9311                                        clear_bit(Blocked, &rdev->flags);
9312                        /* On a read-only array we can:
9313                         * - remove failed devices
9314                         * - add already-in_sync devices if the array itself
9315                         *   is in-sync.
9316                         * As we only add devices that are already in-sync,
9317                         * we can activate the spares immediately.
9318                         */
9319                        remove_and_add_spares(mddev, NULL);
9320                        /* There is no thread, but we need to call
9321                         * ->spare_active and clear saved_raid_disk
9322                         */
9323                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9324                        md_reap_sync_thread(mddev);
9325                        clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9326                        clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9327                        clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
9328                        goto unlock;
9329                }
9330
9331                if (mddev_is_clustered(mddev)) {
9332                        struct md_rdev *rdev, *tmp;
9333                        /* kick the device if another node issued a
9334                         * remove disk.
9335                         */
9336                        rdev_for_each_safe(rdev, tmp, mddev) {
9337                                if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
9338                                                rdev->raid_disk < 0)
9339                                        md_kick_rdev_from_array(rdev);
9340                        }
9341                }
9342
9343                if (try_set_sync && !mddev->external && !mddev->in_sync) {
9344                        spin_lock(&mddev->lock);
9345                        set_in_sync(mddev);
9346                        spin_unlock(&mddev->lock);
9347                }
9348
9349                if (mddev->sb_flags)
9350                        md_update_sb(mddev, 0);
9351
9352                if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
9353                    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
9354                        /* resync/recovery still happening */
9355                        clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9356                        goto unlock;
9357                }
9358                if (mddev->sync_thread) {
9359                        md_reap_sync_thread(mddev);
9360                        goto unlock;
9361                }
9362                /* Set RUNNING before clearing NEEDED to avoid
9363                 * any transients in the value of "sync_action".
9364                 */
9365                mddev->curr_resync_completed = 0;
9366                spin_lock(&mddev->lock);
9367                set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9368                spin_unlock(&mddev->lock);
9369                /* Clear some bits that don't mean anything, but
9370                 * might be left set
9371                 */
9372                clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
9373                clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
9374
9375                if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
9376                    test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
9377                        goto not_running;
9378                /* no recovery is running.
9379                 * remove any failed drives, then
9380                 * add spares if possible.
9381                 * Spares are also removed and re-added, to allow
9382                 * the personality to fail the re-add.
9383                 */
9384
9385                if (mddev->reshape_position != MaxSector) {
9386                        if (mddev->pers->check_reshape == NULL ||
9387                            mddev->pers->check_reshape(mddev) != 0)
9388                                /* Cannot proceed */
9389                                goto not_running;
9390                        set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9391                        clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9392                } else if ((spares = remove_and_add_spares(mddev, NULL))) {
9393                        clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9394                        clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9395                        clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9396                        set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9397                } else if (mddev->recovery_cp < MaxSector) {
9398                        set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9399                        clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9400                } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
9401                        /* nothing to be done ... */
9402                        goto not_running;
9403
9404                if (mddev->pers->sync_request) {
9405                        if (spares) {
9406                                /* We are adding a device or devices to an array
9407                                 * which has the bitmap stored on all devices.
9408                                 * So make sure all bitmap pages get written
9409                                 */
9410                                md_bitmap_write_all(mddev->bitmap);
9411                        }
9412                        INIT_WORK(&mddev->del_work, md_start_sync);
9413                        queue_work(md_misc_wq, &mddev->del_work);
9414                        goto unlock;
9415                }
9416        not_running:
9417                if (!mddev->sync_thread) {
9418                        clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9419                        wake_up(&resync_wait);
9420                        if (test_and_clear_bit(MD_RECOVERY_RECOVER,
9421                                               &mddev->recovery))
9422                                if (mddev->sysfs_action)
9423                                        sysfs_notify_dirent_safe(mddev->sysfs_action);
9424                }
9425        unlock:
9426                wake_up(&mddev->sb_wait);
9427                mddev_unlock(mddev);
9428        }
9429}
9430EXPORT_SYMBOL(md_check_recovery);
9431
9432void md_reap_sync_thread(struct mddev *mddev)
9433{
9434        struct md_rdev *rdev;
9435        sector_t old_dev_sectors = mddev->dev_sectors;
9436        bool is_reshaped = false;
9437
9438        /* resync has finished, collect result */
9439        md_unregister_thread(&mddev->sync_thread);
9440        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9441            !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
9442            mddev->degraded != mddev->raid_disks) {
9443                /* success...*/
9444                /* activate any spares */
9445                if (mddev->pers->spare_active(mddev)) {
9446                        sysfs_notify_dirent_safe(mddev->sysfs_degraded);
9447                        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9448                }
9449        }
9450        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9451            mddev->pers->finish_reshape) {
9452                mddev->pers->finish_reshape(mddev);
9453                if (mddev_is_clustered(mddev))
9454                        is_reshaped = true;
9455        }
9456
9457        /* If array is no-longer degraded, then any saved_raid_disk
9458         * information must be scrapped.
9459         */
9460        if (!mddev->degraded)
9461                rdev_for_each(rdev, mddev)
9462                        rdev->saved_raid_disk = -1;
9463
9464        md_update_sb(mddev, 1);
9465        /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can
9466         * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
9467         * clustered raid */
9468        if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
9469                md_cluster_ops->resync_finish(mddev);
9470        clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9471        clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
9472        clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9473        clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9474        clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9475        clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9476        /*
9477         * We call md_cluster_ops->update_size here because sync_size could
9478         * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
9479         * so it is time to update size across cluster.
9480         */
9481        if (mddev_is_clustered(mddev) && is_reshaped
9482                                      && !test_bit(MD_CLOSING, &mddev->flags))
9483                md_cluster_ops->update_size(mddev, old_dev_sectors);
9484        wake_up(&resync_wait);
9485        /* flag recovery needed just to double check */
9486        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9487        sysfs_notify_dirent_safe(mddev->sysfs_action);
9488        md_new_event(mddev);
9489        if (mddev->event_work.func)
9490                queue_work(md_misc_wq, &mddev->event_work);
9491}
9492EXPORT_SYMBOL(md_reap_sync_thread);
9493
9494void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
9495{
9496        sysfs_notify_dirent_safe(rdev->sysfs_state);
9497        wait_event_timeout(rdev->blocked_wait,
9498                           !test_bit(Blocked, &rdev->flags) &&
9499                           !test_bit(BlockedBadBlocks, &rdev->flags),
9500                           msecs_to_jiffies(5000));
9501        rdev_dec_pending(rdev, mddev);
9502}
9503EXPORT_SYMBOL(md_wait_for_blocked_rdev);
9504
9505void md_finish_reshape(struct mddev *mddev)
9506{
9507        /* called be personality module when reshape completes. */
9508        struct md_rdev *rdev;
9509
9510        rdev_for_each(rdev, mddev) {
9511                if (rdev->data_offset > rdev->new_data_offset)
9512                        rdev->sectors += rdev->data_offset - rdev->new_data_offset;
9513                else
9514                        rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
9515                rdev->data_offset = rdev->new_data_offset;
9516        }
9517}
9518EXPORT_SYMBOL(md_finish_reshape);
9519
9520/* Bad block management */
9521
9522/* Returns 1 on success, 0 on failure */
9523int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9524                       int is_new)
9525{
9526        struct mddev *mddev = rdev->mddev;
9527        int rv;
9528        if (is_new)
9529                s += rdev->new_data_offset;
9530        else
9531                s += rdev->data_offset;
9532        rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
9533        if (rv == 0) {
9534                /* Make sure they get written out promptly */
9535                if (test_bit(ExternalBbl, &rdev->flags))
9536                        sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
9537                sysfs_notify_dirent_safe(rdev->sysfs_state);
9538                set_mask_bits(&mddev->sb_flags, 0,
9539                              BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
9540                md_wakeup_thread(rdev->mddev->thread);
9541                return 1;
9542        } else
9543                return 0;
9544}
9545EXPORT_SYMBOL_GPL(rdev_set_badblocks);
9546
9547int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9548                         int is_new)
9549{
9550        int rv;
9551        if (is_new)
9552                s += rdev->new_data_offset;
9553        else
9554                s += rdev->data_offset;
9555        rv = badblocks_clear(&rdev->badblocks, s, sectors);
9556        if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
9557                sysfs_notify_dirent_safe(rdev->sysfs_badblocks);
9558        return rv;
9559}
9560EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
9561
9562static int md_notify_reboot(struct notifier_block *this,
9563                            unsigned long code, void *x)
9564{
9565        struct list_head *tmp;
9566        struct mddev *mddev;
9567        int need_delay = 0;
9568
9569        for_each_mddev(mddev, tmp) {
9570                if (mddev_trylock(mddev)) {
9571                        if (mddev->pers)
9572                                __md_stop_writes(mddev);
9573                        if (mddev->persistent)
9574                                mddev->safemode = 2;
9575                        mddev_unlock(mddev);
9576                }
9577                need_delay = 1;
9578        }
9579        /*
9580         * certain more exotic SCSI devices are known to be
9581         * volatile wrt too early system reboots. While the
9582         * right place to handle this issue is the given
9583         * driver, we do want to have a safe RAID driver ...
9584         */
9585        if (need_delay)
9586                mdelay(1000*1);
9587
9588        return NOTIFY_DONE;
9589}
9590
9591static struct notifier_block md_notifier = {
9592        .notifier_call  = md_notify_reboot,
9593        .next           = NULL,
9594        .priority       = INT_MAX, /* before any real devices */
9595};
9596
9597static void md_geninit(void)
9598{
9599        pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
9600
9601        proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
9602}
9603
9604static int __init md_init(void)
9605{
9606        int ret = -ENOMEM;
9607
9608        md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
9609        if (!md_wq)
9610                goto err_wq;
9611
9612        md_misc_wq = alloc_workqueue("md_misc", 0, 0);
9613        if (!md_misc_wq)
9614                goto err_misc_wq;
9615
9616        md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0);
9617        if (!md_rdev_misc_wq)
9618                goto err_rdev_misc_wq;
9619
9620        ret = __register_blkdev(MD_MAJOR, "md", md_probe);
9621        if (ret < 0)
9622                goto err_md;
9623
9624        ret = __register_blkdev(0, "mdp", md_probe);
9625        if (ret < 0)
9626                goto err_mdp;
9627        mdp_major = ret;
9628
9629        register_reboot_notifier(&md_notifier);
9630        raid_table_header = register_sysctl_table(raid_root_table);
9631
9632        md_geninit();
9633        return 0;
9634
9635err_mdp:
9636        unregister_blkdev(MD_MAJOR, "md");
9637err_md:
9638        destroy_workqueue(md_rdev_misc_wq);
9639err_rdev_misc_wq:
9640        destroy_workqueue(md_misc_wq);
9641err_misc_wq:
9642        destroy_workqueue(md_wq);
9643err_wq:
9644        return ret;
9645}
9646
9647static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
9648{
9649        struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
9650        struct md_rdev *rdev2, *tmp;
9651        int role, ret;
9652        char b[BDEVNAME_SIZE];
9653
9654        /*
9655         * If size is changed in another node then we need to
9656         * do resize as well.
9657         */
9658        if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
9659                ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
9660                if (ret)
9661                        pr_info("md-cluster: resize failed\n");
9662                else
9663                        md_bitmap_update_sb(mddev->bitmap);
9664        }
9665
9666        /* Check for change of roles in the active devices */
9667        rdev_for_each_safe(rdev2, tmp, mddev) {
9668                if (test_bit(Faulty, &rdev2->flags))
9669                        continue;
9670
9671                /* Check if the roles changed */
9672                role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
9673
9674                if (test_bit(Candidate, &rdev2->flags)) {
9675                        if (role == 0xfffe) {
9676                                pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
9677                                md_kick_rdev_from_array(rdev2);
9678                                continue;
9679                        }
9680                        else
9681                                clear_bit(Candidate, &rdev2->flags);
9682                }
9683
9684                if (role != rdev2->raid_disk) {
9685                        /*
9686                         * got activated except reshape is happening.
9687                         */
9688                        if (rdev2->raid_disk == -1 && role != 0xffff &&
9689                            !(le32_to_cpu(sb->feature_map) &
9690                              MD_FEATURE_RESHAPE_ACTIVE)) {
9691                                rdev2->saved_raid_disk = role;
9692                                ret = remove_and_add_spares(mddev, rdev2);
9693                                pr_info("Activated spare: %s\n",
9694                                        bdevname(rdev2->bdev,b));
9695                                /* wakeup mddev->thread here, so array could
9696                                 * perform resync with the new activated disk */
9697                                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9698                                md_wakeup_thread(mddev->thread);
9699                        }
9700                        /* device faulty
9701                         * We just want to do the minimum to mark the disk
9702                         * as faulty. The recovery is performed by the
9703                         * one who initiated the error.
9704                         */
9705                        if ((role == 0xfffe) || (role == 0xfffd)) {
9706                                md_error(mddev, rdev2);
9707                                clear_bit(Blocked, &rdev2->flags);
9708                        }
9709                }
9710        }
9711
9712        if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) {
9713                ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
9714                if (ret)
9715                        pr_warn("md: updating array disks failed. %d\n", ret);
9716        }
9717
9718        /*
9719         * Since mddev->delta_disks has already updated in update_raid_disks,
9720         * so it is time to check reshape.
9721         */
9722        if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
9723            (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
9724                /*
9725                 * reshape is happening in the remote node, we need to
9726                 * update reshape_position and call start_reshape.
9727                 */
9728                mddev->reshape_position = le64_to_cpu(sb->reshape_position);
9729                if (mddev->pers->update_reshape_pos)
9730                        mddev->pers->update_reshape_pos(mddev);
9731                if (mddev->pers->start_reshape)
9732                        mddev->pers->start_reshape(mddev);
9733        } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
9734                   mddev->reshape_position != MaxSector &&
9735                   !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
9736                /* reshape is just done in another node. */
9737                mddev->reshape_position = MaxSector;
9738                if (mddev->pers->update_reshape_pos)
9739                        mddev->pers->update_reshape_pos(mddev);
9740        }
9741
9742        /* Finally set the event to be up to date */
9743        mddev->events = le64_to_cpu(sb->events);
9744}
9745
9746static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
9747{
9748        int err;
9749        struct page *swapout = rdev->sb_page;
9750        struct mdp_superblock_1 *sb;
9751
9752        /* Store the sb page of the rdev in the swapout temporary
9753         * variable in case we err in the future
9754         */
9755        rdev->sb_page = NULL;
9756        err = alloc_disk_sb(rdev);
9757        if (err == 0) {
9758                ClearPageUptodate(rdev->sb_page);
9759                rdev->sb_loaded = 0;
9760                err = super_types[mddev->major_version].
9761                        load_super(rdev, NULL, mddev->minor_version);
9762        }
9763        if (err < 0) {
9764                pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
9765                                __func__, __LINE__, rdev->desc_nr, err);
9766                if (rdev->sb_page)
9767                        put_page(rdev->sb_page);
9768                rdev->sb_page = swapout;
9769                rdev->sb_loaded = 1;
9770                return err;
9771        }
9772
9773        sb = page_address(rdev->sb_page);
9774        /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
9775         * is not set
9776         */
9777
9778        if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
9779                rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
9780
9781        /* The other node finished recovery, call spare_active to set
9782         * device In_sync and mddev->degraded
9783         */
9784        if (rdev->recovery_offset == MaxSector &&
9785            !test_bit(In_sync, &rdev->flags) &&
9786            mddev->pers->spare_active(mddev))
9787                sysfs_notify_dirent_safe(mddev->sysfs_degraded);
9788
9789        put_page(swapout);
9790        return 0;
9791}
9792
9793void md_reload_sb(struct mddev *mddev, int nr)
9794{
9795        struct md_rdev *rdev;
9796        int err;
9797
9798        /* Find the rdev */
9799        rdev_for_each_rcu(rdev, mddev) {
9800                if (rdev->desc_nr == nr)
9801                        break;
9802        }
9803
9804        if (!rdev || rdev->desc_nr != nr) {
9805                pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
9806                return;
9807        }
9808
9809        err = read_rdev(mddev, rdev);
9810        if (err < 0)
9811                return;
9812
9813        check_sb_changes(mddev, rdev);
9814
9815        /* Read all rdev's to update recovery_offset */
9816        rdev_for_each_rcu(rdev, mddev) {
9817                if (!test_bit(Faulty, &rdev->flags))
9818                        read_rdev(mddev, rdev);
9819        }
9820}
9821EXPORT_SYMBOL(md_reload_sb);
9822
9823#ifndef MODULE
9824
9825/*
9826 * Searches all registered partitions for autorun RAID arrays
9827 * at boot time.
9828 */
9829
9830static DEFINE_MUTEX(detected_devices_mutex);
9831static LIST_HEAD(all_detected_devices);
9832struct detected_devices_node {
9833        struct list_head list;
9834        dev_t dev;
9835};
9836
9837void md_autodetect_dev(dev_t dev)
9838{
9839        struct detected_devices_node *node_detected_dev;
9840
9841        node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
9842        if (node_detected_dev) {
9843                node_detected_dev->dev = dev;
9844                mutex_lock(&detected_devices_mutex);
9845                list_add_tail(&node_detected_dev->list, &all_detected_devices);
9846                mutex_unlock(&detected_devices_mutex);
9847        }
9848}
9849
9850static void autostart_arrays(int part)
9851{
9852        struct md_rdev *rdev;
9853        struct detected_devices_node *node_detected_dev;
9854        dev_t dev;
9855        int i_scanned, i_passed;
9856
9857        i_scanned = 0;
9858        i_passed = 0;
9859
9860        pr_info("md: Autodetecting RAID arrays.\n");
9861
9862        mutex_lock(&detected_devices_mutex);
9863        while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
9864                i_scanned++;
9865                node_detected_dev = list_entry(all_detected_devices.next,
9866                                        struct detected_devices_node, list);
9867                list_del(&node_detected_dev->list);
9868                dev = node_detected_dev->dev;
9869                kfree(node_detected_dev);
9870                mutex_unlock(&detected_devices_mutex);
9871                rdev = md_import_device(dev,0, 90);
9872                mutex_lock(&detected_devices_mutex);
9873                if (IS_ERR(rdev))
9874                        continue;
9875
9876                if (test_bit(Faulty, &rdev->flags))
9877                        continue;
9878
9879                set_bit(AutoDetected, &rdev->flags);
9880                list_add(&rdev->same_set, &pending_raid_disks);
9881                i_passed++;
9882        }
9883        mutex_unlock(&detected_devices_mutex);
9884
9885        pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed);
9886
9887        autorun_devices(part);
9888}
9889
9890#endif /* !MODULE */
9891
9892static __exit void md_exit(void)
9893{
9894        struct mddev *mddev;
9895        struct list_head *tmp;
9896        int delay = 1;
9897
9898        unregister_blkdev(MD_MAJOR,"md");
9899        unregister_blkdev(mdp_major, "mdp");
9900        unregister_reboot_notifier(&md_notifier);
9901        unregister_sysctl_table(raid_table_header);
9902
9903        /* We cannot unload the modules while some process is
9904         * waiting for us in select() or poll() - wake them up
9905         */
9906        md_unloading = 1;
9907        while (waitqueue_active(&md_event_waiters)) {
9908                /* not safe to leave yet */
9909                wake_up(&md_event_waiters);
9910                msleep(delay);
9911                delay += delay;
9912        }
9913        remove_proc_entry("mdstat", NULL);
9914
9915        for_each_mddev(mddev, tmp) {
9916                export_array(mddev);
9917                mddev->ctime = 0;
9918                mddev->hold_active = 0;
9919                /*
9920                 * for_each_mddev() will call mddev_put() at the end of each
9921                 * iteration.  As the mddev is now fully clear, this will
9922                 * schedule the mddev for destruction by a workqueue, and the
9923                 * destroy_workqueue() below will wait for that to complete.
9924                 */
9925        }
9926        destroy_workqueue(md_rdev_misc_wq);
9927        destroy_workqueue(md_misc_wq);
9928        destroy_workqueue(md_wq);
9929}
9930
9931subsys_initcall(md_init);
9932module_exit(md_exit)
9933
9934static int get_ro(char *buffer, const struct kernel_param *kp)
9935{
9936        return sprintf(buffer, "%d\n", start_readonly);
9937}
9938static int set_ro(const char *val, const struct kernel_param *kp)
9939{
9940        return kstrtouint(val, 10, (unsigned int *)&start_readonly);
9941}
9942
9943module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
9944module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
9945module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
9946module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
9947
9948MODULE_LICENSE("GPL");
9949MODULE_DESCRIPTION("MD RAID framework");
9950MODULE_ALIAS("md");
9951MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
9952