linux/drivers/md/md.c
<<
>>
Prefs
   1/*
   2   md.c : Multiple Devices driver for Linux
   3          Copyright (C) 1998, 1999, 2000 Ingo Molnar
   4
   5     completely rewritten, based on the MD driver code from Marc Zyngier
   6
   7   Changes:
   8
   9   - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
  10   - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
  11   - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
  12   - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
  13   - kmod support by: Cyrus Durgin
  14   - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
  15   - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
  16
  17   - lots of fixes and improvements to the RAID1/RAID5 and generic
  18     RAID code (such as request based resynchronization):
  19
  20     Neil Brown <neilb@cse.unsw.edu.au>.
  21
  22   - persistent bitmap code
  23     Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
  24
  25   This program is free software; you can redistribute it and/or modify
  26   it under the terms of the GNU General Public License as published by
  27   the Free Software Foundation; either version 2, or (at your option)
  28   any later version.
  29
  30   You should have received a copy of the GNU General Public License
  31   (for example /usr/src/linux/COPYING); if not, write to the Free
  32   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  33*/
  34
  35#include <linux/module.h>
  36#include <linux/kernel.h>
  37#include <linux/kthread.h>
  38#include <linux/linkage.h>
  39#include <linux/raid/md.h>
  40#include <linux/raid/bitmap.h>
  41#include <linux/sysctl.h>
  42#include <linux/buffer_head.h> /* for invalidate_bdev */
  43#include <linux/poll.h>
  44#include <linux/mutex.h>
  45#include <linux/ctype.h>
  46#include <linux/freezer.h>
  47
  48#include <linux/init.h>
  49
  50#include <linux/file.h>
  51
  52#ifdef CONFIG_KMOD
  53#include <linux/kmod.h>
  54#endif
  55
  56#include <asm/unaligned.h>
  57
  58#define MAJOR_NR MD_MAJOR
  59#define MD_DRIVER
  60
  61/* 63 partitions with the alternate major number (mdp) */
  62#define MdpMinorShift 6
  63
  64#define DEBUG 0
  65#define dprintk(x...) ((void)(DEBUG && printk(x)))
  66
  67
  68#ifndef MODULE
  69static void autostart_arrays (int part);
  70#endif
  71
  72static LIST_HEAD(pers_list);
  73static DEFINE_SPINLOCK(pers_lock);
  74
  75static void md_print_devices(void);
  76
  77#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
  78
  79/*
  80 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
  81 * is 1000 KB/sec, so the extra system load does not show up that much.
  82 * Increase it if you want to have more _guaranteed_ speed. Note that
  83 * the RAID driver will use the maximum available bandwidth if the IO
  84 * subsystem is idle. There is also an 'absolute maximum' reconstruction
  85 * speed limit - in case reconstruction slows down your system despite
  86 * idle IO detection.
  87 *
  88 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
  89 * or /sys/block/mdX/md/sync_speed_{min,max}
  90 */
  91
  92static int sysctl_speed_limit_min = 1000;
  93static int sysctl_speed_limit_max = 200000;
  94static inline int speed_min(mddev_t *mddev)
  95{
  96        return mddev->sync_speed_min ?
  97                mddev->sync_speed_min : sysctl_speed_limit_min;
  98}
  99
 100static inline int speed_max(mddev_t *mddev)
 101{
 102        return mddev->sync_speed_max ?
 103                mddev->sync_speed_max : sysctl_speed_limit_max;
 104}
 105
 106static struct ctl_table_header *raid_table_header;
 107
 108static ctl_table raid_table[] = {
 109        {
 110                .ctl_name       = DEV_RAID_SPEED_LIMIT_MIN,
 111                .procname       = "speed_limit_min",
 112                .data           = &sysctl_speed_limit_min,
 113                .maxlen         = sizeof(int),
 114                .mode           = S_IRUGO|S_IWUSR,
 115                .proc_handler   = &proc_dointvec,
 116        },
 117        {
 118                .ctl_name       = DEV_RAID_SPEED_LIMIT_MAX,
 119                .procname       = "speed_limit_max",
 120                .data           = &sysctl_speed_limit_max,
 121                .maxlen         = sizeof(int),
 122                .mode           = S_IRUGO|S_IWUSR,
 123                .proc_handler   = &proc_dointvec,
 124        },
 125        { .ctl_name = 0 }
 126};
 127
 128static ctl_table raid_dir_table[] = {
 129        {
 130                .ctl_name       = DEV_RAID,
 131                .procname       = "raid",
 132                .maxlen         = 0,
 133                .mode           = S_IRUGO|S_IXUGO,
 134                .child          = raid_table,
 135        },
 136        { .ctl_name = 0 }
 137};
 138
 139static ctl_table raid_root_table[] = {
 140        {
 141                .ctl_name       = CTL_DEV,
 142                .procname       = "dev",
 143                .maxlen         = 0,
 144                .mode           = 0555,
 145                .child          = raid_dir_table,
 146        },
 147        { .ctl_name = 0 }
 148};
 149
 150static struct block_device_operations md_fops;
 151
 152static int start_readonly;
 153
 154/*
 155 * We have a system wide 'event count' that is incremented
 156 * on any 'interesting' event, and readers of /proc/mdstat
 157 * can use 'poll' or 'select' to find out when the event
 158 * count increases.
 159 *
 160 * Events are:
 161 *  start array, stop array, error, add device, remove device,
 162 *  start build, activate spare
 163 */
 164static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
 165static atomic_t md_event_count;
 166void md_new_event(mddev_t *mddev)
 167{
 168        atomic_inc(&md_event_count);
 169        wake_up(&md_event_waiters);
 170        sysfs_notify(&mddev->kobj, NULL, "sync_action");
 171}
 172EXPORT_SYMBOL_GPL(md_new_event);
 173
 174/* Alternate version that can be called from interrupts
 175 * when calling sysfs_notify isn't needed.
 176 */
 177static void md_new_event_inintr(mddev_t *mddev)
 178{
 179        atomic_inc(&md_event_count);
 180        wake_up(&md_event_waiters);
 181}
 182
 183/*
 184 * Enables to iterate over all existing md arrays
 185 * all_mddevs_lock protects this list.
 186 */
 187static LIST_HEAD(all_mddevs);
 188static DEFINE_SPINLOCK(all_mddevs_lock);
 189
 190
 191/*
 192 * iterates through all used mddevs in the system.
 193 * We take care to grab the all_mddevs_lock whenever navigating
 194 * the list, and to always hold a refcount when unlocked.
 195 * Any code which breaks out of this loop while own
 196 * a reference to the current mddev and must mddev_put it.
 197 */
 198#define ITERATE_MDDEV(mddev,tmp)                                        \
 199                                                                        \
 200        for (({ spin_lock(&all_mddevs_lock);                            \
 201                tmp = all_mddevs.next;                                  \
 202                mddev = NULL;});                                        \
 203             ({ if (tmp != &all_mddevs)                                 \
 204                        mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
 205                spin_unlock(&all_mddevs_lock);                          \
 206                if (mddev) mddev_put(mddev);                            \
 207                mddev = list_entry(tmp, mddev_t, all_mddevs);           \
 208                tmp != &all_mddevs;});                                  \
 209             ({ spin_lock(&all_mddevs_lock);                            \
 210                tmp = tmp->next;})                                      \
 211                )
 212
 213
 214static int md_fail_request (struct request_queue *q, struct bio *bio)
 215{
 216        bio_io_error(bio);
 217        return 0;
 218}
 219
 220static inline mddev_t *mddev_get(mddev_t *mddev)
 221{
 222        atomic_inc(&mddev->active);
 223        return mddev;
 224}
 225
 226static void mddev_put(mddev_t *mddev)
 227{
 228        if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
 229                return;
 230        if (!mddev->raid_disks && list_empty(&mddev->disks)) {
 231                list_del(&mddev->all_mddevs);
 232                spin_unlock(&all_mddevs_lock);
 233                blk_cleanup_queue(mddev->queue);
 234                kobject_unregister(&mddev->kobj);
 235        } else
 236                spin_unlock(&all_mddevs_lock);
 237}
 238
 239static mddev_t * mddev_find(dev_t unit)
 240{
 241        mddev_t *mddev, *new = NULL;
 242
 243 retry:
 244        spin_lock(&all_mddevs_lock);
 245        list_for_each_entry(mddev, &all_mddevs, all_mddevs)
 246                if (mddev->unit == unit) {
 247                        mddev_get(mddev);
 248                        spin_unlock(&all_mddevs_lock);
 249                        kfree(new);
 250                        return mddev;
 251                }
 252
 253        if (new) {
 254                list_add(&new->all_mddevs, &all_mddevs);
 255                spin_unlock(&all_mddevs_lock);
 256                return new;
 257        }
 258        spin_unlock(&all_mddevs_lock);
 259
 260        new = kzalloc(sizeof(*new), GFP_KERNEL);
 261        if (!new)
 262                return NULL;
 263
 264        new->unit = unit;
 265        if (MAJOR(unit) == MD_MAJOR)
 266                new->md_minor = MINOR(unit);
 267        else
 268                new->md_minor = MINOR(unit) >> MdpMinorShift;
 269
 270        mutex_init(&new->reconfig_mutex);
 271        INIT_LIST_HEAD(&new->disks);
 272        INIT_LIST_HEAD(&new->all_mddevs);
 273        init_timer(&new->safemode_timer);
 274        atomic_set(&new->active, 1);
 275        spin_lock_init(&new->write_lock);
 276        init_waitqueue_head(&new->sb_wait);
 277        new->reshape_position = MaxSector;
 278
 279        new->queue = blk_alloc_queue(GFP_KERNEL);
 280        if (!new->queue) {
 281                kfree(new);
 282                return NULL;
 283        }
 284        set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags);
 285
 286        blk_queue_make_request(new->queue, md_fail_request);
 287
 288        goto retry;
 289}
 290
 291static inline int mddev_lock(mddev_t * mddev)
 292{
 293        return mutex_lock_interruptible(&mddev->reconfig_mutex);
 294}
 295
 296static inline int mddev_trylock(mddev_t * mddev)
 297{
 298        return mutex_trylock(&mddev->reconfig_mutex);
 299}
 300
 301static inline void mddev_unlock(mddev_t * mddev)
 302{
 303        mutex_unlock(&mddev->reconfig_mutex);
 304
 305        md_wakeup_thread(mddev->thread);
 306}
 307
 308static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
 309{
 310        mdk_rdev_t * rdev;
 311        struct list_head *tmp;
 312
 313        ITERATE_RDEV(mddev,rdev,tmp) {
 314                if (rdev->desc_nr == nr)
 315                        return rdev;
 316        }
 317        return NULL;
 318}
 319
 320static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
 321{
 322        struct list_head *tmp;
 323        mdk_rdev_t *rdev;
 324
 325        ITERATE_RDEV(mddev,rdev,tmp) {
 326                if (rdev->bdev->bd_dev == dev)
 327                        return rdev;
 328        }
 329        return NULL;
 330}
 331
 332static struct mdk_personality *find_pers(int level, char *clevel)
 333{
 334        struct mdk_personality *pers;
 335        list_for_each_entry(pers, &pers_list, list) {
 336                if (level != LEVEL_NONE && pers->level == level)
 337                        return pers;
 338                if (strcmp(pers->name, clevel)==0)
 339                        return pers;
 340        }
 341        return NULL;
 342}
 343
 344static inline sector_t calc_dev_sboffset(struct block_device *bdev)
 345{
 346        sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
 347        return MD_NEW_SIZE_BLOCKS(size);
 348}
 349
 350static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
 351{
 352        sector_t size;
 353
 354        size = rdev->sb_offset;
 355
 356        if (chunk_size)
 357                size &= ~((sector_t)chunk_size/1024 - 1);
 358        return size;
 359}
 360
 361static int alloc_disk_sb(mdk_rdev_t * rdev)
 362{
 363        if (rdev->sb_page)
 364                MD_BUG();
 365
 366        rdev->sb_page = alloc_page(GFP_KERNEL);
 367        if (!rdev->sb_page) {
 368                printk(KERN_ALERT "md: out of memory.\n");
 369                return -EINVAL;
 370        }
 371
 372        return 0;
 373}
 374
 375static void free_disk_sb(mdk_rdev_t * rdev)
 376{
 377        if (rdev->sb_page) {
 378                put_page(rdev->sb_page);
 379                rdev->sb_loaded = 0;
 380                rdev->sb_page = NULL;
 381                rdev->sb_offset = 0;
 382                rdev->size = 0;
 383        }
 384}
 385
 386
 387static void super_written(struct bio *bio, int error)
 388{
 389        mdk_rdev_t *rdev = bio->bi_private;
 390        mddev_t *mddev = rdev->mddev;
 391
 392        if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
 393                printk("md: super_written gets error=%d, uptodate=%d\n",
 394                       error, test_bit(BIO_UPTODATE, &bio->bi_flags));
 395                WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
 396                md_error(mddev, rdev);
 397        }
 398
 399        if (atomic_dec_and_test(&mddev->pending_writes))
 400                wake_up(&mddev->sb_wait);
 401        bio_put(bio);
 402}
 403
 404static void super_written_barrier(struct bio *bio, int error)
 405{
 406        struct bio *bio2 = bio->bi_private;
 407        mdk_rdev_t *rdev = bio2->bi_private;
 408        mddev_t *mddev = rdev->mddev;
 409
 410        if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
 411            error == -EOPNOTSUPP) {
 412                unsigned long flags;
 413                /* barriers don't appear to be supported :-( */
 414                set_bit(BarriersNotsupp, &rdev->flags);
 415                mddev->barriers_work = 0;
 416                spin_lock_irqsave(&mddev->write_lock, flags);
 417                bio2->bi_next = mddev->biolist;
 418                mddev->biolist = bio2;
 419                spin_unlock_irqrestore(&mddev->write_lock, flags);
 420                wake_up(&mddev->sb_wait);
 421                bio_put(bio);
 422        } else {
 423                bio_put(bio2);
 424                bio->bi_private = rdev;
 425                super_written(bio, error);
 426        }
 427}
 428
 429void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 430                   sector_t sector, int size, struct page *page)
 431{
 432        /* write first size bytes of page to sector of rdev
 433         * Increment mddev->pending_writes before returning
 434         * and decrement it on completion, waking up sb_wait
 435         * if zero is reached.
 436         * If an error occurred, call md_error
 437         *
 438         * As we might need to resubmit the request if BIO_RW_BARRIER
 439         * causes ENOTSUPP, we allocate a spare bio...
 440         */
 441        struct bio *bio = bio_alloc(GFP_NOIO, 1);
 442        int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
 443
 444        bio->bi_bdev = rdev->bdev;
 445        bio->bi_sector = sector;
 446        bio_add_page(bio, page, size, 0);
 447        bio->bi_private = rdev;
 448        bio->bi_end_io = super_written;
 449        bio->bi_rw = rw;
 450
 451        atomic_inc(&mddev->pending_writes);
 452        if (!test_bit(BarriersNotsupp, &rdev->flags)) {
 453                struct bio *rbio;
 454                rw |= (1<<BIO_RW_BARRIER);
 455                rbio = bio_clone(bio, GFP_NOIO);
 456                rbio->bi_private = bio;
 457                rbio->bi_end_io = super_written_barrier;
 458                submit_bio(rw, rbio);
 459        } else
 460                submit_bio(rw, bio);
 461}
 462
 463void md_super_wait(mddev_t *mddev)
 464{
 465        /* wait for all superblock writes that were scheduled to complete.
 466         * if any had to be retried (due to BARRIER problems), retry them
 467         */
 468        DEFINE_WAIT(wq);
 469        for(;;) {
 470                prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
 471                if (atomic_read(&mddev->pending_writes)==0)
 472                        break;
 473                while (mddev->biolist) {
 474                        struct bio *bio;
 475                        spin_lock_irq(&mddev->write_lock);
 476                        bio = mddev->biolist;
 477                        mddev->biolist = bio->bi_next ;
 478                        bio->bi_next = NULL;
 479                        spin_unlock_irq(&mddev->write_lock);
 480                        submit_bio(bio->bi_rw, bio);
 481                }
 482                schedule();
 483        }
 484        finish_wait(&mddev->sb_wait, &wq);
 485}
 486
 487static void bi_complete(struct bio *bio, int error)
 488{
 489        complete((struct completion*)bio->bi_private);
 490}
 491
 492int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 493                   struct page *page, int rw)
 494{
 495        struct bio *bio = bio_alloc(GFP_NOIO, 1);
 496        struct completion event;
 497        int ret;
 498
 499        rw |= (1 << BIO_RW_SYNC);
 500
 501        bio->bi_bdev = bdev;
 502        bio->bi_sector = sector;
 503        bio_add_page(bio, page, size, 0);
 504        init_completion(&event);
 505        bio->bi_private = &event;
 506        bio->bi_end_io = bi_complete;
 507        submit_bio(rw, bio);
 508        wait_for_completion(&event);
 509
 510        ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
 511        bio_put(bio);
 512        return ret;
 513}
 514EXPORT_SYMBOL_GPL(sync_page_io);
 515
 516static int read_disk_sb(mdk_rdev_t * rdev, int size)
 517{
 518        char b[BDEVNAME_SIZE];
 519        if (!rdev->sb_page) {
 520                MD_BUG();
 521                return -EINVAL;
 522        }
 523        if (rdev->sb_loaded)
 524                return 0;
 525
 526
 527        if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
 528                goto fail;
 529        rdev->sb_loaded = 1;
 530        return 0;
 531
 532fail:
 533        printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
 534                bdevname(rdev->bdev,b));
 535        return -EINVAL;
 536}
 537
 538static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
 539{
 540        if (    (sb1->set_uuid0 == sb2->set_uuid0) &&
 541                (sb1->set_uuid1 == sb2->set_uuid1) &&
 542                (sb1->set_uuid2 == sb2->set_uuid2) &&
 543                (sb1->set_uuid3 == sb2->set_uuid3))
 544
 545                return 1;
 546
 547        return 0;
 548}
 549
 550
 551static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
 552{
 553        int ret;
 554        mdp_super_t *tmp1, *tmp2;
 555
 556        tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
 557        tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
 558
 559        if (!tmp1 || !tmp2) {
 560                ret = 0;
 561                printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
 562                goto abort;
 563        }
 564
 565        *tmp1 = *sb1;
 566        *tmp2 = *sb2;
 567
 568        /*
 569         * nr_disks is not constant
 570         */
 571        tmp1->nr_disks = 0;
 572        tmp2->nr_disks = 0;
 573
 574        if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
 575                ret = 0;
 576        else
 577                ret = 1;
 578
 579abort:
 580        kfree(tmp1);
 581        kfree(tmp2);
 582        return ret;
 583}
 584
 585
 586static u32 md_csum_fold(u32 csum)
 587{
 588        csum = (csum & 0xffff) + (csum >> 16);
 589        return (csum & 0xffff) + (csum >> 16);
 590}
 591
 592static unsigned int calc_sb_csum(mdp_super_t * sb)
 593{
 594        u64 newcsum = 0;
 595        u32 *sb32 = (u32*)sb;
 596        int i;
 597        unsigned int disk_csum, csum;
 598
 599        disk_csum = sb->sb_csum;
 600        sb->sb_csum = 0;
 601
 602        for (i = 0; i < MD_SB_BYTES/4 ; i++)
 603                newcsum += sb32[i];
 604        csum = (newcsum & 0xffffffff) + (newcsum>>32);
 605
 606
 607#ifdef CONFIG_ALPHA
 608        /* This used to use csum_partial, which was wrong for several
 609         * reasons including that different results are returned on
 610         * different architectures.  It isn't critical that we get exactly
 611         * the same return value as before (we always csum_fold before
 612         * testing, and that removes any differences).  However as we
 613         * know that csum_partial always returned a 16bit value on
 614         * alphas, do a fold to maximise conformity to previous behaviour.
 615         */
 616        sb->sb_csum = md_csum_fold(disk_csum);
 617#else
 618        sb->sb_csum = disk_csum;
 619#endif
 620        return csum;
 621}
 622
 623
 624/*
 625 * Handle superblock details.
 626 * We want to be able to handle multiple superblock formats
 627 * so we have a common interface to them all, and an array of
 628 * different handlers.
 629 * We rely on user-space to write the initial superblock, and support
 630 * reading and updating of superblocks.
 631 * Interface methods are:
 632 *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
 633 *      loads and validates a superblock on dev.
 634 *      if refdev != NULL, compare superblocks on both devices
 635 *    Return:
 636 *      0 - dev has a superblock that is compatible with refdev
 637 *      1 - dev has a superblock that is compatible and newer than refdev
 638 *          so dev should be used as the refdev in future
 639 *     -EINVAL superblock incompatible or invalid
 640 *     -othererror e.g. -EIO
 641 *
 642 *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
 643 *      Verify that dev is acceptable into mddev.
 644 *       The first time, mddev->raid_disks will be 0, and data from
 645 *       dev should be merged in.  Subsequent calls check that dev
 646 *       is new enough.  Return 0 or -EINVAL
 647 *
 648 *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
 649 *     Update the superblock for rdev with data in mddev
 650 *     This does not write to disc.
 651 *
 652 */
 653
 654struct super_type  {
 655        char            *name;
 656        struct module   *owner;
 657        int             (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
 658        int             (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
 659        void            (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
 660};
 661
 662/*
 663 * load_super for 0.90.0 
 664 */
 665static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 666{
 667        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
 668        mdp_super_t *sb;
 669        int ret;
 670        sector_t sb_offset;
 671
 672        /*
 673         * Calculate the position of the superblock,
 674         * it's at the end of the disk.
 675         *
 676         * It also happens to be a multiple of 4Kb.
 677         */
 678        sb_offset = calc_dev_sboffset(rdev->bdev);
 679        rdev->sb_offset = sb_offset;
 680
 681        ret = read_disk_sb(rdev, MD_SB_BYTES);
 682        if (ret) return ret;
 683
 684        ret = -EINVAL;
 685
 686        bdevname(rdev->bdev, b);
 687        sb = (mdp_super_t*)page_address(rdev->sb_page);
 688
 689        if (sb->md_magic != MD_SB_MAGIC) {
 690                printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
 691                       b);
 692                goto abort;
 693        }
 694
 695        if (sb->major_version != 0 ||
 696            sb->minor_version < 90 ||
 697            sb->minor_version > 91) {
 698                printk(KERN_WARNING "Bad version number %d.%d on %s\n",
 699                        sb->major_version, sb->minor_version,
 700                        b);
 701                goto abort;
 702        }
 703
 704        if (sb->raid_disks <= 0)
 705                goto abort;
 706
 707        if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
 708                printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
 709                        b);
 710                goto abort;
 711        }
 712
 713        rdev->preferred_minor = sb->md_minor;
 714        rdev->data_offset = 0;
 715        rdev->sb_size = MD_SB_BYTES;
 716
 717        if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) {
 718                if (sb->level != 1 && sb->level != 4
 719                    && sb->level != 5 && sb->level != 6
 720                    && sb->level != 10) {
 721                        /* FIXME use a better test */
 722                        printk(KERN_WARNING
 723                               "md: bitmaps not supported for this level.\n");
 724                        goto abort;
 725                }
 726        }
 727
 728        if (sb->level == LEVEL_MULTIPATH)
 729                rdev->desc_nr = -1;
 730        else
 731                rdev->desc_nr = sb->this_disk.number;
 732
 733        if (refdev == 0)
 734                ret = 1;
 735        else {
 736                __u64 ev1, ev2;
 737                mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
 738                if (!uuid_equal(refsb, sb)) {
 739                        printk(KERN_WARNING "md: %s has different UUID to %s\n",
 740                                b, bdevname(refdev->bdev,b2));
 741                        goto abort;
 742                }
 743                if (!sb_equal(refsb, sb)) {
 744                        printk(KERN_WARNING "md: %s has same UUID"
 745                               " but different superblock to %s\n",
 746                               b, bdevname(refdev->bdev, b2));
 747                        goto abort;
 748                }
 749                ev1 = md_event(sb);
 750                ev2 = md_event(refsb);
 751                if (ev1 > ev2)
 752                        ret = 1;
 753                else 
 754                        ret = 0;
 755        }
 756        rdev->size = calc_dev_size(rdev, sb->chunk_size);
 757
 758        if (rdev->size < sb->size && sb->level > 1)
 759                /* "this cannot possibly happen" ... */
 760                ret = -EINVAL;
 761
 762 abort:
 763        return ret;
 764}
 765
 766/*
 767 * validate_super for 0.90.0
 768 */
 769static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 770{
 771        mdp_disk_t *desc;
 772        mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
 773        __u64 ev1 = md_event(sb);
 774
 775        rdev->raid_disk = -1;
 776        rdev->flags = 0;
 777        if (mddev->raid_disks == 0) {
 778                mddev->major_version = 0;
 779                mddev->minor_version = sb->minor_version;
 780                mddev->patch_version = sb->patch_version;
 781                mddev->persistent = ! sb->not_persistent;
 782                mddev->chunk_size = sb->chunk_size;
 783                mddev->ctime = sb->ctime;
 784                mddev->utime = sb->utime;
 785                mddev->level = sb->level;
 786                mddev->clevel[0] = 0;
 787                mddev->layout = sb->layout;
 788                mddev->raid_disks = sb->raid_disks;
 789                mddev->size = sb->size;
 790                mddev->events = ev1;
 791                mddev->bitmap_offset = 0;
 792                mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
 793
 794                if (mddev->minor_version >= 91) {
 795                        mddev->reshape_position = sb->reshape_position;
 796                        mddev->delta_disks = sb->delta_disks;
 797                        mddev->new_level = sb->new_level;
 798                        mddev->new_layout = sb->new_layout;
 799                        mddev->new_chunk = sb->new_chunk;
 800                } else {
 801                        mddev->reshape_position = MaxSector;
 802                        mddev->delta_disks = 0;
 803                        mddev->new_level = mddev->level;
 804                        mddev->new_layout = mddev->layout;
 805                        mddev->new_chunk = mddev->chunk_size;
 806                }
 807
 808                if (sb->state & (1<<MD_SB_CLEAN))
 809                        mddev->recovery_cp = MaxSector;
 810                else {
 811                        if (sb->events_hi == sb->cp_events_hi && 
 812                                sb->events_lo == sb->cp_events_lo) {
 813                                mddev->recovery_cp = sb->recovery_cp;
 814                        } else
 815                                mddev->recovery_cp = 0;
 816                }
 817
 818                memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
 819                memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
 820                memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
 821                memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
 822
 823                mddev->max_disks = MD_SB_DISKS;
 824
 825                if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
 826                    mddev->bitmap_file == NULL)
 827                        mddev->bitmap_offset = mddev->default_bitmap_offset;
 828
 829        } else if (mddev->pers == NULL) {
 830                /* Insist on good event counter while assembling */
 831                ++ev1;
 832                if (ev1 < mddev->events) 
 833                        return -EINVAL;
 834        } else if (mddev->bitmap) {
 835                /* if adding to array with a bitmap, then we can accept an
 836                 * older device ... but not too old.
 837                 */
 838                if (ev1 < mddev->bitmap->events_cleared)
 839                        return 0;
 840        } else {
 841                if (ev1 < mddev->events)
 842                        /* just a hot-add of a new device, leave raid_disk at -1 */
 843                        return 0;
 844        }
 845
 846        if (mddev->level != LEVEL_MULTIPATH) {
 847                desc = sb->disks + rdev->desc_nr;
 848
 849                if (desc->state & (1<<MD_DISK_FAULTY))
 850                        set_bit(Faulty, &rdev->flags);
 851                else if (desc->state & (1<<MD_DISK_SYNC) /* &&
 852                            desc->raid_disk < mddev->raid_disks */) {
 853                        set_bit(In_sync, &rdev->flags);
 854                        rdev->raid_disk = desc->raid_disk;
 855                }
 856                if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
 857                        set_bit(WriteMostly, &rdev->flags);
 858        } else /* MULTIPATH are always insync */
 859                set_bit(In_sync, &rdev->flags);
 860        return 0;
 861}
 862
 863/*
 864 * sync_super for 0.90.0
 865 */
 866static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 867{
 868        mdp_super_t *sb;
 869        struct list_head *tmp;
 870        mdk_rdev_t *rdev2;
 871        int next_spare = mddev->raid_disks;
 872
 873
 874        /* make rdev->sb match mddev data..
 875         *
 876         * 1/ zero out disks
 877         * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
 878         * 3/ any empty disks < next_spare become removed
 879         *
 880         * disks[0] gets initialised to REMOVED because
 881         * we cannot be sure from other fields if it has
 882         * been initialised or not.
 883         */
 884        int i;
 885        int active=0, working=0,failed=0,spare=0,nr_disks=0;
 886
 887        rdev->sb_size = MD_SB_BYTES;
 888
 889        sb = (mdp_super_t*)page_address(rdev->sb_page);
 890
 891        memset(sb, 0, sizeof(*sb));
 892
 893        sb->md_magic = MD_SB_MAGIC;
 894        sb->major_version = mddev->major_version;
 895        sb->patch_version = mddev->patch_version;
 896        sb->gvalid_words  = 0; /* ignored */
 897        memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
 898        memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
 899        memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
 900        memcpy(&sb->set_uuid3, mddev->uuid+12,4);
 901
 902        sb->ctime = mddev->ctime;
 903        sb->level = mddev->level;
 904        sb->size  = mddev->size;
 905        sb->raid_disks = mddev->raid_disks;
 906        sb->md_minor = mddev->md_minor;
 907        sb->not_persistent = !mddev->persistent;
 908        sb->utime = mddev->utime;
 909        sb->state = 0;
 910        sb->events_hi = (mddev->events>>32);
 911        sb->events_lo = (u32)mddev->events;
 912
 913        if (mddev->reshape_position == MaxSector)
 914                sb->minor_version = 90;
 915        else {
 916                sb->minor_version = 91;
 917                sb->reshape_position = mddev->reshape_position;
 918                sb->new_level = mddev->new_level;
 919                sb->delta_disks = mddev->delta_disks;
 920                sb->new_layout = mddev->new_layout;
 921                sb->new_chunk = mddev->new_chunk;
 922        }
 923        mddev->minor_version = sb->minor_version;
 924        if (mddev->in_sync)
 925        {
 926                sb->recovery_cp = mddev->recovery_cp;
 927                sb->cp_events_hi = (mddev->events>>32);
 928                sb->cp_events_lo = (u32)mddev->events;
 929                if (mddev->recovery_cp == MaxSector)
 930                        sb->state = (1<< MD_SB_CLEAN);
 931        } else
 932                sb->recovery_cp = 0;
 933
 934        sb->layout = mddev->layout;
 935        sb->chunk_size = mddev->chunk_size;
 936
 937        if (mddev->bitmap && mddev->bitmap_file == NULL)
 938                sb->state |= (1<<MD_SB_BITMAP_PRESENT);
 939
 940        sb->disks[0].state = (1<<MD_DISK_REMOVED);
 941        ITERATE_RDEV(mddev,rdev2,tmp) {
 942                mdp_disk_t *d;
 943                int desc_nr;
 944                if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
 945                    && !test_bit(Faulty, &rdev2->flags))
 946                        desc_nr = rdev2->raid_disk;
 947                else
 948                        desc_nr = next_spare++;
 949                rdev2->desc_nr = desc_nr;
 950                d = &sb->disks[rdev2->desc_nr];
 951                nr_disks++;
 952                d->number = rdev2->desc_nr;
 953                d->major = MAJOR(rdev2->bdev->bd_dev);
 954                d->minor = MINOR(rdev2->bdev->bd_dev);
 955                if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
 956                    && !test_bit(Faulty, &rdev2->flags))
 957                        d->raid_disk = rdev2->raid_disk;
 958                else
 959                        d->raid_disk = rdev2->desc_nr; /* compatibility */
 960                if (test_bit(Faulty, &rdev2->flags))
 961                        d->state = (1<<MD_DISK_FAULTY);
 962                else if (test_bit(In_sync, &rdev2->flags)) {
 963                        d->state = (1<<MD_DISK_ACTIVE);
 964                        d->state |= (1<<MD_DISK_SYNC);
 965                        active++;
 966                        working++;
 967                } else {
 968                        d->state = 0;
 969                        spare++;
 970                        working++;
 971                }
 972                if (test_bit(WriteMostly, &rdev2->flags))
 973                        d->state |= (1<<MD_DISK_WRITEMOSTLY);
 974        }
 975        /* now set the "removed" and "faulty" bits on any missing devices */
 976        for (i=0 ; i < mddev->raid_disks ; i++) {
 977                mdp_disk_t *d = &sb->disks[i];
 978                if (d->state == 0 && d->number == 0) {
 979                        d->number = i;
 980                        d->raid_disk = i;
 981                        d->state = (1<<MD_DISK_REMOVED);
 982                        d->state |= (1<<MD_DISK_FAULTY);
 983                        failed++;
 984                }
 985        }
 986        sb->nr_disks = nr_disks;
 987        sb->active_disks = active;
 988        sb->working_disks = working;
 989        sb->failed_disks = failed;
 990        sb->spare_disks = spare;
 991
 992        sb->this_disk = sb->disks[rdev->desc_nr];
 993        sb->sb_csum = calc_sb_csum(sb);
 994}
 995
 996/*
 997 * version 1 superblock
 998 */
 999
1000static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1001{
1002        __le32 disk_csum;
1003        u32 csum;
1004        unsigned long long newcsum;
1005        int size = 256 + le32_to_cpu(sb->max_dev)*2;
1006        __le32 *isuper = (__le32*)sb;
1007        int i;
1008
1009        disk_csum = sb->sb_csum;
1010        sb->sb_csum = 0;
1011        newcsum = 0;
1012        for (i=0; size>=4; size -= 4 )
1013                newcsum += le32_to_cpu(*isuper++);
1014
1015        if (size == 2)
1016                newcsum += le16_to_cpu(*(__le16*) isuper);
1017
1018        csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1019        sb->sb_csum = disk_csum;
1020        return cpu_to_le32(csum);
1021}
1022
1023static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1024{
1025        struct mdp_superblock_1 *sb;
1026        int ret;
1027        sector_t sb_offset;
1028        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1029        int bmask;
1030
1031        /*
1032         * Calculate the position of the superblock.
1033         * It is always aligned to a 4K boundary and
1034         * depeding on minor_version, it can be:
1035         * 0: At least 8K, but less than 12K, from end of device
1036         * 1: At start of device
1037         * 2: 4K from start of device.
1038         */
1039        switch(minor_version) {
1040        case 0:
1041                sb_offset = rdev->bdev->bd_inode->i_size >> 9;
1042                sb_offset -= 8*2;
1043                sb_offset &= ~(sector_t)(4*2-1);
1044                /* convert from sectors to K */
1045                sb_offset /= 2;
1046                break;
1047        case 1:
1048                sb_offset = 0;
1049                break;
1050        case 2:
1051                sb_offset = 4;
1052                break;
1053        default:
1054                return -EINVAL;
1055        }
1056        rdev->sb_offset = sb_offset;
1057
1058        /* superblock is rarely larger than 1K, but it can be larger,
1059         * and it is safe to read 4k, so we do that
1060         */
1061        ret = read_disk_sb(rdev, 4096);
1062        if (ret) return ret;
1063
1064
1065        sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1066
1067        if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1068            sb->major_version != cpu_to_le32(1) ||
1069            le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1070            le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
1071            (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1072                return -EINVAL;
1073
1074        if (calc_sb_1_csum(sb) != sb->sb_csum) {
1075                printk("md: invalid superblock checksum on %s\n",
1076                        bdevname(rdev->bdev,b));
1077                return -EINVAL;
1078        }
1079        if (le64_to_cpu(sb->data_size) < 10) {
1080                printk("md: data_size too small on %s\n",
1081                       bdevname(rdev->bdev,b));
1082                return -EINVAL;
1083        }
1084        if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) {
1085                if (sb->level != cpu_to_le32(1) &&
1086                    sb->level != cpu_to_le32(4) &&
1087                    sb->level != cpu_to_le32(5) &&
1088                    sb->level != cpu_to_le32(6) &&
1089                    sb->level != cpu_to_le32(10)) {
1090                        printk(KERN_WARNING
1091                               "md: bitmaps not supported for this level.\n");
1092                        return -EINVAL;
1093                }
1094        }
1095
1096        rdev->preferred_minor = 0xffff;
1097        rdev->data_offset = le64_to_cpu(sb->data_offset);
1098        atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1099
1100        rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1101        bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
1102        if (rdev->sb_size & bmask)
1103                rdev-> sb_size = (rdev->sb_size | bmask)+1;
1104
1105        if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1106                rdev->desc_nr = -1;
1107        else
1108                rdev->desc_nr = le32_to_cpu(sb->dev_number);
1109
1110        if (refdev == 0)
1111                ret = 1;
1112        else {
1113                __u64 ev1, ev2;
1114                struct mdp_superblock_1 *refsb = 
1115                        (struct mdp_superblock_1*)page_address(refdev->sb_page);
1116
1117                if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1118                    sb->level != refsb->level ||
1119                    sb->layout != refsb->layout ||
1120                    sb->chunksize != refsb->chunksize) {
1121                        printk(KERN_WARNING "md: %s has strangely different"
1122                                " superblock to %s\n",
1123                                bdevname(rdev->bdev,b),
1124                                bdevname(refdev->bdev,b2));
1125                        return -EINVAL;
1126                }
1127                ev1 = le64_to_cpu(sb->events);
1128                ev2 = le64_to_cpu(refsb->events);
1129
1130                if (ev1 > ev2)
1131                        ret = 1;
1132                else
1133                        ret = 0;
1134        }
1135        if (minor_version) 
1136                rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
1137        else
1138                rdev->size = rdev->sb_offset;
1139        if (rdev->size < le64_to_cpu(sb->data_size)/2)
1140                return -EINVAL;
1141        rdev->size = le64_to_cpu(sb->data_size)/2;
1142        if (le32_to_cpu(sb->chunksize))
1143                rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
1144
1145        if (le64_to_cpu(sb->size) > rdev->size*2)
1146                return -EINVAL;
1147        return ret;
1148}
1149
1150static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1151{
1152        struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1153        __u64 ev1 = le64_to_cpu(sb->events);
1154
1155        rdev->raid_disk = -1;
1156        rdev->flags = 0;
1157        if (mddev->raid_disks == 0) {
1158                mddev->major_version = 1;
1159                mddev->patch_version = 0;
1160                mddev->persistent = 1;
1161                mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
1162                mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1163                mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1164                mddev->level = le32_to_cpu(sb->level);
1165                mddev->clevel[0] = 0;
1166                mddev->layout = le32_to_cpu(sb->layout);
1167                mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1168                mddev->size = le64_to_cpu(sb->size)/2;
1169                mddev->events = ev1;
1170                mddev->bitmap_offset = 0;
1171                mddev->default_bitmap_offset = 1024 >> 9;
1172                
1173                mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1174                memcpy(mddev->uuid, sb->set_uuid, 16);
1175
1176                mddev->max_disks =  (4096-256)/2;
1177
1178                if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1179                    mddev->bitmap_file == NULL )
1180                        mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
1181
1182                if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1183                        mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1184                        mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1185                        mddev->new_level = le32_to_cpu(sb->new_level);
1186                        mddev->new_layout = le32_to_cpu(sb->new_layout);
1187                        mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
1188                } else {
1189                        mddev->reshape_position = MaxSector;
1190                        mddev->delta_disks = 0;
1191                        mddev->new_level = mddev->level;
1192                        mddev->new_layout = mddev->layout;
1193                        mddev->new_chunk = mddev->chunk_size;
1194                }
1195
1196        } else if (mddev->pers == NULL) {
1197                /* Insist of good event counter while assembling */
1198                ++ev1;
1199                if (ev1 < mddev->events)
1200                        return -EINVAL;
1201        } else if (mddev->bitmap) {
1202                /* If adding to array with a bitmap, then we can accept an
1203                 * older device, but not too old.
1204                 */
1205                if (ev1 < mddev->bitmap->events_cleared)
1206                        return 0;
1207        } else {
1208                if (ev1 < mddev->events)
1209                        /* just a hot-add of a new device, leave raid_disk at -1 */
1210                        return 0;
1211        }
1212        if (mddev->level != LEVEL_MULTIPATH) {
1213                int role;
1214                role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1215                switch(role) {
1216                case 0xffff: /* spare */
1217                        break;
1218                case 0xfffe: /* faulty */
1219                        set_bit(Faulty, &rdev->flags);
1220                        break;
1221                default:
1222                        if ((le32_to_cpu(sb->feature_map) &
1223                             MD_FEATURE_RECOVERY_OFFSET))
1224                                rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1225                        else
1226                                set_bit(In_sync, &rdev->flags);
1227                        rdev->raid_disk = role;
1228                        break;
1229                }
1230                if (sb->devflags & WriteMostly1)
1231                        set_bit(WriteMostly, &rdev->flags);
1232        } else /* MULTIPATH are always insync */
1233                set_bit(In_sync, &rdev->flags);
1234
1235        return 0;
1236}
1237
1238static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1239{
1240        struct mdp_superblock_1 *sb;
1241        struct list_head *tmp;
1242        mdk_rdev_t *rdev2;
1243        int max_dev, i;
1244        /* make rdev->sb match mddev and rdev data. */
1245
1246        sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1247
1248        sb->feature_map = 0;
1249        sb->pad0 = 0;
1250        sb->recovery_offset = cpu_to_le64(0);
1251        memset(sb->pad1, 0, sizeof(sb->pad1));
1252        memset(sb->pad2, 0, sizeof(sb->pad2));
1253        memset(sb->pad3, 0, sizeof(sb->pad3));
1254
1255        sb->utime = cpu_to_le64((__u64)mddev->utime);
1256        sb->events = cpu_to_le64(mddev->events);
1257        if (mddev->in_sync)
1258                sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1259        else
1260                sb->resync_offset = cpu_to_le64(0);
1261
1262        sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1263
1264        sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1265        sb->size = cpu_to_le64(mddev->size<<1);
1266
1267        if (mddev->bitmap && mddev->bitmap_file == NULL) {
1268                sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1269                sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1270        }
1271
1272        if (rdev->raid_disk >= 0 &&
1273            !test_bit(In_sync, &rdev->flags) &&
1274            rdev->recovery_offset > 0) {
1275                sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1276                sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
1277        }
1278
1279        if (mddev->reshape_position != MaxSector) {
1280                sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1281                sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1282                sb->new_layout = cpu_to_le32(mddev->new_layout);
1283                sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1284                sb->new_level = cpu_to_le32(mddev->new_level);
1285                sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
1286        }
1287
1288        max_dev = 0;
1289        ITERATE_RDEV(mddev,rdev2,tmp)
1290                if (rdev2->desc_nr+1 > max_dev)
1291                        max_dev = rdev2->desc_nr+1;
1292
1293        if (max_dev > le32_to_cpu(sb->max_dev))
1294                sb->max_dev = cpu_to_le32(max_dev);
1295        for (i=0; i<max_dev;i++)
1296                sb->dev_roles[i] = cpu_to_le16(0xfffe);
1297        
1298        ITERATE_RDEV(mddev,rdev2,tmp) {
1299                i = rdev2->desc_nr;
1300                if (test_bit(Faulty, &rdev2->flags))
1301                        sb->dev_roles[i] = cpu_to_le16(0xfffe);
1302                else if (test_bit(In_sync, &rdev2->flags))
1303                        sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1304                else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
1305                        sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1306                else
1307                        sb->dev_roles[i] = cpu_to_le16(0xffff);
1308        }
1309
1310        sb->sb_csum = calc_sb_1_csum(sb);
1311}
1312
1313
1314static struct super_type super_types[] = {
1315        [0] = {
1316                .name   = "0.90.0",
1317                .owner  = THIS_MODULE,
1318                .load_super     = super_90_load,
1319                .validate_super = super_90_validate,
1320                .sync_super     = super_90_sync,
1321        },
1322        [1] = {
1323                .name   = "md-1",
1324                .owner  = THIS_MODULE,
1325                .load_super     = super_1_load,
1326                .validate_super = super_1_validate,
1327                .sync_super     = super_1_sync,
1328        },
1329};
1330
1331static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1332{
1333        struct list_head *tmp, *tmp2;
1334        mdk_rdev_t *rdev, *rdev2;
1335
1336        ITERATE_RDEV(mddev1,rdev,tmp)
1337                ITERATE_RDEV(mddev2, rdev2, tmp2)
1338                        if (rdev->bdev->bd_contains ==
1339                            rdev2->bdev->bd_contains)
1340                                return 1;
1341
1342        return 0;
1343}
1344
1345static LIST_HEAD(pending_raid_disks);
1346
1347static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1348{
1349        char b[BDEVNAME_SIZE];
1350        struct kobject *ko;
1351        char *s;
1352        int err;
1353
1354        if (rdev->mddev) {
1355                MD_BUG();
1356                return -EINVAL;
1357        }
1358        /* make sure rdev->size exceeds mddev->size */
1359        if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
1360                if (mddev->pers) {
1361                        /* Cannot change size, so fail
1362                         * If mddev->level <= 0, then we don't care
1363                         * about aligning sizes (e.g. linear)
1364                         */
1365                        if (mddev->level > 0)
1366                                return -ENOSPC;
1367                } else
1368                        mddev->size = rdev->size;
1369        }
1370
1371        /* Verify rdev->desc_nr is unique.
1372         * If it is -1, assign a free number, else
1373         * check number is not in use
1374         */
1375        if (rdev->desc_nr < 0) {
1376                int choice = 0;
1377                if (mddev->pers) choice = mddev->raid_disks;
1378                while (find_rdev_nr(mddev, choice))
1379                        choice++;
1380                rdev->desc_nr = choice;
1381        } else {
1382                if (find_rdev_nr(mddev, rdev->desc_nr))
1383                        return -EBUSY;
1384        }
1385        bdevname(rdev->bdev,b);
1386        if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0)
1387                return -ENOMEM;
1388        while ( (s=strchr(rdev->kobj.k_name, '/')) != NULL)
1389                *s = '!';
1390                        
1391        rdev->mddev = mddev;
1392        printk(KERN_INFO "md: bind<%s>\n", b);
1393
1394        rdev->kobj.parent = &mddev->kobj;
1395        if ((err = kobject_add(&rdev->kobj)))
1396                goto fail;
1397
1398        if (rdev->bdev->bd_part)
1399                ko = &rdev->bdev->bd_part->kobj;
1400        else
1401                ko = &rdev->bdev->bd_disk->kobj;
1402        if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) {
1403                kobject_del(&rdev->kobj);
1404                goto fail;
1405        }
1406        list_add(&rdev->same_set, &mddev->disks);
1407        bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk);
1408        return 0;
1409
1410 fail:
1411        printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
1412               b, mdname(mddev));
1413        return err;
1414}
1415
1416static void delayed_delete(struct work_struct *ws)
1417{
1418        mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
1419        kobject_del(&rdev->kobj);
1420}
1421
1422static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1423{
1424        char b[BDEVNAME_SIZE];
1425        if (!rdev->mddev) {
1426                MD_BUG();
1427                return;
1428        }
1429        bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1430        list_del_init(&rdev->same_set);
1431        printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1432        rdev->mddev = NULL;
1433        sysfs_remove_link(&rdev->kobj, "block");
1434
1435        /* We need to delay this, otherwise we can deadlock when
1436         * writing to 'remove' to "dev/state"
1437         */
1438        INIT_WORK(&rdev->del_work, delayed_delete);
1439        schedule_work(&rdev->del_work);
1440}
1441
1442/*
1443 * prevent the device from being mounted, repartitioned or
1444 * otherwise reused by a RAID array (or any other kernel
1445 * subsystem), by bd_claiming the device.
1446 */
1447static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
1448{
1449        int err = 0;
1450        struct block_device *bdev;
1451        char b[BDEVNAME_SIZE];
1452
1453        bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1454        if (IS_ERR(bdev)) {
1455                printk(KERN_ERR "md: could not open %s.\n",
1456                        __bdevname(dev, b));
1457                return PTR_ERR(bdev);
1458        }
1459        err = bd_claim(bdev, rdev);
1460        if (err) {
1461                printk(KERN_ERR "md: could not bd_claim %s.\n",
1462                        bdevname(bdev, b));
1463                blkdev_put(bdev);
1464                return err;
1465        }
1466        rdev->bdev = bdev;
1467        return err;
1468}
1469
1470static void unlock_rdev(mdk_rdev_t *rdev)
1471{
1472        struct block_device *bdev = rdev->bdev;
1473        rdev->bdev = NULL;
1474        if (!bdev)
1475                MD_BUG();
1476        bd_release(bdev);
1477        blkdev_put(bdev);
1478}
1479
1480void md_autodetect_dev(dev_t dev);
1481
1482static void export_rdev(mdk_rdev_t * rdev)
1483{
1484        char b[BDEVNAME_SIZE];
1485        printk(KERN_INFO "md: export_rdev(%s)\n",
1486                bdevname(rdev->bdev,b));
1487        if (rdev->mddev)
1488                MD_BUG();
1489        free_disk_sb(rdev);
1490        list_del_init(&rdev->same_set);
1491#ifndef MODULE
1492        md_autodetect_dev(rdev->bdev->bd_dev);
1493#endif
1494        unlock_rdev(rdev);
1495        kobject_put(&rdev->kobj);
1496}
1497
1498static void kick_rdev_from_array(mdk_rdev_t * rdev)
1499{
1500        unbind_rdev_from_array(rdev);
1501        export_rdev(rdev);
1502}
1503
1504static void export_array(mddev_t *mddev)
1505{
1506        struct list_head *tmp;
1507        mdk_rdev_t *rdev;
1508
1509        ITERATE_RDEV(mddev,rdev,tmp) {
1510                if (!rdev->mddev) {
1511                        MD_BUG();
1512                        continue;
1513                }
1514                kick_rdev_from_array(rdev);
1515        }
1516        if (!list_empty(&mddev->disks))
1517                MD_BUG();
1518        mddev->raid_disks = 0;
1519        mddev->major_version = 0;
1520}
1521
1522static void print_desc(mdp_disk_t *desc)
1523{
1524        printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1525                desc->major,desc->minor,desc->raid_disk,desc->state);
1526}
1527
1528static void print_sb(mdp_super_t *sb)
1529{
1530        int i;
1531
1532        printk(KERN_INFO 
1533                "md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1534                sb->major_version, sb->minor_version, sb->patch_version,
1535                sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1536                sb->ctime);
1537        printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1538                sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1539                sb->md_minor, sb->layout, sb->chunk_size);
1540        printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d"
1541                " FD:%d SD:%d CSUM:%08x E:%08lx\n",
1542                sb->utime, sb->state, sb->active_disks, sb->working_disks,
1543                sb->failed_disks, sb->spare_disks,
1544                sb->sb_csum, (unsigned long)sb->events_lo);
1545
1546        printk(KERN_INFO);
1547        for (i = 0; i < MD_SB_DISKS; i++) {
1548                mdp_disk_t *desc;
1549
1550                desc = sb->disks + i;
1551                if (desc->number || desc->major || desc->minor ||
1552                    desc->raid_disk || (desc->state && (desc->state != 4))) {
1553                        printk("     D %2d: ", i);
1554                        print_desc(desc);
1555                }
1556        }
1557        printk(KERN_INFO "md:     THIS: ");
1558        print_desc(&sb->this_disk);
1559
1560}
1561
1562static void print_rdev(mdk_rdev_t *rdev)
1563{
1564        char b[BDEVNAME_SIZE];
1565        printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
1566                bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
1567                test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1568                rdev->desc_nr);
1569        if (rdev->sb_loaded) {
1570                printk(KERN_INFO "md: rdev superblock:\n");
1571                print_sb((mdp_super_t*)page_address(rdev->sb_page));
1572        } else
1573                printk(KERN_INFO "md: no rdev superblock!\n");
1574}
1575
1576static void md_print_devices(void)
1577{
1578        struct list_head *tmp, *tmp2;
1579        mdk_rdev_t *rdev;
1580        mddev_t *mddev;
1581        char b[BDEVNAME_SIZE];
1582
1583        printk("\n");
1584        printk("md:     **********************************\n");
1585        printk("md:     * <COMPLETE RAID STATE PRINTOUT> *\n");
1586        printk("md:     **********************************\n");
1587        ITERATE_MDDEV(mddev,tmp) {
1588
1589                if (mddev->bitmap)
1590                        bitmap_print_sb(mddev->bitmap);
1591                else
1592                        printk("%s: ", mdname(mddev));
1593                ITERATE_RDEV(mddev,rdev,tmp2)
1594                        printk("<%s>", bdevname(rdev->bdev,b));
1595                printk("\n");
1596
1597                ITERATE_RDEV(mddev,rdev,tmp2)
1598                        print_rdev(rdev);
1599        }
1600        printk("md:     **********************************\n");
1601        printk("\n");
1602}
1603
1604
1605static void sync_sbs(mddev_t * mddev, int nospares)
1606{
1607        /* Update each superblock (in-memory image), but
1608         * if we are allowed to, skip spares which already
1609         * have the right event counter, or have one earlier
1610         * (which would mean they aren't being marked as dirty
1611         * with the rest of the array)
1612         */
1613        mdk_rdev_t *rdev;
1614        struct list_head *tmp;
1615
1616        ITERATE_RDEV(mddev,rdev,tmp) {
1617                if (rdev->sb_events == mddev->events ||
1618                    (nospares &&
1619                     rdev->raid_disk < 0 &&
1620                     (rdev->sb_events&1)==0 &&
1621                     rdev->sb_events+1 == mddev->events)) {
1622                        /* Don't update this superblock */
1623                        rdev->sb_loaded = 2;
1624                } else {
1625                        super_types[mddev->major_version].
1626                                sync_super(mddev, rdev);
1627                        rdev->sb_loaded = 1;
1628                }
1629        }
1630}
1631
1632static void md_update_sb(mddev_t * mddev, int force_change)
1633{
1634        struct list_head *tmp;
1635        mdk_rdev_t *rdev;
1636        int sync_req;
1637        int nospares = 0;
1638
1639repeat:
1640        spin_lock_irq(&mddev->write_lock);
1641
1642        set_bit(MD_CHANGE_PENDING, &mddev->flags);
1643        if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
1644                force_change = 1;
1645        if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
1646                /* just a clean<-> dirty transition, possibly leave spares alone,
1647                 * though if events isn't the right even/odd, we will have to do
1648                 * spares after all
1649                 */
1650                nospares = 1;
1651        if (force_change)
1652                nospares = 0;
1653        if (mddev->degraded)
1654                /* If the array is degraded, then skipping spares is both
1655                 * dangerous and fairly pointless.
1656                 * Dangerous because a device that was removed from the array
1657                 * might have a event_count that still looks up-to-date,
1658                 * so it can be re-added without a resync.
1659                 * Pointless because if there are any spares to skip,
1660                 * then a recovery will happen and soon that array won't
1661                 * be degraded any more and the spare can go back to sleep then.
1662                 */
1663                nospares = 0;
1664
1665        sync_req = mddev->in_sync;
1666        mddev->utime = get_seconds();
1667
1668        /* If this is just a dirty<->clean transition, and the array is clean
1669         * and 'events' is odd, we can roll back to the previous clean state */
1670        if (nospares
1671            && (mddev->in_sync && mddev->recovery_cp == MaxSector)
1672            && (mddev->events & 1)
1673            && mddev->events != 1)
1674                mddev->events--;
1675        else {
1676                /* otherwise we have to go forward and ... */
1677                mddev->events ++;
1678                if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
1679                        /* .. if the array isn't clean, insist on an odd 'events' */
1680                        if ((mddev->events&1)==0) {
1681                                mddev->events++;
1682                                nospares = 0;
1683                        }
1684                } else {
1685                        /* otherwise insist on an even 'events' (for clean states) */
1686                        if ((mddev->events&1)) {
1687                                mddev->events++;
1688                                nospares = 0;
1689                        }
1690                }
1691        }
1692
1693        if (!mddev->events) {
1694                /*
1695                 * oops, this 64-bit counter should never wrap.
1696                 * Either we are in around ~1 trillion A.C., assuming
1697                 * 1 reboot per second, or we have a bug:
1698                 */
1699                MD_BUG();
1700                mddev->events --;
1701        }
1702        sync_sbs(mddev, nospares);
1703
1704        /*
1705         * do not write anything to disk if using
1706         * nonpersistent superblocks
1707         */
1708        if (!mddev->persistent) {
1709                clear_bit(MD_CHANGE_PENDING, &mddev->flags);
1710                spin_unlock_irq(&mddev->write_lock);
1711                wake_up(&mddev->sb_wait);
1712                return;
1713        }
1714        spin_unlock_irq(&mddev->write_lock);
1715
1716        dprintk(KERN_INFO 
1717                "md: updating %s RAID superblock on device (in sync %d)\n",
1718                mdname(mddev),mddev->in_sync);
1719
1720        bitmap_update_sb(mddev->bitmap);
1721        ITERATE_RDEV(mddev,rdev,tmp) {
1722                char b[BDEVNAME_SIZE];
1723                dprintk(KERN_INFO "md: ");
1724                if (rdev->sb_loaded != 1)
1725                        continue; /* no noise on spare devices */
1726                if (test_bit(Faulty, &rdev->flags))
1727                        dprintk("(skipping faulty ");
1728
1729                dprintk("%s ", bdevname(rdev->bdev,b));
1730                if (!test_bit(Faulty, &rdev->flags)) {
1731                        md_super_write(mddev,rdev,
1732                                       rdev->sb_offset<<1, rdev->sb_size,
1733                                       rdev->sb_page);
1734                        dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1735                                bdevname(rdev->bdev,b),
1736                                (unsigned long long)rdev->sb_offset);
1737                        rdev->sb_events = mddev->events;
1738
1739                } else
1740                        dprintk(")\n");
1741                if (mddev->level == LEVEL_MULTIPATH)
1742                        /* only need to write one superblock... */
1743                        break;
1744        }
1745        md_super_wait(mddev);
1746        /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
1747
1748        spin_lock_irq(&mddev->write_lock);
1749        if (mddev->in_sync != sync_req ||
1750            test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
1751                /* have to write it out again */
1752                spin_unlock_irq(&mddev->write_lock);
1753                goto repeat;
1754        }
1755        clear_bit(MD_CHANGE_PENDING, &mddev->flags);
1756        spin_unlock_irq(&mddev->write_lock);
1757        wake_up(&mddev->sb_wait);
1758
1759}
1760
1761/* words written to sysfs files may, or my not, be \n terminated.
1762 * We want to accept with case. For this we use cmd_match.
1763 */
1764static int cmd_match(const char *cmd, const char *str)
1765{
1766        /* See if cmd, written into a sysfs file, matches
1767         * str.  They must either be the same, or cmd can
1768         * have a trailing newline
1769         */
1770        while (*cmd && *str && *cmd == *str) {
1771                cmd++;
1772                str++;
1773        }
1774        if (*cmd == '\n')
1775                cmd++;
1776        if (*str || *cmd)
1777                return 0;
1778        return 1;
1779}
1780
1781struct rdev_sysfs_entry {
1782        struct attribute attr;
1783        ssize_t (*show)(mdk_rdev_t *, char *);
1784        ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
1785};
1786
1787static ssize_t
1788state_show(mdk_rdev_t *rdev, char *page)
1789{
1790        char *sep = "";
1791        int len=0;
1792
1793        if (test_bit(Faulty, &rdev->flags)) {
1794                len+= sprintf(page+len, "%sfaulty",sep);
1795                sep = ",";
1796        }
1797        if (test_bit(In_sync, &rdev->flags)) {
1798                len += sprintf(page+len, "%sin_sync",sep);
1799                sep = ",";
1800        }
1801        if (test_bit(WriteMostly, &rdev->flags)) {
1802                len += sprintf(page+len, "%swrite_mostly",sep);
1803                sep = ",";
1804        }
1805        if (!test_bit(Faulty, &rdev->flags) &&
1806            !test_bit(In_sync, &rdev->flags)) {
1807                len += sprintf(page+len, "%sspare", sep);
1808                sep = ",";
1809        }
1810        return len+sprintf(page+len, "\n");
1811}
1812
1813static ssize_t
1814state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1815{
1816        /* can write
1817         *  faulty  - simulates and error
1818         *  remove  - disconnects the device
1819         *  writemostly - sets write_mostly
1820         *  -writemostly - clears write_mostly
1821         */
1822        int err = -EINVAL;
1823        if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
1824                md_error(rdev->mddev, rdev);
1825                err = 0;
1826        } else if (cmd_match(buf, "remove")) {
1827                if (rdev->raid_disk >= 0)
1828                        err = -EBUSY;
1829                else {
1830                        mddev_t *mddev = rdev->mddev;
1831                        kick_rdev_from_array(rdev);
1832                        if (mddev->pers)
1833                                md_update_sb(mddev, 1);
1834                        md_new_event(mddev);
1835                        err = 0;
1836                }
1837        } else if (cmd_match(buf, "writemostly")) {
1838                set_bit(WriteMostly, &rdev->flags);
1839                err = 0;
1840        } else if (cmd_match(buf, "-writemostly")) {
1841                clear_bit(WriteMostly, &rdev->flags);
1842                err = 0;
1843        }
1844        return err ? err : len;
1845}
1846static struct rdev_sysfs_entry rdev_state =
1847__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
1848
1849static ssize_t
1850super_show(mdk_rdev_t *rdev, char *page)
1851{
1852        if (rdev->sb_loaded && rdev->sb_size) {
1853                memcpy(page, page_address(rdev->sb_page), rdev->sb_size);
1854                return rdev->sb_size;
1855        } else
1856                return 0;
1857}
1858static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super);
1859
1860static ssize_t
1861errors_show(mdk_rdev_t *rdev, char *page)
1862{
1863        return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
1864}
1865
1866static ssize_t
1867errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1868{
1869        char *e;
1870        unsigned long n = simple_strtoul(buf, &e, 10);
1871        if (*buf && (*e == 0 || *e == '\n')) {
1872                atomic_set(&rdev->corrected_errors, n);
1873                return len;
1874        }
1875        return -EINVAL;
1876}
1877static struct rdev_sysfs_entry rdev_errors =
1878__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
1879
1880static ssize_t
1881slot_show(mdk_rdev_t *rdev, char *page)
1882{
1883        if (rdev->raid_disk < 0)
1884                return sprintf(page, "none\n");
1885        else
1886                return sprintf(page, "%d\n", rdev->raid_disk);
1887}
1888
1889static ssize_t
1890slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1891{
1892        char *e;
1893        int slot = simple_strtoul(buf, &e, 10);
1894        if (strncmp(buf, "none", 4)==0)
1895                slot = -1;
1896        else if (e==buf || (*e && *e!= '\n'))
1897                return -EINVAL;
1898        if (rdev->mddev->pers)
1899                /* Cannot set slot in active array (yet) */
1900                return -EBUSY;
1901        if (slot >= rdev->mddev->raid_disks)
1902                return -ENOSPC;
1903        rdev->raid_disk = slot;
1904        /* assume it is working */
1905        rdev->flags = 0;
1906        set_bit(In_sync, &rdev->flags);
1907        return len;
1908}
1909
1910
1911static struct rdev_sysfs_entry rdev_slot =
1912__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
1913
1914static ssize_t
1915offset_show(mdk_rdev_t *rdev, char *page)
1916{
1917        return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
1918}
1919
1920static ssize_t
1921offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1922{
1923        char *e;
1924        unsigned long long offset = simple_strtoull(buf, &e, 10);
1925        if (e==buf || (*e && *e != '\n'))
1926                return -EINVAL;
1927        if (rdev->mddev->pers)
1928                return -EBUSY;
1929        rdev->data_offset = offset;
1930        return len;
1931}
1932
1933static struct rdev_sysfs_entry rdev_offset =
1934__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
1935
1936static ssize_t
1937rdev_size_show(mdk_rdev_t *rdev, char *page)
1938{
1939        return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
1940}
1941
1942static ssize_t
1943rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1944{
1945        char *e;
1946        unsigned long long size = simple_strtoull(buf, &e, 10);
1947        if (e==buf || (*e && *e != '\n'))
1948                return -EINVAL;
1949        if (rdev->mddev->pers)
1950                return -EBUSY;
1951        rdev->size = size;
1952        if (size < rdev->mddev->size || rdev->mddev->size == 0)
1953                rdev->mddev->size = size;
1954        return len;
1955}
1956
1957static struct rdev_sysfs_entry rdev_size =
1958__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
1959
1960static struct attribute *rdev_default_attrs[] = {
1961        &rdev_state.attr,
1962        &rdev_super.attr,
1963        &rdev_errors.attr,
1964        &rdev_slot.attr,
1965        &rdev_offset.attr,
1966        &rdev_size.attr,
1967        NULL,
1968};
1969static ssize_t
1970rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1971{
1972        struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
1973        mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
1974
1975        if (!entry->show)
1976                return -EIO;
1977        return entry->show(rdev, page);
1978}
1979
1980static ssize_t
1981rdev_attr_store(struct kobject *kobj, struct attribute *attr,
1982              const char *page, size_t length)
1983{
1984        struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
1985        mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
1986
1987        if (!entry->store)
1988                return -EIO;
1989        if (!capable(CAP_SYS_ADMIN))
1990                return -EACCES;
1991        return entry->store(rdev, page, length);
1992}
1993
1994static void rdev_free(struct kobject *ko)
1995{
1996        mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
1997        kfree(rdev);
1998}
1999static struct sysfs_ops rdev_sysfs_ops = {
2000        .show           = rdev_attr_show,
2001        .store          = rdev_attr_store,
2002};
2003static struct kobj_type rdev_ktype = {
2004        .release        = rdev_free,
2005        .sysfs_ops      = &rdev_sysfs_ops,
2006        .default_attrs  = rdev_default_attrs,
2007};
2008
2009/*
2010 * Import a device. If 'super_format' >= 0, then sanity check the superblock
2011 *
2012 * mark the device faulty if:
2013 *
2014 *   - the device is nonexistent (zero size)
2015 *   - the device has no valid superblock
2016 *
2017 * a faulty rdev _never_ has rdev->sb set.
2018 */
2019static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
2020{
2021        char b[BDEVNAME_SIZE];
2022        int err;
2023        mdk_rdev_t *rdev;
2024        sector_t size;
2025
2026        rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
2027        if (!rdev) {
2028                printk(KERN_ERR "md: could not alloc mem for new device!\n");
2029                return ERR_PTR(-ENOMEM);
2030        }
2031
2032        if ((err = alloc_disk_sb(rdev)))
2033                goto abort_free;
2034
2035        err = lock_rdev(rdev, newdev);
2036        if (err)
2037                goto abort_free;
2038
2039        rdev->kobj.parent = NULL;
2040        rdev->kobj.ktype = &rdev_ktype;
2041        kobject_init(&rdev->kobj);
2042
2043        rdev->desc_nr = -1;
2044        rdev->saved_raid_disk = -1;
2045        rdev->raid_disk = -1;
2046        rdev->flags = 0;
2047        rdev->data_offset = 0;
2048        rdev->sb_events = 0;
2049        atomic_set(&rdev->nr_pending, 0);
2050        atomic_set(&rdev->read_errors, 0);
2051        atomic_set(&rdev->corrected_errors, 0);
2052
2053        size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2054        if (!size) {
2055                printk(KERN_WARNING 
2056                        "md: %s has zero or unknown size, marking faulty!\n",
2057                        bdevname(rdev->bdev,b));
2058                err = -EINVAL;
2059                goto abort_free;
2060        }
2061
2062        if (super_format >= 0) {
2063                err = super_types[super_format].
2064                        load_super(rdev, NULL, super_minor);
2065                if (err == -EINVAL) {
2066                        printk(KERN_WARNING
2067                                "md: %s does not have a valid v%d.%d "
2068                               "superblock, not importing!\n",
2069                                bdevname(rdev->bdev,b),
2070                               super_format, super_minor);
2071                        goto abort_free;
2072                }
2073                if (err < 0) {
2074                        printk(KERN_WARNING 
2075                                "md: could not read %s's sb, not importing!\n",
2076                                bdevname(rdev->bdev,b));
2077                        goto abort_free;
2078                }
2079        }
2080        INIT_LIST_HEAD(&rdev->same_set);
2081
2082        return rdev;
2083
2084abort_free:
2085        if (rdev->sb_page) {
2086                if (rdev->bdev)
2087                        unlock_rdev(rdev);
2088                free_disk_sb(rdev);
2089        }
2090        kfree(rdev);
2091        return ERR_PTR(err);
2092}
2093
2094/*
2095 * Check a full RAID array for plausibility
2096 */
2097
2098
2099static void analyze_sbs(mddev_t * mddev)
2100{
2101        int i;
2102        struct list_head *tmp;
2103        mdk_rdev_t *rdev, *freshest;
2104        char b[BDEVNAME_SIZE];
2105
2106        freshest = NULL;
2107        ITERATE_RDEV(mddev,rdev,tmp)
2108                switch (super_types[mddev->major_version].
2109                        load_super(rdev, freshest, mddev->minor_version)) {
2110                case 1:
2111                        freshest = rdev;
2112                        break;
2113                case 0:
2114                        break;
2115                default:
2116                        printk( KERN_ERR \
2117                                "md: fatal superblock inconsistency in %s"
2118                                " -- removing from array\n", 
2119                                bdevname(rdev->bdev,b));
2120                        kick_rdev_from_array(rdev);
2121                }
2122
2123
2124        super_types[mddev->major_version].
2125                validate_super(mddev, freshest);
2126
2127        i = 0;
2128        ITERATE_RDEV(mddev,rdev,tmp) {
2129                if (rdev != freshest)
2130                        if (super_types[mddev->major_version].
2131                            validate_super(mddev, rdev)) {
2132                                printk(KERN_WARNING "md: kicking non-fresh %s"
2133                                        " from array!\n",
2134                                        bdevname(rdev->bdev,b));
2135                                kick_rdev_from_array(rdev);
2136                                continue;
2137                        }
2138                if (mddev->level == LEVEL_MULTIPATH) {
2139                        rdev->desc_nr = i++;
2140                        rdev->raid_disk = rdev->desc_nr;
2141                        set_bit(In_sync, &rdev->flags);
2142                } else if (rdev->raid_disk >= mddev->raid_disks) {
2143                        rdev->raid_disk = -1;
2144                        clear_bit(In_sync, &rdev->flags);
2145                }
2146        }
2147
2148
2149
2150        if (mddev->recovery_cp != MaxSector &&
2151            mddev->level >= 1)
2152                printk(KERN_ERR "md: %s: raid array is not clean"
2153                       " -- starting background reconstruction\n",
2154                       mdname(mddev));
2155
2156}
2157
2158static ssize_t
2159safe_delay_show(mddev_t *mddev, char *page)
2160{
2161        int msec = (mddev->safemode_delay*1000)/HZ;
2162        return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2163}
2164static ssize_t
2165safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2166{
2167        int scale=1;
2168        int dot=0;
2169        int i;
2170        unsigned long msec;
2171        char buf[30];
2172        char *e;
2173        /* remove a period, and count digits after it */
2174        if (len >= sizeof(buf))
2175                return -EINVAL;
2176        strlcpy(buf, cbuf, len);
2177        buf[len] = 0;
2178        for (i=0; i<len; i++) {
2179                if (dot) {
2180                        if (isdigit(buf[i])) {
2181                                buf[i-1] = buf[i];
2182                                scale *= 10;
2183                        }
2184                        buf[i] = 0;
2185                } else if (buf[i] == '.') {
2186                        dot=1;
2187                        buf[i] = 0;
2188                }
2189        }
2190        msec = simple_strtoul(buf, &e, 10);
2191        if (e == buf || (*e && *e != '\n'))
2192                return -EINVAL;
2193        msec = (msec * 1000) / scale;
2194        if (msec == 0)
2195                mddev->safemode_delay = 0;
2196        else {
2197                mddev->safemode_delay = (msec*HZ)/1000;
2198                if (mddev->safemode_delay == 0)
2199                        mddev->safemode_delay = 1;
2200        }
2201        return len;
2202}
2203static struct md_sysfs_entry md_safe_delay =
2204__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
2205
2206static ssize_t
2207level_show(mddev_t *mddev, char *page)
2208{
2209        struct mdk_personality *p = mddev->pers;
2210        if (p)
2211                return sprintf(page, "%s\n", p->name);
2212        else if (mddev->clevel[0])
2213                return sprintf(page, "%s\n", mddev->clevel);
2214        else if (mddev->level != LEVEL_NONE)
2215                return sprintf(page, "%d\n", mddev->level);
2216        else
2217                return 0;
2218}
2219
2220static ssize_t
2221level_store(mddev_t *mddev, const char *buf, size_t len)
2222{
2223        int rv = len;
2224        if (mddev->pers)
2225                return -EBUSY;
2226        if (len == 0)
2227                return 0;
2228        if (len >= sizeof(mddev->clevel))
2229                return -ENOSPC;
2230        strncpy(mddev->clevel, buf, len);
2231        if (mddev->clevel[len-1] == '\n')
2232                len--;
2233        mddev->clevel[len] = 0;
2234        mddev->level = LEVEL_NONE;
2235        return rv;
2236}
2237
2238static struct md_sysfs_entry md_level =
2239__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
2240
2241
2242static ssize_t
2243layout_show(mddev_t *mddev, char *page)
2244{
2245        /* just a number, not meaningful for all levels */
2246        if (mddev->reshape_position != MaxSector &&
2247            mddev->layout != mddev->new_layout)
2248                return sprintf(page, "%d (%d)\n",
2249                               mddev->new_layout, mddev->layout);
2250        return sprintf(page, "%d\n", mddev->layout);
2251}
2252
2253static ssize_t
2254layout_store(mddev_t *mddev, const char *buf, size_t len)
2255{
2256        char *e;
2257        unsigned long n = simple_strtoul(buf, &e, 10);
2258
2259        if (!*buf || (*e && *e != '\n'))
2260                return -EINVAL;
2261
2262        if (mddev->pers)
2263                return -EBUSY;
2264        if (mddev->reshape_position != MaxSector)
2265                mddev->new_layout = n;
2266        else
2267                mddev->layout = n;
2268        return len;
2269}
2270static struct md_sysfs_entry md_layout =
2271__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
2272
2273
2274static ssize_t
2275raid_disks_show(mddev_t *mddev, char *page)
2276{
2277        if (mddev->raid_disks == 0)
2278                return 0;
2279        if (mddev->reshape_position != MaxSector &&
2280            mddev->delta_disks != 0)
2281                return sprintf(page, "%d (%d)\n", mddev->raid_disks,
2282                               mddev->raid_disks - mddev->delta_disks);
2283        return sprintf(page, "%d\n", mddev->raid_disks);
2284}
2285
2286static int update_raid_disks(mddev_t *mddev, int raid_disks);
2287
2288static ssize_t
2289raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
2290{
2291        char *e;
2292        int rv = 0;
2293        unsigned long n = simple_strtoul(buf, &e, 10);
2294
2295        if (!*buf || (*e && *e != '\n'))
2296                return -EINVAL;
2297
2298        if (mddev->pers)
2299                rv = update_raid_disks(mddev, n);
2300        else if (mddev->reshape_position != MaxSector) {
2301                int olddisks = mddev->raid_disks - mddev->delta_disks;
2302                mddev->delta_disks = n - olddisks;
2303                mddev->raid_disks = n;
2304        } else
2305                mddev->raid_disks = n;
2306        return rv ? rv : len;
2307}
2308static struct md_sysfs_entry md_raid_disks =
2309__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
2310
2311static ssize_t
2312chunk_size_show(mddev_t *mddev, char *page)
2313{
2314        if (mddev->reshape_position != MaxSector &&
2315            mddev->chunk_size != mddev->new_chunk)
2316                return sprintf(page, "%d (%d)\n", mddev->new_chunk,
2317                               mddev->chunk_size);
2318        return sprintf(page, "%d\n", mddev->chunk_size);
2319}
2320
2321static ssize_t
2322chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2323{
2324        /* can only set chunk_size if array is not yet active */
2325        char *e;
2326        unsigned long n = simple_strtoul(buf, &e, 10);
2327
2328        if (!*buf || (*e && *e != '\n'))
2329                return -EINVAL;
2330
2331        if (mddev->pers)
2332                return -EBUSY;
2333        else if (mddev->reshape_position != MaxSector)
2334                mddev->new_chunk = n;
2335        else
2336                mddev->chunk_size = n;
2337        return len;
2338}
2339static struct md_sysfs_entry md_chunk_size =
2340__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
2341
2342static ssize_t
2343resync_start_show(mddev_t *mddev, char *page)
2344{
2345        return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2346}
2347
2348static ssize_t
2349resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2350{
2351        /* can only set chunk_size if array is not yet active */
2352        char *e;
2353        unsigned long long n = simple_strtoull(buf, &e, 10);
2354
2355        if (mddev->pers)
2356                return -EBUSY;
2357        if (!*buf || (*e && *e != '\n'))
2358                return -EINVAL;
2359
2360        mddev->recovery_cp = n;
2361        return len;
2362}
2363static struct md_sysfs_entry md_resync_start =
2364__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
2365
2366/*
2367 * The array state can be:
2368 *
2369 * clear
2370 *     No devices, no size, no level
2371 *     Equivalent to STOP_ARRAY ioctl
2372 * inactive
2373 *     May have some settings, but array is not active
2374 *        all IO results in error
2375 *     When written, doesn't tear down array, but just stops it
2376 * suspended (not supported yet)
2377 *     All IO requests will block. The array can be reconfigured.
2378 *     Writing this, if accepted, will block until array is quiessent
2379 * readonly
2380 *     no resync can happen.  no superblocks get written.
2381 *     write requests fail
2382 * read-auto
2383 *     like readonly, but behaves like 'clean' on a write request.
2384 *
2385 * clean - no pending writes, but otherwise active.
2386 *     When written to inactive array, starts without resync
2387 *     If a write request arrives then
2388 *       if metadata is known, mark 'dirty' and switch to 'active'.
2389 *       if not known, block and switch to write-pending
2390 *     If written to an active array that has pending writes, then fails.
2391 * active
2392 *     fully active: IO and resync can be happening.
2393 *     When written to inactive array, starts with resync
2394 *
2395 * write-pending
2396 *     clean, but writes are blocked waiting for 'active' to be written.
2397 *
2398 * active-idle
2399 *     like active, but no writes have been seen for a while (100msec).
2400 *
2401 */
2402enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
2403                   write_pending, active_idle, bad_word};
2404static char *array_states[] = {
2405        "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
2406        "write-pending", "active-idle", NULL };
2407
2408static int match_word(const char *word, char **list)
2409{
2410        int n;
2411        for (n=0; list[n]; n++)
2412                if (cmd_match(word, list[n]))
2413                        break;
2414        return n;
2415}
2416
2417static ssize_t
2418array_state_show(mddev_t *mddev, char *page)
2419{
2420        enum array_state st = inactive;
2421
2422        if (mddev->pers)
2423                switch(mddev->ro) {
2424                case 1:
2425                        st = readonly;
2426                        break;
2427                case 2:
2428                        st = read_auto;
2429                        break;
2430                case 0:
2431                        if (mddev->in_sync)
2432                                st = clean;
2433                        else if (mddev->safemode)
2434                                st = active_idle;
2435                        else
2436                                st = active;
2437                }
2438        else {
2439                if (list_empty(&mddev->disks) &&
2440                    mddev->raid_disks == 0 &&
2441                    mddev->size == 0)
2442                        st = clear;
2443                else
2444                        st = inactive;
2445        }
2446        return sprintf(page, "%s\n", array_states[st]);
2447}
2448
2449static int do_md_stop(mddev_t * mddev, int ro);
2450static int do_md_run(mddev_t * mddev);
2451static int restart_array(mddev_t *mddev);
2452
2453static ssize_t
2454array_state_store(mddev_t *mddev, const char *buf, size_t len)
2455{
2456        int err = -EINVAL;
2457        enum array_state st = match_word(buf, array_states);
2458        switch(st) {
2459        case bad_word:
2460                break;
2461        case clear:
2462                /* stopping an active array */
2463                if (mddev->pers) {
2464                        if (atomic_read(&mddev->active) > 1)
2465                                return -EBUSY;
2466                        err = do_md_stop(mddev, 0);
2467                }
2468                break;
2469        case inactive:
2470                /* stopping an active array */
2471                if (mddev->pers) {
2472                        if (atomic_read(&mddev->active) > 1)
2473                                return -EBUSY;
2474                        err = do_md_stop(mddev, 2);
2475                }
2476                break;
2477        case suspended:
2478                break; /* not supported yet */
2479        case readonly:
2480                if (mddev->pers)
2481                        err = do_md_stop(mddev, 1);
2482                else {
2483                        mddev->ro = 1;
2484                        err = do_md_run(mddev);
2485                }
2486                break;
2487        case read_auto:
2488                /* stopping an active array */
2489                if (mddev->pers) {
2490                        err = do_md_stop(mddev, 1);
2491                        if (err == 0)
2492                                mddev->ro = 2; /* FIXME mark devices writable */
2493                } else {
2494                        mddev->ro = 2;
2495                        err = do_md_run(mddev);
2496                }
2497                break;
2498        case clean:
2499                if (mddev->pers) {
2500                        restart_array(mddev);
2501                        spin_lock_irq(&mddev->write_lock);
2502                        if (atomic_read(&mddev->writes_pending) == 0) {
2503                                mddev->in_sync = 1;
2504                                set_bit(MD_CHANGE_CLEAN, &mddev->flags);
2505                        }
2506                        spin_unlock_irq(&mddev->write_lock);
2507                } else {
2508                        mddev->ro = 0;
2509                        mddev->recovery_cp = MaxSector;
2510                        err = do_md_run(mddev);
2511                }
2512                break;
2513        case active:
2514                if (mddev->pers) {
2515                        restart_array(mddev);
2516                        clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2517                        wake_up(&mddev->sb_wait);
2518                        err = 0;
2519                } else {
2520                        mddev->ro = 0;
2521                        err = do_md_run(mddev);
2522                }
2523                break;
2524        case write_pending:
2525        case active_idle:
2526                /* these cannot be set */
2527                break;
2528        }
2529        if (err)
2530                return err;
2531        else
2532                return len;
2533}
2534static struct md_sysfs_entry md_array_state =
2535__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
2536
2537static ssize_t
2538null_show(mddev_t *mddev, char *page)
2539{
2540        return -EINVAL;
2541}
2542
2543static ssize_t
2544new_dev_store(mddev_t *mddev, const char *buf, size_t len)
2545{
2546        /* buf must be %d:%d\n? giving major and minor numbers */
2547        /* The new device is added to the array.
2548         * If the array has a persistent superblock, we read the
2549         * superblock to initialise info and check validity.
2550         * Otherwise, only checking done is that in bind_rdev_to_array,
2551         * which mainly checks size.
2552         */
2553        char *e;
2554        int major = simple_strtoul(buf, &e, 10);
2555        int minor;
2556        dev_t dev;
2557        mdk_rdev_t *rdev;
2558        int err;
2559
2560        if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
2561                return -EINVAL;
2562        minor = simple_strtoul(e+1, &e, 10);
2563        if (*e && *e != '\n')
2564                return -EINVAL;
2565        dev = MKDEV(major, minor);
2566        if (major != MAJOR(dev) ||
2567            minor != MINOR(dev))
2568                return -EOVERFLOW;
2569
2570
2571        if (mddev->persistent) {
2572                rdev = md_import_device(dev, mddev->major_version,
2573                                        mddev->minor_version);
2574                if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
2575                        mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
2576                                                       mdk_rdev_t, same_set);
2577                        err = super_types[mddev->major_version]
2578                                .load_super(rdev, rdev0, mddev->minor_version);
2579                        if (err < 0)
2580                                goto out;
2581                }
2582        } else
2583                rdev = md_import_device(dev, -1, -1);
2584
2585        if (IS_ERR(rdev))
2586                return PTR_ERR(rdev);
2587        err = bind_rdev_to_array(rdev, mddev);
2588 out:
2589        if (err)
2590                export_rdev(rdev);
2591        return err ? err : len;
2592}
2593
2594static struct md_sysfs_entry md_new_device =
2595__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
2596
2597static ssize_t
2598bitmap_store(mddev_t *mddev, const char *buf, size_t len)
2599{
2600        char *end;
2601        unsigned long chunk, end_chunk;
2602
2603        if (!mddev->bitmap)
2604                goto out;
2605        /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
2606        while (*buf) {
2607                chunk = end_chunk = simple_strtoul(buf, &end, 0);
2608                if (buf == end) break;
2609                if (*end == '-') { /* range */
2610                        buf = end + 1;
2611                        end_chunk = simple_strtoul(buf, &end, 0);
2612                        if (buf == end) break;
2613                }
2614                if (*end && !isspace(*end)) break;
2615                bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
2616                buf = end;
2617                while (isspace(*buf)) buf++;
2618        }
2619        bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
2620out:
2621        return len;
2622}
2623
2624static struct md_sysfs_entry md_bitmap =
2625__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
2626
2627static ssize_t
2628size_show(mddev_t *mddev, char *page)
2629{
2630        return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
2631}
2632
2633static int update_size(mddev_t *mddev, unsigned long size);
2634
2635static ssize_t
2636size_store(mddev_t *mddev, const char *buf, size_t len)
2637{
2638        /* If array is inactive, we can reduce the component size, but
2639         * not increase it (except from 0).
2640         * If array is active, we can try an on-line resize
2641         */
2642        char *e;
2643        int err = 0;
2644        unsigned long long size = simple_strtoull(buf, &e, 10);
2645        if (!*buf || *buf == '\n' ||
2646            (*e && *e != '\n'))
2647                return -EINVAL;
2648
2649        if (mddev->pers) {
2650                err = update_size(mddev, size);
2651                md_update_sb(mddev, 1);
2652        } else {
2653                if (mddev->size == 0 ||
2654                    mddev->size > size)
2655                        mddev->size = size;
2656                else
2657                        err = -ENOSPC;
2658        }
2659        return err ? err : len;
2660}
2661
2662static struct md_sysfs_entry md_size =
2663__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
2664
2665
2666/* Metdata version.
2667 * This is either 'none' for arrays with externally managed metadata,
2668 * or N.M for internally known formats
2669 */
2670static ssize_t
2671metadata_show(mddev_t *mddev, char *page)
2672{
2673        if (mddev->persistent)
2674                return sprintf(page, "%d.%d\n",
2675                               mddev->major_version, mddev->minor_version);
2676        else
2677                return sprintf(page, "none\n");
2678}
2679
2680static ssize_t
2681metadata_store(mddev_t *mddev, const char *buf, size_t len)
2682{
2683        int major, minor;
2684        char *e;
2685        if (!list_empty(&mddev->disks))
2686                return -EBUSY;
2687
2688        if (cmd_match(buf, "none")) {
2689                mddev->persistent = 0;
2690                mddev->major_version = 0;
2691                mddev->minor_version = 90;
2692                return len;
2693        }
2694        major = simple_strtoul(buf, &e, 10);
2695        if (e==buf || *e != '.')
2696                return -EINVAL;
2697        buf = e+1;
2698        minor = simple_strtoul(buf, &e, 10);
2699        if (e==buf || (*e && *e != '\n') )
2700                return -EINVAL;
2701        if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
2702                return -ENOENT;
2703        mddev->major_version = major;
2704        mddev->minor_version = minor;
2705        mddev->persistent = 1;
2706        return len;
2707}
2708
2709static struct md_sysfs_entry md_metadata =
2710__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
2711
2712static ssize_t
2713action_show(mddev_t *mddev, char *page)
2714{
2715        char *type = "idle";
2716        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2717            (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
2718                if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2719                        type = "reshape";
2720                else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2721                        if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
2722                                type = "resync";
2723                        else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2724                                type = "check";
2725                        else
2726                                type = "repair";
2727                } else
2728                        type = "recover";
2729        }
2730        return sprintf(page, "%s\n", type);
2731}
2732
2733static ssize_t
2734action_store(mddev_t *mddev, const char *page, size_t len)
2735{
2736        if (!mddev->pers || !mddev->pers->sync_request)
2737                return -EINVAL;
2738
2739        if (cmd_match(page, "idle")) {
2740                if (mddev->sync_thread) {
2741                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2742                        md_unregister_thread(mddev->sync_thread);
2743                        mddev->sync_thread = NULL;
2744                        mddev->recovery = 0;
2745                }
2746        } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2747                   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
2748                return -EBUSY;
2749        else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
2750                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2751        else if (cmd_match(page, "reshape")) {
2752                int err;
2753                if (mddev->pers->start_reshape == NULL)
2754                        return -EINVAL;
2755                err = mddev->pers->start_reshape(mddev);
2756                if (err)
2757                        return err;
2758        } else {
2759                if (cmd_match(page, "check"))
2760                        set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
2761                else if (!cmd_match(page, "repair"))
2762                        return -EINVAL;
2763                set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
2764                set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
2765        }
2766        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2767        md_wakeup_thread(mddev->thread);
2768        return len;
2769}
2770
2771static ssize_t
2772mismatch_cnt_show(mddev_t *mddev, char *page)
2773{
2774        return sprintf(page, "%llu\n",
2775                       (unsigned long long) mddev->resync_mismatches);
2776}
2777
2778static struct md_sysfs_entry md_scan_mode =
2779__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
2780
2781
2782static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
2783
2784static ssize_t
2785sync_min_show(mddev_t *mddev, char *page)
2786{
2787        return sprintf(page, "%d (%s)\n", speed_min(mddev),
2788                       mddev->sync_speed_min ? "local": "system");
2789}
2790
2791static ssize_t
2792sync_min_store(mddev_t *mddev, const char *buf, size_t len)
2793{
2794        int min;
2795        char *e;
2796        if (strncmp(buf, "system", 6)==0) {
2797                mddev->sync_speed_min = 0;
2798                return len;
2799        }
2800        min = simple_strtoul(buf, &e, 10);
2801        if (buf == e || (*e && *e != '\n') || min <= 0)
2802                return -EINVAL;
2803        mddev->sync_speed_min = min;
2804        return len;
2805}
2806
2807static struct md_sysfs_entry md_sync_min =
2808__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
2809
2810static ssize_t
2811sync_max_show(mddev_t *mddev, char *page)
2812{
2813        return sprintf(page, "%d (%s)\n", speed_max(mddev),
2814                       mddev->sync_speed_max ? "local": "system");
2815}
2816
2817static ssize_t
2818sync_max_store(mddev_t *mddev, const char *buf, size_t len)
2819{
2820        int max;
2821        char *e;
2822        if (strncmp(buf, "system", 6)==0) {
2823                mddev->sync_speed_max = 0;
2824                return len;
2825        }
2826        max = simple_strtoul(buf, &e, 10);
2827        if (buf == e || (*e && *e != '\n') || max <= 0)
2828                return -EINVAL;
2829        mddev->sync_speed_max = max;
2830        return len;
2831}
2832
2833static struct md_sysfs_entry md_sync_max =
2834__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
2835
2836static ssize_t
2837degraded_show(mddev_t *mddev, char *page)
2838{
2839        return sprintf(page, "%d\n", mddev->degraded);
2840}
2841static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
2842
2843static ssize_t
2844sync_speed_show(mddev_t *mddev, char *page)
2845{
2846        unsigned long resync, dt, db;
2847        resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active));
2848        dt = ((jiffies - mddev->resync_mark) / HZ);
2849        if (!dt) dt++;
2850        db = resync - (mddev->resync_mark_cnt);
2851        return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
2852}
2853
2854static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
2855
2856static ssize_t
2857sync_completed_show(mddev_t *mddev, char *page)
2858{
2859        unsigned long max_blocks, resync;
2860
2861        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2862                max_blocks = mddev->resync_max_sectors;
2863        else
2864                max_blocks = mddev->size << 1;
2865
2866        resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
2867        return sprintf(page, "%lu / %lu\n", resync, max_blocks);
2868}
2869
2870static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
2871
2872static ssize_t
2873suspend_lo_show(mddev_t *mddev, char *page)
2874{
2875        return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
2876}
2877
2878static ssize_t
2879suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
2880{
2881        char *e;
2882        unsigned long long new = simple_strtoull(buf, &e, 10);
2883
2884        if (mddev->pers->quiesce == NULL)
2885                return -EINVAL;
2886        if (buf == e || (*e && *e != '\n'))
2887                return -EINVAL;
2888        if (new >= mddev->suspend_hi ||
2889            (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
2890                mddev->suspend_lo = new;
2891                mddev->pers->quiesce(mddev, 2);
2892                return len;
2893        } else
2894                return -EINVAL;
2895}
2896static struct md_sysfs_entry md_suspend_lo =
2897__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
2898
2899
2900static ssize_t
2901suspend_hi_show(mddev_t *mddev, char *page)
2902{
2903        return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
2904}
2905
2906static ssize_t
2907suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
2908{
2909        char *e;
2910        unsigned long long new = simple_strtoull(buf, &e, 10);
2911
2912        if (mddev->pers->quiesce == NULL)
2913                return -EINVAL;
2914        if (buf == e || (*e && *e != '\n'))
2915                return -EINVAL;
2916        if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
2917            (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
2918                mddev->suspend_hi = new;
2919                mddev->pers->quiesce(mddev, 1);
2920                mddev->pers->quiesce(mddev, 0);
2921                return len;
2922        } else
2923                return -EINVAL;
2924}
2925static struct md_sysfs_entry md_suspend_hi =
2926__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
2927
2928static ssize_t
2929reshape_position_show(mddev_t *mddev, char *page)
2930{
2931        if (mddev->reshape_position != MaxSector)
2932                return sprintf(page, "%llu\n",
2933                               (unsigned long long)mddev->reshape_position);
2934        strcpy(page, "none\n");
2935        return 5;
2936}
2937
2938static ssize_t
2939reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
2940{
2941        char *e;
2942        unsigned long long new = simple_strtoull(buf, &e, 10);
2943        if (mddev->pers)
2944                return -EBUSY;
2945        if (buf == e || (*e && *e != '\n'))
2946                return -EINVAL;
2947        mddev->reshape_position = new;
2948        mddev->delta_disks = 0;
2949        mddev->new_level = mddev->level;
2950        mddev->new_layout = mddev->layout;
2951        mddev->new_chunk = mddev->chunk_size;
2952        return len;
2953}
2954
2955static struct md_sysfs_entry md_reshape_position =
2956__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
2957       reshape_position_store);
2958
2959
2960static struct attribute *md_default_attrs[] = {
2961        &md_level.attr,
2962        &md_layout.attr,
2963        &md_raid_disks.attr,
2964        &md_chunk_size.attr,
2965        &md_size.attr,
2966        &md_resync_start.attr,
2967        &md_metadata.attr,
2968        &md_new_device.attr,
2969        &md_safe_delay.attr,
2970        &md_array_state.attr,
2971        &md_reshape_position.attr,
2972        NULL,
2973};
2974
2975static struct attribute *md_redundancy_attrs[] = {
2976        &md_scan_mode.attr,
2977        &md_mismatches.attr,
2978        &md_sync_min.attr,
2979        &md_sync_max.attr,
2980        &md_sync_speed.attr,
2981        &md_sync_completed.attr,
2982        &md_suspend_lo.attr,
2983        &md_suspend_hi.attr,
2984        &md_bitmap.attr,
2985        &md_degraded.attr,
2986        NULL,
2987};
2988static struct attribute_group md_redundancy_group = {
2989        .name = NULL,
2990        .attrs = md_redundancy_attrs,
2991};
2992
2993
2994static ssize_t
2995md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2996{
2997        struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
2998        mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
2999        ssize_t rv;
3000
3001        if (!entry->show)
3002                return -EIO;
3003        rv = mddev_lock(mddev);
3004        if (!rv) {
3005                rv = entry->show(mddev, page);
3006                mddev_unlock(mddev);
3007        }
3008        return rv;
3009}
3010
3011static ssize_t
3012md_attr_store(struct kobject *kobj, struct attribute *attr,
3013              const char *page, size_t length)
3014{
3015        struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3016        mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3017        ssize_t rv;
3018
3019        if (!entry->store)
3020                return -EIO;
3021        if (!capable(CAP_SYS_ADMIN))
3022                return -EACCES;
3023        rv = mddev_lock(mddev);
3024        if (!rv) {
3025                rv = entry->store(mddev, page, length);
3026                mddev_unlock(mddev);
3027        }
3028        return rv;
3029}
3030
3031static void md_free(struct kobject *ko)
3032{
3033        mddev_t *mddev = container_of(ko, mddev_t, kobj);
3034        kfree(mddev);
3035}
3036
3037static struct sysfs_ops md_sysfs_ops = {
3038        .show   = md_attr_show,
3039        .store  = md_attr_store,
3040};
3041static struct kobj_type md_ktype = {
3042        .release        = md_free,
3043        .sysfs_ops      = &md_sysfs_ops,
3044        .default_attrs  = md_default_attrs,
3045};
3046
3047int mdp_major = 0;
3048
3049static struct kobject *md_probe(dev_t dev, int *part, void *data)
3050{
3051        static DEFINE_MUTEX(disks_mutex);
3052        mddev_t *mddev = mddev_find(dev);
3053        struct gendisk *disk;
3054        int partitioned = (MAJOR(dev) != MD_MAJOR);
3055        int shift = partitioned ? MdpMinorShift : 0;
3056        int unit = MINOR(dev) >> shift;
3057
3058        if (!mddev)
3059                return NULL;
3060
3061        mutex_lock(&disks_mutex);
3062        if (mddev->gendisk) {
3063                mutex_unlock(&disks_mutex);
3064                mddev_put(mddev);
3065                return NULL;
3066        }
3067        disk = alloc_disk(1 << shift);
3068        if (!disk) {
3069                mutex_unlock(&disks_mutex);
3070                mddev_put(mddev);
3071                return NULL;
3072        }
3073        disk->major = MAJOR(dev);
3074        disk->first_minor = unit << shift;
3075        if (partitioned)
3076                sprintf(disk->disk_name, "md_d%d", unit);
3077        else
3078                sprintf(disk->disk_name, "md%d", unit);
3079        disk->fops = &md_fops;
3080        disk->private_data = mddev;
3081        disk->queue = mddev->queue;
3082        add_disk(disk);
3083        mddev->gendisk = disk;
3084        mutex_unlock(&disks_mutex);
3085        mddev->kobj.parent = &disk->kobj;
3086        kobject_set_name(&mddev->kobj, "%s", "md");
3087        mddev->kobj.ktype = &md_ktype;
3088        if (kobject_register(&mddev->kobj))
3089                printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
3090                       disk->disk_name);
3091        return NULL;
3092}
3093
3094static void md_safemode_timeout(unsigned long data)
3095{
3096        mddev_t *mddev = (mddev_t *) data;
3097
3098        mddev->safemode = 1;
3099        md_wakeup_thread(mddev->thread);
3100}
3101
3102static int start_dirty_degraded;
3103
3104static int do_md_run(mddev_t * mddev)
3105{
3106        int err;
3107        int chunk_size;
3108        struct list_head *tmp;
3109        mdk_rdev_t *rdev;
3110        struct gendisk *disk;
3111        struct mdk_personality *pers;
3112        char b[BDEVNAME_SIZE];
3113
3114        if (list_empty(&mddev->disks))
3115                /* cannot run an array with no devices.. */
3116                return -EINVAL;
3117
3118        if (mddev->pers)
3119                return -EBUSY;
3120
3121        /*
3122         * Analyze all RAID superblock(s)
3123         */
3124        if (!mddev->raid_disks)
3125                analyze_sbs(mddev);
3126
3127        chunk_size = mddev->chunk_size;
3128
3129        if (chunk_size) {
3130                if (chunk_size > MAX_CHUNK_SIZE) {
3131                        printk(KERN_ERR "too big chunk_size: %d > %d\n",
3132                                chunk_size, MAX_CHUNK_SIZE);
3133                        return -EINVAL;
3134                }
3135                /*
3136                 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
3137                 */
3138                if ( (1 << ffz(~chunk_size)) != chunk_size) {
3139                        printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size);
3140                        return -EINVAL;
3141                }
3142                if (chunk_size < PAGE_SIZE) {
3143                        printk(KERN_ERR "too small chunk_size: %d < %ld\n",
3144                                chunk_size, PAGE_SIZE);
3145                        return -EINVAL;
3146                }
3147
3148                /* devices must have minimum size of one chunk */
3149                ITERATE_RDEV(mddev,rdev,tmp) {
3150                        if (test_bit(Faulty, &rdev->flags))
3151                                continue;
3152                        if (rdev->size < chunk_size / 1024) {
3153                                printk(KERN_WARNING
3154                                        "md: Dev %s smaller than chunk_size:"
3155                                        " %lluk < %dk\n",
3156                                        bdevname(rdev->bdev,b),
3157                                        (unsigned long long)rdev->size,
3158                                        chunk_size / 1024);
3159                                return -EINVAL;
3160                        }
3161                }
3162        }
3163
3164#ifdef CONFIG_KMOD
3165        if (mddev->level != LEVEL_NONE)
3166                request_module("md-level-%d", mddev->level);
3167        else if (mddev->clevel[0])
3168                request_module("md-%s", mddev->clevel);
3169#endif
3170
3171        /*
3172         * Drop all container device buffers, from now on
3173         * the only valid external interface is through the md
3174         * device.
3175         */
3176        ITERATE_RDEV(mddev,rdev,tmp) {
3177                if (test_bit(Faulty, &rdev->flags))
3178                        continue;
3179                sync_blockdev(rdev->bdev);
3180                invalidate_bdev(rdev->bdev);
3181
3182                /* perform some consistency tests on the device.
3183                 * We don't want the data to overlap the metadata,
3184                 * Internal Bitmap issues has handled elsewhere.
3185                 */
3186                if (rdev->data_offset < rdev->sb_offset) {
3187                        if (mddev->size &&
3188                            rdev->data_offset + mddev->size*2
3189                            > rdev->sb_offset*2) {
3190                                printk("md: %s: data overlaps metadata\n",
3191                                       mdname(mddev));
3192                                return -EINVAL;
3193                        }
3194                } else {
3195                        if (rdev->sb_offset*2 + rdev->sb_size/512
3196                            > rdev->data_offset) {
3197                                printk("md: %s: metadata overlaps data\n",
3198                                       mdname(mddev));
3199                                return -EINVAL;
3200                        }
3201                }
3202        }
3203
3204        md_probe(mddev->unit, NULL, NULL);
3205        disk = mddev->gendisk;
3206        if (!disk)
3207                return -ENOMEM;
3208
3209        spin_lock(&pers_lock);
3210        pers = find_pers(mddev->level, mddev->clevel);
3211        if (!pers || !try_module_get(pers->owner)) {
3212                spin_unlock(&pers_lock);
3213                if (mddev->level != LEVEL_NONE)
3214                        printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
3215                               mddev->level);
3216                else
3217                        printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
3218                               mddev->clevel);
3219                return -EINVAL;
3220        }
3221        mddev->pers = pers;
3222        spin_unlock(&pers_lock);
3223        mddev->level = pers->level;
3224        strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3225
3226        if (mddev->reshape_position != MaxSector &&
3227            pers->start_reshape == NULL) {
3228                /* This personality cannot handle reshaping... */
3229                mddev->pers = NULL;
3230                module_put(pers->owner);
3231                return -EINVAL;
3232        }
3233
3234        if (pers->sync_request) {
3235                /* Warn if this is a potentially silly
3236                 * configuration.
3237                 */
3238                char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3239                mdk_rdev_t *rdev2;
3240                struct list_head *tmp2;
3241                int warned = 0;
3242                ITERATE_RDEV(mddev, rdev, tmp) {
3243                        ITERATE_RDEV(mddev, rdev2, tmp2) {
3244                                if (rdev < rdev2 &&
3245                                    rdev->bdev->bd_contains ==
3246                                    rdev2->bdev->bd_contains) {
3247                                        printk(KERN_WARNING
3248                                               "%s: WARNING: %s appears to be"
3249                                               " on the same physical disk as"
3250                                               " %s.\n",
3251                                               mdname(mddev),
3252                                               bdevname(rdev->bdev,b),
3253                                               bdevname(rdev2->bdev,b2));
3254                                        warned = 1;
3255                                }
3256                        }
3257                }
3258                if (warned)
3259                        printk(KERN_WARNING
3260                               "True protection against single-disk"
3261                               " failure might be compromised.\n");
3262        }
3263
3264        mddev->recovery = 0;
3265        mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
3266        mddev->barriers_work = 1;
3267        mddev->ok_start_degraded = start_dirty_degraded;
3268
3269        if (start_readonly)
3270                mddev->ro = 2; /* read-only, but switch on first write */
3271
3272        err = mddev->pers->run(mddev);
3273        if (!err && mddev->pers->sync_request) {
3274                err = bitmap_create(mddev);
3275                if (err) {
3276                        printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
3277                               mdname(mddev), err);
3278                        mddev->pers->stop(mddev);
3279                }
3280        }
3281        if (err) {
3282                printk(KERN_ERR "md: pers->run() failed ...\n");
3283                module_put(mddev->pers->owner);
3284                mddev->pers = NULL;
3285                bitmap_destroy(mddev);
3286                return err;
3287        }
3288        if (mddev->pers->sync_request) {
3289                if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3290                        printk(KERN_WARNING
3291                               "md: cannot register extra attributes for %s\n",
3292                               mdname(mddev));
3293        } else if (mddev->ro == 2) /* auto-readonly not meaningful */
3294                mddev->ro = 0;
3295
3296        atomic_set(&mddev->writes_pending,0);
3297        mddev->safemode = 0;
3298        mddev->safemode_timer.function = md_safemode_timeout;
3299        mddev->safemode_timer.data = (unsigned long) mddev;
3300        mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
3301        mddev->in_sync = 1;
3302
3303        ITERATE_RDEV(mddev,rdev,tmp)
3304                if (rdev->raid_disk >= 0) {
3305                        char nm[20];
3306                        sprintf(nm, "rd%d", rdev->raid_disk);
3307                        if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
3308                                printk("md: cannot register %s for %s\n",
3309                                       nm, mdname(mddev));
3310                }
3311        
3312        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3313        
3314        if (mddev->flags)
3315                md_update_sb(mddev, 0);
3316
3317        set_capacity(disk, mddev->array_size<<1);
3318
3319        /* If we call blk_queue_make_request here, it will
3320         * re-initialise max_sectors etc which may have been
3321         * refined inside -> run.  So just set the bits we need to set.
3322         * Most initialisation happended when we called
3323         * blk_queue_make_request(..., md_fail_request)
3324         * earlier.
3325         */
3326        mddev->queue->queuedata = mddev;
3327        mddev->queue->make_request_fn = mddev->pers->make_request;
3328
3329        /* If there is a partially-recovered drive we need to
3330         * start recovery here.  If we leave it to md_check_recovery,
3331         * it will remove the drives and not do the right thing
3332         */
3333        if (mddev->degraded && !mddev->sync_thread) {
3334                struct list_head *rtmp;
3335                int spares = 0;
3336                ITERATE_RDEV(mddev,rdev,rtmp)
3337                        if (rdev->raid_disk >= 0 &&
3338                            !test_bit(In_sync, &rdev->flags) &&
3339                            !test_bit(Faulty, &rdev->flags))
3340                                /* complete an interrupted recovery */
3341                                spares++;
3342                if (spares && mddev->pers->sync_request) {
3343                        mddev->recovery = 0;
3344                        set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3345                        mddev->sync_thread = md_register_thread(md_do_sync,
3346                                                                mddev,
3347                                                                "%s_resync");
3348                        if (!mddev->sync_thread) {
3349                                printk(KERN_ERR "%s: could not start resync"
3350                                       " thread...\n",
3351                                       mdname(mddev));
3352                                /* leave the spares where they are, it shouldn't hurt */
3353                                mddev->recovery = 0;
3354                        }
3355                }
3356        }
3357        md_wakeup_thread(mddev->thread);
3358        md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
3359
3360        mddev->changed = 1;
3361        md_new_event(mddev);
3362        kobject_uevent(&mddev->gendisk->kobj, KOBJ_CHANGE);
3363        return 0;
3364}
3365
3366static int restart_array(mddev_t *mddev)
3367{
3368        struct gendisk *disk = mddev->gendisk;
3369        int err;
3370
3371        /*
3372         * Complain if it has no devices
3373         */
3374        err = -ENXIO;
3375        if (list_empty(&mddev->disks))
3376                goto out;
3377
3378        if (mddev->pers) {
3379                err = -EBUSY;
3380                if (!mddev->ro)
3381                        goto out;
3382
3383                mddev->safemode = 0;
3384                mddev->ro = 0;
3385                set_disk_ro(disk, 0);
3386
3387                printk(KERN_INFO "md: %s switched to read-write mode.\n",
3388                        mdname(mddev));
3389                /*
3390                 * Kick recovery or resync if necessary
3391                 */
3392                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3393                md_wakeup_thread(mddev->thread);
3394                md_wakeup_thread(mddev->sync_thread);
3395                err = 0;
3396        } else
3397                err = -EINVAL;
3398
3399out:
3400        return err;
3401}
3402
3403/* similar to deny_write_access, but accounts for our holding a reference
3404 * to the file ourselves */
3405static int deny_bitmap_write_access(struct file * file)
3406{
3407        struct inode *inode = file->f_mapping->host;
3408
3409        spin_lock(&inode->i_lock);
3410        if (atomic_read(&inode->i_writecount) > 1) {
3411                spin_unlock(&inode->i_lock);
3412                return -ETXTBSY;
3413        }
3414        atomic_set(&inode->i_writecount, -1);
3415        spin_unlock(&inode->i_lock);
3416
3417        return 0;
3418}
3419
3420static void restore_bitmap_write_access(struct file *file)
3421{
3422        struct inode *inode = file->f_mapping->host;
3423
3424        spin_lock(&inode->i_lock);
3425        atomic_set(&inode->i_writecount, 1);
3426        spin_unlock(&inode->i_lock);
3427}
3428
3429/* mode:
3430 *   0 - completely stop and dis-assemble array
3431 *   1 - switch to readonly
3432 *   2 - stop but do not disassemble array
3433 */
3434static int do_md_stop(mddev_t * mddev, int mode)
3435{
3436        int err = 0;
3437        struct gendisk *disk = mddev->gendisk;
3438
3439        if (mddev->pers) {
3440                if (atomic_read(&mddev->active)>2) {
3441                        printk("md: %s still in use.\n",mdname(mddev));
3442                        return -EBUSY;
3443                }
3444
3445                if (mddev->sync_thread) {
3446                        set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3447                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3448                        md_unregister_thread(mddev->sync_thread);
3449                        mddev->sync_thread = NULL;
3450                }
3451
3452                del_timer_sync(&mddev->safemode_timer);
3453
3454                invalidate_partition(disk, 0);
3455
3456                switch(mode) {
3457                case 1: /* readonly */
3458                        err  = -ENXIO;
3459                        if (mddev->ro==1)
3460                                goto out;
3461                        mddev->ro = 1;
3462                        break;
3463                case 0: /* disassemble */
3464                case 2: /* stop */
3465                        bitmap_flush(mddev);
3466                        md_super_wait(mddev);
3467                        if (mddev->ro)
3468                                set_disk_ro(disk, 0);
3469                        blk_queue_make_request(mddev->queue, md_fail_request);
3470                        mddev->pers->stop(mddev);
3471                        mddev->queue->merge_bvec_fn = NULL;
3472                        mddev->queue->unplug_fn = NULL;
3473                        mddev->queue->backing_dev_info.congested_fn = NULL;
3474                        if (mddev->pers->sync_request)
3475                                sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
3476
3477                        module_put(mddev->pers->owner);
3478                        mddev->pers = NULL;
3479
3480                        set_capacity(disk, 0);
3481                        mddev->changed = 1;
3482
3483                        if (mddev->ro)
3484                                mddev->ro = 0;
3485                }
3486                if (!mddev->in_sync || mddev->flags) {
3487                        /* mark array as shutdown cleanly */
3488                        mddev->in_sync = 1;
3489                        md_update_sb(mddev, 1);
3490                }
3491                if (mode == 1)
3492                        set_disk_ro(disk, 1);
3493                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3494        }
3495
3496        /*
3497         * Free resources if final stop
3498         */
3499        if (mode == 0) {
3500                mdk_rdev_t *rdev;
3501                struct list_head *tmp;
3502
3503                printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
3504
3505                bitmap_destroy(mddev);
3506                if (mddev->bitmap_file) {
3507                        restore_bitmap_write_access(mddev->bitmap_file);
3508                        fput(mddev->bitmap_file);
3509                        mddev->bitmap_file = NULL;
3510                }
3511                mddev->bitmap_offset = 0;
3512
3513                ITERATE_RDEV(mddev,rdev,tmp)
3514                        if (rdev->raid_disk >= 0) {
3515                                char nm[20];
3516                                sprintf(nm, "rd%d", rdev->raid_disk);
3517                                sysfs_remove_link(&mddev->kobj, nm);
3518                        }
3519
3520                /* make sure all delayed_delete calls have finished */
3521                flush_scheduled_work();
3522
3523                export_array(mddev);
3524
3525                mddev->array_size = 0;
3526                mddev->size = 0;
3527                mddev->raid_disks = 0;
3528                mddev->recovery_cp = 0;
3529                mddev->reshape_position = MaxSector;
3530
3531        } else if (mddev->pers)
3532                printk(KERN_INFO "md: %s switched to read-only mode.\n",
3533                        mdname(mddev));
3534        err = 0;
3535        md_new_event(mddev);
3536out:
3537        return err;
3538}
3539
3540#ifndef MODULE
3541static void autorun_array(mddev_t *mddev)
3542{
3543        mdk_rdev_t *rdev;
3544        struct list_head *tmp;
3545        int err;
3546
3547        if (list_empty(&mddev->disks))
3548                return;
3549
3550        printk(KERN_INFO "md: running: ");
3551
3552        ITERATE_RDEV(mddev,rdev,tmp) {
3553                char b[BDEVNAME_SIZE];
3554                printk("<%s>", bdevname(rdev->bdev,b));
3555        }
3556        printk("\n");
3557
3558        err = do_md_run (mddev);
3559        if (err) {
3560                printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
3561                do_md_stop (mddev, 0);
3562        }
3563}
3564
3565/*
3566 * lets try to run arrays based on all disks that have arrived
3567 * until now. (those are in pending_raid_disks)
3568 *
3569 * the method: pick the first pending disk, collect all disks with
3570 * the same UUID, remove all from the pending list and put them into
3571 * the 'same_array' list. Then order this list based on superblock
3572 * update time (freshest comes first), kick out 'old' disks and
3573 * compare superblocks. If everything's fine then run it.
3574 *
3575 * If "unit" is allocated, then bump its reference count
3576 */
3577static void autorun_devices(int part)
3578{
3579        struct list_head *tmp;
3580        mdk_rdev_t *rdev0, *rdev;
3581        mddev_t *mddev;
3582        char b[BDEVNAME_SIZE];
3583
3584        printk(KERN_INFO "md: autorun ...\n");
3585        while (!list_empty(&pending_raid_disks)) {
3586                int unit;
3587                dev_t dev;
3588                LIST_HEAD(candidates);
3589                rdev0 = list_entry(pending_raid_disks.next,
3590                                         mdk_rdev_t, same_set);
3591
3592                printk(KERN_INFO "md: considering %s ...\n",
3593                        bdevname(rdev0->bdev,b));
3594                INIT_LIST_HEAD(&candidates);
3595                ITERATE_RDEV_PENDING(rdev,tmp)
3596                        if (super_90_load(rdev, rdev0, 0) >= 0) {
3597                                printk(KERN_INFO "md:  adding %s ...\n",
3598                                        bdevname(rdev->bdev,b));
3599                                list_move(&rdev->same_set, &candidates);
3600                        }
3601                /*
3602                 * now we have a set of devices, with all of them having
3603                 * mostly sane superblocks. It's time to allocate the
3604                 * mddev.
3605                 */
3606                if (part) {
3607                        dev = MKDEV(mdp_major,
3608                                    rdev0->preferred_minor << MdpMinorShift);
3609                        unit = MINOR(dev) >> MdpMinorShift;
3610                } else {
3611                        dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
3612                        unit = MINOR(dev);
3613                }
3614                if (rdev0->preferred_minor != unit) {
3615                        printk(KERN_INFO "md: unit number in %s is bad: %d\n",
3616                               bdevname(rdev0->bdev, b), rdev0->preferred_minor);
3617                        break;
3618                }
3619
3620                md_probe(dev, NULL, NULL);
3621                mddev = mddev_find(dev);
3622                if (!mddev) {
3623                        printk(KERN_ERR 
3624                                "md: cannot allocate memory for md drive.\n");
3625                        break;
3626                }
3627                if (mddev_lock(mddev)) 
3628                        printk(KERN_WARNING "md: %s locked, cannot run\n",
3629                               mdname(mddev));
3630                else if (mddev->raid_disks || mddev->major_version
3631                         || !list_empty(&mddev->disks)) {
3632                        printk(KERN_WARNING 
3633                                "md: %s already running, cannot run %s\n",
3634                                mdname(mddev), bdevname(rdev0->bdev,b));
3635                        mddev_unlock(mddev);
3636                } else {
3637                        printk(KERN_INFO "md: created %s\n", mdname(mddev));
3638                        ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
3639                                list_del_init(&rdev->same_set);
3640                                if (bind_rdev_to_array(rdev, mddev))
3641                                        export_rdev(rdev);
3642                        }
3643                        autorun_array(mddev);
3644                        mddev_unlock(mddev);
3645                }
3646                /* on success, candidates will be empty, on error
3647                 * it won't...
3648                 */
3649                ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
3650                        export_rdev(rdev);
3651                mddev_put(mddev);
3652        }
3653        printk(KERN_INFO "md: ... autorun DONE.\n");
3654}
3655#endif /* !MODULE */
3656
3657static int get_version(void __user * arg)
3658{
3659        mdu_version_t ver;
3660
3661        ver.major = MD_MAJOR_VERSION;
3662        ver.minor = MD_MINOR_VERSION;
3663        ver.patchlevel = MD_PATCHLEVEL_VERSION;
3664
3665        if (copy_to_user(arg, &ver, sizeof(ver)))
3666                return -EFAULT;
3667
3668        return 0;
3669}
3670
3671static int get_array_info(mddev_t * mddev, void __user * arg)
3672{
3673        mdu_array_info_t info;
3674        int nr,working,active,failed,spare;
3675        mdk_rdev_t *rdev;
3676        struct list_head *tmp;
3677
3678        nr=working=active=failed=spare=0;
3679        ITERATE_RDEV(mddev,rdev,tmp) {
3680                nr++;
3681                if (test_bit(Faulty, &rdev->flags))
3682                        failed++;
3683                else {
3684                        working++;
3685                        if (test_bit(In_sync, &rdev->flags))
3686                                active++;       
3687                        else
3688                                spare++;
3689                }
3690        }
3691
3692        info.major_version = mddev->major_version;
3693        info.minor_version = mddev->minor_version;
3694        info.patch_version = MD_PATCHLEVEL_VERSION;
3695        info.ctime         = mddev->ctime;
3696        info.level         = mddev->level;
3697        info.size          = mddev->size;
3698        if (info.size != mddev->size) /* overflow */
3699                info.size = -1;
3700        info.nr_disks      = nr;
3701        info.raid_disks    = mddev->raid_disks;
3702        info.md_minor      = mddev->md_minor;
3703        info.not_persistent= !mddev->persistent;
3704
3705        info.utime         = mddev->utime;
3706        info.state         = 0;
3707        if (mddev->in_sync)
3708                info.state = (1<<MD_SB_CLEAN);
3709        if (mddev->bitmap && mddev->bitmap_offset)
3710                info.state = (1<<MD_SB_BITMAP_PRESENT);
3711        info.active_disks  = active;
3712        info.working_disks = working;
3713        info.failed_disks  = failed;
3714        info.spare_disks   = spare;
3715
3716        info.layout        = mddev->layout;
3717        info.chunk_size    = mddev->chunk_size;
3718
3719        if (copy_to_user(arg, &info, sizeof(info)))
3720                return -EFAULT;
3721
3722        return 0;
3723}
3724
3725static int get_bitmap_file(mddev_t * mddev, void __user * arg)
3726{
3727        mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
3728        char *ptr, *buf = NULL;
3729        int err = -ENOMEM;
3730
3731        md_allow_write(mddev);
3732
3733        file = kmalloc(sizeof(*file), GFP_KERNEL);
3734        if (!file)
3735                goto out;
3736
3737        /* bitmap disabled, zero the first byte and copy out */
3738        if (!mddev->bitmap || !mddev->bitmap->file) {
3739                file->pathname[0] = '\0';
3740                goto copy_out;
3741        }
3742
3743        buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
3744        if (!buf)
3745                goto out;
3746
3747        ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname));
3748        if (!ptr)
3749                goto out;
3750
3751        strcpy(file->pathname, ptr);
3752
3753copy_out:
3754        err = 0;
3755        if (copy_to_user(arg, file, sizeof(*file)))
3756                err = -EFAULT;
3757out:
3758        kfree(buf);
3759        kfree(file);
3760        return err;
3761}
3762
3763static int get_disk_info(mddev_t * mddev, void __user * arg)
3764{
3765        mdu_disk_info_t info;
3766        unsigned int nr;
3767        mdk_rdev_t *rdev;
3768
3769        if (copy_from_user(&info, arg, sizeof(info)))
3770                return -EFAULT;
3771
3772        nr = info.number;
3773
3774        rdev = find_rdev_nr(mddev, nr);
3775        if (rdev) {
3776                info.major = MAJOR(rdev->bdev->bd_dev);
3777                info.minor = MINOR(rdev->bdev->bd_dev);
3778                info.raid_disk = rdev->raid_disk;
3779                info.state = 0;
3780                if (test_bit(Faulty, &rdev->flags))
3781                        info.state |= (1<<MD_DISK_FAULTY);
3782                else if (test_bit(In_sync, &rdev->flags)) {
3783                        info.state |= (1<<MD_DISK_ACTIVE);
3784                        info.state |= (1<<MD_DISK_SYNC);
3785                }
3786                if (test_bit(WriteMostly, &rdev->flags))
3787                        info.state |= (1<<MD_DISK_WRITEMOSTLY);
3788        } else {
3789                info.major = info.minor = 0;
3790                info.raid_disk = -1;
3791                info.state = (1<<MD_DISK_REMOVED);
3792        }
3793
3794        if (copy_to_user(arg, &info, sizeof(info)))
3795                return -EFAULT;
3796
3797        return 0;
3798}
3799
3800static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
3801{
3802        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3803        mdk_rdev_t *rdev;
3804        dev_t dev = MKDEV(info->major,info->minor);
3805
3806        if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
3807                return -EOVERFLOW;
3808
3809        if (!mddev->raid_disks) {
3810                int err;
3811                /* expecting a device which has a superblock */
3812                rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
3813                if (IS_ERR(rdev)) {
3814                        printk(KERN_WARNING 
3815                                "md: md_import_device returned %ld\n",
3816                                PTR_ERR(rdev));
3817                        return PTR_ERR(rdev);
3818                }
3819                if (!list_empty(&mddev->disks)) {
3820                        mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
3821                                                        mdk_rdev_t, same_set);
3822                        int err = super_types[mddev->major_version]
3823                                .load_super(rdev, rdev0, mddev->minor_version);
3824                        if (err < 0) {
3825                                printk(KERN_WARNING 
3826                                        "md: %s has different UUID to %s\n",
3827                                        bdevname(rdev->bdev,b), 
3828                                        bdevname(rdev0->bdev,b2));
3829                                export_rdev(rdev);
3830                                return -EINVAL;
3831                        }
3832                }
3833                err = bind_rdev_to_array(rdev, mddev);
3834                if (err)
3835                        export_rdev(rdev);
3836                return err;
3837        }
3838
3839        /*
3840         * add_new_disk can be used once the array is assembled
3841         * to add "hot spares".  They must already have a superblock
3842         * written
3843         */
3844        if (mddev->pers) {
3845                int err;
3846                if (!mddev->pers->hot_add_disk) {
3847                        printk(KERN_WARNING 
3848                                "%s: personality does not support diskops!\n",
3849                               mdname(mddev));
3850                        return -EINVAL;
3851                }
3852                if (mddev->persistent)
3853                        rdev = md_import_device(dev, mddev->major_version,
3854                                                mddev->minor_version);
3855                else
3856                        rdev = md_import_device(dev, -1, -1);
3857                if (IS_ERR(rdev)) {
3858                        printk(KERN_WARNING 
3859                                "md: md_import_device returned %ld\n",
3860                                PTR_ERR(rdev));
3861                        return PTR_ERR(rdev);
3862                }
3863                /* set save_raid_disk if appropriate */
3864                if (!mddev->persistent) {
3865                        if (info->state & (1<<MD_DISK_SYNC)  &&
3866                            info->raid_disk < mddev->raid_disks)
3867                                rdev->raid_disk = info->raid_disk;
3868                        else
3869                                rdev->raid_disk = -1;
3870                } else
3871                        super_types[mddev->major_version].
3872                                validate_super(mddev, rdev);
3873                rdev->saved_raid_disk = rdev->raid_disk;
3874
3875                clear_bit(In_sync, &rdev->flags); /* just to be sure */
3876                if (info->state & (1<<MD_DISK_WRITEMOSTLY))
3877                        set_bit(WriteMostly, &rdev->flags);
3878
3879                rdev->raid_disk = -1;
3880                err = bind_rdev_to_array(rdev, mddev);
3881                if (!err && !mddev->pers->hot_remove_disk) {
3882                        /* If there is hot_add_disk but no hot_remove_disk
3883                         * then added disks for geometry changes,
3884                         * and should be added immediately.
3885                         */
3886                        super_types[mddev->major_version].
3887                                validate_super(mddev, rdev);
3888                        err = mddev->pers->hot_add_disk(mddev, rdev);
3889                        if (err)
3890                                unbind_rdev_from_array(rdev);
3891                }
3892                if (err)
3893                        export_rdev(rdev);
3894
3895                md_update_sb(mddev, 1);
3896                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3897                md_wakeup_thread(mddev->thread);
3898                return err;
3899        }
3900
3901        /* otherwise, add_new_disk is only allowed
3902         * for major_version==0 superblocks
3903         */
3904        if (mddev->major_version != 0) {
3905                printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
3906                       mdname(mddev));
3907                return -EINVAL;
3908        }
3909
3910        if (!(info->state & (1<<MD_DISK_FAULTY))) {
3911                int err;
3912                rdev = md_import_device (dev, -1, 0);
3913                if (IS_ERR(rdev)) {
3914                        printk(KERN_WARNING 
3915                                "md: error, md_import_device() returned %ld\n",
3916                                PTR_ERR(rdev));
3917                        return PTR_ERR(rdev);
3918                }
3919                rdev->desc_nr = info->number;
3920                if (info->raid_disk < mddev->raid_disks)
3921                        rdev->raid_disk = info->raid_disk;
3922                else
3923                        rdev->raid_disk = -1;
3924
3925                rdev->flags = 0;
3926
3927                if (rdev->raid_disk < mddev->raid_disks)
3928                        if (info->state & (1<<MD_DISK_SYNC))
3929                                set_bit(In_sync, &rdev->flags);
3930
3931                if (info->state & (1<<MD_DISK_WRITEMOSTLY))
3932                        set_bit(WriteMostly, &rdev->flags);
3933
3934                if (!mddev->persistent) {
3935                        printk(KERN_INFO "md: nonpersistent superblock ...\n");
3936                        rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
3937                } else 
3938                        rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
3939                rdev->size = calc_dev_size(rdev, mddev->chunk_size);
3940
3941                err = bind_rdev_to_array(rdev, mddev);
3942                if (err) {
3943                        export_rdev(rdev);
3944                        return err;
3945                }
3946        }
3947
3948        return 0;
3949}
3950
3951static int hot_remove_disk(mddev_t * mddev, dev_t dev)
3952{
3953        char b[BDEVNAME_SIZE];
3954        mdk_rdev_t *rdev;
3955
3956        if (!mddev->pers)
3957                return -ENODEV;
3958
3959        rdev = find_rdev(mddev, dev);
3960        if (!rdev)
3961                return -ENXIO;
3962
3963        if (rdev->raid_disk >= 0)
3964                goto busy;
3965
3966        kick_rdev_from_array(rdev);
3967        md_update_sb(mddev, 1);
3968        md_new_event(mddev);
3969
3970        return 0;
3971busy:
3972        printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n",
3973                bdevname(rdev->bdev,b), mdname(mddev));
3974        return -EBUSY;
3975}
3976
3977static int hot_add_disk(mddev_t * mddev, dev_t dev)
3978{
3979        char b[BDEVNAME_SIZE];
3980        int err;
3981        unsigned int size;
3982        mdk_rdev_t *rdev;
3983
3984        if (!mddev->pers)
3985                return -ENODEV;
3986
3987        if (mddev->major_version != 0) {
3988                printk(KERN_WARNING "%s: HOT_ADD may only be used with"
3989                        " version-0 superblocks.\n",
3990                        mdname(mddev));
3991                return -EINVAL;
3992        }
3993        if (!mddev->pers->hot_add_disk) {
3994                printk(KERN_WARNING 
3995                        "%s: personality does not support diskops!\n",
3996                        mdname(mddev));
3997                return -EINVAL;
3998        }
3999
4000        rdev = md_import_device (dev, -1, 0);
4001        if (IS_ERR(rdev)) {
4002                printk(KERN_WARNING 
4003                        "md: error, md_import_device() returned %ld\n",
4004                        PTR_ERR(rdev));
4005                return -EINVAL;
4006        }
4007
4008        if (mddev->persistent)
4009                rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
4010        else
4011                rdev->sb_offset =
4012                        rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
4013
4014        size = calc_dev_size(rdev, mddev->chunk_size);
4015        rdev->size = size;
4016
4017        if (test_bit(Faulty, &rdev->flags)) {
4018                printk(KERN_WARNING 
4019                        "md: can not hot-add faulty %s disk to %s!\n",
4020                        bdevname(rdev->bdev,b), mdname(mddev));
4021                err = -EINVAL;
4022                goto abort_export;
4023        }
4024        clear_bit(In_sync, &rdev->flags);
4025        rdev->desc_nr = -1;
4026        rdev->saved_raid_disk = -1;
4027        err = bind_rdev_to_array(rdev, mddev);
4028        if (err)
4029                goto abort_export;
4030
4031        /*
4032         * The rest should better be atomic, we can have disk failures
4033         * noticed in interrupt contexts ...
4034         */
4035
4036        if (rdev->desc_nr == mddev->max_disks) {
4037                printk(KERN_WARNING "%s: can not hot-add to full array!\n",
4038                        mdname(mddev));
4039                err = -EBUSY;
4040                goto abort_unbind_export;
4041        }
4042
4043        rdev->raid_disk = -1;
4044
4045        md_update_sb(mddev, 1);
4046
4047        /*
4048         * Kick recovery, maybe this spare has to be added to the
4049         * array immediately.
4050         */
4051        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4052        md_wakeup_thread(mddev->thread);
4053        md_new_event(mddev);
4054        return 0;
4055
4056abort_unbind_export:
4057        unbind_rdev_from_array(rdev);
4058
4059abort_export:
4060        export_rdev(rdev);
4061        return err;
4062}
4063
4064static int set_bitmap_file(mddev_t *mddev, int fd)
4065{
4066        int err;
4067
4068        if (mddev->pers) {
4069                if (!mddev->pers->quiesce)
4070                        return -EBUSY;
4071                if (mddev->recovery || mddev->sync_thread)
4072                        return -EBUSY;
4073                /* we should be able to change the bitmap.. */
4074        }
4075
4076
4077        if (fd >= 0) {
4078                if (mddev->bitmap)
4079                        return -EEXIST; /* cannot add when bitmap is present */
4080                mddev->bitmap_file = fget(fd);
4081
4082                if (mddev->bitmap_file == NULL) {
4083                        printk(KERN_ERR "%s: error: failed to get bitmap file\n",
4084                               mdname(mddev));
4085                        return -EBADF;
4086                }
4087
4088                err = deny_bitmap_write_access(mddev->bitmap_file);
4089                if (err) {
4090                        printk(KERN_ERR "%s: error: bitmap file is already in use\n",
4091                               mdname(mddev));
4092                        fput(mddev->bitmap_file);
4093                        mddev->bitmap_file = NULL;
4094                        return err;
4095                }
4096                mddev->bitmap_offset = 0; /* file overrides offset */
4097        } else if (mddev->bitmap == NULL)
4098                return -ENOENT; /* cannot remove what isn't there */
4099        err = 0;
4100        if (mddev->pers) {
4101                mddev->pers->quiesce(mddev, 1);
4102                if (fd >= 0)
4103                        err = bitmap_create(mddev);
4104                if (fd < 0 || err) {
4105                        bitmap_destroy(mddev);
4106                        fd = -1; /* make sure to put the file */
4107                }
4108                mddev->pers->quiesce(mddev, 0);
4109        }
4110        if (fd < 0) {
4111                if (mddev->bitmap_file) {
4112                        restore_bitmap_write_access(mddev->bitmap_file);
4113                        fput(mddev->bitmap_file);
4114                }
4115                mddev->bitmap_file = NULL;
4116        }
4117
4118        return err;
4119}
4120
4121/*
4122 * set_array_info is used two different ways
4123 * The original usage is when creating a new array.
4124 * In this usage, raid_disks is > 0 and it together with
4125 *  level, size, not_persistent,layout,chunksize determine the
4126 *  shape of the array.
4127 *  This will always create an array with a type-0.90.0 superblock.
4128 * The newer usage is when assembling an array.
4129 *  In this case raid_disks will be 0, and the major_version field is
4130 *  use to determine which style super-blocks are to be found on the devices.
4131 *  The minor and patch _version numbers are also kept incase the
4132 *  super_block handler wishes to interpret them.
4133 */
4134static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
4135{
4136
4137        if (info->raid_disks == 0) {
4138                /* just setting version number for superblock loading */
4139                if (info->major_version < 0 ||
4140                    info->major_version >= ARRAY_SIZE(super_types) ||
4141                    super_types[info->major_version].name == NULL) {
4142                        /* maybe try to auto-load a module? */
4143                        printk(KERN_INFO 
4144                                "md: superblock version %d not known\n",
4145                                info->major_version);
4146                        return -EINVAL;
4147                }
4148                mddev->major_version = info->major_version;
4149                mddev->minor_version = info->minor_version;
4150                mddev->patch_version = info->patch_version;
4151                mddev->persistent = !info->not_persistent;
4152                return 0;
4153        }
4154        mddev->major_version = MD_MAJOR_VERSION;
4155        mddev->minor_version = MD_MINOR_VERSION;
4156        mddev->patch_version = MD_PATCHLEVEL_VERSION;
4157        mddev->ctime         = get_seconds();
4158
4159        mddev->level         = info->level;
4160        mddev->clevel[0]     = 0;
4161        mddev->size          = info->size;
4162        mddev->raid_disks    = info->raid_disks;
4163        /* don't set md_minor, it is determined by which /dev/md* was
4164         * openned
4165         */
4166        if (info->state & (1<<MD_SB_CLEAN))
4167                mddev->recovery_cp = MaxSector;
4168        else
4169                mddev->recovery_cp = 0;
4170        mddev->persistent    = ! info->not_persistent;
4171
4172        mddev->layout        = info->layout;
4173        mddev->chunk_size    = info->chunk_size;
4174
4175        mddev->max_disks     = MD_SB_DISKS;
4176
4177        mddev->flags         = 0;
4178        set_bit(MD_CHANGE_DEVS, &mddev->flags);
4179
4180        mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
4181        mddev->bitmap_offset = 0;
4182
4183        mddev->reshape_position = MaxSector;
4184
4185        /*
4186         * Generate a 128 bit UUID
4187         */
4188        get_random_bytes(mddev->uuid, 16);
4189
4190        mddev->new_level = mddev->level;
4191        mddev->new_chunk = mddev->chunk_size;
4192        mddev->new_layout = mddev->layout;
4193        mddev->delta_disks = 0;
4194
4195        return 0;
4196}
4197
4198static int update_size(mddev_t *mddev, unsigned long size)
4199{
4200        mdk_rdev_t * rdev;
4201        int rv;
4202        struct list_head *tmp;
4203        int fit = (size == 0);
4204
4205        if (mddev->pers->resize == NULL)
4206                return -EINVAL;
4207        /* The "size" is the amount of each device that is used.
4208         * This can only make sense for arrays with redundancy.
4209         * linear and raid0 always use whatever space is available
4210         * We can only consider changing the size if no resync
4211         * or reconstruction is happening, and if the new size
4212         * is acceptable. It must fit before the sb_offset or,
4213         * if that is <data_offset, it must fit before the
4214         * size of each device.
4215         * If size is zero, we find the largest size that fits.
4216         */
4217        if (mddev->sync_thread)
4218                return -EBUSY;
4219        ITERATE_RDEV(mddev,rdev,tmp) {
4220                sector_t avail;
4221                avail = rdev->size * 2;
4222
4223                if (fit && (size == 0 || size > avail/2))
4224                        size = avail/2;
4225                if (avail < ((sector_t)size << 1))
4226                        return -ENOSPC;
4227        }
4228        rv = mddev->pers->resize(mddev, (sector_t)size *2);
4229        if (!rv) {
4230                struct block_device *bdev;
4231
4232                bdev = bdget_disk(mddev->gendisk, 0);
4233                if (bdev) {
4234                        mutex_lock(&bdev->bd_inode->i_mutex);
4235                        i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10);
4236                        mutex_unlock(&bdev->bd_inode->i_mutex);
4237                        bdput(bdev);
4238                }
4239        }
4240        return rv;
4241}
4242
4243static int update_raid_disks(mddev_t *mddev, int raid_disks)
4244{
4245        int rv;
4246        /* change the number of raid disks */
4247        if (mddev->pers->check_reshape == NULL)
4248                return -EINVAL;
4249        if (raid_disks <= 0 ||
4250            raid_disks >= mddev->max_disks)
4251                return -EINVAL;
4252        if (mddev->sync_thread || mddev->reshape_position != MaxSector)
4253                return -EBUSY;
4254        mddev->delta_disks = raid_disks - mddev->raid_disks;
4255
4256        rv = mddev->pers->check_reshape(mddev);
4257        return rv;
4258}
4259
4260
4261/*
4262 * update_array_info is used to change the configuration of an
4263 * on-line array.
4264 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
4265 * fields in the info are checked against the array.
4266 * Any differences that cannot be handled will cause an error.
4267 * Normally, only one change can be managed at a time.
4268 */
4269static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
4270{
4271        int rv = 0;
4272        int cnt = 0;
4273        int state = 0;
4274
4275        /* calculate expected state,ignoring low bits */
4276        if (mddev->bitmap && mddev->bitmap_offset)
4277                state |= (1 << MD_SB_BITMAP_PRESENT);
4278
4279        if (mddev->major_version != info->major_version ||
4280            mddev->minor_version != info->minor_version ||
4281/*          mddev->patch_version != info->patch_version || */
4282            mddev->ctime         != info->ctime         ||
4283            mddev->level         != info->level         ||
4284/*          mddev->layout        != info->layout        || */
4285            !mddev->persistent   != info->not_persistent||
4286            mddev->chunk_size    != info->chunk_size    ||
4287            /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
4288            ((state^info->state) & 0xfffffe00)
4289                )
4290                return -EINVAL;
4291        /* Check there is only one change */
4292        if (info->size >= 0 && mddev->size != info->size) cnt++;
4293        if (mddev->raid_disks != info->raid_disks) cnt++;
4294        if (mddev->layout != info->layout) cnt++;
4295        if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
4296        if (cnt == 0) return 0;
4297        if (cnt > 1) return -EINVAL;
4298
4299        if (mddev->layout != info->layout) {
4300                /* Change layout
4301                 * we don't need to do anything at the md level, the
4302                 * personality will take care of it all.
4303                 */
4304                if (mddev->pers->reconfig == NULL)
4305                        return -EINVAL;
4306                else
4307                        return mddev->pers->reconfig(mddev, info->layout, -1);
4308        }
4309        if (info->size >= 0 && mddev->size != info->size)
4310                rv = update_size(mddev, info->size);
4311
4312        if (mddev->raid_disks    != info->raid_disks)
4313                rv = update_raid_disks(mddev, info->raid_disks);
4314
4315        if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
4316                if (mddev->pers->quiesce == NULL)
4317                        return -EINVAL;
4318                if (mddev->recovery || mddev->sync_thread)
4319                        return -EBUSY;
4320                if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
4321                        /* add the bitmap */
4322                        if (mddev->bitmap)
4323                                return -EEXIST;
4324                        if (mddev->default_bitmap_offset == 0)
4325                                return -EINVAL;
4326                        mddev->bitmap_offset = mddev->default_bitmap_offset;
4327                        mddev->pers->quiesce(mddev, 1);
4328                        rv = bitmap_create(mddev);
4329                        if (rv)
4330                                bitmap_destroy(mddev);
4331                        mddev->pers->quiesce(mddev, 0);
4332                } else {
4333                        /* remove the bitmap */
4334                        if (!mddev->bitmap)
4335                                return -ENOENT;
4336                        if (mddev->bitmap->file)
4337                                return -EINVAL;
4338                        mddev->pers->quiesce(mddev, 1);
4339                        bitmap_destroy(mddev);
4340                        mddev->pers->quiesce(mddev, 0);
4341                        mddev->bitmap_offset = 0;
4342                }
4343        }
4344        md_update_sb(mddev, 1);
4345        return rv;
4346}
4347
4348static int set_disk_faulty(mddev_t *mddev, dev_t dev)
4349{
4350        mdk_rdev_t *rdev;
4351
4352        if (mddev->pers == NULL)
4353                return -ENODEV;
4354
4355        rdev = find_rdev(mddev, dev);
4356        if (!rdev)
4357                return -ENODEV;
4358
4359        md_error(mddev, rdev);
4360        return 0;
4361}
4362
4363static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
4364{
4365        mddev_t *mddev = bdev->bd_disk->private_data;
4366
4367        geo->heads = 2;
4368        geo->sectors = 4;
4369        geo->cylinders = get_capacity(mddev->gendisk) / 8;
4370        return 0;
4371}
4372
4373static int md_ioctl(struct inode *inode, struct file *file,
4374                        unsigned int cmd, unsigned long arg)
4375{
4376        int err = 0;
4377        void __user *argp = (void __user *)arg;
4378        mddev_t *mddev = NULL;
4379
4380        if (!capable(CAP_SYS_ADMIN))
4381                return -EACCES;
4382
4383        /*
4384         * Commands dealing with the RAID driver but not any
4385         * particular array:
4386         */
4387        switch (cmd)
4388        {
4389                case RAID_VERSION:
4390                        err = get_version(argp);
4391                        goto done;
4392
4393                case PRINT_RAID_DEBUG:
4394                        err = 0;
4395                        md_print_devices();
4396                        goto done;
4397
4398#ifndef MODULE
4399                case RAID_AUTORUN:
4400                        err = 0;
4401                        autostart_arrays(arg);
4402                        goto done;
4403#endif
4404                default:;
4405        }
4406
4407        /*
4408         * Commands creating/starting a new array:
4409         */
4410
4411        mddev = inode->i_bdev->bd_disk->private_data;
4412
4413        if (!mddev) {
4414                BUG();
4415                goto abort;
4416        }
4417
4418        err = mddev_lock(mddev);
4419        if (err) {
4420                printk(KERN_INFO 
4421                        "md: ioctl lock interrupted, reason %d, cmd %d\n",
4422                        err, cmd);
4423                goto abort;
4424        }
4425
4426        switch (cmd)
4427        {
4428                case SET_ARRAY_INFO:
4429                        {
4430                                mdu_array_info_t info;
4431                                if (!arg)
4432                                        memset(&info, 0, sizeof(info));
4433                                else if (copy_from_user(&info, argp, sizeof(info))) {
4434                                        err = -EFAULT;
4435                                        goto abort_unlock;
4436                                }
4437                                if (mddev->pers) {
4438                                        err = update_array_info(mddev, &info);
4439                                        if (err) {
4440                                                printk(KERN_WARNING "md: couldn't update"
4441                                                       " array info. %d\n", err);
4442                                                goto abort_unlock;
4443                                        }
4444                                        goto done_unlock;
4445                                }
4446                                if (!list_empty(&mddev->disks)) {
4447                                        printk(KERN_WARNING
4448                                               "md: array %s already has disks!\n",
4449                                               mdname(mddev));
4450                                        err = -EBUSY;
4451                                        goto abort_unlock;
4452                                }
4453                                if (mddev->raid_disks) {
4454                                        printk(KERN_WARNING
4455                                               "md: array %s already initialised!\n",
4456                                               mdname(mddev));
4457                                        err = -EBUSY;
4458                                        goto abort_unlock;
4459                                }
4460                                err = set_array_info(mddev, &info);
4461                                if (err) {
4462                                        printk(KERN_WARNING "md: couldn't set"
4463                                               " array info. %d\n", err);
4464                                        goto abort_unlock;
4465                                }
4466                        }
4467                        goto done_unlock;
4468
4469                default:;
4470        }
4471
4472        /*
4473         * Commands querying/configuring an existing array:
4474         */
4475        /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
4476         * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
4477        if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
4478                        && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
4479                        && cmd != GET_BITMAP_FILE) {
4480                err = -ENODEV;
4481                goto abort_unlock;
4482        }
4483
4484        /*
4485         * Commands even a read-only array can execute:
4486         */
4487        switch (cmd)
4488        {
4489                case GET_ARRAY_INFO:
4490                        err = get_array_info(mddev, argp);
4491                        goto done_unlock;
4492
4493                case GET_BITMAP_FILE:
4494                        err = get_bitmap_file(mddev, argp);
4495                        goto done_unlock;
4496
4497                case GET_DISK_INFO:
4498                        err = get_disk_info(mddev, argp);
4499                        goto done_unlock;
4500
4501                case RESTART_ARRAY_RW:
4502                        err = restart_array(mddev);
4503                        goto done_unlock;
4504
4505                case STOP_ARRAY:
4506                        err = do_md_stop (mddev, 0);
4507                        goto done_unlock;
4508
4509                case STOP_ARRAY_RO:
4510                        err = do_md_stop (mddev, 1);
4511                        goto done_unlock;
4512
4513        /*
4514         * We have a problem here : there is no easy way to give a CHS
4515         * virtual geometry. We currently pretend that we have a 2 heads
4516         * 4 sectors (with a BIG number of cylinders...). This drives
4517         * dosfs just mad... ;-)
4518         */
4519        }
4520
4521        /*
4522         * The remaining ioctls are changing the state of the
4523         * superblock, so we do not allow them on read-only arrays.
4524         * However non-MD ioctls (e.g. get-size) will still come through
4525         * here and hit the 'default' below, so only disallow
4526         * 'md' ioctls, and switch to rw mode if started auto-readonly.
4527         */
4528        if (_IOC_TYPE(cmd) == MD_MAJOR &&
4529            mddev->ro && mddev->pers) {
4530                if (mddev->ro == 2) {
4531                        mddev->ro = 0;
4532                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4533                md_wakeup_thread(mddev->thread);
4534
4535                } else {
4536                        err = -EROFS;
4537                        goto abort_unlock;
4538                }
4539        }
4540
4541        switch (cmd)
4542        {
4543                case ADD_NEW_DISK:
4544                {
4545                        mdu_disk_info_t info;
4546                        if (copy_from_user(&info, argp, sizeof(info)))
4547                                err = -EFAULT;
4548                        else
4549                                err = add_new_disk(mddev, &info);
4550                        goto done_unlock;
4551                }
4552
4553                case HOT_REMOVE_DISK:
4554                        err = hot_remove_disk(mddev, new_decode_dev(arg));
4555                        goto done_unlock;
4556
4557                case HOT_ADD_DISK:
4558                        err = hot_add_disk(mddev, new_decode_dev(arg));
4559                        goto done_unlock;
4560
4561                case SET_DISK_FAULTY:
4562                        err = set_disk_faulty(mddev, new_decode_dev(arg));
4563                        goto done_unlock;
4564
4565                case RUN_ARRAY:
4566                        err = do_md_run (mddev);
4567                        goto done_unlock;
4568
4569                case SET_BITMAP_FILE:
4570                        err = set_bitmap_file(mddev, (int)arg);
4571                        goto done_unlock;
4572
4573                default:
4574                        err = -EINVAL;
4575                        goto abort_unlock;
4576        }
4577
4578done_unlock:
4579abort_unlock:
4580        mddev_unlock(mddev);
4581
4582        return err;
4583done:
4584        if (err)
4585                MD_BUG();
4586abort:
4587        return err;
4588}
4589
4590static int md_open(struct inode *inode, struct file *file)
4591{
4592        /*
4593         * Succeed if we can lock the mddev, which confirms that
4594         * it isn't being stopped right now.
4595         */
4596        mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
4597        int err;
4598
4599        if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
4600                goto out;
4601
4602        err = 0;
4603        mddev_get(mddev);
4604        mddev_unlock(mddev);
4605
4606        check_disk_change(inode->i_bdev);
4607 out:
4608        return err;
4609}
4610
4611static int md_release(struct inode *inode, struct file * file)
4612{
4613        mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
4614
4615        BUG_ON(!mddev);
4616        mddev_put(mddev);
4617
4618        return 0;
4619}
4620
4621static int md_media_changed(struct gendisk *disk)
4622{
4623        mddev_t *mddev = disk->private_data;
4624
4625        return mddev->changed;
4626}
4627
4628static int md_revalidate(struct gendisk *disk)
4629{
4630        mddev_t *mddev = disk->private_data;
4631
4632        mddev->changed = 0;
4633        return 0;
4634}
4635static struct block_device_operations md_fops =
4636{
4637        .owner          = THIS_MODULE,
4638        .open           = md_open,
4639        .release        = md_release,
4640        .ioctl          = md_ioctl,
4641        .getgeo         = md_getgeo,
4642        .media_changed  = md_media_changed,
4643        .revalidate_disk= md_revalidate,
4644};
4645
4646static int md_thread(void * arg)
4647{
4648        mdk_thread_t *thread = arg;
4649
4650        /*
4651         * md_thread is a 'system-thread', it's priority should be very
4652         * high. We avoid resource deadlocks individually in each
4653         * raid personality. (RAID5 does preallocation) We also use RR and
4654         * the very same RT priority as kswapd, thus we will never get
4655         * into a priority inversion deadlock.
4656         *
4657         * we definitely have to have equal or higher priority than
4658         * bdflush, otherwise bdflush will deadlock if there are too
4659         * many dirty RAID5 blocks.
4660         */
4661
4662        allow_signal(SIGKILL);
4663        while (!kthread_should_stop()) {
4664
4665                /* We need to wait INTERRUPTIBLE so that
4666                 * we don't add to the load-average.
4667                 * That means we need to be sure no signals are
4668                 * pending
4669                 */
4670                if (signal_pending(current))
4671                        flush_signals(current);
4672
4673                wait_event_interruptible_timeout
4674                        (thread->wqueue,
4675                         test_bit(THREAD_WAKEUP, &thread->flags)
4676                         || kthread_should_stop(),
4677                         thread->timeout);
4678
4679                clear_bit(THREAD_WAKEUP, &thread->flags);
4680
4681                thread->run(thread->mddev);
4682        }
4683
4684        return 0;
4685}
4686
4687void md_wakeup_thread(mdk_thread_t *thread)
4688{
4689        if (thread) {
4690                dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
4691                set_bit(THREAD_WAKEUP, &thread->flags);
4692                wake_up(&thread->wqueue);
4693        }
4694}
4695
4696mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
4697                                 const char *name)
4698{
4699        mdk_thread_t *thread;
4700
4701        thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
4702        if (!thread)
4703                return NULL;
4704
4705        init_waitqueue_head(&thread->wqueue);
4706
4707        thread->run = run;
4708        thread->mddev = mddev;
4709        thread->timeout = MAX_SCHEDULE_TIMEOUT;
4710        thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
4711        if (IS_ERR(thread->tsk)) {
4712                kfree(thread);
4713                return NULL;
4714        }
4715        return thread;
4716}
4717
4718void md_unregister_thread(mdk_thread_t *thread)
4719{
4720        dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
4721
4722        kthread_stop(thread->tsk);
4723        kfree(thread);
4724}
4725
4726void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
4727{
4728        if (!mddev) {
4729                MD_BUG();
4730                return;
4731        }
4732
4733        if (!rdev || test_bit(Faulty, &rdev->flags))
4734                return;
4735/*
4736        dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
4737                mdname(mddev),
4738                MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
4739                __builtin_return_address(0),__builtin_return_address(1),
4740                __builtin_return_address(2),__builtin_return_address(3));
4741*/
4742        if (!mddev->pers)
4743                return;
4744        if (!mddev->pers->error_handler)
4745                return;
4746        mddev->pers->error_handler(mddev,rdev);
4747        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4748        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4749        md_wakeup_thread(mddev->thread);
4750        md_new_event_inintr(mddev);
4751}
4752
4753/* seq_file implementation /proc/mdstat */
4754
4755static void status_unused(struct seq_file *seq)
4756{
4757        int i = 0;
4758        mdk_rdev_t *rdev;
4759        struct list_head *tmp;
4760
4761        seq_printf(seq, "unused devices: ");
4762
4763        ITERATE_RDEV_PENDING(rdev,tmp) {
4764                char b[BDEVNAME_SIZE];
4765                i++;
4766                seq_printf(seq, "%s ",
4767                              bdevname(rdev->bdev,b));
4768        }
4769        if (!i)
4770                seq_printf(seq, "<none>");
4771
4772        seq_printf(seq, "\n");
4773}
4774
4775
4776static void status_resync(struct seq_file *seq, mddev_t * mddev)
4777{
4778        sector_t max_blocks, resync, res;
4779        unsigned long dt, db, rt;
4780        int scale;
4781        unsigned int per_milli;
4782
4783        resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
4784
4785        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
4786                max_blocks = mddev->resync_max_sectors >> 1;
4787        else
4788                max_blocks = mddev->size;
4789
4790        /*
4791         * Should not happen.
4792         */
4793        if (!max_blocks) {
4794                MD_BUG();
4795                return;
4796        }
4797        /* Pick 'scale' such that (resync>>scale)*1000 will fit
4798         * in a sector_t, and (max_blocks>>scale) will fit in a
4799         * u32, as those are the requirements for sector_div.
4800         * Thus 'scale' must be at least 10
4801         */
4802        scale = 10;
4803        if (sizeof(sector_t) > sizeof(unsigned long)) {
4804                while ( max_blocks/2 > (1ULL<<(scale+32)))
4805                        scale++;
4806        }
4807        res = (resync>>scale)*1000;
4808        sector_div(res, (u32)((max_blocks>>scale)+1));
4809
4810        per_milli = res;
4811        {
4812                int i, x = per_milli/50, y = 20-x;
4813                seq_printf(seq, "[");
4814                for (i = 0; i < x; i++)
4815                        seq_printf(seq, "=");
4816                seq_printf(seq, ">");
4817                for (i = 0; i < y; i++)
4818                        seq_printf(seq, ".");
4819                seq_printf(seq, "] ");
4820        }
4821        seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
4822                   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
4823                    "reshape" :
4824                    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
4825                     "check" :
4826                     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
4827                      "resync" : "recovery"))),
4828                   per_milli/10, per_milli % 10,
4829                   (unsigned long long) resync,
4830                   (unsigned long long) max_blocks);
4831
4832        /*
4833         * We do not want to overflow, so the order of operands and
4834         * the * 100 / 100 trick are important. We do a +1 to be
4835         * safe against division by zero. We only estimate anyway.
4836         *
4837         * dt: time from mark until now
4838         * db: blocks written from mark until now
4839         * rt: remaining time
4840         */
4841        dt = ((jiffies - mddev->resync_mark) / HZ);
4842        if (!dt) dt++;
4843        db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
4844                - mddev->resync_mark_cnt;
4845        rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100;
4846
4847        seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
4848
4849        seq_printf(seq, " speed=%ldK/sec", db/2/dt);
4850}
4851
4852static void *md_seq_start(struct seq_file *seq, loff_t *pos)
4853{
4854        struct list_head *tmp;
4855        loff_t l = *pos;
4856        mddev_t *mddev;
4857
4858        if (l >= 0x10000)
4859                return NULL;
4860        if (!l--)
4861                /* header */
4862                return (void*)1;
4863
4864        spin_lock(&all_mddevs_lock);
4865        list_for_each(tmp,&all_mddevs)
4866                if (!l--) {
4867                        mddev = list_entry(tmp, mddev_t, all_mddevs);
4868                        mddev_get(mddev);
4869                        spin_unlock(&all_mddevs_lock);
4870                        return mddev;
4871                }
4872        spin_unlock(&all_mddevs_lock);
4873        if (!l--)
4874                return (void*)2;/* tail */
4875        return NULL;
4876}
4877
4878static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4879{
4880        struct list_head *tmp;
4881        mddev_t *next_mddev, *mddev = v;
4882        
4883        ++*pos;
4884        if (v == (void*)2)
4885                return NULL;
4886
4887        spin_lock(&all_mddevs_lock);
4888        if (v == (void*)1)
4889                tmp = all_mddevs.next;
4890        else
4891                tmp = mddev->all_mddevs.next;
4892        if (tmp != &all_mddevs)
4893                next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
4894        else {
4895                next_mddev = (void*)2;
4896                *pos = 0x10000;
4897        }               
4898        spin_unlock(&all_mddevs_lock);
4899
4900        if (v != (void*)1)
4901                mddev_put(mddev);
4902        return next_mddev;
4903
4904}
4905
4906static void md_seq_stop(struct seq_file *seq, void *v)
4907{
4908        mddev_t *mddev = v;
4909
4910        if (mddev && v != (void*)1 && v != (void*)2)
4911                mddev_put(mddev);
4912}
4913
4914struct mdstat_info {
4915        int event;
4916};
4917
4918static int md_seq_show(struct seq_file *seq, void *v)
4919{
4920        mddev_t *mddev = v;
4921        sector_t size;
4922        struct list_head *tmp2;
4923        mdk_rdev_t *rdev;
4924        struct mdstat_info *mi = seq->private;
4925        struct bitmap *bitmap;
4926
4927        if (v == (void*)1) {
4928                struct mdk_personality *pers;
4929                seq_printf(seq, "Personalities : ");
4930                spin_lock(&pers_lock);
4931                list_for_each_entry(pers, &pers_list, list)
4932                        seq_printf(seq, "[%s] ", pers->name);
4933
4934                spin_unlock(&pers_lock);
4935                seq_printf(seq, "\n");
4936                mi->event = atomic_read(&md_event_count);
4937                return 0;
4938        }
4939        if (v == (void*)2) {
4940                status_unused(seq);
4941                return 0;
4942        }
4943
4944        if (mddev_lock(mddev) < 0)
4945                return -EINTR;
4946
4947        if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
4948                seq_printf(seq, "%s : %sactive", mdname(mddev),
4949                                                mddev->pers ? "" : "in");
4950                if (mddev->pers) {
4951                        if (mddev->ro==1)
4952                                seq_printf(seq, " (read-only)");
4953                        if (mddev->ro==2)
4954                                seq_printf(seq, "(auto-read-only)");
4955                        seq_printf(seq, " %s", mddev->pers->name);
4956                }
4957
4958                size = 0;
4959                ITERATE_RDEV(mddev,rdev,tmp2) {
4960                        char b[BDEVNAME_SIZE];
4961                        seq_printf(seq, " %s[%d]",
4962                                bdevname(rdev->bdev,b), rdev->desc_nr);
4963                        if (test_bit(WriteMostly, &rdev->flags))
4964                                seq_printf(seq, "(W)");
4965                        if (test_bit(Faulty, &rdev->flags)) {
4966                                seq_printf(seq, "(F)");
4967                                continue;
4968                        } else if (rdev->raid_disk < 0)
4969                                seq_printf(seq, "(S)"); /* spare */
4970                        size += rdev->size;
4971                }
4972
4973                if (!list_empty(&mddev->disks)) {
4974                        if (mddev->pers)
4975                                seq_printf(seq, "\n      %llu blocks",
4976                                        (unsigned long long)mddev->array_size);
4977                        else
4978                                seq_printf(seq, "\n      %llu blocks",
4979                                        (unsigned long long)size);
4980                }
4981                if (mddev->persistent) {
4982                        if (mddev->major_version != 0 ||
4983                            mddev->minor_version != 90) {
4984                                seq_printf(seq," super %d.%d",
4985                                           mddev->major_version,
4986                                           mddev->minor_version);
4987                        }
4988                } else
4989                        seq_printf(seq, " super non-persistent");
4990
4991                if (mddev->pers) {
4992                        mddev->pers->status (seq, mddev);
4993                        seq_printf(seq, "\n      ");
4994                        if (mddev->pers->sync_request) {
4995                                if (mddev->curr_resync > 2) {
4996                                        status_resync (seq, mddev);
4997                                        seq_printf(seq, "\n      ");
4998                                } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
4999                                        seq_printf(seq, "\tresync=DELAYED\n      ");
5000                                else if (mddev->recovery_cp < MaxSector)
5001                                        seq_printf(seq, "\tresync=PENDING\n      ");
5002                        }
5003                } else
5004                        seq_printf(seq, "\n       ");
5005
5006                if ((bitmap = mddev->bitmap)) {
5007                        unsigned long chunk_kb;
5008                        unsigned long flags;
5009                        spin_lock_irqsave(&bitmap->lock, flags);
5010                        chunk_kb = bitmap->chunksize >> 10;
5011                        seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
5012                                "%lu%s chunk",
5013                                bitmap->pages - bitmap->missing_pages,
5014                                bitmap->pages,
5015                                (bitmap->pages - bitmap->missing_pages)
5016                                        << (PAGE_SHIFT - 10),
5017                                chunk_kb ? chunk_kb : bitmap->chunksize,
5018                                chunk_kb ? "KB" : "B");
5019                        if (bitmap->file) {
5020                                seq_printf(seq, ", file: ");
5021                                seq_path(seq, bitmap->file->f_path.mnt,
5022                                         bitmap->file->f_path.dentry," \t\n");
5023                        }
5024
5025                        seq_printf(seq, "\n");
5026                        spin_unlock_irqrestore(&bitmap->lock, flags);
5027                }
5028
5029                seq_printf(seq, "\n");
5030        }
5031        mddev_unlock(mddev);
5032        
5033        return 0;
5034}
5035
5036static struct seq_operations md_seq_ops = {
5037        .start  = md_seq_start,
5038        .next   = md_seq_next,
5039        .stop   = md_seq_stop,
5040        .show   = md_seq_show,
5041};
5042
5043static int md_seq_open(struct inode *inode, struct file *file)
5044{
5045        int error;
5046        struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
5047        if (mi == NULL)
5048                return -ENOMEM;
5049
5050        error = seq_open(file, &md_seq_ops);
5051        if (error)
5052                kfree(mi);
5053        else {
5054                struct seq_file *p = file->private_data;
5055                p->private = mi;
5056                mi->event = atomic_read(&md_event_count);
5057        }
5058        return error;
5059}
5060
5061static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
5062{
5063        struct seq_file *m = filp->private_data;
5064        struct mdstat_info *mi = m->private;
5065        int mask;
5066
5067        poll_wait(filp, &md_event_waiters, wait);
5068
5069        /* always allow read */
5070        mask = POLLIN | POLLRDNORM;
5071
5072        if (mi->event != atomic_read(&md_event_count))
5073                mask |= POLLERR | POLLPRI;
5074        return mask;
5075}
5076
5077static const struct file_operations md_seq_fops = {
5078        .owner          = THIS_MODULE,
5079        .open           = md_seq_open,
5080        .read           = seq_read,
5081        .llseek         = seq_lseek,
5082        .release        = seq_release_private,
5083        .poll           = mdstat_poll,
5084};
5085
5086int register_md_personality(struct mdk_personality *p)
5087{
5088        spin_lock(&pers_lock);
5089        list_add_tail(&p->list, &pers_list);
5090        printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
5091        spin_unlock(&pers_lock);
5092        return 0;
5093}
5094
5095int unregister_md_personality(struct mdk_personality *p)
5096{
5097        printk(KERN_INFO "md: %s personality unregistered\n", p->name);
5098        spin_lock(&pers_lock);
5099        list_del_init(&p->list);
5100        spin_unlock(&pers_lock);
5101        return 0;
5102}
5103
5104static int is_mddev_idle(mddev_t *mddev)
5105{
5106        mdk_rdev_t * rdev;
5107        struct list_head *tmp;
5108        int idle;
5109        long curr_events;
5110
5111        idle = 1;
5112        ITERATE_RDEV(mddev,rdev,tmp) {
5113                struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
5114                curr_events = disk_stat_read(disk, sectors[0]) + 
5115                                disk_stat_read(disk, sectors[1]) - 
5116                                atomic_read(&disk->sync_io);
5117                /* sync IO will cause sync_io to increase before the disk_stats
5118                 * as sync_io is counted when a request starts, and
5119                 * disk_stats is counted when it completes.
5120                 * So resync activity will cause curr_events to be smaller than
5121                 * when there was no such activity.
5122                 * non-sync IO will cause disk_stat to increase without
5123                 * increasing sync_io so curr_events will (eventually)
5124                 * be larger than it was before.  Once it becomes
5125                 * substantially larger, the test below will cause
5126                 * the array to appear non-idle, and resync will slow
5127                 * down.
5128                 * If there is a lot of outstanding resync activity when
5129                 * we set last_event to curr_events, then all that activity
5130                 * completing might cause the array to appear non-idle
5131                 * and resync will be slowed down even though there might
5132                 * not have been non-resync activity.  This will only
5133                 * happen once though.  'last_events' will soon reflect
5134                 * the state where there is little or no outstanding
5135                 * resync requests, and further resync activity will
5136                 * always make curr_events less than last_events.
5137                 *
5138                 */
5139                if (curr_events - rdev->last_events > 4096) {
5140                        rdev->last_events = curr_events;
5141                        idle = 0;
5142                }
5143        }
5144        return idle;
5145}
5146
5147void md_done_sync(mddev_t *mddev, int blocks, int ok)
5148{
5149        /* another "blocks" (512byte) blocks have been synced */
5150        atomic_sub(blocks, &mddev->recovery_active);
5151        wake_up(&mddev->recovery_wait);
5152        if (!ok) {
5153                set_bit(MD_RECOVERY_ERR, &mddev->recovery);
5154                md_wakeup_thread(mddev->thread);
5155                // stop recovery, signal do_sync ....
5156        }
5157}
5158
5159
5160/* md_write_start(mddev, bi)
5161 * If we need to update some array metadata (e.g. 'active' flag
5162 * in superblock) before writing, schedule a superblock update
5163 * and wait for it to complete.
5164 */
5165void md_write_start(mddev_t *mddev, struct bio *bi)
5166{
5167        if (bio_data_dir(bi) != WRITE)
5168                return;
5169
5170        BUG_ON(mddev->ro == 1);
5171        if (mddev->ro == 2) {
5172                /* need to switch to read/write */
5173                mddev->ro = 0;
5174                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5175                md_wakeup_thread(mddev->thread);
5176        }
5177        atomic_inc(&mddev->writes_pending);
5178        if (mddev->in_sync) {
5179                spin_lock_irq(&mddev->write_lock);
5180                if (mddev->in_sync) {
5181                        mddev->in_sync = 0;
5182                        set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5183                        md_wakeup_thread(mddev->thread);
5184                }
5185                spin_unlock_irq(&mddev->write_lock);
5186        }
5187        wait_event(mddev->sb_wait, mddev->flags==0);
5188}
5189
5190void md_write_end(mddev_t *mddev)
5191{
5192        if (atomic_dec_and_test(&mddev->writes_pending)) {
5193                if (mddev->safemode == 2)
5194                        md_wakeup_thread(mddev->thread);
5195                else if (mddev->safemode_delay)
5196                        mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
5197        }
5198}
5199
5200/* md_allow_write(mddev)
5201 * Calling this ensures that the array is marked 'active' so that writes
5202 * may proceed without blocking.  It is important to call this before
5203 * attempting a GFP_KERNEL allocation while holding the mddev lock.
5204 * Must be called with mddev_lock held.
5205 */
5206void md_allow_write(mddev_t *mddev)
5207{
5208        if (!mddev->pers)
5209                return;
5210        if (mddev->ro)
5211                return;
5212
5213        spin_lock_irq(&mddev->write_lock);
5214        if (mddev->in_sync) {
5215                mddev->in_sync = 0;
5216                set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5217                if (mddev->safemode_delay &&
5218                    mddev->safemode == 0)
5219                        mddev->safemode = 1;
5220                spin_unlock_irq(&mddev->write_lock);
5221                md_update_sb(mddev, 0);
5222        } else
5223                spin_unlock_irq(&mddev->write_lock);
5224}
5225EXPORT_SYMBOL_GPL(md_allow_write);
5226
5227static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
5228
5229#define SYNC_MARKS      10
5230#define SYNC_MARK_STEP  (3*HZ)
5231void md_do_sync(mddev_t *mddev)
5232{
5233        mddev_t *mddev2;
5234        unsigned int currspeed = 0,
5235                 window;
5236        sector_t max_sectors,j, io_sectors;
5237        unsigned long mark[SYNC_MARKS];
5238        sector_t mark_cnt[SYNC_MARKS];
5239        int last_mark,m;
5240        struct list_head *tmp;
5241        sector_t last_check;
5242        int skipped = 0;
5243        struct list_head *rtmp;
5244        mdk_rdev_t *rdev;
5245        char *desc;
5246
5247        /* just incase thread restarts... */
5248        if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
5249                return;
5250        if (mddev->ro) /* never try to sync a read-only array */
5251                return;
5252
5253        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5254                if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
5255                        desc = "data-check";
5256                else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5257                        desc = "requested-resync";
5258                else
5259                        desc = "resync";
5260        } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5261                desc = "reshape";
5262        else
5263                desc = "recovery";
5264
5265        /* we overload curr_resync somewhat here.
5266         * 0 == not engaged in resync at all
5267         * 2 == checking that there is no conflict with another sync
5268         * 1 == like 2, but have yielded to allow conflicting resync to
5269         *              commense
5270         * other == active in resync - this many blocks
5271         *
5272         * Before starting a resync we must have set curr_resync to
5273         * 2, and then checked that every "conflicting" array has curr_resync
5274         * less than ours.  When we find one that is the same or higher
5275         * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
5276         * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
5277         * This will mean we have to start checking from the beginning again.
5278         *
5279         */
5280
5281        do {
5282                mddev->curr_resync = 2;
5283
5284        try_again:
5285                if (kthread_should_stop()) {
5286                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5287                        goto skip;
5288                }
5289                ITERATE_MDDEV(mddev2,tmp) {
5290                        if (mddev2 == mddev)
5291                                continue;
5292                        if (mddev2->curr_resync && 
5293                            match_mddev_units(mddev,mddev2)) {
5294                                DEFINE_WAIT(wq);
5295                                if (mddev < mddev2 && mddev->curr_resync == 2) {
5296                                        /* arbitrarily yield */
5297                                        mddev->curr_resync = 1;
5298                                        wake_up(&resync_wait);
5299                                }
5300                                if (mddev > mddev2 && mddev->curr_resync == 1)
5301                                        /* no need to wait here, we can wait the next
5302                                         * time 'round when curr_resync == 2
5303                                         */
5304                                        continue;
5305                                prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE);
5306                                if (!kthread_should_stop() &&
5307                                    mddev2->curr_resync >= mddev->curr_resync) {
5308                                        printk(KERN_INFO "md: delaying %s of %s"
5309                                               " until %s has finished (they"
5310                                               " share one or more physical units)\n",
5311                                               desc, mdname(mddev), mdname(mddev2));
5312                                        mddev_put(mddev2);
5313                                        schedule();
5314                                        finish_wait(&resync_wait, &wq);
5315                                        goto try_again;
5316                                }
5317                                finish_wait(&resync_wait, &wq);
5318                        }
5319                }
5320        } while (mddev->curr_resync < 2);
5321
5322        j = 0;
5323        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5324                /* resync follows the size requested by the personality,
5325                 * which defaults to physical size, but can be virtual size
5326                 */
5327                max_sectors = mddev->resync_max_sectors;
5328                mddev->resync_mismatches = 0;
5329                /* we don't use the checkpoint if there's a bitmap */
5330                if (!mddev->bitmap &&
5331                    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5332                        j = mddev->recovery_cp;
5333        } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5334                max_sectors = mddev->size << 1;
5335        else {
5336                /* recovery follows the physical size of devices */
5337                max_sectors = mddev->size << 1;
5338                j = MaxSector;
5339                ITERATE_RDEV(mddev,rdev,rtmp)
5340                        if (rdev->raid_disk >= 0 &&
5341                            !test_bit(Faulty, &rdev->flags) &&
5342                            !test_bit(In_sync, &rdev->flags) &&
5343                            rdev->recovery_offset < j)
5344                                j = rdev->recovery_offset;
5345        }
5346
5347        printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
5348        printk(KERN_INFO "md: minimum _guaranteed_  speed:"
5349                " %d KB/sec/disk.\n", speed_min(mddev));
5350        printk(KERN_INFO "md: using maximum available idle IO bandwidth "
5351               "(but not more than %d KB/sec) for %s.\n",
5352               speed_max(mddev), desc);
5353
5354        is_mddev_idle(mddev); /* this also initializes IO event counters */
5355
5356        io_sectors = 0;
5357        for (m = 0; m < SYNC_MARKS; m++) {
5358                mark[m] = jiffies;
5359                mark_cnt[m] = io_sectors;
5360        }
5361        last_mark = 0;
5362        mddev->resync_mark = mark[last_mark];
5363        mddev->resync_mark_cnt = mark_cnt[last_mark];
5364
5365        /*
5366         * Tune reconstruction:
5367         */
5368        window = 32*(PAGE_SIZE/512);
5369        printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
5370                window/2,(unsigned long long) max_sectors/2);
5371
5372        atomic_set(&mddev->recovery_active, 0);
5373        init_waitqueue_head(&mddev->recovery_wait);
5374        last_check = 0;
5375
5376        if (j>2) {
5377                printk(KERN_INFO 
5378                       "md: resuming %s of %s from checkpoint.\n",
5379                       desc, mdname(mddev));
5380                mddev->curr_resync = j;
5381        }
5382
5383        while (j < max_sectors) {
5384                sector_t sectors;
5385
5386                skipped = 0;
5387                sectors = mddev->pers->sync_request(mddev, j, &skipped,
5388                                            currspeed < speed_min(mddev));
5389                if (sectors == 0) {
5390                        set_bit(MD_RECOVERY_ERR, &mddev->recovery);
5391                        goto out;
5392                }
5393
5394                if (!skipped) { /* actual IO requested */
5395                        io_sectors += sectors;
5396                        atomic_add(sectors, &mddev->recovery_active);
5397                }
5398
5399                j += sectors;
5400                if (j>1) mddev->curr_resync = j;
5401                mddev->curr_mark_cnt = io_sectors;
5402                if (last_check == 0)
5403                        /* this is the earliers that rebuilt will be
5404                         * visible in /proc/mdstat
5405                         */
5406                        md_new_event(mddev);
5407
5408                if (last_check + window > io_sectors || j == max_sectors)
5409                        continue;
5410
5411                last_check = io_sectors;
5412
5413                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
5414                    test_bit(MD_RECOVERY_ERR, &mddev->recovery))
5415                        break;
5416
5417        repeat:
5418                if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
5419                        /* step marks */
5420                        int next = (last_mark+1) % SYNC_MARKS;
5421
5422                        mddev->resync_mark = mark[next];
5423                        mddev->resync_mark_cnt = mark_cnt[next];
5424                        mark[next] = jiffies;
5425                        mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
5426                        last_mark = next;
5427                }
5428
5429
5430                if (kthread_should_stop()) {
5431                        /*
5432                         * got a signal, exit.
5433                         */
5434                        printk(KERN_INFO 
5435                                "md: md_do_sync() got signal ... exiting\n");
5436                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5437                        goto out;
5438                }
5439
5440                /*
5441                 * this loop exits only if either when we are slower than
5442                 * the 'hard' speed limit, or the system was IO-idle for
5443                 * a jiffy.
5444                 * the system might be non-idle CPU-wise, but we only care
5445                 * about not overloading the IO subsystem. (things like an
5446                 * e2fsck being done on the RAID array should execute fast)
5447                 */
5448                blk_unplug(mddev->queue);
5449                cond_resched();
5450
5451                currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
5452                        /((jiffies-mddev->resync_mark)/HZ +1) +1;
5453
5454                if (currspeed > speed_min(mddev)) {
5455                        if ((currspeed > speed_max(mddev)) ||
5456                                        !is_mddev_idle(mddev)) {
5457                                msleep(500);
5458                                goto repeat;
5459                        }
5460                }
5461        }
5462        printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
5463        /*
5464         * this also signals 'finished resyncing' to md_stop
5465         */
5466 out:
5467        blk_unplug(mddev->queue);
5468
5469        wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
5470
5471        /* tell personality that we are finished */
5472        mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
5473
5474        if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
5475            !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
5476            mddev->curr_resync > 2) {
5477                if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5478                        if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5479                                if (mddev->curr_resync >= mddev->recovery_cp) {
5480                                        printk(KERN_INFO
5481                                               "md: checkpointing %s of %s.\n",
5482                                               desc, mdname(mddev));
5483                                        mddev->recovery_cp = mddev->curr_resync;
5484                                }
5485                        } else
5486                                mddev->recovery_cp = MaxSector;
5487                } else {
5488                        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5489                                mddev->curr_resync = MaxSector;
5490                        ITERATE_RDEV(mddev,rdev,rtmp)
5491                                if (rdev->raid_disk >= 0 &&
5492                                    !test_bit(Faulty, &rdev->flags) &&
5493                                    !test_bit(In_sync, &rdev->flags) &&
5494                                    rdev->recovery_offset < mddev->curr_resync)
5495                                        rdev->recovery_offset = mddev->curr_resync;
5496                }
5497        }
5498        set_bit(MD_CHANGE_DEVS, &mddev->flags);
5499
5500 skip:
5501        mddev->curr_resync = 0;
5502        wake_up(&resync_wait);
5503        set_bit(MD_RECOVERY_DONE, &mddev->recovery);
5504        md_wakeup_thread(mddev->thread);
5505}
5506EXPORT_SYMBOL_GPL(md_do_sync);
5507
5508
5509static int remove_and_add_spares(mddev_t *mddev)
5510{
5511        mdk_rdev_t *rdev;
5512        struct list_head *rtmp;
5513        int spares = 0;
5514
5515        ITERATE_RDEV(mddev,rdev,rtmp)
5516                if (rdev->raid_disk >= 0 &&
5517                    (test_bit(Faulty, &rdev->flags) ||
5518                     ! test_bit(In_sync, &rdev->flags)) &&
5519                    atomic_read(&rdev->nr_pending)==0) {
5520                        if (mddev->pers->hot_remove_disk(
5521                                    mddev, rdev->raid_disk)==0) {
5522                                char nm[20];
5523                                sprintf(nm,"rd%d", rdev->raid_disk);
5524                                sysfs_remove_link(&mddev->kobj, nm);
5525                                rdev->raid_disk = -1;
5526                        }
5527                }
5528
5529        if (mddev->degraded) {
5530                ITERATE_RDEV(mddev,rdev,rtmp)
5531                        if (rdev->raid_disk < 0
5532                            && !test_bit(Faulty, &rdev->flags)) {
5533                                rdev->recovery_offset = 0;
5534                                if (mddev->pers->hot_add_disk(mddev,rdev)) {
5535                                        char nm[20];
5536                                        sprintf(nm, "rd%d", rdev->raid_disk);
5537                                        if (sysfs_create_link(&mddev->kobj,
5538                                                              &rdev->kobj, nm))
5539                                                printk(KERN_WARNING
5540                                                       "md: cannot register "
5541                                                       "%s for %s\n",
5542                                                       nm, mdname(mddev));
5543                                        spares++;
5544                                        md_new_event(mddev);
5545                                } else
5546                                        break;
5547                        }
5548        }
5549        return spares;
5550}
5551/*
5552 * This routine is regularly called by all per-raid-array threads to
5553 * deal with generic issues like resync and super-block update.
5554 * Raid personalities that don't have a thread (linear/raid0) do not
5555 * need this as they never do any recovery or update the superblock.
5556 *
5557 * It does not do any resync itself, but rather "forks" off other threads
5558 * to do that as needed.
5559 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
5560 * "->recovery" and create a thread at ->sync_thread.
5561 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
5562 * and wakeups up this thread which will reap the thread and finish up.
5563 * This thread also removes any faulty devices (with nr_pending == 0).
5564 *
5565 * The overall approach is:
5566 *  1/ if the superblock needs updating, update it.
5567 *  2/ If a recovery thread is running, don't do anything else.
5568 *  3/ If recovery has finished, clean up, possibly marking spares active.
5569 *  4/ If there are any faulty devices, remove them.
5570 *  5/ If array is degraded, try to add spares devices
5571 *  6/ If array has spares or is not in-sync, start a resync thread.
5572 */
5573void md_check_recovery(mddev_t *mddev)
5574{
5575        mdk_rdev_t *rdev;
5576        struct list_head *rtmp;
5577
5578
5579        if (mddev->bitmap)
5580                bitmap_daemon_work(mddev->bitmap);
5581
5582        if (mddev->ro)
5583                return;
5584
5585        if (signal_pending(current)) {
5586                if (mddev->pers->sync_request) {
5587                        printk(KERN_INFO "md: %s in immediate safe mode\n",
5588                               mdname(mddev));
5589                        mddev->safemode = 2;
5590                }
5591                flush_signals(current);
5592        }
5593
5594        if ( ! (
5595                mddev->flags ||
5596                test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
5597                test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
5598                (mddev->safemode == 1) ||
5599                (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
5600                 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
5601                ))
5602                return;
5603
5604        if (mddev_trylock(mddev)) {
5605                int spares = 0;
5606
5607                spin_lock_irq(&mddev->write_lock);
5608                if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
5609                    !mddev->in_sync && mddev->recovery_cp == MaxSector) {
5610                        mddev->in_sync = 1;
5611                        set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5612                }
5613                if (mddev->safemode == 1)
5614                        mddev->safemode = 0;
5615                spin_unlock_irq(&mddev->write_lock);
5616
5617                if (mddev->flags)
5618                        md_update_sb(mddev, 0);
5619
5620
5621                if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
5622                    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
5623                        /* resync/recovery still happening */
5624                        clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5625                        goto unlock;
5626                }
5627                if (mddev->sync_thread) {
5628                        /* resync has finished, collect result */
5629                        md_unregister_thread(mddev->sync_thread);
5630                        mddev->sync_thread = NULL;
5631                        if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
5632                            !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5633                                /* success...*/
5634                                /* activate any spares */
5635                                mddev->pers->spare_active(mddev);
5636                        }
5637                        md_update_sb(mddev, 1);
5638
5639                        /* if array is no-longer degraded, then any saved_raid_disk
5640                         * information must be scrapped
5641                         */
5642                        if (!mddev->degraded)
5643                                ITERATE_RDEV(mddev,rdev,rtmp)
5644                                        rdev->saved_raid_disk = -1;
5645
5646                        mddev->recovery = 0;
5647                        /* flag recovery needed just to double check */
5648                        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5649                        md_new_event(mddev);
5650                        goto unlock;
5651                }
5652                /* Clear some bits that don't mean anything, but
5653                 * might be left set
5654                 */
5655                clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5656                clear_bit(MD_RECOVERY_ERR, &mddev->recovery);
5657                clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
5658                clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
5659
5660                if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
5661                        goto unlock;
5662                /* no recovery is running.
5663                 * remove any failed drives, then
5664                 * add spares if possible.
5665                 * Spare are also removed and re-added, to allow
5666                 * the personality to fail the re-add.
5667                 */
5668
5669                if (mddev->reshape_position != MaxSector) {
5670                        if (mddev->pers->check_reshape(mddev) != 0)
5671                                /* Cannot proceed */
5672                                goto unlock;
5673                        set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
5674                } else if ((spares = remove_and_add_spares(mddev))) {
5675                        clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5676                        clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
5677                } else if (mddev->recovery_cp < MaxSector) {
5678                        set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5679                } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5680                        /* nothing to be done ... */
5681                        goto unlock;
5682
5683                if (mddev->pers->sync_request) {
5684                        set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
5685                        if (spares && mddev->bitmap && ! mddev->bitmap->file) {
5686                                /* We are adding a device or devices to an array
5687                                 * which has the bitmap stored on all devices.
5688                                 * So make sure all bitmap pages get written
5689                                 */
5690                                bitmap_write_all(mddev->bitmap);
5691                        }
5692                        mddev->sync_thread = md_register_thread(md_do_sync,
5693                                                                mddev,
5694                                                                "%s_resync");
5695                        if (!mddev->sync_thread) {
5696                                printk(KERN_ERR "%s: could not start resync"
5697                                        " thread...\n", 
5698                                        mdname(mddev));
5699                                /* leave the spares where they are, it shouldn't hurt */
5700                                mddev->recovery = 0;
5701                        } else
5702                                md_wakeup_thread(mddev->sync_thread);
5703                        md_new_event(mddev);
5704                }
5705        unlock:
5706                mddev_unlock(mddev);
5707        }
5708}
5709
5710static int md_notify_reboot(struct notifier_block *this,
5711                            unsigned long code, void *x)
5712{
5713        struct list_head *tmp;
5714        mddev_t *mddev;
5715
5716        if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
5717
5718                printk(KERN_INFO "md: stopping all md devices.\n");
5719
5720                ITERATE_MDDEV(mddev,tmp)
5721                        if (mddev_trylock(mddev)) {
5722                                do_md_stop (mddev, 1);
5723                                mddev_unlock(mddev);
5724                        }
5725                /*
5726                 * certain more exotic SCSI devices are known to be
5727                 * volatile wrt too early system reboots. While the
5728                 * right place to handle this issue is the given
5729                 * driver, we do want to have a safe RAID driver ...
5730                 */
5731                mdelay(1000*1);
5732        }
5733        return NOTIFY_DONE;
5734}
5735
5736static struct notifier_block md_notifier = {
5737        .notifier_call  = md_notify_reboot,
5738        .next           = NULL,
5739        .priority       = INT_MAX, /* before any real devices */
5740};
5741
5742static void md_geninit(void)
5743{
5744        struct proc_dir_entry *p;
5745
5746        dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
5747
5748        p = create_proc_entry("mdstat", S_IRUGO, NULL);
5749        if (p)
5750                p->proc_fops = &md_seq_fops;
5751}
5752
5753static int __init md_init(void)
5754{
5755        if (register_blkdev(MAJOR_NR, "md"))
5756                return -1;
5757        if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
5758                unregister_blkdev(MAJOR_NR, "md");
5759                return -1;
5760        }
5761        blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE,
5762                            md_probe, NULL, NULL);
5763        blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
5764                            md_probe, NULL, NULL);
5765
5766        register_reboot_notifier(&md_notifier);
5767        raid_table_header = register_sysctl_table(raid_root_table);
5768
5769        md_geninit();
5770        return (0);
5771}
5772
5773
5774#ifndef MODULE
5775
5776/*
5777 * Searches all registered partitions for autorun RAID arrays
5778 * at boot time.
5779 */
5780
5781static LIST_HEAD(all_detected_devices);
5782struct detected_devices_node {
5783        struct list_head list;
5784        dev_t dev;
5785};
5786
5787void md_autodetect_dev(dev_t dev)
5788{
5789        struct detected_devices_node *node_detected_dev;
5790
5791        node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
5792        if (node_detected_dev) {
5793                node_detected_dev->dev = dev;
5794                list_add_tail(&node_detected_dev->list, &all_detected_devices);
5795        } else {
5796                printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
5797                        ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
5798        }
5799}
5800
5801
5802static void autostart_arrays(int part)
5803{
5804        mdk_rdev_t *rdev;
5805        struct detected_devices_node *node_detected_dev;
5806        dev_t dev;
5807        int i_scanned, i_passed;
5808
5809        i_scanned = 0;
5810        i_passed = 0;
5811
5812        printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
5813
5814        while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
5815                i_scanned++;
5816                node_detected_dev = list_entry(all_detected_devices.next,
5817                                        struct detected_devices_node, list);
5818                list_del(&node_detected_dev->list);
5819                dev = node_detected_dev->dev;
5820                kfree(node_detected_dev);
5821                rdev = md_import_device(dev,0, 90);
5822                if (IS_ERR(rdev))
5823                        continue;
5824
5825                if (test_bit(Faulty, &rdev->flags)) {
5826                        MD_BUG();
5827                        continue;
5828                }
5829                list_add(&rdev->same_set, &pending_raid_disks);
5830                i_passed++;
5831        }
5832
5833        printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
5834                                                i_scanned, i_passed);
5835
5836        autorun_devices(part);
5837}
5838
5839#endif /* !MODULE */
5840
5841static __exit void md_exit(void)
5842{
5843        mddev_t *mddev;
5844        struct list_head *tmp;
5845
5846        blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS);
5847        blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
5848
5849        unregister_blkdev(MAJOR_NR,"md");
5850        unregister_blkdev(mdp_major, "mdp");
5851        unregister_reboot_notifier(&md_notifier);
5852        unregister_sysctl_table(raid_table_header);
5853        remove_proc_entry("mdstat", NULL);
5854        ITERATE_MDDEV(mddev,tmp) {
5855                struct gendisk *disk = mddev->gendisk;
5856                if (!disk)
5857                        continue;
5858                export_array(mddev);
5859                del_gendisk(disk);
5860                put_disk(disk);
5861                mddev->gendisk = NULL;
5862                mddev_put(mddev);
5863        }
5864}
5865
5866subsys_initcall(md_init);
5867module_exit(md_exit)
5868
5869static int get_ro(char *buffer, struct kernel_param *kp)
5870{
5871        return sprintf(buffer, "%d", start_readonly);
5872}
5873static int set_ro(const char *val, struct kernel_param *kp)
5874{
5875        char *e;
5876        int num = simple_strtoul(val, &e, 10);
5877        if (*val && (*e == '\0' || *e == '\n')) {
5878                start_readonly = num;
5879                return 0;
5880        }
5881        return -EINVAL;
5882}
5883
5884module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
5885module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
5886
5887
5888EXPORT_SYMBOL(register_md_personality);
5889EXPORT_SYMBOL(unregister_md_personality);
5890EXPORT_SYMBOL(md_error);
5891EXPORT_SYMBOL(md_done_sync);
5892EXPORT_SYMBOL(md_write_start);
5893EXPORT_SYMBOL(md_write_end);
5894EXPORT_SYMBOL(md_register_thread);
5895EXPORT_SYMBOL(md_unregister_thread);
5896EXPORT_SYMBOL(md_wakeup_thread);
5897EXPORT_SYMBOL(md_check_recovery);
5898MODULE_LICENSE("GPL");
5899MODULE_ALIAS("md");
5900MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
5901