LXR linux/drivers/md/raid1.c

   1/*
   2 * raid1.c : Multiple Devices driver for Linux
   3 *
   4 * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
   5 *
   6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
   7 *
   8 * RAID-1 management functions.
   9 *
  10 * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
  11 *
  12 * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
  13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
  14 *
  15 * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
  16 * bitmapped intelligence in resync:
  17 *
  18 *      - bitmap marked during normal i/o
  19 *      - bitmap used to skip nondirty blocks during sync
  20 *
  21 * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
  22 * - persistent bitmap code
  23 *
  24 * This program is free software; you can redistribute it and/or modify
  25 * it under the terms of the GNU General Public License as published by
  26 * the Free Software Foundation; either version 2, or (at your option)
  27 * any later version.
  28 *
  29 * You should have received a copy of the GNU General Public License
  30 * (for example /usr/src/linux/COPYING); if not, write to the Free
  31 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  32 */
  33
  34#include <linux/slab.h>
  35#include <linux/delay.h>
  36#include <linux/blkdev.h>
  37#include <linux/seq_file.h>
  38#include "md.h"
  39#include "raid1.h"
  40#include "bitmap.h"
  41
  42#define DEBUG 0
  43#if DEBUG
  44#define PRINTK(x...) printk(x)
  45#else
  46#define PRINTK(x...)
  47#endif
  48
  49/*
  50 * Number of guaranteed r1bios in case of extreme VM load:
  51 */
  52#define NR_RAID1_BIOS 256
  53
  54
  55static void allow_barrier(conf_t *conf);
  56static void lower_barrier(conf_t *conf);
  57
  58static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
  59{
  60        struct pool_info *pi = data;
  61        int size = offsetof(r1bio_t, bios[pi->raid_disks]);
  62
  63        /* allocate a r1bio with room for raid_disks entries in the bios array */
  64        return kzalloc(size, gfp_flags);
  65}
  66
  67static void r1bio_pool_free(void *r1_bio, void *data)
  68{
  69        kfree(r1_bio);
  70}
  71
  72#define RESYNC_BLOCK_SIZE (64*1024)
  73//#define RESYNC_BLOCK_SIZE PAGE_SIZE
  74#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
  75#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
  76#define RESYNC_WINDOW (2048*1024)
  77
  78static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
  79{
  80        struct pool_info *pi = data;
  81        struct page *page;
  82        r1bio_t *r1_bio;
  83        struct bio *bio;
  84        int i, j;
  85
  86        r1_bio = r1bio_pool_alloc(gfp_flags, pi);
  87        if (!r1_bio)
  88                return NULL;
  89
  90        /*
  91         * Allocate bios : 1 for reading, n-1 for writing
  92         */
  93        for (j = pi->raid_disks ; j-- ; ) {
  94                bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
  95                if (!bio)
  96                        goto out_free_bio;
  97                r1_bio->bios[j] = bio;
  98        }
  99        /*
 100         * Allocate RESYNC_PAGES data pages and attach them to
 101         * the first bio.
 102         * If this is a user-requested check/repair, allocate
 103         * RESYNC_PAGES for each bio.
 104         */
 105        if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
 106                j = pi->raid_disks;
 107        else
 108                j = 1;
 109        while(j--) {
 110                bio = r1_bio->bios[j];
 111                for (i = 0; i < RESYNC_PAGES; i++) {
 112                        page = alloc_page(gfp_flags);
 113                        if (unlikely(!page))
 114                                goto out_free_pages;
 115
 116                        bio->bi_io_vec[i].bv_page = page;
 117                        bio->bi_vcnt = i+1;
 118                }
 119        }
 120        /* If not user-requests, copy the page pointers to all bios */
 121        if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
 122                for (i=0; i<RESYNC_PAGES ; i++)
 123                        for (j=1; j<pi->raid_disks; j++)
 124                                r1_bio->bios[j]->bi_io_vec[i].bv_page =
 125                                        r1_bio->bios[0]->bi_io_vec[i].bv_page;
 126        }
 127
 128        r1_bio->master_bio = NULL;
 129
 130        return r1_bio;
 131
 132out_free_pages:
 133        for (j=0 ; j < pi->raid_disks; j++)
 134                for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++)
 135                        put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
 136        j = -1;
 137out_free_bio:
 138        while ( ++j < pi->raid_disks )
 139                bio_put(r1_bio->bios[j]);
 140        r1bio_pool_free(r1_bio, data);
 141        return NULL;
 142}
 143
 144static void r1buf_pool_free(void *__r1_bio, void *data)
 145{
 146        struct pool_info *pi = data;
 147        int i,j;
 148        r1bio_t *r1bio = __r1_bio;
 149
 150        for (i = 0; i < RESYNC_PAGES; i++)
 151                for (j = pi->raid_disks; j-- ;) {
 152                        if (j == 0 ||
 153                            r1bio->bios[j]->bi_io_vec[i].bv_page !=
 154                            r1bio->bios[0]->bi_io_vec[i].bv_page)
 155                                safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
 156                }
 157        for (i=0 ; i < pi->raid_disks; i++)
 158                bio_put(r1bio->bios[i]);
 159
 160        r1bio_pool_free(r1bio, data);
 161}
 162
 163static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
 164{
 165        int i;
 166
 167        for (i = 0; i < conf->raid_disks; i++) {
 168                struct bio **bio = r1_bio->bios + i;
 169                if (*bio && *bio != IO_BLOCKED)
 170                        bio_put(*bio);
 171                *bio = NULL;
 172        }
 173}
 174
 175static void free_r1bio(r1bio_t *r1_bio)
 176{
 177        conf_t *conf = r1_bio->mddev->private;
 178
 179        /*
 180         * Wake up any possible resync thread that waits for the device
 181         * to go idle.
 182         */
 183        allow_barrier(conf);
 184
 185        put_all_bios(conf, r1_bio);
 186        mempool_free(r1_bio, conf->r1bio_pool);
 187}
 188
 189static void put_buf(r1bio_t *r1_bio)
 190{
 191        conf_t *conf = r1_bio->mddev->private;
 192        int i;
 193
 194        for (i=0; i<conf->raid_disks; i++) {
 195                struct bio *bio = r1_bio->bios[i];
 196                if (bio->bi_end_io)
 197                        rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
 198        }
 199
 200        mempool_free(r1_bio, conf->r1buf_pool);
 201
 202        lower_barrier(conf);
 203}
 204
 205static void reschedule_retry(r1bio_t *r1_bio)
 206{
 207        unsigned long flags;
 208        mddev_t *mddev = r1_bio->mddev;
 209        conf_t *conf = mddev->private;
 210
 211        spin_lock_irqsave(&conf->device_lock, flags);
 212        list_add(&r1_bio->retry_list, &conf->retry_list);
 213        conf->nr_queued ++;
 214        spin_unlock_irqrestore(&conf->device_lock, flags);
 215
 216        wake_up(&conf->wait_barrier);
 217        md_wakeup_thread(mddev->thread);
 218}
 219
 220/*
 221 * raid_end_bio_io() is called when we have finished servicing a mirrored
 222 * operation and are ready to return a success/failure code to the buffer
 223 * cache layer.
 224 */
 225static void raid_end_bio_io(r1bio_t *r1_bio)
 226{
 227        struct bio *bio = r1_bio->master_bio;
 228
 229        /* if nobody has done the final endio yet, do it now */
 230        if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
 231                PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",
 232                        (bio_data_dir(bio) == WRITE) ? "write" : "read",
 233                        (unsigned long long) bio->bi_sector,
 234                        (unsigned long long) bio->bi_sector +
 235                                (bio->bi_size >> 9) - 1);
 236
 237                bio_endio(bio,
 238                        test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
 239        }
 240        free_r1bio(r1_bio);
 241}
 242
 243/*
 244 * Update disk head position estimator based on IRQ completion info.
 245 */
 246static inline void update_head_pos(int disk, r1bio_t *r1_bio)
 247{
 248        conf_t *conf = r1_bio->mddev->private;
 249
 250        conf->mirrors[disk].head_position =
 251                r1_bio->sector + (r1_bio->sectors);
 252}
 253
 254static void raid1_end_read_request(struct bio *bio, int error)
 255{
 256        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 257        r1bio_t *r1_bio = bio->bi_private;
 258        int mirror;
 259        conf_t *conf = r1_bio->mddev->private;
 260
 261        mirror = r1_bio->read_disk;
 262        /*
 263         * this branch is our 'one mirror IO has finished' event handler:
 264         */
 265        update_head_pos(mirror, r1_bio);
 266
 267        if (uptodate)
 268                set_bit(R1BIO_Uptodate, &r1_bio->state);
 269        else {
 270                /* If all other devices have failed, we want to return
 271                 * the error upwards rather than fail the last device.
 272                 * Here we redefine "uptodate" to mean "Don't want to retry"
 273                 */
 274                unsigned long flags;
 275                spin_lock_irqsave(&conf->device_lock, flags);
 276                if (r1_bio->mddev->degraded == conf->raid_disks ||
 277                    (r1_bio->mddev->degraded == conf->raid_disks-1 &&
 278                     !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)))
 279                        uptodate = 1;
 280                spin_unlock_irqrestore(&conf->device_lock, flags);
 281        }
 282
 283        if (uptodate)
 284                raid_end_bio_io(r1_bio);
 285        else {
 286                /*
 287                 * oops, read error:
 288                 */
 289                char b[BDEVNAME_SIZE];
 290                if (printk_ratelimit())
 291                        printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n",
 292                               mdname(conf->mddev),
 293                               bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
 294                reschedule_retry(r1_bio);
 295        }
 296
 297        rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
 298}
 299
 300static void r1_bio_write_done(r1bio_t *r1_bio)
 301{
 302        if (atomic_dec_and_test(&r1_bio->remaining))
 303        {
 304                /* it really is the end of this request */
 305                if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
 306                        /* free extra copy of the data pages */
 307                        int i = r1_bio->behind_page_count;
 308                        while (i--)
 309                                safe_put_page(r1_bio->behind_pages[i]);
 310                        kfree(r1_bio->behind_pages);
 311                        r1_bio->behind_pages = NULL;
 312                }
 313                /* clear the bitmap if all writes complete successfully */
 314                bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
 315                                r1_bio->sectors,
 316                                !test_bit(R1BIO_Degraded, &r1_bio->state),
 317                                test_bit(R1BIO_BehindIO, &r1_bio->state));
 318                md_write_end(r1_bio->mddev);
 319                raid_end_bio_io(r1_bio);
 320        }
 321}
 322
 323static void raid1_end_write_request(struct bio *bio, int error)
 324{
 325        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 326        r1bio_t *r1_bio = bio->bi_private;
 327        int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
 328        conf_t *conf = r1_bio->mddev->private;
 329        struct bio *to_put = NULL;
 330
 331
 332        for (mirror = 0; mirror < conf->raid_disks; mirror++)
 333                if (r1_bio->bios[mirror] == bio)
 334                        break;
 335
 336        /*
 337         * 'one mirror IO has finished' event handler:
 338         */
 339        r1_bio->bios[mirror] = NULL;
 340        to_put = bio;
 341        if (!uptodate) {
 342                md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
 343                /* an I/O failed, we can't clear the bitmap */
 344                set_bit(R1BIO_Degraded, &r1_bio->state);
 345        } else
 346                /*
 347                 * Set R1BIO_Uptodate in our master bio, so that we
 348                 * will return a good error code for to the higher
 349                 * levels even if IO on some other mirrored buffer
 350                 * fails.
 351                 *
 352                 * The 'master' represents the composite IO operation
 353                 * to user-side. So if something waits for IO, then it
 354                 * will wait for the 'master' bio.
 355                 */
 356                set_bit(R1BIO_Uptodate, &r1_bio->state);
 357
 358        update_head_pos(mirror, r1_bio);
 359
 360        if (behind) {
 361                if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
 362                        atomic_dec(&r1_bio->behind_remaining);
 363
 364                /*
 365                 * In behind mode, we ACK the master bio once the I/O
 366                 * has safely reached all non-writemostly
 367                 * disks. Setting the Returned bit ensures that this
 368                 * gets done only once -- we don't ever want to return
 369                 * -EIO here, instead we'll wait
 370                 */
 371                if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
 372                    test_bit(R1BIO_Uptodate, &r1_bio->state)) {
 373                        /* Maybe we can return now */
 374                        if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
 375                                struct bio *mbio = r1_bio->master_bio;
 376                                PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
 377                                       (unsigned long long) mbio->bi_sector,
 378                                       (unsigned long long) mbio->bi_sector +
 379                                       (mbio->bi_size >> 9) - 1);
 380                                bio_endio(mbio, 0);
 381                        }
 382                }
 383        }
 384        rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
 385
 386        /*
 387         * Let's see if all mirrored write operations have finished
 388         * already.
 389         */
 390        r1_bio_write_done(r1_bio);
 391
 392        if (to_put)
 393                bio_put(to_put);
 394}
 395
 396
 397/*
 398 * This routine returns the disk from which the requested read should
 399 * be done. There is a per-array 'next expected sequential IO' sector
 400 * number - if this matches on the next IO then we use the last disk.
 401 * There is also a per-disk 'last know head position' sector that is
 402 * maintained from IRQ contexts, both the normal and the resync IO
 403 * completion handlers update this position correctly. If there is no
 404 * perfect sequential match then we pick the disk whose head is closest.
 405 *
 406 * If there are 2 mirrors in the same 2 devices, performance degrades
 407 * because position is mirror, not device based.
 408 *
 409 * The rdev for the device selected will have nr_pending incremented.
 410 */
 411static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 412{
 413        const sector_t this_sector = r1_bio->sector;
 414        const int sectors = r1_bio->sectors;
 415        int start_disk;
 416        int best_disk;
 417        int i;
 418        sector_t best_dist;
 419        mdk_rdev_t *rdev;
 420        int choose_first;
 421
 422        rcu_read_lock();
 423        /*
 424         * Check if we can balance. We can balance on the whole
 425         * device if no resync is going on, or below the resync window.
 426         * We take the first readable disk when above the resync window.
 427         */
 428 retry:
 429        best_disk = -1;
 430        best_dist = MaxSector;
 431        if (conf->mddev->recovery_cp < MaxSector &&
 432            (this_sector + sectors >= conf->next_resync)) {
 433                choose_first = 1;
 434                start_disk = 0;
 435        } else {
 436                choose_first = 0;
 437                start_disk = conf->last_used;
 438        }
 439
 440        for (i = 0 ; i < conf->raid_disks ; i++) {
 441                sector_t dist;
 442                int disk = start_disk + i;
 443                if (disk >= conf->raid_disks)
 444                        disk -= conf->raid_disks;
 445
 446                rdev = rcu_dereference(conf->mirrors[disk].rdev);
 447                if (r1_bio->bios[disk] == IO_BLOCKED
 448                    || rdev == NULL
 449                    || test_bit(Faulty, &rdev->flags))
 450                        continue;
 451                if (!test_bit(In_sync, &rdev->flags) &&
 452                    rdev->recovery_offset < this_sector + sectors)
 453                        continue;
 454                if (test_bit(WriteMostly, &rdev->flags)) {
 455                        /* Don't balance among write-mostly, just
 456                         * use the first as a last resort */
 457                        if (best_disk < 0)
 458                                best_disk = disk;
 459                        continue;
 460                }
 461                /* This is a reasonable device to use.  It might
 462                 * even be best.
 463                 */
 464                dist = abs(this_sector - conf->mirrors[disk].head_position);
 465                if (choose_first
 466                    /* Don't change to another disk for sequential reads */
 467                    || conf->next_seq_sect == this_sector
 468                    || dist == 0
 469                    /* If device is idle, use it */
 470                    || atomic_read(&rdev->nr_pending) == 0) {
 471                        best_disk = disk;
 472                        break;
 473                }
 474                if (dist < best_dist) {
 475                        best_dist = dist;
 476                        best_disk = disk;
 477                }
 478        }
 479
 480        if (best_disk >= 0) {
 481                rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
 482                if (!rdev)
 483                        goto retry;
 484                atomic_inc(&rdev->nr_pending);
 485                if (test_bit(Faulty, &rdev->flags)) {
 486                        /* cannot risk returning a device that failed
 487                         * before we inc'ed nr_pending
 488                         */
 489                        rdev_dec_pending(rdev, conf->mddev);
 490                        goto retry;
 491                }
 492                conf->next_seq_sect = this_sector + sectors;
 493                conf->last_used = best_disk;
 494        }
 495        rcu_read_unlock();
 496
 497        return best_disk;
 498}
 499
 500int md_raid1_congested(mddev_t *mddev, int bits)
 501{
 502        conf_t *conf = mddev->private;
 503        int i, ret = 0;
 504
 505        rcu_read_lock();
 506        for (i = 0; i < mddev->raid_disks; i++) {
 507                mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
 508                if (rdev && !test_bit(Faulty, &rdev->flags)) {
 509                        struct request_queue *q = bdev_get_queue(rdev->bdev);
 510
 511                        BUG_ON(!q);
 512
 513                        /* Note the '|| 1' - when read_balance prefers
 514                         * non-congested targets, it can be removed
 515                         */
 516                        if ((bits & (1<<BDI_async_congested)) || 1)
 517                                ret |= bdi_congested(&q->backing_dev_info, bits);
 518                        else
 519                                ret &= bdi_congested(&q->backing_dev_info, bits);
 520                }
 521        }
 522        rcu_read_unlock();
 523        return ret;
 524}
 525EXPORT_SYMBOL_GPL(md_raid1_congested);
 526
 527static int raid1_congested(void *data, int bits)
 528{
 529        mddev_t *mddev = data;
 530
 531        return mddev_congested(mddev, bits) ||
 532                md_raid1_congested(mddev, bits);
 533}
 534
 535static void flush_pending_writes(conf_t *conf)
 536{
 537        /* Any writes that have been queued but are awaiting
 538         * bitmap updates get flushed here.
 539         */
 540        spin_lock_irq(&conf->device_lock);
 541
 542        if (conf->pending_bio_list.head) {
 543                struct bio *bio;
 544                bio = bio_list_get(&conf->pending_bio_list);
 545                spin_unlock_irq(&conf->device_lock);
 546                /* flush any pending bitmap writes to
 547                 * disk before proceeding w/ I/O */
 548                bitmap_unplug(conf->mddev->bitmap);
 549
 550                while (bio) { /* submit pending writes */
 551                        struct bio *next = bio->bi_next;
 552                        bio->bi_next = NULL;
 553                        generic_make_request(bio);
 554                        bio = next;
 555                }
 556        } else
 557                spin_unlock_irq(&conf->device_lock);
 558}
 559
 560/* Barriers....
 561 * Sometimes we need to suspend IO while we do something else,
 562 * either some resync/recovery, or reconfigure the array.
 563 * To do this we raise a 'barrier'.
 564 * The 'barrier' is a counter that can be raised multiple times
 565 * to count how many activities are happening which preclude
 566 * normal IO.
 567 * We can only raise the barrier if there is no pending IO.
 568 * i.e. if nr_pending == 0.
 569 * We choose only to raise the barrier if no-one is waiting for the
 570 * barrier to go down.  This means that as soon as an IO request
 571 * is ready, no other operations which require a barrier will start
 572 * until the IO request has had a chance.
 573 *
 574 * So: regular IO calls 'wait_barrier'.  When that returns there
 575 *    is no backgroup IO happening,  It must arrange to call
 576 *    allow_barrier when it has finished its IO.
 577 * backgroup IO calls must call raise_barrier.  Once that returns
 578 *    there is no normal IO happeing.  It must arrange to call
 579 *    lower_barrier when the particular background IO completes.
 580 */
 581#define RESYNC_DEPTH 32
 582
 583static void raise_barrier(conf_t *conf)
 584{
 585        spin_lock_irq(&conf->resync_lock);
 586
 587        /* Wait until no block IO is waiting */
 588        wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
 589                            conf->resync_lock, );
 590
 591        /* block any new IO from starting */
 592        conf->barrier++;
 593
 594        /* Now wait for all pending IO to complete */
 595        wait_event_lock_irq(conf->wait_barrier,
 596                            !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
 597                            conf->resync_lock, );
 598
 599        spin_unlock_irq(&conf->resync_lock);
 600}
 601
 602static void lower_barrier(conf_t *conf)
 603{
 604        unsigned long flags;
 605        BUG_ON(conf->barrier <= 0);
 606        spin_lock_irqsave(&conf->resync_lock, flags);
 607        conf->barrier--;
 608        spin_unlock_irqrestore(&conf->resync_lock, flags);
 609        wake_up(&conf->wait_barrier);
 610}
 611
 612static void wait_barrier(conf_t *conf)
 613{
 614        spin_lock_irq(&conf->resync_lock);
 615        if (conf->barrier) {
 616                conf->nr_waiting++;
 617                wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
 618                                    conf->resync_lock,
 619                                    );
 620                conf->nr_waiting--;
 621        }
 622        conf->nr_pending++;
 623        spin_unlock_irq(&conf->resync_lock);
 624}
 625
 626static void allow_barrier(conf_t *conf)
 627{
 628        unsigned long flags;
 629        spin_lock_irqsave(&conf->resync_lock, flags);
 630        conf->nr_pending--;
 631        spin_unlock_irqrestore(&conf->resync_lock, flags);
 632        wake_up(&conf->wait_barrier);
 633}
 634
 635static void freeze_array(conf_t *conf)
 636{
 637        /* stop syncio and normal IO and wait for everything to
 638         * go quite.
 639         * We increment barrier and nr_waiting, and then
 640         * wait until nr_pending match nr_queued+1
 641         * This is called in the context of one normal IO request
 642         * that has failed. Thus any sync request that might be pending
 643         * will be blocked by nr_pending, and we need to wait for
 644         * pending IO requests to complete or be queued for re-try.
 645         * Thus the number queued (nr_queued) plus this request (1)
 646         * must match the number of pending IOs (nr_pending) before
 647         * we continue.
 648         */
 649        spin_lock_irq(&conf->resync_lock);
 650        conf->barrier++;
 651        conf->nr_waiting++;
 652        wait_event_lock_irq(conf->wait_barrier,
 653                            conf->nr_pending == conf->nr_queued+1,
 654                            conf->resync_lock,
 655                            flush_pending_writes(conf));
 656        spin_unlock_irq(&conf->resync_lock);
 657}
 658static void unfreeze_array(conf_t *conf)
 659{
 660        /* reverse the effect of the freeze */
 661        spin_lock_irq(&conf->resync_lock);
 662        conf->barrier--;
 663        conf->nr_waiting--;
 664        wake_up(&conf->wait_barrier);
 665        spin_unlock_irq(&conf->resync_lock);
 666}
 667
 668
 669/* duplicate the data pages for behind I/O 
 670 */
 671static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
 672{
 673        int i;
 674        struct bio_vec *bvec;
 675        struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*),
 676                                        GFP_NOIO);
 677        if (unlikely(!pages))
 678                return;
 679
 680        bio_for_each_segment(bvec, bio, i) {
 681                pages[i] = alloc_page(GFP_NOIO);
 682                if (unlikely(!pages[i]))
 683                        goto do_sync_io;
 684                memcpy(kmap(pages[i]) + bvec->bv_offset,
 685                        kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
 686                kunmap(pages[i]);
 687                kunmap(bvec->bv_page);
 688        }
 689        r1_bio->behind_pages = pages;
 690        r1_bio->behind_page_count = bio->bi_vcnt;
 691        set_bit(R1BIO_BehindIO, &r1_bio->state);
 692        return;
 693
 694do_sync_io:
 695        for (i = 0; i < bio->bi_vcnt; i++)
 696                if (pages[i])
 697                        put_page(pages[i]);
 698        kfree(pages);
 699        PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
 700}
 701
 702static int make_request(mddev_t *mddev, struct bio * bio)
 703{
 704        conf_t *conf = mddev->private;
 705        mirror_info_t *mirror;
 706        r1bio_t *r1_bio;
 707        struct bio *read_bio;
 708        int i, targets = 0, disks;
 709        struct bitmap *bitmap;
 710        unsigned long flags;
 711        const int rw = bio_data_dir(bio);
 712        const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
 713        const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
 714        mdk_rdev_t *blocked_rdev;
 715        int plugged;
 716
 717        /*
 718         * Register the new request and wait if the reconstruction
 719         * thread has put up a bar for new requests.
 720         * Continue immediately if no resync is active currently.
 721         */
 722
 723        md_write_start(mddev, bio); /* wait on superblock update early */
 724
 725        if (bio_data_dir(bio) == WRITE &&
 726            bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo &&
 727            bio->bi_sector < mddev->suspend_hi) {
 728                /* As the suspend_* range is controlled by
 729                 * userspace, we want an interruptible
 730                 * wait.
 731                 */
 732                DEFINE_WAIT(w);
 733                for (;;) {
 734                        flush_signals(current);
 735                        prepare_to_wait(&conf->wait_barrier,
 736                                        &w, TASK_INTERRUPTIBLE);
 737                        if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo ||
 738                            bio->bi_sector >= mddev->suspend_hi)
 739                                break;
 740                        schedule();
 741                }
 742                finish_wait(&conf->wait_barrier, &w);
 743        }
 744
 745        wait_barrier(conf);
 746
 747        bitmap = mddev->bitmap;
 748
 749        /*
 750         * make_request() can abort the operation when READA is being
 751         * used and no empty request is available.
 752         *
 753         */
 754        r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
 755
 756        r1_bio->master_bio = bio;
 757        r1_bio->sectors = bio->bi_size >> 9;
 758        r1_bio->state = 0;
 759        r1_bio->mddev = mddev;
 760        r1_bio->sector = bio->bi_sector;
 761
 762        if (rw == READ) {
 763                /*
 764                 * read balancing logic:
 765                 */
 766                int rdisk = read_balance(conf, r1_bio);
 767
 768                if (rdisk < 0) {
 769                        /* couldn't find anywhere to read from */
 770                        raid_end_bio_io(r1_bio);
 771                        return 0;
 772                }
 773                mirror = conf->mirrors + rdisk;
 774
 775                if (test_bit(WriteMostly, &mirror->rdev->flags) &&
 776                    bitmap) {
 777                        /* Reading from a write-mostly device must
 778                         * take care not to over-take any writes
 779                         * that are 'behind'
 780                         */
 781                        wait_event(bitmap->behind_wait,
 782                                   atomic_read(&bitmap->behind_writes) == 0);
 783                }
 784                r1_bio->read_disk = rdisk;
 785
 786                read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
 787
 788                r1_bio->bios[rdisk] = read_bio;
 789
 790                read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
 791                read_bio->bi_bdev = mirror->rdev->bdev;
 792                read_bio->bi_end_io = raid1_end_read_request;
 793                read_bio->bi_rw = READ | do_sync;
 794                read_bio->bi_private = r1_bio;
 795
 796                generic_make_request(read_bio);
 797                return 0;
 798        }
 799
 800        /*
 801         * WRITE:
 802         */
 803        /* first select target devices under spinlock and
 804         * inc refcount on their rdev.  Record them by setting
 805         * bios[x] to bio
 806         */
 807        plugged = mddev_check_plugged(mddev);
 808
 809        disks = conf->raid_disks;
 810 retry_write:
 811        blocked_rdev = NULL;
 812        rcu_read_lock();
 813        for (i = 0;  i < disks; i++) {
 814                mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
 815                if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
 816                        atomic_inc(&rdev->nr_pending);
 817                        blocked_rdev = rdev;
 818                        break;
 819                }
 820                if (rdev && !test_bit(Faulty, &rdev->flags)) {
 821                        atomic_inc(&rdev->nr_pending);
 822                        if (test_bit(Faulty, &rdev->flags)) {
 823                                rdev_dec_pending(rdev, mddev);
 824                                r1_bio->bios[i] = NULL;
 825                        } else {
 826                                r1_bio->bios[i] = bio;
 827                                targets++;
 828                        }
 829                } else
 830                        r1_bio->bios[i] = NULL;
 831        }
 832        rcu_read_unlock();
 833
 834        if (unlikely(blocked_rdev)) {
 835                /* Wait for this device to become unblocked */
 836                int j;
 837
 838                for (j = 0; j < i; j++)
 839                        if (r1_bio->bios[j])
 840                                rdev_dec_pending(conf->mirrors[j].rdev, mddev);
 841
 842                allow_barrier(conf);
 843                md_wait_for_blocked_rdev(blocked_rdev, mddev);
 844                wait_barrier(conf);
 845                goto retry_write;
 846        }
 847
 848        BUG_ON(targets == 0); /* we never fail the last device */
 849
 850        if (targets < conf->raid_disks) {
 851                /* array is degraded, we will not clear the bitmap
 852                 * on I/O completion (see raid1_end_write_request) */
 853                set_bit(R1BIO_Degraded, &r1_bio->state);
 854        }
 855
 856        /* do behind I/O ?
 857         * Not if there are too many, or cannot allocate memory,
 858         * or a reader on WriteMostly is waiting for behind writes 
 859         * to flush */
 860        if (bitmap &&
 861            (atomic_read(&bitmap->behind_writes)
 862             < mddev->bitmap_info.max_write_behind) &&
 863            !waitqueue_active(&bitmap->behind_wait))
 864                alloc_behind_pages(bio, r1_bio);
 865
 866        atomic_set(&r1_bio->remaining, 1);
 867        atomic_set(&r1_bio->behind_remaining, 0);
 868
 869        bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
 870                                test_bit(R1BIO_BehindIO, &r1_bio->state));
 871        for (i = 0; i < disks; i++) {
 872                struct bio *mbio;
 873                if (!r1_bio->bios[i])
 874                        continue;
 875
 876                mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
 877                r1_bio->bios[i] = mbio;
 878
 879                mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
 880                mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
 881                mbio->bi_end_io = raid1_end_write_request;
 882                mbio->bi_rw = WRITE | do_flush_fua | do_sync;
 883                mbio->bi_private = r1_bio;
 884
 885                if (r1_bio->behind_pages) {
 886                        struct bio_vec *bvec;
 887                        int j;
 888
 889                        /* Yes, I really want the '__' version so that
 890                         * we clear any unused pointer in the io_vec, rather
 891                         * than leave them unchanged.  This is important
 892                         * because when we come to free the pages, we won't
 893                         * know the original bi_idx, so we just free
 894                         * them all
 895                         */
 896                        __bio_for_each_segment(bvec, mbio, j, 0)
 897                                bvec->bv_page = r1_bio->behind_pages[j];
 898                        if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
 899                                atomic_inc(&r1_bio->behind_remaining);
 900                }
 901
 902                atomic_inc(&r1_bio->remaining);
 903                spin_lock_irqsave(&conf->device_lock, flags);
 904                bio_list_add(&conf->pending_bio_list, mbio);
 905                spin_unlock_irqrestore(&conf->device_lock, flags);
 906        }
 907        r1_bio_write_done(r1_bio);
 908
 909        /* In case raid1d snuck in to freeze_array */
 910        wake_up(&conf->wait_barrier);
 911
 912        if (do_sync || !bitmap || !plugged)
 913                md_wakeup_thread(mddev->thread);
 914
 915        return 0;
 916}
 917
 918static void status(struct seq_file *seq, mddev_t *mddev)
 919{
 920        conf_t *conf = mddev->private;
 921        int i;
 922
 923        seq_printf(seq, " [%d/%d] [", conf->raid_disks,
 924                   conf->raid_disks - mddev->degraded);
 925        rcu_read_lock();
 926        for (i = 0; i < conf->raid_disks; i++) {
 927                mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
 928                seq_printf(seq, "%s",
 929                           rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
 930        }
 931        rcu_read_unlock();
 932        seq_printf(seq, "]");
 933}
 934
 935
 936static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 937{
 938        char b[BDEVNAME_SIZE];
 939        conf_t *conf = mddev->private;
 940
 941        /*
 942         * If it is not operational, then we have already marked it as dead
 943         * else if it is the last working disks, ignore the error, let the
 944         * next level up know.
 945         * else mark the drive as failed
 946         */
 947        if (test_bit(In_sync, &rdev->flags)
 948            && (conf->raid_disks - mddev->degraded) == 1) {
 949                /*
 950                 * Don't fail the drive, act as though we were just a
 951                 * normal single drive.
 952                 * However don't try a recovery from this drive as
 953                 * it is very likely to fail.
 954                 */
 955                mddev->recovery_disabled = 1;
 956                return;
 957        }
 958        if (test_and_clear_bit(In_sync, &rdev->flags)) {
 959                unsigned long flags;
 960                spin_lock_irqsave(&conf->device_lock, flags);
 961                mddev->degraded++;
 962                set_bit(Faulty, &rdev->flags);
 963                spin_unlock_irqrestore(&conf->device_lock, flags);
 964                /*
 965                 * if recovery is running, make sure it aborts.
 966                 */
 967                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 968        } else
 969                set_bit(Faulty, &rdev->flags);
 970        set_bit(MD_CHANGE_DEVS, &mddev->flags);
 971        printk(KERN_ALERT
 972               "md/raid1:%s: Disk failure on %s, disabling device.\n"
 973               "md/raid1:%s: Operation continuing on %d devices.\n",
 974               mdname(mddev), bdevname(rdev->bdev, b),
 975               mdname(mddev), conf->raid_disks - mddev->degraded);
 976}
 977
 978static void print_conf(conf_t *conf)
 979{
 980        int i;
 981
 982        printk(KERN_DEBUG "RAID1 conf printout:\n");
 983        if (!conf) {
 984                printk(KERN_DEBUG "(!conf)\n");
 985                return;
 986        }
 987        printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
 988                conf->raid_disks);
 989
 990        rcu_read_lock();
 991        for (i = 0; i < conf->raid_disks; i++) {
 992                char b[BDEVNAME_SIZE];
 993                mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
 994                if (rdev)
 995                        printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
 996                               i, !test_bit(In_sync, &rdev->flags),
 997                               !test_bit(Faulty, &rdev->flags),
 998                               bdevname(rdev->bdev,b));
 999        }
1000        rcu_read_unlock();

1001}
1002
1003static void close_sync(conf_t *conf)
1004{
1005        wait_barrier(conf);
1006        allow_barrier(conf);
1007
1008        mempool_destroy(conf->r1buf_pool);
1009        conf->r1buf_pool = NULL;
1010}
1011
1012static int raid1_spare_active(mddev_t *mddev)
1013{
1014        int i;
1015        conf_t *conf = mddev->private;
1016        int count = 0;
1017        unsigned long flags;
1018
1019        /*
1020         * Find all failed disks within the RAID1 configuration 
1021         * and mark them readable.
1022         * Called under mddev lock, so rcu protection not needed.
1023         */
1024        for (i = 0; i < conf->raid_disks; i++) {
1025                mdk_rdev_t *rdev = conf->mirrors[i].rdev;
1026                if (rdev
1027                    && !test_bit(Faulty, &rdev->flags)
1028                    && !test_and_set_bit(In_sync, &rdev->flags)) {
1029                        count++;
1030                        sysfs_notify_dirent(rdev->sysfs_state);
1031                }
1032        }
1033        spin_lock_irqsave(&conf->device_lock, flags);
1034        mddev->degraded -= count;
1035        spin_unlock_irqrestore(&conf->device_lock, flags);
1036
1037        print_conf(conf);
1038        return count;
1039}
1040
1041
1042static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1043{
1044        conf_t *conf = mddev->private;
1045        int err = -EEXIST;
1046        int mirror = 0;
1047        mirror_info_t *p;
1048        int first = 0;
1049        int last = mddev->raid_disks - 1;
1050
1051        if (rdev->raid_disk >= 0)
1052                first = last = rdev->raid_disk;
1053
1054        for (mirror = first; mirror <= last; mirror++)
1055                if ( !(p=conf->mirrors+mirror)->rdev) {
1056
1057                        disk_stack_limits(mddev->gendisk, rdev->bdev,
1058                                          rdev->data_offset << 9);
1059                        /* as we don't honour merge_bvec_fn, we must
1060                         * never risk violating it, so limit
1061                         * ->max_segments to one lying with a single
1062                         * page, as a one page request is never in
1063                         * violation.
1064                         */
1065                        if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1066                                blk_queue_max_segments(mddev->queue, 1);
1067                                blk_queue_segment_boundary(mddev->queue,
1068                                                           PAGE_CACHE_SIZE - 1);
1069                        }
1070
1071                        p->head_position = 0;
1072                        rdev->raid_disk = mirror;
1073                        err = 0;
1074                        /* As all devices are equivalent, we don't need a full recovery
1075                         * if this was recently any drive of the array
1076                         */
1077                        if (rdev->saved_raid_disk < 0)
1078                                conf->fullsync = 1;
1079                        rcu_assign_pointer(p->rdev, rdev);
1080                        break;
1081                }
1082        md_integrity_add_rdev(rdev, mddev);
1083        print_conf(conf);
1084        return err;
1085}
1086
1087static int raid1_remove_disk(mddev_t *mddev, int number)
1088{
1089        conf_t *conf = mddev->private;
1090        int err = 0;
1091        mdk_rdev_t *rdev;
1092        mirror_info_t *p = conf->mirrors+ number;
1093
1094        print_conf(conf);
1095        rdev = p->rdev;
1096        if (rdev) {
1097                if (test_bit(In_sync, &rdev->flags) ||
1098                    atomic_read(&rdev->nr_pending)) {
1099                        err = -EBUSY;
1100                        goto abort;
1101                }
1102                /* Only remove non-faulty devices if recovery
1103                 * is not possible.
1104                 */
1105                if (!test_bit(Faulty, &rdev->flags) &&
1106                    !mddev->recovery_disabled &&
1107                    mddev->degraded < conf->raid_disks) {
1108                        err = -EBUSY;
1109                        goto abort;
1110                }
1111                p->rdev = NULL;
1112                synchronize_rcu();
1113                if (atomic_read(&rdev->nr_pending)) {
1114                        /* lost the race, try later */
1115                        err = -EBUSY;
1116                        p->rdev = rdev;
1117                        goto abort;
1118                }
1119                err = md_integrity_register(mddev);
1120        }
1121abort:
1122
1123        print_conf(conf);
1124        return err;
1125}
1126
1127
1128static void end_sync_read(struct bio *bio, int error)
1129{
1130        r1bio_t *r1_bio = bio->bi_private;
1131        int i;
1132
1133        for (i=r1_bio->mddev->raid_disks; i--; )
1134                if (r1_bio->bios[i] == bio)
1135                        break;
1136        BUG_ON(i < 0);
1137        update_head_pos(i, r1_bio);
1138        /*
1139         * we have read a block, now it needs to be re-written,
1140         * or re-read if the read failed.
1141         * We don't do much here, just schedule handling by raid1d
1142         */
1143        if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1144                set_bit(R1BIO_Uptodate, &r1_bio->state);
1145
1146        if (atomic_dec_and_test(&r1_bio->remaining))
1147                reschedule_retry(r1_bio);
1148}
1149
1150static void end_sync_write(struct bio *bio, int error)
1151{
1152        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1153        r1bio_t *r1_bio = bio->bi_private;
1154        mddev_t *mddev = r1_bio->mddev;
1155        conf_t *conf = mddev->private;
1156        int i;
1157        int mirror=0;
1158
1159        for (i = 0; i < conf->raid_disks; i++)
1160                if (r1_bio->bios[i] == bio) {
1161                        mirror = i;
1162                        break;
1163                }
1164        if (!uptodate) {
1165                sector_t sync_blocks = 0;
1166                sector_t s = r1_bio->sector;
1167                long sectors_to_go = r1_bio->sectors;
1168                /* make sure these bits doesn't get cleared. */
1169                do {
1170                        bitmap_end_sync(mddev->bitmap, s,
1171                                        &sync_blocks, 1);
1172                        s += sync_blocks;
1173                        sectors_to_go -= sync_blocks;
1174                } while (sectors_to_go > 0);
1175                md_error(mddev, conf->mirrors[mirror].rdev);
1176        }
1177
1178        update_head_pos(mirror, r1_bio);
1179
1180        if (atomic_dec_and_test(&r1_bio->remaining)) {
1181                sector_t s = r1_bio->sectors;
1182                put_buf(r1_bio);
1183                md_done_sync(mddev, s, uptodate);
1184        }
1185}
1186
1187static int fix_sync_read_error(r1bio_t *r1_bio)
1188{
1189        /* Try some synchronous reads of other devices to get
1190         * good data, much like with normal read errors.  Only
1191         * read into the pages we already have so we don't
1192         * need to re-issue the read request.
1193         * We don't need to freeze the array, because being in an
1194         * active sync request, there is no normal IO, and
1195         * no overlapping syncs.
1196         */
1197        mddev_t *mddev = r1_bio->mddev;
1198        conf_t *conf = mddev->private;
1199        struct bio *bio = r1_bio->bios[r1_bio->read_disk];
1200        sector_t sect = r1_bio->sector;
1201        int sectors = r1_bio->sectors;
1202        int idx = 0;
1203
1204        while(sectors) {
1205                int s = sectors;
1206                int d = r1_bio->read_disk;
1207                int success = 0;
1208                mdk_rdev_t *rdev;
1209                int start;
1210
1211                if (s > (PAGE_SIZE>>9))
1212                        s = PAGE_SIZE >> 9;
1213                do {
1214                        if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
1215                                /* No rcu protection needed here devices
1216                                 * can only be removed when no resync is
1217                                 * active, and resync is currently active
1218                                 */
1219                                rdev = conf->mirrors[d].rdev;
1220                                if (sync_page_io(rdev,
1221                                                 sect,
1222                                                 s<<9,
1223                                                 bio->bi_io_vec[idx].bv_page,
1224                                                 READ, false)) {
1225                                        success = 1;
1226                                        break;
1227                                }
1228                        }
1229                        d++;
1230                        if (d == conf->raid_disks)
1231                                d = 0;
1232                } while (!success && d != r1_bio->read_disk);
1233
1234                if (!success) {
1235                        char b[BDEVNAME_SIZE];
1236                        /* Cannot read from anywhere, array is toast */
1237                        md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1238                        printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
1239                               " for block %llu\n",
1240                               mdname(mddev),
1241                               bdevname(bio->bi_bdev, b),
1242                               (unsigned long long)r1_bio->sector);
1243                        md_done_sync(mddev, r1_bio->sectors, 0);
1244                        put_buf(r1_bio);
1245                        return 0;
1246                }
1247
1248                start = d;
1249                /* write it back and re-read */
1250                while (d != r1_bio->read_disk) {
1251                        if (d == 0)
1252                                d = conf->raid_disks;
1253                        d--;
1254                        if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1255                                continue;
1256                        rdev = conf->mirrors[d].rdev;
1257                        if (sync_page_io(rdev,
1258                                         sect,
1259                                         s<<9,
1260                                         bio->bi_io_vec[idx].bv_page,
1261                                         WRITE, false) == 0) {
1262                                r1_bio->bios[d]->bi_end_io = NULL;
1263                                rdev_dec_pending(rdev, mddev);
1264                                md_error(mddev, rdev);
1265                        } else
1266                                atomic_add(s, &rdev->corrected_errors);
1267                }
1268                d = start;
1269                while (d != r1_bio->read_disk) {
1270                        if (d == 0)
1271                                d = conf->raid_disks;
1272                        d--;
1273                        if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1274                                continue;
1275                        rdev = conf->mirrors[d].rdev;
1276                        if (sync_page_io(rdev,
1277                                         sect,
1278                                         s<<9,
1279                                         bio->bi_io_vec[idx].bv_page,
1280                                         READ, false) == 0)
1281                                md_error(mddev, rdev);
1282                }
1283                sectors -= s;
1284                sect += s;
1285                idx ++;
1286        }
1287        set_bit(R1BIO_Uptodate, &r1_bio->state);
1288        set_bit(BIO_UPTODATE, &bio->bi_flags);
1289        return 1;
1290}
1291
1292static int process_checks(r1bio_t *r1_bio)
1293{
1294        /* We have read all readable devices.  If we haven't
1295         * got the block, then there is no hope left.
1296         * If we have, then we want to do a comparison
1297         * and skip the write if everything is the same.
1298         * If any blocks failed to read, then we need to
1299         * attempt an over-write
1300         */
1301        mddev_t *mddev = r1_bio->mddev;
1302        conf_t *conf = mddev->private;
1303        int primary;
1304        int i;
1305
1306        for (primary = 0; primary < conf->raid_disks; primary++)
1307                if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
1308                    test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
1309                        r1_bio->bios[primary]->bi_end_io = NULL;
1310                        rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
1311                        break;
1312                }
1313        r1_bio->read_disk = primary;
1314        for (i = 0; i < conf->raid_disks; i++) {
1315                int j;
1316                int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
1317                struct bio *pbio = r1_bio->bios[primary];
1318                struct bio *sbio = r1_bio->bios[i];
1319                int size;
1320
1321                if (r1_bio->bios[i]->bi_end_io != end_sync_read)
1322                        continue;
1323
1324                if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
1325                        for (j = vcnt; j-- ; ) {
1326                                struct page *p, *s;
1327                                p = pbio->bi_io_vec[j].bv_page;
1328                                s = sbio->bi_io_vec[j].bv_page;
1329                                if (memcmp(page_address(p),
1330                                           page_address(s),
1331                                           PAGE_SIZE))
1332                                        break;
1333                        }
1334                } else
1335                        j = 0;
1336                if (j >= 0)
1337                        mddev->resync_mismatches += r1_bio->sectors;
1338                if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
1339                              && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
1340                        /* No need to write to this device. */
1341                        sbio->bi_end_io = NULL;
1342                        rdev_dec_pending(conf->mirrors[i].rdev, mddev);
1343                        continue;
1344                }
1345                /* fixup the bio for reuse */
1346                sbio->bi_vcnt = vcnt;
1347                sbio->bi_size = r1_bio->sectors << 9;
1348                sbio->bi_idx = 0;
1349                sbio->bi_phys_segments = 0;
1350                sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1351                sbio->bi_flags |= 1 << BIO_UPTODATE;
1352                sbio->bi_next = NULL;
1353                sbio->bi_sector = r1_bio->sector +
1354                        conf->mirrors[i].rdev->data_offset;
1355                sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1356                size = sbio->bi_size;
1357                for (j = 0; j < vcnt ; j++) {
1358                        struct bio_vec *bi;
1359                        bi = &sbio->bi_io_vec[j];
1360                        bi->bv_offset = 0;
1361                        if (size > PAGE_SIZE)
1362                                bi->bv_len = PAGE_SIZE;
1363                        else
1364                                bi->bv_len = size;
1365                        size -= PAGE_SIZE;
1366                        memcpy(page_address(bi->bv_page),
1367                               page_address(pbio->bi_io_vec[j].bv_page),
1368                               PAGE_SIZE);
1369                }
1370        }
1371        return 0;
1372}
1373
1374static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1375{
1376        conf_t *conf = mddev->private;
1377        int i;
1378        int disks = conf->raid_disks;
1379        struct bio *bio, *wbio;
1380
1381        bio = r1_bio->bios[r1_bio->read_disk];
1382
1383        if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
1384                /* ouch - failed to read all of that. */
1385                if (!fix_sync_read_error(r1_bio))
1386                        return;
1387
1388        if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
1389                if (process_checks(r1_bio) < 0)
1390                        return;
1391        /*
1392         * schedule writes
1393         */
1394        atomic_set(&r1_bio->remaining, 1);
1395        for (i = 0; i < disks ; i++) {
1396                wbio = r1_bio->bios[i];
1397                if (wbio->bi_end_io == NULL ||
1398                    (wbio->bi_end_io == end_sync_read &&
1399                     (i == r1_bio->read_disk ||
1400                      !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
1401                        continue;
1402
1403                wbio->bi_rw = WRITE;
1404                wbio->bi_end_io = end_sync_write;
1405                atomic_inc(&r1_bio->remaining);
1406                md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
1407
1408                generic_make_request(wbio);
1409        }
1410
1411        if (atomic_dec_and_test(&r1_bio->remaining)) {
1412                /* if we're here, all write(s) have completed, so clean up */
1413                md_done_sync(mddev, r1_bio->sectors, 1);
1414                put_buf(r1_bio);
1415        }
1416}
1417
1418/*
1419 * This is a kernel thread which:
1420 *
1421 *      1.      Retries failed read operations on working mirrors.
1422 *      2.      Updates the raid superblock when problems encounter.
1423 *      3.      Performs writes following reads for array syncronising.
1424 */
1425
1426static void fix_read_error(conf_t *conf, int read_disk,
1427                           sector_t sect, int sectors)
1428{
1429        mddev_t *mddev = conf->mddev;
1430        while(sectors) {
1431                int s = sectors;
1432                int d = read_disk;
1433                int success = 0;
1434                int start;
1435                mdk_rdev_t *rdev;
1436
1437                if (s > (PAGE_SIZE>>9))
1438                        s = PAGE_SIZE >> 9;
1439
1440                do {
1441                        /* Note: no rcu protection needed here
1442                         * as this is synchronous in the raid1d thread
1443                         * which is the thread that might remove
1444                         * a device.  If raid1d ever becomes multi-threaded....
1445                         */
1446                        rdev = conf->mirrors[d].rdev;
1447                        if (rdev &&
1448                            test_bit(In_sync, &rdev->flags) &&
1449                            sync_page_io(rdev, sect, s<<9,
1450                                         conf->tmppage, READ, false))
1451                                success = 1;
1452                        else {
1453                                d++;
1454                                if (d == conf->raid_disks)
1455                                        d = 0;
1456                        }
1457                } while (!success && d != read_disk);
1458
1459                if (!success) {
1460                        /* Cannot read from anywhere -- bye bye array */
1461                        md_error(mddev, conf->mirrors[read_disk].rdev);
1462                        break;
1463                }
1464                /* write it back and re-read */
1465                start = d;
1466                while (d != read_disk) {
1467                        if (d==0)
1468                                d = conf->raid_disks;
1469                        d--;
1470                        rdev = conf->mirrors[d].rdev;
1471                        if (rdev &&
1472                            test_bit(In_sync, &rdev->flags)) {
1473                                if (sync_page_io(rdev, sect, s<<9,
1474                                                 conf->tmppage, WRITE, false)
1475                                    == 0)
1476                                        /* Well, this device is dead */
1477                                        md_error(mddev, rdev);
1478                        }
1479                }
1480                d = start;
1481                while (d != read_disk) {
1482                        char b[BDEVNAME_SIZE];
1483                        if (d==0)
1484                                d = conf->raid_disks;
1485                        d--;
1486                        rdev = conf->mirrors[d].rdev;
1487                        if (rdev &&
1488                            test_bit(In_sync, &rdev->flags)) {
1489                                if (sync_page_io(rdev, sect, s<<9,
1490                                                 conf->tmppage, READ, false)
1491                                    == 0)
1492                                        /* Well, this device is dead */
1493                                        md_error(mddev, rdev);
1494                                else {
1495                                        atomic_add(s, &rdev->corrected_errors);
1496                                        printk(KERN_INFO
1497                                               "md/raid1:%s: read error corrected "
1498                                               "(%d sectors at %llu on %s)\n",
1499                                               mdname(mddev), s,
1500                                               (unsigned long long)(sect +
1501                                                   rdev->data_offset),
1502                                               bdevname(rdev->bdev, b));
1503                                }
1504                        }
1505                }
1506                sectors -= s;
1507                sect += s;
1508        }
1509}
1510
1511static void raid1d(mddev_t *mddev)
1512{
1513        r1bio_t *r1_bio;
1514        struct bio *bio;
1515        unsigned long flags;
1516        conf_t *conf = mddev->private;
1517        struct list_head *head = &conf->retry_list;
1518        mdk_rdev_t *rdev;
1519        struct blk_plug plug;
1520
1521        md_check_recovery(mddev);
1522
1523        blk_start_plug(&plug);
1524        for (;;) {
1525                char b[BDEVNAME_SIZE];
1526
1527                if (atomic_read(&mddev->plug_cnt) == 0)
1528                        flush_pending_writes(conf);
1529
1530                spin_lock_irqsave(&conf->device_lock, flags);
1531                if (list_empty(head)) {
1532                        spin_unlock_irqrestore(&conf->device_lock, flags);
1533                        break;
1534                }
1535                r1_bio = list_entry(head->prev, r1bio_t, retry_list);
1536                list_del(head->prev);
1537                conf->nr_queued--;
1538                spin_unlock_irqrestore(&conf->device_lock, flags);
1539
1540                mddev = r1_bio->mddev;
1541                conf = mddev->private;
1542                if (test_bit(R1BIO_IsSync, &r1_bio->state))
1543                        sync_request_write(mddev, r1_bio);
1544                else {
1545                        int disk;
1546
1547                        /* we got a read error. Maybe the drive is bad.  Maybe just
1548                         * the block and we can fix it.
1549                         * We freeze all other IO, and try reading the block from
1550                         * other devices.  When we find one, we re-write
1551                         * and check it that fixes the read error.
1552                         * This is all done synchronously while the array is
1553                         * frozen
1554                         */
1555                        if (mddev->ro == 0) {
1556                                freeze_array(conf);
1557                                fix_read_error(conf, r1_bio->read_disk,
1558                                               r1_bio->sector,
1559                                               r1_bio->sectors);
1560                                unfreeze_array(conf);
1561                        } else
1562                                md_error(mddev,
1563                                         conf->mirrors[r1_bio->read_disk].rdev);
1564
1565                        bio = r1_bio->bios[r1_bio->read_disk];
1566                        if ((disk=read_balance(conf, r1_bio)) == -1) {
1567                                printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
1568                                       " read error for block %llu\n",
1569                                       mdname(mddev),
1570                                       bdevname(bio->bi_bdev,b),
1571                                       (unsigned long long)r1_bio->sector);
1572                                raid_end_bio_io(r1_bio);
1573                        } else {
1574                                const unsigned long do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC;
1575                                r1_bio->bios[r1_bio->read_disk] =
1576                                        mddev->ro ? IO_BLOCKED : NULL;
1577                                r1_bio->read_disk = disk;
1578                                bio_put(bio);
1579                                bio = bio_clone_mddev(r1_bio->master_bio,
1580                                                      GFP_NOIO, mddev);
1581                                r1_bio->bios[r1_bio->read_disk] = bio;
1582                                rdev = conf->mirrors[disk].rdev;
1583                                if (printk_ratelimit())
1584                                        printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to"
1585                                               " other mirror: %s\n",
1586                                               mdname(mddev),
1587                                               (unsigned long long)r1_bio->sector,
1588                                               bdevname(rdev->bdev,b));
1589                                bio->bi_sector = r1_bio->sector + rdev->data_offset;
1590                                bio->bi_bdev = rdev->bdev;
1591                                bio->bi_end_io = raid1_end_read_request;
1592                                bio->bi_rw = READ | do_sync;
1593                                bio->bi_private = r1_bio;
1594                                generic_make_request(bio);
1595                        }
1596                }
1597                cond_resched();
1598        }
1599        blk_finish_plug(&plug);
1600}
1601
1602
1603static int init_resync(conf_t *conf)
1604{
1605        int buffs;
1606
1607        buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
1608        BUG_ON(conf->r1buf_pool);
1609        conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free,
1610                                          conf->poolinfo);
1611        if (!conf->r1buf_pool)
1612                return -ENOMEM;
1613        conf->next_resync = 0;
1614        return 0;
1615}
1616
1617/*
1618 * perform a "sync" on one "block"
1619 *
1620 * We need to make sure that no normal I/O request - particularly write
1621 * requests - conflict with active sync requests.
1622 *
1623 * This is achieved by tracking pending requests and a 'barrier' concept
1624 * that can be installed to exclude normal IO requests.
1625 */
1626
1627static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1628{
1629        conf_t *conf = mddev->private;
1630        r1bio_t *r1_bio;
1631        struct bio *bio;
1632        sector_t max_sector, nr_sectors;
1633        int disk = -1;
1634        int i;
1635        int wonly = -1;
1636        int write_targets = 0, read_targets = 0;
1637        sector_t sync_blocks;
1638        int still_degraded = 0;
1639
1640        if (!conf->r1buf_pool)
1641                if (init_resync(conf))
1642                        return 0;
1643
1644        max_sector = mddev->dev_sectors;
1645        if (sector_nr >= max_sector) {
1646                /* If we aborted, we need to abort the
1647                 * sync on the 'current' bitmap chunk (there will
1648                 * only be one in raid1 resync.
1649                 * We can find the current addess in mddev->curr_resync
1650                 */
1651                if (mddev->curr_resync < max_sector) /* aborted */
1652                        bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1653                                                &sync_blocks, 1);
1654                else /* completed sync */
1655                        conf->fullsync = 0;
1656
1657                bitmap_close_sync(mddev->bitmap);
1658                close_sync(conf);
1659                return 0;
1660        }
1661
1662        if (mddev->bitmap == NULL &&
1663            mddev->recovery_cp == MaxSector &&
1664            !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
1665            conf->fullsync == 0) {
1666                *skipped = 1;
1667                return max_sector - sector_nr;
1668        }
1669        /* before building a request, check if we can skip these blocks..
1670         * This call the bitmap_start_sync doesn't actually record anything
1671         */
1672        if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
1673            !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1674                /* We can skip this block, and probably several more */
1675                *skipped = 1;
1676                return sync_blocks;
1677        }
1678        /*
1679         * If there is non-resync activity waiting for a turn,
1680         * and resync is going fast enough,
1681         * then let it though before starting on this new sync request.
1682         */
1683        if (!go_faster && conf->nr_waiting)
1684                msleep_interruptible(1000);
1685
1686        bitmap_cond_end_sync(mddev->bitmap, sector_nr);
1687        r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
1688        raise_barrier(conf);
1689
1690        conf->next_resync = sector_nr;
1691
1692        rcu_read_lock();
1693        /*
1694         * If we get a correctably read error during resync or recovery,
1695         * we might want to read from a different device.  So we
1696         * flag all drives that could conceivably be read from for READ,
1697         * and any others (which will be non-In_sync devices) for WRITE.
1698         * If a read fails, we try reading from something else for which READ
1699         * is OK.
1700         */
1701
1702        r1_bio->mddev = mddev;
1703        r1_bio->sector = sector_nr;
1704        r1_bio->state = 0;
1705        set_bit(R1BIO_IsSync, &r1_bio->state);
1706
1707        for (i=0; i < conf->raid_disks; i++) {
1708                mdk_rdev_t *rdev;
1709                bio = r1_bio->bios[i];
1710
1711                /* take from bio_init */
1712                bio->bi_next = NULL;
1713                bio->bi_flags &= ~(BIO_POOL_MASK-1);
1714                bio->bi_flags |= 1 << BIO_UPTODATE;
1715                bio->bi_comp_cpu = -1;
1716                bio->bi_rw = READ;
1717                bio->bi_vcnt = 0;
1718                bio->bi_idx = 0;
1719                bio->bi_phys_segments = 0;
1720                bio->bi_size = 0;
1721                bio->bi_end_io = NULL;
1722                bio->bi_private = NULL;
1723
1724                rdev = rcu_dereference(conf->mirrors[i].rdev);
1725                if (rdev == NULL ||
1726                           test_bit(Faulty, &rdev->flags)) {
1727                        still_degraded = 1;
1728                        continue;
1729                } else if (!test_bit(In_sync, &rdev->flags)) {
1730                        bio->bi_rw = WRITE;
1731                        bio->bi_end_io = end_sync_write;
1732                        write_targets ++;
1733                } else {
1734                        /* may need to read from here */
1735                        bio->bi_rw = READ;
1736                        bio->bi_end_io = end_sync_read;
1737                        if (test_bit(WriteMostly, &rdev->flags)) {
1738                                if (wonly < 0)
1739                                        wonly = i;
1740                        } else {
1741                                if (disk < 0)
1742                                        disk = i;
1743                        }
1744                        read_targets++;
1745                }
1746                atomic_inc(&rdev->nr_pending);
1747                bio->bi_sector = sector_nr + rdev->data_offset;
1748                bio->bi_bdev = rdev->bdev;
1749                bio->bi_private = r1_bio;
1750        }
1751        rcu_read_unlock();
1752        if (disk < 0)
1753                disk = wonly;
1754        r1_bio->read_disk = disk;
1755
1756        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
1757                /* extra read targets are also write targets */
1758                write_targets += read_targets-1;
1759
1760        if (write_targets == 0 || read_targets == 0) {
1761                /* There is nowhere to write, so all non-sync
1762                 * drives must be failed - so we are finished
1763                 */
1764                sector_t rv = max_sector - sector_nr;
1765                *skipped = 1;
1766                put_buf(r1_bio);
1767                return rv;
1768        }
1769
1770        if (max_sector > mddev->resync_max)
1771                max_sector = mddev->resync_max; /* Don't do IO beyond here */
1772        nr_sectors = 0;
1773        sync_blocks = 0;
1774        do {
1775                struct page *page;
1776                int len = PAGE_SIZE;
1777                if (sector_nr + (len>>9) > max_sector)
1778                        len = (max_sector - sector_nr) << 9;
1779                if (len == 0)
1780                        break;
1781                if (sync_blocks == 0) {
1782                        if (!bitmap_start_sync(mddev->bitmap, sector_nr,
1783                                               &sync_blocks, still_degraded) &&
1784                            !conf->fullsync &&
1785                            !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
1786                                break;
1787                        BUG_ON(sync_blocks < (PAGE_SIZE>>9));
1788                        if ((len >> 9) > sync_blocks)
1789                                len = sync_blocks<<9;
1790                }
1791
1792                for (i=0 ; i < conf->raid_disks; i++) {
1793                        bio = r1_bio->bios[i];
1794                        if (bio->bi_end_io) {
1795                                page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
1796                                if (bio_add_page(bio, page, len, 0) == 0) {
1797                                        /* stop here */
1798                                        bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
1799                                        while (i > 0) {
1800                                                i--;
1801                                                bio = r1_bio->bios[i];
1802                                                if (bio->bi_end_io==NULL)
1803                                                        continue;
1804                                                /* remove last page from this bio */
1805                                                bio->bi_vcnt--;
1806                                                bio->bi_size -= len;
1807                                                bio->bi_flags &= ~(1<< BIO_SEG_VALID);
1808                                        }
1809                                        goto bio_full;
1810                                }
1811                        }
1812                }
1813                nr_sectors += len>>9;
1814                sector_nr += len>>9;
1815                sync_blocks -= (len>>9);
1816        } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
1817 bio_full:
1818        r1_bio->sectors = nr_sectors;
1819
1820        /* For a user-requested sync, we read all readable devices and do a
1821         * compare
1822         */
1823        if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1824                atomic_set(&r1_bio->remaining, read_targets);
1825                for (i=0; i<conf->raid_disks; i++) {
1826                        bio = r1_bio->bios[i];
1827                        if (bio->bi_end_io == end_sync_read) {
1828                                md_sync_acct(bio->bi_bdev, nr_sectors);
1829                                generic_make_request(bio);
1830                        }
1831                }
1832        } else {
1833                atomic_set(&r1_bio->remaining, 1);
1834                bio = r1_bio->bios[r1_bio->read_disk];
1835                md_sync_acct(bio->bi_bdev, nr_sectors);
1836                generic_make_request(bio);
1837
1838        }
1839        return nr_sectors;
1840}
1841
1842static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks)
1843{
1844        if (sectors)
1845                return sectors;
1846
1847        return mddev->dev_sectors;
1848}
1849
1850static conf_t *setup_conf(mddev_t *mddev)
1851{
1852        conf_t *conf;
1853        int i;
1854        mirror_info_t *disk;
1855        mdk_rdev_t *rdev;
1856        int err = -ENOMEM;
1857
1858        conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
1859        if (!conf)
1860                goto abort;
1861
1862        conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1863                                 GFP_KERNEL);
1864        if (!conf->mirrors)
1865                goto abort;
1866
1867        conf->tmppage = alloc_page(GFP_KERNEL);
1868        if (!conf->tmppage)
1869                goto abort;
1870
1871        conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
1872        if (!conf->poolinfo)
1873                goto abort;
1874        conf->poolinfo->raid_disks = mddev->raid_disks;
1875        conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
1876                                          r1bio_pool_free,
1877                                          conf->poolinfo);
1878        if (!conf->r1bio_pool)
1879                goto abort;
1880
1881        conf->poolinfo->mddev = mddev;
1882
1883        spin_lock_init(&conf->device_lock);
1884        list_for_each_entry(rdev, &mddev->disks, same_set) {
1885                int disk_idx = rdev->raid_disk;
1886                if (disk_idx >= mddev->raid_disks
1887                    || disk_idx < 0)
1888                        continue;
1889                disk = conf->mirrors + disk_idx;
1890
1891                disk->rdev = rdev;
1892
1893                disk->head_position = 0;
1894        }
1895        conf->raid_disks = mddev->raid_disks;
1896        conf->mddev = mddev;
1897        INIT_LIST_HEAD(&conf->retry_list);
1898
1899        spin_lock_init(&conf->resync_lock);
1900        init_waitqueue_head(&conf->wait_barrier);
1901
1902        bio_list_init(&conf->pending_bio_list);
1903
1904        conf->last_used = -1;
1905        for (i = 0; i < conf->raid_disks; i++) {
1906
1907                disk = conf->mirrors + i;
1908
1909                if (!disk->rdev ||
1910                    !test_bit(In_sync, &disk->rdev->flags)) {
1911                        disk->head_position = 0;
1912                        if (disk->rdev)
1913                                conf->fullsync = 1;
1914                } else if (conf->last_used < 0)
1915                        /*
1916                         * The first working device is used as a
1917                         * starting point to read balancing.
1918                         */
1919                        conf->last_used = i;
1920        }
1921
1922        err = -EIO;
1923        if (conf->last_used < 0) {
1924                printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
1925                       mdname(mddev));
1926                goto abort;
1927        }
1928        err = -ENOMEM;
1929        conf->thread = md_register_thread(raid1d, mddev, NULL);
1930        if (!conf->thread) {
1931                printk(KERN_ERR
1932                       "md/raid1:%s: couldn't allocate thread\n",
1933                       mdname(mddev));
1934                goto abort;
1935        }
1936
1937        return conf;
1938
1939 abort:
1940        if (conf) {
1941                if (conf->r1bio_pool)
1942                        mempool_destroy(conf->r1bio_pool);
1943                kfree(conf->mirrors);
1944                safe_put_page(conf->tmppage);
1945                kfree(conf->poolinfo);
1946                kfree(conf);
1947        }
1948        return ERR_PTR(err);
1949}
1950
1951static int run(mddev_t *mddev)
1952{
1953        conf_t *conf;
1954        int i;
1955        mdk_rdev_t *rdev;
1956
1957        if (mddev->level != 1) {
1958                printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
1959                       mdname(mddev), mddev->level);
1960                return -EIO;
1961        }
1962        if (mddev->reshape_position != MaxSector) {
1963                printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n",
1964                       mdname(mddev));
1965                return -EIO;
1966        }
1967        /*
1968         * copy the already verified devices into our private RAID1
1969         * bookkeeping area. [whatever we allocate in run(),
1970         * should be freed in stop()]
1971         */
1972        if (mddev->private == NULL)
1973                conf = setup_conf(mddev);
1974        else
1975                conf = mddev->private;
1976
1977        if (IS_ERR(conf))
1978                return PTR_ERR(conf);
1979
1980        list_for_each_entry(rdev, &mddev->disks, same_set) {
1981                if (!mddev->gendisk)
1982                        continue;
1983                disk_stack_limits(mddev->gendisk, rdev->bdev,
1984                                  rdev->data_offset << 9);
1985                /* as we don't honour merge_bvec_fn, we must never risk
1986                 * violating it, so limit ->max_segments to 1 lying within
1987                 * a single page, as a one page request is never in violation.
1988                 */
1989                if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1990                        blk_queue_max_segments(mddev->queue, 1);
1991                        blk_queue_segment_boundary(mddev->queue,
1992                                                   PAGE_CACHE_SIZE - 1);
1993                }
1994        }
1995
1996        mddev->degraded = 0;
1997        for (i=0; i < conf->raid_disks; i++)
1998                if (conf->mirrors[i].rdev == NULL ||
1999                    !test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
2000                    test_bit(Faulty, &conf->mirrors[i].rdev->flags))

2001                        mddev->degraded++;
2002
2003        if (conf->raid_disks - mddev->degraded == 1)
2004                mddev->recovery_cp = MaxSector;
2005
2006        if (mddev->recovery_cp != MaxSector)
2007                printk(KERN_NOTICE "md/raid1:%s: not clean"
2008                       " -- starting background reconstruction\n",
2009                       mdname(mddev));
2010        printk(KERN_INFO 
2011                "md/raid1:%s: active with %d out of %d mirrors\n",
2012                mdname(mddev), mddev->raid_disks - mddev->degraded, 
2013                mddev->raid_disks);
2014
2015        /*
2016         * Ok, everything is just fine now
2017         */
2018        mddev->thread = conf->thread;
2019        conf->thread = NULL;
2020        mddev->private = conf;
2021
2022        md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
2023
2024        if (mddev->queue) {
2025                mddev->queue->backing_dev_info.congested_fn = raid1_congested;
2026                mddev->queue->backing_dev_info.congested_data = mddev;
2027        }
2028        return md_integrity_register(mddev);
2029}
2030
2031static int stop(mddev_t *mddev)
2032{
2033        conf_t *conf = mddev->private;
2034        struct bitmap *bitmap = mddev->bitmap;
2035
2036        /* wait for behind writes to complete */
2037        if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
2038                printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n",
2039                       mdname(mddev));
2040                /* need to kick something here to make sure I/O goes? */
2041                wait_event(bitmap->behind_wait,
2042                           atomic_read(&bitmap->behind_writes) == 0);
2043        }
2044
2045        raise_barrier(conf);
2046        lower_barrier(conf);
2047
2048        md_unregister_thread(mddev->thread);
2049        mddev->thread = NULL;
2050        if (conf->r1bio_pool)
2051                mempool_destroy(conf->r1bio_pool);
2052        kfree(conf->mirrors);
2053        kfree(conf->poolinfo);
2054        kfree(conf);
2055        mddev->private = NULL;
2056        return 0;
2057}
2058
2059static int raid1_resize(mddev_t *mddev, sector_t sectors)
2060{
2061        /* no resync is happening, and there is enough space
2062         * on all devices, so we can resize.
2063         * We need to make sure resync covers any new space.
2064         * If the array is shrinking we should possibly wait until
2065         * any io in the removed space completes, but it hardly seems
2066         * worth it.
2067         */
2068        md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0));
2069        if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
2070                return -EINVAL;
2071        set_capacity(mddev->gendisk, mddev->array_sectors);
2072        revalidate_disk(mddev->gendisk);
2073        if (sectors > mddev->dev_sectors &&
2074            mddev->recovery_cp > mddev->dev_sectors) {
2075                mddev->recovery_cp = mddev->dev_sectors;
2076                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2077        }
2078        mddev->dev_sectors = sectors;
2079        mddev->resync_max_sectors = sectors;
2080        return 0;
2081}
2082
2083static int raid1_reshape(mddev_t *mddev)
2084{
2085        /* We need to:
2086         * 1/ resize the r1bio_pool
2087         * 2/ resize conf->mirrors
2088         *
2089         * We allocate a new r1bio_pool if we can.
2090         * Then raise a device barrier and wait until all IO stops.
2091         * Then resize conf->mirrors and swap in the new r1bio pool.
2092         *
2093         * At the same time, we "pack" the devices so that all the missing
2094         * devices have the higher raid_disk numbers.
2095         */
2096        mempool_t *newpool, *oldpool;
2097        struct pool_info *newpoolinfo;
2098        mirror_info_t *newmirrors;
2099        conf_t *conf = mddev->private;
2100        int cnt, raid_disks;
2101        unsigned long flags;
2102        int d, d2, err;
2103
2104        /* Cannot change chunk_size, layout, or level */
2105        if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
2106            mddev->layout != mddev->new_layout ||
2107            mddev->level != mddev->new_level) {
2108                mddev->new_chunk_sectors = mddev->chunk_sectors;
2109                mddev->new_layout = mddev->layout;
2110                mddev->new_level = mddev->level;
2111                return -EINVAL;
2112        }
2113
2114        err = md_allow_write(mddev);
2115        if (err)
2116                return err;
2117
2118        raid_disks = mddev->raid_disks + mddev->delta_disks;
2119
2120        if (raid_disks < conf->raid_disks) {
2121                cnt=0;
2122                for (d= 0; d < conf->raid_disks; d++)
2123                        if (conf->mirrors[d].rdev)
2124                                cnt++;
2125                if (cnt > raid_disks)
2126                        return -EBUSY;
2127        }
2128
2129        newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
2130        if (!newpoolinfo)
2131                return -ENOMEM;
2132        newpoolinfo->mddev = mddev;
2133        newpoolinfo->raid_disks = raid_disks;
2134
2135        newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
2136                                 r1bio_pool_free, newpoolinfo);
2137        if (!newpool) {
2138                kfree(newpoolinfo);
2139                return -ENOMEM;
2140        }
2141        newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
2142        if (!newmirrors) {
2143                kfree(newpoolinfo);
2144                mempool_destroy(newpool);
2145                return -ENOMEM;
2146        }
2147
2148        raise_barrier(conf);
2149
2150        /* ok, everything is stopped */
2151        oldpool = conf->r1bio_pool;
2152        conf->r1bio_pool = newpool;
2153
2154        for (d = d2 = 0; d < conf->raid_disks; d++) {
2155                mdk_rdev_t *rdev = conf->mirrors[d].rdev;
2156                if (rdev && rdev->raid_disk != d2) {
2157                        char nm[20];
2158                        sprintf(nm, "rd%d", rdev->raid_disk);
2159                        sysfs_remove_link(&mddev->kobj, nm);
2160                        rdev->raid_disk = d2;
2161                        sprintf(nm, "rd%d", rdev->raid_disk);
2162                        sysfs_remove_link(&mddev->kobj, nm);
2163                        if (sysfs_create_link(&mddev->kobj,
2164                                              &rdev->kobj, nm))
2165                                printk(KERN_WARNING
2166                                       "md/raid1:%s: cannot register "
2167                                       "%s\n",
2168                                       mdname(mddev), nm);
2169                }
2170                if (rdev)
2171                        newmirrors[d2++].rdev = rdev;
2172        }
2173        kfree(conf->mirrors);
2174        conf->mirrors = newmirrors;
2175        kfree(conf->poolinfo);
2176        conf->poolinfo = newpoolinfo;
2177
2178        spin_lock_irqsave(&conf->device_lock, flags);
2179        mddev->degraded += (raid_disks - conf->raid_disks);
2180        spin_unlock_irqrestore(&conf->device_lock, flags);
2181        conf->raid_disks = mddev->raid_disks = raid_disks;
2182        mddev->delta_disks = 0;
2183
2184        conf->last_used = 0; /* just make sure it is in-range */
2185        lower_barrier(conf);
2186
2187        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2188        md_wakeup_thread(mddev->thread);
2189
2190        mempool_destroy(oldpool);
2191        return 0;
2192}
2193
2194static void raid1_quiesce(mddev_t *mddev, int state)
2195{
2196        conf_t *conf = mddev->private;
2197
2198        switch(state) {
2199        case 2: /* wake for suspend */
2200                wake_up(&conf->wait_barrier);
2201                break;
2202        case 1:
2203                raise_barrier(conf);
2204                break;
2205        case 0:
2206                lower_barrier(conf);
2207                break;
2208        }
2209}
2210
2211static void *raid1_takeover(mddev_t *mddev)
2212{
2213        /* raid1 can take over:
2214         *  raid5 with 2 devices, any layout or chunk size
2215         */
2216        if (mddev->level == 5 && mddev->raid_disks == 2) {
2217                conf_t *conf;
2218                mddev->new_level = 1;
2219                mddev->new_layout = 0;
2220                mddev->new_chunk_sectors = 0;
2221                conf = setup_conf(mddev);
2222                if (!IS_ERR(conf))
2223                        conf->barrier = 1;
2224                return conf;
2225        }
2226        return ERR_PTR(-EINVAL);
2227}
2228
2229static struct mdk_personality raid1_personality =
2230{
2231        .name           = "raid1",
2232        .level          = 1,
2233        .owner          = THIS_MODULE,
2234        .make_request   = make_request,
2235        .run            = run,
2236        .stop           = stop,
2237        .status         = status,
2238        .error_handler  = error,
2239        .hot_add_disk   = raid1_add_disk,
2240        .hot_remove_disk= raid1_remove_disk,
2241        .spare_active   = raid1_spare_active,
2242        .sync_request   = sync_request,
2243        .resize         = raid1_resize,
2244        .size           = raid1_size,
2245        .check_reshape  = raid1_reshape,
2246        .quiesce        = raid1_quiesce,
2247        .takeover       = raid1_takeover,
2248};
2249
2250static int __init raid_init(void)
2251{
2252        return register_md_personality(&raid1_personality);
2253}
2254
2255static void raid_exit(void)
2256{
2257        unregister_md_personality(&raid1_personality);
2258}
2259
2260module_init(raid_init);
2261module_exit(raid_exit);
2262MODULE_LICENSE("GPL");
2263MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
2264MODULE_ALIAS("md-personality-3"); /* RAID1 */
2265MODULE_ALIAS("md-raid1");
2266MODULE_ALIAS("md-level-1");
2267