1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/slab.h>
22#include <linux/delay.h>
23#include <linux/blkdev.h>
24#include <linux/seq_file.h>
25#include "md.h"
26#include "raid10.h"
27#include "raid0.h"
28#include "bitmap.h"
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58#define NR_RAID10_BIOS 256
59
60static void unplug_slaves(mddev_t *mddev);
61
62static void allow_barrier(conf_t *conf);
63static void lower_barrier(conf_t *conf);
64
65static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
66{
67 conf_t *conf = data;
68 r10bio_t *r10_bio;
69 int size = offsetof(struct r10bio_s, devs[conf->copies]);
70
71
72 r10_bio = kzalloc(size, gfp_flags);
73 if (!r10_bio && conf->mddev)
74 unplug_slaves(conf->mddev);
75
76 return r10_bio;
77}
78
79static void r10bio_pool_free(void *r10_bio, void *data)
80{
81 kfree(r10_bio);
82}
83
84
85#define RESYNC_BLOCK_SIZE (64*1024)
86#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
87
88#define RESYNC_WINDOW (1024*1024)
89
90#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
91
92
93
94
95
96
97
98
99static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
100{
101 conf_t *conf = data;
102 struct page *page;
103 r10bio_t *r10_bio;
104 struct bio *bio;
105 int i, j;
106 int nalloc;
107
108 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
109 if (!r10_bio) {
110 unplug_slaves(conf->mddev);
111 return NULL;
112 }
113
114 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
115 nalloc = conf->copies;
116 else
117 nalloc = 2;
118
119
120
121
122 for (j = nalloc ; j-- ; ) {
123 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
124 if (!bio)
125 goto out_free_bio;
126 r10_bio->devs[j].bio = bio;
127 }
128
129
130
131
132 for (j = 0 ; j < nalloc; j++) {
133 bio = r10_bio->devs[j].bio;
134 for (i = 0; i < RESYNC_PAGES; i++) {
135 page = alloc_page(gfp_flags);
136 if (unlikely(!page))
137 goto out_free_pages;
138
139 bio->bi_io_vec[i].bv_page = page;
140 }
141 }
142
143 return r10_bio;
144
145out_free_pages:
146 for ( ; i > 0 ; i--)
147 safe_put_page(bio->bi_io_vec[i-1].bv_page);
148 while (j--)
149 for (i = 0; i < RESYNC_PAGES ; i++)
150 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
151 j = -1;
152out_free_bio:
153 while ( ++j < nalloc )
154 bio_put(r10_bio->devs[j].bio);
155 r10bio_pool_free(r10_bio, conf);
156 return NULL;
157}
158
159static void r10buf_pool_free(void *__r10_bio, void *data)
160{
161 int i;
162 conf_t *conf = data;
163 r10bio_t *r10bio = __r10_bio;
164 int j;
165
166 for (j=0; j < conf->copies; j++) {
167 struct bio *bio = r10bio->devs[j].bio;
168 if (bio) {
169 for (i = 0; i < RESYNC_PAGES; i++) {
170 safe_put_page(bio->bi_io_vec[i].bv_page);
171 bio->bi_io_vec[i].bv_page = NULL;
172 }
173 bio_put(bio);
174 }
175 }
176 r10bio_pool_free(r10bio, conf);
177}
178
179static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
180{
181 int i;
182
183 for (i = 0; i < conf->copies; i++) {
184 struct bio **bio = & r10_bio->devs[i].bio;
185 if (*bio && *bio != IO_BLOCKED)
186 bio_put(*bio);
187 *bio = NULL;
188 }
189}
190
191static void free_r10bio(r10bio_t *r10_bio)
192{
193 conf_t *conf = r10_bio->mddev->private;
194
195
196
197
198
199 allow_barrier(conf);
200
201 put_all_bios(conf, r10_bio);
202 mempool_free(r10_bio, conf->r10bio_pool);
203}
204
205static void put_buf(r10bio_t *r10_bio)
206{
207 conf_t *conf = r10_bio->mddev->private;
208
209 mempool_free(r10_bio, conf->r10buf_pool);
210
211 lower_barrier(conf);
212}
213
214static void reschedule_retry(r10bio_t *r10_bio)
215{
216 unsigned long flags;
217 mddev_t *mddev = r10_bio->mddev;
218 conf_t *conf = mddev->private;
219
220 spin_lock_irqsave(&conf->device_lock, flags);
221 list_add(&r10_bio->retry_list, &conf->retry_list);
222 conf->nr_queued ++;
223 spin_unlock_irqrestore(&conf->device_lock, flags);
224
225
226 wake_up(&conf->wait_barrier);
227
228 md_wakeup_thread(mddev->thread);
229}
230
231
232
233
234
235
236static void raid_end_bio_io(r10bio_t *r10_bio)
237{
238 struct bio *bio = r10_bio->master_bio;
239
240 bio_endio(bio,
241 test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);
242 free_r10bio(r10_bio);
243}
244
245
246
247
248static inline void update_head_pos(int slot, r10bio_t *r10_bio)
249{
250 conf_t *conf = r10_bio->mddev->private;
251
252 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
253 r10_bio->devs[slot].addr + (r10_bio->sectors);
254}
255
256static void raid10_end_read_request(struct bio *bio, int error)
257{
258 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
259 r10bio_t *r10_bio = bio->bi_private;
260 int slot, dev;
261 conf_t *conf = r10_bio->mddev->private;
262
263
264 slot = r10_bio->read_slot;
265 dev = r10_bio->devs[slot].devnum;
266
267
268
269 update_head_pos(slot, r10_bio);
270
271 if (uptodate) {
272
273
274
275
276
277
278
279
280
281 set_bit(R10BIO_Uptodate, &r10_bio->state);
282 raid_end_bio_io(r10_bio);
283 } else {
284
285
286
287 char b[BDEVNAME_SIZE];
288 if (printk_ratelimit())
289 printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n",
290 mdname(conf->mddev),
291 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
292 reschedule_retry(r10_bio);
293 }
294
295 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
296}
297
298static void raid10_end_write_request(struct bio *bio, int error)
299{
300 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
301 r10bio_t *r10_bio = bio->bi_private;
302 int slot, dev;
303 conf_t *conf = r10_bio->mddev->private;
304
305 for (slot = 0; slot < conf->copies; slot++)
306 if (r10_bio->devs[slot].bio == bio)
307 break;
308 dev = r10_bio->devs[slot].devnum;
309
310
311
312
313 if (!uptodate) {
314 md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
315
316 set_bit(R10BIO_Degraded, &r10_bio->state);
317 } else
318
319
320
321
322
323
324
325
326
327 set_bit(R10BIO_Uptodate, &r10_bio->state);
328
329 update_head_pos(slot, r10_bio);
330
331
332
333
334
335
336 if (atomic_dec_and_test(&r10_bio->remaining)) {
337
338 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
339 r10_bio->sectors,
340 !test_bit(R10BIO_Degraded, &r10_bio->state),
341 0);
342 md_write_end(r10_bio->mddev);
343 raid_end_bio_io(r10_bio);
344 }
345
346 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
347}
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio)
376{
377 int n,f;
378 sector_t sector;
379 sector_t chunk;
380 sector_t stripe;
381 int dev;
382
383 int slot = 0;
384
385
386 chunk = r10bio->sector >> conf->chunk_shift;
387 sector = r10bio->sector & conf->chunk_mask;
388
389 chunk *= conf->near_copies;
390 stripe = chunk;
391 dev = sector_div(stripe, conf->raid_disks);
392 if (conf->far_offset)
393 stripe *= conf->far_copies;
394
395 sector += stripe << conf->chunk_shift;
396
397
398 for (n=0; n < conf->near_copies; n++) {
399 int d = dev;
400 sector_t s = sector;
401 r10bio->devs[slot].addr = sector;
402 r10bio->devs[slot].devnum = d;
403 slot++;
404
405 for (f = 1; f < conf->far_copies; f++) {
406 d += conf->near_copies;
407 if (d >= conf->raid_disks)
408 d -= conf->raid_disks;
409 s += conf->stride;
410 r10bio->devs[slot].devnum = d;
411 r10bio->devs[slot].addr = s;
412 slot++;
413 }
414 dev++;
415 if (dev >= conf->raid_disks) {
416 dev = 0;
417 sector += (conf->chunk_mask + 1);
418 }
419 }
420 BUG_ON(slot != conf->copies);
421}
422
423static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
424{
425 sector_t offset, chunk, vchunk;
426
427 offset = sector & conf->chunk_mask;
428 if (conf->far_offset) {
429 int fc;
430 chunk = sector >> conf->chunk_shift;
431 fc = sector_div(chunk, conf->far_copies);
432 dev -= fc * conf->near_copies;
433 if (dev < 0)
434 dev += conf->raid_disks;
435 } else {
436 while (sector >= conf->stride) {
437 sector -= conf->stride;
438 if (dev < conf->near_copies)
439 dev += conf->raid_disks - conf->near_copies;
440 else
441 dev -= conf->near_copies;
442 }
443 chunk = sector >> conf->chunk_shift;
444 }
445 vchunk = chunk * conf->raid_disks + dev;
446 sector_div(vchunk, conf->near_copies);
447 return (vchunk << conf->chunk_shift) + offset;
448}
449
450
451
452
453
454
455
456
457
458
459
460static int raid10_mergeable_bvec(struct request_queue *q,
461 struct bvec_merge_data *bvm,
462 struct bio_vec *biovec)
463{
464 mddev_t *mddev = q->queuedata;
465 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
466 int max;
467 unsigned int chunk_sectors = mddev->chunk_sectors;
468 unsigned int bio_sectors = bvm->bi_size >> 9;
469
470 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
471 if (max < 0) max = 0;
472 if (max <= biovec->bv_len && bio_sectors == 0)
473 return biovec->bv_len;
474 else
475 return max;
476}
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497static int read_balance(conf_t *conf, r10bio_t *r10_bio)
498{
499 const sector_t this_sector = r10_bio->sector;
500 int disk, slot, nslot;
501 const int sectors = r10_bio->sectors;
502 sector_t new_distance, current_distance;
503 mdk_rdev_t *rdev;
504
505 raid10_find_phys(conf, r10_bio);
506 rcu_read_lock();
507
508
509
510
511
512
513 if (conf->mddev->recovery_cp < MaxSector
514 && (this_sector + sectors >= conf->next_resync)) {
515
516 slot = 0;
517 disk = r10_bio->devs[slot].devnum;
518
519 while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
520 r10_bio->devs[slot].bio == IO_BLOCKED ||
521 !test_bit(In_sync, &rdev->flags)) {
522 slot++;
523 if (slot == conf->copies) {
524 slot = 0;
525 disk = -1;
526 break;
527 }
528 disk = r10_bio->devs[slot].devnum;
529 }
530 goto rb_out;
531 }
532
533
534
535 slot = 0;
536 disk = r10_bio->devs[slot].devnum;
537 while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
538 r10_bio->devs[slot].bio == IO_BLOCKED ||
539 !test_bit(In_sync, &rdev->flags)) {
540 slot ++;
541 if (slot == conf->copies) {
542 disk = -1;
543 goto rb_out;
544 }
545 disk = r10_bio->devs[slot].devnum;
546 }
547
548
549 current_distance = abs(r10_bio->devs[slot].addr -
550 conf->mirrors[disk].head_position);
551
552
553
554
555 for (nslot = slot; nslot < conf->copies; nslot++) {
556 int ndisk = r10_bio->devs[nslot].devnum;
557
558
559 if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
560 r10_bio->devs[nslot].bio == IO_BLOCKED ||
561 !test_bit(In_sync, &rdev->flags))
562 continue;
563
564
565
566
567
568 if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) {
569 disk = ndisk;
570 slot = nslot;
571 break;
572 }
573
574
575 if (conf->far_copies > 1)
576 new_distance = r10_bio->devs[nslot].addr;
577 else
578 new_distance = abs(r10_bio->devs[nslot].addr -
579 conf->mirrors[ndisk].head_position);
580 if (new_distance < current_distance) {
581 current_distance = new_distance;
582 disk = ndisk;
583 slot = nslot;
584 }
585 }
586
587rb_out:
588 r10_bio->read_slot = slot;
589
590
591 if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL)
592 atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
593 else
594 disk = -1;
595 rcu_read_unlock();
596
597 return disk;
598}
599
600static void unplug_slaves(mddev_t *mddev)
601{
602 conf_t *conf = mddev->private;
603 int i;
604
605 rcu_read_lock();
606 for (i=0; i < conf->raid_disks; i++) {
607 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
608 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
609 struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
610
611 atomic_inc(&rdev->nr_pending);
612 rcu_read_unlock();
613
614 blk_unplug(r_queue);
615
616 rdev_dec_pending(rdev, mddev);
617 rcu_read_lock();
618 }
619 }
620 rcu_read_unlock();
621}
622
623static void raid10_unplug(struct request_queue *q)
624{
625 mddev_t *mddev = q->queuedata;
626
627 unplug_slaves(q->queuedata);
628 md_wakeup_thread(mddev->thread);
629}
630
631static int raid10_congested(void *data, int bits)
632{
633 mddev_t *mddev = data;
634 conf_t *conf = mddev->private;
635 int i, ret = 0;
636
637 if (mddev_congested(mddev, bits))
638 return 1;
639 rcu_read_lock();
640 for (i = 0; i < conf->raid_disks && ret == 0; i++) {
641 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
642 if (rdev && !test_bit(Faulty, &rdev->flags)) {
643 struct request_queue *q = bdev_get_queue(rdev->bdev);
644
645 ret |= bdi_congested(&q->backing_dev_info, bits);
646 }
647 }
648 rcu_read_unlock();
649 return ret;
650}
651
652static int flush_pending_writes(conf_t *conf)
653{
654
655
656
657
658 int rv = 0;
659
660 spin_lock_irq(&conf->device_lock);
661
662 if (conf->pending_bio_list.head) {
663 struct bio *bio;
664 bio = bio_list_get(&conf->pending_bio_list);
665
666 spin_lock(conf->mddev->queue->queue_lock);
667 blk_remove_plug(conf->mddev->queue);
668 spin_unlock(conf->mddev->queue->queue_lock);
669 spin_unlock_irq(&conf->device_lock);
670
671
672 bitmap_unplug(conf->mddev->bitmap);
673
674 while (bio) {
675 struct bio *next = bio->bi_next;
676 bio->bi_next = NULL;
677 generic_make_request(bio);
678 bio = next;
679 }
680 rv = 1;
681 } else
682 spin_unlock_irq(&conf->device_lock);
683 return rv;
684}
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707static void raise_barrier(conf_t *conf, int force)
708{
709 BUG_ON(force && !conf->barrier);
710 spin_lock_irq(&conf->resync_lock);
711
712
713 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
714 conf->resync_lock,
715 raid10_unplug(conf->mddev->queue));
716
717
718 conf->barrier++;
719
720
721 wait_event_lock_irq(conf->wait_barrier,
722 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
723 conf->resync_lock,
724 raid10_unplug(conf->mddev->queue));
725
726 spin_unlock_irq(&conf->resync_lock);
727}
728
729static void lower_barrier(conf_t *conf)
730{
731 unsigned long flags;
732 spin_lock_irqsave(&conf->resync_lock, flags);
733 conf->barrier--;
734 spin_unlock_irqrestore(&conf->resync_lock, flags);
735 wake_up(&conf->wait_barrier);
736}
737
738static void wait_barrier(conf_t *conf)
739{
740 spin_lock_irq(&conf->resync_lock);
741 if (conf->barrier) {
742 conf->nr_waiting++;
743 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
744 conf->resync_lock,
745 raid10_unplug(conf->mddev->queue));
746 conf->nr_waiting--;
747 }
748 conf->nr_pending++;
749 spin_unlock_irq(&conf->resync_lock);
750}
751
752static void allow_barrier(conf_t *conf)
753{
754 unsigned long flags;
755 spin_lock_irqsave(&conf->resync_lock, flags);
756 conf->nr_pending--;
757 spin_unlock_irqrestore(&conf->resync_lock, flags);
758 wake_up(&conf->wait_barrier);
759}
760
761static void freeze_array(conf_t *conf)
762{
763
764
765
766
767
768
769
770
771
772
773
774
775 spin_lock_irq(&conf->resync_lock);
776 conf->barrier++;
777 conf->nr_waiting++;
778 wait_event_lock_irq(conf->wait_barrier,
779 conf->nr_pending == conf->nr_queued+1,
780 conf->resync_lock,
781 ({ flush_pending_writes(conf);
782 raid10_unplug(conf->mddev->queue); }));
783 spin_unlock_irq(&conf->resync_lock);
784}
785
786static void unfreeze_array(conf_t *conf)
787{
788
789 spin_lock_irq(&conf->resync_lock);
790 conf->barrier--;
791 conf->nr_waiting--;
792 wake_up(&conf->wait_barrier);
793 spin_unlock_irq(&conf->resync_lock);
794}
795
796static int make_request(mddev_t *mddev, struct bio * bio)
797{
798 conf_t *conf = mddev->private;
799 mirror_info_t *mirror;
800 r10bio_t *r10_bio;
801 struct bio *read_bio;
802 int i;
803 int chunk_sects = conf->chunk_mask + 1;
804 const int rw = bio_data_dir(bio);
805 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
806 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
807 unsigned long flags;
808 mdk_rdev_t *blocked_rdev;
809
810 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
811 md_flush_request(mddev, bio);
812 return 0;
813 }
814
815
816
817
818 if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)
819 > chunk_sects &&
820 conf->near_copies < conf->raid_disks)) {
821 struct bio_pair *bp;
822
823 if (bio->bi_vcnt != 1 ||
824 bio->bi_idx != 0)
825 goto bad_map;
826
827
828
829 bp = bio_split(bio,
830 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
831
832
833
834
835
836
837
838
839
840 spin_lock_irq(&conf->resync_lock);
841 conf->nr_waiting++;
842 spin_unlock_irq(&conf->resync_lock);
843
844 if (make_request(mddev, &bp->bio1))
845 generic_make_request(&bp->bio1);
846 if (make_request(mddev, &bp->bio2))
847 generic_make_request(&bp->bio2);
848
849 spin_lock_irq(&conf->resync_lock);
850 conf->nr_waiting--;
851 wake_up(&conf->wait_barrier);
852 spin_unlock_irq(&conf->resync_lock);
853
854 bio_pair_release(bp);
855 return 0;
856 bad_map:
857 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
858 " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
859 (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
860
861 bio_io_error(bio);
862 return 0;
863 }
864
865 md_write_start(mddev, bio);
866
867
868
869
870
871
872 wait_barrier(conf);
873
874 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
875
876 r10_bio->master_bio = bio;
877 r10_bio->sectors = bio->bi_size >> 9;
878
879 r10_bio->mddev = mddev;
880 r10_bio->sector = bio->bi_sector;
881 r10_bio->state = 0;
882
883 if (rw == READ) {
884
885
886
887 int disk = read_balance(conf, r10_bio);
888 int slot = r10_bio->read_slot;
889 if (disk < 0) {
890 raid_end_bio_io(r10_bio);
891 return 0;
892 }
893 mirror = conf->mirrors + disk;
894
895 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
896
897 r10_bio->devs[slot].bio = read_bio;
898
899 read_bio->bi_sector = r10_bio->devs[slot].addr +
900 mirror->rdev->data_offset;
901 read_bio->bi_bdev = mirror->rdev->bdev;
902 read_bio->bi_end_io = raid10_end_read_request;
903 read_bio->bi_rw = READ | do_sync;
904 read_bio->bi_private = r10_bio;
905
906 generic_make_request(read_bio);
907 return 0;
908 }
909
910
911
912
913
914
915
916
917 raid10_find_phys(conf, r10_bio);
918 retry_write:
919 blocked_rdev = NULL;
920 rcu_read_lock();
921 for (i = 0; i < conf->copies; i++) {
922 int d = r10_bio->devs[i].devnum;
923 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
924 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
925 atomic_inc(&rdev->nr_pending);
926 blocked_rdev = rdev;
927 break;
928 }
929 if (rdev && !test_bit(Faulty, &rdev->flags)) {
930 atomic_inc(&rdev->nr_pending);
931 r10_bio->devs[i].bio = bio;
932 } else {
933 r10_bio->devs[i].bio = NULL;
934 set_bit(R10BIO_Degraded, &r10_bio->state);
935 }
936 }
937 rcu_read_unlock();
938
939 if (unlikely(blocked_rdev)) {
940
941 int j;
942 int d;
943
944 for (j = 0; j < i; j++)
945 if (r10_bio->devs[j].bio) {
946 d = r10_bio->devs[j].devnum;
947 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
948 }
949 allow_barrier(conf);
950 md_wait_for_blocked_rdev(blocked_rdev, mddev);
951 wait_barrier(conf);
952 goto retry_write;
953 }
954
955 atomic_set(&r10_bio->remaining, 1);
956 bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
957
958 for (i = 0; i < conf->copies; i++) {
959 struct bio *mbio;
960 int d = r10_bio->devs[i].devnum;
961 if (!r10_bio->devs[i].bio)
962 continue;
963
964 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
965 r10_bio->devs[i].bio = mbio;
966
967 mbio->bi_sector = r10_bio->devs[i].addr+
968 conf->mirrors[d].rdev->data_offset;
969 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
970 mbio->bi_end_io = raid10_end_write_request;
971 mbio->bi_rw = WRITE | do_sync | do_fua;
972 mbio->bi_private = r10_bio;
973
974 atomic_inc(&r10_bio->remaining);
975 spin_lock_irqsave(&conf->device_lock, flags);
976 bio_list_add(&conf->pending_bio_list, mbio);
977 blk_plug_device_unlocked(mddev->queue);
978 spin_unlock_irqrestore(&conf->device_lock, flags);
979 }
980
981 if (atomic_dec_and_test(&r10_bio->remaining)) {
982
983 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
984 r10_bio->sectors,
985 !test_bit(R10BIO_Degraded, &r10_bio->state),
986 0);
987 md_write_end(mddev);
988 raid_end_bio_io(r10_bio);
989 }
990
991
992 wake_up(&conf->wait_barrier);
993
994 if (do_sync)
995 md_wakeup_thread(mddev->thread);
996
997 return 0;
998}
999
1000static void status(struct seq_file *seq, mddev_t *mddev)
1001{
1002 conf_t *conf = mddev->private;
1003 int i;
1004
1005 if (conf->near_copies < conf->raid_disks)
1006 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1007 if (conf->near_copies > 1)
1008 seq_printf(seq, " %d near-copies", conf->near_copies);
1009 if (conf->far_copies > 1) {
1010 if (conf->far_offset)
1011 seq_printf(seq, " %d offset-copies", conf->far_copies);
1012 else
1013 seq_printf(seq, " %d far-copies", conf->far_copies);
1014 }
1015 seq_printf(seq, " [%d/%d] [", conf->raid_disks,
1016 conf->raid_disks - mddev->degraded);
1017 for (i = 0; i < conf->raid_disks; i++)
1018 seq_printf(seq, "%s",
1019 conf->mirrors[i].rdev &&
1020 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
1021 seq_printf(seq, "]");
1022}
1023
1024static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1025{
1026 char b[BDEVNAME_SIZE];
1027 conf_t *conf = mddev->private;
1028
1029
1030
1031
1032
1033
1034
1035 if (test_bit(In_sync, &rdev->flags)
1036 && conf->raid_disks-mddev->degraded == 1)
1037
1038
1039
1040
1041
1042
1043
1044 return;
1045 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1046 unsigned long flags;
1047 spin_lock_irqsave(&conf->device_lock, flags);
1048 mddev->degraded++;
1049 spin_unlock_irqrestore(&conf->device_lock, flags);
1050
1051
1052
1053 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1054 }
1055 set_bit(Faulty, &rdev->flags);
1056 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1057 printk(KERN_ALERT
1058 "md/raid10:%s: Disk failure on %s, disabling device.\n"
1059 "md/raid10:%s: Operation continuing on %d devices.\n",
1060 mdname(mddev), bdevname(rdev->bdev, b),
1061 mdname(mddev), conf->raid_disks - mddev->degraded);
1062}
1063
1064static void print_conf(conf_t *conf)
1065{
1066 int i;
1067 mirror_info_t *tmp;
1068
1069 printk(KERN_DEBUG "RAID10 conf printout:\n");
1070 if (!conf) {
1071 printk(KERN_DEBUG "(!conf)\n");
1072 return;
1073 }
1074 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
1075 conf->raid_disks);
1076
1077 for (i = 0; i < conf->raid_disks; i++) {
1078 char b[BDEVNAME_SIZE];
1079 tmp = conf->mirrors + i;
1080 if (tmp->rdev)
1081 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1082 i, !test_bit(In_sync, &tmp->rdev->flags),
1083 !test_bit(Faulty, &tmp->rdev->flags),
1084 bdevname(tmp->rdev->bdev,b));
1085 }
1086}
1087
1088static void close_sync(conf_t *conf)
1089{
1090 wait_barrier(conf);
1091 allow_barrier(conf);
1092
1093 mempool_destroy(conf->r10buf_pool);
1094 conf->r10buf_pool = NULL;
1095}
1096
1097
1098
1099
1100static int enough(conf_t *conf)
1101{
1102 int first = 0;
1103
1104 do {
1105 int n = conf->copies;
1106 int cnt = 0;
1107 while (n--) {
1108 if (conf->mirrors[first].rdev)
1109 cnt++;
1110 first = (first+1) % conf->raid_disks;
1111 }
1112 if (cnt == 0)
1113 return 0;
1114 } while (first != 0);
1115 return 1;
1116}
1117
1118static int raid10_spare_active(mddev_t *mddev)
1119{
1120 int i;
1121 conf_t *conf = mddev->private;
1122 mirror_info_t *tmp;
1123 int count = 0;
1124 unsigned long flags;
1125
1126
1127
1128
1129
1130 for (i = 0; i < conf->raid_disks; i++) {
1131 tmp = conf->mirrors + i;
1132 if (tmp->rdev
1133 && !test_bit(Faulty, &tmp->rdev->flags)
1134 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1135 count++;
1136 sysfs_notify_dirent(tmp->rdev->sysfs_state);
1137 }
1138 }
1139 spin_lock_irqsave(&conf->device_lock, flags);
1140 mddev->degraded -= count;
1141 spin_unlock_irqrestore(&conf->device_lock, flags);
1142
1143 print_conf(conf);
1144 return count;
1145}
1146
1147
1148static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1149{
1150 conf_t *conf = mddev->private;
1151 int err = -EEXIST;
1152 int mirror;
1153 mirror_info_t *p;
1154 int first = 0;
1155 int last = conf->raid_disks - 1;
1156
1157 if (mddev->recovery_cp < MaxSector)
1158
1159
1160
1161 return -EBUSY;
1162 if (!enough(conf))
1163 return -EINVAL;
1164
1165 if (rdev->raid_disk >= 0)
1166 first = last = rdev->raid_disk;
1167
1168 if (rdev->saved_raid_disk >= 0 &&
1169 rdev->saved_raid_disk >= first &&
1170 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1171 mirror = rdev->saved_raid_disk;
1172 else
1173 mirror = first;
1174 for ( ; mirror <= last ; mirror++)
1175 if ( !(p=conf->mirrors+mirror)->rdev) {
1176
1177 disk_stack_limits(mddev->gendisk, rdev->bdev,
1178 rdev->data_offset << 9);
1179
1180
1181
1182
1183
1184
1185 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1186 blk_queue_max_segments(mddev->queue, 1);
1187 blk_queue_segment_boundary(mddev->queue,
1188 PAGE_CACHE_SIZE - 1);
1189 }
1190
1191 p->head_position = 0;
1192 rdev->raid_disk = mirror;
1193 err = 0;
1194 if (rdev->saved_raid_disk != mirror)
1195 conf->fullsync = 1;
1196 rcu_assign_pointer(p->rdev, rdev);
1197 break;
1198 }
1199
1200 md_integrity_add_rdev(rdev, mddev);
1201 print_conf(conf);
1202 return err;
1203}
1204
1205static int raid10_remove_disk(mddev_t *mddev, int number)
1206{
1207 conf_t *conf = mddev->private;
1208 int err = 0;
1209 mdk_rdev_t *rdev;
1210 mirror_info_t *p = conf->mirrors+ number;
1211
1212 print_conf(conf);
1213 rdev = p->rdev;
1214 if (rdev) {
1215 if (test_bit(In_sync, &rdev->flags) ||
1216 atomic_read(&rdev->nr_pending)) {
1217 err = -EBUSY;
1218 goto abort;
1219 }
1220
1221
1222
1223 if (!test_bit(Faulty, &rdev->flags) &&
1224 enough(conf)) {
1225 err = -EBUSY;
1226 goto abort;
1227 }
1228 p->rdev = NULL;
1229 synchronize_rcu();
1230 if (atomic_read(&rdev->nr_pending)) {
1231
1232 err = -EBUSY;
1233 p->rdev = rdev;
1234 goto abort;
1235 }
1236 md_integrity_register(mddev);
1237 }
1238abort:
1239
1240 print_conf(conf);
1241 return err;
1242}
1243
1244
1245static void end_sync_read(struct bio *bio, int error)
1246{
1247 r10bio_t *r10_bio = bio->bi_private;
1248 conf_t *conf = r10_bio->mddev->private;
1249 int i,d;
1250
1251 for (i=0; i<conf->copies; i++)
1252 if (r10_bio->devs[i].bio == bio)
1253 break;
1254 BUG_ON(i == conf->copies);
1255 update_head_pos(i, r10_bio);
1256 d = r10_bio->devs[i].devnum;
1257
1258 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1259 set_bit(R10BIO_Uptodate, &r10_bio->state);
1260 else {
1261 atomic_add(r10_bio->sectors,
1262 &conf->mirrors[d].rdev->corrected_errors);
1263 if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
1264 md_error(r10_bio->mddev,
1265 conf->mirrors[d].rdev);
1266 }
1267
1268
1269
1270
1271 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1272 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1273 atomic_dec_and_test(&r10_bio->remaining)) {
1274
1275
1276
1277 reschedule_retry(r10_bio);
1278 }
1279}
1280
1281static void end_sync_write(struct bio *bio, int error)
1282{
1283 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1284 r10bio_t *r10_bio = bio->bi_private;
1285 mddev_t *mddev = r10_bio->mddev;
1286 conf_t *conf = mddev->private;
1287 int i,d;
1288
1289 for (i = 0; i < conf->copies; i++)
1290 if (r10_bio->devs[i].bio == bio)
1291 break;
1292 d = r10_bio->devs[i].devnum;
1293
1294 if (!uptodate)
1295 md_error(mddev, conf->mirrors[d].rdev);
1296
1297 update_head_pos(i, r10_bio);
1298
1299 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1300 while (atomic_dec_and_test(&r10_bio->remaining)) {
1301 if (r10_bio->master_bio == NULL) {
1302
1303 sector_t s = r10_bio->sectors;
1304 put_buf(r10_bio);
1305 md_done_sync(mddev, s, 1);
1306 break;
1307 } else {
1308 r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
1309 put_buf(r10_bio);
1310 r10_bio = r10_bio2;
1311 }
1312 }
1313}
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1332{
1333 conf_t *conf = mddev->private;
1334 int i, first;
1335 struct bio *tbio, *fbio;
1336
1337 atomic_set(&r10_bio->remaining, 1);
1338
1339
1340 for (i=0; i<conf->copies; i++)
1341 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
1342 break;
1343
1344 if (i == conf->copies)
1345 goto done;
1346
1347 first = i;
1348 fbio = r10_bio->devs[i].bio;
1349
1350
1351 for (i=0 ; i < conf->copies ; i++) {
1352 int j, d;
1353 int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
1354
1355 tbio = r10_bio->devs[i].bio;
1356
1357 if (tbio->bi_end_io != end_sync_read)
1358 continue;
1359 if (i == first)
1360 continue;
1361 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
1362
1363
1364
1365
1366 for (j = 0; j < vcnt; j++)
1367 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
1368 page_address(tbio->bi_io_vec[j].bv_page),
1369 PAGE_SIZE))
1370 break;
1371 if (j == vcnt)
1372 continue;
1373 mddev->resync_mismatches += r10_bio->sectors;
1374 }
1375 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1376
1377 continue;
1378
1379
1380
1381
1382 tbio->bi_vcnt = vcnt;
1383 tbio->bi_size = r10_bio->sectors << 9;
1384 tbio->bi_idx = 0;
1385 tbio->bi_phys_segments = 0;
1386 tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1387 tbio->bi_flags |= 1 << BIO_UPTODATE;
1388 tbio->bi_next = NULL;
1389 tbio->bi_rw = WRITE;
1390 tbio->bi_private = r10_bio;
1391 tbio->bi_sector = r10_bio->devs[i].addr;
1392
1393 for (j=0; j < vcnt ; j++) {
1394 tbio->bi_io_vec[j].bv_offset = 0;
1395 tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
1396
1397 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
1398 page_address(fbio->bi_io_vec[j].bv_page),
1399 PAGE_SIZE);
1400 }
1401 tbio->bi_end_io = end_sync_write;
1402
1403 d = r10_bio->devs[i].devnum;
1404 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1405 atomic_inc(&r10_bio->remaining);
1406 md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
1407
1408 tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
1409 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1410 generic_make_request(tbio);
1411 }
1412
1413done:
1414 if (atomic_dec_and_test(&r10_bio->remaining)) {
1415 md_done_sync(mddev, r10_bio->sectors, 1);
1416 put_buf(r10_bio);
1417 }
1418}
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1432{
1433 conf_t *conf = mddev->private;
1434 int i, d;
1435 struct bio *bio, *wbio;
1436
1437
1438
1439
1440
1441 bio = r10_bio->devs[0].bio;
1442 wbio = r10_bio->devs[1].bio;
1443 for (i=0; i < wbio->bi_vcnt; i++) {
1444 struct page *p = bio->bi_io_vec[i].bv_page;
1445 bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;
1446 wbio->bi_io_vec[i].bv_page = p;
1447 }
1448 d = r10_bio->devs[1].devnum;
1449
1450 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1451 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
1452 if (test_bit(R10BIO_Uptodate, &r10_bio->state))
1453 generic_make_request(wbio);
1454 else
1455 bio_endio(wbio, -EIO);
1456}
1457
1458
1459
1460
1461
1462
1463
1464
1465static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
1466{
1467 struct timespec cur_time_mon;
1468 unsigned long hours_since_last;
1469 unsigned int read_errors = atomic_read(&rdev->read_errors);
1470
1471 ktime_get_ts(&cur_time_mon);
1472
1473 if (rdev->last_read_error.tv_sec == 0 &&
1474 rdev->last_read_error.tv_nsec == 0) {
1475
1476 rdev->last_read_error = cur_time_mon;
1477 return;
1478 }
1479
1480 hours_since_last = (cur_time_mon.tv_sec -
1481 rdev->last_read_error.tv_sec) / 3600;
1482
1483 rdev->last_read_error = cur_time_mon;
1484
1485
1486
1487
1488
1489
1490 if (hours_since_last >= 8 * sizeof(read_errors))
1491 atomic_set(&rdev->read_errors, 0);
1492 else
1493 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
1494}
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1505{
1506 int sect = 0;
1507 int sectors = r10_bio->sectors;
1508 mdk_rdev_t*rdev;
1509 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
1510 int d = r10_bio->devs[r10_bio->read_slot].devnum;
1511
1512 rcu_read_lock();
1513 rdev = rcu_dereference(conf->mirrors[d].rdev);
1514 if (rdev) {
1515 char b[BDEVNAME_SIZE];
1516 int cur_read_error_count = 0;
1517
1518 bdevname(rdev->bdev, b);
1519
1520 if (test_bit(Faulty, &rdev->flags)) {
1521 rcu_read_unlock();
1522
1523
1524 return;
1525 }
1526
1527 check_decay_read_errors(mddev, rdev);
1528 atomic_inc(&rdev->read_errors);
1529 cur_read_error_count = atomic_read(&rdev->read_errors);
1530 if (cur_read_error_count > max_read_errors) {
1531 rcu_read_unlock();
1532 printk(KERN_NOTICE
1533 "md/raid10:%s: %s: Raid device exceeded "
1534 "read_error threshold "
1535 "[cur %d:max %d]\n",
1536 mdname(mddev),
1537 b, cur_read_error_count, max_read_errors);
1538 printk(KERN_NOTICE
1539 "md/raid10:%s: %s: Failing raid "
1540 "device\n", mdname(mddev), b);
1541 md_error(mddev, conf->mirrors[d].rdev);
1542 return;
1543 }
1544 }
1545 rcu_read_unlock();
1546
1547 while(sectors) {
1548 int s = sectors;
1549 int sl = r10_bio->read_slot;
1550 int success = 0;
1551 int start;
1552
1553 if (s > (PAGE_SIZE>>9))
1554 s = PAGE_SIZE >> 9;
1555
1556 rcu_read_lock();
1557 do {
1558 d = r10_bio->devs[sl].devnum;
1559 rdev = rcu_dereference(conf->mirrors[d].rdev);
1560 if (rdev &&
1561 test_bit(In_sync, &rdev->flags)) {
1562 atomic_inc(&rdev->nr_pending);
1563 rcu_read_unlock();
1564 success = sync_page_io(rdev,
1565 r10_bio->devs[sl].addr +
1566 sect,
1567 s<<9,
1568 conf->tmppage, READ, false);
1569 rdev_dec_pending(rdev, mddev);
1570 rcu_read_lock();
1571 if (success)
1572 break;
1573 }
1574 sl++;
1575 if (sl == conf->copies)
1576 sl = 0;
1577 } while (!success && sl != r10_bio->read_slot);
1578 rcu_read_unlock();
1579
1580 if (!success) {
1581
1582 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
1583 md_error(mddev, conf->mirrors[dn].rdev);
1584 break;
1585 }
1586
1587 start = sl;
1588
1589 rcu_read_lock();
1590 while (sl != r10_bio->read_slot) {
1591 char b[BDEVNAME_SIZE];
1592
1593 if (sl==0)
1594 sl = conf->copies;
1595 sl--;
1596 d = r10_bio->devs[sl].devnum;
1597 rdev = rcu_dereference(conf->mirrors[d].rdev);
1598 if (rdev &&
1599 test_bit(In_sync, &rdev->flags)) {
1600 atomic_inc(&rdev->nr_pending);
1601 rcu_read_unlock();
1602 atomic_add(s, &rdev->corrected_errors);
1603 if (sync_page_io(rdev,
1604 r10_bio->devs[sl].addr +
1605 sect,
1606 s<<9, conf->tmppage, WRITE, false)
1607 == 0) {
1608
1609 printk(KERN_NOTICE
1610 "md/raid10:%s: read correction "
1611 "write failed"
1612 " (%d sectors at %llu on %s)\n",
1613 mdname(mddev), s,
1614 (unsigned long long)(sect+
1615 rdev->data_offset),
1616 bdevname(rdev->bdev, b));
1617 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
1618 "drive\n",
1619 mdname(mddev),
1620 bdevname(rdev->bdev, b));
1621 md_error(mddev, rdev);
1622 }
1623 rdev_dec_pending(rdev, mddev);
1624 rcu_read_lock();
1625 }
1626 }
1627 sl = start;
1628 while (sl != r10_bio->read_slot) {
1629
1630 if (sl==0)
1631 sl = conf->copies;
1632 sl--;
1633 d = r10_bio->devs[sl].devnum;
1634 rdev = rcu_dereference(conf->mirrors[d].rdev);
1635 if (rdev &&
1636 test_bit(In_sync, &rdev->flags)) {
1637 char b[BDEVNAME_SIZE];
1638 atomic_inc(&rdev->nr_pending);
1639 rcu_read_unlock();
1640 if (sync_page_io(rdev,
1641 r10_bio->devs[sl].addr +
1642 sect,
1643 s<<9, conf->tmppage,
1644 READ, false) == 0) {
1645
1646 printk(KERN_NOTICE
1647 "md/raid10:%s: unable to read back "
1648 "corrected sectors"
1649 " (%d sectors at %llu on %s)\n",
1650 mdname(mddev), s,
1651 (unsigned long long)(sect+
1652 rdev->data_offset),
1653 bdevname(rdev->bdev, b));
1654 printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
1655 mdname(mddev),
1656 bdevname(rdev->bdev, b));
1657
1658 md_error(mddev, rdev);
1659 } else {
1660 printk(KERN_INFO
1661 "md/raid10:%s: read error corrected"
1662 " (%d sectors at %llu on %s)\n",
1663 mdname(mddev), s,
1664 (unsigned long long)(sect+
1665 rdev->data_offset),
1666 bdevname(rdev->bdev, b));
1667 }
1668
1669 rdev_dec_pending(rdev, mddev);
1670 rcu_read_lock();
1671 }
1672 }
1673 rcu_read_unlock();
1674
1675 sectors -= s;
1676 sect += s;
1677 }
1678}
1679
1680static void raid10d(mddev_t *mddev)
1681{
1682 r10bio_t *r10_bio;
1683 struct bio *bio;
1684 unsigned long flags;
1685 conf_t *conf = mddev->private;
1686 struct list_head *head = &conf->retry_list;
1687 int unplug=0;
1688 mdk_rdev_t *rdev;
1689
1690 md_check_recovery(mddev);
1691
1692 for (;;) {
1693 char b[BDEVNAME_SIZE];
1694
1695 unplug += flush_pending_writes(conf);
1696
1697 spin_lock_irqsave(&conf->device_lock, flags);
1698 if (list_empty(head)) {
1699 spin_unlock_irqrestore(&conf->device_lock, flags);
1700 break;
1701 }
1702 r10_bio = list_entry(head->prev, r10bio_t, retry_list);
1703 list_del(head->prev);
1704 conf->nr_queued--;
1705 spin_unlock_irqrestore(&conf->device_lock, flags);
1706
1707 mddev = r10_bio->mddev;
1708 conf = mddev->private;
1709 if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
1710 sync_request_write(mddev, r10_bio);
1711 unplug = 1;
1712 } else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
1713 recovery_request_write(mddev, r10_bio);
1714 unplug = 1;
1715 } else {
1716 int mirror;
1717
1718
1719
1720
1721
1722
1723
1724
1725 if (mddev->ro == 0) {
1726 freeze_array(conf);
1727 fix_read_error(conf, mddev, r10_bio);
1728 unfreeze_array(conf);
1729 }
1730
1731 bio = r10_bio->devs[r10_bio->read_slot].bio;
1732 r10_bio->devs[r10_bio->read_slot].bio =
1733 mddev->ro ? IO_BLOCKED : NULL;
1734 mirror = read_balance(conf, r10_bio);
1735 if (mirror == -1) {
1736 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
1737 " read error for block %llu\n",
1738 mdname(mddev),
1739 bdevname(bio->bi_bdev,b),
1740 (unsigned long long)r10_bio->sector);
1741 raid_end_bio_io(r10_bio);
1742 bio_put(bio);
1743 } else {
1744 const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
1745 bio_put(bio);
1746 rdev = conf->mirrors[mirror].rdev;
1747 if (printk_ratelimit())
1748 printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
1749 " another mirror\n",
1750 mdname(mddev),
1751 bdevname(rdev->bdev,b),
1752 (unsigned long long)r10_bio->sector);
1753 bio = bio_clone_mddev(r10_bio->master_bio,
1754 GFP_NOIO, mddev);
1755 r10_bio->devs[r10_bio->read_slot].bio = bio;
1756 bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
1757 + rdev->data_offset;
1758 bio->bi_bdev = rdev->bdev;
1759 bio->bi_rw = READ | do_sync;
1760 bio->bi_private = r10_bio;
1761 bio->bi_end_io = raid10_end_read_request;
1762 unplug = 1;
1763 generic_make_request(bio);
1764 }
1765 }
1766 cond_resched();
1767 }
1768 if (unplug)
1769 unplug_slaves(mddev);
1770}
1771
1772
1773static int init_resync(conf_t *conf)
1774{
1775 int buffs;
1776
1777 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
1778 BUG_ON(conf->r10buf_pool);
1779 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
1780 if (!conf->r10buf_pool)
1781 return -ENOMEM;
1782 conf->next_resync = 0;
1783 return 0;
1784}
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1819{
1820 conf_t *conf = mddev->private;
1821 r10bio_t *r10_bio;
1822 struct bio *biolist = NULL, *bio;
1823 sector_t max_sector, nr_sectors;
1824 int disk;
1825 int i;
1826 int max_sync;
1827 sector_t sync_blocks;
1828
1829 sector_t sectors_skipped = 0;
1830 int chunks_skipped = 0;
1831
1832 if (!conf->r10buf_pool)
1833 if (init_resync(conf))
1834 return 0;
1835
1836 skipped:
1837 max_sector = mddev->dev_sectors;
1838 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1839 max_sector = mddev->resync_max_sectors;
1840 if (sector_nr >= max_sector) {
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850 if (mddev->curr_resync < max_sector) {
1851 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1852 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1853 &sync_blocks, 1);
1854 else for (i=0; i<conf->raid_disks; i++) {
1855 sector_t sect =
1856 raid10_find_virt(conf, mddev->curr_resync, i);
1857 bitmap_end_sync(mddev->bitmap, sect,
1858 &sync_blocks, 1);
1859 }
1860 } else
1861 conf->fullsync = 0;
1862
1863 bitmap_close_sync(mddev->bitmap);
1864 close_sync(conf);
1865 *skipped = 1;
1866 return sectors_skipped;
1867 }
1868 if (chunks_skipped >= conf->raid_disks) {
1869
1870
1871
1872 *skipped = 1;
1873 return (max_sector - sector_nr) + sectors_skipped;
1874 }
1875
1876 if (max_sector > mddev->resync_max)
1877 max_sector = mddev->resync_max;
1878
1879
1880
1881
1882 if (conf->near_copies < conf->raid_disks &&
1883 max_sector > (sector_nr | conf->chunk_mask))
1884 max_sector = (sector_nr | conf->chunk_mask) + 1;
1885
1886
1887
1888
1889 if (!go_faster && conf->nr_waiting)
1890 msleep_interruptible(1000);
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
1908 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1909
1910 int j, k;
1911 r10_bio = NULL;
1912
1913 for (i=0 ; i<conf->raid_disks; i++)
1914 if (conf->mirrors[i].rdev &&
1915 !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
1916 int still_degraded = 0;
1917
1918 r10bio_t *rb2 = r10_bio;
1919 sector_t sect = raid10_find_virt(conf, sector_nr, i);
1920 int must_sync;
1921
1922
1923
1924 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1925 &sync_blocks, 1);
1926 if (sync_blocks < max_sync)
1927 max_sync = sync_blocks;
1928 if (!must_sync &&
1929 !conf->fullsync) {
1930
1931
1932
1933 chunks_skipped = -1;
1934 continue;
1935 }
1936
1937 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1938 raise_barrier(conf, rb2 != NULL);
1939 atomic_set(&r10_bio->remaining, 0);
1940
1941 r10_bio->master_bio = (struct bio*)rb2;
1942 if (rb2)
1943 atomic_inc(&rb2->remaining);
1944 r10_bio->mddev = mddev;
1945 set_bit(R10BIO_IsRecover, &r10_bio->state);
1946 r10_bio->sector = sect;
1947
1948 raid10_find_phys(conf, r10_bio);
1949
1950
1951
1952
1953 for (j=0; j<conf->raid_disks; j++)
1954 if (conf->mirrors[j].rdev == NULL ||
1955 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
1956 still_degraded = 1;
1957 break;
1958 }
1959
1960 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1961 &sync_blocks, still_degraded);
1962
1963 for (j=0; j<conf->copies;j++) {
1964 int d = r10_bio->devs[j].devnum;
1965 if (conf->mirrors[d].rdev &&
1966 test_bit(In_sync, &conf->mirrors[d].rdev->flags)) {
1967
1968 bio = r10_bio->devs[0].bio;
1969 bio->bi_next = biolist;
1970 biolist = bio;
1971 bio->bi_private = r10_bio;
1972 bio->bi_end_io = end_sync_read;
1973 bio->bi_rw = READ;
1974 bio->bi_sector = r10_bio->devs[j].addr +
1975 conf->mirrors[d].rdev->data_offset;
1976 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1977 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1978 atomic_inc(&r10_bio->remaining);
1979
1980
1981 for (k=0; k<conf->copies; k++)
1982 if (r10_bio->devs[k].devnum == i)
1983 break;
1984 BUG_ON(k == conf->copies);
1985 bio = r10_bio->devs[1].bio;
1986 bio->bi_next = biolist;
1987 biolist = bio;
1988 bio->bi_private = r10_bio;
1989 bio->bi_end_io = end_sync_write;
1990 bio->bi_rw = WRITE;
1991 bio->bi_sector = r10_bio->devs[k].addr +
1992 conf->mirrors[i].rdev->data_offset;
1993 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1994
1995 r10_bio->devs[0].devnum = d;
1996 r10_bio->devs[1].devnum = i;
1997
1998 break;
1999 }
2000 }
2001 if (j == conf->copies) {
2002
2003 put_buf(r10_bio);
2004 if (rb2)
2005 atomic_dec(&rb2->remaining);
2006 r10_bio = rb2;
2007 if (!test_and_set_bit(MD_RECOVERY_INTR,
2008 &mddev->recovery))
2009 printk(KERN_INFO "md/raid10:%s: insufficient "
2010 "working devices for recovery.\n",
2011 mdname(mddev));
2012 break;
2013 }
2014 }
2015 if (biolist == NULL) {
2016 while (r10_bio) {
2017 r10bio_t *rb2 = r10_bio;
2018 r10_bio = (r10bio_t*) rb2->master_bio;
2019 rb2->master_bio = NULL;
2020 put_buf(rb2);
2021 }
2022 goto giveup;
2023 }
2024 } else {
2025
2026 int count = 0;
2027
2028 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
2029
2030 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
2031 &sync_blocks, mddev->degraded) &&
2032 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
2033
2034 *skipped = 1;
2035 return sync_blocks + sectors_skipped;
2036 }
2037 if (sync_blocks < max_sync)
2038 max_sync = sync_blocks;
2039 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
2040
2041 r10_bio->mddev = mddev;
2042 atomic_set(&r10_bio->remaining, 0);
2043 raise_barrier(conf, 0);
2044 conf->next_resync = sector_nr;
2045
2046 r10_bio->master_bio = NULL;
2047 r10_bio->sector = sector_nr;
2048 set_bit(R10BIO_IsSync, &r10_bio->state);
2049 raid10_find_phys(conf, r10_bio);
2050 r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1;
2051
2052 for (i=0; i<conf->copies; i++) {
2053 int d = r10_bio->devs[i].devnum;
2054 bio = r10_bio->devs[i].bio;
2055 bio->bi_end_io = NULL;
2056 clear_bit(BIO_UPTODATE, &bio->bi_flags);
2057 if (conf->mirrors[d].rdev == NULL ||
2058 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
2059 continue;
2060 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2061 atomic_inc(&r10_bio->remaining);
2062 bio->bi_next = biolist;
2063 biolist = bio;
2064 bio->bi_private = r10_bio;
2065 bio->bi_end_io = end_sync_read;
2066 bio->bi_rw = READ;
2067 bio->bi_sector = r10_bio->devs[i].addr +
2068 conf->mirrors[d].rdev->data_offset;
2069 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
2070 count++;
2071 }
2072
2073 if (count < 2) {
2074 for (i=0; i<conf->copies; i++) {
2075 int d = r10_bio->devs[i].devnum;
2076 if (r10_bio->devs[i].bio->bi_end_io)
2077 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
2078 }
2079 put_buf(r10_bio);
2080 biolist = NULL;
2081 goto giveup;
2082 }
2083 }
2084
2085 for (bio = biolist; bio ; bio=bio->bi_next) {
2086
2087 bio->bi_flags &= ~(BIO_POOL_MASK - 1);
2088 if (bio->bi_end_io)
2089 bio->bi_flags |= 1 << BIO_UPTODATE;
2090 bio->bi_vcnt = 0;
2091 bio->bi_idx = 0;
2092 bio->bi_phys_segments = 0;
2093 bio->bi_size = 0;
2094 }
2095
2096 nr_sectors = 0;
2097 if (sector_nr + max_sync < max_sector)
2098 max_sector = sector_nr + max_sync;
2099 do {
2100 struct page *page;
2101 int len = PAGE_SIZE;
2102 disk = 0;
2103 if (sector_nr + (len>>9) > max_sector)
2104 len = (max_sector - sector_nr) << 9;
2105 if (len == 0)
2106 break;
2107 for (bio= biolist ; bio ; bio=bio->bi_next) {
2108 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
2109 if (bio_add_page(bio, page, len, 0) == 0) {
2110
2111 struct bio *bio2;
2112 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
2113 for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) {
2114
2115 bio2->bi_vcnt--;
2116 bio2->bi_size -= len;
2117 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
2118 }
2119 goto bio_full;
2120 }
2121 disk = i;
2122 }
2123 nr_sectors += len>>9;
2124 sector_nr += len>>9;
2125 } while (biolist->bi_vcnt < RESYNC_PAGES);
2126 bio_full:
2127 r10_bio->sectors = nr_sectors;
2128
2129 while (biolist) {
2130 bio = biolist;
2131 biolist = biolist->bi_next;
2132
2133 bio->bi_next = NULL;
2134 r10_bio = bio->bi_private;
2135 r10_bio->sectors = nr_sectors;
2136
2137 if (bio->bi_end_io == end_sync_read) {
2138 md_sync_acct(bio->bi_bdev, nr_sectors);
2139 generic_make_request(bio);
2140 }
2141 }
2142
2143 if (sectors_skipped)
2144
2145
2146
2147 md_done_sync(mddev, sectors_skipped, 1);
2148
2149 return sectors_skipped + nr_sectors;
2150 giveup:
2151
2152
2153
2154 if (sector_nr + max_sync < max_sector)
2155 max_sector = sector_nr + max_sync;
2156
2157 sectors_skipped += (max_sector - sector_nr);
2158 chunks_skipped ++;
2159 sector_nr = max_sector;
2160 goto skipped;
2161}
2162
2163static sector_t
2164raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
2165{
2166 sector_t size;
2167 conf_t *conf = mddev->private;
2168
2169 if (!raid_disks)
2170 raid_disks = conf->raid_disks;
2171 if (!sectors)
2172 sectors = conf->dev_sectors;
2173
2174 size = sectors >> conf->chunk_shift;
2175 sector_div(size, conf->far_copies);
2176 size = size * raid_disks;
2177 sector_div(size, conf->near_copies);
2178
2179 return size << conf->chunk_shift;
2180}
2181
2182
2183static conf_t *setup_conf(mddev_t *mddev)
2184{
2185 conf_t *conf = NULL;
2186 int nc, fc, fo;
2187 sector_t stride, size;
2188 int err = -EINVAL;
2189
2190 if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) ||
2191 !is_power_of_2(mddev->new_chunk_sectors)) {
2192 printk(KERN_ERR "md/raid10:%s: chunk size must be "
2193 "at least PAGE_SIZE(%ld) and be a power of 2.\n",
2194 mdname(mddev), PAGE_SIZE);
2195 goto out;
2196 }
2197
2198 nc = mddev->new_layout & 255;
2199 fc = (mddev->new_layout >> 8) & 255;
2200 fo = mddev->new_layout & (1<<16);
2201
2202 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
2203 (mddev->new_layout >> 17)) {
2204 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
2205 mdname(mddev), mddev->new_layout);
2206 goto out;
2207 }
2208
2209 err = -ENOMEM;
2210 conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
2211 if (!conf)
2212 goto out;
2213
2214 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
2215 GFP_KERNEL);
2216 if (!conf->mirrors)
2217 goto out;
2218
2219 conf->tmppage = alloc_page(GFP_KERNEL);
2220 if (!conf->tmppage)
2221 goto out;
2222
2223
2224 conf->raid_disks = mddev->raid_disks;
2225 conf->near_copies = nc;
2226 conf->far_copies = fc;
2227 conf->copies = nc*fc;
2228 conf->far_offset = fo;
2229 conf->chunk_mask = mddev->new_chunk_sectors - 1;
2230 conf->chunk_shift = ffz(~mddev->new_chunk_sectors);
2231
2232 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
2233 r10bio_pool_free, conf);
2234 if (!conf->r10bio_pool)
2235 goto out;
2236
2237 size = mddev->dev_sectors >> conf->chunk_shift;
2238 sector_div(size, fc);
2239 size = size * conf->raid_disks;
2240 sector_div(size, nc);
2241
2242
2243 stride = size * conf->copies;
2244
2245
2246
2247
2248 stride += conf->raid_disks - 1;
2249 sector_div(stride, conf->raid_disks);
2250
2251 conf->dev_sectors = stride << conf->chunk_shift;
2252
2253 if (fo)
2254 stride = 1;
2255 else
2256 sector_div(stride, fc);
2257 conf->stride = stride << conf->chunk_shift;
2258
2259
2260 spin_lock_init(&conf->device_lock);
2261 INIT_LIST_HEAD(&conf->retry_list);
2262
2263 spin_lock_init(&conf->resync_lock);
2264 init_waitqueue_head(&conf->wait_barrier);
2265
2266 conf->thread = md_register_thread(raid10d, mddev, NULL);
2267 if (!conf->thread)
2268 goto out;
2269
2270 conf->mddev = mddev;
2271 return conf;
2272
2273 out:
2274 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
2275 mdname(mddev));
2276 if (conf) {
2277 if (conf->r10bio_pool)
2278 mempool_destroy(conf->r10bio_pool);
2279 kfree(conf->mirrors);
2280 safe_put_page(conf->tmppage);
2281 kfree(conf);
2282 }
2283 return ERR_PTR(err);
2284}
2285
2286static int run(mddev_t *mddev)
2287{
2288 conf_t *conf;
2289 int i, disk_idx, chunk_size;
2290 mirror_info_t *disk;
2291 mdk_rdev_t *rdev;
2292 sector_t size;
2293
2294
2295
2296
2297
2298
2299
2300 if (mddev->private == NULL) {
2301 conf = setup_conf(mddev);
2302 if (IS_ERR(conf))
2303 return PTR_ERR(conf);
2304 mddev->private = conf;
2305 }
2306 conf = mddev->private;
2307 if (!conf)
2308 goto out;
2309
2310 mddev->thread = conf->thread;
2311 conf->thread = NULL;
2312
2313 chunk_size = mddev->chunk_sectors << 9;
2314 blk_queue_io_min(mddev->queue, chunk_size);
2315 if (conf->raid_disks % conf->near_copies)
2316 blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks);
2317 else
2318 blk_queue_io_opt(mddev->queue, chunk_size *
2319 (conf->raid_disks / conf->near_copies));
2320
2321 list_for_each_entry(rdev, &mddev->disks, same_set) {
2322 disk_idx = rdev->raid_disk;
2323 if (disk_idx >= conf->raid_disks
2324 || disk_idx < 0)
2325 continue;
2326 disk = conf->mirrors + disk_idx;
2327
2328 disk->rdev = rdev;
2329 disk_stack_limits(mddev->gendisk, rdev->bdev,
2330 rdev->data_offset << 9);
2331
2332
2333
2334
2335 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
2336 blk_queue_max_segments(mddev->queue, 1);
2337 blk_queue_segment_boundary(mddev->queue,
2338 PAGE_CACHE_SIZE - 1);
2339 }
2340
2341 disk->head_position = 0;
2342 }
2343
2344 if (!enough(conf)) {
2345 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
2346 mdname(mddev));
2347 goto out_free_conf;
2348 }
2349
2350 mddev->degraded = 0;
2351 for (i = 0; i < conf->raid_disks; i++) {
2352
2353 disk = conf->mirrors + i;
2354
2355 if (!disk->rdev ||
2356 !test_bit(In_sync, &disk->rdev->flags)) {
2357 disk->head_position = 0;
2358 mddev->degraded++;
2359 if (disk->rdev)
2360 conf->fullsync = 1;
2361 }
2362 }
2363
2364 if (mddev->recovery_cp != MaxSector)
2365 printk(KERN_NOTICE "md/raid10:%s: not clean"
2366 " -- starting background reconstruction\n",
2367 mdname(mddev));
2368 printk(KERN_INFO
2369 "md/raid10:%s: active with %d out of %d devices\n",
2370 mdname(mddev), conf->raid_disks - mddev->degraded,
2371 conf->raid_disks);
2372
2373
2374
2375 mddev->dev_sectors = conf->dev_sectors;
2376 size = raid10_size(mddev, 0, 0);
2377 md_set_array_sectors(mddev, size);
2378 mddev->resync_max_sectors = size;
2379
2380 mddev->queue->unplug_fn = raid10_unplug;
2381 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
2382 mddev->queue->backing_dev_info.congested_data = mddev;
2383
2384
2385
2386
2387
2388 {
2389 int stripe = conf->raid_disks *
2390 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
2391 stripe /= conf->near_copies;
2392 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
2393 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
2394 }
2395
2396 if (conf->near_copies < conf->raid_disks)
2397 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
2398 md_integrity_register(mddev);
2399 return 0;
2400
2401out_free_conf:
2402 md_unregister_thread(mddev->thread);
2403 if (conf->r10bio_pool)
2404 mempool_destroy(conf->r10bio_pool);
2405 safe_put_page(conf->tmppage);
2406 kfree(conf->mirrors);
2407 kfree(conf);
2408 mddev->private = NULL;
2409out:
2410 return -EIO;
2411}
2412
2413static int stop(mddev_t *mddev)
2414{
2415 conf_t *conf = mddev->private;
2416
2417 raise_barrier(conf, 0);
2418 lower_barrier(conf);
2419
2420 md_unregister_thread(mddev->thread);
2421 mddev->thread = NULL;
2422 blk_sync_queue(mddev->queue);
2423 if (conf->r10bio_pool)
2424 mempool_destroy(conf->r10bio_pool);
2425 kfree(conf->mirrors);
2426 kfree(conf);
2427 mddev->private = NULL;
2428 return 0;
2429}
2430
2431static void raid10_quiesce(mddev_t *mddev, int state)
2432{
2433 conf_t *conf = mddev->private;
2434
2435 switch(state) {
2436 case 1:
2437 raise_barrier(conf, 0);
2438 break;
2439 case 0:
2440 lower_barrier(conf);
2441 break;
2442 }
2443}
2444
2445static void *raid10_takeover_raid0(mddev_t *mddev)
2446{
2447 mdk_rdev_t *rdev;
2448 conf_t *conf;
2449
2450 if (mddev->degraded > 0) {
2451 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
2452 mdname(mddev));
2453 return ERR_PTR(-EINVAL);
2454 }
2455
2456
2457 mddev->new_level = 10;
2458
2459 mddev->new_layout = (1<<8) + 2;
2460 mddev->new_chunk_sectors = mddev->chunk_sectors;
2461 mddev->delta_disks = mddev->raid_disks;
2462 mddev->raid_disks *= 2;
2463
2464 mddev->recovery_cp = MaxSector;
2465
2466 conf = setup_conf(mddev);
2467 if (!IS_ERR(conf)) {
2468 list_for_each_entry(rdev, &mddev->disks, same_set)
2469 if (rdev->raid_disk >= 0)
2470 rdev->new_raid_disk = rdev->raid_disk * 2;
2471 conf->barrier = 1;
2472 }
2473
2474 return conf;
2475}
2476
2477static void *raid10_takeover(mddev_t *mddev)
2478{
2479 struct raid0_private_data *raid0_priv;
2480
2481
2482
2483
2484 if (mddev->level == 0) {
2485
2486 raid0_priv = mddev->private;
2487 if (raid0_priv->nr_strip_zones > 1) {
2488 printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
2489 " with more than one zone.\n",
2490 mdname(mddev));
2491 return ERR_PTR(-EINVAL);
2492 }
2493 return raid10_takeover_raid0(mddev);
2494 }
2495 return ERR_PTR(-EINVAL);
2496}
2497
2498static struct mdk_personality raid10_personality =
2499{
2500 .name = "raid10",
2501 .level = 10,
2502 .owner = THIS_MODULE,
2503 .make_request = make_request,
2504 .run = run,
2505 .stop = stop,
2506 .status = status,
2507 .error_handler = error,
2508 .hot_add_disk = raid10_add_disk,
2509 .hot_remove_disk= raid10_remove_disk,
2510 .spare_active = raid10_spare_active,
2511 .sync_request = sync_request,
2512 .quiesce = raid10_quiesce,
2513 .size = raid10_size,
2514 .takeover = raid10_takeover,
2515};
2516
2517static int __init raid_init(void)
2518{
2519 return register_md_personality(&raid10_personality);
2520}
2521
2522static void raid_exit(void)
2523{
2524 unregister_md_personality(&raid10_personality);
2525}
2526
2527module_init(raid_init);
2528module_exit(raid_exit);
2529MODULE_LICENSE("GPL");
2530MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
2531MODULE_ALIAS("md-personality-9");
2532MODULE_ALIAS("md-raid10");
2533MODULE_ALIAS("md-level-10");
2534