1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/slab.h>
22#include <linux/delay.h>
23#include <linux/blkdev.h>
24#include <linux/module.h>
25#include <linux/seq_file.h>
26#include <linux/ratelimit.h>
27#include <linux/kthread.h>
28#include "md.h"
29#include "raid10.h"
30#include "raid0.h"
31#include "bitmap.h"
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77#define NR_RAID10_BIOS 256
78
79
80
81
82
83
84#define IO_BLOCKED ((struct bio *)1)
85
86
87
88
89#define IO_MADE_GOOD ((struct bio *)2)
90
91#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
92
93
94
95
96
97static int max_queued_requests = 1024;
98
99static void allow_barrier(struct r10conf *conf);
100static void lower_barrier(struct r10conf *conf);
101static int _enough(struct r10conf *conf, int previous, int ignore);
102static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
103 int *skipped);
104static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
105static void end_reshape_write(struct bio *bio);
106static void end_reshape(struct r10conf *conf);
107
108static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
109{
110 struct r10conf *conf = data;
111 int size = offsetof(struct r10bio, devs[conf->copies]);
112
113
114
115 return kzalloc(size, gfp_flags);
116}
117
118static void r10bio_pool_free(void *r10_bio, void *data)
119{
120 kfree(r10_bio);
121}
122
123
124#define RESYNC_BLOCK_SIZE (64*1024)
125#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
126
127#define RESYNC_WINDOW (1024*1024)
128
129#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
130
131
132
133
134
135
136
137
138static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
139{
140 struct r10conf *conf = data;
141 struct page *page;
142 struct r10bio *r10_bio;
143 struct bio *bio;
144 int i, j;
145 int nalloc;
146
147 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
148 if (!r10_bio)
149 return NULL;
150
151 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
152 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
153 nalloc = conf->copies;
154 else
155 nalloc = 2;
156
157
158
159
160 for (j = nalloc ; j-- ; ) {
161 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
162 if (!bio)
163 goto out_free_bio;
164 r10_bio->devs[j].bio = bio;
165 if (!conf->have_replacement)
166 continue;
167 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
168 if (!bio)
169 goto out_free_bio;
170 r10_bio->devs[j].repl_bio = bio;
171 }
172
173
174
175
176 for (j = 0 ; j < nalloc; j++) {
177 struct bio *rbio = r10_bio->devs[j].repl_bio;
178 bio = r10_bio->devs[j].bio;
179 for (i = 0; i < RESYNC_PAGES; i++) {
180 if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
181 &conf->mddev->recovery)) {
182
183
184 struct bio *rbio = r10_bio->devs[0].bio;
185 page = rbio->bi_io_vec[i].bv_page;
186 get_page(page);
187 } else
188 page = alloc_page(gfp_flags);
189 if (unlikely(!page))
190 goto out_free_pages;
191
192 bio->bi_io_vec[i].bv_page = page;
193 if (rbio)
194 rbio->bi_io_vec[i].bv_page = page;
195 }
196 }
197
198 return r10_bio;
199
200out_free_pages:
201 for ( ; i > 0 ; i--)
202 safe_put_page(bio->bi_io_vec[i-1].bv_page);
203 while (j--)
204 for (i = 0; i < RESYNC_PAGES ; i++)
205 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
206 j = 0;
207out_free_bio:
208 for ( ; j < nalloc; j++) {
209 if (r10_bio->devs[j].bio)
210 bio_put(r10_bio->devs[j].bio);
211 if (r10_bio->devs[j].repl_bio)
212 bio_put(r10_bio->devs[j].repl_bio);
213 }
214 r10bio_pool_free(r10_bio, conf);
215 return NULL;
216}
217
218static void r10buf_pool_free(void *__r10_bio, void *data)
219{
220 int i;
221 struct r10conf *conf = data;
222 struct r10bio *r10bio = __r10_bio;
223 int j;
224
225 for (j=0; j < conf->copies; j++) {
226 struct bio *bio = r10bio->devs[j].bio;
227 if (bio) {
228 for (i = 0; i < RESYNC_PAGES; i++) {
229 safe_put_page(bio->bi_io_vec[i].bv_page);
230 bio->bi_io_vec[i].bv_page = NULL;
231 }
232 bio_put(bio);
233 }
234 bio = r10bio->devs[j].repl_bio;
235 if (bio)
236 bio_put(bio);
237 }
238 r10bio_pool_free(r10bio, conf);
239}
240
241static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
242{
243 int i;
244
245 for (i = 0; i < conf->copies; i++) {
246 struct bio **bio = & r10_bio->devs[i].bio;
247 if (!BIO_SPECIAL(*bio))
248 bio_put(*bio);
249 *bio = NULL;
250 bio = &r10_bio->devs[i].repl_bio;
251 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
252 bio_put(*bio);
253 *bio = NULL;
254 }
255}
256
257static void free_r10bio(struct r10bio *r10_bio)
258{
259 struct r10conf *conf = r10_bio->mddev->private;
260
261 put_all_bios(conf, r10_bio);
262 mempool_free(r10_bio, conf->r10bio_pool);
263}
264
265static void put_buf(struct r10bio *r10_bio)
266{
267 struct r10conf *conf = r10_bio->mddev->private;
268
269 mempool_free(r10_bio, conf->r10buf_pool);
270
271 lower_barrier(conf);
272}
273
274static void reschedule_retry(struct r10bio *r10_bio)
275{
276 unsigned long flags;
277 struct mddev *mddev = r10_bio->mddev;
278 struct r10conf *conf = mddev->private;
279
280 spin_lock_irqsave(&conf->device_lock, flags);
281 list_add(&r10_bio->retry_list, &conf->retry_list);
282 conf->nr_queued ++;
283 spin_unlock_irqrestore(&conf->device_lock, flags);
284
285
286 wake_up(&conf->wait_barrier);
287
288 md_wakeup_thread(mddev->thread);
289}
290
291
292
293
294
295
296static void raid_end_bio_io(struct r10bio *r10_bio)
297{
298 struct bio *bio = r10_bio->master_bio;
299 int done;
300 struct r10conf *conf = r10_bio->mddev->private;
301
302 if (bio->bi_phys_segments) {
303 unsigned long flags;
304 spin_lock_irqsave(&conf->device_lock, flags);
305 bio->bi_phys_segments--;
306 done = (bio->bi_phys_segments == 0);
307 spin_unlock_irqrestore(&conf->device_lock, flags);
308 } else
309 done = 1;
310 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
311 bio->bi_error = -EIO;
312 if (done) {
313 bio_endio(bio);
314
315
316
317
318 allow_barrier(conf);
319 }
320 free_r10bio(r10_bio);
321}
322
323
324
325
326static inline void update_head_pos(int slot, struct r10bio *r10_bio)
327{
328 struct r10conf *conf = r10_bio->mddev->private;
329
330 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
331 r10_bio->devs[slot].addr + (r10_bio->sectors);
332}
333
334
335
336
337static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
338 struct bio *bio, int *slotp, int *replp)
339{
340 int slot;
341 int repl = 0;
342
343 for (slot = 0; slot < conf->copies; slot++) {
344 if (r10_bio->devs[slot].bio == bio)
345 break;
346 if (r10_bio->devs[slot].repl_bio == bio) {
347 repl = 1;
348 break;
349 }
350 }
351
352 BUG_ON(slot == conf->copies);
353 update_head_pos(slot, r10_bio);
354
355 if (slotp)
356 *slotp = slot;
357 if (replp)
358 *replp = repl;
359 return r10_bio->devs[slot].devnum;
360}
361
362static void raid10_end_read_request(struct bio *bio)
363{
364 int uptodate = !bio->bi_error;
365 struct r10bio *r10_bio = bio->bi_private;
366 int slot, dev;
367 struct md_rdev *rdev;
368 struct r10conf *conf = r10_bio->mddev->private;
369
370 slot = r10_bio->read_slot;
371 dev = r10_bio->devs[slot].devnum;
372 rdev = r10_bio->devs[slot].rdev;
373
374
375
376 update_head_pos(slot, r10_bio);
377
378 if (uptodate) {
379
380
381
382
383
384
385
386
387
388 set_bit(R10BIO_Uptodate, &r10_bio->state);
389 } else {
390
391
392
393
394
395 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
396 rdev->raid_disk))
397 uptodate = 1;
398 }
399 if (uptodate) {
400 raid_end_bio_io(r10_bio);
401 rdev_dec_pending(rdev, conf->mddev);
402 } else {
403
404
405
406 char b[BDEVNAME_SIZE];
407 printk_ratelimited(KERN_ERR
408 "md/raid10:%s: %s: rescheduling sector %llu\n",
409 mdname(conf->mddev),
410 bdevname(rdev->bdev, b),
411 (unsigned long long)r10_bio->sector);
412 set_bit(R10BIO_ReadError, &r10_bio->state);
413 reschedule_retry(r10_bio);
414 }
415}
416
417static void close_write(struct r10bio *r10_bio)
418{
419
420 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
421 r10_bio->sectors,
422 !test_bit(R10BIO_Degraded, &r10_bio->state),
423 0);
424 md_write_end(r10_bio->mddev);
425}
426
427static void one_write_done(struct r10bio *r10_bio)
428{
429 if (atomic_dec_and_test(&r10_bio->remaining)) {
430 if (test_bit(R10BIO_WriteError, &r10_bio->state))
431 reschedule_retry(r10_bio);
432 else {
433 close_write(r10_bio);
434 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
435 reschedule_retry(r10_bio);
436 else
437 raid_end_bio_io(r10_bio);
438 }
439 }
440}
441
442static void raid10_end_write_request(struct bio *bio)
443{
444 struct r10bio *r10_bio = bio->bi_private;
445 int dev;
446 int dec_rdev = 1;
447 struct r10conf *conf = r10_bio->mddev->private;
448 int slot, repl;
449 struct md_rdev *rdev = NULL;
450
451 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
452
453 if (repl)
454 rdev = conf->mirrors[dev].replacement;
455 if (!rdev) {
456 smp_rmb();
457 repl = 0;
458 rdev = conf->mirrors[dev].rdev;
459 }
460
461
462
463 if (bio->bi_error) {
464 if (repl)
465
466
467
468 md_error(rdev->mddev, rdev);
469 else {
470 set_bit(WriteErrorSeen, &rdev->flags);
471 if (!test_and_set_bit(WantReplacement, &rdev->flags))
472 set_bit(MD_RECOVERY_NEEDED,
473 &rdev->mddev->recovery);
474 set_bit(R10BIO_WriteError, &r10_bio->state);
475 dec_rdev = 0;
476 }
477 } else {
478
479
480
481
482
483
484
485
486
487 sector_t first_bad;
488 int bad_sectors;
489
490
491
492
493
494
495
496
497
498 if (test_bit(In_sync, &rdev->flags) &&
499 !test_bit(Faulty, &rdev->flags))
500 set_bit(R10BIO_Uptodate, &r10_bio->state);
501
502
503 if (is_badblock(rdev,
504 r10_bio->devs[slot].addr,
505 r10_bio->sectors,
506 &first_bad, &bad_sectors)) {
507 bio_put(bio);
508 if (repl)
509 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
510 else
511 r10_bio->devs[slot].bio = IO_MADE_GOOD;
512 dec_rdev = 0;
513 set_bit(R10BIO_MadeGood, &r10_bio->state);
514 }
515 }
516
517
518
519
520
521
522 one_write_done(r10_bio);
523 if (dec_rdev)
524 rdev_dec_pending(rdev, conf->mddev);
525}
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
553{
554 int n,f;
555 sector_t sector;
556 sector_t chunk;
557 sector_t stripe;
558 int dev;
559 int slot = 0;
560 int last_far_set_start, last_far_set_size;
561
562 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
563 last_far_set_start *= geo->far_set_size;
564
565 last_far_set_size = geo->far_set_size;
566 last_far_set_size += (geo->raid_disks % geo->far_set_size);
567
568
569 chunk = r10bio->sector >> geo->chunk_shift;
570 sector = r10bio->sector & geo->chunk_mask;
571
572 chunk *= geo->near_copies;
573 stripe = chunk;
574 dev = sector_div(stripe, geo->raid_disks);
575 if (geo->far_offset)
576 stripe *= geo->far_copies;
577
578 sector += stripe << geo->chunk_shift;
579
580
581 for (n = 0; n < geo->near_copies; n++) {
582 int d = dev;
583 int set;
584 sector_t s = sector;
585 r10bio->devs[slot].devnum = d;
586 r10bio->devs[slot].addr = s;
587 slot++;
588
589 for (f = 1; f < geo->far_copies; f++) {
590 set = d / geo->far_set_size;
591 d += geo->near_copies;
592
593 if ((geo->raid_disks % geo->far_set_size) &&
594 (d > last_far_set_start)) {
595 d -= last_far_set_start;
596 d %= last_far_set_size;
597 d += last_far_set_start;
598 } else {
599 d %= geo->far_set_size;
600 d += geo->far_set_size * set;
601 }
602 s += geo->stride;
603 r10bio->devs[slot].devnum = d;
604 r10bio->devs[slot].addr = s;
605 slot++;
606 }
607 dev++;
608 if (dev >= geo->raid_disks) {
609 dev = 0;
610 sector += (geo->chunk_mask + 1);
611 }
612 }
613}
614
615static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
616{
617 struct geom *geo = &conf->geo;
618
619 if (conf->reshape_progress != MaxSector &&
620 ((r10bio->sector >= conf->reshape_progress) !=
621 conf->mddev->reshape_backwards)) {
622 set_bit(R10BIO_Previous, &r10bio->state);
623 geo = &conf->prev;
624 } else
625 clear_bit(R10BIO_Previous, &r10bio->state);
626
627 __raid10_find_phys(geo, r10bio);
628}
629
630static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
631{
632 sector_t offset, chunk, vchunk;
633
634
635
636 struct geom *geo = &conf->geo;
637 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
638 int far_set_size = geo->far_set_size;
639 int last_far_set_start;
640
641 if (geo->raid_disks % geo->far_set_size) {
642 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
643 last_far_set_start *= geo->far_set_size;
644
645 if (dev >= last_far_set_start) {
646 far_set_size = geo->far_set_size;
647 far_set_size += (geo->raid_disks % geo->far_set_size);
648 far_set_start = last_far_set_start;
649 }
650 }
651
652 offset = sector & geo->chunk_mask;
653 if (geo->far_offset) {
654 int fc;
655 chunk = sector >> geo->chunk_shift;
656 fc = sector_div(chunk, geo->far_copies);
657 dev -= fc * geo->near_copies;
658 if (dev < far_set_start)
659 dev += far_set_size;
660 } else {
661 while (sector >= geo->stride) {
662 sector -= geo->stride;
663 if (dev < (geo->near_copies + far_set_start))
664 dev += far_set_size - geo->near_copies;
665 else
666 dev -= geo->near_copies;
667 }
668 chunk = sector >> geo->chunk_shift;
669 }
670 vchunk = chunk * geo->raid_disks + dev;
671 sector_div(vchunk, geo->near_copies);
672 return (vchunk << geo->chunk_shift) + offset;
673}
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694static struct md_rdev *read_balance(struct r10conf *conf,
695 struct r10bio *r10_bio,
696 int *max_sectors)
697{
698 const sector_t this_sector = r10_bio->sector;
699 int disk, slot;
700 int sectors = r10_bio->sectors;
701 int best_good_sectors;
702 sector_t new_distance, best_dist;
703 struct md_rdev *best_rdev, *rdev = NULL;
704 int do_balance;
705 int best_slot;
706 struct geom *geo = &conf->geo;
707
708 raid10_find_phys(conf, r10_bio);
709 rcu_read_lock();
710retry:
711 sectors = r10_bio->sectors;
712 best_slot = -1;
713 best_rdev = NULL;
714 best_dist = MaxSector;
715 best_good_sectors = 0;
716 do_balance = 1;
717
718
719
720
721
722
723 if (conf->mddev->recovery_cp < MaxSector
724 && (this_sector + sectors >= conf->next_resync))
725 do_balance = 0;
726
727 for (slot = 0; slot < conf->copies ; slot++) {
728 sector_t first_bad;
729 int bad_sectors;
730 sector_t dev_sector;
731
732 if (r10_bio->devs[slot].bio == IO_BLOCKED)
733 continue;
734 disk = r10_bio->devs[slot].devnum;
735 rdev = rcu_dereference(conf->mirrors[disk].replacement);
736 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
737 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
738 rdev = rcu_dereference(conf->mirrors[disk].rdev);
739 if (rdev == NULL ||
740 test_bit(Faulty, &rdev->flags))
741 continue;
742 if (!test_bit(In_sync, &rdev->flags) &&
743 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
744 continue;
745
746 dev_sector = r10_bio->devs[slot].addr;
747 if (is_badblock(rdev, dev_sector, sectors,
748 &first_bad, &bad_sectors)) {
749 if (best_dist < MaxSector)
750
751 continue;
752 if (first_bad <= dev_sector) {
753
754
755
756
757 bad_sectors -= (dev_sector - first_bad);
758 if (!do_balance && sectors > bad_sectors)
759 sectors = bad_sectors;
760 if (best_good_sectors > sectors)
761 best_good_sectors = sectors;
762 } else {
763 sector_t good_sectors =
764 first_bad - dev_sector;
765 if (good_sectors > best_good_sectors) {
766 best_good_sectors = good_sectors;
767 best_slot = slot;
768 best_rdev = rdev;
769 }
770 if (!do_balance)
771
772 break;
773 }
774 continue;
775 } else
776 best_good_sectors = sectors;
777
778 if (!do_balance)
779 break;
780
781
782
783
784
785 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
786 break;
787
788
789 if (geo->far_copies > 1)
790 new_distance = r10_bio->devs[slot].addr;
791 else
792 new_distance = abs(r10_bio->devs[slot].addr -
793 conf->mirrors[disk].head_position);
794 if (new_distance < best_dist) {
795 best_dist = new_distance;
796 best_slot = slot;
797 best_rdev = rdev;
798 }
799 }
800 if (slot >= conf->copies) {
801 slot = best_slot;
802 rdev = best_rdev;
803 }
804
805 if (slot >= 0) {
806 atomic_inc(&rdev->nr_pending);
807 if (test_bit(Faulty, &rdev->flags)) {
808
809
810
811 rdev_dec_pending(rdev, conf->mddev);
812 goto retry;
813 }
814 r10_bio->read_slot = slot;
815 } else
816 rdev = NULL;
817 rcu_read_unlock();
818 *max_sectors = best_good_sectors;
819
820 return rdev;
821}
822
823static int raid10_congested(struct mddev *mddev, int bits)
824{
825 struct r10conf *conf = mddev->private;
826 int i, ret = 0;
827
828 if ((bits & (1 << WB_async_congested)) &&
829 conf->pending_count >= max_queued_requests)
830 return 1;
831
832 rcu_read_lock();
833 for (i = 0;
834 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
835 && ret == 0;
836 i++) {
837 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
838 if (rdev && !test_bit(Faulty, &rdev->flags)) {
839 struct request_queue *q = bdev_get_queue(rdev->bdev);
840
841 ret |= bdi_congested(&q->backing_dev_info, bits);
842 }
843 }
844 rcu_read_unlock();
845 return ret;
846}
847
848static void flush_pending_writes(struct r10conf *conf)
849{
850
851
852
853 spin_lock_irq(&conf->device_lock);
854
855 if (conf->pending_bio_list.head) {
856 struct bio *bio;
857 bio = bio_list_get(&conf->pending_bio_list);
858 conf->pending_count = 0;
859 spin_unlock_irq(&conf->device_lock);
860
861
862 bitmap_unplug(conf->mddev->bitmap);
863 wake_up(&conf->wait_barrier);
864
865 while (bio) {
866 struct bio *next = bio->bi_next;
867 bio->bi_next = NULL;
868 if (unlikely((bio->bi_rw & REQ_DISCARD) &&
869 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
870
871 bio_endio(bio);
872 else
873 generic_make_request(bio);
874 bio = next;
875 }
876 } else
877 spin_unlock_irq(&conf->device_lock);
878}
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902static void raise_barrier(struct r10conf *conf, int force)
903{
904 BUG_ON(force && !conf->barrier);
905 spin_lock_irq(&conf->resync_lock);
906
907
908 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
909 conf->resync_lock);
910
911
912 conf->barrier++;
913
914
915 wait_event_lock_irq(conf->wait_barrier,
916 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
917 conf->resync_lock);
918
919 spin_unlock_irq(&conf->resync_lock);
920}
921
922static void lower_barrier(struct r10conf *conf)
923{
924 unsigned long flags;
925 spin_lock_irqsave(&conf->resync_lock, flags);
926 conf->barrier--;
927 spin_unlock_irqrestore(&conf->resync_lock, flags);
928 wake_up(&conf->wait_barrier);
929}
930
931static void wait_barrier(struct r10conf *conf)
932{
933 spin_lock_irq(&conf->resync_lock);
934 if (conf->barrier) {
935 conf->nr_waiting++;
936
937
938
939
940
941
942
943
944
945 wait_event_lock_irq(conf->wait_barrier,
946 !conf->barrier ||
947 (conf->nr_pending &&
948 current->bio_list &&
949 !bio_list_empty(current->bio_list)),
950 conf->resync_lock);
951 conf->nr_waiting--;
952 }
953 conf->nr_pending++;
954 spin_unlock_irq(&conf->resync_lock);
955}
956
957static void allow_barrier(struct r10conf *conf)
958{
959 unsigned long flags;
960 spin_lock_irqsave(&conf->resync_lock, flags);
961 conf->nr_pending--;
962 spin_unlock_irqrestore(&conf->resync_lock, flags);
963 wake_up(&conf->wait_barrier);
964}
965
966static void freeze_array(struct r10conf *conf, int extra)
967{
968
969
970
971
972
973
974
975
976
977
978
979
980 spin_lock_irq(&conf->resync_lock);
981 conf->barrier++;
982 conf->nr_waiting++;
983 wait_event_lock_irq_cmd(conf->wait_barrier,
984 conf->nr_pending == conf->nr_queued+extra,
985 conf->resync_lock,
986 flush_pending_writes(conf));
987
988 spin_unlock_irq(&conf->resync_lock);
989}
990
991static void unfreeze_array(struct r10conf *conf)
992{
993
994 spin_lock_irq(&conf->resync_lock);
995 conf->barrier--;
996 conf->nr_waiting--;
997 wake_up(&conf->wait_barrier);
998 spin_unlock_irq(&conf->resync_lock);
999}
1000
1001static sector_t choose_data_offset(struct r10bio *r10_bio,
1002 struct md_rdev *rdev)
1003{
1004 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1005 test_bit(R10BIO_Previous, &r10_bio->state))
1006 return rdev->data_offset;
1007 else
1008 return rdev->new_data_offset;
1009}
1010
1011struct raid10_plug_cb {
1012 struct blk_plug_cb cb;
1013 struct bio_list pending;
1014 int pending_cnt;
1015};
1016
1017static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1018{
1019 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1020 cb);
1021 struct mddev *mddev = plug->cb.data;
1022 struct r10conf *conf = mddev->private;
1023 struct bio *bio;
1024
1025 if (from_schedule || current->bio_list) {
1026 spin_lock_irq(&conf->device_lock);
1027 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1028 conf->pending_count += plug->pending_cnt;
1029 spin_unlock_irq(&conf->device_lock);
1030 wake_up(&conf->wait_barrier);
1031 md_wakeup_thread(mddev->thread);
1032 kfree(plug);
1033 return;
1034 }
1035
1036
1037 bio = bio_list_get(&plug->pending);
1038 bitmap_unplug(mddev->bitmap);
1039 wake_up(&conf->wait_barrier);
1040
1041 while (bio) {
1042 struct bio *next = bio->bi_next;
1043 bio->bi_next = NULL;
1044 if (unlikely((bio->bi_rw & REQ_DISCARD) &&
1045 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
1046
1047 bio_endio(bio);
1048 else
1049 generic_make_request(bio);
1050 bio = next;
1051 }
1052 kfree(plug);
1053}
1054
1055static void __make_request(struct mddev *mddev, struct bio *bio)
1056{
1057 struct r10conf *conf = mddev->private;
1058 struct r10bio *r10_bio;
1059 struct bio *read_bio;
1060 int i;
1061 const int rw = bio_data_dir(bio);
1062 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
1063 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
1064 const unsigned long do_discard = (bio->bi_rw
1065 & (REQ_DISCARD | REQ_SECURE));
1066 const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
1067 unsigned long flags;
1068 struct md_rdev *blocked_rdev;
1069 struct blk_plug_cb *cb;
1070 struct raid10_plug_cb *plug = NULL;
1071 int sectors_handled;
1072 int max_sectors;
1073 int sectors;
1074
1075
1076
1077
1078
1079
1080 wait_barrier(conf);
1081
1082 sectors = bio_sectors(bio);
1083 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1084 bio->bi_iter.bi_sector < conf->reshape_progress &&
1085 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1086
1087
1088
1089 allow_barrier(conf);
1090 wait_event(conf->wait_barrier,
1091 conf->reshape_progress <= bio->bi_iter.bi_sector ||
1092 conf->reshape_progress >= bio->bi_iter.bi_sector +
1093 sectors);
1094 wait_barrier(conf);
1095 }
1096 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1097 bio_data_dir(bio) == WRITE &&
1098 (mddev->reshape_backwards
1099 ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
1100 bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
1101 : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
1102 bio->bi_iter.bi_sector < conf->reshape_progress))) {
1103
1104 mddev->reshape_position = conf->reshape_progress;
1105 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1106 set_bit(MD_CHANGE_PENDING, &mddev->flags);
1107 md_wakeup_thread(mddev->thread);
1108 wait_event(mddev->sb_wait,
1109 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
1110
1111 conf->reshape_safe = mddev->reshape_position;
1112 }
1113
1114 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1115
1116 r10_bio->master_bio = bio;
1117 r10_bio->sectors = sectors;
1118
1119 r10_bio->mddev = mddev;
1120 r10_bio->sector = bio->bi_iter.bi_sector;
1121 r10_bio->state = 0;
1122
1123
1124
1125
1126
1127
1128
1129
1130 bio->bi_phys_segments = 0;
1131 bio_clear_flag(bio, BIO_SEG_VALID);
1132
1133 if (rw == READ) {
1134
1135
1136
1137 struct md_rdev *rdev;
1138 int slot;
1139
1140read_again:
1141 rdev = read_balance(conf, r10_bio, &max_sectors);
1142 if (!rdev) {
1143 raid_end_bio_io(r10_bio);
1144 return;
1145 }
1146 slot = r10_bio->read_slot;
1147
1148 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1149 bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
1150 max_sectors);
1151
1152 r10_bio->devs[slot].bio = read_bio;
1153 r10_bio->devs[slot].rdev = rdev;
1154
1155 read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
1156 choose_data_offset(r10_bio, rdev);
1157 read_bio->bi_bdev = rdev->bdev;
1158 read_bio->bi_end_io = raid10_end_read_request;
1159 read_bio->bi_rw = READ | do_sync;
1160 read_bio->bi_private = r10_bio;
1161
1162 if (max_sectors < r10_bio->sectors) {
1163
1164
1165
1166 sectors_handled = (r10_bio->sector + max_sectors
1167 - bio->bi_iter.bi_sector);
1168 r10_bio->sectors = max_sectors;
1169 spin_lock_irq(&conf->device_lock);
1170 if (bio->bi_phys_segments == 0)
1171 bio->bi_phys_segments = 2;
1172 else
1173 bio->bi_phys_segments++;
1174 spin_unlock_irq(&conf->device_lock);
1175
1176
1177
1178
1179
1180 reschedule_retry(r10_bio);
1181
1182 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1183
1184 r10_bio->master_bio = bio;
1185 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1186 r10_bio->state = 0;
1187 r10_bio->mddev = mddev;
1188 r10_bio->sector = bio->bi_iter.bi_sector +
1189 sectors_handled;
1190 goto read_again;
1191 } else
1192 generic_make_request(read_bio);
1193 return;
1194 }
1195
1196
1197
1198
1199 if (conf->pending_count >= max_queued_requests) {
1200 md_wakeup_thread(mddev->thread);
1201 wait_event(conf->wait_barrier,
1202 conf->pending_count < max_queued_requests);
1203 }
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216 r10_bio->read_slot = -1;
1217 raid10_find_phys(conf, r10_bio);
1218retry_write:
1219 blocked_rdev = NULL;
1220 rcu_read_lock();
1221 max_sectors = r10_bio->sectors;
1222
1223 for (i = 0; i < conf->copies; i++) {
1224 int d = r10_bio->devs[i].devnum;
1225 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1226 struct md_rdev *rrdev = rcu_dereference(
1227 conf->mirrors[d].replacement);
1228 if (rdev == rrdev)
1229 rrdev = NULL;
1230 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1231 atomic_inc(&rdev->nr_pending);
1232 blocked_rdev = rdev;
1233 break;
1234 }
1235 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1236 atomic_inc(&rrdev->nr_pending);
1237 blocked_rdev = rrdev;
1238 break;
1239 }
1240 if (rdev && (test_bit(Faulty, &rdev->flags)))
1241 rdev = NULL;
1242 if (rrdev && (test_bit(Faulty, &rrdev->flags)))
1243 rrdev = NULL;
1244
1245 r10_bio->devs[i].bio = NULL;
1246 r10_bio->devs[i].repl_bio = NULL;
1247
1248 if (!rdev && !rrdev) {
1249 set_bit(R10BIO_Degraded, &r10_bio->state);
1250 continue;
1251 }
1252 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1253 sector_t first_bad;
1254 sector_t dev_sector = r10_bio->devs[i].addr;
1255 int bad_sectors;
1256 int is_bad;
1257
1258 is_bad = is_badblock(rdev, dev_sector,
1259 max_sectors,
1260 &first_bad, &bad_sectors);
1261 if (is_bad < 0) {
1262
1263
1264
1265 atomic_inc(&rdev->nr_pending);
1266 set_bit(BlockedBadBlocks, &rdev->flags);
1267 blocked_rdev = rdev;
1268 break;
1269 }
1270 if (is_bad && first_bad <= dev_sector) {
1271
1272 bad_sectors -= (dev_sector - first_bad);
1273 if (bad_sectors < max_sectors)
1274
1275
1276
1277 max_sectors = bad_sectors;
1278
1279
1280
1281
1282
1283
1284
1285
1286 continue;
1287 }
1288 if (is_bad) {
1289 int good_sectors = first_bad - dev_sector;
1290 if (good_sectors < max_sectors)
1291 max_sectors = good_sectors;
1292 }
1293 }
1294 if (rdev) {
1295 r10_bio->devs[i].bio = bio;
1296 atomic_inc(&rdev->nr_pending);
1297 }
1298 if (rrdev) {
1299 r10_bio->devs[i].repl_bio = bio;
1300 atomic_inc(&rrdev->nr_pending);
1301 }
1302 }
1303 rcu_read_unlock();
1304
1305 if (unlikely(blocked_rdev)) {
1306
1307 int j;
1308 int d;
1309
1310 for (j = 0; j < i; j++) {
1311 if (r10_bio->devs[j].bio) {
1312 d = r10_bio->devs[j].devnum;
1313 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1314 }
1315 if (r10_bio->devs[j].repl_bio) {
1316 struct md_rdev *rdev;
1317 d = r10_bio->devs[j].devnum;
1318 rdev = conf->mirrors[d].replacement;
1319 if (!rdev) {
1320
1321 smp_mb();
1322 rdev = conf->mirrors[d].rdev;
1323 }
1324 rdev_dec_pending(rdev, mddev);
1325 }
1326 }
1327 allow_barrier(conf);
1328 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1329 wait_barrier(conf);
1330 goto retry_write;
1331 }
1332
1333 if (max_sectors < r10_bio->sectors) {
1334
1335
1336
1337 r10_bio->sectors = max_sectors;
1338 spin_lock_irq(&conf->device_lock);
1339 if (bio->bi_phys_segments == 0)
1340 bio->bi_phys_segments = 2;
1341 else
1342 bio->bi_phys_segments++;
1343 spin_unlock_irq(&conf->device_lock);
1344 }
1345 sectors_handled = r10_bio->sector + max_sectors -
1346 bio->bi_iter.bi_sector;
1347
1348 atomic_set(&r10_bio->remaining, 1);
1349 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1350
1351 for (i = 0; i < conf->copies; i++) {
1352 struct bio *mbio;
1353 int d = r10_bio->devs[i].devnum;
1354 if (r10_bio->devs[i].bio) {
1355 struct md_rdev *rdev = conf->mirrors[d].rdev;
1356 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1357 bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
1358 max_sectors);
1359 r10_bio->devs[i].bio = mbio;
1360
1361 mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
1362 choose_data_offset(r10_bio,
1363 rdev));
1364 mbio->bi_bdev = rdev->bdev;
1365 mbio->bi_end_io = raid10_end_write_request;
1366 mbio->bi_rw =
1367 WRITE | do_sync | do_fua | do_discard | do_same;
1368 mbio->bi_private = r10_bio;
1369
1370 atomic_inc(&r10_bio->remaining);
1371
1372 cb = blk_check_plugged(raid10_unplug, mddev,
1373 sizeof(*plug));
1374 if (cb)
1375 plug = container_of(cb, struct raid10_plug_cb,
1376 cb);
1377 else
1378 plug = NULL;
1379 spin_lock_irqsave(&conf->device_lock, flags);
1380 if (plug) {
1381 bio_list_add(&plug->pending, mbio);
1382 plug->pending_cnt++;
1383 } else {
1384 bio_list_add(&conf->pending_bio_list, mbio);
1385 conf->pending_count++;
1386 }
1387 spin_unlock_irqrestore(&conf->device_lock, flags);
1388 if (!plug)
1389 md_wakeup_thread(mddev->thread);
1390 }
1391
1392 if (r10_bio->devs[i].repl_bio) {
1393 struct md_rdev *rdev = conf->mirrors[d].replacement;
1394 if (rdev == NULL) {
1395
1396 smp_mb();
1397 rdev = conf->mirrors[d].rdev;
1398 }
1399 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1400 bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
1401 max_sectors);
1402 r10_bio->devs[i].repl_bio = mbio;
1403
1404 mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr +
1405 choose_data_offset(
1406 r10_bio, rdev));
1407 mbio->bi_bdev = rdev->bdev;
1408 mbio->bi_end_io = raid10_end_write_request;
1409 mbio->bi_rw =
1410 WRITE | do_sync | do_fua | do_discard | do_same;
1411 mbio->bi_private = r10_bio;
1412
1413 atomic_inc(&r10_bio->remaining);
1414 spin_lock_irqsave(&conf->device_lock, flags);
1415 bio_list_add(&conf->pending_bio_list, mbio);
1416 conf->pending_count++;
1417 spin_unlock_irqrestore(&conf->device_lock, flags);
1418 if (!mddev_check_plugged(mddev))
1419 md_wakeup_thread(mddev->thread);
1420 }
1421 }
1422
1423
1424
1425
1426
1427 if (sectors_handled < bio_sectors(bio)) {
1428 one_write_done(r10_bio);
1429
1430
1431
1432 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1433
1434 r10_bio->master_bio = bio;
1435 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1436
1437 r10_bio->mddev = mddev;
1438 r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
1439 r10_bio->state = 0;
1440 goto retry_write;
1441 }
1442 one_write_done(r10_bio);
1443}
1444
1445static void make_request(struct mddev *mddev, struct bio *bio)
1446{
1447 struct r10conf *conf = mddev->private;
1448 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1449 int chunk_sects = chunk_mask + 1;
1450
1451 struct bio *split;
1452
1453 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
1454 md_flush_request(mddev, bio);
1455 return;
1456 }
1457
1458 md_write_start(mddev, bio);
1459
1460 do {
1461
1462
1463
1464
1465
1466 if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
1467 bio_sectors(bio) > chunk_sects
1468 && (conf->geo.near_copies < conf->geo.raid_disks
1469 || conf->prev.near_copies <
1470 conf->prev.raid_disks))) {
1471 split = bio_split(bio, chunk_sects -
1472 (bio->bi_iter.bi_sector &
1473 (chunk_sects - 1)),
1474 GFP_NOIO, fs_bio_set);
1475 bio_chain(split, bio);
1476 } else {
1477 split = bio;
1478 }
1479
1480 __make_request(mddev, split);
1481 } while (split != bio);
1482
1483
1484 wake_up(&conf->wait_barrier);
1485}
1486
1487static void status(struct seq_file *seq, struct mddev *mddev)
1488{
1489 struct r10conf *conf = mddev->private;
1490 int i;
1491
1492 if (conf->geo.near_copies < conf->geo.raid_disks)
1493 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1494 if (conf->geo.near_copies > 1)
1495 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1496 if (conf->geo.far_copies > 1) {
1497 if (conf->geo.far_offset)
1498 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1499 else
1500 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1501 if (conf->geo.far_set_size != conf->geo.raid_disks)
1502 seq_printf(seq, " %d devices per set", conf->geo.far_set_size);
1503 }
1504 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1505 conf->geo.raid_disks - mddev->degraded);
1506 for (i = 0; i < conf->geo.raid_disks; i++)
1507 seq_printf(seq, "%s",
1508 conf->mirrors[i].rdev &&
1509 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
1510 seq_printf(seq, "]");
1511}
1512
1513
1514
1515
1516
1517
1518static int _enough(struct r10conf *conf, int previous, int ignore)
1519{
1520 int first = 0;
1521 int has_enough = 0;
1522 int disks, ncopies;
1523 if (previous) {
1524 disks = conf->prev.raid_disks;
1525 ncopies = conf->prev.near_copies;
1526 } else {
1527 disks = conf->geo.raid_disks;
1528 ncopies = conf->geo.near_copies;
1529 }
1530
1531 rcu_read_lock();
1532 do {
1533 int n = conf->copies;
1534 int cnt = 0;
1535 int this = first;
1536 while (n--) {
1537 struct md_rdev *rdev;
1538 if (this != ignore &&
1539 (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
1540 test_bit(In_sync, &rdev->flags))
1541 cnt++;
1542 this = (this+1) % disks;
1543 }
1544 if (cnt == 0)
1545 goto out;
1546 first = (first + ncopies) % disks;
1547 } while (first != 0);
1548 has_enough = 1;
1549out:
1550 rcu_read_unlock();
1551 return has_enough;
1552}
1553
1554static int enough(struct r10conf *conf, int ignore)
1555{
1556
1557
1558
1559
1560
1561 return _enough(conf, 0, ignore) &&
1562 _enough(conf, 1, ignore);
1563}
1564
1565static void error(struct mddev *mddev, struct md_rdev *rdev)
1566{
1567 char b[BDEVNAME_SIZE];
1568 struct r10conf *conf = mddev->private;
1569 unsigned long flags;
1570
1571
1572
1573
1574
1575
1576
1577 spin_lock_irqsave(&conf->device_lock, flags);
1578 if (test_bit(In_sync, &rdev->flags)
1579 && !enough(conf, rdev->raid_disk)) {
1580
1581
1582
1583 spin_unlock_irqrestore(&conf->device_lock, flags);
1584 return;
1585 }
1586 if (test_and_clear_bit(In_sync, &rdev->flags))
1587 mddev->degraded++;
1588
1589
1590
1591 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1592 set_bit(Blocked, &rdev->flags);
1593 set_bit(Faulty, &rdev->flags);
1594 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1595 set_bit(MD_CHANGE_PENDING, &mddev->flags);
1596 spin_unlock_irqrestore(&conf->device_lock, flags);
1597 printk(KERN_ALERT
1598 "md/raid10:%s: Disk failure on %s, disabling device.\n"
1599 "md/raid10:%s: Operation continuing on %d devices.\n",
1600 mdname(mddev), bdevname(rdev->bdev, b),
1601 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1602}
1603
1604static void print_conf(struct r10conf *conf)
1605{
1606 int i;
1607 struct raid10_info *tmp;
1608
1609 printk(KERN_DEBUG "RAID10 conf printout:\n");
1610 if (!conf) {
1611 printk(KERN_DEBUG "(!conf)\n");
1612 return;
1613 }
1614 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1615 conf->geo.raid_disks);
1616
1617 for (i = 0; i < conf->geo.raid_disks; i++) {
1618 char b[BDEVNAME_SIZE];
1619 tmp = conf->mirrors + i;
1620 if (tmp->rdev)
1621 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1622 i, !test_bit(In_sync, &tmp->rdev->flags),
1623 !test_bit(Faulty, &tmp->rdev->flags),
1624 bdevname(tmp->rdev->bdev,b));
1625 }
1626}
1627
1628static void close_sync(struct r10conf *conf)
1629{
1630 wait_barrier(conf);
1631 allow_barrier(conf);
1632
1633 mempool_destroy(conf->r10buf_pool);
1634 conf->r10buf_pool = NULL;
1635}
1636
1637static int raid10_spare_active(struct mddev *mddev)
1638{
1639 int i;
1640 struct r10conf *conf = mddev->private;
1641 struct raid10_info *tmp;
1642 int count = 0;
1643 unsigned long flags;
1644
1645
1646
1647
1648
1649 for (i = 0; i < conf->geo.raid_disks; i++) {
1650 tmp = conf->mirrors + i;
1651 if (tmp->replacement
1652 && tmp->replacement->recovery_offset == MaxSector
1653 && !test_bit(Faulty, &tmp->replacement->flags)
1654 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1655
1656 if (!tmp->rdev
1657 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1658 count++;
1659 if (tmp->rdev) {
1660
1661
1662
1663
1664 set_bit(Faulty, &tmp->rdev->flags);
1665 sysfs_notify_dirent_safe(
1666 tmp->rdev->sysfs_state);
1667 }
1668 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1669 } else if (tmp->rdev
1670 && tmp->rdev->recovery_offset == MaxSector
1671 && !test_bit(Faulty, &tmp->rdev->flags)
1672 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1673 count++;
1674 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
1675 }
1676 }
1677 spin_lock_irqsave(&conf->device_lock, flags);
1678 mddev->degraded -= count;
1679 spin_unlock_irqrestore(&conf->device_lock, flags);
1680
1681 print_conf(conf);
1682 return count;
1683}
1684
1685static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1686{
1687 struct r10conf *conf = mddev->private;
1688 int err = -EEXIST;
1689 int mirror;
1690 int first = 0;
1691 int last = conf->geo.raid_disks - 1;
1692
1693 if (mddev->recovery_cp < MaxSector)
1694
1695
1696
1697 return -EBUSY;
1698 if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
1699 return -EINVAL;
1700
1701 if (rdev->raid_disk >= 0)
1702 first = last = rdev->raid_disk;
1703
1704 if (rdev->saved_raid_disk >= first &&
1705 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1706 mirror = rdev->saved_raid_disk;
1707 else
1708 mirror = first;
1709 for ( ; mirror <= last ; mirror++) {
1710 struct raid10_info *p = &conf->mirrors[mirror];
1711 if (p->recovery_disabled == mddev->recovery_disabled)
1712 continue;
1713 if (p->rdev) {
1714 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1715 p->replacement != NULL)
1716 continue;
1717 clear_bit(In_sync, &rdev->flags);
1718 set_bit(Replacement, &rdev->flags);
1719 rdev->raid_disk = mirror;
1720 err = 0;
1721 if (mddev->gendisk)
1722 disk_stack_limits(mddev->gendisk, rdev->bdev,
1723 rdev->data_offset << 9);
1724 conf->fullsync = 1;
1725 rcu_assign_pointer(p->replacement, rdev);
1726 break;
1727 }
1728
1729 if (mddev->gendisk)
1730 disk_stack_limits(mddev->gendisk, rdev->bdev,
1731 rdev->data_offset << 9);
1732
1733 p->head_position = 0;
1734 p->recovery_disabled = mddev->recovery_disabled - 1;
1735 rdev->raid_disk = mirror;
1736 err = 0;
1737 if (rdev->saved_raid_disk != mirror)
1738 conf->fullsync = 1;
1739 rcu_assign_pointer(p->rdev, rdev);
1740 break;
1741 }
1742 md_integrity_add_rdev(rdev, mddev);
1743 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1744 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
1745
1746 print_conf(conf);
1747 return err;
1748}
1749
1750static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1751{
1752 struct r10conf *conf = mddev->private;
1753 int err = 0;
1754 int number = rdev->raid_disk;
1755 struct md_rdev **rdevp;
1756 struct raid10_info *p = conf->mirrors + number;
1757
1758 print_conf(conf);
1759 if (rdev == p->rdev)
1760 rdevp = &p->rdev;
1761 else if (rdev == p->replacement)
1762 rdevp = &p->replacement;
1763 else
1764 return 0;
1765
1766 if (test_bit(In_sync, &rdev->flags) ||
1767 atomic_read(&rdev->nr_pending)) {
1768 err = -EBUSY;
1769 goto abort;
1770 }
1771
1772
1773
1774 if (!test_bit(Faulty, &rdev->flags) &&
1775 mddev->recovery_disabled != p->recovery_disabled &&
1776 (!p->replacement || p->replacement == rdev) &&
1777 number < conf->geo.raid_disks &&
1778 enough(conf, -1)) {
1779 err = -EBUSY;
1780 goto abort;
1781 }
1782 *rdevp = NULL;
1783 synchronize_rcu();
1784 if (atomic_read(&rdev->nr_pending)) {
1785
1786 err = -EBUSY;
1787 *rdevp = rdev;
1788 goto abort;
1789 } else if (p->replacement) {
1790
1791 p->rdev = p->replacement;
1792 clear_bit(Replacement, &p->replacement->flags);
1793 smp_mb();
1794
1795
1796 p->replacement = NULL;
1797 clear_bit(WantReplacement, &rdev->flags);
1798 } else
1799
1800
1801
1802 clear_bit(WantReplacement, &rdev->flags);
1803
1804 err = md_integrity_register(mddev);
1805
1806abort:
1807
1808 print_conf(conf);
1809 return err;
1810}
1811
1812static void end_sync_read(struct bio *bio)
1813{
1814 struct r10bio *r10_bio = bio->bi_private;
1815 struct r10conf *conf = r10_bio->mddev->private;
1816 int d;
1817
1818 if (bio == r10_bio->master_bio) {
1819
1820 d = r10_bio->read_slot;
1821 } else
1822 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1823
1824 if (!bio->bi_error)
1825 set_bit(R10BIO_Uptodate, &r10_bio->state);
1826 else
1827
1828
1829
1830 atomic_add(r10_bio->sectors,
1831 &conf->mirrors[d].rdev->corrected_errors);
1832
1833
1834
1835
1836 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1837 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1838 atomic_dec_and_test(&r10_bio->remaining)) {
1839
1840
1841
1842 reschedule_retry(r10_bio);
1843 }
1844}
1845
1846static void end_sync_request(struct r10bio *r10_bio)
1847{
1848 struct mddev *mddev = r10_bio->mddev;
1849
1850 while (atomic_dec_and_test(&r10_bio->remaining)) {
1851 if (r10_bio->master_bio == NULL) {
1852
1853 sector_t s = r10_bio->sectors;
1854 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1855 test_bit(R10BIO_WriteError, &r10_bio->state))
1856 reschedule_retry(r10_bio);
1857 else
1858 put_buf(r10_bio);
1859 md_done_sync(mddev, s, 1);
1860 break;
1861 } else {
1862 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1863 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1864 test_bit(R10BIO_WriteError, &r10_bio->state))
1865 reschedule_retry(r10_bio);
1866 else
1867 put_buf(r10_bio);
1868 r10_bio = r10_bio2;
1869 }
1870 }
1871}
1872
1873static void end_sync_write(struct bio *bio)
1874{
1875 struct r10bio *r10_bio = bio->bi_private;
1876 struct mddev *mddev = r10_bio->mddev;
1877 struct r10conf *conf = mddev->private;
1878 int d;
1879 sector_t first_bad;
1880 int bad_sectors;
1881 int slot;
1882 int repl;
1883 struct md_rdev *rdev = NULL;
1884
1885 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1886 if (repl)
1887 rdev = conf->mirrors[d].replacement;
1888 else
1889 rdev = conf->mirrors[d].rdev;
1890
1891 if (bio->bi_error) {
1892 if (repl)
1893 md_error(mddev, rdev);
1894 else {
1895 set_bit(WriteErrorSeen, &rdev->flags);
1896 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1897 set_bit(MD_RECOVERY_NEEDED,
1898 &rdev->mddev->recovery);
1899 set_bit(R10BIO_WriteError, &r10_bio->state);
1900 }
1901 } else if (is_badblock(rdev,
1902 r10_bio->devs[slot].addr,
1903 r10_bio->sectors,
1904 &first_bad, &bad_sectors))
1905 set_bit(R10BIO_MadeGood, &r10_bio->state);
1906
1907 rdev_dec_pending(rdev, mddev);
1908
1909 end_sync_request(r10_bio);
1910}
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1929{
1930 struct r10conf *conf = mddev->private;
1931 int i, first;
1932 struct bio *tbio, *fbio;
1933 int vcnt;
1934
1935 atomic_set(&r10_bio->remaining, 1);
1936
1937
1938 for (i=0; i<conf->copies; i++)
1939 if (!r10_bio->devs[i].bio->bi_error)
1940 break;
1941
1942 if (i == conf->copies)
1943 goto done;
1944
1945 first = i;
1946 fbio = r10_bio->devs[i].bio;
1947
1948 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
1949
1950 for (i=0 ; i < conf->copies ; i++) {
1951 int j, d;
1952
1953 tbio = r10_bio->devs[i].bio;
1954
1955 if (tbio->bi_end_io != end_sync_read)
1956 continue;
1957 if (i == first)
1958 continue;
1959 if (!r10_bio->devs[i].bio->bi_error) {
1960
1961
1962
1963
1964 int sectors = r10_bio->sectors;
1965 for (j = 0; j < vcnt; j++) {
1966 int len = PAGE_SIZE;
1967 if (sectors < (len / 512))
1968 len = sectors * 512;
1969 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
1970 page_address(tbio->bi_io_vec[j].bv_page),
1971 len))
1972 break;
1973 sectors -= len/512;
1974 }
1975 if (j == vcnt)
1976 continue;
1977 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
1978 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1979
1980 continue;
1981 }
1982
1983
1984
1985
1986
1987 bio_reset(tbio);
1988
1989 tbio->bi_vcnt = vcnt;
1990 tbio->bi_iter.bi_size = r10_bio->sectors << 9;
1991 tbio->bi_rw = WRITE;
1992 tbio->bi_private = r10_bio;
1993 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
1994 tbio->bi_end_io = end_sync_write;
1995
1996 bio_copy_data(tbio, fbio);
1997
1998 d = r10_bio->devs[i].devnum;
1999 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2000 atomic_inc(&r10_bio->remaining);
2001 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2002
2003 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
2004 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
2005 generic_make_request(tbio);
2006 }
2007
2008
2009
2010
2011 for (i = 0; i < conf->copies; i++) {
2012 int d;
2013
2014 tbio = r10_bio->devs[i].repl_bio;
2015 if (!tbio || !tbio->bi_end_io)
2016 continue;
2017 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2018 && r10_bio->devs[i].bio != fbio)
2019 bio_copy_data(tbio, fbio);
2020 d = r10_bio->devs[i].devnum;
2021 atomic_inc(&r10_bio->remaining);
2022 md_sync_acct(conf->mirrors[d].replacement->bdev,
2023 bio_sectors(tbio));
2024 generic_make_request(tbio);
2025 }
2026
2027done:
2028 if (atomic_dec_and_test(&r10_bio->remaining)) {
2029 md_done_sync(mddev, r10_bio->sectors, 1);
2030 put_buf(r10_bio);
2031 }
2032}
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044static void fix_recovery_read_error(struct r10bio *r10_bio)
2045{
2046
2047
2048
2049
2050
2051
2052
2053 struct mddev *mddev = r10_bio->mddev;
2054 struct r10conf *conf = mddev->private;
2055 struct bio *bio = r10_bio->devs[0].bio;
2056 sector_t sect = 0;
2057 int sectors = r10_bio->sectors;
2058 int idx = 0;
2059 int dr = r10_bio->devs[0].devnum;
2060 int dw = r10_bio->devs[1].devnum;
2061
2062 while (sectors) {
2063 int s = sectors;
2064 struct md_rdev *rdev;
2065 sector_t addr;
2066 int ok;
2067
2068 if (s > (PAGE_SIZE>>9))
2069 s = PAGE_SIZE >> 9;
2070
2071 rdev = conf->mirrors[dr].rdev;
2072 addr = r10_bio->devs[0].addr + sect,
2073 ok = sync_page_io(rdev,
2074 addr,
2075 s << 9,
2076 bio->bi_io_vec[idx].bv_page,
2077 READ, false);
2078 if (ok) {
2079 rdev = conf->mirrors[dw].rdev;
2080 addr = r10_bio->devs[1].addr + sect;
2081 ok = sync_page_io(rdev,
2082 addr,
2083 s << 9,
2084 bio->bi_io_vec[idx].bv_page,
2085 WRITE, false);
2086 if (!ok) {
2087 set_bit(WriteErrorSeen, &rdev->flags);
2088 if (!test_and_set_bit(WantReplacement,
2089 &rdev->flags))
2090 set_bit(MD_RECOVERY_NEEDED,
2091 &rdev->mddev->recovery);
2092 }
2093 }
2094 if (!ok) {
2095
2096
2097
2098
2099 rdev_set_badblocks(rdev, addr, s, 0);
2100
2101 if (rdev != conf->mirrors[dw].rdev) {
2102
2103 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2104 addr = r10_bio->devs[1].addr + sect;
2105 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2106 if (!ok) {
2107
2108 printk(KERN_NOTICE
2109 "md/raid10:%s: recovery aborted"
2110 " due to read error\n",
2111 mdname(mddev));
2112
2113 conf->mirrors[dw].recovery_disabled
2114 = mddev->recovery_disabled;
2115 set_bit(MD_RECOVERY_INTR,
2116 &mddev->recovery);
2117 break;
2118 }
2119 }
2120 }
2121
2122 sectors -= s;
2123 sect += s;
2124 idx++;
2125 }
2126}
2127
2128static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2129{
2130 struct r10conf *conf = mddev->private;
2131 int d;
2132 struct bio *wbio, *wbio2;
2133
2134 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2135 fix_recovery_read_error(r10_bio);
2136 end_sync_request(r10_bio);
2137 return;
2138 }
2139
2140
2141
2142
2143
2144 d = r10_bio->devs[1].devnum;
2145 wbio = r10_bio->devs[1].bio;
2146 wbio2 = r10_bio->devs[1].repl_bio;
2147
2148
2149
2150
2151 if (wbio2 && !wbio2->bi_end_io)
2152 wbio2 = NULL;
2153 if (wbio->bi_end_io) {
2154 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2155 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2156 generic_make_request(wbio);
2157 }
2158 if (wbio2) {
2159 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2160 md_sync_acct(conf->mirrors[d].replacement->bdev,
2161 bio_sectors(wbio2));
2162 generic_make_request(wbio2);
2163 }
2164}
2165
2166
2167
2168
2169
2170
2171
2172static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2173{
2174 struct timespec cur_time_mon;
2175 unsigned long hours_since_last;
2176 unsigned int read_errors = atomic_read(&rdev->read_errors);
2177
2178 ktime_get_ts(&cur_time_mon);
2179
2180 if (rdev->last_read_error.tv_sec == 0 &&
2181 rdev->last_read_error.tv_nsec == 0) {
2182
2183 rdev->last_read_error = cur_time_mon;
2184 return;
2185 }
2186
2187 hours_since_last = (cur_time_mon.tv_sec -
2188 rdev->last_read_error.tv_sec) / 3600;
2189
2190 rdev->last_read_error = cur_time_mon;
2191
2192
2193
2194
2195
2196
2197 if (hours_since_last >= 8 * sizeof(read_errors))
2198 atomic_set(&rdev->read_errors, 0);
2199 else
2200 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2201}
2202
2203static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2204 int sectors, struct page *page, int rw)
2205{
2206 sector_t first_bad;
2207 int bad_sectors;
2208
2209 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2210 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2211 return -1;
2212 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
2213
2214 return 1;
2215 if (rw == WRITE) {
2216 set_bit(WriteErrorSeen, &rdev->flags);
2217 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2218 set_bit(MD_RECOVERY_NEEDED,
2219 &rdev->mddev->recovery);
2220 }
2221
2222 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2223 md_error(rdev->mddev, rdev);
2224 return 0;
2225}
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2236{
2237 int sect = 0;
2238 int sectors = r10_bio->sectors;
2239 struct md_rdev*rdev;
2240 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2241 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2242
2243
2244
2245
2246 rdev = conf->mirrors[d].rdev;
2247
2248 if (test_bit(Faulty, &rdev->flags))
2249
2250
2251 return;
2252
2253 check_decay_read_errors(mddev, rdev);
2254 atomic_inc(&rdev->read_errors);
2255 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2256 char b[BDEVNAME_SIZE];
2257 bdevname(rdev->bdev, b);
2258
2259 printk(KERN_NOTICE
2260 "md/raid10:%s: %s: Raid device exceeded "
2261 "read_error threshold [cur %d:max %d]\n",
2262 mdname(mddev), b,
2263 atomic_read(&rdev->read_errors), max_read_errors);
2264 printk(KERN_NOTICE
2265 "md/raid10:%s: %s: Failing raid device\n",
2266 mdname(mddev), b);
2267 md_error(mddev, conf->mirrors[d].rdev);
2268 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2269 return;
2270 }
2271
2272 while(sectors) {
2273 int s = sectors;
2274 int sl = r10_bio->read_slot;
2275 int success = 0;
2276 int start;
2277
2278 if (s > (PAGE_SIZE>>9))
2279 s = PAGE_SIZE >> 9;
2280
2281 rcu_read_lock();
2282 do {
2283 sector_t first_bad;
2284 int bad_sectors;
2285
2286 d = r10_bio->devs[sl].devnum;
2287 rdev = rcu_dereference(conf->mirrors[d].rdev);
2288 if (rdev &&
2289 test_bit(In_sync, &rdev->flags) &&
2290 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2291 &first_bad, &bad_sectors) == 0) {
2292 atomic_inc(&rdev->nr_pending);
2293 rcu_read_unlock();
2294 success = sync_page_io(rdev,
2295 r10_bio->devs[sl].addr +
2296 sect,
2297 s<<9,
2298 conf->tmppage, READ, false);
2299 rdev_dec_pending(rdev, mddev);
2300 rcu_read_lock();
2301 if (success)
2302 break;
2303 }
2304 sl++;
2305 if (sl == conf->copies)
2306 sl = 0;
2307 } while (!success && sl != r10_bio->read_slot);
2308 rcu_read_unlock();
2309
2310 if (!success) {
2311
2312
2313
2314
2315 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2316 rdev = conf->mirrors[dn].rdev;
2317
2318 if (!rdev_set_badblocks(
2319 rdev,
2320 r10_bio->devs[r10_bio->read_slot].addr
2321 + sect,
2322 s, 0)) {
2323 md_error(mddev, rdev);
2324 r10_bio->devs[r10_bio->read_slot].bio
2325 = IO_BLOCKED;
2326 }
2327 break;
2328 }
2329
2330 start = sl;
2331
2332 rcu_read_lock();
2333 while (sl != r10_bio->read_slot) {
2334 char b[BDEVNAME_SIZE];
2335
2336 if (sl==0)
2337 sl = conf->copies;
2338 sl--;
2339 d = r10_bio->devs[sl].devnum;
2340 rdev = rcu_dereference(conf->mirrors[d].rdev);
2341 if (!rdev ||
2342 !test_bit(In_sync, &rdev->flags))
2343 continue;
2344
2345 atomic_inc(&rdev->nr_pending);
2346 rcu_read_unlock();
2347 if (r10_sync_page_io(rdev,
2348 r10_bio->devs[sl].addr +
2349 sect,
2350 s, conf->tmppage, WRITE)
2351 == 0) {
2352
2353 printk(KERN_NOTICE
2354 "md/raid10:%s: read correction "
2355 "write failed"
2356 " (%d sectors at %llu on %s)\n",
2357 mdname(mddev), s,
2358 (unsigned long long)(
2359 sect +
2360 choose_data_offset(r10_bio,
2361 rdev)),
2362 bdevname(rdev->bdev, b));
2363 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2364 "drive\n",
2365 mdname(mddev),
2366 bdevname(rdev->bdev, b));
2367 }
2368 rdev_dec_pending(rdev, mddev);
2369 rcu_read_lock();
2370 }
2371 sl = start;
2372 while (sl != r10_bio->read_slot) {
2373 char b[BDEVNAME_SIZE];
2374
2375 if (sl==0)
2376 sl = conf->copies;
2377 sl--;
2378 d = r10_bio->devs[sl].devnum;
2379 rdev = rcu_dereference(conf->mirrors[d].rdev);
2380 if (!rdev ||
2381 !test_bit(In_sync, &rdev->flags))
2382 continue;
2383
2384 atomic_inc(&rdev->nr_pending);
2385 rcu_read_unlock();
2386 switch (r10_sync_page_io(rdev,
2387 r10_bio->devs[sl].addr +
2388 sect,
2389 s, conf->tmppage,
2390 READ)) {
2391 case 0:
2392
2393 printk(KERN_NOTICE
2394 "md/raid10:%s: unable to read back "
2395 "corrected sectors"
2396 " (%d sectors at %llu on %s)\n",
2397 mdname(mddev), s,
2398 (unsigned long long)(
2399 sect +
2400 choose_data_offset(r10_bio, rdev)),
2401 bdevname(rdev->bdev, b));
2402 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2403 "drive\n",
2404 mdname(mddev),
2405 bdevname(rdev->bdev, b));
2406 break;
2407 case 1:
2408 printk(KERN_INFO
2409 "md/raid10:%s: read error corrected"
2410 " (%d sectors at %llu on %s)\n",
2411 mdname(mddev), s,
2412 (unsigned long long)(
2413 sect +
2414 choose_data_offset(r10_bio, rdev)),
2415 bdevname(rdev->bdev, b));
2416 atomic_add(s, &rdev->corrected_errors);
2417 }
2418
2419 rdev_dec_pending(rdev, mddev);
2420 rcu_read_lock();
2421 }
2422 rcu_read_unlock();
2423
2424 sectors -= s;
2425 sect += s;
2426 }
2427}
2428
2429static int narrow_write_error(struct r10bio *r10_bio, int i)
2430{
2431 struct bio *bio = r10_bio->master_bio;
2432 struct mddev *mddev = r10_bio->mddev;
2433 struct r10conf *conf = mddev->private;
2434 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446 int block_sectors;
2447 sector_t sector;
2448 int sectors;
2449 int sect_to_write = r10_bio->sectors;
2450 int ok = 1;
2451
2452 if (rdev->badblocks.shift < 0)
2453 return 0;
2454
2455 block_sectors = roundup(1 << rdev->badblocks.shift,
2456 bdev_logical_block_size(rdev->bdev) >> 9);
2457 sector = r10_bio->sector;
2458 sectors = ((r10_bio->sector + block_sectors)
2459 & ~(sector_t)(block_sectors - 1))
2460 - sector;
2461
2462 while (sect_to_write) {
2463 struct bio *wbio;
2464 if (sectors > sect_to_write)
2465 sectors = sect_to_write;
2466
2467 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2468 bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
2469 wbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
2470 choose_data_offset(r10_bio, rdev) +
2471 (sector - r10_bio->sector));
2472 wbio->bi_bdev = rdev->bdev;
2473 if (submit_bio_wait(WRITE, wbio) < 0)
2474
2475 ok = rdev_set_badblocks(rdev, sector,
2476 sectors, 0)
2477 && ok;
2478
2479 bio_put(wbio);
2480 sect_to_write -= sectors;
2481 sector += sectors;
2482 sectors = block_sectors;
2483 }
2484 return ok;
2485}
2486
2487static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2488{
2489 int slot = r10_bio->read_slot;
2490 struct bio *bio;
2491 struct r10conf *conf = mddev->private;
2492 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2493 char b[BDEVNAME_SIZE];
2494 unsigned long do_sync;
2495 int max_sectors;
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505 bio = r10_bio->devs[slot].bio;
2506 bdevname(bio->bi_bdev, b);
2507 bio_put(bio);
2508 r10_bio->devs[slot].bio = NULL;
2509
2510 if (mddev->ro == 0) {
2511 freeze_array(conf, 1);
2512 fix_read_error(conf, mddev, r10_bio);
2513 unfreeze_array(conf);
2514 } else
2515 r10_bio->devs[slot].bio = IO_BLOCKED;
2516
2517 rdev_dec_pending(rdev, mddev);
2518
2519read_more:
2520 rdev = read_balance(conf, r10_bio, &max_sectors);
2521 if (rdev == NULL) {
2522 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2523 " read error for block %llu\n",
2524 mdname(mddev), b,
2525 (unsigned long long)r10_bio->sector);
2526 raid_end_bio_io(r10_bio);
2527 return;
2528 }
2529
2530 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
2531 slot = r10_bio->read_slot;
2532 printk_ratelimited(
2533 KERN_ERR
2534 "md/raid10:%s: %s: redirecting "
2535 "sector %llu to another mirror\n",
2536 mdname(mddev),
2537 bdevname(rdev->bdev, b),
2538 (unsigned long long)r10_bio->sector);
2539 bio = bio_clone_mddev(r10_bio->master_bio,
2540 GFP_NOIO, mddev);
2541 bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
2542 r10_bio->devs[slot].bio = bio;
2543 r10_bio->devs[slot].rdev = rdev;
2544 bio->bi_iter.bi_sector = r10_bio->devs[slot].addr
2545 + choose_data_offset(r10_bio, rdev);
2546 bio->bi_bdev = rdev->bdev;
2547 bio->bi_rw = READ | do_sync;
2548 bio->bi_private = r10_bio;
2549 bio->bi_end_io = raid10_end_read_request;
2550 if (max_sectors < r10_bio->sectors) {
2551
2552 struct bio *mbio = r10_bio->master_bio;
2553 int sectors_handled =
2554 r10_bio->sector + max_sectors
2555 - mbio->bi_iter.bi_sector;
2556 r10_bio->sectors = max_sectors;
2557 spin_lock_irq(&conf->device_lock);
2558 if (mbio->bi_phys_segments == 0)
2559 mbio->bi_phys_segments = 2;
2560 else
2561 mbio->bi_phys_segments++;
2562 spin_unlock_irq(&conf->device_lock);
2563 generic_make_request(bio);
2564
2565 r10_bio = mempool_alloc(conf->r10bio_pool,
2566 GFP_NOIO);
2567 r10_bio->master_bio = mbio;
2568 r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
2569 r10_bio->state = 0;
2570 set_bit(R10BIO_ReadError,
2571 &r10_bio->state);
2572 r10_bio->mddev = mddev;
2573 r10_bio->sector = mbio->bi_iter.bi_sector
2574 + sectors_handled;
2575
2576 goto read_more;
2577 } else
2578 generic_make_request(bio);
2579}
2580
2581static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2582{
2583
2584
2585
2586
2587
2588
2589 int m;
2590 struct md_rdev *rdev;
2591
2592 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2593 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2594 for (m = 0; m < conf->copies; m++) {
2595 int dev = r10_bio->devs[m].devnum;
2596 rdev = conf->mirrors[dev].rdev;
2597 if (r10_bio->devs[m].bio == NULL)
2598 continue;
2599 if (!r10_bio->devs[m].bio->bi_error) {
2600 rdev_clear_badblocks(
2601 rdev,
2602 r10_bio->devs[m].addr,
2603 r10_bio->sectors, 0);
2604 } else {
2605 if (!rdev_set_badblocks(
2606 rdev,
2607 r10_bio->devs[m].addr,
2608 r10_bio->sectors, 0))
2609 md_error(conf->mddev, rdev);
2610 }
2611 rdev = conf->mirrors[dev].replacement;
2612 if (r10_bio->devs[m].repl_bio == NULL)
2613 continue;
2614
2615 if (!r10_bio->devs[m].repl_bio->bi_error) {
2616 rdev_clear_badblocks(
2617 rdev,
2618 r10_bio->devs[m].addr,
2619 r10_bio->sectors, 0);
2620 } else {
2621 if (!rdev_set_badblocks(
2622 rdev,
2623 r10_bio->devs[m].addr,
2624 r10_bio->sectors, 0))
2625 md_error(conf->mddev, rdev);
2626 }
2627 }
2628 put_buf(r10_bio);
2629 } else {
2630 bool fail = false;
2631 for (m = 0; m < conf->copies; m++) {
2632 int dev = r10_bio->devs[m].devnum;
2633 struct bio *bio = r10_bio->devs[m].bio;
2634 rdev = conf->mirrors[dev].rdev;
2635 if (bio == IO_MADE_GOOD) {
2636 rdev_clear_badblocks(
2637 rdev,
2638 r10_bio->devs[m].addr,
2639 r10_bio->sectors, 0);
2640 rdev_dec_pending(rdev, conf->mddev);
2641 } else if (bio != NULL && bio->bi_error) {
2642 fail = true;
2643 if (!narrow_write_error(r10_bio, m)) {
2644 md_error(conf->mddev, rdev);
2645 set_bit(R10BIO_Degraded,
2646 &r10_bio->state);
2647 }
2648 rdev_dec_pending(rdev, conf->mddev);
2649 }
2650 bio = r10_bio->devs[m].repl_bio;
2651 rdev = conf->mirrors[dev].replacement;
2652 if (rdev && bio == IO_MADE_GOOD) {
2653 rdev_clear_badblocks(
2654 rdev,
2655 r10_bio->devs[m].addr,
2656 r10_bio->sectors, 0);
2657 rdev_dec_pending(rdev, conf->mddev);
2658 }
2659 }
2660 if (fail) {
2661 spin_lock_irq(&conf->device_lock);
2662 list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
2663 spin_unlock_irq(&conf->device_lock);
2664 md_wakeup_thread(conf->mddev->thread);
2665 } else {
2666 if (test_bit(R10BIO_WriteError,
2667 &r10_bio->state))
2668 close_write(r10_bio);
2669 raid_end_bio_io(r10_bio);
2670 }
2671 }
2672}
2673
2674static void raid10d(struct md_thread *thread)
2675{
2676 struct mddev *mddev = thread->mddev;
2677 struct r10bio *r10_bio;
2678 unsigned long flags;
2679 struct r10conf *conf = mddev->private;
2680 struct list_head *head = &conf->retry_list;
2681 struct blk_plug plug;
2682
2683 md_check_recovery(mddev);
2684
2685 if (!list_empty_careful(&conf->bio_end_io_list) &&
2686 !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
2687 LIST_HEAD(tmp);
2688 spin_lock_irqsave(&conf->device_lock, flags);
2689 if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
2690 list_add(&tmp, &conf->bio_end_io_list);
2691 list_del_init(&conf->bio_end_io_list);
2692 }
2693 spin_unlock_irqrestore(&conf->device_lock, flags);
2694 while (!list_empty(&tmp)) {
2695 r10_bio = list_first_entry(&tmp, struct r10bio,
2696 retry_list);
2697 list_del(&r10_bio->retry_list);
2698 if (mddev->degraded)
2699 set_bit(R10BIO_Degraded, &r10_bio->state);
2700
2701 if (test_bit(R10BIO_WriteError,
2702 &r10_bio->state))
2703 close_write(r10_bio);
2704 raid_end_bio_io(r10_bio);
2705 }
2706 }
2707
2708 blk_start_plug(&plug);
2709 for (;;) {
2710
2711 flush_pending_writes(conf);
2712
2713 spin_lock_irqsave(&conf->device_lock, flags);
2714 if (list_empty(head)) {
2715 spin_unlock_irqrestore(&conf->device_lock, flags);
2716 break;
2717 }
2718 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2719 list_del(head->prev);
2720 conf->nr_queued--;
2721 spin_unlock_irqrestore(&conf->device_lock, flags);
2722
2723 mddev = r10_bio->mddev;
2724 conf = mddev->private;
2725 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2726 test_bit(R10BIO_WriteError, &r10_bio->state))
2727 handle_write_completed(conf, r10_bio);
2728 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2729 reshape_request_write(mddev, r10_bio);
2730 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2731 sync_request_write(mddev, r10_bio);
2732 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2733 recovery_request_write(mddev, r10_bio);
2734 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2735 handle_read_error(mddev, r10_bio);
2736 else {
2737
2738
2739
2740 int slot = r10_bio->read_slot;
2741 generic_make_request(r10_bio->devs[slot].bio);
2742 }
2743
2744 cond_resched();
2745 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2746 md_check_recovery(mddev);
2747 }
2748 blk_finish_plug(&plug);
2749}
2750
2751static int init_resync(struct r10conf *conf)
2752{
2753 int buffs;
2754 int i;
2755
2756 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2757 BUG_ON(conf->r10buf_pool);
2758 conf->have_replacement = 0;
2759 for (i = 0; i < conf->geo.raid_disks; i++)
2760 if (conf->mirrors[i].replacement)
2761 conf->have_replacement = 1;
2762 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2763 if (!conf->r10buf_pool)
2764 return -ENOMEM;
2765 conf->next_resync = 0;
2766 return 0;
2767}
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2802 int *skipped)
2803{
2804 struct r10conf *conf = mddev->private;
2805 struct r10bio *r10_bio;
2806 struct bio *biolist = NULL, *bio;
2807 sector_t max_sector, nr_sectors;
2808 int i;
2809 int max_sync;
2810 sector_t sync_blocks;
2811 sector_t sectors_skipped = 0;
2812 int chunks_skipped = 0;
2813 sector_t chunk_mask = conf->geo.chunk_mask;
2814
2815 if (!conf->r10buf_pool)
2816 if (init_resync(conf))
2817 return 0;
2818
2819
2820
2821
2822
2823 if (mddev->bitmap == NULL &&
2824 mddev->recovery_cp == MaxSector &&
2825 mddev->reshape_position == MaxSector &&
2826 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2827 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
2828 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2829 conf->fullsync == 0) {
2830 *skipped = 1;
2831 return mddev->dev_sectors - sector_nr;
2832 }
2833
2834 skipped:
2835 max_sector = mddev->dev_sectors;
2836 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2837 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2838 max_sector = mddev->resync_max_sectors;
2839 if (sector_nr >= max_sector) {
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2850 end_reshape(conf);
2851 close_sync(conf);
2852 return 0;
2853 }
2854
2855 if (mddev->curr_resync < max_sector) {
2856 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2857 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2858 &sync_blocks, 1);
2859 else for (i = 0; i < conf->geo.raid_disks; i++) {
2860 sector_t sect =
2861 raid10_find_virt(conf, mddev->curr_resync, i);
2862 bitmap_end_sync(mddev->bitmap, sect,
2863 &sync_blocks, 1);
2864 }
2865 } else {
2866
2867 if ((!mddev->bitmap || conf->fullsync)
2868 && conf->have_replacement
2869 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2870
2871
2872
2873 for (i = 0; i < conf->geo.raid_disks; i++)
2874 if (conf->mirrors[i].replacement)
2875 conf->mirrors[i].replacement
2876 ->recovery_offset
2877 = MaxSector;
2878 }
2879 conf->fullsync = 0;
2880 }
2881 bitmap_close_sync(mddev->bitmap);
2882 close_sync(conf);
2883 *skipped = 1;
2884 return sectors_skipped;
2885 }
2886
2887 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2888 return reshape_request(mddev, sector_nr, skipped);
2889
2890 if (chunks_skipped >= conf->geo.raid_disks) {
2891
2892
2893
2894 *skipped = 1;
2895 return (max_sector - sector_nr) + sectors_skipped;
2896 }
2897
2898 if (max_sector > mddev->resync_max)
2899 max_sector = mddev->resync_max;
2900
2901
2902
2903
2904 if (conf->geo.near_copies < conf->geo.raid_disks &&
2905 max_sector > (sector_nr | chunk_mask))
2906 max_sector = (sector_nr | chunk_mask) + 1;
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
2924 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2925
2926 int j;
2927 r10_bio = NULL;
2928
2929 for (i = 0 ; i < conf->geo.raid_disks; i++) {
2930 int still_degraded;
2931 struct r10bio *rb2;
2932 sector_t sect;
2933 int must_sync;
2934 int any_working;
2935 struct raid10_info *mirror = &conf->mirrors[i];
2936
2937 if ((mirror->rdev == NULL ||
2938 test_bit(In_sync, &mirror->rdev->flags))
2939 &&
2940 (mirror->replacement == NULL ||
2941 test_bit(Faulty,
2942 &mirror->replacement->flags)))
2943 continue;
2944
2945 still_degraded = 0;
2946
2947 rb2 = r10_bio;
2948 sect = raid10_find_virt(conf, sector_nr, i);
2949 if (sect >= mddev->resync_max_sectors) {
2950
2951
2952
2953 continue;
2954 }
2955
2956
2957
2958
2959 must_sync = bitmap_start_sync(mddev->bitmap, sect,
2960 &sync_blocks, 1);
2961 if (sync_blocks < max_sync)
2962 max_sync = sync_blocks;
2963 if (!must_sync &&
2964 mirror->replacement == NULL &&
2965 !conf->fullsync) {
2966
2967
2968
2969 chunks_skipped = -1;
2970 continue;
2971 }
2972
2973 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
2974 r10_bio->state = 0;
2975 raise_barrier(conf, rb2 != NULL);
2976 atomic_set(&r10_bio->remaining, 0);
2977
2978 r10_bio->master_bio = (struct bio*)rb2;
2979 if (rb2)
2980 atomic_inc(&rb2->remaining);
2981 r10_bio->mddev = mddev;
2982 set_bit(R10BIO_IsRecover, &r10_bio->state);
2983 r10_bio->sector = sect;
2984
2985 raid10_find_phys(conf, r10_bio);
2986
2987
2988
2989
2990 for (j = 0; j < conf->geo.raid_disks; j++)
2991 if (conf->mirrors[j].rdev == NULL ||
2992 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
2993 still_degraded = 1;
2994 break;
2995 }
2996
2997 must_sync = bitmap_start_sync(mddev->bitmap, sect,
2998 &sync_blocks, still_degraded);
2999
3000 any_working = 0;
3001 for (j=0; j<conf->copies;j++) {
3002 int k;
3003 int d = r10_bio->devs[j].devnum;
3004 sector_t from_addr, to_addr;
3005 struct md_rdev *rdev;
3006 sector_t sector, first_bad;
3007 int bad_sectors;
3008 if (!conf->mirrors[d].rdev ||
3009 !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
3010 continue;
3011
3012 any_working = 1;
3013 rdev = conf->mirrors[d].rdev;
3014 sector = r10_bio->devs[j].addr;
3015
3016 if (is_badblock(rdev, sector, max_sync,
3017 &first_bad, &bad_sectors)) {
3018 if (first_bad > sector)
3019 max_sync = first_bad - sector;
3020 else {
3021 bad_sectors -= (sector
3022 - first_bad);
3023 if (max_sync > bad_sectors)
3024 max_sync = bad_sectors;
3025 continue;
3026 }
3027 }
3028 bio = r10_bio->devs[0].bio;
3029 bio_reset(bio);
3030 bio->bi_next = biolist;
3031 biolist = bio;
3032 bio->bi_private = r10_bio;
3033 bio->bi_end_io = end_sync_read;
3034 bio->bi_rw = READ;
3035 from_addr = r10_bio->devs[j].addr;
3036 bio->bi_iter.bi_sector = from_addr +
3037 rdev->data_offset;
3038 bio->bi_bdev = rdev->bdev;
3039 atomic_inc(&rdev->nr_pending);
3040
3041
3042 for (k=0; k<conf->copies; k++)
3043 if (r10_bio->devs[k].devnum == i)
3044 break;
3045 BUG_ON(k == conf->copies);
3046 to_addr = r10_bio->devs[k].addr;
3047 r10_bio->devs[0].devnum = d;
3048 r10_bio->devs[0].addr = from_addr;
3049 r10_bio->devs[1].devnum = i;
3050 r10_bio->devs[1].addr = to_addr;
3051
3052 rdev = mirror->rdev;
3053 if (!test_bit(In_sync, &rdev->flags)) {
3054 bio = r10_bio->devs[1].bio;
3055 bio_reset(bio);
3056 bio->bi_next = biolist;
3057 biolist = bio;
3058 bio->bi_private = r10_bio;
3059 bio->bi_end_io = end_sync_write;
3060 bio->bi_rw = WRITE;
3061 bio->bi_iter.bi_sector = to_addr
3062 + rdev->data_offset;
3063 bio->bi_bdev = rdev->bdev;
3064 atomic_inc(&r10_bio->remaining);
3065 } else
3066 r10_bio->devs[1].bio->bi_end_io = NULL;
3067
3068
3069 bio = r10_bio->devs[1].repl_bio;
3070 if (bio)
3071 bio->bi_end_io = NULL;
3072 rdev = mirror->replacement;
3073
3074
3075
3076
3077
3078
3079
3080
3081 if (rdev == NULL || bio == NULL ||
3082 test_bit(Faulty, &rdev->flags))
3083 break;
3084 bio_reset(bio);
3085 bio->bi_next = biolist;
3086 biolist = bio;
3087 bio->bi_private = r10_bio;
3088 bio->bi_end_io = end_sync_write;
3089 bio->bi_rw = WRITE;
3090 bio->bi_iter.bi_sector = to_addr +
3091 rdev->data_offset;
3092 bio->bi_bdev = rdev->bdev;
3093 atomic_inc(&r10_bio->remaining);
3094 break;
3095 }
3096 if (j == conf->copies) {
3097
3098
3099 if (any_working) {
3100
3101
3102
3103 int k;
3104 for (k = 0; k < conf->copies; k++)
3105 if (r10_bio->devs[k].devnum == i)
3106 break;
3107 if (!test_bit(In_sync,
3108 &mirror->rdev->flags)
3109 && !rdev_set_badblocks(
3110 mirror->rdev,
3111 r10_bio->devs[k].addr,
3112 max_sync, 0))
3113 any_working = 0;
3114 if (mirror->replacement &&
3115 !rdev_set_badblocks(
3116 mirror->replacement,
3117 r10_bio->devs[k].addr,
3118 max_sync, 0))
3119 any_working = 0;
3120 }
3121 if (!any_working) {
3122 if (!test_and_set_bit(MD_RECOVERY_INTR,
3123 &mddev->recovery))
3124 printk(KERN_INFO "md/raid10:%s: insufficient "
3125 "working devices for recovery.\n",
3126 mdname(mddev));
3127 mirror->recovery_disabled
3128 = mddev->recovery_disabled;
3129 }
3130 put_buf(r10_bio);
3131 if (rb2)
3132 atomic_dec(&rb2->remaining);
3133 r10_bio = rb2;
3134 break;
3135 }
3136 }
3137 if (biolist == NULL) {
3138 while (r10_bio) {
3139 struct r10bio *rb2 = r10_bio;
3140 r10_bio = (struct r10bio*) rb2->master_bio;
3141 rb2->master_bio = NULL;
3142 put_buf(rb2);
3143 }
3144 goto giveup;
3145 }
3146 } else {
3147
3148 int count = 0;
3149
3150 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
3151
3152 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
3153 &sync_blocks, mddev->degraded) &&
3154 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3155 &mddev->recovery)) {
3156
3157 *skipped = 1;
3158 return sync_blocks + sectors_skipped;
3159 }
3160 if (sync_blocks < max_sync)
3161 max_sync = sync_blocks;
3162 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3163 r10_bio->state = 0;
3164
3165 r10_bio->mddev = mddev;
3166 atomic_set(&r10_bio->remaining, 0);
3167 raise_barrier(conf, 0);
3168 conf->next_resync = sector_nr;
3169
3170 r10_bio->master_bio = NULL;
3171 r10_bio->sector = sector_nr;
3172 set_bit(R10BIO_IsSync, &r10_bio->state);
3173 raid10_find_phys(conf, r10_bio);
3174 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3175
3176 for (i = 0; i < conf->copies; i++) {
3177 int d = r10_bio->devs[i].devnum;
3178 sector_t first_bad, sector;
3179 int bad_sectors;
3180
3181 if (r10_bio->devs[i].repl_bio)
3182 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3183
3184 bio = r10_bio->devs[i].bio;
3185 bio_reset(bio);
3186 bio->bi_error = -EIO;
3187 if (conf->mirrors[d].rdev == NULL ||
3188 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
3189 continue;
3190 sector = r10_bio->devs[i].addr;
3191 if (is_badblock(conf->mirrors[d].rdev,
3192 sector, max_sync,
3193 &first_bad, &bad_sectors)) {
3194 if (first_bad > sector)
3195 max_sync = first_bad - sector;
3196 else {
3197 bad_sectors -= (sector - first_bad);
3198 if (max_sync > bad_sectors)
3199 max_sync = bad_sectors;
3200 continue;
3201 }
3202 }
3203 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3204 atomic_inc(&r10_bio->remaining);
3205 bio->bi_next = biolist;
3206 biolist = bio;
3207 bio->bi_private = r10_bio;
3208 bio->bi_end_io = end_sync_read;
3209 bio->bi_rw = READ;
3210 bio->bi_iter.bi_sector = sector +
3211 conf->mirrors[d].rdev->data_offset;
3212 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
3213 count++;
3214
3215 if (conf->mirrors[d].replacement == NULL ||
3216 test_bit(Faulty,
3217 &conf->mirrors[d].replacement->flags))
3218 continue;
3219
3220
3221 bio = r10_bio->devs[i].repl_bio;
3222 bio_reset(bio);
3223 bio->bi_error = -EIO;
3224
3225 sector = r10_bio->devs[i].addr;
3226 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3227 bio->bi_next = biolist;
3228 biolist = bio;
3229 bio->bi_private = r10_bio;
3230 bio->bi_end_io = end_sync_write;
3231 bio->bi_rw = WRITE;
3232 bio->bi_iter.bi_sector = sector +
3233 conf->mirrors[d].replacement->data_offset;
3234 bio->bi_bdev = conf->mirrors[d].replacement->bdev;
3235 count++;
3236 }
3237
3238 if (count < 2) {
3239 for (i=0; i<conf->copies; i++) {
3240 int d = r10_bio->devs[i].devnum;
3241 if (r10_bio->devs[i].bio->bi_end_io)
3242 rdev_dec_pending(conf->mirrors[d].rdev,
3243 mddev);
3244 if (r10_bio->devs[i].repl_bio &&
3245 r10_bio->devs[i].repl_bio->bi_end_io)
3246 rdev_dec_pending(
3247 conf->mirrors[d].replacement,
3248 mddev);
3249 }
3250 put_buf(r10_bio);
3251 biolist = NULL;
3252 goto giveup;
3253 }
3254 }
3255
3256 nr_sectors = 0;
3257 if (sector_nr + max_sync < max_sector)
3258 max_sector = sector_nr + max_sync;
3259 do {
3260 struct page *page;
3261 int len = PAGE_SIZE;
3262 if (sector_nr + (len>>9) > max_sector)
3263 len = (max_sector - sector_nr) << 9;
3264 if (len == 0)
3265 break;
3266 for (bio= biolist ; bio ; bio=bio->bi_next) {
3267 struct bio *bio2;
3268 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
3269 if (bio_add_page(bio, page, len, 0))
3270 continue;
3271
3272
3273 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
3274 for (bio2 = biolist;
3275 bio2 && bio2 != bio;
3276 bio2 = bio2->bi_next) {
3277
3278 bio2->bi_vcnt--;
3279 bio2->bi_iter.bi_size -= len;
3280 bio_clear_flag(bio2, BIO_SEG_VALID);
3281 }
3282 goto bio_full;
3283 }
3284 nr_sectors += len>>9;
3285 sector_nr += len>>9;
3286 } while (biolist->bi_vcnt < RESYNC_PAGES);
3287 bio_full:
3288 r10_bio->sectors = nr_sectors;
3289
3290 while (biolist) {
3291 bio = biolist;
3292 biolist = biolist->bi_next;
3293
3294 bio->bi_next = NULL;
3295 r10_bio = bio->bi_private;
3296 r10_bio->sectors = nr_sectors;
3297
3298 if (bio->bi_end_io == end_sync_read) {
3299 md_sync_acct(bio->bi_bdev, nr_sectors);
3300 bio->bi_error = 0;
3301 generic_make_request(bio);
3302 }
3303 }
3304
3305 if (sectors_skipped)
3306
3307
3308
3309 md_done_sync(mddev, sectors_skipped, 1);
3310
3311 return sectors_skipped + nr_sectors;
3312 giveup:
3313
3314
3315
3316
3317 if (sector_nr + max_sync < max_sector)
3318 max_sector = sector_nr + max_sync;
3319
3320 sectors_skipped += (max_sector - sector_nr);
3321 chunks_skipped ++;
3322 sector_nr = max_sector;
3323 goto skipped;
3324}
3325
3326static sector_t
3327raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3328{
3329 sector_t size;
3330 struct r10conf *conf = mddev->private;
3331
3332 if (!raid_disks)
3333 raid_disks = min(conf->geo.raid_disks,
3334 conf->prev.raid_disks);
3335 if (!sectors)
3336 sectors = conf->dev_sectors;
3337
3338 size = sectors >> conf->geo.chunk_shift;
3339 sector_div(size, conf->geo.far_copies);
3340 size = size * raid_disks;
3341 sector_div(size, conf->geo.near_copies);
3342
3343 return size << conf->geo.chunk_shift;
3344}
3345
3346static void calc_sectors(struct r10conf *conf, sector_t size)
3347{
3348
3349
3350
3351
3352
3353 size = size >> conf->geo.chunk_shift;
3354 sector_div(size, conf->geo.far_copies);
3355 size = size * conf->geo.raid_disks;
3356 sector_div(size, conf->geo.near_copies);
3357
3358
3359 size = size * conf->copies;
3360
3361
3362
3363
3364 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3365
3366 conf->dev_sectors = size << conf->geo.chunk_shift;
3367
3368 if (conf->geo.far_offset)
3369 conf->geo.stride = 1 << conf->geo.chunk_shift;
3370 else {
3371 sector_div(size, conf->geo.far_copies);
3372 conf->geo.stride = size << conf->geo.chunk_shift;
3373 }
3374}
3375
3376enum geo_type {geo_new, geo_old, geo_start};
3377static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3378{
3379 int nc, fc, fo;
3380 int layout, chunk, disks;
3381 switch (new) {
3382 case geo_old:
3383 layout = mddev->layout;
3384 chunk = mddev->chunk_sectors;
3385 disks = mddev->raid_disks - mddev->delta_disks;
3386 break;
3387 case geo_new:
3388 layout = mddev->new_layout;
3389 chunk = mddev->new_chunk_sectors;
3390 disks = mddev->raid_disks;
3391 break;
3392 default:
3393 case geo_start:
3394
3395 layout = mddev->new_layout;
3396 chunk = mddev->new_chunk_sectors;
3397 disks = mddev->raid_disks + mddev->delta_disks;
3398 break;
3399 }
3400 if (layout >> 19)
3401 return -1;
3402 if (chunk < (PAGE_SIZE >> 9) ||
3403 !is_power_of_2(chunk))
3404 return -2;
3405 nc = layout & 255;
3406 fc = (layout >> 8) & 255;
3407 fo = layout & (1<<16);
3408 geo->raid_disks = disks;
3409 geo->near_copies = nc;
3410 geo->far_copies = fc;
3411 geo->far_offset = fo;
3412 switch (layout >> 17) {
3413 case 0:
3414 geo->far_set_size = disks;
3415 break;
3416 case 1:
3417
3418 geo->far_set_size = disks/fc;
3419 WARN(geo->far_set_size < fc,
3420 "This RAID10 layout does not provide data safety - please backup and create new array\n");
3421 break;
3422 case 2:
3423 geo->far_set_size = fc * nc;
3424 break;
3425 default:
3426 return -1;
3427 }
3428 geo->chunk_mask = chunk - 1;
3429 geo->chunk_shift = ffz(~chunk);
3430 return nc*fc;
3431}
3432
3433static struct r10conf *setup_conf(struct mddev *mddev)
3434{
3435 struct r10conf *conf = NULL;
3436 int err = -EINVAL;
3437 struct geom geo;
3438 int copies;
3439
3440 copies = setup_geo(&geo, mddev, geo_new);
3441
3442 if (copies == -2) {
3443 printk(KERN_ERR "md/raid10:%s: chunk size must be "
3444 "at least PAGE_SIZE(%ld) and be a power of 2.\n",
3445 mdname(mddev), PAGE_SIZE);
3446 goto out;
3447 }
3448
3449 if (copies < 2 || copies > mddev->raid_disks) {
3450 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3451 mdname(mddev), mddev->new_layout);
3452 goto out;
3453 }
3454
3455 err = -ENOMEM;
3456 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3457 if (!conf)
3458 goto out;
3459
3460
3461 conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
3462 max(0,-mddev->delta_disks)),
3463 GFP_KERNEL);
3464 if (!conf->mirrors)
3465 goto out;
3466
3467 conf->tmppage = alloc_page(GFP_KERNEL);
3468 if (!conf->tmppage)
3469 goto out;
3470
3471 conf->geo = geo;
3472 conf->copies = copies;
3473 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
3474 r10bio_pool_free, conf);
3475 if (!conf->r10bio_pool)
3476 goto out;
3477
3478 calc_sectors(conf, mddev->dev_sectors);
3479 if (mddev->reshape_position == MaxSector) {
3480 conf->prev = conf->geo;
3481 conf->reshape_progress = MaxSector;
3482 } else {
3483 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3484 err = -EINVAL;
3485 goto out;
3486 }
3487 conf->reshape_progress = mddev->reshape_position;
3488 if (conf->prev.far_offset)
3489 conf->prev.stride = 1 << conf->prev.chunk_shift;
3490 else
3491
3492 conf->prev.stride = conf->dev_sectors;
3493 }
3494 conf->reshape_safe = conf->reshape_progress;
3495 spin_lock_init(&conf->device_lock);
3496 INIT_LIST_HEAD(&conf->retry_list);
3497 INIT_LIST_HEAD(&conf->bio_end_io_list);
3498
3499 spin_lock_init(&conf->resync_lock);
3500 init_waitqueue_head(&conf->wait_barrier);
3501
3502 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3503 if (!conf->thread)
3504 goto out;
3505
3506 conf->mddev = mddev;
3507 return conf;
3508
3509 out:
3510 if (err == -ENOMEM)
3511 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
3512 mdname(mddev));
3513 if (conf) {
3514 mempool_destroy(conf->r10bio_pool);
3515 kfree(conf->mirrors);
3516 safe_put_page(conf->tmppage);
3517 kfree(conf);
3518 }
3519 return ERR_PTR(err);
3520}
3521
3522static int run(struct mddev *mddev)
3523{
3524 struct r10conf *conf;
3525 int i, disk_idx, chunk_size;
3526 struct raid10_info *disk;
3527 struct md_rdev *rdev;
3528 sector_t size;
3529 sector_t min_offset_diff = 0;
3530 int first = 1;
3531 bool discard_supported = false;
3532
3533 if (mddev->private == NULL) {
3534 conf = setup_conf(mddev);
3535 if (IS_ERR(conf))
3536 return PTR_ERR(conf);
3537 mddev->private = conf;
3538 }
3539 conf = mddev->private;
3540 if (!conf)
3541 goto out;
3542
3543 mddev->thread = conf->thread;
3544 conf->thread = NULL;
3545
3546 chunk_size = mddev->chunk_sectors << 9;
3547 if (mddev->queue) {
3548 blk_queue_max_discard_sectors(mddev->queue,
3549 mddev->chunk_sectors);
3550 blk_queue_max_write_same_sectors(mddev->queue, 0);
3551 blk_queue_io_min(mddev->queue, chunk_size);
3552 if (conf->geo.raid_disks % conf->geo.near_copies)
3553 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3554 else
3555 blk_queue_io_opt(mddev->queue, chunk_size *
3556 (conf->geo.raid_disks / conf->geo.near_copies));
3557 }
3558
3559 rdev_for_each(rdev, mddev) {
3560 long long diff;
3561 struct request_queue *q;
3562
3563 disk_idx = rdev->raid_disk;
3564 if (disk_idx < 0)
3565 continue;
3566 if (disk_idx >= conf->geo.raid_disks &&
3567 disk_idx >= conf->prev.raid_disks)
3568 continue;
3569 disk = conf->mirrors + disk_idx;
3570
3571 if (test_bit(Replacement, &rdev->flags)) {
3572 if (disk->replacement)
3573 goto out_free_conf;
3574 disk->replacement = rdev;
3575 } else {
3576 if (disk->rdev)
3577 goto out_free_conf;
3578 disk->rdev = rdev;
3579 }
3580 q = bdev_get_queue(rdev->bdev);
3581 diff = (rdev->new_data_offset - rdev->data_offset);
3582 if (!mddev->reshape_backwards)
3583 diff = -diff;
3584 if (diff < 0)
3585 diff = 0;
3586 if (first || diff < min_offset_diff)
3587 min_offset_diff = diff;
3588
3589 if (mddev->gendisk)
3590 disk_stack_limits(mddev->gendisk, rdev->bdev,
3591 rdev->data_offset << 9);
3592
3593 disk->head_position = 0;
3594
3595 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3596 discard_supported = true;
3597 }
3598
3599 if (mddev->queue) {
3600 if (discard_supported)
3601 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
3602 mddev->queue);
3603 else
3604 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
3605 mddev->queue);
3606 }
3607
3608 if (!enough(conf, -1)) {
3609 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
3610 mdname(mddev));
3611 goto out_free_conf;
3612 }
3613
3614 if (conf->reshape_progress != MaxSector) {
3615
3616 if (conf->geo.far_copies != 1 &&
3617 conf->geo.far_offset == 0)
3618 goto out_free_conf;
3619 if (conf->prev.far_copies != 1 &&
3620 conf->prev.far_offset == 0)
3621 goto out_free_conf;
3622 }
3623
3624 mddev->degraded = 0;
3625 for (i = 0;
3626 i < conf->geo.raid_disks
3627 || i < conf->prev.raid_disks;
3628 i++) {
3629
3630 disk = conf->mirrors + i;
3631
3632 if (!disk->rdev && disk->replacement) {
3633
3634 disk->rdev = disk->replacement;
3635 disk->replacement = NULL;
3636 clear_bit(Replacement, &disk->rdev->flags);
3637 }
3638
3639 if (!disk->rdev ||
3640 !test_bit(In_sync, &disk->rdev->flags)) {
3641 disk->head_position = 0;
3642 mddev->degraded++;
3643 if (disk->rdev &&
3644 disk->rdev->saved_raid_disk < 0)
3645 conf->fullsync = 1;
3646 }
3647 disk->recovery_disabled = mddev->recovery_disabled - 1;
3648 }
3649
3650 if (mddev->recovery_cp != MaxSector)
3651 printk(KERN_NOTICE "md/raid10:%s: not clean"
3652 " -- starting background reconstruction\n",
3653 mdname(mddev));
3654 printk(KERN_INFO
3655 "md/raid10:%s: active with %d out of %d devices\n",
3656 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3657 conf->geo.raid_disks);
3658
3659
3660
3661 mddev->dev_sectors = conf->dev_sectors;
3662 size = raid10_size(mddev, 0, 0);
3663 md_set_array_sectors(mddev, size);
3664 mddev->resync_max_sectors = size;
3665
3666 if (mddev->queue) {
3667 int stripe = conf->geo.raid_disks *
3668 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3669
3670
3671
3672
3673
3674 stripe /= conf->geo.near_copies;
3675 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3676 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3677 }
3678
3679 if (md_integrity_register(mddev))
3680 goto out_free_conf;
3681
3682 if (conf->reshape_progress != MaxSector) {
3683 unsigned long before_length, after_length;
3684
3685 before_length = ((1 << conf->prev.chunk_shift) *
3686 conf->prev.far_copies);
3687 after_length = ((1 << conf->geo.chunk_shift) *
3688 conf->geo.far_copies);
3689
3690 if (max(before_length, after_length) > min_offset_diff) {
3691
3692 printk("md/raid10: offset difference not enough to continue reshape\n");
3693 goto out_free_conf;
3694 }
3695 conf->offset_diff = min_offset_diff;
3696
3697 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3698 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3699 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3700 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3701 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3702 "reshape");
3703 }
3704
3705 return 0;
3706
3707out_free_conf:
3708 md_unregister_thread(&mddev->thread);
3709 mempool_destroy(conf->r10bio_pool);
3710 safe_put_page(conf->tmppage);
3711 kfree(conf->mirrors);
3712 kfree(conf);
3713 mddev->private = NULL;
3714out:
3715 return -EIO;
3716}
3717
3718static void raid10_free(struct mddev *mddev, void *priv)
3719{
3720 struct r10conf *conf = priv;
3721
3722 mempool_destroy(conf->r10bio_pool);
3723 safe_put_page(conf->tmppage);
3724 kfree(conf->mirrors);
3725 kfree(conf->mirrors_old);
3726 kfree(conf->mirrors_new);
3727 kfree(conf);
3728}
3729
3730static void raid10_quiesce(struct mddev *mddev, int state)
3731{
3732 struct r10conf *conf = mddev->private;
3733
3734 switch(state) {
3735 case 1:
3736 raise_barrier(conf, 0);
3737 break;
3738 case 0:
3739 lower_barrier(conf);
3740 break;
3741 }
3742}
3743
3744static int raid10_resize(struct mddev *mddev, sector_t sectors)
3745{
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758 struct r10conf *conf = mddev->private;
3759 sector_t oldsize, size;
3760
3761 if (mddev->reshape_position != MaxSector)
3762 return -EBUSY;
3763
3764 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3765 return -EINVAL;
3766
3767 oldsize = raid10_size(mddev, 0, 0);
3768 size = raid10_size(mddev, sectors, 0);
3769 if (mddev->external_size &&
3770 mddev->array_sectors > size)
3771 return -EINVAL;
3772 if (mddev->bitmap) {
3773 int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
3774 if (ret)
3775 return ret;
3776 }
3777 md_set_array_sectors(mddev, size);
3778 set_capacity(mddev->gendisk, mddev->array_sectors);
3779 revalidate_disk(mddev->gendisk);
3780 if (sectors > mddev->dev_sectors &&
3781 mddev->recovery_cp > oldsize) {
3782 mddev->recovery_cp = oldsize;
3783 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3784 }
3785 calc_sectors(conf, sectors);
3786 mddev->dev_sectors = conf->dev_sectors;
3787 mddev->resync_max_sectors = size;
3788 return 0;
3789}
3790
3791static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
3792{
3793 struct md_rdev *rdev;
3794 struct r10conf *conf;
3795
3796 if (mddev->degraded > 0) {
3797 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
3798 mdname(mddev));
3799 return ERR_PTR(-EINVAL);
3800 }
3801 sector_div(size, devs);
3802
3803
3804 mddev->new_level = 10;
3805
3806 mddev->new_layout = (1<<8) + 2;
3807 mddev->new_chunk_sectors = mddev->chunk_sectors;
3808 mddev->delta_disks = mddev->raid_disks;
3809 mddev->raid_disks *= 2;
3810
3811 mddev->recovery_cp = MaxSector;
3812 mddev->dev_sectors = size;
3813
3814 conf = setup_conf(mddev);
3815 if (!IS_ERR(conf)) {
3816 rdev_for_each(rdev, mddev)
3817 if (rdev->raid_disk >= 0) {
3818 rdev->new_raid_disk = rdev->raid_disk * 2;
3819 rdev->sectors = size;
3820 }
3821 conf->barrier = 1;
3822 }
3823
3824 return conf;
3825}
3826
3827static void *raid10_takeover(struct mddev *mddev)
3828{
3829 struct r0conf *raid0_conf;
3830
3831
3832
3833
3834 if (mddev->level == 0) {
3835
3836 raid0_conf = mddev->private;
3837 if (raid0_conf->nr_strip_zones > 1) {
3838 printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
3839 " with more than one zone.\n",
3840 mdname(mddev));
3841 return ERR_PTR(-EINVAL);
3842 }
3843 return raid10_takeover_raid0(mddev,
3844 raid0_conf->strip_zone->zone_end,
3845 raid0_conf->strip_zone->nb_dev);
3846 }
3847 return ERR_PTR(-EINVAL);
3848}
3849
3850static int raid10_check_reshape(struct mddev *mddev)
3851{
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866 struct r10conf *conf = mddev->private;
3867 struct geom geo;
3868
3869 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
3870 return -EINVAL;
3871
3872 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
3873
3874 return -EINVAL;
3875 if (geo.far_copies > 1 && !geo.far_offset)
3876
3877 return -EINVAL;
3878
3879 if (mddev->array_sectors & geo.chunk_mask)
3880
3881 return -EINVAL;
3882
3883 if (!enough(conf, -1))
3884 return -EINVAL;
3885
3886 kfree(conf->mirrors_new);
3887 conf->mirrors_new = NULL;
3888 if (mddev->delta_disks > 0) {
3889
3890 conf->mirrors_new = kzalloc(
3891 sizeof(struct raid10_info)
3892 *(mddev->raid_disks +
3893 mddev->delta_disks),
3894 GFP_KERNEL);
3895 if (!conf->mirrors_new)
3896 return -ENOMEM;
3897 }
3898 return 0;
3899}
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914static int calc_degraded(struct r10conf *conf)
3915{
3916 int degraded, degraded2;
3917 int i;
3918
3919 rcu_read_lock();
3920 degraded = 0;
3921
3922 for (i = 0; i < conf->prev.raid_disks; i++) {
3923 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3924 if (!rdev || test_bit(Faulty, &rdev->flags))
3925 degraded++;
3926 else if (!test_bit(In_sync, &rdev->flags))
3927
3928
3929
3930
3931 degraded++;
3932 }
3933 rcu_read_unlock();
3934 if (conf->geo.raid_disks == conf->prev.raid_disks)
3935 return degraded;
3936 rcu_read_lock();
3937 degraded2 = 0;
3938 for (i = 0; i < conf->geo.raid_disks; i++) {
3939 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3940 if (!rdev || test_bit(Faulty, &rdev->flags))
3941 degraded2++;
3942 else if (!test_bit(In_sync, &rdev->flags)) {
3943
3944
3945
3946
3947
3948 if (conf->geo.raid_disks <= conf->prev.raid_disks)
3949 degraded2++;
3950 }
3951 }
3952 rcu_read_unlock();
3953 if (degraded2 > degraded)
3954 return degraded2;
3955 return degraded;
3956}
3957
3958static int raid10_start_reshape(struct mddev *mddev)
3959{
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970 unsigned long before_length, after_length;
3971 sector_t min_offset_diff = 0;
3972 int first = 1;
3973 struct geom new;
3974 struct r10conf *conf = mddev->private;
3975 struct md_rdev *rdev;
3976 int spares = 0;
3977 int ret;
3978
3979 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3980 return -EBUSY;
3981
3982 if (setup_geo(&new, mddev, geo_start) != conf->copies)
3983 return -EINVAL;
3984
3985 before_length = ((1 << conf->prev.chunk_shift) *
3986 conf->prev.far_copies);
3987 after_length = ((1 << conf->geo.chunk_shift) *
3988 conf->geo.far_copies);
3989
3990 rdev_for_each(rdev, mddev) {
3991 if (!test_bit(In_sync, &rdev->flags)
3992 && !test_bit(Faulty, &rdev->flags))
3993 spares++;
3994 if (rdev->raid_disk >= 0) {
3995 long long diff = (rdev->new_data_offset
3996 - rdev->data_offset);
3997 if (!mddev->reshape_backwards)
3998 diff = -diff;
3999 if (diff < 0)
4000 diff = 0;
4001 if (first || diff < min_offset_diff)
4002 min_offset_diff = diff;
4003 }
4004 }
4005
4006 if (max(before_length, after_length) > min_offset_diff)
4007 return -EINVAL;
4008
4009 if (spares < mddev->delta_disks)
4010 return -EINVAL;
4011
4012 conf->offset_diff = min_offset_diff;
4013 spin_lock_irq(&conf->device_lock);
4014 if (conf->mirrors_new) {
4015 memcpy(conf->mirrors_new, conf->mirrors,
4016 sizeof(struct raid10_info)*conf->prev.raid_disks);
4017 smp_mb();
4018 kfree(conf->mirrors_old);
4019 conf->mirrors_old = conf->mirrors;
4020 conf->mirrors = conf->mirrors_new;
4021 conf->mirrors_new = NULL;
4022 }
4023 setup_geo(&conf->geo, mddev, geo_start);
4024 smp_mb();
4025 if (mddev->reshape_backwards) {
4026 sector_t size = raid10_size(mddev, 0, 0);
4027 if (size < mddev->array_sectors) {
4028 spin_unlock_irq(&conf->device_lock);
4029 printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
4030 mdname(mddev));
4031 return -EINVAL;
4032 }
4033 mddev->resync_max_sectors = size;
4034 conf->reshape_progress = size;
4035 } else
4036 conf->reshape_progress = 0;
4037 conf->reshape_safe = conf->reshape_progress;
4038 spin_unlock_irq(&conf->device_lock);
4039
4040 if (mddev->delta_disks && mddev->bitmap) {
4041 ret = bitmap_resize(mddev->bitmap,
4042 raid10_size(mddev, 0,
4043 conf->geo.raid_disks),
4044 0, 0);
4045 if (ret)
4046 goto abort;
4047 }
4048 if (mddev->delta_disks > 0) {
4049 rdev_for_each(rdev, mddev)
4050 if (rdev->raid_disk < 0 &&
4051 !test_bit(Faulty, &rdev->flags)) {
4052 if (raid10_add_disk(mddev, rdev) == 0) {
4053 if (rdev->raid_disk >=
4054 conf->prev.raid_disks)
4055 set_bit(In_sync, &rdev->flags);
4056 else
4057 rdev->recovery_offset = 0;
4058
4059 if (sysfs_link_rdev(mddev, rdev))
4060 ;
4061 }
4062 } else if (rdev->raid_disk >= conf->prev.raid_disks
4063 && !test_bit(Faulty, &rdev->flags)) {
4064
4065 set_bit(In_sync, &rdev->flags);
4066 }
4067 }
4068
4069
4070
4071
4072 spin_lock_irq(&conf->device_lock);
4073 mddev->degraded = calc_degraded(conf);
4074 spin_unlock_irq(&conf->device_lock);
4075 mddev->raid_disks = conf->geo.raid_disks;
4076 mddev->reshape_position = conf->reshape_progress;
4077 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4078
4079 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4080 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4081 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
4082 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4083 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4084
4085 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4086 "reshape");
4087 if (!mddev->sync_thread) {
4088 ret = -EAGAIN;
4089 goto abort;
4090 }
4091 conf->reshape_checkpoint = jiffies;
4092 md_wakeup_thread(mddev->sync_thread);
4093 md_new_event(mddev);
4094 return 0;
4095
4096abort:
4097 mddev->recovery = 0;
4098 spin_lock_irq(&conf->device_lock);
4099 conf->geo = conf->prev;
4100 mddev->raid_disks = conf->geo.raid_disks;
4101 rdev_for_each(rdev, mddev)
4102 rdev->new_data_offset = rdev->data_offset;
4103 smp_wmb();
4104 conf->reshape_progress = MaxSector;
4105 conf->reshape_safe = MaxSector;
4106 mddev->reshape_position = MaxSector;
4107 spin_unlock_irq(&conf->device_lock);
4108 return ret;
4109}
4110
4111
4112
4113
4114
4115
4116
4117static sector_t last_dev_address(sector_t s, struct geom *geo)
4118{
4119 s = (s | geo->chunk_mask) + 1;
4120 s >>= geo->chunk_shift;
4121 s *= geo->near_copies;
4122 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4123 s *= geo->far_copies;
4124 s <<= geo->chunk_shift;
4125 return s;
4126}
4127
4128
4129
4130
4131
4132static sector_t first_dev_address(sector_t s, struct geom *geo)
4133{
4134 s >>= geo->chunk_shift;
4135 s *= geo->near_copies;
4136 sector_div(s, geo->raid_disks);
4137 s *= geo->far_copies;
4138 s <<= geo->chunk_shift;
4139 return s;
4140}
4141
4142static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4143 int *skipped)
4144{
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182 struct r10conf *conf = mddev->private;
4183 struct r10bio *r10_bio;
4184 sector_t next, safe, last;
4185 int max_sectors;
4186 int nr_sectors;
4187 int s;
4188 struct md_rdev *rdev;
4189 int need_flush = 0;
4190 struct bio *blist;
4191 struct bio *bio, *read_bio;
4192 int sectors_done = 0;
4193
4194 if (sector_nr == 0) {
4195
4196 if (mddev->reshape_backwards &&
4197 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4198 sector_nr = (raid10_size(mddev, 0, 0)
4199 - conf->reshape_progress);
4200 } else if (!mddev->reshape_backwards &&
4201 conf->reshape_progress > 0)
4202 sector_nr = conf->reshape_progress;
4203 if (sector_nr) {
4204 mddev->curr_resync_completed = sector_nr;
4205 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4206 *skipped = 1;
4207 return sector_nr;
4208 }
4209 }
4210
4211
4212
4213
4214
4215 if (mddev->reshape_backwards) {
4216
4217
4218
4219 next = first_dev_address(conf->reshape_progress - 1,
4220 &conf->geo);
4221
4222
4223
4224
4225 safe = last_dev_address(conf->reshape_safe - 1,
4226 &conf->prev);
4227
4228 if (next + conf->offset_diff < safe)
4229 need_flush = 1;
4230
4231 last = conf->reshape_progress - 1;
4232 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4233 & conf->prev.chunk_mask);
4234 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4235 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4236 } else {
4237
4238
4239
4240 next = last_dev_address(conf->reshape_progress, &conf->geo);
4241
4242
4243
4244
4245 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4246
4247
4248
4249
4250 if (next > safe + conf->offset_diff)
4251 need_flush = 1;
4252
4253 sector_nr = conf->reshape_progress;
4254 last = sector_nr | (conf->geo.chunk_mask
4255 & conf->prev.chunk_mask);
4256
4257 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4258 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4259 }
4260
4261 if (need_flush ||
4262 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4263
4264 wait_barrier(conf);
4265 mddev->reshape_position = conf->reshape_progress;
4266 if (mddev->reshape_backwards)
4267 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4268 - conf->reshape_progress;
4269 else
4270 mddev->curr_resync_completed = conf->reshape_progress;
4271 conf->reshape_checkpoint = jiffies;
4272 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4273 md_wakeup_thread(mddev->thread);
4274 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4275 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4276 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4277 allow_barrier(conf);
4278 return sectors_done;
4279 }
4280 conf->reshape_safe = mddev->reshape_position;
4281 allow_barrier(conf);
4282 }
4283
4284read_more:
4285
4286 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
4287 r10_bio->state = 0;
4288 raise_barrier(conf, sectors_done != 0);
4289 atomic_set(&r10_bio->remaining, 0);
4290 r10_bio->mddev = mddev;
4291 r10_bio->sector = sector_nr;
4292 set_bit(R10BIO_IsReshape, &r10_bio->state);
4293 r10_bio->sectors = last - sector_nr + 1;
4294 rdev = read_balance(conf, r10_bio, &max_sectors);
4295 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4296
4297 if (!rdev) {
4298
4299
4300
4301
4302 mempool_free(r10_bio, conf->r10buf_pool);
4303 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4304 return sectors_done;
4305 }
4306
4307 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4308
4309 read_bio->bi_bdev = rdev->bdev;
4310 read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4311 + rdev->data_offset);
4312 read_bio->bi_private = r10_bio;
4313 read_bio->bi_end_io = end_sync_read;
4314 read_bio->bi_rw = READ;
4315 read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
4316 read_bio->bi_error = 0;
4317 read_bio->bi_vcnt = 0;
4318 read_bio->bi_iter.bi_size = 0;
4319 r10_bio->master_bio = read_bio;
4320 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4321
4322
4323 __raid10_find_phys(&conf->geo, r10_bio);
4324
4325 blist = read_bio;
4326 read_bio->bi_next = NULL;
4327
4328 for (s = 0; s < conf->copies*2; s++) {
4329 struct bio *b;
4330 int d = r10_bio->devs[s/2].devnum;
4331 struct md_rdev *rdev2;
4332 if (s&1) {
4333 rdev2 = conf->mirrors[d].replacement;
4334 b = r10_bio->devs[s/2].repl_bio;
4335 } else {
4336 rdev2 = conf->mirrors[d].rdev;
4337 b = r10_bio->devs[s/2].bio;
4338 }
4339 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4340 continue;
4341
4342 bio_reset(b);
4343 b->bi_bdev = rdev2->bdev;
4344 b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
4345 rdev2->new_data_offset;
4346 b->bi_private = r10_bio;
4347 b->bi_end_io = end_reshape_write;
4348 b->bi_rw = WRITE;
4349 b->bi_next = blist;
4350 blist = b;
4351 }
4352
4353
4354
4355 nr_sectors = 0;
4356 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4357 struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
4358 int len = (max_sectors - s) << 9;
4359 if (len > PAGE_SIZE)
4360 len = PAGE_SIZE;
4361 for (bio = blist; bio ; bio = bio->bi_next) {
4362 struct bio *bio2;
4363 if (bio_add_page(bio, page, len, 0))
4364 continue;
4365
4366
4367 for (bio2 = blist;
4368 bio2 && bio2 != bio;
4369 bio2 = bio2->bi_next) {
4370
4371 bio2->bi_vcnt--;
4372 bio2->bi_iter.bi_size -= len;
4373 bio_clear_flag(bio2, BIO_SEG_VALID);
4374 }
4375 goto bio_full;
4376 }
4377 sector_nr += len >> 9;
4378 nr_sectors += len >> 9;
4379 }
4380bio_full:
4381 r10_bio->sectors = nr_sectors;
4382
4383
4384 md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
4385 atomic_inc(&r10_bio->remaining);
4386 read_bio->bi_next = NULL;
4387 generic_make_request(read_bio);
4388 sector_nr += nr_sectors;
4389 sectors_done += nr_sectors;
4390 if (sector_nr <= last)
4391 goto read_more;
4392
4393
4394
4395
4396 if (mddev->reshape_backwards)
4397 conf->reshape_progress -= sectors_done;
4398 else
4399 conf->reshape_progress += sectors_done;
4400
4401 return sectors_done;
4402}
4403
4404static void end_reshape_request(struct r10bio *r10_bio);
4405static int handle_reshape_read_error(struct mddev *mddev,
4406 struct r10bio *r10_bio);
4407static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4408{
4409
4410
4411
4412
4413
4414 struct r10conf *conf = mddev->private;
4415 int s;
4416
4417 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4418 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4419
4420 md_done_sync(mddev, r10_bio->sectors, 0);
4421 return;
4422 }
4423
4424
4425
4426
4427 atomic_set(&r10_bio->remaining, 1);
4428 for (s = 0; s < conf->copies*2; s++) {
4429 struct bio *b;
4430 int d = r10_bio->devs[s/2].devnum;
4431 struct md_rdev *rdev;
4432 if (s&1) {
4433 rdev = conf->mirrors[d].replacement;
4434 b = r10_bio->devs[s/2].repl_bio;
4435 } else {
4436 rdev = conf->mirrors[d].rdev;
4437 b = r10_bio->devs[s/2].bio;
4438 }
4439 if (!rdev || test_bit(Faulty, &rdev->flags))
4440 continue;
4441 atomic_inc(&rdev->nr_pending);
4442 md_sync_acct(b->bi_bdev, r10_bio->sectors);
4443 atomic_inc(&r10_bio->remaining);
4444 b->bi_next = NULL;
4445 generic_make_request(b);
4446 }
4447 end_reshape_request(r10_bio);
4448}
4449
4450static void end_reshape(struct r10conf *conf)
4451{
4452 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4453 return;
4454
4455 spin_lock_irq(&conf->device_lock);
4456 conf->prev = conf->geo;
4457 md_finish_reshape(conf->mddev);
4458 smp_wmb();
4459 conf->reshape_progress = MaxSector;
4460 conf->reshape_safe = MaxSector;
4461 spin_unlock_irq(&conf->device_lock);
4462
4463
4464
4465
4466 if (conf->mddev->queue) {
4467 int stripe = conf->geo.raid_disks *
4468 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4469 stripe /= conf->geo.near_copies;
4470 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4471 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4472 }
4473 conf->fullsync = 0;
4474}
4475
4476static int handle_reshape_read_error(struct mddev *mddev,
4477 struct r10bio *r10_bio)
4478{
4479
4480 int sectors = r10_bio->sectors;
4481 struct r10conf *conf = mddev->private;
4482 struct {
4483 struct r10bio r10_bio;
4484 struct r10dev devs[conf->copies];
4485 } on_stack;
4486 struct r10bio *r10b = &on_stack.r10_bio;
4487 int slot = 0;
4488 int idx = 0;
4489 struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
4490
4491 r10b->sector = r10_bio->sector;
4492 __raid10_find_phys(&conf->prev, r10b);
4493
4494 while (sectors) {
4495 int s = sectors;
4496 int success = 0;
4497 int first_slot = slot;
4498
4499 if (s > (PAGE_SIZE >> 9))
4500 s = PAGE_SIZE >> 9;
4501
4502 while (!success) {
4503 int d = r10b->devs[slot].devnum;
4504 struct md_rdev *rdev = conf->mirrors[d].rdev;
4505 sector_t addr;
4506 if (rdev == NULL ||
4507 test_bit(Faulty, &rdev->flags) ||
4508 !test_bit(In_sync, &rdev->flags))
4509 goto failed;
4510
4511 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4512 success = sync_page_io(rdev,
4513 addr,
4514 s << 9,
4515 bvec[idx].bv_page,
4516 READ, false);
4517 if (success)
4518 break;
4519 failed:
4520 slot++;
4521 if (slot >= conf->copies)
4522 slot = 0;
4523 if (slot == first_slot)
4524 break;
4525 }
4526 if (!success) {
4527
4528 set_bit(MD_RECOVERY_INTR,
4529 &mddev->recovery);
4530 return -EIO;
4531 }
4532 sectors -= s;
4533 idx++;
4534 }
4535 return 0;
4536}
4537
4538static void end_reshape_write(struct bio *bio)
4539{
4540 struct r10bio *r10_bio = bio->bi_private;
4541 struct mddev *mddev = r10_bio->mddev;
4542 struct r10conf *conf = mddev->private;
4543 int d;
4544 int slot;
4545 int repl;
4546 struct md_rdev *rdev = NULL;
4547
4548 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4549 if (repl)
4550 rdev = conf->mirrors[d].replacement;
4551 if (!rdev) {
4552 smp_mb();
4553 rdev = conf->mirrors[d].rdev;
4554 }
4555
4556 if (bio->bi_error) {
4557
4558 md_error(mddev, rdev);
4559 }
4560
4561 rdev_dec_pending(rdev, mddev);
4562 end_reshape_request(r10_bio);
4563}
4564
4565static void end_reshape_request(struct r10bio *r10_bio)
4566{
4567 if (!atomic_dec_and_test(&r10_bio->remaining))
4568 return;
4569 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4570 bio_put(r10_bio->master_bio);
4571 put_buf(r10_bio);
4572}
4573
4574static void raid10_finish_reshape(struct mddev *mddev)
4575{
4576 struct r10conf *conf = mddev->private;
4577
4578 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4579 return;
4580
4581 if (mddev->delta_disks > 0) {
4582 sector_t size = raid10_size(mddev, 0, 0);
4583 md_set_array_sectors(mddev, size);
4584 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4585 mddev->recovery_cp = mddev->resync_max_sectors;
4586 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4587 }
4588 mddev->resync_max_sectors = size;
4589 set_capacity(mddev->gendisk, mddev->array_sectors);
4590 revalidate_disk(mddev->gendisk);
4591 } else {
4592 int d;
4593 for (d = conf->geo.raid_disks ;
4594 d < conf->geo.raid_disks - mddev->delta_disks;
4595 d++) {
4596 struct md_rdev *rdev = conf->mirrors[d].rdev;
4597 if (rdev)
4598 clear_bit(In_sync, &rdev->flags);
4599 rdev = conf->mirrors[d].replacement;
4600 if (rdev)
4601 clear_bit(In_sync, &rdev->flags);
4602 }
4603 }
4604 mddev->layout = mddev->new_layout;
4605 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4606 mddev->reshape_position = MaxSector;
4607 mddev->delta_disks = 0;
4608 mddev->reshape_backwards = 0;
4609}
4610
4611static struct md_personality raid10_personality =
4612{
4613 .name = "raid10",
4614 .level = 10,
4615 .owner = THIS_MODULE,
4616 .make_request = make_request,
4617 .run = run,
4618 .free = raid10_free,
4619 .status = status,
4620 .error_handler = error,
4621 .hot_add_disk = raid10_add_disk,
4622 .hot_remove_disk= raid10_remove_disk,
4623 .spare_active = raid10_spare_active,
4624 .sync_request = sync_request,
4625 .quiesce = raid10_quiesce,
4626 .size = raid10_size,
4627 .resize = raid10_resize,
4628 .takeover = raid10_takeover,
4629 .check_reshape = raid10_check_reshape,
4630 .start_reshape = raid10_start_reshape,
4631 .finish_reshape = raid10_finish_reshape,
4632 .congested = raid10_congested,
4633};
4634
4635static int __init raid_init(void)
4636{
4637 return register_md_personality(&raid10_personality);
4638}
4639
4640static void raid_exit(void)
4641{
4642 unregister_md_personality(&raid10_personality);
4643}
4644
4645module_init(raid_init);
4646module_exit(raid_exit);
4647MODULE_LICENSE("GPL");
4648MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4649MODULE_ALIAS("md-personality-9");
4650MODULE_ALIAS("md-raid10");
4651MODULE_ALIAS("md-level-10");
4652
4653module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
4654