1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/slab.h>
22#include <linux/delay.h>
23#include <linux/blkdev.h>
24#include <linux/module.h>
25#include <linux/seq_file.h>
26#include <linux/ratelimit.h>
27#include <linux/kthread.h>
28#include "md.h"
29#include "raid10.h"
30#include "raid0.h"
31#include "bitmap.h"
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76#define NR_RAID10_BIOS 256
77
78
79
80
81
82
83#define IO_BLOCKED ((struct bio *)1)
84
85
86
87
88#define IO_MADE_GOOD ((struct bio *)2)
89
90#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
91
92
93
94
95
96static int max_queued_requests = 1024;
97
98static void allow_barrier(struct r10conf *conf);
99static void lower_barrier(struct r10conf *conf);
100static int enough(struct r10conf *conf, int ignore);
101static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
102 int *skipped);
103static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
104static void end_reshape_write(struct bio *bio, int error);
105static void end_reshape(struct r10conf *conf);
106
107static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
108{
109 struct r10conf *conf = data;
110 int size = offsetof(struct r10bio, devs[conf->copies]);
111
112
113
114 return kzalloc(size, gfp_flags);
115}
116
117static void r10bio_pool_free(void *r10_bio, void *data)
118{
119 kfree(r10_bio);
120}
121
122
123#define RESYNC_BLOCK_SIZE (64*1024)
124#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
125
126#define RESYNC_WINDOW (1024*1024)
127
128#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
129
130
131
132
133
134
135
136
137static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
138{
139 struct r10conf *conf = data;
140 struct page *page;
141 struct r10bio *r10_bio;
142 struct bio *bio;
143 int i, j;
144 int nalloc;
145
146 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
147 if (!r10_bio)
148 return NULL;
149
150 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
151 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
152 nalloc = conf->copies;
153 else
154 nalloc = 2;
155
156
157
158
159 for (j = nalloc ; j-- ; ) {
160 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
161 if (!bio)
162 goto out_free_bio;
163 r10_bio->devs[j].bio = bio;
164 if (!conf->have_replacement)
165 continue;
166 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
167 if (!bio)
168 goto out_free_bio;
169 r10_bio->devs[j].repl_bio = bio;
170 }
171
172
173
174
175 for (j = 0 ; j < nalloc; j++) {
176 struct bio *rbio = r10_bio->devs[j].repl_bio;
177 bio = r10_bio->devs[j].bio;
178 for (i = 0; i < RESYNC_PAGES; i++) {
179 if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
180 &conf->mddev->recovery)) {
181
182
183 struct bio *rbio = r10_bio->devs[0].bio;
184 page = rbio->bi_io_vec[i].bv_page;
185 get_page(page);
186 } else
187 page = alloc_page(gfp_flags);
188 if (unlikely(!page))
189 goto out_free_pages;
190
191 bio->bi_io_vec[i].bv_page = page;
192 if (rbio)
193 rbio->bi_io_vec[i].bv_page = page;
194 }
195 }
196
197 return r10_bio;
198
199out_free_pages:
200 for ( ; i > 0 ; i--)
201 safe_put_page(bio->bi_io_vec[i-1].bv_page);
202 while (j--)
203 for (i = 0; i < RESYNC_PAGES ; i++)
204 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
205 j = 0;
206out_free_bio:
207 for ( ; j < nalloc; j++) {
208 if (r10_bio->devs[j].bio)
209 bio_put(r10_bio->devs[j].bio);
210 if (r10_bio->devs[j].repl_bio)
211 bio_put(r10_bio->devs[j].repl_bio);
212 }
213 r10bio_pool_free(r10_bio, conf);
214 return NULL;
215}
216
217static void r10buf_pool_free(void *__r10_bio, void *data)
218{
219 int i;
220 struct r10conf *conf = data;
221 struct r10bio *r10bio = __r10_bio;
222 int j;
223
224 for (j=0; j < conf->copies; j++) {
225 struct bio *bio = r10bio->devs[j].bio;
226 if (bio) {
227 for (i = 0; i < RESYNC_PAGES; i++) {
228 safe_put_page(bio->bi_io_vec[i].bv_page);
229 bio->bi_io_vec[i].bv_page = NULL;
230 }
231 bio_put(bio);
232 }
233 bio = r10bio->devs[j].repl_bio;
234 if (bio)
235 bio_put(bio);
236 }
237 r10bio_pool_free(r10bio, conf);
238}
239
240static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
241{
242 int i;
243
244 for (i = 0; i < conf->copies; i++) {
245 struct bio **bio = & r10_bio->devs[i].bio;
246 if (!BIO_SPECIAL(*bio))
247 bio_put(*bio);
248 *bio = NULL;
249 bio = &r10_bio->devs[i].repl_bio;
250 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
251 bio_put(*bio);
252 *bio = NULL;
253 }
254}
255
256static void free_r10bio(struct r10bio *r10_bio)
257{
258 struct r10conf *conf = r10_bio->mddev->private;
259
260 put_all_bios(conf, r10_bio);
261 mempool_free(r10_bio, conf->r10bio_pool);
262}
263
264static void put_buf(struct r10bio *r10_bio)
265{
266 struct r10conf *conf = r10_bio->mddev->private;
267
268 mempool_free(r10_bio, conf->r10buf_pool);
269
270 lower_barrier(conf);
271}
272
273static void reschedule_retry(struct r10bio *r10_bio)
274{
275 unsigned long flags;
276 struct mddev *mddev = r10_bio->mddev;
277 struct r10conf *conf = mddev->private;
278
279 spin_lock_irqsave(&conf->device_lock, flags);
280 list_add(&r10_bio->retry_list, &conf->retry_list);
281 conf->nr_queued ++;
282 spin_unlock_irqrestore(&conf->device_lock, flags);
283
284
285 wake_up(&conf->wait_barrier);
286
287 md_wakeup_thread(mddev->thread);
288}
289
290
291
292
293
294
295static void raid_end_bio_io(struct r10bio *r10_bio)
296{
297 struct bio *bio = r10_bio->master_bio;
298 int done;
299 struct r10conf *conf = r10_bio->mddev->private;
300
301 if (bio->bi_phys_segments) {
302 unsigned long flags;
303 spin_lock_irqsave(&conf->device_lock, flags);
304 bio->bi_phys_segments--;
305 done = (bio->bi_phys_segments == 0);
306 spin_unlock_irqrestore(&conf->device_lock, flags);
307 } else
308 done = 1;
309 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
310 clear_bit(BIO_UPTODATE, &bio->bi_flags);
311 if (done) {
312 bio_endio(bio, 0);
313
314
315
316
317 allow_barrier(conf);
318 }
319 free_r10bio(r10_bio);
320}
321
322
323
324
325static inline void update_head_pos(int slot, struct r10bio *r10_bio)
326{
327 struct r10conf *conf = r10_bio->mddev->private;
328
329 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
330 r10_bio->devs[slot].addr + (r10_bio->sectors);
331}
332
333
334
335
336static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
337 struct bio *bio, int *slotp, int *replp)
338{
339 int slot;
340 int repl = 0;
341
342 for (slot = 0; slot < conf->copies; slot++) {
343 if (r10_bio->devs[slot].bio == bio)
344 break;
345 if (r10_bio->devs[slot].repl_bio == bio) {
346 repl = 1;
347 break;
348 }
349 }
350
351 BUG_ON(slot == conf->copies);
352 update_head_pos(slot, r10_bio);
353
354 if (slotp)
355 *slotp = slot;
356 if (replp)
357 *replp = repl;
358 return r10_bio->devs[slot].devnum;
359}
360
361static void raid10_end_read_request(struct bio *bio, int error)
362{
363 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
364 struct r10bio *r10_bio = bio->bi_private;
365 int slot, dev;
366 struct md_rdev *rdev;
367 struct r10conf *conf = r10_bio->mddev->private;
368
369
370 slot = r10_bio->read_slot;
371 dev = r10_bio->devs[slot].devnum;
372 rdev = r10_bio->devs[slot].rdev;
373
374
375
376 update_head_pos(slot, r10_bio);
377
378 if (uptodate) {
379
380
381
382
383
384
385
386
387
388 set_bit(R10BIO_Uptodate, &r10_bio->state);
389 } else {
390
391
392
393
394
395 unsigned long flags;
396 spin_lock_irqsave(&conf->device_lock, flags);
397 if (!enough(conf, rdev->raid_disk))
398 uptodate = 1;
399 spin_unlock_irqrestore(&conf->device_lock, flags);
400 }
401 if (uptodate) {
402 raid_end_bio_io(r10_bio);
403 rdev_dec_pending(rdev, conf->mddev);
404 } else {
405
406
407
408 char b[BDEVNAME_SIZE];
409 printk_ratelimited(KERN_ERR
410 "md/raid10:%s: %s: rescheduling sector %llu\n",
411 mdname(conf->mddev),
412 bdevname(rdev->bdev, b),
413 (unsigned long long)r10_bio->sector);
414 set_bit(R10BIO_ReadError, &r10_bio->state);
415 reschedule_retry(r10_bio);
416 }
417}
418
419static void close_write(struct r10bio *r10_bio)
420{
421
422 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
423 r10_bio->sectors,
424 !test_bit(R10BIO_Degraded, &r10_bio->state),
425 0);
426 md_write_end(r10_bio->mddev);
427}
428
429static void one_write_done(struct r10bio *r10_bio)
430{
431 if (atomic_dec_and_test(&r10_bio->remaining)) {
432 if (test_bit(R10BIO_WriteError, &r10_bio->state))
433 reschedule_retry(r10_bio);
434 else {
435 close_write(r10_bio);
436 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
437 reschedule_retry(r10_bio);
438 else
439 raid_end_bio_io(r10_bio);
440 }
441 }
442}
443
444static void raid10_end_write_request(struct bio *bio, int error)
445{
446 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
447 struct r10bio *r10_bio = bio->bi_private;
448 int dev;
449 int dec_rdev = 1;
450 struct r10conf *conf = r10_bio->mddev->private;
451 int slot, repl;
452 struct md_rdev *rdev = NULL;
453
454 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
455
456 if (repl)
457 rdev = conf->mirrors[dev].replacement;
458 if (!rdev) {
459 smp_rmb();
460 repl = 0;
461 rdev = conf->mirrors[dev].rdev;
462 }
463
464
465
466 if (!uptodate) {
467 if (repl)
468
469
470
471 md_error(rdev->mddev, rdev);
472 else {
473 set_bit(WriteErrorSeen, &rdev->flags);
474 if (!test_and_set_bit(WantReplacement, &rdev->flags))
475 set_bit(MD_RECOVERY_NEEDED,
476 &rdev->mddev->recovery);
477 set_bit(R10BIO_WriteError, &r10_bio->state);
478 dec_rdev = 0;
479 }
480 } else {
481
482
483
484
485
486
487
488
489
490 sector_t first_bad;
491 int bad_sectors;
492
493
494
495
496
497
498
499
500
501 if (test_bit(In_sync, &rdev->flags) &&
502 !test_bit(Faulty, &rdev->flags))
503 set_bit(R10BIO_Uptodate, &r10_bio->state);
504
505
506 if (is_badblock(rdev,
507 r10_bio->devs[slot].addr,
508 r10_bio->sectors,
509 &first_bad, &bad_sectors)) {
510 bio_put(bio);
511 if (repl)
512 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
513 else
514 r10_bio->devs[slot].bio = IO_MADE_GOOD;
515 dec_rdev = 0;
516 set_bit(R10BIO_MadeGood, &r10_bio->state);
517 }
518 }
519
520
521
522
523
524
525 one_write_done(r10_bio);
526 if (dec_rdev)
527 rdev_dec_pending(rdev, conf->mddev);
528}
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
556{
557 int n,f;
558 sector_t sector;
559 sector_t chunk;
560 sector_t stripe;
561 int dev;
562 int slot = 0;
563 int last_far_set_start, last_far_set_size;
564
565 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
566 last_far_set_start *= geo->far_set_size;
567
568 last_far_set_size = geo->far_set_size;
569 last_far_set_size += (geo->raid_disks % geo->far_set_size);
570
571
572 chunk = r10bio->sector >> geo->chunk_shift;
573 sector = r10bio->sector & geo->chunk_mask;
574
575 chunk *= geo->near_copies;
576 stripe = chunk;
577 dev = sector_div(stripe, geo->raid_disks);
578 if (geo->far_offset)
579 stripe *= geo->far_copies;
580
581 sector += stripe << geo->chunk_shift;
582
583
584 for (n = 0; n < geo->near_copies; n++) {
585 int d = dev;
586 int set;
587 sector_t s = sector;
588 r10bio->devs[slot].devnum = d;
589 r10bio->devs[slot].addr = s;
590 slot++;
591
592 for (f = 1; f < geo->far_copies; f++) {
593 set = d / geo->far_set_size;
594 d += geo->near_copies;
595
596 if ((geo->raid_disks % geo->far_set_size) &&
597 (d > last_far_set_start)) {
598 d -= last_far_set_start;
599 d %= last_far_set_size;
600 d += last_far_set_start;
601 } else {
602 d %= geo->far_set_size;
603 d += geo->far_set_size * set;
604 }
605 s += geo->stride;
606 r10bio->devs[slot].devnum = d;
607 r10bio->devs[slot].addr = s;
608 slot++;
609 }
610 dev++;
611 if (dev >= geo->raid_disks) {
612 dev = 0;
613 sector += (geo->chunk_mask + 1);
614 }
615 }
616}
617
618static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
619{
620 struct geom *geo = &conf->geo;
621
622 if (conf->reshape_progress != MaxSector &&
623 ((r10bio->sector >= conf->reshape_progress) !=
624 conf->mddev->reshape_backwards)) {
625 set_bit(R10BIO_Previous, &r10bio->state);
626 geo = &conf->prev;
627 } else
628 clear_bit(R10BIO_Previous, &r10bio->state);
629
630 __raid10_find_phys(geo, r10bio);
631}
632
633static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
634{
635 sector_t offset, chunk, vchunk;
636
637
638
639 struct geom *geo = &conf->geo;
640 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
641 int far_set_size = geo->far_set_size;
642 int last_far_set_start;
643
644 if (geo->raid_disks % geo->far_set_size) {
645 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
646 last_far_set_start *= geo->far_set_size;
647
648 if (dev >= last_far_set_start) {
649 far_set_size = geo->far_set_size;
650 far_set_size += (geo->raid_disks % geo->far_set_size);
651 far_set_start = last_far_set_start;
652 }
653 }
654
655 offset = sector & geo->chunk_mask;
656 if (geo->far_offset) {
657 int fc;
658 chunk = sector >> geo->chunk_shift;
659 fc = sector_div(chunk, geo->far_copies);
660 dev -= fc * geo->near_copies;
661 if (dev < far_set_start)
662 dev += far_set_size;
663 } else {
664 while (sector >= geo->stride) {
665 sector -= geo->stride;
666 if (dev < (geo->near_copies + far_set_start))
667 dev += far_set_size - geo->near_copies;
668 else
669 dev -= geo->near_copies;
670 }
671 chunk = sector >> geo->chunk_shift;
672 }
673 vchunk = chunk * geo->raid_disks + dev;
674 sector_div(vchunk, geo->near_copies);
675 return (vchunk << geo->chunk_shift) + offset;
676}
677
678
679
680
681
682
683
684
685
686
687
688static int raid10_mergeable_bvec(struct request_queue *q,
689 struct bvec_merge_data *bvm,
690 struct bio_vec *biovec)
691{
692 struct mddev *mddev = q->queuedata;
693 struct r10conf *conf = mddev->private;
694 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
695 int max;
696 unsigned int chunk_sectors;
697 unsigned int bio_sectors = bvm->bi_size >> 9;
698 struct geom *geo = &conf->geo;
699
700 chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
701 if (conf->reshape_progress != MaxSector &&
702 ((sector >= conf->reshape_progress) !=
703 conf->mddev->reshape_backwards))
704 geo = &conf->prev;
705
706 if (geo->near_copies < geo->raid_disks) {
707 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
708 + bio_sectors)) << 9;
709 if (max < 0)
710
711 max = 0;
712 if (max <= biovec->bv_len && bio_sectors == 0)
713 return biovec->bv_len;
714 } else
715 max = biovec->bv_len;
716
717 if (mddev->merge_check_needed) {
718 struct {
719 struct r10bio r10_bio;
720 struct r10dev devs[conf->copies];
721 } on_stack;
722 struct r10bio *r10_bio = &on_stack.r10_bio;
723 int s;
724 if (conf->reshape_progress != MaxSector) {
725
726 if (max <= biovec->bv_len && bio_sectors == 0)
727 return biovec->bv_len;
728 return 0;
729 }
730 r10_bio->sector = sector;
731 raid10_find_phys(conf, r10_bio);
732 rcu_read_lock();
733 for (s = 0; s < conf->copies; s++) {
734 int disk = r10_bio->devs[s].devnum;
735 struct md_rdev *rdev = rcu_dereference(
736 conf->mirrors[disk].rdev);
737 if (rdev && !test_bit(Faulty, &rdev->flags)) {
738 struct request_queue *q =
739 bdev_get_queue(rdev->bdev);
740 if (q->merge_bvec_fn) {
741 bvm->bi_sector = r10_bio->devs[s].addr
742 + rdev->data_offset;
743 bvm->bi_bdev = rdev->bdev;
744 max = min(max, q->merge_bvec_fn(
745 q, bvm, biovec));
746 }
747 }
748 rdev = rcu_dereference(conf->mirrors[disk].replacement);
749 if (rdev && !test_bit(Faulty, &rdev->flags)) {
750 struct request_queue *q =
751 bdev_get_queue(rdev->bdev);
752 if (q->merge_bvec_fn) {
753 bvm->bi_sector = r10_bio->devs[s].addr
754 + rdev->data_offset;
755 bvm->bi_bdev = rdev->bdev;
756 max = min(max, q->merge_bvec_fn(
757 q, bvm, biovec));
758 }
759 }
760 }
761 rcu_read_unlock();
762 }
763 return max;
764}
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785static struct md_rdev *read_balance(struct r10conf *conf,
786 struct r10bio *r10_bio,
787 int *max_sectors)
788{
789 const sector_t this_sector = r10_bio->sector;
790 int disk, slot;
791 int sectors = r10_bio->sectors;
792 int best_good_sectors;
793 sector_t new_distance, best_dist;
794 struct md_rdev *best_rdev, *rdev = NULL;
795 int do_balance;
796 int best_slot;
797 struct geom *geo = &conf->geo;
798
799 raid10_find_phys(conf, r10_bio);
800 rcu_read_lock();
801retry:
802 sectors = r10_bio->sectors;
803 best_slot = -1;
804 best_rdev = NULL;
805 best_dist = MaxSector;
806 best_good_sectors = 0;
807 do_balance = 1;
808
809
810
811
812
813
814 if (conf->mddev->recovery_cp < MaxSector
815 && (this_sector + sectors >= conf->next_resync))
816 do_balance = 0;
817
818 for (slot = 0; slot < conf->copies ; slot++) {
819 sector_t first_bad;
820 int bad_sectors;
821 sector_t dev_sector;
822
823 if (r10_bio->devs[slot].bio == IO_BLOCKED)
824 continue;
825 disk = r10_bio->devs[slot].devnum;
826 rdev = rcu_dereference(conf->mirrors[disk].replacement);
827 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
828 test_bit(Unmerged, &rdev->flags) ||
829 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
830 rdev = rcu_dereference(conf->mirrors[disk].rdev);
831 if (rdev == NULL ||
832 test_bit(Faulty, &rdev->flags) ||
833 test_bit(Unmerged, &rdev->flags))
834 continue;
835 if (!test_bit(In_sync, &rdev->flags) &&
836 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
837 continue;
838
839 dev_sector = r10_bio->devs[slot].addr;
840 if (is_badblock(rdev, dev_sector, sectors,
841 &first_bad, &bad_sectors)) {
842 if (best_dist < MaxSector)
843
844 continue;
845 if (first_bad <= dev_sector) {
846
847
848
849
850 bad_sectors -= (dev_sector - first_bad);
851 if (!do_balance && sectors > bad_sectors)
852 sectors = bad_sectors;
853 if (best_good_sectors > sectors)
854 best_good_sectors = sectors;
855 } else {
856 sector_t good_sectors =
857 first_bad - dev_sector;
858 if (good_sectors > best_good_sectors) {
859 best_good_sectors = good_sectors;
860 best_slot = slot;
861 best_rdev = rdev;
862 }
863 if (!do_balance)
864
865 break;
866 }
867 continue;
868 } else
869 best_good_sectors = sectors;
870
871 if (!do_balance)
872 break;
873
874
875
876
877
878 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
879 break;
880
881
882 if (geo->far_copies > 1)
883 new_distance = r10_bio->devs[slot].addr;
884 else
885 new_distance = abs(r10_bio->devs[slot].addr -
886 conf->mirrors[disk].head_position);
887 if (new_distance < best_dist) {
888 best_dist = new_distance;
889 best_slot = slot;
890 best_rdev = rdev;
891 }
892 }
893 if (slot >= conf->copies) {
894 slot = best_slot;
895 rdev = best_rdev;
896 }
897
898 if (slot >= 0) {
899 atomic_inc(&rdev->nr_pending);
900 if (test_bit(Faulty, &rdev->flags)) {
901
902
903
904 rdev_dec_pending(rdev, conf->mddev);
905 goto retry;
906 }
907 r10_bio->read_slot = slot;
908 } else
909 rdev = NULL;
910 rcu_read_unlock();
911 *max_sectors = best_good_sectors;
912
913 return rdev;
914}
915
916int md_raid10_congested(struct mddev *mddev, int bits)
917{
918 struct r10conf *conf = mddev->private;
919 int i, ret = 0;
920
921 if ((bits & (1 << BDI_async_congested)) &&
922 conf->pending_count >= max_queued_requests)
923 return 1;
924
925 rcu_read_lock();
926 for (i = 0;
927 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
928 && ret == 0;
929 i++) {
930 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
931 if (rdev && !test_bit(Faulty, &rdev->flags)) {
932 struct request_queue *q = bdev_get_queue(rdev->bdev);
933
934 ret |= bdi_congested(&q->backing_dev_info, bits);
935 }
936 }
937 rcu_read_unlock();
938 return ret;
939}
940EXPORT_SYMBOL_GPL(md_raid10_congested);
941
942static int raid10_congested(void *data, int bits)
943{
944 struct mddev *mddev = data;
945
946 return mddev_congested(mddev, bits) ||
947 md_raid10_congested(mddev, bits);
948}
949
950static void flush_pending_writes(struct r10conf *conf)
951{
952
953
954
955 spin_lock_irq(&conf->device_lock);
956
957 if (conf->pending_bio_list.head) {
958 struct bio *bio;
959 bio = bio_list_get(&conf->pending_bio_list);
960 conf->pending_count = 0;
961 spin_unlock_irq(&conf->device_lock);
962
963
964 bitmap_unplug(conf->mddev->bitmap);
965 wake_up(&conf->wait_barrier);
966
967 while (bio) {
968 struct bio *next = bio->bi_next;
969 bio->bi_next = NULL;
970 if (unlikely((bio->bi_rw & REQ_DISCARD) &&
971 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
972
973 bio_endio(bio, 0);
974 else
975 generic_make_request(bio);
976 bio = next;
977 }
978 } else
979 spin_unlock_irq(&conf->device_lock);
980}
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004static void raise_barrier(struct r10conf *conf, int force)
1005{
1006 BUG_ON(force && !conf->barrier);
1007 spin_lock_irq(&conf->resync_lock);
1008
1009
1010 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
1011 conf->resync_lock);
1012
1013
1014 conf->barrier++;
1015
1016
1017 wait_event_lock_irq(conf->wait_barrier,
1018 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
1019 conf->resync_lock);
1020
1021 spin_unlock_irq(&conf->resync_lock);
1022}
1023
1024static void lower_barrier(struct r10conf *conf)
1025{
1026 unsigned long flags;
1027 spin_lock_irqsave(&conf->resync_lock, flags);
1028 conf->barrier--;
1029 spin_unlock_irqrestore(&conf->resync_lock, flags);
1030 wake_up(&conf->wait_barrier);
1031}
1032
1033static void wait_barrier(struct r10conf *conf)
1034{
1035 spin_lock_irq(&conf->resync_lock);
1036 if (conf->barrier) {
1037 conf->nr_waiting++;
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047 wait_event_lock_irq(conf->wait_barrier,
1048 !conf->barrier ||
1049 (conf->nr_pending &&
1050 current->bio_list &&
1051 !bio_list_empty(current->bio_list)),
1052 conf->resync_lock);
1053 conf->nr_waiting--;
1054 }
1055 conf->nr_pending++;
1056 spin_unlock_irq(&conf->resync_lock);
1057}
1058
1059static void allow_barrier(struct r10conf *conf)
1060{
1061 unsigned long flags;
1062 spin_lock_irqsave(&conf->resync_lock, flags);
1063 conf->nr_pending--;
1064 spin_unlock_irqrestore(&conf->resync_lock, flags);
1065 wake_up(&conf->wait_barrier);
1066}
1067
1068static void freeze_array(struct r10conf *conf, int extra)
1069{
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082 spin_lock_irq(&conf->resync_lock);
1083 conf->barrier++;
1084 conf->nr_waiting++;
1085 wait_event_lock_irq_cmd(conf->wait_barrier,
1086 conf->nr_pending == conf->nr_queued+extra,
1087 conf->resync_lock,
1088 flush_pending_writes(conf));
1089
1090 spin_unlock_irq(&conf->resync_lock);
1091}
1092
1093static void unfreeze_array(struct r10conf *conf)
1094{
1095
1096 spin_lock_irq(&conf->resync_lock);
1097 conf->barrier--;
1098 conf->nr_waiting--;
1099 wake_up(&conf->wait_barrier);
1100 spin_unlock_irq(&conf->resync_lock);
1101}
1102
1103static sector_t choose_data_offset(struct r10bio *r10_bio,
1104 struct md_rdev *rdev)
1105{
1106 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1107 test_bit(R10BIO_Previous, &r10_bio->state))
1108 return rdev->data_offset;
1109 else
1110 return rdev->new_data_offset;
1111}
1112
1113struct raid10_plug_cb {
1114 struct blk_plug_cb cb;
1115 struct bio_list pending;
1116 int pending_cnt;
1117};
1118
1119static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1120{
1121 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1122 cb);
1123 struct mddev *mddev = plug->cb.data;
1124 struct r10conf *conf = mddev->private;
1125 struct bio *bio;
1126
1127 if (from_schedule || current->bio_list) {
1128 spin_lock_irq(&conf->device_lock);
1129 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1130 conf->pending_count += plug->pending_cnt;
1131 spin_unlock_irq(&conf->device_lock);
1132 wake_up(&conf->wait_barrier);
1133 md_wakeup_thread(mddev->thread);
1134 kfree(plug);
1135 return;
1136 }
1137
1138
1139 bio = bio_list_get(&plug->pending);
1140 bitmap_unplug(mddev->bitmap);
1141 wake_up(&conf->wait_barrier);
1142
1143 while (bio) {
1144 struct bio *next = bio->bi_next;
1145 bio->bi_next = NULL;
1146 if (unlikely((bio->bi_rw & REQ_DISCARD) &&
1147 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
1148
1149 bio_endio(bio, 0);
1150 else
1151 generic_make_request(bio);
1152 bio = next;
1153 }
1154 kfree(plug);
1155}
1156
1157static void make_request(struct mddev *mddev, struct bio * bio)
1158{
1159 struct r10conf *conf = mddev->private;
1160 struct r10bio *r10_bio;
1161 struct bio *read_bio;
1162 int i;
1163 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1164 int chunk_sects = chunk_mask + 1;
1165 const int rw = bio_data_dir(bio);
1166 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
1167 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
1168 const unsigned long do_discard = (bio->bi_rw
1169 & (REQ_DISCARD | REQ_SECURE));
1170 const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
1171 unsigned long flags;
1172 struct md_rdev *blocked_rdev;
1173 struct blk_plug_cb *cb;
1174 struct raid10_plug_cb *plug = NULL;
1175 int sectors_handled;
1176 int max_sectors;
1177 int sectors;
1178
1179 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
1180 md_flush_request(mddev, bio);
1181 return;
1182 }
1183
1184
1185
1186
1187 if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio)
1188 > chunk_sects
1189 && (conf->geo.near_copies < conf->geo.raid_disks
1190 || conf->prev.near_copies < conf->prev.raid_disks))) {
1191 struct bio_pair *bp;
1192
1193 if (bio_segments(bio) > 1)
1194 goto bad_map;
1195
1196
1197
1198 bp = bio_split(bio,
1199 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209 spin_lock_irq(&conf->resync_lock);
1210 conf->nr_waiting++;
1211 spin_unlock_irq(&conf->resync_lock);
1212
1213 make_request(mddev, &bp->bio1);
1214 make_request(mddev, &bp->bio2);
1215
1216 spin_lock_irq(&conf->resync_lock);
1217 conf->nr_waiting--;
1218 wake_up(&conf->wait_barrier);
1219 spin_unlock_irq(&conf->resync_lock);
1220
1221 bio_pair_release(bp);
1222 return;
1223 bad_map:
1224 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
1225 " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
1226 (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
1227
1228 bio_io_error(bio);
1229 return;
1230 }
1231
1232 md_write_start(mddev, bio);
1233
1234
1235
1236
1237
1238
1239 wait_barrier(conf);
1240
1241 sectors = bio_sectors(bio);
1242 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1243 bio->bi_sector < conf->reshape_progress &&
1244 bio->bi_sector + sectors > conf->reshape_progress) {
1245
1246
1247
1248 allow_barrier(conf);
1249 wait_event(conf->wait_barrier,
1250 conf->reshape_progress <= bio->bi_sector ||
1251 conf->reshape_progress >= bio->bi_sector + sectors);
1252 wait_barrier(conf);
1253 }
1254 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1255 bio_data_dir(bio) == WRITE &&
1256 (mddev->reshape_backwards
1257 ? (bio->bi_sector < conf->reshape_safe &&
1258 bio->bi_sector + sectors > conf->reshape_progress)
1259 : (bio->bi_sector + sectors > conf->reshape_safe &&
1260 bio->bi_sector < conf->reshape_progress))) {
1261
1262 mddev->reshape_position = conf->reshape_progress;
1263 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1264 set_bit(MD_CHANGE_PENDING, &mddev->flags);
1265 md_wakeup_thread(mddev->thread);
1266 wait_event(mddev->sb_wait,
1267 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
1268
1269 conf->reshape_safe = mddev->reshape_position;
1270 }
1271
1272 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1273
1274 r10_bio->master_bio = bio;
1275 r10_bio->sectors = sectors;
1276
1277 r10_bio->mddev = mddev;
1278 r10_bio->sector = bio->bi_sector;
1279 r10_bio->state = 0;
1280
1281
1282
1283
1284
1285
1286
1287
1288 bio->bi_phys_segments = 0;
1289 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1290
1291 if (rw == READ) {
1292
1293
1294
1295 struct md_rdev *rdev;
1296 int slot;
1297
1298read_again:
1299 rdev = read_balance(conf, r10_bio, &max_sectors);
1300 if (!rdev) {
1301 raid_end_bio_io(r10_bio);
1302 return;
1303 }
1304 slot = r10_bio->read_slot;
1305
1306 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1307 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
1308 max_sectors);
1309
1310 r10_bio->devs[slot].bio = read_bio;
1311 r10_bio->devs[slot].rdev = rdev;
1312
1313 read_bio->bi_sector = r10_bio->devs[slot].addr +
1314 choose_data_offset(r10_bio, rdev);
1315 read_bio->bi_bdev = rdev->bdev;
1316 read_bio->bi_end_io = raid10_end_read_request;
1317 read_bio->bi_rw = READ | do_sync;
1318 read_bio->bi_private = r10_bio;
1319
1320 if (max_sectors < r10_bio->sectors) {
1321
1322
1323
1324 sectors_handled = (r10_bio->sectors + max_sectors
1325 - bio->bi_sector);
1326 r10_bio->sectors = max_sectors;
1327 spin_lock_irq(&conf->device_lock);
1328 if (bio->bi_phys_segments == 0)
1329 bio->bi_phys_segments = 2;
1330 else
1331 bio->bi_phys_segments++;
1332 spin_unlock(&conf->device_lock);
1333
1334
1335
1336
1337
1338 reschedule_retry(r10_bio);
1339
1340 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1341
1342 r10_bio->master_bio = bio;
1343 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1344 r10_bio->state = 0;
1345 r10_bio->mddev = mddev;
1346 r10_bio->sector = bio->bi_sector + sectors_handled;
1347 goto read_again;
1348 } else
1349 generic_make_request(read_bio);
1350 return;
1351 }
1352
1353
1354
1355
1356 if (conf->pending_count >= max_queued_requests) {
1357 md_wakeup_thread(mddev->thread);
1358 wait_event(conf->wait_barrier,
1359 conf->pending_count < max_queued_requests);
1360 }
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373 r10_bio->read_slot = -1;
1374 raid10_find_phys(conf, r10_bio);
1375retry_write:
1376 blocked_rdev = NULL;
1377 rcu_read_lock();
1378 max_sectors = r10_bio->sectors;
1379
1380 for (i = 0; i < conf->copies; i++) {
1381 int d = r10_bio->devs[i].devnum;
1382 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1383 struct md_rdev *rrdev = rcu_dereference(
1384 conf->mirrors[d].replacement);
1385 if (rdev == rrdev)
1386 rrdev = NULL;
1387 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1388 atomic_inc(&rdev->nr_pending);
1389 blocked_rdev = rdev;
1390 break;
1391 }
1392 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1393 atomic_inc(&rrdev->nr_pending);
1394 blocked_rdev = rrdev;
1395 break;
1396 }
1397 if (rdev && (test_bit(Faulty, &rdev->flags)
1398 || test_bit(Unmerged, &rdev->flags)))
1399 rdev = NULL;
1400 if (rrdev && (test_bit(Faulty, &rrdev->flags)
1401 || test_bit(Unmerged, &rrdev->flags)))
1402 rrdev = NULL;
1403
1404 r10_bio->devs[i].bio = NULL;
1405 r10_bio->devs[i].repl_bio = NULL;
1406
1407 if (!rdev && !rrdev) {
1408 set_bit(R10BIO_Degraded, &r10_bio->state);
1409 continue;
1410 }
1411 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1412 sector_t first_bad;
1413 sector_t dev_sector = r10_bio->devs[i].addr;
1414 int bad_sectors;
1415 int is_bad;
1416
1417 is_bad = is_badblock(rdev, dev_sector,
1418 max_sectors,
1419 &first_bad, &bad_sectors);
1420 if (is_bad < 0) {
1421
1422
1423
1424 atomic_inc(&rdev->nr_pending);
1425 set_bit(BlockedBadBlocks, &rdev->flags);
1426 blocked_rdev = rdev;
1427 break;
1428 }
1429 if (is_bad && first_bad <= dev_sector) {
1430
1431 bad_sectors -= (dev_sector - first_bad);
1432 if (bad_sectors < max_sectors)
1433
1434
1435
1436 max_sectors = bad_sectors;
1437
1438
1439
1440
1441
1442
1443
1444
1445 continue;
1446 }
1447 if (is_bad) {
1448 int good_sectors = first_bad - dev_sector;
1449 if (good_sectors < max_sectors)
1450 max_sectors = good_sectors;
1451 }
1452 }
1453 if (rdev) {
1454 r10_bio->devs[i].bio = bio;
1455 atomic_inc(&rdev->nr_pending);
1456 }
1457 if (rrdev) {
1458 r10_bio->devs[i].repl_bio = bio;
1459 atomic_inc(&rrdev->nr_pending);
1460 }
1461 }
1462 rcu_read_unlock();
1463
1464 if (unlikely(blocked_rdev)) {
1465
1466 int j;
1467 int d;
1468
1469 for (j = 0; j < i; j++) {
1470 if (r10_bio->devs[j].bio) {
1471 d = r10_bio->devs[j].devnum;
1472 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1473 }
1474 if (r10_bio->devs[j].repl_bio) {
1475 struct md_rdev *rdev;
1476 d = r10_bio->devs[j].devnum;
1477 rdev = conf->mirrors[d].replacement;
1478 if (!rdev) {
1479
1480 smp_mb();
1481 rdev = conf->mirrors[d].rdev;
1482 }
1483 rdev_dec_pending(rdev, mddev);
1484 }
1485 }
1486 allow_barrier(conf);
1487 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1488 wait_barrier(conf);
1489 goto retry_write;
1490 }
1491
1492 if (max_sectors < r10_bio->sectors) {
1493
1494
1495
1496 r10_bio->sectors = max_sectors;
1497 spin_lock_irq(&conf->device_lock);
1498 if (bio->bi_phys_segments == 0)
1499 bio->bi_phys_segments = 2;
1500 else
1501 bio->bi_phys_segments++;
1502 spin_unlock_irq(&conf->device_lock);
1503 }
1504 sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
1505
1506 atomic_set(&r10_bio->remaining, 1);
1507 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1508
1509 for (i = 0; i < conf->copies; i++) {
1510 struct bio *mbio;
1511 int d = r10_bio->devs[i].devnum;
1512 if (r10_bio->devs[i].bio) {
1513 struct md_rdev *rdev = conf->mirrors[d].rdev;
1514 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1515 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1516 max_sectors);
1517 r10_bio->devs[i].bio = mbio;
1518
1519 mbio->bi_sector = (r10_bio->devs[i].addr+
1520 choose_data_offset(r10_bio,
1521 rdev));
1522 mbio->bi_bdev = rdev->bdev;
1523 mbio->bi_end_io = raid10_end_write_request;
1524 mbio->bi_rw =
1525 WRITE | do_sync | do_fua | do_discard | do_same;
1526 mbio->bi_private = r10_bio;
1527
1528 atomic_inc(&r10_bio->remaining);
1529
1530 cb = blk_check_plugged(raid10_unplug, mddev,
1531 sizeof(*plug));
1532 if (cb)
1533 plug = container_of(cb, struct raid10_plug_cb,
1534 cb);
1535 else
1536 plug = NULL;
1537 spin_lock_irqsave(&conf->device_lock, flags);
1538 if (plug) {
1539 bio_list_add(&plug->pending, mbio);
1540 plug->pending_cnt++;
1541 } else {
1542 bio_list_add(&conf->pending_bio_list, mbio);
1543 conf->pending_count++;
1544 }
1545 spin_unlock_irqrestore(&conf->device_lock, flags);
1546 if (!plug)
1547 md_wakeup_thread(mddev->thread);
1548 }
1549
1550 if (r10_bio->devs[i].repl_bio) {
1551 struct md_rdev *rdev = conf->mirrors[d].replacement;
1552 if (rdev == NULL) {
1553
1554 smp_mb();
1555 rdev = conf->mirrors[d].rdev;
1556 }
1557 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1558 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1559 max_sectors);
1560 r10_bio->devs[i].repl_bio = mbio;
1561
1562 mbio->bi_sector = (r10_bio->devs[i].addr +
1563 choose_data_offset(
1564 r10_bio, rdev));
1565 mbio->bi_bdev = rdev->bdev;
1566 mbio->bi_end_io = raid10_end_write_request;
1567 mbio->bi_rw =
1568 WRITE | do_sync | do_fua | do_discard | do_same;
1569 mbio->bi_private = r10_bio;
1570
1571 atomic_inc(&r10_bio->remaining);
1572 spin_lock_irqsave(&conf->device_lock, flags);
1573 bio_list_add(&conf->pending_bio_list, mbio);
1574 conf->pending_count++;
1575 spin_unlock_irqrestore(&conf->device_lock, flags);
1576 if (!mddev_check_plugged(mddev))
1577 md_wakeup_thread(mddev->thread);
1578 }
1579 }
1580
1581
1582
1583
1584
1585 if (sectors_handled < bio_sectors(bio)) {
1586 one_write_done(r10_bio);
1587
1588
1589
1590 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1591
1592 r10_bio->master_bio = bio;
1593 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1594
1595 r10_bio->mddev = mddev;
1596 r10_bio->sector = bio->bi_sector + sectors_handled;
1597 r10_bio->state = 0;
1598 goto retry_write;
1599 }
1600 one_write_done(r10_bio);
1601
1602
1603 wake_up(&conf->wait_barrier);
1604}
1605
1606static void status(struct seq_file *seq, struct mddev *mddev)
1607{
1608 struct r10conf *conf = mddev->private;
1609 int i;
1610
1611 if (conf->geo.near_copies < conf->geo.raid_disks)
1612 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1613 if (conf->geo.near_copies > 1)
1614 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1615 if (conf->geo.far_copies > 1) {
1616 if (conf->geo.far_offset)
1617 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1618 else
1619 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1620 }
1621 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1622 conf->geo.raid_disks - mddev->degraded);
1623 for (i = 0; i < conf->geo.raid_disks; i++)
1624 seq_printf(seq, "%s",
1625 conf->mirrors[i].rdev &&
1626 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
1627 seq_printf(seq, "]");
1628}
1629
1630
1631
1632
1633
1634
1635static int _enough(struct r10conf *conf, struct geom *geo, int ignore)
1636{
1637 int first = 0;
1638
1639 do {
1640 int n = conf->copies;
1641 int cnt = 0;
1642 int this = first;
1643 while (n--) {
1644 if (conf->mirrors[this].rdev &&
1645 this != ignore)
1646 cnt++;
1647 this = (this+1) % geo->raid_disks;
1648 }
1649 if (cnt == 0)
1650 return 0;
1651 first = (first + geo->near_copies) % geo->raid_disks;
1652 } while (first != 0);
1653 return 1;
1654}
1655
1656static int enough(struct r10conf *conf, int ignore)
1657{
1658 return _enough(conf, &conf->geo, ignore) &&
1659 _enough(conf, &conf->prev, ignore);
1660}
1661
1662static void error(struct mddev *mddev, struct md_rdev *rdev)
1663{
1664 char b[BDEVNAME_SIZE];
1665 struct r10conf *conf = mddev->private;
1666
1667
1668
1669
1670
1671
1672
1673 if (test_bit(In_sync, &rdev->flags)
1674 && !enough(conf, rdev->raid_disk))
1675
1676
1677
1678 return;
1679 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1680 unsigned long flags;
1681 spin_lock_irqsave(&conf->device_lock, flags);
1682 mddev->degraded++;
1683 spin_unlock_irqrestore(&conf->device_lock, flags);
1684
1685
1686
1687 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1688 }
1689 set_bit(Blocked, &rdev->flags);
1690 set_bit(Faulty, &rdev->flags);
1691 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1692 printk(KERN_ALERT
1693 "md/raid10:%s: Disk failure on %s, disabling device.\n"
1694 "md/raid10:%s: Operation continuing on %d devices.\n",
1695 mdname(mddev), bdevname(rdev->bdev, b),
1696 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1697}
1698
1699static void print_conf(struct r10conf *conf)
1700{
1701 int i;
1702 struct raid10_info *tmp;
1703
1704 printk(KERN_DEBUG "RAID10 conf printout:\n");
1705 if (!conf) {
1706 printk(KERN_DEBUG "(!conf)\n");
1707 return;
1708 }
1709 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1710 conf->geo.raid_disks);
1711
1712 for (i = 0; i < conf->geo.raid_disks; i++) {
1713 char b[BDEVNAME_SIZE];
1714 tmp = conf->mirrors + i;
1715 if (tmp->rdev)
1716 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1717 i, !test_bit(In_sync, &tmp->rdev->flags),
1718 !test_bit(Faulty, &tmp->rdev->flags),
1719 bdevname(tmp->rdev->bdev,b));
1720 }
1721}
1722
1723static void close_sync(struct r10conf *conf)
1724{
1725 wait_barrier(conf);
1726 allow_barrier(conf);
1727
1728 mempool_destroy(conf->r10buf_pool);
1729 conf->r10buf_pool = NULL;
1730}
1731
1732static int raid10_spare_active(struct mddev *mddev)
1733{
1734 int i;
1735 struct r10conf *conf = mddev->private;
1736 struct raid10_info *tmp;
1737 int count = 0;
1738 unsigned long flags;
1739
1740
1741
1742
1743
1744 for (i = 0; i < conf->geo.raid_disks; i++) {
1745 tmp = conf->mirrors + i;
1746 if (tmp->replacement
1747 && tmp->replacement->recovery_offset == MaxSector
1748 && !test_bit(Faulty, &tmp->replacement->flags)
1749 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1750
1751 if (!tmp->rdev
1752 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1753 count++;
1754 if (tmp->rdev) {
1755
1756
1757
1758
1759 set_bit(Faulty, &tmp->rdev->flags);
1760 sysfs_notify_dirent_safe(
1761 tmp->rdev->sysfs_state);
1762 }
1763 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1764 } else if (tmp->rdev
1765 && !test_bit(Faulty, &tmp->rdev->flags)
1766 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1767 count++;
1768 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
1769 }
1770 }
1771 spin_lock_irqsave(&conf->device_lock, flags);
1772 mddev->degraded -= count;
1773 spin_unlock_irqrestore(&conf->device_lock, flags);
1774
1775 print_conf(conf);
1776 return count;
1777}
1778
1779
1780static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1781{
1782 struct r10conf *conf = mddev->private;
1783 int err = -EEXIST;
1784 int mirror;
1785 int first = 0;
1786 int last = conf->geo.raid_disks - 1;
1787 struct request_queue *q = bdev_get_queue(rdev->bdev);
1788
1789 if (mddev->recovery_cp < MaxSector)
1790
1791
1792
1793 return -EBUSY;
1794 if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1))
1795 return -EINVAL;
1796
1797 if (rdev->raid_disk >= 0)
1798 first = last = rdev->raid_disk;
1799
1800 if (q->merge_bvec_fn) {
1801 set_bit(Unmerged, &rdev->flags);
1802 mddev->merge_check_needed = 1;
1803 }
1804
1805 if (rdev->saved_raid_disk >= first &&
1806 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1807 mirror = rdev->saved_raid_disk;
1808 else
1809 mirror = first;
1810 for ( ; mirror <= last ; mirror++) {
1811 struct raid10_info *p = &conf->mirrors[mirror];
1812 if (p->recovery_disabled == mddev->recovery_disabled)
1813 continue;
1814 if (p->rdev) {
1815 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1816 p->replacement != NULL)
1817 continue;
1818 clear_bit(In_sync, &rdev->flags);
1819 set_bit(Replacement, &rdev->flags);
1820 rdev->raid_disk = mirror;
1821 err = 0;
1822 disk_stack_limits(mddev->gendisk, rdev->bdev,
1823 rdev->data_offset << 9);
1824 conf->fullsync = 1;
1825 rcu_assign_pointer(p->replacement, rdev);
1826 break;
1827 }
1828
1829 disk_stack_limits(mddev->gendisk, rdev->bdev,
1830 rdev->data_offset << 9);
1831
1832 p->head_position = 0;
1833 p->recovery_disabled = mddev->recovery_disabled - 1;
1834 rdev->raid_disk = mirror;
1835 err = 0;
1836 if (rdev->saved_raid_disk != mirror)
1837 conf->fullsync = 1;
1838 rcu_assign_pointer(p->rdev, rdev);
1839 break;
1840 }
1841 if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
1842
1843
1844
1845
1846
1847
1848
1849 synchronize_sched();
1850 freeze_array(conf, 0);
1851 unfreeze_array(conf);
1852 clear_bit(Unmerged, &rdev->flags);
1853 }
1854 md_integrity_add_rdev(rdev, mddev);
1855 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1856 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
1857
1858 print_conf(conf);
1859 return err;
1860}
1861
1862static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1863{
1864 struct r10conf *conf = mddev->private;
1865 int err = 0;
1866 int number = rdev->raid_disk;
1867 struct md_rdev **rdevp;
1868 struct raid10_info *p = conf->mirrors + number;
1869
1870 print_conf(conf);
1871 if (rdev == p->rdev)
1872 rdevp = &p->rdev;
1873 else if (rdev == p->replacement)
1874 rdevp = &p->replacement;
1875 else
1876 return 0;
1877
1878 if (test_bit(In_sync, &rdev->flags) ||
1879 atomic_read(&rdev->nr_pending)) {
1880 err = -EBUSY;
1881 goto abort;
1882 }
1883
1884
1885
1886 if (!test_bit(Faulty, &rdev->flags) &&
1887 mddev->recovery_disabled != p->recovery_disabled &&
1888 (!p->replacement || p->replacement == rdev) &&
1889 number < conf->geo.raid_disks &&
1890 enough(conf, -1)) {
1891 err = -EBUSY;
1892 goto abort;
1893 }
1894 *rdevp = NULL;
1895 synchronize_rcu();
1896 if (atomic_read(&rdev->nr_pending)) {
1897
1898 err = -EBUSY;
1899 *rdevp = rdev;
1900 goto abort;
1901 } else if (p->replacement) {
1902
1903 p->rdev = p->replacement;
1904 clear_bit(Replacement, &p->replacement->flags);
1905 smp_mb();
1906
1907
1908 p->replacement = NULL;
1909 clear_bit(WantReplacement, &rdev->flags);
1910 } else
1911
1912
1913
1914 clear_bit(WantReplacement, &rdev->flags);
1915
1916 err = md_integrity_register(mddev);
1917
1918abort:
1919
1920 print_conf(conf);
1921 return err;
1922}
1923
1924
1925static void end_sync_read(struct bio *bio, int error)
1926{
1927 struct r10bio *r10_bio = bio->bi_private;
1928 struct r10conf *conf = r10_bio->mddev->private;
1929 int d;
1930
1931 if (bio == r10_bio->master_bio) {
1932
1933 d = r10_bio->read_slot;
1934 } else
1935 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1936
1937 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1938 set_bit(R10BIO_Uptodate, &r10_bio->state);
1939 else
1940
1941
1942
1943 atomic_add(r10_bio->sectors,
1944 &conf->mirrors[d].rdev->corrected_errors);
1945
1946
1947
1948
1949 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1950 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1951 atomic_dec_and_test(&r10_bio->remaining)) {
1952
1953
1954
1955 reschedule_retry(r10_bio);
1956 }
1957}
1958
1959static void end_sync_request(struct r10bio *r10_bio)
1960{
1961 struct mddev *mddev = r10_bio->mddev;
1962
1963 while (atomic_dec_and_test(&r10_bio->remaining)) {
1964 if (r10_bio->master_bio == NULL) {
1965
1966 sector_t s = r10_bio->sectors;
1967 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1968 test_bit(R10BIO_WriteError, &r10_bio->state))
1969 reschedule_retry(r10_bio);
1970 else
1971 put_buf(r10_bio);
1972 md_done_sync(mddev, s, 1);
1973 break;
1974 } else {
1975 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1976 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1977 test_bit(R10BIO_WriteError, &r10_bio->state))
1978 reschedule_retry(r10_bio);
1979 else
1980 put_buf(r10_bio);
1981 r10_bio = r10_bio2;
1982 }
1983 }
1984}
1985
1986static void end_sync_write(struct bio *bio, int error)
1987{
1988 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1989 struct r10bio *r10_bio = bio->bi_private;
1990 struct mddev *mddev = r10_bio->mddev;
1991 struct r10conf *conf = mddev->private;
1992 int d;
1993 sector_t first_bad;
1994 int bad_sectors;
1995 int slot;
1996 int repl;
1997 struct md_rdev *rdev = NULL;
1998
1999 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
2000 if (repl)
2001 rdev = conf->mirrors[d].replacement;
2002 else
2003 rdev = conf->mirrors[d].rdev;
2004
2005 if (!uptodate) {
2006 if (repl)
2007 md_error(mddev, rdev);
2008 else {
2009 set_bit(WriteErrorSeen, &rdev->flags);
2010 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2011 set_bit(MD_RECOVERY_NEEDED,
2012 &rdev->mddev->recovery);
2013 set_bit(R10BIO_WriteError, &r10_bio->state);
2014 }
2015 } else if (is_badblock(rdev,
2016 r10_bio->devs[slot].addr,
2017 r10_bio->sectors,
2018 &first_bad, &bad_sectors))
2019 set_bit(R10BIO_MadeGood, &r10_bio->state);
2020
2021 rdev_dec_pending(rdev, mddev);
2022
2023 end_sync_request(r10_bio);
2024}
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2043{
2044 struct r10conf *conf = mddev->private;
2045 int i, first;
2046 struct bio *tbio, *fbio;
2047 int vcnt;
2048
2049 atomic_set(&r10_bio->remaining, 1);
2050
2051
2052 for (i=0; i<conf->copies; i++)
2053 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
2054 break;
2055
2056 if (i == conf->copies)
2057 goto done;
2058
2059 first = i;
2060 fbio = r10_bio->devs[i].bio;
2061
2062 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2063
2064 for (i=0 ; i < conf->copies ; i++) {
2065 int j, d;
2066
2067 tbio = r10_bio->devs[i].bio;
2068
2069 if (tbio->bi_end_io != end_sync_read)
2070 continue;
2071 if (i == first)
2072 continue;
2073 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
2074
2075
2076
2077
2078 for (j = 0; j < vcnt; j++)
2079 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
2080 page_address(tbio->bi_io_vec[j].bv_page),
2081 fbio->bi_io_vec[j].bv_len))
2082 break;
2083 if (j == vcnt)
2084 continue;
2085 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2086 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2087
2088 continue;
2089 }
2090
2091
2092
2093
2094
2095 bio_reset(tbio);
2096
2097 tbio->bi_vcnt = vcnt;
2098 tbio->bi_size = r10_bio->sectors << 9;
2099 tbio->bi_rw = WRITE;
2100 tbio->bi_private = r10_bio;
2101 tbio->bi_sector = r10_bio->devs[i].addr;
2102
2103 for (j=0; j < vcnt ; j++) {
2104 tbio->bi_io_vec[j].bv_offset = 0;
2105 tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
2106
2107 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2108 page_address(fbio->bi_io_vec[j].bv_page),
2109 PAGE_SIZE);
2110 }
2111 tbio->bi_end_io = end_sync_write;
2112
2113 d = r10_bio->devs[i].devnum;
2114 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2115 atomic_inc(&r10_bio->remaining);
2116 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2117
2118 tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
2119 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
2120 generic_make_request(tbio);
2121 }
2122
2123
2124
2125
2126 for (i = 0; i < conf->copies; i++) {
2127 int j, d;
2128
2129 tbio = r10_bio->devs[i].repl_bio;
2130 if (!tbio || !tbio->bi_end_io)
2131 continue;
2132 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2133 && r10_bio->devs[i].bio != fbio)
2134 for (j = 0; j < vcnt; j++)
2135 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2136 page_address(fbio->bi_io_vec[j].bv_page),
2137 PAGE_SIZE);
2138 d = r10_bio->devs[i].devnum;
2139 atomic_inc(&r10_bio->remaining);
2140 md_sync_acct(conf->mirrors[d].replacement->bdev,
2141 bio_sectors(tbio));
2142 generic_make_request(tbio);
2143 }
2144
2145done:
2146 if (atomic_dec_and_test(&r10_bio->remaining)) {
2147 md_done_sync(mddev, r10_bio->sectors, 1);
2148 put_buf(r10_bio);
2149 }
2150}
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162static void fix_recovery_read_error(struct r10bio *r10_bio)
2163{
2164
2165
2166
2167
2168
2169
2170
2171 struct mddev *mddev = r10_bio->mddev;
2172 struct r10conf *conf = mddev->private;
2173 struct bio *bio = r10_bio->devs[0].bio;
2174 sector_t sect = 0;
2175 int sectors = r10_bio->sectors;
2176 int idx = 0;
2177 int dr = r10_bio->devs[0].devnum;
2178 int dw = r10_bio->devs[1].devnum;
2179
2180 while (sectors) {
2181 int s = sectors;
2182 struct md_rdev *rdev;
2183 sector_t addr;
2184 int ok;
2185
2186 if (s > (PAGE_SIZE>>9))
2187 s = PAGE_SIZE >> 9;
2188
2189 rdev = conf->mirrors[dr].rdev;
2190 addr = r10_bio->devs[0].addr + sect,
2191 ok = sync_page_io(rdev,
2192 addr,
2193 s << 9,
2194 bio->bi_io_vec[idx].bv_page,
2195 READ, false);
2196 if (ok) {
2197 rdev = conf->mirrors[dw].rdev;
2198 addr = r10_bio->devs[1].addr + sect;
2199 ok = sync_page_io(rdev,
2200 addr,
2201 s << 9,
2202 bio->bi_io_vec[idx].bv_page,
2203 WRITE, false);
2204 if (!ok) {
2205 set_bit(WriteErrorSeen, &rdev->flags);
2206 if (!test_and_set_bit(WantReplacement,
2207 &rdev->flags))
2208 set_bit(MD_RECOVERY_NEEDED,
2209 &rdev->mddev->recovery);
2210 }
2211 }
2212 if (!ok) {
2213
2214
2215
2216
2217 rdev_set_badblocks(rdev, addr, s, 0);
2218
2219 if (rdev != conf->mirrors[dw].rdev) {
2220
2221 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2222 addr = r10_bio->devs[1].addr + sect;
2223 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2224 if (!ok) {
2225
2226 printk(KERN_NOTICE
2227 "md/raid10:%s: recovery aborted"
2228 " due to read error\n",
2229 mdname(mddev));
2230
2231 conf->mirrors[dw].recovery_disabled
2232 = mddev->recovery_disabled;
2233 set_bit(MD_RECOVERY_INTR,
2234 &mddev->recovery);
2235 break;
2236 }
2237 }
2238 }
2239
2240 sectors -= s;
2241 sect += s;
2242 idx++;
2243 }
2244}
2245
2246static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2247{
2248 struct r10conf *conf = mddev->private;
2249 int d;
2250 struct bio *wbio, *wbio2;
2251
2252 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2253 fix_recovery_read_error(r10_bio);
2254 end_sync_request(r10_bio);
2255 return;
2256 }
2257
2258
2259
2260
2261
2262 d = r10_bio->devs[1].devnum;
2263 wbio = r10_bio->devs[1].bio;
2264 wbio2 = r10_bio->devs[1].repl_bio;
2265 if (wbio->bi_end_io) {
2266 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2267 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2268 generic_make_request(wbio);
2269 }
2270 if (wbio2 && wbio2->bi_end_io) {
2271 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2272 md_sync_acct(conf->mirrors[d].replacement->bdev,
2273 bio_sectors(wbio2));
2274 generic_make_request(wbio2);
2275 }
2276}
2277
2278
2279
2280
2281
2282
2283
2284
2285static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2286{
2287 struct timespec cur_time_mon;
2288 unsigned long hours_since_last;
2289 unsigned int read_errors = atomic_read(&rdev->read_errors);
2290
2291 ktime_get_ts(&cur_time_mon);
2292
2293 if (rdev->last_read_error.tv_sec == 0 &&
2294 rdev->last_read_error.tv_nsec == 0) {
2295
2296 rdev->last_read_error = cur_time_mon;
2297 return;
2298 }
2299
2300 hours_since_last = (cur_time_mon.tv_sec -
2301 rdev->last_read_error.tv_sec) / 3600;
2302
2303 rdev->last_read_error = cur_time_mon;
2304
2305
2306
2307
2308
2309
2310 if (hours_since_last >= 8 * sizeof(read_errors))
2311 atomic_set(&rdev->read_errors, 0);
2312 else
2313 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2314}
2315
2316static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2317 int sectors, struct page *page, int rw)
2318{
2319 sector_t first_bad;
2320 int bad_sectors;
2321
2322 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2323 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2324 return -1;
2325 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
2326
2327 return 1;
2328 if (rw == WRITE) {
2329 set_bit(WriteErrorSeen, &rdev->flags);
2330 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2331 set_bit(MD_RECOVERY_NEEDED,
2332 &rdev->mddev->recovery);
2333 }
2334
2335 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2336 md_error(rdev->mddev, rdev);
2337 return 0;
2338}
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2349{
2350 int sect = 0;
2351 int sectors = r10_bio->sectors;
2352 struct md_rdev*rdev;
2353 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2354 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2355
2356
2357
2358
2359 rdev = conf->mirrors[d].rdev;
2360
2361 if (test_bit(Faulty, &rdev->flags))
2362
2363
2364 return;
2365
2366 check_decay_read_errors(mddev, rdev);
2367 atomic_inc(&rdev->read_errors);
2368 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2369 char b[BDEVNAME_SIZE];
2370 bdevname(rdev->bdev, b);
2371
2372 printk(KERN_NOTICE
2373 "md/raid10:%s: %s: Raid device exceeded "
2374 "read_error threshold [cur %d:max %d]\n",
2375 mdname(mddev), b,
2376 atomic_read(&rdev->read_errors), max_read_errors);
2377 printk(KERN_NOTICE
2378 "md/raid10:%s: %s: Failing raid device\n",
2379 mdname(mddev), b);
2380 md_error(mddev, conf->mirrors[d].rdev);
2381 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2382 return;
2383 }
2384
2385 while(sectors) {
2386 int s = sectors;
2387 int sl = r10_bio->read_slot;
2388 int success = 0;
2389 int start;
2390
2391 if (s > (PAGE_SIZE>>9))
2392 s = PAGE_SIZE >> 9;
2393
2394 rcu_read_lock();
2395 do {
2396 sector_t first_bad;
2397 int bad_sectors;
2398
2399 d = r10_bio->devs[sl].devnum;
2400 rdev = rcu_dereference(conf->mirrors[d].rdev);
2401 if (rdev &&
2402 !test_bit(Unmerged, &rdev->flags) &&
2403 test_bit(In_sync, &rdev->flags) &&
2404 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2405 &first_bad, &bad_sectors) == 0) {
2406 atomic_inc(&rdev->nr_pending);
2407 rcu_read_unlock();
2408 success = sync_page_io(rdev,
2409 r10_bio->devs[sl].addr +
2410 sect,
2411 s<<9,
2412 conf->tmppage, READ, false);
2413 rdev_dec_pending(rdev, mddev);
2414 rcu_read_lock();
2415 if (success)
2416 break;
2417 }
2418 sl++;
2419 if (sl == conf->copies)
2420 sl = 0;
2421 } while (!success && sl != r10_bio->read_slot);
2422 rcu_read_unlock();
2423
2424 if (!success) {
2425
2426
2427
2428
2429 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2430 rdev = conf->mirrors[dn].rdev;
2431
2432 if (!rdev_set_badblocks(
2433 rdev,
2434 r10_bio->devs[r10_bio->read_slot].addr
2435 + sect,
2436 s, 0)) {
2437 md_error(mddev, rdev);
2438 r10_bio->devs[r10_bio->read_slot].bio
2439 = IO_BLOCKED;
2440 }
2441 break;
2442 }
2443
2444 start = sl;
2445
2446 rcu_read_lock();
2447 while (sl != r10_bio->read_slot) {
2448 char b[BDEVNAME_SIZE];
2449
2450 if (sl==0)
2451 sl = conf->copies;
2452 sl--;
2453 d = r10_bio->devs[sl].devnum;
2454 rdev = rcu_dereference(conf->mirrors[d].rdev);
2455 if (!rdev ||
2456 test_bit(Unmerged, &rdev->flags) ||
2457 !test_bit(In_sync, &rdev->flags))
2458 continue;
2459
2460 atomic_inc(&rdev->nr_pending);
2461 rcu_read_unlock();
2462 if (r10_sync_page_io(rdev,
2463 r10_bio->devs[sl].addr +
2464 sect,
2465 s, conf->tmppage, WRITE)
2466 == 0) {
2467
2468 printk(KERN_NOTICE
2469 "md/raid10:%s: read correction "
2470 "write failed"
2471 " (%d sectors at %llu on %s)\n",
2472 mdname(mddev), s,
2473 (unsigned long long)(
2474 sect +
2475 choose_data_offset(r10_bio,
2476 rdev)),
2477 bdevname(rdev->bdev, b));
2478 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2479 "drive\n",
2480 mdname(mddev),
2481 bdevname(rdev->bdev, b));
2482 }
2483 rdev_dec_pending(rdev, mddev);
2484 rcu_read_lock();
2485 }
2486 sl = start;
2487 while (sl != r10_bio->read_slot) {
2488 char b[BDEVNAME_SIZE];
2489
2490 if (sl==0)
2491 sl = conf->copies;
2492 sl--;
2493 d = r10_bio->devs[sl].devnum;
2494 rdev = rcu_dereference(conf->mirrors[d].rdev);
2495 if (!rdev ||
2496 !test_bit(In_sync, &rdev->flags))
2497 continue;
2498
2499 atomic_inc(&rdev->nr_pending);
2500 rcu_read_unlock();
2501 switch (r10_sync_page_io(rdev,
2502 r10_bio->devs[sl].addr +
2503 sect,
2504 s, conf->tmppage,
2505 READ)) {
2506 case 0:
2507
2508 printk(KERN_NOTICE
2509 "md/raid10:%s: unable to read back "
2510 "corrected sectors"
2511 " (%d sectors at %llu on %s)\n",
2512 mdname(mddev), s,
2513 (unsigned long long)(
2514 sect +
2515 choose_data_offset(r10_bio, rdev)),
2516 bdevname(rdev->bdev, b));
2517 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2518 "drive\n",
2519 mdname(mddev),
2520 bdevname(rdev->bdev, b));
2521 break;
2522 case 1:
2523 printk(KERN_INFO
2524 "md/raid10:%s: read error corrected"
2525 " (%d sectors at %llu on %s)\n",
2526 mdname(mddev), s,
2527 (unsigned long long)(
2528 sect +
2529 choose_data_offset(r10_bio, rdev)),
2530 bdevname(rdev->bdev, b));
2531 atomic_add(s, &rdev->corrected_errors);
2532 }
2533
2534 rdev_dec_pending(rdev, mddev);
2535 rcu_read_lock();
2536 }
2537 rcu_read_unlock();
2538
2539 sectors -= s;
2540 sect += s;
2541 }
2542}
2543
2544static int narrow_write_error(struct r10bio *r10_bio, int i)
2545{
2546 struct bio *bio = r10_bio->master_bio;
2547 struct mddev *mddev = r10_bio->mddev;
2548 struct r10conf *conf = mddev->private;
2549 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561 int block_sectors;
2562 sector_t sector;
2563 int sectors;
2564 int sect_to_write = r10_bio->sectors;
2565 int ok = 1;
2566
2567 if (rdev->badblocks.shift < 0)
2568 return 0;
2569
2570 block_sectors = 1 << rdev->badblocks.shift;
2571 sector = r10_bio->sector;
2572 sectors = ((r10_bio->sector + block_sectors)
2573 & ~(sector_t)(block_sectors - 1))
2574 - sector;
2575
2576 while (sect_to_write) {
2577 struct bio *wbio;
2578 if (sectors > sect_to_write)
2579 sectors = sect_to_write;
2580
2581 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2582 md_trim_bio(wbio, sector - bio->bi_sector, sectors);
2583 wbio->bi_sector = (r10_bio->devs[i].addr+
2584 choose_data_offset(r10_bio, rdev) +
2585 (sector - r10_bio->sector));
2586 wbio->bi_bdev = rdev->bdev;
2587 if (submit_bio_wait(WRITE, wbio) == 0)
2588
2589 ok = rdev_set_badblocks(rdev, sector,
2590 sectors, 0)
2591 && ok;
2592
2593 bio_put(wbio);
2594 sect_to_write -= sectors;
2595 sector += sectors;
2596 sectors = block_sectors;
2597 }
2598 return ok;
2599}
2600
2601static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2602{
2603 int slot = r10_bio->read_slot;
2604 struct bio *bio;
2605 struct r10conf *conf = mddev->private;
2606 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2607 char b[BDEVNAME_SIZE];
2608 unsigned long do_sync;
2609 int max_sectors;
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619 bio = r10_bio->devs[slot].bio;
2620 bdevname(bio->bi_bdev, b);
2621 bio_put(bio);
2622 r10_bio->devs[slot].bio = NULL;
2623
2624 if (mddev->ro == 0) {
2625 freeze_array(conf, 1);
2626 fix_read_error(conf, mddev, r10_bio);
2627 unfreeze_array(conf);
2628 } else
2629 r10_bio->devs[slot].bio = IO_BLOCKED;
2630
2631 rdev_dec_pending(rdev, mddev);
2632
2633read_more:
2634 rdev = read_balance(conf, r10_bio, &max_sectors);
2635 if (rdev == NULL) {
2636 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2637 " read error for block %llu\n",
2638 mdname(mddev), b,
2639 (unsigned long long)r10_bio->sector);
2640 raid_end_bio_io(r10_bio);
2641 return;
2642 }
2643
2644 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
2645 slot = r10_bio->read_slot;
2646 printk_ratelimited(
2647 KERN_ERR
2648 "md/raid10:%s: %s: redirecting "
2649 "sector %llu to another mirror\n",
2650 mdname(mddev),
2651 bdevname(rdev->bdev, b),
2652 (unsigned long long)r10_bio->sector);
2653 bio = bio_clone_mddev(r10_bio->master_bio,
2654 GFP_NOIO, mddev);
2655 md_trim_bio(bio,
2656 r10_bio->sector - bio->bi_sector,
2657 max_sectors);
2658 r10_bio->devs[slot].bio = bio;
2659 r10_bio->devs[slot].rdev = rdev;
2660 bio->bi_sector = r10_bio->devs[slot].addr
2661 + choose_data_offset(r10_bio, rdev);
2662 bio->bi_bdev = rdev->bdev;
2663 bio->bi_rw = READ | do_sync;
2664 bio->bi_private = r10_bio;
2665 bio->bi_end_io = raid10_end_read_request;
2666 if (max_sectors < r10_bio->sectors) {
2667
2668 struct bio *mbio = r10_bio->master_bio;
2669 int sectors_handled =
2670 r10_bio->sector + max_sectors
2671 - mbio->bi_sector;
2672 r10_bio->sectors = max_sectors;
2673 spin_lock_irq(&conf->device_lock);
2674 if (mbio->bi_phys_segments == 0)
2675 mbio->bi_phys_segments = 2;
2676 else
2677 mbio->bi_phys_segments++;
2678 spin_unlock_irq(&conf->device_lock);
2679 generic_make_request(bio);
2680
2681 r10_bio = mempool_alloc(conf->r10bio_pool,
2682 GFP_NOIO);
2683 r10_bio->master_bio = mbio;
2684 r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
2685 r10_bio->state = 0;
2686 set_bit(R10BIO_ReadError,
2687 &r10_bio->state);
2688 r10_bio->mddev = mddev;
2689 r10_bio->sector = mbio->bi_sector
2690 + sectors_handled;
2691
2692 goto read_more;
2693 } else
2694 generic_make_request(bio);
2695}
2696
2697static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2698{
2699
2700
2701
2702
2703
2704
2705 int m;
2706 struct md_rdev *rdev;
2707
2708 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2709 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2710 for (m = 0; m < conf->copies; m++) {
2711 int dev = r10_bio->devs[m].devnum;
2712 rdev = conf->mirrors[dev].rdev;
2713 if (r10_bio->devs[m].bio == NULL)
2714 continue;
2715 if (test_bit(BIO_UPTODATE,
2716 &r10_bio->devs[m].bio->bi_flags)) {
2717 rdev_clear_badblocks(
2718 rdev,
2719 r10_bio->devs[m].addr,
2720 r10_bio->sectors, 0);
2721 } else {
2722 if (!rdev_set_badblocks(
2723 rdev,
2724 r10_bio->devs[m].addr,
2725 r10_bio->sectors, 0))
2726 md_error(conf->mddev, rdev);
2727 }
2728 rdev = conf->mirrors[dev].replacement;
2729 if (r10_bio->devs[m].repl_bio == NULL)
2730 continue;
2731 if (test_bit(BIO_UPTODATE,
2732 &r10_bio->devs[m].repl_bio->bi_flags)) {
2733 rdev_clear_badblocks(
2734 rdev,
2735 r10_bio->devs[m].addr,
2736 r10_bio->sectors, 0);
2737 } else {
2738 if (!rdev_set_badblocks(
2739 rdev,
2740 r10_bio->devs[m].addr,
2741 r10_bio->sectors, 0))
2742 md_error(conf->mddev, rdev);
2743 }
2744 }
2745 put_buf(r10_bio);
2746 } else {
2747 for (m = 0; m < conf->copies; m++) {
2748 int dev = r10_bio->devs[m].devnum;
2749 struct bio *bio = r10_bio->devs[m].bio;
2750 rdev = conf->mirrors[dev].rdev;
2751 if (bio == IO_MADE_GOOD) {
2752 rdev_clear_badblocks(
2753 rdev,
2754 r10_bio->devs[m].addr,
2755 r10_bio->sectors, 0);
2756 rdev_dec_pending(rdev, conf->mddev);
2757 } else if (bio != NULL &&
2758 !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2759 if (!narrow_write_error(r10_bio, m)) {
2760 md_error(conf->mddev, rdev);
2761 set_bit(R10BIO_Degraded,
2762 &r10_bio->state);
2763 }
2764 rdev_dec_pending(rdev, conf->mddev);
2765 }
2766 bio = r10_bio->devs[m].repl_bio;
2767 rdev = conf->mirrors[dev].replacement;
2768 if (rdev && bio == IO_MADE_GOOD) {
2769 rdev_clear_badblocks(
2770 rdev,
2771 r10_bio->devs[m].addr,
2772 r10_bio->sectors, 0);
2773 rdev_dec_pending(rdev, conf->mddev);
2774 }
2775 }
2776 if (test_bit(R10BIO_WriteError,
2777 &r10_bio->state))
2778 close_write(r10_bio);
2779 raid_end_bio_io(r10_bio);
2780 }
2781}
2782
2783static void raid10d(struct md_thread *thread)
2784{
2785 struct mddev *mddev = thread->mddev;
2786 struct r10bio *r10_bio;
2787 unsigned long flags;
2788 struct r10conf *conf = mddev->private;
2789 struct list_head *head = &conf->retry_list;
2790 struct blk_plug plug;
2791
2792 md_check_recovery(mddev);
2793
2794 blk_start_plug(&plug);
2795 for (;;) {
2796
2797 flush_pending_writes(conf);
2798
2799 spin_lock_irqsave(&conf->device_lock, flags);
2800 if (list_empty(head)) {
2801 spin_unlock_irqrestore(&conf->device_lock, flags);
2802 break;
2803 }
2804 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2805 list_del(head->prev);
2806 conf->nr_queued--;
2807 spin_unlock_irqrestore(&conf->device_lock, flags);
2808
2809 mddev = r10_bio->mddev;
2810 conf = mddev->private;
2811 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2812 test_bit(R10BIO_WriteError, &r10_bio->state))
2813 handle_write_completed(conf, r10_bio);
2814 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2815 reshape_request_write(mddev, r10_bio);
2816 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2817 sync_request_write(mddev, r10_bio);
2818 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2819 recovery_request_write(mddev, r10_bio);
2820 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2821 handle_read_error(mddev, r10_bio);
2822 else {
2823
2824
2825
2826 int slot = r10_bio->read_slot;
2827 generic_make_request(r10_bio->devs[slot].bio);
2828 }
2829
2830 cond_resched();
2831 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2832 md_check_recovery(mddev);
2833 }
2834 blk_finish_plug(&plug);
2835}
2836
2837
2838static int init_resync(struct r10conf *conf)
2839{
2840 int buffs;
2841 int i;
2842
2843 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2844 BUG_ON(conf->r10buf_pool);
2845 conf->have_replacement = 0;
2846 for (i = 0; i < conf->geo.raid_disks; i++)
2847 if (conf->mirrors[i].replacement)
2848 conf->have_replacement = 1;
2849 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2850 if (!conf->r10buf_pool)
2851 return -ENOMEM;
2852 conf->next_resync = 0;
2853 return 0;
2854}
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2889 int *skipped, int go_faster)
2890{
2891 struct r10conf *conf = mddev->private;
2892 struct r10bio *r10_bio;
2893 struct bio *biolist = NULL, *bio;
2894 sector_t max_sector, nr_sectors;
2895 int i;
2896 int max_sync;
2897 sector_t sync_blocks;
2898 sector_t sectors_skipped = 0;
2899 int chunks_skipped = 0;
2900 sector_t chunk_mask = conf->geo.chunk_mask;
2901
2902 if (!conf->r10buf_pool)
2903 if (init_resync(conf))
2904 return 0;
2905
2906
2907
2908
2909
2910 if (mddev->bitmap == NULL &&
2911 mddev->recovery_cp == MaxSector &&
2912 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
2913 conf->fullsync == 0) {
2914 *skipped = 1;
2915 max_sector = mddev->dev_sectors;
2916 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2917 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2918 max_sector = mddev->resync_max_sectors;
2919 return max_sector - sector_nr;
2920 }
2921
2922 skipped:
2923 max_sector = mddev->dev_sectors;
2924 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2925 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2926 max_sector = mddev->resync_max_sectors;
2927 if (sector_nr >= max_sector) {
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2938 end_reshape(conf);
2939 return 0;
2940 }
2941
2942 if (mddev->curr_resync < max_sector) {
2943 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2944 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2945 &sync_blocks, 1);
2946 else for (i = 0; i < conf->geo.raid_disks; i++) {
2947 sector_t sect =
2948 raid10_find_virt(conf, mddev->curr_resync, i);
2949 bitmap_end_sync(mddev->bitmap, sect,
2950 &sync_blocks, 1);
2951 }
2952 } else {
2953
2954 if ((!mddev->bitmap || conf->fullsync)
2955 && conf->have_replacement
2956 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2957
2958
2959
2960 for (i = 0; i < conf->geo.raid_disks; i++)
2961 if (conf->mirrors[i].replacement)
2962 conf->mirrors[i].replacement
2963 ->recovery_offset
2964 = MaxSector;
2965 }
2966 conf->fullsync = 0;
2967 }
2968 bitmap_close_sync(mddev->bitmap);
2969 close_sync(conf);
2970 *skipped = 1;
2971 return sectors_skipped;
2972 }
2973
2974 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2975 return reshape_request(mddev, sector_nr, skipped);
2976
2977 if (chunks_skipped >= conf->geo.raid_disks) {
2978
2979
2980
2981 *skipped = 1;
2982 return (max_sector - sector_nr) + sectors_skipped;
2983 }
2984
2985 if (max_sector > mddev->resync_max)
2986 max_sector = mddev->resync_max;
2987
2988
2989
2990
2991 if (conf->geo.near_copies < conf->geo.raid_disks &&
2992 max_sector > (sector_nr | chunk_mask))
2993 max_sector = (sector_nr | chunk_mask) + 1;
2994
2995
2996
2997
2998 if (!go_faster && conf->nr_waiting)
2999 msleep_interruptible(1000);
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
3017 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3018
3019 int j;
3020 r10_bio = NULL;
3021
3022 for (i = 0 ; i < conf->geo.raid_disks; i++) {
3023 int still_degraded;
3024 struct r10bio *rb2;
3025 sector_t sect;
3026 int must_sync;
3027 int any_working;
3028 struct raid10_info *mirror = &conf->mirrors[i];
3029
3030 if ((mirror->rdev == NULL ||
3031 test_bit(In_sync, &mirror->rdev->flags))
3032 &&
3033 (mirror->replacement == NULL ||
3034 test_bit(Faulty,
3035 &mirror->replacement->flags)))
3036 continue;
3037
3038 still_degraded = 0;
3039
3040 rb2 = r10_bio;
3041 sect = raid10_find_virt(conf, sector_nr, i);
3042 if (sect >= mddev->resync_max_sectors) {
3043
3044
3045
3046 continue;
3047 }
3048
3049
3050
3051
3052 must_sync = bitmap_start_sync(mddev->bitmap, sect,
3053 &sync_blocks, 1);
3054 if (sync_blocks < max_sync)
3055 max_sync = sync_blocks;
3056 if (!must_sync &&
3057 mirror->replacement == NULL &&
3058 !conf->fullsync) {
3059
3060
3061
3062 chunks_skipped = -1;
3063 continue;
3064 }
3065
3066 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3067 raise_barrier(conf, rb2 != NULL);
3068 atomic_set(&r10_bio->remaining, 0);
3069
3070 r10_bio->master_bio = (struct bio*)rb2;
3071 if (rb2)
3072 atomic_inc(&rb2->remaining);
3073 r10_bio->mddev = mddev;
3074 set_bit(R10BIO_IsRecover, &r10_bio->state);
3075 r10_bio->sector = sect;
3076
3077 raid10_find_phys(conf, r10_bio);
3078
3079
3080
3081
3082 for (j = 0; j < conf->geo.raid_disks; j++)
3083 if (conf->mirrors[j].rdev == NULL ||
3084 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
3085 still_degraded = 1;
3086 break;
3087 }
3088
3089 must_sync = bitmap_start_sync(mddev->bitmap, sect,
3090 &sync_blocks, still_degraded);
3091
3092 any_working = 0;
3093 for (j=0; j<conf->copies;j++) {
3094 int k;
3095 int d = r10_bio->devs[j].devnum;
3096 sector_t from_addr, to_addr;
3097 struct md_rdev *rdev;
3098 sector_t sector, first_bad;
3099 int bad_sectors;
3100 if (!conf->mirrors[d].rdev ||
3101 !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
3102 continue;
3103
3104 any_working = 1;
3105 rdev = conf->mirrors[d].rdev;
3106 sector = r10_bio->devs[j].addr;
3107
3108 if (is_badblock(rdev, sector, max_sync,
3109 &first_bad, &bad_sectors)) {
3110 if (first_bad > sector)
3111 max_sync = first_bad - sector;
3112 else {
3113 bad_sectors -= (sector
3114 - first_bad);
3115 if (max_sync > bad_sectors)
3116 max_sync = bad_sectors;
3117 continue;
3118 }
3119 }
3120 bio = r10_bio->devs[0].bio;
3121 bio_reset(bio);
3122 bio->bi_next = biolist;
3123 biolist = bio;
3124 bio->bi_private = r10_bio;
3125 bio->bi_end_io = end_sync_read;
3126 bio->bi_rw = READ;
3127 from_addr = r10_bio->devs[j].addr;
3128 bio->bi_sector = from_addr + rdev->data_offset;
3129 bio->bi_bdev = rdev->bdev;
3130 atomic_inc(&rdev->nr_pending);
3131
3132
3133 for (k=0; k<conf->copies; k++)
3134 if (r10_bio->devs[k].devnum == i)
3135 break;
3136 BUG_ON(k == conf->copies);
3137 to_addr = r10_bio->devs[k].addr;
3138 r10_bio->devs[0].devnum = d;
3139 r10_bio->devs[0].addr = from_addr;
3140 r10_bio->devs[1].devnum = i;
3141 r10_bio->devs[1].addr = to_addr;
3142
3143 rdev = mirror->rdev;
3144 if (!test_bit(In_sync, &rdev->flags)) {
3145 bio = r10_bio->devs[1].bio;
3146 bio_reset(bio);
3147 bio->bi_next = biolist;
3148 biolist = bio;
3149 bio->bi_private = r10_bio;
3150 bio->bi_end_io = end_sync_write;
3151 bio->bi_rw = WRITE;
3152 bio->bi_sector = to_addr
3153 + rdev->data_offset;
3154 bio->bi_bdev = rdev->bdev;
3155 atomic_inc(&r10_bio->remaining);
3156 } else
3157 r10_bio->devs[1].bio->bi_end_io = NULL;
3158
3159
3160 bio = r10_bio->devs[1].repl_bio;
3161 if (bio)
3162 bio->bi_end_io = NULL;
3163 rdev = mirror->replacement;
3164
3165
3166
3167
3168
3169
3170
3171
3172 if (rdev == NULL || bio == NULL ||
3173 test_bit(Faulty, &rdev->flags))
3174 break;
3175 bio_reset(bio);
3176 bio->bi_next = biolist;
3177 biolist = bio;
3178 bio->bi_private = r10_bio;
3179 bio->bi_end_io = end_sync_write;
3180 bio->bi_rw = WRITE;
3181 bio->bi_sector = to_addr + rdev->data_offset;
3182 bio->bi_bdev = rdev->bdev;
3183 atomic_inc(&r10_bio->remaining);
3184 break;
3185 }
3186 if (j == conf->copies) {
3187
3188
3189 put_buf(r10_bio);
3190 if (rb2)
3191 atomic_dec(&rb2->remaining);
3192 r10_bio = rb2;
3193 if (any_working) {
3194
3195
3196
3197 int k;
3198 for (k = 0; k < conf->copies; k++)
3199 if (r10_bio->devs[k].devnum == i)
3200 break;
3201 if (!test_bit(In_sync,
3202 &mirror->rdev->flags)
3203 && !rdev_set_badblocks(
3204 mirror->rdev,
3205 r10_bio->devs[k].addr,
3206 max_sync, 0))
3207 any_working = 0;
3208 if (mirror->replacement &&
3209 !rdev_set_badblocks(
3210 mirror->replacement,
3211 r10_bio->devs[k].addr,
3212 max_sync, 0))
3213 any_working = 0;
3214 }
3215 if (!any_working) {
3216 if (!test_and_set_bit(MD_RECOVERY_INTR,
3217 &mddev->recovery))
3218 printk(KERN_INFO "md/raid10:%s: insufficient "
3219 "working devices for recovery.\n",
3220 mdname(mddev));
3221 mirror->recovery_disabled
3222 = mddev->recovery_disabled;
3223 }
3224 break;
3225 }
3226 }
3227 if (biolist == NULL) {
3228 while (r10_bio) {
3229 struct r10bio *rb2 = r10_bio;
3230 r10_bio = (struct r10bio*) rb2->master_bio;
3231 rb2->master_bio = NULL;
3232 put_buf(rb2);
3233 }
3234 goto giveup;
3235 }
3236 } else {
3237
3238 int count = 0;
3239
3240 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
3241
3242 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
3243 &sync_blocks, mddev->degraded) &&
3244 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3245 &mddev->recovery)) {
3246
3247 *skipped = 1;
3248 return sync_blocks + sectors_skipped;
3249 }
3250 if (sync_blocks < max_sync)
3251 max_sync = sync_blocks;
3252 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3253
3254 r10_bio->mddev = mddev;
3255 atomic_set(&r10_bio->remaining, 0);
3256 raise_barrier(conf, 0);
3257 conf->next_resync = sector_nr;
3258
3259 r10_bio->master_bio = NULL;
3260 r10_bio->sector = sector_nr;
3261 set_bit(R10BIO_IsSync, &r10_bio->state);
3262 raid10_find_phys(conf, r10_bio);
3263 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3264
3265 for (i = 0; i < conf->copies; i++) {
3266 int d = r10_bio->devs[i].devnum;
3267 sector_t first_bad, sector;
3268 int bad_sectors;
3269
3270 if (r10_bio->devs[i].repl_bio)
3271 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3272
3273 bio = r10_bio->devs[i].bio;
3274 bio_reset(bio);
3275 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3276 if (conf->mirrors[d].rdev == NULL ||
3277 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
3278 continue;
3279 sector = r10_bio->devs[i].addr;
3280 if (is_badblock(conf->mirrors[d].rdev,
3281 sector, max_sync,
3282 &first_bad, &bad_sectors)) {
3283 if (first_bad > sector)
3284 max_sync = first_bad - sector;
3285 else {
3286 bad_sectors -= (sector - first_bad);
3287 if (max_sync > bad_sectors)
3288 max_sync = bad_sectors;
3289 continue;
3290 }
3291 }
3292 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3293 atomic_inc(&r10_bio->remaining);
3294 bio->bi_next = biolist;
3295 biolist = bio;
3296 bio->bi_private = r10_bio;
3297 bio->bi_end_io = end_sync_read;
3298 bio->bi_rw = READ;
3299 bio->bi_sector = sector +
3300 conf->mirrors[d].rdev->data_offset;
3301 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
3302 count++;
3303
3304 if (conf->mirrors[d].replacement == NULL ||
3305 test_bit(Faulty,
3306 &conf->mirrors[d].replacement->flags))
3307 continue;
3308
3309
3310 bio = r10_bio->devs[i].repl_bio;
3311 bio_reset(bio);
3312 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3313
3314 sector = r10_bio->devs[i].addr;
3315 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3316 bio->bi_next = biolist;
3317 biolist = bio;
3318 bio->bi_private = r10_bio;
3319 bio->bi_end_io = end_sync_write;
3320 bio->bi_rw = WRITE;
3321 bio->bi_sector = sector +
3322 conf->mirrors[d].replacement->data_offset;
3323 bio->bi_bdev = conf->mirrors[d].replacement->bdev;
3324 count++;
3325 }
3326
3327 if (count < 2) {
3328 for (i=0; i<conf->copies; i++) {
3329 int d = r10_bio->devs[i].devnum;
3330 if (r10_bio->devs[i].bio->bi_end_io)
3331 rdev_dec_pending(conf->mirrors[d].rdev,
3332 mddev);
3333 if (r10_bio->devs[i].repl_bio &&
3334 r10_bio->devs[i].repl_bio->bi_end_io)
3335 rdev_dec_pending(
3336 conf->mirrors[d].replacement,
3337 mddev);
3338 }
3339 put_buf(r10_bio);
3340 biolist = NULL;
3341 goto giveup;
3342 }
3343 }
3344
3345 nr_sectors = 0;
3346 if (sector_nr + max_sync < max_sector)
3347 max_sector = sector_nr + max_sync;
3348 do {
3349 struct page *page;
3350 int len = PAGE_SIZE;
3351 if (sector_nr + (len>>9) > max_sector)
3352 len = (max_sector - sector_nr) << 9;
3353 if (len == 0)
3354 break;
3355 for (bio= biolist ; bio ; bio=bio->bi_next) {
3356 struct bio *bio2;
3357 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
3358 if (bio_add_page(bio, page, len, 0))
3359 continue;
3360
3361
3362 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
3363 for (bio2 = biolist;
3364 bio2 && bio2 != bio;
3365 bio2 = bio2->bi_next) {
3366
3367 bio2->bi_vcnt--;
3368 bio2->bi_size -= len;
3369 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
3370 }
3371 goto bio_full;
3372 }
3373 nr_sectors += len>>9;
3374 sector_nr += len>>9;
3375 } while (biolist->bi_vcnt < RESYNC_PAGES);
3376 bio_full:
3377 r10_bio->sectors = nr_sectors;
3378
3379 while (biolist) {
3380 bio = biolist;
3381 biolist = biolist->bi_next;
3382
3383 bio->bi_next = NULL;
3384 r10_bio = bio->bi_private;
3385 r10_bio->sectors = nr_sectors;
3386
3387 if (bio->bi_end_io == end_sync_read) {
3388 md_sync_acct(bio->bi_bdev, nr_sectors);
3389 generic_make_request(bio);
3390 }
3391 }
3392
3393 if (sectors_skipped)
3394
3395
3396
3397 md_done_sync(mddev, sectors_skipped, 1);
3398
3399 return sectors_skipped + nr_sectors;
3400 giveup:
3401
3402
3403
3404
3405 if (sector_nr + max_sync < max_sector)
3406 max_sector = sector_nr + max_sync;
3407
3408 sectors_skipped += (max_sector - sector_nr);
3409 chunks_skipped ++;
3410 sector_nr = max_sector;
3411 goto skipped;
3412}
3413
3414static sector_t
3415raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3416{
3417 sector_t size;
3418 struct r10conf *conf = mddev->private;
3419
3420 if (!raid_disks)
3421 raid_disks = min(conf->geo.raid_disks,
3422 conf->prev.raid_disks);
3423 if (!sectors)
3424 sectors = conf->dev_sectors;
3425
3426 size = sectors >> conf->geo.chunk_shift;
3427 sector_div(size, conf->geo.far_copies);
3428 size = size * raid_disks;
3429 sector_div(size, conf->geo.near_copies);
3430
3431 return size << conf->geo.chunk_shift;
3432}
3433
3434static void calc_sectors(struct r10conf *conf, sector_t size)
3435{
3436
3437
3438
3439
3440
3441 size = size >> conf->geo.chunk_shift;
3442 sector_div(size, conf->geo.far_copies);
3443 size = size * conf->geo.raid_disks;
3444 sector_div(size, conf->geo.near_copies);
3445
3446
3447 size = size * conf->copies;
3448
3449
3450
3451
3452 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3453
3454 conf->dev_sectors = size << conf->geo.chunk_shift;
3455
3456 if (conf->geo.far_offset)
3457 conf->geo.stride = 1 << conf->geo.chunk_shift;
3458 else {
3459 sector_div(size, conf->geo.far_copies);
3460 conf->geo.stride = size << conf->geo.chunk_shift;
3461 }
3462}
3463
3464enum geo_type {geo_new, geo_old, geo_start};
3465static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3466{
3467 int nc, fc, fo;
3468 int layout, chunk, disks;
3469 switch (new) {
3470 case geo_old:
3471 layout = mddev->layout;
3472 chunk = mddev->chunk_sectors;
3473 disks = mddev->raid_disks - mddev->delta_disks;
3474 break;
3475 case geo_new:
3476 layout = mddev->new_layout;
3477 chunk = mddev->new_chunk_sectors;
3478 disks = mddev->raid_disks;
3479 break;
3480 default:
3481 case geo_start:
3482
3483 layout = mddev->new_layout;
3484 chunk = mddev->new_chunk_sectors;
3485 disks = mddev->raid_disks + mddev->delta_disks;
3486 break;
3487 }
3488 if (layout >> 18)
3489 return -1;
3490 if (chunk < (PAGE_SIZE >> 9) ||
3491 !is_power_of_2(chunk))
3492 return -2;
3493 nc = layout & 255;
3494 fc = (layout >> 8) & 255;
3495 fo = layout & (1<<16);
3496 geo->raid_disks = disks;
3497 geo->near_copies = nc;
3498 geo->far_copies = fc;
3499 geo->far_offset = fo;
3500 geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
3501 geo->chunk_mask = chunk - 1;
3502 geo->chunk_shift = ffz(~chunk);
3503 return nc*fc;
3504}
3505
3506static struct r10conf *setup_conf(struct mddev *mddev)
3507{
3508 struct r10conf *conf = NULL;
3509 int err = -EINVAL;
3510 struct geom geo;
3511 int copies;
3512
3513 copies = setup_geo(&geo, mddev, geo_new);
3514
3515 if (copies == -2) {
3516 printk(KERN_ERR "md/raid10:%s: chunk size must be "
3517 "at least PAGE_SIZE(%ld) and be a power of 2.\n",
3518 mdname(mddev), PAGE_SIZE);
3519 goto out;
3520 }
3521
3522 if (copies < 2 || copies > mddev->raid_disks) {
3523 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3524 mdname(mddev), mddev->new_layout);
3525 goto out;
3526 }
3527
3528 err = -ENOMEM;
3529 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3530 if (!conf)
3531 goto out;
3532
3533
3534 conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
3535 max(0,mddev->delta_disks)),
3536 GFP_KERNEL);
3537 if (!conf->mirrors)
3538 goto out;
3539
3540 conf->tmppage = alloc_page(GFP_KERNEL);
3541 if (!conf->tmppage)
3542 goto out;
3543
3544 conf->geo = geo;
3545 conf->copies = copies;
3546 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
3547 r10bio_pool_free, conf);
3548 if (!conf->r10bio_pool)
3549 goto out;
3550
3551 calc_sectors(conf, mddev->dev_sectors);
3552 if (mddev->reshape_position == MaxSector) {
3553 conf->prev = conf->geo;
3554 conf->reshape_progress = MaxSector;
3555 } else {
3556 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3557 err = -EINVAL;
3558 goto out;
3559 }
3560 conf->reshape_progress = mddev->reshape_position;
3561 if (conf->prev.far_offset)
3562 conf->prev.stride = 1 << conf->prev.chunk_shift;
3563 else
3564
3565 conf->prev.stride = conf->dev_sectors;
3566 }
3567 spin_lock_init(&conf->device_lock);
3568 INIT_LIST_HEAD(&conf->retry_list);
3569
3570 spin_lock_init(&conf->resync_lock);
3571 init_waitqueue_head(&conf->wait_barrier);
3572
3573 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3574 if (!conf->thread)
3575 goto out;
3576
3577 conf->mddev = mddev;
3578 return conf;
3579
3580 out:
3581 if (err == -ENOMEM)
3582 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
3583 mdname(mddev));
3584 if (conf) {
3585 if (conf->r10bio_pool)
3586 mempool_destroy(conf->r10bio_pool);
3587 kfree(conf->mirrors);
3588 safe_put_page(conf->tmppage);
3589 kfree(conf);
3590 }
3591 return ERR_PTR(err);
3592}
3593
3594static int run(struct mddev *mddev)
3595{
3596 struct r10conf *conf;
3597 int i, disk_idx, chunk_size;
3598 struct raid10_info *disk;
3599 struct md_rdev *rdev;
3600 sector_t size;
3601 sector_t min_offset_diff = 0;
3602 int first = 1;
3603 bool discard_supported = false;
3604
3605 if (mddev->private == NULL) {
3606 conf = setup_conf(mddev);
3607 if (IS_ERR(conf))
3608 return PTR_ERR(conf);
3609 mddev->private = conf;
3610 }
3611 conf = mddev->private;
3612 if (!conf)
3613 goto out;
3614
3615 mddev->thread = conf->thread;
3616 conf->thread = NULL;
3617
3618 chunk_size = mddev->chunk_sectors << 9;
3619 if (mddev->queue) {
3620 blk_queue_max_discard_sectors(mddev->queue,
3621 mddev->chunk_sectors);
3622 blk_queue_max_write_same_sectors(mddev->queue, 0);
3623 blk_queue_io_min(mddev->queue, chunk_size);
3624 if (conf->geo.raid_disks % conf->geo.near_copies)
3625 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3626 else
3627 blk_queue_io_opt(mddev->queue, chunk_size *
3628 (conf->geo.raid_disks / conf->geo.near_copies));
3629 }
3630
3631 rdev_for_each(rdev, mddev) {
3632 long long diff;
3633 struct request_queue *q;
3634
3635 disk_idx = rdev->raid_disk;
3636 if (disk_idx < 0)
3637 continue;
3638 if (disk_idx >= conf->geo.raid_disks &&
3639 disk_idx >= conf->prev.raid_disks)
3640 continue;
3641 disk = conf->mirrors + disk_idx;
3642
3643 if (test_bit(Replacement, &rdev->flags)) {
3644 if (disk->replacement)
3645 goto out_free_conf;
3646 disk->replacement = rdev;
3647 } else {
3648 if (disk->rdev)
3649 goto out_free_conf;
3650 disk->rdev = rdev;
3651 }
3652 q = bdev_get_queue(rdev->bdev);
3653 if (q->merge_bvec_fn)
3654 mddev->merge_check_needed = 1;
3655 diff = (rdev->new_data_offset - rdev->data_offset);
3656 if (!mddev->reshape_backwards)
3657 diff = -diff;
3658 if (diff < 0)
3659 diff = 0;
3660 if (first || diff < min_offset_diff)
3661 min_offset_diff = diff;
3662
3663 if (mddev->gendisk)
3664 disk_stack_limits(mddev->gendisk, rdev->bdev,
3665 rdev->data_offset << 9);
3666
3667 disk->head_position = 0;
3668
3669 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3670 discard_supported = true;
3671 }
3672
3673 if (mddev->queue) {
3674 if (discard_supported)
3675 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
3676 mddev->queue);
3677 else
3678 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
3679 mddev->queue);
3680 }
3681
3682 if (!enough(conf, -1)) {
3683 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
3684 mdname(mddev));
3685 goto out_free_conf;
3686 }
3687
3688 if (conf->reshape_progress != MaxSector) {
3689
3690 if (conf->geo.far_copies != 1 &&
3691 conf->geo.far_offset == 0)
3692 goto out_free_conf;
3693 if (conf->prev.far_copies != 1 &&
3694 conf->geo.far_offset == 0)
3695 goto out_free_conf;
3696 }
3697
3698 mddev->degraded = 0;
3699 for (i = 0;
3700 i < conf->geo.raid_disks
3701 || i < conf->prev.raid_disks;
3702 i++) {
3703
3704 disk = conf->mirrors + i;
3705
3706 if (!disk->rdev && disk->replacement) {
3707
3708 disk->rdev = disk->replacement;
3709 disk->replacement = NULL;
3710 clear_bit(Replacement, &disk->rdev->flags);
3711 }
3712
3713 if (!disk->rdev ||
3714 !test_bit(In_sync, &disk->rdev->flags)) {
3715 disk->head_position = 0;
3716 mddev->degraded++;
3717 if (disk->rdev)
3718 conf->fullsync = 1;
3719 }
3720 disk->recovery_disabled = mddev->recovery_disabled - 1;
3721 }
3722
3723 if (mddev->recovery_cp != MaxSector)
3724 printk(KERN_NOTICE "md/raid10:%s: not clean"
3725 " -- starting background reconstruction\n",
3726 mdname(mddev));
3727 printk(KERN_INFO
3728 "md/raid10:%s: active with %d out of %d devices\n",
3729 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3730 conf->geo.raid_disks);
3731
3732
3733
3734 mddev->dev_sectors = conf->dev_sectors;
3735 size = raid10_size(mddev, 0, 0);
3736 md_set_array_sectors(mddev, size);
3737 mddev->resync_max_sectors = size;
3738
3739 if (mddev->queue) {
3740 int stripe = conf->geo.raid_disks *
3741 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3742 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
3743 mddev->queue->backing_dev_info.congested_data = mddev;
3744
3745
3746
3747
3748
3749 stripe /= conf->geo.near_copies;
3750 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3751 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3752 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3753 }
3754
3755
3756 if (md_integrity_register(mddev))
3757 goto out_free_conf;
3758
3759 if (conf->reshape_progress != MaxSector) {
3760 unsigned long before_length, after_length;
3761
3762 before_length = ((1 << conf->prev.chunk_shift) *
3763 conf->prev.far_copies);
3764 after_length = ((1 << conf->geo.chunk_shift) *
3765 conf->geo.far_copies);
3766
3767 if (max(before_length, after_length) > min_offset_diff) {
3768
3769 printk("md/raid10: offset difference not enough to continue reshape\n");
3770 goto out_free_conf;
3771 }
3772 conf->offset_diff = min_offset_diff;
3773
3774 conf->reshape_safe = conf->reshape_progress;
3775 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3776 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3777 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3778 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3779 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3780 "reshape");
3781 }
3782
3783 return 0;
3784
3785out_free_conf:
3786 md_unregister_thread(&mddev->thread);
3787 if (conf->r10bio_pool)
3788 mempool_destroy(conf->r10bio_pool);
3789 safe_put_page(conf->tmppage);
3790 kfree(conf->mirrors);
3791 kfree(conf);
3792 mddev->private = NULL;
3793out:
3794 return -EIO;
3795}
3796
3797static int stop(struct mddev *mddev)
3798{
3799 struct r10conf *conf = mddev->private;
3800
3801 raise_barrier(conf, 0);
3802 lower_barrier(conf);
3803
3804 md_unregister_thread(&mddev->thread);
3805 if (mddev->queue)
3806
3807 blk_sync_queue(mddev->queue);
3808
3809 if (conf->r10bio_pool)
3810 mempool_destroy(conf->r10bio_pool);
3811 safe_put_page(conf->tmppage);
3812 kfree(conf->mirrors);
3813 kfree(conf);
3814 mddev->private = NULL;
3815 return 0;
3816}
3817
3818static void raid10_quiesce(struct mddev *mddev, int state)
3819{
3820 struct r10conf *conf = mddev->private;
3821
3822 switch(state) {
3823 case 1:
3824 raise_barrier(conf, 0);
3825 break;
3826 case 0:
3827 lower_barrier(conf);
3828 break;
3829 }
3830}
3831
3832static int raid10_resize(struct mddev *mddev, sector_t sectors)
3833{
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846 struct r10conf *conf = mddev->private;
3847 sector_t oldsize, size;
3848
3849 if (mddev->reshape_position != MaxSector)
3850 return -EBUSY;
3851
3852 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3853 return -EINVAL;
3854
3855 oldsize = raid10_size(mddev, 0, 0);
3856 size = raid10_size(mddev, sectors, 0);
3857 if (mddev->external_size &&
3858 mddev->array_sectors > size)
3859 return -EINVAL;
3860 if (mddev->bitmap) {
3861 int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
3862 if (ret)
3863 return ret;
3864 }
3865 md_set_array_sectors(mddev, size);
3866 set_capacity(mddev->gendisk, mddev->array_sectors);
3867 revalidate_disk(mddev->gendisk);
3868 if (sectors > mddev->dev_sectors &&
3869 mddev->recovery_cp > oldsize) {
3870 mddev->recovery_cp = oldsize;
3871 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3872 }
3873 calc_sectors(conf, sectors);
3874 mddev->dev_sectors = conf->dev_sectors;
3875 mddev->resync_max_sectors = size;
3876 return 0;
3877}
3878
3879static void *raid10_takeover_raid0(struct mddev *mddev)
3880{
3881 struct md_rdev *rdev;
3882 struct r10conf *conf;
3883
3884 if (mddev->degraded > 0) {
3885 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
3886 mdname(mddev));
3887 return ERR_PTR(-EINVAL);
3888 }
3889
3890
3891 mddev->new_level = 10;
3892
3893 mddev->new_layout = (1<<8) + 2;
3894 mddev->new_chunk_sectors = mddev->chunk_sectors;
3895 mddev->delta_disks = mddev->raid_disks;
3896 mddev->raid_disks *= 2;
3897
3898 mddev->recovery_cp = MaxSector;
3899
3900 conf = setup_conf(mddev);
3901 if (!IS_ERR(conf)) {
3902 rdev_for_each(rdev, mddev)
3903 if (rdev->raid_disk >= 0)
3904 rdev->new_raid_disk = rdev->raid_disk * 2;
3905 conf->barrier = 1;
3906 }
3907
3908 return conf;
3909}
3910
3911static void *raid10_takeover(struct mddev *mddev)
3912{
3913 struct r0conf *raid0_conf;
3914
3915
3916
3917
3918 if (mddev->level == 0) {
3919
3920 raid0_conf = mddev->private;
3921 if (raid0_conf->nr_strip_zones > 1) {
3922 printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
3923 " with more than one zone.\n",
3924 mdname(mddev));
3925 return ERR_PTR(-EINVAL);
3926 }
3927 return raid10_takeover_raid0(mddev);
3928 }
3929 return ERR_PTR(-EINVAL);
3930}
3931
3932static int raid10_check_reshape(struct mddev *mddev)
3933{
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948 struct r10conf *conf = mddev->private;
3949 struct geom geo;
3950
3951 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
3952 return -EINVAL;
3953
3954 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
3955
3956 return -EINVAL;
3957 if (geo.far_copies > 1 && !geo.far_offset)
3958
3959 return -EINVAL;
3960
3961 if (mddev->array_sectors & geo.chunk_mask)
3962
3963 return -EINVAL;
3964
3965 if (!enough(conf, -1))
3966 return -EINVAL;
3967
3968 kfree(conf->mirrors_new);
3969 conf->mirrors_new = NULL;
3970 if (mddev->delta_disks > 0) {
3971
3972 conf->mirrors_new = kzalloc(
3973 sizeof(struct raid10_info)
3974 *(mddev->raid_disks +
3975 mddev->delta_disks),
3976 GFP_KERNEL);
3977 if (!conf->mirrors_new)
3978 return -ENOMEM;
3979 }
3980 return 0;
3981}
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996static int calc_degraded(struct r10conf *conf)
3997{
3998 int degraded, degraded2;
3999 int i;
4000
4001 rcu_read_lock();
4002 degraded = 0;
4003
4004 for (i = 0; i < conf->prev.raid_disks; i++) {
4005 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4006 if (!rdev || test_bit(Faulty, &rdev->flags))
4007 degraded++;
4008 else if (!test_bit(In_sync, &rdev->flags))
4009
4010
4011
4012
4013 degraded++;
4014 }
4015 rcu_read_unlock();
4016 if (conf->geo.raid_disks == conf->prev.raid_disks)
4017 return degraded;
4018 rcu_read_lock();
4019 degraded2 = 0;
4020 for (i = 0; i < conf->geo.raid_disks; i++) {
4021 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4022 if (!rdev || test_bit(Faulty, &rdev->flags))
4023 degraded2++;
4024 else if (!test_bit(In_sync, &rdev->flags)) {
4025
4026
4027
4028
4029
4030 if (conf->geo.raid_disks <= conf->prev.raid_disks)
4031 degraded2++;
4032 }
4033 }
4034 rcu_read_unlock();
4035 if (degraded2 > degraded)
4036 return degraded2;
4037 return degraded;
4038}
4039
4040static int raid10_start_reshape(struct mddev *mddev)
4041{
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052 unsigned long before_length, after_length;
4053 sector_t min_offset_diff = 0;
4054 int first = 1;
4055 struct geom new;
4056 struct r10conf *conf = mddev->private;
4057 struct md_rdev *rdev;
4058 int spares = 0;
4059 int ret;
4060
4061 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4062 return -EBUSY;
4063
4064 if (setup_geo(&new, mddev, geo_start) != conf->copies)
4065 return -EINVAL;
4066
4067 before_length = ((1 << conf->prev.chunk_shift) *
4068 conf->prev.far_copies);
4069 after_length = ((1 << conf->geo.chunk_shift) *
4070 conf->geo.far_copies);
4071
4072 rdev_for_each(rdev, mddev) {
4073 if (!test_bit(In_sync, &rdev->flags)
4074 && !test_bit(Faulty, &rdev->flags))
4075 spares++;
4076 if (rdev->raid_disk >= 0) {
4077 long long diff = (rdev->new_data_offset
4078 - rdev->data_offset);
4079 if (!mddev->reshape_backwards)
4080 diff = -diff;
4081 if (diff < 0)
4082 diff = 0;
4083 if (first || diff < min_offset_diff)
4084 min_offset_diff = diff;
4085 }
4086 }
4087
4088 if (max(before_length, after_length) > min_offset_diff)
4089 return -EINVAL;
4090
4091 if (spares < mddev->delta_disks)
4092 return -EINVAL;
4093
4094 conf->offset_diff = min_offset_diff;
4095 spin_lock_irq(&conf->device_lock);
4096 if (conf->mirrors_new) {
4097 memcpy(conf->mirrors_new, conf->mirrors,
4098 sizeof(struct raid10_info)*conf->prev.raid_disks);
4099 smp_mb();
4100 kfree(conf->mirrors_old);
4101 conf->mirrors_old = conf->mirrors;
4102 conf->mirrors = conf->mirrors_new;
4103 conf->mirrors_new = NULL;
4104 }
4105 setup_geo(&conf->geo, mddev, geo_start);
4106 smp_mb();
4107 if (mddev->reshape_backwards) {
4108 sector_t size = raid10_size(mddev, 0, 0);
4109 if (size < mddev->array_sectors) {
4110 spin_unlock_irq(&conf->device_lock);
4111 printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
4112 mdname(mddev));
4113 return -EINVAL;
4114 }
4115 mddev->resync_max_sectors = size;
4116 conf->reshape_progress = size;
4117 } else
4118 conf->reshape_progress = 0;
4119 spin_unlock_irq(&conf->device_lock);
4120
4121 if (mddev->delta_disks && mddev->bitmap) {
4122 ret = bitmap_resize(mddev->bitmap,
4123 raid10_size(mddev, 0,
4124 conf->geo.raid_disks),
4125 0, 0);
4126 if (ret)
4127 goto abort;
4128 }
4129 if (mddev->delta_disks > 0) {
4130 rdev_for_each(rdev, mddev)
4131 if (rdev->raid_disk < 0 &&
4132 !test_bit(Faulty, &rdev->flags)) {
4133 if (raid10_add_disk(mddev, rdev) == 0) {
4134 if (rdev->raid_disk >=
4135 conf->prev.raid_disks)
4136 set_bit(In_sync, &rdev->flags);
4137 else
4138 rdev->recovery_offset = 0;
4139
4140 if (sysfs_link_rdev(mddev, rdev))
4141 ;
4142 }
4143 } else if (rdev->raid_disk >= conf->prev.raid_disks
4144 && !test_bit(Faulty, &rdev->flags)) {
4145
4146 set_bit(In_sync, &rdev->flags);
4147 }
4148 }
4149
4150
4151
4152
4153 spin_lock_irq(&conf->device_lock);
4154 mddev->degraded = calc_degraded(conf);
4155 spin_unlock_irq(&conf->device_lock);
4156 mddev->raid_disks = conf->geo.raid_disks;
4157 mddev->reshape_position = conf->reshape_progress;
4158 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4159
4160 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4161 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4162 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4163 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4164
4165 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4166 "reshape");
4167 if (!mddev->sync_thread) {
4168 ret = -EAGAIN;
4169 goto abort;
4170 }
4171 conf->reshape_checkpoint = jiffies;
4172 md_wakeup_thread(mddev->sync_thread);
4173 md_new_event(mddev);
4174 return 0;
4175
4176abort:
4177 mddev->recovery = 0;
4178 spin_lock_irq(&conf->device_lock);
4179 conf->geo = conf->prev;
4180 mddev->raid_disks = conf->geo.raid_disks;
4181 rdev_for_each(rdev, mddev)
4182 rdev->new_data_offset = rdev->data_offset;
4183 smp_wmb();
4184 conf->reshape_progress = MaxSector;
4185 mddev->reshape_position = MaxSector;
4186 spin_unlock_irq(&conf->device_lock);
4187 return ret;
4188}
4189
4190
4191
4192
4193
4194
4195
4196static sector_t last_dev_address(sector_t s, struct geom *geo)
4197{
4198 s = (s | geo->chunk_mask) + 1;
4199 s >>= geo->chunk_shift;
4200 s *= geo->near_copies;
4201 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4202 s *= geo->far_copies;
4203 s <<= geo->chunk_shift;
4204 return s;
4205}
4206
4207
4208
4209
4210
4211static sector_t first_dev_address(sector_t s, struct geom *geo)
4212{
4213 s >>= geo->chunk_shift;
4214 s *= geo->near_copies;
4215 sector_div(s, geo->raid_disks);
4216 s *= geo->far_copies;
4217 s <<= geo->chunk_shift;
4218 return s;
4219}
4220
4221static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4222 int *skipped)
4223{
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261 struct r10conf *conf = mddev->private;
4262 struct r10bio *r10_bio;
4263 sector_t next, safe, last;
4264 int max_sectors;
4265 int nr_sectors;
4266 int s;
4267 struct md_rdev *rdev;
4268 int need_flush = 0;
4269 struct bio *blist;
4270 struct bio *bio, *read_bio;
4271 int sectors_done = 0;
4272
4273 if (sector_nr == 0) {
4274
4275 if (mddev->reshape_backwards &&
4276 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4277 sector_nr = (raid10_size(mddev, 0, 0)
4278 - conf->reshape_progress);
4279 } else if (!mddev->reshape_backwards &&
4280 conf->reshape_progress > 0)
4281 sector_nr = conf->reshape_progress;
4282 if (sector_nr) {
4283 mddev->curr_resync_completed = sector_nr;
4284 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4285 *skipped = 1;
4286 return sector_nr;
4287 }
4288 }
4289
4290
4291
4292
4293
4294 if (mddev->reshape_backwards) {
4295
4296
4297
4298 next = first_dev_address(conf->reshape_progress - 1,
4299 &conf->geo);
4300
4301
4302
4303
4304 safe = last_dev_address(conf->reshape_safe - 1,
4305 &conf->prev);
4306
4307 if (next + conf->offset_diff < safe)
4308 need_flush = 1;
4309
4310 last = conf->reshape_progress - 1;
4311 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4312 & conf->prev.chunk_mask);
4313 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4314 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4315 } else {
4316
4317
4318
4319 next = last_dev_address(conf->reshape_progress, &conf->geo);
4320
4321
4322
4323
4324 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4325
4326
4327
4328
4329 if (next > safe + conf->offset_diff)
4330 need_flush = 1;
4331
4332 sector_nr = conf->reshape_progress;
4333 last = sector_nr | (conf->geo.chunk_mask
4334 & conf->prev.chunk_mask);
4335
4336 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4337 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4338 }
4339
4340 if (need_flush ||
4341 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4342
4343 wait_barrier(conf);
4344 mddev->reshape_position = conf->reshape_progress;
4345 if (mddev->reshape_backwards)
4346 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4347 - conf->reshape_progress;
4348 else
4349 mddev->curr_resync_completed = conf->reshape_progress;
4350 conf->reshape_checkpoint = jiffies;
4351 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4352 md_wakeup_thread(mddev->thread);
4353 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4354 kthread_should_stop());
4355 conf->reshape_safe = mddev->reshape_position;
4356 allow_barrier(conf);
4357 }
4358
4359read_more:
4360
4361 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
4362 raise_barrier(conf, sectors_done != 0);
4363 atomic_set(&r10_bio->remaining, 0);
4364 r10_bio->mddev = mddev;
4365 r10_bio->sector = sector_nr;
4366 set_bit(R10BIO_IsReshape, &r10_bio->state);
4367 r10_bio->sectors = last - sector_nr + 1;
4368 rdev = read_balance(conf, r10_bio, &max_sectors);
4369 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4370
4371 if (!rdev) {
4372
4373
4374
4375
4376 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4377 return sectors_done;
4378 }
4379
4380 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4381
4382 read_bio->bi_bdev = rdev->bdev;
4383 read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4384 + rdev->data_offset);
4385 read_bio->bi_private = r10_bio;
4386 read_bio->bi_end_io = end_sync_read;
4387 read_bio->bi_rw = READ;
4388 read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
4389 read_bio->bi_flags |= 1 << BIO_UPTODATE;
4390 read_bio->bi_vcnt = 0;
4391 read_bio->bi_size = 0;
4392 r10_bio->master_bio = read_bio;
4393 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4394
4395
4396 __raid10_find_phys(&conf->geo, r10_bio);
4397
4398 blist = read_bio;
4399 read_bio->bi_next = NULL;
4400
4401 for (s = 0; s < conf->copies*2; s++) {
4402 struct bio *b;
4403 int d = r10_bio->devs[s/2].devnum;
4404 struct md_rdev *rdev2;
4405 if (s&1) {
4406 rdev2 = conf->mirrors[d].replacement;
4407 b = r10_bio->devs[s/2].repl_bio;
4408 } else {
4409 rdev2 = conf->mirrors[d].rdev;
4410 b = r10_bio->devs[s/2].bio;
4411 }
4412 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4413 continue;
4414
4415 bio_reset(b);
4416 b->bi_bdev = rdev2->bdev;
4417 b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
4418 b->bi_private = r10_bio;
4419 b->bi_end_io = end_reshape_write;
4420 b->bi_rw = WRITE;
4421 b->bi_next = blist;
4422 blist = b;
4423 }
4424
4425
4426
4427 nr_sectors = 0;
4428 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4429 struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
4430 int len = (max_sectors - s) << 9;
4431 if (len > PAGE_SIZE)
4432 len = PAGE_SIZE;
4433 for (bio = blist; bio ; bio = bio->bi_next) {
4434 struct bio *bio2;
4435 if (bio_add_page(bio, page, len, 0))
4436 continue;
4437
4438
4439 for (bio2 = blist;
4440 bio2 && bio2 != bio;
4441 bio2 = bio2->bi_next) {
4442
4443 bio2->bi_vcnt--;
4444 bio2->bi_size -= len;
4445 bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
4446 }
4447 goto bio_full;
4448 }
4449 sector_nr += len >> 9;
4450 nr_sectors += len >> 9;
4451 }
4452bio_full:
4453 r10_bio->sectors = nr_sectors;
4454
4455
4456 md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
4457 atomic_inc(&r10_bio->remaining);
4458 read_bio->bi_next = NULL;
4459 generic_make_request(read_bio);
4460 sector_nr += nr_sectors;
4461 sectors_done += nr_sectors;
4462 if (sector_nr <= last)
4463 goto read_more;
4464
4465
4466
4467
4468 if (mddev->reshape_backwards)
4469 conf->reshape_progress -= sectors_done;
4470 else
4471 conf->reshape_progress += sectors_done;
4472
4473 return sectors_done;
4474}
4475
4476static void end_reshape_request(struct r10bio *r10_bio);
4477static int handle_reshape_read_error(struct mddev *mddev,
4478 struct r10bio *r10_bio);
4479static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4480{
4481
4482
4483
4484
4485
4486 struct r10conf *conf = mddev->private;
4487 int s;
4488
4489 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4490 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4491
4492 md_done_sync(mddev, r10_bio->sectors, 0);
4493 return;
4494 }
4495
4496
4497
4498
4499 atomic_set(&r10_bio->remaining, 1);
4500 for (s = 0; s < conf->copies*2; s++) {
4501 struct bio *b;
4502 int d = r10_bio->devs[s/2].devnum;
4503 struct md_rdev *rdev;
4504 if (s&1) {
4505 rdev = conf->mirrors[d].replacement;
4506 b = r10_bio->devs[s/2].repl_bio;
4507 } else {
4508 rdev = conf->mirrors[d].rdev;
4509 b = r10_bio->devs[s/2].bio;
4510 }
4511 if (!rdev || test_bit(Faulty, &rdev->flags))
4512 continue;
4513 atomic_inc(&rdev->nr_pending);
4514 md_sync_acct(b->bi_bdev, r10_bio->sectors);
4515 atomic_inc(&r10_bio->remaining);
4516 b->bi_next = NULL;
4517 generic_make_request(b);
4518 }
4519 end_reshape_request(r10_bio);
4520}
4521
4522static void end_reshape(struct r10conf *conf)
4523{
4524 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4525 return;
4526
4527 spin_lock_irq(&conf->device_lock);
4528 conf->prev = conf->geo;
4529 md_finish_reshape(conf->mddev);
4530 smp_wmb();
4531 conf->reshape_progress = MaxSector;
4532 spin_unlock_irq(&conf->device_lock);
4533
4534
4535
4536
4537 if (conf->mddev->queue) {
4538 int stripe = conf->geo.raid_disks *
4539 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4540 stripe /= conf->geo.near_copies;
4541 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4542 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4543 }
4544 conf->fullsync = 0;
4545}
4546
4547
4548static int handle_reshape_read_error(struct mddev *mddev,
4549 struct r10bio *r10_bio)
4550{
4551
4552 int sectors = r10_bio->sectors;
4553 struct r10conf *conf = mddev->private;
4554 struct {
4555 struct r10bio r10_bio;
4556 struct r10dev devs[conf->copies];
4557 } on_stack;
4558 struct r10bio *r10b = &on_stack.r10_bio;
4559 int slot = 0;
4560 int idx = 0;
4561 struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
4562
4563 r10b->sector = r10_bio->sector;
4564 __raid10_find_phys(&conf->prev, r10b);
4565
4566 while (sectors) {
4567 int s = sectors;
4568 int success = 0;
4569 int first_slot = slot;
4570
4571 if (s > (PAGE_SIZE >> 9))
4572 s = PAGE_SIZE >> 9;
4573
4574 while (!success) {
4575 int d = r10b->devs[slot].devnum;
4576 struct md_rdev *rdev = conf->mirrors[d].rdev;
4577 sector_t addr;
4578 if (rdev == NULL ||
4579 test_bit(Faulty, &rdev->flags) ||
4580 !test_bit(In_sync, &rdev->flags))
4581 goto failed;
4582
4583 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4584 success = sync_page_io(rdev,
4585 addr,
4586 s << 9,
4587 bvec[idx].bv_page,
4588 READ, false);
4589 if (success)
4590 break;
4591 failed:
4592 slot++;
4593 if (slot >= conf->copies)
4594 slot = 0;
4595 if (slot == first_slot)
4596 break;
4597 }
4598 if (!success) {
4599
4600 set_bit(MD_RECOVERY_INTR,
4601 &mddev->recovery);
4602 return -EIO;
4603 }
4604 sectors -= s;
4605 idx++;
4606 }
4607 return 0;
4608}
4609
4610static void end_reshape_write(struct bio *bio, int error)
4611{
4612 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
4613 struct r10bio *r10_bio = bio->bi_private;
4614 struct mddev *mddev = r10_bio->mddev;
4615 struct r10conf *conf = mddev->private;
4616 int d;
4617 int slot;
4618 int repl;
4619 struct md_rdev *rdev = NULL;
4620
4621 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4622 if (repl)
4623 rdev = conf->mirrors[d].replacement;
4624 if (!rdev) {
4625 smp_mb();
4626 rdev = conf->mirrors[d].rdev;
4627 }
4628
4629 if (!uptodate) {
4630
4631 md_error(mddev, rdev);
4632 }
4633
4634 rdev_dec_pending(rdev, mddev);
4635 end_reshape_request(r10_bio);
4636}
4637
4638static void end_reshape_request(struct r10bio *r10_bio)
4639{
4640 if (!atomic_dec_and_test(&r10_bio->remaining))
4641 return;
4642 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4643 bio_put(r10_bio->master_bio);
4644 put_buf(r10_bio);
4645}
4646
4647static void raid10_finish_reshape(struct mddev *mddev)
4648{
4649 struct r10conf *conf = mddev->private;
4650
4651 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4652 return;
4653
4654 if (mddev->delta_disks > 0) {
4655 sector_t size = raid10_size(mddev, 0, 0);
4656 md_set_array_sectors(mddev, size);
4657 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4658 mddev->recovery_cp = mddev->resync_max_sectors;
4659 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4660 }
4661 mddev->resync_max_sectors = size;
4662 set_capacity(mddev->gendisk, mddev->array_sectors);
4663 revalidate_disk(mddev->gendisk);
4664 } else {
4665 int d;
4666 for (d = conf->geo.raid_disks ;
4667 d < conf->geo.raid_disks - mddev->delta_disks;
4668 d++) {
4669 struct md_rdev *rdev = conf->mirrors[d].rdev;
4670 if (rdev)
4671 clear_bit(In_sync, &rdev->flags);
4672 rdev = conf->mirrors[d].replacement;
4673 if (rdev)
4674 clear_bit(In_sync, &rdev->flags);
4675 }
4676 }
4677 mddev->layout = mddev->new_layout;
4678 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4679 mddev->reshape_position = MaxSector;
4680 mddev->delta_disks = 0;
4681 mddev->reshape_backwards = 0;
4682}
4683
4684static struct md_personality raid10_personality =
4685{
4686 .name = "raid10",
4687 .level = 10,
4688 .owner = THIS_MODULE,
4689 .make_request = make_request,
4690 .run = run,
4691 .stop = stop,
4692 .status = status,
4693 .error_handler = error,
4694 .hot_add_disk = raid10_add_disk,
4695 .hot_remove_disk= raid10_remove_disk,
4696 .spare_active = raid10_spare_active,
4697 .sync_request = sync_request,
4698 .quiesce = raid10_quiesce,
4699 .size = raid10_size,
4700 .resize = raid10_resize,
4701 .takeover = raid10_takeover,
4702 .check_reshape = raid10_check_reshape,
4703 .start_reshape = raid10_start_reshape,
4704 .finish_reshape = raid10_finish_reshape,
4705};
4706
4707static int __init raid_init(void)
4708{
4709 return register_md_personality(&raid10_personality);
4710}
4711
4712static void raid_exit(void)
4713{
4714 unregister_md_personality(&raid10_personality);
4715}
4716
4717module_init(raid_init);
4718module_exit(raid_exit);
4719MODULE_LICENSE("GPL");
4720MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4721MODULE_ALIAS("md-personality-9");
4722MODULE_ALIAS("md-raid10");
4723MODULE_ALIAS("md-level-10");
4724
4725module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
4726