1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/slab.h>
22#include <linux/delay.h>
23#include <linux/blkdev.h>
24#include <linux/module.h>
25#include <linux/seq_file.h>
26#include <linux/ratelimit.h>
27#include <linux/kthread.h>
28#include "md.h"
29#include "raid10.h"
30#include "raid0.h"
31#include "bitmap.h"
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76#define NR_RAID10_BIOS 256
77
78
79
80
81
82
83#define IO_BLOCKED ((struct bio *)1)
84
85
86
87
88#define IO_MADE_GOOD ((struct bio *)2)
89
90#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
91
92
93
94
95
96static int max_queued_requests = 1024;
97
98static void allow_barrier(struct r10conf *conf);
99static void lower_barrier(struct r10conf *conf);
100static int _enough(struct r10conf *conf, int previous, int ignore);
101static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
102 int *skipped);
103static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
104static void end_reshape_write(struct bio *bio, int error);
105static void end_reshape(struct r10conf *conf);
106
107static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
108{
109 struct r10conf *conf = data;
110 int size = offsetof(struct r10bio, devs[conf->copies]);
111
112
113
114 return kzalloc(size, gfp_flags);
115}
116
117static void r10bio_pool_free(void *r10_bio, void *data)
118{
119 kfree(r10_bio);
120}
121
122
123#define RESYNC_BLOCK_SIZE (64*1024)
124#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
125
126#define RESYNC_WINDOW (1024*1024)
127
128#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
129
130
131
132
133
134
135
136
137static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
138{
139 struct r10conf *conf = data;
140 struct page *page;
141 struct r10bio *r10_bio;
142 struct bio *bio;
143 int i, j;
144 int nalloc;
145
146 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
147 if (!r10_bio)
148 return NULL;
149
150 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
151 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
152 nalloc = conf->copies;
153 else
154 nalloc = 2;
155
156
157
158
159 for (j = nalloc ; j-- ; ) {
160 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
161 if (!bio)
162 goto out_free_bio;
163 r10_bio->devs[j].bio = bio;
164 if (!conf->have_replacement)
165 continue;
166 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
167 if (!bio)
168 goto out_free_bio;
169 r10_bio->devs[j].repl_bio = bio;
170 }
171
172
173
174
175 for (j = 0 ; j < nalloc; j++) {
176 struct bio *rbio = r10_bio->devs[j].repl_bio;
177 bio = r10_bio->devs[j].bio;
178 for (i = 0; i < RESYNC_PAGES; i++) {
179 if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
180 &conf->mddev->recovery)) {
181
182
183 struct bio *rbio = r10_bio->devs[0].bio;
184 page = rbio->bi_io_vec[i].bv_page;
185 get_page(page);
186 } else
187 page = alloc_page(gfp_flags);
188 if (unlikely(!page))
189 goto out_free_pages;
190
191 bio->bi_io_vec[i].bv_page = page;
192 if (rbio)
193 rbio->bi_io_vec[i].bv_page = page;
194 }
195 }
196
197 return r10_bio;
198
199out_free_pages:
200 for ( ; i > 0 ; i--)
201 safe_put_page(bio->bi_io_vec[i-1].bv_page);
202 while (j--)
203 for (i = 0; i < RESYNC_PAGES ; i++)
204 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
205 j = 0;
206out_free_bio:
207 for ( ; j < nalloc; j++) {
208 if (r10_bio->devs[j].bio)
209 bio_put(r10_bio->devs[j].bio);
210 if (r10_bio->devs[j].repl_bio)
211 bio_put(r10_bio->devs[j].repl_bio);
212 }
213 r10bio_pool_free(r10_bio, conf);
214 return NULL;
215}
216
217static void r10buf_pool_free(void *__r10_bio, void *data)
218{
219 int i;
220 struct r10conf *conf = data;
221 struct r10bio *r10bio = __r10_bio;
222 int j;
223
224 for (j=0; j < conf->copies; j++) {
225 struct bio *bio = r10bio->devs[j].bio;
226 if (bio) {
227 for (i = 0; i < RESYNC_PAGES; i++) {
228 safe_put_page(bio->bi_io_vec[i].bv_page);
229 bio->bi_io_vec[i].bv_page = NULL;
230 }
231 bio_put(bio);
232 }
233 bio = r10bio->devs[j].repl_bio;
234 if (bio)
235 bio_put(bio);
236 }
237 r10bio_pool_free(r10bio, conf);
238}
239
240static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
241{
242 int i;
243
244 for (i = 0; i < conf->copies; i++) {
245 struct bio **bio = & r10_bio->devs[i].bio;
246 if (!BIO_SPECIAL(*bio))
247 bio_put(*bio);
248 *bio = NULL;
249 bio = &r10_bio->devs[i].repl_bio;
250 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
251 bio_put(*bio);
252 *bio = NULL;
253 }
254}
255
256static void free_r10bio(struct r10bio *r10_bio)
257{
258 struct r10conf *conf = r10_bio->mddev->private;
259
260 put_all_bios(conf, r10_bio);
261 mempool_free(r10_bio, conf->r10bio_pool);
262}
263
264static void put_buf(struct r10bio *r10_bio)
265{
266 struct r10conf *conf = r10_bio->mddev->private;
267
268 mempool_free(r10_bio, conf->r10buf_pool);
269
270 lower_barrier(conf);
271}
272
273static void reschedule_retry(struct r10bio *r10_bio)
274{
275 unsigned long flags;
276 struct mddev *mddev = r10_bio->mddev;
277 struct r10conf *conf = mddev->private;
278
279 spin_lock_irqsave(&conf->device_lock, flags);
280 list_add(&r10_bio->retry_list, &conf->retry_list);
281 conf->nr_queued ++;
282 spin_unlock_irqrestore(&conf->device_lock, flags);
283
284
285 wake_up(&conf->wait_barrier);
286
287 md_wakeup_thread(mddev->thread);
288}
289
290
291
292
293
294
295static void raid_end_bio_io(struct r10bio *r10_bio)
296{
297 struct bio *bio = r10_bio->master_bio;
298 int done;
299 struct r10conf *conf = r10_bio->mddev->private;
300
301 if (bio->bi_phys_segments) {
302 unsigned long flags;
303 spin_lock_irqsave(&conf->device_lock, flags);
304 bio->bi_phys_segments--;
305 done = (bio->bi_phys_segments == 0);
306 spin_unlock_irqrestore(&conf->device_lock, flags);
307 } else
308 done = 1;
309 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
310 clear_bit(BIO_UPTODATE, &bio->bi_flags);
311 if (done) {
312 bio_endio(bio, 0);
313
314
315
316
317 allow_barrier(conf);
318 }
319 free_r10bio(r10_bio);
320}
321
322
323
324
325static inline void update_head_pos(int slot, struct r10bio *r10_bio)
326{
327 struct r10conf *conf = r10_bio->mddev->private;
328
329 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
330 r10_bio->devs[slot].addr + (r10_bio->sectors);
331}
332
333
334
335
336static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
337 struct bio *bio, int *slotp, int *replp)
338{
339 int slot;
340 int repl = 0;
341
342 for (slot = 0; slot < conf->copies; slot++) {
343 if (r10_bio->devs[slot].bio == bio)
344 break;
345 if (r10_bio->devs[slot].repl_bio == bio) {
346 repl = 1;
347 break;
348 }
349 }
350
351 BUG_ON(slot == conf->copies);
352 update_head_pos(slot, r10_bio);
353
354 if (slotp)
355 *slotp = slot;
356 if (replp)
357 *replp = repl;
358 return r10_bio->devs[slot].devnum;
359}
360
361static void raid10_end_read_request(struct bio *bio, int error)
362{
363 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
364 struct r10bio *r10_bio = bio->bi_private;
365 int slot, dev;
366 struct md_rdev *rdev;
367 struct r10conf *conf = r10_bio->mddev->private;
368
369
370 slot = r10_bio->read_slot;
371 dev = r10_bio->devs[slot].devnum;
372 rdev = r10_bio->devs[slot].rdev;
373
374
375
376 update_head_pos(slot, r10_bio);
377
378 if (uptodate) {
379
380
381
382
383
384
385
386
387
388 set_bit(R10BIO_Uptodate, &r10_bio->state);
389 } else {
390
391
392
393
394
395 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
396 rdev->raid_disk))
397 uptodate = 1;
398 }
399 if (uptodate) {
400 raid_end_bio_io(r10_bio);
401 rdev_dec_pending(rdev, conf->mddev);
402 } else {
403
404
405
406 char b[BDEVNAME_SIZE];
407 printk_ratelimited(KERN_ERR
408 "md/raid10:%s: %s: rescheduling sector %llu\n",
409 mdname(conf->mddev),
410 bdevname(rdev->bdev, b),
411 (unsigned long long)r10_bio->sector);
412 set_bit(R10BIO_ReadError, &r10_bio->state);
413 reschedule_retry(r10_bio);
414 }
415}
416
417static void close_write(struct r10bio *r10_bio)
418{
419
420 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
421 r10_bio->sectors,
422 !test_bit(R10BIO_Degraded, &r10_bio->state),
423 0);
424 md_write_end(r10_bio->mddev);
425}
426
427static void one_write_done(struct r10bio *r10_bio)
428{
429 if (atomic_dec_and_test(&r10_bio->remaining)) {
430 if (test_bit(R10BIO_WriteError, &r10_bio->state))
431 reschedule_retry(r10_bio);
432 else {
433 close_write(r10_bio);
434 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
435 reschedule_retry(r10_bio);
436 else
437 raid_end_bio_io(r10_bio);
438 }
439 }
440}
441
442static void raid10_end_write_request(struct bio *bio, int error)
443{
444 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
445 struct r10bio *r10_bio = bio->bi_private;
446 int dev;
447 int dec_rdev = 1;
448 struct r10conf *conf = r10_bio->mddev->private;
449 int slot, repl;
450 struct md_rdev *rdev = NULL;
451
452 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
453
454 if (repl)
455 rdev = conf->mirrors[dev].replacement;
456 if (!rdev) {
457 smp_rmb();
458 repl = 0;
459 rdev = conf->mirrors[dev].rdev;
460 }
461
462
463
464 if (!uptodate) {
465 if (repl)
466
467
468
469 md_error(rdev->mddev, rdev);
470 else {
471 set_bit(WriteErrorSeen, &rdev->flags);
472 if (!test_and_set_bit(WantReplacement, &rdev->flags))
473 set_bit(MD_RECOVERY_NEEDED,
474 &rdev->mddev->recovery);
475 set_bit(R10BIO_WriteError, &r10_bio->state);
476 dec_rdev = 0;
477 }
478 } else {
479
480
481
482
483
484
485
486
487
488 sector_t first_bad;
489 int bad_sectors;
490
491
492
493
494
495
496
497
498
499 if (test_bit(In_sync, &rdev->flags) &&
500 !test_bit(Faulty, &rdev->flags))
501 set_bit(R10BIO_Uptodate, &r10_bio->state);
502
503
504 if (is_badblock(rdev,
505 r10_bio->devs[slot].addr,
506 r10_bio->sectors,
507 &first_bad, &bad_sectors)) {
508 bio_put(bio);
509 if (repl)
510 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
511 else
512 r10_bio->devs[slot].bio = IO_MADE_GOOD;
513 dec_rdev = 0;
514 set_bit(R10BIO_MadeGood, &r10_bio->state);
515 }
516 }
517
518
519
520
521
522
523 one_write_done(r10_bio);
524 if (dec_rdev)
525 rdev_dec_pending(rdev, conf->mddev);
526}
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
554{
555 int n,f;
556 sector_t sector;
557 sector_t chunk;
558 sector_t stripe;
559 int dev;
560 int slot = 0;
561 int last_far_set_start, last_far_set_size;
562
563 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
564 last_far_set_start *= geo->far_set_size;
565
566 last_far_set_size = geo->far_set_size;
567 last_far_set_size += (geo->raid_disks % geo->far_set_size);
568
569
570 chunk = r10bio->sector >> geo->chunk_shift;
571 sector = r10bio->sector & geo->chunk_mask;
572
573 chunk *= geo->near_copies;
574 stripe = chunk;
575 dev = sector_div(stripe, geo->raid_disks);
576 if (geo->far_offset)
577 stripe *= geo->far_copies;
578
579 sector += stripe << geo->chunk_shift;
580
581
582 for (n = 0; n < geo->near_copies; n++) {
583 int d = dev;
584 int set;
585 sector_t s = sector;
586 r10bio->devs[slot].devnum = d;
587 r10bio->devs[slot].addr = s;
588 slot++;
589
590 for (f = 1; f < geo->far_copies; f++) {
591 set = d / geo->far_set_size;
592 d += geo->near_copies;
593
594 if ((geo->raid_disks % geo->far_set_size) &&
595 (d > last_far_set_start)) {
596 d -= last_far_set_start;
597 d %= last_far_set_size;
598 d += last_far_set_start;
599 } else {
600 d %= geo->far_set_size;
601 d += geo->far_set_size * set;
602 }
603 s += geo->stride;
604 r10bio->devs[slot].devnum = d;
605 r10bio->devs[slot].addr = s;
606 slot++;
607 }
608 dev++;
609 if (dev >= geo->raid_disks) {
610 dev = 0;
611 sector += (geo->chunk_mask + 1);
612 }
613 }
614}
615
616static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
617{
618 struct geom *geo = &conf->geo;
619
620 if (conf->reshape_progress != MaxSector &&
621 ((r10bio->sector >= conf->reshape_progress) !=
622 conf->mddev->reshape_backwards)) {
623 set_bit(R10BIO_Previous, &r10bio->state);
624 geo = &conf->prev;
625 } else
626 clear_bit(R10BIO_Previous, &r10bio->state);
627
628 __raid10_find_phys(geo, r10bio);
629}
630
631static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
632{
633 sector_t offset, chunk, vchunk;
634
635
636
637 struct geom *geo = &conf->geo;
638 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
639 int far_set_size = geo->far_set_size;
640 int last_far_set_start;
641
642 if (geo->raid_disks % geo->far_set_size) {
643 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
644 last_far_set_start *= geo->far_set_size;
645
646 if (dev >= last_far_set_start) {
647 far_set_size = geo->far_set_size;
648 far_set_size += (geo->raid_disks % geo->far_set_size);
649 far_set_start = last_far_set_start;
650 }
651 }
652
653 offset = sector & geo->chunk_mask;
654 if (geo->far_offset) {
655 int fc;
656 chunk = sector >> geo->chunk_shift;
657 fc = sector_div(chunk, geo->far_copies);
658 dev -= fc * geo->near_copies;
659 if (dev < far_set_start)
660 dev += far_set_size;
661 } else {
662 while (sector >= geo->stride) {
663 sector -= geo->stride;
664 if (dev < (geo->near_copies + far_set_start))
665 dev += far_set_size - geo->near_copies;
666 else
667 dev -= geo->near_copies;
668 }
669 chunk = sector >> geo->chunk_shift;
670 }
671 vchunk = chunk * geo->raid_disks + dev;
672 sector_div(vchunk, geo->near_copies);
673 return (vchunk << geo->chunk_shift) + offset;
674}
675
676
677
678
679
680
681
682
683
684
685
686static int raid10_mergeable_bvec(struct request_queue *q,
687 struct bvec_merge_data *bvm,
688 struct bio_vec *biovec)
689{
690 struct mddev *mddev = q->queuedata;
691 struct r10conf *conf = mddev->private;
692 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
693 int max;
694 unsigned int chunk_sectors;
695 unsigned int bio_sectors = bvm->bi_size >> 9;
696 struct geom *geo = &conf->geo;
697
698 chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
699 if (conf->reshape_progress != MaxSector &&
700 ((sector >= conf->reshape_progress) !=
701 conf->mddev->reshape_backwards))
702 geo = &conf->prev;
703
704 if (geo->near_copies < geo->raid_disks) {
705 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
706 + bio_sectors)) << 9;
707 if (max < 0)
708
709 max = 0;
710 if (max <= biovec->bv_len && bio_sectors == 0)
711 return biovec->bv_len;
712 } else
713 max = biovec->bv_len;
714
715 if (mddev->merge_check_needed) {
716 struct {
717 struct r10bio r10_bio;
718 struct r10dev devs[conf->copies];
719 } on_stack;
720 struct r10bio *r10_bio = &on_stack.r10_bio;
721 int s;
722 if (conf->reshape_progress != MaxSector) {
723
724 if (max <= biovec->bv_len && bio_sectors == 0)
725 return biovec->bv_len;
726 return 0;
727 }
728 r10_bio->sector = sector;
729 raid10_find_phys(conf, r10_bio);
730 rcu_read_lock();
731 for (s = 0; s < conf->copies; s++) {
732 int disk = r10_bio->devs[s].devnum;
733 struct md_rdev *rdev = rcu_dereference(
734 conf->mirrors[disk].rdev);
735 if (rdev && !test_bit(Faulty, &rdev->flags)) {
736 struct request_queue *q =
737 bdev_get_queue(rdev->bdev);
738 if (q->merge_bvec_fn) {
739 bvm->bi_sector = r10_bio->devs[s].addr
740 + rdev->data_offset;
741 bvm->bi_bdev = rdev->bdev;
742 max = min(max, q->merge_bvec_fn(
743 q, bvm, biovec));
744 }
745 }
746 rdev = rcu_dereference(conf->mirrors[disk].replacement);
747 if (rdev && !test_bit(Faulty, &rdev->flags)) {
748 struct request_queue *q =
749 bdev_get_queue(rdev->bdev);
750 if (q->merge_bvec_fn) {
751 bvm->bi_sector = r10_bio->devs[s].addr
752 + rdev->data_offset;
753 bvm->bi_bdev = rdev->bdev;
754 max = min(max, q->merge_bvec_fn(
755 q, bvm, biovec));
756 }
757 }
758 }
759 rcu_read_unlock();
760 }
761 return max;
762}
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783static struct md_rdev *read_balance(struct r10conf *conf,
784 struct r10bio *r10_bio,
785 int *max_sectors)
786{
787 const sector_t this_sector = r10_bio->sector;
788 int disk, slot;
789 int sectors = r10_bio->sectors;
790 int best_good_sectors;
791 sector_t new_distance, best_dist;
792 struct md_rdev *best_rdev, *rdev = NULL;
793 int do_balance;
794 int best_slot;
795 struct geom *geo = &conf->geo;
796
797 raid10_find_phys(conf, r10_bio);
798 rcu_read_lock();
799retry:
800 sectors = r10_bio->sectors;
801 best_slot = -1;
802 best_rdev = NULL;
803 best_dist = MaxSector;
804 best_good_sectors = 0;
805 do_balance = 1;
806
807
808
809
810
811
812 if (conf->mddev->recovery_cp < MaxSector
813 && (this_sector + sectors >= conf->next_resync))
814 do_balance = 0;
815
816 for (slot = 0; slot < conf->copies ; slot++) {
817 sector_t first_bad;
818 int bad_sectors;
819 sector_t dev_sector;
820
821 if (r10_bio->devs[slot].bio == IO_BLOCKED)
822 continue;
823 disk = r10_bio->devs[slot].devnum;
824 rdev = rcu_dereference(conf->mirrors[disk].replacement);
825 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
826 test_bit(Unmerged, &rdev->flags) ||
827 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
828 rdev = rcu_dereference(conf->mirrors[disk].rdev);
829 if (rdev == NULL ||
830 test_bit(Faulty, &rdev->flags) ||
831 test_bit(Unmerged, &rdev->flags))
832 continue;
833 if (!test_bit(In_sync, &rdev->flags) &&
834 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
835 continue;
836
837 dev_sector = r10_bio->devs[slot].addr;
838 if (is_badblock(rdev, dev_sector, sectors,
839 &first_bad, &bad_sectors)) {
840 if (best_dist < MaxSector)
841
842 continue;
843 if (first_bad <= dev_sector) {
844
845
846
847
848 bad_sectors -= (dev_sector - first_bad);
849 if (!do_balance && sectors > bad_sectors)
850 sectors = bad_sectors;
851 if (best_good_sectors > sectors)
852 best_good_sectors = sectors;
853 } else {
854 sector_t good_sectors =
855 first_bad - dev_sector;
856 if (good_sectors > best_good_sectors) {
857 best_good_sectors = good_sectors;
858 best_slot = slot;
859 best_rdev = rdev;
860 }
861 if (!do_balance)
862
863 break;
864 }
865 continue;
866 } else
867 best_good_sectors = sectors;
868
869 if (!do_balance)
870 break;
871
872
873
874
875
876 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
877 break;
878
879
880 if (geo->far_copies > 1)
881 new_distance = r10_bio->devs[slot].addr;
882 else
883 new_distance = abs(r10_bio->devs[slot].addr -
884 conf->mirrors[disk].head_position);
885 if (new_distance < best_dist) {
886 best_dist = new_distance;
887 best_slot = slot;
888 best_rdev = rdev;
889 }
890 }
891 if (slot >= conf->copies) {
892 slot = best_slot;
893 rdev = best_rdev;
894 }
895
896 if (slot >= 0) {
897 atomic_inc(&rdev->nr_pending);
898 if (test_bit(Faulty, &rdev->flags)) {
899
900
901
902 rdev_dec_pending(rdev, conf->mddev);
903 goto retry;
904 }
905 r10_bio->read_slot = slot;
906 } else
907 rdev = NULL;
908 rcu_read_unlock();
909 *max_sectors = best_good_sectors;
910
911 return rdev;
912}
913
914int md_raid10_congested(struct mddev *mddev, int bits)
915{
916 struct r10conf *conf = mddev->private;
917 int i, ret = 0;
918
919 if ((bits & (1 << BDI_async_congested)) &&
920 conf->pending_count >= max_queued_requests)
921 return 1;
922
923 rcu_read_lock();
924 for (i = 0;
925 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
926 && ret == 0;
927 i++) {
928 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
929 if (rdev && !test_bit(Faulty, &rdev->flags)) {
930 struct request_queue *q = bdev_get_queue(rdev->bdev);
931
932 ret |= bdi_congested(&q->backing_dev_info, bits);
933 }
934 }
935 rcu_read_unlock();
936 return ret;
937}
938EXPORT_SYMBOL_GPL(md_raid10_congested);
939
940static int raid10_congested(void *data, int bits)
941{
942 struct mddev *mddev = data;
943
944 return mddev_congested(mddev, bits) ||
945 md_raid10_congested(mddev, bits);
946}
947
948static void flush_pending_writes(struct r10conf *conf)
949{
950
951
952
953 spin_lock_irq(&conf->device_lock);
954
955 if (conf->pending_bio_list.head) {
956 struct bio *bio;
957 bio = bio_list_get(&conf->pending_bio_list);
958 conf->pending_count = 0;
959 spin_unlock_irq(&conf->device_lock);
960
961
962 bitmap_unplug(conf->mddev->bitmap);
963 wake_up(&conf->wait_barrier);
964
965 while (bio) {
966 struct bio *next = bio->bi_next;
967 bio->bi_next = NULL;
968 if (unlikely((bio->bi_rw & REQ_DISCARD) &&
969 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
970
971 bio_endio(bio, 0);
972 else
973 generic_make_request(bio);
974 bio = next;
975 }
976 } else
977 spin_unlock_irq(&conf->device_lock);
978}
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002static void raise_barrier(struct r10conf *conf, int force)
1003{
1004 BUG_ON(force && !conf->barrier);
1005 spin_lock_irq(&conf->resync_lock);
1006
1007
1008 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
1009 conf->resync_lock);
1010
1011
1012 conf->barrier++;
1013
1014
1015 wait_event_lock_irq(conf->wait_barrier,
1016 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
1017 conf->resync_lock);
1018
1019 spin_unlock_irq(&conf->resync_lock);
1020}
1021
1022static void lower_barrier(struct r10conf *conf)
1023{
1024 unsigned long flags;
1025 spin_lock_irqsave(&conf->resync_lock, flags);
1026 conf->barrier--;
1027 spin_unlock_irqrestore(&conf->resync_lock, flags);
1028 wake_up(&conf->wait_barrier);
1029}
1030
1031static void wait_barrier(struct r10conf *conf)
1032{
1033 spin_lock_irq(&conf->resync_lock);
1034 if (conf->barrier) {
1035 conf->nr_waiting++;
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045 wait_event_lock_irq(conf->wait_barrier,
1046 !conf->barrier ||
1047 (conf->nr_pending &&
1048 current->bio_list &&
1049 !bio_list_empty(current->bio_list)),
1050 conf->resync_lock);
1051 conf->nr_waiting--;
1052 }
1053 conf->nr_pending++;
1054 spin_unlock_irq(&conf->resync_lock);
1055}
1056
1057static void allow_barrier(struct r10conf *conf)
1058{
1059 unsigned long flags;
1060 spin_lock_irqsave(&conf->resync_lock, flags);
1061 conf->nr_pending--;
1062 spin_unlock_irqrestore(&conf->resync_lock, flags);
1063 wake_up(&conf->wait_barrier);
1064}
1065
1066static void freeze_array(struct r10conf *conf, int extra)
1067{
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080 spin_lock_irq(&conf->resync_lock);
1081 conf->barrier++;
1082 conf->nr_waiting++;
1083 wait_event_lock_irq_cmd(conf->wait_barrier,
1084 conf->nr_pending == conf->nr_queued+extra,
1085 conf->resync_lock,
1086 flush_pending_writes(conf));
1087
1088 spin_unlock_irq(&conf->resync_lock);
1089}
1090
1091static void unfreeze_array(struct r10conf *conf)
1092{
1093
1094 spin_lock_irq(&conf->resync_lock);
1095 conf->barrier--;
1096 conf->nr_waiting--;
1097 wake_up(&conf->wait_barrier);
1098 spin_unlock_irq(&conf->resync_lock);
1099}
1100
1101static sector_t choose_data_offset(struct r10bio *r10_bio,
1102 struct md_rdev *rdev)
1103{
1104 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1105 test_bit(R10BIO_Previous, &r10_bio->state))
1106 return rdev->data_offset;
1107 else
1108 return rdev->new_data_offset;
1109}
1110
1111struct raid10_plug_cb {
1112 struct blk_plug_cb cb;
1113 struct bio_list pending;
1114 int pending_cnt;
1115};
1116
1117static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1118{
1119 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1120 cb);
1121 struct mddev *mddev = plug->cb.data;
1122 struct r10conf *conf = mddev->private;
1123 struct bio *bio;
1124
1125 if (from_schedule || current->bio_list) {
1126 spin_lock_irq(&conf->device_lock);
1127 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1128 conf->pending_count += plug->pending_cnt;
1129 spin_unlock_irq(&conf->device_lock);
1130 wake_up(&conf->wait_barrier);
1131 md_wakeup_thread(mddev->thread);
1132 kfree(plug);
1133 return;
1134 }
1135
1136
1137 bio = bio_list_get(&plug->pending);
1138 bitmap_unplug(mddev->bitmap);
1139 wake_up(&conf->wait_barrier);
1140
1141 while (bio) {
1142 struct bio *next = bio->bi_next;
1143 bio->bi_next = NULL;
1144 if (unlikely((bio->bi_rw & REQ_DISCARD) &&
1145 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
1146
1147 bio_endio(bio, 0);
1148 else
1149 generic_make_request(bio);
1150 bio = next;
1151 }
1152 kfree(plug);
1153}
1154
1155static void make_request(struct mddev *mddev, struct bio * bio)
1156{
1157 struct r10conf *conf = mddev->private;
1158 struct r10bio *r10_bio;
1159 struct bio *read_bio;
1160 int i;
1161 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1162 int chunk_sects = chunk_mask + 1;
1163 const int rw = bio_data_dir(bio);
1164 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
1165 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
1166 const unsigned long do_discard = (bio->bi_rw
1167 & (REQ_DISCARD | REQ_SECURE));
1168 const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
1169 unsigned long flags;
1170 struct md_rdev *blocked_rdev;
1171 struct blk_plug_cb *cb;
1172 struct raid10_plug_cb *plug = NULL;
1173 int sectors_handled;
1174 int max_sectors;
1175 int sectors;
1176
1177 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
1178 md_flush_request(mddev, bio);
1179 return;
1180 }
1181
1182
1183
1184
1185 if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio)
1186 > chunk_sects
1187 && (conf->geo.near_copies < conf->geo.raid_disks
1188 || conf->prev.near_copies < conf->prev.raid_disks))) {
1189 struct bio_pair *bp;
1190
1191 if (bio_segments(bio) > 1)
1192 goto bad_map;
1193
1194
1195
1196 bp = bio_split(bio,
1197 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207 spin_lock_irq(&conf->resync_lock);
1208 conf->nr_waiting++;
1209 spin_unlock_irq(&conf->resync_lock);
1210
1211 make_request(mddev, &bp->bio1);
1212 make_request(mddev, &bp->bio2);
1213
1214 spin_lock_irq(&conf->resync_lock);
1215 conf->nr_waiting--;
1216 wake_up(&conf->wait_barrier);
1217 spin_unlock_irq(&conf->resync_lock);
1218
1219 bio_pair_release(bp);
1220 return;
1221 bad_map:
1222 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
1223 " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
1224 (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
1225
1226 bio_io_error(bio);
1227 return;
1228 }
1229
1230 md_write_start(mddev, bio);
1231
1232
1233
1234
1235
1236
1237 wait_barrier(conf);
1238
1239 sectors = bio_sectors(bio);
1240 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1241 bio->bi_sector < conf->reshape_progress &&
1242 bio->bi_sector + sectors > conf->reshape_progress) {
1243
1244
1245
1246 allow_barrier(conf);
1247 wait_event(conf->wait_barrier,
1248 conf->reshape_progress <= bio->bi_sector ||
1249 conf->reshape_progress >= bio->bi_sector + sectors);
1250 wait_barrier(conf);
1251 }
1252 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1253 bio_data_dir(bio) == WRITE &&
1254 (mddev->reshape_backwards
1255 ? (bio->bi_sector < conf->reshape_safe &&
1256 bio->bi_sector + sectors > conf->reshape_progress)
1257 : (bio->bi_sector + sectors > conf->reshape_safe &&
1258 bio->bi_sector < conf->reshape_progress))) {
1259
1260 mddev->reshape_position = conf->reshape_progress;
1261 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1262 set_bit(MD_CHANGE_PENDING, &mddev->flags);
1263 md_wakeup_thread(mddev->thread);
1264 wait_event(mddev->sb_wait,
1265 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
1266
1267 conf->reshape_safe = mddev->reshape_position;
1268 }
1269
1270 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1271
1272 r10_bio->master_bio = bio;
1273 r10_bio->sectors = sectors;
1274
1275 r10_bio->mddev = mddev;
1276 r10_bio->sector = bio->bi_sector;
1277 r10_bio->state = 0;
1278
1279
1280
1281
1282
1283
1284
1285
1286 bio->bi_phys_segments = 0;
1287 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1288
1289 if (rw == READ) {
1290
1291
1292
1293 struct md_rdev *rdev;
1294 int slot;
1295
1296read_again:
1297 rdev = read_balance(conf, r10_bio, &max_sectors);
1298 if (!rdev) {
1299 raid_end_bio_io(r10_bio);
1300 return;
1301 }
1302 slot = r10_bio->read_slot;
1303
1304 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1305 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
1306 max_sectors);
1307
1308 r10_bio->devs[slot].bio = read_bio;
1309 r10_bio->devs[slot].rdev = rdev;
1310
1311 read_bio->bi_sector = r10_bio->devs[slot].addr +
1312 choose_data_offset(r10_bio, rdev);
1313 read_bio->bi_bdev = rdev->bdev;
1314 read_bio->bi_end_io = raid10_end_read_request;
1315 read_bio->bi_rw = READ | do_sync;
1316 read_bio->bi_private = r10_bio;
1317
1318 if (max_sectors < r10_bio->sectors) {
1319
1320
1321
1322 sectors_handled = (r10_bio->sectors + max_sectors
1323 - bio->bi_sector);
1324 r10_bio->sectors = max_sectors;
1325 spin_lock_irq(&conf->device_lock);
1326 if (bio->bi_phys_segments == 0)
1327 bio->bi_phys_segments = 2;
1328 else
1329 bio->bi_phys_segments++;
1330 spin_unlock(&conf->device_lock);
1331
1332
1333
1334
1335
1336 reschedule_retry(r10_bio);
1337
1338 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1339
1340 r10_bio->master_bio = bio;
1341 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1342 r10_bio->state = 0;
1343 r10_bio->mddev = mddev;
1344 r10_bio->sector = bio->bi_sector + sectors_handled;
1345 goto read_again;
1346 } else
1347 generic_make_request(read_bio);
1348 return;
1349 }
1350
1351
1352
1353
1354 if (conf->pending_count >= max_queued_requests) {
1355 md_wakeup_thread(mddev->thread);
1356 wait_event(conf->wait_barrier,
1357 conf->pending_count < max_queued_requests);
1358 }
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371 r10_bio->read_slot = -1;
1372 raid10_find_phys(conf, r10_bio);
1373retry_write:
1374 blocked_rdev = NULL;
1375 rcu_read_lock();
1376 max_sectors = r10_bio->sectors;
1377
1378 for (i = 0; i < conf->copies; i++) {
1379 int d = r10_bio->devs[i].devnum;
1380 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1381 struct md_rdev *rrdev = rcu_dereference(
1382 conf->mirrors[d].replacement);
1383 if (rdev == rrdev)
1384 rrdev = NULL;
1385 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1386 atomic_inc(&rdev->nr_pending);
1387 blocked_rdev = rdev;
1388 break;
1389 }
1390 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1391 atomic_inc(&rrdev->nr_pending);
1392 blocked_rdev = rrdev;
1393 break;
1394 }
1395 if (rdev && (test_bit(Faulty, &rdev->flags)
1396 || test_bit(Unmerged, &rdev->flags)))
1397 rdev = NULL;
1398 if (rrdev && (test_bit(Faulty, &rrdev->flags)
1399 || test_bit(Unmerged, &rrdev->flags)))
1400 rrdev = NULL;
1401
1402 r10_bio->devs[i].bio = NULL;
1403 r10_bio->devs[i].repl_bio = NULL;
1404
1405 if (!rdev && !rrdev) {
1406 set_bit(R10BIO_Degraded, &r10_bio->state);
1407 continue;
1408 }
1409 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1410 sector_t first_bad;
1411 sector_t dev_sector = r10_bio->devs[i].addr;
1412 int bad_sectors;
1413 int is_bad;
1414
1415 is_bad = is_badblock(rdev, dev_sector,
1416 max_sectors,
1417 &first_bad, &bad_sectors);
1418 if (is_bad < 0) {
1419
1420
1421
1422 atomic_inc(&rdev->nr_pending);
1423 set_bit(BlockedBadBlocks, &rdev->flags);
1424 blocked_rdev = rdev;
1425 break;
1426 }
1427 if (is_bad && first_bad <= dev_sector) {
1428
1429 bad_sectors -= (dev_sector - first_bad);
1430 if (bad_sectors < max_sectors)
1431
1432
1433
1434 max_sectors = bad_sectors;
1435
1436
1437
1438
1439
1440
1441
1442
1443 continue;
1444 }
1445 if (is_bad) {
1446 int good_sectors = first_bad - dev_sector;
1447 if (good_sectors < max_sectors)
1448 max_sectors = good_sectors;
1449 }
1450 }
1451 if (rdev) {
1452 r10_bio->devs[i].bio = bio;
1453 atomic_inc(&rdev->nr_pending);
1454 }
1455 if (rrdev) {
1456 r10_bio->devs[i].repl_bio = bio;
1457 atomic_inc(&rrdev->nr_pending);
1458 }
1459 }
1460 rcu_read_unlock();
1461
1462 if (unlikely(blocked_rdev)) {
1463
1464 int j;
1465 int d;
1466
1467 for (j = 0; j < i; j++) {
1468 if (r10_bio->devs[j].bio) {
1469 d = r10_bio->devs[j].devnum;
1470 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1471 }
1472 if (r10_bio->devs[j].repl_bio) {
1473 struct md_rdev *rdev;
1474 d = r10_bio->devs[j].devnum;
1475 rdev = conf->mirrors[d].replacement;
1476 if (!rdev) {
1477
1478 smp_mb();
1479 rdev = conf->mirrors[d].rdev;
1480 }
1481 rdev_dec_pending(rdev, mddev);
1482 }
1483 }
1484 allow_barrier(conf);
1485 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1486 wait_barrier(conf);
1487 goto retry_write;
1488 }
1489
1490 if (max_sectors < r10_bio->sectors) {
1491
1492
1493
1494 r10_bio->sectors = max_sectors;
1495 spin_lock_irq(&conf->device_lock);
1496 if (bio->bi_phys_segments == 0)
1497 bio->bi_phys_segments = 2;
1498 else
1499 bio->bi_phys_segments++;
1500 spin_unlock_irq(&conf->device_lock);
1501 }
1502 sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
1503
1504 atomic_set(&r10_bio->remaining, 1);
1505 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1506
1507 for (i = 0; i < conf->copies; i++) {
1508 struct bio *mbio;
1509 int d = r10_bio->devs[i].devnum;
1510 if (r10_bio->devs[i].bio) {
1511 struct md_rdev *rdev = conf->mirrors[d].rdev;
1512 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1513 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1514 max_sectors);
1515 r10_bio->devs[i].bio = mbio;
1516
1517 mbio->bi_sector = (r10_bio->devs[i].addr+
1518 choose_data_offset(r10_bio,
1519 rdev));
1520 mbio->bi_bdev = rdev->bdev;
1521 mbio->bi_end_io = raid10_end_write_request;
1522 mbio->bi_rw =
1523 WRITE | do_sync | do_fua | do_discard | do_same;
1524 mbio->bi_private = r10_bio;
1525
1526 atomic_inc(&r10_bio->remaining);
1527
1528 cb = blk_check_plugged(raid10_unplug, mddev,
1529 sizeof(*plug));
1530 if (cb)
1531 plug = container_of(cb, struct raid10_plug_cb,
1532 cb);
1533 else
1534 plug = NULL;
1535 spin_lock_irqsave(&conf->device_lock, flags);
1536 if (plug) {
1537 bio_list_add(&plug->pending, mbio);
1538 plug->pending_cnt++;
1539 } else {
1540 bio_list_add(&conf->pending_bio_list, mbio);
1541 conf->pending_count++;
1542 }
1543 spin_unlock_irqrestore(&conf->device_lock, flags);
1544 if (!plug)
1545 md_wakeup_thread(mddev->thread);
1546 }
1547
1548 if (r10_bio->devs[i].repl_bio) {
1549 struct md_rdev *rdev = conf->mirrors[d].replacement;
1550 if (rdev == NULL) {
1551
1552 smp_mb();
1553 rdev = conf->mirrors[d].rdev;
1554 }
1555 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1556 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1557 max_sectors);
1558 r10_bio->devs[i].repl_bio = mbio;
1559
1560 mbio->bi_sector = (r10_bio->devs[i].addr +
1561 choose_data_offset(
1562 r10_bio, rdev));
1563 mbio->bi_bdev = rdev->bdev;
1564 mbio->bi_end_io = raid10_end_write_request;
1565 mbio->bi_rw =
1566 WRITE | do_sync | do_fua | do_discard | do_same;
1567 mbio->bi_private = r10_bio;
1568
1569 atomic_inc(&r10_bio->remaining);
1570 spin_lock_irqsave(&conf->device_lock, flags);
1571 bio_list_add(&conf->pending_bio_list, mbio);
1572 conf->pending_count++;
1573 spin_unlock_irqrestore(&conf->device_lock, flags);
1574 if (!mddev_check_plugged(mddev))
1575 md_wakeup_thread(mddev->thread);
1576 }
1577 }
1578
1579
1580
1581
1582
1583 if (sectors_handled < bio_sectors(bio)) {
1584 one_write_done(r10_bio);
1585
1586
1587
1588 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1589
1590 r10_bio->master_bio = bio;
1591 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1592
1593 r10_bio->mddev = mddev;
1594 r10_bio->sector = bio->bi_sector + sectors_handled;
1595 r10_bio->state = 0;
1596 goto retry_write;
1597 }
1598 one_write_done(r10_bio);
1599
1600
1601 wake_up(&conf->wait_barrier);
1602}
1603
1604static void status(struct seq_file *seq, struct mddev *mddev)
1605{
1606 struct r10conf *conf = mddev->private;
1607 int i;
1608
1609 if (conf->geo.near_copies < conf->geo.raid_disks)
1610 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1611 if (conf->geo.near_copies > 1)
1612 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1613 if (conf->geo.far_copies > 1) {
1614 if (conf->geo.far_offset)
1615 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1616 else
1617 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1618 }
1619 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1620 conf->geo.raid_disks - mddev->degraded);
1621 for (i = 0; i < conf->geo.raid_disks; i++)
1622 seq_printf(seq, "%s",
1623 conf->mirrors[i].rdev &&
1624 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
1625 seq_printf(seq, "]");
1626}
1627
1628
1629
1630
1631
1632
1633static int _enough(struct r10conf *conf, int previous, int ignore)
1634{
1635 int first = 0;
1636 int has_enough = 0;
1637 int disks, ncopies;
1638 if (previous) {
1639 disks = conf->prev.raid_disks;
1640 ncopies = conf->prev.near_copies;
1641 } else {
1642 disks = conf->geo.raid_disks;
1643 ncopies = conf->geo.near_copies;
1644 }
1645
1646 rcu_read_lock();
1647 do {
1648 int n = conf->copies;
1649 int cnt = 0;
1650 int this = first;
1651 while (n--) {
1652 struct md_rdev *rdev;
1653 if (this != ignore &&
1654 (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
1655 test_bit(In_sync, &rdev->flags))
1656 cnt++;
1657 this = (this+1) % disks;
1658 }
1659 if (cnt == 0)
1660 goto out;
1661 first = (first + ncopies) % disks;
1662 } while (first != 0);
1663 has_enough = 1;
1664out:
1665 rcu_read_unlock();
1666 return has_enough;
1667}
1668
1669static int enough(struct r10conf *conf, int ignore)
1670{
1671
1672
1673
1674
1675
1676 return _enough(conf, 0, ignore) &&
1677 _enough(conf, 1, ignore);
1678}
1679
1680static void error(struct mddev *mddev, struct md_rdev *rdev)
1681{
1682 char b[BDEVNAME_SIZE];
1683 struct r10conf *conf = mddev->private;
1684 unsigned long flags;
1685
1686
1687
1688
1689
1690
1691
1692 spin_lock_irqsave(&conf->device_lock, flags);
1693 if (test_bit(In_sync, &rdev->flags)
1694 && !enough(conf, rdev->raid_disk)) {
1695
1696
1697
1698 spin_unlock_irqrestore(&conf->device_lock, flags);
1699 return;
1700 }
1701 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1702 mddev->degraded++;
1703
1704
1705
1706 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1707 }
1708 set_bit(Blocked, &rdev->flags);
1709 set_bit(Faulty, &rdev->flags);
1710 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1711 spin_unlock_irqrestore(&conf->device_lock, flags);
1712 printk(KERN_ALERT
1713 "md/raid10:%s: Disk failure on %s, disabling device.\n"
1714 "md/raid10:%s: Operation continuing on %d devices.\n",
1715 mdname(mddev), bdevname(rdev->bdev, b),
1716 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1717}
1718
1719static void print_conf(struct r10conf *conf)
1720{
1721 int i;
1722 struct raid10_info *tmp;
1723
1724 printk(KERN_DEBUG "RAID10 conf printout:\n");
1725 if (!conf) {
1726 printk(KERN_DEBUG "(!conf)\n");
1727 return;
1728 }
1729 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1730 conf->geo.raid_disks);
1731
1732 for (i = 0; i < conf->geo.raid_disks; i++) {
1733 char b[BDEVNAME_SIZE];
1734 tmp = conf->mirrors + i;
1735 if (tmp->rdev)
1736 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1737 i, !test_bit(In_sync, &tmp->rdev->flags),
1738 !test_bit(Faulty, &tmp->rdev->flags),
1739 bdevname(tmp->rdev->bdev,b));
1740 }
1741}
1742
1743static void close_sync(struct r10conf *conf)
1744{
1745 wait_barrier(conf);
1746 allow_barrier(conf);
1747
1748 mempool_destroy(conf->r10buf_pool);
1749 conf->r10buf_pool = NULL;
1750}
1751
1752static int raid10_spare_active(struct mddev *mddev)
1753{
1754 int i;
1755 struct r10conf *conf = mddev->private;
1756 struct raid10_info *tmp;
1757 int count = 0;
1758 unsigned long flags;
1759
1760
1761
1762
1763
1764 for (i = 0; i < conf->geo.raid_disks; i++) {
1765 tmp = conf->mirrors + i;
1766 if (tmp->replacement
1767 && tmp->replacement->recovery_offset == MaxSector
1768 && !test_bit(Faulty, &tmp->replacement->flags)
1769 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1770
1771 if (!tmp->rdev
1772 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1773 count++;
1774 if (tmp->rdev) {
1775
1776
1777
1778
1779 set_bit(Faulty, &tmp->rdev->flags);
1780 sysfs_notify_dirent_safe(
1781 tmp->rdev->sysfs_state);
1782 }
1783 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1784 } else if (tmp->rdev
1785 && !test_bit(Faulty, &tmp->rdev->flags)
1786 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1787 count++;
1788 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
1789 }
1790 }
1791 spin_lock_irqsave(&conf->device_lock, flags);
1792 mddev->degraded -= count;
1793 spin_unlock_irqrestore(&conf->device_lock, flags);
1794
1795 print_conf(conf);
1796 return count;
1797}
1798
1799
1800static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1801{
1802 struct r10conf *conf = mddev->private;
1803 int err = -EEXIST;
1804 int mirror;
1805 int first = 0;
1806 int last = conf->geo.raid_disks - 1;
1807 struct request_queue *q = bdev_get_queue(rdev->bdev);
1808
1809 if (mddev->recovery_cp < MaxSector)
1810
1811
1812
1813 return -EBUSY;
1814 if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
1815 return -EINVAL;
1816
1817 if (rdev->raid_disk >= 0)
1818 first = last = rdev->raid_disk;
1819
1820 if (q->merge_bvec_fn) {
1821 set_bit(Unmerged, &rdev->flags);
1822 mddev->merge_check_needed = 1;
1823 }
1824
1825 if (rdev->saved_raid_disk >= first &&
1826 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1827 mirror = rdev->saved_raid_disk;
1828 else
1829 mirror = first;
1830 for ( ; mirror <= last ; mirror++) {
1831 struct raid10_info *p = &conf->mirrors[mirror];
1832 if (p->recovery_disabled == mddev->recovery_disabled)
1833 continue;
1834 if (p->rdev) {
1835 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1836 p->replacement != NULL)
1837 continue;
1838 clear_bit(In_sync, &rdev->flags);
1839 set_bit(Replacement, &rdev->flags);
1840 rdev->raid_disk = mirror;
1841 err = 0;
1842 if (mddev->gendisk)
1843 disk_stack_limits(mddev->gendisk, rdev->bdev,
1844 rdev->data_offset << 9);
1845 conf->fullsync = 1;
1846 rcu_assign_pointer(p->replacement, rdev);
1847 break;
1848 }
1849
1850 if (mddev->gendisk)
1851 disk_stack_limits(mddev->gendisk, rdev->bdev,
1852 rdev->data_offset << 9);
1853
1854 p->head_position = 0;
1855 p->recovery_disabled = mddev->recovery_disabled - 1;
1856 rdev->raid_disk = mirror;
1857 err = 0;
1858 if (rdev->saved_raid_disk != mirror)
1859 conf->fullsync = 1;
1860 rcu_assign_pointer(p->rdev, rdev);
1861 break;
1862 }
1863 if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
1864
1865
1866
1867
1868
1869
1870
1871 synchronize_sched();
1872 freeze_array(conf, 0);
1873 unfreeze_array(conf);
1874 clear_bit(Unmerged, &rdev->flags);
1875 }
1876 md_integrity_add_rdev(rdev, mddev);
1877 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1878 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
1879
1880 print_conf(conf);
1881 return err;
1882}
1883
1884static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1885{
1886 struct r10conf *conf = mddev->private;
1887 int err = 0;
1888 int number = rdev->raid_disk;
1889 struct md_rdev **rdevp;
1890 struct raid10_info *p = conf->mirrors + number;
1891
1892 print_conf(conf);
1893 if (rdev == p->rdev)
1894 rdevp = &p->rdev;
1895 else if (rdev == p->replacement)
1896 rdevp = &p->replacement;
1897 else
1898 return 0;
1899
1900 if (test_bit(In_sync, &rdev->flags) ||
1901 atomic_read(&rdev->nr_pending)) {
1902 err = -EBUSY;
1903 goto abort;
1904 }
1905
1906
1907
1908 if (!test_bit(Faulty, &rdev->flags) &&
1909 mddev->recovery_disabled != p->recovery_disabled &&
1910 (!p->replacement || p->replacement == rdev) &&
1911 number < conf->geo.raid_disks &&
1912 enough(conf, -1)) {
1913 err = -EBUSY;
1914 goto abort;
1915 }
1916 *rdevp = NULL;
1917 synchronize_rcu();
1918 if (atomic_read(&rdev->nr_pending)) {
1919
1920 err = -EBUSY;
1921 *rdevp = rdev;
1922 goto abort;
1923 } else if (p->replacement) {
1924
1925 p->rdev = p->replacement;
1926 clear_bit(Replacement, &p->replacement->flags);
1927 smp_mb();
1928
1929
1930 p->replacement = NULL;
1931 clear_bit(WantReplacement, &rdev->flags);
1932 } else
1933
1934
1935
1936 clear_bit(WantReplacement, &rdev->flags);
1937
1938 err = md_integrity_register(mddev);
1939
1940abort:
1941
1942 print_conf(conf);
1943 return err;
1944}
1945
1946
1947static void end_sync_read(struct bio *bio, int error)
1948{
1949 struct r10bio *r10_bio = bio->bi_private;
1950 struct r10conf *conf = r10_bio->mddev->private;
1951 int d;
1952
1953 if (bio == r10_bio->master_bio) {
1954
1955 d = r10_bio->read_slot;
1956 } else
1957 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1958
1959 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1960 set_bit(R10BIO_Uptodate, &r10_bio->state);
1961 else
1962
1963
1964
1965 atomic_add(r10_bio->sectors,
1966 &conf->mirrors[d].rdev->corrected_errors);
1967
1968
1969
1970
1971 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1972 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1973 atomic_dec_and_test(&r10_bio->remaining)) {
1974
1975
1976
1977 reschedule_retry(r10_bio);
1978 }
1979}
1980
1981static void end_sync_request(struct r10bio *r10_bio)
1982{
1983 struct mddev *mddev = r10_bio->mddev;
1984
1985 while (atomic_dec_and_test(&r10_bio->remaining)) {
1986 if (r10_bio->master_bio == NULL) {
1987
1988 sector_t s = r10_bio->sectors;
1989 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1990 test_bit(R10BIO_WriteError, &r10_bio->state))
1991 reschedule_retry(r10_bio);
1992 else
1993 put_buf(r10_bio);
1994 md_done_sync(mddev, s, 1);
1995 break;
1996 } else {
1997 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1998 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1999 test_bit(R10BIO_WriteError, &r10_bio->state))
2000 reschedule_retry(r10_bio);
2001 else
2002 put_buf(r10_bio);
2003 r10_bio = r10_bio2;
2004 }
2005 }
2006}
2007
2008static void end_sync_write(struct bio *bio, int error)
2009{
2010 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
2011 struct r10bio *r10_bio = bio->bi_private;
2012 struct mddev *mddev = r10_bio->mddev;
2013 struct r10conf *conf = mddev->private;
2014 int d;
2015 sector_t first_bad;
2016 int bad_sectors;
2017 int slot;
2018 int repl;
2019 struct md_rdev *rdev = NULL;
2020
2021 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
2022 if (repl)
2023 rdev = conf->mirrors[d].replacement;
2024 else
2025 rdev = conf->mirrors[d].rdev;
2026
2027 if (!uptodate) {
2028 if (repl)
2029 md_error(mddev, rdev);
2030 else {
2031 set_bit(WriteErrorSeen, &rdev->flags);
2032 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2033 set_bit(MD_RECOVERY_NEEDED,
2034 &rdev->mddev->recovery);
2035 set_bit(R10BIO_WriteError, &r10_bio->state);
2036 }
2037 } else if (is_badblock(rdev,
2038 r10_bio->devs[slot].addr,
2039 r10_bio->sectors,
2040 &first_bad, &bad_sectors))
2041 set_bit(R10BIO_MadeGood, &r10_bio->state);
2042
2043 rdev_dec_pending(rdev, mddev);
2044
2045 end_sync_request(r10_bio);
2046}
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2065{
2066 struct r10conf *conf = mddev->private;
2067 int i, first;
2068 struct bio *tbio, *fbio;
2069 int vcnt;
2070
2071 atomic_set(&r10_bio->remaining, 1);
2072
2073
2074 for (i=0; i<conf->copies; i++)
2075 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
2076 break;
2077
2078 if (i == conf->copies)
2079 goto done;
2080
2081 first = i;
2082 fbio = r10_bio->devs[i].bio;
2083
2084 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2085
2086 for (i=0 ; i < conf->copies ; i++) {
2087 int j, d;
2088
2089 tbio = r10_bio->devs[i].bio;
2090
2091 if (tbio->bi_end_io != end_sync_read)
2092 continue;
2093 if (i == first)
2094 continue;
2095 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
2096
2097
2098
2099
2100 int sectors = r10_bio->sectors;
2101 for (j = 0; j < vcnt; j++) {
2102 int len = PAGE_SIZE;
2103 if (sectors < (len / 512))
2104 len = sectors * 512;
2105 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
2106 page_address(tbio->bi_io_vec[j].bv_page),
2107 len))
2108 break;
2109 sectors -= len/512;
2110 }
2111 if (j == vcnt)
2112 continue;
2113 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2114 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2115
2116 continue;
2117 }
2118
2119
2120
2121
2122
2123 bio_reset(tbio);
2124
2125 tbio->bi_vcnt = vcnt;
2126 tbio->bi_size = r10_bio->sectors << 9;
2127 tbio->bi_rw = WRITE;
2128 tbio->bi_private = r10_bio;
2129 tbio->bi_sector = r10_bio->devs[i].addr;
2130
2131 for (j=0; j < vcnt ; j++) {
2132 tbio->bi_io_vec[j].bv_offset = 0;
2133 tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
2134
2135 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2136 page_address(fbio->bi_io_vec[j].bv_page),
2137 PAGE_SIZE);
2138 }
2139 tbio->bi_end_io = end_sync_write;
2140
2141 d = r10_bio->devs[i].devnum;
2142 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2143 atomic_inc(&r10_bio->remaining);
2144 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2145
2146 tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
2147 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
2148 generic_make_request(tbio);
2149 }
2150
2151
2152
2153
2154 for (i = 0; i < conf->copies; i++) {
2155 int j, d;
2156
2157 tbio = r10_bio->devs[i].repl_bio;
2158 if (!tbio || !tbio->bi_end_io)
2159 continue;
2160 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2161 && r10_bio->devs[i].bio != fbio)
2162 for (j = 0; j < vcnt; j++)
2163 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2164 page_address(fbio->bi_io_vec[j].bv_page),
2165 PAGE_SIZE);
2166 d = r10_bio->devs[i].devnum;
2167 atomic_inc(&r10_bio->remaining);
2168 md_sync_acct(conf->mirrors[d].replacement->bdev,
2169 bio_sectors(tbio));
2170 generic_make_request(tbio);
2171 }
2172
2173done:
2174 if (atomic_dec_and_test(&r10_bio->remaining)) {
2175 md_done_sync(mddev, r10_bio->sectors, 1);
2176 put_buf(r10_bio);
2177 }
2178}
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190static void fix_recovery_read_error(struct r10bio *r10_bio)
2191{
2192
2193
2194
2195
2196
2197
2198
2199 struct mddev *mddev = r10_bio->mddev;
2200 struct r10conf *conf = mddev->private;
2201 struct bio *bio = r10_bio->devs[0].bio;
2202 sector_t sect = 0;
2203 int sectors = r10_bio->sectors;
2204 int idx = 0;
2205 int dr = r10_bio->devs[0].devnum;
2206 int dw = r10_bio->devs[1].devnum;
2207
2208 while (sectors) {
2209 int s = sectors;
2210 struct md_rdev *rdev;
2211 sector_t addr;
2212 int ok;
2213
2214 if (s > (PAGE_SIZE>>9))
2215 s = PAGE_SIZE >> 9;
2216
2217 rdev = conf->mirrors[dr].rdev;
2218 addr = r10_bio->devs[0].addr + sect,
2219 ok = sync_page_io(rdev,
2220 addr,
2221 s << 9,
2222 bio->bi_io_vec[idx].bv_page,
2223 READ, false);
2224 if (ok) {
2225 rdev = conf->mirrors[dw].rdev;
2226 addr = r10_bio->devs[1].addr + sect;
2227 ok = sync_page_io(rdev,
2228 addr,
2229 s << 9,
2230 bio->bi_io_vec[idx].bv_page,
2231 WRITE, false);
2232 if (!ok) {
2233 set_bit(WriteErrorSeen, &rdev->flags);
2234 if (!test_and_set_bit(WantReplacement,
2235 &rdev->flags))
2236 set_bit(MD_RECOVERY_NEEDED,
2237 &rdev->mddev->recovery);
2238 }
2239 }
2240 if (!ok) {
2241
2242
2243
2244
2245 rdev_set_badblocks(rdev, addr, s, 0);
2246
2247 if (rdev != conf->mirrors[dw].rdev) {
2248
2249 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2250 addr = r10_bio->devs[1].addr + sect;
2251 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2252 if (!ok) {
2253
2254 printk(KERN_NOTICE
2255 "md/raid10:%s: recovery aborted"
2256 " due to read error\n",
2257 mdname(mddev));
2258
2259 conf->mirrors[dw].recovery_disabled
2260 = mddev->recovery_disabled;
2261 set_bit(MD_RECOVERY_INTR,
2262 &mddev->recovery);
2263 break;
2264 }
2265 }
2266 }
2267
2268 sectors -= s;
2269 sect += s;
2270 idx++;
2271 }
2272}
2273
2274static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2275{
2276 struct r10conf *conf = mddev->private;
2277 int d;
2278 struct bio *wbio, *wbio2;
2279
2280 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2281 fix_recovery_read_error(r10_bio);
2282 end_sync_request(r10_bio);
2283 return;
2284 }
2285
2286
2287
2288
2289
2290 d = r10_bio->devs[1].devnum;
2291 wbio = r10_bio->devs[1].bio;
2292 wbio2 = r10_bio->devs[1].repl_bio;
2293
2294
2295
2296
2297 if (wbio2 && !wbio2->bi_end_io)
2298 wbio2 = NULL;
2299 if (wbio->bi_end_io) {
2300 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2301 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2302 generic_make_request(wbio);
2303 }
2304 if (wbio2) {
2305 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2306 md_sync_acct(conf->mirrors[d].replacement->bdev,
2307 bio_sectors(wbio2));
2308 generic_make_request(wbio2);
2309 }
2310}
2311
2312
2313
2314
2315
2316
2317
2318
2319static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2320{
2321 struct timespec cur_time_mon;
2322 unsigned long hours_since_last;
2323 unsigned int read_errors = atomic_read(&rdev->read_errors);
2324
2325 ktime_get_ts(&cur_time_mon);
2326
2327 if (rdev->last_read_error.tv_sec == 0 &&
2328 rdev->last_read_error.tv_nsec == 0) {
2329
2330 rdev->last_read_error = cur_time_mon;
2331 return;
2332 }
2333
2334 hours_since_last = (cur_time_mon.tv_sec -
2335 rdev->last_read_error.tv_sec) / 3600;
2336
2337 rdev->last_read_error = cur_time_mon;
2338
2339
2340
2341
2342
2343
2344 if (hours_since_last >= 8 * sizeof(read_errors))
2345 atomic_set(&rdev->read_errors, 0);
2346 else
2347 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2348}
2349
2350static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2351 int sectors, struct page *page, int rw)
2352{
2353 sector_t first_bad;
2354 int bad_sectors;
2355
2356 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2357 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2358 return -1;
2359 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
2360
2361 return 1;
2362 if (rw == WRITE) {
2363 set_bit(WriteErrorSeen, &rdev->flags);
2364 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2365 set_bit(MD_RECOVERY_NEEDED,
2366 &rdev->mddev->recovery);
2367 }
2368
2369 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2370 md_error(rdev->mddev, rdev);
2371 return 0;
2372}
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2383{
2384 int sect = 0;
2385 int sectors = r10_bio->sectors;
2386 struct md_rdev*rdev;
2387 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2388 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2389
2390
2391
2392
2393 rdev = conf->mirrors[d].rdev;
2394
2395 if (test_bit(Faulty, &rdev->flags))
2396
2397
2398 return;
2399
2400 check_decay_read_errors(mddev, rdev);
2401 atomic_inc(&rdev->read_errors);
2402 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2403 char b[BDEVNAME_SIZE];
2404 bdevname(rdev->bdev, b);
2405
2406 printk(KERN_NOTICE
2407 "md/raid10:%s: %s: Raid device exceeded "
2408 "read_error threshold [cur %d:max %d]\n",
2409 mdname(mddev), b,
2410 atomic_read(&rdev->read_errors), max_read_errors);
2411 printk(KERN_NOTICE
2412 "md/raid10:%s: %s: Failing raid device\n",
2413 mdname(mddev), b);
2414 md_error(mddev, conf->mirrors[d].rdev);
2415 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2416 return;
2417 }
2418
2419 while(sectors) {
2420 int s = sectors;
2421 int sl = r10_bio->read_slot;
2422 int success = 0;
2423 int start;
2424
2425 if (s > (PAGE_SIZE>>9))
2426 s = PAGE_SIZE >> 9;
2427
2428 rcu_read_lock();
2429 do {
2430 sector_t first_bad;
2431 int bad_sectors;
2432
2433 d = r10_bio->devs[sl].devnum;
2434 rdev = rcu_dereference(conf->mirrors[d].rdev);
2435 if (rdev &&
2436 !test_bit(Unmerged, &rdev->flags) &&
2437 test_bit(In_sync, &rdev->flags) &&
2438 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2439 &first_bad, &bad_sectors) == 0) {
2440 atomic_inc(&rdev->nr_pending);
2441 rcu_read_unlock();
2442 success = sync_page_io(rdev,
2443 r10_bio->devs[sl].addr +
2444 sect,
2445 s<<9,
2446 conf->tmppage, READ, false);
2447 rdev_dec_pending(rdev, mddev);
2448 rcu_read_lock();
2449 if (success)
2450 break;
2451 }
2452 sl++;
2453 if (sl == conf->copies)
2454 sl = 0;
2455 } while (!success && sl != r10_bio->read_slot);
2456 rcu_read_unlock();
2457
2458 if (!success) {
2459
2460
2461
2462
2463 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2464 rdev = conf->mirrors[dn].rdev;
2465
2466 if (!rdev_set_badblocks(
2467 rdev,
2468 r10_bio->devs[r10_bio->read_slot].addr
2469 + sect,
2470 s, 0)) {
2471 md_error(mddev, rdev);
2472 r10_bio->devs[r10_bio->read_slot].bio
2473 = IO_BLOCKED;
2474 }
2475 break;
2476 }
2477
2478 start = sl;
2479
2480 rcu_read_lock();
2481 while (sl != r10_bio->read_slot) {
2482 char b[BDEVNAME_SIZE];
2483
2484 if (sl==0)
2485 sl = conf->copies;
2486 sl--;
2487 d = r10_bio->devs[sl].devnum;
2488 rdev = rcu_dereference(conf->mirrors[d].rdev);
2489 if (!rdev ||
2490 test_bit(Unmerged, &rdev->flags) ||
2491 !test_bit(In_sync, &rdev->flags))
2492 continue;
2493
2494 atomic_inc(&rdev->nr_pending);
2495 rcu_read_unlock();
2496 if (r10_sync_page_io(rdev,
2497 r10_bio->devs[sl].addr +
2498 sect,
2499 s, conf->tmppage, WRITE)
2500 == 0) {
2501
2502 printk(KERN_NOTICE
2503 "md/raid10:%s: read correction "
2504 "write failed"
2505 " (%d sectors at %llu on %s)\n",
2506 mdname(mddev), s,
2507 (unsigned long long)(
2508 sect +
2509 choose_data_offset(r10_bio,
2510 rdev)),
2511 bdevname(rdev->bdev, b));
2512 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2513 "drive\n",
2514 mdname(mddev),
2515 bdevname(rdev->bdev, b));
2516 }
2517 rdev_dec_pending(rdev, mddev);
2518 rcu_read_lock();
2519 }
2520 sl = start;
2521 while (sl != r10_bio->read_slot) {
2522 char b[BDEVNAME_SIZE];
2523
2524 if (sl==0)
2525 sl = conf->copies;
2526 sl--;
2527 d = r10_bio->devs[sl].devnum;
2528 rdev = rcu_dereference(conf->mirrors[d].rdev);
2529 if (!rdev ||
2530 !test_bit(In_sync, &rdev->flags))
2531 continue;
2532
2533 atomic_inc(&rdev->nr_pending);
2534 rcu_read_unlock();
2535 switch (r10_sync_page_io(rdev,
2536 r10_bio->devs[sl].addr +
2537 sect,
2538 s, conf->tmppage,
2539 READ)) {
2540 case 0:
2541
2542 printk(KERN_NOTICE
2543 "md/raid10:%s: unable to read back "
2544 "corrected sectors"
2545 " (%d sectors at %llu on %s)\n",
2546 mdname(mddev), s,
2547 (unsigned long long)(
2548 sect +
2549 choose_data_offset(r10_bio, rdev)),
2550 bdevname(rdev->bdev, b));
2551 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2552 "drive\n",
2553 mdname(mddev),
2554 bdevname(rdev->bdev, b));
2555 break;
2556 case 1:
2557 printk(KERN_INFO
2558 "md/raid10:%s: read error corrected"
2559 " (%d sectors at %llu on %s)\n",
2560 mdname(mddev), s,
2561 (unsigned long long)(
2562 sect +
2563 choose_data_offset(r10_bio, rdev)),
2564 bdevname(rdev->bdev, b));
2565 atomic_add(s, &rdev->corrected_errors);
2566 }
2567
2568 rdev_dec_pending(rdev, mddev);
2569 rcu_read_lock();
2570 }
2571 rcu_read_unlock();
2572
2573 sectors -= s;
2574 sect += s;
2575 }
2576}
2577
2578static int narrow_write_error(struct r10bio *r10_bio, int i)
2579{
2580 struct bio *bio = r10_bio->master_bio;
2581 struct mddev *mddev = r10_bio->mddev;
2582 struct r10conf *conf = mddev->private;
2583 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595 int block_sectors;
2596 sector_t sector;
2597 int sectors;
2598 int sect_to_write = r10_bio->sectors;
2599 int ok = 1;
2600
2601 if (rdev->badblocks.shift < 0)
2602 return 0;
2603
2604 block_sectors = 1 << rdev->badblocks.shift;
2605 sector = r10_bio->sector;
2606 sectors = ((r10_bio->sector + block_sectors)
2607 & ~(sector_t)(block_sectors - 1))
2608 - sector;
2609
2610 while (sect_to_write) {
2611 struct bio *wbio;
2612 if (sectors > sect_to_write)
2613 sectors = sect_to_write;
2614
2615 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2616 md_trim_bio(wbio, sector - bio->bi_sector, sectors);
2617 wbio->bi_sector = (r10_bio->devs[i].addr+
2618 choose_data_offset(r10_bio, rdev) +
2619 (sector - r10_bio->sector));
2620 wbio->bi_bdev = rdev->bdev;
2621 if (submit_bio_wait(WRITE, wbio) == 0)
2622
2623 ok = rdev_set_badblocks(rdev, sector,
2624 sectors, 0)
2625 && ok;
2626
2627 bio_put(wbio);
2628 sect_to_write -= sectors;
2629 sector += sectors;
2630 sectors = block_sectors;
2631 }
2632 return ok;
2633}
2634
2635static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2636{
2637 int slot = r10_bio->read_slot;
2638 struct bio *bio;
2639 struct r10conf *conf = mddev->private;
2640 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2641 char b[BDEVNAME_SIZE];
2642 unsigned long do_sync;
2643 int max_sectors;
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653 bio = r10_bio->devs[slot].bio;
2654 bdevname(bio->bi_bdev, b);
2655 bio_put(bio);
2656 r10_bio->devs[slot].bio = NULL;
2657
2658 if (mddev->ro == 0) {
2659 freeze_array(conf, 1);
2660 fix_read_error(conf, mddev, r10_bio);
2661 unfreeze_array(conf);
2662 } else
2663 r10_bio->devs[slot].bio = IO_BLOCKED;
2664
2665 rdev_dec_pending(rdev, mddev);
2666
2667read_more:
2668 rdev = read_balance(conf, r10_bio, &max_sectors);
2669 if (rdev == NULL) {
2670 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2671 " read error for block %llu\n",
2672 mdname(mddev), b,
2673 (unsigned long long)r10_bio->sector);
2674 raid_end_bio_io(r10_bio);
2675 return;
2676 }
2677
2678 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
2679 slot = r10_bio->read_slot;
2680 printk_ratelimited(
2681 KERN_ERR
2682 "md/raid10:%s: %s: redirecting "
2683 "sector %llu to another mirror\n",
2684 mdname(mddev),
2685 bdevname(rdev->bdev, b),
2686 (unsigned long long)r10_bio->sector);
2687 bio = bio_clone_mddev(r10_bio->master_bio,
2688 GFP_NOIO, mddev);
2689 md_trim_bio(bio,
2690 r10_bio->sector - bio->bi_sector,
2691 max_sectors);
2692 r10_bio->devs[slot].bio = bio;
2693 r10_bio->devs[slot].rdev = rdev;
2694 bio->bi_sector = r10_bio->devs[slot].addr
2695 + choose_data_offset(r10_bio, rdev);
2696 bio->bi_bdev = rdev->bdev;
2697 bio->bi_rw = READ | do_sync;
2698 bio->bi_private = r10_bio;
2699 bio->bi_end_io = raid10_end_read_request;
2700 if (max_sectors < r10_bio->sectors) {
2701
2702 struct bio *mbio = r10_bio->master_bio;
2703 int sectors_handled =
2704 r10_bio->sector + max_sectors
2705 - mbio->bi_sector;
2706 r10_bio->sectors = max_sectors;
2707 spin_lock_irq(&conf->device_lock);
2708 if (mbio->bi_phys_segments == 0)
2709 mbio->bi_phys_segments = 2;
2710 else
2711 mbio->bi_phys_segments++;
2712 spin_unlock_irq(&conf->device_lock);
2713 generic_make_request(bio);
2714
2715 r10_bio = mempool_alloc(conf->r10bio_pool,
2716 GFP_NOIO);
2717 r10_bio->master_bio = mbio;
2718 r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
2719 r10_bio->state = 0;
2720 set_bit(R10BIO_ReadError,
2721 &r10_bio->state);
2722 r10_bio->mddev = mddev;
2723 r10_bio->sector = mbio->bi_sector
2724 + sectors_handled;
2725
2726 goto read_more;
2727 } else
2728 generic_make_request(bio);
2729}
2730
2731static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2732{
2733
2734
2735
2736
2737
2738
2739 int m;
2740 struct md_rdev *rdev;
2741
2742 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2743 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2744 for (m = 0; m < conf->copies; m++) {
2745 int dev = r10_bio->devs[m].devnum;
2746 rdev = conf->mirrors[dev].rdev;
2747 if (r10_bio->devs[m].bio == NULL)
2748 continue;
2749 if (test_bit(BIO_UPTODATE,
2750 &r10_bio->devs[m].bio->bi_flags)) {
2751 rdev_clear_badblocks(
2752 rdev,
2753 r10_bio->devs[m].addr,
2754 r10_bio->sectors, 0);
2755 } else {
2756 if (!rdev_set_badblocks(
2757 rdev,
2758 r10_bio->devs[m].addr,
2759 r10_bio->sectors, 0))
2760 md_error(conf->mddev, rdev);
2761 }
2762 rdev = conf->mirrors[dev].replacement;
2763 if (r10_bio->devs[m].repl_bio == NULL)
2764 continue;
2765 if (test_bit(BIO_UPTODATE,
2766 &r10_bio->devs[m].repl_bio->bi_flags)) {
2767 rdev_clear_badblocks(
2768 rdev,
2769 r10_bio->devs[m].addr,
2770 r10_bio->sectors, 0);
2771 } else {
2772 if (!rdev_set_badblocks(
2773 rdev,
2774 r10_bio->devs[m].addr,
2775 r10_bio->sectors, 0))
2776 md_error(conf->mddev, rdev);
2777 }
2778 }
2779 put_buf(r10_bio);
2780 } else {
2781 for (m = 0; m < conf->copies; m++) {
2782 int dev = r10_bio->devs[m].devnum;
2783 struct bio *bio = r10_bio->devs[m].bio;
2784 rdev = conf->mirrors[dev].rdev;
2785 if (bio == IO_MADE_GOOD) {
2786 rdev_clear_badblocks(
2787 rdev,
2788 r10_bio->devs[m].addr,
2789 r10_bio->sectors, 0);
2790 rdev_dec_pending(rdev, conf->mddev);
2791 } else if (bio != NULL &&
2792 !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2793 if (!narrow_write_error(r10_bio, m)) {
2794 md_error(conf->mddev, rdev);
2795 set_bit(R10BIO_Degraded,
2796 &r10_bio->state);
2797 }
2798 rdev_dec_pending(rdev, conf->mddev);
2799 }
2800 bio = r10_bio->devs[m].repl_bio;
2801 rdev = conf->mirrors[dev].replacement;
2802 if (rdev && bio == IO_MADE_GOOD) {
2803 rdev_clear_badblocks(
2804 rdev,
2805 r10_bio->devs[m].addr,
2806 r10_bio->sectors, 0);
2807 rdev_dec_pending(rdev, conf->mddev);
2808 }
2809 }
2810 if (test_bit(R10BIO_WriteError,
2811 &r10_bio->state))
2812 close_write(r10_bio);
2813 raid_end_bio_io(r10_bio);
2814 }
2815}
2816
2817static void raid10d(struct md_thread *thread)
2818{
2819 struct mddev *mddev = thread->mddev;
2820 struct r10bio *r10_bio;
2821 unsigned long flags;
2822 struct r10conf *conf = mddev->private;
2823 struct list_head *head = &conf->retry_list;
2824 struct blk_plug plug;
2825
2826 md_check_recovery(mddev);
2827
2828 blk_start_plug(&plug);
2829 for (;;) {
2830
2831 flush_pending_writes(conf);
2832
2833 spin_lock_irqsave(&conf->device_lock, flags);
2834 if (list_empty(head)) {
2835 spin_unlock_irqrestore(&conf->device_lock, flags);
2836 break;
2837 }
2838 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2839 list_del(head->prev);
2840 conf->nr_queued--;
2841 spin_unlock_irqrestore(&conf->device_lock, flags);
2842
2843 mddev = r10_bio->mddev;
2844 conf = mddev->private;
2845 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2846 test_bit(R10BIO_WriteError, &r10_bio->state))
2847 handle_write_completed(conf, r10_bio);
2848 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2849 reshape_request_write(mddev, r10_bio);
2850 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2851 sync_request_write(mddev, r10_bio);
2852 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2853 recovery_request_write(mddev, r10_bio);
2854 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2855 handle_read_error(mddev, r10_bio);
2856 else {
2857
2858
2859
2860 int slot = r10_bio->read_slot;
2861 generic_make_request(r10_bio->devs[slot].bio);
2862 }
2863
2864 cond_resched();
2865 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2866 md_check_recovery(mddev);
2867 }
2868 blk_finish_plug(&plug);
2869}
2870
2871
2872static int init_resync(struct r10conf *conf)
2873{
2874 int buffs;
2875 int i;
2876
2877 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2878 BUG_ON(conf->r10buf_pool);
2879 conf->have_replacement = 0;
2880 for (i = 0; i < conf->geo.raid_disks; i++)
2881 if (conf->mirrors[i].replacement)
2882 conf->have_replacement = 1;
2883 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2884 if (!conf->r10buf_pool)
2885 return -ENOMEM;
2886 conf->next_resync = 0;
2887 return 0;
2888}
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2923 int *skipped, int go_faster)
2924{
2925 struct r10conf *conf = mddev->private;
2926 struct r10bio *r10_bio;
2927 struct bio *biolist = NULL, *bio;
2928 sector_t max_sector, nr_sectors;
2929 int i;
2930 int max_sync;
2931 sector_t sync_blocks;
2932 sector_t sectors_skipped = 0;
2933 int chunks_skipped = 0;
2934 sector_t chunk_mask = conf->geo.chunk_mask;
2935
2936 if (!conf->r10buf_pool)
2937 if (init_resync(conf))
2938 return 0;
2939
2940
2941
2942
2943
2944 if (mddev->bitmap == NULL &&
2945 mddev->recovery_cp == MaxSector &&
2946 mddev->reshape_position == MaxSector &&
2947 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2948 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
2949 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2950 conf->fullsync == 0) {
2951 *skipped = 1;
2952 return mddev->dev_sectors - sector_nr;
2953 }
2954
2955 skipped:
2956 max_sector = mddev->dev_sectors;
2957 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2958 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2959 max_sector = mddev->resync_max_sectors;
2960 if (sector_nr >= max_sector) {
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2971 end_reshape(conf);
2972 return 0;
2973 }
2974
2975 if (mddev->curr_resync < max_sector) {
2976 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2977 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2978 &sync_blocks, 1);
2979 else for (i = 0; i < conf->geo.raid_disks; i++) {
2980 sector_t sect =
2981 raid10_find_virt(conf, mddev->curr_resync, i);
2982 bitmap_end_sync(mddev->bitmap, sect,
2983 &sync_blocks, 1);
2984 }
2985 } else {
2986
2987 if ((!mddev->bitmap || conf->fullsync)
2988 && conf->have_replacement
2989 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2990
2991
2992
2993 for (i = 0; i < conf->geo.raid_disks; i++)
2994 if (conf->mirrors[i].replacement)
2995 conf->mirrors[i].replacement
2996 ->recovery_offset
2997 = MaxSector;
2998 }
2999 conf->fullsync = 0;
3000 }
3001 bitmap_close_sync(mddev->bitmap);
3002 close_sync(conf);
3003 *skipped = 1;
3004 return sectors_skipped;
3005 }
3006
3007 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3008 return reshape_request(mddev, sector_nr, skipped);
3009
3010 if (chunks_skipped >= conf->geo.raid_disks) {
3011
3012
3013
3014 *skipped = 1;
3015 return (max_sector - sector_nr) + sectors_skipped;
3016 }
3017
3018 if (max_sector > mddev->resync_max)
3019 max_sector = mddev->resync_max;
3020
3021
3022
3023
3024 if (conf->geo.near_copies < conf->geo.raid_disks &&
3025 max_sector > (sector_nr | chunk_mask))
3026 max_sector = (sector_nr | chunk_mask) + 1;
3027
3028
3029
3030
3031 if (!go_faster && conf->nr_waiting)
3032 msleep_interruptible(1000);
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
3050 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3051
3052 int j;
3053 r10_bio = NULL;
3054
3055 for (i = 0 ; i < conf->geo.raid_disks; i++) {
3056 int still_degraded;
3057 struct r10bio *rb2;
3058 sector_t sect;
3059 int must_sync;
3060 int any_working;
3061 struct raid10_info *mirror = &conf->mirrors[i];
3062
3063 if ((mirror->rdev == NULL ||
3064 test_bit(In_sync, &mirror->rdev->flags))
3065 &&
3066 (mirror->replacement == NULL ||
3067 test_bit(Faulty,
3068 &mirror->replacement->flags)))
3069 continue;
3070
3071 still_degraded = 0;
3072
3073 rb2 = r10_bio;
3074 sect = raid10_find_virt(conf, sector_nr, i);
3075 if (sect >= mddev->resync_max_sectors) {
3076
3077
3078
3079 continue;
3080 }
3081
3082
3083
3084
3085 must_sync = bitmap_start_sync(mddev->bitmap, sect,
3086 &sync_blocks, 1);
3087 if (sync_blocks < max_sync)
3088 max_sync = sync_blocks;
3089 if (!must_sync &&
3090 mirror->replacement == NULL &&
3091 !conf->fullsync) {
3092
3093
3094
3095 chunks_skipped = -1;
3096 continue;
3097 }
3098
3099 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3100 raise_barrier(conf, rb2 != NULL);
3101 atomic_set(&r10_bio->remaining, 0);
3102
3103 r10_bio->master_bio = (struct bio*)rb2;
3104 if (rb2)
3105 atomic_inc(&rb2->remaining);
3106 r10_bio->mddev = mddev;
3107 set_bit(R10BIO_IsRecover, &r10_bio->state);
3108 r10_bio->sector = sect;
3109
3110 raid10_find_phys(conf, r10_bio);
3111
3112
3113
3114
3115 for (j = 0; j < conf->geo.raid_disks; j++)
3116 if (conf->mirrors[j].rdev == NULL ||
3117 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
3118 still_degraded = 1;
3119 break;
3120 }
3121
3122 must_sync = bitmap_start_sync(mddev->bitmap, sect,
3123 &sync_blocks, still_degraded);
3124
3125 any_working = 0;
3126 for (j=0; j<conf->copies;j++) {
3127 int k;
3128 int d = r10_bio->devs[j].devnum;
3129 sector_t from_addr, to_addr;
3130 struct md_rdev *rdev;
3131 sector_t sector, first_bad;
3132 int bad_sectors;
3133 if (!conf->mirrors[d].rdev ||
3134 !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
3135 continue;
3136
3137 any_working = 1;
3138 rdev = conf->mirrors[d].rdev;
3139 sector = r10_bio->devs[j].addr;
3140
3141 if (is_badblock(rdev, sector, max_sync,
3142 &first_bad, &bad_sectors)) {
3143 if (first_bad > sector)
3144 max_sync = first_bad - sector;
3145 else {
3146 bad_sectors -= (sector
3147 - first_bad);
3148 if (max_sync > bad_sectors)
3149 max_sync = bad_sectors;
3150 continue;
3151 }
3152 }
3153 bio = r10_bio->devs[0].bio;
3154 bio_reset(bio);
3155 bio->bi_next = biolist;
3156 biolist = bio;
3157 bio->bi_private = r10_bio;
3158 bio->bi_end_io = end_sync_read;
3159 bio->bi_rw = READ;
3160 from_addr = r10_bio->devs[j].addr;
3161 bio->bi_sector = from_addr + rdev->data_offset;
3162 bio->bi_bdev = rdev->bdev;
3163 atomic_inc(&rdev->nr_pending);
3164
3165
3166 for (k=0; k<conf->copies; k++)
3167 if (r10_bio->devs[k].devnum == i)
3168 break;
3169 BUG_ON(k == conf->copies);
3170 to_addr = r10_bio->devs[k].addr;
3171 r10_bio->devs[0].devnum = d;
3172 r10_bio->devs[0].addr = from_addr;
3173 r10_bio->devs[1].devnum = i;
3174 r10_bio->devs[1].addr = to_addr;
3175
3176 rdev = mirror->rdev;
3177 if (!test_bit(In_sync, &rdev->flags)) {
3178 bio = r10_bio->devs[1].bio;
3179 bio_reset(bio);
3180 bio->bi_next = biolist;
3181 biolist = bio;
3182 bio->bi_private = r10_bio;
3183 bio->bi_end_io = end_sync_write;
3184 bio->bi_rw = WRITE;
3185 bio->bi_sector = to_addr
3186 + rdev->data_offset;
3187 bio->bi_bdev = rdev->bdev;
3188 atomic_inc(&r10_bio->remaining);
3189 } else
3190 r10_bio->devs[1].bio->bi_end_io = NULL;
3191
3192
3193 bio = r10_bio->devs[1].repl_bio;
3194 if (bio)
3195 bio->bi_end_io = NULL;
3196 rdev = mirror->replacement;
3197
3198
3199
3200
3201
3202
3203
3204
3205 if (rdev == NULL || bio == NULL ||
3206 test_bit(Faulty, &rdev->flags))
3207 break;
3208 bio_reset(bio);
3209 bio->bi_next = biolist;
3210 biolist = bio;
3211 bio->bi_private = r10_bio;
3212 bio->bi_end_io = end_sync_write;
3213 bio->bi_rw = WRITE;
3214 bio->bi_sector = to_addr + rdev->data_offset;
3215 bio->bi_bdev = rdev->bdev;
3216 atomic_inc(&r10_bio->remaining);
3217 break;
3218 }
3219 if (j == conf->copies) {
3220
3221
3222 put_buf(r10_bio);
3223 if (rb2)
3224 atomic_dec(&rb2->remaining);
3225 r10_bio = rb2;
3226 if (any_working) {
3227
3228
3229
3230 int k;
3231 for (k = 0; k < conf->copies; k++)
3232 if (r10_bio->devs[k].devnum == i)
3233 break;
3234 if (!test_bit(In_sync,
3235 &mirror->rdev->flags)
3236 && !rdev_set_badblocks(
3237 mirror->rdev,
3238 r10_bio->devs[k].addr,
3239 max_sync, 0))
3240 any_working = 0;
3241 if (mirror->replacement &&
3242 !rdev_set_badblocks(
3243 mirror->replacement,
3244 r10_bio->devs[k].addr,
3245 max_sync, 0))
3246 any_working = 0;
3247 }
3248 if (!any_working) {
3249 if (!test_and_set_bit(MD_RECOVERY_INTR,
3250 &mddev->recovery))
3251 printk(KERN_INFO "md/raid10:%s: insufficient "
3252 "working devices for recovery.\n",
3253 mdname(mddev));
3254 mirror->recovery_disabled
3255 = mddev->recovery_disabled;
3256 }
3257 break;
3258 }
3259 }
3260 if (biolist == NULL) {
3261 while (r10_bio) {
3262 struct r10bio *rb2 = r10_bio;
3263 r10_bio = (struct r10bio*) rb2->master_bio;
3264 rb2->master_bio = NULL;
3265 put_buf(rb2);
3266 }
3267 goto giveup;
3268 }
3269 } else {
3270
3271 int count = 0;
3272
3273 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
3274
3275 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
3276 &sync_blocks, mddev->degraded) &&
3277 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3278 &mddev->recovery)) {
3279
3280 *skipped = 1;
3281 return sync_blocks + sectors_skipped;
3282 }
3283 if (sync_blocks < max_sync)
3284 max_sync = sync_blocks;
3285 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3286
3287 r10_bio->mddev = mddev;
3288 atomic_set(&r10_bio->remaining, 0);
3289 raise_barrier(conf, 0);
3290 conf->next_resync = sector_nr;
3291
3292 r10_bio->master_bio = NULL;
3293 r10_bio->sector = sector_nr;
3294 set_bit(R10BIO_IsSync, &r10_bio->state);
3295 raid10_find_phys(conf, r10_bio);
3296 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3297
3298 for (i = 0; i < conf->copies; i++) {
3299 int d = r10_bio->devs[i].devnum;
3300 sector_t first_bad, sector;
3301 int bad_sectors;
3302
3303 if (r10_bio->devs[i].repl_bio)
3304 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3305
3306 bio = r10_bio->devs[i].bio;
3307 bio_reset(bio);
3308 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3309 if (conf->mirrors[d].rdev == NULL ||
3310 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
3311 continue;
3312 sector = r10_bio->devs[i].addr;
3313 if (is_badblock(conf->mirrors[d].rdev,
3314 sector, max_sync,
3315 &first_bad, &bad_sectors)) {
3316 if (first_bad > sector)
3317 max_sync = first_bad - sector;
3318 else {
3319 bad_sectors -= (sector - first_bad);
3320 if (max_sync > bad_sectors)
3321 max_sync = bad_sectors;
3322 continue;
3323 }
3324 }
3325 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3326 atomic_inc(&r10_bio->remaining);
3327 bio->bi_next = biolist;
3328 biolist = bio;
3329 bio->bi_private = r10_bio;
3330 bio->bi_end_io = end_sync_read;
3331 bio->bi_rw = READ;
3332 bio->bi_sector = sector +
3333 conf->mirrors[d].rdev->data_offset;
3334 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
3335 count++;
3336
3337 if (conf->mirrors[d].replacement == NULL ||
3338 test_bit(Faulty,
3339 &conf->mirrors[d].replacement->flags))
3340 continue;
3341
3342
3343 bio = r10_bio->devs[i].repl_bio;
3344 bio_reset(bio);
3345 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3346
3347 sector = r10_bio->devs[i].addr;
3348 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3349 bio->bi_next = biolist;
3350 biolist = bio;
3351 bio->bi_private = r10_bio;
3352 bio->bi_end_io = end_sync_write;
3353 bio->bi_rw = WRITE;
3354 bio->bi_sector = sector +
3355 conf->mirrors[d].replacement->data_offset;
3356 bio->bi_bdev = conf->mirrors[d].replacement->bdev;
3357 count++;
3358 }
3359
3360 if (count < 2) {
3361 for (i=0; i<conf->copies; i++) {
3362 int d = r10_bio->devs[i].devnum;
3363 if (r10_bio->devs[i].bio->bi_end_io)
3364 rdev_dec_pending(conf->mirrors[d].rdev,
3365 mddev);
3366 if (r10_bio->devs[i].repl_bio &&
3367 r10_bio->devs[i].repl_bio->bi_end_io)
3368 rdev_dec_pending(
3369 conf->mirrors[d].replacement,
3370 mddev);
3371 }
3372 put_buf(r10_bio);
3373 biolist = NULL;
3374 goto giveup;
3375 }
3376 }
3377
3378 nr_sectors = 0;
3379 if (sector_nr + max_sync < max_sector)
3380 max_sector = sector_nr + max_sync;
3381 do {
3382 struct page *page;
3383 int len = PAGE_SIZE;
3384 if (sector_nr + (len>>9) > max_sector)
3385 len = (max_sector - sector_nr) << 9;
3386 if (len == 0)
3387 break;
3388 for (bio= biolist ; bio ; bio=bio->bi_next) {
3389 struct bio *bio2;
3390 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
3391 if (bio_add_page(bio, page, len, 0))
3392 continue;
3393
3394
3395 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
3396 for (bio2 = biolist;
3397 bio2 && bio2 != bio;
3398 bio2 = bio2->bi_next) {
3399
3400 bio2->bi_vcnt--;
3401 bio2->bi_size -= len;
3402 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
3403 }
3404 goto bio_full;
3405 }
3406 nr_sectors += len>>9;
3407 sector_nr += len>>9;
3408 } while (biolist->bi_vcnt < RESYNC_PAGES);
3409 bio_full:
3410 r10_bio->sectors = nr_sectors;
3411
3412 while (biolist) {
3413 bio = biolist;
3414 biolist = biolist->bi_next;
3415
3416 bio->bi_next = NULL;
3417 r10_bio = bio->bi_private;
3418 r10_bio->sectors = nr_sectors;
3419
3420 if (bio->bi_end_io == end_sync_read) {
3421 md_sync_acct(bio->bi_bdev, nr_sectors);
3422 set_bit(BIO_UPTODATE, &bio->bi_flags);
3423 generic_make_request(bio);
3424 }
3425 }
3426
3427 if (sectors_skipped)
3428
3429
3430
3431 md_done_sync(mddev, sectors_skipped, 1);
3432
3433 return sectors_skipped + nr_sectors;
3434 giveup:
3435
3436
3437
3438
3439 if (sector_nr + max_sync < max_sector)
3440 max_sector = sector_nr + max_sync;
3441
3442 sectors_skipped += (max_sector - sector_nr);
3443 chunks_skipped ++;
3444 sector_nr = max_sector;
3445 goto skipped;
3446}
3447
3448static sector_t
3449raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3450{
3451 sector_t size;
3452 struct r10conf *conf = mddev->private;
3453
3454 if (!raid_disks)
3455 raid_disks = min(conf->geo.raid_disks,
3456 conf->prev.raid_disks);
3457 if (!sectors)
3458 sectors = conf->dev_sectors;
3459
3460 size = sectors >> conf->geo.chunk_shift;
3461 sector_div(size, conf->geo.far_copies);
3462 size = size * raid_disks;
3463 sector_div(size, conf->geo.near_copies);
3464
3465 return size << conf->geo.chunk_shift;
3466}
3467
3468static void calc_sectors(struct r10conf *conf, sector_t size)
3469{
3470
3471
3472
3473
3474
3475 size = size >> conf->geo.chunk_shift;
3476 sector_div(size, conf->geo.far_copies);
3477 size = size * conf->geo.raid_disks;
3478 sector_div(size, conf->geo.near_copies);
3479
3480
3481 size = size * conf->copies;
3482
3483
3484
3485
3486 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3487
3488 conf->dev_sectors = size << conf->geo.chunk_shift;
3489
3490 if (conf->geo.far_offset)
3491 conf->geo.stride = 1 << conf->geo.chunk_shift;
3492 else {
3493 sector_div(size, conf->geo.far_copies);
3494 conf->geo.stride = size << conf->geo.chunk_shift;
3495 }
3496}
3497
3498enum geo_type {geo_new, geo_old, geo_start};
3499static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3500{
3501 int nc, fc, fo;
3502 int layout, chunk, disks;
3503 switch (new) {
3504 case geo_old:
3505 layout = mddev->layout;
3506 chunk = mddev->chunk_sectors;
3507 disks = mddev->raid_disks - mddev->delta_disks;
3508 break;
3509 case geo_new:
3510 layout = mddev->new_layout;
3511 chunk = mddev->new_chunk_sectors;
3512 disks = mddev->raid_disks;
3513 break;
3514 default:
3515 case geo_start:
3516
3517 layout = mddev->new_layout;
3518 chunk = mddev->new_chunk_sectors;
3519 disks = mddev->raid_disks + mddev->delta_disks;
3520 break;
3521 }
3522 if (layout >> 18)
3523 return -1;
3524 if (chunk < (PAGE_SIZE >> 9) ||
3525 !is_power_of_2(chunk))
3526 return -2;
3527 nc = layout & 255;
3528 fc = (layout >> 8) & 255;
3529 fo = layout & (1<<16);
3530 geo->raid_disks = disks;
3531 geo->near_copies = nc;
3532 geo->far_copies = fc;
3533 geo->far_offset = fo;
3534 geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
3535 geo->chunk_mask = chunk - 1;
3536 geo->chunk_shift = ffz(~chunk);
3537 return nc*fc;
3538}
3539
3540static struct r10conf *setup_conf(struct mddev *mddev)
3541{
3542 struct r10conf *conf = NULL;
3543 int err = -EINVAL;
3544 struct geom geo;
3545 int copies;
3546
3547 copies = setup_geo(&geo, mddev, geo_new);
3548
3549 if (copies == -2) {
3550 printk(KERN_ERR "md/raid10:%s: chunk size must be "
3551 "at least PAGE_SIZE(%ld) and be a power of 2.\n",
3552 mdname(mddev), PAGE_SIZE);
3553 goto out;
3554 }
3555
3556 if (copies < 2 || copies > mddev->raid_disks) {
3557 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3558 mdname(mddev), mddev->new_layout);
3559 goto out;
3560 }
3561
3562 err = -ENOMEM;
3563 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3564 if (!conf)
3565 goto out;
3566
3567
3568 conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
3569 max(0,-mddev->delta_disks)),
3570 GFP_KERNEL);
3571 if (!conf->mirrors)
3572 goto out;
3573
3574 conf->tmppage = alloc_page(GFP_KERNEL);
3575 if (!conf->tmppage)
3576 goto out;
3577
3578 conf->geo = geo;
3579 conf->copies = copies;
3580 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
3581 r10bio_pool_free, conf);
3582 if (!conf->r10bio_pool)
3583 goto out;
3584
3585 calc_sectors(conf, mddev->dev_sectors);
3586 if (mddev->reshape_position == MaxSector) {
3587 conf->prev = conf->geo;
3588 conf->reshape_progress = MaxSector;
3589 } else {
3590 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3591 err = -EINVAL;
3592 goto out;
3593 }
3594 conf->reshape_progress = mddev->reshape_position;
3595 if (conf->prev.far_offset)
3596 conf->prev.stride = 1 << conf->prev.chunk_shift;
3597 else
3598
3599 conf->prev.stride = conf->dev_sectors;
3600 }
3601 spin_lock_init(&conf->device_lock);
3602 INIT_LIST_HEAD(&conf->retry_list);
3603
3604 spin_lock_init(&conf->resync_lock);
3605 init_waitqueue_head(&conf->wait_barrier);
3606
3607 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3608 if (!conf->thread)
3609 goto out;
3610
3611 conf->mddev = mddev;
3612 return conf;
3613
3614 out:
3615 if (err == -ENOMEM)
3616 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
3617 mdname(mddev));
3618 if (conf) {
3619 if (conf->r10bio_pool)
3620 mempool_destroy(conf->r10bio_pool);
3621 kfree(conf->mirrors);
3622 safe_put_page(conf->tmppage);
3623 kfree(conf);
3624 }
3625 return ERR_PTR(err);
3626}
3627
3628static int run(struct mddev *mddev)
3629{
3630 struct r10conf *conf;
3631 int i, disk_idx, chunk_size;
3632 struct raid10_info *disk;
3633 struct md_rdev *rdev;
3634 sector_t size;
3635 sector_t min_offset_diff = 0;
3636 int first = 1;
3637 bool discard_supported = false;
3638
3639 if (mddev->private == NULL) {
3640 conf = setup_conf(mddev);
3641 if (IS_ERR(conf))
3642 return PTR_ERR(conf);
3643 mddev->private = conf;
3644 }
3645 conf = mddev->private;
3646 if (!conf)
3647 goto out;
3648
3649 mddev->thread = conf->thread;
3650 conf->thread = NULL;
3651
3652 chunk_size = mddev->chunk_sectors << 9;
3653 if (mddev->queue) {
3654 blk_queue_max_discard_sectors(mddev->queue,
3655 mddev->chunk_sectors);
3656 blk_queue_max_write_same_sectors(mddev->queue, 0);
3657 blk_queue_io_min(mddev->queue, chunk_size);
3658 if (conf->geo.raid_disks % conf->geo.near_copies)
3659 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3660 else
3661 blk_queue_io_opt(mddev->queue, chunk_size *
3662 (conf->geo.raid_disks / conf->geo.near_copies));
3663 }
3664
3665 rdev_for_each(rdev, mddev) {
3666 long long diff;
3667 struct request_queue *q;
3668
3669 disk_idx = rdev->raid_disk;
3670 if (disk_idx < 0)
3671 continue;
3672 if (disk_idx >= conf->geo.raid_disks &&
3673 disk_idx >= conf->prev.raid_disks)
3674 continue;
3675 disk = conf->mirrors + disk_idx;
3676
3677 if (test_bit(Replacement, &rdev->flags)) {
3678 if (disk->replacement)
3679 goto out_free_conf;
3680 disk->replacement = rdev;
3681 } else {
3682 if (disk->rdev)
3683 goto out_free_conf;
3684 disk->rdev = rdev;
3685 }
3686 q = bdev_get_queue(rdev->bdev);
3687 if (q->merge_bvec_fn)
3688 mddev->merge_check_needed = 1;
3689 diff = (rdev->new_data_offset - rdev->data_offset);
3690 if (!mddev->reshape_backwards)
3691 diff = -diff;
3692 if (diff < 0)
3693 diff = 0;
3694 if (first || diff < min_offset_diff)
3695 min_offset_diff = diff;
3696
3697 if (mddev->gendisk)
3698 disk_stack_limits(mddev->gendisk, rdev->bdev,
3699 rdev->data_offset << 9);
3700
3701 disk->head_position = 0;
3702
3703 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3704 discard_supported = true;
3705 }
3706
3707 if (mddev->queue) {
3708 if (discard_supported)
3709 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
3710 mddev->queue);
3711 else
3712 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
3713 mddev->queue);
3714 }
3715
3716 if (!enough(conf, -1)) {
3717 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
3718 mdname(mddev));
3719 goto out_free_conf;
3720 }
3721
3722 if (conf->reshape_progress != MaxSector) {
3723
3724 if (conf->geo.far_copies != 1 &&
3725 conf->geo.far_offset == 0)
3726 goto out_free_conf;
3727 if (conf->prev.far_copies != 1 &&
3728 conf->prev.far_offset == 0)
3729 goto out_free_conf;
3730 }
3731
3732 mddev->degraded = 0;
3733 for (i = 0;
3734 i < conf->geo.raid_disks
3735 || i < conf->prev.raid_disks;
3736 i++) {
3737
3738 disk = conf->mirrors + i;
3739
3740 if (!disk->rdev && disk->replacement) {
3741
3742 disk->rdev = disk->replacement;
3743 disk->replacement = NULL;
3744 clear_bit(Replacement, &disk->rdev->flags);
3745 }
3746
3747 if (!disk->rdev ||
3748 !test_bit(In_sync, &disk->rdev->flags)) {
3749 disk->head_position = 0;
3750 mddev->degraded++;
3751 if (disk->rdev)
3752 conf->fullsync = 1;
3753 }
3754 disk->recovery_disabled = mddev->recovery_disabled - 1;
3755 }
3756
3757 if (mddev->recovery_cp != MaxSector)
3758 printk(KERN_NOTICE "md/raid10:%s: not clean"
3759 " -- starting background reconstruction\n",
3760 mdname(mddev));
3761 printk(KERN_INFO
3762 "md/raid10:%s: active with %d out of %d devices\n",
3763 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3764 conf->geo.raid_disks);
3765
3766
3767
3768 mddev->dev_sectors = conf->dev_sectors;
3769 size = raid10_size(mddev, 0, 0);
3770 md_set_array_sectors(mddev, size);
3771 mddev->resync_max_sectors = size;
3772
3773 if (mddev->queue) {
3774 int stripe = conf->geo.raid_disks *
3775 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3776 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
3777 mddev->queue->backing_dev_info.congested_data = mddev;
3778
3779
3780
3781
3782
3783 stripe /= conf->geo.near_copies;
3784 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3785 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3786 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3787 }
3788
3789
3790 if (md_integrity_register(mddev))
3791 goto out_free_conf;
3792
3793 if (conf->reshape_progress != MaxSector) {
3794 unsigned long before_length, after_length;
3795
3796 before_length = ((1 << conf->prev.chunk_shift) *
3797 conf->prev.far_copies);
3798 after_length = ((1 << conf->geo.chunk_shift) *
3799 conf->geo.far_copies);
3800
3801 if (max(before_length, after_length) > min_offset_diff) {
3802
3803 printk("md/raid10: offset difference not enough to continue reshape\n");
3804 goto out_free_conf;
3805 }
3806 conf->offset_diff = min_offset_diff;
3807
3808 conf->reshape_safe = conf->reshape_progress;
3809 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3810 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3811 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3812 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3813 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3814 "reshape");
3815 }
3816
3817 return 0;
3818
3819out_free_conf:
3820 md_unregister_thread(&mddev->thread);
3821 if (conf->r10bio_pool)
3822 mempool_destroy(conf->r10bio_pool);
3823 safe_put_page(conf->tmppage);
3824 kfree(conf->mirrors);
3825 kfree(conf);
3826 mddev->private = NULL;
3827out:
3828 return -EIO;
3829}
3830
3831static int stop(struct mddev *mddev)
3832{
3833 struct r10conf *conf = mddev->private;
3834
3835 raise_barrier(conf, 0);
3836 lower_barrier(conf);
3837
3838 md_unregister_thread(&mddev->thread);
3839 if (mddev->queue)
3840
3841 blk_sync_queue(mddev->queue);
3842
3843 if (conf->r10bio_pool)
3844 mempool_destroy(conf->r10bio_pool);
3845 safe_put_page(conf->tmppage);
3846 kfree(conf->mirrors);
3847 kfree(conf);
3848 mddev->private = NULL;
3849 return 0;
3850}
3851
3852static void raid10_quiesce(struct mddev *mddev, int state)
3853{
3854 struct r10conf *conf = mddev->private;
3855
3856 switch(state) {
3857 case 1:
3858 raise_barrier(conf, 0);
3859 break;
3860 case 0:
3861 lower_barrier(conf);
3862 break;
3863 }
3864}
3865
3866static int raid10_resize(struct mddev *mddev, sector_t sectors)
3867{
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880 struct r10conf *conf = mddev->private;
3881 sector_t oldsize, size;
3882
3883 if (mddev->reshape_position != MaxSector)
3884 return -EBUSY;
3885
3886 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3887 return -EINVAL;
3888
3889 oldsize = raid10_size(mddev, 0, 0);
3890 size = raid10_size(mddev, sectors, 0);
3891 if (mddev->external_size &&
3892 mddev->array_sectors > size)
3893 return -EINVAL;
3894 if (mddev->bitmap) {
3895 int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
3896 if (ret)
3897 return ret;
3898 }
3899 md_set_array_sectors(mddev, size);
3900 set_capacity(mddev->gendisk, mddev->array_sectors);
3901 revalidate_disk(mddev->gendisk);
3902 if (sectors > mddev->dev_sectors &&
3903 mddev->recovery_cp > oldsize) {
3904 mddev->recovery_cp = oldsize;
3905 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3906 }
3907 calc_sectors(conf, sectors);
3908 mddev->dev_sectors = conf->dev_sectors;
3909 mddev->resync_max_sectors = size;
3910 return 0;
3911}
3912
3913static void *raid10_takeover_raid0(struct mddev *mddev)
3914{
3915 struct md_rdev *rdev;
3916 struct r10conf *conf;
3917
3918 if (mddev->degraded > 0) {
3919 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
3920 mdname(mddev));
3921 return ERR_PTR(-EINVAL);
3922 }
3923
3924
3925 mddev->new_level = 10;
3926
3927 mddev->new_layout = (1<<8) + 2;
3928 mddev->new_chunk_sectors = mddev->chunk_sectors;
3929 mddev->delta_disks = mddev->raid_disks;
3930 mddev->raid_disks *= 2;
3931
3932 mddev->recovery_cp = MaxSector;
3933
3934 conf = setup_conf(mddev);
3935 if (!IS_ERR(conf)) {
3936 rdev_for_each(rdev, mddev)
3937 if (rdev->raid_disk >= 0)
3938 rdev->new_raid_disk = rdev->raid_disk * 2;
3939 conf->barrier = 1;
3940 }
3941
3942 return conf;
3943}
3944
3945static void *raid10_takeover(struct mddev *mddev)
3946{
3947 struct r0conf *raid0_conf;
3948
3949
3950
3951
3952 if (mddev->level == 0) {
3953
3954 raid0_conf = mddev->private;
3955 if (raid0_conf->nr_strip_zones > 1) {
3956 printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
3957 " with more than one zone.\n",
3958 mdname(mddev));
3959 return ERR_PTR(-EINVAL);
3960 }
3961 return raid10_takeover_raid0(mddev);
3962 }
3963 return ERR_PTR(-EINVAL);
3964}
3965
3966static int raid10_check_reshape(struct mddev *mddev)
3967{
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982 struct r10conf *conf = mddev->private;
3983 struct geom geo;
3984
3985 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
3986 return -EINVAL;
3987
3988 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
3989
3990 return -EINVAL;
3991 if (geo.far_copies > 1 && !geo.far_offset)
3992
3993 return -EINVAL;
3994
3995 if (mddev->array_sectors & geo.chunk_mask)
3996
3997 return -EINVAL;
3998
3999 if (!enough(conf, -1))
4000 return -EINVAL;
4001
4002 kfree(conf->mirrors_new);
4003 conf->mirrors_new = NULL;
4004 if (mddev->delta_disks > 0) {
4005
4006 conf->mirrors_new = kzalloc(
4007 sizeof(struct raid10_info)
4008 *(mddev->raid_disks +
4009 mddev->delta_disks),
4010 GFP_KERNEL);
4011 if (!conf->mirrors_new)
4012 return -ENOMEM;
4013 }
4014 return 0;
4015}
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030static int calc_degraded(struct r10conf *conf)
4031{
4032 int degraded, degraded2;
4033 int i;
4034
4035 rcu_read_lock();
4036 degraded = 0;
4037
4038 for (i = 0; i < conf->prev.raid_disks; i++) {
4039 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4040 if (!rdev || test_bit(Faulty, &rdev->flags))
4041 degraded++;
4042 else if (!test_bit(In_sync, &rdev->flags))
4043
4044
4045
4046
4047 degraded++;
4048 }
4049 rcu_read_unlock();
4050 if (conf->geo.raid_disks == conf->prev.raid_disks)
4051 return degraded;
4052 rcu_read_lock();
4053 degraded2 = 0;
4054 for (i = 0; i < conf->geo.raid_disks; i++) {
4055 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4056 if (!rdev || test_bit(Faulty, &rdev->flags))
4057 degraded2++;
4058 else if (!test_bit(In_sync, &rdev->flags)) {
4059
4060
4061
4062
4063
4064 if (conf->geo.raid_disks <= conf->prev.raid_disks)
4065 degraded2++;
4066 }
4067 }
4068 rcu_read_unlock();
4069 if (degraded2 > degraded)
4070 return degraded2;
4071 return degraded;
4072}
4073
4074static int raid10_start_reshape(struct mddev *mddev)
4075{
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086 unsigned long before_length, after_length;
4087 sector_t min_offset_diff = 0;
4088 int first = 1;
4089 struct geom new;
4090 struct r10conf *conf = mddev->private;
4091 struct md_rdev *rdev;
4092 int spares = 0;
4093 int ret;
4094
4095 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4096 return -EBUSY;
4097
4098 if (setup_geo(&new, mddev, geo_start) != conf->copies)
4099 return -EINVAL;
4100
4101 before_length = ((1 << conf->prev.chunk_shift) *
4102 conf->prev.far_copies);
4103 after_length = ((1 << conf->geo.chunk_shift) *
4104 conf->geo.far_copies);
4105
4106 rdev_for_each(rdev, mddev) {
4107 if (!test_bit(In_sync, &rdev->flags)
4108 && !test_bit(Faulty, &rdev->flags))
4109 spares++;
4110 if (rdev->raid_disk >= 0) {
4111 long long diff = (rdev->new_data_offset
4112 - rdev->data_offset);
4113 if (!mddev->reshape_backwards)
4114 diff = -diff;
4115 if (diff < 0)
4116 diff = 0;
4117 if (first || diff < min_offset_diff)
4118 min_offset_diff = diff;
4119 }
4120 }
4121
4122 if (max(before_length, after_length) > min_offset_diff)
4123 return -EINVAL;
4124
4125 if (spares < mddev->delta_disks)
4126 return -EINVAL;
4127
4128 conf->offset_diff = min_offset_diff;
4129 spin_lock_irq(&conf->device_lock);
4130 if (conf->mirrors_new) {
4131 memcpy(conf->mirrors_new, conf->mirrors,
4132 sizeof(struct raid10_info)*conf->prev.raid_disks);
4133 smp_mb();
4134 kfree(conf->mirrors_old);
4135 conf->mirrors_old = conf->mirrors;
4136 conf->mirrors = conf->mirrors_new;
4137 conf->mirrors_new = NULL;
4138 }
4139 setup_geo(&conf->geo, mddev, geo_start);
4140 smp_mb();
4141 if (mddev->reshape_backwards) {
4142 sector_t size = raid10_size(mddev, 0, 0);
4143 if (size < mddev->array_sectors) {
4144 spin_unlock_irq(&conf->device_lock);
4145 printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
4146 mdname(mddev));
4147 return -EINVAL;
4148 }
4149 mddev->resync_max_sectors = size;
4150 conf->reshape_progress = size;
4151 } else
4152 conf->reshape_progress = 0;
4153 spin_unlock_irq(&conf->device_lock);
4154
4155 if (mddev->delta_disks && mddev->bitmap) {
4156 ret = bitmap_resize(mddev->bitmap,
4157 raid10_size(mddev, 0,
4158 conf->geo.raid_disks),
4159 0, 0);
4160 if (ret)
4161 goto abort;
4162 }
4163 if (mddev->delta_disks > 0) {
4164 rdev_for_each(rdev, mddev)
4165 if (rdev->raid_disk < 0 &&
4166 !test_bit(Faulty, &rdev->flags)) {
4167 if (raid10_add_disk(mddev, rdev) == 0) {
4168 if (rdev->raid_disk >=
4169 conf->prev.raid_disks)
4170 set_bit(In_sync, &rdev->flags);
4171 else
4172 rdev->recovery_offset = 0;
4173
4174 if (sysfs_link_rdev(mddev, rdev))
4175 ;
4176 }
4177 } else if (rdev->raid_disk >= conf->prev.raid_disks
4178 && !test_bit(Faulty, &rdev->flags)) {
4179
4180 set_bit(In_sync, &rdev->flags);
4181 }
4182 }
4183
4184
4185
4186
4187 spin_lock_irq(&conf->device_lock);
4188 mddev->degraded = calc_degraded(conf);
4189 spin_unlock_irq(&conf->device_lock);
4190 mddev->raid_disks = conf->geo.raid_disks;
4191 mddev->reshape_position = conf->reshape_progress;
4192 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4193
4194 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4195 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4196 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4197 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4198
4199 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4200 "reshape");
4201 if (!mddev->sync_thread) {
4202 ret = -EAGAIN;
4203 goto abort;
4204 }
4205 conf->reshape_checkpoint = jiffies;
4206 md_wakeup_thread(mddev->sync_thread);
4207 md_new_event(mddev);
4208 return 0;
4209
4210abort:
4211 mddev->recovery = 0;
4212 spin_lock_irq(&conf->device_lock);
4213 conf->geo = conf->prev;
4214 mddev->raid_disks = conf->geo.raid_disks;
4215 rdev_for_each(rdev, mddev)
4216 rdev->new_data_offset = rdev->data_offset;
4217 smp_wmb();
4218 conf->reshape_progress = MaxSector;
4219 mddev->reshape_position = MaxSector;
4220 spin_unlock_irq(&conf->device_lock);
4221 return ret;
4222}
4223
4224
4225
4226
4227
4228
4229
4230static sector_t last_dev_address(sector_t s, struct geom *geo)
4231{
4232 s = (s | geo->chunk_mask) + 1;
4233 s >>= geo->chunk_shift;
4234 s *= geo->near_copies;
4235 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4236 s *= geo->far_copies;
4237 s <<= geo->chunk_shift;
4238 return s;
4239}
4240
4241
4242
4243
4244
4245static sector_t first_dev_address(sector_t s, struct geom *geo)
4246{
4247 s >>= geo->chunk_shift;
4248 s *= geo->near_copies;
4249 sector_div(s, geo->raid_disks);
4250 s *= geo->far_copies;
4251 s <<= geo->chunk_shift;
4252 return s;
4253}
4254
4255static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4256 int *skipped)
4257{
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295 struct r10conf *conf = mddev->private;
4296 struct r10bio *r10_bio;
4297 sector_t next, safe, last;
4298 int max_sectors;
4299 int nr_sectors;
4300 int s;
4301 struct md_rdev *rdev;
4302 int need_flush = 0;
4303 struct bio *blist;
4304 struct bio *bio, *read_bio;
4305 int sectors_done = 0;
4306
4307 if (sector_nr == 0) {
4308
4309 if (mddev->reshape_backwards &&
4310 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4311 sector_nr = (raid10_size(mddev, 0, 0)
4312 - conf->reshape_progress);
4313 } else if (!mddev->reshape_backwards &&
4314 conf->reshape_progress > 0)
4315 sector_nr = conf->reshape_progress;
4316 if (sector_nr) {
4317 mddev->curr_resync_completed = sector_nr;
4318 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4319 *skipped = 1;
4320 return sector_nr;
4321 }
4322 }
4323
4324
4325
4326
4327
4328 if (mddev->reshape_backwards) {
4329
4330
4331
4332 next = first_dev_address(conf->reshape_progress - 1,
4333 &conf->geo);
4334
4335
4336
4337
4338 safe = last_dev_address(conf->reshape_safe - 1,
4339 &conf->prev);
4340
4341 if (next + conf->offset_diff < safe)
4342 need_flush = 1;
4343
4344 last = conf->reshape_progress - 1;
4345 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4346 & conf->prev.chunk_mask);
4347 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4348 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4349 } else {
4350
4351
4352
4353 next = last_dev_address(conf->reshape_progress, &conf->geo);
4354
4355
4356
4357
4358 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4359
4360
4361
4362
4363 if (next > safe + conf->offset_diff)
4364 need_flush = 1;
4365
4366 sector_nr = conf->reshape_progress;
4367 last = sector_nr | (conf->geo.chunk_mask
4368 & conf->prev.chunk_mask);
4369
4370 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4371 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4372 }
4373
4374 if (need_flush ||
4375 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4376
4377 wait_barrier(conf);
4378 mddev->reshape_position = conf->reshape_progress;
4379 if (mddev->reshape_backwards)
4380 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4381 - conf->reshape_progress;
4382 else
4383 mddev->curr_resync_completed = conf->reshape_progress;
4384 conf->reshape_checkpoint = jiffies;
4385 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4386 md_wakeup_thread(mddev->thread);
4387 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4388 kthread_should_stop());
4389 conf->reshape_safe = mddev->reshape_position;
4390 allow_barrier(conf);
4391 }
4392
4393read_more:
4394
4395 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
4396 raise_barrier(conf, sectors_done != 0);
4397 atomic_set(&r10_bio->remaining, 0);
4398 r10_bio->mddev = mddev;
4399 r10_bio->sector = sector_nr;
4400 set_bit(R10BIO_IsReshape, &r10_bio->state);
4401 r10_bio->sectors = last - sector_nr + 1;
4402 rdev = read_balance(conf, r10_bio, &max_sectors);
4403 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4404
4405 if (!rdev) {
4406
4407
4408
4409
4410 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4411 return sectors_done;
4412 }
4413
4414 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4415
4416 read_bio->bi_bdev = rdev->bdev;
4417 read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4418 + rdev->data_offset);
4419 read_bio->bi_private = r10_bio;
4420 read_bio->bi_end_io = end_sync_read;
4421 read_bio->bi_rw = READ;
4422 read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
4423 read_bio->bi_flags |= 1 << BIO_UPTODATE;
4424 read_bio->bi_vcnt = 0;
4425 read_bio->bi_size = 0;
4426 r10_bio->master_bio = read_bio;
4427 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4428
4429
4430 __raid10_find_phys(&conf->geo, r10_bio);
4431
4432 blist = read_bio;
4433 read_bio->bi_next = NULL;
4434
4435 for (s = 0; s < conf->copies*2; s++) {
4436 struct bio *b;
4437 int d = r10_bio->devs[s/2].devnum;
4438 struct md_rdev *rdev2;
4439 if (s&1) {
4440 rdev2 = conf->mirrors[d].replacement;
4441 b = r10_bio->devs[s/2].repl_bio;
4442 } else {
4443 rdev2 = conf->mirrors[d].rdev;
4444 b = r10_bio->devs[s/2].bio;
4445 }
4446 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4447 continue;
4448
4449 bio_reset(b);
4450 b->bi_bdev = rdev2->bdev;
4451 b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
4452 b->bi_private = r10_bio;
4453 b->bi_end_io = end_reshape_write;
4454 b->bi_rw = WRITE;
4455 b->bi_next = blist;
4456 blist = b;
4457 }
4458
4459
4460
4461 nr_sectors = 0;
4462 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4463 struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
4464 int len = (max_sectors - s) << 9;
4465 if (len > PAGE_SIZE)
4466 len = PAGE_SIZE;
4467 for (bio = blist; bio ; bio = bio->bi_next) {
4468 struct bio *bio2;
4469 if (bio_add_page(bio, page, len, 0))
4470 continue;
4471
4472
4473 for (bio2 = blist;
4474 bio2 && bio2 != bio;
4475 bio2 = bio2->bi_next) {
4476
4477 bio2->bi_vcnt--;
4478 bio2->bi_size -= len;
4479 bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
4480 }
4481 goto bio_full;
4482 }
4483 sector_nr += len >> 9;
4484 nr_sectors += len >> 9;
4485 }
4486bio_full:
4487 r10_bio->sectors = nr_sectors;
4488
4489
4490 md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
4491 atomic_inc(&r10_bio->remaining);
4492 read_bio->bi_next = NULL;
4493 generic_make_request(read_bio);
4494 sector_nr += nr_sectors;
4495 sectors_done += nr_sectors;
4496 if (sector_nr <= last)
4497 goto read_more;
4498
4499
4500
4501
4502 if (mddev->reshape_backwards)
4503 conf->reshape_progress -= sectors_done;
4504 else
4505 conf->reshape_progress += sectors_done;
4506
4507 return sectors_done;
4508}
4509
4510static void end_reshape_request(struct r10bio *r10_bio);
4511static int handle_reshape_read_error(struct mddev *mddev,
4512 struct r10bio *r10_bio);
4513static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4514{
4515
4516
4517
4518
4519
4520 struct r10conf *conf = mddev->private;
4521 int s;
4522
4523 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4524 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4525
4526 md_done_sync(mddev, r10_bio->sectors, 0);
4527 return;
4528 }
4529
4530
4531
4532
4533 atomic_set(&r10_bio->remaining, 1);
4534 for (s = 0; s < conf->copies*2; s++) {
4535 struct bio *b;
4536 int d = r10_bio->devs[s/2].devnum;
4537 struct md_rdev *rdev;
4538 if (s&1) {
4539 rdev = conf->mirrors[d].replacement;
4540 b = r10_bio->devs[s/2].repl_bio;
4541 } else {
4542 rdev = conf->mirrors[d].rdev;
4543 b = r10_bio->devs[s/2].bio;
4544 }
4545 if (!rdev || test_bit(Faulty, &rdev->flags))
4546 continue;
4547 atomic_inc(&rdev->nr_pending);
4548 md_sync_acct(b->bi_bdev, r10_bio->sectors);
4549 atomic_inc(&r10_bio->remaining);
4550 b->bi_next = NULL;
4551 generic_make_request(b);
4552 }
4553 end_reshape_request(r10_bio);
4554}
4555
4556static void end_reshape(struct r10conf *conf)
4557{
4558 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4559 return;
4560
4561 spin_lock_irq(&conf->device_lock);
4562 conf->prev = conf->geo;
4563 md_finish_reshape(conf->mddev);
4564 smp_wmb();
4565 conf->reshape_progress = MaxSector;
4566 spin_unlock_irq(&conf->device_lock);
4567
4568
4569
4570
4571 if (conf->mddev->queue) {
4572 int stripe = conf->geo.raid_disks *
4573 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4574 stripe /= conf->geo.near_copies;
4575 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4576 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4577 }
4578 conf->fullsync = 0;
4579}
4580
4581
4582static int handle_reshape_read_error(struct mddev *mddev,
4583 struct r10bio *r10_bio)
4584{
4585
4586 int sectors = r10_bio->sectors;
4587 struct r10conf *conf = mddev->private;
4588 struct {
4589 struct r10bio r10_bio;
4590 struct r10dev devs[conf->copies];
4591 } on_stack;
4592 struct r10bio *r10b = &on_stack.r10_bio;
4593 int slot = 0;
4594 int idx = 0;
4595 struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
4596
4597 r10b->sector = r10_bio->sector;
4598 __raid10_find_phys(&conf->prev, r10b);
4599
4600 while (sectors) {
4601 int s = sectors;
4602 int success = 0;
4603 int first_slot = slot;
4604
4605 if (s > (PAGE_SIZE >> 9))
4606 s = PAGE_SIZE >> 9;
4607
4608 while (!success) {
4609 int d = r10b->devs[slot].devnum;
4610 struct md_rdev *rdev = conf->mirrors[d].rdev;
4611 sector_t addr;
4612 if (rdev == NULL ||
4613 test_bit(Faulty, &rdev->flags) ||
4614 !test_bit(In_sync, &rdev->flags))
4615 goto failed;
4616
4617 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4618 success = sync_page_io(rdev,
4619 addr,
4620 s << 9,
4621 bvec[idx].bv_page,
4622 READ, false);
4623 if (success)
4624 break;
4625 failed:
4626 slot++;
4627 if (slot >= conf->copies)
4628 slot = 0;
4629 if (slot == first_slot)
4630 break;
4631 }
4632 if (!success) {
4633
4634 set_bit(MD_RECOVERY_INTR,
4635 &mddev->recovery);
4636 return -EIO;
4637 }
4638 sectors -= s;
4639 idx++;
4640 }
4641 return 0;
4642}
4643
4644static void end_reshape_write(struct bio *bio, int error)
4645{
4646 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
4647 struct r10bio *r10_bio = bio->bi_private;
4648 struct mddev *mddev = r10_bio->mddev;
4649 struct r10conf *conf = mddev->private;
4650 int d;
4651 int slot;
4652 int repl;
4653 struct md_rdev *rdev = NULL;
4654
4655 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4656 if (repl)
4657 rdev = conf->mirrors[d].replacement;
4658 if (!rdev) {
4659 smp_mb();
4660 rdev = conf->mirrors[d].rdev;
4661 }
4662
4663 if (!uptodate) {
4664
4665 md_error(mddev, rdev);
4666 }
4667
4668 rdev_dec_pending(rdev, mddev);
4669 end_reshape_request(r10_bio);
4670}
4671
4672static void end_reshape_request(struct r10bio *r10_bio)
4673{
4674 if (!atomic_dec_and_test(&r10_bio->remaining))
4675 return;
4676 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4677 bio_put(r10_bio->master_bio);
4678 put_buf(r10_bio);
4679}
4680
4681static void raid10_finish_reshape(struct mddev *mddev)
4682{
4683 struct r10conf *conf = mddev->private;
4684
4685 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4686 return;
4687
4688 if (mddev->delta_disks > 0) {
4689 sector_t size = raid10_size(mddev, 0, 0);
4690 md_set_array_sectors(mddev, size);
4691 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4692 mddev->recovery_cp = mddev->resync_max_sectors;
4693 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4694 }
4695 mddev->resync_max_sectors = size;
4696 set_capacity(mddev->gendisk, mddev->array_sectors);
4697 revalidate_disk(mddev->gendisk);
4698 } else {
4699 int d;
4700 for (d = conf->geo.raid_disks ;
4701 d < conf->geo.raid_disks - mddev->delta_disks;
4702 d++) {
4703 struct md_rdev *rdev = conf->mirrors[d].rdev;
4704 if (rdev)
4705 clear_bit(In_sync, &rdev->flags);
4706 rdev = conf->mirrors[d].replacement;
4707 if (rdev)
4708 clear_bit(In_sync, &rdev->flags);
4709 }
4710 }
4711 mddev->layout = mddev->new_layout;
4712 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4713 mddev->reshape_position = MaxSector;
4714 mddev->delta_disks = 0;
4715 mddev->reshape_backwards = 0;
4716}
4717
4718static struct md_personality raid10_personality =
4719{
4720 .name = "raid10",
4721 .level = 10,
4722 .owner = THIS_MODULE,
4723 .make_request = make_request,
4724 .run = run,
4725 .stop = stop,
4726 .status = status,
4727 .error_handler = error,
4728 .hot_add_disk = raid10_add_disk,
4729 .hot_remove_disk= raid10_remove_disk,
4730 .spare_active = raid10_spare_active,
4731 .sync_request = sync_request,
4732 .quiesce = raid10_quiesce,
4733 .size = raid10_size,
4734 .resize = raid10_resize,
4735 .takeover = raid10_takeover,
4736 .check_reshape = raid10_check_reshape,
4737 .start_reshape = raid10_start_reshape,
4738 .finish_reshape = raid10_finish_reshape,
4739};
4740
4741static int __init raid_init(void)
4742{
4743 return register_md_personality(&raid10_personality);
4744}
4745
4746static void raid_exit(void)
4747{
4748 unregister_md_personality(&raid10_personality);
4749}
4750
4751module_init(raid_init);
4752module_exit(raid_exit);
4753MODULE_LICENSE("GPL");
4754MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4755MODULE_ALIAS("md-personality-9");
4756MODULE_ALIAS("md-raid10");
4757MODULE_ALIAS("md-level-10");
4758
4759module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
4760