1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/slab.h>
22#include <linux/delay.h>
23#include <linux/blkdev.h>
24#include <linux/module.h>
25#include <linux/seq_file.h>
26#include <linux/ratelimit.h>
27#include <linux/kthread.h>
28#include "md.h"
29#include "raid10.h"
30#include "raid0.h"
31#include "bitmap.h"
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76#define NR_RAID10_BIOS 256
77
78
79
80
81
82
83#define IO_BLOCKED ((struct bio *)1)
84
85
86
87
88#define IO_MADE_GOOD ((struct bio *)2)
89
90#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
91
92
93
94
95
96static int max_queued_requests = 1024;
97
98static void allow_barrier(struct r10conf *conf);
99static void lower_barrier(struct r10conf *conf);
100static int _enough(struct r10conf *conf, int previous, int ignore);
101static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
102 int *skipped);
103static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
104static void end_reshape_write(struct bio *bio, int error);
105static void end_reshape(struct r10conf *conf);
106
107static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
108{
109 struct r10conf *conf = data;
110 int size = offsetof(struct r10bio, devs[conf->copies]);
111
112
113
114 return kzalloc(size, gfp_flags);
115}
116
117static void r10bio_pool_free(void *r10_bio, void *data)
118{
119 kfree(r10_bio);
120}
121
122
123#define RESYNC_BLOCK_SIZE (64*1024)
124#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
125
126#define RESYNC_WINDOW (1024*1024)
127
128#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
129
130
131
132
133
134
135
136
137static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
138{
139 struct r10conf *conf = data;
140 struct page *page;
141 struct r10bio *r10_bio;
142 struct bio *bio;
143 int i, j;
144 int nalloc;
145
146 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
147 if (!r10_bio)
148 return NULL;
149
150 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
151 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
152 nalloc = conf->copies;
153 else
154 nalloc = 2;
155
156
157
158
159 for (j = nalloc ; j-- ; ) {
160 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
161 if (!bio)
162 goto out_free_bio;
163 r10_bio->devs[j].bio = bio;
164 if (!conf->have_replacement)
165 continue;
166 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
167 if (!bio)
168 goto out_free_bio;
169 r10_bio->devs[j].repl_bio = bio;
170 }
171
172
173
174
175 for (j = 0 ; j < nalloc; j++) {
176 struct bio *rbio = r10_bio->devs[j].repl_bio;
177 bio = r10_bio->devs[j].bio;
178 for (i = 0; i < RESYNC_PAGES; i++) {
179 if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
180 &conf->mddev->recovery)) {
181
182
183 struct bio *rbio = r10_bio->devs[0].bio;
184 page = rbio->bi_io_vec[i].bv_page;
185 get_page(page);
186 } else
187 page = alloc_page(gfp_flags);
188 if (unlikely(!page))
189 goto out_free_pages;
190
191 bio->bi_io_vec[i].bv_page = page;
192 if (rbio)
193 rbio->bi_io_vec[i].bv_page = page;
194 }
195 }
196
197 return r10_bio;
198
199out_free_pages:
200 for ( ; i > 0 ; i--)
201 safe_put_page(bio->bi_io_vec[i-1].bv_page);
202 while (j--)
203 for (i = 0; i < RESYNC_PAGES ; i++)
204 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
205 j = 0;
206out_free_bio:
207 for ( ; j < nalloc; j++) {
208 if (r10_bio->devs[j].bio)
209 bio_put(r10_bio->devs[j].bio);
210 if (r10_bio->devs[j].repl_bio)
211 bio_put(r10_bio->devs[j].repl_bio);
212 }
213 r10bio_pool_free(r10_bio, conf);
214 return NULL;
215}
216
217static void r10buf_pool_free(void *__r10_bio, void *data)
218{
219 int i;
220 struct r10conf *conf = data;
221 struct r10bio *r10bio = __r10_bio;
222 int j;
223
224 for (j=0; j < conf->copies; j++) {
225 struct bio *bio = r10bio->devs[j].bio;
226 if (bio) {
227 for (i = 0; i < RESYNC_PAGES; i++) {
228 safe_put_page(bio->bi_io_vec[i].bv_page);
229 bio->bi_io_vec[i].bv_page = NULL;
230 }
231 bio_put(bio);
232 }
233 bio = r10bio->devs[j].repl_bio;
234 if (bio)
235 bio_put(bio);
236 }
237 r10bio_pool_free(r10bio, conf);
238}
239
240static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
241{
242 int i;
243
244 for (i = 0; i < conf->copies; i++) {
245 struct bio **bio = & r10_bio->devs[i].bio;
246 if (!BIO_SPECIAL(*bio))
247 bio_put(*bio);
248 *bio = NULL;
249 bio = &r10_bio->devs[i].repl_bio;
250 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
251 bio_put(*bio);
252 *bio = NULL;
253 }
254}
255
256static void free_r10bio(struct r10bio *r10_bio)
257{
258 struct r10conf *conf = r10_bio->mddev->private;
259
260 put_all_bios(conf, r10_bio);
261 mempool_free(r10_bio, conf->r10bio_pool);
262}
263
264static void put_buf(struct r10bio *r10_bio)
265{
266 struct r10conf *conf = r10_bio->mddev->private;
267
268 mempool_free(r10_bio, conf->r10buf_pool);
269
270 lower_barrier(conf);
271}
272
273static void reschedule_retry(struct r10bio *r10_bio)
274{
275 unsigned long flags;
276 struct mddev *mddev = r10_bio->mddev;
277 struct r10conf *conf = mddev->private;
278
279 spin_lock_irqsave(&conf->device_lock, flags);
280 list_add(&r10_bio->retry_list, &conf->retry_list);
281 conf->nr_queued ++;
282 spin_unlock_irqrestore(&conf->device_lock, flags);
283
284
285 wake_up(&conf->wait_barrier);
286
287 md_wakeup_thread(mddev->thread);
288}
289
290
291
292
293
294
295static void raid_end_bio_io(struct r10bio *r10_bio)
296{
297 struct bio *bio = r10_bio->master_bio;
298 int done;
299 struct r10conf *conf = r10_bio->mddev->private;
300
301 if (bio->bi_phys_segments) {
302 unsigned long flags;
303 spin_lock_irqsave(&conf->device_lock, flags);
304 bio->bi_phys_segments--;
305 done = (bio->bi_phys_segments == 0);
306 spin_unlock_irqrestore(&conf->device_lock, flags);
307 } else
308 done = 1;
309 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
310 clear_bit(BIO_UPTODATE, &bio->bi_flags);
311 if (done) {
312 bio_endio(bio, 0);
313
314
315
316
317 allow_barrier(conf);
318 }
319 free_r10bio(r10_bio);
320}
321
322
323
324
325static inline void update_head_pos(int slot, struct r10bio *r10_bio)
326{
327 struct r10conf *conf = r10_bio->mddev->private;
328
329 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
330 r10_bio->devs[slot].addr + (r10_bio->sectors);
331}
332
333
334
335
336static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
337 struct bio *bio, int *slotp, int *replp)
338{
339 int slot;
340 int repl = 0;
341
342 for (slot = 0; slot < conf->copies; slot++) {
343 if (r10_bio->devs[slot].bio == bio)
344 break;
345 if (r10_bio->devs[slot].repl_bio == bio) {
346 repl = 1;
347 break;
348 }
349 }
350
351 BUG_ON(slot == conf->copies);
352 update_head_pos(slot, r10_bio);
353
354 if (slotp)
355 *slotp = slot;
356 if (replp)
357 *replp = repl;
358 return r10_bio->devs[slot].devnum;
359}
360
361static void raid10_end_read_request(struct bio *bio, int error)
362{
363 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
364 struct r10bio *r10_bio = bio->bi_private;
365 int slot, dev;
366 struct md_rdev *rdev;
367 struct r10conf *conf = r10_bio->mddev->private;
368
369
370 slot = r10_bio->read_slot;
371 dev = r10_bio->devs[slot].devnum;
372 rdev = r10_bio->devs[slot].rdev;
373
374
375
376 update_head_pos(slot, r10_bio);
377
378 if (uptodate) {
379
380
381
382
383
384
385
386
387
388 set_bit(R10BIO_Uptodate, &r10_bio->state);
389 } else {
390
391
392
393
394
395 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
396 rdev->raid_disk))
397 uptodate = 1;
398 }
399 if (uptodate) {
400 raid_end_bio_io(r10_bio);
401 rdev_dec_pending(rdev, conf->mddev);
402 } else {
403
404
405
406 char b[BDEVNAME_SIZE];
407 printk_ratelimited(KERN_ERR
408 "md/raid10:%s: %s: rescheduling sector %llu\n",
409 mdname(conf->mddev),
410 bdevname(rdev->bdev, b),
411 (unsigned long long)r10_bio->sector);
412 set_bit(R10BIO_ReadError, &r10_bio->state);
413 reschedule_retry(r10_bio);
414 }
415}
416
417static void close_write(struct r10bio *r10_bio)
418{
419
420 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
421 r10_bio->sectors,
422 !test_bit(R10BIO_Degraded, &r10_bio->state),
423 0);
424 md_write_end(r10_bio->mddev);
425}
426
427static void one_write_done(struct r10bio *r10_bio)
428{
429 if (atomic_dec_and_test(&r10_bio->remaining)) {
430 if (test_bit(R10BIO_WriteError, &r10_bio->state))
431 reschedule_retry(r10_bio);
432 else {
433 close_write(r10_bio);
434 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
435 reschedule_retry(r10_bio);
436 else
437 raid_end_bio_io(r10_bio);
438 }
439 }
440}
441
442static void raid10_end_write_request(struct bio *bio, int error)
443{
444 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
445 struct r10bio *r10_bio = bio->bi_private;
446 int dev;
447 int dec_rdev = 1;
448 struct r10conf *conf = r10_bio->mddev->private;
449 int slot, repl;
450 struct md_rdev *rdev = NULL;
451
452 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
453
454 if (repl)
455 rdev = conf->mirrors[dev].replacement;
456 if (!rdev) {
457 smp_rmb();
458 repl = 0;
459 rdev = conf->mirrors[dev].rdev;
460 }
461
462
463
464 if (!uptodate) {
465 if (repl)
466
467
468
469 md_error(rdev->mddev, rdev);
470 else {
471 set_bit(WriteErrorSeen, &rdev->flags);
472 if (!test_and_set_bit(WantReplacement, &rdev->flags))
473 set_bit(MD_RECOVERY_NEEDED,
474 &rdev->mddev->recovery);
475 set_bit(R10BIO_WriteError, &r10_bio->state);
476 dec_rdev = 0;
477 }
478 } else {
479
480
481
482
483
484
485
486
487
488 sector_t first_bad;
489 int bad_sectors;
490
491
492
493
494
495
496
497
498
499 if (test_bit(In_sync, &rdev->flags) &&
500 !test_bit(Faulty, &rdev->flags))
501 set_bit(R10BIO_Uptodate, &r10_bio->state);
502
503
504 if (is_badblock(rdev,
505 r10_bio->devs[slot].addr,
506 r10_bio->sectors,
507 &first_bad, &bad_sectors)) {
508 bio_put(bio);
509 if (repl)
510 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
511 else
512 r10_bio->devs[slot].bio = IO_MADE_GOOD;
513 dec_rdev = 0;
514 set_bit(R10BIO_MadeGood, &r10_bio->state);
515 }
516 }
517
518
519
520
521
522
523 one_write_done(r10_bio);
524 if (dec_rdev)
525 rdev_dec_pending(rdev, conf->mddev);
526}
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
554{
555 int n,f;
556 sector_t sector;
557 sector_t chunk;
558 sector_t stripe;
559 int dev;
560 int slot = 0;
561 int last_far_set_start, last_far_set_size;
562
563 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
564 last_far_set_start *= geo->far_set_size;
565
566 last_far_set_size = geo->far_set_size;
567 last_far_set_size += (geo->raid_disks % geo->far_set_size);
568
569
570 chunk = r10bio->sector >> geo->chunk_shift;
571 sector = r10bio->sector & geo->chunk_mask;
572
573 chunk *= geo->near_copies;
574 stripe = chunk;
575 dev = sector_div(stripe, geo->raid_disks);
576 if (geo->far_offset)
577 stripe *= geo->far_copies;
578
579 sector += stripe << geo->chunk_shift;
580
581
582 for (n = 0; n < geo->near_copies; n++) {
583 int d = dev;
584 int set;
585 sector_t s = sector;
586 r10bio->devs[slot].devnum = d;
587 r10bio->devs[slot].addr = s;
588 slot++;
589
590 for (f = 1; f < geo->far_copies; f++) {
591 set = d / geo->far_set_size;
592 d += geo->near_copies;
593
594 if ((geo->raid_disks % geo->far_set_size) &&
595 (d > last_far_set_start)) {
596 d -= last_far_set_start;
597 d %= last_far_set_size;
598 d += last_far_set_start;
599 } else {
600 d %= geo->far_set_size;
601 d += geo->far_set_size * set;
602 }
603 s += geo->stride;
604 r10bio->devs[slot].devnum = d;
605 r10bio->devs[slot].addr = s;
606 slot++;
607 }
608 dev++;
609 if (dev >= geo->raid_disks) {
610 dev = 0;
611 sector += (geo->chunk_mask + 1);
612 }
613 }
614}
615
616static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
617{
618 struct geom *geo = &conf->geo;
619
620 if (conf->reshape_progress != MaxSector &&
621 ((r10bio->sector >= conf->reshape_progress) !=
622 conf->mddev->reshape_backwards)) {
623 set_bit(R10BIO_Previous, &r10bio->state);
624 geo = &conf->prev;
625 } else
626 clear_bit(R10BIO_Previous, &r10bio->state);
627
628 __raid10_find_phys(geo, r10bio);
629}
630
631static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
632{
633 sector_t offset, chunk, vchunk;
634
635
636
637 struct geom *geo = &conf->geo;
638 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
639 int far_set_size = geo->far_set_size;
640 int last_far_set_start;
641
642 if (geo->raid_disks % geo->far_set_size) {
643 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
644 last_far_set_start *= geo->far_set_size;
645
646 if (dev >= last_far_set_start) {
647 far_set_size = geo->far_set_size;
648 far_set_size += (geo->raid_disks % geo->far_set_size);
649 far_set_start = last_far_set_start;
650 }
651 }
652
653 offset = sector & geo->chunk_mask;
654 if (geo->far_offset) {
655 int fc;
656 chunk = sector >> geo->chunk_shift;
657 fc = sector_div(chunk, geo->far_copies);
658 dev -= fc * geo->near_copies;
659 if (dev < far_set_start)
660 dev += far_set_size;
661 } else {
662 while (sector >= geo->stride) {
663 sector -= geo->stride;
664 if (dev < (geo->near_copies + far_set_start))
665 dev += far_set_size - geo->near_copies;
666 else
667 dev -= geo->near_copies;
668 }
669 chunk = sector >> geo->chunk_shift;
670 }
671 vchunk = chunk * geo->raid_disks + dev;
672 sector_div(vchunk, geo->near_copies);
673 return (vchunk << geo->chunk_shift) + offset;
674}
675
676
677
678
679
680
681
682
683
684
685
686static int raid10_mergeable_bvec(struct request_queue *q,
687 struct bvec_merge_data *bvm,
688 struct bio_vec *biovec)
689{
690 struct mddev *mddev = q->queuedata;
691 struct r10conf *conf = mddev->private;
692 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
693 int max;
694 unsigned int chunk_sectors;
695 unsigned int bio_sectors = bvm->bi_size >> 9;
696 struct geom *geo = &conf->geo;
697
698 chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
699 if (conf->reshape_progress != MaxSector &&
700 ((sector >= conf->reshape_progress) !=
701 conf->mddev->reshape_backwards))
702 geo = &conf->prev;
703
704 if (geo->near_copies < geo->raid_disks) {
705 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
706 + bio_sectors)) << 9;
707 if (max < 0)
708
709 max = 0;
710 if (max <= biovec->bv_len && bio_sectors == 0)
711 return biovec->bv_len;
712 } else
713 max = biovec->bv_len;
714
715 if (mddev->merge_check_needed) {
716 struct {
717 struct r10bio r10_bio;
718 struct r10dev devs[conf->copies];
719 } on_stack;
720 struct r10bio *r10_bio = &on_stack.r10_bio;
721 int s;
722 if (conf->reshape_progress != MaxSector) {
723
724 if (max <= biovec->bv_len && bio_sectors == 0)
725 return biovec->bv_len;
726 return 0;
727 }
728 r10_bio->sector = sector;
729 raid10_find_phys(conf, r10_bio);
730 rcu_read_lock();
731 for (s = 0; s < conf->copies; s++) {
732 int disk = r10_bio->devs[s].devnum;
733 struct md_rdev *rdev = rcu_dereference(
734 conf->mirrors[disk].rdev);
735 if (rdev && !test_bit(Faulty, &rdev->flags)) {
736 struct request_queue *q =
737 bdev_get_queue(rdev->bdev);
738 if (q->merge_bvec_fn) {
739 bvm->bi_sector = r10_bio->devs[s].addr
740 + rdev->data_offset;
741 bvm->bi_bdev = rdev->bdev;
742 max = min(max, q->merge_bvec_fn(
743 q, bvm, biovec));
744 }
745 }
746 rdev = rcu_dereference(conf->mirrors[disk].replacement);
747 if (rdev && !test_bit(Faulty, &rdev->flags)) {
748 struct request_queue *q =
749 bdev_get_queue(rdev->bdev);
750 if (q->merge_bvec_fn) {
751 bvm->bi_sector = r10_bio->devs[s].addr
752 + rdev->data_offset;
753 bvm->bi_bdev = rdev->bdev;
754 max = min(max, q->merge_bvec_fn(
755 q, bvm, biovec));
756 }
757 }
758 }
759 rcu_read_unlock();
760 }
761 return max;
762}
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783static struct md_rdev *read_balance(struct r10conf *conf,
784 struct r10bio *r10_bio,
785 int *max_sectors)
786{
787 const sector_t this_sector = r10_bio->sector;
788 int disk, slot;
789 int sectors = r10_bio->sectors;
790 int best_good_sectors;
791 sector_t new_distance, best_dist;
792 struct md_rdev *best_rdev, *rdev = NULL;
793 int do_balance;
794 int best_slot;
795 struct geom *geo = &conf->geo;
796
797 raid10_find_phys(conf, r10_bio);
798 rcu_read_lock();
799retry:
800 sectors = r10_bio->sectors;
801 best_slot = -1;
802 best_rdev = NULL;
803 best_dist = MaxSector;
804 best_good_sectors = 0;
805 do_balance = 1;
806
807
808
809
810
811
812 if (conf->mddev->recovery_cp < MaxSector
813 && (this_sector + sectors >= conf->next_resync))
814 do_balance = 0;
815
816 for (slot = 0; slot < conf->copies ; slot++) {
817 sector_t first_bad;
818 int bad_sectors;
819 sector_t dev_sector;
820
821 if (r10_bio->devs[slot].bio == IO_BLOCKED)
822 continue;
823 disk = r10_bio->devs[slot].devnum;
824 rdev = rcu_dereference(conf->mirrors[disk].replacement);
825 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
826 test_bit(Unmerged, &rdev->flags) ||
827 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
828 rdev = rcu_dereference(conf->mirrors[disk].rdev);
829 if (rdev == NULL ||
830 test_bit(Faulty, &rdev->flags) ||
831 test_bit(Unmerged, &rdev->flags))
832 continue;
833 if (!test_bit(In_sync, &rdev->flags) &&
834 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
835 continue;
836
837 dev_sector = r10_bio->devs[slot].addr;
838 if (is_badblock(rdev, dev_sector, sectors,
839 &first_bad, &bad_sectors)) {
840 if (best_dist < MaxSector)
841
842 continue;
843 if (first_bad <= dev_sector) {
844
845
846
847
848 bad_sectors -= (dev_sector - first_bad);
849 if (!do_balance && sectors > bad_sectors)
850 sectors = bad_sectors;
851 if (best_good_sectors > sectors)
852 best_good_sectors = sectors;
853 } else {
854 sector_t good_sectors =
855 first_bad - dev_sector;
856 if (good_sectors > best_good_sectors) {
857 best_good_sectors = good_sectors;
858 best_slot = slot;
859 best_rdev = rdev;
860 }
861 if (!do_balance)
862
863 break;
864 }
865 continue;
866 } else
867 best_good_sectors = sectors;
868
869 if (!do_balance)
870 break;
871
872
873
874
875
876 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
877 break;
878
879
880 if (geo->far_copies > 1)
881 new_distance = r10_bio->devs[slot].addr;
882 else
883 new_distance = abs(r10_bio->devs[slot].addr -
884 conf->mirrors[disk].head_position);
885 if (new_distance < best_dist) {
886 best_dist = new_distance;
887 best_slot = slot;
888 best_rdev = rdev;
889 }
890 }
891 if (slot >= conf->copies) {
892 slot = best_slot;
893 rdev = best_rdev;
894 }
895
896 if (slot >= 0) {
897 atomic_inc(&rdev->nr_pending);
898 if (test_bit(Faulty, &rdev->flags)) {
899
900
901
902 rdev_dec_pending(rdev, conf->mddev);
903 goto retry;
904 }
905 r10_bio->read_slot = slot;
906 } else
907 rdev = NULL;
908 rcu_read_unlock();
909 *max_sectors = best_good_sectors;
910
911 return rdev;
912}
913
914int md_raid10_congested(struct mddev *mddev, int bits)
915{
916 struct r10conf *conf = mddev->private;
917 int i, ret = 0;
918
919 if ((bits & (1 << BDI_async_congested)) &&
920 conf->pending_count >= max_queued_requests)
921 return 1;
922
923 rcu_read_lock();
924 for (i = 0;
925 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
926 && ret == 0;
927 i++) {
928 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
929 if (rdev && !test_bit(Faulty, &rdev->flags)) {
930 struct request_queue *q = bdev_get_queue(rdev->bdev);
931
932 ret |= bdi_congested(&q->backing_dev_info, bits);
933 }
934 }
935 rcu_read_unlock();
936 return ret;
937}
938EXPORT_SYMBOL_GPL(md_raid10_congested);
939
940static int raid10_congested(void *data, int bits)
941{
942 struct mddev *mddev = data;
943
944 return mddev_congested(mddev, bits) ||
945 md_raid10_congested(mddev, bits);
946}
947
948static void flush_pending_writes(struct r10conf *conf)
949{
950
951
952
953 spin_lock_irq(&conf->device_lock);
954
955 if (conf->pending_bio_list.head) {
956 struct bio *bio;
957 bio = bio_list_get(&conf->pending_bio_list);
958 conf->pending_count = 0;
959 spin_unlock_irq(&conf->device_lock);
960
961
962 bitmap_unplug(conf->mddev->bitmap);
963 wake_up(&conf->wait_barrier);
964
965 while (bio) {
966 struct bio *next = bio->bi_next;
967 bio->bi_next = NULL;
968 if (unlikely((bio->bi_rw & REQ_DISCARD) &&
969 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
970
971 bio_endio(bio, 0);
972 else
973 generic_make_request(bio);
974 bio = next;
975 }
976 } else
977 spin_unlock_irq(&conf->device_lock);
978}
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002static void raise_barrier(struct r10conf *conf, int force)
1003{
1004 BUG_ON(force && !conf->barrier);
1005 spin_lock_irq(&conf->resync_lock);
1006
1007
1008 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
1009 conf->resync_lock);
1010
1011
1012 conf->barrier++;
1013
1014
1015 wait_event_lock_irq(conf->wait_barrier,
1016 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
1017 conf->resync_lock);
1018
1019 spin_unlock_irq(&conf->resync_lock);
1020}
1021
1022static void lower_barrier(struct r10conf *conf)
1023{
1024 unsigned long flags;
1025 spin_lock_irqsave(&conf->resync_lock, flags);
1026 conf->barrier--;
1027 spin_unlock_irqrestore(&conf->resync_lock, flags);
1028 wake_up(&conf->wait_barrier);
1029}
1030
1031static void wait_barrier(struct r10conf *conf)
1032{
1033 spin_lock_irq(&conf->resync_lock);
1034 if (conf->barrier) {
1035 conf->nr_waiting++;
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045 wait_event_lock_irq(conf->wait_barrier,
1046 !conf->barrier ||
1047 (conf->nr_pending &&
1048 current->bio_list &&
1049 !bio_list_empty(current->bio_list)),
1050 conf->resync_lock);
1051 conf->nr_waiting--;
1052 }
1053 conf->nr_pending++;
1054 spin_unlock_irq(&conf->resync_lock);
1055}
1056
1057static void allow_barrier(struct r10conf *conf)
1058{
1059 unsigned long flags;
1060 spin_lock_irqsave(&conf->resync_lock, flags);
1061 conf->nr_pending--;
1062 spin_unlock_irqrestore(&conf->resync_lock, flags);
1063 wake_up(&conf->wait_barrier);
1064}
1065
1066static void freeze_array(struct r10conf *conf, int extra)
1067{
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080 spin_lock_irq(&conf->resync_lock);
1081 conf->barrier++;
1082 conf->nr_waiting++;
1083 wait_event_lock_irq_cmd(conf->wait_barrier,
1084 conf->nr_pending == conf->nr_queued+extra,
1085 conf->resync_lock,
1086 flush_pending_writes(conf));
1087
1088 spin_unlock_irq(&conf->resync_lock);
1089}
1090
1091static void unfreeze_array(struct r10conf *conf)
1092{
1093
1094 spin_lock_irq(&conf->resync_lock);
1095 conf->barrier--;
1096 conf->nr_waiting--;
1097 wake_up(&conf->wait_barrier);
1098 spin_unlock_irq(&conf->resync_lock);
1099}
1100
1101static sector_t choose_data_offset(struct r10bio *r10_bio,
1102 struct md_rdev *rdev)
1103{
1104 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1105 test_bit(R10BIO_Previous, &r10_bio->state))
1106 return rdev->data_offset;
1107 else
1108 return rdev->new_data_offset;
1109}
1110
1111struct raid10_plug_cb {
1112 struct blk_plug_cb cb;
1113 struct bio_list pending;
1114 int pending_cnt;
1115};
1116
1117static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1118{
1119 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1120 cb);
1121 struct mddev *mddev = plug->cb.data;
1122 struct r10conf *conf = mddev->private;
1123 struct bio *bio;
1124
1125 if (from_schedule || current->bio_list) {
1126 spin_lock_irq(&conf->device_lock);
1127 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1128 conf->pending_count += plug->pending_cnt;
1129 spin_unlock_irq(&conf->device_lock);
1130 wake_up(&conf->wait_barrier);
1131 md_wakeup_thread(mddev->thread);
1132 kfree(plug);
1133 return;
1134 }
1135
1136
1137 bio = bio_list_get(&plug->pending);
1138 bitmap_unplug(mddev->bitmap);
1139 wake_up(&conf->wait_barrier);
1140
1141 while (bio) {
1142 struct bio *next = bio->bi_next;
1143 bio->bi_next = NULL;
1144 if (unlikely((bio->bi_rw & REQ_DISCARD) &&
1145 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
1146
1147 bio_endio(bio, 0);
1148 else
1149 generic_make_request(bio);
1150 bio = next;
1151 }
1152 kfree(plug);
1153}
1154
1155static void __make_request(struct mddev *mddev, struct bio *bio)
1156{
1157 struct r10conf *conf = mddev->private;
1158 struct r10bio *r10_bio;
1159 struct bio *read_bio;
1160 int i;
1161 const int rw = bio_data_dir(bio);
1162 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
1163 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
1164 const unsigned long do_discard = (bio->bi_rw
1165 & (REQ_DISCARD | REQ_SECURE));
1166 const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
1167 unsigned long flags;
1168 struct md_rdev *blocked_rdev;
1169 struct blk_plug_cb *cb;
1170 struct raid10_plug_cb *plug = NULL;
1171 int sectors_handled;
1172 int max_sectors;
1173 int sectors;
1174
1175 sectors = bio_sectors(bio);
1176 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1177 bio->bi_iter.bi_sector < conf->reshape_progress &&
1178 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1179
1180
1181
1182 allow_barrier(conf);
1183 wait_event(conf->wait_barrier,
1184 conf->reshape_progress <= bio->bi_iter.bi_sector ||
1185 conf->reshape_progress >= bio->bi_iter.bi_sector +
1186 sectors);
1187 wait_barrier(conf);
1188 }
1189 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1190 bio_data_dir(bio) == WRITE &&
1191 (mddev->reshape_backwards
1192 ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
1193 bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
1194 : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
1195 bio->bi_iter.bi_sector < conf->reshape_progress))) {
1196
1197 mddev->reshape_position = conf->reshape_progress;
1198 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1199 set_bit(MD_CHANGE_PENDING, &mddev->flags);
1200 md_wakeup_thread(mddev->thread);
1201 wait_event(mddev->sb_wait,
1202 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
1203
1204 conf->reshape_safe = mddev->reshape_position;
1205 }
1206
1207 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1208
1209 r10_bio->master_bio = bio;
1210 r10_bio->sectors = sectors;
1211
1212 r10_bio->mddev = mddev;
1213 r10_bio->sector = bio->bi_iter.bi_sector;
1214 r10_bio->state = 0;
1215
1216
1217
1218
1219
1220
1221
1222
1223 bio->bi_phys_segments = 0;
1224 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1225
1226 if (rw == READ) {
1227
1228
1229
1230 struct md_rdev *rdev;
1231 int slot;
1232
1233read_again:
1234 rdev = read_balance(conf, r10_bio, &max_sectors);
1235 if (!rdev) {
1236 raid_end_bio_io(r10_bio);
1237 return;
1238 }
1239 slot = r10_bio->read_slot;
1240
1241 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1242 bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
1243 max_sectors);
1244
1245 r10_bio->devs[slot].bio = read_bio;
1246 r10_bio->devs[slot].rdev = rdev;
1247
1248 read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
1249 choose_data_offset(r10_bio, rdev);
1250 read_bio->bi_bdev = rdev->bdev;
1251 read_bio->bi_end_io = raid10_end_read_request;
1252 read_bio->bi_rw = READ | do_sync;
1253 read_bio->bi_private = r10_bio;
1254
1255 if (max_sectors < r10_bio->sectors) {
1256
1257
1258
1259 sectors_handled = (r10_bio->sector + max_sectors
1260 - bio->bi_iter.bi_sector);
1261 r10_bio->sectors = max_sectors;
1262 spin_lock_irq(&conf->device_lock);
1263 if (bio->bi_phys_segments == 0)
1264 bio->bi_phys_segments = 2;
1265 else
1266 bio->bi_phys_segments++;
1267 spin_unlock_irq(&conf->device_lock);
1268
1269
1270
1271
1272
1273 reschedule_retry(r10_bio);
1274
1275 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1276
1277 r10_bio->master_bio = bio;
1278 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1279 r10_bio->state = 0;
1280 r10_bio->mddev = mddev;
1281 r10_bio->sector = bio->bi_iter.bi_sector +
1282 sectors_handled;
1283 goto read_again;
1284 } else
1285 generic_make_request(read_bio);
1286 return;
1287 }
1288
1289
1290
1291
1292 if (conf->pending_count >= max_queued_requests) {
1293 md_wakeup_thread(mddev->thread);
1294 wait_event(conf->wait_barrier,
1295 conf->pending_count < max_queued_requests);
1296 }
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309 r10_bio->read_slot = -1;
1310 raid10_find_phys(conf, r10_bio);
1311retry_write:
1312 blocked_rdev = NULL;
1313 rcu_read_lock();
1314 max_sectors = r10_bio->sectors;
1315
1316 for (i = 0; i < conf->copies; i++) {
1317 int d = r10_bio->devs[i].devnum;
1318 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1319 struct md_rdev *rrdev = rcu_dereference(
1320 conf->mirrors[d].replacement);
1321 if (rdev == rrdev)
1322 rrdev = NULL;
1323 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1324 atomic_inc(&rdev->nr_pending);
1325 blocked_rdev = rdev;
1326 break;
1327 }
1328 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1329 atomic_inc(&rrdev->nr_pending);
1330 blocked_rdev = rrdev;
1331 break;
1332 }
1333 if (rdev && (test_bit(Faulty, &rdev->flags)
1334 || test_bit(Unmerged, &rdev->flags)))
1335 rdev = NULL;
1336 if (rrdev && (test_bit(Faulty, &rrdev->flags)
1337 || test_bit(Unmerged, &rrdev->flags)))
1338 rrdev = NULL;
1339
1340 r10_bio->devs[i].bio = NULL;
1341 r10_bio->devs[i].repl_bio = NULL;
1342
1343 if (!rdev && !rrdev) {
1344 set_bit(R10BIO_Degraded, &r10_bio->state);
1345 continue;
1346 }
1347 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1348 sector_t first_bad;
1349 sector_t dev_sector = r10_bio->devs[i].addr;
1350 int bad_sectors;
1351 int is_bad;
1352
1353 is_bad = is_badblock(rdev, dev_sector,
1354 max_sectors,
1355 &first_bad, &bad_sectors);
1356 if (is_bad < 0) {
1357
1358
1359
1360 atomic_inc(&rdev->nr_pending);
1361 set_bit(BlockedBadBlocks, &rdev->flags);
1362 blocked_rdev = rdev;
1363 break;
1364 }
1365 if (is_bad && first_bad <= dev_sector) {
1366
1367 bad_sectors -= (dev_sector - first_bad);
1368 if (bad_sectors < max_sectors)
1369
1370
1371
1372 max_sectors = bad_sectors;
1373
1374
1375
1376
1377
1378
1379
1380
1381 continue;
1382 }
1383 if (is_bad) {
1384 int good_sectors = first_bad - dev_sector;
1385 if (good_sectors < max_sectors)
1386 max_sectors = good_sectors;
1387 }
1388 }
1389 if (rdev) {
1390 r10_bio->devs[i].bio = bio;
1391 atomic_inc(&rdev->nr_pending);
1392 }
1393 if (rrdev) {
1394 r10_bio->devs[i].repl_bio = bio;
1395 atomic_inc(&rrdev->nr_pending);
1396 }
1397 }
1398 rcu_read_unlock();
1399
1400 if (unlikely(blocked_rdev)) {
1401
1402 int j;
1403 int d;
1404
1405 for (j = 0; j < i; j++) {
1406 if (r10_bio->devs[j].bio) {
1407 d = r10_bio->devs[j].devnum;
1408 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1409 }
1410 if (r10_bio->devs[j].repl_bio) {
1411 struct md_rdev *rdev;
1412 d = r10_bio->devs[j].devnum;
1413 rdev = conf->mirrors[d].replacement;
1414 if (!rdev) {
1415
1416 smp_mb();
1417 rdev = conf->mirrors[d].rdev;
1418 }
1419 rdev_dec_pending(rdev, mddev);
1420 }
1421 }
1422 allow_barrier(conf);
1423 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1424 wait_barrier(conf);
1425 goto retry_write;
1426 }
1427
1428 if (max_sectors < r10_bio->sectors) {
1429
1430
1431
1432 r10_bio->sectors = max_sectors;
1433 spin_lock_irq(&conf->device_lock);
1434 if (bio->bi_phys_segments == 0)
1435 bio->bi_phys_segments = 2;
1436 else
1437 bio->bi_phys_segments++;
1438 spin_unlock_irq(&conf->device_lock);
1439 }
1440 sectors_handled = r10_bio->sector + max_sectors -
1441 bio->bi_iter.bi_sector;
1442
1443 atomic_set(&r10_bio->remaining, 1);
1444 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1445
1446 for (i = 0; i < conf->copies; i++) {
1447 struct bio *mbio;
1448 int d = r10_bio->devs[i].devnum;
1449 if (r10_bio->devs[i].bio) {
1450 struct md_rdev *rdev = conf->mirrors[d].rdev;
1451 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1452 bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
1453 max_sectors);
1454 r10_bio->devs[i].bio = mbio;
1455
1456 mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
1457 choose_data_offset(r10_bio,
1458 rdev));
1459 mbio->bi_bdev = rdev->bdev;
1460 mbio->bi_end_io = raid10_end_write_request;
1461 mbio->bi_rw =
1462 WRITE | do_sync | do_fua | do_discard | do_same;
1463 mbio->bi_private = r10_bio;
1464
1465 atomic_inc(&r10_bio->remaining);
1466
1467 cb = blk_check_plugged(raid10_unplug, mddev,
1468 sizeof(*plug));
1469 if (cb)
1470 plug = container_of(cb, struct raid10_plug_cb,
1471 cb);
1472 else
1473 plug = NULL;
1474 spin_lock_irqsave(&conf->device_lock, flags);
1475 if (plug) {
1476 bio_list_add(&plug->pending, mbio);
1477 plug->pending_cnt++;
1478 } else {
1479 bio_list_add(&conf->pending_bio_list, mbio);
1480 conf->pending_count++;
1481 }
1482 spin_unlock_irqrestore(&conf->device_lock, flags);
1483 if (!plug)
1484 md_wakeup_thread(mddev->thread);
1485 }
1486
1487 if (r10_bio->devs[i].repl_bio) {
1488 struct md_rdev *rdev = conf->mirrors[d].replacement;
1489 if (rdev == NULL) {
1490
1491 smp_mb();
1492 rdev = conf->mirrors[d].rdev;
1493 }
1494 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1495 bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
1496 max_sectors);
1497 r10_bio->devs[i].repl_bio = mbio;
1498
1499 mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr +
1500 choose_data_offset(
1501 r10_bio, rdev));
1502 mbio->bi_bdev = rdev->bdev;
1503 mbio->bi_end_io = raid10_end_write_request;
1504 mbio->bi_rw =
1505 WRITE | do_sync | do_fua | do_discard | do_same;
1506 mbio->bi_private = r10_bio;
1507
1508 atomic_inc(&r10_bio->remaining);
1509 spin_lock_irqsave(&conf->device_lock, flags);
1510 bio_list_add(&conf->pending_bio_list, mbio);
1511 conf->pending_count++;
1512 spin_unlock_irqrestore(&conf->device_lock, flags);
1513 if (!mddev_check_plugged(mddev))
1514 md_wakeup_thread(mddev->thread);
1515 }
1516 }
1517
1518
1519
1520
1521
1522 if (sectors_handled < bio_sectors(bio)) {
1523 one_write_done(r10_bio);
1524
1525
1526
1527 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1528
1529 r10_bio->master_bio = bio;
1530 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1531
1532 r10_bio->mddev = mddev;
1533 r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
1534 r10_bio->state = 0;
1535 goto retry_write;
1536 }
1537 one_write_done(r10_bio);
1538}
1539
1540static void make_request(struct mddev *mddev, struct bio *bio)
1541{
1542 struct r10conf *conf = mddev->private;
1543 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1544 int chunk_sects = chunk_mask + 1;
1545
1546 struct bio *split;
1547
1548 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
1549 md_flush_request(mddev, bio);
1550 return;
1551 }
1552
1553 md_write_start(mddev, bio);
1554
1555
1556
1557
1558
1559
1560 wait_barrier(conf);
1561
1562 do {
1563
1564
1565
1566
1567
1568 if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
1569 bio_sectors(bio) > chunk_sects
1570 && (conf->geo.near_copies < conf->geo.raid_disks
1571 || conf->prev.near_copies <
1572 conf->prev.raid_disks))) {
1573 split = bio_split(bio, chunk_sects -
1574 (bio->bi_iter.bi_sector &
1575 (chunk_sects - 1)),
1576 GFP_NOIO, fs_bio_set);
1577 bio_chain(split, bio);
1578 } else {
1579 split = bio;
1580 }
1581
1582 __make_request(mddev, split);
1583 } while (split != bio);
1584
1585
1586 wake_up(&conf->wait_barrier);
1587}
1588
1589static void status(struct seq_file *seq, struct mddev *mddev)
1590{
1591 struct r10conf *conf = mddev->private;
1592 int i;
1593
1594 if (conf->geo.near_copies < conf->geo.raid_disks)
1595 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1596 if (conf->geo.near_copies > 1)
1597 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1598 if (conf->geo.far_copies > 1) {
1599 if (conf->geo.far_offset)
1600 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1601 else
1602 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1603 }
1604 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1605 conf->geo.raid_disks - mddev->degraded);
1606 for (i = 0; i < conf->geo.raid_disks; i++)
1607 seq_printf(seq, "%s",
1608 conf->mirrors[i].rdev &&
1609 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
1610 seq_printf(seq, "]");
1611}
1612
1613
1614
1615
1616
1617
1618static int _enough(struct r10conf *conf, int previous, int ignore)
1619{
1620 int first = 0;
1621 int has_enough = 0;
1622 int disks, ncopies;
1623 if (previous) {
1624 disks = conf->prev.raid_disks;
1625 ncopies = conf->prev.near_copies;
1626 } else {
1627 disks = conf->geo.raid_disks;
1628 ncopies = conf->geo.near_copies;
1629 }
1630
1631 rcu_read_lock();
1632 do {
1633 int n = conf->copies;
1634 int cnt = 0;
1635 int this = first;
1636 while (n--) {
1637 struct md_rdev *rdev;
1638 if (this != ignore &&
1639 (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
1640 test_bit(In_sync, &rdev->flags))
1641 cnt++;
1642 this = (this+1) % disks;
1643 }
1644 if (cnt == 0)
1645 goto out;
1646 first = (first + ncopies) % disks;
1647 } while (first != 0);
1648 has_enough = 1;
1649out:
1650 rcu_read_unlock();
1651 return has_enough;
1652}
1653
1654static int enough(struct r10conf *conf, int ignore)
1655{
1656
1657
1658
1659
1660
1661 return _enough(conf, 0, ignore) &&
1662 _enough(conf, 1, ignore);
1663}
1664
1665static void error(struct mddev *mddev, struct md_rdev *rdev)
1666{
1667 char b[BDEVNAME_SIZE];
1668 struct r10conf *conf = mddev->private;
1669 unsigned long flags;
1670
1671
1672
1673
1674
1675
1676
1677 spin_lock_irqsave(&conf->device_lock, flags);
1678 if (test_bit(In_sync, &rdev->flags)
1679 && !enough(conf, rdev->raid_disk)) {
1680
1681
1682
1683 spin_unlock_irqrestore(&conf->device_lock, flags);
1684 return;
1685 }
1686 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1687 mddev->degraded++;
1688
1689
1690
1691 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1692 }
1693 set_bit(Blocked, &rdev->flags);
1694 set_bit(Faulty, &rdev->flags);
1695 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1696 spin_unlock_irqrestore(&conf->device_lock, flags);
1697 printk(KERN_ALERT
1698 "md/raid10:%s: Disk failure on %s, disabling device.\n"
1699 "md/raid10:%s: Operation continuing on %d devices.\n",
1700 mdname(mddev), bdevname(rdev->bdev, b),
1701 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1702}
1703
1704static void print_conf(struct r10conf *conf)
1705{
1706 int i;
1707 struct raid10_info *tmp;
1708
1709 printk(KERN_DEBUG "RAID10 conf printout:\n");
1710 if (!conf) {
1711 printk(KERN_DEBUG "(!conf)\n");
1712 return;
1713 }
1714 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1715 conf->geo.raid_disks);
1716
1717 for (i = 0; i < conf->geo.raid_disks; i++) {
1718 char b[BDEVNAME_SIZE];
1719 tmp = conf->mirrors + i;
1720 if (tmp->rdev)
1721 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1722 i, !test_bit(In_sync, &tmp->rdev->flags),
1723 !test_bit(Faulty, &tmp->rdev->flags),
1724 bdevname(tmp->rdev->bdev,b));
1725 }
1726}
1727
1728static void close_sync(struct r10conf *conf)
1729{
1730 wait_barrier(conf);
1731 allow_barrier(conf);
1732
1733 mempool_destroy(conf->r10buf_pool);
1734 conf->r10buf_pool = NULL;
1735}
1736
1737static int raid10_spare_active(struct mddev *mddev)
1738{
1739 int i;
1740 struct r10conf *conf = mddev->private;
1741 struct raid10_info *tmp;
1742 int count = 0;
1743 unsigned long flags;
1744
1745
1746
1747
1748
1749 for (i = 0; i < conf->geo.raid_disks; i++) {
1750 tmp = conf->mirrors + i;
1751 if (tmp->replacement
1752 && tmp->replacement->recovery_offset == MaxSector
1753 && !test_bit(Faulty, &tmp->replacement->flags)
1754 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1755
1756 if (!tmp->rdev
1757 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1758 count++;
1759 if (tmp->rdev) {
1760
1761
1762
1763
1764 set_bit(Faulty, &tmp->rdev->flags);
1765 sysfs_notify_dirent_safe(
1766 tmp->rdev->sysfs_state);
1767 }
1768 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1769 } else if (tmp->rdev
1770 && tmp->rdev->recovery_offset == MaxSector
1771 && !test_bit(Faulty, &tmp->rdev->flags)
1772 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1773 count++;
1774 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
1775 }
1776 }
1777 spin_lock_irqsave(&conf->device_lock, flags);
1778 mddev->degraded -= count;
1779 spin_unlock_irqrestore(&conf->device_lock, flags);
1780
1781 print_conf(conf);
1782 return count;
1783}
1784
1785
1786static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1787{
1788 struct r10conf *conf = mddev->private;
1789 int err = -EEXIST;
1790 int mirror;
1791 int first = 0;
1792 int last = conf->geo.raid_disks - 1;
1793 struct request_queue *q = bdev_get_queue(rdev->bdev);
1794
1795 if (mddev->recovery_cp < MaxSector)
1796
1797
1798
1799 return -EBUSY;
1800 if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
1801 return -EINVAL;
1802
1803 if (rdev->raid_disk >= 0)
1804 first = last = rdev->raid_disk;
1805
1806 if (q->merge_bvec_fn) {
1807 set_bit(Unmerged, &rdev->flags);
1808 mddev->merge_check_needed = 1;
1809 }
1810
1811 if (rdev->saved_raid_disk >= first &&
1812 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1813 mirror = rdev->saved_raid_disk;
1814 else
1815 mirror = first;
1816 for ( ; mirror <= last ; mirror++) {
1817 struct raid10_info *p = &conf->mirrors[mirror];
1818 if (p->recovery_disabled == mddev->recovery_disabled)
1819 continue;
1820 if (p->rdev) {
1821 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1822 p->replacement != NULL)
1823 continue;
1824 clear_bit(In_sync, &rdev->flags);
1825 set_bit(Replacement, &rdev->flags);
1826 rdev->raid_disk = mirror;
1827 err = 0;
1828 if (mddev->gendisk)
1829 disk_stack_limits(mddev->gendisk, rdev->bdev,
1830 rdev->data_offset << 9);
1831 conf->fullsync = 1;
1832 rcu_assign_pointer(p->replacement, rdev);
1833 break;
1834 }
1835
1836 if (mddev->gendisk)
1837 disk_stack_limits(mddev->gendisk, rdev->bdev,
1838 rdev->data_offset << 9);
1839
1840 p->head_position = 0;
1841 p->recovery_disabled = mddev->recovery_disabled - 1;
1842 rdev->raid_disk = mirror;
1843 err = 0;
1844 if (rdev->saved_raid_disk != mirror)
1845 conf->fullsync = 1;
1846 rcu_assign_pointer(p->rdev, rdev);
1847 break;
1848 }
1849 if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
1850
1851
1852
1853
1854
1855
1856
1857 synchronize_sched();
1858 freeze_array(conf, 0);
1859 unfreeze_array(conf);
1860 clear_bit(Unmerged, &rdev->flags);
1861 }
1862 md_integrity_add_rdev(rdev, mddev);
1863 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1864 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
1865
1866 print_conf(conf);
1867 return err;
1868}
1869
1870static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1871{
1872 struct r10conf *conf = mddev->private;
1873 int err = 0;
1874 int number = rdev->raid_disk;
1875 struct md_rdev **rdevp;
1876 struct raid10_info *p = conf->mirrors + number;
1877
1878 print_conf(conf);
1879 if (rdev == p->rdev)
1880 rdevp = &p->rdev;
1881 else if (rdev == p->replacement)
1882 rdevp = &p->replacement;
1883 else
1884 return 0;
1885
1886 if (test_bit(In_sync, &rdev->flags) ||
1887 atomic_read(&rdev->nr_pending)) {
1888 err = -EBUSY;
1889 goto abort;
1890 }
1891
1892
1893
1894 if (!test_bit(Faulty, &rdev->flags) &&
1895 mddev->recovery_disabled != p->recovery_disabled &&
1896 (!p->replacement || p->replacement == rdev) &&
1897 number < conf->geo.raid_disks &&
1898 enough(conf, -1)) {
1899 err = -EBUSY;
1900 goto abort;
1901 }
1902 *rdevp = NULL;
1903 synchronize_rcu();
1904 if (atomic_read(&rdev->nr_pending)) {
1905
1906 err = -EBUSY;
1907 *rdevp = rdev;
1908 goto abort;
1909 } else if (p->replacement) {
1910
1911 p->rdev = p->replacement;
1912 clear_bit(Replacement, &p->replacement->flags);
1913 smp_mb();
1914
1915
1916 p->replacement = NULL;
1917 clear_bit(WantReplacement, &rdev->flags);
1918 } else
1919
1920
1921
1922 clear_bit(WantReplacement, &rdev->flags);
1923
1924 err = md_integrity_register(mddev);
1925
1926abort:
1927
1928 print_conf(conf);
1929 return err;
1930}
1931
1932
1933static void end_sync_read(struct bio *bio, int error)
1934{
1935 struct r10bio *r10_bio = bio->bi_private;
1936 struct r10conf *conf = r10_bio->mddev->private;
1937 int d;
1938
1939 if (bio == r10_bio->master_bio) {
1940
1941 d = r10_bio->read_slot;
1942 } else
1943 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1944
1945 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1946 set_bit(R10BIO_Uptodate, &r10_bio->state);
1947 else
1948
1949
1950
1951 atomic_add(r10_bio->sectors,
1952 &conf->mirrors[d].rdev->corrected_errors);
1953
1954
1955
1956
1957 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1958 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1959 atomic_dec_and_test(&r10_bio->remaining)) {
1960
1961
1962
1963 reschedule_retry(r10_bio);
1964 }
1965}
1966
1967static void end_sync_request(struct r10bio *r10_bio)
1968{
1969 struct mddev *mddev = r10_bio->mddev;
1970
1971 while (atomic_dec_and_test(&r10_bio->remaining)) {
1972 if (r10_bio->master_bio == NULL) {
1973
1974 sector_t s = r10_bio->sectors;
1975 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1976 test_bit(R10BIO_WriteError, &r10_bio->state))
1977 reschedule_retry(r10_bio);
1978 else
1979 put_buf(r10_bio);
1980 md_done_sync(mddev, s, 1);
1981 break;
1982 } else {
1983 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1984 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1985 test_bit(R10BIO_WriteError, &r10_bio->state))
1986 reschedule_retry(r10_bio);
1987 else
1988 put_buf(r10_bio);
1989 r10_bio = r10_bio2;
1990 }
1991 }
1992}
1993
1994static void end_sync_write(struct bio *bio, int error)
1995{
1996 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1997 struct r10bio *r10_bio = bio->bi_private;
1998 struct mddev *mddev = r10_bio->mddev;
1999 struct r10conf *conf = mddev->private;
2000 int d;
2001 sector_t first_bad;
2002 int bad_sectors;
2003 int slot;
2004 int repl;
2005 struct md_rdev *rdev = NULL;
2006
2007 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
2008 if (repl)
2009 rdev = conf->mirrors[d].replacement;
2010 else
2011 rdev = conf->mirrors[d].rdev;
2012
2013 if (!uptodate) {
2014 if (repl)
2015 md_error(mddev, rdev);
2016 else {
2017 set_bit(WriteErrorSeen, &rdev->flags);
2018 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2019 set_bit(MD_RECOVERY_NEEDED,
2020 &rdev->mddev->recovery);
2021 set_bit(R10BIO_WriteError, &r10_bio->state);
2022 }
2023 } else if (is_badblock(rdev,
2024 r10_bio->devs[slot].addr,
2025 r10_bio->sectors,
2026 &first_bad, &bad_sectors))
2027 set_bit(R10BIO_MadeGood, &r10_bio->state);
2028
2029 rdev_dec_pending(rdev, mddev);
2030
2031 end_sync_request(r10_bio);
2032}
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2051{
2052 struct r10conf *conf = mddev->private;
2053 int i, first;
2054 struct bio *tbio, *fbio;
2055 int vcnt;
2056
2057 atomic_set(&r10_bio->remaining, 1);
2058
2059
2060 for (i=0; i<conf->copies; i++)
2061 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
2062 break;
2063
2064 if (i == conf->copies)
2065 goto done;
2066
2067 first = i;
2068 fbio = r10_bio->devs[i].bio;
2069
2070 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2071
2072 for (i=0 ; i < conf->copies ; i++) {
2073 int j, d;
2074
2075 tbio = r10_bio->devs[i].bio;
2076
2077 if (tbio->bi_end_io != end_sync_read)
2078 continue;
2079 if (i == first)
2080 continue;
2081 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
2082
2083
2084
2085
2086 int sectors = r10_bio->sectors;
2087 for (j = 0; j < vcnt; j++) {
2088 int len = PAGE_SIZE;
2089 if (sectors < (len / 512))
2090 len = sectors * 512;
2091 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
2092 page_address(tbio->bi_io_vec[j].bv_page),
2093 len))
2094 break;
2095 sectors -= len/512;
2096 }
2097 if (j == vcnt)
2098 continue;
2099 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2100 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2101
2102 continue;
2103 }
2104
2105
2106
2107
2108
2109 bio_reset(tbio);
2110
2111 tbio->bi_vcnt = vcnt;
2112 tbio->bi_iter.bi_size = r10_bio->sectors << 9;
2113 tbio->bi_rw = WRITE;
2114 tbio->bi_private = r10_bio;
2115 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
2116
2117 for (j=0; j < vcnt ; j++) {
2118 tbio->bi_io_vec[j].bv_offset = 0;
2119 tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
2120
2121 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2122 page_address(fbio->bi_io_vec[j].bv_page),
2123 PAGE_SIZE);
2124 }
2125 tbio->bi_end_io = end_sync_write;
2126
2127 d = r10_bio->devs[i].devnum;
2128 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2129 atomic_inc(&r10_bio->remaining);
2130 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2131
2132 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
2133 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
2134 generic_make_request(tbio);
2135 }
2136
2137
2138
2139
2140 for (i = 0; i < conf->copies; i++) {
2141 int j, d;
2142
2143 tbio = r10_bio->devs[i].repl_bio;
2144 if (!tbio || !tbio->bi_end_io)
2145 continue;
2146 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2147 && r10_bio->devs[i].bio != fbio)
2148 for (j = 0; j < vcnt; j++)
2149 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2150 page_address(fbio->bi_io_vec[j].bv_page),
2151 PAGE_SIZE);
2152 d = r10_bio->devs[i].devnum;
2153 atomic_inc(&r10_bio->remaining);
2154 md_sync_acct(conf->mirrors[d].replacement->bdev,
2155 bio_sectors(tbio));
2156 generic_make_request(tbio);
2157 }
2158
2159done:
2160 if (atomic_dec_and_test(&r10_bio->remaining)) {
2161 md_done_sync(mddev, r10_bio->sectors, 1);
2162 put_buf(r10_bio);
2163 }
2164}
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176static void fix_recovery_read_error(struct r10bio *r10_bio)
2177{
2178
2179
2180
2181
2182
2183
2184
2185 struct mddev *mddev = r10_bio->mddev;
2186 struct r10conf *conf = mddev->private;
2187 struct bio *bio = r10_bio->devs[0].bio;
2188 sector_t sect = 0;
2189 int sectors = r10_bio->sectors;
2190 int idx = 0;
2191 int dr = r10_bio->devs[0].devnum;
2192 int dw = r10_bio->devs[1].devnum;
2193
2194 while (sectors) {
2195 int s = sectors;
2196 struct md_rdev *rdev;
2197 sector_t addr;
2198 int ok;
2199
2200 if (s > (PAGE_SIZE>>9))
2201 s = PAGE_SIZE >> 9;
2202
2203 rdev = conf->mirrors[dr].rdev;
2204 addr = r10_bio->devs[0].addr + sect,
2205 ok = sync_page_io(rdev,
2206 addr,
2207 s << 9,
2208 bio->bi_io_vec[idx].bv_page,
2209 READ, false);
2210 if (ok) {
2211 rdev = conf->mirrors[dw].rdev;
2212 addr = r10_bio->devs[1].addr + sect;
2213 ok = sync_page_io(rdev,
2214 addr,
2215 s << 9,
2216 bio->bi_io_vec[idx].bv_page,
2217 WRITE, false);
2218 if (!ok) {
2219 set_bit(WriteErrorSeen, &rdev->flags);
2220 if (!test_and_set_bit(WantReplacement,
2221 &rdev->flags))
2222 set_bit(MD_RECOVERY_NEEDED,
2223 &rdev->mddev->recovery);
2224 }
2225 }
2226 if (!ok) {
2227
2228
2229
2230
2231 rdev_set_badblocks(rdev, addr, s, 0);
2232
2233 if (rdev != conf->mirrors[dw].rdev) {
2234
2235 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2236 addr = r10_bio->devs[1].addr + sect;
2237 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2238 if (!ok) {
2239
2240 printk(KERN_NOTICE
2241 "md/raid10:%s: recovery aborted"
2242 " due to read error\n",
2243 mdname(mddev));
2244
2245 conf->mirrors[dw].recovery_disabled
2246 = mddev->recovery_disabled;
2247 set_bit(MD_RECOVERY_INTR,
2248 &mddev->recovery);
2249 break;
2250 }
2251 }
2252 }
2253
2254 sectors -= s;
2255 sect += s;
2256 idx++;
2257 }
2258}
2259
2260static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2261{
2262 struct r10conf *conf = mddev->private;
2263 int d;
2264 struct bio *wbio, *wbio2;
2265
2266 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2267 fix_recovery_read_error(r10_bio);
2268 end_sync_request(r10_bio);
2269 return;
2270 }
2271
2272
2273
2274
2275
2276 d = r10_bio->devs[1].devnum;
2277 wbio = r10_bio->devs[1].bio;
2278 wbio2 = r10_bio->devs[1].repl_bio;
2279
2280
2281
2282
2283 if (wbio2 && !wbio2->bi_end_io)
2284 wbio2 = NULL;
2285 if (wbio->bi_end_io) {
2286 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2287 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2288 generic_make_request(wbio);
2289 }
2290 if (wbio2) {
2291 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2292 md_sync_acct(conf->mirrors[d].replacement->bdev,
2293 bio_sectors(wbio2));
2294 generic_make_request(wbio2);
2295 }
2296}
2297
2298
2299
2300
2301
2302
2303
2304
2305static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2306{
2307 struct timespec cur_time_mon;
2308 unsigned long hours_since_last;
2309 unsigned int read_errors = atomic_read(&rdev->read_errors);
2310
2311 ktime_get_ts(&cur_time_mon);
2312
2313 if (rdev->last_read_error.tv_sec == 0 &&
2314 rdev->last_read_error.tv_nsec == 0) {
2315
2316 rdev->last_read_error = cur_time_mon;
2317 return;
2318 }
2319
2320 hours_since_last = (cur_time_mon.tv_sec -
2321 rdev->last_read_error.tv_sec) / 3600;
2322
2323 rdev->last_read_error = cur_time_mon;
2324
2325
2326
2327
2328
2329
2330 if (hours_since_last >= 8 * sizeof(read_errors))
2331 atomic_set(&rdev->read_errors, 0);
2332 else
2333 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2334}
2335
2336static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2337 int sectors, struct page *page, int rw)
2338{
2339 sector_t first_bad;
2340 int bad_sectors;
2341
2342 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2343 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2344 return -1;
2345 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
2346
2347 return 1;
2348 if (rw == WRITE) {
2349 set_bit(WriteErrorSeen, &rdev->flags);
2350 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2351 set_bit(MD_RECOVERY_NEEDED,
2352 &rdev->mddev->recovery);
2353 }
2354
2355 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2356 md_error(rdev->mddev, rdev);
2357 return 0;
2358}
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2369{
2370 int sect = 0;
2371 int sectors = r10_bio->sectors;
2372 struct md_rdev*rdev;
2373 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2374 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2375
2376
2377
2378
2379 rdev = conf->mirrors[d].rdev;
2380
2381 if (test_bit(Faulty, &rdev->flags))
2382
2383
2384 return;
2385
2386 check_decay_read_errors(mddev, rdev);
2387 atomic_inc(&rdev->read_errors);
2388 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2389 char b[BDEVNAME_SIZE];
2390 bdevname(rdev->bdev, b);
2391
2392 printk(KERN_NOTICE
2393 "md/raid10:%s: %s: Raid device exceeded "
2394 "read_error threshold [cur %d:max %d]\n",
2395 mdname(mddev), b,
2396 atomic_read(&rdev->read_errors), max_read_errors);
2397 printk(KERN_NOTICE
2398 "md/raid10:%s: %s: Failing raid device\n",
2399 mdname(mddev), b);
2400 md_error(mddev, conf->mirrors[d].rdev);
2401 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2402 return;
2403 }
2404
2405 while(sectors) {
2406 int s = sectors;
2407 int sl = r10_bio->read_slot;
2408 int success = 0;
2409 int start;
2410
2411 if (s > (PAGE_SIZE>>9))
2412 s = PAGE_SIZE >> 9;
2413
2414 rcu_read_lock();
2415 do {
2416 sector_t first_bad;
2417 int bad_sectors;
2418
2419 d = r10_bio->devs[sl].devnum;
2420 rdev = rcu_dereference(conf->mirrors[d].rdev);
2421 if (rdev &&
2422 !test_bit(Unmerged, &rdev->flags) &&
2423 test_bit(In_sync, &rdev->flags) &&
2424 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2425 &first_bad, &bad_sectors) == 0) {
2426 atomic_inc(&rdev->nr_pending);
2427 rcu_read_unlock();
2428 success = sync_page_io(rdev,
2429 r10_bio->devs[sl].addr +
2430 sect,
2431 s<<9,
2432 conf->tmppage, READ, false);
2433 rdev_dec_pending(rdev, mddev);
2434 rcu_read_lock();
2435 if (success)
2436 break;
2437 }
2438 sl++;
2439 if (sl == conf->copies)
2440 sl = 0;
2441 } while (!success && sl != r10_bio->read_slot);
2442 rcu_read_unlock();
2443
2444 if (!success) {
2445
2446
2447
2448
2449 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2450 rdev = conf->mirrors[dn].rdev;
2451
2452 if (!rdev_set_badblocks(
2453 rdev,
2454 r10_bio->devs[r10_bio->read_slot].addr
2455 + sect,
2456 s, 0)) {
2457 md_error(mddev, rdev);
2458 r10_bio->devs[r10_bio->read_slot].bio
2459 = IO_BLOCKED;
2460 }
2461 break;
2462 }
2463
2464 start = sl;
2465
2466 rcu_read_lock();
2467 while (sl != r10_bio->read_slot) {
2468 char b[BDEVNAME_SIZE];
2469
2470 if (sl==0)
2471 sl = conf->copies;
2472 sl--;
2473 d = r10_bio->devs[sl].devnum;
2474 rdev = rcu_dereference(conf->mirrors[d].rdev);
2475 if (!rdev ||
2476 test_bit(Unmerged, &rdev->flags) ||
2477 !test_bit(In_sync, &rdev->flags))
2478 continue;
2479
2480 atomic_inc(&rdev->nr_pending);
2481 rcu_read_unlock();
2482 if (r10_sync_page_io(rdev,
2483 r10_bio->devs[sl].addr +
2484 sect,
2485 s, conf->tmppage, WRITE)
2486 == 0) {
2487
2488 printk(KERN_NOTICE
2489 "md/raid10:%s: read correction "
2490 "write failed"
2491 " (%d sectors at %llu on %s)\n",
2492 mdname(mddev), s,
2493 (unsigned long long)(
2494 sect +
2495 choose_data_offset(r10_bio,
2496 rdev)),
2497 bdevname(rdev->bdev, b));
2498 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2499 "drive\n",
2500 mdname(mddev),
2501 bdevname(rdev->bdev, b));
2502 }
2503 rdev_dec_pending(rdev, mddev);
2504 rcu_read_lock();
2505 }
2506 sl = start;
2507 while (sl != r10_bio->read_slot) {
2508 char b[BDEVNAME_SIZE];
2509
2510 if (sl==0)
2511 sl = conf->copies;
2512 sl--;
2513 d = r10_bio->devs[sl].devnum;
2514 rdev = rcu_dereference(conf->mirrors[d].rdev);
2515 if (!rdev ||
2516 !test_bit(In_sync, &rdev->flags))
2517 continue;
2518
2519 atomic_inc(&rdev->nr_pending);
2520 rcu_read_unlock();
2521 switch (r10_sync_page_io(rdev,
2522 r10_bio->devs[sl].addr +
2523 sect,
2524 s, conf->tmppage,
2525 READ)) {
2526 case 0:
2527
2528 printk(KERN_NOTICE
2529 "md/raid10:%s: unable to read back "
2530 "corrected sectors"
2531 " (%d sectors at %llu on %s)\n",
2532 mdname(mddev), s,
2533 (unsigned long long)(
2534 sect +
2535 choose_data_offset(r10_bio, rdev)),
2536 bdevname(rdev->bdev, b));
2537 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2538 "drive\n",
2539 mdname(mddev),
2540 bdevname(rdev->bdev, b));
2541 break;
2542 case 1:
2543 printk(KERN_INFO
2544 "md/raid10:%s: read error corrected"
2545 " (%d sectors at %llu on %s)\n",
2546 mdname(mddev), s,
2547 (unsigned long long)(
2548 sect +
2549 choose_data_offset(r10_bio, rdev)),
2550 bdevname(rdev->bdev, b));
2551 atomic_add(s, &rdev->corrected_errors);
2552 }
2553
2554 rdev_dec_pending(rdev, mddev);
2555 rcu_read_lock();
2556 }
2557 rcu_read_unlock();
2558
2559 sectors -= s;
2560 sect += s;
2561 }
2562}
2563
2564static int narrow_write_error(struct r10bio *r10_bio, int i)
2565{
2566 struct bio *bio = r10_bio->master_bio;
2567 struct mddev *mddev = r10_bio->mddev;
2568 struct r10conf *conf = mddev->private;
2569 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581 int block_sectors;
2582 sector_t sector;
2583 int sectors;
2584 int sect_to_write = r10_bio->sectors;
2585 int ok = 1;
2586
2587 if (rdev->badblocks.shift < 0)
2588 return 0;
2589
2590 block_sectors = 1 << rdev->badblocks.shift;
2591 sector = r10_bio->sector;
2592 sectors = ((r10_bio->sector + block_sectors)
2593 & ~(sector_t)(block_sectors - 1))
2594 - sector;
2595
2596 while (sect_to_write) {
2597 struct bio *wbio;
2598 if (sectors > sect_to_write)
2599 sectors = sect_to_write;
2600
2601 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2602 bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
2603 wbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
2604 choose_data_offset(r10_bio, rdev) +
2605 (sector - r10_bio->sector));
2606 wbio->bi_bdev = rdev->bdev;
2607 if (submit_bio_wait(WRITE, wbio) == 0)
2608
2609 ok = rdev_set_badblocks(rdev, sector,
2610 sectors, 0)
2611 && ok;
2612
2613 bio_put(wbio);
2614 sect_to_write -= sectors;
2615 sector += sectors;
2616 sectors = block_sectors;
2617 }
2618 return ok;
2619}
2620
2621static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2622{
2623 int slot = r10_bio->read_slot;
2624 struct bio *bio;
2625 struct r10conf *conf = mddev->private;
2626 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2627 char b[BDEVNAME_SIZE];
2628 unsigned long do_sync;
2629 int max_sectors;
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639 bio = r10_bio->devs[slot].bio;
2640 bdevname(bio->bi_bdev, b);
2641 bio_put(bio);
2642 r10_bio->devs[slot].bio = NULL;
2643
2644 if (mddev->ro == 0) {
2645 freeze_array(conf, 1);
2646 fix_read_error(conf, mddev, r10_bio);
2647 unfreeze_array(conf);
2648 } else
2649 r10_bio->devs[slot].bio = IO_BLOCKED;
2650
2651 rdev_dec_pending(rdev, mddev);
2652
2653read_more:
2654 rdev = read_balance(conf, r10_bio, &max_sectors);
2655 if (rdev == NULL) {
2656 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2657 " read error for block %llu\n",
2658 mdname(mddev), b,
2659 (unsigned long long)r10_bio->sector);
2660 raid_end_bio_io(r10_bio);
2661 return;
2662 }
2663
2664 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
2665 slot = r10_bio->read_slot;
2666 printk_ratelimited(
2667 KERN_ERR
2668 "md/raid10:%s: %s: redirecting "
2669 "sector %llu to another mirror\n",
2670 mdname(mddev),
2671 bdevname(rdev->bdev, b),
2672 (unsigned long long)r10_bio->sector);
2673 bio = bio_clone_mddev(r10_bio->master_bio,
2674 GFP_NOIO, mddev);
2675 bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
2676 r10_bio->devs[slot].bio = bio;
2677 r10_bio->devs[slot].rdev = rdev;
2678 bio->bi_iter.bi_sector = r10_bio->devs[slot].addr
2679 + choose_data_offset(r10_bio, rdev);
2680 bio->bi_bdev = rdev->bdev;
2681 bio->bi_rw = READ | do_sync;
2682 bio->bi_private = r10_bio;
2683 bio->bi_end_io = raid10_end_read_request;
2684 if (max_sectors < r10_bio->sectors) {
2685
2686 struct bio *mbio = r10_bio->master_bio;
2687 int sectors_handled =
2688 r10_bio->sector + max_sectors
2689 - mbio->bi_iter.bi_sector;
2690 r10_bio->sectors = max_sectors;
2691 spin_lock_irq(&conf->device_lock);
2692 if (mbio->bi_phys_segments == 0)
2693 mbio->bi_phys_segments = 2;
2694 else
2695 mbio->bi_phys_segments++;
2696 spin_unlock_irq(&conf->device_lock);
2697 generic_make_request(bio);
2698
2699 r10_bio = mempool_alloc(conf->r10bio_pool,
2700 GFP_NOIO);
2701 r10_bio->master_bio = mbio;
2702 r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
2703 r10_bio->state = 0;
2704 set_bit(R10BIO_ReadError,
2705 &r10_bio->state);
2706 r10_bio->mddev = mddev;
2707 r10_bio->sector = mbio->bi_iter.bi_sector
2708 + sectors_handled;
2709
2710 goto read_more;
2711 } else
2712 generic_make_request(bio);
2713}
2714
2715static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2716{
2717
2718
2719
2720
2721
2722
2723 int m;
2724 struct md_rdev *rdev;
2725
2726 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2727 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2728 for (m = 0; m < conf->copies; m++) {
2729 int dev = r10_bio->devs[m].devnum;
2730 rdev = conf->mirrors[dev].rdev;
2731 if (r10_bio->devs[m].bio == NULL)
2732 continue;
2733 if (test_bit(BIO_UPTODATE,
2734 &r10_bio->devs[m].bio->bi_flags)) {
2735 rdev_clear_badblocks(
2736 rdev,
2737 r10_bio->devs[m].addr,
2738 r10_bio->sectors, 0);
2739 } else {
2740 if (!rdev_set_badblocks(
2741 rdev,
2742 r10_bio->devs[m].addr,
2743 r10_bio->sectors, 0))
2744 md_error(conf->mddev, rdev);
2745 }
2746 rdev = conf->mirrors[dev].replacement;
2747 if (r10_bio->devs[m].repl_bio == NULL)
2748 continue;
2749 if (test_bit(BIO_UPTODATE,
2750 &r10_bio->devs[m].repl_bio->bi_flags)) {
2751 rdev_clear_badblocks(
2752 rdev,
2753 r10_bio->devs[m].addr,
2754 r10_bio->sectors, 0);
2755 } else {
2756 if (!rdev_set_badblocks(
2757 rdev,
2758 r10_bio->devs[m].addr,
2759 r10_bio->sectors, 0))
2760 md_error(conf->mddev, rdev);
2761 }
2762 }
2763 put_buf(r10_bio);
2764 } else {
2765 for (m = 0; m < conf->copies; m++) {
2766 int dev = r10_bio->devs[m].devnum;
2767 struct bio *bio = r10_bio->devs[m].bio;
2768 rdev = conf->mirrors[dev].rdev;
2769 if (bio == IO_MADE_GOOD) {
2770 rdev_clear_badblocks(
2771 rdev,
2772 r10_bio->devs[m].addr,
2773 r10_bio->sectors, 0);
2774 rdev_dec_pending(rdev, conf->mddev);
2775 } else if (bio != NULL &&
2776 !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2777 if (!narrow_write_error(r10_bio, m)) {
2778 md_error(conf->mddev, rdev);
2779 set_bit(R10BIO_Degraded,
2780 &r10_bio->state);
2781 }
2782 rdev_dec_pending(rdev, conf->mddev);
2783 }
2784 bio = r10_bio->devs[m].repl_bio;
2785 rdev = conf->mirrors[dev].replacement;
2786 if (rdev && bio == IO_MADE_GOOD) {
2787 rdev_clear_badblocks(
2788 rdev,
2789 r10_bio->devs[m].addr,
2790 r10_bio->sectors, 0);
2791 rdev_dec_pending(rdev, conf->mddev);
2792 }
2793 }
2794 if (test_bit(R10BIO_WriteError,
2795 &r10_bio->state))
2796 close_write(r10_bio);
2797 raid_end_bio_io(r10_bio);
2798 }
2799}
2800
2801static void raid10d(struct md_thread *thread)
2802{
2803 struct mddev *mddev = thread->mddev;
2804 struct r10bio *r10_bio;
2805 unsigned long flags;
2806 struct r10conf *conf = mddev->private;
2807 struct list_head *head = &conf->retry_list;
2808 struct blk_plug plug;
2809
2810 md_check_recovery(mddev);
2811
2812 blk_start_plug(&plug);
2813 for (;;) {
2814
2815 flush_pending_writes(conf);
2816
2817 spin_lock_irqsave(&conf->device_lock, flags);
2818 if (list_empty(head)) {
2819 spin_unlock_irqrestore(&conf->device_lock, flags);
2820 break;
2821 }
2822 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2823 list_del(head->prev);
2824 conf->nr_queued--;
2825 spin_unlock_irqrestore(&conf->device_lock, flags);
2826
2827 mddev = r10_bio->mddev;
2828 conf = mddev->private;
2829 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2830 test_bit(R10BIO_WriteError, &r10_bio->state))
2831 handle_write_completed(conf, r10_bio);
2832 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2833 reshape_request_write(mddev, r10_bio);
2834 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2835 sync_request_write(mddev, r10_bio);
2836 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2837 recovery_request_write(mddev, r10_bio);
2838 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2839 handle_read_error(mddev, r10_bio);
2840 else {
2841
2842
2843
2844 int slot = r10_bio->read_slot;
2845 generic_make_request(r10_bio->devs[slot].bio);
2846 }
2847
2848 cond_resched();
2849 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2850 md_check_recovery(mddev);
2851 }
2852 blk_finish_plug(&plug);
2853}
2854
2855
2856static int init_resync(struct r10conf *conf)
2857{
2858 int buffs;
2859 int i;
2860
2861 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2862 BUG_ON(conf->r10buf_pool);
2863 conf->have_replacement = 0;
2864 for (i = 0; i < conf->geo.raid_disks; i++)
2865 if (conf->mirrors[i].replacement)
2866 conf->have_replacement = 1;
2867 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2868 if (!conf->r10buf_pool)
2869 return -ENOMEM;
2870 conf->next_resync = 0;
2871 return 0;
2872}
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2907 int *skipped, int go_faster)
2908{
2909 struct r10conf *conf = mddev->private;
2910 struct r10bio *r10_bio;
2911 struct bio *biolist = NULL, *bio;
2912 sector_t max_sector, nr_sectors;
2913 int i;
2914 int max_sync;
2915 sector_t sync_blocks;
2916 sector_t sectors_skipped = 0;
2917 int chunks_skipped = 0;
2918 sector_t chunk_mask = conf->geo.chunk_mask;
2919
2920 if (!conf->r10buf_pool)
2921 if (init_resync(conf))
2922 return 0;
2923
2924
2925
2926
2927
2928 if (mddev->bitmap == NULL &&
2929 mddev->recovery_cp == MaxSector &&
2930 mddev->reshape_position == MaxSector &&
2931 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2932 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
2933 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2934 conf->fullsync == 0) {
2935 *skipped = 1;
2936 return mddev->dev_sectors - sector_nr;
2937 }
2938
2939 skipped:
2940 max_sector = mddev->dev_sectors;
2941 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2942 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2943 max_sector = mddev->resync_max_sectors;
2944 if (sector_nr >= max_sector) {
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2955 end_reshape(conf);
2956 return 0;
2957 }
2958
2959 if (mddev->curr_resync < max_sector) {
2960 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2961 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2962 &sync_blocks, 1);
2963 else for (i = 0; i < conf->geo.raid_disks; i++) {
2964 sector_t sect =
2965 raid10_find_virt(conf, mddev->curr_resync, i);
2966 bitmap_end_sync(mddev->bitmap, sect,
2967 &sync_blocks, 1);
2968 }
2969 } else {
2970
2971 if ((!mddev->bitmap || conf->fullsync)
2972 && conf->have_replacement
2973 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2974
2975
2976
2977 for (i = 0; i < conf->geo.raid_disks; i++)
2978 if (conf->mirrors[i].replacement)
2979 conf->mirrors[i].replacement
2980 ->recovery_offset
2981 = MaxSector;
2982 }
2983 conf->fullsync = 0;
2984 }
2985 bitmap_close_sync(mddev->bitmap);
2986 close_sync(conf);
2987 *skipped = 1;
2988 return sectors_skipped;
2989 }
2990
2991 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2992 return reshape_request(mddev, sector_nr, skipped);
2993
2994 if (chunks_skipped >= conf->geo.raid_disks) {
2995
2996
2997
2998 *skipped = 1;
2999 return (max_sector - sector_nr) + sectors_skipped;
3000 }
3001
3002 if (max_sector > mddev->resync_max)
3003 max_sector = mddev->resync_max;
3004
3005
3006
3007
3008 if (conf->geo.near_copies < conf->geo.raid_disks &&
3009 max_sector > (sector_nr | chunk_mask))
3010 max_sector = (sector_nr | chunk_mask) + 1;
3011
3012
3013
3014
3015 if (!go_faster && conf->nr_waiting)
3016 msleep_interruptible(1000);
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
3034 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3035
3036 int j;
3037 r10_bio = NULL;
3038
3039 for (i = 0 ; i < conf->geo.raid_disks; i++) {
3040 int still_degraded;
3041 struct r10bio *rb2;
3042 sector_t sect;
3043 int must_sync;
3044 int any_working;
3045 struct raid10_info *mirror = &conf->mirrors[i];
3046
3047 if ((mirror->rdev == NULL ||
3048 test_bit(In_sync, &mirror->rdev->flags))
3049 &&
3050 (mirror->replacement == NULL ||
3051 test_bit(Faulty,
3052 &mirror->replacement->flags)))
3053 continue;
3054
3055 still_degraded = 0;
3056
3057 rb2 = r10_bio;
3058 sect = raid10_find_virt(conf, sector_nr, i);
3059 if (sect >= mddev->resync_max_sectors) {
3060
3061
3062
3063 continue;
3064 }
3065
3066
3067
3068
3069 must_sync = bitmap_start_sync(mddev->bitmap, sect,
3070 &sync_blocks, 1);
3071 if (sync_blocks < max_sync)
3072 max_sync = sync_blocks;
3073 if (!must_sync &&
3074 mirror->replacement == NULL &&
3075 !conf->fullsync) {
3076
3077
3078
3079 chunks_skipped = -1;
3080 continue;
3081 }
3082
3083 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3084 raise_barrier(conf, rb2 != NULL);
3085 atomic_set(&r10_bio->remaining, 0);
3086
3087 r10_bio->master_bio = (struct bio*)rb2;
3088 if (rb2)
3089 atomic_inc(&rb2->remaining);
3090 r10_bio->mddev = mddev;
3091 set_bit(R10BIO_IsRecover, &r10_bio->state);
3092 r10_bio->sector = sect;
3093
3094 raid10_find_phys(conf, r10_bio);
3095
3096
3097
3098
3099 for (j = 0; j < conf->geo.raid_disks; j++)
3100 if (conf->mirrors[j].rdev == NULL ||
3101 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
3102 still_degraded = 1;
3103 break;
3104 }
3105
3106 must_sync = bitmap_start_sync(mddev->bitmap, sect,
3107 &sync_blocks, still_degraded);
3108
3109 any_working = 0;
3110 for (j=0; j<conf->copies;j++) {
3111 int k;
3112 int d = r10_bio->devs[j].devnum;
3113 sector_t from_addr, to_addr;
3114 struct md_rdev *rdev;
3115 sector_t sector, first_bad;
3116 int bad_sectors;
3117 if (!conf->mirrors[d].rdev ||
3118 !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
3119 continue;
3120
3121 any_working = 1;
3122 rdev = conf->mirrors[d].rdev;
3123 sector = r10_bio->devs[j].addr;
3124
3125 if (is_badblock(rdev, sector, max_sync,
3126 &first_bad, &bad_sectors)) {
3127 if (first_bad > sector)
3128 max_sync = first_bad - sector;
3129 else {
3130 bad_sectors -= (sector
3131 - first_bad);
3132 if (max_sync > bad_sectors)
3133 max_sync = bad_sectors;
3134 continue;
3135 }
3136 }
3137 bio = r10_bio->devs[0].bio;
3138 bio_reset(bio);
3139 bio->bi_next = biolist;
3140 biolist = bio;
3141 bio->bi_private = r10_bio;
3142 bio->bi_end_io = end_sync_read;
3143 bio->bi_rw = READ;
3144 from_addr = r10_bio->devs[j].addr;
3145 bio->bi_iter.bi_sector = from_addr +
3146 rdev->data_offset;
3147 bio->bi_bdev = rdev->bdev;
3148 atomic_inc(&rdev->nr_pending);
3149
3150
3151 for (k=0; k<conf->copies; k++)
3152 if (r10_bio->devs[k].devnum == i)
3153 break;
3154 BUG_ON(k == conf->copies);
3155 to_addr = r10_bio->devs[k].addr;
3156 r10_bio->devs[0].devnum = d;
3157 r10_bio->devs[0].addr = from_addr;
3158 r10_bio->devs[1].devnum = i;
3159 r10_bio->devs[1].addr = to_addr;
3160
3161 rdev = mirror->rdev;
3162 if (!test_bit(In_sync, &rdev->flags)) {
3163 bio = r10_bio->devs[1].bio;
3164 bio_reset(bio);
3165 bio->bi_next = biolist;
3166 biolist = bio;
3167 bio->bi_private = r10_bio;
3168 bio->bi_end_io = end_sync_write;
3169 bio->bi_rw = WRITE;
3170 bio->bi_iter.bi_sector = to_addr
3171 + rdev->data_offset;
3172 bio->bi_bdev = rdev->bdev;
3173 atomic_inc(&r10_bio->remaining);
3174 } else
3175 r10_bio->devs[1].bio->bi_end_io = NULL;
3176
3177
3178 bio = r10_bio->devs[1].repl_bio;
3179 if (bio)
3180 bio->bi_end_io = NULL;
3181 rdev = mirror->replacement;
3182
3183
3184
3185
3186
3187
3188
3189
3190 if (rdev == NULL || bio == NULL ||
3191 test_bit(Faulty, &rdev->flags))
3192 break;
3193 bio_reset(bio);
3194 bio->bi_next = biolist;
3195 biolist = bio;
3196 bio->bi_private = r10_bio;
3197 bio->bi_end_io = end_sync_write;
3198 bio->bi_rw = WRITE;
3199 bio->bi_iter.bi_sector = to_addr +
3200 rdev->data_offset;
3201 bio->bi_bdev = rdev->bdev;
3202 atomic_inc(&r10_bio->remaining);
3203 break;
3204 }
3205 if (j == conf->copies) {
3206
3207
3208 if (any_working) {
3209
3210
3211
3212 int k;
3213 for (k = 0; k < conf->copies; k++)
3214 if (r10_bio->devs[k].devnum == i)
3215 break;
3216 if (!test_bit(In_sync,
3217 &mirror->rdev->flags)
3218 && !rdev_set_badblocks(
3219 mirror->rdev,
3220 r10_bio->devs[k].addr,
3221 max_sync, 0))
3222 any_working = 0;
3223 if (mirror->replacement &&
3224 !rdev_set_badblocks(
3225 mirror->replacement,
3226 r10_bio->devs[k].addr,
3227 max_sync, 0))
3228 any_working = 0;
3229 }
3230 if (!any_working) {
3231 if (!test_and_set_bit(MD_RECOVERY_INTR,
3232 &mddev->recovery))
3233 printk(KERN_INFO "md/raid10:%s: insufficient "
3234 "working devices for recovery.\n",
3235 mdname(mddev));
3236 mirror->recovery_disabled
3237 = mddev->recovery_disabled;
3238 }
3239 put_buf(r10_bio);
3240 if (rb2)
3241 atomic_dec(&rb2->remaining);
3242 r10_bio = rb2;
3243 break;
3244 }
3245 }
3246 if (biolist == NULL) {
3247 while (r10_bio) {
3248 struct r10bio *rb2 = r10_bio;
3249 r10_bio = (struct r10bio*) rb2->master_bio;
3250 rb2->master_bio = NULL;
3251 put_buf(rb2);
3252 }
3253 goto giveup;
3254 }
3255 } else {
3256
3257 int count = 0;
3258
3259 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
3260
3261 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
3262 &sync_blocks, mddev->degraded) &&
3263 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3264 &mddev->recovery)) {
3265
3266 *skipped = 1;
3267 return sync_blocks + sectors_skipped;
3268 }
3269 if (sync_blocks < max_sync)
3270 max_sync = sync_blocks;
3271 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3272
3273 r10_bio->mddev = mddev;
3274 atomic_set(&r10_bio->remaining, 0);
3275 raise_barrier(conf, 0);
3276 conf->next_resync = sector_nr;
3277
3278 r10_bio->master_bio = NULL;
3279 r10_bio->sector = sector_nr;
3280 set_bit(R10BIO_IsSync, &r10_bio->state);
3281 raid10_find_phys(conf, r10_bio);
3282 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3283
3284 for (i = 0; i < conf->copies; i++) {
3285 int d = r10_bio->devs[i].devnum;
3286 sector_t first_bad, sector;
3287 int bad_sectors;
3288
3289 if (r10_bio->devs[i].repl_bio)
3290 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3291
3292 bio = r10_bio->devs[i].bio;
3293 bio_reset(bio);
3294 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3295 if (conf->mirrors[d].rdev == NULL ||
3296 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
3297 continue;
3298 sector = r10_bio->devs[i].addr;
3299 if (is_badblock(conf->mirrors[d].rdev,
3300 sector, max_sync,
3301 &first_bad, &bad_sectors)) {
3302 if (first_bad > sector)
3303 max_sync = first_bad - sector;
3304 else {
3305 bad_sectors -= (sector - first_bad);
3306 if (max_sync > bad_sectors)
3307 max_sync = bad_sectors;
3308 continue;
3309 }
3310 }
3311 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3312 atomic_inc(&r10_bio->remaining);
3313 bio->bi_next = biolist;
3314 biolist = bio;
3315 bio->bi_private = r10_bio;
3316 bio->bi_end_io = end_sync_read;
3317 bio->bi_rw = READ;
3318 bio->bi_iter.bi_sector = sector +
3319 conf->mirrors[d].rdev->data_offset;
3320 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
3321 count++;
3322
3323 if (conf->mirrors[d].replacement == NULL ||
3324 test_bit(Faulty,
3325 &conf->mirrors[d].replacement->flags))
3326 continue;
3327
3328
3329 bio = r10_bio->devs[i].repl_bio;
3330 bio_reset(bio);
3331 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3332
3333 sector = r10_bio->devs[i].addr;
3334 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3335 bio->bi_next = biolist;
3336 biolist = bio;
3337 bio->bi_private = r10_bio;
3338 bio->bi_end_io = end_sync_write;
3339 bio->bi_rw = WRITE;
3340 bio->bi_iter.bi_sector = sector +
3341 conf->mirrors[d].replacement->data_offset;
3342 bio->bi_bdev = conf->mirrors[d].replacement->bdev;
3343 count++;
3344 }
3345
3346 if (count < 2) {
3347 for (i=0; i<conf->copies; i++) {
3348 int d = r10_bio->devs[i].devnum;
3349 if (r10_bio->devs[i].bio->bi_end_io)
3350 rdev_dec_pending(conf->mirrors[d].rdev,
3351 mddev);
3352 if (r10_bio->devs[i].repl_bio &&
3353 r10_bio->devs[i].repl_bio->bi_end_io)
3354 rdev_dec_pending(
3355 conf->mirrors[d].replacement,
3356 mddev);
3357 }
3358 put_buf(r10_bio);
3359 biolist = NULL;
3360 goto giveup;
3361 }
3362 }
3363
3364 nr_sectors = 0;
3365 if (sector_nr + max_sync < max_sector)
3366 max_sector = sector_nr + max_sync;
3367 do {
3368 struct page *page;
3369 int len = PAGE_SIZE;
3370 if (sector_nr + (len>>9) > max_sector)
3371 len = (max_sector - sector_nr) << 9;
3372 if (len == 0)
3373 break;
3374 for (bio= biolist ; bio ; bio=bio->bi_next) {
3375 struct bio *bio2;
3376 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
3377 if (bio_add_page(bio, page, len, 0))
3378 continue;
3379
3380
3381 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
3382 for (bio2 = biolist;
3383 bio2 && bio2 != bio;
3384 bio2 = bio2->bi_next) {
3385
3386 bio2->bi_vcnt--;
3387 bio2->bi_iter.bi_size -= len;
3388 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
3389 }
3390 goto bio_full;
3391 }
3392 nr_sectors += len>>9;
3393 sector_nr += len>>9;
3394 } while (biolist->bi_vcnt < RESYNC_PAGES);
3395 bio_full:
3396 r10_bio->sectors = nr_sectors;
3397
3398 while (biolist) {
3399 bio = biolist;
3400 biolist = biolist->bi_next;
3401
3402 bio->bi_next = NULL;
3403 r10_bio = bio->bi_private;
3404 r10_bio->sectors = nr_sectors;
3405
3406 if (bio->bi_end_io == end_sync_read) {
3407 md_sync_acct(bio->bi_bdev, nr_sectors);
3408 set_bit(BIO_UPTODATE, &bio->bi_flags);
3409 generic_make_request(bio);
3410 }
3411 }
3412
3413 if (sectors_skipped)
3414
3415
3416
3417 md_done_sync(mddev, sectors_skipped, 1);
3418
3419 return sectors_skipped + nr_sectors;
3420 giveup:
3421
3422
3423
3424
3425 if (sector_nr + max_sync < max_sector)
3426 max_sector = sector_nr + max_sync;
3427
3428 sectors_skipped += (max_sector - sector_nr);
3429 chunks_skipped ++;
3430 sector_nr = max_sector;
3431 goto skipped;
3432}
3433
3434static sector_t
3435raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3436{
3437 sector_t size;
3438 struct r10conf *conf = mddev->private;
3439
3440 if (!raid_disks)
3441 raid_disks = min(conf->geo.raid_disks,
3442 conf->prev.raid_disks);
3443 if (!sectors)
3444 sectors = conf->dev_sectors;
3445
3446 size = sectors >> conf->geo.chunk_shift;
3447 sector_div(size, conf->geo.far_copies);
3448 size = size * raid_disks;
3449 sector_div(size, conf->geo.near_copies);
3450
3451 return size << conf->geo.chunk_shift;
3452}
3453
3454static void calc_sectors(struct r10conf *conf, sector_t size)
3455{
3456
3457
3458
3459
3460
3461 size = size >> conf->geo.chunk_shift;
3462 sector_div(size, conf->geo.far_copies);
3463 size = size * conf->geo.raid_disks;
3464 sector_div(size, conf->geo.near_copies);
3465
3466
3467 size = size * conf->copies;
3468
3469
3470
3471
3472 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3473
3474 conf->dev_sectors = size << conf->geo.chunk_shift;
3475
3476 if (conf->geo.far_offset)
3477 conf->geo.stride = 1 << conf->geo.chunk_shift;
3478 else {
3479 sector_div(size, conf->geo.far_copies);
3480 conf->geo.stride = size << conf->geo.chunk_shift;
3481 }
3482}
3483
3484enum geo_type {geo_new, geo_old, geo_start};
3485static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3486{
3487 int nc, fc, fo;
3488 int layout, chunk, disks;
3489 switch (new) {
3490 case geo_old:
3491 layout = mddev->layout;
3492 chunk = mddev->chunk_sectors;
3493 disks = mddev->raid_disks - mddev->delta_disks;
3494 break;
3495 case geo_new:
3496 layout = mddev->new_layout;
3497 chunk = mddev->new_chunk_sectors;
3498 disks = mddev->raid_disks;
3499 break;
3500 default:
3501 case geo_start:
3502
3503 layout = mddev->new_layout;
3504 chunk = mddev->new_chunk_sectors;
3505 disks = mddev->raid_disks + mddev->delta_disks;
3506 break;
3507 }
3508 if (layout >> 18)
3509 return -1;
3510 if (chunk < (PAGE_SIZE >> 9) ||
3511 !is_power_of_2(chunk))
3512 return -2;
3513 nc = layout & 255;
3514 fc = (layout >> 8) & 255;
3515 fo = layout & (1<<16);
3516 geo->raid_disks = disks;
3517 geo->near_copies = nc;
3518 geo->far_copies = fc;
3519 geo->far_offset = fo;
3520 geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
3521 geo->chunk_mask = chunk - 1;
3522 geo->chunk_shift = ffz(~chunk);
3523 return nc*fc;
3524}
3525
3526static struct r10conf *setup_conf(struct mddev *mddev)
3527{
3528 struct r10conf *conf = NULL;
3529 int err = -EINVAL;
3530 struct geom geo;
3531 int copies;
3532
3533 copies = setup_geo(&geo, mddev, geo_new);
3534
3535 if (copies == -2) {
3536 printk(KERN_ERR "md/raid10:%s: chunk size must be "
3537 "at least PAGE_SIZE(%ld) and be a power of 2.\n",
3538 mdname(mddev), PAGE_SIZE);
3539 goto out;
3540 }
3541
3542 if (copies < 2 || copies > mddev->raid_disks) {
3543 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3544 mdname(mddev), mddev->new_layout);
3545 goto out;
3546 }
3547
3548 err = -ENOMEM;
3549 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3550 if (!conf)
3551 goto out;
3552
3553
3554 conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
3555 max(0,-mddev->delta_disks)),
3556 GFP_KERNEL);
3557 if (!conf->mirrors)
3558 goto out;
3559
3560 conf->tmppage = alloc_page(GFP_KERNEL);
3561 if (!conf->tmppage)
3562 goto out;
3563
3564 conf->geo = geo;
3565 conf->copies = copies;
3566 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
3567 r10bio_pool_free, conf);
3568 if (!conf->r10bio_pool)
3569 goto out;
3570
3571 calc_sectors(conf, mddev->dev_sectors);
3572 if (mddev->reshape_position == MaxSector) {
3573 conf->prev = conf->geo;
3574 conf->reshape_progress = MaxSector;
3575 } else {
3576 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3577 err = -EINVAL;
3578 goto out;
3579 }
3580 conf->reshape_progress = mddev->reshape_position;
3581 if (conf->prev.far_offset)
3582 conf->prev.stride = 1 << conf->prev.chunk_shift;
3583 else
3584
3585 conf->prev.stride = conf->dev_sectors;
3586 }
3587 spin_lock_init(&conf->device_lock);
3588 INIT_LIST_HEAD(&conf->retry_list);
3589
3590 spin_lock_init(&conf->resync_lock);
3591 init_waitqueue_head(&conf->wait_barrier);
3592
3593 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3594 if (!conf->thread)
3595 goto out;
3596
3597 conf->mddev = mddev;
3598 return conf;
3599
3600 out:
3601 if (err == -ENOMEM)
3602 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
3603 mdname(mddev));
3604 if (conf) {
3605 if (conf->r10bio_pool)
3606 mempool_destroy(conf->r10bio_pool);
3607 kfree(conf->mirrors);
3608 safe_put_page(conf->tmppage);
3609 kfree(conf);
3610 }
3611 return ERR_PTR(err);
3612}
3613
3614static int run(struct mddev *mddev)
3615{
3616 struct r10conf *conf;
3617 int i, disk_idx, chunk_size;
3618 struct raid10_info *disk;
3619 struct md_rdev *rdev;
3620 sector_t size;
3621 sector_t min_offset_diff = 0;
3622 int first = 1;
3623 bool discard_supported = false;
3624
3625 if (mddev->private == NULL) {
3626 conf = setup_conf(mddev);
3627 if (IS_ERR(conf))
3628 return PTR_ERR(conf);
3629 mddev->private = conf;
3630 }
3631 conf = mddev->private;
3632 if (!conf)
3633 goto out;
3634
3635 mddev->thread = conf->thread;
3636 conf->thread = NULL;
3637
3638 chunk_size = mddev->chunk_sectors << 9;
3639 if (mddev->queue) {
3640 blk_queue_max_discard_sectors(mddev->queue,
3641 mddev->chunk_sectors);
3642 blk_queue_max_write_same_sectors(mddev->queue, 0);
3643 blk_queue_io_min(mddev->queue, chunk_size);
3644 if (conf->geo.raid_disks % conf->geo.near_copies)
3645 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3646 else
3647 blk_queue_io_opt(mddev->queue, chunk_size *
3648 (conf->geo.raid_disks / conf->geo.near_copies));
3649 }
3650
3651 rdev_for_each(rdev, mddev) {
3652 long long diff;
3653 struct request_queue *q;
3654
3655 disk_idx = rdev->raid_disk;
3656 if (disk_idx < 0)
3657 continue;
3658 if (disk_idx >= conf->geo.raid_disks &&
3659 disk_idx >= conf->prev.raid_disks)
3660 continue;
3661 disk = conf->mirrors + disk_idx;
3662
3663 if (test_bit(Replacement, &rdev->flags)) {
3664 if (disk->replacement)
3665 goto out_free_conf;
3666 disk->replacement = rdev;
3667 } else {
3668 if (disk->rdev)
3669 goto out_free_conf;
3670 disk->rdev = rdev;
3671 }
3672 q = bdev_get_queue(rdev->bdev);
3673 if (q->merge_bvec_fn)
3674 mddev->merge_check_needed = 1;
3675 diff = (rdev->new_data_offset - rdev->data_offset);
3676 if (!mddev->reshape_backwards)
3677 diff = -diff;
3678 if (diff < 0)
3679 diff = 0;
3680 if (first || diff < min_offset_diff)
3681 min_offset_diff = diff;
3682
3683 if (mddev->gendisk)
3684 disk_stack_limits(mddev->gendisk, rdev->bdev,
3685 rdev->data_offset << 9);
3686
3687 disk->head_position = 0;
3688
3689 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3690 discard_supported = true;
3691 }
3692
3693 if (mddev->queue) {
3694 if (discard_supported)
3695 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
3696 mddev->queue);
3697 else
3698 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
3699 mddev->queue);
3700 }
3701
3702 if (!enough(conf, -1)) {
3703 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
3704 mdname(mddev));
3705 goto out_free_conf;
3706 }
3707
3708 if (conf->reshape_progress != MaxSector) {
3709
3710 if (conf->geo.far_copies != 1 &&
3711 conf->geo.far_offset == 0)
3712 goto out_free_conf;
3713 if (conf->prev.far_copies != 1 &&
3714 conf->prev.far_offset == 0)
3715 goto out_free_conf;
3716 }
3717
3718 mddev->degraded = 0;
3719 for (i = 0;
3720 i < conf->geo.raid_disks
3721 || i < conf->prev.raid_disks;
3722 i++) {
3723
3724 disk = conf->mirrors + i;
3725
3726 if (!disk->rdev && disk->replacement) {
3727
3728 disk->rdev = disk->replacement;
3729 disk->replacement = NULL;
3730 clear_bit(Replacement, &disk->rdev->flags);
3731 }
3732
3733 if (!disk->rdev ||
3734 !test_bit(In_sync, &disk->rdev->flags)) {
3735 disk->head_position = 0;
3736 mddev->degraded++;
3737 if (disk->rdev &&
3738 disk->rdev->saved_raid_disk < 0)
3739 conf->fullsync = 1;
3740 }
3741 disk->recovery_disabled = mddev->recovery_disabled - 1;
3742 }
3743
3744 if (mddev->recovery_cp != MaxSector)
3745 printk(KERN_NOTICE "md/raid10:%s: not clean"
3746 " -- starting background reconstruction\n",
3747 mdname(mddev));
3748 printk(KERN_INFO
3749 "md/raid10:%s: active with %d out of %d devices\n",
3750 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3751 conf->geo.raid_disks);
3752
3753
3754
3755 mddev->dev_sectors = conf->dev_sectors;
3756 size = raid10_size(mddev, 0, 0);
3757 md_set_array_sectors(mddev, size);
3758 mddev->resync_max_sectors = size;
3759
3760 if (mddev->queue) {
3761 int stripe = conf->geo.raid_disks *
3762 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3763 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
3764 mddev->queue->backing_dev_info.congested_data = mddev;
3765
3766
3767
3768
3769
3770 stripe /= conf->geo.near_copies;
3771 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3772 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3773 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3774 }
3775
3776
3777 if (md_integrity_register(mddev))
3778 goto out_free_conf;
3779
3780 if (conf->reshape_progress != MaxSector) {
3781 unsigned long before_length, after_length;
3782
3783 before_length = ((1 << conf->prev.chunk_shift) *
3784 conf->prev.far_copies);
3785 after_length = ((1 << conf->geo.chunk_shift) *
3786 conf->geo.far_copies);
3787
3788 if (max(before_length, after_length) > min_offset_diff) {
3789
3790 printk("md/raid10: offset difference not enough to continue reshape\n");
3791 goto out_free_conf;
3792 }
3793 conf->offset_diff = min_offset_diff;
3794
3795 conf->reshape_safe = conf->reshape_progress;
3796 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3797 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3798 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3799 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3800 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3801 "reshape");
3802 }
3803
3804 return 0;
3805
3806out_free_conf:
3807 md_unregister_thread(&mddev->thread);
3808 if (conf->r10bio_pool)
3809 mempool_destroy(conf->r10bio_pool);
3810 safe_put_page(conf->tmppage);
3811 kfree(conf->mirrors);
3812 kfree(conf);
3813 mddev->private = NULL;
3814out:
3815 return -EIO;
3816}
3817
3818static int stop(struct mddev *mddev)
3819{
3820 struct r10conf *conf = mddev->private;
3821
3822 raise_barrier(conf, 0);
3823 lower_barrier(conf);
3824
3825 md_unregister_thread(&mddev->thread);
3826 if (mddev->queue)
3827
3828 blk_sync_queue(mddev->queue);
3829
3830 if (conf->r10bio_pool)
3831 mempool_destroy(conf->r10bio_pool);
3832 safe_put_page(conf->tmppage);
3833 kfree(conf->mirrors);
3834 kfree(conf);
3835 mddev->private = NULL;
3836 return 0;
3837}
3838
3839static void raid10_quiesce(struct mddev *mddev, int state)
3840{
3841 struct r10conf *conf = mddev->private;
3842
3843 switch(state) {
3844 case 1:
3845 raise_barrier(conf, 0);
3846 break;
3847 case 0:
3848 lower_barrier(conf);
3849 break;
3850 }
3851}
3852
3853static int raid10_resize(struct mddev *mddev, sector_t sectors)
3854{
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867 struct r10conf *conf = mddev->private;
3868 sector_t oldsize, size;
3869
3870 if (mddev->reshape_position != MaxSector)
3871 return -EBUSY;
3872
3873 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3874 return -EINVAL;
3875
3876 oldsize = raid10_size(mddev, 0, 0);
3877 size = raid10_size(mddev, sectors, 0);
3878 if (mddev->external_size &&
3879 mddev->array_sectors > size)
3880 return -EINVAL;
3881 if (mddev->bitmap) {
3882 int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
3883 if (ret)
3884 return ret;
3885 }
3886 md_set_array_sectors(mddev, size);
3887 set_capacity(mddev->gendisk, mddev->array_sectors);
3888 revalidate_disk(mddev->gendisk);
3889 if (sectors > mddev->dev_sectors &&
3890 mddev->recovery_cp > oldsize) {
3891 mddev->recovery_cp = oldsize;
3892 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3893 }
3894 calc_sectors(conf, sectors);
3895 mddev->dev_sectors = conf->dev_sectors;
3896 mddev->resync_max_sectors = size;
3897 return 0;
3898}
3899
3900static void *raid10_takeover_raid0(struct mddev *mddev)
3901{
3902 struct md_rdev *rdev;
3903 struct r10conf *conf;
3904
3905 if (mddev->degraded > 0) {
3906 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
3907 mdname(mddev));
3908 return ERR_PTR(-EINVAL);
3909 }
3910
3911
3912 mddev->new_level = 10;
3913
3914 mddev->new_layout = (1<<8) + 2;
3915 mddev->new_chunk_sectors = mddev->chunk_sectors;
3916 mddev->delta_disks = mddev->raid_disks;
3917 mddev->raid_disks *= 2;
3918
3919 mddev->recovery_cp = MaxSector;
3920
3921 conf = setup_conf(mddev);
3922 if (!IS_ERR(conf)) {
3923 rdev_for_each(rdev, mddev)
3924 if (rdev->raid_disk >= 0)
3925 rdev->new_raid_disk = rdev->raid_disk * 2;
3926 conf->barrier = 1;
3927 }
3928
3929 return conf;
3930}
3931
3932static void *raid10_takeover(struct mddev *mddev)
3933{
3934 struct r0conf *raid0_conf;
3935
3936
3937
3938
3939 if (mddev->level == 0) {
3940
3941 raid0_conf = mddev->private;
3942 if (raid0_conf->nr_strip_zones > 1) {
3943 printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
3944 " with more than one zone.\n",
3945 mdname(mddev));
3946 return ERR_PTR(-EINVAL);
3947 }
3948 return raid10_takeover_raid0(mddev);
3949 }
3950 return ERR_PTR(-EINVAL);
3951}
3952
3953static int raid10_check_reshape(struct mddev *mddev)
3954{
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969 struct r10conf *conf = mddev->private;
3970 struct geom geo;
3971
3972 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
3973 return -EINVAL;
3974
3975 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
3976
3977 return -EINVAL;
3978 if (geo.far_copies > 1 && !geo.far_offset)
3979
3980 return -EINVAL;
3981
3982 if (mddev->array_sectors & geo.chunk_mask)
3983
3984 return -EINVAL;
3985
3986 if (!enough(conf, -1))
3987 return -EINVAL;
3988
3989 kfree(conf->mirrors_new);
3990 conf->mirrors_new = NULL;
3991 if (mddev->delta_disks > 0) {
3992
3993 conf->mirrors_new = kzalloc(
3994 sizeof(struct raid10_info)
3995 *(mddev->raid_disks +
3996 mddev->delta_disks),
3997 GFP_KERNEL);
3998 if (!conf->mirrors_new)
3999 return -ENOMEM;
4000 }
4001 return 0;
4002}
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017static int calc_degraded(struct r10conf *conf)
4018{
4019 int degraded, degraded2;
4020 int i;
4021
4022 rcu_read_lock();
4023 degraded = 0;
4024
4025 for (i = 0; i < conf->prev.raid_disks; i++) {
4026 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4027 if (!rdev || test_bit(Faulty, &rdev->flags))
4028 degraded++;
4029 else if (!test_bit(In_sync, &rdev->flags))
4030
4031
4032
4033
4034 degraded++;
4035 }
4036 rcu_read_unlock();
4037 if (conf->geo.raid_disks == conf->prev.raid_disks)
4038 return degraded;
4039 rcu_read_lock();
4040 degraded2 = 0;
4041 for (i = 0; i < conf->geo.raid_disks; i++) {
4042 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4043 if (!rdev || test_bit(Faulty, &rdev->flags))
4044 degraded2++;
4045 else if (!test_bit(In_sync, &rdev->flags)) {
4046
4047
4048
4049
4050
4051 if (conf->geo.raid_disks <= conf->prev.raid_disks)
4052 degraded2++;
4053 }
4054 }
4055 rcu_read_unlock();
4056 if (degraded2 > degraded)
4057 return degraded2;
4058 return degraded;
4059}
4060
4061static int raid10_start_reshape(struct mddev *mddev)
4062{
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073 unsigned long before_length, after_length;
4074 sector_t min_offset_diff = 0;
4075 int first = 1;
4076 struct geom new;
4077 struct r10conf *conf = mddev->private;
4078 struct md_rdev *rdev;
4079 int spares = 0;
4080 int ret;
4081
4082 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4083 return -EBUSY;
4084
4085 if (setup_geo(&new, mddev, geo_start) != conf->copies)
4086 return -EINVAL;
4087
4088 before_length = ((1 << conf->prev.chunk_shift) *
4089 conf->prev.far_copies);
4090 after_length = ((1 << conf->geo.chunk_shift) *
4091 conf->geo.far_copies);
4092
4093 rdev_for_each(rdev, mddev) {
4094 if (!test_bit(In_sync, &rdev->flags)
4095 && !test_bit(Faulty, &rdev->flags))
4096 spares++;
4097 if (rdev->raid_disk >= 0) {
4098 long long diff = (rdev->new_data_offset
4099 - rdev->data_offset);
4100 if (!mddev->reshape_backwards)
4101 diff = -diff;
4102 if (diff < 0)
4103 diff = 0;
4104 if (first || diff < min_offset_diff)
4105 min_offset_diff = diff;
4106 }
4107 }
4108
4109 if (max(before_length, after_length) > min_offset_diff)
4110 return -EINVAL;
4111
4112 if (spares < mddev->delta_disks)
4113 return -EINVAL;
4114
4115 conf->offset_diff = min_offset_diff;
4116 spin_lock_irq(&conf->device_lock);
4117 if (conf->mirrors_new) {
4118 memcpy(conf->mirrors_new, conf->mirrors,
4119 sizeof(struct raid10_info)*conf->prev.raid_disks);
4120 smp_mb();
4121 kfree(conf->mirrors_old);
4122 conf->mirrors_old = conf->mirrors;
4123 conf->mirrors = conf->mirrors_new;
4124 conf->mirrors_new = NULL;
4125 }
4126 setup_geo(&conf->geo, mddev, geo_start);
4127 smp_mb();
4128 if (mddev->reshape_backwards) {
4129 sector_t size = raid10_size(mddev, 0, 0);
4130 if (size < mddev->array_sectors) {
4131 spin_unlock_irq(&conf->device_lock);
4132 printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
4133 mdname(mddev));
4134 return -EINVAL;
4135 }
4136 mddev->resync_max_sectors = size;
4137 conf->reshape_progress = size;
4138 } else
4139 conf->reshape_progress = 0;
4140 spin_unlock_irq(&conf->device_lock);
4141
4142 if (mddev->delta_disks && mddev->bitmap) {
4143 ret = bitmap_resize(mddev->bitmap,
4144 raid10_size(mddev, 0,
4145 conf->geo.raid_disks),
4146 0, 0);
4147 if (ret)
4148 goto abort;
4149 }
4150 if (mddev->delta_disks > 0) {
4151 rdev_for_each(rdev, mddev)
4152 if (rdev->raid_disk < 0 &&
4153 !test_bit(Faulty, &rdev->flags)) {
4154 if (raid10_add_disk(mddev, rdev) == 0) {
4155 if (rdev->raid_disk >=
4156 conf->prev.raid_disks)
4157 set_bit(In_sync, &rdev->flags);
4158 else
4159 rdev->recovery_offset = 0;
4160
4161 if (sysfs_link_rdev(mddev, rdev))
4162 ;
4163 }
4164 } else if (rdev->raid_disk >= conf->prev.raid_disks
4165 && !test_bit(Faulty, &rdev->flags)) {
4166
4167 set_bit(In_sync, &rdev->flags);
4168 }
4169 }
4170
4171
4172
4173
4174 spin_lock_irq(&conf->device_lock);
4175 mddev->degraded = calc_degraded(conf);
4176 spin_unlock_irq(&conf->device_lock);
4177 mddev->raid_disks = conf->geo.raid_disks;
4178 mddev->reshape_position = conf->reshape_progress;
4179 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4180
4181 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4182 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4183 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4184 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4185
4186 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4187 "reshape");
4188 if (!mddev->sync_thread) {
4189 ret = -EAGAIN;
4190 goto abort;
4191 }
4192 conf->reshape_checkpoint = jiffies;
4193 md_wakeup_thread(mddev->sync_thread);
4194 md_new_event(mddev);
4195 return 0;
4196
4197abort:
4198 mddev->recovery = 0;
4199 spin_lock_irq(&conf->device_lock);
4200 conf->geo = conf->prev;
4201 mddev->raid_disks = conf->geo.raid_disks;
4202 rdev_for_each(rdev, mddev)
4203 rdev->new_data_offset = rdev->data_offset;
4204 smp_wmb();
4205 conf->reshape_progress = MaxSector;
4206 mddev->reshape_position = MaxSector;
4207 spin_unlock_irq(&conf->device_lock);
4208 return ret;
4209}
4210
4211
4212
4213
4214
4215
4216
4217static sector_t last_dev_address(sector_t s, struct geom *geo)
4218{
4219 s = (s | geo->chunk_mask) + 1;
4220 s >>= geo->chunk_shift;
4221 s *= geo->near_copies;
4222 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4223 s *= geo->far_copies;
4224 s <<= geo->chunk_shift;
4225 return s;
4226}
4227
4228
4229
4230
4231
4232static sector_t first_dev_address(sector_t s, struct geom *geo)
4233{
4234 s >>= geo->chunk_shift;
4235 s *= geo->near_copies;
4236 sector_div(s, geo->raid_disks);
4237 s *= geo->far_copies;
4238 s <<= geo->chunk_shift;
4239 return s;
4240}
4241
4242static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4243 int *skipped)
4244{
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282 struct r10conf *conf = mddev->private;
4283 struct r10bio *r10_bio;
4284 sector_t next, safe, last;
4285 int max_sectors;
4286 int nr_sectors;
4287 int s;
4288 struct md_rdev *rdev;
4289 int need_flush = 0;
4290 struct bio *blist;
4291 struct bio *bio, *read_bio;
4292 int sectors_done = 0;
4293
4294 if (sector_nr == 0) {
4295
4296 if (mddev->reshape_backwards &&
4297 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4298 sector_nr = (raid10_size(mddev, 0, 0)
4299 - conf->reshape_progress);
4300 } else if (!mddev->reshape_backwards &&
4301 conf->reshape_progress > 0)
4302 sector_nr = conf->reshape_progress;
4303 if (sector_nr) {
4304 mddev->curr_resync_completed = sector_nr;
4305 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4306 *skipped = 1;
4307 return sector_nr;
4308 }
4309 }
4310
4311
4312
4313
4314
4315 if (mddev->reshape_backwards) {
4316
4317
4318
4319 next = first_dev_address(conf->reshape_progress - 1,
4320 &conf->geo);
4321
4322
4323
4324
4325 safe = last_dev_address(conf->reshape_safe - 1,
4326 &conf->prev);
4327
4328 if (next + conf->offset_diff < safe)
4329 need_flush = 1;
4330
4331 last = conf->reshape_progress - 1;
4332 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4333 & conf->prev.chunk_mask);
4334 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4335 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4336 } else {
4337
4338
4339
4340 next = last_dev_address(conf->reshape_progress, &conf->geo);
4341
4342
4343
4344
4345 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4346
4347
4348
4349
4350 if (next > safe + conf->offset_diff)
4351 need_flush = 1;
4352
4353 sector_nr = conf->reshape_progress;
4354 last = sector_nr | (conf->geo.chunk_mask
4355 & conf->prev.chunk_mask);
4356
4357 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4358 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4359 }
4360
4361 if (need_flush ||
4362 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4363
4364 wait_barrier(conf);
4365 mddev->reshape_position = conf->reshape_progress;
4366 if (mddev->reshape_backwards)
4367 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4368 - conf->reshape_progress;
4369 else
4370 mddev->curr_resync_completed = conf->reshape_progress;
4371 conf->reshape_checkpoint = jiffies;
4372 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4373 md_wakeup_thread(mddev->thread);
4374 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4375 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4376 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4377 allow_barrier(conf);
4378 return sectors_done;
4379 }
4380 conf->reshape_safe = mddev->reshape_position;
4381 allow_barrier(conf);
4382 }
4383
4384read_more:
4385
4386 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
4387 raise_barrier(conf, sectors_done != 0);
4388 atomic_set(&r10_bio->remaining, 0);
4389 r10_bio->mddev = mddev;
4390 r10_bio->sector = sector_nr;
4391 set_bit(R10BIO_IsReshape, &r10_bio->state);
4392 r10_bio->sectors = last - sector_nr + 1;
4393 rdev = read_balance(conf, r10_bio, &max_sectors);
4394 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4395
4396 if (!rdev) {
4397
4398
4399
4400
4401 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4402 return sectors_done;
4403 }
4404
4405 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4406
4407 read_bio->bi_bdev = rdev->bdev;
4408 read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4409 + rdev->data_offset);
4410 read_bio->bi_private = r10_bio;
4411 read_bio->bi_end_io = end_sync_read;
4412 read_bio->bi_rw = READ;
4413 read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
4414 read_bio->bi_flags |= 1 << BIO_UPTODATE;
4415 read_bio->bi_vcnt = 0;
4416 read_bio->bi_iter.bi_size = 0;
4417 r10_bio->master_bio = read_bio;
4418 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4419
4420
4421 __raid10_find_phys(&conf->geo, r10_bio);
4422
4423 blist = read_bio;
4424 read_bio->bi_next = NULL;
4425
4426 for (s = 0; s < conf->copies*2; s++) {
4427 struct bio *b;
4428 int d = r10_bio->devs[s/2].devnum;
4429 struct md_rdev *rdev2;
4430 if (s&1) {
4431 rdev2 = conf->mirrors[d].replacement;
4432 b = r10_bio->devs[s/2].repl_bio;
4433 } else {
4434 rdev2 = conf->mirrors[d].rdev;
4435 b = r10_bio->devs[s/2].bio;
4436 }
4437 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4438 continue;
4439
4440 bio_reset(b);
4441 b->bi_bdev = rdev2->bdev;
4442 b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
4443 rdev2->new_data_offset;
4444 b->bi_private = r10_bio;
4445 b->bi_end_io = end_reshape_write;
4446 b->bi_rw = WRITE;
4447 b->bi_next = blist;
4448 blist = b;
4449 }
4450
4451
4452
4453 nr_sectors = 0;
4454 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4455 struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
4456 int len = (max_sectors - s) << 9;
4457 if (len > PAGE_SIZE)
4458 len = PAGE_SIZE;
4459 for (bio = blist; bio ; bio = bio->bi_next) {
4460 struct bio *bio2;
4461 if (bio_add_page(bio, page, len, 0))
4462 continue;
4463
4464
4465 for (bio2 = blist;
4466 bio2 && bio2 != bio;
4467 bio2 = bio2->bi_next) {
4468
4469 bio2->bi_vcnt--;
4470 bio2->bi_iter.bi_size -= len;
4471 bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
4472 }
4473 goto bio_full;
4474 }
4475 sector_nr += len >> 9;
4476 nr_sectors += len >> 9;
4477 }
4478bio_full:
4479 r10_bio->sectors = nr_sectors;
4480
4481
4482 md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
4483 atomic_inc(&r10_bio->remaining);
4484 read_bio->bi_next = NULL;
4485 generic_make_request(read_bio);
4486 sector_nr += nr_sectors;
4487 sectors_done += nr_sectors;
4488 if (sector_nr <= last)
4489 goto read_more;
4490
4491
4492
4493
4494 if (mddev->reshape_backwards)
4495 conf->reshape_progress -= sectors_done;
4496 else
4497 conf->reshape_progress += sectors_done;
4498
4499 return sectors_done;
4500}
4501
4502static void end_reshape_request(struct r10bio *r10_bio);
4503static int handle_reshape_read_error(struct mddev *mddev,
4504 struct r10bio *r10_bio);
4505static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4506{
4507
4508
4509
4510
4511
4512 struct r10conf *conf = mddev->private;
4513 int s;
4514
4515 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4516 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4517
4518 md_done_sync(mddev, r10_bio->sectors, 0);
4519 return;
4520 }
4521
4522
4523
4524
4525 atomic_set(&r10_bio->remaining, 1);
4526 for (s = 0; s < conf->copies*2; s++) {
4527 struct bio *b;
4528 int d = r10_bio->devs[s/2].devnum;
4529 struct md_rdev *rdev;
4530 if (s&1) {
4531 rdev = conf->mirrors[d].replacement;
4532 b = r10_bio->devs[s/2].repl_bio;
4533 } else {
4534 rdev = conf->mirrors[d].rdev;
4535 b = r10_bio->devs[s/2].bio;
4536 }
4537 if (!rdev || test_bit(Faulty, &rdev->flags))
4538 continue;
4539 atomic_inc(&rdev->nr_pending);
4540 md_sync_acct(b->bi_bdev, r10_bio->sectors);
4541 atomic_inc(&r10_bio->remaining);
4542 b->bi_next = NULL;
4543 generic_make_request(b);
4544 }
4545 end_reshape_request(r10_bio);
4546}
4547
4548static void end_reshape(struct r10conf *conf)
4549{
4550 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4551 return;
4552
4553 spin_lock_irq(&conf->device_lock);
4554 conf->prev = conf->geo;
4555 md_finish_reshape(conf->mddev);
4556 smp_wmb();
4557 conf->reshape_progress = MaxSector;
4558 spin_unlock_irq(&conf->device_lock);
4559
4560
4561
4562
4563 if (conf->mddev->queue) {
4564 int stripe = conf->geo.raid_disks *
4565 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4566 stripe /= conf->geo.near_copies;
4567 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4568 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4569 }
4570 conf->fullsync = 0;
4571}
4572
4573
4574static int handle_reshape_read_error(struct mddev *mddev,
4575 struct r10bio *r10_bio)
4576{
4577
4578 int sectors = r10_bio->sectors;
4579 struct r10conf *conf = mddev->private;
4580 struct {
4581 struct r10bio r10_bio;
4582 struct r10dev devs[conf->copies];
4583 } on_stack;
4584 struct r10bio *r10b = &on_stack.r10_bio;
4585 int slot = 0;
4586 int idx = 0;
4587 struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
4588
4589 r10b->sector = r10_bio->sector;
4590 __raid10_find_phys(&conf->prev, r10b);
4591
4592 while (sectors) {
4593 int s = sectors;
4594 int success = 0;
4595 int first_slot = slot;
4596
4597 if (s > (PAGE_SIZE >> 9))
4598 s = PAGE_SIZE >> 9;
4599
4600 while (!success) {
4601 int d = r10b->devs[slot].devnum;
4602 struct md_rdev *rdev = conf->mirrors[d].rdev;
4603 sector_t addr;
4604 if (rdev == NULL ||
4605 test_bit(Faulty, &rdev->flags) ||
4606 !test_bit(In_sync, &rdev->flags))
4607 goto failed;
4608
4609 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4610 success = sync_page_io(rdev,
4611 addr,
4612 s << 9,
4613 bvec[idx].bv_page,
4614 READ, false);
4615 if (success)
4616 break;
4617 failed:
4618 slot++;
4619 if (slot >= conf->copies)
4620 slot = 0;
4621 if (slot == first_slot)
4622 break;
4623 }
4624 if (!success) {
4625
4626 set_bit(MD_RECOVERY_INTR,
4627 &mddev->recovery);
4628 return -EIO;
4629 }
4630 sectors -= s;
4631 idx++;
4632 }
4633 return 0;
4634}
4635
4636static void end_reshape_write(struct bio *bio, int error)
4637{
4638 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
4639 struct r10bio *r10_bio = bio->bi_private;
4640 struct mddev *mddev = r10_bio->mddev;
4641 struct r10conf *conf = mddev->private;
4642 int d;
4643 int slot;
4644 int repl;
4645 struct md_rdev *rdev = NULL;
4646
4647 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4648 if (repl)
4649 rdev = conf->mirrors[d].replacement;
4650 if (!rdev) {
4651 smp_mb();
4652 rdev = conf->mirrors[d].rdev;
4653 }
4654
4655 if (!uptodate) {
4656
4657 md_error(mddev, rdev);
4658 }
4659
4660 rdev_dec_pending(rdev, mddev);
4661 end_reshape_request(r10_bio);
4662}
4663
4664static void end_reshape_request(struct r10bio *r10_bio)
4665{
4666 if (!atomic_dec_and_test(&r10_bio->remaining))
4667 return;
4668 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4669 bio_put(r10_bio->master_bio);
4670 put_buf(r10_bio);
4671}
4672
4673static void raid10_finish_reshape(struct mddev *mddev)
4674{
4675 struct r10conf *conf = mddev->private;
4676
4677 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4678 return;
4679
4680 if (mddev->delta_disks > 0) {
4681 sector_t size = raid10_size(mddev, 0, 0);
4682 md_set_array_sectors(mddev, size);
4683 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4684 mddev->recovery_cp = mddev->resync_max_sectors;
4685 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4686 }
4687 mddev->resync_max_sectors = size;
4688 set_capacity(mddev->gendisk, mddev->array_sectors);
4689 revalidate_disk(mddev->gendisk);
4690 } else {
4691 int d;
4692 for (d = conf->geo.raid_disks ;
4693 d < conf->geo.raid_disks - mddev->delta_disks;
4694 d++) {
4695 struct md_rdev *rdev = conf->mirrors[d].rdev;
4696 if (rdev)
4697 clear_bit(In_sync, &rdev->flags);
4698 rdev = conf->mirrors[d].replacement;
4699 if (rdev)
4700 clear_bit(In_sync, &rdev->flags);
4701 }
4702 }
4703 mddev->layout = mddev->new_layout;
4704 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4705 mddev->reshape_position = MaxSector;
4706 mddev->delta_disks = 0;
4707 mddev->reshape_backwards = 0;
4708}
4709
4710static struct md_personality raid10_personality =
4711{
4712 .name = "raid10",
4713 .level = 10,
4714 .owner = THIS_MODULE,
4715 .make_request = make_request,
4716 .run = run,
4717 .stop = stop,
4718 .status = status,
4719 .error_handler = error,
4720 .hot_add_disk = raid10_add_disk,
4721 .hot_remove_disk= raid10_remove_disk,
4722 .spare_active = raid10_spare_active,
4723 .sync_request = sync_request,
4724 .quiesce = raid10_quiesce,
4725 .size = raid10_size,
4726 .resize = raid10_resize,
4727 .takeover = raid10_takeover,
4728 .check_reshape = raid10_check_reshape,
4729 .start_reshape = raid10_start_reshape,
4730 .finish_reshape = raid10_finish_reshape,
4731};
4732
4733static int __init raid_init(void)
4734{
4735 return register_md_personality(&raid10_personality);
4736}
4737
4738static void raid_exit(void)
4739{
4740 unregister_md_personality(&raid10_personality);
4741}
4742
4743module_init(raid_init);
4744module_exit(raid_exit);
4745MODULE_LICENSE("GPL");
4746MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4747MODULE_ALIAS("md-personality-9");
4748MODULE_ALIAS("md-raid10");
4749MODULE_ALIAS("md-level-10");
4750
4751module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
4752