1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/slab.h>
22#include <linux/delay.h>
23#include <linux/blkdev.h>
24#include <linux/module.h>
25#include <linux/seq_file.h>
26#include <linux/ratelimit.h>
27#include <linux/kthread.h>
28#include "md.h"
29#include "raid10.h"
30#include "raid0.h"
31#include "bitmap.h"
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76#define NR_RAID10_BIOS 256
77
78
79
80
81
82
83#define IO_BLOCKED ((struct bio *)1)
84
85
86
87
88#define IO_MADE_GOOD ((struct bio *)2)
89
90#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
91
92
93
94
95
96static int max_queued_requests = 1024;
97
98static void allow_barrier(struct r10conf *conf);
99static void lower_barrier(struct r10conf *conf);
100static int _enough(struct r10conf *conf, int previous, int ignore);
101static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
102 int *skipped);
103static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
104static void end_reshape_write(struct bio *bio, int error);
105static void end_reshape(struct r10conf *conf);
106
107static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
108{
109 struct r10conf *conf = data;
110 int size = offsetof(struct r10bio, devs[conf->copies]);
111
112
113
114 return kzalloc(size, gfp_flags);
115}
116
117static void r10bio_pool_free(void *r10_bio, void *data)
118{
119 kfree(r10_bio);
120}
121
122
123#define RESYNC_BLOCK_SIZE (64*1024)
124#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
125
126#define RESYNC_WINDOW (1024*1024)
127
128#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
129
130
131
132
133
134
135
136
137static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
138{
139 struct r10conf *conf = data;
140 struct page *page;
141 struct r10bio *r10_bio;
142 struct bio *bio;
143 int i, j;
144 int nalloc;
145
146 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
147 if (!r10_bio)
148 return NULL;
149
150 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
151 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
152 nalloc = conf->copies;
153 else
154 nalloc = 2;
155
156
157
158
159 for (j = nalloc ; j-- ; ) {
160 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
161 if (!bio)
162 goto out_free_bio;
163 r10_bio->devs[j].bio = bio;
164 if (!conf->have_replacement)
165 continue;
166 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
167 if (!bio)
168 goto out_free_bio;
169 r10_bio->devs[j].repl_bio = bio;
170 }
171
172
173
174
175 for (j = 0 ; j < nalloc; j++) {
176 struct bio *rbio = r10_bio->devs[j].repl_bio;
177 bio = r10_bio->devs[j].bio;
178 for (i = 0; i < RESYNC_PAGES; i++) {
179 if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
180 &conf->mddev->recovery)) {
181
182
183 struct bio *rbio = r10_bio->devs[0].bio;
184 page = rbio->bi_io_vec[i].bv_page;
185 get_page(page);
186 } else
187 page = alloc_page(gfp_flags);
188 if (unlikely(!page))
189 goto out_free_pages;
190
191 bio->bi_io_vec[i].bv_page = page;
192 if (rbio)
193 rbio->bi_io_vec[i].bv_page = page;
194 }
195 }
196
197 return r10_bio;
198
199out_free_pages:
200 for ( ; i > 0 ; i--)
201 safe_put_page(bio->bi_io_vec[i-1].bv_page);
202 while (j--)
203 for (i = 0; i < RESYNC_PAGES ; i++)
204 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
205 j = 0;
206out_free_bio:
207 for ( ; j < nalloc; j++) {
208 if (r10_bio->devs[j].bio)
209 bio_put(r10_bio->devs[j].bio);
210 if (r10_bio->devs[j].repl_bio)
211 bio_put(r10_bio->devs[j].repl_bio);
212 }
213 r10bio_pool_free(r10_bio, conf);
214 return NULL;
215}
216
217static void r10buf_pool_free(void *__r10_bio, void *data)
218{
219 int i;
220 struct r10conf *conf = data;
221 struct r10bio *r10bio = __r10_bio;
222 int j;
223
224 for (j=0; j < conf->copies; j++) {
225 struct bio *bio = r10bio->devs[j].bio;
226 if (bio) {
227 for (i = 0; i < RESYNC_PAGES; i++) {
228 safe_put_page(bio->bi_io_vec[i].bv_page);
229 bio->bi_io_vec[i].bv_page = NULL;
230 }
231 bio_put(bio);
232 }
233 bio = r10bio->devs[j].repl_bio;
234 if (bio)
235 bio_put(bio);
236 }
237 r10bio_pool_free(r10bio, conf);
238}
239
240static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
241{
242 int i;
243
244 for (i = 0; i < conf->copies; i++) {
245 struct bio **bio = & r10_bio->devs[i].bio;
246 if (!BIO_SPECIAL(*bio))
247 bio_put(*bio);
248 *bio = NULL;
249 bio = &r10_bio->devs[i].repl_bio;
250 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
251 bio_put(*bio);
252 *bio = NULL;
253 }
254}
255
256static void free_r10bio(struct r10bio *r10_bio)
257{
258 struct r10conf *conf = r10_bio->mddev->private;
259
260 put_all_bios(conf, r10_bio);
261 mempool_free(r10_bio, conf->r10bio_pool);
262}
263
264static void put_buf(struct r10bio *r10_bio)
265{
266 struct r10conf *conf = r10_bio->mddev->private;
267
268 mempool_free(r10_bio, conf->r10buf_pool);
269
270 lower_barrier(conf);
271}
272
273static void reschedule_retry(struct r10bio *r10_bio)
274{
275 unsigned long flags;
276 struct mddev *mddev = r10_bio->mddev;
277 struct r10conf *conf = mddev->private;
278
279 spin_lock_irqsave(&conf->device_lock, flags);
280 list_add(&r10_bio->retry_list, &conf->retry_list);
281 conf->nr_queued ++;
282 spin_unlock_irqrestore(&conf->device_lock, flags);
283
284
285 wake_up(&conf->wait_barrier);
286
287 md_wakeup_thread(mddev->thread);
288}
289
290
291
292
293
294
295static void raid_end_bio_io(struct r10bio *r10_bio)
296{
297 struct bio *bio = r10_bio->master_bio;
298 int done;
299 struct r10conf *conf = r10_bio->mddev->private;
300
301 if (bio->bi_phys_segments) {
302 unsigned long flags;
303 spin_lock_irqsave(&conf->device_lock, flags);
304 bio->bi_phys_segments--;
305 done = (bio->bi_phys_segments == 0);
306 spin_unlock_irqrestore(&conf->device_lock, flags);
307 } else
308 done = 1;
309 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
310 clear_bit(BIO_UPTODATE, &bio->bi_flags);
311 if (done) {
312 bio_endio(bio, 0);
313
314
315
316
317 allow_barrier(conf);
318 }
319 free_r10bio(r10_bio);
320}
321
322
323
324
325static inline void update_head_pos(int slot, struct r10bio *r10_bio)
326{
327 struct r10conf *conf = r10_bio->mddev->private;
328
329 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
330 r10_bio->devs[slot].addr + (r10_bio->sectors);
331}
332
333
334
335
336static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
337 struct bio *bio, int *slotp, int *replp)
338{
339 int slot;
340 int repl = 0;
341
342 for (slot = 0; slot < conf->copies; slot++) {
343 if (r10_bio->devs[slot].bio == bio)
344 break;
345 if (r10_bio->devs[slot].repl_bio == bio) {
346 repl = 1;
347 break;
348 }
349 }
350
351 BUG_ON(slot == conf->copies);
352 update_head_pos(slot, r10_bio);
353
354 if (slotp)
355 *slotp = slot;
356 if (replp)
357 *replp = repl;
358 return r10_bio->devs[slot].devnum;
359}
360
361static void raid10_end_read_request(struct bio *bio, int error)
362{
363 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
364 struct r10bio *r10_bio = bio->bi_private;
365 int slot, dev;
366 struct md_rdev *rdev;
367 struct r10conf *conf = r10_bio->mddev->private;
368
369
370 slot = r10_bio->read_slot;
371 dev = r10_bio->devs[slot].devnum;
372 rdev = r10_bio->devs[slot].rdev;
373
374
375
376 update_head_pos(slot, r10_bio);
377
378 if (uptodate) {
379
380
381
382
383
384
385
386
387
388 set_bit(R10BIO_Uptodate, &r10_bio->state);
389 } else {
390
391
392
393
394
395 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
396 rdev->raid_disk))
397 uptodate = 1;
398 }
399 if (uptodate) {
400 raid_end_bio_io(r10_bio);
401 rdev_dec_pending(rdev, conf->mddev);
402 } else {
403
404
405
406 char b[BDEVNAME_SIZE];
407 printk_ratelimited(KERN_ERR
408 "md/raid10:%s: %s: rescheduling sector %llu\n",
409 mdname(conf->mddev),
410 bdevname(rdev->bdev, b),
411 (unsigned long long)r10_bio->sector);
412 set_bit(R10BIO_ReadError, &r10_bio->state);
413 reschedule_retry(r10_bio);
414 }
415}
416
417static void close_write(struct r10bio *r10_bio)
418{
419
420 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
421 r10_bio->sectors,
422 !test_bit(R10BIO_Degraded, &r10_bio->state),
423 0);
424 md_write_end(r10_bio->mddev);
425}
426
427static void one_write_done(struct r10bio *r10_bio)
428{
429 if (atomic_dec_and_test(&r10_bio->remaining)) {
430 if (test_bit(R10BIO_WriteError, &r10_bio->state))
431 reschedule_retry(r10_bio);
432 else {
433 close_write(r10_bio);
434 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
435 reschedule_retry(r10_bio);
436 else
437 raid_end_bio_io(r10_bio);
438 }
439 }
440}
441
442static void raid10_end_write_request(struct bio *bio, int error)
443{
444 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
445 struct r10bio *r10_bio = bio->bi_private;
446 int dev;
447 int dec_rdev = 1;
448 struct r10conf *conf = r10_bio->mddev->private;
449 int slot, repl;
450 struct md_rdev *rdev = NULL;
451
452 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
453
454 if (repl)
455 rdev = conf->mirrors[dev].replacement;
456 if (!rdev) {
457 smp_rmb();
458 repl = 0;
459 rdev = conf->mirrors[dev].rdev;
460 }
461
462
463
464 if (!uptodate) {
465 if (repl)
466
467
468
469 md_error(rdev->mddev, rdev);
470 else {
471 set_bit(WriteErrorSeen, &rdev->flags);
472 if (!test_and_set_bit(WantReplacement, &rdev->flags))
473 set_bit(MD_RECOVERY_NEEDED,
474 &rdev->mddev->recovery);
475 set_bit(R10BIO_WriteError, &r10_bio->state);
476 dec_rdev = 0;
477 }
478 } else {
479
480
481
482
483
484
485
486
487
488 sector_t first_bad;
489 int bad_sectors;
490
491
492
493
494
495
496
497
498
499 if (test_bit(In_sync, &rdev->flags) &&
500 !test_bit(Faulty, &rdev->flags))
501 set_bit(R10BIO_Uptodate, &r10_bio->state);
502
503
504 if (is_badblock(rdev,
505 r10_bio->devs[slot].addr,
506 r10_bio->sectors,
507 &first_bad, &bad_sectors)) {
508 bio_put(bio);
509 if (repl)
510 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
511 else
512 r10_bio->devs[slot].bio = IO_MADE_GOOD;
513 dec_rdev = 0;
514 set_bit(R10BIO_MadeGood, &r10_bio->state);
515 }
516 }
517
518
519
520
521
522
523 one_write_done(r10_bio);
524 if (dec_rdev)
525 rdev_dec_pending(rdev, conf->mddev);
526}
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
554{
555 int n,f;
556 sector_t sector;
557 sector_t chunk;
558 sector_t stripe;
559 int dev;
560 int slot = 0;
561 int last_far_set_start, last_far_set_size;
562
563 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
564 last_far_set_start *= geo->far_set_size;
565
566 last_far_set_size = geo->far_set_size;
567 last_far_set_size += (geo->raid_disks % geo->far_set_size);
568
569
570 chunk = r10bio->sector >> geo->chunk_shift;
571 sector = r10bio->sector & geo->chunk_mask;
572
573 chunk *= geo->near_copies;
574 stripe = chunk;
575 dev = sector_div(stripe, geo->raid_disks);
576 if (geo->far_offset)
577 stripe *= geo->far_copies;
578
579 sector += stripe << geo->chunk_shift;
580
581
582 for (n = 0; n < geo->near_copies; n++) {
583 int d = dev;
584 int set;
585 sector_t s = sector;
586 r10bio->devs[slot].devnum = d;
587 r10bio->devs[slot].addr = s;
588 slot++;
589
590 for (f = 1; f < geo->far_copies; f++) {
591 set = d / geo->far_set_size;
592 d += geo->near_copies;
593
594 if ((geo->raid_disks % geo->far_set_size) &&
595 (d > last_far_set_start)) {
596 d -= last_far_set_start;
597 d %= last_far_set_size;
598 d += last_far_set_start;
599 } else {
600 d %= geo->far_set_size;
601 d += geo->far_set_size * set;
602 }
603 s += geo->stride;
604 r10bio->devs[slot].devnum = d;
605 r10bio->devs[slot].addr = s;
606 slot++;
607 }
608 dev++;
609 if (dev >= geo->raid_disks) {
610 dev = 0;
611 sector += (geo->chunk_mask + 1);
612 }
613 }
614}
615
616static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
617{
618 struct geom *geo = &conf->geo;
619
620 if (conf->reshape_progress != MaxSector &&
621 ((r10bio->sector >= conf->reshape_progress) !=
622 conf->mddev->reshape_backwards)) {
623 set_bit(R10BIO_Previous, &r10bio->state);
624 geo = &conf->prev;
625 } else
626 clear_bit(R10BIO_Previous, &r10bio->state);
627
628 __raid10_find_phys(geo, r10bio);
629}
630
631static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
632{
633 sector_t offset, chunk, vchunk;
634
635
636
637 struct geom *geo = &conf->geo;
638 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
639 int far_set_size = geo->far_set_size;
640 int last_far_set_start;
641
642 if (geo->raid_disks % geo->far_set_size) {
643 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
644 last_far_set_start *= geo->far_set_size;
645
646 if (dev >= last_far_set_start) {
647 far_set_size = geo->far_set_size;
648 far_set_size += (geo->raid_disks % geo->far_set_size);
649 far_set_start = last_far_set_start;
650 }
651 }
652
653 offset = sector & geo->chunk_mask;
654 if (geo->far_offset) {
655 int fc;
656 chunk = sector >> geo->chunk_shift;
657 fc = sector_div(chunk, geo->far_copies);
658 dev -= fc * geo->near_copies;
659 if (dev < far_set_start)
660 dev += far_set_size;
661 } else {
662 while (sector >= geo->stride) {
663 sector -= geo->stride;
664 if (dev < (geo->near_copies + far_set_start))
665 dev += far_set_size - geo->near_copies;
666 else
667 dev -= geo->near_copies;
668 }
669 chunk = sector >> geo->chunk_shift;
670 }
671 vchunk = chunk * geo->raid_disks + dev;
672 sector_div(vchunk, geo->near_copies);
673 return (vchunk << geo->chunk_shift) + offset;
674}
675
676
677
678
679
680
681
682
683
684
685
686static int raid10_mergeable_bvec(struct request_queue *q,
687 struct bvec_merge_data *bvm,
688 struct bio_vec *biovec)
689{
690 struct mddev *mddev = q->queuedata;
691 struct r10conf *conf = mddev->private;
692 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
693 int max;
694 unsigned int chunk_sectors;
695 unsigned int bio_sectors = bvm->bi_size >> 9;
696 struct geom *geo = &conf->geo;
697
698 chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
699 if (conf->reshape_progress != MaxSector &&
700 ((sector >= conf->reshape_progress) !=
701 conf->mddev->reshape_backwards))
702 geo = &conf->prev;
703
704 if (geo->near_copies < geo->raid_disks) {
705 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
706 + bio_sectors)) << 9;
707 if (max < 0)
708
709 max = 0;
710 if (max <= biovec->bv_len && bio_sectors == 0)
711 return biovec->bv_len;
712 } else
713 max = biovec->bv_len;
714
715 if (mddev->merge_check_needed) {
716 struct {
717 struct r10bio r10_bio;
718 struct r10dev devs[conf->copies];
719 } on_stack;
720 struct r10bio *r10_bio = &on_stack.r10_bio;
721 int s;
722 if (conf->reshape_progress != MaxSector) {
723
724 if (max <= biovec->bv_len && bio_sectors == 0)
725 return biovec->bv_len;
726 return 0;
727 }
728 r10_bio->sector = sector;
729 raid10_find_phys(conf, r10_bio);
730 rcu_read_lock();
731 for (s = 0; s < conf->copies; s++) {
732 int disk = r10_bio->devs[s].devnum;
733 struct md_rdev *rdev = rcu_dereference(
734 conf->mirrors[disk].rdev);
735 if (rdev && !test_bit(Faulty, &rdev->flags)) {
736 struct request_queue *q =
737 bdev_get_queue(rdev->bdev);
738 if (q->merge_bvec_fn) {
739 bvm->bi_sector = r10_bio->devs[s].addr
740 + rdev->data_offset;
741 bvm->bi_bdev = rdev->bdev;
742 max = min(max, q->merge_bvec_fn(
743 q, bvm, biovec));
744 }
745 }
746 rdev = rcu_dereference(conf->mirrors[disk].replacement);
747 if (rdev && !test_bit(Faulty, &rdev->flags)) {
748 struct request_queue *q =
749 bdev_get_queue(rdev->bdev);
750 if (q->merge_bvec_fn) {
751 bvm->bi_sector = r10_bio->devs[s].addr
752 + rdev->data_offset;
753 bvm->bi_bdev = rdev->bdev;
754 max = min(max, q->merge_bvec_fn(
755 q, bvm, biovec));
756 }
757 }
758 }
759 rcu_read_unlock();
760 }
761 return max;
762}
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783static struct md_rdev *read_balance(struct r10conf *conf,
784 struct r10bio *r10_bio,
785 int *max_sectors)
786{
787 const sector_t this_sector = r10_bio->sector;
788 int disk, slot;
789 int sectors = r10_bio->sectors;
790 int best_good_sectors;
791 sector_t new_distance, best_dist;
792 struct md_rdev *best_rdev, *rdev = NULL;
793 int do_balance;
794 int best_slot;
795 struct geom *geo = &conf->geo;
796
797 raid10_find_phys(conf, r10_bio);
798 rcu_read_lock();
799retry:
800 sectors = r10_bio->sectors;
801 best_slot = -1;
802 best_rdev = NULL;
803 best_dist = MaxSector;
804 best_good_sectors = 0;
805 do_balance = 1;
806
807
808
809
810
811
812 if (conf->mddev->recovery_cp < MaxSector
813 && (this_sector + sectors >= conf->next_resync))
814 do_balance = 0;
815
816 for (slot = 0; slot < conf->copies ; slot++) {
817 sector_t first_bad;
818 int bad_sectors;
819 sector_t dev_sector;
820
821 if (r10_bio->devs[slot].bio == IO_BLOCKED)
822 continue;
823 disk = r10_bio->devs[slot].devnum;
824 rdev = rcu_dereference(conf->mirrors[disk].replacement);
825 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
826 test_bit(Unmerged, &rdev->flags) ||
827 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
828 rdev = rcu_dereference(conf->mirrors[disk].rdev);
829 if (rdev == NULL ||
830 test_bit(Faulty, &rdev->flags) ||
831 test_bit(Unmerged, &rdev->flags))
832 continue;
833 if (!test_bit(In_sync, &rdev->flags) &&
834 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
835 continue;
836
837 dev_sector = r10_bio->devs[slot].addr;
838 if (is_badblock(rdev, dev_sector, sectors,
839 &first_bad, &bad_sectors)) {
840 if (best_dist < MaxSector)
841
842 continue;
843 if (first_bad <= dev_sector) {
844
845
846
847
848 bad_sectors -= (dev_sector - first_bad);
849 if (!do_balance && sectors > bad_sectors)
850 sectors = bad_sectors;
851 if (best_good_sectors > sectors)
852 best_good_sectors = sectors;
853 } else {
854 sector_t good_sectors =
855 first_bad - dev_sector;
856 if (good_sectors > best_good_sectors) {
857 best_good_sectors = good_sectors;
858 best_slot = slot;
859 best_rdev = rdev;
860 }
861 if (!do_balance)
862
863 break;
864 }
865 continue;
866 } else
867 best_good_sectors = sectors;
868
869 if (!do_balance)
870 break;
871
872
873
874
875
876 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
877 break;
878
879
880 if (geo->far_copies > 1)
881 new_distance = r10_bio->devs[slot].addr;
882 else
883 new_distance = abs(r10_bio->devs[slot].addr -
884 conf->mirrors[disk].head_position);
885 if (new_distance < best_dist) {
886 best_dist = new_distance;
887 best_slot = slot;
888 best_rdev = rdev;
889 }
890 }
891 if (slot >= conf->copies) {
892 slot = best_slot;
893 rdev = best_rdev;
894 }
895
896 if (slot >= 0) {
897 atomic_inc(&rdev->nr_pending);
898 if (test_bit(Faulty, &rdev->flags)) {
899
900
901
902 rdev_dec_pending(rdev, conf->mddev);
903 goto retry;
904 }
905 r10_bio->read_slot = slot;
906 } else
907 rdev = NULL;
908 rcu_read_unlock();
909 *max_sectors = best_good_sectors;
910
911 return rdev;
912}
913
914int md_raid10_congested(struct mddev *mddev, int bits)
915{
916 struct r10conf *conf = mddev->private;
917 int i, ret = 0;
918
919 if ((bits & (1 << BDI_async_congested)) &&
920 conf->pending_count >= max_queued_requests)
921 return 1;
922
923 rcu_read_lock();
924 for (i = 0;
925 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
926 && ret == 0;
927 i++) {
928 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
929 if (rdev && !test_bit(Faulty, &rdev->flags)) {
930 struct request_queue *q = bdev_get_queue(rdev->bdev);
931
932 ret |= bdi_congested(&q->backing_dev_info, bits);
933 }
934 }
935 rcu_read_unlock();
936 return ret;
937}
938EXPORT_SYMBOL_GPL(md_raid10_congested);
939
940static int raid10_congested(void *data, int bits)
941{
942 struct mddev *mddev = data;
943
944 return mddev_congested(mddev, bits) ||
945 md_raid10_congested(mddev, bits);
946}
947
948static void flush_pending_writes(struct r10conf *conf)
949{
950
951
952
953 spin_lock_irq(&conf->device_lock);
954
955 if (conf->pending_bio_list.head) {
956 struct bio *bio;
957 bio = bio_list_get(&conf->pending_bio_list);
958 conf->pending_count = 0;
959 spin_unlock_irq(&conf->device_lock);
960
961
962 bitmap_unplug(conf->mddev->bitmap);
963 wake_up(&conf->wait_barrier);
964
965 while (bio) {
966 struct bio *next = bio->bi_next;
967 bio->bi_next = NULL;
968 if (unlikely((bio->bi_rw & REQ_DISCARD) &&
969 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
970
971 bio_endio(bio, 0);
972 else
973 generic_make_request(bio);
974 bio = next;
975 }
976 } else
977 spin_unlock_irq(&conf->device_lock);
978}
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002static void raise_barrier(struct r10conf *conf, int force)
1003{
1004 BUG_ON(force && !conf->barrier);
1005 spin_lock_irq(&conf->resync_lock);
1006
1007
1008 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
1009 conf->resync_lock);
1010
1011
1012 conf->barrier++;
1013
1014
1015 wait_event_lock_irq(conf->wait_barrier,
1016 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
1017 conf->resync_lock);
1018
1019 spin_unlock_irq(&conf->resync_lock);
1020}
1021
1022static void lower_barrier(struct r10conf *conf)
1023{
1024 unsigned long flags;
1025 spin_lock_irqsave(&conf->resync_lock, flags);
1026 conf->barrier--;
1027 spin_unlock_irqrestore(&conf->resync_lock, flags);
1028 wake_up(&conf->wait_barrier);
1029}
1030
1031static void wait_barrier(struct r10conf *conf)
1032{
1033 spin_lock_irq(&conf->resync_lock);
1034 if (conf->barrier) {
1035 conf->nr_waiting++;
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045 wait_event_lock_irq(conf->wait_barrier,
1046 !conf->barrier ||
1047 (conf->nr_pending &&
1048 current->bio_list &&
1049 !bio_list_empty(current->bio_list)),
1050 conf->resync_lock);
1051 conf->nr_waiting--;
1052 }
1053 conf->nr_pending++;
1054 spin_unlock_irq(&conf->resync_lock);
1055}
1056
1057static void allow_barrier(struct r10conf *conf)
1058{
1059 unsigned long flags;
1060 spin_lock_irqsave(&conf->resync_lock, flags);
1061 conf->nr_pending--;
1062 spin_unlock_irqrestore(&conf->resync_lock, flags);
1063 wake_up(&conf->wait_barrier);
1064}
1065
1066static void freeze_array(struct r10conf *conf, int extra)
1067{
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080 spin_lock_irq(&conf->resync_lock);
1081 conf->barrier++;
1082 conf->nr_waiting++;
1083 wait_event_lock_irq_cmd(conf->wait_barrier,
1084 conf->nr_pending == conf->nr_queued+extra,
1085 conf->resync_lock,
1086 flush_pending_writes(conf));
1087
1088 spin_unlock_irq(&conf->resync_lock);
1089}
1090
1091static void unfreeze_array(struct r10conf *conf)
1092{
1093
1094 spin_lock_irq(&conf->resync_lock);
1095 conf->barrier--;
1096 conf->nr_waiting--;
1097 wake_up(&conf->wait_barrier);
1098 spin_unlock_irq(&conf->resync_lock);
1099}
1100
1101static sector_t choose_data_offset(struct r10bio *r10_bio,
1102 struct md_rdev *rdev)
1103{
1104 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1105 test_bit(R10BIO_Previous, &r10_bio->state))
1106 return rdev->data_offset;
1107 else
1108 return rdev->new_data_offset;
1109}
1110
1111struct raid10_plug_cb {
1112 struct blk_plug_cb cb;
1113 struct bio_list pending;
1114 int pending_cnt;
1115};
1116
1117static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1118{
1119 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1120 cb);
1121 struct mddev *mddev = plug->cb.data;
1122 struct r10conf *conf = mddev->private;
1123 struct bio *bio;
1124
1125 if (from_schedule || current->bio_list) {
1126 spin_lock_irq(&conf->device_lock);
1127 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1128 conf->pending_count += plug->pending_cnt;
1129 spin_unlock_irq(&conf->device_lock);
1130 wake_up(&conf->wait_barrier);
1131 md_wakeup_thread(mddev->thread);
1132 kfree(plug);
1133 return;
1134 }
1135
1136
1137 bio = bio_list_get(&plug->pending);
1138 bitmap_unplug(mddev->bitmap);
1139 wake_up(&conf->wait_barrier);
1140
1141 while (bio) {
1142 struct bio *next = bio->bi_next;
1143 bio->bi_next = NULL;
1144 if (unlikely((bio->bi_rw & REQ_DISCARD) &&
1145 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
1146
1147 bio_endio(bio, 0);
1148 else
1149 generic_make_request(bio);
1150 bio = next;
1151 }
1152 kfree(plug);
1153}
1154
1155static void __make_request(struct mddev *mddev, struct bio *bio)
1156{
1157 struct r10conf *conf = mddev->private;
1158 struct r10bio *r10_bio;
1159 struct bio *read_bio;
1160 int i;
1161 const int rw = bio_data_dir(bio);
1162 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
1163 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
1164 const unsigned long do_discard = (bio->bi_rw
1165 & (REQ_DISCARD | REQ_SECURE));
1166 const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
1167 unsigned long flags;
1168 struct md_rdev *blocked_rdev;
1169 struct blk_plug_cb *cb;
1170 struct raid10_plug_cb *plug = NULL;
1171 int sectors_handled;
1172 int max_sectors;
1173 int sectors;
1174
1175
1176
1177
1178
1179
1180 wait_barrier(conf);
1181
1182 sectors = bio_sectors(bio);
1183 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1184 bio->bi_iter.bi_sector < conf->reshape_progress &&
1185 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1186
1187
1188
1189 allow_barrier(conf);
1190 wait_event(conf->wait_barrier,
1191 conf->reshape_progress <= bio->bi_iter.bi_sector ||
1192 conf->reshape_progress >= bio->bi_iter.bi_sector +
1193 sectors);
1194 wait_barrier(conf);
1195 }
1196 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1197 bio_data_dir(bio) == WRITE &&
1198 (mddev->reshape_backwards
1199 ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
1200 bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
1201 : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
1202 bio->bi_iter.bi_sector < conf->reshape_progress))) {
1203
1204 mddev->reshape_position = conf->reshape_progress;
1205 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1206 set_bit(MD_CHANGE_PENDING, &mddev->flags);
1207 md_wakeup_thread(mddev->thread);
1208 wait_event(mddev->sb_wait,
1209 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
1210
1211 conf->reshape_safe = mddev->reshape_position;
1212 }
1213
1214 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1215
1216 r10_bio->master_bio = bio;
1217 r10_bio->sectors = sectors;
1218
1219 r10_bio->mddev = mddev;
1220 r10_bio->sector = bio->bi_iter.bi_sector;
1221 r10_bio->state = 0;
1222
1223
1224
1225
1226
1227
1228
1229
1230 bio->bi_phys_segments = 0;
1231 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1232
1233 if (rw == READ) {
1234
1235
1236
1237 struct md_rdev *rdev;
1238 int slot;
1239
1240read_again:
1241 rdev = read_balance(conf, r10_bio, &max_sectors);
1242 if (!rdev) {
1243 raid_end_bio_io(r10_bio);
1244 return;
1245 }
1246 slot = r10_bio->read_slot;
1247
1248 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1249 bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
1250 max_sectors);
1251
1252 r10_bio->devs[slot].bio = read_bio;
1253 r10_bio->devs[slot].rdev = rdev;
1254
1255 read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
1256 choose_data_offset(r10_bio, rdev);
1257 read_bio->bi_bdev = rdev->bdev;
1258 read_bio->bi_end_io = raid10_end_read_request;
1259 read_bio->bi_rw = READ | do_sync;
1260 read_bio->bi_private = r10_bio;
1261
1262 if (max_sectors < r10_bio->sectors) {
1263
1264
1265
1266 sectors_handled = (r10_bio->sector + max_sectors
1267 - bio->bi_iter.bi_sector);
1268 r10_bio->sectors = max_sectors;
1269 spin_lock_irq(&conf->device_lock);
1270 if (bio->bi_phys_segments == 0)
1271 bio->bi_phys_segments = 2;
1272 else
1273 bio->bi_phys_segments++;
1274 spin_unlock_irq(&conf->device_lock);
1275
1276
1277
1278
1279
1280 reschedule_retry(r10_bio);
1281
1282 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1283
1284 r10_bio->master_bio = bio;
1285 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1286 r10_bio->state = 0;
1287 r10_bio->mddev = mddev;
1288 r10_bio->sector = bio->bi_iter.bi_sector +
1289 sectors_handled;
1290 goto read_again;
1291 } else
1292 generic_make_request(read_bio);
1293 return;
1294 }
1295
1296
1297
1298
1299 if (conf->pending_count >= max_queued_requests) {
1300 md_wakeup_thread(mddev->thread);
1301 wait_event(conf->wait_barrier,
1302 conf->pending_count < max_queued_requests);
1303 }
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316 r10_bio->read_slot = -1;
1317 raid10_find_phys(conf, r10_bio);
1318retry_write:
1319 blocked_rdev = NULL;
1320 rcu_read_lock();
1321 max_sectors = r10_bio->sectors;
1322
1323 for (i = 0; i < conf->copies; i++) {
1324 int d = r10_bio->devs[i].devnum;
1325 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1326 struct md_rdev *rrdev = rcu_dereference(
1327 conf->mirrors[d].replacement);
1328 if (rdev == rrdev)
1329 rrdev = NULL;
1330 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1331 atomic_inc(&rdev->nr_pending);
1332 blocked_rdev = rdev;
1333 break;
1334 }
1335 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1336 atomic_inc(&rrdev->nr_pending);
1337 blocked_rdev = rrdev;
1338 break;
1339 }
1340 if (rdev && (test_bit(Faulty, &rdev->flags)
1341 || test_bit(Unmerged, &rdev->flags)))
1342 rdev = NULL;
1343 if (rrdev && (test_bit(Faulty, &rrdev->flags)
1344 || test_bit(Unmerged, &rrdev->flags)))
1345 rrdev = NULL;
1346
1347 r10_bio->devs[i].bio = NULL;
1348 r10_bio->devs[i].repl_bio = NULL;
1349
1350 if (!rdev && !rrdev) {
1351 set_bit(R10BIO_Degraded, &r10_bio->state);
1352 continue;
1353 }
1354 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1355 sector_t first_bad;
1356 sector_t dev_sector = r10_bio->devs[i].addr;
1357 int bad_sectors;
1358 int is_bad;
1359
1360 is_bad = is_badblock(rdev, dev_sector,
1361 max_sectors,
1362 &first_bad, &bad_sectors);
1363 if (is_bad < 0) {
1364
1365
1366
1367 atomic_inc(&rdev->nr_pending);
1368 set_bit(BlockedBadBlocks, &rdev->flags);
1369 blocked_rdev = rdev;
1370 break;
1371 }
1372 if (is_bad && first_bad <= dev_sector) {
1373
1374 bad_sectors -= (dev_sector - first_bad);
1375 if (bad_sectors < max_sectors)
1376
1377
1378
1379 max_sectors = bad_sectors;
1380
1381
1382
1383
1384
1385
1386
1387
1388 continue;
1389 }
1390 if (is_bad) {
1391 int good_sectors = first_bad - dev_sector;
1392 if (good_sectors < max_sectors)
1393 max_sectors = good_sectors;
1394 }
1395 }
1396 if (rdev) {
1397 r10_bio->devs[i].bio = bio;
1398 atomic_inc(&rdev->nr_pending);
1399 }
1400 if (rrdev) {
1401 r10_bio->devs[i].repl_bio = bio;
1402 atomic_inc(&rrdev->nr_pending);
1403 }
1404 }
1405 rcu_read_unlock();
1406
1407 if (unlikely(blocked_rdev)) {
1408
1409 int j;
1410 int d;
1411
1412 for (j = 0; j < i; j++) {
1413 if (r10_bio->devs[j].bio) {
1414 d = r10_bio->devs[j].devnum;
1415 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1416 }
1417 if (r10_bio->devs[j].repl_bio) {
1418 struct md_rdev *rdev;
1419 d = r10_bio->devs[j].devnum;
1420 rdev = conf->mirrors[d].replacement;
1421 if (!rdev) {
1422
1423 smp_mb();
1424 rdev = conf->mirrors[d].rdev;
1425 }
1426 rdev_dec_pending(rdev, mddev);
1427 }
1428 }
1429 allow_barrier(conf);
1430 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1431 wait_barrier(conf);
1432 goto retry_write;
1433 }
1434
1435 if (max_sectors < r10_bio->sectors) {
1436
1437
1438
1439 r10_bio->sectors = max_sectors;
1440 spin_lock_irq(&conf->device_lock);
1441 if (bio->bi_phys_segments == 0)
1442 bio->bi_phys_segments = 2;
1443 else
1444 bio->bi_phys_segments++;
1445 spin_unlock_irq(&conf->device_lock);
1446 }
1447 sectors_handled = r10_bio->sector + max_sectors -
1448 bio->bi_iter.bi_sector;
1449
1450 atomic_set(&r10_bio->remaining, 1);
1451 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1452
1453 for (i = 0; i < conf->copies; i++) {
1454 struct bio *mbio;
1455 int d = r10_bio->devs[i].devnum;
1456 if (r10_bio->devs[i].bio) {
1457 struct md_rdev *rdev = conf->mirrors[d].rdev;
1458 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1459 bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
1460 max_sectors);
1461 r10_bio->devs[i].bio = mbio;
1462
1463 mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
1464 choose_data_offset(r10_bio,
1465 rdev));
1466 mbio->bi_bdev = rdev->bdev;
1467 mbio->bi_end_io = raid10_end_write_request;
1468 mbio->bi_rw =
1469 WRITE | do_sync | do_fua | do_discard | do_same;
1470 mbio->bi_private = r10_bio;
1471
1472 atomic_inc(&r10_bio->remaining);
1473
1474 cb = blk_check_plugged(raid10_unplug, mddev,
1475 sizeof(*plug));
1476 if (cb)
1477 plug = container_of(cb, struct raid10_plug_cb,
1478 cb);
1479 else
1480 plug = NULL;
1481 spin_lock_irqsave(&conf->device_lock, flags);
1482 if (plug) {
1483 bio_list_add(&plug->pending, mbio);
1484 plug->pending_cnt++;
1485 } else {
1486 bio_list_add(&conf->pending_bio_list, mbio);
1487 conf->pending_count++;
1488 }
1489 spin_unlock_irqrestore(&conf->device_lock, flags);
1490 if (!plug)
1491 md_wakeup_thread(mddev->thread);
1492 }
1493
1494 if (r10_bio->devs[i].repl_bio) {
1495 struct md_rdev *rdev = conf->mirrors[d].replacement;
1496 if (rdev == NULL) {
1497
1498 smp_mb();
1499 rdev = conf->mirrors[d].rdev;
1500 }
1501 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1502 bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
1503 max_sectors);
1504 r10_bio->devs[i].repl_bio = mbio;
1505
1506 mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr +
1507 choose_data_offset(
1508 r10_bio, rdev));
1509 mbio->bi_bdev = rdev->bdev;
1510 mbio->bi_end_io = raid10_end_write_request;
1511 mbio->bi_rw =
1512 WRITE | do_sync | do_fua | do_discard | do_same;
1513 mbio->bi_private = r10_bio;
1514
1515 atomic_inc(&r10_bio->remaining);
1516 spin_lock_irqsave(&conf->device_lock, flags);
1517 bio_list_add(&conf->pending_bio_list, mbio);
1518 conf->pending_count++;
1519 spin_unlock_irqrestore(&conf->device_lock, flags);
1520 if (!mddev_check_plugged(mddev))
1521 md_wakeup_thread(mddev->thread);
1522 }
1523 }
1524
1525
1526
1527
1528
1529 if (sectors_handled < bio_sectors(bio)) {
1530 one_write_done(r10_bio);
1531
1532
1533
1534 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1535
1536 r10_bio->master_bio = bio;
1537 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1538
1539 r10_bio->mddev = mddev;
1540 r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
1541 r10_bio->state = 0;
1542 goto retry_write;
1543 }
1544 one_write_done(r10_bio);
1545}
1546
1547static void make_request(struct mddev *mddev, struct bio *bio)
1548{
1549 struct r10conf *conf = mddev->private;
1550 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1551 int chunk_sects = chunk_mask + 1;
1552
1553 struct bio *split;
1554
1555 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
1556 md_flush_request(mddev, bio);
1557 return;
1558 }
1559
1560 md_write_start(mddev, bio);
1561
1562
1563 do {
1564
1565
1566
1567
1568
1569 if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
1570 bio_sectors(bio) > chunk_sects
1571 && (conf->geo.near_copies < conf->geo.raid_disks
1572 || conf->prev.near_copies <
1573 conf->prev.raid_disks))) {
1574 split = bio_split(bio, chunk_sects -
1575 (bio->bi_iter.bi_sector &
1576 (chunk_sects - 1)),
1577 GFP_NOIO, fs_bio_set);
1578 bio_chain(split, bio);
1579 } else {
1580 split = bio;
1581 }
1582
1583 __make_request(mddev, split);
1584 } while (split != bio);
1585
1586
1587 wake_up(&conf->wait_barrier);
1588}
1589
1590static void status(struct seq_file *seq, struct mddev *mddev)
1591{
1592 struct r10conf *conf = mddev->private;
1593 int i;
1594
1595 if (conf->geo.near_copies < conf->geo.raid_disks)
1596 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1597 if (conf->geo.near_copies > 1)
1598 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1599 if (conf->geo.far_copies > 1) {
1600 if (conf->geo.far_offset)
1601 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1602 else
1603 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1604 }
1605 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1606 conf->geo.raid_disks - mddev->degraded);
1607 for (i = 0; i < conf->geo.raid_disks; i++)
1608 seq_printf(seq, "%s",
1609 conf->mirrors[i].rdev &&
1610 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
1611 seq_printf(seq, "]");
1612}
1613
1614
1615
1616
1617
1618
1619static int _enough(struct r10conf *conf, int previous, int ignore)
1620{
1621 int first = 0;
1622 int has_enough = 0;
1623 int disks, ncopies;
1624 if (previous) {
1625 disks = conf->prev.raid_disks;
1626 ncopies = conf->prev.near_copies;
1627 } else {
1628 disks = conf->geo.raid_disks;
1629 ncopies = conf->geo.near_copies;
1630 }
1631
1632 rcu_read_lock();
1633 do {
1634 int n = conf->copies;
1635 int cnt = 0;
1636 int this = first;
1637 while (n--) {
1638 struct md_rdev *rdev;
1639 if (this != ignore &&
1640 (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
1641 test_bit(In_sync, &rdev->flags))
1642 cnt++;
1643 this = (this+1) % disks;
1644 }
1645 if (cnt == 0)
1646 goto out;
1647 first = (first + ncopies) % disks;
1648 } while (first != 0);
1649 has_enough = 1;
1650out:
1651 rcu_read_unlock();
1652 return has_enough;
1653}
1654
1655static int enough(struct r10conf *conf, int ignore)
1656{
1657
1658
1659
1660
1661
1662 return _enough(conf, 0, ignore) &&
1663 _enough(conf, 1, ignore);
1664}
1665
1666static void error(struct mddev *mddev, struct md_rdev *rdev)
1667{
1668 char b[BDEVNAME_SIZE];
1669 struct r10conf *conf = mddev->private;
1670 unsigned long flags;
1671
1672
1673
1674
1675
1676
1677
1678 spin_lock_irqsave(&conf->device_lock, flags);
1679 if (test_bit(In_sync, &rdev->flags)
1680 && !enough(conf, rdev->raid_disk)) {
1681
1682
1683
1684 spin_unlock_irqrestore(&conf->device_lock, flags);
1685 return;
1686 }
1687 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1688 mddev->degraded++;
1689
1690
1691
1692 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1693 }
1694 set_bit(Blocked, &rdev->flags);
1695 set_bit(Faulty, &rdev->flags);
1696 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1697 spin_unlock_irqrestore(&conf->device_lock, flags);
1698 printk(KERN_ALERT
1699 "md/raid10:%s: Disk failure on %s, disabling device.\n"
1700 "md/raid10:%s: Operation continuing on %d devices.\n",
1701 mdname(mddev), bdevname(rdev->bdev, b),
1702 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1703}
1704
1705static void print_conf(struct r10conf *conf)
1706{
1707 int i;
1708 struct raid10_info *tmp;
1709
1710 printk(KERN_DEBUG "RAID10 conf printout:\n");
1711 if (!conf) {
1712 printk(KERN_DEBUG "(!conf)\n");
1713 return;
1714 }
1715 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1716 conf->geo.raid_disks);
1717
1718 for (i = 0; i < conf->geo.raid_disks; i++) {
1719 char b[BDEVNAME_SIZE];
1720 tmp = conf->mirrors + i;
1721 if (tmp->rdev)
1722 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1723 i, !test_bit(In_sync, &tmp->rdev->flags),
1724 !test_bit(Faulty, &tmp->rdev->flags),
1725 bdevname(tmp->rdev->bdev,b));
1726 }
1727}
1728
1729static void close_sync(struct r10conf *conf)
1730{
1731 wait_barrier(conf);
1732 allow_barrier(conf);
1733
1734 mempool_destroy(conf->r10buf_pool);
1735 conf->r10buf_pool = NULL;
1736}
1737
1738static int raid10_spare_active(struct mddev *mddev)
1739{
1740 int i;
1741 struct r10conf *conf = mddev->private;
1742 struct raid10_info *tmp;
1743 int count = 0;
1744 unsigned long flags;
1745
1746
1747
1748
1749
1750 for (i = 0; i < conf->geo.raid_disks; i++) {
1751 tmp = conf->mirrors + i;
1752 if (tmp->replacement
1753 && tmp->replacement->recovery_offset == MaxSector
1754 && !test_bit(Faulty, &tmp->replacement->flags)
1755 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1756
1757 if (!tmp->rdev
1758 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1759 count++;
1760 if (tmp->rdev) {
1761
1762
1763
1764
1765 set_bit(Faulty, &tmp->rdev->flags);
1766 sysfs_notify_dirent_safe(
1767 tmp->rdev->sysfs_state);
1768 }
1769 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1770 } else if (tmp->rdev
1771 && tmp->rdev->recovery_offset == MaxSector
1772 && !test_bit(Faulty, &tmp->rdev->flags)
1773 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1774 count++;
1775 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
1776 }
1777 }
1778 spin_lock_irqsave(&conf->device_lock, flags);
1779 mddev->degraded -= count;
1780 spin_unlock_irqrestore(&conf->device_lock, flags);
1781
1782 print_conf(conf);
1783 return count;
1784}
1785
1786
1787static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1788{
1789 struct r10conf *conf = mddev->private;
1790 int err = -EEXIST;
1791 int mirror;
1792 int first = 0;
1793 int last = conf->geo.raid_disks - 1;
1794 struct request_queue *q = bdev_get_queue(rdev->bdev);
1795
1796 if (mddev->recovery_cp < MaxSector)
1797
1798
1799
1800 return -EBUSY;
1801 if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
1802 return -EINVAL;
1803
1804 if (rdev->raid_disk >= 0)
1805 first = last = rdev->raid_disk;
1806
1807 if (q->merge_bvec_fn) {
1808 set_bit(Unmerged, &rdev->flags);
1809 mddev->merge_check_needed = 1;
1810 }
1811
1812 if (rdev->saved_raid_disk >= first &&
1813 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1814 mirror = rdev->saved_raid_disk;
1815 else
1816 mirror = first;
1817 for ( ; mirror <= last ; mirror++) {
1818 struct raid10_info *p = &conf->mirrors[mirror];
1819 if (p->recovery_disabled == mddev->recovery_disabled)
1820 continue;
1821 if (p->rdev) {
1822 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1823 p->replacement != NULL)
1824 continue;
1825 clear_bit(In_sync, &rdev->flags);
1826 set_bit(Replacement, &rdev->flags);
1827 rdev->raid_disk = mirror;
1828 err = 0;
1829 if (mddev->gendisk)
1830 disk_stack_limits(mddev->gendisk, rdev->bdev,
1831 rdev->data_offset << 9);
1832 conf->fullsync = 1;
1833 rcu_assign_pointer(p->replacement, rdev);
1834 break;
1835 }
1836
1837 if (mddev->gendisk)
1838 disk_stack_limits(mddev->gendisk, rdev->bdev,
1839 rdev->data_offset << 9);
1840
1841 p->head_position = 0;
1842 p->recovery_disabled = mddev->recovery_disabled - 1;
1843 rdev->raid_disk = mirror;
1844 err = 0;
1845 if (rdev->saved_raid_disk != mirror)
1846 conf->fullsync = 1;
1847 rcu_assign_pointer(p->rdev, rdev);
1848 break;
1849 }
1850 if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
1851
1852
1853
1854
1855
1856
1857
1858 synchronize_sched();
1859 freeze_array(conf, 0);
1860 unfreeze_array(conf);
1861 clear_bit(Unmerged, &rdev->flags);
1862 }
1863 md_integrity_add_rdev(rdev, mddev);
1864 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1865 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
1866
1867 print_conf(conf);
1868 return err;
1869}
1870
1871static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1872{
1873 struct r10conf *conf = mddev->private;
1874 int err = 0;
1875 int number = rdev->raid_disk;
1876 struct md_rdev **rdevp;
1877 struct raid10_info *p = conf->mirrors + number;
1878
1879 print_conf(conf);
1880 if (rdev == p->rdev)
1881 rdevp = &p->rdev;
1882 else if (rdev == p->replacement)
1883 rdevp = &p->replacement;
1884 else
1885 return 0;
1886
1887 if (test_bit(In_sync, &rdev->flags) ||
1888 atomic_read(&rdev->nr_pending)) {
1889 err = -EBUSY;
1890 goto abort;
1891 }
1892
1893
1894
1895 if (!test_bit(Faulty, &rdev->flags) &&
1896 mddev->recovery_disabled != p->recovery_disabled &&
1897 (!p->replacement || p->replacement == rdev) &&
1898 number < conf->geo.raid_disks &&
1899 enough(conf, -1)) {
1900 err = -EBUSY;
1901 goto abort;
1902 }
1903 *rdevp = NULL;
1904 synchronize_rcu();
1905 if (atomic_read(&rdev->nr_pending)) {
1906
1907 err = -EBUSY;
1908 *rdevp = rdev;
1909 goto abort;
1910 } else if (p->replacement) {
1911
1912 p->rdev = p->replacement;
1913 clear_bit(Replacement, &p->replacement->flags);
1914 smp_mb();
1915
1916
1917 p->replacement = NULL;
1918 clear_bit(WantReplacement, &rdev->flags);
1919 } else
1920
1921
1922
1923 clear_bit(WantReplacement, &rdev->flags);
1924
1925 err = md_integrity_register(mddev);
1926
1927abort:
1928
1929 print_conf(conf);
1930 return err;
1931}
1932
1933
1934static void end_sync_read(struct bio *bio, int error)
1935{
1936 struct r10bio *r10_bio = bio->bi_private;
1937 struct r10conf *conf = r10_bio->mddev->private;
1938 int d;
1939
1940 if (bio == r10_bio->master_bio) {
1941
1942 d = r10_bio->read_slot;
1943 } else
1944 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1945
1946 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1947 set_bit(R10BIO_Uptodate, &r10_bio->state);
1948 else
1949
1950
1951
1952 atomic_add(r10_bio->sectors,
1953 &conf->mirrors[d].rdev->corrected_errors);
1954
1955
1956
1957
1958 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1959 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1960 atomic_dec_and_test(&r10_bio->remaining)) {
1961
1962
1963
1964 reschedule_retry(r10_bio);
1965 }
1966}
1967
1968static void end_sync_request(struct r10bio *r10_bio)
1969{
1970 struct mddev *mddev = r10_bio->mddev;
1971
1972 while (atomic_dec_and_test(&r10_bio->remaining)) {
1973 if (r10_bio->master_bio == NULL) {
1974
1975 sector_t s = r10_bio->sectors;
1976 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1977 test_bit(R10BIO_WriteError, &r10_bio->state))
1978 reschedule_retry(r10_bio);
1979 else
1980 put_buf(r10_bio);
1981 md_done_sync(mddev, s, 1);
1982 break;
1983 } else {
1984 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1985 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1986 test_bit(R10BIO_WriteError, &r10_bio->state))
1987 reschedule_retry(r10_bio);
1988 else
1989 put_buf(r10_bio);
1990 r10_bio = r10_bio2;
1991 }
1992 }
1993}
1994
1995static void end_sync_write(struct bio *bio, int error)
1996{
1997 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1998 struct r10bio *r10_bio = bio->bi_private;
1999 struct mddev *mddev = r10_bio->mddev;
2000 struct r10conf *conf = mddev->private;
2001 int d;
2002 sector_t first_bad;
2003 int bad_sectors;
2004 int slot;
2005 int repl;
2006 struct md_rdev *rdev = NULL;
2007
2008 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
2009 if (repl)
2010 rdev = conf->mirrors[d].replacement;
2011 else
2012 rdev = conf->mirrors[d].rdev;
2013
2014 if (!uptodate) {
2015 if (repl)
2016 md_error(mddev, rdev);
2017 else {
2018 set_bit(WriteErrorSeen, &rdev->flags);
2019 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2020 set_bit(MD_RECOVERY_NEEDED,
2021 &rdev->mddev->recovery);
2022 set_bit(R10BIO_WriteError, &r10_bio->state);
2023 }
2024 } else if (is_badblock(rdev,
2025 r10_bio->devs[slot].addr,
2026 r10_bio->sectors,
2027 &first_bad, &bad_sectors))
2028 set_bit(R10BIO_MadeGood, &r10_bio->state);
2029
2030 rdev_dec_pending(rdev, mddev);
2031
2032 end_sync_request(r10_bio);
2033}
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2052{
2053 struct r10conf *conf = mddev->private;
2054 int i, first;
2055 struct bio *tbio, *fbio;
2056 int vcnt;
2057
2058 atomic_set(&r10_bio->remaining, 1);
2059
2060
2061 for (i=0; i<conf->copies; i++)
2062 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
2063 break;
2064
2065 if (i == conf->copies)
2066 goto done;
2067
2068 first = i;
2069 fbio = r10_bio->devs[i].bio;
2070
2071 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2072
2073 for (i=0 ; i < conf->copies ; i++) {
2074 int j, d;
2075
2076 tbio = r10_bio->devs[i].bio;
2077
2078 if (tbio->bi_end_io != end_sync_read)
2079 continue;
2080 if (i == first)
2081 continue;
2082 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
2083
2084
2085
2086
2087 int sectors = r10_bio->sectors;
2088 for (j = 0; j < vcnt; j++) {
2089 int len = PAGE_SIZE;
2090 if (sectors < (len / 512))
2091 len = sectors * 512;
2092 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
2093 page_address(tbio->bi_io_vec[j].bv_page),
2094 len))
2095 break;
2096 sectors -= len/512;
2097 }
2098 if (j == vcnt)
2099 continue;
2100 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2101 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2102
2103 continue;
2104 }
2105
2106
2107
2108
2109
2110 bio_reset(tbio);
2111
2112 tbio->bi_vcnt = vcnt;
2113 tbio->bi_iter.bi_size = r10_bio->sectors << 9;
2114 tbio->bi_rw = WRITE;
2115 tbio->bi_private = r10_bio;
2116 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
2117
2118 for (j=0; j < vcnt ; j++) {
2119 tbio->bi_io_vec[j].bv_offset = 0;
2120 tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
2121
2122 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2123 page_address(fbio->bi_io_vec[j].bv_page),
2124 PAGE_SIZE);
2125 }
2126 tbio->bi_end_io = end_sync_write;
2127
2128 d = r10_bio->devs[i].devnum;
2129 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2130 atomic_inc(&r10_bio->remaining);
2131 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2132
2133 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
2134 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
2135 generic_make_request(tbio);
2136 }
2137
2138
2139
2140
2141 for (i = 0; i < conf->copies; i++) {
2142 int j, d;
2143
2144 tbio = r10_bio->devs[i].repl_bio;
2145 if (!tbio || !tbio->bi_end_io)
2146 continue;
2147 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2148 && r10_bio->devs[i].bio != fbio)
2149 for (j = 0; j < vcnt; j++)
2150 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2151 page_address(fbio->bi_io_vec[j].bv_page),
2152 PAGE_SIZE);
2153 d = r10_bio->devs[i].devnum;
2154 atomic_inc(&r10_bio->remaining);
2155 md_sync_acct(conf->mirrors[d].replacement->bdev,
2156 bio_sectors(tbio));
2157 generic_make_request(tbio);
2158 }
2159
2160done:
2161 if (atomic_dec_and_test(&r10_bio->remaining)) {
2162 md_done_sync(mddev, r10_bio->sectors, 1);
2163 put_buf(r10_bio);
2164 }
2165}
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177static void fix_recovery_read_error(struct r10bio *r10_bio)
2178{
2179
2180
2181
2182
2183
2184
2185
2186 struct mddev *mddev = r10_bio->mddev;
2187 struct r10conf *conf = mddev->private;
2188 struct bio *bio = r10_bio->devs[0].bio;
2189 sector_t sect = 0;
2190 int sectors = r10_bio->sectors;
2191 int idx = 0;
2192 int dr = r10_bio->devs[0].devnum;
2193 int dw = r10_bio->devs[1].devnum;
2194
2195 while (sectors) {
2196 int s = sectors;
2197 struct md_rdev *rdev;
2198 sector_t addr;
2199 int ok;
2200
2201 if (s > (PAGE_SIZE>>9))
2202 s = PAGE_SIZE >> 9;
2203
2204 rdev = conf->mirrors[dr].rdev;
2205 addr = r10_bio->devs[0].addr + sect,
2206 ok = sync_page_io(rdev,
2207 addr,
2208 s << 9,
2209 bio->bi_io_vec[idx].bv_page,
2210 READ, false);
2211 if (ok) {
2212 rdev = conf->mirrors[dw].rdev;
2213 addr = r10_bio->devs[1].addr + sect;
2214 ok = sync_page_io(rdev,
2215 addr,
2216 s << 9,
2217 bio->bi_io_vec[idx].bv_page,
2218 WRITE, false);
2219 if (!ok) {
2220 set_bit(WriteErrorSeen, &rdev->flags);
2221 if (!test_and_set_bit(WantReplacement,
2222 &rdev->flags))
2223 set_bit(MD_RECOVERY_NEEDED,
2224 &rdev->mddev->recovery);
2225 }
2226 }
2227 if (!ok) {
2228
2229
2230
2231
2232 rdev_set_badblocks(rdev, addr, s, 0);
2233
2234 if (rdev != conf->mirrors[dw].rdev) {
2235
2236 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2237 addr = r10_bio->devs[1].addr + sect;
2238 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2239 if (!ok) {
2240
2241 printk(KERN_NOTICE
2242 "md/raid10:%s: recovery aborted"
2243 " due to read error\n",
2244 mdname(mddev));
2245
2246 conf->mirrors[dw].recovery_disabled
2247 = mddev->recovery_disabled;
2248 set_bit(MD_RECOVERY_INTR,
2249 &mddev->recovery);
2250 break;
2251 }
2252 }
2253 }
2254
2255 sectors -= s;
2256 sect += s;
2257 idx++;
2258 }
2259}
2260
2261static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2262{
2263 struct r10conf *conf = mddev->private;
2264 int d;
2265 struct bio *wbio, *wbio2;
2266
2267 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2268 fix_recovery_read_error(r10_bio);
2269 end_sync_request(r10_bio);
2270 return;
2271 }
2272
2273
2274
2275
2276
2277 d = r10_bio->devs[1].devnum;
2278 wbio = r10_bio->devs[1].bio;
2279 wbio2 = r10_bio->devs[1].repl_bio;
2280
2281
2282
2283
2284 if (wbio2 && !wbio2->bi_end_io)
2285 wbio2 = NULL;
2286 if (wbio->bi_end_io) {
2287 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2288 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2289 generic_make_request(wbio);
2290 }
2291 if (wbio2) {
2292 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2293 md_sync_acct(conf->mirrors[d].replacement->bdev,
2294 bio_sectors(wbio2));
2295 generic_make_request(wbio2);
2296 }
2297}
2298
2299
2300
2301
2302
2303
2304
2305
2306static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2307{
2308 struct timespec cur_time_mon;
2309 unsigned long hours_since_last;
2310 unsigned int read_errors = atomic_read(&rdev->read_errors);
2311
2312 ktime_get_ts(&cur_time_mon);
2313
2314 if (rdev->last_read_error.tv_sec == 0 &&
2315 rdev->last_read_error.tv_nsec == 0) {
2316
2317 rdev->last_read_error = cur_time_mon;
2318 return;
2319 }
2320
2321 hours_since_last = (cur_time_mon.tv_sec -
2322 rdev->last_read_error.tv_sec) / 3600;
2323
2324 rdev->last_read_error = cur_time_mon;
2325
2326
2327
2328
2329
2330
2331 if (hours_since_last >= 8 * sizeof(read_errors))
2332 atomic_set(&rdev->read_errors, 0);
2333 else
2334 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2335}
2336
2337static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2338 int sectors, struct page *page, int rw)
2339{
2340 sector_t first_bad;
2341 int bad_sectors;
2342
2343 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2344 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2345 return -1;
2346 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
2347
2348 return 1;
2349 if (rw == WRITE) {
2350 set_bit(WriteErrorSeen, &rdev->flags);
2351 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2352 set_bit(MD_RECOVERY_NEEDED,
2353 &rdev->mddev->recovery);
2354 }
2355
2356 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2357 md_error(rdev->mddev, rdev);
2358 return 0;
2359}
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2370{
2371 int sect = 0;
2372 int sectors = r10_bio->sectors;
2373 struct md_rdev*rdev;
2374 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2375 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2376
2377
2378
2379
2380 rdev = conf->mirrors[d].rdev;
2381
2382 if (test_bit(Faulty, &rdev->flags))
2383
2384
2385 return;
2386
2387 check_decay_read_errors(mddev, rdev);
2388 atomic_inc(&rdev->read_errors);
2389 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2390 char b[BDEVNAME_SIZE];
2391 bdevname(rdev->bdev, b);
2392
2393 printk(KERN_NOTICE
2394 "md/raid10:%s: %s: Raid device exceeded "
2395 "read_error threshold [cur %d:max %d]\n",
2396 mdname(mddev), b,
2397 atomic_read(&rdev->read_errors), max_read_errors);
2398 printk(KERN_NOTICE
2399 "md/raid10:%s: %s: Failing raid device\n",
2400 mdname(mddev), b);
2401 md_error(mddev, conf->mirrors[d].rdev);
2402 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2403 return;
2404 }
2405
2406 while(sectors) {
2407 int s = sectors;
2408 int sl = r10_bio->read_slot;
2409 int success = 0;
2410 int start;
2411
2412 if (s > (PAGE_SIZE>>9))
2413 s = PAGE_SIZE >> 9;
2414
2415 rcu_read_lock();
2416 do {
2417 sector_t first_bad;
2418 int bad_sectors;
2419
2420 d = r10_bio->devs[sl].devnum;
2421 rdev = rcu_dereference(conf->mirrors[d].rdev);
2422 if (rdev &&
2423 !test_bit(Unmerged, &rdev->flags) &&
2424 test_bit(In_sync, &rdev->flags) &&
2425 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2426 &first_bad, &bad_sectors) == 0) {
2427 atomic_inc(&rdev->nr_pending);
2428 rcu_read_unlock();
2429 success = sync_page_io(rdev,
2430 r10_bio->devs[sl].addr +
2431 sect,
2432 s<<9,
2433 conf->tmppage, READ, false);
2434 rdev_dec_pending(rdev, mddev);
2435 rcu_read_lock();
2436 if (success)
2437 break;
2438 }
2439 sl++;
2440 if (sl == conf->copies)
2441 sl = 0;
2442 } while (!success && sl != r10_bio->read_slot);
2443 rcu_read_unlock();
2444
2445 if (!success) {
2446
2447
2448
2449
2450 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2451 rdev = conf->mirrors[dn].rdev;
2452
2453 if (!rdev_set_badblocks(
2454 rdev,
2455 r10_bio->devs[r10_bio->read_slot].addr
2456 + sect,
2457 s, 0)) {
2458 md_error(mddev, rdev);
2459 r10_bio->devs[r10_bio->read_slot].bio
2460 = IO_BLOCKED;
2461 }
2462 break;
2463 }
2464
2465 start = sl;
2466
2467 rcu_read_lock();
2468 while (sl != r10_bio->read_slot) {
2469 char b[BDEVNAME_SIZE];
2470
2471 if (sl==0)
2472 sl = conf->copies;
2473 sl--;
2474 d = r10_bio->devs[sl].devnum;
2475 rdev = rcu_dereference(conf->mirrors[d].rdev);
2476 if (!rdev ||
2477 test_bit(Unmerged, &rdev->flags) ||
2478 !test_bit(In_sync, &rdev->flags))
2479 continue;
2480
2481 atomic_inc(&rdev->nr_pending);
2482 rcu_read_unlock();
2483 if (r10_sync_page_io(rdev,
2484 r10_bio->devs[sl].addr +
2485 sect,
2486 s, conf->tmppage, WRITE)
2487 == 0) {
2488
2489 printk(KERN_NOTICE
2490 "md/raid10:%s: read correction "
2491 "write failed"
2492 " (%d sectors at %llu on %s)\n",
2493 mdname(mddev), s,
2494 (unsigned long long)(
2495 sect +
2496 choose_data_offset(r10_bio,
2497 rdev)),
2498 bdevname(rdev->bdev, b));
2499 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2500 "drive\n",
2501 mdname(mddev),
2502 bdevname(rdev->bdev, b));
2503 }
2504 rdev_dec_pending(rdev, mddev);
2505 rcu_read_lock();
2506 }
2507 sl = start;
2508 while (sl != r10_bio->read_slot) {
2509 char b[BDEVNAME_SIZE];
2510
2511 if (sl==0)
2512 sl = conf->copies;
2513 sl--;
2514 d = r10_bio->devs[sl].devnum;
2515 rdev = rcu_dereference(conf->mirrors[d].rdev);
2516 if (!rdev ||
2517 !test_bit(In_sync, &rdev->flags))
2518 continue;
2519
2520 atomic_inc(&rdev->nr_pending);
2521 rcu_read_unlock();
2522 switch (r10_sync_page_io(rdev,
2523 r10_bio->devs[sl].addr +
2524 sect,
2525 s, conf->tmppage,
2526 READ)) {
2527 case 0:
2528
2529 printk(KERN_NOTICE
2530 "md/raid10:%s: unable to read back "
2531 "corrected sectors"
2532 " (%d sectors at %llu on %s)\n",
2533 mdname(mddev), s,
2534 (unsigned long long)(
2535 sect +
2536 choose_data_offset(r10_bio, rdev)),
2537 bdevname(rdev->bdev, b));
2538 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2539 "drive\n",
2540 mdname(mddev),
2541 bdevname(rdev->bdev, b));
2542 break;
2543 case 1:
2544 printk(KERN_INFO
2545 "md/raid10:%s: read error corrected"
2546 " (%d sectors at %llu on %s)\n",
2547 mdname(mddev), s,
2548 (unsigned long long)(
2549 sect +
2550 choose_data_offset(r10_bio, rdev)),
2551 bdevname(rdev->bdev, b));
2552 atomic_add(s, &rdev->corrected_errors);
2553 }
2554
2555 rdev_dec_pending(rdev, mddev);
2556 rcu_read_lock();
2557 }
2558 rcu_read_unlock();
2559
2560 sectors -= s;
2561 sect += s;
2562 }
2563}
2564
2565static int narrow_write_error(struct r10bio *r10_bio, int i)
2566{
2567 struct bio *bio = r10_bio->master_bio;
2568 struct mddev *mddev = r10_bio->mddev;
2569 struct r10conf *conf = mddev->private;
2570 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582 int block_sectors;
2583 sector_t sector;
2584 int sectors;
2585 int sect_to_write = r10_bio->sectors;
2586 int ok = 1;
2587
2588 if (rdev->badblocks.shift < 0)
2589 return 0;
2590
2591 block_sectors = 1 << rdev->badblocks.shift;
2592 sector = r10_bio->sector;
2593 sectors = ((r10_bio->sector + block_sectors)
2594 & ~(sector_t)(block_sectors - 1))
2595 - sector;
2596
2597 while (sect_to_write) {
2598 struct bio *wbio;
2599 if (sectors > sect_to_write)
2600 sectors = sect_to_write;
2601
2602 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2603 bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
2604 wbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
2605 choose_data_offset(r10_bio, rdev) +
2606 (sector - r10_bio->sector));
2607 wbio->bi_bdev = rdev->bdev;
2608 if (submit_bio_wait(WRITE, wbio) == 0)
2609
2610 ok = rdev_set_badblocks(rdev, sector,
2611 sectors, 0)
2612 && ok;
2613
2614 bio_put(wbio);
2615 sect_to_write -= sectors;
2616 sector += sectors;
2617 sectors = block_sectors;
2618 }
2619 return ok;
2620}
2621
2622static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2623{
2624 int slot = r10_bio->read_slot;
2625 struct bio *bio;
2626 struct r10conf *conf = mddev->private;
2627 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2628 char b[BDEVNAME_SIZE];
2629 unsigned long do_sync;
2630 int max_sectors;
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640 bio = r10_bio->devs[slot].bio;
2641 bdevname(bio->bi_bdev, b);
2642 bio_put(bio);
2643 r10_bio->devs[slot].bio = NULL;
2644
2645 if (mddev->ro == 0) {
2646 freeze_array(conf, 1);
2647 fix_read_error(conf, mddev, r10_bio);
2648 unfreeze_array(conf);
2649 } else
2650 r10_bio->devs[slot].bio = IO_BLOCKED;
2651
2652 rdev_dec_pending(rdev, mddev);
2653
2654read_more:
2655 rdev = read_balance(conf, r10_bio, &max_sectors);
2656 if (rdev == NULL) {
2657 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2658 " read error for block %llu\n",
2659 mdname(mddev), b,
2660 (unsigned long long)r10_bio->sector);
2661 raid_end_bio_io(r10_bio);
2662 return;
2663 }
2664
2665 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
2666 slot = r10_bio->read_slot;
2667 printk_ratelimited(
2668 KERN_ERR
2669 "md/raid10:%s: %s: redirecting "
2670 "sector %llu to another mirror\n",
2671 mdname(mddev),
2672 bdevname(rdev->bdev, b),
2673 (unsigned long long)r10_bio->sector);
2674 bio = bio_clone_mddev(r10_bio->master_bio,
2675 GFP_NOIO, mddev);
2676 bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
2677 r10_bio->devs[slot].bio = bio;
2678 r10_bio->devs[slot].rdev = rdev;
2679 bio->bi_iter.bi_sector = r10_bio->devs[slot].addr
2680 + choose_data_offset(r10_bio, rdev);
2681 bio->bi_bdev = rdev->bdev;
2682 bio->bi_rw = READ | do_sync;
2683 bio->bi_private = r10_bio;
2684 bio->bi_end_io = raid10_end_read_request;
2685 if (max_sectors < r10_bio->sectors) {
2686
2687 struct bio *mbio = r10_bio->master_bio;
2688 int sectors_handled =
2689 r10_bio->sector + max_sectors
2690 - mbio->bi_iter.bi_sector;
2691 r10_bio->sectors = max_sectors;
2692 spin_lock_irq(&conf->device_lock);
2693 if (mbio->bi_phys_segments == 0)
2694 mbio->bi_phys_segments = 2;
2695 else
2696 mbio->bi_phys_segments++;
2697 spin_unlock_irq(&conf->device_lock);
2698 generic_make_request(bio);
2699
2700 r10_bio = mempool_alloc(conf->r10bio_pool,
2701 GFP_NOIO);
2702 r10_bio->master_bio = mbio;
2703 r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
2704 r10_bio->state = 0;
2705 set_bit(R10BIO_ReadError,
2706 &r10_bio->state);
2707 r10_bio->mddev = mddev;
2708 r10_bio->sector = mbio->bi_iter.bi_sector
2709 + sectors_handled;
2710
2711 goto read_more;
2712 } else
2713 generic_make_request(bio);
2714}
2715
2716static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2717{
2718
2719
2720
2721
2722
2723
2724 int m;
2725 struct md_rdev *rdev;
2726
2727 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2728 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2729 for (m = 0; m < conf->copies; m++) {
2730 int dev = r10_bio->devs[m].devnum;
2731 rdev = conf->mirrors[dev].rdev;
2732 if (r10_bio->devs[m].bio == NULL)
2733 continue;
2734 if (test_bit(BIO_UPTODATE,
2735 &r10_bio->devs[m].bio->bi_flags)) {
2736 rdev_clear_badblocks(
2737 rdev,
2738 r10_bio->devs[m].addr,
2739 r10_bio->sectors, 0);
2740 } else {
2741 if (!rdev_set_badblocks(
2742 rdev,
2743 r10_bio->devs[m].addr,
2744 r10_bio->sectors, 0))
2745 md_error(conf->mddev, rdev);
2746 }
2747 rdev = conf->mirrors[dev].replacement;
2748 if (r10_bio->devs[m].repl_bio == NULL)
2749 continue;
2750 if (test_bit(BIO_UPTODATE,
2751 &r10_bio->devs[m].repl_bio->bi_flags)) {
2752 rdev_clear_badblocks(
2753 rdev,
2754 r10_bio->devs[m].addr,
2755 r10_bio->sectors, 0);
2756 } else {
2757 if (!rdev_set_badblocks(
2758 rdev,
2759 r10_bio->devs[m].addr,
2760 r10_bio->sectors, 0))
2761 md_error(conf->mddev, rdev);
2762 }
2763 }
2764 put_buf(r10_bio);
2765 } else {
2766 for (m = 0; m < conf->copies; m++) {
2767 int dev = r10_bio->devs[m].devnum;
2768 struct bio *bio = r10_bio->devs[m].bio;
2769 rdev = conf->mirrors[dev].rdev;
2770 if (bio == IO_MADE_GOOD) {
2771 rdev_clear_badblocks(
2772 rdev,
2773 r10_bio->devs[m].addr,
2774 r10_bio->sectors, 0);
2775 rdev_dec_pending(rdev, conf->mddev);
2776 } else if (bio != NULL &&
2777 !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2778 if (!narrow_write_error(r10_bio, m)) {
2779 md_error(conf->mddev, rdev);
2780 set_bit(R10BIO_Degraded,
2781 &r10_bio->state);
2782 }
2783 rdev_dec_pending(rdev, conf->mddev);
2784 }
2785 bio = r10_bio->devs[m].repl_bio;
2786 rdev = conf->mirrors[dev].replacement;
2787 if (rdev && bio == IO_MADE_GOOD) {
2788 rdev_clear_badblocks(
2789 rdev,
2790 r10_bio->devs[m].addr,
2791 r10_bio->sectors, 0);
2792 rdev_dec_pending(rdev, conf->mddev);
2793 }
2794 }
2795 if (test_bit(R10BIO_WriteError,
2796 &r10_bio->state))
2797 close_write(r10_bio);
2798 raid_end_bio_io(r10_bio);
2799 }
2800}
2801
2802static void raid10d(struct md_thread *thread)
2803{
2804 struct mddev *mddev = thread->mddev;
2805 struct r10bio *r10_bio;
2806 unsigned long flags;
2807 struct r10conf *conf = mddev->private;
2808 struct list_head *head = &conf->retry_list;
2809 struct blk_plug plug;
2810
2811 md_check_recovery(mddev);
2812
2813 blk_start_plug(&plug);
2814 for (;;) {
2815
2816 flush_pending_writes(conf);
2817
2818 spin_lock_irqsave(&conf->device_lock, flags);
2819 if (list_empty(head)) {
2820 spin_unlock_irqrestore(&conf->device_lock, flags);
2821 break;
2822 }
2823 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2824 list_del(head->prev);
2825 conf->nr_queued--;
2826 spin_unlock_irqrestore(&conf->device_lock, flags);
2827
2828 mddev = r10_bio->mddev;
2829 conf = mddev->private;
2830 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2831 test_bit(R10BIO_WriteError, &r10_bio->state))
2832 handle_write_completed(conf, r10_bio);
2833 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2834 reshape_request_write(mddev, r10_bio);
2835 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2836 sync_request_write(mddev, r10_bio);
2837 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2838 recovery_request_write(mddev, r10_bio);
2839 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2840 handle_read_error(mddev, r10_bio);
2841 else {
2842
2843
2844
2845 int slot = r10_bio->read_slot;
2846 generic_make_request(r10_bio->devs[slot].bio);
2847 }
2848
2849 cond_resched();
2850 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2851 md_check_recovery(mddev);
2852 }
2853 blk_finish_plug(&plug);
2854}
2855
2856
2857static int init_resync(struct r10conf *conf)
2858{
2859 int buffs;
2860 int i;
2861
2862 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2863 BUG_ON(conf->r10buf_pool);
2864 conf->have_replacement = 0;
2865 for (i = 0; i < conf->geo.raid_disks; i++)
2866 if (conf->mirrors[i].replacement)
2867 conf->have_replacement = 1;
2868 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2869 if (!conf->r10buf_pool)
2870 return -ENOMEM;
2871 conf->next_resync = 0;
2872 return 0;
2873}
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2908 int *skipped, int go_faster)
2909{
2910 struct r10conf *conf = mddev->private;
2911 struct r10bio *r10_bio;
2912 struct bio *biolist = NULL, *bio;
2913 sector_t max_sector, nr_sectors;
2914 int i;
2915 int max_sync;
2916 sector_t sync_blocks;
2917 sector_t sectors_skipped = 0;
2918 int chunks_skipped = 0;
2919 sector_t chunk_mask = conf->geo.chunk_mask;
2920
2921 if (!conf->r10buf_pool)
2922 if (init_resync(conf))
2923 return 0;
2924
2925
2926
2927
2928
2929 if (mddev->bitmap == NULL &&
2930 mddev->recovery_cp == MaxSector &&
2931 mddev->reshape_position == MaxSector &&
2932 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2933 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
2934 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2935 conf->fullsync == 0) {
2936 *skipped = 1;
2937 return mddev->dev_sectors - sector_nr;
2938 }
2939
2940 skipped:
2941 max_sector = mddev->dev_sectors;
2942 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2943 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2944 max_sector = mddev->resync_max_sectors;
2945 if (sector_nr >= max_sector) {
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2956 end_reshape(conf);
2957 return 0;
2958 }
2959
2960 if (mddev->curr_resync < max_sector) {
2961 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2962 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2963 &sync_blocks, 1);
2964 else for (i = 0; i < conf->geo.raid_disks; i++) {
2965 sector_t sect =
2966 raid10_find_virt(conf, mddev->curr_resync, i);
2967 bitmap_end_sync(mddev->bitmap, sect,
2968 &sync_blocks, 1);
2969 }
2970 } else {
2971
2972 if ((!mddev->bitmap || conf->fullsync)
2973 && conf->have_replacement
2974 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2975
2976
2977
2978 for (i = 0; i < conf->geo.raid_disks; i++)
2979 if (conf->mirrors[i].replacement)
2980 conf->mirrors[i].replacement
2981 ->recovery_offset
2982 = MaxSector;
2983 }
2984 conf->fullsync = 0;
2985 }
2986 bitmap_close_sync(mddev->bitmap);
2987 close_sync(conf);
2988 *skipped = 1;
2989 return sectors_skipped;
2990 }
2991
2992 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2993 return reshape_request(mddev, sector_nr, skipped);
2994
2995 if (chunks_skipped >= conf->geo.raid_disks) {
2996
2997
2998
2999 *skipped = 1;
3000 return (max_sector - sector_nr) + sectors_skipped;
3001 }
3002
3003 if (max_sector > mddev->resync_max)
3004 max_sector = mddev->resync_max;
3005
3006
3007
3008
3009 if (conf->geo.near_copies < conf->geo.raid_disks &&
3010 max_sector > (sector_nr | chunk_mask))
3011 max_sector = (sector_nr | chunk_mask) + 1;
3012
3013
3014
3015
3016 if (!go_faster && conf->nr_waiting)
3017 msleep_interruptible(1000);
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
3035 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3036
3037 int j;
3038 r10_bio = NULL;
3039
3040 for (i = 0 ; i < conf->geo.raid_disks; i++) {
3041 int still_degraded;
3042 struct r10bio *rb2;
3043 sector_t sect;
3044 int must_sync;
3045 int any_working;
3046 struct raid10_info *mirror = &conf->mirrors[i];
3047
3048 if ((mirror->rdev == NULL ||
3049 test_bit(In_sync, &mirror->rdev->flags))
3050 &&
3051 (mirror->replacement == NULL ||
3052 test_bit(Faulty,
3053 &mirror->replacement->flags)))
3054 continue;
3055
3056 still_degraded = 0;
3057
3058 rb2 = r10_bio;
3059 sect = raid10_find_virt(conf, sector_nr, i);
3060 if (sect >= mddev->resync_max_sectors) {
3061
3062
3063
3064 continue;
3065 }
3066
3067
3068
3069
3070 must_sync = bitmap_start_sync(mddev->bitmap, sect,
3071 &sync_blocks, 1);
3072 if (sync_blocks < max_sync)
3073 max_sync = sync_blocks;
3074 if (!must_sync &&
3075 mirror->replacement == NULL &&
3076 !conf->fullsync) {
3077
3078
3079
3080 chunks_skipped = -1;
3081 continue;
3082 }
3083
3084 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3085 raise_barrier(conf, rb2 != NULL);
3086 atomic_set(&r10_bio->remaining, 0);
3087
3088 r10_bio->master_bio = (struct bio*)rb2;
3089 if (rb2)
3090 atomic_inc(&rb2->remaining);
3091 r10_bio->mddev = mddev;
3092 set_bit(R10BIO_IsRecover, &r10_bio->state);
3093 r10_bio->sector = sect;
3094
3095 raid10_find_phys(conf, r10_bio);
3096
3097
3098
3099
3100 for (j = 0; j < conf->geo.raid_disks; j++)
3101 if (conf->mirrors[j].rdev == NULL ||
3102 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
3103 still_degraded = 1;
3104 break;
3105 }
3106
3107 must_sync = bitmap_start_sync(mddev->bitmap, sect,
3108 &sync_blocks, still_degraded);
3109
3110 any_working = 0;
3111 for (j=0; j<conf->copies;j++) {
3112 int k;
3113 int d = r10_bio->devs[j].devnum;
3114 sector_t from_addr, to_addr;
3115 struct md_rdev *rdev;
3116 sector_t sector, first_bad;
3117 int bad_sectors;
3118 if (!conf->mirrors[d].rdev ||
3119 !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
3120 continue;
3121
3122 any_working = 1;
3123 rdev = conf->mirrors[d].rdev;
3124 sector = r10_bio->devs[j].addr;
3125
3126 if (is_badblock(rdev, sector, max_sync,
3127 &first_bad, &bad_sectors)) {
3128 if (first_bad > sector)
3129 max_sync = first_bad - sector;
3130 else {
3131 bad_sectors -= (sector
3132 - first_bad);
3133 if (max_sync > bad_sectors)
3134 max_sync = bad_sectors;
3135 continue;
3136 }
3137 }
3138 bio = r10_bio->devs[0].bio;
3139 bio_reset(bio);
3140 bio->bi_next = biolist;
3141 biolist = bio;
3142 bio->bi_private = r10_bio;
3143 bio->bi_end_io = end_sync_read;
3144 bio->bi_rw = READ;
3145 from_addr = r10_bio->devs[j].addr;
3146 bio->bi_iter.bi_sector = from_addr +
3147 rdev->data_offset;
3148 bio->bi_bdev = rdev->bdev;
3149 atomic_inc(&rdev->nr_pending);
3150
3151
3152 for (k=0; k<conf->copies; k++)
3153 if (r10_bio->devs[k].devnum == i)
3154 break;
3155 BUG_ON(k == conf->copies);
3156 to_addr = r10_bio->devs[k].addr;
3157 r10_bio->devs[0].devnum = d;
3158 r10_bio->devs[0].addr = from_addr;
3159 r10_bio->devs[1].devnum = i;
3160 r10_bio->devs[1].addr = to_addr;
3161
3162 rdev = mirror->rdev;
3163 if (!test_bit(In_sync, &rdev->flags)) {
3164 bio = r10_bio->devs[1].bio;
3165 bio_reset(bio);
3166 bio->bi_next = biolist;
3167 biolist = bio;
3168 bio->bi_private = r10_bio;
3169 bio->bi_end_io = end_sync_write;
3170 bio->bi_rw = WRITE;
3171 bio->bi_iter.bi_sector = to_addr
3172 + rdev->data_offset;
3173 bio->bi_bdev = rdev->bdev;
3174 atomic_inc(&r10_bio->remaining);
3175 } else
3176 r10_bio->devs[1].bio->bi_end_io = NULL;
3177
3178
3179 bio = r10_bio->devs[1].repl_bio;
3180 if (bio)
3181 bio->bi_end_io = NULL;
3182 rdev = mirror->replacement;
3183
3184
3185
3186
3187
3188
3189
3190
3191 if (rdev == NULL || bio == NULL ||
3192 test_bit(Faulty, &rdev->flags))
3193 break;
3194 bio_reset(bio);
3195 bio->bi_next = biolist;
3196 biolist = bio;
3197 bio->bi_private = r10_bio;
3198 bio->bi_end_io = end_sync_write;
3199 bio->bi_rw = WRITE;
3200 bio->bi_iter.bi_sector = to_addr +
3201 rdev->data_offset;
3202 bio->bi_bdev = rdev->bdev;
3203 atomic_inc(&r10_bio->remaining);
3204 break;
3205 }
3206 if (j == conf->copies) {
3207
3208
3209 if (any_working) {
3210
3211
3212
3213 int k;
3214 for (k = 0; k < conf->copies; k++)
3215 if (r10_bio->devs[k].devnum == i)
3216 break;
3217 if (!test_bit(In_sync,
3218 &mirror->rdev->flags)
3219 && !rdev_set_badblocks(
3220 mirror->rdev,
3221 r10_bio->devs[k].addr,
3222 max_sync, 0))
3223 any_working = 0;
3224 if (mirror->replacement &&
3225 !rdev_set_badblocks(
3226 mirror->replacement,
3227 r10_bio->devs[k].addr,
3228 max_sync, 0))
3229 any_working = 0;
3230 }
3231 if (!any_working) {
3232 if (!test_and_set_bit(MD_RECOVERY_INTR,
3233 &mddev->recovery))
3234 printk(KERN_INFO "md/raid10:%s: insufficient "
3235 "working devices for recovery.\n",
3236 mdname(mddev));
3237 mirror->recovery_disabled
3238 = mddev->recovery_disabled;
3239 }
3240 put_buf(r10_bio);
3241 if (rb2)
3242 atomic_dec(&rb2->remaining);
3243 r10_bio = rb2;
3244 break;
3245 }
3246 }
3247 if (biolist == NULL) {
3248 while (r10_bio) {
3249 struct r10bio *rb2 = r10_bio;
3250 r10_bio = (struct r10bio*) rb2->master_bio;
3251 rb2->master_bio = NULL;
3252 put_buf(rb2);
3253 }
3254 goto giveup;
3255 }
3256 } else {
3257
3258 int count = 0;
3259
3260 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
3261
3262 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
3263 &sync_blocks, mddev->degraded) &&
3264 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3265 &mddev->recovery)) {
3266
3267 *skipped = 1;
3268 return sync_blocks + sectors_skipped;
3269 }
3270 if (sync_blocks < max_sync)
3271 max_sync = sync_blocks;
3272 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3273
3274 r10_bio->mddev = mddev;
3275 atomic_set(&r10_bio->remaining, 0);
3276 raise_barrier(conf, 0);
3277 conf->next_resync = sector_nr;
3278
3279 r10_bio->master_bio = NULL;
3280 r10_bio->sector = sector_nr;
3281 set_bit(R10BIO_IsSync, &r10_bio->state);
3282 raid10_find_phys(conf, r10_bio);
3283 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3284
3285 for (i = 0; i < conf->copies; i++) {
3286 int d = r10_bio->devs[i].devnum;
3287 sector_t first_bad, sector;
3288 int bad_sectors;
3289
3290 if (r10_bio->devs[i].repl_bio)
3291 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3292
3293 bio = r10_bio->devs[i].bio;
3294 bio_reset(bio);
3295 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3296 if (conf->mirrors[d].rdev == NULL ||
3297 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
3298 continue;
3299 sector = r10_bio->devs[i].addr;
3300 if (is_badblock(conf->mirrors[d].rdev,
3301 sector, max_sync,
3302 &first_bad, &bad_sectors)) {
3303 if (first_bad > sector)
3304 max_sync = first_bad - sector;
3305 else {
3306 bad_sectors -= (sector - first_bad);
3307 if (max_sync > bad_sectors)
3308 max_sync = bad_sectors;
3309 continue;
3310 }
3311 }
3312 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3313 atomic_inc(&r10_bio->remaining);
3314 bio->bi_next = biolist;
3315 biolist = bio;
3316 bio->bi_private = r10_bio;
3317 bio->bi_end_io = end_sync_read;
3318 bio->bi_rw = READ;
3319 bio->bi_iter.bi_sector = sector +
3320 conf->mirrors[d].rdev->data_offset;
3321 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
3322 count++;
3323
3324 if (conf->mirrors[d].replacement == NULL ||
3325 test_bit(Faulty,
3326 &conf->mirrors[d].replacement->flags))
3327 continue;
3328
3329
3330 bio = r10_bio->devs[i].repl_bio;
3331 bio_reset(bio);
3332 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3333
3334 sector = r10_bio->devs[i].addr;
3335 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3336 bio->bi_next = biolist;
3337 biolist = bio;
3338 bio->bi_private = r10_bio;
3339 bio->bi_end_io = end_sync_write;
3340 bio->bi_rw = WRITE;
3341 bio->bi_iter.bi_sector = sector +
3342 conf->mirrors[d].replacement->data_offset;
3343 bio->bi_bdev = conf->mirrors[d].replacement->bdev;
3344 count++;
3345 }
3346
3347 if (count < 2) {
3348 for (i=0; i<conf->copies; i++) {
3349 int d = r10_bio->devs[i].devnum;
3350 if (r10_bio->devs[i].bio->bi_end_io)
3351 rdev_dec_pending(conf->mirrors[d].rdev,
3352 mddev);
3353 if (r10_bio->devs[i].repl_bio &&
3354 r10_bio->devs[i].repl_bio->bi_end_io)
3355 rdev_dec_pending(
3356 conf->mirrors[d].replacement,
3357 mddev);
3358 }
3359 put_buf(r10_bio);
3360 biolist = NULL;
3361 goto giveup;
3362 }
3363 }
3364
3365 nr_sectors = 0;
3366 if (sector_nr + max_sync < max_sector)
3367 max_sector = sector_nr + max_sync;
3368 do {
3369 struct page *page;
3370 int len = PAGE_SIZE;
3371 if (sector_nr + (len>>9) > max_sector)
3372 len = (max_sector - sector_nr) << 9;
3373 if (len == 0)
3374 break;
3375 for (bio= biolist ; bio ; bio=bio->bi_next) {
3376 struct bio *bio2;
3377 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
3378 if (bio_add_page(bio, page, len, 0))
3379 continue;
3380
3381
3382 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
3383 for (bio2 = biolist;
3384 bio2 && bio2 != bio;
3385 bio2 = bio2->bi_next) {
3386
3387 bio2->bi_vcnt--;
3388 bio2->bi_iter.bi_size -= len;
3389 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
3390 }
3391 goto bio_full;
3392 }
3393 nr_sectors += len>>9;
3394 sector_nr += len>>9;
3395 } while (biolist->bi_vcnt < RESYNC_PAGES);
3396 bio_full:
3397 r10_bio->sectors = nr_sectors;
3398
3399 while (biolist) {
3400 bio = biolist;
3401 biolist = biolist->bi_next;
3402
3403 bio->bi_next = NULL;
3404 r10_bio = bio->bi_private;
3405 r10_bio->sectors = nr_sectors;
3406
3407 if (bio->bi_end_io == end_sync_read) {
3408 md_sync_acct(bio->bi_bdev, nr_sectors);
3409 set_bit(BIO_UPTODATE, &bio->bi_flags);
3410 generic_make_request(bio);
3411 }
3412 }
3413
3414 if (sectors_skipped)
3415
3416
3417
3418 md_done_sync(mddev, sectors_skipped, 1);
3419
3420 return sectors_skipped + nr_sectors;
3421 giveup:
3422
3423
3424
3425
3426 if (sector_nr + max_sync < max_sector)
3427 max_sector = sector_nr + max_sync;
3428
3429 sectors_skipped += (max_sector - sector_nr);
3430 chunks_skipped ++;
3431 sector_nr = max_sector;
3432 goto skipped;
3433}
3434
3435static sector_t
3436raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3437{
3438 sector_t size;
3439 struct r10conf *conf = mddev->private;
3440
3441 if (!raid_disks)
3442 raid_disks = min(conf->geo.raid_disks,
3443 conf->prev.raid_disks);
3444 if (!sectors)
3445 sectors = conf->dev_sectors;
3446
3447 size = sectors >> conf->geo.chunk_shift;
3448 sector_div(size, conf->geo.far_copies);
3449 size = size * raid_disks;
3450 sector_div(size, conf->geo.near_copies);
3451
3452 return size << conf->geo.chunk_shift;
3453}
3454
3455static void calc_sectors(struct r10conf *conf, sector_t size)
3456{
3457
3458
3459
3460
3461
3462 size = size >> conf->geo.chunk_shift;
3463 sector_div(size, conf->geo.far_copies);
3464 size = size * conf->geo.raid_disks;
3465 sector_div(size, conf->geo.near_copies);
3466
3467
3468 size = size * conf->copies;
3469
3470
3471
3472
3473 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3474
3475 conf->dev_sectors = size << conf->geo.chunk_shift;
3476
3477 if (conf->geo.far_offset)
3478 conf->geo.stride = 1 << conf->geo.chunk_shift;
3479 else {
3480 sector_div(size, conf->geo.far_copies);
3481 conf->geo.stride = size << conf->geo.chunk_shift;
3482 }
3483}
3484
3485enum geo_type {geo_new, geo_old, geo_start};
3486static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3487{
3488 int nc, fc, fo;
3489 int layout, chunk, disks;
3490 switch (new) {
3491 case geo_old:
3492 layout = mddev->layout;
3493 chunk = mddev->chunk_sectors;
3494 disks = mddev->raid_disks - mddev->delta_disks;
3495 break;
3496 case geo_new:
3497 layout = mddev->new_layout;
3498 chunk = mddev->new_chunk_sectors;
3499 disks = mddev->raid_disks;
3500 break;
3501 default:
3502 case geo_start:
3503
3504 layout = mddev->new_layout;
3505 chunk = mddev->new_chunk_sectors;
3506 disks = mddev->raid_disks + mddev->delta_disks;
3507 break;
3508 }
3509 if (layout >> 18)
3510 return -1;
3511 if (chunk < (PAGE_SIZE >> 9) ||
3512 !is_power_of_2(chunk))
3513 return -2;
3514 nc = layout & 255;
3515 fc = (layout >> 8) & 255;
3516 fo = layout & (1<<16);
3517 geo->raid_disks = disks;
3518 geo->near_copies = nc;
3519 geo->far_copies = fc;
3520 geo->far_offset = fo;
3521 geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
3522 geo->chunk_mask = chunk - 1;
3523 geo->chunk_shift = ffz(~chunk);
3524 return nc*fc;
3525}
3526
3527static struct r10conf *setup_conf(struct mddev *mddev)
3528{
3529 struct r10conf *conf = NULL;
3530 int err = -EINVAL;
3531 struct geom geo;
3532 int copies;
3533
3534 copies = setup_geo(&geo, mddev, geo_new);
3535
3536 if (copies == -2) {
3537 printk(KERN_ERR "md/raid10:%s: chunk size must be "
3538 "at least PAGE_SIZE(%ld) and be a power of 2.\n",
3539 mdname(mddev), PAGE_SIZE);
3540 goto out;
3541 }
3542
3543 if (copies < 2 || copies > mddev->raid_disks) {
3544 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3545 mdname(mddev), mddev->new_layout);
3546 goto out;
3547 }
3548
3549 err = -ENOMEM;
3550 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3551 if (!conf)
3552 goto out;
3553
3554
3555 conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
3556 max(0,-mddev->delta_disks)),
3557 GFP_KERNEL);
3558 if (!conf->mirrors)
3559 goto out;
3560
3561 conf->tmppage = alloc_page(GFP_KERNEL);
3562 if (!conf->tmppage)
3563 goto out;
3564
3565 conf->geo = geo;
3566 conf->copies = copies;
3567 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
3568 r10bio_pool_free, conf);
3569 if (!conf->r10bio_pool)
3570 goto out;
3571
3572 calc_sectors(conf, mddev->dev_sectors);
3573 if (mddev->reshape_position == MaxSector) {
3574 conf->prev = conf->geo;
3575 conf->reshape_progress = MaxSector;
3576 } else {
3577 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3578 err = -EINVAL;
3579 goto out;
3580 }
3581 conf->reshape_progress = mddev->reshape_position;
3582 if (conf->prev.far_offset)
3583 conf->prev.stride = 1 << conf->prev.chunk_shift;
3584 else
3585
3586 conf->prev.stride = conf->dev_sectors;
3587 }
3588 spin_lock_init(&conf->device_lock);
3589 INIT_LIST_HEAD(&conf->retry_list);
3590
3591 spin_lock_init(&conf->resync_lock);
3592 init_waitqueue_head(&conf->wait_barrier);
3593
3594 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3595 if (!conf->thread)
3596 goto out;
3597
3598 conf->mddev = mddev;
3599 return conf;
3600
3601 out:
3602 if (err == -ENOMEM)
3603 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
3604 mdname(mddev));
3605 if (conf) {
3606 if (conf->r10bio_pool)
3607 mempool_destroy(conf->r10bio_pool);
3608 kfree(conf->mirrors);
3609 safe_put_page(conf->tmppage);
3610 kfree(conf);
3611 }
3612 return ERR_PTR(err);
3613}
3614
3615static int run(struct mddev *mddev)
3616{
3617 struct r10conf *conf;
3618 int i, disk_idx, chunk_size;
3619 struct raid10_info *disk;
3620 struct md_rdev *rdev;
3621 sector_t size;
3622 sector_t min_offset_diff = 0;
3623 int first = 1;
3624 bool discard_supported = false;
3625
3626 if (mddev->private == NULL) {
3627 conf = setup_conf(mddev);
3628 if (IS_ERR(conf))
3629 return PTR_ERR(conf);
3630 mddev->private = conf;
3631 }
3632 conf = mddev->private;
3633 if (!conf)
3634 goto out;
3635
3636 mddev->thread = conf->thread;
3637 conf->thread = NULL;
3638
3639 chunk_size = mddev->chunk_sectors << 9;
3640 if (mddev->queue) {
3641 blk_queue_max_discard_sectors(mddev->queue,
3642 mddev->chunk_sectors);
3643 blk_queue_max_write_same_sectors(mddev->queue, 0);
3644 blk_queue_io_min(mddev->queue, chunk_size);
3645 if (conf->geo.raid_disks % conf->geo.near_copies)
3646 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3647 else
3648 blk_queue_io_opt(mddev->queue, chunk_size *
3649 (conf->geo.raid_disks / conf->geo.near_copies));
3650 }
3651
3652 rdev_for_each(rdev, mddev) {
3653 long long diff;
3654 struct request_queue *q;
3655
3656 disk_idx = rdev->raid_disk;
3657 if (disk_idx < 0)
3658 continue;
3659 if (disk_idx >= conf->geo.raid_disks &&
3660 disk_idx >= conf->prev.raid_disks)
3661 continue;
3662 disk = conf->mirrors + disk_idx;
3663
3664 if (test_bit(Replacement, &rdev->flags)) {
3665 if (disk->replacement)
3666 goto out_free_conf;
3667 disk->replacement = rdev;
3668 } else {
3669 if (disk->rdev)
3670 goto out_free_conf;
3671 disk->rdev = rdev;
3672 }
3673 q = bdev_get_queue(rdev->bdev);
3674 if (q->merge_bvec_fn)
3675 mddev->merge_check_needed = 1;
3676 diff = (rdev->new_data_offset - rdev->data_offset);
3677 if (!mddev->reshape_backwards)
3678 diff = -diff;
3679 if (diff < 0)
3680 diff = 0;
3681 if (first || diff < min_offset_diff)
3682 min_offset_diff = diff;
3683
3684 if (mddev->gendisk)
3685 disk_stack_limits(mddev->gendisk, rdev->bdev,
3686 rdev->data_offset << 9);
3687
3688 disk->head_position = 0;
3689
3690 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3691 discard_supported = true;
3692 }
3693
3694 if (mddev->queue) {
3695 if (discard_supported)
3696 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
3697 mddev->queue);
3698 else
3699 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
3700 mddev->queue);
3701 }
3702
3703 if (!enough(conf, -1)) {
3704 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
3705 mdname(mddev));
3706 goto out_free_conf;
3707 }
3708
3709 if (conf->reshape_progress != MaxSector) {
3710
3711 if (conf->geo.far_copies != 1 &&
3712 conf->geo.far_offset == 0)
3713 goto out_free_conf;
3714 if (conf->prev.far_copies != 1 &&
3715 conf->prev.far_offset == 0)
3716 goto out_free_conf;
3717 }
3718
3719 mddev->degraded = 0;
3720 for (i = 0;
3721 i < conf->geo.raid_disks
3722 || i < conf->prev.raid_disks;
3723 i++) {
3724
3725 disk = conf->mirrors + i;
3726
3727 if (!disk->rdev && disk->replacement) {
3728
3729 disk->rdev = disk->replacement;
3730 disk->replacement = NULL;
3731 clear_bit(Replacement, &disk->rdev->flags);
3732 }
3733
3734 if (!disk->rdev ||
3735 !test_bit(In_sync, &disk->rdev->flags)) {
3736 disk->head_position = 0;
3737 mddev->degraded++;
3738 if (disk->rdev &&
3739 disk->rdev->saved_raid_disk < 0)
3740 conf->fullsync = 1;
3741 }
3742 disk->recovery_disabled = mddev->recovery_disabled - 1;
3743 }
3744
3745 if (mddev->recovery_cp != MaxSector)
3746 printk(KERN_NOTICE "md/raid10:%s: not clean"
3747 " -- starting background reconstruction\n",
3748 mdname(mddev));
3749 printk(KERN_INFO
3750 "md/raid10:%s: active with %d out of %d devices\n",
3751 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3752 conf->geo.raid_disks);
3753
3754
3755
3756 mddev->dev_sectors = conf->dev_sectors;
3757 size = raid10_size(mddev, 0, 0);
3758 md_set_array_sectors(mddev, size);
3759 mddev->resync_max_sectors = size;
3760
3761 if (mddev->queue) {
3762 int stripe = conf->geo.raid_disks *
3763 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3764 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
3765 mddev->queue->backing_dev_info.congested_data = mddev;
3766
3767
3768
3769
3770
3771 stripe /= conf->geo.near_copies;
3772 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3773 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3774 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3775 }
3776
3777
3778 if (md_integrity_register(mddev))
3779 goto out_free_conf;
3780
3781 if (conf->reshape_progress != MaxSector) {
3782 unsigned long before_length, after_length;
3783
3784 before_length = ((1 << conf->prev.chunk_shift) *
3785 conf->prev.far_copies);
3786 after_length = ((1 << conf->geo.chunk_shift) *
3787 conf->geo.far_copies);
3788
3789 if (max(before_length, after_length) > min_offset_diff) {
3790
3791 printk("md/raid10: offset difference not enough to continue reshape\n");
3792 goto out_free_conf;
3793 }
3794 conf->offset_diff = min_offset_diff;
3795
3796 conf->reshape_safe = conf->reshape_progress;
3797 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3798 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3799 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3800 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3801 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3802 "reshape");
3803 }
3804
3805 return 0;
3806
3807out_free_conf:
3808 md_unregister_thread(&mddev->thread);
3809 if (conf->r10bio_pool)
3810 mempool_destroy(conf->r10bio_pool);
3811 safe_put_page(conf->tmppage);
3812 kfree(conf->mirrors);
3813 kfree(conf);
3814 mddev->private = NULL;
3815out:
3816 return -EIO;
3817}
3818
3819static int stop(struct mddev *mddev)
3820{
3821 struct r10conf *conf = mddev->private;
3822
3823 raise_barrier(conf, 0);
3824 lower_barrier(conf);
3825
3826 md_unregister_thread(&mddev->thread);
3827 if (mddev->queue)
3828
3829 blk_sync_queue(mddev->queue);
3830
3831 if (conf->r10bio_pool)
3832 mempool_destroy(conf->r10bio_pool);
3833 safe_put_page(conf->tmppage);
3834 kfree(conf->mirrors);
3835 kfree(conf);
3836 mddev->private = NULL;
3837 return 0;
3838}
3839
3840static void raid10_quiesce(struct mddev *mddev, int state)
3841{
3842 struct r10conf *conf = mddev->private;
3843
3844 switch(state) {
3845 case 1:
3846 raise_barrier(conf, 0);
3847 break;
3848 case 0:
3849 lower_barrier(conf);
3850 break;
3851 }
3852}
3853
3854static int raid10_resize(struct mddev *mddev, sector_t sectors)
3855{
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868 struct r10conf *conf = mddev->private;
3869 sector_t oldsize, size;
3870
3871 if (mddev->reshape_position != MaxSector)
3872 return -EBUSY;
3873
3874 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3875 return -EINVAL;
3876
3877 oldsize = raid10_size(mddev, 0, 0);
3878 size = raid10_size(mddev, sectors, 0);
3879 if (mddev->external_size &&
3880 mddev->array_sectors > size)
3881 return -EINVAL;
3882 if (mddev->bitmap) {
3883 int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
3884 if (ret)
3885 return ret;
3886 }
3887 md_set_array_sectors(mddev, size);
3888 set_capacity(mddev->gendisk, mddev->array_sectors);
3889 revalidate_disk(mddev->gendisk);
3890 if (sectors > mddev->dev_sectors &&
3891 mddev->recovery_cp > oldsize) {
3892 mddev->recovery_cp = oldsize;
3893 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3894 }
3895 calc_sectors(conf, sectors);
3896 mddev->dev_sectors = conf->dev_sectors;
3897 mddev->resync_max_sectors = size;
3898 return 0;
3899}
3900
3901static void *raid10_takeover_raid0(struct mddev *mddev)
3902{
3903 struct md_rdev *rdev;
3904 struct r10conf *conf;
3905
3906 if (mddev->degraded > 0) {
3907 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
3908 mdname(mddev));
3909 return ERR_PTR(-EINVAL);
3910 }
3911
3912
3913 mddev->new_level = 10;
3914
3915 mddev->new_layout = (1<<8) + 2;
3916 mddev->new_chunk_sectors = mddev->chunk_sectors;
3917 mddev->delta_disks = mddev->raid_disks;
3918 mddev->raid_disks *= 2;
3919
3920 mddev->recovery_cp = MaxSector;
3921
3922 conf = setup_conf(mddev);
3923 if (!IS_ERR(conf)) {
3924 rdev_for_each(rdev, mddev)
3925 if (rdev->raid_disk >= 0)
3926 rdev->new_raid_disk = rdev->raid_disk * 2;
3927 conf->barrier = 1;
3928 }
3929
3930 return conf;
3931}
3932
3933static void *raid10_takeover(struct mddev *mddev)
3934{
3935 struct r0conf *raid0_conf;
3936
3937
3938
3939
3940 if (mddev->level == 0) {
3941
3942 raid0_conf = mddev->private;
3943 if (raid0_conf->nr_strip_zones > 1) {
3944 printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
3945 " with more than one zone.\n",
3946 mdname(mddev));
3947 return ERR_PTR(-EINVAL);
3948 }
3949 return raid10_takeover_raid0(mddev);
3950 }
3951 return ERR_PTR(-EINVAL);
3952}
3953
3954static int raid10_check_reshape(struct mddev *mddev)
3955{
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970 struct r10conf *conf = mddev->private;
3971 struct geom geo;
3972
3973 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
3974 return -EINVAL;
3975
3976 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
3977
3978 return -EINVAL;
3979 if (geo.far_copies > 1 && !geo.far_offset)
3980
3981 return -EINVAL;
3982
3983 if (mddev->array_sectors & geo.chunk_mask)
3984
3985 return -EINVAL;
3986
3987 if (!enough(conf, -1))
3988 return -EINVAL;
3989
3990 kfree(conf->mirrors_new);
3991 conf->mirrors_new = NULL;
3992 if (mddev->delta_disks > 0) {
3993
3994 conf->mirrors_new = kzalloc(
3995 sizeof(struct raid10_info)
3996 *(mddev->raid_disks +
3997 mddev->delta_disks),
3998 GFP_KERNEL);
3999 if (!conf->mirrors_new)
4000 return -ENOMEM;
4001 }
4002 return 0;
4003}
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018static int calc_degraded(struct r10conf *conf)
4019{
4020 int degraded, degraded2;
4021 int i;
4022
4023 rcu_read_lock();
4024 degraded = 0;
4025
4026 for (i = 0; i < conf->prev.raid_disks; i++) {
4027 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4028 if (!rdev || test_bit(Faulty, &rdev->flags))
4029 degraded++;
4030 else if (!test_bit(In_sync, &rdev->flags))
4031
4032
4033
4034
4035 degraded++;
4036 }
4037 rcu_read_unlock();
4038 if (conf->geo.raid_disks == conf->prev.raid_disks)
4039 return degraded;
4040 rcu_read_lock();
4041 degraded2 = 0;
4042 for (i = 0; i < conf->geo.raid_disks; i++) {
4043 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4044 if (!rdev || test_bit(Faulty, &rdev->flags))
4045 degraded2++;
4046 else if (!test_bit(In_sync, &rdev->flags)) {
4047
4048
4049
4050
4051
4052 if (conf->geo.raid_disks <= conf->prev.raid_disks)
4053 degraded2++;
4054 }
4055 }
4056 rcu_read_unlock();
4057 if (degraded2 > degraded)
4058 return degraded2;
4059 return degraded;
4060}
4061
4062static int raid10_start_reshape(struct mddev *mddev)
4063{
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074 unsigned long before_length, after_length;
4075 sector_t min_offset_diff = 0;
4076 int first = 1;
4077 struct geom new;
4078 struct r10conf *conf = mddev->private;
4079 struct md_rdev *rdev;
4080 int spares = 0;
4081 int ret;
4082
4083 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4084 return -EBUSY;
4085
4086 if (setup_geo(&new, mddev, geo_start) != conf->copies)
4087 return -EINVAL;
4088
4089 before_length = ((1 << conf->prev.chunk_shift) *
4090 conf->prev.far_copies);
4091 after_length = ((1 << conf->geo.chunk_shift) *
4092 conf->geo.far_copies);
4093
4094 rdev_for_each(rdev, mddev) {
4095 if (!test_bit(In_sync, &rdev->flags)
4096 && !test_bit(Faulty, &rdev->flags))
4097 spares++;
4098 if (rdev->raid_disk >= 0) {
4099 long long diff = (rdev->new_data_offset
4100 - rdev->data_offset);
4101 if (!mddev->reshape_backwards)
4102 diff = -diff;
4103 if (diff < 0)
4104 diff = 0;
4105 if (first || diff < min_offset_diff)
4106 min_offset_diff = diff;
4107 }
4108 }
4109
4110 if (max(before_length, after_length) > min_offset_diff)
4111 return -EINVAL;
4112
4113 if (spares < mddev->delta_disks)
4114 return -EINVAL;
4115
4116 conf->offset_diff = min_offset_diff;
4117 spin_lock_irq(&conf->device_lock);
4118 if (conf->mirrors_new) {
4119 memcpy(conf->mirrors_new, conf->mirrors,
4120 sizeof(struct raid10_info)*conf->prev.raid_disks);
4121 smp_mb();
4122 kfree(conf->mirrors_old);
4123 conf->mirrors_old = conf->mirrors;
4124 conf->mirrors = conf->mirrors_new;
4125 conf->mirrors_new = NULL;
4126 }
4127 setup_geo(&conf->geo, mddev, geo_start);
4128 smp_mb();
4129 if (mddev->reshape_backwards) {
4130 sector_t size = raid10_size(mddev, 0, 0);
4131 if (size < mddev->array_sectors) {
4132 spin_unlock_irq(&conf->device_lock);
4133 printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
4134 mdname(mddev));
4135 return -EINVAL;
4136 }
4137 mddev->resync_max_sectors = size;
4138 conf->reshape_progress = size;
4139 } else
4140 conf->reshape_progress = 0;
4141 spin_unlock_irq(&conf->device_lock);
4142
4143 if (mddev->delta_disks && mddev->bitmap) {
4144 ret = bitmap_resize(mddev->bitmap,
4145 raid10_size(mddev, 0,
4146 conf->geo.raid_disks),
4147 0, 0);
4148 if (ret)
4149 goto abort;
4150 }
4151 if (mddev->delta_disks > 0) {
4152 rdev_for_each(rdev, mddev)
4153 if (rdev->raid_disk < 0 &&
4154 !test_bit(Faulty, &rdev->flags)) {
4155 if (raid10_add_disk(mddev, rdev) == 0) {
4156 if (rdev->raid_disk >=
4157 conf->prev.raid_disks)
4158 set_bit(In_sync, &rdev->flags);
4159 else
4160 rdev->recovery_offset = 0;
4161
4162 if (sysfs_link_rdev(mddev, rdev))
4163 ;
4164 }
4165 } else if (rdev->raid_disk >= conf->prev.raid_disks
4166 && !test_bit(Faulty, &rdev->flags)) {
4167
4168 set_bit(In_sync, &rdev->flags);
4169 }
4170 }
4171
4172
4173
4174
4175 spin_lock_irq(&conf->device_lock);
4176 mddev->degraded = calc_degraded(conf);
4177 spin_unlock_irq(&conf->device_lock);
4178 mddev->raid_disks = conf->geo.raid_disks;
4179 mddev->reshape_position = conf->reshape_progress;
4180 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4181
4182 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4183 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4184 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4185 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4186
4187 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4188 "reshape");
4189 if (!mddev->sync_thread) {
4190 ret = -EAGAIN;
4191 goto abort;
4192 }
4193 conf->reshape_checkpoint = jiffies;
4194 md_wakeup_thread(mddev->sync_thread);
4195 md_new_event(mddev);
4196 return 0;
4197
4198abort:
4199 mddev->recovery = 0;
4200 spin_lock_irq(&conf->device_lock);
4201 conf->geo = conf->prev;
4202 mddev->raid_disks = conf->geo.raid_disks;
4203 rdev_for_each(rdev, mddev)
4204 rdev->new_data_offset = rdev->data_offset;
4205 smp_wmb();
4206 conf->reshape_progress = MaxSector;
4207 mddev->reshape_position = MaxSector;
4208 spin_unlock_irq(&conf->device_lock);
4209 return ret;
4210}
4211
4212
4213
4214
4215
4216
4217
4218static sector_t last_dev_address(sector_t s, struct geom *geo)
4219{
4220 s = (s | geo->chunk_mask) + 1;
4221 s >>= geo->chunk_shift;
4222 s *= geo->near_copies;
4223 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4224 s *= geo->far_copies;
4225 s <<= geo->chunk_shift;
4226 return s;
4227}
4228
4229
4230
4231
4232
4233static sector_t first_dev_address(sector_t s, struct geom *geo)
4234{
4235 s >>= geo->chunk_shift;
4236 s *= geo->near_copies;
4237 sector_div(s, geo->raid_disks);
4238 s *= geo->far_copies;
4239 s <<= geo->chunk_shift;
4240 return s;
4241}
4242
4243static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4244 int *skipped)
4245{
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283 struct r10conf *conf = mddev->private;
4284 struct r10bio *r10_bio;
4285 sector_t next, safe, last;
4286 int max_sectors;
4287 int nr_sectors;
4288 int s;
4289 struct md_rdev *rdev;
4290 int need_flush = 0;
4291 struct bio *blist;
4292 struct bio *bio, *read_bio;
4293 int sectors_done = 0;
4294
4295 if (sector_nr == 0) {
4296
4297 if (mddev->reshape_backwards &&
4298 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4299 sector_nr = (raid10_size(mddev, 0, 0)
4300 - conf->reshape_progress);
4301 } else if (!mddev->reshape_backwards &&
4302 conf->reshape_progress > 0)
4303 sector_nr = conf->reshape_progress;
4304 if (sector_nr) {
4305 mddev->curr_resync_completed = sector_nr;
4306 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4307 *skipped = 1;
4308 return sector_nr;
4309 }
4310 }
4311
4312
4313
4314
4315
4316 if (mddev->reshape_backwards) {
4317
4318
4319
4320 next = first_dev_address(conf->reshape_progress - 1,
4321 &conf->geo);
4322
4323
4324
4325
4326 safe = last_dev_address(conf->reshape_safe - 1,
4327 &conf->prev);
4328
4329 if (next + conf->offset_diff < safe)
4330 need_flush = 1;
4331
4332 last = conf->reshape_progress - 1;
4333 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4334 & conf->prev.chunk_mask);
4335 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4336 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4337 } else {
4338
4339
4340
4341 next = last_dev_address(conf->reshape_progress, &conf->geo);
4342
4343
4344
4345
4346 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4347
4348
4349
4350
4351 if (next > safe + conf->offset_diff)
4352 need_flush = 1;
4353
4354 sector_nr = conf->reshape_progress;
4355 last = sector_nr | (conf->geo.chunk_mask
4356 & conf->prev.chunk_mask);
4357
4358 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4359 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4360 }
4361
4362 if (need_flush ||
4363 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4364
4365 wait_barrier(conf);
4366 mddev->reshape_position = conf->reshape_progress;
4367 if (mddev->reshape_backwards)
4368 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4369 - conf->reshape_progress;
4370 else
4371 mddev->curr_resync_completed = conf->reshape_progress;
4372 conf->reshape_checkpoint = jiffies;
4373 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4374 md_wakeup_thread(mddev->thread);
4375 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4376 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4377 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4378 allow_barrier(conf);
4379 return sectors_done;
4380 }
4381 conf->reshape_safe = mddev->reshape_position;
4382 allow_barrier(conf);
4383 }
4384
4385read_more:
4386
4387 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
4388 raise_barrier(conf, sectors_done != 0);
4389 atomic_set(&r10_bio->remaining, 0);
4390 r10_bio->mddev = mddev;
4391 r10_bio->sector = sector_nr;
4392 set_bit(R10BIO_IsReshape, &r10_bio->state);
4393 r10_bio->sectors = last - sector_nr + 1;
4394 rdev = read_balance(conf, r10_bio, &max_sectors);
4395 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4396
4397 if (!rdev) {
4398
4399
4400
4401
4402 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4403 return sectors_done;
4404 }
4405
4406 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4407
4408 read_bio->bi_bdev = rdev->bdev;
4409 read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4410 + rdev->data_offset);
4411 read_bio->bi_private = r10_bio;
4412 read_bio->bi_end_io = end_sync_read;
4413 read_bio->bi_rw = READ;
4414 read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
4415 read_bio->bi_flags |= 1 << BIO_UPTODATE;
4416 read_bio->bi_vcnt = 0;
4417 read_bio->bi_iter.bi_size = 0;
4418 r10_bio->master_bio = read_bio;
4419 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4420
4421
4422 __raid10_find_phys(&conf->geo, r10_bio);
4423
4424 blist = read_bio;
4425 read_bio->bi_next = NULL;
4426
4427 for (s = 0; s < conf->copies*2; s++) {
4428 struct bio *b;
4429 int d = r10_bio->devs[s/2].devnum;
4430 struct md_rdev *rdev2;
4431 if (s&1) {
4432 rdev2 = conf->mirrors[d].replacement;
4433 b = r10_bio->devs[s/2].repl_bio;
4434 } else {
4435 rdev2 = conf->mirrors[d].rdev;
4436 b = r10_bio->devs[s/2].bio;
4437 }
4438 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4439 continue;
4440
4441 bio_reset(b);
4442 b->bi_bdev = rdev2->bdev;
4443 b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
4444 rdev2->new_data_offset;
4445 b->bi_private = r10_bio;
4446 b->bi_end_io = end_reshape_write;
4447 b->bi_rw = WRITE;
4448 b->bi_next = blist;
4449 blist = b;
4450 }
4451
4452
4453
4454 nr_sectors = 0;
4455 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4456 struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
4457 int len = (max_sectors - s) << 9;
4458 if (len > PAGE_SIZE)
4459 len = PAGE_SIZE;
4460 for (bio = blist; bio ; bio = bio->bi_next) {
4461 struct bio *bio2;
4462 if (bio_add_page(bio, page, len, 0))
4463 continue;
4464
4465
4466 for (bio2 = blist;
4467 bio2 && bio2 != bio;
4468 bio2 = bio2->bi_next) {
4469
4470 bio2->bi_vcnt--;
4471 bio2->bi_iter.bi_size -= len;
4472 bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
4473 }
4474 goto bio_full;
4475 }
4476 sector_nr += len >> 9;
4477 nr_sectors += len >> 9;
4478 }
4479bio_full:
4480 r10_bio->sectors = nr_sectors;
4481
4482
4483 md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
4484 atomic_inc(&r10_bio->remaining);
4485 read_bio->bi_next = NULL;
4486 generic_make_request(read_bio);
4487 sector_nr += nr_sectors;
4488 sectors_done += nr_sectors;
4489 if (sector_nr <= last)
4490 goto read_more;
4491
4492
4493
4494
4495 if (mddev->reshape_backwards)
4496 conf->reshape_progress -= sectors_done;
4497 else
4498 conf->reshape_progress += sectors_done;
4499
4500 return sectors_done;
4501}
4502
4503static void end_reshape_request(struct r10bio *r10_bio);
4504static int handle_reshape_read_error(struct mddev *mddev,
4505 struct r10bio *r10_bio);
4506static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4507{
4508
4509
4510
4511
4512
4513 struct r10conf *conf = mddev->private;
4514 int s;
4515
4516 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4517 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4518
4519 md_done_sync(mddev, r10_bio->sectors, 0);
4520 return;
4521 }
4522
4523
4524
4525
4526 atomic_set(&r10_bio->remaining, 1);
4527 for (s = 0; s < conf->copies*2; s++) {
4528 struct bio *b;
4529 int d = r10_bio->devs[s/2].devnum;
4530 struct md_rdev *rdev;
4531 if (s&1) {
4532 rdev = conf->mirrors[d].replacement;
4533 b = r10_bio->devs[s/2].repl_bio;
4534 } else {
4535 rdev = conf->mirrors[d].rdev;
4536 b = r10_bio->devs[s/2].bio;
4537 }
4538 if (!rdev || test_bit(Faulty, &rdev->flags))
4539 continue;
4540 atomic_inc(&rdev->nr_pending);
4541 md_sync_acct(b->bi_bdev, r10_bio->sectors);
4542 atomic_inc(&r10_bio->remaining);
4543 b->bi_next = NULL;
4544 generic_make_request(b);
4545 }
4546 end_reshape_request(r10_bio);
4547}
4548
4549static void end_reshape(struct r10conf *conf)
4550{
4551 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4552 return;
4553
4554 spin_lock_irq(&conf->device_lock);
4555 conf->prev = conf->geo;
4556 md_finish_reshape(conf->mddev);
4557 smp_wmb();
4558 conf->reshape_progress = MaxSector;
4559 spin_unlock_irq(&conf->device_lock);
4560
4561
4562
4563
4564 if (conf->mddev->queue) {
4565 int stripe = conf->geo.raid_disks *
4566 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4567 stripe /= conf->geo.near_copies;
4568 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4569 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4570 }
4571 conf->fullsync = 0;
4572}
4573
4574
4575static int handle_reshape_read_error(struct mddev *mddev,
4576 struct r10bio *r10_bio)
4577{
4578
4579 int sectors = r10_bio->sectors;
4580 struct r10conf *conf = mddev->private;
4581 struct {
4582 struct r10bio r10_bio;
4583 struct r10dev devs[conf->copies];
4584 } on_stack;
4585 struct r10bio *r10b = &on_stack.r10_bio;
4586 int slot = 0;
4587 int idx = 0;
4588 struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
4589
4590 r10b->sector = r10_bio->sector;
4591 __raid10_find_phys(&conf->prev, r10b);
4592
4593 while (sectors) {
4594 int s = sectors;
4595 int success = 0;
4596 int first_slot = slot;
4597
4598 if (s > (PAGE_SIZE >> 9))
4599 s = PAGE_SIZE >> 9;
4600
4601 while (!success) {
4602 int d = r10b->devs[slot].devnum;
4603 struct md_rdev *rdev = conf->mirrors[d].rdev;
4604 sector_t addr;
4605 if (rdev == NULL ||
4606 test_bit(Faulty, &rdev->flags) ||
4607 !test_bit(In_sync, &rdev->flags))
4608 goto failed;
4609
4610 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4611 success = sync_page_io(rdev,
4612 addr,
4613 s << 9,
4614 bvec[idx].bv_page,
4615 READ, false);
4616 if (success)
4617 break;
4618 failed:
4619 slot++;
4620 if (slot >= conf->copies)
4621 slot = 0;
4622 if (slot == first_slot)
4623 break;
4624 }
4625 if (!success) {
4626
4627 set_bit(MD_RECOVERY_INTR,
4628 &mddev->recovery);
4629 return -EIO;
4630 }
4631 sectors -= s;
4632 idx++;
4633 }
4634 return 0;
4635}
4636
4637static void end_reshape_write(struct bio *bio, int error)
4638{
4639 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
4640 struct r10bio *r10_bio = bio->bi_private;
4641 struct mddev *mddev = r10_bio->mddev;
4642 struct r10conf *conf = mddev->private;
4643 int d;
4644 int slot;
4645 int repl;
4646 struct md_rdev *rdev = NULL;
4647
4648 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4649 if (repl)
4650 rdev = conf->mirrors[d].replacement;
4651 if (!rdev) {
4652 smp_mb();
4653 rdev = conf->mirrors[d].rdev;
4654 }
4655
4656 if (!uptodate) {
4657
4658 md_error(mddev, rdev);
4659 }
4660
4661 rdev_dec_pending(rdev, mddev);
4662 end_reshape_request(r10_bio);
4663}
4664
4665static void end_reshape_request(struct r10bio *r10_bio)
4666{
4667 if (!atomic_dec_and_test(&r10_bio->remaining))
4668 return;
4669 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4670 bio_put(r10_bio->master_bio);
4671 put_buf(r10_bio);
4672}
4673
4674static void raid10_finish_reshape(struct mddev *mddev)
4675{
4676 struct r10conf *conf = mddev->private;
4677
4678 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4679 return;
4680
4681 if (mddev->delta_disks > 0) {
4682 sector_t size = raid10_size(mddev, 0, 0);
4683 md_set_array_sectors(mddev, size);
4684 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4685 mddev->recovery_cp = mddev->resync_max_sectors;
4686 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4687 }
4688 mddev->resync_max_sectors = size;
4689 set_capacity(mddev->gendisk, mddev->array_sectors);
4690 revalidate_disk(mddev->gendisk);
4691 } else {
4692 int d;
4693 for (d = conf->geo.raid_disks ;
4694 d < conf->geo.raid_disks - mddev->delta_disks;
4695 d++) {
4696 struct md_rdev *rdev = conf->mirrors[d].rdev;
4697 if (rdev)
4698 clear_bit(In_sync, &rdev->flags);
4699 rdev = conf->mirrors[d].replacement;
4700 if (rdev)
4701 clear_bit(In_sync, &rdev->flags);
4702 }
4703 }
4704 mddev->layout = mddev->new_layout;
4705 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4706 mddev->reshape_position = MaxSector;
4707 mddev->delta_disks = 0;
4708 mddev->reshape_backwards = 0;
4709}
4710
4711static struct md_personality raid10_personality =
4712{
4713 .name = "raid10",
4714 .level = 10,
4715 .owner = THIS_MODULE,
4716 .make_request = make_request,
4717 .run = run,
4718 .stop = stop,
4719 .status = status,
4720 .error_handler = error,
4721 .hot_add_disk = raid10_add_disk,
4722 .hot_remove_disk= raid10_remove_disk,
4723 .spare_active = raid10_spare_active,
4724 .sync_request = sync_request,
4725 .quiesce = raid10_quiesce,
4726 .size = raid10_size,
4727 .resize = raid10_resize,
4728 .takeover = raid10_takeover,
4729 .check_reshape = raid10_check_reshape,
4730 .start_reshape = raid10_start_reshape,
4731 .finish_reshape = raid10_finish_reshape,
4732};
4733
4734static int __init raid_init(void)
4735{
4736 return register_md_personality(&raid10_personality);
4737}
4738
4739static void raid_exit(void)
4740{
4741 unregister_md_personality(&raid10_personality);
4742}
4743
4744module_init(raid_init);
4745module_exit(raid_exit);
4746MODULE_LICENSE("GPL");
4747MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4748MODULE_ALIAS("md-personality-9");
4749MODULE_ALIAS("md-raid10");
4750MODULE_ALIAS("md-level-10");
4751
4752module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
4753