1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/slab.h>
22#include <linux/delay.h>
23#include <linux/blkdev.h>
24#include <linux/module.h>
25#include <linux/seq_file.h>
26#include <linux/ratelimit.h>
27#include <linux/kthread.h>
28#include "md.h"
29#include "raid10.h"
30#include "raid0.h"
31#include "bitmap.h"
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76#define NR_RAID10_BIOS 256
77
78
79
80
81
82
83#define IO_BLOCKED ((struct bio *)1)
84
85
86
87
88#define IO_MADE_GOOD ((struct bio *)2)
89
90#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
91
92
93
94
95
96static int max_queued_requests = 1024;
97
98static void allow_barrier(struct r10conf *conf);
99static void lower_barrier(struct r10conf *conf);
100static int _enough(struct r10conf *conf, int previous, int ignore);
101static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
102 int *skipped);
103static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
104static void end_reshape_write(struct bio *bio, int error);
105static void end_reshape(struct r10conf *conf);
106
107static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
108{
109 struct r10conf *conf = data;
110 int size = offsetof(struct r10bio, devs[conf->copies]);
111
112
113
114 return kzalloc(size, gfp_flags);
115}
116
117static void r10bio_pool_free(void *r10_bio, void *data)
118{
119 kfree(r10_bio);
120}
121
122
123#define RESYNC_BLOCK_SIZE (64*1024)
124#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
125
126#define RESYNC_WINDOW (1024*1024)
127
128#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
129
130
131
132
133
134
135
136
137static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
138{
139 struct r10conf *conf = data;
140 struct page *page;
141 struct r10bio *r10_bio;
142 struct bio *bio;
143 int i, j;
144 int nalloc;
145
146 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
147 if (!r10_bio)
148 return NULL;
149
150 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
151 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
152 nalloc = conf->copies;
153 else
154 nalloc = 2;
155
156
157
158
159 for (j = nalloc ; j-- ; ) {
160 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
161 if (!bio)
162 goto out_free_bio;
163 r10_bio->devs[j].bio = bio;
164 if (!conf->have_replacement)
165 continue;
166 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
167 if (!bio)
168 goto out_free_bio;
169 r10_bio->devs[j].repl_bio = bio;
170 }
171
172
173
174
175 for (j = 0 ; j < nalloc; j++) {
176 struct bio *rbio = r10_bio->devs[j].repl_bio;
177 bio = r10_bio->devs[j].bio;
178 for (i = 0; i < RESYNC_PAGES; i++) {
179 if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
180 &conf->mddev->recovery)) {
181
182
183 struct bio *rbio = r10_bio->devs[0].bio;
184 page = rbio->bi_io_vec[i].bv_page;
185 get_page(page);
186 } else
187 page = alloc_page(gfp_flags);
188 if (unlikely(!page))
189 goto out_free_pages;
190
191 bio->bi_io_vec[i].bv_page = page;
192 if (rbio)
193 rbio->bi_io_vec[i].bv_page = page;
194 }
195 }
196
197 return r10_bio;
198
199out_free_pages:
200 for ( ; i > 0 ; i--)
201 safe_put_page(bio->bi_io_vec[i-1].bv_page);
202 while (j--)
203 for (i = 0; i < RESYNC_PAGES ; i++)
204 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
205 j = 0;
206out_free_bio:
207 for ( ; j < nalloc; j++) {
208 if (r10_bio->devs[j].bio)
209 bio_put(r10_bio->devs[j].bio);
210 if (r10_bio->devs[j].repl_bio)
211 bio_put(r10_bio->devs[j].repl_bio);
212 }
213 r10bio_pool_free(r10_bio, conf);
214 return NULL;
215}
216
217static void r10buf_pool_free(void *__r10_bio, void *data)
218{
219 int i;
220 struct r10conf *conf = data;
221 struct r10bio *r10bio = __r10_bio;
222 int j;
223
224 for (j=0; j < conf->copies; j++) {
225 struct bio *bio = r10bio->devs[j].bio;
226 if (bio) {
227 for (i = 0; i < RESYNC_PAGES; i++) {
228 safe_put_page(bio->bi_io_vec[i].bv_page);
229 bio->bi_io_vec[i].bv_page = NULL;
230 }
231 bio_put(bio);
232 }
233 bio = r10bio->devs[j].repl_bio;
234 if (bio)
235 bio_put(bio);
236 }
237 r10bio_pool_free(r10bio, conf);
238}
239
240static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
241{
242 int i;
243
244 for (i = 0; i < conf->copies; i++) {
245 struct bio **bio = & r10_bio->devs[i].bio;
246 if (!BIO_SPECIAL(*bio))
247 bio_put(*bio);
248 *bio = NULL;
249 bio = &r10_bio->devs[i].repl_bio;
250 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
251 bio_put(*bio);
252 *bio = NULL;
253 }
254}
255
256static void free_r10bio(struct r10bio *r10_bio)
257{
258 struct r10conf *conf = r10_bio->mddev->private;
259
260 put_all_bios(conf, r10_bio);
261 mempool_free(r10_bio, conf->r10bio_pool);
262}
263
264static void put_buf(struct r10bio *r10_bio)
265{
266 struct r10conf *conf = r10_bio->mddev->private;
267
268 mempool_free(r10_bio, conf->r10buf_pool);
269
270 lower_barrier(conf);
271}
272
273static void reschedule_retry(struct r10bio *r10_bio)
274{
275 unsigned long flags;
276 struct mddev *mddev = r10_bio->mddev;
277 struct r10conf *conf = mddev->private;
278
279 spin_lock_irqsave(&conf->device_lock, flags);
280 list_add(&r10_bio->retry_list, &conf->retry_list);
281 conf->nr_queued ++;
282 spin_unlock_irqrestore(&conf->device_lock, flags);
283
284
285 wake_up(&conf->wait_barrier);
286
287 md_wakeup_thread(mddev->thread);
288}
289
290
291
292
293
294
295static void raid_end_bio_io(struct r10bio *r10_bio)
296{
297 struct bio *bio = r10_bio->master_bio;
298 int done;
299 struct r10conf *conf = r10_bio->mddev->private;
300
301 if (bio->bi_phys_segments) {
302 unsigned long flags;
303 spin_lock_irqsave(&conf->device_lock, flags);
304 bio->bi_phys_segments--;
305 done = (bio->bi_phys_segments == 0);
306 spin_unlock_irqrestore(&conf->device_lock, flags);
307 } else
308 done = 1;
309 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
310 clear_bit(BIO_UPTODATE, &bio->bi_flags);
311 if (done) {
312 bio_endio(bio, 0);
313
314
315
316
317 allow_barrier(conf);
318 }
319 free_r10bio(r10_bio);
320}
321
322
323
324
325static inline void update_head_pos(int slot, struct r10bio *r10_bio)
326{
327 struct r10conf *conf = r10_bio->mddev->private;
328
329 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
330 r10_bio->devs[slot].addr + (r10_bio->sectors);
331}
332
333
334
335
336static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
337 struct bio *bio, int *slotp, int *replp)
338{
339 int slot;
340 int repl = 0;
341
342 for (slot = 0; slot < conf->copies; slot++) {
343 if (r10_bio->devs[slot].bio == bio)
344 break;
345 if (r10_bio->devs[slot].repl_bio == bio) {
346 repl = 1;
347 break;
348 }
349 }
350
351 BUG_ON(slot == conf->copies);
352 update_head_pos(slot, r10_bio);
353
354 if (slotp)
355 *slotp = slot;
356 if (replp)
357 *replp = repl;
358 return r10_bio->devs[slot].devnum;
359}
360
361static void raid10_end_read_request(struct bio *bio, int error)
362{
363 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
364 struct r10bio *r10_bio = bio->bi_private;
365 int slot, dev;
366 struct md_rdev *rdev;
367 struct r10conf *conf = r10_bio->mddev->private;
368
369
370 slot = r10_bio->read_slot;
371 dev = r10_bio->devs[slot].devnum;
372 rdev = r10_bio->devs[slot].rdev;
373
374
375
376 update_head_pos(slot, r10_bio);
377
378 if (uptodate) {
379
380
381
382
383
384
385
386
387
388 set_bit(R10BIO_Uptodate, &r10_bio->state);
389 } else {
390
391
392
393
394
395 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
396 rdev->raid_disk))
397 uptodate = 1;
398 }
399 if (uptodate) {
400 raid_end_bio_io(r10_bio);
401 rdev_dec_pending(rdev, conf->mddev);
402 } else {
403
404
405
406 char b[BDEVNAME_SIZE];
407 printk_ratelimited(KERN_ERR
408 "md/raid10:%s: %s: rescheduling sector %llu\n",
409 mdname(conf->mddev),
410 bdevname(rdev->bdev, b),
411 (unsigned long long)r10_bio->sector);
412 set_bit(R10BIO_ReadError, &r10_bio->state);
413 reschedule_retry(r10_bio);
414 }
415}
416
417static void close_write(struct r10bio *r10_bio)
418{
419
420 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
421 r10_bio->sectors,
422 !test_bit(R10BIO_Degraded, &r10_bio->state),
423 0);
424 md_write_end(r10_bio->mddev);
425}
426
427static void one_write_done(struct r10bio *r10_bio)
428{
429 if (atomic_dec_and_test(&r10_bio->remaining)) {
430 if (test_bit(R10BIO_WriteError, &r10_bio->state))
431 reschedule_retry(r10_bio);
432 else {
433 close_write(r10_bio);
434 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
435 reschedule_retry(r10_bio);
436 else
437 raid_end_bio_io(r10_bio);
438 }
439 }
440}
441
442static void raid10_end_write_request(struct bio *bio, int error)
443{
444 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
445 struct r10bio *r10_bio = bio->bi_private;
446 int dev;
447 int dec_rdev = 1;
448 struct r10conf *conf = r10_bio->mddev->private;
449 int slot, repl;
450 struct md_rdev *rdev = NULL;
451
452 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
453
454 if (repl)
455 rdev = conf->mirrors[dev].replacement;
456 if (!rdev) {
457 smp_rmb();
458 repl = 0;
459 rdev = conf->mirrors[dev].rdev;
460 }
461
462
463
464 if (!uptodate) {
465 if (repl)
466
467
468
469 md_error(rdev->mddev, rdev);
470 else {
471 set_bit(WriteErrorSeen, &rdev->flags);
472 if (!test_and_set_bit(WantReplacement, &rdev->flags))
473 set_bit(MD_RECOVERY_NEEDED,
474 &rdev->mddev->recovery);
475 set_bit(R10BIO_WriteError, &r10_bio->state);
476 dec_rdev = 0;
477 }
478 } else {
479
480
481
482
483
484
485
486
487
488 sector_t first_bad;
489 int bad_sectors;
490
491
492
493
494
495
496
497
498
499 if (test_bit(In_sync, &rdev->flags) &&
500 !test_bit(Faulty, &rdev->flags))
501 set_bit(R10BIO_Uptodate, &r10_bio->state);
502
503
504 if (is_badblock(rdev,
505 r10_bio->devs[slot].addr,
506 r10_bio->sectors,
507 &first_bad, &bad_sectors)) {
508 bio_put(bio);
509 if (repl)
510 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
511 else
512 r10_bio->devs[slot].bio = IO_MADE_GOOD;
513 dec_rdev = 0;
514 set_bit(R10BIO_MadeGood, &r10_bio->state);
515 }
516 }
517
518
519
520
521
522
523 one_write_done(r10_bio);
524 if (dec_rdev)
525 rdev_dec_pending(rdev, conf->mddev);
526}
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
554{
555 int n,f;
556 sector_t sector;
557 sector_t chunk;
558 sector_t stripe;
559 int dev;
560 int slot = 0;
561 int last_far_set_start, last_far_set_size;
562
563 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
564 last_far_set_start *= geo->far_set_size;
565
566 last_far_set_size = geo->far_set_size;
567 last_far_set_size += (geo->raid_disks % geo->far_set_size);
568
569
570 chunk = r10bio->sector >> geo->chunk_shift;
571 sector = r10bio->sector & geo->chunk_mask;
572
573 chunk *= geo->near_copies;
574 stripe = chunk;
575 dev = sector_div(stripe, geo->raid_disks);
576 if (geo->far_offset)
577 stripe *= geo->far_copies;
578
579 sector += stripe << geo->chunk_shift;
580
581
582 for (n = 0; n < geo->near_copies; n++) {
583 int d = dev;
584 int set;
585 sector_t s = sector;
586 r10bio->devs[slot].devnum = d;
587 r10bio->devs[slot].addr = s;
588 slot++;
589
590 for (f = 1; f < geo->far_copies; f++) {
591 set = d / geo->far_set_size;
592 d += geo->near_copies;
593
594 if ((geo->raid_disks % geo->far_set_size) &&
595 (d > last_far_set_start)) {
596 d -= last_far_set_start;
597 d %= last_far_set_size;
598 d += last_far_set_start;
599 } else {
600 d %= geo->far_set_size;
601 d += geo->far_set_size * set;
602 }
603 s += geo->stride;
604 r10bio->devs[slot].devnum = d;
605 r10bio->devs[slot].addr = s;
606 slot++;
607 }
608 dev++;
609 if (dev >= geo->raid_disks) {
610 dev = 0;
611 sector += (geo->chunk_mask + 1);
612 }
613 }
614}
615
616static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
617{
618 struct geom *geo = &conf->geo;
619
620 if (conf->reshape_progress != MaxSector &&
621 ((r10bio->sector >= conf->reshape_progress) !=
622 conf->mddev->reshape_backwards)) {
623 set_bit(R10BIO_Previous, &r10bio->state);
624 geo = &conf->prev;
625 } else
626 clear_bit(R10BIO_Previous, &r10bio->state);
627
628 __raid10_find_phys(geo, r10bio);
629}
630
631static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
632{
633 sector_t offset, chunk, vchunk;
634
635
636
637 struct geom *geo = &conf->geo;
638 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
639 int far_set_size = geo->far_set_size;
640 int last_far_set_start;
641
642 if (geo->raid_disks % geo->far_set_size) {
643 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
644 last_far_set_start *= geo->far_set_size;
645
646 if (dev >= last_far_set_start) {
647 far_set_size = geo->far_set_size;
648 far_set_size += (geo->raid_disks % geo->far_set_size);
649 far_set_start = last_far_set_start;
650 }
651 }
652
653 offset = sector & geo->chunk_mask;
654 if (geo->far_offset) {
655 int fc;
656 chunk = sector >> geo->chunk_shift;
657 fc = sector_div(chunk, geo->far_copies);
658 dev -= fc * geo->near_copies;
659 if (dev < far_set_start)
660 dev += far_set_size;
661 } else {
662 while (sector >= geo->stride) {
663 sector -= geo->stride;
664 if (dev < (geo->near_copies + far_set_start))
665 dev += far_set_size - geo->near_copies;
666 else
667 dev -= geo->near_copies;
668 }
669 chunk = sector >> geo->chunk_shift;
670 }
671 vchunk = chunk * geo->raid_disks + dev;
672 sector_div(vchunk, geo->near_copies);
673 return (vchunk << geo->chunk_shift) + offset;
674}
675
676
677
678
679
680
681
682
683
684
685
686static int raid10_mergeable_bvec(struct request_queue *q,
687 struct bvec_merge_data *bvm,
688 struct bio_vec *biovec)
689{
690 struct mddev *mddev = q->queuedata;
691 struct r10conf *conf = mddev->private;
692 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
693 int max;
694 unsigned int chunk_sectors;
695 unsigned int bio_sectors = bvm->bi_size >> 9;
696 struct geom *geo = &conf->geo;
697
698 chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
699 if (conf->reshape_progress != MaxSector &&
700 ((sector >= conf->reshape_progress) !=
701 conf->mddev->reshape_backwards))
702 geo = &conf->prev;
703
704 if (geo->near_copies < geo->raid_disks) {
705 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
706 + bio_sectors)) << 9;
707 if (max < 0)
708
709 max = 0;
710 if (max <= biovec->bv_len && bio_sectors == 0)
711 return biovec->bv_len;
712 } else
713 max = biovec->bv_len;
714
715 if (mddev->merge_check_needed) {
716 struct {
717 struct r10bio r10_bio;
718 struct r10dev devs[conf->copies];
719 } on_stack;
720 struct r10bio *r10_bio = &on_stack.r10_bio;
721 int s;
722 if (conf->reshape_progress != MaxSector) {
723
724 if (max <= biovec->bv_len && bio_sectors == 0)
725 return biovec->bv_len;
726 return 0;
727 }
728 r10_bio->sector = sector;
729 raid10_find_phys(conf, r10_bio);
730 rcu_read_lock();
731 for (s = 0; s < conf->copies; s++) {
732 int disk = r10_bio->devs[s].devnum;
733 struct md_rdev *rdev = rcu_dereference(
734 conf->mirrors[disk].rdev);
735 if (rdev && !test_bit(Faulty, &rdev->flags)) {
736 struct request_queue *q =
737 bdev_get_queue(rdev->bdev);
738 if (q->merge_bvec_fn) {
739 bvm->bi_sector = r10_bio->devs[s].addr
740 + rdev->data_offset;
741 bvm->bi_bdev = rdev->bdev;
742 max = min(max, q->merge_bvec_fn(
743 q, bvm, biovec));
744 }
745 }
746 rdev = rcu_dereference(conf->mirrors[disk].replacement);
747 if (rdev && !test_bit(Faulty, &rdev->flags)) {
748 struct request_queue *q =
749 bdev_get_queue(rdev->bdev);
750 if (q->merge_bvec_fn) {
751 bvm->bi_sector = r10_bio->devs[s].addr
752 + rdev->data_offset;
753 bvm->bi_bdev = rdev->bdev;
754 max = min(max, q->merge_bvec_fn(
755 q, bvm, biovec));
756 }
757 }
758 }
759 rcu_read_unlock();
760 }
761 return max;
762}
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783static struct md_rdev *read_balance(struct r10conf *conf,
784 struct r10bio *r10_bio,
785 int *max_sectors)
786{
787 const sector_t this_sector = r10_bio->sector;
788 int disk, slot;
789 int sectors = r10_bio->sectors;
790 int best_good_sectors;
791 sector_t new_distance, best_dist;
792 struct md_rdev *best_rdev, *rdev = NULL;
793 int do_balance;
794 int best_slot;
795 struct geom *geo = &conf->geo;
796
797 raid10_find_phys(conf, r10_bio);
798 rcu_read_lock();
799retry:
800 sectors = r10_bio->sectors;
801 best_slot = -1;
802 best_rdev = NULL;
803 best_dist = MaxSector;
804 best_good_sectors = 0;
805 do_balance = 1;
806
807
808
809
810
811
812 if (conf->mddev->recovery_cp < MaxSector
813 && (this_sector + sectors >= conf->next_resync))
814 do_balance = 0;
815
816 for (slot = 0; slot < conf->copies ; slot++) {
817 sector_t first_bad;
818 int bad_sectors;
819 sector_t dev_sector;
820
821 if (r10_bio->devs[slot].bio == IO_BLOCKED)
822 continue;
823 disk = r10_bio->devs[slot].devnum;
824 rdev = rcu_dereference(conf->mirrors[disk].replacement);
825 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
826 test_bit(Unmerged, &rdev->flags) ||
827 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
828 rdev = rcu_dereference(conf->mirrors[disk].rdev);
829 if (rdev == NULL ||
830 test_bit(Faulty, &rdev->flags) ||
831 test_bit(Unmerged, &rdev->flags))
832 continue;
833 if (!test_bit(In_sync, &rdev->flags) &&
834 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
835 continue;
836
837 dev_sector = r10_bio->devs[slot].addr;
838 if (is_badblock(rdev, dev_sector, sectors,
839 &first_bad, &bad_sectors)) {
840 if (best_dist < MaxSector)
841
842 continue;
843 if (first_bad <= dev_sector) {
844
845
846
847
848 bad_sectors -= (dev_sector - first_bad);
849 if (!do_balance && sectors > bad_sectors)
850 sectors = bad_sectors;
851 if (best_good_sectors > sectors)
852 best_good_sectors = sectors;
853 } else {
854 sector_t good_sectors =
855 first_bad - dev_sector;
856 if (good_sectors > best_good_sectors) {
857 best_good_sectors = good_sectors;
858 best_slot = slot;
859 best_rdev = rdev;
860 }
861 if (!do_balance)
862
863 break;
864 }
865 continue;
866 } else
867 best_good_sectors = sectors;
868
869 if (!do_balance)
870 break;
871
872
873
874
875
876 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
877 break;
878
879
880 if (geo->far_copies > 1)
881 new_distance = r10_bio->devs[slot].addr;
882 else
883 new_distance = abs(r10_bio->devs[slot].addr -
884 conf->mirrors[disk].head_position);
885 if (new_distance < best_dist) {
886 best_dist = new_distance;
887 best_slot = slot;
888 best_rdev = rdev;
889 }
890 }
891 if (slot >= conf->copies) {
892 slot = best_slot;
893 rdev = best_rdev;
894 }
895
896 if (slot >= 0) {
897 atomic_inc(&rdev->nr_pending);
898 if (test_bit(Faulty, &rdev->flags)) {
899
900
901
902 rdev_dec_pending(rdev, conf->mddev);
903 goto retry;
904 }
905 r10_bio->read_slot = slot;
906 } else
907 rdev = NULL;
908 rcu_read_unlock();
909 *max_sectors = best_good_sectors;
910
911 return rdev;
912}
913
914int md_raid10_congested(struct mddev *mddev, int bits)
915{
916 struct r10conf *conf = mddev->private;
917 int i, ret = 0;
918
919 if ((bits & (1 << BDI_async_congested)) &&
920 conf->pending_count >= max_queued_requests)
921 return 1;
922
923 rcu_read_lock();
924 for (i = 0;
925 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
926 && ret == 0;
927 i++) {
928 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
929 if (rdev && !test_bit(Faulty, &rdev->flags)) {
930 struct request_queue *q = bdev_get_queue(rdev->bdev);
931
932 ret |= bdi_congested(&q->backing_dev_info, bits);
933 }
934 }
935 rcu_read_unlock();
936 return ret;
937}
938EXPORT_SYMBOL_GPL(md_raid10_congested);
939
940static int raid10_congested(void *data, int bits)
941{
942 struct mddev *mddev = data;
943
944 return mddev_congested(mddev, bits) ||
945 md_raid10_congested(mddev, bits);
946}
947
948static void flush_pending_writes(struct r10conf *conf)
949{
950
951
952
953 spin_lock_irq(&conf->device_lock);
954
955 if (conf->pending_bio_list.head) {
956 struct bio *bio;
957 bio = bio_list_get(&conf->pending_bio_list);
958 conf->pending_count = 0;
959 spin_unlock_irq(&conf->device_lock);
960
961
962 bitmap_unplug(conf->mddev->bitmap);
963 wake_up(&conf->wait_barrier);
964
965 while (bio) {
966 struct bio *next = bio->bi_next;
967 bio->bi_next = NULL;
968 if (unlikely((bio->bi_rw & REQ_DISCARD) &&
969 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
970
971 bio_endio(bio, 0);
972 else
973 generic_make_request(bio);
974 bio = next;
975 }
976 } else
977 spin_unlock_irq(&conf->device_lock);
978}
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002static void raise_barrier(struct r10conf *conf, int force)
1003{
1004 BUG_ON(force && !conf->barrier);
1005 spin_lock_irq(&conf->resync_lock);
1006
1007
1008 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
1009 conf->resync_lock);
1010
1011
1012 conf->barrier++;
1013
1014
1015 wait_event_lock_irq(conf->wait_barrier,
1016 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
1017 conf->resync_lock);
1018
1019 spin_unlock_irq(&conf->resync_lock);
1020}
1021
1022static void lower_barrier(struct r10conf *conf)
1023{
1024 unsigned long flags;
1025 spin_lock_irqsave(&conf->resync_lock, flags);
1026 conf->barrier--;
1027 spin_unlock_irqrestore(&conf->resync_lock, flags);
1028 wake_up(&conf->wait_barrier);
1029}
1030
1031static void wait_barrier(struct r10conf *conf)
1032{
1033 spin_lock_irq(&conf->resync_lock);
1034 if (conf->barrier) {
1035 conf->nr_waiting++;
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045 wait_event_lock_irq(conf->wait_barrier,
1046 !conf->barrier ||
1047 (conf->nr_pending &&
1048 current->bio_list &&
1049 !bio_list_empty(current->bio_list)),
1050 conf->resync_lock);
1051 conf->nr_waiting--;
1052 }
1053 conf->nr_pending++;
1054 spin_unlock_irq(&conf->resync_lock);
1055}
1056
1057static void allow_barrier(struct r10conf *conf)
1058{
1059 unsigned long flags;
1060 spin_lock_irqsave(&conf->resync_lock, flags);
1061 conf->nr_pending--;
1062 spin_unlock_irqrestore(&conf->resync_lock, flags);
1063 wake_up(&conf->wait_barrier);
1064}
1065
1066static void freeze_array(struct r10conf *conf, int extra)
1067{
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080 spin_lock_irq(&conf->resync_lock);
1081 conf->barrier++;
1082 conf->nr_waiting++;
1083 wait_event_lock_irq_cmd(conf->wait_barrier,
1084 conf->nr_pending == conf->nr_queued+extra,
1085 conf->resync_lock,
1086 flush_pending_writes(conf));
1087
1088 spin_unlock_irq(&conf->resync_lock);
1089}
1090
1091static void unfreeze_array(struct r10conf *conf)
1092{
1093
1094 spin_lock_irq(&conf->resync_lock);
1095 conf->barrier--;
1096 conf->nr_waiting--;
1097 wake_up(&conf->wait_barrier);
1098 spin_unlock_irq(&conf->resync_lock);
1099}
1100
1101static sector_t choose_data_offset(struct r10bio *r10_bio,
1102 struct md_rdev *rdev)
1103{
1104 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1105 test_bit(R10BIO_Previous, &r10_bio->state))
1106 return rdev->data_offset;
1107 else
1108 return rdev->new_data_offset;
1109}
1110
1111struct raid10_plug_cb {
1112 struct blk_plug_cb cb;
1113 struct bio_list pending;
1114 int pending_cnt;
1115};
1116
1117static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1118{
1119 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1120 cb);
1121 struct mddev *mddev = plug->cb.data;
1122 struct r10conf *conf = mddev->private;
1123 struct bio *bio;
1124
1125 if (from_schedule || current->bio_list) {
1126 spin_lock_irq(&conf->device_lock);
1127 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1128 conf->pending_count += plug->pending_cnt;
1129 spin_unlock_irq(&conf->device_lock);
1130 wake_up(&conf->wait_barrier);
1131 md_wakeup_thread(mddev->thread);
1132 kfree(plug);
1133 return;
1134 }
1135
1136
1137 bio = bio_list_get(&plug->pending);
1138 bitmap_unplug(mddev->bitmap);
1139 wake_up(&conf->wait_barrier);
1140
1141 while (bio) {
1142 struct bio *next = bio->bi_next;
1143 bio->bi_next = NULL;
1144 if (unlikely((bio->bi_rw & REQ_DISCARD) &&
1145 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
1146
1147 bio_endio(bio, 0);
1148 else
1149 generic_make_request(bio);
1150 bio = next;
1151 }
1152 kfree(plug);
1153}
1154
1155static void make_request(struct mddev *mddev, struct bio * bio)
1156{
1157 struct r10conf *conf = mddev->private;
1158 struct r10bio *r10_bio;
1159 struct bio *read_bio;
1160 int i;
1161 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1162 int chunk_sects = chunk_mask + 1;
1163 const int rw = bio_data_dir(bio);
1164 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
1165 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
1166 const unsigned long do_discard = (bio->bi_rw
1167 & (REQ_DISCARD | REQ_SECURE));
1168 const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
1169 unsigned long flags;
1170 struct md_rdev *blocked_rdev;
1171 struct blk_plug_cb *cb;
1172 struct raid10_plug_cb *plug = NULL;
1173 int sectors_handled;
1174 int max_sectors;
1175 int sectors;
1176
1177 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
1178 md_flush_request(mddev, bio);
1179 return;
1180 }
1181
1182
1183
1184
1185 if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio)
1186 > chunk_sects
1187 && (conf->geo.near_copies < conf->geo.raid_disks
1188 || conf->prev.near_copies < conf->prev.raid_disks))) {
1189 struct bio_pair *bp;
1190
1191 if (bio_segments(bio) > 1)
1192 goto bad_map;
1193
1194
1195
1196 bp = bio_split(bio,
1197 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207 spin_lock_irq(&conf->resync_lock);
1208 conf->nr_waiting++;
1209 spin_unlock_irq(&conf->resync_lock);
1210
1211 make_request(mddev, &bp->bio1);
1212 make_request(mddev, &bp->bio2);
1213
1214 spin_lock_irq(&conf->resync_lock);
1215 conf->nr_waiting--;
1216 wake_up(&conf->wait_barrier);
1217 spin_unlock_irq(&conf->resync_lock);
1218
1219 bio_pair_release(bp);
1220 return;
1221 bad_map:
1222 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
1223 " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
1224 (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
1225
1226 bio_io_error(bio);
1227 return;
1228 }
1229
1230 md_write_start(mddev, bio);
1231
1232
1233
1234
1235
1236
1237 wait_barrier(conf);
1238
1239 sectors = bio_sectors(bio);
1240 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1241 bio->bi_sector < conf->reshape_progress &&
1242 bio->bi_sector + sectors > conf->reshape_progress) {
1243
1244
1245
1246 allow_barrier(conf);
1247 wait_event(conf->wait_barrier,
1248 conf->reshape_progress <= bio->bi_sector ||
1249 conf->reshape_progress >= bio->bi_sector + sectors);
1250 wait_barrier(conf);
1251 }
1252 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1253 bio_data_dir(bio) == WRITE &&
1254 (mddev->reshape_backwards
1255 ? (bio->bi_sector < conf->reshape_safe &&
1256 bio->bi_sector + sectors > conf->reshape_progress)
1257 : (bio->bi_sector + sectors > conf->reshape_safe &&
1258 bio->bi_sector < conf->reshape_progress))) {
1259
1260 mddev->reshape_position = conf->reshape_progress;
1261 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1262 set_bit(MD_CHANGE_PENDING, &mddev->flags);
1263 md_wakeup_thread(mddev->thread);
1264 wait_event(mddev->sb_wait,
1265 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
1266
1267 conf->reshape_safe = mddev->reshape_position;
1268 }
1269
1270 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1271
1272 r10_bio->master_bio = bio;
1273 r10_bio->sectors = sectors;
1274
1275 r10_bio->mddev = mddev;
1276 r10_bio->sector = bio->bi_sector;
1277 r10_bio->state = 0;
1278
1279
1280
1281
1282
1283
1284
1285
1286 bio->bi_phys_segments = 0;
1287 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1288
1289 if (rw == READ) {
1290
1291
1292
1293 struct md_rdev *rdev;
1294 int slot;
1295
1296read_again:
1297 rdev = read_balance(conf, r10_bio, &max_sectors);
1298 if (!rdev) {
1299 raid_end_bio_io(r10_bio);
1300 return;
1301 }
1302 slot = r10_bio->read_slot;
1303
1304 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1305 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
1306 max_sectors);
1307
1308 r10_bio->devs[slot].bio = read_bio;
1309 r10_bio->devs[slot].rdev = rdev;
1310
1311 read_bio->bi_sector = r10_bio->devs[slot].addr +
1312 choose_data_offset(r10_bio, rdev);
1313 read_bio->bi_bdev = rdev->bdev;
1314 read_bio->bi_end_io = raid10_end_read_request;
1315 read_bio->bi_rw = READ | do_sync;
1316 read_bio->bi_private = r10_bio;
1317
1318 if (max_sectors < r10_bio->sectors) {
1319
1320
1321
1322 sectors_handled = (r10_bio->sectors + max_sectors
1323 - bio->bi_sector);
1324 r10_bio->sectors = max_sectors;
1325 spin_lock_irq(&conf->device_lock);
1326 if (bio->bi_phys_segments == 0)
1327 bio->bi_phys_segments = 2;
1328 else
1329 bio->bi_phys_segments++;
1330 spin_unlock(&conf->device_lock);
1331
1332
1333
1334
1335
1336 reschedule_retry(r10_bio);
1337
1338 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1339
1340 r10_bio->master_bio = bio;
1341 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1342 r10_bio->state = 0;
1343 r10_bio->mddev = mddev;
1344 r10_bio->sector = bio->bi_sector + sectors_handled;
1345 goto read_again;
1346 } else
1347 generic_make_request(read_bio);
1348 return;
1349 }
1350
1351
1352
1353
1354 if (conf->pending_count >= max_queued_requests) {
1355 md_wakeup_thread(mddev->thread);
1356 wait_event(conf->wait_barrier,
1357 conf->pending_count < max_queued_requests);
1358 }
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371 r10_bio->read_slot = -1;
1372 raid10_find_phys(conf, r10_bio);
1373retry_write:
1374 blocked_rdev = NULL;
1375 rcu_read_lock();
1376 max_sectors = r10_bio->sectors;
1377
1378 for (i = 0; i < conf->copies; i++) {
1379 int d = r10_bio->devs[i].devnum;
1380 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1381 struct md_rdev *rrdev = rcu_dereference(
1382 conf->mirrors[d].replacement);
1383 if (rdev == rrdev)
1384 rrdev = NULL;
1385 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1386 atomic_inc(&rdev->nr_pending);
1387 blocked_rdev = rdev;
1388 break;
1389 }
1390 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1391 atomic_inc(&rrdev->nr_pending);
1392 blocked_rdev = rrdev;
1393 break;
1394 }
1395 if (rdev && (test_bit(Faulty, &rdev->flags)
1396 || test_bit(Unmerged, &rdev->flags)))
1397 rdev = NULL;
1398 if (rrdev && (test_bit(Faulty, &rrdev->flags)
1399 || test_bit(Unmerged, &rrdev->flags)))
1400 rrdev = NULL;
1401
1402 r10_bio->devs[i].bio = NULL;
1403 r10_bio->devs[i].repl_bio = NULL;
1404
1405 if (!rdev && !rrdev) {
1406 set_bit(R10BIO_Degraded, &r10_bio->state);
1407 continue;
1408 }
1409 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1410 sector_t first_bad;
1411 sector_t dev_sector = r10_bio->devs[i].addr;
1412 int bad_sectors;
1413 int is_bad;
1414
1415 is_bad = is_badblock(rdev, dev_sector,
1416 max_sectors,
1417 &first_bad, &bad_sectors);
1418 if (is_bad < 0) {
1419
1420
1421
1422 atomic_inc(&rdev->nr_pending);
1423 set_bit(BlockedBadBlocks, &rdev->flags);
1424 blocked_rdev = rdev;
1425 break;
1426 }
1427 if (is_bad && first_bad <= dev_sector) {
1428
1429 bad_sectors -= (dev_sector - first_bad);
1430 if (bad_sectors < max_sectors)
1431
1432
1433
1434 max_sectors = bad_sectors;
1435
1436
1437
1438
1439
1440
1441
1442
1443 continue;
1444 }
1445 if (is_bad) {
1446 int good_sectors = first_bad - dev_sector;
1447 if (good_sectors < max_sectors)
1448 max_sectors = good_sectors;
1449 }
1450 }
1451 if (rdev) {
1452 r10_bio->devs[i].bio = bio;
1453 atomic_inc(&rdev->nr_pending);
1454 }
1455 if (rrdev) {
1456 r10_bio->devs[i].repl_bio = bio;
1457 atomic_inc(&rrdev->nr_pending);
1458 }
1459 }
1460 rcu_read_unlock();
1461
1462 if (unlikely(blocked_rdev)) {
1463
1464 int j;
1465 int d;
1466
1467 for (j = 0; j < i; j++) {
1468 if (r10_bio->devs[j].bio) {
1469 d = r10_bio->devs[j].devnum;
1470 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1471 }
1472 if (r10_bio->devs[j].repl_bio) {
1473 struct md_rdev *rdev;
1474 d = r10_bio->devs[j].devnum;
1475 rdev = conf->mirrors[d].replacement;
1476 if (!rdev) {
1477
1478 smp_mb();
1479 rdev = conf->mirrors[d].rdev;
1480 }
1481 rdev_dec_pending(rdev, mddev);
1482 }
1483 }
1484 allow_barrier(conf);
1485 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1486 wait_barrier(conf);
1487 goto retry_write;
1488 }
1489
1490 if (max_sectors < r10_bio->sectors) {
1491
1492
1493
1494 r10_bio->sectors = max_sectors;
1495 spin_lock_irq(&conf->device_lock);
1496 if (bio->bi_phys_segments == 0)
1497 bio->bi_phys_segments = 2;
1498 else
1499 bio->bi_phys_segments++;
1500 spin_unlock_irq(&conf->device_lock);
1501 }
1502 sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
1503
1504 atomic_set(&r10_bio->remaining, 1);
1505 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1506
1507 for (i = 0; i < conf->copies; i++) {
1508 struct bio *mbio;
1509 int d = r10_bio->devs[i].devnum;
1510 if (r10_bio->devs[i].bio) {
1511 struct md_rdev *rdev = conf->mirrors[d].rdev;
1512 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1513 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1514 max_sectors);
1515 r10_bio->devs[i].bio = mbio;
1516
1517 mbio->bi_sector = (r10_bio->devs[i].addr+
1518 choose_data_offset(r10_bio,
1519 rdev));
1520 mbio->bi_bdev = rdev->bdev;
1521 mbio->bi_end_io = raid10_end_write_request;
1522 mbio->bi_rw =
1523 WRITE | do_sync | do_fua | do_discard | do_same;
1524 mbio->bi_private = r10_bio;
1525
1526 atomic_inc(&r10_bio->remaining);
1527
1528 cb = blk_check_plugged(raid10_unplug, mddev,
1529 sizeof(*plug));
1530 if (cb)
1531 plug = container_of(cb, struct raid10_plug_cb,
1532 cb);
1533 else
1534 plug = NULL;
1535 spin_lock_irqsave(&conf->device_lock, flags);
1536 if (plug) {
1537 bio_list_add(&plug->pending, mbio);
1538 plug->pending_cnt++;
1539 } else {
1540 bio_list_add(&conf->pending_bio_list, mbio);
1541 conf->pending_count++;
1542 }
1543 spin_unlock_irqrestore(&conf->device_lock, flags);
1544 if (!plug)
1545 md_wakeup_thread(mddev->thread);
1546 }
1547
1548 if (r10_bio->devs[i].repl_bio) {
1549 struct md_rdev *rdev = conf->mirrors[d].replacement;
1550 if (rdev == NULL) {
1551
1552 smp_mb();
1553 rdev = conf->mirrors[d].rdev;
1554 }
1555 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1556 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1557 max_sectors);
1558 r10_bio->devs[i].repl_bio = mbio;
1559
1560 mbio->bi_sector = (r10_bio->devs[i].addr +
1561 choose_data_offset(
1562 r10_bio, rdev));
1563 mbio->bi_bdev = rdev->bdev;
1564 mbio->bi_end_io = raid10_end_write_request;
1565 mbio->bi_rw =
1566 WRITE | do_sync | do_fua | do_discard | do_same;
1567 mbio->bi_private = r10_bio;
1568
1569 atomic_inc(&r10_bio->remaining);
1570 spin_lock_irqsave(&conf->device_lock, flags);
1571 bio_list_add(&conf->pending_bio_list, mbio);
1572 conf->pending_count++;
1573 spin_unlock_irqrestore(&conf->device_lock, flags);
1574 if (!mddev_check_plugged(mddev))
1575 md_wakeup_thread(mddev->thread);
1576 }
1577 }
1578
1579
1580
1581
1582
1583 if (sectors_handled < bio_sectors(bio)) {
1584 one_write_done(r10_bio);
1585
1586
1587
1588 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1589
1590 r10_bio->master_bio = bio;
1591 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1592
1593 r10_bio->mddev = mddev;
1594 r10_bio->sector = bio->bi_sector + sectors_handled;
1595 r10_bio->state = 0;
1596 goto retry_write;
1597 }
1598 one_write_done(r10_bio);
1599
1600
1601 wake_up(&conf->wait_barrier);
1602}
1603
1604static void status(struct seq_file *seq, struct mddev *mddev)
1605{
1606 struct r10conf *conf = mddev->private;
1607 int i;
1608
1609 if (conf->geo.near_copies < conf->geo.raid_disks)
1610 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1611 if (conf->geo.near_copies > 1)
1612 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1613 if (conf->geo.far_copies > 1) {
1614 if (conf->geo.far_offset)
1615 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1616 else
1617 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1618 }
1619 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1620 conf->geo.raid_disks - mddev->degraded);
1621 for (i = 0; i < conf->geo.raid_disks; i++)
1622 seq_printf(seq, "%s",
1623 conf->mirrors[i].rdev &&
1624 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
1625 seq_printf(seq, "]");
1626}
1627
1628
1629
1630
1631
1632
1633static int _enough(struct r10conf *conf, int previous, int ignore)
1634{
1635 int first = 0;
1636 int has_enough = 0;
1637 int disks, ncopies;
1638 if (previous) {
1639 disks = conf->prev.raid_disks;
1640 ncopies = conf->prev.near_copies;
1641 } else {
1642 disks = conf->geo.raid_disks;
1643 ncopies = conf->geo.near_copies;
1644 }
1645
1646 rcu_read_lock();
1647 do {
1648 int n = conf->copies;
1649 int cnt = 0;
1650 int this = first;
1651 while (n--) {
1652 struct md_rdev *rdev;
1653 if (this != ignore &&
1654 (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
1655 test_bit(In_sync, &rdev->flags))
1656 cnt++;
1657 this = (this+1) % disks;
1658 }
1659 if (cnt == 0)
1660 goto out;
1661 first = (first + ncopies) % disks;
1662 } while (first != 0);
1663 has_enough = 1;
1664out:
1665 rcu_read_unlock();
1666 return has_enough;
1667}
1668
1669static int enough(struct r10conf *conf, int ignore)
1670{
1671
1672
1673
1674
1675
1676 return _enough(conf, 0, ignore) &&
1677 _enough(conf, 1, ignore);
1678}
1679
1680static void error(struct mddev *mddev, struct md_rdev *rdev)
1681{
1682 char b[BDEVNAME_SIZE];
1683 struct r10conf *conf = mddev->private;
1684 unsigned long flags;
1685
1686
1687
1688
1689
1690
1691
1692 spin_lock_irqsave(&conf->device_lock, flags);
1693 if (test_bit(In_sync, &rdev->flags)
1694 && !enough(conf, rdev->raid_disk)) {
1695
1696
1697
1698 spin_unlock_irqrestore(&conf->device_lock, flags);
1699 return;
1700 }
1701 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1702 mddev->degraded++;
1703
1704
1705
1706 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1707 }
1708 set_bit(Blocked, &rdev->flags);
1709 set_bit(Faulty, &rdev->flags);
1710 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1711 spin_unlock_irqrestore(&conf->device_lock, flags);
1712 printk(KERN_ALERT
1713 "md/raid10:%s: Disk failure on %s, disabling device.\n"
1714 "md/raid10:%s: Operation continuing on %d devices.\n",
1715 mdname(mddev), bdevname(rdev->bdev, b),
1716 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1717}
1718
1719static void print_conf(struct r10conf *conf)
1720{
1721 int i;
1722 struct raid10_info *tmp;
1723
1724 printk(KERN_DEBUG "RAID10 conf printout:\n");
1725 if (!conf) {
1726 printk(KERN_DEBUG "(!conf)\n");
1727 return;
1728 }
1729 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1730 conf->geo.raid_disks);
1731
1732 for (i = 0; i < conf->geo.raid_disks; i++) {
1733 char b[BDEVNAME_SIZE];
1734 tmp = conf->mirrors + i;
1735 if (tmp->rdev)
1736 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1737 i, !test_bit(In_sync, &tmp->rdev->flags),
1738 !test_bit(Faulty, &tmp->rdev->flags),
1739 bdevname(tmp->rdev->bdev,b));
1740 }
1741}
1742
1743static void close_sync(struct r10conf *conf)
1744{
1745 wait_barrier(conf);
1746 allow_barrier(conf);
1747
1748 mempool_destroy(conf->r10buf_pool);
1749 conf->r10buf_pool = NULL;
1750}
1751
1752static int raid10_spare_active(struct mddev *mddev)
1753{
1754 int i;
1755 struct r10conf *conf = mddev->private;
1756 struct raid10_info *tmp;
1757 int count = 0;
1758 unsigned long flags;
1759
1760
1761
1762
1763
1764 for (i = 0; i < conf->geo.raid_disks; i++) {
1765 tmp = conf->mirrors + i;
1766 if (tmp->replacement
1767 && tmp->replacement->recovery_offset == MaxSector
1768 && !test_bit(Faulty, &tmp->replacement->flags)
1769 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1770
1771 if (!tmp->rdev
1772 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1773 count++;
1774 if (tmp->rdev) {
1775
1776
1777
1778
1779 set_bit(Faulty, &tmp->rdev->flags);
1780 sysfs_notify_dirent_safe(
1781 tmp->rdev->sysfs_state);
1782 }
1783 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1784 } else if (tmp->rdev
1785 && tmp->rdev->recovery_offset == MaxSector
1786 && !test_bit(Faulty, &tmp->rdev->flags)
1787 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1788 count++;
1789 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
1790 }
1791 }
1792 spin_lock_irqsave(&conf->device_lock, flags);
1793 mddev->degraded -= count;
1794 spin_unlock_irqrestore(&conf->device_lock, flags);
1795
1796 print_conf(conf);
1797 return count;
1798}
1799
1800
1801static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1802{
1803 struct r10conf *conf = mddev->private;
1804 int err = -EEXIST;
1805 int mirror;
1806 int first = 0;
1807 int last = conf->geo.raid_disks - 1;
1808 struct request_queue *q = bdev_get_queue(rdev->bdev);
1809
1810 if (mddev->recovery_cp < MaxSector)
1811
1812
1813
1814 return -EBUSY;
1815 if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
1816 return -EINVAL;
1817
1818 if (rdev->raid_disk >= 0)
1819 first = last = rdev->raid_disk;
1820
1821 if (q->merge_bvec_fn) {
1822 set_bit(Unmerged, &rdev->flags);
1823 mddev->merge_check_needed = 1;
1824 }
1825
1826 if (rdev->saved_raid_disk >= first &&
1827 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1828 mirror = rdev->saved_raid_disk;
1829 else
1830 mirror = first;
1831 for ( ; mirror <= last ; mirror++) {
1832 struct raid10_info *p = &conf->mirrors[mirror];
1833 if (p->recovery_disabled == mddev->recovery_disabled)
1834 continue;
1835 if (p->rdev) {
1836 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1837 p->replacement != NULL)
1838 continue;
1839 clear_bit(In_sync, &rdev->flags);
1840 set_bit(Replacement, &rdev->flags);
1841 rdev->raid_disk = mirror;
1842 err = 0;
1843 if (mddev->gendisk)
1844 disk_stack_limits(mddev->gendisk, rdev->bdev,
1845 rdev->data_offset << 9);
1846 conf->fullsync = 1;
1847 rcu_assign_pointer(p->replacement, rdev);
1848 break;
1849 }
1850
1851 if (mddev->gendisk)
1852 disk_stack_limits(mddev->gendisk, rdev->bdev,
1853 rdev->data_offset << 9);
1854
1855 p->head_position = 0;
1856 p->recovery_disabled = mddev->recovery_disabled - 1;
1857 rdev->raid_disk = mirror;
1858 err = 0;
1859 if (rdev->saved_raid_disk != mirror)
1860 conf->fullsync = 1;
1861 rcu_assign_pointer(p->rdev, rdev);
1862 break;
1863 }
1864 if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
1865
1866
1867
1868
1869
1870
1871
1872 synchronize_sched();
1873 freeze_array(conf, 0);
1874 unfreeze_array(conf);
1875 clear_bit(Unmerged, &rdev->flags);
1876 }
1877 md_integrity_add_rdev(rdev, mddev);
1878 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1879 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
1880
1881 print_conf(conf);
1882 return err;
1883}
1884
1885static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1886{
1887 struct r10conf *conf = mddev->private;
1888 int err = 0;
1889 int number = rdev->raid_disk;
1890 struct md_rdev **rdevp;
1891 struct raid10_info *p = conf->mirrors + number;
1892
1893 print_conf(conf);
1894 if (rdev == p->rdev)
1895 rdevp = &p->rdev;
1896 else if (rdev == p->replacement)
1897 rdevp = &p->replacement;
1898 else
1899 return 0;
1900
1901 if (test_bit(In_sync, &rdev->flags) ||
1902 atomic_read(&rdev->nr_pending)) {
1903 err = -EBUSY;
1904 goto abort;
1905 }
1906
1907
1908
1909 if (!test_bit(Faulty, &rdev->flags) &&
1910 mddev->recovery_disabled != p->recovery_disabled &&
1911 (!p->replacement || p->replacement == rdev) &&
1912 number < conf->geo.raid_disks &&
1913 enough(conf, -1)) {
1914 err = -EBUSY;
1915 goto abort;
1916 }
1917 *rdevp = NULL;
1918 synchronize_rcu();
1919 if (atomic_read(&rdev->nr_pending)) {
1920
1921 err = -EBUSY;
1922 *rdevp = rdev;
1923 goto abort;
1924 } else if (p->replacement) {
1925
1926 p->rdev = p->replacement;
1927 clear_bit(Replacement, &p->replacement->flags);
1928 smp_mb();
1929
1930
1931 p->replacement = NULL;
1932 clear_bit(WantReplacement, &rdev->flags);
1933 } else
1934
1935
1936
1937 clear_bit(WantReplacement, &rdev->flags);
1938
1939 err = md_integrity_register(mddev);
1940
1941abort:
1942
1943 print_conf(conf);
1944 return err;
1945}
1946
1947
1948static void end_sync_read(struct bio *bio, int error)
1949{
1950 struct r10bio *r10_bio = bio->bi_private;
1951 struct r10conf *conf = r10_bio->mddev->private;
1952 int d;
1953
1954 if (bio == r10_bio->master_bio) {
1955
1956 d = r10_bio->read_slot;
1957 } else
1958 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1959
1960 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1961 set_bit(R10BIO_Uptodate, &r10_bio->state);
1962 else
1963
1964
1965
1966 atomic_add(r10_bio->sectors,
1967 &conf->mirrors[d].rdev->corrected_errors);
1968
1969
1970
1971
1972 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1973 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1974 atomic_dec_and_test(&r10_bio->remaining)) {
1975
1976
1977
1978 reschedule_retry(r10_bio);
1979 }
1980}
1981
1982static void end_sync_request(struct r10bio *r10_bio)
1983{
1984 struct mddev *mddev = r10_bio->mddev;
1985
1986 while (atomic_dec_and_test(&r10_bio->remaining)) {
1987 if (r10_bio->master_bio == NULL) {
1988
1989 sector_t s = r10_bio->sectors;
1990 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1991 test_bit(R10BIO_WriteError, &r10_bio->state))
1992 reschedule_retry(r10_bio);
1993 else
1994 put_buf(r10_bio);
1995 md_done_sync(mddev, s, 1);
1996 break;
1997 } else {
1998 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1999 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2000 test_bit(R10BIO_WriteError, &r10_bio->state))
2001 reschedule_retry(r10_bio);
2002 else
2003 put_buf(r10_bio);
2004 r10_bio = r10_bio2;
2005 }
2006 }
2007}
2008
2009static void end_sync_write(struct bio *bio, int error)
2010{
2011 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
2012 struct r10bio *r10_bio = bio->bi_private;
2013 struct mddev *mddev = r10_bio->mddev;
2014 struct r10conf *conf = mddev->private;
2015 int d;
2016 sector_t first_bad;
2017 int bad_sectors;
2018 int slot;
2019 int repl;
2020 struct md_rdev *rdev = NULL;
2021
2022 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
2023 if (repl)
2024 rdev = conf->mirrors[d].replacement;
2025 else
2026 rdev = conf->mirrors[d].rdev;
2027
2028 if (!uptodate) {
2029 if (repl)
2030 md_error(mddev, rdev);
2031 else {
2032 set_bit(WriteErrorSeen, &rdev->flags);
2033 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2034 set_bit(MD_RECOVERY_NEEDED,
2035 &rdev->mddev->recovery);
2036 set_bit(R10BIO_WriteError, &r10_bio->state);
2037 }
2038 } else if (is_badblock(rdev,
2039 r10_bio->devs[slot].addr,
2040 r10_bio->sectors,
2041 &first_bad, &bad_sectors))
2042 set_bit(R10BIO_MadeGood, &r10_bio->state);
2043
2044 rdev_dec_pending(rdev, mddev);
2045
2046 end_sync_request(r10_bio);
2047}
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2066{
2067 struct r10conf *conf = mddev->private;
2068 int i, first;
2069 struct bio *tbio, *fbio;
2070 int vcnt;
2071
2072 atomic_set(&r10_bio->remaining, 1);
2073
2074
2075 for (i=0; i<conf->copies; i++)
2076 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
2077 break;
2078
2079 if (i == conf->copies)
2080 goto done;
2081
2082 first = i;
2083 fbio = r10_bio->devs[i].bio;
2084
2085 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2086
2087 for (i=0 ; i < conf->copies ; i++) {
2088 int j, d;
2089
2090 tbio = r10_bio->devs[i].bio;
2091
2092 if (tbio->bi_end_io != end_sync_read)
2093 continue;
2094 if (i == first)
2095 continue;
2096 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
2097
2098
2099
2100
2101 int sectors = r10_bio->sectors;
2102 for (j = 0; j < vcnt; j++) {
2103 int len = PAGE_SIZE;
2104 if (sectors < (len / 512))
2105 len = sectors * 512;
2106 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
2107 page_address(tbio->bi_io_vec[j].bv_page),
2108 len))
2109 break;
2110 sectors -= len/512;
2111 }
2112 if (j == vcnt)
2113 continue;
2114 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2115 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2116
2117 continue;
2118 }
2119
2120
2121
2122
2123
2124 bio_reset(tbio);
2125
2126 tbio->bi_vcnt = vcnt;
2127 tbio->bi_size = r10_bio->sectors << 9;
2128 tbio->bi_rw = WRITE;
2129 tbio->bi_private = r10_bio;
2130 tbio->bi_sector = r10_bio->devs[i].addr;
2131
2132 for (j=0; j < vcnt ; j++) {
2133 tbio->bi_io_vec[j].bv_offset = 0;
2134 tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
2135
2136 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2137 page_address(fbio->bi_io_vec[j].bv_page),
2138 PAGE_SIZE);
2139 }
2140 tbio->bi_end_io = end_sync_write;
2141
2142 d = r10_bio->devs[i].devnum;
2143 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2144 atomic_inc(&r10_bio->remaining);
2145 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2146
2147 tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
2148 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
2149 generic_make_request(tbio);
2150 }
2151
2152
2153
2154
2155 for (i = 0; i < conf->copies; i++) {
2156 int j, d;
2157
2158 tbio = r10_bio->devs[i].repl_bio;
2159 if (!tbio || !tbio->bi_end_io)
2160 continue;
2161 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2162 && r10_bio->devs[i].bio != fbio)
2163 for (j = 0; j < vcnt; j++)
2164 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2165 page_address(fbio->bi_io_vec[j].bv_page),
2166 PAGE_SIZE);
2167 d = r10_bio->devs[i].devnum;
2168 atomic_inc(&r10_bio->remaining);
2169 md_sync_acct(conf->mirrors[d].replacement->bdev,
2170 bio_sectors(tbio));
2171 generic_make_request(tbio);
2172 }
2173
2174done:
2175 if (atomic_dec_and_test(&r10_bio->remaining)) {
2176 md_done_sync(mddev, r10_bio->sectors, 1);
2177 put_buf(r10_bio);
2178 }
2179}
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191static void fix_recovery_read_error(struct r10bio *r10_bio)
2192{
2193
2194
2195
2196
2197
2198
2199
2200 struct mddev *mddev = r10_bio->mddev;
2201 struct r10conf *conf = mddev->private;
2202 struct bio *bio = r10_bio->devs[0].bio;
2203 sector_t sect = 0;
2204 int sectors = r10_bio->sectors;
2205 int idx = 0;
2206 int dr = r10_bio->devs[0].devnum;
2207 int dw = r10_bio->devs[1].devnum;
2208
2209 while (sectors) {
2210 int s = sectors;
2211 struct md_rdev *rdev;
2212 sector_t addr;
2213 int ok;
2214
2215 if (s > (PAGE_SIZE>>9))
2216 s = PAGE_SIZE >> 9;
2217
2218 rdev = conf->mirrors[dr].rdev;
2219 addr = r10_bio->devs[0].addr + sect,
2220 ok = sync_page_io(rdev,
2221 addr,
2222 s << 9,
2223 bio->bi_io_vec[idx].bv_page,
2224 READ, false);
2225 if (ok) {
2226 rdev = conf->mirrors[dw].rdev;
2227 addr = r10_bio->devs[1].addr + sect;
2228 ok = sync_page_io(rdev,
2229 addr,
2230 s << 9,
2231 bio->bi_io_vec[idx].bv_page,
2232 WRITE, false);
2233 if (!ok) {
2234 set_bit(WriteErrorSeen, &rdev->flags);
2235 if (!test_and_set_bit(WantReplacement,
2236 &rdev->flags))
2237 set_bit(MD_RECOVERY_NEEDED,
2238 &rdev->mddev->recovery);
2239 }
2240 }
2241 if (!ok) {
2242
2243
2244
2245
2246 rdev_set_badblocks(rdev, addr, s, 0);
2247
2248 if (rdev != conf->mirrors[dw].rdev) {
2249
2250 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2251 addr = r10_bio->devs[1].addr + sect;
2252 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2253 if (!ok) {
2254
2255 printk(KERN_NOTICE
2256 "md/raid10:%s: recovery aborted"
2257 " due to read error\n",
2258 mdname(mddev));
2259
2260 conf->mirrors[dw].recovery_disabled
2261 = mddev->recovery_disabled;
2262 set_bit(MD_RECOVERY_INTR,
2263 &mddev->recovery);
2264 break;
2265 }
2266 }
2267 }
2268
2269 sectors -= s;
2270 sect += s;
2271 idx++;
2272 }
2273}
2274
2275static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2276{
2277 struct r10conf *conf = mddev->private;
2278 int d;
2279 struct bio *wbio, *wbio2;
2280
2281 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2282 fix_recovery_read_error(r10_bio);
2283 end_sync_request(r10_bio);
2284 return;
2285 }
2286
2287
2288
2289
2290
2291 d = r10_bio->devs[1].devnum;
2292 wbio = r10_bio->devs[1].bio;
2293 wbio2 = r10_bio->devs[1].repl_bio;
2294
2295
2296
2297
2298 if (wbio2 && !wbio2->bi_end_io)
2299 wbio2 = NULL;
2300 if (wbio->bi_end_io) {
2301 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2302 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2303 generic_make_request(wbio);
2304 }
2305 if (wbio2) {
2306 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2307 md_sync_acct(conf->mirrors[d].replacement->bdev,
2308 bio_sectors(wbio2));
2309 generic_make_request(wbio2);
2310 }
2311}
2312
2313
2314
2315
2316
2317
2318
2319
2320static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2321{
2322 struct timespec cur_time_mon;
2323 unsigned long hours_since_last;
2324 unsigned int read_errors = atomic_read(&rdev->read_errors);
2325
2326 ktime_get_ts(&cur_time_mon);
2327
2328 if (rdev->last_read_error.tv_sec == 0 &&
2329 rdev->last_read_error.tv_nsec == 0) {
2330
2331 rdev->last_read_error = cur_time_mon;
2332 return;
2333 }
2334
2335 hours_since_last = (cur_time_mon.tv_sec -
2336 rdev->last_read_error.tv_sec) / 3600;
2337
2338 rdev->last_read_error = cur_time_mon;
2339
2340
2341
2342
2343
2344
2345 if (hours_since_last >= 8 * sizeof(read_errors))
2346 atomic_set(&rdev->read_errors, 0);
2347 else
2348 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2349}
2350
2351static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2352 int sectors, struct page *page, int rw)
2353{
2354 sector_t first_bad;
2355 int bad_sectors;
2356
2357 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2358 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2359 return -1;
2360 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
2361
2362 return 1;
2363 if (rw == WRITE) {
2364 set_bit(WriteErrorSeen, &rdev->flags);
2365 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2366 set_bit(MD_RECOVERY_NEEDED,
2367 &rdev->mddev->recovery);
2368 }
2369
2370 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2371 md_error(rdev->mddev, rdev);
2372 return 0;
2373}
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2384{
2385 int sect = 0;
2386 int sectors = r10_bio->sectors;
2387 struct md_rdev*rdev;
2388 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2389 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2390
2391
2392
2393
2394 rdev = conf->mirrors[d].rdev;
2395
2396 if (test_bit(Faulty, &rdev->flags))
2397
2398
2399 return;
2400
2401 check_decay_read_errors(mddev, rdev);
2402 atomic_inc(&rdev->read_errors);
2403 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2404 char b[BDEVNAME_SIZE];
2405 bdevname(rdev->bdev, b);
2406
2407 printk(KERN_NOTICE
2408 "md/raid10:%s: %s: Raid device exceeded "
2409 "read_error threshold [cur %d:max %d]\n",
2410 mdname(mddev), b,
2411 atomic_read(&rdev->read_errors), max_read_errors);
2412 printk(KERN_NOTICE
2413 "md/raid10:%s: %s: Failing raid device\n",
2414 mdname(mddev), b);
2415 md_error(mddev, conf->mirrors[d].rdev);
2416 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2417 return;
2418 }
2419
2420 while(sectors) {
2421 int s = sectors;
2422 int sl = r10_bio->read_slot;
2423 int success = 0;
2424 int start;
2425
2426 if (s > (PAGE_SIZE>>9))
2427 s = PAGE_SIZE >> 9;
2428
2429 rcu_read_lock();
2430 do {
2431 sector_t first_bad;
2432 int bad_sectors;
2433
2434 d = r10_bio->devs[sl].devnum;
2435 rdev = rcu_dereference(conf->mirrors[d].rdev);
2436 if (rdev &&
2437 !test_bit(Unmerged, &rdev->flags) &&
2438 test_bit(In_sync, &rdev->flags) &&
2439 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2440 &first_bad, &bad_sectors) == 0) {
2441 atomic_inc(&rdev->nr_pending);
2442 rcu_read_unlock();
2443 success = sync_page_io(rdev,
2444 r10_bio->devs[sl].addr +
2445 sect,
2446 s<<9,
2447 conf->tmppage, READ, false);
2448 rdev_dec_pending(rdev, mddev);
2449 rcu_read_lock();
2450 if (success)
2451 break;
2452 }
2453 sl++;
2454 if (sl == conf->copies)
2455 sl = 0;
2456 } while (!success && sl != r10_bio->read_slot);
2457 rcu_read_unlock();
2458
2459 if (!success) {
2460
2461
2462
2463
2464 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2465 rdev = conf->mirrors[dn].rdev;
2466
2467 if (!rdev_set_badblocks(
2468 rdev,
2469 r10_bio->devs[r10_bio->read_slot].addr
2470 + sect,
2471 s, 0)) {
2472 md_error(mddev, rdev);
2473 r10_bio->devs[r10_bio->read_slot].bio
2474 = IO_BLOCKED;
2475 }
2476 break;
2477 }
2478
2479 start = sl;
2480
2481 rcu_read_lock();
2482 while (sl != r10_bio->read_slot) {
2483 char b[BDEVNAME_SIZE];
2484
2485 if (sl==0)
2486 sl = conf->copies;
2487 sl--;
2488 d = r10_bio->devs[sl].devnum;
2489 rdev = rcu_dereference(conf->mirrors[d].rdev);
2490 if (!rdev ||
2491 test_bit(Unmerged, &rdev->flags) ||
2492 !test_bit(In_sync, &rdev->flags))
2493 continue;
2494
2495 atomic_inc(&rdev->nr_pending);
2496 rcu_read_unlock();
2497 if (r10_sync_page_io(rdev,
2498 r10_bio->devs[sl].addr +
2499 sect,
2500 s, conf->tmppage, WRITE)
2501 == 0) {
2502
2503 printk(KERN_NOTICE
2504 "md/raid10:%s: read correction "
2505 "write failed"
2506 " (%d sectors at %llu on %s)\n",
2507 mdname(mddev), s,
2508 (unsigned long long)(
2509 sect +
2510 choose_data_offset(r10_bio,
2511 rdev)),
2512 bdevname(rdev->bdev, b));
2513 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2514 "drive\n",
2515 mdname(mddev),
2516 bdevname(rdev->bdev, b));
2517 }
2518 rdev_dec_pending(rdev, mddev);
2519 rcu_read_lock();
2520 }
2521 sl = start;
2522 while (sl != r10_bio->read_slot) {
2523 char b[BDEVNAME_SIZE];
2524
2525 if (sl==0)
2526 sl = conf->copies;
2527 sl--;
2528 d = r10_bio->devs[sl].devnum;
2529 rdev = rcu_dereference(conf->mirrors[d].rdev);
2530 if (!rdev ||
2531 !test_bit(In_sync, &rdev->flags))
2532 continue;
2533
2534 atomic_inc(&rdev->nr_pending);
2535 rcu_read_unlock();
2536 switch (r10_sync_page_io(rdev,
2537 r10_bio->devs[sl].addr +
2538 sect,
2539 s, conf->tmppage,
2540 READ)) {
2541 case 0:
2542
2543 printk(KERN_NOTICE
2544 "md/raid10:%s: unable to read back "
2545 "corrected sectors"
2546 " (%d sectors at %llu on %s)\n",
2547 mdname(mddev), s,
2548 (unsigned long long)(
2549 sect +
2550 choose_data_offset(r10_bio, rdev)),
2551 bdevname(rdev->bdev, b));
2552 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2553 "drive\n",
2554 mdname(mddev),
2555 bdevname(rdev->bdev, b));
2556 break;
2557 case 1:
2558 printk(KERN_INFO
2559 "md/raid10:%s: read error corrected"
2560 " (%d sectors at %llu on %s)\n",
2561 mdname(mddev), s,
2562 (unsigned long long)(
2563 sect +
2564 choose_data_offset(r10_bio, rdev)),
2565 bdevname(rdev->bdev, b));
2566 atomic_add(s, &rdev->corrected_errors);
2567 }
2568
2569 rdev_dec_pending(rdev, mddev);
2570 rcu_read_lock();
2571 }
2572 rcu_read_unlock();
2573
2574 sectors -= s;
2575 sect += s;
2576 }
2577}
2578
2579static int narrow_write_error(struct r10bio *r10_bio, int i)
2580{
2581 struct bio *bio = r10_bio->master_bio;
2582 struct mddev *mddev = r10_bio->mddev;
2583 struct r10conf *conf = mddev->private;
2584 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596 int block_sectors;
2597 sector_t sector;
2598 int sectors;
2599 int sect_to_write = r10_bio->sectors;
2600 int ok = 1;
2601
2602 if (rdev->badblocks.shift < 0)
2603 return 0;
2604
2605 block_sectors = 1 << rdev->badblocks.shift;
2606 sector = r10_bio->sector;
2607 sectors = ((r10_bio->sector + block_sectors)
2608 & ~(sector_t)(block_sectors - 1))
2609 - sector;
2610
2611 while (sect_to_write) {
2612 struct bio *wbio;
2613 if (sectors > sect_to_write)
2614 sectors = sect_to_write;
2615
2616 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2617 md_trim_bio(wbio, sector - bio->bi_sector, sectors);
2618 wbio->bi_sector = (r10_bio->devs[i].addr+
2619 choose_data_offset(r10_bio, rdev) +
2620 (sector - r10_bio->sector));
2621 wbio->bi_bdev = rdev->bdev;
2622 if (submit_bio_wait(WRITE, wbio) == 0)
2623
2624 ok = rdev_set_badblocks(rdev, sector,
2625 sectors, 0)
2626 && ok;
2627
2628 bio_put(wbio);
2629 sect_to_write -= sectors;
2630 sector += sectors;
2631 sectors = block_sectors;
2632 }
2633 return ok;
2634}
2635
2636static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2637{
2638 int slot = r10_bio->read_slot;
2639 struct bio *bio;
2640 struct r10conf *conf = mddev->private;
2641 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2642 char b[BDEVNAME_SIZE];
2643 unsigned long do_sync;
2644 int max_sectors;
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654 bio = r10_bio->devs[slot].bio;
2655 bdevname(bio->bi_bdev, b);
2656 bio_put(bio);
2657 r10_bio->devs[slot].bio = NULL;
2658
2659 if (mddev->ro == 0) {
2660 freeze_array(conf, 1);
2661 fix_read_error(conf, mddev, r10_bio);
2662 unfreeze_array(conf);
2663 } else
2664 r10_bio->devs[slot].bio = IO_BLOCKED;
2665
2666 rdev_dec_pending(rdev, mddev);
2667
2668read_more:
2669 rdev = read_balance(conf, r10_bio, &max_sectors);
2670 if (rdev == NULL) {
2671 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2672 " read error for block %llu\n",
2673 mdname(mddev), b,
2674 (unsigned long long)r10_bio->sector);
2675 raid_end_bio_io(r10_bio);
2676 return;
2677 }
2678
2679 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
2680 slot = r10_bio->read_slot;
2681 printk_ratelimited(
2682 KERN_ERR
2683 "md/raid10:%s: %s: redirecting "
2684 "sector %llu to another mirror\n",
2685 mdname(mddev),
2686 bdevname(rdev->bdev, b),
2687 (unsigned long long)r10_bio->sector);
2688 bio = bio_clone_mddev(r10_bio->master_bio,
2689 GFP_NOIO, mddev);
2690 md_trim_bio(bio,
2691 r10_bio->sector - bio->bi_sector,
2692 max_sectors);
2693 r10_bio->devs[slot].bio = bio;
2694 r10_bio->devs[slot].rdev = rdev;
2695 bio->bi_sector = r10_bio->devs[slot].addr
2696 + choose_data_offset(r10_bio, rdev);
2697 bio->bi_bdev = rdev->bdev;
2698 bio->bi_rw = READ | do_sync;
2699 bio->bi_private = r10_bio;
2700 bio->bi_end_io = raid10_end_read_request;
2701 if (max_sectors < r10_bio->sectors) {
2702
2703 struct bio *mbio = r10_bio->master_bio;
2704 int sectors_handled =
2705 r10_bio->sector + max_sectors
2706 - mbio->bi_sector;
2707 r10_bio->sectors = max_sectors;
2708 spin_lock_irq(&conf->device_lock);
2709 if (mbio->bi_phys_segments == 0)
2710 mbio->bi_phys_segments = 2;
2711 else
2712 mbio->bi_phys_segments++;
2713 spin_unlock_irq(&conf->device_lock);
2714 generic_make_request(bio);
2715
2716 r10_bio = mempool_alloc(conf->r10bio_pool,
2717 GFP_NOIO);
2718 r10_bio->master_bio = mbio;
2719 r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
2720 r10_bio->state = 0;
2721 set_bit(R10BIO_ReadError,
2722 &r10_bio->state);
2723 r10_bio->mddev = mddev;
2724 r10_bio->sector = mbio->bi_sector
2725 + sectors_handled;
2726
2727 goto read_more;
2728 } else
2729 generic_make_request(bio);
2730}
2731
2732static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2733{
2734
2735
2736
2737
2738
2739
2740 int m;
2741 struct md_rdev *rdev;
2742
2743 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2744 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2745 for (m = 0; m < conf->copies; m++) {
2746 int dev = r10_bio->devs[m].devnum;
2747 rdev = conf->mirrors[dev].rdev;
2748 if (r10_bio->devs[m].bio == NULL)
2749 continue;
2750 if (test_bit(BIO_UPTODATE,
2751 &r10_bio->devs[m].bio->bi_flags)) {
2752 rdev_clear_badblocks(
2753 rdev,
2754 r10_bio->devs[m].addr,
2755 r10_bio->sectors, 0);
2756 } else {
2757 if (!rdev_set_badblocks(
2758 rdev,
2759 r10_bio->devs[m].addr,
2760 r10_bio->sectors, 0))
2761 md_error(conf->mddev, rdev);
2762 }
2763 rdev = conf->mirrors[dev].replacement;
2764 if (r10_bio->devs[m].repl_bio == NULL)
2765 continue;
2766 if (test_bit(BIO_UPTODATE,
2767 &r10_bio->devs[m].repl_bio->bi_flags)) {
2768 rdev_clear_badblocks(
2769 rdev,
2770 r10_bio->devs[m].addr,
2771 r10_bio->sectors, 0);
2772 } else {
2773 if (!rdev_set_badblocks(
2774 rdev,
2775 r10_bio->devs[m].addr,
2776 r10_bio->sectors, 0))
2777 md_error(conf->mddev, rdev);
2778 }
2779 }
2780 put_buf(r10_bio);
2781 } else {
2782 for (m = 0; m < conf->copies; m++) {
2783 int dev = r10_bio->devs[m].devnum;
2784 struct bio *bio = r10_bio->devs[m].bio;
2785 rdev = conf->mirrors[dev].rdev;
2786 if (bio == IO_MADE_GOOD) {
2787 rdev_clear_badblocks(
2788 rdev,
2789 r10_bio->devs[m].addr,
2790 r10_bio->sectors, 0);
2791 rdev_dec_pending(rdev, conf->mddev);
2792 } else if (bio != NULL &&
2793 !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2794 if (!narrow_write_error(r10_bio, m)) {
2795 md_error(conf->mddev, rdev);
2796 set_bit(R10BIO_Degraded,
2797 &r10_bio->state);
2798 }
2799 rdev_dec_pending(rdev, conf->mddev);
2800 }
2801 bio = r10_bio->devs[m].repl_bio;
2802 rdev = conf->mirrors[dev].replacement;
2803 if (rdev && bio == IO_MADE_GOOD) {
2804 rdev_clear_badblocks(
2805 rdev,
2806 r10_bio->devs[m].addr,
2807 r10_bio->sectors, 0);
2808 rdev_dec_pending(rdev, conf->mddev);
2809 }
2810 }
2811 if (test_bit(R10BIO_WriteError,
2812 &r10_bio->state))
2813 close_write(r10_bio);
2814 raid_end_bio_io(r10_bio);
2815 }
2816}
2817
2818static void raid10d(struct md_thread *thread)
2819{
2820 struct mddev *mddev = thread->mddev;
2821 struct r10bio *r10_bio;
2822 unsigned long flags;
2823 struct r10conf *conf = mddev->private;
2824 struct list_head *head = &conf->retry_list;
2825 struct blk_plug plug;
2826
2827 md_check_recovery(mddev);
2828
2829 blk_start_plug(&plug);
2830 for (;;) {
2831
2832 flush_pending_writes(conf);
2833
2834 spin_lock_irqsave(&conf->device_lock, flags);
2835 if (list_empty(head)) {
2836 spin_unlock_irqrestore(&conf->device_lock, flags);
2837 break;
2838 }
2839 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2840 list_del(head->prev);
2841 conf->nr_queued--;
2842 spin_unlock_irqrestore(&conf->device_lock, flags);
2843
2844 mddev = r10_bio->mddev;
2845 conf = mddev->private;
2846 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2847 test_bit(R10BIO_WriteError, &r10_bio->state))
2848 handle_write_completed(conf, r10_bio);
2849 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2850 reshape_request_write(mddev, r10_bio);
2851 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2852 sync_request_write(mddev, r10_bio);
2853 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2854 recovery_request_write(mddev, r10_bio);
2855 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2856 handle_read_error(mddev, r10_bio);
2857 else {
2858
2859
2860
2861 int slot = r10_bio->read_slot;
2862 generic_make_request(r10_bio->devs[slot].bio);
2863 }
2864
2865 cond_resched();
2866 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2867 md_check_recovery(mddev);
2868 }
2869 blk_finish_plug(&plug);
2870}
2871
2872
2873static int init_resync(struct r10conf *conf)
2874{
2875 int buffs;
2876 int i;
2877
2878 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2879 BUG_ON(conf->r10buf_pool);
2880 conf->have_replacement = 0;
2881 for (i = 0; i < conf->geo.raid_disks; i++)
2882 if (conf->mirrors[i].replacement)
2883 conf->have_replacement = 1;
2884 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2885 if (!conf->r10buf_pool)
2886 return -ENOMEM;
2887 conf->next_resync = 0;
2888 return 0;
2889}
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2924 int *skipped, int go_faster)
2925{
2926 struct r10conf *conf = mddev->private;
2927 struct r10bio *r10_bio;
2928 struct bio *biolist = NULL, *bio;
2929 sector_t max_sector, nr_sectors;
2930 int i;
2931 int max_sync;
2932 sector_t sync_blocks;
2933 sector_t sectors_skipped = 0;
2934 int chunks_skipped = 0;
2935 sector_t chunk_mask = conf->geo.chunk_mask;
2936
2937 if (!conf->r10buf_pool)
2938 if (init_resync(conf))
2939 return 0;
2940
2941
2942
2943
2944
2945 if (mddev->bitmap == NULL &&
2946 mddev->recovery_cp == MaxSector &&
2947 mddev->reshape_position == MaxSector &&
2948 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2949 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
2950 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2951 conf->fullsync == 0) {
2952 *skipped = 1;
2953 return mddev->dev_sectors - sector_nr;
2954 }
2955
2956 skipped:
2957 max_sector = mddev->dev_sectors;
2958 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2959 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2960 max_sector = mddev->resync_max_sectors;
2961 if (sector_nr >= max_sector) {
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2972 end_reshape(conf);
2973 return 0;
2974 }
2975
2976 if (mddev->curr_resync < max_sector) {
2977 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2978 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2979 &sync_blocks, 1);
2980 else for (i = 0; i < conf->geo.raid_disks; i++) {
2981 sector_t sect =
2982 raid10_find_virt(conf, mddev->curr_resync, i);
2983 bitmap_end_sync(mddev->bitmap, sect,
2984 &sync_blocks, 1);
2985 }
2986 } else {
2987
2988 if ((!mddev->bitmap || conf->fullsync)
2989 && conf->have_replacement
2990 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2991
2992
2993
2994 for (i = 0; i < conf->geo.raid_disks; i++)
2995 if (conf->mirrors[i].replacement)
2996 conf->mirrors[i].replacement
2997 ->recovery_offset
2998 = MaxSector;
2999 }
3000 conf->fullsync = 0;
3001 }
3002 bitmap_close_sync(mddev->bitmap);
3003 close_sync(conf);
3004 *skipped = 1;
3005 return sectors_skipped;
3006 }
3007
3008 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3009 return reshape_request(mddev, sector_nr, skipped);
3010
3011 if (chunks_skipped >= conf->geo.raid_disks) {
3012
3013
3014
3015 *skipped = 1;
3016 return (max_sector - sector_nr) + sectors_skipped;
3017 }
3018
3019 if (max_sector > mddev->resync_max)
3020 max_sector = mddev->resync_max;
3021
3022
3023
3024
3025 if (conf->geo.near_copies < conf->geo.raid_disks &&
3026 max_sector > (sector_nr | chunk_mask))
3027 max_sector = (sector_nr | chunk_mask) + 1;
3028
3029
3030
3031
3032 if (!go_faster && conf->nr_waiting)
3033 msleep_interruptible(1000);
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
3051 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3052
3053 int j;
3054 r10_bio = NULL;
3055
3056 for (i = 0 ; i < conf->geo.raid_disks; i++) {
3057 int still_degraded;
3058 struct r10bio *rb2;
3059 sector_t sect;
3060 int must_sync;
3061 int any_working;
3062 struct raid10_info *mirror = &conf->mirrors[i];
3063
3064 if ((mirror->rdev == NULL ||
3065 test_bit(In_sync, &mirror->rdev->flags))
3066 &&
3067 (mirror->replacement == NULL ||
3068 test_bit(Faulty,
3069 &mirror->replacement->flags)))
3070 continue;
3071
3072 still_degraded = 0;
3073
3074 rb2 = r10_bio;
3075 sect = raid10_find_virt(conf, sector_nr, i);
3076 if (sect >= mddev->resync_max_sectors) {
3077
3078
3079
3080 continue;
3081 }
3082
3083
3084
3085
3086 must_sync = bitmap_start_sync(mddev->bitmap, sect,
3087 &sync_blocks, 1);
3088 if (sync_blocks < max_sync)
3089 max_sync = sync_blocks;
3090 if (!must_sync &&
3091 mirror->replacement == NULL &&
3092 !conf->fullsync) {
3093
3094
3095
3096 chunks_skipped = -1;
3097 continue;
3098 }
3099
3100 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3101 raise_barrier(conf, rb2 != NULL);
3102 atomic_set(&r10_bio->remaining, 0);
3103
3104 r10_bio->master_bio = (struct bio*)rb2;
3105 if (rb2)
3106 atomic_inc(&rb2->remaining);
3107 r10_bio->mddev = mddev;
3108 set_bit(R10BIO_IsRecover, &r10_bio->state);
3109 r10_bio->sector = sect;
3110
3111 raid10_find_phys(conf, r10_bio);
3112
3113
3114
3115
3116 for (j = 0; j < conf->geo.raid_disks; j++)
3117 if (conf->mirrors[j].rdev == NULL ||
3118 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
3119 still_degraded = 1;
3120 break;
3121 }
3122
3123 must_sync = bitmap_start_sync(mddev->bitmap, sect,
3124 &sync_blocks, still_degraded);
3125
3126 any_working = 0;
3127 for (j=0; j<conf->copies;j++) {
3128 int k;
3129 int d = r10_bio->devs[j].devnum;
3130 sector_t from_addr, to_addr;
3131 struct md_rdev *rdev;
3132 sector_t sector, first_bad;
3133 int bad_sectors;
3134 if (!conf->mirrors[d].rdev ||
3135 !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
3136 continue;
3137
3138 any_working = 1;
3139 rdev = conf->mirrors[d].rdev;
3140 sector = r10_bio->devs[j].addr;
3141
3142 if (is_badblock(rdev, sector, max_sync,
3143 &first_bad, &bad_sectors)) {
3144 if (first_bad > sector)
3145 max_sync = first_bad - sector;
3146 else {
3147 bad_sectors -= (sector
3148 - first_bad);
3149 if (max_sync > bad_sectors)
3150 max_sync = bad_sectors;
3151 continue;
3152 }
3153 }
3154 bio = r10_bio->devs[0].bio;
3155 bio_reset(bio);
3156 bio->bi_next = biolist;
3157 biolist = bio;
3158 bio->bi_private = r10_bio;
3159 bio->bi_end_io = end_sync_read;
3160 bio->bi_rw = READ;
3161 from_addr = r10_bio->devs[j].addr;
3162 bio->bi_sector = from_addr + rdev->data_offset;
3163 bio->bi_bdev = rdev->bdev;
3164 atomic_inc(&rdev->nr_pending);
3165
3166
3167 for (k=0; k<conf->copies; k++)
3168 if (r10_bio->devs[k].devnum == i)
3169 break;
3170 BUG_ON(k == conf->copies);
3171 to_addr = r10_bio->devs[k].addr;
3172 r10_bio->devs[0].devnum = d;
3173 r10_bio->devs[0].addr = from_addr;
3174 r10_bio->devs[1].devnum = i;
3175 r10_bio->devs[1].addr = to_addr;
3176
3177 rdev = mirror->rdev;
3178 if (!test_bit(In_sync, &rdev->flags)) {
3179 bio = r10_bio->devs[1].bio;
3180 bio_reset(bio);
3181 bio->bi_next = biolist;
3182 biolist = bio;
3183 bio->bi_private = r10_bio;
3184 bio->bi_end_io = end_sync_write;
3185 bio->bi_rw = WRITE;
3186 bio->bi_sector = to_addr
3187 + rdev->data_offset;
3188 bio->bi_bdev = rdev->bdev;
3189 atomic_inc(&r10_bio->remaining);
3190 } else
3191 r10_bio->devs[1].bio->bi_end_io = NULL;
3192
3193
3194 bio = r10_bio->devs[1].repl_bio;
3195 if (bio)
3196 bio->bi_end_io = NULL;
3197 rdev = mirror->replacement;
3198
3199
3200
3201
3202
3203
3204
3205
3206 if (rdev == NULL || bio == NULL ||
3207 test_bit(Faulty, &rdev->flags))
3208 break;
3209 bio_reset(bio);
3210 bio->bi_next = biolist;
3211 biolist = bio;
3212 bio->bi_private = r10_bio;
3213 bio->bi_end_io = end_sync_write;
3214 bio->bi_rw = WRITE;
3215 bio->bi_sector = to_addr + rdev->data_offset;
3216 bio->bi_bdev = rdev->bdev;
3217 atomic_inc(&r10_bio->remaining);
3218 break;
3219 }
3220 if (j == conf->copies) {
3221
3222
3223 put_buf(r10_bio);
3224 if (rb2)
3225 atomic_dec(&rb2->remaining);
3226 r10_bio = rb2;
3227 if (any_working) {
3228
3229
3230
3231 int k;
3232 for (k = 0; k < conf->copies; k++)
3233 if (r10_bio->devs[k].devnum == i)
3234 break;
3235 if (!test_bit(In_sync,
3236 &mirror->rdev->flags)
3237 && !rdev_set_badblocks(
3238 mirror->rdev,
3239 r10_bio->devs[k].addr,
3240 max_sync, 0))
3241 any_working = 0;
3242 if (mirror->replacement &&
3243 !rdev_set_badblocks(
3244 mirror->replacement,
3245 r10_bio->devs[k].addr,
3246 max_sync, 0))
3247 any_working = 0;
3248 }
3249 if (!any_working) {
3250 if (!test_and_set_bit(MD_RECOVERY_INTR,
3251 &mddev->recovery))
3252 printk(KERN_INFO "md/raid10:%s: insufficient "
3253 "working devices for recovery.\n",
3254 mdname(mddev));
3255 mirror->recovery_disabled
3256 = mddev->recovery_disabled;
3257 }
3258 break;
3259 }
3260 }
3261 if (biolist == NULL) {
3262 while (r10_bio) {
3263 struct r10bio *rb2 = r10_bio;
3264 r10_bio = (struct r10bio*) rb2->master_bio;
3265 rb2->master_bio = NULL;
3266 put_buf(rb2);
3267 }
3268 goto giveup;
3269 }
3270 } else {
3271
3272 int count = 0;
3273
3274 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
3275
3276 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
3277 &sync_blocks, mddev->degraded) &&
3278 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3279 &mddev->recovery)) {
3280
3281 *skipped = 1;
3282 return sync_blocks + sectors_skipped;
3283 }
3284 if (sync_blocks < max_sync)
3285 max_sync = sync_blocks;
3286 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3287
3288 r10_bio->mddev = mddev;
3289 atomic_set(&r10_bio->remaining, 0);
3290 raise_barrier(conf, 0);
3291 conf->next_resync = sector_nr;
3292
3293 r10_bio->master_bio = NULL;
3294 r10_bio->sector = sector_nr;
3295 set_bit(R10BIO_IsSync, &r10_bio->state);
3296 raid10_find_phys(conf, r10_bio);
3297 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3298
3299 for (i = 0; i < conf->copies; i++) {
3300 int d = r10_bio->devs[i].devnum;
3301 sector_t first_bad, sector;
3302 int bad_sectors;
3303
3304 if (r10_bio->devs[i].repl_bio)
3305 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3306
3307 bio = r10_bio->devs[i].bio;
3308 bio_reset(bio);
3309 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3310 if (conf->mirrors[d].rdev == NULL ||
3311 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
3312 continue;
3313 sector = r10_bio->devs[i].addr;
3314 if (is_badblock(conf->mirrors[d].rdev,
3315 sector, max_sync,
3316 &first_bad, &bad_sectors)) {
3317 if (first_bad > sector)
3318 max_sync = first_bad - sector;
3319 else {
3320 bad_sectors -= (sector - first_bad);
3321 if (max_sync > bad_sectors)
3322 max_sync = bad_sectors;
3323 continue;
3324 }
3325 }
3326 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3327 atomic_inc(&r10_bio->remaining);
3328 bio->bi_next = biolist;
3329 biolist = bio;
3330 bio->bi_private = r10_bio;
3331 bio->bi_end_io = end_sync_read;
3332 bio->bi_rw = READ;
3333 bio->bi_sector = sector +
3334 conf->mirrors[d].rdev->data_offset;
3335 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
3336 count++;
3337
3338 if (conf->mirrors[d].replacement == NULL ||
3339 test_bit(Faulty,
3340 &conf->mirrors[d].replacement->flags))
3341 continue;
3342
3343
3344 bio = r10_bio->devs[i].repl_bio;
3345 bio_reset(bio);
3346 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3347
3348 sector = r10_bio->devs[i].addr;
3349 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3350 bio->bi_next = biolist;
3351 biolist = bio;
3352 bio->bi_private = r10_bio;
3353 bio->bi_end_io = end_sync_write;
3354 bio->bi_rw = WRITE;
3355 bio->bi_sector = sector +
3356 conf->mirrors[d].replacement->data_offset;
3357 bio->bi_bdev = conf->mirrors[d].replacement->bdev;
3358 count++;
3359 }
3360
3361 if (count < 2) {
3362 for (i=0; i<conf->copies; i++) {
3363 int d = r10_bio->devs[i].devnum;
3364 if (r10_bio->devs[i].bio->bi_end_io)
3365 rdev_dec_pending(conf->mirrors[d].rdev,
3366 mddev);
3367 if (r10_bio->devs[i].repl_bio &&
3368 r10_bio->devs[i].repl_bio->bi_end_io)
3369 rdev_dec_pending(
3370 conf->mirrors[d].replacement,
3371 mddev);
3372 }
3373 put_buf(r10_bio);
3374 biolist = NULL;
3375 goto giveup;
3376 }
3377 }
3378
3379 nr_sectors = 0;
3380 if (sector_nr + max_sync < max_sector)
3381 max_sector = sector_nr + max_sync;
3382 do {
3383 struct page *page;
3384 int len = PAGE_SIZE;
3385 if (sector_nr + (len>>9) > max_sector)
3386 len = (max_sector - sector_nr) << 9;
3387 if (len == 0)
3388 break;
3389 for (bio= biolist ; bio ; bio=bio->bi_next) {
3390 struct bio *bio2;
3391 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
3392 if (bio_add_page(bio, page, len, 0))
3393 continue;
3394
3395
3396 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
3397 for (bio2 = biolist;
3398 bio2 && bio2 != bio;
3399 bio2 = bio2->bi_next) {
3400
3401 bio2->bi_vcnt--;
3402 bio2->bi_size -= len;
3403 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
3404 }
3405 goto bio_full;
3406 }
3407 nr_sectors += len>>9;
3408 sector_nr += len>>9;
3409 } while (biolist->bi_vcnt < RESYNC_PAGES);
3410 bio_full:
3411 r10_bio->sectors = nr_sectors;
3412
3413 while (biolist) {
3414 bio = biolist;
3415 biolist = biolist->bi_next;
3416
3417 bio->bi_next = NULL;
3418 r10_bio = bio->bi_private;
3419 r10_bio->sectors = nr_sectors;
3420
3421 if (bio->bi_end_io == end_sync_read) {
3422 md_sync_acct(bio->bi_bdev, nr_sectors);
3423 set_bit(BIO_UPTODATE, &bio->bi_flags);
3424 generic_make_request(bio);
3425 }
3426 }
3427
3428 if (sectors_skipped)
3429
3430
3431
3432 md_done_sync(mddev, sectors_skipped, 1);
3433
3434 return sectors_skipped + nr_sectors;
3435 giveup:
3436
3437
3438
3439
3440 if (sector_nr + max_sync < max_sector)
3441 max_sector = sector_nr + max_sync;
3442
3443 sectors_skipped += (max_sector - sector_nr);
3444 chunks_skipped ++;
3445 sector_nr = max_sector;
3446 goto skipped;
3447}
3448
3449static sector_t
3450raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3451{
3452 sector_t size;
3453 struct r10conf *conf = mddev->private;
3454
3455 if (!raid_disks)
3456 raid_disks = min(conf->geo.raid_disks,
3457 conf->prev.raid_disks);
3458 if (!sectors)
3459 sectors = conf->dev_sectors;
3460
3461 size = sectors >> conf->geo.chunk_shift;
3462 sector_div(size, conf->geo.far_copies);
3463 size = size * raid_disks;
3464 sector_div(size, conf->geo.near_copies);
3465
3466 return size << conf->geo.chunk_shift;
3467}
3468
3469static void calc_sectors(struct r10conf *conf, sector_t size)
3470{
3471
3472
3473
3474
3475
3476 size = size >> conf->geo.chunk_shift;
3477 sector_div(size, conf->geo.far_copies);
3478 size = size * conf->geo.raid_disks;
3479 sector_div(size, conf->geo.near_copies);
3480
3481
3482 size = size * conf->copies;
3483
3484
3485
3486
3487 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3488
3489 conf->dev_sectors = size << conf->geo.chunk_shift;
3490
3491 if (conf->geo.far_offset)
3492 conf->geo.stride = 1 << conf->geo.chunk_shift;
3493 else {
3494 sector_div(size, conf->geo.far_copies);
3495 conf->geo.stride = size << conf->geo.chunk_shift;
3496 }
3497}
3498
3499enum geo_type {geo_new, geo_old, geo_start};
3500static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3501{
3502 int nc, fc, fo;
3503 int layout, chunk, disks;
3504 switch (new) {
3505 case geo_old:
3506 layout = mddev->layout;
3507 chunk = mddev->chunk_sectors;
3508 disks = mddev->raid_disks - mddev->delta_disks;
3509 break;
3510 case geo_new:
3511 layout = mddev->new_layout;
3512 chunk = mddev->new_chunk_sectors;
3513 disks = mddev->raid_disks;
3514 break;
3515 default:
3516 case geo_start:
3517
3518 layout = mddev->new_layout;
3519 chunk = mddev->new_chunk_sectors;
3520 disks = mddev->raid_disks + mddev->delta_disks;
3521 break;
3522 }
3523 if (layout >> 18)
3524 return -1;
3525 if (chunk < (PAGE_SIZE >> 9) ||
3526 !is_power_of_2(chunk))
3527 return -2;
3528 nc = layout & 255;
3529 fc = (layout >> 8) & 255;
3530 fo = layout & (1<<16);
3531 geo->raid_disks = disks;
3532 geo->near_copies = nc;
3533 geo->far_copies = fc;
3534 geo->far_offset = fo;
3535 geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
3536 geo->chunk_mask = chunk - 1;
3537 geo->chunk_shift = ffz(~chunk);
3538 return nc*fc;
3539}
3540
3541static struct r10conf *setup_conf(struct mddev *mddev)
3542{
3543 struct r10conf *conf = NULL;
3544 int err = -EINVAL;
3545 struct geom geo;
3546 int copies;
3547
3548 copies = setup_geo(&geo, mddev, geo_new);
3549
3550 if (copies == -2) {
3551 printk(KERN_ERR "md/raid10:%s: chunk size must be "
3552 "at least PAGE_SIZE(%ld) and be a power of 2.\n",
3553 mdname(mddev), PAGE_SIZE);
3554 goto out;
3555 }
3556
3557 if (copies < 2 || copies > mddev->raid_disks) {
3558 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3559 mdname(mddev), mddev->new_layout);
3560 goto out;
3561 }
3562
3563 err = -ENOMEM;
3564 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3565 if (!conf)
3566 goto out;
3567
3568
3569 conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
3570 max(0,-mddev->delta_disks)),
3571 GFP_KERNEL);
3572 if (!conf->mirrors)
3573 goto out;
3574
3575 conf->tmppage = alloc_page(GFP_KERNEL);
3576 if (!conf->tmppage)
3577 goto out;
3578
3579 conf->geo = geo;
3580 conf->copies = copies;
3581 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
3582 r10bio_pool_free, conf);
3583 if (!conf->r10bio_pool)
3584 goto out;
3585
3586 calc_sectors(conf, mddev->dev_sectors);
3587 if (mddev->reshape_position == MaxSector) {
3588 conf->prev = conf->geo;
3589 conf->reshape_progress = MaxSector;
3590 } else {
3591 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3592 err = -EINVAL;
3593 goto out;
3594 }
3595 conf->reshape_progress = mddev->reshape_position;
3596 if (conf->prev.far_offset)
3597 conf->prev.stride = 1 << conf->prev.chunk_shift;
3598 else
3599
3600 conf->prev.stride = conf->dev_sectors;
3601 }
3602 spin_lock_init(&conf->device_lock);
3603 INIT_LIST_HEAD(&conf->retry_list);
3604
3605 spin_lock_init(&conf->resync_lock);
3606 init_waitqueue_head(&conf->wait_barrier);
3607
3608 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3609 if (!conf->thread)
3610 goto out;
3611
3612 conf->mddev = mddev;
3613 return conf;
3614
3615 out:
3616 if (err == -ENOMEM)
3617 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
3618 mdname(mddev));
3619 if (conf) {
3620 if (conf->r10bio_pool)
3621 mempool_destroy(conf->r10bio_pool);
3622 kfree(conf->mirrors);
3623 safe_put_page(conf->tmppage);
3624 kfree(conf);
3625 }
3626 return ERR_PTR(err);
3627}
3628
3629static int run(struct mddev *mddev)
3630{
3631 struct r10conf *conf;
3632 int i, disk_idx, chunk_size;
3633 struct raid10_info *disk;
3634 struct md_rdev *rdev;
3635 sector_t size;
3636 sector_t min_offset_diff = 0;
3637 int first = 1;
3638 bool discard_supported = false;
3639
3640 if (mddev->private == NULL) {
3641 conf = setup_conf(mddev);
3642 if (IS_ERR(conf))
3643 return PTR_ERR(conf);
3644 mddev->private = conf;
3645 }
3646 conf = mddev->private;
3647 if (!conf)
3648 goto out;
3649
3650 mddev->thread = conf->thread;
3651 conf->thread = NULL;
3652
3653 chunk_size = mddev->chunk_sectors << 9;
3654 if (mddev->queue) {
3655 blk_queue_max_discard_sectors(mddev->queue,
3656 mddev->chunk_sectors);
3657 blk_queue_max_write_same_sectors(mddev->queue, 0);
3658 blk_queue_io_min(mddev->queue, chunk_size);
3659 if (conf->geo.raid_disks % conf->geo.near_copies)
3660 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3661 else
3662 blk_queue_io_opt(mddev->queue, chunk_size *
3663 (conf->geo.raid_disks / conf->geo.near_copies));
3664 }
3665
3666 rdev_for_each(rdev, mddev) {
3667 long long diff;
3668 struct request_queue *q;
3669
3670 disk_idx = rdev->raid_disk;
3671 if (disk_idx < 0)
3672 continue;
3673 if (disk_idx >= conf->geo.raid_disks &&
3674 disk_idx >= conf->prev.raid_disks)
3675 continue;
3676 disk = conf->mirrors + disk_idx;
3677
3678 if (test_bit(Replacement, &rdev->flags)) {
3679 if (disk->replacement)
3680 goto out_free_conf;
3681 disk->replacement = rdev;
3682 } else {
3683 if (disk->rdev)
3684 goto out_free_conf;
3685 disk->rdev = rdev;
3686 }
3687 q = bdev_get_queue(rdev->bdev);
3688 if (q->merge_bvec_fn)
3689 mddev->merge_check_needed = 1;
3690 diff = (rdev->new_data_offset - rdev->data_offset);
3691 if (!mddev->reshape_backwards)
3692 diff = -diff;
3693 if (diff < 0)
3694 diff = 0;
3695 if (first || diff < min_offset_diff)
3696 min_offset_diff = diff;
3697
3698 if (mddev->gendisk)
3699 disk_stack_limits(mddev->gendisk, rdev->bdev,
3700 rdev->data_offset << 9);
3701
3702 disk->head_position = 0;
3703
3704 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3705 discard_supported = true;
3706 }
3707
3708 if (mddev->queue) {
3709 if (discard_supported)
3710 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
3711 mddev->queue);
3712 else
3713 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
3714 mddev->queue);
3715 }
3716
3717 if (!enough(conf, -1)) {
3718 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
3719 mdname(mddev));
3720 goto out_free_conf;
3721 }
3722
3723 if (conf->reshape_progress != MaxSector) {
3724
3725 if (conf->geo.far_copies != 1 &&
3726 conf->geo.far_offset == 0)
3727 goto out_free_conf;
3728 if (conf->prev.far_copies != 1 &&
3729 conf->prev.far_offset == 0)
3730 goto out_free_conf;
3731 }
3732
3733 mddev->degraded = 0;
3734 for (i = 0;
3735 i < conf->geo.raid_disks
3736 || i < conf->prev.raid_disks;
3737 i++) {
3738
3739 disk = conf->mirrors + i;
3740
3741 if (!disk->rdev && disk->replacement) {
3742
3743 disk->rdev = disk->replacement;
3744 disk->replacement = NULL;
3745 clear_bit(Replacement, &disk->rdev->flags);
3746 }
3747
3748 if (!disk->rdev ||
3749 !test_bit(In_sync, &disk->rdev->flags)) {
3750 disk->head_position = 0;
3751 mddev->degraded++;
3752 if (disk->rdev)
3753 conf->fullsync = 1;
3754 }
3755 disk->recovery_disabled = mddev->recovery_disabled - 1;
3756 }
3757
3758 if (mddev->recovery_cp != MaxSector)
3759 printk(KERN_NOTICE "md/raid10:%s: not clean"
3760 " -- starting background reconstruction\n",
3761 mdname(mddev));
3762 printk(KERN_INFO
3763 "md/raid10:%s: active with %d out of %d devices\n",
3764 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3765 conf->geo.raid_disks);
3766
3767
3768
3769 mddev->dev_sectors = conf->dev_sectors;
3770 size = raid10_size(mddev, 0, 0);
3771 md_set_array_sectors(mddev, size);
3772 mddev->resync_max_sectors = size;
3773
3774 if (mddev->queue) {
3775 int stripe = conf->geo.raid_disks *
3776 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3777 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
3778 mddev->queue->backing_dev_info.congested_data = mddev;
3779
3780
3781
3782
3783
3784 stripe /= conf->geo.near_copies;
3785 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3786 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3787 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3788 }
3789
3790
3791 if (md_integrity_register(mddev))
3792 goto out_free_conf;
3793
3794 if (conf->reshape_progress != MaxSector) {
3795 unsigned long before_length, after_length;
3796
3797 before_length = ((1 << conf->prev.chunk_shift) *
3798 conf->prev.far_copies);
3799 after_length = ((1 << conf->geo.chunk_shift) *
3800 conf->geo.far_copies);
3801
3802 if (max(before_length, after_length) > min_offset_diff) {
3803
3804 printk("md/raid10: offset difference not enough to continue reshape\n");
3805 goto out_free_conf;
3806 }
3807 conf->offset_diff = min_offset_diff;
3808
3809 conf->reshape_safe = conf->reshape_progress;
3810 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3811 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3812 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3813 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3814 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3815 "reshape");
3816 }
3817
3818 return 0;
3819
3820out_free_conf:
3821 md_unregister_thread(&mddev->thread);
3822 if (conf->r10bio_pool)
3823 mempool_destroy(conf->r10bio_pool);
3824 safe_put_page(conf->tmppage);
3825 kfree(conf->mirrors);
3826 kfree(conf);
3827 mddev->private = NULL;
3828out:
3829 return -EIO;
3830}
3831
3832static int stop(struct mddev *mddev)
3833{
3834 struct r10conf *conf = mddev->private;
3835
3836 raise_barrier(conf, 0);
3837 lower_barrier(conf);
3838
3839 md_unregister_thread(&mddev->thread);
3840 if (mddev->queue)
3841
3842 blk_sync_queue(mddev->queue);
3843
3844 if (conf->r10bio_pool)
3845 mempool_destroy(conf->r10bio_pool);
3846 safe_put_page(conf->tmppage);
3847 kfree(conf->mirrors);
3848 kfree(conf);
3849 mddev->private = NULL;
3850 return 0;
3851}
3852
3853static void raid10_quiesce(struct mddev *mddev, int state)
3854{
3855 struct r10conf *conf = mddev->private;
3856
3857 switch(state) {
3858 case 1:
3859 raise_barrier(conf, 0);
3860 break;
3861 case 0:
3862 lower_barrier(conf);
3863 break;
3864 }
3865}
3866
3867static int raid10_resize(struct mddev *mddev, sector_t sectors)
3868{
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881 struct r10conf *conf = mddev->private;
3882 sector_t oldsize, size;
3883
3884 if (mddev->reshape_position != MaxSector)
3885 return -EBUSY;
3886
3887 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3888 return -EINVAL;
3889
3890 oldsize = raid10_size(mddev, 0, 0);
3891 size = raid10_size(mddev, sectors, 0);
3892 if (mddev->external_size &&
3893 mddev->array_sectors > size)
3894 return -EINVAL;
3895 if (mddev->bitmap) {
3896 int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
3897 if (ret)
3898 return ret;
3899 }
3900 md_set_array_sectors(mddev, size);
3901 set_capacity(mddev->gendisk, mddev->array_sectors);
3902 revalidate_disk(mddev->gendisk);
3903 if (sectors > mddev->dev_sectors &&
3904 mddev->recovery_cp > oldsize) {
3905 mddev->recovery_cp = oldsize;
3906 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3907 }
3908 calc_sectors(conf, sectors);
3909 mddev->dev_sectors = conf->dev_sectors;
3910 mddev->resync_max_sectors = size;
3911 return 0;
3912}
3913
3914static void *raid10_takeover_raid0(struct mddev *mddev)
3915{
3916 struct md_rdev *rdev;
3917 struct r10conf *conf;
3918
3919 if (mddev->degraded > 0) {
3920 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
3921 mdname(mddev));
3922 return ERR_PTR(-EINVAL);
3923 }
3924
3925
3926 mddev->new_level = 10;
3927
3928 mddev->new_layout = (1<<8) + 2;
3929 mddev->new_chunk_sectors = mddev->chunk_sectors;
3930 mddev->delta_disks = mddev->raid_disks;
3931 mddev->raid_disks *= 2;
3932
3933 mddev->recovery_cp = MaxSector;
3934
3935 conf = setup_conf(mddev);
3936 if (!IS_ERR(conf)) {
3937 rdev_for_each(rdev, mddev)
3938 if (rdev->raid_disk >= 0)
3939 rdev->new_raid_disk = rdev->raid_disk * 2;
3940 conf->barrier = 1;
3941 }
3942
3943 return conf;
3944}
3945
3946static void *raid10_takeover(struct mddev *mddev)
3947{
3948 struct r0conf *raid0_conf;
3949
3950
3951
3952
3953 if (mddev->level == 0) {
3954
3955 raid0_conf = mddev->private;
3956 if (raid0_conf->nr_strip_zones > 1) {
3957 printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
3958 " with more than one zone.\n",
3959 mdname(mddev));
3960 return ERR_PTR(-EINVAL);
3961 }
3962 return raid10_takeover_raid0(mddev);
3963 }
3964 return ERR_PTR(-EINVAL);
3965}
3966
3967static int raid10_check_reshape(struct mddev *mddev)
3968{
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983 struct r10conf *conf = mddev->private;
3984 struct geom geo;
3985
3986 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
3987 return -EINVAL;
3988
3989 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
3990
3991 return -EINVAL;
3992 if (geo.far_copies > 1 && !geo.far_offset)
3993
3994 return -EINVAL;
3995
3996 if (mddev->array_sectors & geo.chunk_mask)
3997
3998 return -EINVAL;
3999
4000 if (!enough(conf, -1))
4001 return -EINVAL;
4002
4003 kfree(conf->mirrors_new);
4004 conf->mirrors_new = NULL;
4005 if (mddev->delta_disks > 0) {
4006
4007 conf->mirrors_new = kzalloc(
4008 sizeof(struct raid10_info)
4009 *(mddev->raid_disks +
4010 mddev->delta_disks),
4011 GFP_KERNEL);
4012 if (!conf->mirrors_new)
4013 return -ENOMEM;
4014 }
4015 return 0;
4016}
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031static int calc_degraded(struct r10conf *conf)
4032{
4033 int degraded, degraded2;
4034 int i;
4035
4036 rcu_read_lock();
4037 degraded = 0;
4038
4039 for (i = 0; i < conf->prev.raid_disks; i++) {
4040 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4041 if (!rdev || test_bit(Faulty, &rdev->flags))
4042 degraded++;
4043 else if (!test_bit(In_sync, &rdev->flags))
4044
4045
4046
4047
4048 degraded++;
4049 }
4050 rcu_read_unlock();
4051 if (conf->geo.raid_disks == conf->prev.raid_disks)
4052 return degraded;
4053 rcu_read_lock();
4054 degraded2 = 0;
4055 for (i = 0; i < conf->geo.raid_disks; i++) {
4056 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4057 if (!rdev || test_bit(Faulty, &rdev->flags))
4058 degraded2++;
4059 else if (!test_bit(In_sync, &rdev->flags)) {
4060
4061
4062
4063
4064
4065 if (conf->geo.raid_disks <= conf->prev.raid_disks)
4066 degraded2++;
4067 }
4068 }
4069 rcu_read_unlock();
4070 if (degraded2 > degraded)
4071 return degraded2;
4072 return degraded;
4073}
4074
4075static int raid10_start_reshape(struct mddev *mddev)
4076{
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087 unsigned long before_length, after_length;
4088 sector_t min_offset_diff = 0;
4089 int first = 1;
4090 struct geom new;
4091 struct r10conf *conf = mddev->private;
4092 struct md_rdev *rdev;
4093 int spares = 0;
4094 int ret;
4095
4096 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4097 return -EBUSY;
4098
4099 if (setup_geo(&new, mddev, geo_start) != conf->copies)
4100 return -EINVAL;
4101
4102 before_length = ((1 << conf->prev.chunk_shift) *
4103 conf->prev.far_copies);
4104 after_length = ((1 << conf->geo.chunk_shift) *
4105 conf->geo.far_copies);
4106
4107 rdev_for_each(rdev, mddev) {
4108 if (!test_bit(In_sync, &rdev->flags)
4109 && !test_bit(Faulty, &rdev->flags))
4110 spares++;
4111 if (rdev->raid_disk >= 0) {
4112 long long diff = (rdev->new_data_offset
4113 - rdev->data_offset);
4114 if (!mddev->reshape_backwards)
4115 diff = -diff;
4116 if (diff < 0)
4117 diff = 0;
4118 if (first || diff < min_offset_diff)
4119 min_offset_diff = diff;
4120 }
4121 }
4122
4123 if (max(before_length, after_length) > min_offset_diff)
4124 return -EINVAL;
4125
4126 if (spares < mddev->delta_disks)
4127 return -EINVAL;
4128
4129 conf->offset_diff = min_offset_diff;
4130 spin_lock_irq(&conf->device_lock);
4131 if (conf->mirrors_new) {
4132 memcpy(conf->mirrors_new, conf->mirrors,
4133 sizeof(struct raid10_info)*conf->prev.raid_disks);
4134 smp_mb();
4135 kfree(conf->mirrors_old);
4136 conf->mirrors_old = conf->mirrors;
4137 conf->mirrors = conf->mirrors_new;
4138 conf->mirrors_new = NULL;
4139 }
4140 setup_geo(&conf->geo, mddev, geo_start);
4141 smp_mb();
4142 if (mddev->reshape_backwards) {
4143 sector_t size = raid10_size(mddev, 0, 0);
4144 if (size < mddev->array_sectors) {
4145 spin_unlock_irq(&conf->device_lock);
4146 printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
4147 mdname(mddev));
4148 return -EINVAL;
4149 }
4150 mddev->resync_max_sectors = size;
4151 conf->reshape_progress = size;
4152 } else
4153 conf->reshape_progress = 0;
4154 spin_unlock_irq(&conf->device_lock);
4155
4156 if (mddev->delta_disks && mddev->bitmap) {
4157 ret = bitmap_resize(mddev->bitmap,
4158 raid10_size(mddev, 0,
4159 conf->geo.raid_disks),
4160 0, 0);
4161 if (ret)
4162 goto abort;
4163 }
4164 if (mddev->delta_disks > 0) {
4165 rdev_for_each(rdev, mddev)
4166 if (rdev->raid_disk < 0 &&
4167 !test_bit(Faulty, &rdev->flags)) {
4168 if (raid10_add_disk(mddev, rdev) == 0) {
4169 if (rdev->raid_disk >=
4170 conf->prev.raid_disks)
4171 set_bit(In_sync, &rdev->flags);
4172 else
4173 rdev->recovery_offset = 0;
4174
4175 if (sysfs_link_rdev(mddev, rdev))
4176 ;
4177 }
4178 } else if (rdev->raid_disk >= conf->prev.raid_disks
4179 && !test_bit(Faulty, &rdev->flags)) {
4180
4181 set_bit(In_sync, &rdev->flags);
4182 }
4183 }
4184
4185
4186
4187
4188 spin_lock_irq(&conf->device_lock);
4189 mddev->degraded = calc_degraded(conf);
4190 spin_unlock_irq(&conf->device_lock);
4191 mddev->raid_disks = conf->geo.raid_disks;
4192 mddev->reshape_position = conf->reshape_progress;
4193 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4194
4195 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4196 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4197 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4198 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4199
4200 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4201 "reshape");
4202 if (!mddev->sync_thread) {
4203 ret = -EAGAIN;
4204 goto abort;
4205 }
4206 conf->reshape_checkpoint = jiffies;
4207 md_wakeup_thread(mddev->sync_thread);
4208 md_new_event(mddev);
4209 return 0;
4210
4211abort:
4212 mddev->recovery = 0;
4213 spin_lock_irq(&conf->device_lock);
4214 conf->geo = conf->prev;
4215 mddev->raid_disks = conf->geo.raid_disks;
4216 rdev_for_each(rdev, mddev)
4217 rdev->new_data_offset = rdev->data_offset;
4218 smp_wmb();
4219 conf->reshape_progress = MaxSector;
4220 mddev->reshape_position = MaxSector;
4221 spin_unlock_irq(&conf->device_lock);
4222 return ret;
4223}
4224
4225
4226
4227
4228
4229
4230
4231static sector_t last_dev_address(sector_t s, struct geom *geo)
4232{
4233 s = (s | geo->chunk_mask) + 1;
4234 s >>= geo->chunk_shift;
4235 s *= geo->near_copies;
4236 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4237 s *= geo->far_copies;
4238 s <<= geo->chunk_shift;
4239 return s;
4240}
4241
4242
4243
4244
4245
4246static sector_t first_dev_address(sector_t s, struct geom *geo)
4247{
4248 s >>= geo->chunk_shift;
4249 s *= geo->near_copies;
4250 sector_div(s, geo->raid_disks);
4251 s *= geo->far_copies;
4252 s <<= geo->chunk_shift;
4253 return s;
4254}
4255
4256static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4257 int *skipped)
4258{
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296 struct r10conf *conf = mddev->private;
4297 struct r10bio *r10_bio;
4298 sector_t next, safe, last;
4299 int max_sectors;
4300 int nr_sectors;
4301 int s;
4302 struct md_rdev *rdev;
4303 int need_flush = 0;
4304 struct bio *blist;
4305 struct bio *bio, *read_bio;
4306 int sectors_done = 0;
4307
4308 if (sector_nr == 0) {
4309
4310 if (mddev->reshape_backwards &&
4311 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4312 sector_nr = (raid10_size(mddev, 0, 0)
4313 - conf->reshape_progress);
4314 } else if (!mddev->reshape_backwards &&
4315 conf->reshape_progress > 0)
4316 sector_nr = conf->reshape_progress;
4317 if (sector_nr) {
4318 mddev->curr_resync_completed = sector_nr;
4319 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4320 *skipped = 1;
4321 return sector_nr;
4322 }
4323 }
4324
4325
4326
4327
4328
4329 if (mddev->reshape_backwards) {
4330
4331
4332
4333 next = first_dev_address(conf->reshape_progress - 1,
4334 &conf->geo);
4335
4336
4337
4338
4339 safe = last_dev_address(conf->reshape_safe - 1,
4340 &conf->prev);
4341
4342 if (next + conf->offset_diff < safe)
4343 need_flush = 1;
4344
4345 last = conf->reshape_progress - 1;
4346 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4347 & conf->prev.chunk_mask);
4348 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4349 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4350 } else {
4351
4352
4353
4354 next = last_dev_address(conf->reshape_progress, &conf->geo);
4355
4356
4357
4358
4359 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4360
4361
4362
4363
4364 if (next > safe + conf->offset_diff)
4365 need_flush = 1;
4366
4367 sector_nr = conf->reshape_progress;
4368 last = sector_nr | (conf->geo.chunk_mask
4369 & conf->prev.chunk_mask);
4370
4371 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4372 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4373 }
4374
4375 if (need_flush ||
4376 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4377
4378 wait_barrier(conf);
4379 mddev->reshape_position = conf->reshape_progress;
4380 if (mddev->reshape_backwards)
4381 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4382 - conf->reshape_progress;
4383 else
4384 mddev->curr_resync_completed = conf->reshape_progress;
4385 conf->reshape_checkpoint = jiffies;
4386 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4387 md_wakeup_thread(mddev->thread);
4388 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4389 kthread_should_stop());
4390 conf->reshape_safe = mddev->reshape_position;
4391 allow_barrier(conf);
4392 }
4393
4394read_more:
4395
4396 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
4397 raise_barrier(conf, sectors_done != 0);
4398 atomic_set(&r10_bio->remaining, 0);
4399 r10_bio->mddev = mddev;
4400 r10_bio->sector = sector_nr;
4401 set_bit(R10BIO_IsReshape, &r10_bio->state);
4402 r10_bio->sectors = last - sector_nr + 1;
4403 rdev = read_balance(conf, r10_bio, &max_sectors);
4404 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4405
4406 if (!rdev) {
4407
4408
4409
4410
4411 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4412 return sectors_done;
4413 }
4414
4415 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4416
4417 read_bio->bi_bdev = rdev->bdev;
4418 read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4419 + rdev->data_offset);
4420 read_bio->bi_private = r10_bio;
4421 read_bio->bi_end_io = end_sync_read;
4422 read_bio->bi_rw = READ;
4423 read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
4424 read_bio->bi_flags |= 1 << BIO_UPTODATE;
4425 read_bio->bi_vcnt = 0;
4426 read_bio->bi_size = 0;
4427 r10_bio->master_bio = read_bio;
4428 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4429
4430
4431 __raid10_find_phys(&conf->geo, r10_bio);
4432
4433 blist = read_bio;
4434 read_bio->bi_next = NULL;
4435
4436 for (s = 0; s < conf->copies*2; s++) {
4437 struct bio *b;
4438 int d = r10_bio->devs[s/2].devnum;
4439 struct md_rdev *rdev2;
4440 if (s&1) {
4441 rdev2 = conf->mirrors[d].replacement;
4442 b = r10_bio->devs[s/2].repl_bio;
4443 } else {
4444 rdev2 = conf->mirrors[d].rdev;
4445 b = r10_bio->devs[s/2].bio;
4446 }
4447 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4448 continue;
4449
4450 bio_reset(b);
4451 b->bi_bdev = rdev2->bdev;
4452 b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
4453 b->bi_private = r10_bio;
4454 b->bi_end_io = end_reshape_write;
4455 b->bi_rw = WRITE;
4456 b->bi_next = blist;
4457 blist = b;
4458 }
4459
4460
4461
4462 nr_sectors = 0;
4463 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4464 struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
4465 int len = (max_sectors - s) << 9;
4466 if (len > PAGE_SIZE)
4467 len = PAGE_SIZE;
4468 for (bio = blist; bio ; bio = bio->bi_next) {
4469 struct bio *bio2;
4470 if (bio_add_page(bio, page, len, 0))
4471 continue;
4472
4473
4474 for (bio2 = blist;
4475 bio2 && bio2 != bio;
4476 bio2 = bio2->bi_next) {
4477
4478 bio2->bi_vcnt--;
4479 bio2->bi_size -= len;
4480 bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
4481 }
4482 goto bio_full;
4483 }
4484 sector_nr += len >> 9;
4485 nr_sectors += len >> 9;
4486 }
4487bio_full:
4488 r10_bio->sectors = nr_sectors;
4489
4490
4491 md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
4492 atomic_inc(&r10_bio->remaining);
4493 read_bio->bi_next = NULL;
4494 generic_make_request(read_bio);
4495 sector_nr += nr_sectors;
4496 sectors_done += nr_sectors;
4497 if (sector_nr <= last)
4498 goto read_more;
4499
4500
4501
4502
4503 if (mddev->reshape_backwards)
4504 conf->reshape_progress -= sectors_done;
4505 else
4506 conf->reshape_progress += sectors_done;
4507
4508 return sectors_done;
4509}
4510
4511static void end_reshape_request(struct r10bio *r10_bio);
4512static int handle_reshape_read_error(struct mddev *mddev,
4513 struct r10bio *r10_bio);
4514static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4515{
4516
4517
4518
4519
4520
4521 struct r10conf *conf = mddev->private;
4522 int s;
4523
4524 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4525 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4526
4527 md_done_sync(mddev, r10_bio->sectors, 0);
4528 return;
4529 }
4530
4531
4532
4533
4534 atomic_set(&r10_bio->remaining, 1);
4535 for (s = 0; s < conf->copies*2; s++) {
4536 struct bio *b;
4537 int d = r10_bio->devs[s/2].devnum;
4538 struct md_rdev *rdev;
4539 if (s&1) {
4540 rdev = conf->mirrors[d].replacement;
4541 b = r10_bio->devs[s/2].repl_bio;
4542 } else {
4543 rdev = conf->mirrors[d].rdev;
4544 b = r10_bio->devs[s/2].bio;
4545 }
4546 if (!rdev || test_bit(Faulty, &rdev->flags))
4547 continue;
4548 atomic_inc(&rdev->nr_pending);
4549 md_sync_acct(b->bi_bdev, r10_bio->sectors);
4550 atomic_inc(&r10_bio->remaining);
4551 b->bi_next = NULL;
4552 generic_make_request(b);
4553 }
4554 end_reshape_request(r10_bio);
4555}
4556
4557static void end_reshape(struct r10conf *conf)
4558{
4559 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4560 return;
4561
4562 spin_lock_irq(&conf->device_lock);
4563 conf->prev = conf->geo;
4564 md_finish_reshape(conf->mddev);
4565 smp_wmb();
4566 conf->reshape_progress = MaxSector;
4567 spin_unlock_irq(&conf->device_lock);
4568
4569
4570
4571
4572 if (conf->mddev->queue) {
4573 int stripe = conf->geo.raid_disks *
4574 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4575 stripe /= conf->geo.near_copies;
4576 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4577 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4578 }
4579 conf->fullsync = 0;
4580}
4581
4582
4583static int handle_reshape_read_error(struct mddev *mddev,
4584 struct r10bio *r10_bio)
4585{
4586
4587 int sectors = r10_bio->sectors;
4588 struct r10conf *conf = mddev->private;
4589 struct {
4590 struct r10bio r10_bio;
4591 struct r10dev devs[conf->copies];
4592 } on_stack;
4593 struct r10bio *r10b = &on_stack.r10_bio;
4594 int slot = 0;
4595 int idx = 0;
4596 struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
4597
4598 r10b->sector = r10_bio->sector;
4599 __raid10_find_phys(&conf->prev, r10b);
4600
4601 while (sectors) {
4602 int s = sectors;
4603 int success = 0;
4604 int first_slot = slot;
4605
4606 if (s > (PAGE_SIZE >> 9))
4607 s = PAGE_SIZE >> 9;
4608
4609 while (!success) {
4610 int d = r10b->devs[slot].devnum;
4611 struct md_rdev *rdev = conf->mirrors[d].rdev;
4612 sector_t addr;
4613 if (rdev == NULL ||
4614 test_bit(Faulty, &rdev->flags) ||
4615 !test_bit(In_sync, &rdev->flags))
4616 goto failed;
4617
4618 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4619 success = sync_page_io(rdev,
4620 addr,
4621 s << 9,
4622 bvec[idx].bv_page,
4623 READ, false);
4624 if (success)
4625 break;
4626 failed:
4627 slot++;
4628 if (slot >= conf->copies)
4629 slot = 0;
4630 if (slot == first_slot)
4631 break;
4632 }
4633 if (!success) {
4634
4635 set_bit(MD_RECOVERY_INTR,
4636 &mddev->recovery);
4637 return -EIO;
4638 }
4639 sectors -= s;
4640 idx++;
4641 }
4642 return 0;
4643}
4644
4645static void end_reshape_write(struct bio *bio, int error)
4646{
4647 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
4648 struct r10bio *r10_bio = bio->bi_private;
4649 struct mddev *mddev = r10_bio->mddev;
4650 struct r10conf *conf = mddev->private;
4651 int d;
4652 int slot;
4653 int repl;
4654 struct md_rdev *rdev = NULL;
4655
4656 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4657 if (repl)
4658 rdev = conf->mirrors[d].replacement;
4659 if (!rdev) {
4660 smp_mb();
4661 rdev = conf->mirrors[d].rdev;
4662 }
4663
4664 if (!uptodate) {
4665
4666 md_error(mddev, rdev);
4667 }
4668
4669 rdev_dec_pending(rdev, mddev);
4670 end_reshape_request(r10_bio);
4671}
4672
4673static void end_reshape_request(struct r10bio *r10_bio)
4674{
4675 if (!atomic_dec_and_test(&r10_bio->remaining))
4676 return;
4677 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4678 bio_put(r10_bio->master_bio);
4679 put_buf(r10_bio);
4680}
4681
4682static void raid10_finish_reshape(struct mddev *mddev)
4683{
4684 struct r10conf *conf = mddev->private;
4685
4686 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4687 return;
4688
4689 if (mddev->delta_disks > 0) {
4690 sector_t size = raid10_size(mddev, 0, 0);
4691 md_set_array_sectors(mddev, size);
4692 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4693 mddev->recovery_cp = mddev->resync_max_sectors;
4694 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4695 }
4696 mddev->resync_max_sectors = size;
4697 set_capacity(mddev->gendisk, mddev->array_sectors);
4698 revalidate_disk(mddev->gendisk);
4699 } else {
4700 int d;
4701 for (d = conf->geo.raid_disks ;
4702 d < conf->geo.raid_disks - mddev->delta_disks;
4703 d++) {
4704 struct md_rdev *rdev = conf->mirrors[d].rdev;
4705 if (rdev)
4706 clear_bit(In_sync, &rdev->flags);
4707 rdev = conf->mirrors[d].replacement;
4708 if (rdev)
4709 clear_bit(In_sync, &rdev->flags);
4710 }
4711 }
4712 mddev->layout = mddev->new_layout;
4713 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4714 mddev->reshape_position = MaxSector;
4715 mddev->delta_disks = 0;
4716 mddev->reshape_backwards = 0;
4717}
4718
4719static struct md_personality raid10_personality =
4720{
4721 .name = "raid10",
4722 .level = 10,
4723 .owner = THIS_MODULE,
4724 .make_request = make_request,
4725 .run = run,
4726 .stop = stop,
4727 .status = status,
4728 .error_handler = error,
4729 .hot_add_disk = raid10_add_disk,
4730 .hot_remove_disk= raid10_remove_disk,
4731 .spare_active = raid10_spare_active,
4732 .sync_request = sync_request,
4733 .quiesce = raid10_quiesce,
4734 .size = raid10_size,
4735 .resize = raid10_resize,
4736 .takeover = raid10_takeover,
4737 .check_reshape = raid10_check_reshape,
4738 .start_reshape = raid10_start_reshape,
4739 .finish_reshape = raid10_finish_reshape,
4740};
4741
4742static int __init raid_init(void)
4743{
4744 return register_md_personality(&raid10_personality);
4745}
4746
4747static void raid_exit(void)
4748{
4749 unregister_md_personality(&raid10_personality);
4750}
4751
4752module_init(raid_init);
4753module_exit(raid_exit);
4754MODULE_LICENSE("GPL");
4755MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4756MODULE_ALIAS("md-personality-9");
4757MODULE_ALIAS("md-raid10");
4758MODULE_ALIAS("md-level-10");
4759
4760module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
4761