1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/slab.h>
22#include <linux/delay.h>
23#include <linux/blkdev.h>
24#include <linux/module.h>
25#include <linux/seq_file.h>
26#include <linux/ratelimit.h>
27#include <linux/kthread.h>
28#include "md.h"
29#include "raid10.h"
30#include "raid0.h"
31#include "bitmap.h"
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77#define NR_RAID10_BIOS 256
78
79
80
81
82
83
84#define IO_BLOCKED ((struct bio *)1)
85
86
87
88
89#define IO_MADE_GOOD ((struct bio *)2)
90
91#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
92
93
94
95
96
97static int max_queued_requests = 1024;
98
99static void allow_barrier(struct r10conf *conf);
100static void lower_barrier(struct r10conf *conf);
101static int _enough(struct r10conf *conf, int previous, int ignore);
102static int enough(struct r10conf *conf, int ignore);
103static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
104 int *skipped);
105static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
106static void end_reshape_write(struct bio *bio, int error);
107static void end_reshape(struct r10conf *conf);
108
109static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
110{
111 struct r10conf *conf = data;
112 int size = offsetof(struct r10bio, devs[conf->copies]);
113
114
115
116 return kzalloc(size, gfp_flags);
117}
118
119static void r10bio_pool_free(void *r10_bio, void *data)
120{
121 kfree(r10_bio);
122}
123
124
125#define RESYNC_BLOCK_SIZE (64*1024)
126#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
127
128#define RESYNC_WINDOW (1024*1024)
129
130#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
131
132
133
134
135
136
137
138
139static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
140{
141 struct r10conf *conf = data;
142 struct page *page;
143 struct r10bio *r10_bio;
144 struct bio *bio;
145 int i, j;
146 int nalloc;
147
148 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
149 if (!r10_bio)
150 return NULL;
151
152 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
153 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
154 nalloc = conf->copies;
155 else
156 nalloc = 2;
157
158
159
160
161 for (j = nalloc ; j-- ; ) {
162 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
163 if (!bio)
164 goto out_free_bio;
165 r10_bio->devs[j].bio = bio;
166 if (!conf->have_replacement)
167 continue;
168 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
169 if (!bio)
170 goto out_free_bio;
171 r10_bio->devs[j].repl_bio = bio;
172 }
173
174
175
176
177 for (j = 0 ; j < nalloc; j++) {
178 struct bio *rbio = r10_bio->devs[j].repl_bio;
179 bio = r10_bio->devs[j].bio;
180 for (i = 0; i < RESYNC_PAGES; i++) {
181 if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
182 &conf->mddev->recovery)) {
183
184
185 struct bio *rbio = r10_bio->devs[0].bio;
186 page = rbio->bi_io_vec[i].bv_page;
187 get_page(page);
188 } else
189 page = alloc_page(gfp_flags);
190 if (unlikely(!page))
191 goto out_free_pages;
192
193 bio->bi_io_vec[i].bv_page = page;
194 if (rbio)
195 rbio->bi_io_vec[i].bv_page = page;
196 }
197 }
198
199 return r10_bio;
200
201out_free_pages:
202 for ( ; i > 0 ; i--)
203 safe_put_page(bio->bi_io_vec[i-1].bv_page);
204 while (j--)
205 for (i = 0; i < RESYNC_PAGES ; i++)
206 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
207 j = 0;
208out_free_bio:
209 for ( ; j < nalloc; j++) {
210 if (r10_bio->devs[j].bio)
211 bio_put(r10_bio->devs[j].bio);
212 if (r10_bio->devs[j].repl_bio)
213 bio_put(r10_bio->devs[j].repl_bio);
214 }
215 r10bio_pool_free(r10_bio, conf);
216 return NULL;
217}
218
219static void r10buf_pool_free(void *__r10_bio, void *data)
220{
221 int i;
222 struct r10conf *conf = data;
223 struct r10bio *r10bio = __r10_bio;
224 int j;
225
226 for (j=0; j < conf->copies; j++) {
227 struct bio *bio = r10bio->devs[j].bio;
228 if (bio) {
229 for (i = 0; i < RESYNC_PAGES; i++) {
230 safe_put_page(bio->bi_io_vec[i].bv_page);
231 bio->bi_io_vec[i].bv_page = NULL;
232 }
233 bio_put(bio);
234 }
235 bio = r10bio->devs[j].repl_bio;
236 if (bio)
237 bio_put(bio);
238 }
239 r10bio_pool_free(r10bio, conf);
240}
241
242static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
243{
244 int i;
245
246 for (i = 0; i < conf->copies; i++) {
247 struct bio **bio = & r10_bio->devs[i].bio;
248 if (!BIO_SPECIAL(*bio))
249 bio_put(*bio);
250 *bio = NULL;
251 bio = &r10_bio->devs[i].repl_bio;
252 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
253 bio_put(*bio);
254 *bio = NULL;
255 }
256}
257
258static void free_r10bio(struct r10bio *r10_bio)
259{
260 struct r10conf *conf = r10_bio->mddev->private;
261
262 put_all_bios(conf, r10_bio);
263 mempool_free(r10_bio, conf->r10bio_pool);
264}
265
266static void put_buf(struct r10bio *r10_bio)
267{
268 struct r10conf *conf = r10_bio->mddev->private;
269
270 mempool_free(r10_bio, conf->r10buf_pool);
271
272 lower_barrier(conf);
273}
274
275static void reschedule_retry(struct r10bio *r10_bio)
276{
277 unsigned long flags;
278 struct mddev *mddev = r10_bio->mddev;
279 struct r10conf *conf = mddev->private;
280
281 spin_lock_irqsave(&conf->device_lock, flags);
282 list_add(&r10_bio->retry_list, &conf->retry_list);
283 conf->nr_queued ++;
284 spin_unlock_irqrestore(&conf->device_lock, flags);
285
286
287 wake_up(&conf->wait_barrier);
288
289 md_wakeup_thread(mddev->thread);
290}
291
292
293
294
295
296
297static void raid_end_bio_io(struct r10bio *r10_bio)
298{
299 struct bio *bio = r10_bio->master_bio;
300 int done;
301 struct r10conf *conf = r10_bio->mddev->private;
302
303 if (bio->bi_phys_segments) {
304 unsigned long flags;
305 spin_lock_irqsave(&conf->device_lock, flags);
306 bio->bi_phys_segments--;
307 done = (bio->bi_phys_segments == 0);
308 spin_unlock_irqrestore(&conf->device_lock, flags);
309 } else
310 done = 1;
311 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
312 clear_bit(BIO_UPTODATE, &bio->bi_flags);
313 if (done) {
314
315 if (bio_data_dir(bio) == WRITE)
316 md_write_end(r10_bio->mddev);
317 bio_endio(bio, 0);
318
319
320
321
322 allow_barrier(conf);
323 }
324 free_r10bio(r10_bio);
325}
326
327
328
329
330static inline void update_head_pos(int slot, struct r10bio *r10_bio)
331{
332 struct r10conf *conf = r10_bio->mddev->private;
333
334 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
335 r10_bio->devs[slot].addr + (r10_bio->sectors);
336}
337
338
339
340
341static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
342 struct bio *bio, int *slotp, int *replp)
343{
344 int slot;
345 int repl = 0;
346
347 for (slot = 0; slot < conf->copies; slot++) {
348 if (r10_bio->devs[slot].bio == bio)
349 break;
350 if (r10_bio->devs[slot].repl_bio == bio) {
351 repl = 1;
352 break;
353 }
354 }
355
356 BUG_ON(slot == conf->copies);
357 update_head_pos(slot, r10_bio);
358
359 if (slotp)
360 *slotp = slot;
361 if (replp)
362 *replp = repl;
363 return r10_bio->devs[slot].devnum;
364}
365
366static void raid10_end_read_request(struct bio *bio, int error)
367{
368 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
369 struct r10bio *r10_bio = bio->bi_private;
370 int slot, dev;
371 struct md_rdev *rdev;
372 struct r10conf *conf = r10_bio->mddev->private;
373
374 slot = r10_bio->read_slot;
375 dev = r10_bio->devs[slot].devnum;
376 rdev = r10_bio->devs[slot].rdev;
377
378
379
380 update_head_pos(slot, r10_bio);
381
382 if (uptodate) {
383
384
385
386
387
388
389
390
391
392 set_bit(R10BIO_Uptodate, &r10_bio->state);
393 } else {
394
395
396
397
398
399 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
400 rdev->raid_disk))
401 uptodate = 1;
402 }
403 if (uptodate) {
404 raid_end_bio_io(r10_bio);
405 rdev_dec_pending(rdev, conf->mddev);
406 } else {
407
408
409
410 char b[BDEVNAME_SIZE];
411 pr_err_ratelimited("md/raid10:%s: %s: rescheduling sector %llu\n",
412 mdname(conf->mddev),
413 bdevname(rdev->bdev, b),
414 (unsigned long long)r10_bio->sector);
415 set_bit(R10BIO_ReadError, &r10_bio->state);
416 reschedule_retry(r10_bio);
417 }
418}
419
420static void close_write(struct r10bio *r10_bio)
421{
422
423 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
424 r10_bio->sectors,
425 !test_bit(R10BIO_Degraded, &r10_bio->state),
426 0);
427}
428
429static void one_write_done(struct r10bio *r10_bio)
430{
431 if (atomic_dec_and_test(&r10_bio->remaining)) {
432 if (test_bit(R10BIO_WriteError, &r10_bio->state))
433 reschedule_retry(r10_bio);
434 else {
435 close_write(r10_bio);
436 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
437 reschedule_retry(r10_bio);
438 else
439 raid_end_bio_io(r10_bio);
440 }
441 }
442}
443
444static void raid10_end_write_request(struct bio *bio, int error)
445{
446 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
447 struct r10bio *r10_bio = bio->bi_private;
448 int dev;
449 int dec_rdev = 1;
450 struct r10conf *conf = r10_bio->mddev->private;
451 int slot, repl;
452 struct md_rdev *rdev = NULL;
453 struct bio *to_put = NULL;
454
455 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
456
457 if (repl)
458 rdev = conf->mirrors[dev].replacement;
459 if (!rdev) {
460 smp_rmb();
461 repl = 0;
462 rdev = conf->mirrors[dev].rdev;
463 }
464
465
466
467 if (!uptodate) {
468 if (repl)
469
470
471
472 md_error(rdev->mddev, rdev);
473 else {
474 set_bit(WriteErrorSeen, &rdev->flags);
475 if (!test_and_set_bit(WantReplacement, &rdev->flags))
476 set_bit(MD_RECOVERY_NEEDED,
477 &rdev->mddev->recovery);
478
479 dec_rdev = 0;
480 if (test_bit(FailFast, &rdev->flags) &&
481 (bio->bi_rw & MD_FAILFAST)) {
482 md_error(rdev->mddev, rdev);
483 if (!test_bit(Faulty, &rdev->flags))
484
485
486
487
488 set_bit(R10BIO_WriteError, &r10_bio->state);
489 else {
490 r10_bio->devs[slot].bio = NULL;
491 to_put = bio;
492 dec_rdev = 1;
493 }
494 } else
495 set_bit(R10BIO_WriteError, &r10_bio->state);
496 }
497 } else {
498
499
500
501
502
503
504
505
506
507 sector_t first_bad;
508 int bad_sectors;
509
510
511
512
513
514
515
516
517
518 if (test_bit(In_sync, &rdev->flags) &&
519 !test_bit(Faulty, &rdev->flags))
520 set_bit(R10BIO_Uptodate, &r10_bio->state);
521
522
523 if (is_badblock(rdev,
524 r10_bio->devs[slot].addr,
525 r10_bio->sectors,
526 &first_bad, &bad_sectors)) {
527 bio_put(bio);
528 if (repl)
529 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
530 else
531 r10_bio->devs[slot].bio = IO_MADE_GOOD;
532 dec_rdev = 0;
533 set_bit(R10BIO_MadeGood, &r10_bio->state);
534 }
535 }
536
537
538
539
540
541
542 one_write_done(r10_bio);
543 if (dec_rdev)
544 rdev_dec_pending(rdev, conf->mddev);
545 if (to_put)
546 bio_put(to_put);
547}
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
575{
576 int n,f;
577 sector_t sector;
578 sector_t chunk;
579 sector_t stripe;
580 int dev;
581 int slot = 0;
582 int last_far_set_start, last_far_set_size;
583
584 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
585 last_far_set_start *= geo->far_set_size;
586
587 last_far_set_size = geo->far_set_size;
588 last_far_set_size += (geo->raid_disks % geo->far_set_size);
589
590
591 chunk = r10bio->sector >> geo->chunk_shift;
592 sector = r10bio->sector & geo->chunk_mask;
593
594 chunk *= geo->near_copies;
595 stripe = chunk;
596 dev = sector_div(stripe, geo->raid_disks);
597 if (geo->far_offset)
598 stripe *= geo->far_copies;
599
600 sector += stripe << geo->chunk_shift;
601
602
603 for (n = 0; n < geo->near_copies; n++) {
604 int d = dev;
605 int set;
606 sector_t s = sector;
607 r10bio->devs[slot].devnum = d;
608 r10bio->devs[slot].addr = s;
609 slot++;
610
611 for (f = 1; f < geo->far_copies; f++) {
612 set = d / geo->far_set_size;
613 d += geo->near_copies;
614
615 if ((geo->raid_disks % geo->far_set_size) &&
616 (d > last_far_set_start)) {
617 d -= last_far_set_start;
618 d %= last_far_set_size;
619 d += last_far_set_start;
620 } else {
621 d %= geo->far_set_size;
622 d += geo->far_set_size * set;
623 }
624 s += geo->stride;
625 r10bio->devs[slot].devnum = d;
626 r10bio->devs[slot].addr = s;
627 slot++;
628 }
629 dev++;
630 if (dev >= geo->raid_disks) {
631 dev = 0;
632 sector += (geo->chunk_mask + 1);
633 }
634 }
635}
636
637static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
638{
639 struct geom *geo = &conf->geo;
640
641 if (conf->reshape_progress != MaxSector &&
642 ((r10bio->sector >= conf->reshape_progress) !=
643 conf->mddev->reshape_backwards)) {
644 set_bit(R10BIO_Previous, &r10bio->state);
645 geo = &conf->prev;
646 } else
647 clear_bit(R10BIO_Previous, &r10bio->state);
648
649 __raid10_find_phys(geo, r10bio);
650}
651
652static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
653{
654 sector_t offset, chunk, vchunk;
655
656
657
658 struct geom *geo = &conf->geo;
659 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
660 int far_set_size = geo->far_set_size;
661 int last_far_set_start;
662
663 if (geo->raid_disks % geo->far_set_size) {
664 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
665 last_far_set_start *= geo->far_set_size;
666
667 if (dev >= last_far_set_start) {
668 far_set_size = geo->far_set_size;
669 far_set_size += (geo->raid_disks % geo->far_set_size);
670 far_set_start = last_far_set_start;
671 }
672 }
673
674 offset = sector & geo->chunk_mask;
675 if (geo->far_offset) {
676 int fc;
677 chunk = sector >> geo->chunk_shift;
678 fc = sector_div(chunk, geo->far_copies);
679 dev -= fc * geo->near_copies;
680 if (dev < far_set_start)
681 dev += far_set_size;
682 } else {
683 while (sector >= geo->stride) {
684 sector -= geo->stride;
685 if (dev < (geo->near_copies + far_set_start))
686 dev += far_set_size - geo->near_copies;
687 else
688 dev -= geo->near_copies;
689 }
690 chunk = sector >> geo->chunk_shift;
691 }
692 vchunk = chunk * geo->raid_disks + dev;
693 sector_div(vchunk, geo->near_copies);
694 return (vchunk << geo->chunk_shift) + offset;
695}
696
697
698
699
700
701
702
703
704
705
706
707static int raid10_mergeable_bvec(struct mddev *mddev,
708 struct bvec_merge_data *bvm,
709 struct bio_vec *biovec)
710{
711 struct r10conf *conf = mddev->private;
712 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
713 int max;
714 unsigned int chunk_sectors;
715 unsigned int bio_sectors = bvm->bi_size >> 9;
716 struct geom *geo = &conf->geo;
717
718 chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
719 if (conf->reshape_progress != MaxSector &&
720 ((sector >= conf->reshape_progress) !=
721 conf->mddev->reshape_backwards))
722 geo = &conf->prev;
723
724 if (geo->near_copies < geo->raid_disks) {
725 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
726 + bio_sectors)) << 9;
727 if (max < 0)
728
729 max = 0;
730 if (max <= biovec->bv_len && bio_sectors == 0)
731 return biovec->bv_len;
732 } else
733 max = biovec->bv_len;
734
735 if (mddev->merge_check_needed) {
736 struct {
737 struct r10bio r10_bio;
738 struct r10dev devs[conf->copies];
739 } on_stack;
740 struct r10bio *r10_bio = &on_stack.r10_bio;
741 int s;
742 if (conf->reshape_progress != MaxSector) {
743
744 if (max <= biovec->bv_len && bio_sectors == 0)
745 return biovec->bv_len;
746 return 0;
747 }
748 r10_bio->sector = sector;
749 raid10_find_phys(conf, r10_bio);
750 rcu_read_lock();
751 for (s = 0; s < conf->copies; s++) {
752 int disk = r10_bio->devs[s].devnum;
753 struct md_rdev *rdev = rcu_dereference(
754 conf->mirrors[disk].rdev);
755 if (rdev && !test_bit(Faulty, &rdev->flags)) {
756 struct request_queue *q =
757 bdev_get_queue(rdev->bdev);
758 if (q->merge_bvec_fn) {
759 bvm->bi_sector = r10_bio->devs[s].addr
760 + rdev->data_offset;
761 bvm->bi_bdev = rdev->bdev;
762 max = min(max, q->merge_bvec_fn(
763 q, bvm, biovec));
764 }
765 }
766 rdev = rcu_dereference(conf->mirrors[disk].replacement);
767 if (rdev && !test_bit(Faulty, &rdev->flags)) {
768 struct request_queue *q =
769 bdev_get_queue(rdev->bdev);
770 if (q->merge_bvec_fn) {
771 bvm->bi_sector = r10_bio->devs[s].addr
772 + rdev->data_offset;
773 bvm->bi_bdev = rdev->bdev;
774 max = min(max, q->merge_bvec_fn(
775 q, bvm, biovec));
776 }
777 }
778 }
779 rcu_read_unlock();
780 }
781 return max;
782}
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803static struct md_rdev *read_balance(struct r10conf *conf,
804 struct r10bio *r10_bio,
805 int *max_sectors)
806{
807 const sector_t this_sector = r10_bio->sector;
808 int disk, slot;
809 int sectors = r10_bio->sectors;
810 int best_good_sectors;
811 sector_t new_distance, best_dist;
812 struct md_rdev *best_rdev, *rdev = NULL;
813 int do_balance;
814 int best_slot;
815 struct geom *geo = &conf->geo;
816
817 raid10_find_phys(conf, r10_bio);
818 rcu_read_lock();
819 sectors = r10_bio->sectors;
820 best_slot = -1;
821 best_rdev = NULL;
822 best_dist = MaxSector;
823 best_good_sectors = 0;
824 do_balance = 1;
825 clear_bit(R10BIO_FailFast, &r10_bio->state);
826
827
828
829
830
831
832 if (conf->mddev->recovery_cp < MaxSector
833 && (this_sector + sectors >= conf->next_resync))
834 do_balance = 0;
835
836 for (slot = 0; slot < conf->copies ; slot++) {
837 sector_t first_bad;
838 int bad_sectors;
839 sector_t dev_sector;
840
841 if (r10_bio->devs[slot].bio == IO_BLOCKED)
842 continue;
843 disk = r10_bio->devs[slot].devnum;
844 rdev = rcu_dereference(conf->mirrors[disk].replacement);
845 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
846 test_bit(Unmerged, &rdev->flags) ||
847 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
848 rdev = rcu_dereference(conf->mirrors[disk].rdev);
849 if (rdev == NULL ||
850 test_bit(Faulty, &rdev->flags) ||
851 test_bit(Unmerged, &rdev->flags))
852 continue;
853 if (!test_bit(In_sync, &rdev->flags) &&
854 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
855 continue;
856
857 dev_sector = r10_bio->devs[slot].addr;
858 if (is_badblock(rdev, dev_sector, sectors,
859 &first_bad, &bad_sectors)) {
860 if (best_dist < MaxSector)
861
862 continue;
863 if (first_bad <= dev_sector) {
864
865
866
867
868 bad_sectors -= (dev_sector - first_bad);
869 if (!do_balance && sectors > bad_sectors)
870 sectors = bad_sectors;
871 if (best_good_sectors > sectors)
872 best_good_sectors = sectors;
873 } else {
874 sector_t good_sectors =
875 first_bad - dev_sector;
876 if (good_sectors > best_good_sectors) {
877 best_good_sectors = good_sectors;
878 best_slot = slot;
879 best_rdev = rdev;
880 }
881 if (!do_balance)
882
883 break;
884 }
885 continue;
886 } else
887 best_good_sectors = sectors;
888
889 if (!do_balance)
890 break;
891
892 if (best_slot >= 0)
893
894 set_bit(R10BIO_FailFast, &r10_bio->state);
895
896
897
898
899 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
900 new_distance = 0;
901
902
903 else if (geo->far_copies > 1)
904 new_distance = r10_bio->devs[slot].addr;
905 else
906 new_distance = abs(r10_bio->devs[slot].addr -
907 conf->mirrors[disk].head_position);
908 if (new_distance < best_dist) {
909 best_dist = new_distance;
910 best_slot = slot;
911 best_rdev = rdev;
912 }
913 }
914 if (slot >= conf->copies) {
915 slot = best_slot;
916 rdev = best_rdev;
917 }
918
919 if (slot >= 0) {
920 atomic_inc(&rdev->nr_pending);
921 r10_bio->read_slot = slot;
922 } else
923 rdev = NULL;
924 rcu_read_unlock();
925 *max_sectors = best_good_sectors;
926
927 return rdev;
928}
929
930static int raid10_congested(struct mddev *mddev, int bits)
931{
932 struct r10conf *conf = mddev->private;
933 int i, ret = 0;
934
935 if ((bits & (1 << BDI_async_congested)) &&
936 conf->pending_count >= max_queued_requests)
937 return 1;
938
939 rcu_read_lock();
940 for (i = 0;
941 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
942 && ret == 0;
943 i++) {
944 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
945 if (rdev && !test_bit(Faulty, &rdev->flags)) {
946 struct request_queue *q = bdev_get_queue(rdev->bdev);
947
948 ret |= bdi_congested(&q->backing_dev_info, bits);
949 }
950 }
951 rcu_read_unlock();
952 return ret;
953}
954
955static void flush_pending_writes(struct r10conf *conf)
956{
957
958
959
960 spin_lock_irq(&conf->device_lock);
961
962 if (conf->pending_bio_list.head) {
963 struct bio *bio;
964 bio = bio_list_get(&conf->pending_bio_list);
965 conf->pending_count = 0;
966 spin_unlock_irq(&conf->device_lock);
967
968
969 bitmap_unplug(conf->mddev->bitmap);
970 wake_up(&conf->wait_barrier);
971
972 while (bio) {
973 struct bio *next = bio->bi_next;
974 bio->bi_next = NULL;
975 if (unlikely((bio->bi_rw & REQ_DISCARD) &&
976 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
977
978 bio_endio(bio, 0);
979 else
980 generic_make_request(bio);
981 bio = next;
982 }
983 } else
984 spin_unlock_irq(&conf->device_lock);
985}
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009static void raise_barrier(struct r10conf *conf, int force)
1010{
1011 BUG_ON(force && !conf->barrier);
1012 spin_lock_irq(&conf->resync_lock);
1013
1014
1015 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
1016 conf->resync_lock);
1017
1018
1019 conf->barrier++;
1020
1021
1022 wait_event_lock_irq(conf->wait_barrier,
1023 !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
1024 conf->resync_lock);
1025
1026 spin_unlock_irq(&conf->resync_lock);
1027}
1028
1029static void lower_barrier(struct r10conf *conf)
1030{
1031 unsigned long flags;
1032 spin_lock_irqsave(&conf->resync_lock, flags);
1033 conf->barrier--;
1034 spin_unlock_irqrestore(&conf->resync_lock, flags);
1035 wake_up(&conf->wait_barrier);
1036}
1037
1038static void wait_barrier(struct r10conf *conf)
1039{
1040 spin_lock_irq(&conf->resync_lock);
1041 if (conf->barrier) {
1042 conf->nr_waiting++;
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052 wait_event_lock_irq(conf->wait_barrier,
1053 !conf->barrier ||
1054 (atomic_read(&conf->nr_pending) &&
1055 current->bio_list &&
1056 (!bio_list_empty(¤t->bio_list[0]) ||
1057 !bio_list_empty(¤t->bio_list[1]))),
1058 conf->resync_lock);
1059 conf->nr_waiting--;
1060 if (!conf->nr_waiting)
1061 wake_up(&conf->wait_barrier);
1062 }
1063 atomic_inc(&conf->nr_pending);
1064 spin_unlock_irq(&conf->resync_lock);
1065}
1066
1067static void allow_barrier(struct r10conf *conf)
1068{
1069 if ((atomic_dec_and_test(&conf->nr_pending)) ||
1070 (conf->array_freeze_pending))
1071 wake_up(&conf->wait_barrier);
1072}
1073
1074static void freeze_array(struct r10conf *conf, int extra)
1075{
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088 spin_lock_irq(&conf->resync_lock);
1089 conf->array_freeze_pending++;
1090 conf->barrier++;
1091 conf->nr_waiting++;
1092 wait_event_lock_irq_cmd(conf->wait_barrier,
1093 atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
1094 conf->resync_lock,
1095 flush_pending_writes(conf));
1096
1097 conf->array_freeze_pending--;
1098 spin_unlock_irq(&conf->resync_lock);
1099}
1100
1101static void unfreeze_array(struct r10conf *conf)
1102{
1103
1104 spin_lock_irq(&conf->resync_lock);
1105 conf->barrier--;
1106 conf->nr_waiting--;
1107 wake_up(&conf->wait_barrier);
1108 spin_unlock_irq(&conf->resync_lock);
1109}
1110
1111static sector_t choose_data_offset(struct r10bio *r10_bio,
1112 struct md_rdev *rdev)
1113{
1114 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1115 test_bit(R10BIO_Previous, &r10_bio->state))
1116 return rdev->data_offset;
1117 else
1118 return rdev->new_data_offset;
1119}
1120
1121struct raid10_plug_cb {
1122 struct blk_plug_cb cb;
1123 struct bio_list pending;
1124 int pending_cnt;
1125};
1126
1127static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1128{
1129 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1130 cb);
1131 struct mddev *mddev = plug->cb.data;
1132 struct r10conf *conf = mddev->private;
1133 struct bio *bio;
1134
1135 if (from_schedule || current->bio_list) {
1136 spin_lock_irq(&conf->device_lock);
1137 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1138 conf->pending_count += plug->pending_cnt;
1139 spin_unlock_irq(&conf->device_lock);
1140 wake_up(&conf->wait_barrier);
1141 md_wakeup_thread(mddev->thread);
1142 kfree(plug);
1143 return;
1144 }
1145
1146
1147 bio = bio_list_get(&plug->pending);
1148 bitmap_unplug(mddev->bitmap);
1149 wake_up(&conf->wait_barrier);
1150
1151 while (bio) {
1152 struct bio *next = bio->bi_next;
1153 bio->bi_next = NULL;
1154 if (unlikely((bio->bi_rw & REQ_DISCARD) &&
1155 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
1156
1157 bio_endio(bio, 0);
1158 else
1159 generic_make_request(bio);
1160 bio = next;
1161 }
1162 kfree(plug);
1163}
1164
1165static bool raid10_make_request(struct mddev *mddev, struct bio * bio)
1166{
1167 struct r10conf *conf = mddev->private;
1168 struct r10bio *r10_bio;
1169 struct bio *read_bio;
1170 int i;
1171 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1172 int chunk_sects = chunk_mask + 1;
1173 const int rw = bio_data_dir(bio);
1174 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
1175 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
1176 const unsigned long do_discard = (bio->bi_rw
1177 & (REQ_DISCARD | REQ_SECURE));
1178 const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
1179 unsigned long flags;
1180 struct md_rdev *blocked_rdev;
1181 struct blk_plug_cb *cb;
1182 struct raid10_plug_cb *plug = NULL;
1183 int sectors_handled;
1184 int max_sectors;
1185 int sectors;
1186
1187 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
1188 md_flush_request(mddev, bio);
1189 return true;
1190 }
1191
1192
1193
1194
1195 if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio)
1196 > chunk_sects
1197 && (conf->geo.near_copies < conf->geo.raid_disks
1198 || conf->prev.near_copies < conf->prev.raid_disks))) {
1199 struct bio_pair *bp;
1200
1201 if (bio_segments(bio) > 1)
1202 goto bad_map;
1203
1204
1205
1206 bp = bio_split(bio,
1207 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217 spin_lock_irq(&conf->resync_lock);
1218 conf->nr_waiting++;
1219 spin_unlock_irq(&conf->resync_lock);
1220
1221 raid10_make_request(mddev, &bp->bio1);
1222 raid10_make_request(mddev, &bp->bio2);
1223
1224 spin_lock_irq(&conf->resync_lock);
1225 conf->nr_waiting--;
1226 wake_up(&conf->wait_barrier);
1227 spin_unlock_irq(&conf->resync_lock);
1228
1229 bio_pair_release(bp);
1230 return true;
1231 bad_map:
1232 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
1233 " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
1234 (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
1235
1236 bio_io_error(bio);
1237 return true;
1238 }
1239
1240 md_write_start(mddev, bio);
1241
1242
1243
1244
1245
1246
1247 wait_barrier(conf);
1248
1249 sectors = bio_sectors(bio);
1250 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1251 bio->bi_sector < conf->reshape_progress &&
1252 bio->bi_sector + sectors > conf->reshape_progress) {
1253
1254
1255
1256 allow_barrier(conf);
1257 wait_event(conf->wait_barrier,
1258 conf->reshape_progress <= bio->bi_sector ||
1259 conf->reshape_progress >= bio->bi_sector + sectors);
1260 wait_barrier(conf);
1261 }
1262 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1263 bio_data_dir(bio) == WRITE &&
1264 (mddev->reshape_backwards
1265 ? (bio->bi_sector < conf->reshape_safe &&
1266 bio->bi_sector + sectors > conf->reshape_progress)
1267 : (bio->bi_sector + sectors > conf->reshape_safe &&
1268 bio->bi_sector < conf->reshape_progress))) {
1269 gmb();
1270
1271 mddev->reshape_position = conf->reshape_progress;
1272 set_mask_bits(&mddev->sb_flags, 0,
1273 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1274 md_wakeup_thread(mddev->thread);
1275 wait_event(mddev->sb_wait,
1276 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
1277
1278 conf->reshape_safe = mddev->reshape_position;
1279 }
1280
1281 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1282
1283 r10_bio->master_bio = bio;
1284 r10_bio->sectors = sectors;
1285
1286 r10_bio->mddev = mddev;
1287 r10_bio->sector = bio->bi_sector;
1288 r10_bio->state = 0;
1289
1290
1291
1292
1293
1294
1295
1296
1297 bio->bi_phys_segments = 0;
1298 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1299
1300 if (rw == READ) {
1301
1302
1303
1304 struct md_rdev *rdev;
1305 int slot;
1306
1307read_again:
1308 rdev = read_balance(conf, r10_bio, &max_sectors);
1309 if (!rdev) {
1310 raid_end_bio_io(r10_bio);
1311 return true;
1312 }
1313 slot = r10_bio->read_slot;
1314
1315 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1316 bio_trim(read_bio, r10_bio->sector - bio->bi_sector,
1317 max_sectors);
1318
1319 r10_bio->devs[slot].bio = read_bio;
1320 r10_bio->devs[slot].rdev = rdev;
1321
1322 read_bio->bi_sector = r10_bio->devs[slot].addr +
1323 choose_data_offset(r10_bio, rdev);
1324 read_bio->bi_bdev = rdev->bdev;
1325 read_bio->bi_end_io = raid10_end_read_request;
1326 read_bio->bi_rw = READ | do_sync;
1327 if (test_bit(FailFast, &rdev->flags) &&
1328 test_bit(R10BIO_FailFast, &r10_bio->state))
1329 read_bio->bi_rw |= MD_FAILFAST;
1330 read_bio->bi_private = r10_bio;
1331
1332 if (max_sectors < r10_bio->sectors) {
1333
1334
1335
1336 sectors_handled = (r10_bio->sector + max_sectors
1337 - bio->bi_sector);
1338 r10_bio->sectors = max_sectors;
1339 spin_lock_irq(&conf->device_lock);
1340 if (bio->bi_phys_segments == 0)
1341 bio->bi_phys_segments = 2;
1342 else
1343 bio->bi_phys_segments++;
1344 spin_unlock_irq(&conf->device_lock);
1345
1346
1347
1348
1349
1350 reschedule_retry(r10_bio);
1351
1352 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1353
1354 r10_bio->master_bio = bio;
1355 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1356 r10_bio->state = 0;
1357 r10_bio->mddev = mddev;
1358 r10_bio->sector = bio->bi_sector + sectors_handled;
1359 goto read_again;
1360 } else
1361 generic_make_request(read_bio);
1362 return true;
1363 }
1364
1365
1366
1367
1368 if (conf->pending_count >= max_queued_requests) {
1369 md_wakeup_thread(mddev->thread);
1370 wait_event(conf->wait_barrier,
1371 conf->pending_count < max_queued_requests);
1372 }
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385 r10_bio->read_slot = -1;
1386 raid10_find_phys(conf, r10_bio);
1387retry_write:
1388 blocked_rdev = NULL;
1389 rcu_read_lock();
1390 max_sectors = r10_bio->sectors;
1391
1392 for (i = 0; i < conf->copies; i++) {
1393 int d = r10_bio->devs[i].devnum;
1394 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1395 struct md_rdev *rrdev = rcu_dereference(
1396 conf->mirrors[d].replacement);
1397 if (rdev == rrdev)
1398 rrdev = NULL;
1399 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1400 atomic_inc(&rdev->nr_pending);
1401 blocked_rdev = rdev;
1402 break;
1403 }
1404 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1405 atomic_inc(&rrdev->nr_pending);
1406 blocked_rdev = rrdev;
1407 break;
1408 }
1409 if (rdev && (test_bit(Faulty, &rdev->flags)
1410 || test_bit(Unmerged, &rdev->flags)))
1411 rdev = NULL;
1412 if (rrdev && (test_bit(Faulty, &rrdev->flags)
1413 || test_bit(Unmerged, &rrdev->flags)))
1414 rrdev = NULL;
1415
1416 r10_bio->devs[i].bio = NULL;
1417 r10_bio->devs[i].repl_bio = NULL;
1418
1419 if (!rdev && !rrdev) {
1420 set_bit(R10BIO_Degraded, &r10_bio->state);
1421 continue;
1422 }
1423 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1424 sector_t first_bad;
1425 sector_t dev_sector = r10_bio->devs[i].addr;
1426 int bad_sectors;
1427 int is_bad;
1428
1429 is_bad = is_badblock(rdev, dev_sector,
1430 max_sectors,
1431 &first_bad, &bad_sectors);
1432 if (is_bad < 0) {
1433
1434
1435
1436 atomic_inc(&rdev->nr_pending);
1437 set_bit(BlockedBadBlocks, &rdev->flags);
1438 blocked_rdev = rdev;
1439 break;
1440 }
1441 if (is_bad && first_bad <= dev_sector) {
1442
1443 bad_sectors -= (dev_sector - first_bad);
1444 if (bad_sectors < max_sectors)
1445
1446
1447
1448 max_sectors = bad_sectors;
1449
1450
1451
1452
1453
1454
1455
1456
1457 continue;
1458 }
1459 if (is_bad) {
1460 int good_sectors = first_bad - dev_sector;
1461 if (good_sectors < max_sectors)
1462 max_sectors = good_sectors;
1463 }
1464 }
1465 if (rdev) {
1466 r10_bio->devs[i].bio = bio;
1467 atomic_inc(&rdev->nr_pending);
1468 }
1469 if (rrdev) {
1470 r10_bio->devs[i].repl_bio = bio;
1471 atomic_inc(&rrdev->nr_pending);
1472 }
1473 }
1474 rcu_read_unlock();
1475
1476 if (unlikely(blocked_rdev)) {
1477
1478 int j;
1479 int d;
1480
1481 for (j = 0; j < i; j++) {
1482 if (r10_bio->devs[j].bio) {
1483 d = r10_bio->devs[j].devnum;
1484 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1485 }
1486 if (r10_bio->devs[j].repl_bio) {
1487 struct md_rdev *rdev;
1488 d = r10_bio->devs[j].devnum;
1489 rdev = conf->mirrors[d].replacement;
1490 if (!rdev) {
1491
1492 smp_mb();
1493 rdev = conf->mirrors[d].rdev;
1494 }
1495 rdev_dec_pending(rdev, mddev);
1496 }
1497 }
1498 allow_barrier(conf);
1499 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1500 wait_barrier(conf);
1501 goto retry_write;
1502 }
1503
1504 if (max_sectors < r10_bio->sectors)
1505 r10_bio->sectors = max_sectors;
1506 sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
1507
1508 atomic_set(&r10_bio->remaining, 1);
1509 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1510
1511 for (i = 0; i < conf->copies; i++) {
1512 struct bio *mbio;
1513 int d = r10_bio->devs[i].devnum;
1514 if (r10_bio->devs[i].bio) {
1515 struct md_rdev *rdev = conf->mirrors[d].rdev;
1516 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1517 bio_trim(mbio, r10_bio->sector - bio->bi_sector,
1518 max_sectors);
1519 r10_bio->devs[i].bio = mbio;
1520
1521 mbio->bi_sector = (r10_bio->devs[i].addr+
1522 choose_data_offset(r10_bio,
1523 rdev));
1524 mbio->bi_bdev = rdev->bdev;
1525 mbio->bi_end_io = raid10_end_write_request;
1526 mbio->bi_rw =
1527 WRITE | do_sync | do_fua | do_discard | do_same;
1528 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags) &&
1529 enough(conf, d))
1530 mbio->bi_rw |= MD_FAILFAST;
1531 mbio->bi_private = r10_bio;
1532
1533 atomic_inc(&r10_bio->remaining);
1534
1535 cb = blk_check_plugged(raid10_unplug, mddev,
1536 sizeof(*plug));
1537 if (cb)
1538 plug = container_of(cb, struct raid10_plug_cb,
1539 cb);
1540 else
1541 plug = NULL;
1542 spin_lock_irqsave(&conf->device_lock, flags);
1543 if (plug) {
1544 bio_list_add(&plug->pending, mbio);
1545 plug->pending_cnt++;
1546 } else {
1547 bio_list_add(&conf->pending_bio_list, mbio);
1548 conf->pending_count++;
1549 }
1550 spin_unlock_irqrestore(&conf->device_lock, flags);
1551 if (!plug)
1552 md_wakeup_thread(mddev->thread);
1553 }
1554
1555 if (r10_bio->devs[i].repl_bio) {
1556 struct md_rdev *rdev = conf->mirrors[d].replacement;
1557 if (rdev == NULL) {
1558
1559 smp_mb();
1560 rdev = conf->mirrors[d].rdev;
1561 }
1562 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1563 bio_trim(mbio, r10_bio->sector - bio->bi_sector,
1564 max_sectors);
1565 r10_bio->devs[i].repl_bio = mbio;
1566
1567 mbio->bi_sector = (r10_bio->devs[i].addr +
1568 choose_data_offset(
1569 r10_bio, rdev));
1570 mbio->bi_bdev = rdev->bdev;
1571 mbio->bi_end_io = raid10_end_write_request;
1572 mbio->bi_rw =
1573 WRITE | do_sync | do_fua | do_discard | do_same;
1574 mbio->bi_private = r10_bio;
1575
1576 atomic_inc(&r10_bio->remaining);
1577 spin_lock_irqsave(&conf->device_lock, flags);
1578 bio_list_add(&conf->pending_bio_list, mbio);
1579 conf->pending_count++;
1580 spin_unlock_irqrestore(&conf->device_lock, flags);
1581 if (!mddev_check_plugged(mddev))
1582 md_wakeup_thread(mddev->thread);
1583 }
1584 }
1585
1586
1587
1588
1589
1590 if (sectors_handled < bio_sectors(bio)) {
1591
1592
1593
1594 spin_lock_irq(&conf->device_lock);
1595 if (bio->bi_phys_segments == 0)
1596 bio->bi_phys_segments = 2;
1597 else
1598 bio->bi_phys_segments++;
1599 spin_unlock_irq(&conf->device_lock);
1600 one_write_done(r10_bio);
1601 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1602
1603 r10_bio->master_bio = bio;
1604 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1605
1606 r10_bio->mddev = mddev;
1607 r10_bio->sector = bio->bi_sector + sectors_handled;
1608 r10_bio->state = 0;
1609 goto retry_write;
1610 }
1611 one_write_done(r10_bio);
1612
1613
1614 wake_up(&conf->wait_barrier);
1615 return true;
1616}
1617
1618static void raid10_status(struct seq_file *seq, struct mddev *mddev)
1619{
1620 struct r10conf *conf = mddev->private;
1621 int i;
1622
1623 if (conf->geo.near_copies < conf->geo.raid_disks)
1624 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1625 if (conf->geo.near_copies > 1)
1626 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1627 if (conf->geo.far_copies > 1) {
1628 if (conf->geo.far_offset)
1629 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1630 else
1631 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1632 if (conf->geo.far_set_size != conf->geo.raid_disks)
1633 seq_printf(seq, " %d devices per set", conf->geo.far_set_size);
1634 }
1635 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1636 conf->geo.raid_disks - mddev->degraded);
1637 rcu_read_lock();
1638 for (i = 0; i < conf->geo.raid_disks; i++) {
1639 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1640 seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1641 }
1642 rcu_read_unlock();
1643 seq_printf(seq, "]");
1644}
1645
1646
1647
1648
1649
1650
1651static int _enough(struct r10conf *conf, int previous, int ignore)
1652{
1653 int first = 0;
1654 int has_enough = 0;
1655 int disks, ncopies;
1656 if (previous) {
1657 disks = conf->prev.raid_disks;
1658 ncopies = conf->prev.near_copies;
1659 } else {
1660 disks = conf->geo.raid_disks;
1661 ncopies = conf->geo.near_copies;
1662 }
1663
1664 rcu_read_lock();
1665 do {
1666 int n = conf->copies;
1667 int cnt = 0;
1668 int this = first;
1669 while (n--) {
1670 struct md_rdev *rdev;
1671 if (this != ignore &&
1672 (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
1673 test_bit(In_sync, &rdev->flags))
1674 cnt++;
1675 this = (this+1) % disks;
1676 }
1677 if (cnt == 0)
1678 goto out;
1679 first = (first + ncopies) % disks;
1680 } while (first != 0);
1681 has_enough = 1;
1682out:
1683 rcu_read_unlock();
1684 return has_enough;
1685}
1686
1687static int enough(struct r10conf *conf, int ignore)
1688{
1689
1690
1691
1692
1693
1694 return _enough(conf, 0, ignore) &&
1695 _enough(conf, 1, ignore);
1696}
1697
1698static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
1699{
1700 char b[BDEVNAME_SIZE];
1701 struct r10conf *conf = mddev->private;
1702 unsigned long flags;
1703
1704
1705
1706
1707
1708
1709
1710 spin_lock_irqsave(&conf->device_lock, flags);
1711 if (test_bit(In_sync, &rdev->flags)
1712 && !enough(conf, rdev->raid_disk)) {
1713
1714
1715
1716 spin_unlock_irqrestore(&conf->device_lock, flags);
1717 return;
1718 }
1719 if (test_and_clear_bit(In_sync, &rdev->flags))
1720 mddev->degraded++;
1721
1722
1723
1724 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1725 set_bit(Blocked, &rdev->flags);
1726 set_bit(Faulty, &rdev->flags);
1727 set_mask_bits(&mddev->sb_flags, 0,
1728 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1729 spin_unlock_irqrestore(&conf->device_lock, flags);
1730 pr_crit("md/raid10:%s: Disk failure on %s, disabling device.\n"
1731 "md/raid10:%s: Operation continuing on %d devices.\n",
1732 mdname(mddev), bdevname(rdev->bdev, b),
1733 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1734}
1735
1736static void print_conf(struct r10conf *conf)
1737{
1738 int i;
1739 struct md_rdev *rdev;
1740
1741 pr_debug("RAID10 conf printout:\n");
1742 if (!conf) {
1743 pr_debug("(!conf)\n");
1744 return;
1745 }
1746 pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1747 conf->geo.raid_disks);
1748
1749
1750
1751 for (i = 0; i < conf->geo.raid_disks; i++) {
1752 char b[BDEVNAME_SIZE];
1753 rdev = conf->mirrors[i].rdev;
1754 if (rdev)
1755 pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n",
1756 i, !test_bit(In_sync, &rdev->flags),
1757 !test_bit(Faulty, &rdev->flags),
1758 bdevname(rdev->bdev,b));
1759 }
1760}
1761
1762static void close_sync(struct r10conf *conf)
1763{
1764 wait_barrier(conf);
1765 allow_barrier(conf);
1766
1767 mempool_destroy(conf->r10buf_pool);
1768 conf->r10buf_pool = NULL;
1769}
1770
1771static int raid10_spare_active(struct mddev *mddev)
1772{
1773 int i;
1774 struct r10conf *conf = mddev->private;
1775 struct raid10_info *tmp;
1776 int count = 0;
1777 unsigned long flags;
1778
1779
1780
1781
1782
1783 for (i = 0; i < conf->geo.raid_disks; i++) {
1784 tmp = conf->mirrors + i;
1785 if (tmp->replacement
1786 && tmp->replacement->recovery_offset == MaxSector
1787 && !test_bit(Faulty, &tmp->replacement->flags)
1788 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1789
1790 if (!tmp->rdev
1791 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1792 count++;
1793 if (tmp->rdev) {
1794
1795
1796
1797
1798 set_bit(Faulty, &tmp->rdev->flags);
1799 sysfs_notify_dirent_safe(
1800 tmp->rdev->sysfs_state);
1801 }
1802 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1803 } else if (tmp->rdev
1804 && tmp->rdev->recovery_offset == MaxSector
1805 && !test_bit(Faulty, &tmp->rdev->flags)
1806 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1807 count++;
1808 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
1809 }
1810 }
1811 spin_lock_irqsave(&conf->device_lock, flags);
1812 mddev->degraded -= count;
1813 spin_unlock_irqrestore(&conf->device_lock, flags);
1814
1815 print_conf(conf);
1816 return count;
1817}
1818
1819static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1820{
1821 struct r10conf *conf = mddev->private;
1822 int err = -EEXIST;
1823 int mirror;
1824 int first = 0;
1825 int last = conf->geo.raid_disks - 1;
1826 struct request_queue *q = bdev_get_queue(rdev->bdev);
1827
1828 if (mddev->recovery_cp < MaxSector)
1829
1830
1831
1832 return -EBUSY;
1833 if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
1834 return -EINVAL;
1835
1836 if (md_integrity_add_rdev(rdev, mddev))
1837 return -ENXIO;
1838
1839 if (rdev->raid_disk >= 0)
1840 first = last = rdev->raid_disk;
1841
1842 if (q->merge_bvec_fn) {
1843 set_bit(Unmerged, &rdev->flags);
1844 mddev->merge_check_needed = 1;
1845 }
1846
1847 if (rdev->saved_raid_disk >= first &&
1848 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1849 mirror = rdev->saved_raid_disk;
1850 else
1851 mirror = first;
1852 for ( ; mirror <= last ; mirror++) {
1853 struct raid10_info *p = &conf->mirrors[mirror];
1854 if (p->recovery_disabled == mddev->recovery_disabled)
1855 continue;
1856 if (p->rdev) {
1857 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1858 p->replacement != NULL)
1859 continue;
1860 clear_bit(In_sync, &rdev->flags);
1861 set_bit(Replacement, &rdev->flags);
1862 rdev->raid_disk = mirror;
1863 err = 0;
1864 if (mddev->gendisk)
1865 disk_stack_limits(mddev->gendisk, rdev->bdev,
1866 rdev->data_offset << 9);
1867 conf->fullsync = 1;
1868 rcu_assign_pointer(p->replacement, rdev);
1869 break;
1870 }
1871
1872 if (mddev->gendisk)
1873 disk_stack_limits(mddev->gendisk, rdev->bdev,
1874 rdev->data_offset << 9);
1875
1876 p->head_position = 0;
1877 p->recovery_disabled = mddev->recovery_disabled - 1;
1878 rdev->raid_disk = mirror;
1879 err = 0;
1880 if (rdev->saved_raid_disk != mirror)
1881 conf->fullsync = 1;
1882 rcu_assign_pointer(p->rdev, rdev);
1883 break;
1884 }
1885 if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
1886
1887
1888
1889
1890
1891
1892
1893 synchronize_sched();
1894 freeze_array(conf, 0);
1895 unfreeze_array(conf);
1896 clear_bit(Unmerged, &rdev->flags);
1897 }
1898 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1899 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
1900
1901 print_conf(conf);
1902 return err;
1903}
1904
1905static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1906{
1907 struct r10conf *conf = mddev->private;
1908 int err = 0;
1909 int number = rdev->raid_disk;
1910 struct md_rdev **rdevp;
1911 struct raid10_info *p = conf->mirrors + number;
1912
1913 print_conf(conf);
1914 if (rdev == p->rdev)
1915 rdevp = &p->rdev;
1916 else if (rdev == p->replacement)
1917 rdevp = &p->replacement;
1918 else
1919 return 0;
1920
1921 if (test_bit(In_sync, &rdev->flags) ||
1922 atomic_read(&rdev->nr_pending)) {
1923 err = -EBUSY;
1924 goto abort;
1925 }
1926
1927
1928
1929 if (!test_bit(Faulty, &rdev->flags) &&
1930 mddev->recovery_disabled != p->recovery_disabled &&
1931 (!p->replacement || p->replacement == rdev) &&
1932 number < conf->geo.raid_disks &&
1933 enough(conf, -1)) {
1934 err = -EBUSY;
1935 goto abort;
1936 }
1937 *rdevp = NULL;
1938 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
1939 synchronize_rcu();
1940 if (atomic_read(&rdev->nr_pending)) {
1941
1942 err = -EBUSY;
1943 *rdevp = rdev;
1944 goto abort;
1945 }
1946 }
1947 if (p->replacement) {
1948
1949 p->rdev = p->replacement;
1950 clear_bit(Replacement, &p->replacement->flags);
1951 smp_mb();
1952
1953
1954 p->replacement = NULL;
1955 clear_bit(WantReplacement, &rdev->flags);
1956 } else
1957
1958
1959
1960 clear_bit(WantReplacement, &rdev->flags);
1961
1962 err = md_integrity_register(mddev);
1963
1964abort:
1965
1966 print_conf(conf);
1967 return err;
1968}
1969
1970static void end_sync_read(struct bio *bio, int error)
1971{
1972 struct r10bio *r10_bio = bio->bi_private;
1973 struct r10conf *conf = r10_bio->mddev->private;
1974 int d;
1975
1976 if (bio == r10_bio->master_bio) {
1977
1978 d = r10_bio->read_slot;
1979 } else
1980 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1981
1982 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1983 set_bit(R10BIO_Uptodate, &r10_bio->state);
1984 else
1985
1986
1987
1988 atomic_add(r10_bio->sectors,
1989 &conf->mirrors[d].rdev->corrected_errors);
1990
1991
1992
1993
1994 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1995 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1996 atomic_dec_and_test(&r10_bio->remaining)) {
1997
1998
1999
2000 reschedule_retry(r10_bio);
2001 }
2002}
2003
2004static void end_sync_request(struct r10bio *r10_bio)
2005{
2006 struct mddev *mddev = r10_bio->mddev;
2007
2008 while (atomic_dec_and_test(&r10_bio->remaining)) {
2009 if (r10_bio->master_bio == NULL) {
2010
2011 sector_t s = r10_bio->sectors;
2012 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2013 test_bit(R10BIO_WriteError, &r10_bio->state))
2014 reschedule_retry(r10_bio);
2015 else
2016 put_buf(r10_bio);
2017 md_done_sync(mddev, s, 1);
2018 break;
2019 } else {
2020 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
2021 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2022 test_bit(R10BIO_WriteError, &r10_bio->state))
2023 reschedule_retry(r10_bio);
2024 else
2025 put_buf(r10_bio);
2026 r10_bio = r10_bio2;
2027 }
2028 }
2029}
2030
2031static void end_sync_write(struct bio *bio, int error)
2032{
2033 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
2034 struct r10bio *r10_bio = bio->bi_private;
2035 struct mddev *mddev = r10_bio->mddev;
2036 struct r10conf *conf = mddev->private;
2037 int d;
2038 sector_t first_bad;
2039 int bad_sectors;
2040 int slot;
2041 int repl;
2042 struct md_rdev *rdev = NULL;
2043
2044 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
2045 if (repl)
2046 rdev = conf->mirrors[d].replacement;
2047 else
2048 rdev = conf->mirrors[d].rdev;
2049
2050 if (!uptodate) {
2051 if (repl)
2052 md_error(mddev, rdev);
2053 else {
2054 set_bit(WriteErrorSeen, &rdev->flags);
2055 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2056 set_bit(MD_RECOVERY_NEEDED,
2057 &rdev->mddev->recovery);
2058 set_bit(R10BIO_WriteError, &r10_bio->state);
2059 }
2060 } else if (is_badblock(rdev,
2061 r10_bio->devs[slot].addr,
2062 r10_bio->sectors,
2063 &first_bad, &bad_sectors))
2064 set_bit(R10BIO_MadeGood, &r10_bio->state);
2065
2066 rdev_dec_pending(rdev, mddev);
2067
2068 end_sync_request(r10_bio);
2069}
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2088{
2089 struct r10conf *conf = mddev->private;
2090 int i, first;
2091 struct bio *tbio, *fbio;
2092 int vcnt;
2093
2094 atomic_set(&r10_bio->remaining, 1);
2095
2096
2097 for (i=0; i<conf->copies; i++)
2098 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
2099 break;
2100
2101 if (i == conf->copies)
2102 goto done;
2103
2104 first = i;
2105 fbio = r10_bio->devs[i].bio;
2106 fbio->bi_size = r10_bio->sectors << 9;
2107 fbio->bi_idx = 0;
2108
2109 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2110
2111 for (i=0 ; i < conf->copies ; i++) {
2112 int j, d;
2113 struct md_rdev *rdev;
2114
2115 tbio = r10_bio->devs[i].bio;
2116
2117 if (tbio->bi_end_io != end_sync_read)
2118 continue;
2119 if (i == first)
2120 continue;
2121 d = r10_bio->devs[i].devnum;
2122 rdev = conf->mirrors[d].rdev;
2123 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
2124
2125
2126
2127
2128 int sectors = r10_bio->sectors;
2129 for (j = 0; j < vcnt; j++) {
2130 int len = PAGE_SIZE;
2131 if (sectors < (len / 512))
2132 len = sectors * 512;
2133 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
2134 page_address(tbio->bi_io_vec[j].bv_page),
2135 len))
2136 break;
2137 sectors -= len/512;
2138 }
2139 if (j == vcnt)
2140 continue;
2141 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2142 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2143
2144 continue;
2145 } else if (test_bit(FailFast, &rdev->flags)) {
2146
2147 md_error(rdev->mddev, rdev);
2148 continue;
2149 }
2150
2151
2152
2153
2154
2155 bio_reset(tbio);
2156
2157 tbio->bi_vcnt = vcnt;
2158 tbio->bi_size = fbio->bi_size;
2159 tbio->bi_rw = WRITE;
2160 tbio->bi_private = r10_bio;
2161 tbio->bi_sector = r10_bio->devs[i].addr;
2162
2163 for (j=0; j < vcnt ; j++) {
2164 tbio->bi_io_vec[j].bv_offset = 0;
2165 tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
2166
2167 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2168 page_address(fbio->bi_io_vec[j].bv_page),
2169 PAGE_SIZE);
2170 }
2171 tbio->bi_end_io = end_sync_write;
2172
2173 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2174 atomic_inc(&r10_bio->remaining);
2175 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2176
2177 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
2178 tbio->bi_rw |= MD_FAILFAST;
2179 tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
2180 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
2181 generic_make_request(tbio);
2182 }
2183
2184
2185
2186
2187 for (i = 0; i < conf->copies; i++) {
2188 int j, d;
2189
2190 tbio = r10_bio->devs[i].repl_bio;
2191 if (!tbio || !tbio->bi_end_io)
2192 continue;
2193 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2194 && r10_bio->devs[i].bio != fbio)
2195 for (j = 0; j < vcnt; j++)
2196 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2197 page_address(fbio->bi_io_vec[j].bv_page),
2198 PAGE_SIZE);
2199 d = r10_bio->devs[i].devnum;
2200 atomic_inc(&r10_bio->remaining);
2201 md_sync_acct(conf->mirrors[d].replacement->bdev,
2202 bio_sectors(tbio));
2203 generic_make_request(tbio);
2204 }
2205
2206done:
2207 if (atomic_dec_and_test(&r10_bio->remaining)) {
2208 md_done_sync(mddev, r10_bio->sectors, 1);
2209 put_buf(r10_bio);
2210 }
2211}
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223static void fix_recovery_read_error(struct r10bio *r10_bio)
2224{
2225
2226
2227
2228
2229
2230
2231
2232 struct mddev *mddev = r10_bio->mddev;
2233 struct r10conf *conf = mddev->private;
2234 struct bio *bio = r10_bio->devs[0].bio;
2235 sector_t sect = 0;
2236 int sectors = r10_bio->sectors;
2237 int idx = 0;
2238 int dr = r10_bio->devs[0].devnum;
2239 int dw = r10_bio->devs[1].devnum;
2240
2241 while (sectors) {
2242 int s = sectors;
2243 struct md_rdev *rdev;
2244 sector_t addr;
2245 int ok;
2246
2247 if (s > (PAGE_SIZE>>9))
2248 s = PAGE_SIZE >> 9;
2249
2250 rdev = conf->mirrors[dr].rdev;
2251 addr = r10_bio->devs[0].addr + sect,
2252 ok = sync_page_io(rdev,
2253 addr,
2254 s << 9,
2255 bio->bi_io_vec[idx].bv_page,
2256 READ, false);
2257 if (ok) {
2258 rdev = conf->mirrors[dw].rdev;
2259 addr = r10_bio->devs[1].addr + sect;
2260 ok = sync_page_io(rdev,
2261 addr,
2262 s << 9,
2263 bio->bi_io_vec[idx].bv_page,
2264 WRITE, false);
2265 if (!ok) {
2266 set_bit(WriteErrorSeen, &rdev->flags);
2267 if (!test_and_set_bit(WantReplacement,
2268 &rdev->flags))
2269 set_bit(MD_RECOVERY_NEEDED,
2270 &rdev->mddev->recovery);
2271 }
2272 }
2273 if (!ok) {
2274
2275
2276
2277
2278 rdev_set_badblocks(rdev, addr, s, 0);
2279
2280 if (rdev != conf->mirrors[dw].rdev) {
2281
2282 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2283 addr = r10_bio->devs[1].addr + sect;
2284 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2285 if (!ok) {
2286
2287 pr_notice("md/raid10:%s: recovery aborted due to read error\n",
2288 mdname(mddev));
2289
2290 conf->mirrors[dw].recovery_disabled
2291 = mddev->recovery_disabled;
2292 set_bit(MD_RECOVERY_INTR,
2293 &mddev->recovery);
2294 break;
2295 }
2296 }
2297 }
2298
2299 sectors -= s;
2300 sect += s;
2301 idx++;
2302 }
2303}
2304
2305static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2306{
2307 struct r10conf *conf = mddev->private;
2308 int d;
2309 struct bio *wbio, *wbio2;
2310
2311 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2312 fix_recovery_read_error(r10_bio);
2313 end_sync_request(r10_bio);
2314 return;
2315 }
2316
2317
2318
2319
2320
2321 d = r10_bio->devs[1].devnum;
2322 wbio = r10_bio->devs[1].bio;
2323 wbio2 = r10_bio->devs[1].repl_bio;
2324
2325
2326
2327
2328 if (wbio2 && !wbio2->bi_end_io)
2329 wbio2 = NULL;
2330 if (wbio->bi_end_io) {
2331 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2332 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2333 generic_make_request(wbio);
2334 }
2335 if (wbio2) {
2336 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2337 md_sync_acct(conf->mirrors[d].replacement->bdev,
2338 bio_sectors(wbio2));
2339 generic_make_request(wbio2);
2340 }
2341}
2342
2343
2344
2345
2346
2347
2348
2349static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2350{
2351 struct timespec cur_time_mon;
2352 unsigned long hours_since_last;
2353 unsigned int read_errors = atomic_read(&rdev->read_errors);
2354
2355 ktime_get_ts(&cur_time_mon);
2356
2357 if (rdev->last_read_error.tv_sec == 0 &&
2358 rdev->last_read_error.tv_nsec == 0) {
2359
2360 rdev->last_read_error = cur_time_mon;
2361 return;
2362 }
2363
2364 hours_since_last = (cur_time_mon.tv_sec -
2365 rdev->last_read_error.tv_sec) / 3600;
2366
2367 rdev->last_read_error = cur_time_mon;
2368
2369
2370
2371
2372
2373
2374 if (hours_since_last >= 8 * sizeof(read_errors))
2375 atomic_set(&rdev->read_errors, 0);
2376 else
2377 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2378}
2379
2380static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2381 int sectors, struct page *page, int rw)
2382{
2383 sector_t first_bad;
2384 int bad_sectors;
2385
2386 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2387 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2388 return -1;
2389 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
2390
2391 return 1;
2392 if (rw == WRITE) {
2393 set_bit(WriteErrorSeen, &rdev->flags);
2394 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2395 set_bit(MD_RECOVERY_NEEDED,
2396 &rdev->mddev->recovery);
2397 }
2398
2399 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2400 md_error(rdev->mddev, rdev);
2401 return 0;
2402}
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2413{
2414 int sect = 0;
2415 int sectors = r10_bio->sectors;
2416 struct md_rdev*rdev;
2417 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2418 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2419
2420
2421
2422
2423 rdev = conf->mirrors[d].rdev;
2424
2425 if (test_bit(Faulty, &rdev->flags))
2426
2427
2428 return;
2429
2430 check_decay_read_errors(mddev, rdev);
2431 atomic_inc(&rdev->read_errors);
2432 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2433 char b[BDEVNAME_SIZE];
2434 bdevname(rdev->bdev, b);
2435
2436 pr_notice("md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %d:max %d]\n",
2437 mdname(mddev), b,
2438 atomic_read(&rdev->read_errors), max_read_errors);
2439 pr_notice("md/raid10:%s: %s: Failing raid device\n",
2440 mdname(mddev), b);
2441 md_error(mddev, rdev);
2442 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2443 return;
2444 }
2445
2446 while(sectors) {
2447 int s = sectors;
2448 int sl = r10_bio->read_slot;
2449 int success = 0;
2450 int start;
2451
2452 if (s > (PAGE_SIZE>>9))
2453 s = PAGE_SIZE >> 9;
2454
2455 rcu_read_lock();
2456 do {
2457 sector_t first_bad;
2458 int bad_sectors;
2459
2460 d = r10_bio->devs[sl].devnum;
2461 rdev = rcu_dereference(conf->mirrors[d].rdev);
2462 if (rdev &&
2463 !test_bit(Unmerged, &rdev->flags) &&
2464 test_bit(In_sync, &rdev->flags) &&
2465 !test_bit(Faulty, &rdev->flags) &&
2466 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2467 &first_bad, &bad_sectors) == 0) {
2468 atomic_inc(&rdev->nr_pending);
2469 rcu_read_unlock();
2470 success = sync_page_io(rdev,
2471 r10_bio->devs[sl].addr +
2472 sect,
2473 s<<9,
2474 conf->tmppage, READ, false);
2475 rdev_dec_pending(rdev, mddev);
2476 rcu_read_lock();
2477 if (success)
2478 break;
2479 }
2480 sl++;
2481 if (sl == conf->copies)
2482 sl = 0;
2483 } while (!success && sl != r10_bio->read_slot);
2484 rcu_read_unlock();
2485
2486 if (!success) {
2487
2488
2489
2490
2491 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2492 rdev = conf->mirrors[dn].rdev;
2493
2494 if (!rdev_set_badblocks(
2495 rdev,
2496 r10_bio->devs[r10_bio->read_slot].addr
2497 + sect,
2498 s, 0)) {
2499 md_error(mddev, rdev);
2500 r10_bio->devs[r10_bio->read_slot].bio
2501 = IO_BLOCKED;
2502 }
2503 break;
2504 }
2505
2506 start = sl;
2507
2508 rcu_read_lock();
2509 while (sl != r10_bio->read_slot) {
2510 char b[BDEVNAME_SIZE];
2511
2512 if (sl==0)
2513 sl = conf->copies;
2514 sl--;
2515 d = r10_bio->devs[sl].devnum;
2516 rdev = rcu_dereference(conf->mirrors[d].rdev);
2517 if (!rdev ||
2518 test_bit(Unmerged, &rdev->flags) ||
2519 test_bit(Faulty, &rdev->flags) ||
2520 !test_bit(In_sync, &rdev->flags))
2521 continue;
2522
2523 atomic_inc(&rdev->nr_pending);
2524 rcu_read_unlock();
2525 if (r10_sync_page_io(rdev,
2526 r10_bio->devs[sl].addr +
2527 sect,
2528 s, conf->tmppage, WRITE)
2529 == 0) {
2530
2531 pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %s)\n",
2532 mdname(mddev), s,
2533 (unsigned long long)(
2534 sect +
2535 choose_data_offset(r10_bio,
2536 rdev)),
2537 bdevname(rdev->bdev, b));
2538 pr_notice("md/raid10:%s: %s: failing drive\n",
2539 mdname(mddev),
2540 bdevname(rdev->bdev, b));
2541 }
2542 rdev_dec_pending(rdev, mddev);
2543 rcu_read_lock();
2544 }
2545 sl = start;
2546 while (sl != r10_bio->read_slot) {
2547 char b[BDEVNAME_SIZE];
2548
2549 if (sl==0)
2550 sl = conf->copies;
2551 sl--;
2552 d = r10_bio->devs[sl].devnum;
2553 rdev = rcu_dereference(conf->mirrors[d].rdev);
2554 if (!rdev ||
2555 test_bit(Faulty, &rdev->flags) ||
2556 !test_bit(In_sync, &rdev->flags))
2557 continue;
2558
2559 atomic_inc(&rdev->nr_pending);
2560 rcu_read_unlock();
2561 switch (r10_sync_page_io(rdev,
2562 r10_bio->devs[sl].addr +
2563 sect,
2564 s, conf->tmppage,
2565 READ)) {
2566 case 0:
2567
2568 pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %s)\n",
2569 mdname(mddev), s,
2570 (unsigned long long)(
2571 sect +
2572 choose_data_offset(r10_bio, rdev)),
2573 bdevname(rdev->bdev, b));
2574 pr_notice("md/raid10:%s: %s: failing drive\n",
2575 mdname(mddev),
2576 bdevname(rdev->bdev, b));
2577 break;
2578 case 1:
2579 pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %s)\n",
2580 mdname(mddev), s,
2581 (unsigned long long)(
2582 sect +
2583 choose_data_offset(r10_bio, rdev)),
2584 bdevname(rdev->bdev, b));
2585 atomic_add(s, &rdev->corrected_errors);
2586 }
2587
2588 rdev_dec_pending(rdev, mddev);
2589 rcu_read_lock();
2590 }
2591 rcu_read_unlock();
2592
2593 sectors -= s;
2594 sect += s;
2595 }
2596}
2597
2598static int narrow_write_error(struct r10bio *r10_bio, int i)
2599{
2600 struct bio *bio = r10_bio->master_bio;
2601 struct mddev *mddev = r10_bio->mddev;
2602 struct r10conf *conf = mddev->private;
2603 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615 int block_sectors;
2616 sector_t sector;
2617 int sectors;
2618 int sect_to_write = r10_bio->sectors;
2619 int ok = 1;
2620
2621 if (rdev->badblocks.shift < 0)
2622 return 0;
2623
2624 block_sectors = roundup(1 << rdev->badblocks.shift,
2625 bdev_logical_block_size(rdev->bdev) >> 9);
2626 sector = r10_bio->sector;
2627 sectors = ((r10_bio->sector + block_sectors)
2628 & ~(sector_t)(block_sectors - 1))
2629 - sector;
2630
2631 while (sect_to_write) {
2632 struct bio *wbio;
2633 sector_t wsector;
2634 if (sectors > sect_to_write)
2635 sectors = sect_to_write;
2636
2637 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2638 bio_trim(wbio, sector - bio->bi_sector, sectors);
2639 wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
2640 wbio->bi_sector = wsector +
2641 choose_data_offset(r10_bio, rdev);
2642 wbio->bi_bdev = rdev->bdev;
2643 if (submit_bio_wait(WRITE, wbio) < 0)
2644
2645 ok = rdev_set_badblocks(rdev, wsector,
2646 sectors, 0)
2647 && ok;
2648
2649 bio_put(wbio);
2650 sect_to_write -= sectors;
2651 sector += sectors;
2652 sectors = block_sectors;
2653 }
2654 return ok;
2655}
2656
2657static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2658{
2659 int slot = r10_bio->read_slot;
2660 struct bio *bio;
2661 struct r10conf *conf = mddev->private;
2662 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2663 char b[BDEVNAME_SIZE];
2664 unsigned long do_sync;
2665 int max_sectors;
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675 bio = r10_bio->devs[slot].bio;
2676 bdevname(bio->bi_bdev, b);
2677 bio_put(bio);
2678 r10_bio->devs[slot].bio = NULL;
2679
2680 if (mddev->ro)
2681 r10_bio->devs[slot].bio = IO_BLOCKED;
2682 else if (!test_bit(FailFast, &rdev->flags)) {
2683 freeze_array(conf, 1);
2684 fix_read_error(conf, mddev, r10_bio);
2685 unfreeze_array(conf);
2686 } else
2687 md_error(mddev, rdev);
2688
2689 rdev_dec_pending(rdev, mddev);
2690
2691read_more:
2692 rdev = read_balance(conf, r10_bio, &max_sectors);
2693 if (rdev == NULL) {
2694 pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
2695 mdname(mddev), b,
2696 (unsigned long long)r10_bio->sector);
2697 raid_end_bio_io(r10_bio);
2698 return;
2699 }
2700
2701 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
2702 slot = r10_bio->read_slot;
2703 pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
2704 mdname(mddev),
2705 bdevname(rdev->bdev, b),
2706 (unsigned long long)r10_bio->sector);
2707 bio = bio_clone_mddev(r10_bio->master_bio,
2708 GFP_NOIO, mddev);
2709 bio_trim(bio, r10_bio->sector - bio->bi_sector, max_sectors);
2710 r10_bio->devs[slot].bio = bio;
2711 r10_bio->devs[slot].rdev = rdev;
2712 bio->bi_sector = r10_bio->devs[slot].addr
2713 + choose_data_offset(r10_bio, rdev);
2714 bio->bi_bdev = rdev->bdev;
2715 bio->bi_rw = READ | do_sync;
2716 if (test_bit(FailFast, &rdev->flags) &&
2717 test_bit(R10BIO_FailFast, &r10_bio->state))
2718 bio->bi_rw |= MD_FAILFAST;
2719 bio->bi_private = r10_bio;
2720 bio->bi_end_io = raid10_end_read_request;
2721 if (max_sectors < r10_bio->sectors) {
2722
2723 struct bio *mbio = r10_bio->master_bio;
2724 int sectors_handled =
2725 r10_bio->sector + max_sectors
2726 - mbio->bi_sector;
2727 r10_bio->sectors = max_sectors;
2728 spin_lock_irq(&conf->device_lock);
2729 if (mbio->bi_phys_segments == 0)
2730 mbio->bi_phys_segments = 2;
2731 else
2732 mbio->bi_phys_segments++;
2733 spin_unlock_irq(&conf->device_lock);
2734 generic_make_request(bio);
2735
2736 r10_bio = mempool_alloc(conf->r10bio_pool,
2737 GFP_NOIO);
2738 r10_bio->master_bio = mbio;
2739 r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
2740 r10_bio->state = 0;
2741 set_bit(R10BIO_ReadError,
2742 &r10_bio->state);
2743 r10_bio->mddev = mddev;
2744 r10_bio->sector = mbio->bi_sector
2745 + sectors_handled;
2746
2747 goto read_more;
2748 } else
2749 generic_make_request(bio);
2750}
2751
2752static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2753{
2754
2755
2756
2757
2758
2759
2760 int m;
2761 struct md_rdev *rdev;
2762
2763 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2764 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2765 for (m = 0; m < conf->copies; m++) {
2766 int dev = r10_bio->devs[m].devnum;
2767 rdev = conf->mirrors[dev].rdev;
2768 if (r10_bio->devs[m].bio == NULL)
2769 continue;
2770 if (test_bit(BIO_UPTODATE,
2771 &r10_bio->devs[m].bio->bi_flags)) {
2772 rdev_clear_badblocks(
2773 rdev,
2774 r10_bio->devs[m].addr,
2775 r10_bio->sectors, 0);
2776 } else {
2777 if (!rdev_set_badblocks(
2778 rdev,
2779 r10_bio->devs[m].addr,
2780 r10_bio->sectors, 0))
2781 md_error(conf->mddev, rdev);
2782 }
2783 rdev = conf->mirrors[dev].replacement;
2784 if (r10_bio->devs[m].repl_bio == NULL)
2785 continue;
2786 if (test_bit(BIO_UPTODATE,
2787 &r10_bio->devs[m].repl_bio->bi_flags)) {
2788 rdev_clear_badblocks(
2789 rdev,
2790 r10_bio->devs[m].addr,
2791 r10_bio->sectors, 0);
2792 } else {
2793 if (!rdev_set_badblocks(
2794 rdev,
2795 r10_bio->devs[m].addr,
2796 r10_bio->sectors, 0))
2797 md_error(conf->mddev, rdev);
2798 }
2799 }
2800 put_buf(r10_bio);
2801 } else {
2802 bool fail = false;
2803 for (m = 0; m < conf->copies; m++) {
2804 int dev = r10_bio->devs[m].devnum;
2805 struct bio *bio = r10_bio->devs[m].bio;
2806 rdev = conf->mirrors[dev].rdev;
2807 if (bio == IO_MADE_GOOD) {
2808 rdev_clear_badblocks(
2809 rdev,
2810 r10_bio->devs[m].addr,
2811 r10_bio->sectors, 0);
2812 rdev_dec_pending(rdev, conf->mddev);
2813 } else if (bio != NULL &&
2814 !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2815 fail = true;
2816 if (!narrow_write_error(r10_bio, m)) {
2817 md_error(conf->mddev, rdev);
2818 set_bit(R10BIO_Degraded,
2819 &r10_bio->state);
2820 }
2821 rdev_dec_pending(rdev, conf->mddev);
2822 }
2823 bio = r10_bio->devs[m].repl_bio;
2824 rdev = conf->mirrors[dev].replacement;
2825 if (rdev && bio == IO_MADE_GOOD) {
2826 rdev_clear_badblocks(
2827 rdev,
2828 r10_bio->devs[m].addr,
2829 r10_bio->sectors, 0);
2830 rdev_dec_pending(rdev, conf->mddev);
2831 }
2832 }
2833 if (fail) {
2834 spin_lock_irq(&conf->device_lock);
2835 list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
2836 conf->nr_queued++;
2837 spin_unlock_irq(&conf->device_lock);
2838 md_wakeup_thread(conf->mddev->thread);
2839 } else {
2840 if (test_bit(R10BIO_WriteError,
2841 &r10_bio->state))
2842 close_write(r10_bio);
2843 raid_end_bio_io(r10_bio);
2844 }
2845 }
2846}
2847
2848static void raid10d(struct md_thread *thread)
2849{
2850 struct mddev *mddev = thread->mddev;
2851 struct r10bio *r10_bio;
2852 unsigned long flags;
2853 struct r10conf *conf = mddev->private;
2854 struct list_head *head = &conf->retry_list;
2855 struct blk_plug plug;
2856
2857 md_check_recovery(mddev);
2858
2859 if (!list_empty_careful(&conf->bio_end_io_list) &&
2860 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2861 LIST_HEAD(tmp);
2862 spin_lock_irqsave(&conf->device_lock, flags);
2863 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2864 while (!list_empty(&conf->bio_end_io_list)) {
2865 list_move(conf->bio_end_io_list.prev, &tmp);
2866 conf->nr_queued--;
2867 }
2868 }
2869 spin_unlock_irqrestore(&conf->device_lock, flags);
2870 while (!list_empty(&tmp)) {
2871 r10_bio = list_first_entry(&tmp, struct r10bio,
2872 retry_list);
2873 list_del(&r10_bio->retry_list);
2874 if (mddev->degraded)
2875 set_bit(R10BIO_Degraded, &r10_bio->state);
2876
2877 if (test_bit(R10BIO_WriteError,
2878 &r10_bio->state))
2879 close_write(r10_bio);
2880 raid_end_bio_io(r10_bio);
2881 }
2882 }
2883
2884 blk_start_plug(&plug);
2885 for (;;) {
2886
2887 flush_pending_writes(conf);
2888
2889 spin_lock_irqsave(&conf->device_lock, flags);
2890 if (list_empty(head)) {
2891 spin_unlock_irqrestore(&conf->device_lock, flags);
2892 break;
2893 }
2894 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2895 list_del(head->prev);
2896 conf->nr_queued--;
2897 spin_unlock_irqrestore(&conf->device_lock, flags);
2898
2899 mddev = r10_bio->mddev;
2900 conf = mddev->private;
2901 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2902 test_bit(R10BIO_WriteError, &r10_bio->state))
2903 handle_write_completed(conf, r10_bio);
2904 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2905 reshape_request_write(mddev, r10_bio);
2906 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2907 sync_request_write(mddev, r10_bio);
2908 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2909 recovery_request_write(mddev, r10_bio);
2910 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2911 handle_read_error(mddev, r10_bio);
2912 else {
2913
2914
2915
2916 int slot = r10_bio->read_slot;
2917 generic_make_request(r10_bio->devs[slot].bio);
2918 }
2919
2920 cond_resched();
2921 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
2922 md_check_recovery(mddev);
2923 }
2924 blk_finish_plug(&plug);
2925}
2926
2927static int init_resync(struct r10conf *conf)
2928{
2929 int buffs;
2930 int i;
2931
2932 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2933 BUG_ON(conf->r10buf_pool);
2934 conf->have_replacement = 0;
2935 for (i = 0; i < conf->geo.raid_disks; i++)
2936 if (conf->mirrors[i].replacement)
2937 conf->have_replacement = 1;
2938 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2939 if (!conf->r10buf_pool)
2940 return -ENOMEM;
2941 conf->next_resync = 0;
2942 return 0;
2943}
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
2978 int *skipped)
2979{
2980 struct r10conf *conf = mddev->private;
2981 struct r10bio *r10_bio;
2982 struct bio *biolist = NULL, *bio;
2983 sector_t max_sector, nr_sectors;
2984 int i;
2985 int max_sync;
2986 sector_t sync_blocks;
2987 sector_t sectors_skipped = 0;
2988 int chunks_skipped = 0;
2989 sector_t chunk_mask = conf->geo.chunk_mask;
2990
2991 if (!conf->r10buf_pool)
2992 if (init_resync(conf))
2993 return 0;
2994
2995
2996
2997
2998
2999 if (mddev->bitmap == NULL &&
3000 mddev->recovery_cp == MaxSector &&
3001 mddev->reshape_position == MaxSector &&
3002 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
3003 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
3004 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
3005 conf->fullsync == 0) {
3006 *skipped = 1;
3007 return mddev->dev_sectors - sector_nr;
3008 }
3009
3010 skipped:
3011 max_sector = mddev->dev_sectors;
3012 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
3013 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3014 max_sector = mddev->resync_max_sectors;
3015 if (sector_nr >= max_sector) {
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
3026 end_reshape(conf);
3027 close_sync(conf);
3028 return 0;
3029 }
3030
3031 if (mddev->curr_resync < max_sector) {
3032 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3033 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
3034 &sync_blocks, 1);
3035 else for (i = 0; i < conf->geo.raid_disks; i++) {
3036 sector_t sect =
3037 raid10_find_virt(conf, mddev->curr_resync, i);
3038 bitmap_end_sync(mddev->bitmap, sect,
3039 &sync_blocks, 1);
3040 }
3041 } else {
3042
3043 if ((!mddev->bitmap || conf->fullsync)
3044 && conf->have_replacement
3045 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3046
3047
3048
3049 rcu_read_lock();
3050 for (i = 0; i < conf->geo.raid_disks; i++) {
3051 struct md_rdev *rdev =
3052 rcu_dereference(conf->mirrors[i].replacement);
3053 if (rdev)
3054 rdev->recovery_offset = MaxSector;
3055 }
3056 rcu_read_unlock();
3057 }
3058 conf->fullsync = 0;
3059 }
3060 bitmap_close_sync(mddev->bitmap);
3061 close_sync(conf);
3062 *skipped = 1;
3063 return sectors_skipped;
3064 }
3065
3066 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3067 return reshape_request(mddev, sector_nr, skipped);
3068
3069 if (chunks_skipped >= conf->geo.raid_disks) {
3070
3071
3072
3073 *skipped = 1;
3074 return (max_sector - sector_nr) + sectors_skipped;
3075 }
3076
3077 if (max_sector > mddev->resync_max)
3078 max_sector = mddev->resync_max;
3079
3080
3081
3082
3083 if (conf->geo.near_copies < conf->geo.raid_disks &&
3084 max_sector > (sector_nr | chunk_mask))
3085 max_sector = (sector_nr | chunk_mask) + 1;
3086
3087
3088
3089
3090
3091 if (conf->nr_waiting)
3092 schedule_timeout_uninterruptible(1);
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
3110 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3111
3112 int j;
3113 r10_bio = NULL;
3114
3115 for (i = 0 ; i < conf->geo.raid_disks; i++) {
3116 int still_degraded;
3117 struct r10bio *rb2;
3118 sector_t sect;
3119 int must_sync;
3120 int any_working;
3121 struct raid10_info *mirror = &conf->mirrors[i];
3122 struct md_rdev *mrdev, *mreplace;
3123
3124 rcu_read_lock();
3125 mrdev = rcu_dereference(mirror->rdev);
3126 mreplace = rcu_dereference(mirror->replacement);
3127
3128 if ((mrdev == NULL ||
3129 test_bit(Faulty, &mrdev->flags) ||
3130 test_bit(In_sync, &mrdev->flags)) &&
3131 (mreplace == NULL ||
3132 test_bit(Faulty, &mreplace->flags))) {
3133 rcu_read_unlock();
3134 continue;
3135 }
3136
3137 still_degraded = 0;
3138
3139 rb2 = r10_bio;
3140 sect = raid10_find_virt(conf, sector_nr, i);
3141 if (sect >= mddev->resync_max_sectors) {
3142
3143
3144
3145 rcu_read_unlock();
3146 continue;
3147 }
3148 if (mreplace && test_bit(Faulty, &mreplace->flags))
3149 mreplace = NULL;
3150
3151
3152
3153
3154 must_sync = bitmap_start_sync(mddev->bitmap, sect,
3155 &sync_blocks, 1);
3156 if (sync_blocks < max_sync)
3157 max_sync = sync_blocks;
3158 if (!must_sync &&
3159 mreplace == NULL &&
3160 !conf->fullsync) {
3161
3162
3163
3164 chunks_skipped = -1;
3165 rcu_read_unlock();
3166 continue;
3167 }
3168 atomic_inc(&mrdev->nr_pending);
3169 if (mreplace)
3170 atomic_inc(&mreplace->nr_pending);
3171 rcu_read_unlock();
3172
3173 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3174 r10_bio->state = 0;
3175 raise_barrier(conf, rb2 != NULL);
3176 atomic_set(&r10_bio->remaining, 0);
3177
3178 r10_bio->master_bio = (struct bio*)rb2;
3179 if (rb2)
3180 atomic_inc(&rb2->remaining);
3181 r10_bio->mddev = mddev;
3182 set_bit(R10BIO_IsRecover, &r10_bio->state);
3183 r10_bio->sector = sect;
3184
3185 raid10_find_phys(conf, r10_bio);
3186
3187
3188
3189
3190 rcu_read_lock();
3191 for (j = 0; j < conf->geo.raid_disks; j++) {
3192 struct md_rdev *rdev = rcu_dereference(
3193 conf->mirrors[j].rdev);
3194 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3195 still_degraded = 1;
3196 break;
3197 }
3198 }
3199
3200 must_sync = bitmap_start_sync(mddev->bitmap, sect,
3201 &sync_blocks, still_degraded);
3202
3203 any_working = 0;
3204 for (j=0; j<conf->copies;j++) {
3205 int k;
3206 int d = r10_bio->devs[j].devnum;
3207 sector_t from_addr, to_addr;
3208 struct md_rdev *rdev =
3209 rcu_dereference(conf->mirrors[d].rdev);
3210 sector_t sector, first_bad;
3211 int bad_sectors;
3212 if (!rdev ||
3213 !test_bit(In_sync, &rdev->flags))
3214 continue;
3215
3216 any_working = 1;
3217 sector = r10_bio->devs[j].addr;
3218
3219 if (is_badblock(rdev, sector, max_sync,
3220 &first_bad, &bad_sectors)) {
3221 if (first_bad > sector)
3222 max_sync = first_bad - sector;
3223 else {
3224 bad_sectors -= (sector
3225 - first_bad);
3226 if (max_sync > bad_sectors)
3227 max_sync = bad_sectors;
3228 continue;
3229 }
3230 }
3231 bio = r10_bio->devs[0].bio;
3232 bio_reset(bio);
3233 bio->bi_next = biolist;
3234 biolist = bio;
3235 bio->bi_private = r10_bio;
3236 bio->bi_end_io = end_sync_read;
3237 bio->bi_rw = READ;
3238 if (test_bit(FailFast, &rdev->flags))
3239 bio->bi_rw |= MD_FAILFAST;
3240 from_addr = r10_bio->devs[j].addr;
3241 bio->bi_sector = from_addr + rdev->data_offset;
3242 bio->bi_bdev = rdev->bdev;
3243 atomic_inc(&rdev->nr_pending);
3244
3245
3246 for (k=0; k<conf->copies; k++)
3247 if (r10_bio->devs[k].devnum == i)
3248 break;
3249 BUG_ON(k == conf->copies);
3250 to_addr = r10_bio->devs[k].addr;
3251 r10_bio->devs[0].devnum = d;
3252 r10_bio->devs[0].addr = from_addr;
3253 r10_bio->devs[1].devnum = i;
3254 r10_bio->devs[1].addr = to_addr;
3255
3256 if (!test_bit(In_sync, &mrdev->flags)) {
3257 bio = r10_bio->devs[1].bio;
3258 bio_reset(bio);
3259 bio->bi_next = biolist;
3260 biolist = bio;
3261 bio->bi_private = r10_bio;
3262 bio->bi_end_io = end_sync_write;
3263 bio->bi_rw = WRITE;
3264 bio->bi_sector = to_addr
3265 + mrdev->data_offset;
3266 bio->bi_bdev = mrdev->bdev;
3267 atomic_inc(&r10_bio->remaining);
3268 } else
3269 r10_bio->devs[1].bio->bi_end_io = NULL;
3270
3271
3272 bio = r10_bio->devs[1].repl_bio;
3273 if (bio)
3274 bio->bi_end_io = NULL;
3275
3276
3277
3278
3279
3280
3281
3282
3283 if (mreplace == NULL || bio == NULL ||
3284 test_bit(Faulty, &mreplace->flags))
3285 break;
3286 bio_reset(bio);
3287 bio->bi_next = biolist;
3288 biolist = bio;
3289 bio->bi_private = r10_bio;
3290 bio->bi_end_io = end_sync_write;
3291 bio->bi_rw = WRITE;
3292 bio->bi_sector = to_addr +
3293 mreplace->data_offset;
3294 bio->bi_bdev = mreplace->bdev;
3295 atomic_inc(&r10_bio->remaining);
3296 break;
3297 }
3298 rcu_read_unlock();
3299 if (j == conf->copies) {
3300
3301
3302 if (any_working) {
3303
3304
3305
3306 int k;
3307 for (k = 0; k < conf->copies; k++)
3308 if (r10_bio->devs[k].devnum == i)
3309 break;
3310 if (!test_bit(In_sync,
3311 &mrdev->flags)
3312 && !rdev_set_badblocks(
3313 mrdev,
3314 r10_bio->devs[k].addr,
3315 max_sync, 0))
3316 any_working = 0;
3317 if (mreplace &&
3318 !rdev_set_badblocks(
3319 mreplace,
3320 r10_bio->devs[k].addr,
3321 max_sync, 0))
3322 any_working = 0;
3323 }
3324 if (!any_working) {
3325 if (!test_and_set_bit(MD_RECOVERY_INTR,
3326 &mddev->recovery))
3327 pr_warn("md/raid10:%s: insufficient working devices for recovery.\n",
3328 mdname(mddev));
3329 mirror->recovery_disabled
3330 = mddev->recovery_disabled;
3331 }
3332 put_buf(r10_bio);
3333 if (rb2)
3334 atomic_dec(&rb2->remaining);
3335 r10_bio = rb2;
3336 rdev_dec_pending(mrdev, mddev);
3337 if (mreplace)
3338 rdev_dec_pending(mreplace, mddev);
3339 break;
3340 }
3341 rdev_dec_pending(mrdev, mddev);
3342 if (mreplace)
3343 rdev_dec_pending(mreplace, mddev);
3344 if (r10_bio->devs[0].bio->bi_rw & MD_FAILFAST) {
3345
3346
3347
3348
3349 int targets = 1;
3350 for (; j < conf->copies; j++) {
3351 int d = r10_bio->devs[j].devnum;
3352 if (conf->mirrors[d].rdev &&
3353 test_bit(In_sync,
3354 &conf->mirrors[d].rdev->flags))
3355 targets++;
3356 }
3357 if (targets == 1)
3358 r10_bio->devs[0].bio->bi_rw
3359 &= ~MD_FAILFAST;
3360 }
3361 }
3362 if (biolist == NULL) {
3363 while (r10_bio) {
3364 struct r10bio *rb2 = r10_bio;
3365 r10_bio = (struct r10bio*) rb2->master_bio;
3366 rb2->master_bio = NULL;
3367 put_buf(rb2);
3368 }
3369 goto giveup;
3370 }
3371 } else {
3372
3373 int count = 0;
3374
3375 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
3376
3377 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
3378 &sync_blocks, mddev->degraded) &&
3379 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3380 &mddev->recovery)) {
3381
3382 *skipped = 1;
3383 return sync_blocks + sectors_skipped;
3384 }
3385 if (sync_blocks < max_sync)
3386 max_sync = sync_blocks;
3387 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3388 r10_bio->state = 0;
3389
3390 r10_bio->mddev = mddev;
3391 atomic_set(&r10_bio->remaining, 0);
3392 raise_barrier(conf, 0);
3393 conf->next_resync = sector_nr;
3394
3395 r10_bio->master_bio = NULL;
3396 r10_bio->sector = sector_nr;
3397 set_bit(R10BIO_IsSync, &r10_bio->state);
3398 raid10_find_phys(conf, r10_bio);
3399 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3400
3401 for (i = 0; i < conf->copies; i++) {
3402 int d = r10_bio->devs[i].devnum;
3403 sector_t first_bad, sector;
3404 int bad_sectors;
3405 struct md_rdev *rdev;
3406
3407 if (r10_bio->devs[i].repl_bio)
3408 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3409
3410 bio = r10_bio->devs[i].bio;
3411 bio_reset(bio);
3412 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3413 rcu_read_lock();
3414 rdev = rcu_dereference(conf->mirrors[d].rdev);
3415 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3416 rcu_read_unlock();
3417 continue;
3418 }
3419 sector = r10_bio->devs[i].addr;
3420 if (is_badblock(rdev, sector, max_sync,
3421 &first_bad, &bad_sectors)) {
3422 if (first_bad > sector)
3423 max_sync = first_bad - sector;
3424 else {
3425 bad_sectors -= (sector - first_bad);
3426 if (max_sync > bad_sectors)
3427 max_sync = bad_sectors;
3428 rcu_read_unlock();
3429 continue;
3430 }
3431 }
3432 atomic_inc(&rdev->nr_pending);
3433 atomic_inc(&r10_bio->remaining);
3434 bio->bi_next = biolist;
3435 biolist = bio;
3436 bio->bi_private = r10_bio;
3437 bio->bi_end_io = end_sync_read;
3438 bio->bi_rw = READ;
3439 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
3440 bio->bi_rw |= MD_FAILFAST;
3441 bio->bi_sector = sector + rdev->data_offset;
3442 bio->bi_bdev = rdev->bdev;
3443 count++;
3444
3445 rdev = rcu_dereference(conf->mirrors[d].replacement);
3446 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3447 rcu_read_unlock();
3448 continue;
3449 }
3450 atomic_inc(&rdev->nr_pending);
3451 rcu_read_unlock();
3452
3453
3454 bio = r10_bio->devs[i].repl_bio;
3455 bio_reset(bio);
3456 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3457
3458 sector = r10_bio->devs[i].addr;
3459 bio->bi_next = biolist;
3460 biolist = bio;
3461 bio->bi_private = r10_bio;
3462 bio->bi_end_io = end_sync_write;
3463 bio->bi_rw = WRITE;
3464 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
3465 bio->bi_rw |= MD_FAILFAST;
3466 bio->bi_sector = sector + rdev->data_offset;
3467 bio->bi_bdev = rdev->bdev;
3468 count++;
3469 }
3470
3471 if (count < 2) {
3472 for (i=0; i<conf->copies; i++) {
3473 int d = r10_bio->devs[i].devnum;
3474 if (r10_bio->devs[i].bio->bi_end_io)
3475 rdev_dec_pending(conf->mirrors[d].rdev,
3476 mddev);
3477 if (r10_bio->devs[i].repl_bio &&
3478 r10_bio->devs[i].repl_bio->bi_end_io)
3479 rdev_dec_pending(
3480 conf->mirrors[d].replacement,
3481 mddev);
3482 }
3483 put_buf(r10_bio);
3484 biolist = NULL;
3485 goto giveup;
3486 }
3487 }
3488
3489 nr_sectors = 0;
3490 if (sector_nr + max_sync < max_sector)
3491 max_sector = sector_nr + max_sync;
3492 do {
3493 struct page *page;
3494 int len = PAGE_SIZE;
3495 if (sector_nr + (len>>9) > max_sector)
3496 len = (max_sector - sector_nr) << 9;
3497 if (len == 0)
3498 break;
3499 for (bio= biolist ; bio ; bio=bio->bi_next) {
3500 struct bio *bio2;
3501 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
3502 if (bio_add_page(bio, page, len, 0))
3503 continue;
3504
3505
3506 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
3507 for (bio2 = biolist;
3508 bio2 && bio2 != bio;
3509 bio2 = bio2->bi_next) {
3510
3511 bio2->bi_vcnt--;
3512 bio2->bi_size -= len;
3513 __clear_bit(BIO_SEG_VALID, &bio2->bi_flags);
3514 }
3515 goto bio_full;
3516 }
3517 nr_sectors += len>>9;
3518 sector_nr += len>>9;
3519 } while (biolist->bi_vcnt < RESYNC_PAGES);
3520 bio_full:
3521 r10_bio->sectors = nr_sectors;
3522
3523 while (biolist) {
3524 bio = biolist;
3525 biolist = biolist->bi_next;
3526
3527 bio->bi_next = NULL;
3528 r10_bio = bio->bi_private;
3529 r10_bio->sectors = nr_sectors;
3530
3531 if (bio->bi_end_io == end_sync_read) {
3532 md_sync_acct(bio->bi_bdev, nr_sectors);
3533 set_bit(BIO_UPTODATE, &bio->bi_flags);
3534 generic_make_request(bio);
3535 }
3536 }
3537
3538 if (sectors_skipped)
3539
3540
3541
3542 md_done_sync(mddev, sectors_skipped, 1);
3543
3544 return sectors_skipped + nr_sectors;
3545 giveup:
3546
3547
3548
3549
3550 if (sector_nr + max_sync < max_sector)
3551 max_sector = sector_nr + max_sync;
3552
3553 sectors_skipped += (max_sector - sector_nr);
3554 chunks_skipped ++;
3555 sector_nr = max_sector;
3556 goto skipped;
3557}
3558
3559static sector_t
3560raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3561{
3562 sector_t size;
3563 struct r10conf *conf = mddev->private;
3564
3565 if (!raid_disks)
3566 raid_disks = min(conf->geo.raid_disks,
3567 conf->prev.raid_disks);
3568 if (!sectors)
3569 sectors = conf->dev_sectors;
3570
3571 size = sectors >> conf->geo.chunk_shift;
3572 sector_div(size, conf->geo.far_copies);
3573 size = size * raid_disks;
3574 sector_div(size, conf->geo.near_copies);
3575
3576 return size << conf->geo.chunk_shift;
3577}
3578
3579static void calc_sectors(struct r10conf *conf, sector_t size)
3580{
3581
3582
3583
3584
3585
3586 size = size >> conf->geo.chunk_shift;
3587 sector_div(size, conf->geo.far_copies);
3588 size = size * conf->geo.raid_disks;
3589 sector_div(size, conf->geo.near_copies);
3590
3591
3592 size = size * conf->copies;
3593
3594
3595
3596
3597 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3598
3599 conf->dev_sectors = size << conf->geo.chunk_shift;
3600
3601 if (conf->geo.far_offset)
3602 conf->geo.stride = 1 << conf->geo.chunk_shift;
3603 else {
3604 sector_div(size, conf->geo.far_copies);
3605 conf->geo.stride = size << conf->geo.chunk_shift;
3606 }
3607}
3608
3609enum geo_type {geo_new, geo_old, geo_start};
3610static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3611{
3612 int nc, fc, fo;
3613 int layout, chunk, disks;
3614 switch (new) {
3615 case geo_old:
3616 layout = mddev->layout;
3617 chunk = mddev->chunk_sectors;
3618 disks = mddev->raid_disks - mddev->delta_disks;
3619 break;
3620 case geo_new:
3621 layout = mddev->new_layout;
3622 chunk = mddev->new_chunk_sectors;
3623 disks = mddev->raid_disks;
3624 break;
3625 default:
3626 case geo_start:
3627
3628 layout = mddev->new_layout;
3629 chunk = mddev->new_chunk_sectors;
3630 disks = mddev->raid_disks + mddev->delta_disks;
3631 break;
3632 }
3633 if (layout >> 19)
3634 return -1;
3635 if (chunk < (PAGE_SIZE >> 9) ||
3636 !is_power_of_2(chunk))
3637 return -2;
3638 nc = layout & 255;
3639 fc = (layout >> 8) & 255;
3640 fo = layout & (1<<16);
3641 geo->raid_disks = disks;
3642 geo->near_copies = nc;
3643 geo->far_copies = fc;
3644 geo->far_offset = fo;
3645 switch (layout >> 17) {
3646 case 0:
3647 geo->far_set_size = disks;
3648 break;
3649 case 1:
3650
3651 geo->far_set_size = disks/fc;
3652 WARN(geo->far_set_size < fc,
3653 "This RAID10 layout does not provide data safety - please backup and create new array\n");
3654 break;
3655 case 2:
3656 geo->far_set_size = fc * nc;
3657 break;
3658 default:
3659 return -1;
3660 }
3661 geo->chunk_mask = chunk - 1;
3662 geo->chunk_shift = ffz(~chunk);
3663 return nc*fc;
3664}
3665
3666static struct r10conf *setup_conf(struct mddev *mddev)
3667{
3668 struct r10conf *conf = NULL;
3669 int err = -EINVAL;
3670 struct geom geo;
3671 int copies;
3672
3673 copies = setup_geo(&geo, mddev, geo_new);
3674
3675 if (copies == -2) {
3676 pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n",
3677 mdname(mddev), PAGE_SIZE);
3678 goto out;
3679 }
3680
3681 if (copies < 2 || copies > mddev->raid_disks) {
3682 pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3683 mdname(mddev), mddev->new_layout);
3684 goto out;
3685 }
3686
3687 err = -ENOMEM;
3688 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3689 if (!conf)
3690 goto out;
3691
3692
3693 conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
3694 max(0,-mddev->delta_disks)),
3695 GFP_KERNEL);
3696 if (!conf->mirrors)
3697 goto out;
3698
3699 conf->tmppage = alloc_page(GFP_KERNEL);
3700 if (!conf->tmppage)
3701 goto out;
3702
3703 conf->geo = geo;
3704 conf->copies = copies;
3705 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
3706 r10bio_pool_free, conf);
3707 if (!conf->r10bio_pool)
3708 goto out;
3709
3710 calc_sectors(conf, mddev->dev_sectors);
3711 if (mddev->reshape_position == MaxSector) {
3712 conf->prev = conf->geo;
3713 conf->reshape_progress = MaxSector;
3714 } else {
3715 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3716 err = -EINVAL;
3717 goto out;
3718 }
3719 conf->reshape_progress = mddev->reshape_position;
3720 if (conf->prev.far_offset)
3721 conf->prev.stride = 1 << conf->prev.chunk_shift;
3722 else
3723
3724 conf->prev.stride = conf->dev_sectors;
3725 }
3726 conf->reshape_safe = conf->reshape_progress;
3727 spin_lock_init(&conf->device_lock);
3728 INIT_LIST_HEAD(&conf->retry_list);
3729 INIT_LIST_HEAD(&conf->bio_end_io_list);
3730
3731 spin_lock_init(&conf->resync_lock);
3732 init_waitqueue_head(&conf->wait_barrier);
3733 atomic_set(&conf->nr_pending, 0);
3734
3735 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3736 if (!conf->thread)
3737 goto out;
3738
3739 conf->mddev = mddev;
3740 return conf;
3741
3742 out:
3743 if (conf) {
3744 mempool_destroy(conf->r10bio_pool);
3745 kfree(conf->mirrors);
3746 safe_put_page(conf->tmppage);
3747 kfree(conf);
3748 }
3749 return ERR_PTR(err);
3750}
3751
3752static int raid10_run(struct mddev *mddev)
3753{
3754 struct r10conf *conf;
3755 int i, disk_idx, chunk_size;
3756 struct raid10_info *disk;
3757 struct md_rdev *rdev;
3758 sector_t size;
3759 sector_t min_offset_diff = 0;
3760 int first = 1;
3761 bool discard_supported = false;
3762
3763 if (mddev_init_writes_pending(mddev) < 0)
3764 return -ENOMEM;
3765
3766 if (mddev->private == NULL) {
3767 conf = setup_conf(mddev);
3768 if (IS_ERR(conf))
3769 return PTR_ERR(conf);
3770 mddev->private = conf;
3771 }
3772 conf = mddev->private;
3773 if (!conf)
3774 goto out;
3775
3776 mddev->thread = conf->thread;
3777 conf->thread = NULL;
3778
3779 chunk_size = mddev->chunk_sectors << 9;
3780 if (mddev->queue) {
3781 blk_queue_max_discard_sectors(mddev->queue,
3782 mddev->chunk_sectors);
3783 blk_queue_max_write_same_sectors(mddev->queue, 0);
3784 blk_queue_io_min(mddev->queue, chunk_size);
3785 if (conf->geo.raid_disks % conf->geo.near_copies)
3786 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3787 else
3788 blk_queue_io_opt(mddev->queue, chunk_size *
3789 (conf->geo.raid_disks / conf->geo.near_copies));
3790 }
3791
3792 rdev_for_each(rdev, mddev) {
3793 long long diff;
3794 struct request_queue *q;
3795
3796 disk_idx = rdev->raid_disk;
3797 if (disk_idx < 0)
3798 continue;
3799 if (disk_idx >= conf->geo.raid_disks &&
3800 disk_idx >= conf->prev.raid_disks)
3801 continue;
3802 disk = conf->mirrors + disk_idx;
3803
3804 if (test_bit(Replacement, &rdev->flags)) {
3805 if (disk->replacement)
3806 goto out_free_conf;
3807 disk->replacement = rdev;
3808 } else {
3809 if (disk->rdev)
3810 goto out_free_conf;
3811 disk->rdev = rdev;
3812 }
3813 q = bdev_get_queue(rdev->bdev);
3814 if (q->merge_bvec_fn)
3815 mddev->merge_check_needed = 1;
3816 diff = (rdev->new_data_offset - rdev->data_offset);
3817 if (!mddev->reshape_backwards)
3818 diff = -diff;
3819 if (diff < 0)
3820 diff = 0;
3821 if (first || diff < min_offset_diff)
3822 min_offset_diff = diff;
3823
3824 if (mddev->gendisk)
3825 disk_stack_limits(mddev->gendisk, rdev->bdev,
3826 rdev->data_offset << 9);
3827
3828 disk->head_position = 0;
3829
3830 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3831 discard_supported = true;
3832 }
3833
3834 if (mddev->queue) {
3835 if (discard_supported)
3836 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
3837 mddev->queue);
3838 else
3839 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
3840 mddev->queue);
3841 }
3842
3843 if (!enough(conf, -1)) {
3844 pr_err("md/raid10:%s: not enough operational mirrors.\n",
3845 mdname(mddev));
3846 goto out_free_conf;
3847 }
3848
3849 if (conf->reshape_progress != MaxSector) {
3850
3851 if (conf->geo.far_copies != 1 &&
3852 conf->geo.far_offset == 0)
3853 goto out_free_conf;
3854 if (conf->prev.far_copies != 1 &&
3855 conf->prev.far_offset == 0)
3856 goto out_free_conf;
3857 }
3858
3859 mddev->degraded = 0;
3860 for (i = 0;
3861 i < conf->geo.raid_disks
3862 || i < conf->prev.raid_disks;
3863 i++) {
3864
3865 disk = conf->mirrors + i;
3866
3867 if (!disk->rdev && disk->replacement) {
3868
3869 disk->rdev = disk->replacement;
3870 disk->replacement = NULL;
3871 clear_bit(Replacement, &disk->rdev->flags);
3872 }
3873
3874 if (!disk->rdev ||
3875 !test_bit(In_sync, &disk->rdev->flags)) {
3876 disk->head_position = 0;
3877 mddev->degraded++;
3878 if (disk->rdev &&
3879 disk->rdev->saved_raid_disk < 0)
3880 conf->fullsync = 1;
3881 }
3882 disk->recovery_disabled = mddev->recovery_disabled - 1;
3883 }
3884
3885 if (mddev->recovery_cp != MaxSector)
3886 pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n",
3887 mdname(mddev));
3888 pr_info("md/raid10:%s: active with %d out of %d devices\n",
3889 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3890 conf->geo.raid_disks);
3891
3892
3893
3894 mddev->dev_sectors = conf->dev_sectors;
3895 size = raid10_size(mddev, 0, 0);
3896 md_set_array_sectors(mddev, size);
3897 mddev->resync_max_sectors = size;
3898 set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
3899
3900 if (mddev->queue) {
3901 int stripe = conf->geo.raid_disks *
3902 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3903
3904
3905
3906
3907
3908 stripe /= conf->geo.near_copies;
3909 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3910 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3911 }
3912
3913 if (md_integrity_register(mddev))
3914 goto out_free_conf;
3915
3916 if (conf->reshape_progress != MaxSector) {
3917 unsigned long before_length, after_length;
3918
3919 before_length = ((1 << conf->prev.chunk_shift) *
3920 conf->prev.far_copies);
3921 after_length = ((1 << conf->geo.chunk_shift) *
3922 conf->geo.far_copies);
3923
3924 if (max(before_length, after_length) > min_offset_diff) {
3925
3926 pr_warn("md/raid10: offset difference not enough to continue reshape\n");
3927 goto out_free_conf;
3928 }
3929 conf->offset_diff = min_offset_diff;
3930
3931 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3932 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3933 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3934 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3935 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3936 "reshape");
3937 }
3938
3939 return 0;
3940
3941out_free_conf:
3942 md_unregister_thread(&mddev->thread);
3943 mempool_destroy(conf->r10bio_pool);
3944 safe_put_page(conf->tmppage);
3945 kfree(conf->mirrors);
3946 kfree(conf);
3947 mddev->private = NULL;
3948out:
3949 return -EIO;
3950}
3951
3952static void raid10_free(struct mddev *mddev, void *priv)
3953{
3954 struct r10conf *conf = priv;
3955
3956 mempool_destroy(conf->r10bio_pool);
3957 safe_put_page(conf->tmppage);
3958 kfree(conf->mirrors);
3959 kfree(conf->mirrors_old);
3960 kfree(conf->mirrors_new);
3961 kfree(conf);
3962}
3963
3964static void raid10_quiesce(struct mddev *mddev, int state)
3965{
3966 struct r10conf *conf = mddev->private;
3967
3968 switch(state) {
3969 case 1:
3970 raise_barrier(conf, 0);
3971 break;
3972 case 0:
3973 lower_barrier(conf);
3974 break;
3975 }
3976}
3977
3978static int raid10_resize(struct mddev *mddev, sector_t sectors)
3979{
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992 struct r10conf *conf = mddev->private;
3993 sector_t oldsize, size;
3994
3995 if (mddev->reshape_position != MaxSector)
3996 return -EBUSY;
3997
3998 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3999 return -EINVAL;
4000
4001 oldsize = raid10_size(mddev, 0, 0);
4002 size = raid10_size(mddev, sectors, 0);
4003 if (mddev->external_size &&
4004 mddev->array_sectors > size)
4005 return -EINVAL;
4006 if (mddev->bitmap) {
4007 int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
4008 if (ret)
4009 return ret;
4010 }
4011 md_set_array_sectors(mddev, size);
4012 if (mddev->queue) {
4013 set_capacity(mddev->gendisk, mddev->array_sectors);
4014 revalidate_disk(mddev->gendisk);
4015 }
4016 if (sectors > mddev->dev_sectors &&
4017 mddev->recovery_cp > oldsize) {
4018 mddev->recovery_cp = oldsize;
4019 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4020 }
4021 calc_sectors(conf, sectors);
4022 mddev->dev_sectors = conf->dev_sectors;
4023 mddev->resync_max_sectors = size;
4024 return 0;
4025}
4026
4027static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
4028{
4029 struct md_rdev *rdev;
4030 struct r10conf *conf;
4031
4032 if (mddev->degraded > 0) {
4033 pr_warn("md/raid10:%s: Error: degraded raid0!\n",
4034 mdname(mddev));
4035 return ERR_PTR(-EINVAL);
4036 }
4037 sector_div(size, devs);
4038
4039
4040 mddev->new_level = 10;
4041
4042 mddev->new_layout = (1<<8) + 2;
4043 mddev->new_chunk_sectors = mddev->chunk_sectors;
4044 mddev->delta_disks = mddev->raid_disks;
4045 mddev->raid_disks *= 2;
4046
4047 mddev->recovery_cp = MaxSector;
4048 mddev->dev_sectors = size;
4049
4050 conf = setup_conf(mddev);
4051 if (!IS_ERR(conf)) {
4052 rdev_for_each(rdev, mddev)
4053 if (rdev->raid_disk >= 0) {
4054 rdev->new_raid_disk = rdev->raid_disk * 2;
4055 rdev->sectors = size;
4056 }
4057 conf->barrier = 1;
4058 }
4059
4060 return conf;
4061}
4062
4063static void *raid10_takeover(struct mddev *mddev)
4064{
4065 struct r0conf *raid0_conf;
4066
4067
4068
4069
4070 if (mddev->level == 0) {
4071
4072 raid0_conf = mddev->private;
4073 if (raid0_conf->nr_strip_zones > 1) {
4074 pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n",
4075 mdname(mddev));
4076 return ERR_PTR(-EINVAL);
4077 }
4078 return raid10_takeover_raid0(mddev,
4079 raid0_conf->strip_zone->zone_end,
4080 raid0_conf->strip_zone->nb_dev);
4081 }
4082 return ERR_PTR(-EINVAL);
4083}
4084
4085static int raid10_check_reshape(struct mddev *mddev)
4086{
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101 struct r10conf *conf = mddev->private;
4102 struct geom geo;
4103
4104 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
4105 return -EINVAL;
4106
4107 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
4108
4109 return -EINVAL;
4110 if (geo.far_copies > 1 && !geo.far_offset)
4111
4112 return -EINVAL;
4113
4114 if (mddev->array_sectors & geo.chunk_mask)
4115
4116 return -EINVAL;
4117
4118 if (!enough(conf, -1))
4119 return -EINVAL;
4120
4121 kfree(conf->mirrors_new);
4122 conf->mirrors_new = NULL;
4123 if (mddev->delta_disks > 0) {
4124
4125 conf->mirrors_new = kzalloc(
4126 sizeof(struct raid10_info)
4127 *(mddev->raid_disks +
4128 mddev->delta_disks),
4129 GFP_KERNEL);
4130 if (!conf->mirrors_new)
4131 return -ENOMEM;
4132 }
4133 return 0;
4134}
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149static int calc_degraded(struct r10conf *conf)
4150{
4151 int degraded, degraded2;
4152 int i;
4153
4154 rcu_read_lock();
4155 degraded = 0;
4156
4157 for (i = 0; i < conf->prev.raid_disks; i++) {
4158 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4159 if (!rdev || test_bit(Faulty, &rdev->flags))
4160 degraded++;
4161 else if (!test_bit(In_sync, &rdev->flags))
4162
4163
4164
4165
4166 degraded++;
4167 }
4168 rcu_read_unlock();
4169 if (conf->geo.raid_disks == conf->prev.raid_disks)
4170 return degraded;
4171 rcu_read_lock();
4172 degraded2 = 0;
4173 for (i = 0; i < conf->geo.raid_disks; i++) {
4174 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4175 if (!rdev || test_bit(Faulty, &rdev->flags))
4176 degraded2++;
4177 else if (!test_bit(In_sync, &rdev->flags)) {
4178
4179
4180
4181
4182
4183 if (conf->geo.raid_disks <= conf->prev.raid_disks)
4184 degraded2++;
4185 }
4186 }
4187 rcu_read_unlock();
4188 if (degraded2 > degraded)
4189 return degraded2;
4190 return degraded;
4191}
4192
4193static int raid10_start_reshape(struct mddev *mddev)
4194{
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205 unsigned long before_length, after_length;
4206 sector_t min_offset_diff = 0;
4207 int first = 1;
4208 struct geom new;
4209 struct r10conf *conf = mddev->private;
4210 struct md_rdev *rdev;
4211 int spares = 0;
4212 int ret;
4213
4214 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4215 return -EBUSY;
4216
4217 if (setup_geo(&new, mddev, geo_start) != conf->copies)
4218 return -EINVAL;
4219
4220 before_length = ((1 << conf->prev.chunk_shift) *
4221 conf->prev.far_copies);
4222 after_length = ((1 << conf->geo.chunk_shift) *
4223 conf->geo.far_copies);
4224
4225 rdev_for_each(rdev, mddev) {
4226 if (!test_bit(In_sync, &rdev->flags)
4227 && !test_bit(Faulty, &rdev->flags))
4228 spares++;
4229 if (rdev->raid_disk >= 0) {
4230 long long diff = (rdev->new_data_offset
4231 - rdev->data_offset);
4232 if (!mddev->reshape_backwards)
4233 diff = -diff;
4234 if (diff < 0)
4235 diff = 0;
4236 if (first || diff < min_offset_diff)
4237 min_offset_diff = diff;
4238 }
4239 }
4240
4241 if (max(before_length, after_length) > min_offset_diff)
4242 return -EINVAL;
4243
4244 if (spares < mddev->delta_disks)
4245 return -EINVAL;
4246
4247 conf->offset_diff = min_offset_diff;
4248 spin_lock_irq(&conf->device_lock);
4249 if (conf->mirrors_new) {
4250 memcpy(conf->mirrors_new, conf->mirrors,
4251 sizeof(struct raid10_info)*conf->prev.raid_disks);
4252 smp_mb();
4253 kfree(conf->mirrors_old);
4254 conf->mirrors_old = conf->mirrors;
4255 conf->mirrors = conf->mirrors_new;
4256 conf->mirrors_new = NULL;
4257 }
4258 setup_geo(&conf->geo, mddev, geo_start);
4259 smp_mb();
4260 if (mddev->reshape_backwards) {
4261 sector_t size = raid10_size(mddev, 0, 0);
4262 if (size < mddev->array_sectors) {
4263 spin_unlock_irq(&conf->device_lock);
4264 pr_warn("md/raid10:%s: array size must be reduce before number of disks\n",
4265 mdname(mddev));
4266 return -EINVAL;
4267 }
4268 mddev->resync_max_sectors = size;
4269 conf->reshape_progress = size;
4270 } else
4271 conf->reshape_progress = 0;
4272 conf->reshape_safe = conf->reshape_progress;
4273 spin_unlock_irq(&conf->device_lock);
4274
4275 if (mddev->delta_disks && mddev->bitmap) {
4276 ret = bitmap_resize(mddev->bitmap,
4277 raid10_size(mddev, 0,
4278 conf->geo.raid_disks),
4279 0, 0);
4280 if (ret)
4281 goto abort;
4282 }
4283 if (mddev->delta_disks > 0) {
4284 rdev_for_each(rdev, mddev)
4285 if (rdev->raid_disk < 0 &&
4286 !test_bit(Faulty, &rdev->flags)) {
4287 if (raid10_add_disk(mddev, rdev) == 0) {
4288 if (rdev->raid_disk >=
4289 conf->prev.raid_disks)
4290 set_bit(In_sync, &rdev->flags);
4291 else
4292 rdev->recovery_offset = 0;
4293
4294 if (sysfs_link_rdev(mddev, rdev))
4295 ;
4296 }
4297 } else if (rdev->raid_disk >= conf->prev.raid_disks
4298 && !test_bit(Faulty, &rdev->flags)) {
4299
4300 set_bit(In_sync, &rdev->flags);
4301 }
4302 }
4303
4304
4305
4306
4307 spin_lock_irq(&conf->device_lock);
4308 mddev->degraded = calc_degraded(conf);
4309 spin_unlock_irq(&conf->device_lock);
4310 mddev->raid_disks = conf->geo.raid_disks;
4311 mddev->reshape_position = conf->reshape_progress;
4312 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4313
4314 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4315 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4316 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
4317 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4318 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4319
4320 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4321 "reshape");
4322 if (!mddev->sync_thread) {
4323 ret = -EAGAIN;
4324 goto abort;
4325 }
4326 conf->reshape_checkpoint = jiffies;
4327 md_wakeup_thread(mddev->sync_thread);
4328 md_new_event(mddev);
4329 return 0;
4330
4331abort:
4332 mddev->recovery = 0;
4333 spin_lock_irq(&conf->device_lock);
4334 conf->geo = conf->prev;
4335 mddev->raid_disks = conf->geo.raid_disks;
4336 rdev_for_each(rdev, mddev)
4337 rdev->new_data_offset = rdev->data_offset;
4338 smp_wmb();
4339 conf->reshape_progress = MaxSector;
4340 conf->reshape_safe = MaxSector;
4341 mddev->reshape_position = MaxSector;
4342 spin_unlock_irq(&conf->device_lock);
4343 return ret;
4344}
4345
4346
4347
4348
4349
4350
4351
4352static sector_t last_dev_address(sector_t s, struct geom *geo)
4353{
4354 s = (s | geo->chunk_mask) + 1;
4355 s >>= geo->chunk_shift;
4356 s *= geo->near_copies;
4357 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4358 s *= geo->far_copies;
4359 s <<= geo->chunk_shift;
4360 return s;
4361}
4362
4363
4364
4365
4366
4367static sector_t first_dev_address(sector_t s, struct geom *geo)
4368{
4369 s >>= geo->chunk_shift;
4370 s *= geo->near_copies;
4371 sector_div(s, geo->raid_disks);
4372 s *= geo->far_copies;
4373 s <<= geo->chunk_shift;
4374 return s;
4375}
4376
4377static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4378 int *skipped)
4379{
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417 struct r10conf *conf = mddev->private;
4418 struct r10bio *r10_bio;
4419 sector_t next, safe, last;
4420 int max_sectors;
4421 int nr_sectors;
4422 int s;
4423 struct md_rdev *rdev;
4424 int need_flush = 0;
4425 struct bio *blist;
4426 struct bio *bio, *read_bio;
4427 int sectors_done = 0;
4428
4429 if (sector_nr == 0) {
4430
4431 if (mddev->reshape_backwards &&
4432 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4433 sector_nr = (raid10_size(mddev, 0, 0)
4434 - conf->reshape_progress);
4435 } else if (!mddev->reshape_backwards &&
4436 conf->reshape_progress > 0)
4437 sector_nr = conf->reshape_progress;
4438 if (sector_nr) {
4439 mddev->curr_resync_completed = sector_nr;
4440 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4441 *skipped = 1;
4442 return sector_nr;
4443 }
4444 }
4445
4446
4447
4448
4449
4450 if (mddev->reshape_backwards) {
4451
4452
4453
4454 next = first_dev_address(conf->reshape_progress - 1,
4455 &conf->geo);
4456
4457
4458
4459
4460 safe = last_dev_address(conf->reshape_safe - 1,
4461 &conf->prev);
4462
4463 if (next + conf->offset_diff < safe)
4464 need_flush = 1;
4465
4466 last = conf->reshape_progress - 1;
4467 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4468 & conf->prev.chunk_mask);
4469 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4470 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4471 } else {
4472
4473
4474
4475 next = last_dev_address(conf->reshape_progress, &conf->geo);
4476
4477
4478
4479
4480 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4481
4482
4483
4484
4485 if (next > safe + conf->offset_diff)
4486 need_flush = 1;
4487
4488 sector_nr = conf->reshape_progress;
4489 last = sector_nr | (conf->geo.chunk_mask
4490 & conf->prev.chunk_mask);
4491
4492 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4493 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4494 }
4495
4496 if (need_flush ||
4497 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4498
4499 wait_barrier(conf);
4500 mddev->reshape_position = conf->reshape_progress;
4501 if (mddev->reshape_backwards)
4502 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4503 - conf->reshape_progress;
4504 else
4505 mddev->curr_resync_completed = conf->reshape_progress;
4506 conf->reshape_checkpoint = jiffies;
4507 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4508 md_wakeup_thread(mddev->thread);
4509 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
4510 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4511 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4512 allow_barrier(conf);
4513 return sectors_done;
4514 }
4515 conf->reshape_safe = mddev->reshape_position;
4516 allow_barrier(conf);
4517 }
4518
4519read_more:
4520
4521 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
4522 r10_bio->state = 0;
4523 raise_barrier(conf, sectors_done != 0);
4524 atomic_set(&r10_bio->remaining, 0);
4525 r10_bio->mddev = mddev;
4526 r10_bio->sector = sector_nr;
4527 set_bit(R10BIO_IsReshape, &r10_bio->state);
4528 r10_bio->sectors = last - sector_nr + 1;
4529 rdev = read_balance(conf, r10_bio, &max_sectors);
4530 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4531
4532 if (!rdev) {
4533
4534
4535
4536
4537 mempool_free(r10_bio, conf->r10buf_pool);
4538 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4539 return sectors_done;
4540 }
4541
4542 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4543
4544 read_bio->bi_bdev = rdev->bdev;
4545 read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4546 + rdev->data_offset);
4547 read_bio->bi_private = r10_bio;
4548 read_bio->bi_end_io = end_sync_read;
4549 read_bio->bi_rw = READ;
4550 read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
4551 __set_bit(BIO_UPTODATE, &read_bio->bi_flags);
4552 read_bio->bi_vcnt = 0;
4553 read_bio->bi_size = 0;
4554 r10_bio->master_bio = read_bio;
4555 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4556
4557
4558 __raid10_find_phys(&conf->geo, r10_bio);
4559
4560 blist = read_bio;
4561 read_bio->bi_next = NULL;
4562
4563 rcu_read_lock();
4564 for (s = 0; s < conf->copies*2; s++) {
4565 struct bio *b;
4566 int d = r10_bio->devs[s/2].devnum;
4567 struct md_rdev *rdev2;
4568 if (s&1) {
4569 rdev2 = rcu_dereference(conf->mirrors[d].replacement);
4570 b = r10_bio->devs[s/2].repl_bio;
4571 } else {
4572 rdev2 = rcu_dereference(conf->mirrors[d].rdev);
4573 b = r10_bio->devs[s/2].bio;
4574 }
4575 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4576 continue;
4577
4578 bio_reset(b);
4579 b->bi_bdev = rdev2->bdev;
4580 b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
4581 b->bi_private = r10_bio;
4582 b->bi_end_io = end_reshape_write;
4583 b->bi_rw = WRITE;
4584 b->bi_next = blist;
4585 blist = b;
4586 }
4587
4588
4589
4590 nr_sectors = 0;
4591 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4592 struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
4593 int len = (max_sectors - s) << 9;
4594 if (len > PAGE_SIZE)
4595 len = PAGE_SIZE;
4596 for (bio = blist; bio ; bio = bio->bi_next) {
4597 struct bio *bio2;
4598 if (bio_add_page(bio, page, len, 0))
4599 continue;
4600
4601
4602 for (bio2 = blist;
4603 bio2 && bio2 != bio;
4604 bio2 = bio2->bi_next) {
4605
4606 bio2->bi_vcnt--;
4607 bio2->bi_size -= len;
4608 __clear_bit(BIO_SEG_VALID, &bio2->bi_flags);
4609 }
4610 goto bio_full;
4611 }
4612 sector_nr += len >> 9;
4613 nr_sectors += len >> 9;
4614 }
4615bio_full:
4616 rcu_read_unlock();
4617 r10_bio->sectors = nr_sectors;
4618
4619
4620 md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
4621 atomic_inc(&r10_bio->remaining);
4622 read_bio->bi_next = NULL;
4623 generic_make_request(read_bio);
4624 sector_nr += nr_sectors;
4625 sectors_done += nr_sectors;
4626 if (sector_nr <= last)
4627 goto read_more;
4628
4629
4630
4631
4632 if (mddev->reshape_backwards)
4633 conf->reshape_progress -= sectors_done;
4634 else
4635 conf->reshape_progress += sectors_done;
4636
4637 return sectors_done;
4638}
4639
4640static void end_reshape_request(struct r10bio *r10_bio);
4641static int handle_reshape_read_error(struct mddev *mddev,
4642 struct r10bio *r10_bio);
4643static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4644{
4645
4646
4647
4648
4649
4650 struct r10conf *conf = mddev->private;
4651 int s;
4652
4653 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4654 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4655
4656 md_done_sync(mddev, r10_bio->sectors, 0);
4657 return;
4658 }
4659
4660
4661
4662
4663 atomic_set(&r10_bio->remaining, 1);
4664 for (s = 0; s < conf->copies*2; s++) {
4665 struct bio *b;
4666 int d = r10_bio->devs[s/2].devnum;
4667 struct md_rdev *rdev;
4668 rcu_read_lock();
4669 if (s&1) {
4670 rdev = rcu_dereference(conf->mirrors[d].replacement);
4671 b = r10_bio->devs[s/2].repl_bio;
4672 } else {
4673 rdev = rcu_dereference(conf->mirrors[d].rdev);
4674 b = r10_bio->devs[s/2].bio;
4675 }
4676 if (!rdev || test_bit(Faulty, &rdev->flags)) {
4677 rcu_read_unlock();
4678 continue;
4679 }
4680 atomic_inc(&rdev->nr_pending);
4681 rcu_read_unlock();
4682 md_sync_acct(b->bi_bdev, r10_bio->sectors);
4683 atomic_inc(&r10_bio->remaining);
4684 b->bi_next = NULL;
4685 generic_make_request(b);
4686 }
4687 end_reshape_request(r10_bio);
4688}
4689
4690static void end_reshape(struct r10conf *conf)
4691{
4692 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4693 return;
4694
4695 spin_lock_irq(&conf->device_lock);
4696 conf->prev = conf->geo;
4697 md_finish_reshape(conf->mddev);
4698 smp_wmb();
4699 conf->reshape_progress = MaxSector;
4700 conf->reshape_safe = MaxSector;
4701 spin_unlock_irq(&conf->device_lock);
4702
4703
4704
4705
4706 if (conf->mddev->queue) {
4707 int stripe = conf->geo.raid_disks *
4708 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4709 stripe /= conf->geo.near_copies;
4710 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4711 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4712 }
4713 conf->fullsync = 0;
4714}
4715
4716static int handle_reshape_read_error(struct mddev *mddev,
4717 struct r10bio *r10_bio)
4718{
4719
4720 int sectors = r10_bio->sectors;
4721 struct r10conf *conf = mddev->private;
4722 struct {
4723 struct r10bio r10_bio;
4724 struct r10dev devs[conf->copies];
4725 } on_stack;
4726 struct r10bio *r10b = &on_stack.r10_bio;
4727 int slot = 0;
4728 int idx = 0;
4729 struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
4730
4731 r10b->sector = r10_bio->sector;
4732 __raid10_find_phys(&conf->prev, r10b);
4733
4734 while (sectors) {
4735 int s = sectors;
4736 int success = 0;
4737 int first_slot = slot;
4738
4739 if (s > (PAGE_SIZE >> 9))
4740 s = PAGE_SIZE >> 9;
4741
4742 rcu_read_lock();
4743 while (!success) {
4744 int d = r10b->devs[slot].devnum;
4745 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4746 sector_t addr;
4747 if (rdev == NULL ||
4748 test_bit(Faulty, &rdev->flags) ||
4749 !test_bit(In_sync, &rdev->flags))
4750 goto failed;
4751
4752 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4753 atomic_inc(&rdev->nr_pending);
4754 rcu_read_unlock();
4755 success = sync_page_io(rdev,
4756 addr,
4757 s << 9,
4758 bvec[idx].bv_page,
4759 READ, false);
4760 rdev_dec_pending(rdev, mddev);
4761 rcu_read_lock();
4762 if (success)
4763 break;
4764 failed:
4765 slot++;
4766 if (slot >= conf->copies)
4767 slot = 0;
4768 if (slot == first_slot)
4769 break;
4770 }
4771 rcu_read_unlock();
4772 if (!success) {
4773
4774 set_bit(MD_RECOVERY_INTR,
4775 &mddev->recovery);
4776 return -EIO;
4777 }
4778 sectors -= s;
4779 idx++;
4780 }
4781 return 0;
4782}
4783
4784static void end_reshape_write(struct bio *bio, int error)
4785{
4786 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
4787 struct r10bio *r10_bio = bio->bi_private;
4788 struct mddev *mddev = r10_bio->mddev;
4789 struct r10conf *conf = mddev->private;
4790 int d;
4791 int slot;
4792 int repl;
4793 struct md_rdev *rdev = NULL;
4794
4795 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4796 if (repl)
4797 rdev = conf->mirrors[d].replacement;
4798 if (!rdev) {
4799 smp_mb();
4800 rdev = conf->mirrors[d].rdev;
4801 }
4802
4803 if (!uptodate) {
4804
4805 md_error(mddev, rdev);
4806 }
4807
4808 rdev_dec_pending(rdev, mddev);
4809 end_reshape_request(r10_bio);
4810}
4811
4812static void end_reshape_request(struct r10bio *r10_bio)
4813{
4814 if (!atomic_dec_and_test(&r10_bio->remaining))
4815 return;
4816 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4817 bio_put(r10_bio->master_bio);
4818 put_buf(r10_bio);
4819}
4820
4821static void raid10_finish_reshape(struct mddev *mddev)
4822{
4823 struct r10conf *conf = mddev->private;
4824
4825 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4826 return;
4827
4828 if (mddev->delta_disks > 0) {
4829 sector_t size = raid10_size(mddev, 0, 0);
4830 md_set_array_sectors(mddev, size);
4831 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4832 mddev->recovery_cp = mddev->resync_max_sectors;
4833 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4834 }
4835 mddev->resync_max_sectors = size;
4836 if (mddev->queue) {
4837 set_capacity(mddev->gendisk, mddev->array_sectors);
4838 revalidate_disk(mddev->gendisk);
4839 }
4840 } else {
4841 int d;
4842 rcu_read_lock();
4843 for (d = conf->geo.raid_disks ;
4844 d < conf->geo.raid_disks - mddev->delta_disks;
4845 d++) {
4846 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4847 if (rdev)
4848 clear_bit(In_sync, &rdev->flags);
4849 rdev = rcu_dereference(conf->mirrors[d].replacement);
4850 if (rdev)
4851 clear_bit(In_sync, &rdev->flags);
4852 }
4853 rcu_read_unlock();
4854 }
4855 mddev->layout = mddev->new_layout;
4856 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4857 mddev->reshape_position = MaxSector;
4858 mddev->delta_disks = 0;
4859 mddev->reshape_backwards = 0;
4860}
4861
4862static struct md_personality raid10_personality =
4863{
4864 .name = "raid10",
4865 .level = 10,
4866 .owner = THIS_MODULE,
4867 .make_request = raid10_make_request,
4868 .run = raid10_run,
4869 .free = raid10_free,
4870 .status = raid10_status,
4871 .error_handler = raid10_error,
4872 .hot_add_disk = raid10_add_disk,
4873 .hot_remove_disk= raid10_remove_disk,
4874 .spare_active = raid10_spare_active,
4875 .sync_request = raid10_sync_request,
4876 .quiesce = raid10_quiesce,
4877 .size = raid10_size,
4878 .resize = raid10_resize,
4879 .takeover = raid10_takeover,
4880 .check_reshape = raid10_check_reshape,
4881 .start_reshape = raid10_start_reshape,
4882 .finish_reshape = raid10_finish_reshape,
4883 .congested = raid10_congested,
4884 .mergeable_bvec = raid10_mergeable_bvec,
4885};
4886
4887static int __init raid_init(void)
4888{
4889 return register_md_personality(&raid10_personality);
4890}
4891
4892static void raid_exit(void)
4893{
4894 unregister_md_personality(&raid10_personality);
4895}
4896
4897module_init(raid_init);
4898module_exit(raid_exit);
4899MODULE_LICENSE("GPL");
4900MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4901MODULE_ALIAS("md-personality-9");
4902MODULE_ALIAS("md-raid10");
4903MODULE_ALIAS("md-level-10");
4904
4905module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
4906