1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/slab.h>
22#include <linux/delay.h>
23#include <linux/blkdev.h>
24#include <linux/module.h>
25#include <linux/seq_file.h>
26#include <linux/ratelimit.h>
27#include <linux/kthread.h>
28#include <trace/events/block.h>
29#include "md.h"
30#include "raid10.h"
31#include "raid0.h"
32#include "bitmap.h"
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78#define NR_RAID10_BIOS 256
79
80
81
82
83
84
85#define IO_BLOCKED ((struct bio *)1)
86
87
88
89
90#define IO_MADE_GOOD ((struct bio *)2)
91
92#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
93
94
95
96
97
98static int max_queued_requests = 1024;
99
100static void allow_barrier(struct r10conf *conf);
101static void lower_barrier(struct r10conf *conf);
102static int _enough(struct r10conf *conf, int previous, int ignore);
103static int enough(struct r10conf *conf, int ignore);
104static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
105 int *skipped);
106static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
107static void end_reshape_write(struct bio *bio);
108static void end_reshape(struct r10conf *conf);
109
110#define raid10_log(md, fmt, args...) \
111 do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
112
113static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
114{
115 struct r10conf *conf = data;
116 int size = offsetof(struct r10bio, devs[conf->copies]);
117
118
119
120 return kzalloc(size, gfp_flags);
121}
122
123static void r10bio_pool_free(void *r10_bio, void *data)
124{
125 kfree(r10_bio);
126}
127
128
129#define RESYNC_BLOCK_SIZE (64*1024)
130#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
131
132#define RESYNC_WINDOW (1024*1024)
133
134#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
135
136
137
138
139
140
141
142
143static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
144{
145 struct r10conf *conf = data;
146 struct page *page;
147 struct r10bio *r10_bio;
148 struct bio *bio;
149 int i, j;
150 int nalloc;
151
152 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
153 if (!r10_bio)
154 return NULL;
155
156 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
157 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
158 nalloc = conf->copies;
159 else
160 nalloc = 2;
161
162
163
164
165 for (j = nalloc ; j-- ; ) {
166 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
167 if (!bio)
168 goto out_free_bio;
169 r10_bio->devs[j].bio = bio;
170 if (!conf->have_replacement)
171 continue;
172 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
173 if (!bio)
174 goto out_free_bio;
175 r10_bio->devs[j].repl_bio = bio;
176 }
177
178
179
180
181 for (j = 0 ; j < nalloc; j++) {
182 struct bio *rbio = r10_bio->devs[j].repl_bio;
183 bio = r10_bio->devs[j].bio;
184 for (i = 0; i < RESYNC_PAGES; i++) {
185 if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
186 &conf->mddev->recovery)) {
187
188
189 struct bio *rbio = r10_bio->devs[0].bio;
190 page = rbio->bi_io_vec[i].bv_page;
191 get_page(page);
192 } else
193 page = alloc_page(gfp_flags);
194 if (unlikely(!page))
195 goto out_free_pages;
196
197 bio->bi_io_vec[i].bv_page = page;
198 if (rbio)
199 rbio->bi_io_vec[i].bv_page = page;
200 }
201 }
202
203 return r10_bio;
204
205out_free_pages:
206 for ( ; i > 0 ; i--)
207 safe_put_page(bio->bi_io_vec[i-1].bv_page);
208 while (j--)
209 for (i = 0; i < RESYNC_PAGES ; i++)
210 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
211 j = 0;
212out_free_bio:
213 for ( ; j < nalloc; j++) {
214 if (r10_bio->devs[j].bio)
215 bio_put(r10_bio->devs[j].bio);
216 if (r10_bio->devs[j].repl_bio)
217 bio_put(r10_bio->devs[j].repl_bio);
218 }
219 r10bio_pool_free(r10_bio, conf);
220 return NULL;
221}
222
223static void r10buf_pool_free(void *__r10_bio, void *data)
224{
225 int i;
226 struct r10conf *conf = data;
227 struct r10bio *r10bio = __r10_bio;
228 int j;
229
230 for (j=0; j < conf->copies; j++) {
231 struct bio *bio = r10bio->devs[j].bio;
232 if (bio) {
233 for (i = 0; i < RESYNC_PAGES; i++) {
234 safe_put_page(bio->bi_io_vec[i].bv_page);
235 bio->bi_io_vec[i].bv_page = NULL;
236 }
237 bio_put(bio);
238 }
239 bio = r10bio->devs[j].repl_bio;
240 if (bio)
241 bio_put(bio);
242 }
243 r10bio_pool_free(r10bio, conf);
244}
245
246static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
247{
248 int i;
249
250 for (i = 0; i < conf->copies; i++) {
251 struct bio **bio = & r10_bio->devs[i].bio;
252 if (!BIO_SPECIAL(*bio))
253 bio_put(*bio);
254 *bio = NULL;
255 bio = &r10_bio->devs[i].repl_bio;
256 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
257 bio_put(*bio);
258 *bio = NULL;
259 }
260}
261
262static void free_r10bio(struct r10bio *r10_bio)
263{
264 struct r10conf *conf = r10_bio->mddev->private;
265
266 put_all_bios(conf, r10_bio);
267 mempool_free(r10_bio, conf->r10bio_pool);
268}
269
270static void put_buf(struct r10bio *r10_bio)
271{
272 struct r10conf *conf = r10_bio->mddev->private;
273
274 mempool_free(r10_bio, conf->r10buf_pool);
275
276 lower_barrier(conf);
277}
278
279static void reschedule_retry(struct r10bio *r10_bio)
280{
281 unsigned long flags;
282 struct mddev *mddev = r10_bio->mddev;
283 struct r10conf *conf = mddev->private;
284
285 spin_lock_irqsave(&conf->device_lock, flags);
286 list_add(&r10_bio->retry_list, &conf->retry_list);
287 conf->nr_queued ++;
288 spin_unlock_irqrestore(&conf->device_lock, flags);
289
290
291 wake_up(&conf->wait_barrier);
292
293 md_wakeup_thread(mddev->thread);
294}
295
296
297
298
299
300
301static void raid_end_bio_io(struct r10bio *r10_bio)
302{
303 struct bio *bio = r10_bio->master_bio;
304 int done;
305 struct r10conf *conf = r10_bio->mddev->private;
306
307 if (bio->bi_phys_segments) {
308 unsigned long flags;
309 spin_lock_irqsave(&conf->device_lock, flags);
310 bio->bi_phys_segments--;
311 done = (bio->bi_phys_segments == 0);
312 spin_unlock_irqrestore(&conf->device_lock, flags);
313 } else
314 done = 1;
315 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
316 bio->bi_error = -EIO;
317 if (done) {
318 bio_endio(bio);
319
320
321
322
323 allow_barrier(conf);
324 }
325 free_r10bio(r10_bio);
326}
327
328
329
330
331static inline void update_head_pos(int slot, struct r10bio *r10_bio)
332{
333 struct r10conf *conf = r10_bio->mddev->private;
334
335 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
336 r10_bio->devs[slot].addr + (r10_bio->sectors);
337}
338
339
340
341
342static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
343 struct bio *bio, int *slotp, int *replp)
344{
345 int slot;
346 int repl = 0;
347
348 for (slot = 0; slot < conf->copies; slot++) {
349 if (r10_bio->devs[slot].bio == bio)
350 break;
351 if (r10_bio->devs[slot].repl_bio == bio) {
352 repl = 1;
353 break;
354 }
355 }
356
357 BUG_ON(slot == conf->copies);
358 update_head_pos(slot, r10_bio);
359
360 if (slotp)
361 *slotp = slot;
362 if (replp)
363 *replp = repl;
364 return r10_bio->devs[slot].devnum;
365}
366
367static void raid10_end_read_request(struct bio *bio)
368{
369 int uptodate = !bio->bi_error;
370 struct r10bio *r10_bio = bio->bi_private;
371 int slot, dev;
372 struct md_rdev *rdev;
373 struct r10conf *conf = r10_bio->mddev->private;
374
375 slot = r10_bio->read_slot;
376 dev = r10_bio->devs[slot].devnum;
377 rdev = r10_bio->devs[slot].rdev;
378
379
380
381 update_head_pos(slot, r10_bio);
382
383 if (uptodate) {
384
385
386
387
388
389
390
391
392
393 set_bit(R10BIO_Uptodate, &r10_bio->state);
394 } else {
395
396
397
398
399
400 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
401 rdev->raid_disk))
402 uptodate = 1;
403 }
404 if (uptodate) {
405 raid_end_bio_io(r10_bio);
406 rdev_dec_pending(rdev, conf->mddev);
407 } else {
408
409
410
411 char b[BDEVNAME_SIZE];
412 pr_err_ratelimited("md/raid10:%s: %s: rescheduling sector %llu\n",
413 mdname(conf->mddev),
414 bdevname(rdev->bdev, b),
415 (unsigned long long)r10_bio->sector);
416 set_bit(R10BIO_ReadError, &r10_bio->state);
417 reschedule_retry(r10_bio);
418 }
419}
420
421static void close_write(struct r10bio *r10_bio)
422{
423
424 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
425 r10_bio->sectors,
426 !test_bit(R10BIO_Degraded, &r10_bio->state),
427 0);
428 md_write_end(r10_bio->mddev);
429}
430
431static void one_write_done(struct r10bio *r10_bio)
432{
433 if (atomic_dec_and_test(&r10_bio->remaining)) {
434 if (test_bit(R10BIO_WriteError, &r10_bio->state))
435 reschedule_retry(r10_bio);
436 else {
437 close_write(r10_bio);
438 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
439 reschedule_retry(r10_bio);
440 else
441 raid_end_bio_io(r10_bio);
442 }
443 }
444}
445
446static void raid10_end_write_request(struct bio *bio)
447{
448 struct r10bio *r10_bio = bio->bi_private;
449 int dev;
450 int dec_rdev = 1;
451 struct r10conf *conf = r10_bio->mddev->private;
452 int slot, repl;
453 struct md_rdev *rdev = NULL;
454 struct bio *to_put = NULL;
455 bool discard_error;
456
457 discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD;
458
459 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
460
461 if (repl)
462 rdev = conf->mirrors[dev].replacement;
463 if (!rdev) {
464 smp_rmb();
465 repl = 0;
466 rdev = conf->mirrors[dev].rdev;
467 }
468
469
470
471 if (bio->bi_error && !discard_error) {
472 if (repl)
473
474
475
476 md_error(rdev->mddev, rdev);
477 else {
478 set_bit(WriteErrorSeen, &rdev->flags);
479 if (!test_and_set_bit(WantReplacement, &rdev->flags))
480 set_bit(MD_RECOVERY_NEEDED,
481 &rdev->mddev->recovery);
482
483 dec_rdev = 0;
484 if (test_bit(FailFast, &rdev->flags) &&
485 (bio->bi_opf & MD_FAILFAST)) {
486 md_error(rdev->mddev, rdev);
487 if (!test_bit(Faulty, &rdev->flags))
488
489
490
491
492 set_bit(R10BIO_WriteError, &r10_bio->state);
493 else {
494 r10_bio->devs[slot].bio = NULL;
495 to_put = bio;
496 dec_rdev = 1;
497 }
498 } else
499 set_bit(R10BIO_WriteError, &r10_bio->state);
500 }
501 } else {
502
503
504
505
506
507
508
509
510
511 sector_t first_bad;
512 int bad_sectors;
513
514
515
516
517
518
519
520
521
522 if (test_bit(In_sync, &rdev->flags) &&
523 !test_bit(Faulty, &rdev->flags))
524 set_bit(R10BIO_Uptodate, &r10_bio->state);
525
526
527 if (is_badblock(rdev,
528 r10_bio->devs[slot].addr,
529 r10_bio->sectors,
530 &first_bad, &bad_sectors) && !discard_error) {
531 bio_put(bio);
532 if (repl)
533 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
534 else
535 r10_bio->devs[slot].bio = IO_MADE_GOOD;
536 dec_rdev = 0;
537 set_bit(R10BIO_MadeGood, &r10_bio->state);
538 }
539 }
540
541
542
543
544
545
546 one_write_done(r10_bio);
547 if (dec_rdev)
548 rdev_dec_pending(rdev, conf->mddev);
549 if (to_put)
550 bio_put(to_put);
551}
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
579{
580 int n,f;
581 sector_t sector;
582 sector_t chunk;
583 sector_t stripe;
584 int dev;
585 int slot = 0;
586 int last_far_set_start, last_far_set_size;
587
588 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
589 last_far_set_start *= geo->far_set_size;
590
591 last_far_set_size = geo->far_set_size;
592 last_far_set_size += (geo->raid_disks % geo->far_set_size);
593
594
595 chunk = r10bio->sector >> geo->chunk_shift;
596 sector = r10bio->sector & geo->chunk_mask;
597
598 chunk *= geo->near_copies;
599 stripe = chunk;
600 dev = sector_div(stripe, geo->raid_disks);
601 if (geo->far_offset)
602 stripe *= geo->far_copies;
603
604 sector += stripe << geo->chunk_shift;
605
606
607 for (n = 0; n < geo->near_copies; n++) {
608 int d = dev;
609 int set;
610 sector_t s = sector;
611 r10bio->devs[slot].devnum = d;
612 r10bio->devs[slot].addr = s;
613 slot++;
614
615 for (f = 1; f < geo->far_copies; f++) {
616 set = d / geo->far_set_size;
617 d += geo->near_copies;
618
619 if ((geo->raid_disks % geo->far_set_size) &&
620 (d > last_far_set_start)) {
621 d -= last_far_set_start;
622 d %= last_far_set_size;
623 d += last_far_set_start;
624 } else {
625 d %= geo->far_set_size;
626 d += geo->far_set_size * set;
627 }
628 s += geo->stride;
629 r10bio->devs[slot].devnum = d;
630 r10bio->devs[slot].addr = s;
631 slot++;
632 }
633 dev++;
634 if (dev >= geo->raid_disks) {
635 dev = 0;
636 sector += (geo->chunk_mask + 1);
637 }
638 }
639}
640
641static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
642{
643 struct geom *geo = &conf->geo;
644
645 if (conf->reshape_progress != MaxSector &&
646 ((r10bio->sector >= conf->reshape_progress) !=
647 conf->mddev->reshape_backwards)) {
648 set_bit(R10BIO_Previous, &r10bio->state);
649 geo = &conf->prev;
650 } else
651 clear_bit(R10BIO_Previous, &r10bio->state);
652
653 __raid10_find_phys(geo, r10bio);
654}
655
656static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
657{
658 sector_t offset, chunk, vchunk;
659
660
661
662 struct geom *geo = &conf->geo;
663 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
664 int far_set_size = geo->far_set_size;
665 int last_far_set_start;
666
667 if (geo->raid_disks % geo->far_set_size) {
668 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
669 last_far_set_start *= geo->far_set_size;
670
671 if (dev >= last_far_set_start) {
672 far_set_size = geo->far_set_size;
673 far_set_size += (geo->raid_disks % geo->far_set_size);
674 far_set_start = last_far_set_start;
675 }
676 }
677
678 offset = sector & geo->chunk_mask;
679 if (geo->far_offset) {
680 int fc;
681 chunk = sector >> geo->chunk_shift;
682 fc = sector_div(chunk, geo->far_copies);
683 dev -= fc * geo->near_copies;
684 if (dev < far_set_start)
685 dev += far_set_size;
686 } else {
687 while (sector >= geo->stride) {
688 sector -= geo->stride;
689 if (dev < (geo->near_copies + far_set_start))
690 dev += far_set_size - geo->near_copies;
691 else
692 dev -= geo->near_copies;
693 }
694 chunk = sector >> geo->chunk_shift;
695 }
696 vchunk = chunk * geo->raid_disks + dev;
697 sector_div(vchunk, geo->near_copies);
698 return (vchunk << geo->chunk_shift) + offset;
699}
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720static struct md_rdev *read_balance(struct r10conf *conf,
721 struct r10bio *r10_bio,
722 int *max_sectors)
723{
724 const sector_t this_sector = r10_bio->sector;
725 int disk, slot;
726 int sectors = r10_bio->sectors;
727 int best_good_sectors;
728 sector_t new_distance, best_dist;
729 struct md_rdev *best_rdev, *rdev = NULL;
730 int do_balance;
731 int best_slot;
732 struct geom *geo = &conf->geo;
733
734 raid10_find_phys(conf, r10_bio);
735 rcu_read_lock();
736 sectors = r10_bio->sectors;
737 best_slot = -1;
738 best_rdev = NULL;
739 best_dist = MaxSector;
740 best_good_sectors = 0;
741 do_balance = 1;
742 clear_bit(R10BIO_FailFast, &r10_bio->state);
743
744
745
746
747
748
749 if (conf->mddev->recovery_cp < MaxSector
750 && (this_sector + sectors >= conf->next_resync))
751 do_balance = 0;
752
753 for (slot = 0; slot < conf->copies ; slot++) {
754 sector_t first_bad;
755 int bad_sectors;
756 sector_t dev_sector;
757
758 if (r10_bio->devs[slot].bio == IO_BLOCKED)
759 continue;
760 disk = r10_bio->devs[slot].devnum;
761 rdev = rcu_dereference(conf->mirrors[disk].replacement);
762 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
763 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
764 rdev = rcu_dereference(conf->mirrors[disk].rdev);
765 if (rdev == NULL ||
766 test_bit(Faulty, &rdev->flags))
767 continue;
768 if (!test_bit(In_sync, &rdev->flags) &&
769 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
770 continue;
771
772 dev_sector = r10_bio->devs[slot].addr;
773 if (is_badblock(rdev, dev_sector, sectors,
774 &first_bad, &bad_sectors)) {
775 if (best_dist < MaxSector)
776
777 continue;
778 if (first_bad <= dev_sector) {
779
780
781
782
783 bad_sectors -= (dev_sector - first_bad);
784 if (!do_balance && sectors > bad_sectors)
785 sectors = bad_sectors;
786 if (best_good_sectors > sectors)
787 best_good_sectors = sectors;
788 } else {
789 sector_t good_sectors =
790 first_bad - dev_sector;
791 if (good_sectors > best_good_sectors) {
792 best_good_sectors = good_sectors;
793 best_slot = slot;
794 best_rdev = rdev;
795 }
796 if (!do_balance)
797
798 break;
799 }
800 continue;
801 } else
802 best_good_sectors = sectors;
803
804 if (!do_balance)
805 break;
806
807 if (best_slot >= 0)
808
809 set_bit(R10BIO_FailFast, &r10_bio->state);
810
811
812
813
814 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
815 new_distance = 0;
816
817
818 else if (geo->far_copies > 1)
819 new_distance = r10_bio->devs[slot].addr;
820 else
821 new_distance = abs(r10_bio->devs[slot].addr -
822 conf->mirrors[disk].head_position);
823 if (new_distance < best_dist) {
824 best_dist = new_distance;
825 best_slot = slot;
826 best_rdev = rdev;
827 }
828 }
829 if (slot >= conf->copies) {
830 slot = best_slot;
831 rdev = best_rdev;
832 }
833
834 if (slot >= 0) {
835 atomic_inc(&rdev->nr_pending);
836 r10_bio->read_slot = slot;
837 } else
838 rdev = NULL;
839 rcu_read_unlock();
840 *max_sectors = best_good_sectors;
841
842 return rdev;
843}
844
845static int raid10_congested(struct mddev *mddev, int bits)
846{
847 struct r10conf *conf = mddev->private;
848 int i, ret = 0;
849
850 if ((bits & (1 << WB_async_congested)) &&
851 conf->pending_count >= max_queued_requests)
852 return 1;
853
854 rcu_read_lock();
855 for (i = 0;
856 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
857 && ret == 0;
858 i++) {
859 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
860 if (rdev && !test_bit(Faulty, &rdev->flags)) {
861 struct request_queue *q = bdev_get_queue(rdev->bdev);
862
863 ret |= bdi_congested(q->backing_dev_info, bits);
864 }
865 }
866 rcu_read_unlock();
867 return ret;
868}
869
870static void flush_pending_writes(struct r10conf *conf)
871{
872
873
874
875 spin_lock_irq(&conf->device_lock);
876
877 if (conf->pending_bio_list.head) {
878 struct bio *bio;
879 bio = bio_list_get(&conf->pending_bio_list);
880 conf->pending_count = 0;
881 spin_unlock_irq(&conf->device_lock);
882
883
884 bitmap_unplug(conf->mddev->bitmap);
885 wake_up(&conf->wait_barrier);
886
887 while (bio) {
888 struct bio *next = bio->bi_next;
889 struct md_rdev *rdev = (void*)bio->bi_bdev;
890 bio->bi_next = NULL;
891 bio->bi_bdev = rdev->bdev;
892 if (test_bit(Faulty, &rdev->flags)) {
893 bio->bi_error = -EIO;
894 bio_endio(bio);
895 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
896 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
897
898 bio_endio(bio);
899 else
900 generic_make_request(bio);
901 bio = next;
902 }
903 } else
904 spin_unlock_irq(&conf->device_lock);
905}
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929static void raise_barrier(struct r10conf *conf, int force)
930{
931 BUG_ON(force && !conf->barrier);
932 spin_lock_irq(&conf->resync_lock);
933
934
935 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
936 conf->resync_lock);
937
938
939 conf->barrier++;
940
941
942 wait_event_lock_irq(conf->wait_barrier,
943 !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
944 conf->resync_lock);
945
946 spin_unlock_irq(&conf->resync_lock);
947}
948
949static void lower_barrier(struct r10conf *conf)
950{
951 unsigned long flags;
952 spin_lock_irqsave(&conf->resync_lock, flags);
953 conf->barrier--;
954 spin_unlock_irqrestore(&conf->resync_lock, flags);
955 wake_up(&conf->wait_barrier);
956}
957
958static void wait_barrier(struct r10conf *conf)
959{
960 spin_lock_irq(&conf->resync_lock);
961 if (conf->barrier) {
962 conf->nr_waiting++;
963
964
965
966
967
968
969
970
971
972 raid10_log(conf->mddev, "wait barrier");
973 wait_event_lock_irq(conf->wait_barrier,
974 !conf->barrier ||
975 (atomic_read(&conf->nr_pending) &&
976 current->bio_list &&
977 (!bio_list_empty(¤t->bio_list[0]) ||
978 !bio_list_empty(¤t->bio_list[1]))),
979 conf->resync_lock);
980 conf->nr_waiting--;
981 if (!conf->nr_waiting)
982 wake_up(&conf->wait_barrier);
983 }
984 atomic_inc(&conf->nr_pending);
985 spin_unlock_irq(&conf->resync_lock);
986}
987
988static void allow_barrier(struct r10conf *conf)
989{
990 if ((atomic_dec_and_test(&conf->nr_pending)) ||
991 (conf->array_freeze_pending))
992 wake_up(&conf->wait_barrier);
993}
994
995static void freeze_array(struct r10conf *conf, int extra)
996{
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009 spin_lock_irq(&conf->resync_lock);
1010 conf->array_freeze_pending++;
1011 conf->barrier++;
1012 conf->nr_waiting++;
1013 wait_event_lock_irq_cmd(conf->wait_barrier,
1014 atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
1015 conf->resync_lock,
1016 flush_pending_writes(conf));
1017
1018 conf->array_freeze_pending--;
1019 spin_unlock_irq(&conf->resync_lock);
1020}
1021
1022static void unfreeze_array(struct r10conf *conf)
1023{
1024
1025 spin_lock_irq(&conf->resync_lock);
1026 conf->barrier--;
1027 conf->nr_waiting--;
1028 wake_up(&conf->wait_barrier);
1029 spin_unlock_irq(&conf->resync_lock);
1030}
1031
1032static sector_t choose_data_offset(struct r10bio *r10_bio,
1033 struct md_rdev *rdev)
1034{
1035 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1036 test_bit(R10BIO_Previous, &r10_bio->state))
1037 return rdev->data_offset;
1038 else
1039 return rdev->new_data_offset;
1040}
1041
1042struct raid10_plug_cb {
1043 struct blk_plug_cb cb;
1044 struct bio_list pending;
1045 int pending_cnt;
1046};
1047
1048static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1049{
1050 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1051 cb);
1052 struct mddev *mddev = plug->cb.data;
1053 struct r10conf *conf = mddev->private;
1054 struct bio *bio;
1055
1056 if (from_schedule || current->bio_list) {
1057 spin_lock_irq(&conf->device_lock);
1058 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1059 conf->pending_count += plug->pending_cnt;
1060 spin_unlock_irq(&conf->device_lock);
1061 wake_up(&conf->wait_barrier);
1062 md_wakeup_thread(mddev->thread);
1063 kfree(plug);
1064 return;
1065 }
1066
1067
1068 bio = bio_list_get(&plug->pending);
1069 bitmap_unplug(mddev->bitmap);
1070 wake_up(&conf->wait_barrier);
1071
1072 while (bio) {
1073 struct bio *next = bio->bi_next;
1074 struct md_rdev *rdev = (void*)bio->bi_bdev;
1075 bio->bi_next = NULL;
1076 bio->bi_bdev = rdev->bdev;
1077 if (test_bit(Faulty, &rdev->flags)) {
1078 bio->bi_error = -EIO;
1079 bio_endio(bio);
1080 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
1081 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
1082
1083 bio_endio(bio);
1084 else
1085 generic_make_request(bio);
1086 bio = next;
1087 }
1088 kfree(plug);
1089}
1090
1091static void raid10_read_request(struct mddev *mddev, struct bio *bio,
1092 struct r10bio *r10_bio)
1093{
1094 struct r10conf *conf = mddev->private;
1095 struct bio *read_bio;
1096 const int op = bio_op(bio);
1097 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1098 int sectors_handled;
1099 int max_sectors;
1100 sector_t sectors;
1101 struct md_rdev *rdev;
1102 int slot;
1103
1104
1105
1106
1107
1108
1109 wait_barrier(conf);
1110
1111 sectors = bio_sectors(bio);
1112 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1113 bio->bi_iter.bi_sector < conf->reshape_progress &&
1114 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1115
1116
1117
1118
1119 raid10_log(conf->mddev, "wait reshape");
1120 allow_barrier(conf);
1121 wait_event(conf->wait_barrier,
1122 conf->reshape_progress <= bio->bi_iter.bi_sector ||
1123 conf->reshape_progress >= bio->bi_iter.bi_sector +
1124 sectors);
1125 wait_barrier(conf);
1126 }
1127
1128read_again:
1129 rdev = read_balance(conf, r10_bio, &max_sectors);
1130 if (!rdev) {
1131 raid_end_bio_io(r10_bio);
1132 return;
1133 }
1134 slot = r10_bio->read_slot;
1135
1136 read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
1137 bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
1138 max_sectors);
1139
1140 r10_bio->devs[slot].bio = read_bio;
1141 r10_bio->devs[slot].rdev = rdev;
1142
1143 read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
1144 choose_data_offset(r10_bio, rdev);
1145 read_bio->bi_bdev = rdev->bdev;
1146 read_bio->bi_end_io = raid10_end_read_request;
1147 bio_set_op_attrs(read_bio, op, do_sync);
1148 if (test_bit(FailFast, &rdev->flags) &&
1149 test_bit(R10BIO_FailFast, &r10_bio->state))
1150 read_bio->bi_opf |= MD_FAILFAST;
1151 read_bio->bi_private = r10_bio;
1152
1153 if (mddev->gendisk)
1154 trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
1155 read_bio, disk_devt(mddev->gendisk),
1156 r10_bio->sector);
1157 if (max_sectors < r10_bio->sectors) {
1158
1159
1160
1161
1162 sectors_handled = (r10_bio->sector + max_sectors
1163 - bio->bi_iter.bi_sector);
1164 r10_bio->sectors = max_sectors;
1165 spin_lock_irq(&conf->device_lock);
1166 if (bio->bi_phys_segments == 0)
1167 bio->bi_phys_segments = 2;
1168 else
1169 bio->bi_phys_segments++;
1170 spin_unlock_irq(&conf->device_lock);
1171
1172
1173
1174
1175
1176
1177 reschedule_retry(r10_bio);
1178
1179 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1180
1181 r10_bio->master_bio = bio;
1182 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1183 r10_bio->state = 0;
1184 r10_bio->mddev = mddev;
1185 r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
1186 goto read_again;
1187 } else
1188 generic_make_request(read_bio);
1189 return;
1190}
1191
1192static void raid10_write_request(struct mddev *mddev, struct bio *bio,
1193 struct r10bio *r10_bio)
1194{
1195 struct r10conf *conf = mddev->private;
1196 int i;
1197 const int op = bio_op(bio);
1198 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1199 const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
1200 unsigned long flags;
1201 struct md_rdev *blocked_rdev;
1202 struct blk_plug_cb *cb;
1203 struct raid10_plug_cb *plug = NULL;
1204 sector_t sectors;
1205 int sectors_handled;
1206 int max_sectors;
1207
1208 md_write_start(mddev, bio);
1209
1210
1211
1212
1213
1214
1215 wait_barrier(conf);
1216
1217 sectors = bio_sectors(bio);
1218 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1219 bio->bi_iter.bi_sector < conf->reshape_progress &&
1220 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1221
1222
1223
1224
1225 raid10_log(conf->mddev, "wait reshape");
1226 allow_barrier(conf);
1227 wait_event(conf->wait_barrier,
1228 conf->reshape_progress <= bio->bi_iter.bi_sector ||
1229 conf->reshape_progress >= bio->bi_iter.bi_sector +
1230 sectors);
1231 wait_barrier(conf);
1232 }
1233
1234 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1235 (mddev->reshape_backwards
1236 ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
1237 bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
1238 : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
1239 bio->bi_iter.bi_sector < conf->reshape_progress))) {
1240
1241 mddev->reshape_position = conf->reshape_progress;
1242 set_mask_bits(&mddev->sb_flags, 0,
1243 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1244 md_wakeup_thread(mddev->thread);
1245 raid10_log(conf->mddev, "wait reshape metadata");
1246 wait_event(mddev->sb_wait,
1247 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
1248
1249 conf->reshape_safe = mddev->reshape_position;
1250 }
1251
1252 if (conf->pending_count >= max_queued_requests) {
1253 md_wakeup_thread(mddev->thread);
1254 raid10_log(mddev, "wait queued");
1255 wait_event(conf->wait_barrier,
1256 conf->pending_count < max_queued_requests);
1257 }
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270 r10_bio->read_slot = -1;
1271 raid10_find_phys(conf, r10_bio);
1272retry_write:
1273 blocked_rdev = NULL;
1274 rcu_read_lock();
1275 max_sectors = r10_bio->sectors;
1276
1277 for (i = 0; i < conf->copies; i++) {
1278 int d = r10_bio->devs[i].devnum;
1279 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1280 struct md_rdev *rrdev = rcu_dereference(
1281 conf->mirrors[d].replacement);
1282 if (rdev == rrdev)
1283 rrdev = NULL;
1284 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1285 atomic_inc(&rdev->nr_pending);
1286 blocked_rdev = rdev;
1287 break;
1288 }
1289 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1290 atomic_inc(&rrdev->nr_pending);
1291 blocked_rdev = rrdev;
1292 break;
1293 }
1294 if (rdev && (test_bit(Faulty, &rdev->flags)))
1295 rdev = NULL;
1296 if (rrdev && (test_bit(Faulty, &rrdev->flags)))
1297 rrdev = NULL;
1298
1299 r10_bio->devs[i].bio = NULL;
1300 r10_bio->devs[i].repl_bio = NULL;
1301
1302 if (!rdev && !rrdev) {
1303 set_bit(R10BIO_Degraded, &r10_bio->state);
1304 continue;
1305 }
1306 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1307 sector_t first_bad;
1308 sector_t dev_sector = r10_bio->devs[i].addr;
1309 int bad_sectors;
1310 int is_bad;
1311
1312 is_bad = is_badblock(rdev, dev_sector, max_sectors,
1313 &first_bad, &bad_sectors);
1314 if (is_bad < 0) {
1315
1316
1317
1318 atomic_inc(&rdev->nr_pending);
1319 set_bit(BlockedBadBlocks, &rdev->flags);
1320 blocked_rdev = rdev;
1321 break;
1322 }
1323 if (is_bad && first_bad <= dev_sector) {
1324
1325 bad_sectors -= (dev_sector - first_bad);
1326 if (bad_sectors < max_sectors)
1327
1328
1329
1330 max_sectors = bad_sectors;
1331
1332
1333
1334
1335
1336
1337
1338
1339 continue;
1340 }
1341 if (is_bad) {
1342 int good_sectors = first_bad - dev_sector;
1343 if (good_sectors < max_sectors)
1344 max_sectors = good_sectors;
1345 }
1346 }
1347 if (rdev) {
1348 r10_bio->devs[i].bio = bio;
1349 atomic_inc(&rdev->nr_pending);
1350 }
1351 if (rrdev) {
1352 r10_bio->devs[i].repl_bio = bio;
1353 atomic_inc(&rrdev->nr_pending);
1354 }
1355 }
1356 rcu_read_unlock();
1357
1358 if (unlikely(blocked_rdev)) {
1359
1360 int j;
1361 int d;
1362
1363 for (j = 0; j < i; j++) {
1364 if (r10_bio->devs[j].bio) {
1365 d = r10_bio->devs[j].devnum;
1366 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1367 }
1368 if (r10_bio->devs[j].repl_bio) {
1369 struct md_rdev *rdev;
1370 d = r10_bio->devs[j].devnum;
1371 rdev = conf->mirrors[d].replacement;
1372 if (!rdev) {
1373
1374 smp_mb();
1375 rdev = conf->mirrors[d].rdev;
1376 }
1377 rdev_dec_pending(rdev, mddev);
1378 }
1379 }
1380 allow_barrier(conf);
1381 raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
1382 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1383 wait_barrier(conf);
1384 goto retry_write;
1385 }
1386
1387 if (max_sectors < r10_bio->sectors) {
1388
1389
1390
1391 r10_bio->sectors = max_sectors;
1392 spin_lock_irq(&conf->device_lock);
1393 if (bio->bi_phys_segments == 0)
1394 bio->bi_phys_segments = 2;
1395 else
1396 bio->bi_phys_segments++;
1397 spin_unlock_irq(&conf->device_lock);
1398 }
1399 sectors_handled = r10_bio->sector + max_sectors -
1400 bio->bi_iter.bi_sector;
1401
1402 atomic_set(&r10_bio->remaining, 1);
1403 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1404
1405 for (i = 0; i < conf->copies; i++) {
1406 struct bio *mbio;
1407 int d = r10_bio->devs[i].devnum;
1408 if (r10_bio->devs[i].bio) {
1409 struct md_rdev *rdev = conf->mirrors[d].rdev;
1410 mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
1411 bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
1412 max_sectors);
1413 r10_bio->devs[i].bio = mbio;
1414
1415 mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
1416 choose_data_offset(r10_bio, rdev));
1417 mbio->bi_bdev = rdev->bdev;
1418 mbio->bi_end_io = raid10_end_write_request;
1419 bio_set_op_attrs(mbio, op, do_sync | do_fua);
1420 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags) &&
1421 enough(conf, d))
1422 mbio->bi_opf |= MD_FAILFAST;
1423 mbio->bi_private = r10_bio;
1424
1425 if (conf->mddev->gendisk)
1426 trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
1427 mbio, disk_devt(conf->mddev->gendisk),
1428 r10_bio->sector);
1429
1430 mbio->bi_bdev = (void*)rdev;
1431
1432 atomic_inc(&r10_bio->remaining);
1433
1434 cb = blk_check_plugged(raid10_unplug, mddev,
1435 sizeof(*plug));
1436 if (cb)
1437 plug = container_of(cb, struct raid10_plug_cb,
1438 cb);
1439 else
1440 plug = NULL;
1441 spin_lock_irqsave(&conf->device_lock, flags);
1442 if (plug) {
1443 bio_list_add(&plug->pending, mbio);
1444 plug->pending_cnt++;
1445 } else {
1446 bio_list_add(&conf->pending_bio_list, mbio);
1447 conf->pending_count++;
1448 }
1449 spin_unlock_irqrestore(&conf->device_lock, flags);
1450 if (!plug)
1451 md_wakeup_thread(mddev->thread);
1452 }
1453
1454 if (r10_bio->devs[i].repl_bio) {
1455 struct md_rdev *rdev = conf->mirrors[d].replacement;
1456 if (rdev == NULL) {
1457
1458 smp_mb();
1459 rdev = conf->mirrors[d].rdev;
1460 }
1461 mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
1462 bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
1463 max_sectors);
1464 r10_bio->devs[i].repl_bio = mbio;
1465
1466 mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr +
1467 choose_data_offset(r10_bio, rdev));
1468 mbio->bi_bdev = rdev->bdev;
1469 mbio->bi_end_io = raid10_end_write_request;
1470 bio_set_op_attrs(mbio, op, do_sync | do_fua);
1471 mbio->bi_private = r10_bio;
1472
1473 if (conf->mddev->gendisk)
1474 trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
1475 mbio, disk_devt(conf->mddev->gendisk),
1476 r10_bio->sector);
1477
1478 mbio->bi_bdev = (void*)rdev;
1479
1480 atomic_inc(&r10_bio->remaining);
1481
1482 cb = blk_check_plugged(raid10_unplug, mddev,
1483 sizeof(*plug));
1484 if (cb)
1485 plug = container_of(cb, struct raid10_plug_cb,
1486 cb);
1487 else
1488 plug = NULL;
1489 spin_lock_irqsave(&conf->device_lock, flags);
1490 if (plug) {
1491 bio_list_add(&plug->pending, mbio);
1492 plug->pending_cnt++;
1493 } else {
1494 bio_list_add(&conf->pending_bio_list, mbio);
1495 conf->pending_count++;
1496 }
1497 spin_unlock_irqrestore(&conf->device_lock, flags);
1498 if (!plug)
1499 md_wakeup_thread(mddev->thread);
1500 }
1501 }
1502
1503
1504
1505
1506
1507 if (sectors_handled < bio_sectors(bio)) {
1508 one_write_done(r10_bio);
1509
1510
1511
1512 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1513
1514 r10_bio->master_bio = bio;
1515 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1516
1517 r10_bio->mddev = mddev;
1518 r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
1519 r10_bio->state = 0;
1520 goto retry_write;
1521 }
1522 one_write_done(r10_bio);
1523}
1524
1525static void __make_request(struct mddev *mddev, struct bio *bio)
1526{
1527 struct r10conf *conf = mddev->private;
1528 struct r10bio *r10_bio;
1529
1530 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1531
1532 r10_bio->master_bio = bio;
1533 r10_bio->sectors = bio_sectors(bio);
1534
1535 r10_bio->mddev = mddev;
1536 r10_bio->sector = bio->bi_iter.bi_sector;
1537 r10_bio->state = 0;
1538
1539
1540
1541
1542
1543
1544
1545
1546 bio->bi_phys_segments = 0;
1547 bio_clear_flag(bio, BIO_SEG_VALID);
1548
1549 if (bio_data_dir(bio) == READ)
1550 raid10_read_request(mddev, bio, r10_bio);
1551 else
1552 raid10_write_request(mddev, bio, r10_bio);
1553}
1554
1555static void raid10_make_request(struct mddev *mddev, struct bio *bio)
1556{
1557 struct r10conf *conf = mddev->private;
1558 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1559 int chunk_sects = chunk_mask + 1;
1560
1561 struct bio *split;
1562
1563 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1564 md_flush_request(mddev, bio);
1565 return;
1566 }
1567
1568 do {
1569
1570
1571
1572
1573
1574 if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
1575 bio_sectors(bio) > chunk_sects
1576 && (conf->geo.near_copies < conf->geo.raid_disks
1577 || conf->prev.near_copies <
1578 conf->prev.raid_disks))) {
1579 split = bio_split(bio, chunk_sects -
1580 (bio->bi_iter.bi_sector &
1581 (chunk_sects - 1)),
1582 GFP_NOIO, fs_bio_set);
1583 bio_chain(split, bio);
1584 } else {
1585 split = bio;
1586 }
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602 __make_request(mddev, split);
1603 if (split != bio && bio_data_dir(bio) == READ) {
1604 generic_make_request(bio);
1605 break;
1606 }
1607 } while (split != bio);
1608
1609
1610 wake_up(&conf->wait_barrier);
1611}
1612
1613static void raid10_status(struct seq_file *seq, struct mddev *mddev)
1614{
1615 struct r10conf *conf = mddev->private;
1616 int i;
1617
1618 if (conf->geo.near_copies < conf->geo.raid_disks)
1619 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1620 if (conf->geo.near_copies > 1)
1621 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1622 if (conf->geo.far_copies > 1) {
1623 if (conf->geo.far_offset)
1624 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1625 else
1626 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1627 if (conf->geo.far_set_size != conf->geo.raid_disks)
1628 seq_printf(seq, " %d devices per set", conf->geo.far_set_size);
1629 }
1630 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1631 conf->geo.raid_disks - mddev->degraded);
1632 rcu_read_lock();
1633 for (i = 0; i < conf->geo.raid_disks; i++) {
1634 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1635 seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1636 }
1637 rcu_read_unlock();
1638 seq_printf(seq, "]");
1639}
1640
1641
1642
1643
1644
1645
1646static int _enough(struct r10conf *conf, int previous, int ignore)
1647{
1648 int first = 0;
1649 int has_enough = 0;
1650 int disks, ncopies;
1651 if (previous) {
1652 disks = conf->prev.raid_disks;
1653 ncopies = conf->prev.near_copies;
1654 } else {
1655 disks = conf->geo.raid_disks;
1656 ncopies = conf->geo.near_copies;
1657 }
1658
1659 rcu_read_lock();
1660 do {
1661 int n = conf->copies;
1662 int cnt = 0;
1663 int this = first;
1664 while (n--) {
1665 struct md_rdev *rdev;
1666 if (this != ignore &&
1667 (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
1668 test_bit(In_sync, &rdev->flags))
1669 cnt++;
1670 this = (this+1) % disks;
1671 }
1672 if (cnt == 0)
1673 goto out;
1674 first = (first + ncopies) % disks;
1675 } while (first != 0);
1676 has_enough = 1;
1677out:
1678 rcu_read_unlock();
1679 return has_enough;
1680}
1681
1682static int enough(struct r10conf *conf, int ignore)
1683{
1684
1685
1686
1687
1688
1689 return _enough(conf, 0, ignore) &&
1690 _enough(conf, 1, ignore);
1691}
1692
1693static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
1694{
1695 char b[BDEVNAME_SIZE];
1696 struct r10conf *conf = mddev->private;
1697 unsigned long flags;
1698
1699
1700
1701
1702
1703
1704
1705 spin_lock_irqsave(&conf->device_lock, flags);
1706 if (test_bit(In_sync, &rdev->flags)
1707 && !enough(conf, rdev->raid_disk)) {
1708
1709
1710
1711 spin_unlock_irqrestore(&conf->device_lock, flags);
1712 return;
1713 }
1714 if (test_and_clear_bit(In_sync, &rdev->flags))
1715 mddev->degraded++;
1716
1717
1718
1719 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1720 set_bit(Blocked, &rdev->flags);
1721 set_bit(Faulty, &rdev->flags);
1722 set_mask_bits(&mddev->sb_flags, 0,
1723 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1724 spin_unlock_irqrestore(&conf->device_lock, flags);
1725 pr_crit("md/raid10:%s: Disk failure on %s, disabling device.\n"
1726 "md/raid10:%s: Operation continuing on %d devices.\n",
1727 mdname(mddev), bdevname(rdev->bdev, b),
1728 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1729}
1730
1731static void print_conf(struct r10conf *conf)
1732{
1733 int i;
1734 struct md_rdev *rdev;
1735
1736 pr_debug("RAID10 conf printout:\n");
1737 if (!conf) {
1738 pr_debug("(!conf)\n");
1739 return;
1740 }
1741 pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1742 conf->geo.raid_disks);
1743
1744
1745
1746 for (i = 0; i < conf->geo.raid_disks; i++) {
1747 char b[BDEVNAME_SIZE];
1748 rdev = conf->mirrors[i].rdev;
1749 if (rdev)
1750 pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n",
1751 i, !test_bit(In_sync, &rdev->flags),
1752 !test_bit(Faulty, &rdev->flags),
1753 bdevname(rdev->bdev,b));
1754 }
1755}
1756
1757static void close_sync(struct r10conf *conf)
1758{
1759 wait_barrier(conf);
1760 allow_barrier(conf);
1761
1762 mempool_destroy(conf->r10buf_pool);
1763 conf->r10buf_pool = NULL;
1764}
1765
1766static int raid10_spare_active(struct mddev *mddev)
1767{
1768 int i;
1769 struct r10conf *conf = mddev->private;
1770 struct raid10_info *tmp;
1771 int count = 0;
1772 unsigned long flags;
1773
1774
1775
1776
1777
1778 for (i = 0; i < conf->geo.raid_disks; i++) {
1779 tmp = conf->mirrors + i;
1780 if (tmp->replacement
1781 && tmp->replacement->recovery_offset == MaxSector
1782 && !test_bit(Faulty, &tmp->replacement->flags)
1783 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1784
1785 if (!tmp->rdev
1786 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1787 count++;
1788 if (tmp->rdev) {
1789
1790
1791
1792
1793 set_bit(Faulty, &tmp->rdev->flags);
1794 sysfs_notify_dirent_safe(
1795 tmp->rdev->sysfs_state);
1796 }
1797 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1798 } else if (tmp->rdev
1799 && tmp->rdev->recovery_offset == MaxSector
1800 && !test_bit(Faulty, &tmp->rdev->flags)
1801 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1802 count++;
1803 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
1804 }
1805 }
1806 spin_lock_irqsave(&conf->device_lock, flags);
1807 mddev->degraded -= count;
1808 spin_unlock_irqrestore(&conf->device_lock, flags);
1809
1810 print_conf(conf);
1811 return count;
1812}
1813
1814static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1815{
1816 struct r10conf *conf = mddev->private;
1817 int err = -EEXIST;
1818 int mirror;
1819 int first = 0;
1820 int last = conf->geo.raid_disks - 1;
1821
1822 if (mddev->recovery_cp < MaxSector)
1823
1824
1825
1826 return -EBUSY;
1827 if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
1828 return -EINVAL;
1829
1830 if (md_integrity_add_rdev(rdev, mddev))
1831 return -ENXIO;
1832
1833 if (rdev->raid_disk >= 0)
1834 first = last = rdev->raid_disk;
1835
1836 if (rdev->saved_raid_disk >= first &&
1837 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1838 mirror = rdev->saved_raid_disk;
1839 else
1840 mirror = first;
1841 for ( ; mirror <= last ; mirror++) {
1842 struct raid10_info *p = &conf->mirrors[mirror];
1843 if (p->recovery_disabled == mddev->recovery_disabled)
1844 continue;
1845 if (p->rdev) {
1846 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1847 p->replacement != NULL)
1848 continue;
1849 clear_bit(In_sync, &rdev->flags);
1850 set_bit(Replacement, &rdev->flags);
1851 rdev->raid_disk = mirror;
1852 err = 0;
1853 if (mddev->gendisk)
1854 disk_stack_limits(mddev->gendisk, rdev->bdev,
1855 rdev->data_offset << 9);
1856 conf->fullsync = 1;
1857 rcu_assign_pointer(p->replacement, rdev);
1858 break;
1859 }
1860
1861 if (mddev->gendisk)
1862 disk_stack_limits(mddev->gendisk, rdev->bdev,
1863 rdev->data_offset << 9);
1864
1865 p->head_position = 0;
1866 p->recovery_disabled = mddev->recovery_disabled - 1;
1867 rdev->raid_disk = mirror;
1868 err = 0;
1869 if (rdev->saved_raid_disk != mirror)
1870 conf->fullsync = 1;
1871 rcu_assign_pointer(p->rdev, rdev);
1872 break;
1873 }
1874 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1875 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
1876
1877 print_conf(conf);
1878 return err;
1879}
1880
1881static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1882{
1883 struct r10conf *conf = mddev->private;
1884 int err = 0;
1885 int number = rdev->raid_disk;
1886 struct md_rdev **rdevp;
1887 struct raid10_info *p = conf->mirrors + number;
1888
1889 print_conf(conf);
1890 if (rdev == p->rdev)
1891 rdevp = &p->rdev;
1892 else if (rdev == p->replacement)
1893 rdevp = &p->replacement;
1894 else
1895 return 0;
1896
1897 if (test_bit(In_sync, &rdev->flags) ||
1898 atomic_read(&rdev->nr_pending)) {
1899 err = -EBUSY;
1900 goto abort;
1901 }
1902
1903
1904
1905 if (!test_bit(Faulty, &rdev->flags) &&
1906 mddev->recovery_disabled != p->recovery_disabled &&
1907 (!p->replacement || p->replacement == rdev) &&
1908 number < conf->geo.raid_disks &&
1909 enough(conf, -1)) {
1910 err = -EBUSY;
1911 goto abort;
1912 }
1913 *rdevp = NULL;
1914 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
1915 synchronize_rcu();
1916 if (atomic_read(&rdev->nr_pending)) {
1917
1918 err = -EBUSY;
1919 *rdevp = rdev;
1920 goto abort;
1921 }
1922 }
1923 if (p->replacement) {
1924
1925 p->rdev = p->replacement;
1926 clear_bit(Replacement, &p->replacement->flags);
1927 smp_mb();
1928
1929
1930 p->replacement = NULL;
1931 clear_bit(WantReplacement, &rdev->flags);
1932 } else
1933
1934
1935
1936 clear_bit(WantReplacement, &rdev->flags);
1937
1938 err = md_integrity_register(mddev);
1939
1940abort:
1941
1942 print_conf(conf);
1943 return err;
1944}
1945
1946static void end_sync_read(struct bio *bio)
1947{
1948 struct r10bio *r10_bio = bio->bi_private;
1949 struct r10conf *conf = r10_bio->mddev->private;
1950 int d;
1951
1952 if (bio == r10_bio->master_bio) {
1953
1954 d = r10_bio->read_slot;
1955 } else
1956 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1957
1958 if (!bio->bi_error)
1959 set_bit(R10BIO_Uptodate, &r10_bio->state);
1960 else
1961
1962
1963
1964 atomic_add(r10_bio->sectors,
1965 &conf->mirrors[d].rdev->corrected_errors);
1966
1967
1968
1969
1970 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1971 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1972 atomic_dec_and_test(&r10_bio->remaining)) {
1973
1974
1975
1976 reschedule_retry(r10_bio);
1977 }
1978}
1979
1980static void end_sync_request(struct r10bio *r10_bio)
1981{
1982 struct mddev *mddev = r10_bio->mddev;
1983
1984 while (atomic_dec_and_test(&r10_bio->remaining)) {
1985 if (r10_bio->master_bio == NULL) {
1986
1987 sector_t s = r10_bio->sectors;
1988 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1989 test_bit(R10BIO_WriteError, &r10_bio->state))
1990 reschedule_retry(r10_bio);
1991 else
1992 put_buf(r10_bio);
1993 md_done_sync(mddev, s, 1);
1994 break;
1995 } else {
1996 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1997 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1998 test_bit(R10BIO_WriteError, &r10_bio->state))
1999 reschedule_retry(r10_bio);
2000 else
2001 put_buf(r10_bio);
2002 r10_bio = r10_bio2;
2003 }
2004 }
2005}
2006
2007static void end_sync_write(struct bio *bio)
2008{
2009 struct r10bio *r10_bio = bio->bi_private;
2010 struct mddev *mddev = r10_bio->mddev;
2011 struct r10conf *conf = mddev->private;
2012 int d;
2013 sector_t first_bad;
2014 int bad_sectors;
2015 int slot;
2016 int repl;
2017 struct md_rdev *rdev = NULL;
2018
2019 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
2020 if (repl)
2021 rdev = conf->mirrors[d].replacement;
2022 else
2023 rdev = conf->mirrors[d].rdev;
2024
2025 if (bio->bi_error) {
2026 if (repl)
2027 md_error(mddev, rdev);
2028 else {
2029 set_bit(WriteErrorSeen, &rdev->flags);
2030 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2031 set_bit(MD_RECOVERY_NEEDED,
2032 &rdev->mddev->recovery);
2033 set_bit(R10BIO_WriteError, &r10_bio->state);
2034 }
2035 } else if (is_badblock(rdev,
2036 r10_bio->devs[slot].addr,
2037 r10_bio->sectors,
2038 &first_bad, &bad_sectors))
2039 set_bit(R10BIO_MadeGood, &r10_bio->state);
2040
2041 rdev_dec_pending(rdev, mddev);
2042
2043 end_sync_request(r10_bio);
2044}
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2063{
2064 struct r10conf *conf = mddev->private;
2065 int i, first;
2066 struct bio *tbio, *fbio;
2067 int vcnt;
2068
2069 atomic_set(&r10_bio->remaining, 1);
2070
2071
2072 for (i=0; i<conf->copies; i++)
2073 if (!r10_bio->devs[i].bio->bi_error)
2074 break;
2075
2076 if (i == conf->copies)
2077 goto done;
2078
2079 first = i;
2080 fbio = r10_bio->devs[i].bio;
2081 fbio->bi_iter.bi_size = r10_bio->sectors << 9;
2082 fbio->bi_iter.bi_idx = 0;
2083
2084 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2085
2086 for (i=0 ; i < conf->copies ; i++) {
2087 int j, d;
2088 struct md_rdev *rdev;
2089
2090 tbio = r10_bio->devs[i].bio;
2091
2092 if (tbio->bi_end_io != end_sync_read)
2093 continue;
2094 if (i == first)
2095 continue;
2096 d = r10_bio->devs[i].devnum;
2097 rdev = conf->mirrors[d].rdev;
2098 if (!r10_bio->devs[i].bio->bi_error) {
2099
2100
2101
2102
2103 int sectors = r10_bio->sectors;
2104 for (j = 0; j < vcnt; j++) {
2105 int len = PAGE_SIZE;
2106 if (sectors < (len / 512))
2107 len = sectors * 512;
2108 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
2109 page_address(tbio->bi_io_vec[j].bv_page),
2110 len))
2111 break;
2112 sectors -= len/512;
2113 }
2114 if (j == vcnt)
2115 continue;
2116 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2117 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2118
2119 continue;
2120 } else if (test_bit(FailFast, &rdev->flags)) {
2121
2122 md_error(rdev->mddev, rdev);
2123 continue;
2124 }
2125
2126
2127
2128
2129
2130 bio_reset(tbio);
2131
2132 tbio->bi_vcnt = vcnt;
2133 tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
2134 tbio->bi_private = r10_bio;
2135 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
2136 tbio->bi_end_io = end_sync_write;
2137 bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
2138
2139 bio_copy_data(tbio, fbio);
2140
2141 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2142 atomic_inc(&r10_bio->remaining);
2143 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2144
2145 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
2146 tbio->bi_opf |= MD_FAILFAST;
2147 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
2148 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
2149 generic_make_request(tbio);
2150 }
2151
2152
2153
2154
2155 for (i = 0; i < conf->copies; i++) {
2156 int d;
2157
2158 tbio = r10_bio->devs[i].repl_bio;
2159 if (!tbio || !tbio->bi_end_io)
2160 continue;
2161 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2162 && r10_bio->devs[i].bio != fbio)
2163 bio_copy_data(tbio, fbio);
2164 d = r10_bio->devs[i].devnum;
2165 atomic_inc(&r10_bio->remaining);
2166 md_sync_acct(conf->mirrors[d].replacement->bdev,
2167 bio_sectors(tbio));
2168 generic_make_request(tbio);
2169 }
2170
2171done:
2172 if (atomic_dec_and_test(&r10_bio->remaining)) {
2173 md_done_sync(mddev, r10_bio->sectors, 1);
2174 put_buf(r10_bio);
2175 }
2176}
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188static void fix_recovery_read_error(struct r10bio *r10_bio)
2189{
2190
2191
2192
2193
2194
2195
2196
2197 struct mddev *mddev = r10_bio->mddev;
2198 struct r10conf *conf = mddev->private;
2199 struct bio *bio = r10_bio->devs[0].bio;
2200 sector_t sect = 0;
2201 int sectors = r10_bio->sectors;
2202 int idx = 0;
2203 int dr = r10_bio->devs[0].devnum;
2204 int dw = r10_bio->devs[1].devnum;
2205
2206 while (sectors) {
2207 int s = sectors;
2208 struct md_rdev *rdev;
2209 sector_t addr;
2210 int ok;
2211
2212 if (s > (PAGE_SIZE>>9))
2213 s = PAGE_SIZE >> 9;
2214
2215 rdev = conf->mirrors[dr].rdev;
2216 addr = r10_bio->devs[0].addr + sect,
2217 ok = sync_page_io(rdev,
2218 addr,
2219 s << 9,
2220 bio->bi_io_vec[idx].bv_page,
2221 REQ_OP_READ, 0, false);
2222 if (ok) {
2223 rdev = conf->mirrors[dw].rdev;
2224 addr = r10_bio->devs[1].addr + sect;
2225 ok = sync_page_io(rdev,
2226 addr,
2227 s << 9,
2228 bio->bi_io_vec[idx].bv_page,
2229 REQ_OP_WRITE, 0, false);
2230 if (!ok) {
2231 set_bit(WriteErrorSeen, &rdev->flags);
2232 if (!test_and_set_bit(WantReplacement,
2233 &rdev->flags))
2234 set_bit(MD_RECOVERY_NEEDED,
2235 &rdev->mddev->recovery);
2236 }
2237 }
2238 if (!ok) {
2239
2240
2241
2242
2243 rdev_set_badblocks(rdev, addr, s, 0);
2244
2245 if (rdev != conf->mirrors[dw].rdev) {
2246
2247 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2248 addr = r10_bio->devs[1].addr + sect;
2249 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2250 if (!ok) {
2251
2252 pr_notice("md/raid10:%s: recovery aborted due to read error\n",
2253 mdname(mddev));
2254
2255 conf->mirrors[dw].recovery_disabled
2256 = mddev->recovery_disabled;
2257 set_bit(MD_RECOVERY_INTR,
2258 &mddev->recovery);
2259 break;
2260 }
2261 }
2262 }
2263
2264 sectors -= s;
2265 sect += s;
2266 idx++;
2267 }
2268}
2269
2270static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2271{
2272 struct r10conf *conf = mddev->private;
2273 int d;
2274 struct bio *wbio, *wbio2;
2275
2276 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2277 fix_recovery_read_error(r10_bio);
2278 end_sync_request(r10_bio);
2279 return;
2280 }
2281
2282
2283
2284
2285
2286 d = r10_bio->devs[1].devnum;
2287 wbio = r10_bio->devs[1].bio;
2288 wbio2 = r10_bio->devs[1].repl_bio;
2289
2290
2291
2292
2293 if (wbio2 && !wbio2->bi_end_io)
2294 wbio2 = NULL;
2295 if (wbio->bi_end_io) {
2296 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2297 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2298 generic_make_request(wbio);
2299 }
2300 if (wbio2) {
2301 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2302 md_sync_acct(conf->mirrors[d].replacement->bdev,
2303 bio_sectors(wbio2));
2304 generic_make_request(wbio2);
2305 }
2306}
2307
2308
2309
2310
2311
2312
2313
2314static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2315{
2316 long cur_time_mon;
2317 unsigned long hours_since_last;
2318 unsigned int read_errors = atomic_read(&rdev->read_errors);
2319
2320 cur_time_mon = ktime_get_seconds();
2321
2322 if (rdev->last_read_error == 0) {
2323
2324 rdev->last_read_error = cur_time_mon;
2325 return;
2326 }
2327
2328 hours_since_last = (long)(cur_time_mon -
2329 rdev->last_read_error) / 3600;
2330
2331 rdev->last_read_error = cur_time_mon;
2332
2333
2334
2335
2336
2337
2338 if (hours_since_last >= 8 * sizeof(read_errors))
2339 atomic_set(&rdev->read_errors, 0);
2340 else
2341 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2342}
2343
2344static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2345 int sectors, struct page *page, int rw)
2346{
2347 sector_t first_bad;
2348 int bad_sectors;
2349
2350 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2351 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2352 return -1;
2353 if (sync_page_io(rdev, sector, sectors << 9, page, rw, 0, false))
2354
2355 return 1;
2356 if (rw == WRITE) {
2357 set_bit(WriteErrorSeen, &rdev->flags);
2358 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2359 set_bit(MD_RECOVERY_NEEDED,
2360 &rdev->mddev->recovery);
2361 }
2362
2363 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2364 md_error(rdev->mddev, rdev);
2365 return 0;
2366}
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2377{
2378 int sect = 0;
2379 int sectors = r10_bio->sectors;
2380 struct md_rdev*rdev;
2381 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2382 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2383
2384
2385
2386
2387 rdev = conf->mirrors[d].rdev;
2388
2389 if (test_bit(Faulty, &rdev->flags))
2390
2391
2392 return;
2393
2394 check_decay_read_errors(mddev, rdev);
2395 atomic_inc(&rdev->read_errors);
2396 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2397 char b[BDEVNAME_SIZE];
2398 bdevname(rdev->bdev, b);
2399
2400 pr_notice("md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %d:max %d]\n",
2401 mdname(mddev), b,
2402 atomic_read(&rdev->read_errors), max_read_errors);
2403 pr_notice("md/raid10:%s: %s: Failing raid device\n",
2404 mdname(mddev), b);
2405 md_error(mddev, rdev);
2406 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2407 return;
2408 }
2409
2410 while(sectors) {
2411 int s = sectors;
2412 int sl = r10_bio->read_slot;
2413 int success = 0;
2414 int start;
2415
2416 if (s > (PAGE_SIZE>>9))
2417 s = PAGE_SIZE >> 9;
2418
2419 rcu_read_lock();
2420 do {
2421 sector_t first_bad;
2422 int bad_sectors;
2423
2424 d = r10_bio->devs[sl].devnum;
2425 rdev = rcu_dereference(conf->mirrors[d].rdev);
2426 if (rdev &&
2427 test_bit(In_sync, &rdev->flags) &&
2428 !test_bit(Faulty, &rdev->flags) &&
2429 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2430 &first_bad, &bad_sectors) == 0) {
2431 atomic_inc(&rdev->nr_pending);
2432 rcu_read_unlock();
2433 success = sync_page_io(rdev,
2434 r10_bio->devs[sl].addr +
2435 sect,
2436 s<<9,
2437 conf->tmppage,
2438 REQ_OP_READ, 0, false);
2439 rdev_dec_pending(rdev, mddev);
2440 rcu_read_lock();
2441 if (success)
2442 break;
2443 }
2444 sl++;
2445 if (sl == conf->copies)
2446 sl = 0;
2447 } while (!success && sl != r10_bio->read_slot);
2448 rcu_read_unlock();
2449
2450 if (!success) {
2451
2452
2453
2454
2455 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2456 rdev = conf->mirrors[dn].rdev;
2457
2458 if (!rdev_set_badblocks(
2459 rdev,
2460 r10_bio->devs[r10_bio->read_slot].addr
2461 + sect,
2462 s, 0)) {
2463 md_error(mddev, rdev);
2464 r10_bio->devs[r10_bio->read_slot].bio
2465 = IO_BLOCKED;
2466 }
2467 break;
2468 }
2469
2470 start = sl;
2471
2472 rcu_read_lock();
2473 while (sl != r10_bio->read_slot) {
2474 char b[BDEVNAME_SIZE];
2475
2476 if (sl==0)
2477 sl = conf->copies;
2478 sl--;
2479 d = r10_bio->devs[sl].devnum;
2480 rdev = rcu_dereference(conf->mirrors[d].rdev);
2481 if (!rdev ||
2482 test_bit(Faulty, &rdev->flags) ||
2483 !test_bit(In_sync, &rdev->flags))
2484 continue;
2485
2486 atomic_inc(&rdev->nr_pending);
2487 rcu_read_unlock();
2488 if (r10_sync_page_io(rdev,
2489 r10_bio->devs[sl].addr +
2490 sect,
2491 s, conf->tmppage, WRITE)
2492 == 0) {
2493
2494 pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %s)\n",
2495 mdname(mddev), s,
2496 (unsigned long long)(
2497 sect +
2498 choose_data_offset(r10_bio,
2499 rdev)),
2500 bdevname(rdev->bdev, b));
2501 pr_notice("md/raid10:%s: %s: failing drive\n",
2502 mdname(mddev),
2503 bdevname(rdev->bdev, b));
2504 }
2505 rdev_dec_pending(rdev, mddev);
2506 rcu_read_lock();
2507 }
2508 sl = start;
2509 while (sl != r10_bio->read_slot) {
2510 char b[BDEVNAME_SIZE];
2511
2512 if (sl==0)
2513 sl = conf->copies;
2514 sl--;
2515 d = r10_bio->devs[sl].devnum;
2516 rdev = rcu_dereference(conf->mirrors[d].rdev);
2517 if (!rdev ||
2518 test_bit(Faulty, &rdev->flags) ||
2519 !test_bit(In_sync, &rdev->flags))
2520 continue;
2521
2522 atomic_inc(&rdev->nr_pending);
2523 rcu_read_unlock();
2524 switch (r10_sync_page_io(rdev,
2525 r10_bio->devs[sl].addr +
2526 sect,
2527 s, conf->tmppage,
2528 READ)) {
2529 case 0:
2530
2531 pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %s)\n",
2532 mdname(mddev), s,
2533 (unsigned long long)(
2534 sect +
2535 choose_data_offset(r10_bio, rdev)),
2536 bdevname(rdev->bdev, b));
2537 pr_notice("md/raid10:%s: %s: failing drive\n",
2538 mdname(mddev),
2539 bdevname(rdev->bdev, b));
2540 break;
2541 case 1:
2542 pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %s)\n",
2543 mdname(mddev), s,
2544 (unsigned long long)(
2545 sect +
2546 choose_data_offset(r10_bio, rdev)),
2547 bdevname(rdev->bdev, b));
2548 atomic_add(s, &rdev->corrected_errors);
2549 }
2550
2551 rdev_dec_pending(rdev, mddev);
2552 rcu_read_lock();
2553 }
2554 rcu_read_unlock();
2555
2556 sectors -= s;
2557 sect += s;
2558 }
2559}
2560
2561static int narrow_write_error(struct r10bio *r10_bio, int i)
2562{
2563 struct bio *bio = r10_bio->master_bio;
2564 struct mddev *mddev = r10_bio->mddev;
2565 struct r10conf *conf = mddev->private;
2566 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578 int block_sectors;
2579 sector_t sector;
2580 int sectors;
2581 int sect_to_write = r10_bio->sectors;
2582 int ok = 1;
2583
2584 if (rdev->badblocks.shift < 0)
2585 return 0;
2586
2587 block_sectors = roundup(1 << rdev->badblocks.shift,
2588 bdev_logical_block_size(rdev->bdev) >> 9);
2589 sector = r10_bio->sector;
2590 sectors = ((r10_bio->sector + block_sectors)
2591 & ~(sector_t)(block_sectors - 1))
2592 - sector;
2593
2594 while (sect_to_write) {
2595 struct bio *wbio;
2596 sector_t wsector;
2597 if (sectors > sect_to_write)
2598 sectors = sect_to_write;
2599
2600 wbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
2601 bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
2602 wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
2603 wbio->bi_iter.bi_sector = wsector +
2604 choose_data_offset(r10_bio, rdev);
2605 wbio->bi_bdev = rdev->bdev;
2606 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
2607
2608 if (submit_bio_wait(wbio) < 0)
2609
2610 ok = rdev_set_badblocks(rdev, wsector,
2611 sectors, 0)
2612 && ok;
2613
2614 bio_put(wbio);
2615 sect_to_write -= sectors;
2616 sector += sectors;
2617 sectors = block_sectors;
2618 }
2619 return ok;
2620}
2621
2622static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2623{
2624 int slot = r10_bio->read_slot;
2625 struct bio *bio;
2626 struct r10conf *conf = mddev->private;
2627 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2628 char b[BDEVNAME_SIZE];
2629 unsigned long do_sync;
2630 int max_sectors;
2631 dev_t bio_dev;
2632 sector_t bio_last_sector;
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642 bio = r10_bio->devs[slot].bio;
2643 bdevname(bio->bi_bdev, b);
2644 bio_dev = bio->bi_bdev->bd_dev;
2645 bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors;
2646 bio_put(bio);
2647 r10_bio->devs[slot].bio = NULL;
2648
2649 if (mddev->ro)
2650 r10_bio->devs[slot].bio = IO_BLOCKED;
2651 else if (!test_bit(FailFast, &rdev->flags)) {
2652 freeze_array(conf, 1);
2653 fix_read_error(conf, mddev, r10_bio);
2654 unfreeze_array(conf);
2655 } else
2656 md_error(mddev, rdev);
2657
2658 rdev_dec_pending(rdev, mddev);
2659
2660read_more:
2661 rdev = read_balance(conf, r10_bio, &max_sectors);
2662 if (rdev == NULL) {
2663 pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
2664 mdname(mddev), b,
2665 (unsigned long long)r10_bio->sector);
2666 raid_end_bio_io(r10_bio);
2667 return;
2668 }
2669
2670 do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC);
2671 slot = r10_bio->read_slot;
2672 pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
2673 mdname(mddev),
2674 bdevname(rdev->bdev, b),
2675 (unsigned long long)r10_bio->sector);
2676 bio = bio_clone_fast(r10_bio->master_bio, GFP_NOIO, mddev->bio_set);
2677 bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
2678 r10_bio->devs[slot].bio = bio;
2679 r10_bio->devs[slot].rdev = rdev;
2680 bio->bi_iter.bi_sector = r10_bio->devs[slot].addr
2681 + choose_data_offset(r10_bio, rdev);
2682 bio->bi_bdev = rdev->bdev;
2683 bio_set_op_attrs(bio, REQ_OP_READ, do_sync);
2684 if (test_bit(FailFast, &rdev->flags) &&
2685 test_bit(R10BIO_FailFast, &r10_bio->state))
2686 bio->bi_opf |= MD_FAILFAST;
2687 bio->bi_private = r10_bio;
2688 bio->bi_end_io = raid10_end_read_request;
2689 trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
2690 bio, bio_dev,
2691 bio_last_sector - r10_bio->sectors);
2692
2693 if (max_sectors < r10_bio->sectors) {
2694
2695 struct bio *mbio = r10_bio->master_bio;
2696 int sectors_handled =
2697 r10_bio->sector + max_sectors
2698 - mbio->bi_iter.bi_sector;
2699 r10_bio->sectors = max_sectors;
2700 spin_lock_irq(&conf->device_lock);
2701 if (mbio->bi_phys_segments == 0)
2702 mbio->bi_phys_segments = 2;
2703 else
2704 mbio->bi_phys_segments++;
2705 spin_unlock_irq(&conf->device_lock);
2706 generic_make_request(bio);
2707
2708 r10_bio = mempool_alloc(conf->r10bio_pool,
2709 GFP_NOIO);
2710 r10_bio->master_bio = mbio;
2711 r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
2712 r10_bio->state = 0;
2713 set_bit(R10BIO_ReadError,
2714 &r10_bio->state);
2715 r10_bio->mddev = mddev;
2716 r10_bio->sector = mbio->bi_iter.bi_sector
2717 + sectors_handled;
2718
2719 goto read_more;
2720 } else
2721 generic_make_request(bio);
2722}
2723
2724static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2725{
2726
2727
2728
2729
2730
2731
2732 int m;
2733 struct md_rdev *rdev;
2734
2735 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2736 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2737 for (m = 0; m < conf->copies; m++) {
2738 int dev = r10_bio->devs[m].devnum;
2739 rdev = conf->mirrors[dev].rdev;
2740 if (r10_bio->devs[m].bio == NULL)
2741 continue;
2742 if (!r10_bio->devs[m].bio->bi_error) {
2743 rdev_clear_badblocks(
2744 rdev,
2745 r10_bio->devs[m].addr,
2746 r10_bio->sectors, 0);
2747 } else {
2748 if (!rdev_set_badblocks(
2749 rdev,
2750 r10_bio->devs[m].addr,
2751 r10_bio->sectors, 0))
2752 md_error(conf->mddev, rdev);
2753 }
2754 rdev = conf->mirrors[dev].replacement;
2755 if (r10_bio->devs[m].repl_bio == NULL)
2756 continue;
2757
2758 if (!r10_bio->devs[m].repl_bio->bi_error) {
2759 rdev_clear_badblocks(
2760 rdev,
2761 r10_bio->devs[m].addr,
2762 r10_bio->sectors, 0);
2763 } else {
2764 if (!rdev_set_badblocks(
2765 rdev,
2766 r10_bio->devs[m].addr,
2767 r10_bio->sectors, 0))
2768 md_error(conf->mddev, rdev);
2769 }
2770 }
2771 put_buf(r10_bio);
2772 } else {
2773 bool fail = false;
2774 for (m = 0; m < conf->copies; m++) {
2775 int dev = r10_bio->devs[m].devnum;
2776 struct bio *bio = r10_bio->devs[m].bio;
2777 rdev = conf->mirrors[dev].rdev;
2778 if (bio == IO_MADE_GOOD) {
2779 rdev_clear_badblocks(
2780 rdev,
2781 r10_bio->devs[m].addr,
2782 r10_bio->sectors, 0);
2783 rdev_dec_pending(rdev, conf->mddev);
2784 } else if (bio != NULL && bio->bi_error) {
2785 fail = true;
2786 if (!narrow_write_error(r10_bio, m)) {
2787 md_error(conf->mddev, rdev);
2788 set_bit(R10BIO_Degraded,
2789 &r10_bio->state);
2790 }
2791 rdev_dec_pending(rdev, conf->mddev);
2792 }
2793 bio = r10_bio->devs[m].repl_bio;
2794 rdev = conf->mirrors[dev].replacement;
2795 if (rdev && bio == IO_MADE_GOOD) {
2796 rdev_clear_badblocks(
2797 rdev,
2798 r10_bio->devs[m].addr,
2799 r10_bio->sectors, 0);
2800 rdev_dec_pending(rdev, conf->mddev);
2801 }
2802 }
2803 if (fail) {
2804 spin_lock_irq(&conf->device_lock);
2805 list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
2806 conf->nr_queued++;
2807 spin_unlock_irq(&conf->device_lock);
2808 md_wakeup_thread(conf->mddev->thread);
2809 } else {
2810 if (test_bit(R10BIO_WriteError,
2811 &r10_bio->state))
2812 close_write(r10_bio);
2813 raid_end_bio_io(r10_bio);
2814 }
2815 }
2816}
2817
2818static void raid10d(struct md_thread *thread)
2819{
2820 struct mddev *mddev = thread->mddev;
2821 struct r10bio *r10_bio;
2822 unsigned long flags;
2823 struct r10conf *conf = mddev->private;
2824 struct list_head *head = &conf->retry_list;
2825 struct blk_plug plug;
2826
2827 md_check_recovery(mddev);
2828
2829 if (!list_empty_careful(&conf->bio_end_io_list) &&
2830 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2831 LIST_HEAD(tmp);
2832 spin_lock_irqsave(&conf->device_lock, flags);
2833 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2834 while (!list_empty(&conf->bio_end_io_list)) {
2835 list_move(conf->bio_end_io_list.prev, &tmp);
2836 conf->nr_queued--;
2837 }
2838 }
2839 spin_unlock_irqrestore(&conf->device_lock, flags);
2840 while (!list_empty(&tmp)) {
2841 r10_bio = list_first_entry(&tmp, struct r10bio,
2842 retry_list);
2843 list_del(&r10_bio->retry_list);
2844 if (mddev->degraded)
2845 set_bit(R10BIO_Degraded, &r10_bio->state);
2846
2847 if (test_bit(R10BIO_WriteError,
2848 &r10_bio->state))
2849 close_write(r10_bio);
2850 raid_end_bio_io(r10_bio);
2851 }
2852 }
2853
2854 blk_start_plug(&plug);
2855 for (;;) {
2856
2857 flush_pending_writes(conf);
2858
2859 spin_lock_irqsave(&conf->device_lock, flags);
2860 if (list_empty(head)) {
2861 spin_unlock_irqrestore(&conf->device_lock, flags);
2862 break;
2863 }
2864 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2865 list_del(head->prev);
2866 conf->nr_queued--;
2867 spin_unlock_irqrestore(&conf->device_lock, flags);
2868
2869 mddev = r10_bio->mddev;
2870 conf = mddev->private;
2871 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2872 test_bit(R10BIO_WriteError, &r10_bio->state))
2873 handle_write_completed(conf, r10_bio);
2874 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2875 reshape_request_write(mddev, r10_bio);
2876 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2877 sync_request_write(mddev, r10_bio);
2878 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2879 recovery_request_write(mddev, r10_bio);
2880 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2881 handle_read_error(mddev, r10_bio);
2882 else {
2883
2884
2885
2886 int slot = r10_bio->read_slot;
2887 generic_make_request(r10_bio->devs[slot].bio);
2888 }
2889
2890 cond_resched();
2891 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
2892 md_check_recovery(mddev);
2893 }
2894 blk_finish_plug(&plug);
2895}
2896
2897static int init_resync(struct r10conf *conf)
2898{
2899 int buffs;
2900 int i;
2901
2902 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2903 BUG_ON(conf->r10buf_pool);
2904 conf->have_replacement = 0;
2905 for (i = 0; i < conf->geo.raid_disks; i++)
2906 if (conf->mirrors[i].replacement)
2907 conf->have_replacement = 1;
2908 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2909 if (!conf->r10buf_pool)
2910 return -ENOMEM;
2911 conf->next_resync = 0;
2912 return 0;
2913}
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
2948 int *skipped)
2949{
2950 struct r10conf *conf = mddev->private;
2951 struct r10bio *r10_bio;
2952 struct bio *biolist = NULL, *bio;
2953 sector_t max_sector, nr_sectors;
2954 int i;
2955 int max_sync;
2956 sector_t sync_blocks;
2957 sector_t sectors_skipped = 0;
2958 int chunks_skipped = 0;
2959 sector_t chunk_mask = conf->geo.chunk_mask;
2960
2961 if (!conf->r10buf_pool)
2962 if (init_resync(conf))
2963 return 0;
2964
2965
2966
2967
2968
2969 if (mddev->bitmap == NULL &&
2970 mddev->recovery_cp == MaxSector &&
2971 mddev->reshape_position == MaxSector &&
2972 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2973 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
2974 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2975 conf->fullsync == 0) {
2976 *skipped = 1;
2977 return mddev->dev_sectors - sector_nr;
2978 }
2979
2980 skipped:
2981 max_sector = mddev->dev_sectors;
2982 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2983 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2984 max_sector = mddev->resync_max_sectors;
2985 if (sector_nr >= max_sector) {
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2996 end_reshape(conf);
2997 close_sync(conf);
2998 return 0;
2999 }
3000
3001 if (mddev->curr_resync < max_sector) {
3002 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3003 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
3004 &sync_blocks, 1);
3005 else for (i = 0; i < conf->geo.raid_disks; i++) {
3006 sector_t sect =
3007 raid10_find_virt(conf, mddev->curr_resync, i);
3008 bitmap_end_sync(mddev->bitmap, sect,
3009 &sync_blocks, 1);
3010 }
3011 } else {
3012
3013 if ((!mddev->bitmap || conf->fullsync)
3014 && conf->have_replacement
3015 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3016
3017
3018
3019 rcu_read_lock();
3020 for (i = 0; i < conf->geo.raid_disks; i++) {
3021 struct md_rdev *rdev =
3022 rcu_dereference(conf->mirrors[i].replacement);
3023 if (rdev)
3024 rdev->recovery_offset = MaxSector;
3025 }
3026 rcu_read_unlock();
3027 }
3028 conf->fullsync = 0;
3029 }
3030 bitmap_close_sync(mddev->bitmap);
3031 close_sync(conf);
3032 *skipped = 1;
3033 return sectors_skipped;
3034 }
3035
3036 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3037 return reshape_request(mddev, sector_nr, skipped);
3038
3039 if (chunks_skipped >= conf->geo.raid_disks) {
3040
3041
3042
3043 *skipped = 1;
3044 return (max_sector - sector_nr) + sectors_skipped;
3045 }
3046
3047 if (max_sector > mddev->resync_max)
3048 max_sector = mddev->resync_max;
3049
3050
3051
3052
3053 if (conf->geo.near_copies < conf->geo.raid_disks &&
3054 max_sector > (sector_nr | chunk_mask))
3055 max_sector = (sector_nr | chunk_mask) + 1;
3056
3057
3058
3059
3060
3061 if (conf->nr_waiting)
3062 schedule_timeout_uninterruptible(1);
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
3080 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3081
3082 int j;
3083 r10_bio = NULL;
3084
3085 for (i = 0 ; i < conf->geo.raid_disks; i++) {
3086 int still_degraded;
3087 struct r10bio *rb2;
3088 sector_t sect;
3089 int must_sync;
3090 int any_working;
3091 struct raid10_info *mirror = &conf->mirrors[i];
3092 struct md_rdev *mrdev, *mreplace;
3093
3094 rcu_read_lock();
3095 mrdev = rcu_dereference(mirror->rdev);
3096 mreplace = rcu_dereference(mirror->replacement);
3097
3098 if ((mrdev == NULL ||
3099 test_bit(Faulty, &mrdev->flags) ||
3100 test_bit(In_sync, &mrdev->flags)) &&
3101 (mreplace == NULL ||
3102 test_bit(Faulty, &mreplace->flags))) {
3103 rcu_read_unlock();
3104 continue;
3105 }
3106
3107 still_degraded = 0;
3108
3109 rb2 = r10_bio;
3110 sect = raid10_find_virt(conf, sector_nr, i);
3111 if (sect >= mddev->resync_max_sectors) {
3112
3113
3114
3115 rcu_read_unlock();
3116 continue;
3117 }
3118 if (mreplace && test_bit(Faulty, &mreplace->flags))
3119 mreplace = NULL;
3120
3121
3122
3123
3124 must_sync = bitmap_start_sync(mddev->bitmap, sect,
3125 &sync_blocks, 1);
3126 if (sync_blocks < max_sync)
3127 max_sync = sync_blocks;
3128 if (!must_sync &&
3129 mreplace == NULL &&
3130 !conf->fullsync) {
3131
3132
3133
3134 chunks_skipped = -1;
3135 rcu_read_unlock();
3136 continue;
3137 }
3138 atomic_inc(&mrdev->nr_pending);
3139 if (mreplace)
3140 atomic_inc(&mreplace->nr_pending);
3141 rcu_read_unlock();
3142
3143 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3144 r10_bio->state = 0;
3145 raise_barrier(conf, rb2 != NULL);
3146 atomic_set(&r10_bio->remaining, 0);
3147
3148 r10_bio->master_bio = (struct bio*)rb2;
3149 if (rb2)
3150 atomic_inc(&rb2->remaining);
3151 r10_bio->mddev = mddev;
3152 set_bit(R10BIO_IsRecover, &r10_bio->state);
3153 r10_bio->sector = sect;
3154
3155 raid10_find_phys(conf, r10_bio);
3156
3157
3158
3159
3160 rcu_read_lock();
3161 for (j = 0; j < conf->geo.raid_disks; j++) {
3162 struct md_rdev *rdev = rcu_dereference(
3163 conf->mirrors[j].rdev);
3164 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3165 still_degraded = 1;
3166 break;
3167 }
3168 }
3169
3170 must_sync = bitmap_start_sync(mddev->bitmap, sect,
3171 &sync_blocks, still_degraded);
3172
3173 any_working = 0;
3174 for (j=0; j<conf->copies;j++) {
3175 int k;
3176 int d = r10_bio->devs[j].devnum;
3177 sector_t from_addr, to_addr;
3178 struct md_rdev *rdev =
3179 rcu_dereference(conf->mirrors[d].rdev);
3180 sector_t sector, first_bad;
3181 int bad_sectors;
3182 if (!rdev ||
3183 !test_bit(In_sync, &rdev->flags))
3184 continue;
3185
3186 any_working = 1;
3187 sector = r10_bio->devs[j].addr;
3188
3189 if (is_badblock(rdev, sector, max_sync,
3190 &first_bad, &bad_sectors)) {
3191 if (first_bad > sector)
3192 max_sync = first_bad - sector;
3193 else {
3194 bad_sectors -= (sector
3195 - first_bad);
3196 if (max_sync > bad_sectors)
3197 max_sync = bad_sectors;
3198 continue;
3199 }
3200 }
3201 bio = r10_bio->devs[0].bio;
3202 bio_reset(bio);
3203 bio->bi_next = biolist;
3204 biolist = bio;
3205 bio->bi_private = r10_bio;
3206 bio->bi_end_io = end_sync_read;
3207 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3208 if (test_bit(FailFast, &rdev->flags))
3209 bio->bi_opf |= MD_FAILFAST;
3210 from_addr = r10_bio->devs[j].addr;
3211 bio->bi_iter.bi_sector = from_addr +
3212 rdev->data_offset;
3213 bio->bi_bdev = rdev->bdev;
3214 atomic_inc(&rdev->nr_pending);
3215
3216
3217 for (k=0; k<conf->copies; k++)
3218 if (r10_bio->devs[k].devnum == i)
3219 break;
3220 BUG_ON(k == conf->copies);
3221 to_addr = r10_bio->devs[k].addr;
3222 r10_bio->devs[0].devnum = d;
3223 r10_bio->devs[0].addr = from_addr;
3224 r10_bio->devs[1].devnum = i;
3225 r10_bio->devs[1].addr = to_addr;
3226
3227 if (!test_bit(In_sync, &mrdev->flags)) {
3228 bio = r10_bio->devs[1].bio;
3229 bio_reset(bio);
3230 bio->bi_next = biolist;
3231 biolist = bio;
3232 bio->bi_private = r10_bio;
3233 bio->bi_end_io = end_sync_write;
3234 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3235 bio->bi_iter.bi_sector = to_addr
3236 + mrdev->data_offset;
3237 bio->bi_bdev = mrdev->bdev;
3238 atomic_inc(&r10_bio->remaining);
3239 } else
3240 r10_bio->devs[1].bio->bi_end_io = NULL;
3241
3242
3243 bio = r10_bio->devs[1].repl_bio;
3244 if (bio)
3245 bio->bi_end_io = NULL;
3246
3247
3248
3249
3250
3251
3252
3253
3254 if (mreplace == NULL || bio == NULL ||
3255 test_bit(Faulty, &mreplace->flags))
3256 break;
3257 bio_reset(bio);
3258 bio->bi_next = biolist;
3259 biolist = bio;
3260 bio->bi_private = r10_bio;
3261 bio->bi_end_io = end_sync_write;
3262 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3263 bio->bi_iter.bi_sector = to_addr +
3264 mreplace->data_offset;
3265 bio->bi_bdev = mreplace->bdev;
3266 atomic_inc(&r10_bio->remaining);
3267 break;
3268 }
3269 rcu_read_unlock();
3270 if (j == conf->copies) {
3271
3272
3273 if (any_working) {
3274
3275
3276
3277 int k;
3278 for (k = 0; k < conf->copies; k++)
3279 if (r10_bio->devs[k].devnum == i)
3280 break;
3281 if (!test_bit(In_sync,
3282 &mrdev->flags)
3283 && !rdev_set_badblocks(
3284 mrdev,
3285 r10_bio->devs[k].addr,
3286 max_sync, 0))
3287 any_working = 0;
3288 if (mreplace &&
3289 !rdev_set_badblocks(
3290 mreplace,
3291 r10_bio->devs[k].addr,
3292 max_sync, 0))
3293 any_working = 0;
3294 }
3295 if (!any_working) {
3296 if (!test_and_set_bit(MD_RECOVERY_INTR,
3297 &mddev->recovery))
3298 pr_warn("md/raid10:%s: insufficient working devices for recovery.\n",
3299 mdname(mddev));
3300 mirror->recovery_disabled
3301 = mddev->recovery_disabled;
3302 }
3303 put_buf(r10_bio);
3304 if (rb2)
3305 atomic_dec(&rb2->remaining);
3306 r10_bio = rb2;
3307 rdev_dec_pending(mrdev, mddev);
3308 if (mreplace)
3309 rdev_dec_pending(mreplace, mddev);
3310 break;
3311 }
3312 rdev_dec_pending(mrdev, mddev);
3313 if (mreplace)
3314 rdev_dec_pending(mreplace, mddev);
3315 if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) {
3316
3317
3318
3319
3320 int targets = 1;
3321 for (; j < conf->copies; j++) {
3322 int d = r10_bio->devs[j].devnum;
3323 if (conf->mirrors[d].rdev &&
3324 test_bit(In_sync,
3325 &conf->mirrors[d].rdev->flags))
3326 targets++;
3327 }
3328 if (targets == 1)
3329 r10_bio->devs[0].bio->bi_opf
3330 &= ~MD_FAILFAST;
3331 }
3332 }
3333 if (biolist == NULL) {
3334 while (r10_bio) {
3335 struct r10bio *rb2 = r10_bio;
3336 r10_bio = (struct r10bio*) rb2->master_bio;
3337 rb2->master_bio = NULL;
3338 put_buf(rb2);
3339 }
3340 goto giveup;
3341 }
3342 } else {
3343
3344 int count = 0;
3345
3346 bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0);
3347
3348 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
3349 &sync_blocks, mddev->degraded) &&
3350 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3351 &mddev->recovery)) {
3352
3353 *skipped = 1;
3354 return sync_blocks + sectors_skipped;
3355 }
3356 if (sync_blocks < max_sync)
3357 max_sync = sync_blocks;
3358 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3359 r10_bio->state = 0;
3360
3361 r10_bio->mddev = mddev;
3362 atomic_set(&r10_bio->remaining, 0);
3363 raise_barrier(conf, 0);
3364 conf->next_resync = sector_nr;
3365
3366 r10_bio->master_bio = NULL;
3367 r10_bio->sector = sector_nr;
3368 set_bit(R10BIO_IsSync, &r10_bio->state);
3369 raid10_find_phys(conf, r10_bio);
3370 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3371
3372 for (i = 0; i < conf->copies; i++) {
3373 int d = r10_bio->devs[i].devnum;
3374 sector_t first_bad, sector;
3375 int bad_sectors;
3376 struct md_rdev *rdev;
3377
3378 if (r10_bio->devs[i].repl_bio)
3379 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3380
3381 bio = r10_bio->devs[i].bio;
3382 bio_reset(bio);
3383 bio->bi_error = -EIO;
3384 rcu_read_lock();
3385 rdev = rcu_dereference(conf->mirrors[d].rdev);
3386 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3387 rcu_read_unlock();
3388 continue;
3389 }
3390 sector = r10_bio->devs[i].addr;
3391 if (is_badblock(rdev, sector, max_sync,
3392 &first_bad, &bad_sectors)) {
3393 if (first_bad > sector)
3394 max_sync = first_bad - sector;
3395 else {
3396 bad_sectors -= (sector - first_bad);
3397 if (max_sync > bad_sectors)
3398 max_sync = bad_sectors;
3399 rcu_read_unlock();
3400 continue;
3401 }
3402 }
3403 atomic_inc(&rdev->nr_pending);
3404 atomic_inc(&r10_bio->remaining);
3405 bio->bi_next = biolist;
3406 biolist = bio;
3407 bio->bi_private = r10_bio;
3408 bio->bi_end_io = end_sync_read;
3409 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3410 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
3411 bio->bi_opf |= MD_FAILFAST;
3412 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3413 bio->bi_bdev = rdev->bdev;
3414 count++;
3415
3416 rdev = rcu_dereference(conf->mirrors[d].replacement);
3417 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3418 rcu_read_unlock();
3419 continue;
3420 }
3421 atomic_inc(&rdev->nr_pending);
3422 rcu_read_unlock();
3423
3424
3425 bio = r10_bio->devs[i].repl_bio;
3426 bio_reset(bio);
3427 bio->bi_error = -EIO;
3428
3429 sector = r10_bio->devs[i].addr;
3430 bio->bi_next = biolist;
3431 biolist = bio;
3432 bio->bi_private = r10_bio;
3433 bio->bi_end_io = end_sync_write;
3434 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3435 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
3436 bio->bi_opf |= MD_FAILFAST;
3437 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3438 bio->bi_bdev = rdev->bdev;
3439 count++;
3440 }
3441
3442 if (count < 2) {
3443 for (i=0; i<conf->copies; i++) {
3444 int d = r10_bio->devs[i].devnum;
3445 if (r10_bio->devs[i].bio->bi_end_io)
3446 rdev_dec_pending(conf->mirrors[d].rdev,
3447 mddev);
3448 if (r10_bio->devs[i].repl_bio &&
3449 r10_bio->devs[i].repl_bio->bi_end_io)
3450 rdev_dec_pending(
3451 conf->mirrors[d].replacement,
3452 mddev);
3453 }
3454 put_buf(r10_bio);
3455 biolist = NULL;
3456 goto giveup;
3457 }
3458 }
3459
3460 nr_sectors = 0;
3461 if (sector_nr + max_sync < max_sector)
3462 max_sector = sector_nr + max_sync;
3463 do {
3464 struct page *page;
3465 int len = PAGE_SIZE;
3466 if (sector_nr + (len>>9) > max_sector)
3467 len = (max_sector - sector_nr) << 9;
3468 if (len == 0)
3469 break;
3470 for (bio= biolist ; bio ; bio=bio->bi_next) {
3471 struct bio *bio2;
3472 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
3473 if (bio_add_page(bio, page, len, 0))
3474 continue;
3475
3476
3477 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
3478 for (bio2 = biolist;
3479 bio2 && bio2 != bio;
3480 bio2 = bio2->bi_next) {
3481
3482 bio2->bi_vcnt--;
3483 bio2->bi_iter.bi_size -= len;
3484 bio_clear_flag(bio2, BIO_SEG_VALID);
3485 }
3486 goto bio_full;
3487 }
3488 nr_sectors += len>>9;
3489 sector_nr += len>>9;
3490 } while (biolist->bi_vcnt < RESYNC_PAGES);
3491 bio_full:
3492 r10_bio->sectors = nr_sectors;
3493
3494 while (biolist) {
3495 bio = biolist;
3496 biolist = biolist->bi_next;
3497
3498 bio->bi_next = NULL;
3499 r10_bio = bio->bi_private;
3500 r10_bio->sectors = nr_sectors;
3501
3502 if (bio->bi_end_io == end_sync_read) {
3503 md_sync_acct(bio->bi_bdev, nr_sectors);
3504 bio->bi_error = 0;
3505 generic_make_request(bio);
3506 }
3507 }
3508
3509 if (sectors_skipped)
3510
3511
3512
3513 md_done_sync(mddev, sectors_skipped, 1);
3514
3515 return sectors_skipped + nr_sectors;
3516 giveup:
3517
3518
3519
3520
3521 if (sector_nr + max_sync < max_sector)
3522 max_sector = sector_nr + max_sync;
3523
3524 sectors_skipped += (max_sector - sector_nr);
3525 chunks_skipped ++;
3526 sector_nr = max_sector;
3527 goto skipped;
3528}
3529
3530static sector_t
3531raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3532{
3533 sector_t size;
3534 struct r10conf *conf = mddev->private;
3535
3536 if (!raid_disks)
3537 raid_disks = min(conf->geo.raid_disks,
3538 conf->prev.raid_disks);
3539 if (!sectors)
3540 sectors = conf->dev_sectors;
3541
3542 size = sectors >> conf->geo.chunk_shift;
3543 sector_div(size, conf->geo.far_copies);
3544 size = size * raid_disks;
3545 sector_div(size, conf->geo.near_copies);
3546
3547 return size << conf->geo.chunk_shift;
3548}
3549
3550static void calc_sectors(struct r10conf *conf, sector_t size)
3551{
3552
3553
3554
3555
3556
3557 size = size >> conf->geo.chunk_shift;
3558 sector_div(size, conf->geo.far_copies);
3559 size = size * conf->geo.raid_disks;
3560 sector_div(size, conf->geo.near_copies);
3561
3562
3563 size = size * conf->copies;
3564
3565
3566
3567
3568 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3569
3570 conf->dev_sectors = size << conf->geo.chunk_shift;
3571
3572 if (conf->geo.far_offset)
3573 conf->geo.stride = 1 << conf->geo.chunk_shift;
3574 else {
3575 sector_div(size, conf->geo.far_copies);
3576 conf->geo.stride = size << conf->geo.chunk_shift;
3577 }
3578}
3579
3580enum geo_type {geo_new, geo_old, geo_start};
3581static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3582{
3583 int nc, fc, fo;
3584 int layout, chunk, disks;
3585 switch (new) {
3586 case geo_old:
3587 layout = mddev->layout;
3588 chunk = mddev->chunk_sectors;
3589 disks = mddev->raid_disks - mddev->delta_disks;
3590 break;
3591 case geo_new:
3592 layout = mddev->new_layout;
3593 chunk = mddev->new_chunk_sectors;
3594 disks = mddev->raid_disks;
3595 break;
3596 default:
3597 case geo_start:
3598
3599 layout = mddev->new_layout;
3600 chunk = mddev->new_chunk_sectors;
3601 disks = mddev->raid_disks + mddev->delta_disks;
3602 break;
3603 }
3604 if (layout >> 19)
3605 return -1;
3606 if (chunk < (PAGE_SIZE >> 9) ||
3607 !is_power_of_2(chunk))
3608 return -2;
3609 nc = layout & 255;
3610 fc = (layout >> 8) & 255;
3611 fo = layout & (1<<16);
3612 geo->raid_disks = disks;
3613 geo->near_copies = nc;
3614 geo->far_copies = fc;
3615 geo->far_offset = fo;
3616 switch (layout >> 17) {
3617 case 0:
3618 geo->far_set_size = disks;
3619 break;
3620 case 1:
3621
3622 geo->far_set_size = disks/fc;
3623 WARN(geo->far_set_size < fc,
3624 "This RAID10 layout does not provide data safety - please backup and create new array\n");
3625 break;
3626 case 2:
3627 geo->far_set_size = fc * nc;
3628 break;
3629 default:
3630 return -1;
3631 }
3632 geo->chunk_mask = chunk - 1;
3633 geo->chunk_shift = ffz(~chunk);
3634 return nc*fc;
3635}
3636
3637static struct r10conf *setup_conf(struct mddev *mddev)
3638{
3639 struct r10conf *conf = NULL;
3640 int err = -EINVAL;
3641 struct geom geo;
3642 int copies;
3643
3644 copies = setup_geo(&geo, mddev, geo_new);
3645
3646 if (copies == -2) {
3647 pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n",
3648 mdname(mddev), PAGE_SIZE);
3649 goto out;
3650 }
3651
3652 if (copies < 2 || copies > mddev->raid_disks) {
3653 pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3654 mdname(mddev), mddev->new_layout);
3655 goto out;
3656 }
3657
3658 err = -ENOMEM;
3659 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3660 if (!conf)
3661 goto out;
3662
3663
3664 conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
3665 max(0,-mddev->delta_disks)),
3666 GFP_KERNEL);
3667 if (!conf->mirrors)
3668 goto out;
3669
3670 conf->tmppage = alloc_page(GFP_KERNEL);
3671 if (!conf->tmppage)
3672 goto out;
3673
3674 conf->geo = geo;
3675 conf->copies = copies;
3676 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
3677 r10bio_pool_free, conf);
3678 if (!conf->r10bio_pool)
3679 goto out;
3680
3681 calc_sectors(conf, mddev->dev_sectors);
3682 if (mddev->reshape_position == MaxSector) {
3683 conf->prev = conf->geo;
3684 conf->reshape_progress = MaxSector;
3685 } else {
3686 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3687 err = -EINVAL;
3688 goto out;
3689 }
3690 conf->reshape_progress = mddev->reshape_position;
3691 if (conf->prev.far_offset)
3692 conf->prev.stride = 1 << conf->prev.chunk_shift;
3693 else
3694
3695 conf->prev.stride = conf->dev_sectors;
3696 }
3697 conf->reshape_safe = conf->reshape_progress;
3698 spin_lock_init(&conf->device_lock);
3699 INIT_LIST_HEAD(&conf->retry_list);
3700 INIT_LIST_HEAD(&conf->bio_end_io_list);
3701
3702 spin_lock_init(&conf->resync_lock);
3703 init_waitqueue_head(&conf->wait_barrier);
3704 atomic_set(&conf->nr_pending, 0);
3705
3706 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3707 if (!conf->thread)
3708 goto out;
3709
3710 conf->mddev = mddev;
3711 return conf;
3712
3713 out:
3714 if (conf) {
3715 mempool_destroy(conf->r10bio_pool);
3716 kfree(conf->mirrors);
3717 safe_put_page(conf->tmppage);
3718 kfree(conf);
3719 }
3720 return ERR_PTR(err);
3721}
3722
3723static int raid10_run(struct mddev *mddev)
3724{
3725 struct r10conf *conf;
3726 int i, disk_idx, chunk_size;
3727 struct raid10_info *disk;
3728 struct md_rdev *rdev;
3729 sector_t size;
3730 sector_t min_offset_diff = 0;
3731 int first = 1;
3732 bool discard_supported = false;
3733
3734 if (mddev->private == NULL) {
3735 conf = setup_conf(mddev);
3736 if (IS_ERR(conf))
3737 return PTR_ERR(conf);
3738 mddev->private = conf;
3739 }
3740 conf = mddev->private;
3741 if (!conf)
3742 goto out;
3743
3744 mddev->thread = conf->thread;
3745 conf->thread = NULL;
3746
3747 chunk_size = mddev->chunk_sectors << 9;
3748 if (mddev->queue) {
3749 blk_queue_max_discard_sectors(mddev->queue,
3750 mddev->chunk_sectors);
3751 blk_queue_max_write_same_sectors(mddev->queue, 0);
3752 blk_queue_io_min(mddev->queue, chunk_size);
3753 if (conf->geo.raid_disks % conf->geo.near_copies)
3754 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3755 else
3756 blk_queue_io_opt(mddev->queue, chunk_size *
3757 (conf->geo.raid_disks / conf->geo.near_copies));
3758 }
3759
3760 rdev_for_each(rdev, mddev) {
3761 long long diff;
3762 struct request_queue *q;
3763
3764 disk_idx = rdev->raid_disk;
3765 if (disk_idx < 0)
3766 continue;
3767 if (disk_idx >= conf->geo.raid_disks &&
3768 disk_idx >= conf->prev.raid_disks)
3769 continue;
3770 disk = conf->mirrors + disk_idx;
3771
3772 if (test_bit(Replacement, &rdev->flags)) {
3773 if (disk->replacement)
3774 goto out_free_conf;
3775 disk->replacement = rdev;
3776 } else {
3777 if (disk->rdev)
3778 goto out_free_conf;
3779 disk->rdev = rdev;
3780 }
3781 q = bdev_get_queue(rdev->bdev);
3782 diff = (rdev->new_data_offset - rdev->data_offset);
3783 if (!mddev->reshape_backwards)
3784 diff = -diff;
3785 if (diff < 0)
3786 diff = 0;
3787 if (first || diff < min_offset_diff)
3788 min_offset_diff = diff;
3789
3790 if (mddev->gendisk)
3791 disk_stack_limits(mddev->gendisk, rdev->bdev,
3792 rdev->data_offset << 9);
3793
3794 disk->head_position = 0;
3795
3796 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3797 discard_supported = true;
3798 }
3799
3800 if (mddev->queue) {
3801 if (discard_supported)
3802 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
3803 mddev->queue);
3804 else
3805 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
3806 mddev->queue);
3807 }
3808
3809 if (!enough(conf, -1)) {
3810 pr_err("md/raid10:%s: not enough operational mirrors.\n",
3811 mdname(mddev));
3812 goto out_free_conf;
3813 }
3814
3815 if (conf->reshape_progress != MaxSector) {
3816
3817 if (conf->geo.far_copies != 1 &&
3818 conf->geo.far_offset == 0)
3819 goto out_free_conf;
3820 if (conf->prev.far_copies != 1 &&
3821 conf->prev.far_offset == 0)
3822 goto out_free_conf;
3823 }
3824
3825 mddev->degraded = 0;
3826 for (i = 0;
3827 i < conf->geo.raid_disks
3828 || i < conf->prev.raid_disks;
3829 i++) {
3830
3831 disk = conf->mirrors + i;
3832
3833 if (!disk->rdev && disk->replacement) {
3834
3835 disk->rdev = disk->replacement;
3836 disk->replacement = NULL;
3837 clear_bit(Replacement, &disk->rdev->flags);
3838 }
3839
3840 if (!disk->rdev ||
3841 !test_bit(In_sync, &disk->rdev->flags)) {
3842 disk->head_position = 0;
3843 mddev->degraded++;
3844 if (disk->rdev &&
3845 disk->rdev->saved_raid_disk < 0)
3846 conf->fullsync = 1;
3847 }
3848 disk->recovery_disabled = mddev->recovery_disabled - 1;
3849 }
3850
3851 if (mddev->recovery_cp != MaxSector)
3852 pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n",
3853 mdname(mddev));
3854 pr_info("md/raid10:%s: active with %d out of %d devices\n",
3855 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3856 conf->geo.raid_disks);
3857
3858
3859
3860 mddev->dev_sectors = conf->dev_sectors;
3861 size = raid10_size(mddev, 0, 0);
3862 md_set_array_sectors(mddev, size);
3863 mddev->resync_max_sectors = size;
3864 set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
3865
3866 if (mddev->queue) {
3867 int stripe = conf->geo.raid_disks *
3868 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3869
3870
3871
3872
3873
3874 stripe /= conf->geo.near_copies;
3875 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
3876 mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
3877 }
3878
3879 if (md_integrity_register(mddev))
3880 goto out_free_conf;
3881
3882 if (conf->reshape_progress != MaxSector) {
3883 unsigned long before_length, after_length;
3884
3885 before_length = ((1 << conf->prev.chunk_shift) *
3886 conf->prev.far_copies);
3887 after_length = ((1 << conf->geo.chunk_shift) *
3888 conf->geo.far_copies);
3889
3890 if (max(before_length, after_length) > min_offset_diff) {
3891
3892 pr_warn("md/raid10: offset difference not enough to continue reshape\n");
3893 goto out_free_conf;
3894 }
3895 conf->offset_diff = min_offset_diff;
3896
3897 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3898 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3899 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3900 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3901 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3902 "reshape");
3903 }
3904
3905 return 0;
3906
3907out_free_conf:
3908 md_unregister_thread(&mddev->thread);
3909 mempool_destroy(conf->r10bio_pool);
3910 safe_put_page(conf->tmppage);
3911 kfree(conf->mirrors);
3912 kfree(conf);
3913 mddev->private = NULL;
3914out:
3915 return -EIO;
3916}
3917
3918static void raid10_free(struct mddev *mddev, void *priv)
3919{
3920 struct r10conf *conf = priv;
3921
3922 mempool_destroy(conf->r10bio_pool);
3923 safe_put_page(conf->tmppage);
3924 kfree(conf->mirrors);
3925 kfree(conf->mirrors_old);
3926 kfree(conf->mirrors_new);
3927 kfree(conf);
3928}
3929
3930static void raid10_quiesce(struct mddev *mddev, int state)
3931{
3932 struct r10conf *conf = mddev->private;
3933
3934 switch(state) {
3935 case 1:
3936 raise_barrier(conf, 0);
3937 break;
3938 case 0:
3939 lower_barrier(conf);
3940 break;
3941 }
3942}
3943
3944static int raid10_resize(struct mddev *mddev, sector_t sectors)
3945{
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958 struct r10conf *conf = mddev->private;
3959 sector_t oldsize, size;
3960
3961 if (mddev->reshape_position != MaxSector)
3962 return -EBUSY;
3963
3964 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3965 return -EINVAL;
3966
3967 oldsize = raid10_size(mddev, 0, 0);
3968 size = raid10_size(mddev, sectors, 0);
3969 if (mddev->external_size &&
3970 mddev->array_sectors > size)
3971 return -EINVAL;
3972 if (mddev->bitmap) {
3973 int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
3974 if (ret)
3975 return ret;
3976 }
3977 md_set_array_sectors(mddev, size);
3978 if (sectors > mddev->dev_sectors &&
3979 mddev->recovery_cp > oldsize) {
3980 mddev->recovery_cp = oldsize;
3981 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3982 }
3983 calc_sectors(conf, sectors);
3984 mddev->dev_sectors = conf->dev_sectors;
3985 mddev->resync_max_sectors = size;
3986 return 0;
3987}
3988
3989static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
3990{
3991 struct md_rdev *rdev;
3992 struct r10conf *conf;
3993
3994 if (mddev->degraded > 0) {
3995 pr_warn("md/raid10:%s: Error: degraded raid0!\n",
3996 mdname(mddev));
3997 return ERR_PTR(-EINVAL);
3998 }
3999 sector_div(size, devs);
4000
4001
4002 mddev->new_level = 10;
4003
4004 mddev->new_layout = (1<<8) + 2;
4005 mddev->new_chunk_sectors = mddev->chunk_sectors;
4006 mddev->delta_disks = mddev->raid_disks;
4007 mddev->raid_disks *= 2;
4008
4009 mddev->recovery_cp = MaxSector;
4010 mddev->dev_sectors = size;
4011
4012 conf = setup_conf(mddev);
4013 if (!IS_ERR(conf)) {
4014 rdev_for_each(rdev, mddev)
4015 if (rdev->raid_disk >= 0) {
4016 rdev->new_raid_disk = rdev->raid_disk * 2;
4017 rdev->sectors = size;
4018 }
4019 conf->barrier = 1;
4020 }
4021
4022 return conf;
4023}
4024
4025static void *raid10_takeover(struct mddev *mddev)
4026{
4027 struct r0conf *raid0_conf;
4028
4029
4030
4031
4032 if (mddev->level == 0) {
4033
4034 raid0_conf = mddev->private;
4035 if (raid0_conf->nr_strip_zones > 1) {
4036 pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n",
4037 mdname(mddev));
4038 return ERR_PTR(-EINVAL);
4039 }
4040 return raid10_takeover_raid0(mddev,
4041 raid0_conf->strip_zone->zone_end,
4042 raid0_conf->strip_zone->nb_dev);
4043 }
4044 return ERR_PTR(-EINVAL);
4045}
4046
4047static int raid10_check_reshape(struct mddev *mddev)
4048{
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063 struct r10conf *conf = mddev->private;
4064 struct geom geo;
4065
4066 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
4067 return -EINVAL;
4068
4069 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
4070
4071 return -EINVAL;
4072 if (geo.far_copies > 1 && !geo.far_offset)
4073
4074 return -EINVAL;
4075
4076 if (mddev->array_sectors & geo.chunk_mask)
4077
4078 return -EINVAL;
4079
4080 if (!enough(conf, -1))
4081 return -EINVAL;
4082
4083 kfree(conf->mirrors_new);
4084 conf->mirrors_new = NULL;
4085 if (mddev->delta_disks > 0) {
4086
4087 conf->mirrors_new = kzalloc(
4088 sizeof(struct raid10_info)
4089 *(mddev->raid_disks +
4090 mddev->delta_disks),
4091 GFP_KERNEL);
4092 if (!conf->mirrors_new)
4093 return -ENOMEM;
4094 }
4095 return 0;
4096}
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111static int calc_degraded(struct r10conf *conf)
4112{
4113 int degraded, degraded2;
4114 int i;
4115
4116 rcu_read_lock();
4117 degraded = 0;
4118
4119 for (i = 0; i < conf->prev.raid_disks; i++) {
4120 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4121 if (!rdev || test_bit(Faulty, &rdev->flags))
4122 degraded++;
4123 else if (!test_bit(In_sync, &rdev->flags))
4124
4125
4126
4127
4128 degraded++;
4129 }
4130 rcu_read_unlock();
4131 if (conf->geo.raid_disks == conf->prev.raid_disks)
4132 return degraded;
4133 rcu_read_lock();
4134 degraded2 = 0;
4135 for (i = 0; i < conf->geo.raid_disks; i++) {
4136 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4137 if (!rdev || test_bit(Faulty, &rdev->flags))
4138 degraded2++;
4139 else if (!test_bit(In_sync, &rdev->flags)) {
4140
4141
4142
4143
4144
4145 if (conf->geo.raid_disks <= conf->prev.raid_disks)
4146 degraded2++;
4147 }
4148 }
4149 rcu_read_unlock();
4150 if (degraded2 > degraded)
4151 return degraded2;
4152 return degraded;
4153}
4154
4155static int raid10_start_reshape(struct mddev *mddev)
4156{
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167 unsigned long before_length, after_length;
4168 sector_t min_offset_diff = 0;
4169 int first = 1;
4170 struct geom new;
4171 struct r10conf *conf = mddev->private;
4172 struct md_rdev *rdev;
4173 int spares = 0;
4174 int ret;
4175
4176 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4177 return -EBUSY;
4178
4179 if (setup_geo(&new, mddev, geo_start) != conf->copies)
4180 return -EINVAL;
4181
4182 before_length = ((1 << conf->prev.chunk_shift) *
4183 conf->prev.far_copies);
4184 after_length = ((1 << conf->geo.chunk_shift) *
4185 conf->geo.far_copies);
4186
4187 rdev_for_each(rdev, mddev) {
4188 if (!test_bit(In_sync, &rdev->flags)
4189 && !test_bit(Faulty, &rdev->flags))
4190 spares++;
4191 if (rdev->raid_disk >= 0) {
4192 long long diff = (rdev->new_data_offset
4193 - rdev->data_offset);
4194 if (!mddev->reshape_backwards)
4195 diff = -diff;
4196 if (diff < 0)
4197 diff = 0;
4198 if (first || diff < min_offset_diff)
4199 min_offset_diff = diff;
4200 }
4201 }
4202
4203 if (max(before_length, after_length) > min_offset_diff)
4204 return -EINVAL;
4205
4206 if (spares < mddev->delta_disks)
4207 return -EINVAL;
4208
4209 conf->offset_diff = min_offset_diff;
4210 spin_lock_irq(&conf->device_lock);
4211 if (conf->mirrors_new) {
4212 memcpy(conf->mirrors_new, conf->mirrors,
4213 sizeof(struct raid10_info)*conf->prev.raid_disks);
4214 smp_mb();
4215 kfree(conf->mirrors_old);
4216 conf->mirrors_old = conf->mirrors;
4217 conf->mirrors = conf->mirrors_new;
4218 conf->mirrors_new = NULL;
4219 }
4220 setup_geo(&conf->geo, mddev, geo_start);
4221 smp_mb();
4222 if (mddev->reshape_backwards) {
4223 sector_t size = raid10_size(mddev, 0, 0);
4224 if (size < mddev->array_sectors) {
4225 spin_unlock_irq(&conf->device_lock);
4226 pr_warn("md/raid10:%s: array size must be reduce before number of disks\n",
4227 mdname(mddev));
4228 return -EINVAL;
4229 }
4230 mddev->resync_max_sectors = size;
4231 conf->reshape_progress = size;
4232 } else
4233 conf->reshape_progress = 0;
4234 conf->reshape_safe = conf->reshape_progress;
4235 spin_unlock_irq(&conf->device_lock);
4236
4237 if (mddev->delta_disks && mddev->bitmap) {
4238 ret = bitmap_resize(mddev->bitmap,
4239 raid10_size(mddev, 0,
4240 conf->geo.raid_disks),
4241 0, 0);
4242 if (ret)
4243 goto abort;
4244 }
4245 if (mddev->delta_disks > 0) {
4246 rdev_for_each(rdev, mddev)
4247 if (rdev->raid_disk < 0 &&
4248 !test_bit(Faulty, &rdev->flags)) {
4249 if (raid10_add_disk(mddev, rdev) == 0) {
4250 if (rdev->raid_disk >=
4251 conf->prev.raid_disks)
4252 set_bit(In_sync, &rdev->flags);
4253 else
4254 rdev->recovery_offset = 0;
4255
4256 if (sysfs_link_rdev(mddev, rdev))
4257 ;
4258 }
4259 } else if (rdev->raid_disk >= conf->prev.raid_disks
4260 && !test_bit(Faulty, &rdev->flags)) {
4261
4262 set_bit(In_sync, &rdev->flags);
4263 }
4264 }
4265
4266
4267
4268
4269 spin_lock_irq(&conf->device_lock);
4270 mddev->degraded = calc_degraded(conf);
4271 spin_unlock_irq(&conf->device_lock);
4272 mddev->raid_disks = conf->geo.raid_disks;
4273 mddev->reshape_position = conf->reshape_progress;
4274 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4275
4276 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4277 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4278 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
4279 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4280 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4281
4282 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4283 "reshape");
4284 if (!mddev->sync_thread) {
4285 ret = -EAGAIN;
4286 goto abort;
4287 }
4288 conf->reshape_checkpoint = jiffies;
4289 md_wakeup_thread(mddev->sync_thread);
4290 md_new_event(mddev);
4291 return 0;
4292
4293abort:
4294 mddev->recovery = 0;
4295 spin_lock_irq(&conf->device_lock);
4296 conf->geo = conf->prev;
4297 mddev->raid_disks = conf->geo.raid_disks;
4298 rdev_for_each(rdev, mddev)
4299 rdev->new_data_offset = rdev->data_offset;
4300 smp_wmb();
4301 conf->reshape_progress = MaxSector;
4302 conf->reshape_safe = MaxSector;
4303 mddev->reshape_position = MaxSector;
4304 spin_unlock_irq(&conf->device_lock);
4305 return ret;
4306}
4307
4308
4309
4310
4311
4312
4313
4314static sector_t last_dev_address(sector_t s, struct geom *geo)
4315{
4316 s = (s | geo->chunk_mask) + 1;
4317 s >>= geo->chunk_shift;
4318 s *= geo->near_copies;
4319 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4320 s *= geo->far_copies;
4321 s <<= geo->chunk_shift;
4322 return s;
4323}
4324
4325
4326
4327
4328
4329static sector_t first_dev_address(sector_t s, struct geom *geo)
4330{
4331 s >>= geo->chunk_shift;
4332 s *= geo->near_copies;
4333 sector_div(s, geo->raid_disks);
4334 s *= geo->far_copies;
4335 s <<= geo->chunk_shift;
4336 return s;
4337}
4338
4339static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4340 int *skipped)
4341{
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379 struct r10conf *conf = mddev->private;
4380 struct r10bio *r10_bio;
4381 sector_t next, safe, last;
4382 int max_sectors;
4383 int nr_sectors;
4384 int s;
4385 struct md_rdev *rdev;
4386 int need_flush = 0;
4387 struct bio *blist;
4388 struct bio *bio, *read_bio;
4389 int sectors_done = 0;
4390
4391 if (sector_nr == 0) {
4392
4393 if (mddev->reshape_backwards &&
4394 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4395 sector_nr = (raid10_size(mddev, 0, 0)
4396 - conf->reshape_progress);
4397 } else if (!mddev->reshape_backwards &&
4398 conf->reshape_progress > 0)
4399 sector_nr = conf->reshape_progress;
4400 if (sector_nr) {
4401 mddev->curr_resync_completed = sector_nr;
4402 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4403 *skipped = 1;
4404 return sector_nr;
4405 }
4406 }
4407
4408
4409
4410
4411
4412 if (mddev->reshape_backwards) {
4413
4414
4415
4416 next = first_dev_address(conf->reshape_progress - 1,
4417 &conf->geo);
4418
4419
4420
4421
4422 safe = last_dev_address(conf->reshape_safe - 1,
4423 &conf->prev);
4424
4425 if (next + conf->offset_diff < safe)
4426 need_flush = 1;
4427
4428 last = conf->reshape_progress - 1;
4429 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4430 & conf->prev.chunk_mask);
4431 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4432 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4433 } else {
4434
4435
4436
4437 next = last_dev_address(conf->reshape_progress, &conf->geo);
4438
4439
4440
4441
4442 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4443
4444
4445
4446
4447 if (next > safe + conf->offset_diff)
4448 need_flush = 1;
4449
4450 sector_nr = conf->reshape_progress;
4451 last = sector_nr | (conf->geo.chunk_mask
4452 & conf->prev.chunk_mask);
4453
4454 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4455 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4456 }
4457
4458 if (need_flush ||
4459 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4460
4461 wait_barrier(conf);
4462 mddev->reshape_position = conf->reshape_progress;
4463 if (mddev->reshape_backwards)
4464 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4465 - conf->reshape_progress;
4466 else
4467 mddev->curr_resync_completed = conf->reshape_progress;
4468 conf->reshape_checkpoint = jiffies;
4469 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4470 md_wakeup_thread(mddev->thread);
4471 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
4472 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4473 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4474 allow_barrier(conf);
4475 return sectors_done;
4476 }
4477 conf->reshape_safe = mddev->reshape_position;
4478 allow_barrier(conf);
4479 }
4480
4481read_more:
4482
4483 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
4484 r10_bio->state = 0;
4485 raise_barrier(conf, sectors_done != 0);
4486 atomic_set(&r10_bio->remaining, 0);
4487 r10_bio->mddev = mddev;
4488 r10_bio->sector = sector_nr;
4489 set_bit(R10BIO_IsReshape, &r10_bio->state);
4490 r10_bio->sectors = last - sector_nr + 1;
4491 rdev = read_balance(conf, r10_bio, &max_sectors);
4492 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4493
4494 if (!rdev) {
4495
4496
4497
4498
4499 mempool_free(r10_bio, conf->r10buf_pool);
4500 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4501 return sectors_done;
4502 }
4503
4504 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4505
4506 read_bio->bi_bdev = rdev->bdev;
4507 read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4508 + rdev->data_offset);
4509 read_bio->bi_private = r10_bio;
4510 read_bio->bi_end_io = end_sync_read;
4511 bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
4512 read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
4513 read_bio->bi_error = 0;
4514 read_bio->bi_vcnt = 0;
4515 read_bio->bi_iter.bi_size = 0;
4516 r10_bio->master_bio = read_bio;
4517 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4518
4519
4520 __raid10_find_phys(&conf->geo, r10_bio);
4521
4522 blist = read_bio;
4523 read_bio->bi_next = NULL;
4524
4525 rcu_read_lock();
4526 for (s = 0; s < conf->copies*2; s++) {
4527 struct bio *b;
4528 int d = r10_bio->devs[s/2].devnum;
4529 struct md_rdev *rdev2;
4530 if (s&1) {
4531 rdev2 = rcu_dereference(conf->mirrors[d].replacement);
4532 b = r10_bio->devs[s/2].repl_bio;
4533 } else {
4534 rdev2 = rcu_dereference(conf->mirrors[d].rdev);
4535 b = r10_bio->devs[s/2].bio;
4536 }
4537 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4538 continue;
4539
4540 bio_reset(b);
4541 b->bi_bdev = rdev2->bdev;
4542 b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
4543 rdev2->new_data_offset;
4544 b->bi_private = r10_bio;
4545 b->bi_end_io = end_reshape_write;
4546 bio_set_op_attrs(b, REQ_OP_WRITE, 0);
4547 b->bi_next = blist;
4548 blist = b;
4549 }
4550
4551
4552
4553 nr_sectors = 0;
4554 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4555 struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
4556 int len = (max_sectors - s) << 9;
4557 if (len > PAGE_SIZE)
4558 len = PAGE_SIZE;
4559 for (bio = blist; bio ; bio = bio->bi_next) {
4560 struct bio *bio2;
4561 if (bio_add_page(bio, page, len, 0))
4562 continue;
4563
4564
4565 for (bio2 = blist;
4566 bio2 && bio2 != bio;
4567 bio2 = bio2->bi_next) {
4568
4569 bio2->bi_vcnt--;
4570 bio2->bi_iter.bi_size -= len;
4571 bio_clear_flag(bio2, BIO_SEG_VALID);
4572 }
4573 goto bio_full;
4574 }
4575 sector_nr += len >> 9;
4576 nr_sectors += len >> 9;
4577 }
4578bio_full:
4579 rcu_read_unlock();
4580 r10_bio->sectors = nr_sectors;
4581
4582
4583 md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
4584 atomic_inc(&r10_bio->remaining);
4585 read_bio->bi_next = NULL;
4586 generic_make_request(read_bio);
4587 sector_nr += nr_sectors;
4588 sectors_done += nr_sectors;
4589 if (sector_nr <= last)
4590 goto read_more;
4591
4592
4593
4594
4595 if (mddev->reshape_backwards)
4596 conf->reshape_progress -= sectors_done;
4597 else
4598 conf->reshape_progress += sectors_done;
4599
4600 return sectors_done;
4601}
4602
4603static void end_reshape_request(struct r10bio *r10_bio);
4604static int handle_reshape_read_error(struct mddev *mddev,
4605 struct r10bio *r10_bio);
4606static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4607{
4608
4609
4610
4611
4612
4613 struct r10conf *conf = mddev->private;
4614 int s;
4615
4616 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4617 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4618
4619 md_done_sync(mddev, r10_bio->sectors, 0);
4620 return;
4621 }
4622
4623
4624
4625
4626 atomic_set(&r10_bio->remaining, 1);
4627 for (s = 0; s < conf->copies*2; s++) {
4628 struct bio *b;
4629 int d = r10_bio->devs[s/2].devnum;
4630 struct md_rdev *rdev;
4631 rcu_read_lock();
4632 if (s&1) {
4633 rdev = rcu_dereference(conf->mirrors[d].replacement);
4634 b = r10_bio->devs[s/2].repl_bio;
4635 } else {
4636 rdev = rcu_dereference(conf->mirrors[d].rdev);
4637 b = r10_bio->devs[s/2].bio;
4638 }
4639 if (!rdev || test_bit(Faulty, &rdev->flags)) {
4640 rcu_read_unlock();
4641 continue;
4642 }
4643 atomic_inc(&rdev->nr_pending);
4644 rcu_read_unlock();
4645 md_sync_acct(b->bi_bdev, r10_bio->sectors);
4646 atomic_inc(&r10_bio->remaining);
4647 b->bi_next = NULL;
4648 generic_make_request(b);
4649 }
4650 end_reshape_request(r10_bio);
4651}
4652
4653static void end_reshape(struct r10conf *conf)
4654{
4655 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4656 return;
4657
4658 spin_lock_irq(&conf->device_lock);
4659 conf->prev = conf->geo;
4660 md_finish_reshape(conf->mddev);
4661 smp_wmb();
4662 conf->reshape_progress = MaxSector;
4663 conf->reshape_safe = MaxSector;
4664 spin_unlock_irq(&conf->device_lock);
4665
4666
4667
4668
4669 if (conf->mddev->queue) {
4670 int stripe = conf->geo.raid_disks *
4671 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4672 stripe /= conf->geo.near_copies;
4673 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
4674 conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
4675 }
4676 conf->fullsync = 0;
4677}
4678
4679static int handle_reshape_read_error(struct mddev *mddev,
4680 struct r10bio *r10_bio)
4681{
4682
4683 int sectors = r10_bio->sectors;
4684 struct r10conf *conf = mddev->private;
4685 struct {
4686 struct r10bio r10_bio;
4687 struct r10dev devs[conf->copies];
4688 } on_stack;
4689 struct r10bio *r10b = &on_stack.r10_bio;
4690 int slot = 0;
4691 int idx = 0;
4692 struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
4693
4694 r10b->sector = r10_bio->sector;
4695 __raid10_find_phys(&conf->prev, r10b);
4696
4697 while (sectors) {
4698 int s = sectors;
4699 int success = 0;
4700 int first_slot = slot;
4701
4702 if (s > (PAGE_SIZE >> 9))
4703 s = PAGE_SIZE >> 9;
4704
4705 rcu_read_lock();
4706 while (!success) {
4707 int d = r10b->devs[slot].devnum;
4708 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4709 sector_t addr;
4710 if (rdev == NULL ||
4711 test_bit(Faulty, &rdev->flags) ||
4712 !test_bit(In_sync, &rdev->flags))
4713 goto failed;
4714
4715 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4716 atomic_inc(&rdev->nr_pending);
4717 rcu_read_unlock();
4718 success = sync_page_io(rdev,
4719 addr,
4720 s << 9,
4721 bvec[idx].bv_page,
4722 REQ_OP_READ, 0, false);
4723 rdev_dec_pending(rdev, mddev);
4724 rcu_read_lock();
4725 if (success)
4726 break;
4727 failed:
4728 slot++;
4729 if (slot >= conf->copies)
4730 slot = 0;
4731 if (slot == first_slot)
4732 break;
4733 }
4734 rcu_read_unlock();
4735 if (!success) {
4736
4737 set_bit(MD_RECOVERY_INTR,
4738 &mddev->recovery);
4739 return -EIO;
4740 }
4741 sectors -= s;
4742 idx++;
4743 }
4744 return 0;
4745}
4746
4747static void end_reshape_write(struct bio *bio)
4748{
4749 struct r10bio *r10_bio = bio->bi_private;
4750 struct mddev *mddev = r10_bio->mddev;
4751 struct r10conf *conf = mddev->private;
4752 int d;
4753 int slot;
4754 int repl;
4755 struct md_rdev *rdev = NULL;
4756
4757 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4758 if (repl)
4759 rdev = conf->mirrors[d].replacement;
4760 if (!rdev) {
4761 smp_mb();
4762 rdev = conf->mirrors[d].rdev;
4763 }
4764
4765 if (bio->bi_error) {
4766
4767 md_error(mddev, rdev);
4768 }
4769
4770 rdev_dec_pending(rdev, mddev);
4771 end_reshape_request(r10_bio);
4772}
4773
4774static void end_reshape_request(struct r10bio *r10_bio)
4775{
4776 if (!atomic_dec_and_test(&r10_bio->remaining))
4777 return;
4778 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4779 bio_put(r10_bio->master_bio);
4780 put_buf(r10_bio);
4781}
4782
4783static void raid10_finish_reshape(struct mddev *mddev)
4784{
4785 struct r10conf *conf = mddev->private;
4786
4787 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4788 return;
4789
4790 if (mddev->delta_disks > 0) {
4791 sector_t size = raid10_size(mddev, 0, 0);
4792 md_set_array_sectors(mddev, size);
4793 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4794 mddev->recovery_cp = mddev->resync_max_sectors;
4795 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4796 }
4797 mddev->resync_max_sectors = size;
4798 if (mddev->queue) {
4799 set_capacity(mddev->gendisk, mddev->array_sectors);
4800 revalidate_disk(mddev->gendisk);
4801 }
4802 } else {
4803 int d;
4804 rcu_read_lock();
4805 for (d = conf->geo.raid_disks ;
4806 d < conf->geo.raid_disks - mddev->delta_disks;
4807 d++) {
4808 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4809 if (rdev)
4810 clear_bit(In_sync, &rdev->flags);
4811 rdev = rcu_dereference(conf->mirrors[d].replacement);
4812 if (rdev)
4813 clear_bit(In_sync, &rdev->flags);
4814 }
4815 rcu_read_unlock();
4816 }
4817 mddev->layout = mddev->new_layout;
4818 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4819 mddev->reshape_position = MaxSector;
4820 mddev->delta_disks = 0;
4821 mddev->reshape_backwards = 0;
4822}
4823
4824static struct md_personality raid10_personality =
4825{
4826 .name = "raid10",
4827 .level = 10,
4828 .owner = THIS_MODULE,
4829 .make_request = raid10_make_request,
4830 .run = raid10_run,
4831 .free = raid10_free,
4832 .status = raid10_status,
4833 .error_handler = raid10_error,
4834 .hot_add_disk = raid10_add_disk,
4835 .hot_remove_disk= raid10_remove_disk,
4836 .spare_active = raid10_spare_active,
4837 .sync_request = raid10_sync_request,
4838 .quiesce = raid10_quiesce,
4839 .size = raid10_size,
4840 .resize = raid10_resize,
4841 .takeover = raid10_takeover,
4842 .check_reshape = raid10_check_reshape,
4843 .start_reshape = raid10_start_reshape,
4844 .finish_reshape = raid10_finish_reshape,
4845 .congested = raid10_congested,
4846};
4847
4848static int __init raid_init(void)
4849{
4850 return register_md_personality(&raid10_personality);
4851}
4852
4853static void raid_exit(void)
4854{
4855 unregister_md_personality(&raid10_personality);
4856}
4857
4858module_init(raid_init);
4859module_exit(raid_exit);
4860MODULE_LICENSE("GPL");
4861MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4862MODULE_ALIAS("md-personality-9");
4863MODULE_ALIAS("md-raid10");
4864MODULE_ALIAS("md-level-10");
4865
4866module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
4867