1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/slab.h>
22#include <linux/delay.h>
23#include <linux/blkdev.h>
24#include <linux/module.h>
25#include <linux/seq_file.h>
26#include <linux/ratelimit.h>
27#include <linux/kthread.h>
28#include <trace/events/block.h>
29#include "md.h"
30#include "raid10.h"
31#include "raid0.h"
32#include "bitmap.h"
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78#define NR_RAID10_BIOS 256
79
80
81
82
83
84
85#define IO_BLOCKED ((struct bio *)1)
86
87
88
89
90#define IO_MADE_GOOD ((struct bio *)2)
91
92#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
93
94
95
96
97
98static int max_queued_requests = 1024;
99
100static void allow_barrier(struct r10conf *conf);
101static void lower_barrier(struct r10conf *conf);
102static int _enough(struct r10conf *conf, int previous, int ignore);
103static int enough(struct r10conf *conf, int ignore);
104static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
105 int *skipped);
106static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
107static void end_reshape_write(struct bio *bio);
108static void end_reshape(struct r10conf *conf);
109
110#define raid10_log(md, fmt, args...) \
111 do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
112
113#include "raid1-10.c"
114
115
116
117
118
119static inline struct r10bio *get_resync_r10bio(struct bio *bio)
120{
121 return get_resync_pages(bio)->raid_bio;
122}
123
124static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
125{
126 struct r10conf *conf = data;
127 int size = offsetof(struct r10bio, devs[conf->copies]);
128
129
130
131 return kzalloc(size, gfp_flags);
132}
133
134static void r10bio_pool_free(void *r10_bio, void *data)
135{
136 kfree(r10_bio);
137}
138
139
140#define RESYNC_WINDOW (1024*1024)
141
142#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
143
144
145
146
147
148
149
150
151static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
152{
153 struct r10conf *conf = data;
154 struct r10bio *r10_bio;
155 struct bio *bio;
156 int j;
157 int nalloc, nalloc_rp;
158 struct resync_pages *rps;
159
160 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
161 if (!r10_bio)
162 return NULL;
163
164 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
165 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
166 nalloc = conf->copies;
167 else
168 nalloc = 2;
169
170
171 if (!conf->have_replacement)
172 nalloc_rp = nalloc;
173 else
174 nalloc_rp = nalloc * 2;
175 rps = kmalloc(sizeof(struct resync_pages) * nalloc_rp, gfp_flags);
176 if (!rps)
177 goto out_free_r10bio;
178
179
180
181
182 for (j = nalloc ; j-- ; ) {
183 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
184 if (!bio)
185 goto out_free_bio;
186 r10_bio->devs[j].bio = bio;
187 if (!conf->have_replacement)
188 continue;
189 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
190 if (!bio)
191 goto out_free_bio;
192 r10_bio->devs[j].repl_bio = bio;
193 }
194
195
196
197
198 for (j = 0; j < nalloc; j++) {
199 struct bio *rbio = r10_bio->devs[j].repl_bio;
200 struct resync_pages *rp, *rp_repl;
201
202 rp = &rps[j];
203 if (rbio)
204 rp_repl = &rps[nalloc + j];
205
206 bio = r10_bio->devs[j].bio;
207
208 if (!j || test_bit(MD_RECOVERY_SYNC,
209 &conf->mddev->recovery)) {
210 if (resync_alloc_pages(rp, gfp_flags))
211 goto out_free_pages;
212 } else {
213 memcpy(rp, &rps[0], sizeof(*rp));
214 resync_get_all_pages(rp);
215 }
216
217 rp->raid_bio = r10_bio;
218 bio->bi_private = rp;
219 if (rbio) {
220 memcpy(rp_repl, rp, sizeof(*rp));
221 rbio->bi_private = rp_repl;
222 }
223 }
224
225 return r10_bio;
226
227out_free_pages:
228 while (--j >= 0)
229 resync_free_pages(&rps[j * 2]);
230
231 j = 0;
232out_free_bio:
233 for ( ; j < nalloc; j++) {
234 if (r10_bio->devs[j].bio)
235 bio_put(r10_bio->devs[j].bio);
236 if (r10_bio->devs[j].repl_bio)
237 bio_put(r10_bio->devs[j].repl_bio);
238 }
239 kfree(rps);
240out_free_r10bio:
241 r10bio_pool_free(r10_bio, conf);
242 return NULL;
243}
244
245static void r10buf_pool_free(void *__r10_bio, void *data)
246{
247 struct r10conf *conf = data;
248 struct r10bio *r10bio = __r10_bio;
249 int j;
250 struct resync_pages *rp = NULL;
251
252 for (j = conf->copies; j--; ) {
253 struct bio *bio = r10bio->devs[j].bio;
254
255 rp = get_resync_pages(bio);
256 resync_free_pages(rp);
257 bio_put(bio);
258
259 bio = r10bio->devs[j].repl_bio;
260 if (bio)
261 bio_put(bio);
262 }
263
264
265 kfree(rp);
266
267 r10bio_pool_free(r10bio, conf);
268}
269
270static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
271{
272 int i;
273
274 for (i = 0; i < conf->copies; i++) {
275 struct bio **bio = & r10_bio->devs[i].bio;
276 if (!BIO_SPECIAL(*bio))
277 bio_put(*bio);
278 *bio = NULL;
279 bio = &r10_bio->devs[i].repl_bio;
280 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
281 bio_put(*bio);
282 *bio = NULL;
283 }
284}
285
286static void free_r10bio(struct r10bio *r10_bio)
287{
288 struct r10conf *conf = r10_bio->mddev->private;
289
290 put_all_bios(conf, r10_bio);
291 mempool_free(r10_bio, conf->r10bio_pool);
292}
293
294static void put_buf(struct r10bio *r10_bio)
295{
296 struct r10conf *conf = r10_bio->mddev->private;
297
298 mempool_free(r10_bio, conf->r10buf_pool);
299
300 lower_barrier(conf);
301}
302
303static void reschedule_retry(struct r10bio *r10_bio)
304{
305 unsigned long flags;
306 struct mddev *mddev = r10_bio->mddev;
307 struct r10conf *conf = mddev->private;
308
309 spin_lock_irqsave(&conf->device_lock, flags);
310 list_add(&r10_bio->retry_list, &conf->retry_list);
311 conf->nr_queued ++;
312 spin_unlock_irqrestore(&conf->device_lock, flags);
313
314
315 wake_up(&conf->wait_barrier);
316
317 md_wakeup_thread(mddev->thread);
318}
319
320
321
322
323
324
325static void raid_end_bio_io(struct r10bio *r10_bio)
326{
327 struct bio *bio = r10_bio->master_bio;
328 struct r10conf *conf = r10_bio->mddev->private;
329
330 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
331 bio->bi_status = BLK_STS_IOERR;
332
333 bio_endio(bio);
334
335
336
337
338 allow_barrier(conf);
339
340 free_r10bio(r10_bio);
341}
342
343
344
345
346static inline void update_head_pos(int slot, struct r10bio *r10_bio)
347{
348 struct r10conf *conf = r10_bio->mddev->private;
349
350 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
351 r10_bio->devs[slot].addr + (r10_bio->sectors);
352}
353
354
355
356
357static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
358 struct bio *bio, int *slotp, int *replp)
359{
360 int slot;
361 int repl = 0;
362
363 for (slot = 0; slot < conf->copies; slot++) {
364 if (r10_bio->devs[slot].bio == bio)
365 break;
366 if (r10_bio->devs[slot].repl_bio == bio) {
367 repl = 1;
368 break;
369 }
370 }
371
372 BUG_ON(slot == conf->copies);
373 update_head_pos(slot, r10_bio);
374
375 if (slotp)
376 *slotp = slot;
377 if (replp)
378 *replp = repl;
379 return r10_bio->devs[slot].devnum;
380}
381
382static void raid10_end_read_request(struct bio *bio)
383{
384 int uptodate = !bio->bi_status;
385 struct r10bio *r10_bio = bio->bi_private;
386 int slot, dev;
387 struct md_rdev *rdev;
388 struct r10conf *conf = r10_bio->mddev->private;
389
390 slot = r10_bio->read_slot;
391 dev = r10_bio->devs[slot].devnum;
392 rdev = r10_bio->devs[slot].rdev;
393
394
395
396 update_head_pos(slot, r10_bio);
397
398 if (uptodate) {
399
400
401
402
403
404
405
406
407
408 set_bit(R10BIO_Uptodate, &r10_bio->state);
409 } else {
410
411
412
413
414
415 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
416 rdev->raid_disk))
417 uptodate = 1;
418 }
419 if (uptodate) {
420 raid_end_bio_io(r10_bio);
421 rdev_dec_pending(rdev, conf->mddev);
422 } else {
423
424
425
426 char b[BDEVNAME_SIZE];
427 pr_err_ratelimited("md/raid10:%s: %s: rescheduling sector %llu\n",
428 mdname(conf->mddev),
429 bdevname(rdev->bdev, b),
430 (unsigned long long)r10_bio->sector);
431 set_bit(R10BIO_ReadError, &r10_bio->state);
432 reschedule_retry(r10_bio);
433 }
434}
435
436static void close_write(struct r10bio *r10_bio)
437{
438
439 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
440 r10_bio->sectors,
441 !test_bit(R10BIO_Degraded, &r10_bio->state),
442 0);
443 md_write_end(r10_bio->mddev);
444}
445
446static void one_write_done(struct r10bio *r10_bio)
447{
448 if (atomic_dec_and_test(&r10_bio->remaining)) {
449 if (test_bit(R10BIO_WriteError, &r10_bio->state))
450 reschedule_retry(r10_bio);
451 else {
452 close_write(r10_bio);
453 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
454 reschedule_retry(r10_bio);
455 else
456 raid_end_bio_io(r10_bio);
457 }
458 }
459}
460
461static void raid10_end_write_request(struct bio *bio)
462{
463 struct r10bio *r10_bio = bio->bi_private;
464 int dev;
465 int dec_rdev = 1;
466 struct r10conf *conf = r10_bio->mddev->private;
467 int slot, repl;
468 struct md_rdev *rdev = NULL;
469 struct bio *to_put = NULL;
470 bool discard_error;
471
472 discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
473
474 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
475
476 if (repl)
477 rdev = conf->mirrors[dev].replacement;
478 if (!rdev) {
479 smp_rmb();
480 repl = 0;
481 rdev = conf->mirrors[dev].rdev;
482 }
483
484
485
486 if (bio->bi_status && !discard_error) {
487 if (repl)
488
489
490
491 md_error(rdev->mddev, rdev);
492 else {
493 set_bit(WriteErrorSeen, &rdev->flags);
494 if (!test_and_set_bit(WantReplacement, &rdev->flags))
495 set_bit(MD_RECOVERY_NEEDED,
496 &rdev->mddev->recovery);
497
498 dec_rdev = 0;
499 if (test_bit(FailFast, &rdev->flags) &&
500 (bio->bi_opf & MD_FAILFAST)) {
501 md_error(rdev->mddev, rdev);
502 if (!test_bit(Faulty, &rdev->flags))
503
504
505
506
507 set_bit(R10BIO_WriteError, &r10_bio->state);
508 else {
509 r10_bio->devs[slot].bio = NULL;
510 to_put = bio;
511 dec_rdev = 1;
512 }
513 } else
514 set_bit(R10BIO_WriteError, &r10_bio->state);
515 }
516 } else {
517
518
519
520
521
522
523
524
525
526 sector_t first_bad;
527 int bad_sectors;
528
529
530
531
532
533
534
535
536
537 if (test_bit(In_sync, &rdev->flags) &&
538 !test_bit(Faulty, &rdev->flags))
539 set_bit(R10BIO_Uptodate, &r10_bio->state);
540
541
542 if (is_badblock(rdev,
543 r10_bio->devs[slot].addr,
544 r10_bio->sectors,
545 &first_bad, &bad_sectors) && !discard_error) {
546 bio_put(bio);
547 if (repl)
548 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
549 else
550 r10_bio->devs[slot].bio = IO_MADE_GOOD;
551 dec_rdev = 0;
552 set_bit(R10BIO_MadeGood, &r10_bio->state);
553 }
554 }
555
556
557
558
559
560
561 one_write_done(r10_bio);
562 if (dec_rdev)
563 rdev_dec_pending(rdev, conf->mddev);
564 if (to_put)
565 bio_put(to_put);
566}
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
594{
595 int n,f;
596 sector_t sector;
597 sector_t chunk;
598 sector_t stripe;
599 int dev;
600 int slot = 0;
601 int last_far_set_start, last_far_set_size;
602
603 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
604 last_far_set_start *= geo->far_set_size;
605
606 last_far_set_size = geo->far_set_size;
607 last_far_set_size += (geo->raid_disks % geo->far_set_size);
608
609
610 chunk = r10bio->sector >> geo->chunk_shift;
611 sector = r10bio->sector & geo->chunk_mask;
612
613 chunk *= geo->near_copies;
614 stripe = chunk;
615 dev = sector_div(stripe, geo->raid_disks);
616 if (geo->far_offset)
617 stripe *= geo->far_copies;
618
619 sector += stripe << geo->chunk_shift;
620
621
622 for (n = 0; n < geo->near_copies; n++) {
623 int d = dev;
624 int set;
625 sector_t s = sector;
626 r10bio->devs[slot].devnum = d;
627 r10bio->devs[slot].addr = s;
628 slot++;
629
630 for (f = 1; f < geo->far_copies; f++) {
631 set = d / geo->far_set_size;
632 d += geo->near_copies;
633
634 if ((geo->raid_disks % geo->far_set_size) &&
635 (d > last_far_set_start)) {
636 d -= last_far_set_start;
637 d %= last_far_set_size;
638 d += last_far_set_start;
639 } else {
640 d %= geo->far_set_size;
641 d += geo->far_set_size * set;
642 }
643 s += geo->stride;
644 r10bio->devs[slot].devnum = d;
645 r10bio->devs[slot].addr = s;
646 slot++;
647 }
648 dev++;
649 if (dev >= geo->raid_disks) {
650 dev = 0;
651 sector += (geo->chunk_mask + 1);
652 }
653 }
654}
655
656static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
657{
658 struct geom *geo = &conf->geo;
659
660 if (conf->reshape_progress != MaxSector &&
661 ((r10bio->sector >= conf->reshape_progress) !=
662 conf->mddev->reshape_backwards)) {
663 set_bit(R10BIO_Previous, &r10bio->state);
664 geo = &conf->prev;
665 } else
666 clear_bit(R10BIO_Previous, &r10bio->state);
667
668 __raid10_find_phys(geo, r10bio);
669}
670
671static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
672{
673 sector_t offset, chunk, vchunk;
674
675
676
677 struct geom *geo = &conf->geo;
678 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
679 int far_set_size = geo->far_set_size;
680 int last_far_set_start;
681
682 if (geo->raid_disks % geo->far_set_size) {
683 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
684 last_far_set_start *= geo->far_set_size;
685
686 if (dev >= last_far_set_start) {
687 far_set_size = geo->far_set_size;
688 far_set_size += (geo->raid_disks % geo->far_set_size);
689 far_set_start = last_far_set_start;
690 }
691 }
692
693 offset = sector & geo->chunk_mask;
694 if (geo->far_offset) {
695 int fc;
696 chunk = sector >> geo->chunk_shift;
697 fc = sector_div(chunk, geo->far_copies);
698 dev -= fc * geo->near_copies;
699 if (dev < far_set_start)
700 dev += far_set_size;
701 } else {
702 while (sector >= geo->stride) {
703 sector -= geo->stride;
704 if (dev < (geo->near_copies + far_set_start))
705 dev += far_set_size - geo->near_copies;
706 else
707 dev -= geo->near_copies;
708 }
709 chunk = sector >> geo->chunk_shift;
710 }
711 vchunk = chunk * geo->raid_disks + dev;
712 sector_div(vchunk, geo->near_copies);
713 return (vchunk << geo->chunk_shift) + offset;
714}
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735static struct md_rdev *read_balance(struct r10conf *conf,
736 struct r10bio *r10_bio,
737 int *max_sectors)
738{
739 const sector_t this_sector = r10_bio->sector;
740 int disk, slot;
741 int sectors = r10_bio->sectors;
742 int best_good_sectors;
743 sector_t new_distance, best_dist;
744 struct md_rdev *best_rdev, *rdev = NULL;
745 int do_balance;
746 int best_slot;
747 struct geom *geo = &conf->geo;
748
749 raid10_find_phys(conf, r10_bio);
750 rcu_read_lock();
751 sectors = r10_bio->sectors;
752 best_slot = -1;
753 best_rdev = NULL;
754 best_dist = MaxSector;
755 best_good_sectors = 0;
756 do_balance = 1;
757 clear_bit(R10BIO_FailFast, &r10_bio->state);
758
759
760
761
762
763
764 if (conf->mddev->recovery_cp < MaxSector
765 && (this_sector + sectors >= conf->next_resync))
766 do_balance = 0;
767
768 for (slot = 0; slot < conf->copies ; slot++) {
769 sector_t first_bad;
770 int bad_sectors;
771 sector_t dev_sector;
772
773 if (r10_bio->devs[slot].bio == IO_BLOCKED)
774 continue;
775 disk = r10_bio->devs[slot].devnum;
776 rdev = rcu_dereference(conf->mirrors[disk].replacement);
777 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
778 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
779 rdev = rcu_dereference(conf->mirrors[disk].rdev);
780 if (rdev == NULL ||
781 test_bit(Faulty, &rdev->flags))
782 continue;
783 if (!test_bit(In_sync, &rdev->flags) &&
784 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
785 continue;
786
787 dev_sector = r10_bio->devs[slot].addr;
788 if (is_badblock(rdev, dev_sector, sectors,
789 &first_bad, &bad_sectors)) {
790 if (best_dist < MaxSector)
791
792 continue;
793 if (first_bad <= dev_sector) {
794
795
796
797
798 bad_sectors -= (dev_sector - first_bad);
799 if (!do_balance && sectors > bad_sectors)
800 sectors = bad_sectors;
801 if (best_good_sectors > sectors)
802 best_good_sectors = sectors;
803 } else {
804 sector_t good_sectors =
805 first_bad - dev_sector;
806 if (good_sectors > best_good_sectors) {
807 best_good_sectors = good_sectors;
808 best_slot = slot;
809 best_rdev = rdev;
810 }
811 if (!do_balance)
812
813 break;
814 }
815 continue;
816 } else
817 best_good_sectors = sectors;
818
819 if (!do_balance)
820 break;
821
822 if (best_slot >= 0)
823
824 set_bit(R10BIO_FailFast, &r10_bio->state);
825
826
827
828
829 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
830 new_distance = 0;
831
832
833 else if (geo->far_copies > 1)
834 new_distance = r10_bio->devs[slot].addr;
835 else
836 new_distance = abs(r10_bio->devs[slot].addr -
837 conf->mirrors[disk].head_position);
838 if (new_distance < best_dist) {
839 best_dist = new_distance;
840 best_slot = slot;
841 best_rdev = rdev;
842 }
843 }
844 if (slot >= conf->copies) {
845 slot = best_slot;
846 rdev = best_rdev;
847 }
848
849 if (slot >= 0) {
850 atomic_inc(&rdev->nr_pending);
851 r10_bio->read_slot = slot;
852 } else
853 rdev = NULL;
854 rcu_read_unlock();
855 *max_sectors = best_good_sectors;
856
857 return rdev;
858}
859
860static int raid10_congested(struct mddev *mddev, int bits)
861{
862 struct r10conf *conf = mddev->private;
863 int i, ret = 0;
864
865 if ((bits & (1 << WB_async_congested)) &&
866 conf->pending_count >= max_queued_requests)
867 return 1;
868
869 rcu_read_lock();
870 for (i = 0;
871 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
872 && ret == 0;
873 i++) {
874 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
875 if (rdev && !test_bit(Faulty, &rdev->flags)) {
876 struct request_queue *q = bdev_get_queue(rdev->bdev);
877
878 ret |= bdi_congested(q->backing_dev_info, bits);
879 }
880 }
881 rcu_read_unlock();
882 return ret;
883}
884
885static void flush_pending_writes(struct r10conf *conf)
886{
887
888
889
890 spin_lock_irq(&conf->device_lock);
891
892 if (conf->pending_bio_list.head) {
893 struct bio *bio;
894 bio = bio_list_get(&conf->pending_bio_list);
895 conf->pending_count = 0;
896 spin_unlock_irq(&conf->device_lock);
897
898
899 bitmap_unplug(conf->mddev->bitmap);
900 wake_up(&conf->wait_barrier);
901
902 while (bio) {
903 struct bio *next = bio->bi_next;
904 struct md_rdev *rdev = (void*)bio->bi_disk;
905 bio->bi_next = NULL;
906 bio_set_dev(bio, rdev->bdev);
907 if (test_bit(Faulty, &rdev->flags)) {
908 bio_io_error(bio);
909 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
910 !blk_queue_discard(bio->bi_disk->queue)))
911
912 bio_endio(bio);
913 else
914 generic_make_request(bio);
915 bio = next;
916 }
917 } else
918 spin_unlock_irq(&conf->device_lock);
919}
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943static void raise_barrier(struct r10conf *conf, int force)
944{
945 BUG_ON(force && !conf->barrier);
946 spin_lock_irq(&conf->resync_lock);
947
948
949 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
950 conf->resync_lock);
951
952
953 conf->barrier++;
954
955
956 wait_event_lock_irq(conf->wait_barrier,
957 !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
958 conf->resync_lock);
959
960 spin_unlock_irq(&conf->resync_lock);
961}
962
963static void lower_barrier(struct r10conf *conf)
964{
965 unsigned long flags;
966 spin_lock_irqsave(&conf->resync_lock, flags);
967 conf->barrier--;
968 spin_unlock_irqrestore(&conf->resync_lock, flags);
969 wake_up(&conf->wait_barrier);
970}
971
972static void wait_barrier(struct r10conf *conf)
973{
974 spin_lock_irq(&conf->resync_lock);
975 if (conf->barrier) {
976 conf->nr_waiting++;
977
978
979
980
981
982
983
984
985
986 raid10_log(conf->mddev, "wait barrier");
987 wait_event_lock_irq(conf->wait_barrier,
988 !conf->barrier ||
989 (atomic_read(&conf->nr_pending) &&
990 current->bio_list &&
991 (!bio_list_empty(¤t->bio_list[0]) ||
992 !bio_list_empty(¤t->bio_list[1]))),
993 conf->resync_lock);
994 conf->nr_waiting--;
995 if (!conf->nr_waiting)
996 wake_up(&conf->wait_barrier);
997 }
998 atomic_inc(&conf->nr_pending);
999 spin_unlock_irq(&conf->resync_lock);
1000}
1001
1002static void allow_barrier(struct r10conf *conf)
1003{
1004 if ((atomic_dec_and_test(&conf->nr_pending)) ||
1005 (conf->array_freeze_pending))
1006 wake_up(&conf->wait_barrier);
1007}
1008
1009static void freeze_array(struct r10conf *conf, int extra)
1010{
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023 spin_lock_irq(&conf->resync_lock);
1024 conf->array_freeze_pending++;
1025 conf->barrier++;
1026 conf->nr_waiting++;
1027 wait_event_lock_irq_cmd(conf->wait_barrier,
1028 atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
1029 conf->resync_lock,
1030 flush_pending_writes(conf));
1031
1032 conf->array_freeze_pending--;
1033 spin_unlock_irq(&conf->resync_lock);
1034}
1035
1036static void unfreeze_array(struct r10conf *conf)
1037{
1038
1039 spin_lock_irq(&conf->resync_lock);
1040 conf->barrier--;
1041 conf->nr_waiting--;
1042 wake_up(&conf->wait_barrier);
1043 spin_unlock_irq(&conf->resync_lock);
1044}
1045
1046static sector_t choose_data_offset(struct r10bio *r10_bio,
1047 struct md_rdev *rdev)
1048{
1049 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1050 test_bit(R10BIO_Previous, &r10_bio->state))
1051 return rdev->data_offset;
1052 else
1053 return rdev->new_data_offset;
1054}
1055
1056struct raid10_plug_cb {
1057 struct blk_plug_cb cb;
1058 struct bio_list pending;
1059 int pending_cnt;
1060};
1061
1062static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1063{
1064 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1065 cb);
1066 struct mddev *mddev = plug->cb.data;
1067 struct r10conf *conf = mddev->private;
1068 struct bio *bio;
1069
1070 if (from_schedule || current->bio_list) {
1071 spin_lock_irq(&conf->device_lock);
1072 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1073 conf->pending_count += plug->pending_cnt;
1074 spin_unlock_irq(&conf->device_lock);
1075 wake_up(&conf->wait_barrier);
1076 md_wakeup_thread(mddev->thread);
1077 kfree(plug);
1078 return;
1079 }
1080
1081
1082 bio = bio_list_get(&plug->pending);
1083 bitmap_unplug(mddev->bitmap);
1084 wake_up(&conf->wait_barrier);
1085
1086 while (bio) {
1087 struct bio *next = bio->bi_next;
1088 struct md_rdev *rdev = (void*)bio->bi_disk;
1089 bio->bi_next = NULL;
1090 bio_set_dev(bio, rdev->bdev);
1091 if (test_bit(Faulty, &rdev->flags)) {
1092 bio_io_error(bio);
1093 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
1094 !blk_queue_discard(bio->bi_disk->queue)))
1095
1096 bio_endio(bio);
1097 else
1098 generic_make_request(bio);
1099 bio = next;
1100 }
1101 kfree(plug);
1102}
1103
1104static void raid10_read_request(struct mddev *mddev, struct bio *bio,
1105 struct r10bio *r10_bio)
1106{
1107 struct r10conf *conf = mddev->private;
1108 struct bio *read_bio;
1109 const int op = bio_op(bio);
1110 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1111 int max_sectors;
1112 sector_t sectors;
1113 struct md_rdev *rdev;
1114 char b[BDEVNAME_SIZE];
1115 int slot = r10_bio->read_slot;
1116 struct md_rdev *err_rdev = NULL;
1117 gfp_t gfp = GFP_NOIO;
1118
1119 if (r10_bio->devs[slot].rdev) {
1120
1121
1122
1123
1124
1125
1126
1127 int disk;
1128
1129
1130
1131
1132 gfp = GFP_NOIO | __GFP_HIGH;
1133
1134 rcu_read_lock();
1135 disk = r10_bio->devs[slot].devnum;
1136 err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
1137 if (err_rdev)
1138 bdevname(err_rdev->bdev, b);
1139 else {
1140 strcpy(b, "???");
1141
1142 err_rdev = r10_bio->devs[slot].rdev;
1143 }
1144 rcu_read_unlock();
1145 }
1146
1147
1148
1149
1150
1151 wait_barrier(conf);
1152
1153 sectors = r10_bio->sectors;
1154 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1155 bio->bi_iter.bi_sector < conf->reshape_progress &&
1156 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1157
1158
1159
1160
1161 raid10_log(conf->mddev, "wait reshape");
1162 allow_barrier(conf);
1163 wait_event(conf->wait_barrier,
1164 conf->reshape_progress <= bio->bi_iter.bi_sector ||
1165 conf->reshape_progress >= bio->bi_iter.bi_sector +
1166 sectors);
1167 wait_barrier(conf);
1168 }
1169
1170 rdev = read_balance(conf, r10_bio, &max_sectors);
1171 if (!rdev) {
1172 if (err_rdev) {
1173 pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
1174 mdname(mddev), b,
1175 (unsigned long long)r10_bio->sector);
1176 }
1177 raid_end_bio_io(r10_bio);
1178 return;
1179 }
1180 if (err_rdev)
1181 pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
1182 mdname(mddev),
1183 bdevname(rdev->bdev, b),
1184 (unsigned long long)r10_bio->sector);
1185 if (max_sectors < bio_sectors(bio)) {
1186 struct bio *split = bio_split(bio, max_sectors,
1187 gfp, conf->bio_split);
1188 bio_chain(split, bio);
1189 generic_make_request(bio);
1190 bio = split;
1191 r10_bio->master_bio = bio;
1192 r10_bio->sectors = max_sectors;
1193 }
1194 slot = r10_bio->read_slot;
1195
1196 read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
1197
1198 r10_bio->devs[slot].bio = read_bio;
1199 r10_bio->devs[slot].rdev = rdev;
1200
1201 read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
1202 choose_data_offset(r10_bio, rdev);
1203 bio_set_dev(read_bio, rdev->bdev);
1204 read_bio->bi_end_io = raid10_end_read_request;
1205 bio_set_op_attrs(read_bio, op, do_sync);
1206 if (test_bit(FailFast, &rdev->flags) &&
1207 test_bit(R10BIO_FailFast, &r10_bio->state))
1208 read_bio->bi_opf |= MD_FAILFAST;
1209 read_bio->bi_private = r10_bio;
1210
1211 if (mddev->gendisk)
1212 trace_block_bio_remap(read_bio->bi_disk->queue,
1213 read_bio, disk_devt(mddev->gendisk),
1214 r10_bio->sector);
1215 generic_make_request(read_bio);
1216 return;
1217}
1218
1219static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
1220 struct bio *bio, bool replacement,
1221 int n_copy)
1222{
1223 const int op = bio_op(bio);
1224 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1225 const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
1226 unsigned long flags;
1227 struct blk_plug_cb *cb;
1228 struct raid10_plug_cb *plug = NULL;
1229 struct r10conf *conf = mddev->private;
1230 struct md_rdev *rdev;
1231 int devnum = r10_bio->devs[n_copy].devnum;
1232 struct bio *mbio;
1233
1234 if (replacement) {
1235 rdev = conf->mirrors[devnum].replacement;
1236 if (rdev == NULL) {
1237
1238 smp_mb();
1239 rdev = conf->mirrors[devnum].rdev;
1240 }
1241 } else
1242 rdev = conf->mirrors[devnum].rdev;
1243
1244 mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
1245 if (replacement)
1246 r10_bio->devs[n_copy].repl_bio = mbio;
1247 else
1248 r10_bio->devs[n_copy].bio = mbio;
1249
1250 mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
1251 choose_data_offset(r10_bio, rdev));
1252 bio_set_dev(mbio, rdev->bdev);
1253 mbio->bi_end_io = raid10_end_write_request;
1254 bio_set_op_attrs(mbio, op, do_sync | do_fua);
1255 if (!replacement && test_bit(FailFast,
1256 &conf->mirrors[devnum].rdev->flags)
1257 && enough(conf, devnum))
1258 mbio->bi_opf |= MD_FAILFAST;
1259 mbio->bi_private = r10_bio;
1260
1261 if (conf->mddev->gendisk)
1262 trace_block_bio_remap(mbio->bi_disk->queue,
1263 mbio, disk_devt(conf->mddev->gendisk),
1264 r10_bio->sector);
1265
1266 mbio->bi_disk = (void *)rdev;
1267
1268 atomic_inc(&r10_bio->remaining);
1269
1270 cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
1271 if (cb)
1272 plug = container_of(cb, struct raid10_plug_cb, cb);
1273 else
1274 plug = NULL;
1275 if (plug) {
1276 bio_list_add(&plug->pending, mbio);
1277 plug->pending_cnt++;
1278 } else {
1279 spin_lock_irqsave(&conf->device_lock, flags);
1280 bio_list_add(&conf->pending_bio_list, mbio);
1281 conf->pending_count++;
1282 spin_unlock_irqrestore(&conf->device_lock, flags);
1283 md_wakeup_thread(mddev->thread);
1284 }
1285}
1286
1287static void raid10_write_request(struct mddev *mddev, struct bio *bio,
1288 struct r10bio *r10_bio)
1289{
1290 struct r10conf *conf = mddev->private;
1291 int i;
1292 struct md_rdev *blocked_rdev;
1293 sector_t sectors;
1294 int max_sectors;
1295
1296
1297
1298
1299
1300
1301 wait_barrier(conf);
1302
1303 sectors = r10_bio->sectors;
1304 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1305 bio->bi_iter.bi_sector < conf->reshape_progress &&
1306 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1307
1308
1309
1310
1311 raid10_log(conf->mddev, "wait reshape");
1312 allow_barrier(conf);
1313 wait_event(conf->wait_barrier,
1314 conf->reshape_progress <= bio->bi_iter.bi_sector ||
1315 conf->reshape_progress >= bio->bi_iter.bi_sector +
1316 sectors);
1317 wait_barrier(conf);
1318 }
1319
1320 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1321 (mddev->reshape_backwards
1322 ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
1323 bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
1324 : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
1325 bio->bi_iter.bi_sector < conf->reshape_progress))) {
1326
1327 mddev->reshape_position = conf->reshape_progress;
1328 set_mask_bits(&mddev->sb_flags, 0,
1329 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1330 md_wakeup_thread(mddev->thread);
1331 raid10_log(conf->mddev, "wait reshape metadata");
1332 wait_event(mddev->sb_wait,
1333 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
1334
1335 conf->reshape_safe = mddev->reshape_position;
1336 }
1337
1338 if (conf->pending_count >= max_queued_requests) {
1339 md_wakeup_thread(mddev->thread);
1340 raid10_log(mddev, "wait queued");
1341 wait_event(conf->wait_barrier,
1342 conf->pending_count < max_queued_requests);
1343 }
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354 r10_bio->read_slot = -1;
1355 raid10_find_phys(conf, r10_bio);
1356retry_write:
1357 blocked_rdev = NULL;
1358 rcu_read_lock();
1359 max_sectors = r10_bio->sectors;
1360
1361 for (i = 0; i < conf->copies; i++) {
1362 int d = r10_bio->devs[i].devnum;
1363 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1364 struct md_rdev *rrdev = rcu_dereference(
1365 conf->mirrors[d].replacement);
1366 if (rdev == rrdev)
1367 rrdev = NULL;
1368 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1369 atomic_inc(&rdev->nr_pending);
1370 blocked_rdev = rdev;
1371 break;
1372 }
1373 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1374 atomic_inc(&rrdev->nr_pending);
1375 blocked_rdev = rrdev;
1376 break;
1377 }
1378 if (rdev && (test_bit(Faulty, &rdev->flags)))
1379 rdev = NULL;
1380 if (rrdev && (test_bit(Faulty, &rrdev->flags)))
1381 rrdev = NULL;
1382
1383 r10_bio->devs[i].bio = NULL;
1384 r10_bio->devs[i].repl_bio = NULL;
1385
1386 if (!rdev && !rrdev) {
1387 set_bit(R10BIO_Degraded, &r10_bio->state);
1388 continue;
1389 }
1390 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1391 sector_t first_bad;
1392 sector_t dev_sector = r10_bio->devs[i].addr;
1393 int bad_sectors;
1394 int is_bad;
1395
1396 is_bad = is_badblock(rdev, dev_sector, max_sectors,
1397 &first_bad, &bad_sectors);
1398 if (is_bad < 0) {
1399
1400
1401
1402 atomic_inc(&rdev->nr_pending);
1403 set_bit(BlockedBadBlocks, &rdev->flags);
1404 blocked_rdev = rdev;
1405 break;
1406 }
1407 if (is_bad && first_bad <= dev_sector) {
1408
1409 bad_sectors -= (dev_sector - first_bad);
1410 if (bad_sectors < max_sectors)
1411
1412
1413
1414 max_sectors = bad_sectors;
1415
1416
1417
1418
1419
1420
1421
1422
1423 continue;
1424 }
1425 if (is_bad) {
1426 int good_sectors = first_bad - dev_sector;
1427 if (good_sectors < max_sectors)
1428 max_sectors = good_sectors;
1429 }
1430 }
1431 if (rdev) {
1432 r10_bio->devs[i].bio = bio;
1433 atomic_inc(&rdev->nr_pending);
1434 }
1435 if (rrdev) {
1436 r10_bio->devs[i].repl_bio = bio;
1437 atomic_inc(&rrdev->nr_pending);
1438 }
1439 }
1440 rcu_read_unlock();
1441
1442 if (unlikely(blocked_rdev)) {
1443
1444 int j;
1445 int d;
1446
1447 for (j = 0; j < i; j++) {
1448 if (r10_bio->devs[j].bio) {
1449 d = r10_bio->devs[j].devnum;
1450 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1451 }
1452 if (r10_bio->devs[j].repl_bio) {
1453 struct md_rdev *rdev;
1454 d = r10_bio->devs[j].devnum;
1455 rdev = conf->mirrors[d].replacement;
1456 if (!rdev) {
1457
1458 smp_mb();
1459 rdev = conf->mirrors[d].rdev;
1460 }
1461 rdev_dec_pending(rdev, mddev);
1462 }
1463 }
1464 allow_barrier(conf);
1465 raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
1466 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1467 wait_barrier(conf);
1468 goto retry_write;
1469 }
1470
1471 if (max_sectors < r10_bio->sectors)
1472 r10_bio->sectors = max_sectors;
1473
1474 if (r10_bio->sectors < bio_sectors(bio)) {
1475 struct bio *split = bio_split(bio, r10_bio->sectors,
1476 GFP_NOIO, conf->bio_split);
1477 bio_chain(split, bio);
1478 generic_make_request(bio);
1479 bio = split;
1480 r10_bio->master_bio = bio;
1481 }
1482
1483 atomic_set(&r10_bio->remaining, 1);
1484 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1485
1486 for (i = 0; i < conf->copies; i++) {
1487 if (r10_bio->devs[i].bio)
1488 raid10_write_one_disk(mddev, r10_bio, bio, false, i);
1489 if (r10_bio->devs[i].repl_bio)
1490 raid10_write_one_disk(mddev, r10_bio, bio, true, i);
1491 }
1492 one_write_done(r10_bio);
1493}
1494
1495static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
1496{
1497 struct r10conf *conf = mddev->private;
1498 struct r10bio *r10_bio;
1499
1500 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1501
1502 r10_bio->master_bio = bio;
1503 r10_bio->sectors = sectors;
1504
1505 r10_bio->mddev = mddev;
1506 r10_bio->sector = bio->bi_iter.bi_sector;
1507 r10_bio->state = 0;
1508 memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
1509
1510 if (bio_data_dir(bio) == READ)
1511 raid10_read_request(mddev, bio, r10_bio);
1512 else
1513 raid10_write_request(mddev, bio, r10_bio);
1514}
1515
1516static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
1517{
1518 struct r10conf *conf = mddev->private;
1519 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1520 int chunk_sects = chunk_mask + 1;
1521 int sectors = bio_sectors(bio);
1522
1523 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1524 md_flush_request(mddev, bio);
1525 return true;
1526 }
1527
1528 if (!md_write_start(mddev, bio))
1529 return false;
1530
1531
1532
1533
1534
1535 if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
1536 sectors > chunk_sects
1537 && (conf->geo.near_copies < conf->geo.raid_disks
1538 || conf->prev.near_copies <
1539 conf->prev.raid_disks)))
1540 sectors = chunk_sects -
1541 (bio->bi_iter.bi_sector &
1542 (chunk_sects - 1));
1543 __make_request(mddev, bio, sectors);
1544
1545
1546 wake_up(&conf->wait_barrier);
1547 return true;
1548}
1549
1550static void raid10_status(struct seq_file *seq, struct mddev *mddev)
1551{
1552 struct r10conf *conf = mddev->private;
1553 int i;
1554
1555 if (conf->geo.near_copies < conf->geo.raid_disks)
1556 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1557 if (conf->geo.near_copies > 1)
1558 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1559 if (conf->geo.far_copies > 1) {
1560 if (conf->geo.far_offset)
1561 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1562 else
1563 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1564 if (conf->geo.far_set_size != conf->geo.raid_disks)
1565 seq_printf(seq, " %d devices per set", conf->geo.far_set_size);
1566 }
1567 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1568 conf->geo.raid_disks - mddev->degraded);
1569 rcu_read_lock();
1570 for (i = 0; i < conf->geo.raid_disks; i++) {
1571 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1572 seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1573 }
1574 rcu_read_unlock();
1575 seq_printf(seq, "]");
1576}
1577
1578
1579
1580
1581
1582
1583static int _enough(struct r10conf *conf, int previous, int ignore)
1584{
1585 int first = 0;
1586 int has_enough = 0;
1587 int disks, ncopies;
1588 if (previous) {
1589 disks = conf->prev.raid_disks;
1590 ncopies = conf->prev.near_copies;
1591 } else {
1592 disks = conf->geo.raid_disks;
1593 ncopies = conf->geo.near_copies;
1594 }
1595
1596 rcu_read_lock();
1597 do {
1598 int n = conf->copies;
1599 int cnt = 0;
1600 int this = first;
1601 while (n--) {
1602 struct md_rdev *rdev;
1603 if (this != ignore &&
1604 (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
1605 test_bit(In_sync, &rdev->flags))
1606 cnt++;
1607 this = (this+1) % disks;
1608 }
1609 if (cnt == 0)
1610 goto out;
1611 first = (first + ncopies) % disks;
1612 } while (first != 0);
1613 has_enough = 1;
1614out:
1615 rcu_read_unlock();
1616 return has_enough;
1617}
1618
1619static int enough(struct r10conf *conf, int ignore)
1620{
1621
1622
1623
1624
1625
1626 return _enough(conf, 0, ignore) &&
1627 _enough(conf, 1, ignore);
1628}
1629
1630static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
1631{
1632 char b[BDEVNAME_SIZE];
1633 struct r10conf *conf = mddev->private;
1634 unsigned long flags;
1635
1636
1637
1638
1639
1640
1641
1642 spin_lock_irqsave(&conf->device_lock, flags);
1643 if (test_bit(In_sync, &rdev->flags)
1644 && !enough(conf, rdev->raid_disk)) {
1645
1646
1647
1648 spin_unlock_irqrestore(&conf->device_lock, flags);
1649 return;
1650 }
1651 if (test_and_clear_bit(In_sync, &rdev->flags))
1652 mddev->degraded++;
1653
1654
1655
1656 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1657 set_bit(Blocked, &rdev->flags);
1658 set_bit(Faulty, &rdev->flags);
1659 set_mask_bits(&mddev->sb_flags, 0,
1660 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1661 spin_unlock_irqrestore(&conf->device_lock, flags);
1662 pr_crit("md/raid10:%s: Disk failure on %s, disabling device.\n"
1663 "md/raid10:%s: Operation continuing on %d devices.\n",
1664 mdname(mddev), bdevname(rdev->bdev, b),
1665 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1666}
1667
1668static void print_conf(struct r10conf *conf)
1669{
1670 int i;
1671 struct md_rdev *rdev;
1672
1673 pr_debug("RAID10 conf printout:\n");
1674 if (!conf) {
1675 pr_debug("(!conf)\n");
1676 return;
1677 }
1678 pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1679 conf->geo.raid_disks);
1680
1681
1682
1683 for (i = 0; i < conf->geo.raid_disks; i++) {
1684 char b[BDEVNAME_SIZE];
1685 rdev = conf->mirrors[i].rdev;
1686 if (rdev)
1687 pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n",
1688 i, !test_bit(In_sync, &rdev->flags),
1689 !test_bit(Faulty, &rdev->flags),
1690 bdevname(rdev->bdev,b));
1691 }
1692}
1693
1694static void close_sync(struct r10conf *conf)
1695{
1696 wait_barrier(conf);
1697 allow_barrier(conf);
1698
1699 mempool_destroy(conf->r10buf_pool);
1700 conf->r10buf_pool = NULL;
1701}
1702
1703static int raid10_spare_active(struct mddev *mddev)
1704{
1705 int i;
1706 struct r10conf *conf = mddev->private;
1707 struct raid10_info *tmp;
1708 int count = 0;
1709 unsigned long flags;
1710
1711
1712
1713
1714
1715 for (i = 0; i < conf->geo.raid_disks; i++) {
1716 tmp = conf->mirrors + i;
1717 if (tmp->replacement
1718 && tmp->replacement->recovery_offset == MaxSector
1719 && !test_bit(Faulty, &tmp->replacement->flags)
1720 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1721
1722 if (!tmp->rdev
1723 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1724 count++;
1725 if (tmp->rdev) {
1726
1727
1728
1729
1730 set_bit(Faulty, &tmp->rdev->flags);
1731 sysfs_notify_dirent_safe(
1732 tmp->rdev->sysfs_state);
1733 }
1734 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1735 } else if (tmp->rdev
1736 && tmp->rdev->recovery_offset == MaxSector
1737 && !test_bit(Faulty, &tmp->rdev->flags)
1738 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1739 count++;
1740 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
1741 }
1742 }
1743 spin_lock_irqsave(&conf->device_lock, flags);
1744 mddev->degraded -= count;
1745 spin_unlock_irqrestore(&conf->device_lock, flags);
1746
1747 print_conf(conf);
1748 return count;
1749}
1750
1751static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1752{
1753 struct r10conf *conf = mddev->private;
1754 int err = -EEXIST;
1755 int mirror;
1756 int first = 0;
1757 int last = conf->geo.raid_disks - 1;
1758
1759 if (mddev->recovery_cp < MaxSector)
1760
1761
1762
1763 return -EBUSY;
1764 if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
1765 return -EINVAL;
1766
1767 if (md_integrity_add_rdev(rdev, mddev))
1768 return -ENXIO;
1769
1770 if (rdev->raid_disk >= 0)
1771 first = last = rdev->raid_disk;
1772
1773 if (rdev->saved_raid_disk >= first &&
1774 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1775 mirror = rdev->saved_raid_disk;
1776 else
1777 mirror = first;
1778 for ( ; mirror <= last ; mirror++) {
1779 struct raid10_info *p = &conf->mirrors[mirror];
1780 if (p->recovery_disabled == mddev->recovery_disabled)
1781 continue;
1782 if (p->rdev) {
1783 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1784 p->replacement != NULL)
1785 continue;
1786 clear_bit(In_sync, &rdev->flags);
1787 set_bit(Replacement, &rdev->flags);
1788 rdev->raid_disk = mirror;
1789 err = 0;
1790 if (mddev->gendisk)
1791 disk_stack_limits(mddev->gendisk, rdev->bdev,
1792 rdev->data_offset << 9);
1793 conf->fullsync = 1;
1794 rcu_assign_pointer(p->replacement, rdev);
1795 break;
1796 }
1797
1798 if (mddev->gendisk)
1799 disk_stack_limits(mddev->gendisk, rdev->bdev,
1800 rdev->data_offset << 9);
1801
1802 p->head_position = 0;
1803 p->recovery_disabled = mddev->recovery_disabled - 1;
1804 rdev->raid_disk = mirror;
1805 err = 0;
1806 if (rdev->saved_raid_disk != mirror)
1807 conf->fullsync = 1;
1808 rcu_assign_pointer(p->rdev, rdev);
1809 break;
1810 }
1811 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1812 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
1813
1814 print_conf(conf);
1815 return err;
1816}
1817
1818static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1819{
1820 struct r10conf *conf = mddev->private;
1821 int err = 0;
1822 int number = rdev->raid_disk;
1823 struct md_rdev **rdevp;
1824 struct raid10_info *p = conf->mirrors + number;
1825
1826 print_conf(conf);
1827 if (rdev == p->rdev)
1828 rdevp = &p->rdev;
1829 else if (rdev == p->replacement)
1830 rdevp = &p->replacement;
1831 else
1832 return 0;
1833
1834 if (test_bit(In_sync, &rdev->flags) ||
1835 atomic_read(&rdev->nr_pending)) {
1836 err = -EBUSY;
1837 goto abort;
1838 }
1839
1840
1841
1842 if (!test_bit(Faulty, &rdev->flags) &&
1843 mddev->recovery_disabled != p->recovery_disabled &&
1844 (!p->replacement || p->replacement == rdev) &&
1845 number < conf->geo.raid_disks &&
1846 enough(conf, -1)) {
1847 err = -EBUSY;
1848 goto abort;
1849 }
1850 *rdevp = NULL;
1851 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
1852 synchronize_rcu();
1853 if (atomic_read(&rdev->nr_pending)) {
1854
1855 err = -EBUSY;
1856 *rdevp = rdev;
1857 goto abort;
1858 }
1859 }
1860 if (p->replacement) {
1861
1862 p->rdev = p->replacement;
1863 clear_bit(Replacement, &p->replacement->flags);
1864 smp_mb();
1865
1866
1867 p->replacement = NULL;
1868 }
1869
1870 clear_bit(WantReplacement, &rdev->flags);
1871 err = md_integrity_register(mddev);
1872
1873abort:
1874
1875 print_conf(conf);
1876 return err;
1877}
1878
1879static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
1880{
1881 struct r10conf *conf = r10_bio->mddev->private;
1882
1883 if (!bio->bi_status)
1884 set_bit(R10BIO_Uptodate, &r10_bio->state);
1885 else
1886
1887
1888
1889 atomic_add(r10_bio->sectors,
1890 &conf->mirrors[d].rdev->corrected_errors);
1891
1892
1893
1894
1895 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1896 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1897 atomic_dec_and_test(&r10_bio->remaining)) {
1898
1899
1900
1901 reschedule_retry(r10_bio);
1902 }
1903}
1904
1905static void end_sync_read(struct bio *bio)
1906{
1907 struct r10bio *r10_bio = get_resync_r10bio(bio);
1908 struct r10conf *conf = r10_bio->mddev->private;
1909 int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1910
1911 __end_sync_read(r10_bio, bio, d);
1912}
1913
1914static void end_reshape_read(struct bio *bio)
1915{
1916
1917 struct r10bio *r10_bio = bio->bi_private;
1918
1919 __end_sync_read(r10_bio, bio, r10_bio->read_slot);
1920}
1921
1922static void end_sync_request(struct r10bio *r10_bio)
1923{
1924 struct mddev *mddev = r10_bio->mddev;
1925
1926 while (atomic_dec_and_test(&r10_bio->remaining)) {
1927 if (r10_bio->master_bio == NULL) {
1928
1929 sector_t s = r10_bio->sectors;
1930 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1931 test_bit(R10BIO_WriteError, &r10_bio->state))
1932 reschedule_retry(r10_bio);
1933 else
1934 put_buf(r10_bio);
1935 md_done_sync(mddev, s, 1);
1936 break;
1937 } else {
1938 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1939 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1940 test_bit(R10BIO_WriteError, &r10_bio->state))
1941 reschedule_retry(r10_bio);
1942 else
1943 put_buf(r10_bio);
1944 r10_bio = r10_bio2;
1945 }
1946 }
1947}
1948
1949static void end_sync_write(struct bio *bio)
1950{
1951 struct r10bio *r10_bio = get_resync_r10bio(bio);
1952 struct mddev *mddev = r10_bio->mddev;
1953 struct r10conf *conf = mddev->private;
1954 int d;
1955 sector_t first_bad;
1956 int bad_sectors;
1957 int slot;
1958 int repl;
1959 struct md_rdev *rdev = NULL;
1960
1961 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1962 if (repl)
1963 rdev = conf->mirrors[d].replacement;
1964 else
1965 rdev = conf->mirrors[d].rdev;
1966
1967 if (bio->bi_status) {
1968 if (repl)
1969 md_error(mddev, rdev);
1970 else {
1971 set_bit(WriteErrorSeen, &rdev->flags);
1972 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1973 set_bit(MD_RECOVERY_NEEDED,
1974 &rdev->mddev->recovery);
1975 set_bit(R10BIO_WriteError, &r10_bio->state);
1976 }
1977 } else if (is_badblock(rdev,
1978 r10_bio->devs[slot].addr,
1979 r10_bio->sectors,
1980 &first_bad, &bad_sectors))
1981 set_bit(R10BIO_MadeGood, &r10_bio->state);
1982
1983 rdev_dec_pending(rdev, mddev);
1984
1985 end_sync_request(r10_bio);
1986}
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2005{
2006 struct r10conf *conf = mddev->private;
2007 int i, first;
2008 struct bio *tbio, *fbio;
2009 int vcnt;
2010 struct page **tpages, **fpages;
2011
2012 atomic_set(&r10_bio->remaining, 1);
2013
2014
2015 for (i=0; i<conf->copies; i++)
2016 if (!r10_bio->devs[i].bio->bi_status)
2017 break;
2018
2019 if (i == conf->copies)
2020 goto done;
2021
2022 first = i;
2023 fbio = r10_bio->devs[i].bio;
2024 fbio->bi_iter.bi_size = r10_bio->sectors << 9;
2025 fbio->bi_iter.bi_idx = 0;
2026 fpages = get_resync_pages(fbio)->pages;
2027
2028 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2029
2030 for (i=0 ; i < conf->copies ; i++) {
2031 int j, d;
2032 struct md_rdev *rdev;
2033 struct resync_pages *rp;
2034
2035 tbio = r10_bio->devs[i].bio;
2036
2037 if (tbio->bi_end_io != end_sync_read)
2038 continue;
2039 if (i == first)
2040 continue;
2041
2042 tpages = get_resync_pages(tbio)->pages;
2043 d = r10_bio->devs[i].devnum;
2044 rdev = conf->mirrors[d].rdev;
2045 if (!r10_bio->devs[i].bio->bi_status) {
2046
2047
2048
2049
2050 int sectors = r10_bio->sectors;
2051 for (j = 0; j < vcnt; j++) {
2052 int len = PAGE_SIZE;
2053 if (sectors < (len / 512))
2054 len = sectors * 512;
2055 if (memcmp(page_address(fpages[j]),
2056 page_address(tpages[j]),
2057 len))
2058 break;
2059 sectors -= len/512;
2060 }
2061 if (j == vcnt)
2062 continue;
2063 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2064 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2065
2066 continue;
2067 } else if (test_bit(FailFast, &rdev->flags)) {
2068
2069 md_error(rdev->mddev, rdev);
2070 continue;
2071 }
2072
2073
2074
2075
2076
2077 rp = get_resync_pages(tbio);
2078 bio_reset(tbio);
2079
2080 md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size);
2081
2082 rp->raid_bio = r10_bio;
2083 tbio->bi_private = rp;
2084 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
2085 tbio->bi_end_io = end_sync_write;
2086 bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
2087
2088 bio_copy_data(tbio, fbio);
2089
2090 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2091 atomic_inc(&r10_bio->remaining);
2092 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2093
2094 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
2095 tbio->bi_opf |= MD_FAILFAST;
2096 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
2097 bio_set_dev(tbio, conf->mirrors[d].rdev->bdev);
2098 generic_make_request(tbio);
2099 }
2100
2101
2102
2103
2104 for (i = 0; i < conf->copies; i++) {
2105 int d;
2106
2107 tbio = r10_bio->devs[i].repl_bio;
2108 if (!tbio || !tbio->bi_end_io)
2109 continue;
2110 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2111 && r10_bio->devs[i].bio != fbio)
2112 bio_copy_data(tbio, fbio);
2113 d = r10_bio->devs[i].devnum;
2114 atomic_inc(&r10_bio->remaining);
2115 md_sync_acct(conf->mirrors[d].replacement->bdev,
2116 bio_sectors(tbio));
2117 generic_make_request(tbio);
2118 }
2119
2120done:
2121 if (atomic_dec_and_test(&r10_bio->remaining)) {
2122 md_done_sync(mddev, r10_bio->sectors, 1);
2123 put_buf(r10_bio);
2124 }
2125}
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137static void fix_recovery_read_error(struct r10bio *r10_bio)
2138{
2139
2140
2141
2142
2143
2144
2145
2146 struct mddev *mddev = r10_bio->mddev;
2147 struct r10conf *conf = mddev->private;
2148 struct bio *bio = r10_bio->devs[0].bio;
2149 sector_t sect = 0;
2150 int sectors = r10_bio->sectors;
2151 int idx = 0;
2152 int dr = r10_bio->devs[0].devnum;
2153 int dw = r10_bio->devs[1].devnum;
2154 struct page **pages = get_resync_pages(bio)->pages;
2155
2156 while (sectors) {
2157 int s = sectors;
2158 struct md_rdev *rdev;
2159 sector_t addr;
2160 int ok;
2161
2162 if (s > (PAGE_SIZE>>9))
2163 s = PAGE_SIZE >> 9;
2164
2165 rdev = conf->mirrors[dr].rdev;
2166 addr = r10_bio->devs[0].addr + sect,
2167 ok = sync_page_io(rdev,
2168 addr,
2169 s << 9,
2170 pages[idx],
2171 REQ_OP_READ, 0, false);
2172 if (ok) {
2173 rdev = conf->mirrors[dw].rdev;
2174 addr = r10_bio->devs[1].addr + sect;
2175 ok = sync_page_io(rdev,
2176 addr,
2177 s << 9,
2178 pages[idx],
2179 REQ_OP_WRITE, 0, false);
2180 if (!ok) {
2181 set_bit(WriteErrorSeen, &rdev->flags);
2182 if (!test_and_set_bit(WantReplacement,
2183 &rdev->flags))
2184 set_bit(MD_RECOVERY_NEEDED,
2185 &rdev->mddev->recovery);
2186 }
2187 }
2188 if (!ok) {
2189
2190
2191
2192
2193 rdev_set_badblocks(rdev, addr, s, 0);
2194
2195 if (rdev != conf->mirrors[dw].rdev) {
2196
2197 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2198 addr = r10_bio->devs[1].addr + sect;
2199 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2200 if (!ok) {
2201
2202 pr_notice("md/raid10:%s: recovery aborted due to read error\n",
2203 mdname(mddev));
2204
2205 conf->mirrors[dw].recovery_disabled
2206 = mddev->recovery_disabled;
2207 set_bit(MD_RECOVERY_INTR,
2208 &mddev->recovery);
2209 break;
2210 }
2211 }
2212 }
2213
2214 sectors -= s;
2215 sect += s;
2216 idx++;
2217 }
2218}
2219
2220static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2221{
2222 struct r10conf *conf = mddev->private;
2223 int d;
2224 struct bio *wbio, *wbio2;
2225
2226 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2227 fix_recovery_read_error(r10_bio);
2228 end_sync_request(r10_bio);
2229 return;
2230 }
2231
2232
2233
2234
2235
2236 d = r10_bio->devs[1].devnum;
2237 wbio = r10_bio->devs[1].bio;
2238 wbio2 = r10_bio->devs[1].repl_bio;
2239
2240
2241
2242
2243 if (wbio2 && !wbio2->bi_end_io)
2244 wbio2 = NULL;
2245 if (wbio->bi_end_io) {
2246 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2247 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2248 generic_make_request(wbio);
2249 }
2250 if (wbio2) {
2251 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2252 md_sync_acct(conf->mirrors[d].replacement->bdev,
2253 bio_sectors(wbio2));
2254 generic_make_request(wbio2);
2255 }
2256}
2257
2258
2259
2260
2261
2262
2263
2264static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2265{
2266 long cur_time_mon;
2267 unsigned long hours_since_last;
2268 unsigned int read_errors = atomic_read(&rdev->read_errors);
2269
2270 cur_time_mon = ktime_get_seconds();
2271
2272 if (rdev->last_read_error == 0) {
2273
2274 rdev->last_read_error = cur_time_mon;
2275 return;
2276 }
2277
2278 hours_since_last = (long)(cur_time_mon -
2279 rdev->last_read_error) / 3600;
2280
2281 rdev->last_read_error = cur_time_mon;
2282
2283
2284
2285
2286
2287
2288 if (hours_since_last >= 8 * sizeof(read_errors))
2289 atomic_set(&rdev->read_errors, 0);
2290 else
2291 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2292}
2293
2294static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2295 int sectors, struct page *page, int rw)
2296{
2297 sector_t first_bad;
2298 int bad_sectors;
2299
2300 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2301 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2302 return -1;
2303 if (sync_page_io(rdev, sector, sectors << 9, page, rw, 0, false))
2304
2305 return 1;
2306 if (rw == WRITE) {
2307 set_bit(WriteErrorSeen, &rdev->flags);
2308 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2309 set_bit(MD_RECOVERY_NEEDED,
2310 &rdev->mddev->recovery);
2311 }
2312
2313 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2314 md_error(rdev->mddev, rdev);
2315 return 0;
2316}
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2327{
2328 int sect = 0;
2329 int sectors = r10_bio->sectors;
2330 struct md_rdev*rdev;
2331 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2332 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2333
2334
2335
2336
2337 rdev = conf->mirrors[d].rdev;
2338
2339 if (test_bit(Faulty, &rdev->flags))
2340
2341
2342 return;
2343
2344 check_decay_read_errors(mddev, rdev);
2345 atomic_inc(&rdev->read_errors);
2346 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2347 char b[BDEVNAME_SIZE];
2348 bdevname(rdev->bdev, b);
2349
2350 pr_notice("md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %d:max %d]\n",
2351 mdname(mddev), b,
2352 atomic_read(&rdev->read_errors), max_read_errors);
2353 pr_notice("md/raid10:%s: %s: Failing raid device\n",
2354 mdname(mddev), b);
2355 md_error(mddev, rdev);
2356 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2357 return;
2358 }
2359
2360 while(sectors) {
2361 int s = sectors;
2362 int sl = r10_bio->read_slot;
2363 int success = 0;
2364 int start;
2365
2366 if (s > (PAGE_SIZE>>9))
2367 s = PAGE_SIZE >> 9;
2368
2369 rcu_read_lock();
2370 do {
2371 sector_t first_bad;
2372 int bad_sectors;
2373
2374 d = r10_bio->devs[sl].devnum;
2375 rdev = rcu_dereference(conf->mirrors[d].rdev);
2376 if (rdev &&
2377 test_bit(In_sync, &rdev->flags) &&
2378 !test_bit(Faulty, &rdev->flags) &&
2379 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2380 &first_bad, &bad_sectors) == 0) {
2381 atomic_inc(&rdev->nr_pending);
2382 rcu_read_unlock();
2383 success = sync_page_io(rdev,
2384 r10_bio->devs[sl].addr +
2385 sect,
2386 s<<9,
2387 conf->tmppage,
2388 REQ_OP_READ, 0, false);
2389 rdev_dec_pending(rdev, mddev);
2390 rcu_read_lock();
2391 if (success)
2392 break;
2393 }
2394 sl++;
2395 if (sl == conf->copies)
2396 sl = 0;
2397 } while (!success && sl != r10_bio->read_slot);
2398 rcu_read_unlock();
2399
2400 if (!success) {
2401
2402
2403
2404
2405 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2406 rdev = conf->mirrors[dn].rdev;
2407
2408 if (!rdev_set_badblocks(
2409 rdev,
2410 r10_bio->devs[r10_bio->read_slot].addr
2411 + sect,
2412 s, 0)) {
2413 md_error(mddev, rdev);
2414 r10_bio->devs[r10_bio->read_slot].bio
2415 = IO_BLOCKED;
2416 }
2417 break;
2418 }
2419
2420 start = sl;
2421
2422 rcu_read_lock();
2423 while (sl != r10_bio->read_slot) {
2424 char b[BDEVNAME_SIZE];
2425
2426 if (sl==0)
2427 sl = conf->copies;
2428 sl--;
2429 d = r10_bio->devs[sl].devnum;
2430 rdev = rcu_dereference(conf->mirrors[d].rdev);
2431 if (!rdev ||
2432 test_bit(Faulty, &rdev->flags) ||
2433 !test_bit(In_sync, &rdev->flags))
2434 continue;
2435
2436 atomic_inc(&rdev->nr_pending);
2437 rcu_read_unlock();
2438 if (r10_sync_page_io(rdev,
2439 r10_bio->devs[sl].addr +
2440 sect,
2441 s, conf->tmppage, WRITE)
2442 == 0) {
2443
2444 pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %s)\n",
2445 mdname(mddev), s,
2446 (unsigned long long)(
2447 sect +
2448 choose_data_offset(r10_bio,
2449 rdev)),
2450 bdevname(rdev->bdev, b));
2451 pr_notice("md/raid10:%s: %s: failing drive\n",
2452 mdname(mddev),
2453 bdevname(rdev->bdev, b));
2454 }
2455 rdev_dec_pending(rdev, mddev);
2456 rcu_read_lock();
2457 }
2458 sl = start;
2459 while (sl != r10_bio->read_slot) {
2460 char b[BDEVNAME_SIZE];
2461
2462 if (sl==0)
2463 sl = conf->copies;
2464 sl--;
2465 d = r10_bio->devs[sl].devnum;
2466 rdev = rcu_dereference(conf->mirrors[d].rdev);
2467 if (!rdev ||
2468 test_bit(Faulty, &rdev->flags) ||
2469 !test_bit(In_sync, &rdev->flags))
2470 continue;
2471
2472 atomic_inc(&rdev->nr_pending);
2473 rcu_read_unlock();
2474 switch (r10_sync_page_io(rdev,
2475 r10_bio->devs[sl].addr +
2476 sect,
2477 s, conf->tmppage,
2478 READ)) {
2479 case 0:
2480
2481 pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %s)\n",
2482 mdname(mddev), s,
2483 (unsigned long long)(
2484 sect +
2485 choose_data_offset(r10_bio, rdev)),
2486 bdevname(rdev->bdev, b));
2487 pr_notice("md/raid10:%s: %s: failing drive\n",
2488 mdname(mddev),
2489 bdevname(rdev->bdev, b));
2490 break;
2491 case 1:
2492 pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %s)\n",
2493 mdname(mddev), s,
2494 (unsigned long long)(
2495 sect +
2496 choose_data_offset(r10_bio, rdev)),
2497 bdevname(rdev->bdev, b));
2498 atomic_add(s, &rdev->corrected_errors);
2499 }
2500
2501 rdev_dec_pending(rdev, mddev);
2502 rcu_read_lock();
2503 }
2504 rcu_read_unlock();
2505
2506 sectors -= s;
2507 sect += s;
2508 }
2509}
2510
2511static int narrow_write_error(struct r10bio *r10_bio, int i)
2512{
2513 struct bio *bio = r10_bio->master_bio;
2514 struct mddev *mddev = r10_bio->mddev;
2515 struct r10conf *conf = mddev->private;
2516 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528 int block_sectors;
2529 sector_t sector;
2530 int sectors;
2531 int sect_to_write = r10_bio->sectors;
2532 int ok = 1;
2533
2534 if (rdev->badblocks.shift < 0)
2535 return 0;
2536
2537 block_sectors = roundup(1 << rdev->badblocks.shift,
2538 bdev_logical_block_size(rdev->bdev) >> 9);
2539 sector = r10_bio->sector;
2540 sectors = ((r10_bio->sector + block_sectors)
2541 & ~(sector_t)(block_sectors - 1))
2542 - sector;
2543
2544 while (sect_to_write) {
2545 struct bio *wbio;
2546 sector_t wsector;
2547 if (sectors > sect_to_write)
2548 sectors = sect_to_write;
2549
2550 wbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
2551 bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
2552 wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
2553 wbio->bi_iter.bi_sector = wsector +
2554 choose_data_offset(r10_bio, rdev);
2555 bio_set_dev(wbio, rdev->bdev);
2556 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
2557
2558 if (submit_bio_wait(wbio) < 0)
2559
2560 ok = rdev_set_badblocks(rdev, wsector,
2561 sectors, 0)
2562 && ok;
2563
2564 bio_put(wbio);
2565 sect_to_write -= sectors;
2566 sector += sectors;
2567 sectors = block_sectors;
2568 }
2569 return ok;
2570}
2571
2572static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2573{
2574 int slot = r10_bio->read_slot;
2575 struct bio *bio;
2576 struct r10conf *conf = mddev->private;
2577 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2578 sector_t bio_last_sector;
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588 bio = r10_bio->devs[slot].bio;
2589 bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors;
2590 bio_put(bio);
2591 r10_bio->devs[slot].bio = NULL;
2592
2593 if (mddev->ro)
2594 r10_bio->devs[slot].bio = IO_BLOCKED;
2595 else if (!test_bit(FailFast, &rdev->flags)) {
2596 freeze_array(conf, 1);
2597 fix_read_error(conf, mddev, r10_bio);
2598 unfreeze_array(conf);
2599 } else
2600 md_error(mddev, rdev);
2601
2602 rdev_dec_pending(rdev, mddev);
2603 allow_barrier(conf);
2604 r10_bio->state = 0;
2605 raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
2606}
2607
2608static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2609{
2610
2611
2612
2613
2614
2615
2616 int m;
2617 struct md_rdev *rdev;
2618
2619 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2620 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2621 for (m = 0; m < conf->copies; m++) {
2622 int dev = r10_bio->devs[m].devnum;
2623 rdev = conf->mirrors[dev].rdev;
2624 if (r10_bio->devs[m].bio == NULL)
2625 continue;
2626 if (!r10_bio->devs[m].bio->bi_status) {
2627 rdev_clear_badblocks(
2628 rdev,
2629 r10_bio->devs[m].addr,
2630 r10_bio->sectors, 0);
2631 } else {
2632 if (!rdev_set_badblocks(
2633 rdev,
2634 r10_bio->devs[m].addr,
2635 r10_bio->sectors, 0))
2636 md_error(conf->mddev, rdev);
2637 }
2638 rdev = conf->mirrors[dev].replacement;
2639 if (r10_bio->devs[m].repl_bio == NULL)
2640 continue;
2641
2642 if (!r10_bio->devs[m].repl_bio->bi_status) {
2643 rdev_clear_badblocks(
2644 rdev,
2645 r10_bio->devs[m].addr,
2646 r10_bio->sectors, 0);
2647 } else {
2648 if (!rdev_set_badblocks(
2649 rdev,
2650 r10_bio->devs[m].addr,
2651 r10_bio->sectors, 0))
2652 md_error(conf->mddev, rdev);
2653 }
2654 }
2655 put_buf(r10_bio);
2656 } else {
2657 bool fail = false;
2658 for (m = 0; m < conf->copies; m++) {
2659 int dev = r10_bio->devs[m].devnum;
2660 struct bio *bio = r10_bio->devs[m].bio;
2661 rdev = conf->mirrors[dev].rdev;
2662 if (bio == IO_MADE_GOOD) {
2663 rdev_clear_badblocks(
2664 rdev,
2665 r10_bio->devs[m].addr,
2666 r10_bio->sectors, 0);
2667 rdev_dec_pending(rdev, conf->mddev);
2668 } else if (bio != NULL && bio->bi_status) {
2669 fail = true;
2670 if (!narrow_write_error(r10_bio, m)) {
2671 md_error(conf->mddev, rdev);
2672 set_bit(R10BIO_Degraded,
2673 &r10_bio->state);
2674 }
2675 rdev_dec_pending(rdev, conf->mddev);
2676 }
2677 bio = r10_bio->devs[m].repl_bio;
2678 rdev = conf->mirrors[dev].replacement;
2679 if (rdev && bio == IO_MADE_GOOD) {
2680 rdev_clear_badblocks(
2681 rdev,
2682 r10_bio->devs[m].addr,
2683 r10_bio->sectors, 0);
2684 rdev_dec_pending(rdev, conf->mddev);
2685 }
2686 }
2687 if (fail) {
2688 spin_lock_irq(&conf->device_lock);
2689 list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
2690 conf->nr_queued++;
2691 spin_unlock_irq(&conf->device_lock);
2692
2693
2694
2695
2696 wake_up(&conf->wait_barrier);
2697 md_wakeup_thread(conf->mddev->thread);
2698 } else {
2699 if (test_bit(R10BIO_WriteError,
2700 &r10_bio->state))
2701 close_write(r10_bio);
2702 raid_end_bio_io(r10_bio);
2703 }
2704 }
2705}
2706
2707static void raid10d(struct md_thread *thread)
2708{
2709 struct mddev *mddev = thread->mddev;
2710 struct r10bio *r10_bio;
2711 unsigned long flags;
2712 struct r10conf *conf = mddev->private;
2713 struct list_head *head = &conf->retry_list;
2714 struct blk_plug plug;
2715
2716 md_check_recovery(mddev);
2717
2718 if (!list_empty_careful(&conf->bio_end_io_list) &&
2719 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2720 LIST_HEAD(tmp);
2721 spin_lock_irqsave(&conf->device_lock, flags);
2722 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2723 while (!list_empty(&conf->bio_end_io_list)) {
2724 list_move(conf->bio_end_io_list.prev, &tmp);
2725 conf->nr_queued--;
2726 }
2727 }
2728 spin_unlock_irqrestore(&conf->device_lock, flags);
2729 while (!list_empty(&tmp)) {
2730 r10_bio = list_first_entry(&tmp, struct r10bio,
2731 retry_list);
2732 list_del(&r10_bio->retry_list);
2733 if (mddev->degraded)
2734 set_bit(R10BIO_Degraded, &r10_bio->state);
2735
2736 if (test_bit(R10BIO_WriteError,
2737 &r10_bio->state))
2738 close_write(r10_bio);
2739 raid_end_bio_io(r10_bio);
2740 }
2741 }
2742
2743 blk_start_plug(&plug);
2744 for (;;) {
2745
2746 flush_pending_writes(conf);
2747
2748 spin_lock_irqsave(&conf->device_lock, flags);
2749 if (list_empty(head)) {
2750 spin_unlock_irqrestore(&conf->device_lock, flags);
2751 break;
2752 }
2753 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2754 list_del(head->prev);
2755 conf->nr_queued--;
2756 spin_unlock_irqrestore(&conf->device_lock, flags);
2757
2758 mddev = r10_bio->mddev;
2759 conf = mddev->private;
2760 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2761 test_bit(R10BIO_WriteError, &r10_bio->state))
2762 handle_write_completed(conf, r10_bio);
2763 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2764 reshape_request_write(mddev, r10_bio);
2765 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2766 sync_request_write(mddev, r10_bio);
2767 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2768 recovery_request_write(mddev, r10_bio);
2769 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2770 handle_read_error(mddev, r10_bio);
2771 else
2772 WARN_ON_ONCE(1);
2773
2774 cond_resched();
2775 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
2776 md_check_recovery(mddev);
2777 }
2778 blk_finish_plug(&plug);
2779}
2780
2781static int init_resync(struct r10conf *conf)
2782{
2783 int buffs;
2784 int i;
2785
2786 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2787 BUG_ON(conf->r10buf_pool);
2788 conf->have_replacement = 0;
2789 for (i = 0; i < conf->geo.raid_disks; i++)
2790 if (conf->mirrors[i].replacement)
2791 conf->have_replacement = 1;
2792 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2793 if (!conf->r10buf_pool)
2794 return -ENOMEM;
2795 conf->next_resync = 0;
2796 return 0;
2797}
2798
2799static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
2800{
2801 struct r10bio *r10bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
2802 struct rsync_pages *rp;
2803 struct bio *bio;
2804 int nalloc;
2805 int i;
2806
2807 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
2808 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
2809 nalloc = conf->copies;
2810 else
2811 nalloc = 2;
2812
2813 for (i = 0; i < nalloc; i++) {
2814 bio = r10bio->devs[i].bio;
2815 rp = bio->bi_private;
2816 bio_reset(bio);
2817 bio->bi_private = rp;
2818 bio = r10bio->devs[i].repl_bio;
2819 if (bio) {
2820 rp = bio->bi_private;
2821 bio_reset(bio);
2822 bio->bi_private = rp;
2823 }
2824 }
2825 return r10bio;
2826}
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
2861 int *skipped)
2862{
2863 struct r10conf *conf = mddev->private;
2864 struct r10bio *r10_bio;
2865 struct bio *biolist = NULL, *bio;
2866 sector_t max_sector, nr_sectors;
2867 int i;
2868 int max_sync;
2869 sector_t sync_blocks;
2870 sector_t sectors_skipped = 0;
2871 int chunks_skipped = 0;
2872 sector_t chunk_mask = conf->geo.chunk_mask;
2873 int page_idx = 0;
2874
2875 if (!conf->r10buf_pool)
2876 if (init_resync(conf))
2877 return 0;
2878
2879
2880
2881
2882
2883 if (mddev->bitmap == NULL &&
2884 mddev->recovery_cp == MaxSector &&
2885 mddev->reshape_position == MaxSector &&
2886 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2887 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
2888 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2889 conf->fullsync == 0) {
2890 *skipped = 1;
2891 return mddev->dev_sectors - sector_nr;
2892 }
2893
2894 skipped:
2895 max_sector = mddev->dev_sectors;
2896 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2897 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2898 max_sector = mddev->resync_max_sectors;
2899 if (sector_nr >= max_sector) {
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2910 end_reshape(conf);
2911 close_sync(conf);
2912 return 0;
2913 }
2914
2915 if (mddev->curr_resync < max_sector) {
2916 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2917 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2918 &sync_blocks, 1);
2919 else for (i = 0; i < conf->geo.raid_disks; i++) {
2920 sector_t sect =
2921 raid10_find_virt(conf, mddev->curr_resync, i);
2922 bitmap_end_sync(mddev->bitmap, sect,
2923 &sync_blocks, 1);
2924 }
2925 } else {
2926
2927 if ((!mddev->bitmap || conf->fullsync)
2928 && conf->have_replacement
2929 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2930
2931
2932
2933 rcu_read_lock();
2934 for (i = 0; i < conf->geo.raid_disks; i++) {
2935 struct md_rdev *rdev =
2936 rcu_dereference(conf->mirrors[i].replacement);
2937 if (rdev)
2938 rdev->recovery_offset = MaxSector;
2939 }
2940 rcu_read_unlock();
2941 }
2942 conf->fullsync = 0;
2943 }
2944 bitmap_close_sync(mddev->bitmap);
2945 close_sync(conf);
2946 *skipped = 1;
2947 return sectors_skipped;
2948 }
2949
2950 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2951 return reshape_request(mddev, sector_nr, skipped);
2952
2953 if (chunks_skipped >= conf->geo.raid_disks) {
2954
2955
2956
2957 *skipped = 1;
2958 return (max_sector - sector_nr) + sectors_skipped;
2959 }
2960
2961 if (max_sector > mddev->resync_max)
2962 max_sector = mddev->resync_max;
2963
2964
2965
2966
2967 if (conf->geo.near_copies < conf->geo.raid_disks &&
2968 max_sector > (sector_nr | chunk_mask))
2969 max_sector = (sector_nr | chunk_mask) + 1;
2970
2971
2972
2973
2974
2975 if (conf->nr_waiting)
2976 schedule_timeout_uninterruptible(1);
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
2994 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2995
2996 int j;
2997 r10_bio = NULL;
2998
2999 for (i = 0 ; i < conf->geo.raid_disks; i++) {
3000 int still_degraded;
3001 struct r10bio *rb2;
3002 sector_t sect;
3003 int must_sync;
3004 int any_working;
3005 struct raid10_info *mirror = &conf->mirrors[i];
3006 struct md_rdev *mrdev, *mreplace;
3007
3008 rcu_read_lock();
3009 mrdev = rcu_dereference(mirror->rdev);
3010 mreplace = rcu_dereference(mirror->replacement);
3011
3012 if ((mrdev == NULL ||
3013 test_bit(Faulty, &mrdev->flags) ||
3014 test_bit(In_sync, &mrdev->flags)) &&
3015 (mreplace == NULL ||
3016 test_bit(Faulty, &mreplace->flags))) {
3017 rcu_read_unlock();
3018 continue;
3019 }
3020
3021 still_degraded = 0;
3022
3023 rb2 = r10_bio;
3024 sect = raid10_find_virt(conf, sector_nr, i);
3025 if (sect >= mddev->resync_max_sectors) {
3026
3027
3028
3029 rcu_read_unlock();
3030 continue;
3031 }
3032 if (mreplace && test_bit(Faulty, &mreplace->flags))
3033 mreplace = NULL;
3034
3035
3036
3037
3038 must_sync = bitmap_start_sync(mddev->bitmap, sect,
3039 &sync_blocks, 1);
3040 if (sync_blocks < max_sync)
3041 max_sync = sync_blocks;
3042 if (!must_sync &&
3043 mreplace == NULL &&
3044 !conf->fullsync) {
3045
3046
3047
3048 chunks_skipped = -1;
3049 rcu_read_unlock();
3050 continue;
3051 }
3052 atomic_inc(&mrdev->nr_pending);
3053 if (mreplace)
3054 atomic_inc(&mreplace->nr_pending);
3055 rcu_read_unlock();
3056
3057 r10_bio = raid10_alloc_init_r10buf(conf);
3058 r10_bio->state = 0;
3059 raise_barrier(conf, rb2 != NULL);
3060 atomic_set(&r10_bio->remaining, 0);
3061
3062 r10_bio->master_bio = (struct bio*)rb2;
3063 if (rb2)
3064 atomic_inc(&rb2->remaining);
3065 r10_bio->mddev = mddev;
3066 set_bit(R10BIO_IsRecover, &r10_bio->state);
3067 r10_bio->sector = sect;
3068
3069 raid10_find_phys(conf, r10_bio);
3070
3071
3072
3073
3074 rcu_read_lock();
3075 for (j = 0; j < conf->geo.raid_disks; j++) {
3076 struct md_rdev *rdev = rcu_dereference(
3077 conf->mirrors[j].rdev);
3078 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3079 still_degraded = 1;
3080 break;
3081 }
3082 }
3083
3084 must_sync = bitmap_start_sync(mddev->bitmap, sect,
3085 &sync_blocks, still_degraded);
3086
3087 any_working = 0;
3088 for (j=0; j<conf->copies;j++) {
3089 int k;
3090 int d = r10_bio->devs[j].devnum;
3091 sector_t from_addr, to_addr;
3092 struct md_rdev *rdev =
3093 rcu_dereference(conf->mirrors[d].rdev);
3094 sector_t sector, first_bad;
3095 int bad_sectors;
3096 if (!rdev ||
3097 !test_bit(In_sync, &rdev->flags))
3098 continue;
3099
3100 any_working = 1;
3101 sector = r10_bio->devs[j].addr;
3102
3103 if (is_badblock(rdev, sector, max_sync,
3104 &first_bad, &bad_sectors)) {
3105 if (first_bad > sector)
3106 max_sync = first_bad - sector;
3107 else {
3108 bad_sectors -= (sector
3109 - first_bad);
3110 if (max_sync > bad_sectors)
3111 max_sync = bad_sectors;
3112 continue;
3113 }
3114 }
3115 bio = r10_bio->devs[0].bio;
3116 bio->bi_next = biolist;
3117 biolist = bio;
3118 bio->bi_end_io = end_sync_read;
3119 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3120 if (test_bit(FailFast, &rdev->flags))
3121 bio->bi_opf |= MD_FAILFAST;
3122 from_addr = r10_bio->devs[j].addr;
3123 bio->bi_iter.bi_sector = from_addr +
3124 rdev->data_offset;
3125 bio_set_dev(bio, rdev->bdev);
3126 atomic_inc(&rdev->nr_pending);
3127
3128
3129 for (k=0; k<conf->copies; k++)
3130 if (r10_bio->devs[k].devnum == i)
3131 break;
3132 BUG_ON(k == conf->copies);
3133 to_addr = r10_bio->devs[k].addr;
3134 r10_bio->devs[0].devnum = d;
3135 r10_bio->devs[0].addr = from_addr;
3136 r10_bio->devs[1].devnum = i;
3137 r10_bio->devs[1].addr = to_addr;
3138
3139 if (!test_bit(In_sync, &mrdev->flags)) {
3140 bio = r10_bio->devs[1].bio;
3141 bio->bi_next = biolist;
3142 biolist = bio;
3143 bio->bi_end_io = end_sync_write;
3144 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3145 bio->bi_iter.bi_sector = to_addr
3146 + mrdev->data_offset;
3147 bio_set_dev(bio, mrdev->bdev);
3148 atomic_inc(&r10_bio->remaining);
3149 } else
3150 r10_bio->devs[1].bio->bi_end_io = NULL;
3151
3152
3153 bio = r10_bio->devs[1].repl_bio;
3154 if (bio)
3155 bio->bi_end_io = NULL;
3156
3157
3158
3159
3160
3161
3162
3163
3164 if (mreplace == NULL || bio == NULL ||
3165 test_bit(Faulty, &mreplace->flags))
3166 break;
3167 bio->bi_next = biolist;
3168 biolist = bio;
3169 bio->bi_end_io = end_sync_write;
3170 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3171 bio->bi_iter.bi_sector = to_addr +
3172 mreplace->data_offset;
3173 bio_set_dev(bio, mreplace->bdev);
3174 atomic_inc(&r10_bio->remaining);
3175 break;
3176 }
3177 rcu_read_unlock();
3178 if (j == conf->copies) {
3179
3180
3181 if (any_working) {
3182
3183
3184
3185 int k;
3186 for (k = 0; k < conf->copies; k++)
3187 if (r10_bio->devs[k].devnum == i)
3188 break;
3189 if (!test_bit(In_sync,
3190 &mrdev->flags)
3191 && !rdev_set_badblocks(
3192 mrdev,
3193 r10_bio->devs[k].addr,
3194 max_sync, 0))
3195 any_working = 0;
3196 if (mreplace &&
3197 !rdev_set_badblocks(
3198 mreplace,
3199 r10_bio->devs[k].addr,
3200 max_sync, 0))
3201 any_working = 0;
3202 }
3203 if (!any_working) {
3204 if (!test_and_set_bit(MD_RECOVERY_INTR,
3205 &mddev->recovery))
3206 pr_warn("md/raid10:%s: insufficient working devices for recovery.\n",
3207 mdname(mddev));
3208 mirror->recovery_disabled
3209 = mddev->recovery_disabled;
3210 }
3211 put_buf(r10_bio);
3212 if (rb2)
3213 atomic_dec(&rb2->remaining);
3214 r10_bio = rb2;
3215 rdev_dec_pending(mrdev, mddev);
3216 if (mreplace)
3217 rdev_dec_pending(mreplace, mddev);
3218 break;
3219 }
3220 rdev_dec_pending(mrdev, mddev);
3221 if (mreplace)
3222 rdev_dec_pending(mreplace, mddev);
3223 if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) {
3224
3225
3226
3227
3228 int targets = 1;
3229 for (; j < conf->copies; j++) {
3230 int d = r10_bio->devs[j].devnum;
3231 if (conf->mirrors[d].rdev &&
3232 test_bit(In_sync,
3233 &conf->mirrors[d].rdev->flags))
3234 targets++;
3235 }
3236 if (targets == 1)
3237 r10_bio->devs[0].bio->bi_opf
3238 &= ~MD_FAILFAST;
3239 }
3240 }
3241 if (biolist == NULL) {
3242 while (r10_bio) {
3243 struct r10bio *rb2 = r10_bio;
3244 r10_bio = (struct r10bio*) rb2->master_bio;
3245 rb2->master_bio = NULL;
3246 put_buf(rb2);
3247 }
3248 goto giveup;
3249 }
3250 } else {
3251
3252 int count = 0;
3253
3254 bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0);
3255
3256 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
3257 &sync_blocks, mddev->degraded) &&
3258 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3259 &mddev->recovery)) {
3260
3261 *skipped = 1;
3262 return sync_blocks + sectors_skipped;
3263 }
3264 if (sync_blocks < max_sync)
3265 max_sync = sync_blocks;
3266 r10_bio = raid10_alloc_init_r10buf(conf);
3267 r10_bio->state = 0;
3268
3269 r10_bio->mddev = mddev;
3270 atomic_set(&r10_bio->remaining, 0);
3271 raise_barrier(conf, 0);
3272 conf->next_resync = sector_nr;
3273
3274 r10_bio->master_bio = NULL;
3275 r10_bio->sector = sector_nr;
3276 set_bit(R10BIO_IsSync, &r10_bio->state);
3277 raid10_find_phys(conf, r10_bio);
3278 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3279
3280 for (i = 0; i < conf->copies; i++) {
3281 int d = r10_bio->devs[i].devnum;
3282 sector_t first_bad, sector;
3283 int bad_sectors;
3284 struct md_rdev *rdev;
3285
3286 if (r10_bio->devs[i].repl_bio)
3287 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3288
3289 bio = r10_bio->devs[i].bio;
3290 bio->bi_status = BLK_STS_IOERR;
3291 rcu_read_lock();
3292 rdev = rcu_dereference(conf->mirrors[d].rdev);
3293 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3294 rcu_read_unlock();
3295 continue;
3296 }
3297 sector = r10_bio->devs[i].addr;
3298 if (is_badblock(rdev, sector, max_sync,
3299 &first_bad, &bad_sectors)) {
3300 if (first_bad > sector)
3301 max_sync = first_bad - sector;
3302 else {
3303 bad_sectors -= (sector - first_bad);
3304 if (max_sync > bad_sectors)
3305 max_sync = bad_sectors;
3306 rcu_read_unlock();
3307 continue;
3308 }
3309 }
3310 atomic_inc(&rdev->nr_pending);
3311 atomic_inc(&r10_bio->remaining);
3312 bio->bi_next = biolist;
3313 biolist = bio;
3314 bio->bi_end_io = end_sync_read;
3315 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3316 if (test_bit(FailFast, &rdev->flags))
3317 bio->bi_opf |= MD_FAILFAST;
3318 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3319 bio_set_dev(bio, rdev->bdev);
3320 count++;
3321
3322 rdev = rcu_dereference(conf->mirrors[d].replacement);
3323 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3324 rcu_read_unlock();
3325 continue;
3326 }
3327 atomic_inc(&rdev->nr_pending);
3328
3329
3330 bio = r10_bio->devs[i].repl_bio;
3331 bio->bi_status = BLK_STS_IOERR;
3332
3333 sector = r10_bio->devs[i].addr;
3334 bio->bi_next = biolist;
3335 biolist = bio;
3336 bio->bi_end_io = end_sync_write;
3337 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3338 if (test_bit(FailFast, &rdev->flags))
3339 bio->bi_opf |= MD_FAILFAST;
3340 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3341 bio_set_dev(bio, rdev->bdev);
3342 count++;
3343 rcu_read_unlock();
3344 }
3345
3346 if (count < 2) {
3347 for (i=0; i<conf->copies; i++) {
3348 int d = r10_bio->devs[i].devnum;
3349 if (r10_bio->devs[i].bio->bi_end_io)
3350 rdev_dec_pending(conf->mirrors[d].rdev,
3351 mddev);
3352 if (r10_bio->devs[i].repl_bio &&
3353 r10_bio->devs[i].repl_bio->bi_end_io)
3354 rdev_dec_pending(
3355 conf->mirrors[d].replacement,
3356 mddev);
3357 }
3358 put_buf(r10_bio);
3359 biolist = NULL;
3360 goto giveup;
3361 }
3362 }
3363
3364 nr_sectors = 0;
3365 if (sector_nr + max_sync < max_sector)
3366 max_sector = sector_nr + max_sync;
3367 do {
3368 struct page *page;
3369 int len = PAGE_SIZE;
3370 if (sector_nr + (len>>9) > max_sector)
3371 len = (max_sector - sector_nr) << 9;
3372 if (len == 0)
3373 break;
3374 for (bio= biolist ; bio ; bio=bio->bi_next) {
3375 struct resync_pages *rp = get_resync_pages(bio);
3376 page = resync_fetch_page(rp, page_idx);
3377
3378
3379
3380
3381 bio_add_page(bio, page, len, 0);
3382 }
3383 nr_sectors += len>>9;
3384 sector_nr += len>>9;
3385 } while (++page_idx < RESYNC_PAGES);
3386 r10_bio->sectors = nr_sectors;
3387
3388 while (biolist) {
3389 bio = biolist;
3390 biolist = biolist->bi_next;
3391
3392 bio->bi_next = NULL;
3393 r10_bio = get_resync_r10bio(bio);
3394 r10_bio->sectors = nr_sectors;
3395
3396 if (bio->bi_end_io == end_sync_read) {
3397 md_sync_acct_bio(bio, nr_sectors);
3398 bio->bi_status = 0;
3399 generic_make_request(bio);
3400 }
3401 }
3402
3403 if (sectors_skipped)
3404
3405
3406
3407 md_done_sync(mddev, sectors_skipped, 1);
3408
3409 return sectors_skipped + nr_sectors;
3410 giveup:
3411
3412
3413
3414
3415 if (sector_nr + max_sync < max_sector)
3416 max_sector = sector_nr + max_sync;
3417
3418 sectors_skipped += (max_sector - sector_nr);
3419 chunks_skipped ++;
3420 sector_nr = max_sector;
3421 goto skipped;
3422}
3423
3424static sector_t
3425raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3426{
3427 sector_t size;
3428 struct r10conf *conf = mddev->private;
3429
3430 if (!raid_disks)
3431 raid_disks = min(conf->geo.raid_disks,
3432 conf->prev.raid_disks);
3433 if (!sectors)
3434 sectors = conf->dev_sectors;
3435
3436 size = sectors >> conf->geo.chunk_shift;
3437 sector_div(size, conf->geo.far_copies);
3438 size = size * raid_disks;
3439 sector_div(size, conf->geo.near_copies);
3440
3441 return size << conf->geo.chunk_shift;
3442}
3443
3444static void calc_sectors(struct r10conf *conf, sector_t size)
3445{
3446
3447
3448
3449
3450
3451 size = size >> conf->geo.chunk_shift;
3452 sector_div(size, conf->geo.far_copies);
3453 size = size * conf->geo.raid_disks;
3454 sector_div(size, conf->geo.near_copies);
3455
3456
3457 size = size * conf->copies;
3458
3459
3460
3461
3462 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3463
3464 conf->dev_sectors = size << conf->geo.chunk_shift;
3465
3466 if (conf->geo.far_offset)
3467 conf->geo.stride = 1 << conf->geo.chunk_shift;
3468 else {
3469 sector_div(size, conf->geo.far_copies);
3470 conf->geo.stride = size << conf->geo.chunk_shift;
3471 }
3472}
3473
3474enum geo_type {geo_new, geo_old, geo_start};
3475static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3476{
3477 int nc, fc, fo;
3478 int layout, chunk, disks;
3479 switch (new) {
3480 case geo_old:
3481 layout = mddev->layout;
3482 chunk = mddev->chunk_sectors;
3483 disks = mddev->raid_disks - mddev->delta_disks;
3484 break;
3485 case geo_new:
3486 layout = mddev->new_layout;
3487 chunk = mddev->new_chunk_sectors;
3488 disks = mddev->raid_disks;
3489 break;
3490 default:
3491 case geo_start:
3492
3493 layout = mddev->new_layout;
3494 chunk = mddev->new_chunk_sectors;
3495 disks = mddev->raid_disks + mddev->delta_disks;
3496 break;
3497 }
3498 if (layout >> 19)
3499 return -1;
3500 if (chunk < (PAGE_SIZE >> 9) ||
3501 !is_power_of_2(chunk))
3502 return -2;
3503 nc = layout & 255;
3504 fc = (layout >> 8) & 255;
3505 fo = layout & (1<<16);
3506 geo->raid_disks = disks;
3507 geo->near_copies = nc;
3508 geo->far_copies = fc;
3509 geo->far_offset = fo;
3510 switch (layout >> 17) {
3511 case 0:
3512 geo->far_set_size = disks;
3513 break;
3514 case 1:
3515
3516 geo->far_set_size = disks/fc;
3517 WARN(geo->far_set_size < fc,
3518 "This RAID10 layout does not provide data safety - please backup and create new array\n");
3519 break;
3520 case 2:
3521 geo->far_set_size = fc * nc;
3522 break;
3523 default:
3524 return -1;
3525 }
3526 geo->chunk_mask = chunk - 1;
3527 geo->chunk_shift = ffz(~chunk);
3528 return nc*fc;
3529}
3530
3531static struct r10conf *setup_conf(struct mddev *mddev)
3532{
3533 struct r10conf *conf = NULL;
3534 int err = -EINVAL;
3535 struct geom geo;
3536 int copies;
3537
3538 copies = setup_geo(&geo, mddev, geo_new);
3539
3540 if (copies == -2) {
3541 pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n",
3542 mdname(mddev), PAGE_SIZE);
3543 goto out;
3544 }
3545
3546 if (copies < 2 || copies > mddev->raid_disks) {
3547 pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3548 mdname(mddev), mddev->new_layout);
3549 goto out;
3550 }
3551
3552 err = -ENOMEM;
3553 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3554 if (!conf)
3555 goto out;
3556
3557
3558 conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
3559 max(0,-mddev->delta_disks)),
3560 GFP_KERNEL);
3561 if (!conf->mirrors)
3562 goto out;
3563
3564 conf->tmppage = alloc_page(GFP_KERNEL);
3565 if (!conf->tmppage)
3566 goto out;
3567
3568 conf->geo = geo;
3569 conf->copies = copies;
3570 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
3571 r10bio_pool_free, conf);
3572 if (!conf->r10bio_pool)
3573 goto out;
3574
3575 conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
3576 if (!conf->bio_split)
3577 goto out;
3578
3579 calc_sectors(conf, mddev->dev_sectors);
3580 if (mddev->reshape_position == MaxSector) {
3581 conf->prev = conf->geo;
3582 conf->reshape_progress = MaxSector;
3583 } else {
3584 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3585 err = -EINVAL;
3586 goto out;
3587 }
3588 conf->reshape_progress = mddev->reshape_position;
3589 if (conf->prev.far_offset)
3590 conf->prev.stride = 1 << conf->prev.chunk_shift;
3591 else
3592
3593 conf->prev.stride = conf->dev_sectors;
3594 }
3595 conf->reshape_safe = conf->reshape_progress;
3596 spin_lock_init(&conf->device_lock);
3597 INIT_LIST_HEAD(&conf->retry_list);
3598 INIT_LIST_HEAD(&conf->bio_end_io_list);
3599
3600 spin_lock_init(&conf->resync_lock);
3601 init_waitqueue_head(&conf->wait_barrier);
3602 atomic_set(&conf->nr_pending, 0);
3603
3604 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3605 if (!conf->thread)
3606 goto out;
3607
3608 conf->mddev = mddev;
3609 return conf;
3610
3611 out:
3612 if (conf) {
3613 mempool_destroy(conf->r10bio_pool);
3614 kfree(conf->mirrors);
3615 safe_put_page(conf->tmppage);
3616 if (conf->bio_split)
3617 bioset_free(conf->bio_split);
3618 kfree(conf);
3619 }
3620 return ERR_PTR(err);
3621}
3622
3623static int raid10_run(struct mddev *mddev)
3624{
3625 struct r10conf *conf;
3626 int i, disk_idx, chunk_size;
3627 struct raid10_info *disk;
3628 struct md_rdev *rdev;
3629 sector_t size;
3630 sector_t min_offset_diff = 0;
3631 int first = 1;
3632 bool discard_supported = false;
3633
3634 if (mddev_init_writes_pending(mddev) < 0)
3635 return -ENOMEM;
3636
3637 if (mddev->private == NULL) {
3638 conf = setup_conf(mddev);
3639 if (IS_ERR(conf))
3640 return PTR_ERR(conf);
3641 mddev->private = conf;
3642 }
3643 conf = mddev->private;
3644 if (!conf)
3645 goto out;
3646
3647 mddev->thread = conf->thread;
3648 conf->thread = NULL;
3649
3650 chunk_size = mddev->chunk_sectors << 9;
3651 if (mddev->queue) {
3652 blk_queue_max_discard_sectors(mddev->queue,
3653 mddev->chunk_sectors);
3654 blk_queue_max_write_same_sectors(mddev->queue, 0);
3655 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
3656 blk_queue_io_min(mddev->queue, chunk_size);
3657 if (conf->geo.raid_disks % conf->geo.near_copies)
3658 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3659 else
3660 blk_queue_io_opt(mddev->queue, chunk_size *
3661 (conf->geo.raid_disks / conf->geo.near_copies));
3662 }
3663
3664 rdev_for_each(rdev, mddev) {
3665 long long diff;
3666
3667 disk_idx = rdev->raid_disk;
3668 if (disk_idx < 0)
3669 continue;
3670 if (disk_idx >= conf->geo.raid_disks &&
3671 disk_idx >= conf->prev.raid_disks)
3672 continue;
3673 disk = conf->mirrors + disk_idx;
3674
3675 if (test_bit(Replacement, &rdev->flags)) {
3676 if (disk->replacement)
3677 goto out_free_conf;
3678 disk->replacement = rdev;
3679 } else {
3680 if (disk->rdev)
3681 goto out_free_conf;
3682 disk->rdev = rdev;
3683 }
3684 diff = (rdev->new_data_offset - rdev->data_offset);
3685 if (!mddev->reshape_backwards)
3686 diff = -diff;
3687 if (diff < 0)
3688 diff = 0;
3689 if (first || diff < min_offset_diff)
3690 min_offset_diff = diff;
3691
3692 if (mddev->gendisk)
3693 disk_stack_limits(mddev->gendisk, rdev->bdev,
3694 rdev->data_offset << 9);
3695
3696 disk->head_position = 0;
3697
3698 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3699 discard_supported = true;
3700 first = 0;
3701 }
3702
3703 if (mddev->queue) {
3704 if (discard_supported)
3705 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
3706 mddev->queue);
3707 else
3708 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
3709 mddev->queue);
3710 }
3711
3712 if (!enough(conf, -1)) {
3713 pr_err("md/raid10:%s: not enough operational mirrors.\n",
3714 mdname(mddev));
3715 goto out_free_conf;
3716 }
3717
3718 if (conf->reshape_progress != MaxSector) {
3719
3720 if (conf->geo.far_copies != 1 &&
3721 conf->geo.far_offset == 0)
3722 goto out_free_conf;
3723 if (conf->prev.far_copies != 1 &&
3724 conf->prev.far_offset == 0)
3725 goto out_free_conf;
3726 }
3727
3728 mddev->degraded = 0;
3729 for (i = 0;
3730 i < conf->geo.raid_disks
3731 || i < conf->prev.raid_disks;
3732 i++) {
3733
3734 disk = conf->mirrors + i;
3735
3736 if (!disk->rdev && disk->replacement) {
3737
3738 disk->rdev = disk->replacement;
3739 disk->replacement = NULL;
3740 clear_bit(Replacement, &disk->rdev->flags);
3741 }
3742
3743 if (!disk->rdev ||
3744 !test_bit(In_sync, &disk->rdev->flags)) {
3745 disk->head_position = 0;
3746 mddev->degraded++;
3747 if (disk->rdev &&
3748 disk->rdev->saved_raid_disk < 0)
3749 conf->fullsync = 1;
3750 }
3751 disk->recovery_disabled = mddev->recovery_disabled - 1;
3752 }
3753
3754 if (mddev->recovery_cp != MaxSector)
3755 pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n",
3756 mdname(mddev));
3757 pr_info("md/raid10:%s: active with %d out of %d devices\n",
3758 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3759 conf->geo.raid_disks);
3760
3761
3762
3763 mddev->dev_sectors = conf->dev_sectors;
3764 size = raid10_size(mddev, 0, 0);
3765 md_set_array_sectors(mddev, size);
3766 mddev->resync_max_sectors = size;
3767 set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
3768
3769 if (mddev->queue) {
3770 int stripe = conf->geo.raid_disks *
3771 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3772
3773
3774
3775
3776
3777 stripe /= conf->geo.near_copies;
3778 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
3779 mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
3780 }
3781
3782 if (md_integrity_register(mddev))
3783 goto out_free_conf;
3784
3785 if (conf->reshape_progress != MaxSector) {
3786 unsigned long before_length, after_length;
3787
3788 before_length = ((1 << conf->prev.chunk_shift) *
3789 conf->prev.far_copies);
3790 after_length = ((1 << conf->geo.chunk_shift) *
3791 conf->geo.far_copies);
3792
3793 if (max(before_length, after_length) > min_offset_diff) {
3794
3795 pr_warn("md/raid10: offset difference not enough to continue reshape\n");
3796 goto out_free_conf;
3797 }
3798 conf->offset_diff = min_offset_diff;
3799
3800 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3801 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3802 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3803 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3804 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3805 "reshape");
3806 }
3807
3808 return 0;
3809
3810out_free_conf:
3811 md_unregister_thread(&mddev->thread);
3812 mempool_destroy(conf->r10bio_pool);
3813 safe_put_page(conf->tmppage);
3814 kfree(conf->mirrors);
3815 kfree(conf);
3816 mddev->private = NULL;
3817out:
3818 return -EIO;
3819}
3820
3821static void raid10_free(struct mddev *mddev, void *priv)
3822{
3823 struct r10conf *conf = priv;
3824
3825 mempool_destroy(conf->r10bio_pool);
3826 safe_put_page(conf->tmppage);
3827 kfree(conf->mirrors);
3828 kfree(conf->mirrors_old);
3829 kfree(conf->mirrors_new);
3830 if (conf->bio_split)
3831 bioset_free(conf->bio_split);
3832 kfree(conf);
3833}
3834
3835static void raid10_quiesce(struct mddev *mddev, int state)
3836{
3837 struct r10conf *conf = mddev->private;
3838
3839 switch(state) {
3840 case 1:
3841 raise_barrier(conf, 0);
3842 break;
3843 case 0:
3844 lower_barrier(conf);
3845 break;
3846 }
3847}
3848
3849static int raid10_resize(struct mddev *mddev, sector_t sectors)
3850{
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863 struct r10conf *conf = mddev->private;
3864 sector_t oldsize, size;
3865
3866 if (mddev->reshape_position != MaxSector)
3867 return -EBUSY;
3868
3869 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3870 return -EINVAL;
3871
3872 oldsize = raid10_size(mddev, 0, 0);
3873 size = raid10_size(mddev, sectors, 0);
3874 if (mddev->external_size &&
3875 mddev->array_sectors > size)
3876 return -EINVAL;
3877 if (mddev->bitmap) {
3878 int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
3879 if (ret)
3880 return ret;
3881 }
3882 md_set_array_sectors(mddev, size);
3883 if (sectors > mddev->dev_sectors &&
3884 mddev->recovery_cp > oldsize) {
3885 mddev->recovery_cp = oldsize;
3886 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3887 }
3888 calc_sectors(conf, sectors);
3889 mddev->dev_sectors = conf->dev_sectors;
3890 mddev->resync_max_sectors = size;
3891 return 0;
3892}
3893
3894static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
3895{
3896 struct md_rdev *rdev;
3897 struct r10conf *conf;
3898
3899 if (mddev->degraded > 0) {
3900 pr_warn("md/raid10:%s: Error: degraded raid0!\n",
3901 mdname(mddev));
3902 return ERR_PTR(-EINVAL);
3903 }
3904 sector_div(size, devs);
3905
3906
3907 mddev->new_level = 10;
3908
3909 mddev->new_layout = (1<<8) + 2;
3910 mddev->new_chunk_sectors = mddev->chunk_sectors;
3911 mddev->delta_disks = mddev->raid_disks;
3912 mddev->raid_disks *= 2;
3913
3914 mddev->recovery_cp = MaxSector;
3915 mddev->dev_sectors = size;
3916
3917 conf = setup_conf(mddev);
3918 if (!IS_ERR(conf)) {
3919 rdev_for_each(rdev, mddev)
3920 if (rdev->raid_disk >= 0) {
3921 rdev->new_raid_disk = rdev->raid_disk * 2;
3922 rdev->sectors = size;
3923 }
3924 conf->barrier = 1;
3925 }
3926
3927 return conf;
3928}
3929
3930static void *raid10_takeover(struct mddev *mddev)
3931{
3932 struct r0conf *raid0_conf;
3933
3934
3935
3936
3937 if (mddev->level == 0) {
3938
3939 raid0_conf = mddev->private;
3940 if (raid0_conf->nr_strip_zones > 1) {
3941 pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n",
3942 mdname(mddev));
3943 return ERR_PTR(-EINVAL);
3944 }
3945 return raid10_takeover_raid0(mddev,
3946 raid0_conf->strip_zone->zone_end,
3947 raid0_conf->strip_zone->nb_dev);
3948 }
3949 return ERR_PTR(-EINVAL);
3950}
3951
3952static int raid10_check_reshape(struct mddev *mddev)
3953{
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968 struct r10conf *conf = mddev->private;
3969 struct geom geo;
3970
3971 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
3972 return -EINVAL;
3973
3974 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
3975
3976 return -EINVAL;
3977 if (geo.far_copies > 1 && !geo.far_offset)
3978
3979 return -EINVAL;
3980
3981 if (mddev->array_sectors & geo.chunk_mask)
3982
3983 return -EINVAL;
3984
3985 if (!enough(conf, -1))
3986 return -EINVAL;
3987
3988 kfree(conf->mirrors_new);
3989 conf->mirrors_new = NULL;
3990 if (mddev->delta_disks > 0) {
3991
3992 conf->mirrors_new = kzalloc(
3993 sizeof(struct raid10_info)
3994 *(mddev->raid_disks +
3995 mddev->delta_disks),
3996 GFP_KERNEL);
3997 if (!conf->mirrors_new)
3998 return -ENOMEM;
3999 }
4000 return 0;
4001}
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016static int calc_degraded(struct r10conf *conf)
4017{
4018 int degraded, degraded2;
4019 int i;
4020
4021 rcu_read_lock();
4022 degraded = 0;
4023
4024 for (i = 0; i < conf->prev.raid_disks; i++) {
4025 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4026 if (!rdev || test_bit(Faulty, &rdev->flags))
4027 degraded++;
4028 else if (!test_bit(In_sync, &rdev->flags))
4029
4030
4031
4032
4033 degraded++;
4034 }
4035 rcu_read_unlock();
4036 if (conf->geo.raid_disks == conf->prev.raid_disks)
4037 return degraded;
4038 rcu_read_lock();
4039 degraded2 = 0;
4040 for (i = 0; i < conf->geo.raid_disks; i++) {
4041 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4042 if (!rdev || test_bit(Faulty, &rdev->flags))
4043 degraded2++;
4044 else if (!test_bit(In_sync, &rdev->flags)) {
4045
4046
4047
4048
4049
4050 if (conf->geo.raid_disks <= conf->prev.raid_disks)
4051 degraded2++;
4052 }
4053 }
4054 rcu_read_unlock();
4055 if (degraded2 > degraded)
4056 return degraded2;
4057 return degraded;
4058}
4059
4060static int raid10_start_reshape(struct mddev *mddev)
4061{
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072 unsigned long before_length, after_length;
4073 sector_t min_offset_diff = 0;
4074 int first = 1;
4075 struct geom new;
4076 struct r10conf *conf = mddev->private;
4077 struct md_rdev *rdev;
4078 int spares = 0;
4079 int ret;
4080
4081 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4082 return -EBUSY;
4083
4084 if (setup_geo(&new, mddev, geo_start) != conf->copies)
4085 return -EINVAL;
4086
4087 before_length = ((1 << conf->prev.chunk_shift) *
4088 conf->prev.far_copies);
4089 after_length = ((1 << conf->geo.chunk_shift) *
4090 conf->geo.far_copies);
4091
4092 rdev_for_each(rdev, mddev) {
4093 if (!test_bit(In_sync, &rdev->flags)
4094 && !test_bit(Faulty, &rdev->flags))
4095 spares++;
4096 if (rdev->raid_disk >= 0) {
4097 long long diff = (rdev->new_data_offset
4098 - rdev->data_offset);
4099 if (!mddev->reshape_backwards)
4100 diff = -diff;
4101 if (diff < 0)
4102 diff = 0;
4103 if (first || diff < min_offset_diff)
4104 min_offset_diff = diff;
4105 first = 0;
4106 }
4107 }
4108
4109 if (max(before_length, after_length) > min_offset_diff)
4110 return -EINVAL;
4111
4112 if (spares < mddev->delta_disks)
4113 return -EINVAL;
4114
4115 conf->offset_diff = min_offset_diff;
4116 spin_lock_irq(&conf->device_lock);
4117 if (conf->mirrors_new) {
4118 memcpy(conf->mirrors_new, conf->mirrors,
4119 sizeof(struct raid10_info)*conf->prev.raid_disks);
4120 smp_mb();
4121 kfree(conf->mirrors_old);
4122 conf->mirrors_old = conf->mirrors;
4123 conf->mirrors = conf->mirrors_new;
4124 conf->mirrors_new = NULL;
4125 }
4126 setup_geo(&conf->geo, mddev, geo_start);
4127 smp_mb();
4128 if (mddev->reshape_backwards) {
4129 sector_t size = raid10_size(mddev, 0, 0);
4130 if (size < mddev->array_sectors) {
4131 spin_unlock_irq(&conf->device_lock);
4132 pr_warn("md/raid10:%s: array size must be reduce before number of disks\n",
4133 mdname(mddev));
4134 return -EINVAL;
4135 }
4136 mddev->resync_max_sectors = size;
4137 conf->reshape_progress = size;
4138 } else
4139 conf->reshape_progress = 0;
4140 conf->reshape_safe = conf->reshape_progress;
4141 spin_unlock_irq(&conf->device_lock);
4142
4143 if (mddev->delta_disks && mddev->bitmap) {
4144 ret = bitmap_resize(mddev->bitmap,
4145 raid10_size(mddev, 0,
4146 conf->geo.raid_disks),
4147 0, 0);
4148 if (ret)
4149 goto abort;
4150 }
4151 if (mddev->delta_disks > 0) {
4152 rdev_for_each(rdev, mddev)
4153 if (rdev->raid_disk < 0 &&
4154 !test_bit(Faulty, &rdev->flags)) {
4155 if (raid10_add_disk(mddev, rdev) == 0) {
4156 if (rdev->raid_disk >=
4157 conf->prev.raid_disks)
4158 set_bit(In_sync, &rdev->flags);
4159 else
4160 rdev->recovery_offset = 0;
4161
4162 if (sysfs_link_rdev(mddev, rdev))
4163 ;
4164 }
4165 } else if (rdev->raid_disk >= conf->prev.raid_disks
4166 && !test_bit(Faulty, &rdev->flags)) {
4167
4168 set_bit(In_sync, &rdev->flags);
4169 }
4170 }
4171
4172
4173
4174
4175 spin_lock_irq(&conf->device_lock);
4176 mddev->degraded = calc_degraded(conf);
4177 spin_unlock_irq(&conf->device_lock);
4178 mddev->raid_disks = conf->geo.raid_disks;
4179 mddev->reshape_position = conf->reshape_progress;
4180 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4181
4182 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4183 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4184 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
4185 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4186 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4187
4188 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4189 "reshape");
4190 if (!mddev->sync_thread) {
4191 ret = -EAGAIN;
4192 goto abort;
4193 }
4194 conf->reshape_checkpoint = jiffies;
4195 md_wakeup_thread(mddev->sync_thread);
4196 md_new_event(mddev);
4197 return 0;
4198
4199abort:
4200 mddev->recovery = 0;
4201 spin_lock_irq(&conf->device_lock);
4202 conf->geo = conf->prev;
4203 mddev->raid_disks = conf->geo.raid_disks;
4204 rdev_for_each(rdev, mddev)
4205 rdev->new_data_offset = rdev->data_offset;
4206 smp_wmb();
4207 conf->reshape_progress = MaxSector;
4208 conf->reshape_safe = MaxSector;
4209 mddev->reshape_position = MaxSector;
4210 spin_unlock_irq(&conf->device_lock);
4211 return ret;
4212}
4213
4214
4215
4216
4217
4218
4219
4220static sector_t last_dev_address(sector_t s, struct geom *geo)
4221{
4222 s = (s | geo->chunk_mask) + 1;
4223 s >>= geo->chunk_shift;
4224 s *= geo->near_copies;
4225 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4226 s *= geo->far_copies;
4227 s <<= geo->chunk_shift;
4228 return s;
4229}
4230
4231
4232
4233
4234
4235static sector_t first_dev_address(sector_t s, struct geom *geo)
4236{
4237 s >>= geo->chunk_shift;
4238 s *= geo->near_copies;
4239 sector_div(s, geo->raid_disks);
4240 s *= geo->far_copies;
4241 s <<= geo->chunk_shift;
4242 return s;
4243}
4244
4245static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4246 int *skipped)
4247{
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285 struct r10conf *conf = mddev->private;
4286 struct r10bio *r10_bio;
4287 sector_t next, safe, last;
4288 int max_sectors;
4289 int nr_sectors;
4290 int s;
4291 struct md_rdev *rdev;
4292 int need_flush = 0;
4293 struct bio *blist;
4294 struct bio *bio, *read_bio;
4295 int sectors_done = 0;
4296 struct page **pages;
4297
4298 if (sector_nr == 0) {
4299
4300 if (mddev->reshape_backwards &&
4301 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4302 sector_nr = (raid10_size(mddev, 0, 0)
4303 - conf->reshape_progress);
4304 } else if (!mddev->reshape_backwards &&
4305 conf->reshape_progress > 0)
4306 sector_nr = conf->reshape_progress;
4307 if (sector_nr) {
4308 mddev->curr_resync_completed = sector_nr;
4309 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4310 *skipped = 1;
4311 return sector_nr;
4312 }
4313 }
4314
4315
4316
4317
4318
4319 if (mddev->reshape_backwards) {
4320
4321
4322
4323 next = first_dev_address(conf->reshape_progress - 1,
4324 &conf->geo);
4325
4326
4327
4328
4329 safe = last_dev_address(conf->reshape_safe - 1,
4330 &conf->prev);
4331
4332 if (next + conf->offset_diff < safe)
4333 need_flush = 1;
4334
4335 last = conf->reshape_progress - 1;
4336 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4337 & conf->prev.chunk_mask);
4338 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4339 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4340 } else {
4341
4342
4343
4344 next = last_dev_address(conf->reshape_progress, &conf->geo);
4345
4346
4347
4348
4349 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4350
4351
4352
4353
4354 if (next > safe + conf->offset_diff)
4355 need_flush = 1;
4356
4357 sector_nr = conf->reshape_progress;
4358 last = sector_nr | (conf->geo.chunk_mask
4359 & conf->prev.chunk_mask);
4360
4361 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4362 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4363 }
4364
4365 if (need_flush ||
4366 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4367
4368 wait_barrier(conf);
4369 mddev->reshape_position = conf->reshape_progress;
4370 if (mddev->reshape_backwards)
4371 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4372 - conf->reshape_progress;
4373 else
4374 mddev->curr_resync_completed = conf->reshape_progress;
4375 conf->reshape_checkpoint = jiffies;
4376 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4377 md_wakeup_thread(mddev->thread);
4378 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
4379 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4380 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4381 allow_barrier(conf);
4382 return sectors_done;
4383 }
4384 conf->reshape_safe = mddev->reshape_position;
4385 allow_barrier(conf);
4386 }
4387
4388read_more:
4389
4390 r10_bio = raid10_alloc_init_r10buf(conf);
4391 r10_bio->state = 0;
4392 raise_barrier(conf, sectors_done != 0);
4393 atomic_set(&r10_bio->remaining, 0);
4394 r10_bio->mddev = mddev;
4395 r10_bio->sector = sector_nr;
4396 set_bit(R10BIO_IsReshape, &r10_bio->state);
4397 r10_bio->sectors = last - sector_nr + 1;
4398 rdev = read_balance(conf, r10_bio, &max_sectors);
4399 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4400
4401 if (!rdev) {
4402
4403
4404
4405
4406 mempool_free(r10_bio, conf->r10buf_pool);
4407 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4408 return sectors_done;
4409 }
4410
4411 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4412
4413 bio_set_dev(read_bio, rdev->bdev);
4414 read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4415 + rdev->data_offset);
4416 read_bio->bi_private = r10_bio;
4417 read_bio->bi_end_io = end_reshape_read;
4418 bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
4419 read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
4420 read_bio->bi_status = 0;
4421 read_bio->bi_vcnt = 0;
4422 read_bio->bi_iter.bi_size = 0;
4423 r10_bio->master_bio = read_bio;
4424 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4425
4426
4427 __raid10_find_phys(&conf->geo, r10_bio);
4428
4429 blist = read_bio;
4430 read_bio->bi_next = NULL;
4431
4432 rcu_read_lock();
4433 for (s = 0; s < conf->copies*2; s++) {
4434 struct bio *b;
4435 int d = r10_bio->devs[s/2].devnum;
4436 struct md_rdev *rdev2;
4437 if (s&1) {
4438 rdev2 = rcu_dereference(conf->mirrors[d].replacement);
4439 b = r10_bio->devs[s/2].repl_bio;
4440 } else {
4441 rdev2 = rcu_dereference(conf->mirrors[d].rdev);
4442 b = r10_bio->devs[s/2].bio;
4443 }
4444 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4445 continue;
4446
4447 bio_set_dev(b, rdev2->bdev);
4448 b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
4449 rdev2->new_data_offset;
4450 b->bi_end_io = end_reshape_write;
4451 bio_set_op_attrs(b, REQ_OP_WRITE, 0);
4452 b->bi_next = blist;
4453 blist = b;
4454 }
4455
4456
4457
4458 nr_sectors = 0;
4459 pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
4460 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4461 struct page *page = pages[s / (PAGE_SIZE >> 9)];
4462 int len = (max_sectors - s) << 9;
4463 if (len > PAGE_SIZE)
4464 len = PAGE_SIZE;
4465 for (bio = blist; bio ; bio = bio->bi_next) {
4466
4467
4468
4469
4470 bio_add_page(bio, page, len, 0);
4471 }
4472 sector_nr += len >> 9;
4473 nr_sectors += len >> 9;
4474 }
4475 rcu_read_unlock();
4476 r10_bio->sectors = nr_sectors;
4477
4478
4479 md_sync_acct_bio(read_bio, r10_bio->sectors);
4480 atomic_inc(&r10_bio->remaining);
4481 read_bio->bi_next = NULL;
4482 generic_make_request(read_bio);
4483 sector_nr += nr_sectors;
4484 sectors_done += nr_sectors;
4485 if (sector_nr <= last)
4486 goto read_more;
4487
4488
4489
4490
4491 if (mddev->reshape_backwards)
4492 conf->reshape_progress -= sectors_done;
4493 else
4494 conf->reshape_progress += sectors_done;
4495
4496 return sectors_done;
4497}
4498
4499static void end_reshape_request(struct r10bio *r10_bio);
4500static int handle_reshape_read_error(struct mddev *mddev,
4501 struct r10bio *r10_bio);
4502static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4503{
4504
4505
4506
4507
4508
4509 struct r10conf *conf = mddev->private;
4510 int s;
4511
4512 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4513 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4514
4515 md_done_sync(mddev, r10_bio->sectors, 0);
4516 return;
4517 }
4518
4519
4520
4521
4522 atomic_set(&r10_bio->remaining, 1);
4523 for (s = 0; s < conf->copies*2; s++) {
4524 struct bio *b;
4525 int d = r10_bio->devs[s/2].devnum;
4526 struct md_rdev *rdev;
4527 rcu_read_lock();
4528 if (s&1) {
4529 rdev = rcu_dereference(conf->mirrors[d].replacement);
4530 b = r10_bio->devs[s/2].repl_bio;
4531 } else {
4532 rdev = rcu_dereference(conf->mirrors[d].rdev);
4533 b = r10_bio->devs[s/2].bio;
4534 }
4535 if (!rdev || test_bit(Faulty, &rdev->flags)) {
4536 rcu_read_unlock();
4537 continue;
4538 }
4539 atomic_inc(&rdev->nr_pending);
4540 rcu_read_unlock();
4541 md_sync_acct_bio(b, r10_bio->sectors);
4542 atomic_inc(&r10_bio->remaining);
4543 b->bi_next = NULL;
4544 generic_make_request(b);
4545 }
4546 end_reshape_request(r10_bio);
4547}
4548
4549static void end_reshape(struct r10conf *conf)
4550{
4551 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4552 return;
4553
4554 spin_lock_irq(&conf->device_lock);
4555 conf->prev = conf->geo;
4556 md_finish_reshape(conf->mddev);
4557 smp_wmb();
4558 conf->reshape_progress = MaxSector;
4559 conf->reshape_safe = MaxSector;
4560 spin_unlock_irq(&conf->device_lock);
4561
4562
4563
4564
4565 if (conf->mddev->queue) {
4566 int stripe = conf->geo.raid_disks *
4567 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4568 stripe /= conf->geo.near_copies;
4569 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
4570 conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
4571 }
4572 conf->fullsync = 0;
4573}
4574
4575static int handle_reshape_read_error(struct mddev *mddev,
4576 struct r10bio *r10_bio)
4577{
4578
4579 int sectors = r10_bio->sectors;
4580 struct r10conf *conf = mddev->private;
4581 struct {
4582 struct r10bio r10_bio;
4583 struct r10dev devs[conf->copies];
4584 } on_stack;
4585 struct r10bio *r10b = &on_stack.r10_bio;
4586 int slot = 0;
4587 int idx = 0;
4588 struct page **pages;
4589
4590
4591 pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
4592
4593 r10b->sector = r10_bio->sector;
4594 __raid10_find_phys(&conf->prev, r10b);
4595
4596 while (sectors) {
4597 int s = sectors;
4598 int success = 0;
4599 int first_slot = slot;
4600
4601 if (s > (PAGE_SIZE >> 9))
4602 s = PAGE_SIZE >> 9;
4603
4604 rcu_read_lock();
4605 while (!success) {
4606 int d = r10b->devs[slot].devnum;
4607 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4608 sector_t addr;
4609 if (rdev == NULL ||
4610 test_bit(Faulty, &rdev->flags) ||
4611 !test_bit(In_sync, &rdev->flags))
4612 goto failed;
4613
4614 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4615 atomic_inc(&rdev->nr_pending);
4616 rcu_read_unlock();
4617 success = sync_page_io(rdev,
4618 addr,
4619 s << 9,
4620 pages[idx],
4621 REQ_OP_READ, 0, false);
4622 rdev_dec_pending(rdev, mddev);
4623 rcu_read_lock();
4624 if (success)
4625 break;
4626 failed:
4627 slot++;
4628 if (slot >= conf->copies)
4629 slot = 0;
4630 if (slot == first_slot)
4631 break;
4632 }
4633 rcu_read_unlock();
4634 if (!success) {
4635
4636 set_bit(MD_RECOVERY_INTR,
4637 &mddev->recovery);
4638 return -EIO;
4639 }
4640 sectors -= s;
4641 idx++;
4642 }
4643 return 0;
4644}
4645
4646static void end_reshape_write(struct bio *bio)
4647{
4648 struct r10bio *r10_bio = get_resync_r10bio(bio);
4649 struct mddev *mddev = r10_bio->mddev;
4650 struct r10conf *conf = mddev->private;
4651 int d;
4652 int slot;
4653 int repl;
4654 struct md_rdev *rdev = NULL;
4655
4656 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4657 if (repl)
4658 rdev = conf->mirrors[d].replacement;
4659 if (!rdev) {
4660 smp_mb();
4661 rdev = conf->mirrors[d].rdev;
4662 }
4663
4664 if (bio->bi_status) {
4665
4666 md_error(mddev, rdev);
4667 }
4668
4669 rdev_dec_pending(rdev, mddev);
4670 end_reshape_request(r10_bio);
4671}
4672
4673static void end_reshape_request(struct r10bio *r10_bio)
4674{
4675 if (!atomic_dec_and_test(&r10_bio->remaining))
4676 return;
4677 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4678 bio_put(r10_bio->master_bio);
4679 put_buf(r10_bio);
4680}
4681
4682static void raid10_finish_reshape(struct mddev *mddev)
4683{
4684 struct r10conf *conf = mddev->private;
4685
4686 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4687 return;
4688
4689 if (mddev->delta_disks > 0) {
4690 sector_t size = raid10_size(mddev, 0, 0);
4691 md_set_array_sectors(mddev, size);
4692 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4693 mddev->recovery_cp = mddev->resync_max_sectors;
4694 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4695 }
4696 mddev->resync_max_sectors = size;
4697 if (mddev->queue) {
4698 set_capacity(mddev->gendisk, mddev->array_sectors);
4699 revalidate_disk(mddev->gendisk);
4700 }
4701 } else {
4702 int d;
4703 rcu_read_lock();
4704 for (d = conf->geo.raid_disks ;
4705 d < conf->geo.raid_disks - mddev->delta_disks;
4706 d++) {
4707 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4708 if (rdev)
4709 clear_bit(In_sync, &rdev->flags);
4710 rdev = rcu_dereference(conf->mirrors[d].replacement);
4711 if (rdev)
4712 clear_bit(In_sync, &rdev->flags);
4713 }
4714 rcu_read_unlock();
4715 }
4716 mddev->layout = mddev->new_layout;
4717 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4718 mddev->reshape_position = MaxSector;
4719 mddev->delta_disks = 0;
4720 mddev->reshape_backwards = 0;
4721}
4722
4723static struct md_personality raid10_personality =
4724{
4725 .name = "raid10",
4726 .level = 10,
4727 .owner = THIS_MODULE,
4728 .make_request = raid10_make_request,
4729 .run = raid10_run,
4730 .free = raid10_free,
4731 .status = raid10_status,
4732 .error_handler = raid10_error,
4733 .hot_add_disk = raid10_add_disk,
4734 .hot_remove_disk= raid10_remove_disk,
4735 .spare_active = raid10_spare_active,
4736 .sync_request = raid10_sync_request,
4737 .quiesce = raid10_quiesce,
4738 .size = raid10_size,
4739 .resize = raid10_resize,
4740 .takeover = raid10_takeover,
4741 .check_reshape = raid10_check_reshape,
4742 .start_reshape = raid10_start_reshape,
4743 .finish_reshape = raid10_finish_reshape,
4744 .congested = raid10_congested,
4745};
4746
4747static int __init raid_init(void)
4748{
4749 return register_md_personality(&raid10_personality);
4750}
4751
4752static void raid_exit(void)
4753{
4754 unregister_md_personality(&raid10_personality);
4755}
4756
4757module_init(raid_init);
4758module_exit(raid_exit);
4759MODULE_LICENSE("GPL");
4760MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4761MODULE_ALIAS("md-personality-9");
4762MODULE_ALIAS("md-raid10");
4763MODULE_ALIAS("md-level-10");
4764
4765module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
4766