1
2
3
4
5
6
7
8
9
10
11
12#include <linux/slab.h>
13#include <linux/delay.h>
14#include <linux/blkdev.h>
15#include <linux/module.h>
16#include <linux/seq_file.h>
17#include <linux/ratelimit.h>
18#include <linux/kthread.h>
19#include <linux/raid/md_p.h>
20#include <trace/events/block.h>
21#include "md.h"
22#include "raid10.h"
23#include "raid0.h"
24#include "md-bitmap.h"
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67static void allow_barrier(struct r10conf *conf);
68static void lower_barrier(struct r10conf *conf);
69static int _enough(struct r10conf *conf, int previous, int ignore);
70static int enough(struct r10conf *conf, int ignore);
71static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
72 int *skipped);
73static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
74static void end_reshape_write(struct bio *bio);
75static void end_reshape(struct r10conf *conf);
76
77#define raid10_log(md, fmt, args...) \
78 do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
79
80#include "raid1-10.c"
81
82
83
84
85
86static inline struct r10bio *get_resync_r10bio(struct bio *bio)
87{
88 return get_resync_pages(bio)->raid_bio;
89}
90
91static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
92{
93 struct r10conf *conf = data;
94 int size = offsetof(struct r10bio, devs[conf->copies]);
95
96
97
98 return kzalloc(size, gfp_flags);
99}
100
101#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
102
103#define RESYNC_WINDOW (1024*1024)
104
105#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
106#define CLUSTER_RESYNC_WINDOW (32 * RESYNC_WINDOW)
107#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
108
109
110
111
112
113
114
115
116static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
117{
118 struct r10conf *conf = data;
119 struct r10bio *r10_bio;
120 struct bio *bio;
121 int j;
122 int nalloc, nalloc_rp;
123 struct resync_pages *rps;
124
125 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
126 if (!r10_bio)
127 return NULL;
128
129 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
130 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
131 nalloc = conf->copies;
132 else
133 nalloc = 2;
134
135
136 if (!conf->have_replacement)
137 nalloc_rp = nalloc;
138 else
139 nalloc_rp = nalloc * 2;
140 rps = kmalloc_array(nalloc_rp, sizeof(struct resync_pages), gfp_flags);
141 if (!rps)
142 goto out_free_r10bio;
143
144
145
146
147 for (j = nalloc ; j-- ; ) {
148 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
149 if (!bio)
150 goto out_free_bio;
151 r10_bio->devs[j].bio = bio;
152 if (!conf->have_replacement)
153 continue;
154 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
155 if (!bio)
156 goto out_free_bio;
157 r10_bio->devs[j].repl_bio = bio;
158 }
159
160
161
162
163 for (j = 0; j < nalloc; j++) {
164 struct bio *rbio = r10_bio->devs[j].repl_bio;
165 struct resync_pages *rp, *rp_repl;
166
167 rp = &rps[j];
168 if (rbio)
169 rp_repl = &rps[nalloc + j];
170
171 bio = r10_bio->devs[j].bio;
172
173 if (!j || test_bit(MD_RECOVERY_SYNC,
174 &conf->mddev->recovery)) {
175 if (resync_alloc_pages(rp, gfp_flags))
176 goto out_free_pages;
177 } else {
178 memcpy(rp, &rps[0], sizeof(*rp));
179 resync_get_all_pages(rp);
180 }
181
182 rp->raid_bio = r10_bio;
183 bio->bi_private = rp;
184 if (rbio) {
185 memcpy(rp_repl, rp, sizeof(*rp));
186 rbio->bi_private = rp_repl;
187 }
188 }
189
190 return r10_bio;
191
192out_free_pages:
193 while (--j >= 0)
194 resync_free_pages(&rps[j]);
195
196 j = 0;
197out_free_bio:
198 for ( ; j < nalloc; j++) {
199 if (r10_bio->devs[j].bio)
200 bio_put(r10_bio->devs[j].bio);
201 if (r10_bio->devs[j].repl_bio)
202 bio_put(r10_bio->devs[j].repl_bio);
203 }
204 kfree(rps);
205out_free_r10bio:
206 rbio_pool_free(r10_bio, conf);
207 return NULL;
208}
209
210static void r10buf_pool_free(void *__r10_bio, void *data)
211{
212 struct r10conf *conf = data;
213 struct r10bio *r10bio = __r10_bio;
214 int j;
215 struct resync_pages *rp = NULL;
216
217 for (j = conf->copies; j--; ) {
218 struct bio *bio = r10bio->devs[j].bio;
219
220 if (bio) {
221 rp = get_resync_pages(bio);
222 resync_free_pages(rp);
223 bio_put(bio);
224 }
225
226 bio = r10bio->devs[j].repl_bio;
227 if (bio)
228 bio_put(bio);
229 }
230
231
232 kfree(rp);
233
234 rbio_pool_free(r10bio, conf);
235}
236
237static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
238{
239 int i;
240
241 for (i = 0; i < conf->copies; i++) {
242 struct bio **bio = & r10_bio->devs[i].bio;
243 if (!BIO_SPECIAL(*bio))
244 bio_put(*bio);
245 *bio = NULL;
246 bio = &r10_bio->devs[i].repl_bio;
247 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
248 bio_put(*bio);
249 *bio = NULL;
250 }
251}
252
253static void free_r10bio(struct r10bio *r10_bio)
254{
255 struct r10conf *conf = r10_bio->mddev->private;
256
257 put_all_bios(conf, r10_bio);
258 mempool_free(r10_bio, &conf->r10bio_pool);
259}
260
261static void put_buf(struct r10bio *r10_bio)
262{
263 struct r10conf *conf = r10_bio->mddev->private;
264
265 mempool_free(r10_bio, &conf->r10buf_pool);
266
267 lower_barrier(conf);
268}
269
270static void reschedule_retry(struct r10bio *r10_bio)
271{
272 unsigned long flags;
273 struct mddev *mddev = r10_bio->mddev;
274 struct r10conf *conf = mddev->private;
275
276 spin_lock_irqsave(&conf->device_lock, flags);
277 list_add(&r10_bio->retry_list, &conf->retry_list);
278 conf->nr_queued ++;
279 spin_unlock_irqrestore(&conf->device_lock, flags);
280
281
282 wake_up(&conf->wait_barrier);
283
284 md_wakeup_thread(mddev->thread);
285}
286
287
288
289
290
291
292static void raid_end_bio_io(struct r10bio *r10_bio)
293{
294 struct bio *bio = r10_bio->master_bio;
295 struct r10conf *conf = r10_bio->mddev->private;
296
297 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
298 bio->bi_status = BLK_STS_IOERR;
299
300 bio_endio(bio);
301
302
303
304
305 allow_barrier(conf);
306
307 free_r10bio(r10_bio);
308}
309
310
311
312
313static inline void update_head_pos(int slot, struct r10bio *r10_bio)
314{
315 struct r10conf *conf = r10_bio->mddev->private;
316
317 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
318 r10_bio->devs[slot].addr + (r10_bio->sectors);
319}
320
321
322
323
324static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
325 struct bio *bio, int *slotp, int *replp)
326{
327 int slot;
328 int repl = 0;
329
330 for (slot = 0; slot < conf->copies; slot++) {
331 if (r10_bio->devs[slot].bio == bio)
332 break;
333 if (r10_bio->devs[slot].repl_bio == bio) {
334 repl = 1;
335 break;
336 }
337 }
338
339 BUG_ON(slot == conf->copies);
340 update_head_pos(slot, r10_bio);
341
342 if (slotp)
343 *slotp = slot;
344 if (replp)
345 *replp = repl;
346 return r10_bio->devs[slot].devnum;
347}
348
349static void raid10_end_read_request(struct bio *bio)
350{
351 int uptodate = !bio->bi_status;
352 struct r10bio *r10_bio = bio->bi_private;
353 int slot;
354 struct md_rdev *rdev;
355 struct r10conf *conf = r10_bio->mddev->private;
356
357 slot = r10_bio->read_slot;
358 rdev = r10_bio->devs[slot].rdev;
359
360
361
362 update_head_pos(slot, r10_bio);
363
364 if (uptodate) {
365
366
367
368
369
370
371
372
373
374 set_bit(R10BIO_Uptodate, &r10_bio->state);
375 } else {
376
377
378
379
380
381 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
382 rdev->raid_disk))
383 uptodate = 1;
384 }
385 if (uptodate) {
386 raid_end_bio_io(r10_bio);
387 rdev_dec_pending(rdev, conf->mddev);
388 } else {
389
390
391
392 char b[BDEVNAME_SIZE];
393 pr_err_ratelimited("md/raid10:%s: %s: rescheduling sector %llu\n",
394 mdname(conf->mddev),
395 bdevname(rdev->bdev, b),
396 (unsigned long long)r10_bio->sector);
397 set_bit(R10BIO_ReadError, &r10_bio->state);
398 reschedule_retry(r10_bio);
399 }
400}
401
402static void close_write(struct r10bio *r10_bio)
403{
404
405 md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
406 r10_bio->sectors,
407 !test_bit(R10BIO_Degraded, &r10_bio->state),
408 0);
409 md_write_end(r10_bio->mddev);
410}
411
412static void one_write_done(struct r10bio *r10_bio)
413{
414 if (atomic_dec_and_test(&r10_bio->remaining)) {
415 if (test_bit(R10BIO_WriteError, &r10_bio->state))
416 reschedule_retry(r10_bio);
417 else {
418 close_write(r10_bio);
419 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
420 reschedule_retry(r10_bio);
421 else
422 raid_end_bio_io(r10_bio);
423 }
424 }
425}
426
427static void raid10_end_write_request(struct bio *bio)
428{
429 struct r10bio *r10_bio = bio->bi_private;
430 int dev;
431 int dec_rdev = 1;
432 struct r10conf *conf = r10_bio->mddev->private;
433 int slot, repl;
434 struct md_rdev *rdev = NULL;
435 struct bio *to_put = NULL;
436 bool discard_error;
437
438 discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
439
440 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
441
442 if (repl)
443 rdev = conf->mirrors[dev].replacement;
444 if (!rdev) {
445 smp_rmb();
446 repl = 0;
447 rdev = conf->mirrors[dev].rdev;
448 }
449
450
451
452 if (bio->bi_status && !discard_error) {
453 if (repl)
454
455
456
457 md_error(rdev->mddev, rdev);
458 else {
459 set_bit(WriteErrorSeen, &rdev->flags);
460 if (!test_and_set_bit(WantReplacement, &rdev->flags))
461 set_bit(MD_RECOVERY_NEEDED,
462 &rdev->mddev->recovery);
463
464 dec_rdev = 0;
465 if (test_bit(FailFast, &rdev->flags) &&
466 (bio->bi_opf & MD_FAILFAST)) {
467 md_error(rdev->mddev, rdev);
468 }
469
470
471
472
473
474
475
476 if (!test_bit(Faulty, &rdev->flags))
477 set_bit(R10BIO_WriteError, &r10_bio->state);
478 else {
479 r10_bio->devs[slot].bio = NULL;
480 to_put = bio;
481 dec_rdev = 1;
482 }
483 }
484 } else {
485
486
487
488
489
490
491
492
493
494 sector_t first_bad;
495 int bad_sectors;
496
497
498
499
500
501
502
503
504
505 if (test_bit(In_sync, &rdev->flags) &&
506 !test_bit(Faulty, &rdev->flags))
507 set_bit(R10BIO_Uptodate, &r10_bio->state);
508
509
510 if (is_badblock(rdev,
511 r10_bio->devs[slot].addr,
512 r10_bio->sectors,
513 &first_bad, &bad_sectors) && !discard_error) {
514 bio_put(bio);
515 if (repl)
516 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
517 else
518 r10_bio->devs[slot].bio = IO_MADE_GOOD;
519 dec_rdev = 0;
520 set_bit(R10BIO_MadeGood, &r10_bio->state);
521 }
522 }
523
524
525
526
527
528
529 one_write_done(r10_bio);
530 if (dec_rdev)
531 rdev_dec_pending(rdev, conf->mddev);
532 if (to_put)
533 bio_put(to_put);
534}
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
562{
563 int n,f;
564 sector_t sector;
565 sector_t chunk;
566 sector_t stripe;
567 int dev;
568 int slot = 0;
569 int last_far_set_start, last_far_set_size;
570
571 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
572 last_far_set_start *= geo->far_set_size;
573
574 last_far_set_size = geo->far_set_size;
575 last_far_set_size += (geo->raid_disks % geo->far_set_size);
576
577
578 chunk = r10bio->sector >> geo->chunk_shift;
579 sector = r10bio->sector & geo->chunk_mask;
580
581 chunk *= geo->near_copies;
582 stripe = chunk;
583 dev = sector_div(stripe, geo->raid_disks);
584 if (geo->far_offset)
585 stripe *= geo->far_copies;
586
587 sector += stripe << geo->chunk_shift;
588
589
590 for (n = 0; n < geo->near_copies; n++) {
591 int d = dev;
592 int set;
593 sector_t s = sector;
594 r10bio->devs[slot].devnum = d;
595 r10bio->devs[slot].addr = s;
596 slot++;
597
598 for (f = 1; f < geo->far_copies; f++) {
599 set = d / geo->far_set_size;
600 d += geo->near_copies;
601
602 if ((geo->raid_disks % geo->far_set_size) &&
603 (d > last_far_set_start)) {
604 d -= last_far_set_start;
605 d %= last_far_set_size;
606 d += last_far_set_start;
607 } else {
608 d %= geo->far_set_size;
609 d += geo->far_set_size * set;
610 }
611 s += geo->stride;
612 r10bio->devs[slot].devnum = d;
613 r10bio->devs[slot].addr = s;
614 slot++;
615 }
616 dev++;
617 if (dev >= geo->raid_disks) {
618 dev = 0;
619 sector += (geo->chunk_mask + 1);
620 }
621 }
622}
623
624static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
625{
626 struct geom *geo = &conf->geo;
627
628 if (conf->reshape_progress != MaxSector &&
629 ((r10bio->sector >= conf->reshape_progress) !=
630 conf->mddev->reshape_backwards)) {
631 set_bit(R10BIO_Previous, &r10bio->state);
632 geo = &conf->prev;
633 } else
634 clear_bit(R10BIO_Previous, &r10bio->state);
635
636 __raid10_find_phys(geo, r10bio);
637}
638
639static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
640{
641 sector_t offset, chunk, vchunk;
642
643
644
645 struct geom *geo = &conf->geo;
646 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
647 int far_set_size = geo->far_set_size;
648 int last_far_set_start;
649
650 if (geo->raid_disks % geo->far_set_size) {
651 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
652 last_far_set_start *= geo->far_set_size;
653
654 if (dev >= last_far_set_start) {
655 far_set_size = geo->far_set_size;
656 far_set_size += (geo->raid_disks % geo->far_set_size);
657 far_set_start = last_far_set_start;
658 }
659 }
660
661 offset = sector & geo->chunk_mask;
662 if (geo->far_offset) {
663 int fc;
664 chunk = sector >> geo->chunk_shift;
665 fc = sector_div(chunk, geo->far_copies);
666 dev -= fc * geo->near_copies;
667 if (dev < far_set_start)
668 dev += far_set_size;
669 } else {
670 while (sector >= geo->stride) {
671 sector -= geo->stride;
672 if (dev < (geo->near_copies + far_set_start))
673 dev += far_set_size - geo->near_copies;
674 else
675 dev -= geo->near_copies;
676 }
677 chunk = sector >> geo->chunk_shift;
678 }
679 vchunk = chunk * geo->raid_disks + dev;
680 sector_div(vchunk, geo->near_copies);
681 return (vchunk << geo->chunk_shift) + offset;
682}
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703static struct md_rdev *read_balance(struct r10conf *conf,
704 struct r10bio *r10_bio,
705 int *max_sectors)
706{
707 const sector_t this_sector = r10_bio->sector;
708 int disk, slot;
709 int sectors = r10_bio->sectors;
710 int best_good_sectors;
711 sector_t new_distance, best_dist;
712 struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL;
713 int do_balance;
714 int best_dist_slot, best_pending_slot;
715 bool has_nonrot_disk = false;
716 unsigned int min_pending;
717 struct geom *geo = &conf->geo;
718
719 raid10_find_phys(conf, r10_bio);
720 rcu_read_lock();
721 best_dist_slot = -1;
722 min_pending = UINT_MAX;
723 best_dist_rdev = NULL;
724 best_pending_rdev = NULL;
725 best_dist = MaxSector;
726 best_good_sectors = 0;
727 do_balance = 1;
728 clear_bit(R10BIO_FailFast, &r10_bio->state);
729
730
731
732
733
734
735 if ((conf->mddev->recovery_cp < MaxSector
736 && (this_sector + sectors >= conf->next_resync)) ||
737 (mddev_is_clustered(conf->mddev) &&
738 md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
739 this_sector + sectors)))
740 do_balance = 0;
741
742 for (slot = 0; slot < conf->copies ; slot++) {
743 sector_t first_bad;
744 int bad_sectors;
745 sector_t dev_sector;
746 unsigned int pending;
747 bool nonrot;
748
749 if (r10_bio->devs[slot].bio == IO_BLOCKED)
750 continue;
751 disk = r10_bio->devs[slot].devnum;
752 rdev = rcu_dereference(conf->mirrors[disk].replacement);
753 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
754 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
755 rdev = rcu_dereference(conf->mirrors[disk].rdev);
756 if (rdev == NULL ||
757 test_bit(Faulty, &rdev->flags))
758 continue;
759 if (!test_bit(In_sync, &rdev->flags) &&
760 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
761 continue;
762
763 dev_sector = r10_bio->devs[slot].addr;
764 if (is_badblock(rdev, dev_sector, sectors,
765 &first_bad, &bad_sectors)) {
766 if (best_dist < MaxSector)
767
768 continue;
769 if (first_bad <= dev_sector) {
770
771
772
773
774 bad_sectors -= (dev_sector - first_bad);
775 if (!do_balance && sectors > bad_sectors)
776 sectors = bad_sectors;
777 if (best_good_sectors > sectors)
778 best_good_sectors = sectors;
779 } else {
780 sector_t good_sectors =
781 first_bad - dev_sector;
782 if (good_sectors > best_good_sectors) {
783 best_good_sectors = good_sectors;
784 best_dist_slot = slot;
785 best_dist_rdev = rdev;
786 }
787 if (!do_balance)
788
789 break;
790 }
791 continue;
792 } else
793 best_good_sectors = sectors;
794
795 if (!do_balance)
796 break;
797
798 nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
799 has_nonrot_disk |= nonrot;
800 pending = atomic_read(&rdev->nr_pending);
801 if (min_pending > pending && nonrot) {
802 min_pending = pending;
803 best_pending_slot = slot;
804 best_pending_rdev = rdev;
805 }
806
807 if (best_dist_slot >= 0)
808
809 set_bit(R10BIO_FailFast, &r10_bio->state);
810
811
812
813
814 if (geo->near_copies > 1 && !pending)
815 new_distance = 0;
816
817
818 else if (geo->far_copies > 1)
819 new_distance = r10_bio->devs[slot].addr;
820 else
821 new_distance = abs(r10_bio->devs[slot].addr -
822 conf->mirrors[disk].head_position);
823
824 if (new_distance < best_dist) {
825 best_dist = new_distance;
826 best_dist_slot = slot;
827 best_dist_rdev = rdev;
828 }
829 }
830 if (slot >= conf->copies) {
831 if (has_nonrot_disk) {
832 slot = best_pending_slot;
833 rdev = best_pending_rdev;
834 } else {
835 slot = best_dist_slot;
836 rdev = best_dist_rdev;
837 }
838 }
839
840 if (slot >= 0) {
841 atomic_inc(&rdev->nr_pending);
842 r10_bio->read_slot = slot;
843 } else
844 rdev = NULL;
845 rcu_read_unlock();
846 *max_sectors = best_good_sectors;
847
848 return rdev;
849}
850
851static void flush_pending_writes(struct r10conf *conf)
852{
853
854
855
856 spin_lock_irq(&conf->device_lock);
857
858 if (conf->pending_bio_list.head) {
859 struct blk_plug plug;
860 struct bio *bio;
861
862 bio = bio_list_get(&conf->pending_bio_list);
863 conf->pending_count = 0;
864 spin_unlock_irq(&conf->device_lock);
865
866
867
868
869
870
871
872
873
874
875 __set_current_state(TASK_RUNNING);
876
877 blk_start_plug(&plug);
878
879
880 md_bitmap_unplug(conf->mddev->bitmap);
881 wake_up(&conf->wait_barrier);
882
883 while (bio) {
884 struct bio *next = bio->bi_next;
885 struct md_rdev *rdev = (void*)bio->bi_disk;
886 bio->bi_next = NULL;
887 bio_set_dev(bio, rdev->bdev);
888 if (test_bit(Faulty, &rdev->flags)) {
889 bio_io_error(bio);
890 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
891 !blk_queue_discard(bio->bi_disk->queue)))
892
893 bio_endio(bio);
894 else
895 submit_bio_noacct(bio);
896 bio = next;
897 }
898 blk_finish_plug(&plug);
899 } else
900 spin_unlock_irq(&conf->device_lock);
901}
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925static void raise_barrier(struct r10conf *conf, int force)
926{
927 BUG_ON(force && !conf->barrier);
928 spin_lock_irq(&conf->resync_lock);
929
930
931 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
932 conf->resync_lock);
933
934
935 conf->barrier++;
936
937
938 wait_event_lock_irq(conf->wait_barrier,
939 !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
940 conf->resync_lock);
941
942 spin_unlock_irq(&conf->resync_lock);
943}
944
945static void lower_barrier(struct r10conf *conf)
946{
947 unsigned long flags;
948 spin_lock_irqsave(&conf->resync_lock, flags);
949 conf->barrier--;
950 spin_unlock_irqrestore(&conf->resync_lock, flags);
951 wake_up(&conf->wait_barrier);
952}
953
954static void wait_barrier(struct r10conf *conf)
955{
956 spin_lock_irq(&conf->resync_lock);
957 if (conf->barrier) {
958 struct bio_list *bio_list = current->bio_list;
959 conf->nr_waiting++;
960
961
962
963
964
965
966
967
968
969 raid10_log(conf->mddev, "wait barrier");
970 wait_event_lock_irq(conf->wait_barrier,
971 !conf->barrier ||
972 (atomic_read(&conf->nr_pending) &&
973 bio_list &&
974 (!bio_list_empty(&bio_list[0]) ||
975 !bio_list_empty(&bio_list[1]))) ||
976
977
978
979 (conf->mddev->thread->tsk == current &&
980 test_bit(MD_RECOVERY_RUNNING,
981 &conf->mddev->recovery) &&
982 conf->nr_queued > 0),
983 conf->resync_lock);
984 conf->nr_waiting--;
985 if (!conf->nr_waiting)
986 wake_up(&conf->wait_barrier);
987 }
988 atomic_inc(&conf->nr_pending);
989 spin_unlock_irq(&conf->resync_lock);
990}
991
992static void allow_barrier(struct r10conf *conf)
993{
994 if ((atomic_dec_and_test(&conf->nr_pending)) ||
995 (conf->array_freeze_pending))
996 wake_up(&conf->wait_barrier);
997}
998
999static void freeze_array(struct r10conf *conf, int extra)
1000{
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013 spin_lock_irq(&conf->resync_lock);
1014 conf->array_freeze_pending++;
1015 conf->barrier++;
1016 conf->nr_waiting++;
1017 wait_event_lock_irq_cmd(conf->wait_barrier,
1018 atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
1019 conf->resync_lock,
1020 flush_pending_writes(conf));
1021
1022 conf->array_freeze_pending--;
1023 spin_unlock_irq(&conf->resync_lock);
1024}
1025
1026static void unfreeze_array(struct r10conf *conf)
1027{
1028
1029 spin_lock_irq(&conf->resync_lock);
1030 conf->barrier--;
1031 conf->nr_waiting--;
1032 wake_up(&conf->wait_barrier);
1033 spin_unlock_irq(&conf->resync_lock);
1034}
1035
1036static sector_t choose_data_offset(struct r10bio *r10_bio,
1037 struct md_rdev *rdev)
1038{
1039 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1040 test_bit(R10BIO_Previous, &r10_bio->state))
1041 return rdev->data_offset;
1042 else
1043 return rdev->new_data_offset;
1044}
1045
1046struct raid10_plug_cb {
1047 struct blk_plug_cb cb;
1048 struct bio_list pending;
1049 int pending_cnt;
1050};
1051
1052static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1053{
1054 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1055 cb);
1056 struct mddev *mddev = plug->cb.data;
1057 struct r10conf *conf = mddev->private;
1058 struct bio *bio;
1059
1060 if (from_schedule || current->bio_list) {
1061 spin_lock_irq(&conf->device_lock);
1062 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1063 conf->pending_count += plug->pending_cnt;
1064 spin_unlock_irq(&conf->device_lock);
1065 wake_up(&conf->wait_barrier);
1066 md_wakeup_thread(mddev->thread);
1067 kfree(plug);
1068 return;
1069 }
1070
1071
1072 bio = bio_list_get(&plug->pending);
1073 md_bitmap_unplug(mddev->bitmap);
1074 wake_up(&conf->wait_barrier);
1075
1076 while (bio) {
1077 struct bio *next = bio->bi_next;
1078 struct md_rdev *rdev = (void*)bio->bi_disk;
1079 bio->bi_next = NULL;
1080 bio_set_dev(bio, rdev->bdev);
1081 if (test_bit(Faulty, &rdev->flags)) {
1082 bio_io_error(bio);
1083 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
1084 !blk_queue_discard(bio->bi_disk->queue)))
1085
1086 bio_endio(bio);
1087 else
1088 submit_bio_noacct(bio);
1089 bio = next;
1090 }
1091 kfree(plug);
1092}
1093
1094
1095
1096
1097
1098
1099
1100static void regular_request_wait(struct mddev *mddev, struct r10conf *conf,
1101 struct bio *bio, sector_t sectors)
1102{
1103 wait_barrier(conf);
1104 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1105 bio->bi_iter.bi_sector < conf->reshape_progress &&
1106 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1107 raid10_log(conf->mddev, "wait reshape");
1108 allow_barrier(conf);
1109 wait_event(conf->wait_barrier,
1110 conf->reshape_progress <= bio->bi_iter.bi_sector ||
1111 conf->reshape_progress >= bio->bi_iter.bi_sector +
1112 sectors);
1113 wait_barrier(conf);
1114 }
1115}
1116
1117static void raid10_read_request(struct mddev *mddev, struct bio *bio,
1118 struct r10bio *r10_bio)
1119{
1120 struct r10conf *conf = mddev->private;
1121 struct bio *read_bio;
1122 const int op = bio_op(bio);
1123 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1124 int max_sectors;
1125 struct md_rdev *rdev;
1126 char b[BDEVNAME_SIZE];
1127 int slot = r10_bio->read_slot;
1128 struct md_rdev *err_rdev = NULL;
1129 gfp_t gfp = GFP_NOIO;
1130
1131 if (r10_bio->devs[slot].rdev) {
1132
1133
1134
1135
1136
1137
1138
1139 int disk;
1140
1141
1142
1143
1144 gfp = GFP_NOIO | __GFP_HIGH;
1145
1146 rcu_read_lock();
1147 disk = r10_bio->devs[slot].devnum;
1148 err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
1149 if (err_rdev)
1150 bdevname(err_rdev->bdev, b);
1151 else {
1152 strcpy(b, "???");
1153
1154 err_rdev = r10_bio->devs[slot].rdev;
1155 }
1156 rcu_read_unlock();
1157 }
1158
1159 regular_request_wait(mddev, conf, bio, r10_bio->sectors);
1160 rdev = read_balance(conf, r10_bio, &max_sectors);
1161 if (!rdev) {
1162 if (err_rdev) {
1163 pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
1164 mdname(mddev), b,
1165 (unsigned long long)r10_bio->sector);
1166 }
1167 raid_end_bio_io(r10_bio);
1168 return;
1169 }
1170 if (err_rdev)
1171 pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
1172 mdname(mddev),
1173 bdevname(rdev->bdev, b),
1174 (unsigned long long)r10_bio->sector);
1175 if (max_sectors < bio_sectors(bio)) {
1176 struct bio *split = bio_split(bio, max_sectors,
1177 gfp, &conf->bio_split);
1178 bio_chain(split, bio);
1179 allow_barrier(conf);
1180 submit_bio_noacct(bio);
1181 wait_barrier(conf);
1182 bio = split;
1183 r10_bio->master_bio = bio;
1184 r10_bio->sectors = max_sectors;
1185 }
1186 slot = r10_bio->read_slot;
1187
1188 read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
1189
1190 r10_bio->devs[slot].bio = read_bio;
1191 r10_bio->devs[slot].rdev = rdev;
1192
1193 read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
1194 choose_data_offset(r10_bio, rdev);
1195 bio_set_dev(read_bio, rdev->bdev);
1196 read_bio->bi_end_io = raid10_end_read_request;
1197 bio_set_op_attrs(read_bio, op, do_sync);
1198 if (test_bit(FailFast, &rdev->flags) &&
1199 test_bit(R10BIO_FailFast, &r10_bio->state))
1200 read_bio->bi_opf |= MD_FAILFAST;
1201 read_bio->bi_private = r10_bio;
1202
1203 if (mddev->gendisk)
1204 trace_block_bio_remap(read_bio->bi_disk->queue,
1205 read_bio, disk_devt(mddev->gendisk),
1206 r10_bio->sector);
1207 submit_bio_noacct(read_bio);
1208 return;
1209}
1210
1211static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
1212 struct bio *bio, bool replacement,
1213 int n_copy)
1214{
1215 const int op = bio_op(bio);
1216 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1217 const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
1218 unsigned long flags;
1219 struct blk_plug_cb *cb;
1220 struct raid10_plug_cb *plug = NULL;
1221 struct r10conf *conf = mddev->private;
1222 struct md_rdev *rdev;
1223 int devnum = r10_bio->devs[n_copy].devnum;
1224 struct bio *mbio;
1225
1226 if (replacement) {
1227 rdev = conf->mirrors[devnum].replacement;
1228 if (rdev == NULL) {
1229
1230 smp_mb();
1231 rdev = conf->mirrors[devnum].rdev;
1232 }
1233 } else
1234 rdev = conf->mirrors[devnum].rdev;
1235
1236 mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
1237 if (replacement)
1238 r10_bio->devs[n_copy].repl_bio = mbio;
1239 else
1240 r10_bio->devs[n_copy].bio = mbio;
1241
1242 mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
1243 choose_data_offset(r10_bio, rdev));
1244 bio_set_dev(mbio, rdev->bdev);
1245 mbio->bi_end_io = raid10_end_write_request;
1246 bio_set_op_attrs(mbio, op, do_sync | do_fua);
1247 if (!replacement && test_bit(FailFast,
1248 &conf->mirrors[devnum].rdev->flags)
1249 && enough(conf, devnum))
1250 mbio->bi_opf |= MD_FAILFAST;
1251 mbio->bi_private = r10_bio;
1252
1253 if (conf->mddev->gendisk)
1254 trace_block_bio_remap(mbio->bi_disk->queue,
1255 mbio, disk_devt(conf->mddev->gendisk),
1256 r10_bio->sector);
1257
1258 mbio->bi_disk = (void *)rdev;
1259
1260 atomic_inc(&r10_bio->remaining);
1261
1262 cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
1263 if (cb)
1264 plug = container_of(cb, struct raid10_plug_cb, cb);
1265 else
1266 plug = NULL;
1267 if (plug) {
1268 bio_list_add(&plug->pending, mbio);
1269 plug->pending_cnt++;
1270 } else {
1271 spin_lock_irqsave(&conf->device_lock, flags);
1272 bio_list_add(&conf->pending_bio_list, mbio);
1273 conf->pending_count++;
1274 spin_unlock_irqrestore(&conf->device_lock, flags);
1275 md_wakeup_thread(mddev->thread);
1276 }
1277}
1278
1279static void raid10_write_request(struct mddev *mddev, struct bio *bio,
1280 struct r10bio *r10_bio)
1281{
1282 struct r10conf *conf = mddev->private;
1283 int i;
1284 struct md_rdev *blocked_rdev;
1285 sector_t sectors;
1286 int max_sectors;
1287
1288 if ((mddev_is_clustered(mddev) &&
1289 md_cluster_ops->area_resyncing(mddev, WRITE,
1290 bio->bi_iter.bi_sector,
1291 bio_end_sector(bio)))) {
1292 DEFINE_WAIT(w);
1293 for (;;) {
1294 prepare_to_wait(&conf->wait_barrier,
1295 &w, TASK_IDLE);
1296 if (!md_cluster_ops->area_resyncing(mddev, WRITE,
1297 bio->bi_iter.bi_sector, bio_end_sector(bio)))
1298 break;
1299 schedule();
1300 }
1301 finish_wait(&conf->wait_barrier, &w);
1302 }
1303
1304 sectors = r10_bio->sectors;
1305 regular_request_wait(mddev, conf, bio, sectors);
1306 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1307 (mddev->reshape_backwards
1308 ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
1309 bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
1310 : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
1311 bio->bi_iter.bi_sector < conf->reshape_progress))) {
1312
1313 mddev->reshape_position = conf->reshape_progress;
1314 set_mask_bits(&mddev->sb_flags, 0,
1315 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1316 md_wakeup_thread(mddev->thread);
1317 raid10_log(conf->mddev, "wait reshape metadata");
1318 wait_event(mddev->sb_wait,
1319 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
1320
1321 conf->reshape_safe = mddev->reshape_position;
1322 }
1323
1324 if (conf->pending_count >= max_queued_requests) {
1325 md_wakeup_thread(mddev->thread);
1326 raid10_log(mddev, "wait queued");
1327 wait_event(conf->wait_barrier,
1328 conf->pending_count < max_queued_requests);
1329 }
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340 r10_bio->read_slot = -1;
1341 raid10_find_phys(conf, r10_bio);
1342retry_write:
1343 blocked_rdev = NULL;
1344 rcu_read_lock();
1345 max_sectors = r10_bio->sectors;
1346
1347 for (i = 0; i < conf->copies; i++) {
1348 int d = r10_bio->devs[i].devnum;
1349 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1350 struct md_rdev *rrdev = rcu_dereference(
1351 conf->mirrors[d].replacement);
1352 if (rdev == rrdev)
1353 rrdev = NULL;
1354 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1355 atomic_inc(&rdev->nr_pending);
1356 blocked_rdev = rdev;
1357 break;
1358 }
1359 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1360 atomic_inc(&rrdev->nr_pending);
1361 blocked_rdev = rrdev;
1362 break;
1363 }
1364 if (rdev && (test_bit(Faulty, &rdev->flags)))
1365 rdev = NULL;
1366 if (rrdev && (test_bit(Faulty, &rrdev->flags)))
1367 rrdev = NULL;
1368
1369 r10_bio->devs[i].bio = NULL;
1370 r10_bio->devs[i].repl_bio = NULL;
1371
1372 if (!rdev && !rrdev) {
1373 set_bit(R10BIO_Degraded, &r10_bio->state);
1374 continue;
1375 }
1376 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1377 sector_t first_bad;
1378 sector_t dev_sector = r10_bio->devs[i].addr;
1379 int bad_sectors;
1380 int is_bad;
1381
1382 is_bad = is_badblock(rdev, dev_sector, max_sectors,
1383 &first_bad, &bad_sectors);
1384 if (is_bad < 0) {
1385
1386
1387
1388 atomic_inc(&rdev->nr_pending);
1389 set_bit(BlockedBadBlocks, &rdev->flags);
1390 blocked_rdev = rdev;
1391 break;
1392 }
1393 if (is_bad && first_bad <= dev_sector) {
1394
1395 bad_sectors -= (dev_sector - first_bad);
1396 if (bad_sectors < max_sectors)
1397
1398
1399
1400 max_sectors = bad_sectors;
1401
1402
1403
1404
1405
1406
1407
1408
1409 continue;
1410 }
1411 if (is_bad) {
1412 int good_sectors = first_bad - dev_sector;
1413 if (good_sectors < max_sectors)
1414 max_sectors = good_sectors;
1415 }
1416 }
1417 if (rdev) {
1418 r10_bio->devs[i].bio = bio;
1419 atomic_inc(&rdev->nr_pending);
1420 }
1421 if (rrdev) {
1422 r10_bio->devs[i].repl_bio = bio;
1423 atomic_inc(&rrdev->nr_pending);
1424 }
1425 }
1426 rcu_read_unlock();
1427
1428 if (unlikely(blocked_rdev)) {
1429
1430 int j;
1431 int d;
1432
1433 for (j = 0; j < i; j++) {
1434 if (r10_bio->devs[j].bio) {
1435 d = r10_bio->devs[j].devnum;
1436 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1437 }
1438 if (r10_bio->devs[j].repl_bio) {
1439 struct md_rdev *rdev;
1440 d = r10_bio->devs[j].devnum;
1441 rdev = conf->mirrors[d].replacement;
1442 if (!rdev) {
1443
1444 smp_mb();
1445 rdev = conf->mirrors[d].rdev;
1446 }
1447 rdev_dec_pending(rdev, mddev);
1448 }
1449 }
1450 allow_barrier(conf);
1451 raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
1452 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1453 wait_barrier(conf);
1454 goto retry_write;
1455 }
1456
1457 if (max_sectors < r10_bio->sectors)
1458 r10_bio->sectors = max_sectors;
1459
1460 if (r10_bio->sectors < bio_sectors(bio)) {
1461 struct bio *split = bio_split(bio, r10_bio->sectors,
1462 GFP_NOIO, &conf->bio_split);
1463 bio_chain(split, bio);
1464 allow_barrier(conf);
1465 submit_bio_noacct(bio);
1466 wait_barrier(conf);
1467 bio = split;
1468 r10_bio->master_bio = bio;
1469 }
1470
1471 atomic_set(&r10_bio->remaining, 1);
1472 md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1473
1474 for (i = 0; i < conf->copies; i++) {
1475 if (r10_bio->devs[i].bio)
1476 raid10_write_one_disk(mddev, r10_bio, bio, false, i);
1477 if (r10_bio->devs[i].repl_bio)
1478 raid10_write_one_disk(mddev, r10_bio, bio, true, i);
1479 }
1480 one_write_done(r10_bio);
1481}
1482
1483static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
1484{
1485 struct r10conf *conf = mddev->private;
1486 struct r10bio *r10_bio;
1487
1488 r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
1489
1490 r10_bio->master_bio = bio;
1491 r10_bio->sectors = sectors;
1492
1493 r10_bio->mddev = mddev;
1494 r10_bio->sector = bio->bi_iter.bi_sector;
1495 r10_bio->state = 0;
1496 memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
1497
1498 if (bio_data_dir(bio) == READ)
1499 raid10_read_request(mddev, bio, r10_bio);
1500 else
1501 raid10_write_request(mddev, bio, r10_bio);
1502}
1503
1504static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
1505{
1506 struct r10conf *conf = mddev->private;
1507 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1508 int chunk_sects = chunk_mask + 1;
1509 int sectors = bio_sectors(bio);
1510
1511 if (unlikely(bio->bi_opf & REQ_PREFLUSH)
1512 && md_flush_request(mddev, bio))
1513 return true;
1514
1515 if (!md_write_start(mddev, bio))
1516 return false;
1517
1518
1519
1520
1521
1522 if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
1523 sectors > chunk_sects
1524 && (conf->geo.near_copies < conf->geo.raid_disks
1525 || conf->prev.near_copies <
1526 conf->prev.raid_disks)))
1527 sectors = chunk_sects -
1528 (bio->bi_iter.bi_sector &
1529 (chunk_sects - 1));
1530 __make_request(mddev, bio, sectors);
1531
1532
1533 wake_up(&conf->wait_barrier);
1534 return true;
1535}
1536
1537static void raid10_status(struct seq_file *seq, struct mddev *mddev)
1538{
1539 struct r10conf *conf = mddev->private;
1540 int i;
1541
1542 if (conf->geo.near_copies < conf->geo.raid_disks)
1543 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1544 if (conf->geo.near_copies > 1)
1545 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1546 if (conf->geo.far_copies > 1) {
1547 if (conf->geo.far_offset)
1548 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1549 else
1550 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1551 if (conf->geo.far_set_size != conf->geo.raid_disks)
1552 seq_printf(seq, " %d devices per set", conf->geo.far_set_size);
1553 }
1554 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1555 conf->geo.raid_disks - mddev->degraded);
1556 rcu_read_lock();
1557 for (i = 0; i < conf->geo.raid_disks; i++) {
1558 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1559 seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1560 }
1561 rcu_read_unlock();
1562 seq_printf(seq, "]");
1563}
1564
1565
1566
1567
1568
1569
1570static int _enough(struct r10conf *conf, int previous, int ignore)
1571{
1572 int first = 0;
1573 int has_enough = 0;
1574 int disks, ncopies;
1575 if (previous) {
1576 disks = conf->prev.raid_disks;
1577 ncopies = conf->prev.near_copies;
1578 } else {
1579 disks = conf->geo.raid_disks;
1580 ncopies = conf->geo.near_copies;
1581 }
1582
1583 rcu_read_lock();
1584 do {
1585 int n = conf->copies;
1586 int cnt = 0;
1587 int this = first;
1588 while (n--) {
1589 struct md_rdev *rdev;
1590 if (this != ignore &&
1591 (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
1592 test_bit(In_sync, &rdev->flags))
1593 cnt++;
1594 this = (this+1) % disks;
1595 }
1596 if (cnt == 0)
1597 goto out;
1598 first = (first + ncopies) % disks;
1599 } while (first != 0);
1600 has_enough = 1;
1601out:
1602 rcu_read_unlock();
1603 return has_enough;
1604}
1605
1606static int enough(struct r10conf *conf, int ignore)
1607{
1608
1609
1610
1611
1612
1613 return _enough(conf, 0, ignore) &&
1614 _enough(conf, 1, ignore);
1615}
1616
1617static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
1618{
1619 char b[BDEVNAME_SIZE];
1620 struct r10conf *conf = mddev->private;
1621 unsigned long flags;
1622
1623
1624
1625
1626
1627
1628
1629 spin_lock_irqsave(&conf->device_lock, flags);
1630 if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
1631 && !enough(conf, rdev->raid_disk)) {
1632
1633
1634
1635 spin_unlock_irqrestore(&conf->device_lock, flags);
1636 return;
1637 }
1638 if (test_and_clear_bit(In_sync, &rdev->flags))
1639 mddev->degraded++;
1640
1641
1642
1643 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1644 set_bit(Blocked, &rdev->flags);
1645 set_bit(Faulty, &rdev->flags);
1646 set_mask_bits(&mddev->sb_flags, 0,
1647 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1648 spin_unlock_irqrestore(&conf->device_lock, flags);
1649 pr_crit("md/raid10:%s: Disk failure on %s, disabling device.\n"
1650 "md/raid10:%s: Operation continuing on %d devices.\n",
1651 mdname(mddev), bdevname(rdev->bdev, b),
1652 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1653}
1654
1655static void print_conf(struct r10conf *conf)
1656{
1657 int i;
1658 struct md_rdev *rdev;
1659
1660 pr_debug("RAID10 conf printout:\n");
1661 if (!conf) {
1662 pr_debug("(!conf)\n");
1663 return;
1664 }
1665 pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1666 conf->geo.raid_disks);
1667
1668
1669
1670 for (i = 0; i < conf->geo.raid_disks; i++) {
1671 char b[BDEVNAME_SIZE];
1672 rdev = conf->mirrors[i].rdev;
1673 if (rdev)
1674 pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n",
1675 i, !test_bit(In_sync, &rdev->flags),
1676 !test_bit(Faulty, &rdev->flags),
1677 bdevname(rdev->bdev,b));
1678 }
1679}
1680
1681static void close_sync(struct r10conf *conf)
1682{
1683 wait_barrier(conf);
1684 allow_barrier(conf);
1685
1686 mempool_exit(&conf->r10buf_pool);
1687}
1688
1689static int raid10_spare_active(struct mddev *mddev)
1690{
1691 int i;
1692 struct r10conf *conf = mddev->private;
1693 struct raid10_info *tmp;
1694 int count = 0;
1695 unsigned long flags;
1696
1697
1698
1699
1700
1701 for (i = 0; i < conf->geo.raid_disks; i++) {
1702 tmp = conf->mirrors + i;
1703 if (tmp->replacement
1704 && tmp->replacement->recovery_offset == MaxSector
1705 && !test_bit(Faulty, &tmp->replacement->flags)
1706 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1707
1708 if (!tmp->rdev
1709 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1710 count++;
1711 if (tmp->rdev) {
1712
1713
1714
1715
1716 set_bit(Faulty, &tmp->rdev->flags);
1717 sysfs_notify_dirent_safe(
1718 tmp->rdev->sysfs_state);
1719 }
1720 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1721 } else if (tmp->rdev
1722 && tmp->rdev->recovery_offset == MaxSector
1723 && !test_bit(Faulty, &tmp->rdev->flags)
1724 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1725 count++;
1726 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
1727 }
1728 }
1729 spin_lock_irqsave(&conf->device_lock, flags);
1730 mddev->degraded -= count;
1731 spin_unlock_irqrestore(&conf->device_lock, flags);
1732
1733 print_conf(conf);
1734 return count;
1735}
1736
1737static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1738{
1739 struct r10conf *conf = mddev->private;
1740 int err = -EEXIST;
1741 int mirror;
1742 int first = 0;
1743 int last = conf->geo.raid_disks - 1;
1744
1745 if (mddev->recovery_cp < MaxSector)
1746
1747
1748
1749 return -EBUSY;
1750 if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
1751 return -EINVAL;
1752
1753 if (md_integrity_add_rdev(rdev, mddev))
1754 return -ENXIO;
1755
1756 if (rdev->raid_disk >= 0)
1757 first = last = rdev->raid_disk;
1758
1759 if (rdev->saved_raid_disk >= first &&
1760 rdev->saved_raid_disk < conf->geo.raid_disks &&
1761 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1762 mirror = rdev->saved_raid_disk;
1763 else
1764 mirror = first;
1765 for ( ; mirror <= last ; mirror++) {
1766 struct raid10_info *p = &conf->mirrors[mirror];
1767 if (p->recovery_disabled == mddev->recovery_disabled)
1768 continue;
1769 if (p->rdev) {
1770 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1771 p->replacement != NULL)
1772 continue;
1773 clear_bit(In_sync, &rdev->flags);
1774 set_bit(Replacement, &rdev->flags);
1775 rdev->raid_disk = mirror;
1776 err = 0;
1777 if (mddev->gendisk)
1778 disk_stack_limits(mddev->gendisk, rdev->bdev,
1779 rdev->data_offset << 9);
1780 conf->fullsync = 1;
1781 rcu_assign_pointer(p->replacement, rdev);
1782 break;
1783 }
1784
1785 if (mddev->gendisk)
1786 disk_stack_limits(mddev->gendisk, rdev->bdev,
1787 rdev->data_offset << 9);
1788
1789 p->head_position = 0;
1790 p->recovery_disabled = mddev->recovery_disabled - 1;
1791 rdev->raid_disk = mirror;
1792 err = 0;
1793 if (rdev->saved_raid_disk != mirror)
1794 conf->fullsync = 1;
1795 rcu_assign_pointer(p->rdev, rdev);
1796 break;
1797 }
1798 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1799 blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue);
1800
1801 print_conf(conf);
1802 return err;
1803}
1804
1805static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1806{
1807 struct r10conf *conf = mddev->private;
1808 int err = 0;
1809 int number = rdev->raid_disk;
1810 struct md_rdev **rdevp;
1811 struct raid10_info *p = conf->mirrors + number;
1812
1813 print_conf(conf);
1814 if (rdev == p->rdev)
1815 rdevp = &p->rdev;
1816 else if (rdev == p->replacement)
1817 rdevp = &p->replacement;
1818 else
1819 return 0;
1820
1821 if (test_bit(In_sync, &rdev->flags) ||
1822 atomic_read(&rdev->nr_pending)) {
1823 err = -EBUSY;
1824 goto abort;
1825 }
1826
1827
1828
1829 if (!test_bit(Faulty, &rdev->flags) &&
1830 mddev->recovery_disabled != p->recovery_disabled &&
1831 (!p->replacement || p->replacement == rdev) &&
1832 number < conf->geo.raid_disks &&
1833 enough(conf, -1)) {
1834 err = -EBUSY;
1835 goto abort;
1836 }
1837 *rdevp = NULL;
1838 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
1839 synchronize_rcu();
1840 if (atomic_read(&rdev->nr_pending)) {
1841
1842 err = -EBUSY;
1843 *rdevp = rdev;
1844 goto abort;
1845 }
1846 }
1847 if (p->replacement) {
1848
1849 p->rdev = p->replacement;
1850 clear_bit(Replacement, &p->replacement->flags);
1851 smp_mb();
1852
1853
1854 p->replacement = NULL;
1855 }
1856
1857 clear_bit(WantReplacement, &rdev->flags);
1858 err = md_integrity_register(mddev);
1859
1860abort:
1861
1862 print_conf(conf);
1863 return err;
1864}
1865
1866static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
1867{
1868 struct r10conf *conf = r10_bio->mddev->private;
1869
1870 if (!bio->bi_status)
1871 set_bit(R10BIO_Uptodate, &r10_bio->state);
1872 else
1873
1874
1875
1876 atomic_add(r10_bio->sectors,
1877 &conf->mirrors[d].rdev->corrected_errors);
1878
1879
1880
1881
1882 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1883 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1884 atomic_dec_and_test(&r10_bio->remaining)) {
1885
1886
1887
1888 reschedule_retry(r10_bio);
1889 }
1890}
1891
1892static void end_sync_read(struct bio *bio)
1893{
1894 struct r10bio *r10_bio = get_resync_r10bio(bio);
1895 struct r10conf *conf = r10_bio->mddev->private;
1896 int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1897
1898 __end_sync_read(r10_bio, bio, d);
1899}
1900
1901static void end_reshape_read(struct bio *bio)
1902{
1903
1904 struct r10bio *r10_bio = bio->bi_private;
1905
1906 __end_sync_read(r10_bio, bio, r10_bio->read_slot);
1907}
1908
1909static void end_sync_request(struct r10bio *r10_bio)
1910{
1911 struct mddev *mddev = r10_bio->mddev;
1912
1913 while (atomic_dec_and_test(&r10_bio->remaining)) {
1914 if (r10_bio->master_bio == NULL) {
1915
1916 sector_t s = r10_bio->sectors;
1917 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1918 test_bit(R10BIO_WriteError, &r10_bio->state))
1919 reschedule_retry(r10_bio);
1920 else
1921 put_buf(r10_bio);
1922 md_done_sync(mddev, s, 1);
1923 break;
1924 } else {
1925 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1926 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1927 test_bit(R10BIO_WriteError, &r10_bio->state))
1928 reschedule_retry(r10_bio);
1929 else
1930 put_buf(r10_bio);
1931 r10_bio = r10_bio2;
1932 }
1933 }
1934}
1935
1936static void end_sync_write(struct bio *bio)
1937{
1938 struct r10bio *r10_bio = get_resync_r10bio(bio);
1939 struct mddev *mddev = r10_bio->mddev;
1940 struct r10conf *conf = mddev->private;
1941 int d;
1942 sector_t first_bad;
1943 int bad_sectors;
1944 int slot;
1945 int repl;
1946 struct md_rdev *rdev = NULL;
1947
1948 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1949 if (repl)
1950 rdev = conf->mirrors[d].replacement;
1951 else
1952 rdev = conf->mirrors[d].rdev;
1953
1954 if (bio->bi_status) {
1955 if (repl)
1956 md_error(mddev, rdev);
1957 else {
1958 set_bit(WriteErrorSeen, &rdev->flags);
1959 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1960 set_bit(MD_RECOVERY_NEEDED,
1961 &rdev->mddev->recovery);
1962 set_bit(R10BIO_WriteError, &r10_bio->state);
1963 }
1964 } else if (is_badblock(rdev,
1965 r10_bio->devs[slot].addr,
1966 r10_bio->sectors,
1967 &first_bad, &bad_sectors))
1968 set_bit(R10BIO_MadeGood, &r10_bio->state);
1969
1970 rdev_dec_pending(rdev, mddev);
1971
1972 end_sync_request(r10_bio);
1973}
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1992{
1993 struct r10conf *conf = mddev->private;
1994 int i, first;
1995 struct bio *tbio, *fbio;
1996 int vcnt;
1997 struct page **tpages, **fpages;
1998
1999 atomic_set(&r10_bio->remaining, 1);
2000
2001
2002 for (i=0; i<conf->copies; i++)
2003 if (!r10_bio->devs[i].bio->bi_status)
2004 break;
2005
2006 if (i == conf->copies)
2007 goto done;
2008
2009 first = i;
2010 fbio = r10_bio->devs[i].bio;
2011 fbio->bi_iter.bi_size = r10_bio->sectors << 9;
2012 fbio->bi_iter.bi_idx = 0;
2013 fpages = get_resync_pages(fbio)->pages;
2014
2015 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2016
2017 for (i=0 ; i < conf->copies ; i++) {
2018 int j, d;
2019 struct md_rdev *rdev;
2020 struct resync_pages *rp;
2021
2022 tbio = r10_bio->devs[i].bio;
2023
2024 if (tbio->bi_end_io != end_sync_read)
2025 continue;
2026 if (i == first)
2027 continue;
2028
2029 tpages = get_resync_pages(tbio)->pages;
2030 d = r10_bio->devs[i].devnum;
2031 rdev = conf->mirrors[d].rdev;
2032 if (!r10_bio->devs[i].bio->bi_status) {
2033
2034
2035
2036
2037 int sectors = r10_bio->sectors;
2038 for (j = 0; j < vcnt; j++) {
2039 int len = PAGE_SIZE;
2040 if (sectors < (len / 512))
2041 len = sectors * 512;
2042 if (memcmp(page_address(fpages[j]),
2043 page_address(tpages[j]),
2044 len))
2045 break;
2046 sectors -= len/512;
2047 }
2048 if (j == vcnt)
2049 continue;
2050 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2051 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2052
2053 continue;
2054 } else if (test_bit(FailFast, &rdev->flags)) {
2055
2056 md_error(rdev->mddev, rdev);
2057 continue;
2058 }
2059
2060
2061
2062
2063
2064 rp = get_resync_pages(tbio);
2065 bio_reset(tbio);
2066
2067 md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size);
2068
2069 rp->raid_bio = r10_bio;
2070 tbio->bi_private = rp;
2071 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
2072 tbio->bi_end_io = end_sync_write;
2073 bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
2074
2075 bio_copy_data(tbio, fbio);
2076
2077 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2078 atomic_inc(&r10_bio->remaining);
2079 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2080
2081 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
2082 tbio->bi_opf |= MD_FAILFAST;
2083 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
2084 bio_set_dev(tbio, conf->mirrors[d].rdev->bdev);
2085 submit_bio_noacct(tbio);
2086 }
2087
2088
2089
2090
2091 for (i = 0; i < conf->copies; i++) {
2092 int d;
2093
2094 tbio = r10_bio->devs[i].repl_bio;
2095 if (!tbio || !tbio->bi_end_io)
2096 continue;
2097 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2098 && r10_bio->devs[i].bio != fbio)
2099 bio_copy_data(tbio, fbio);
2100 d = r10_bio->devs[i].devnum;
2101 atomic_inc(&r10_bio->remaining);
2102 md_sync_acct(conf->mirrors[d].replacement->bdev,
2103 bio_sectors(tbio));
2104 submit_bio_noacct(tbio);
2105 }
2106
2107done:
2108 if (atomic_dec_and_test(&r10_bio->remaining)) {
2109 md_done_sync(mddev, r10_bio->sectors, 1);
2110 put_buf(r10_bio);
2111 }
2112}
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124static void fix_recovery_read_error(struct r10bio *r10_bio)
2125{
2126
2127
2128
2129
2130
2131
2132
2133 struct mddev *mddev = r10_bio->mddev;
2134 struct r10conf *conf = mddev->private;
2135 struct bio *bio = r10_bio->devs[0].bio;
2136 sector_t sect = 0;
2137 int sectors = r10_bio->sectors;
2138 int idx = 0;
2139 int dr = r10_bio->devs[0].devnum;
2140 int dw = r10_bio->devs[1].devnum;
2141 struct page **pages = get_resync_pages(bio)->pages;
2142
2143 while (sectors) {
2144 int s = sectors;
2145 struct md_rdev *rdev;
2146 sector_t addr;
2147 int ok;
2148
2149 if (s > (PAGE_SIZE>>9))
2150 s = PAGE_SIZE >> 9;
2151
2152 rdev = conf->mirrors[dr].rdev;
2153 addr = r10_bio->devs[0].addr + sect,
2154 ok = sync_page_io(rdev,
2155 addr,
2156 s << 9,
2157 pages[idx],
2158 REQ_OP_READ, 0, false);
2159 if (ok) {
2160 rdev = conf->mirrors[dw].rdev;
2161 addr = r10_bio->devs[1].addr + sect;
2162 ok = sync_page_io(rdev,
2163 addr,
2164 s << 9,
2165 pages[idx],
2166 REQ_OP_WRITE, 0, false);
2167 if (!ok) {
2168 set_bit(WriteErrorSeen, &rdev->flags);
2169 if (!test_and_set_bit(WantReplacement,
2170 &rdev->flags))
2171 set_bit(MD_RECOVERY_NEEDED,
2172 &rdev->mddev->recovery);
2173 }
2174 }
2175 if (!ok) {
2176
2177
2178
2179
2180 rdev_set_badblocks(rdev, addr, s, 0);
2181
2182 if (rdev != conf->mirrors[dw].rdev) {
2183
2184 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2185 addr = r10_bio->devs[1].addr + sect;
2186 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2187 if (!ok) {
2188
2189 pr_notice("md/raid10:%s: recovery aborted due to read error\n",
2190 mdname(mddev));
2191
2192 conf->mirrors[dw].recovery_disabled
2193 = mddev->recovery_disabled;
2194 set_bit(MD_RECOVERY_INTR,
2195 &mddev->recovery);
2196 break;
2197 }
2198 }
2199 }
2200
2201 sectors -= s;
2202 sect += s;
2203 idx++;
2204 }
2205}
2206
2207static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2208{
2209 struct r10conf *conf = mddev->private;
2210 int d;
2211 struct bio *wbio, *wbio2;
2212
2213 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2214 fix_recovery_read_error(r10_bio);
2215 end_sync_request(r10_bio);
2216 return;
2217 }
2218
2219
2220
2221
2222
2223 d = r10_bio->devs[1].devnum;
2224 wbio = r10_bio->devs[1].bio;
2225 wbio2 = r10_bio->devs[1].repl_bio;
2226
2227
2228
2229
2230 if (wbio2 && !wbio2->bi_end_io)
2231 wbio2 = NULL;
2232 if (wbio->bi_end_io) {
2233 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2234 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2235 submit_bio_noacct(wbio);
2236 }
2237 if (wbio2) {
2238 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2239 md_sync_acct(conf->mirrors[d].replacement->bdev,
2240 bio_sectors(wbio2));
2241 submit_bio_noacct(wbio2);
2242 }
2243}
2244
2245
2246
2247
2248
2249
2250
2251static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2252{
2253 long cur_time_mon;
2254 unsigned long hours_since_last;
2255 unsigned int read_errors = atomic_read(&rdev->read_errors);
2256
2257 cur_time_mon = ktime_get_seconds();
2258
2259 if (rdev->last_read_error == 0) {
2260
2261 rdev->last_read_error = cur_time_mon;
2262 return;
2263 }
2264
2265 hours_since_last = (long)(cur_time_mon -
2266 rdev->last_read_error) / 3600;
2267
2268 rdev->last_read_error = cur_time_mon;
2269
2270
2271
2272
2273
2274
2275 if (hours_since_last >= 8 * sizeof(read_errors))
2276 atomic_set(&rdev->read_errors, 0);
2277 else
2278 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2279}
2280
2281static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2282 int sectors, struct page *page, int rw)
2283{
2284 sector_t first_bad;
2285 int bad_sectors;
2286
2287 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2288 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2289 return -1;
2290 if (sync_page_io(rdev, sector, sectors << 9, page, rw, 0, false))
2291
2292 return 1;
2293 if (rw == WRITE) {
2294 set_bit(WriteErrorSeen, &rdev->flags);
2295 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2296 set_bit(MD_RECOVERY_NEEDED,
2297 &rdev->mddev->recovery);
2298 }
2299
2300 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2301 md_error(rdev->mddev, rdev);
2302 return 0;
2303}
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2314{
2315 int sect = 0;
2316 int sectors = r10_bio->sectors;
2317 struct md_rdev *rdev;
2318 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2319 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2320
2321
2322
2323
2324 rdev = conf->mirrors[d].rdev;
2325
2326 if (test_bit(Faulty, &rdev->flags))
2327
2328
2329 return;
2330
2331 check_decay_read_errors(mddev, rdev);
2332 atomic_inc(&rdev->read_errors);
2333 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2334 char b[BDEVNAME_SIZE];
2335 bdevname(rdev->bdev, b);
2336
2337 pr_notice("md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %d:max %d]\n",
2338 mdname(mddev), b,
2339 atomic_read(&rdev->read_errors), max_read_errors);
2340 pr_notice("md/raid10:%s: %s: Failing raid device\n",
2341 mdname(mddev), b);
2342 md_error(mddev, rdev);
2343 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2344 return;
2345 }
2346
2347 while(sectors) {
2348 int s = sectors;
2349 int sl = r10_bio->read_slot;
2350 int success = 0;
2351 int start;
2352
2353 if (s > (PAGE_SIZE>>9))
2354 s = PAGE_SIZE >> 9;
2355
2356 rcu_read_lock();
2357 do {
2358 sector_t first_bad;
2359 int bad_sectors;
2360
2361 d = r10_bio->devs[sl].devnum;
2362 rdev = rcu_dereference(conf->mirrors[d].rdev);
2363 if (rdev &&
2364 test_bit(In_sync, &rdev->flags) &&
2365 !test_bit(Faulty, &rdev->flags) &&
2366 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2367 &first_bad, &bad_sectors) == 0) {
2368 atomic_inc(&rdev->nr_pending);
2369 rcu_read_unlock();
2370 success = sync_page_io(rdev,
2371 r10_bio->devs[sl].addr +
2372 sect,
2373 s<<9,
2374 conf->tmppage,
2375 REQ_OP_READ, 0, false);
2376 rdev_dec_pending(rdev, mddev);
2377 rcu_read_lock();
2378 if (success)
2379 break;
2380 }
2381 sl++;
2382 if (sl == conf->copies)
2383 sl = 0;
2384 } while (!success && sl != r10_bio->read_slot);
2385 rcu_read_unlock();
2386
2387 if (!success) {
2388
2389
2390
2391
2392 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2393 rdev = conf->mirrors[dn].rdev;
2394
2395 if (!rdev_set_badblocks(
2396 rdev,
2397 r10_bio->devs[r10_bio->read_slot].addr
2398 + sect,
2399 s, 0)) {
2400 md_error(mddev, rdev);
2401 r10_bio->devs[r10_bio->read_slot].bio
2402 = IO_BLOCKED;
2403 }
2404 break;
2405 }
2406
2407 start = sl;
2408
2409 rcu_read_lock();
2410 while (sl != r10_bio->read_slot) {
2411 char b[BDEVNAME_SIZE];
2412
2413 if (sl==0)
2414 sl = conf->copies;
2415 sl--;
2416 d = r10_bio->devs[sl].devnum;
2417 rdev = rcu_dereference(conf->mirrors[d].rdev);
2418 if (!rdev ||
2419 test_bit(Faulty, &rdev->flags) ||
2420 !test_bit(In_sync, &rdev->flags))
2421 continue;
2422
2423 atomic_inc(&rdev->nr_pending);
2424 rcu_read_unlock();
2425 if (r10_sync_page_io(rdev,
2426 r10_bio->devs[sl].addr +
2427 sect,
2428 s, conf->tmppage, WRITE)
2429 == 0) {
2430
2431 pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %s)\n",
2432 mdname(mddev), s,
2433 (unsigned long long)(
2434 sect +
2435 choose_data_offset(r10_bio,
2436 rdev)),
2437 bdevname(rdev->bdev, b));
2438 pr_notice("md/raid10:%s: %s: failing drive\n",
2439 mdname(mddev),
2440 bdevname(rdev->bdev, b));
2441 }
2442 rdev_dec_pending(rdev, mddev);
2443 rcu_read_lock();
2444 }
2445 sl = start;
2446 while (sl != r10_bio->read_slot) {
2447 char b[BDEVNAME_SIZE];
2448
2449 if (sl==0)
2450 sl = conf->copies;
2451 sl--;
2452 d = r10_bio->devs[sl].devnum;
2453 rdev = rcu_dereference(conf->mirrors[d].rdev);
2454 if (!rdev ||
2455 test_bit(Faulty, &rdev->flags) ||
2456 !test_bit(In_sync, &rdev->flags))
2457 continue;
2458
2459 atomic_inc(&rdev->nr_pending);
2460 rcu_read_unlock();
2461 switch (r10_sync_page_io(rdev,
2462 r10_bio->devs[sl].addr +
2463 sect,
2464 s, conf->tmppage,
2465 READ)) {
2466 case 0:
2467
2468 pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %s)\n",
2469 mdname(mddev), s,
2470 (unsigned long long)(
2471 sect +
2472 choose_data_offset(r10_bio, rdev)),
2473 bdevname(rdev->bdev, b));
2474 pr_notice("md/raid10:%s: %s: failing drive\n",
2475 mdname(mddev),
2476 bdevname(rdev->bdev, b));
2477 break;
2478 case 1:
2479 pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %s)\n",
2480 mdname(mddev), s,
2481 (unsigned long long)(
2482 sect +
2483 choose_data_offset(r10_bio, rdev)),
2484 bdevname(rdev->bdev, b));
2485 atomic_add(s, &rdev->corrected_errors);
2486 }
2487
2488 rdev_dec_pending(rdev, mddev);
2489 rcu_read_lock();
2490 }
2491 rcu_read_unlock();
2492
2493 sectors -= s;
2494 sect += s;
2495 }
2496}
2497
2498static int narrow_write_error(struct r10bio *r10_bio, int i)
2499{
2500 struct bio *bio = r10_bio->master_bio;
2501 struct mddev *mddev = r10_bio->mddev;
2502 struct r10conf *conf = mddev->private;
2503 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515 int block_sectors;
2516 sector_t sector;
2517 int sectors;
2518 int sect_to_write = r10_bio->sectors;
2519 int ok = 1;
2520
2521 if (rdev->badblocks.shift < 0)
2522 return 0;
2523
2524 block_sectors = roundup(1 << rdev->badblocks.shift,
2525 bdev_logical_block_size(rdev->bdev) >> 9);
2526 sector = r10_bio->sector;
2527 sectors = ((r10_bio->sector + block_sectors)
2528 & ~(sector_t)(block_sectors - 1))
2529 - sector;
2530
2531 while (sect_to_write) {
2532 struct bio *wbio;
2533 sector_t wsector;
2534 if (sectors > sect_to_write)
2535 sectors = sect_to_write;
2536
2537 wbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
2538 bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
2539 wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
2540 wbio->bi_iter.bi_sector = wsector +
2541 choose_data_offset(r10_bio, rdev);
2542 bio_set_dev(wbio, rdev->bdev);
2543 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
2544
2545 if (submit_bio_wait(wbio) < 0)
2546
2547 ok = rdev_set_badblocks(rdev, wsector,
2548 sectors, 0)
2549 && ok;
2550
2551 bio_put(wbio);
2552 sect_to_write -= sectors;
2553 sector += sectors;
2554 sectors = block_sectors;
2555 }
2556 return ok;
2557}
2558
2559static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2560{
2561 int slot = r10_bio->read_slot;
2562 struct bio *bio;
2563 struct r10conf *conf = mddev->private;
2564 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574 bio = r10_bio->devs[slot].bio;
2575 bio_put(bio);
2576 r10_bio->devs[slot].bio = NULL;
2577
2578 if (mddev->ro)
2579 r10_bio->devs[slot].bio = IO_BLOCKED;
2580 else if (!test_bit(FailFast, &rdev->flags)) {
2581 freeze_array(conf, 1);
2582 fix_read_error(conf, mddev, r10_bio);
2583 unfreeze_array(conf);
2584 } else
2585 md_error(mddev, rdev);
2586
2587 rdev_dec_pending(rdev, mddev);
2588 allow_barrier(conf);
2589 r10_bio->state = 0;
2590 raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
2591}
2592
2593static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2594{
2595
2596
2597
2598
2599
2600
2601 int m;
2602 struct md_rdev *rdev;
2603
2604 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2605 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2606 for (m = 0; m < conf->copies; m++) {
2607 int dev = r10_bio->devs[m].devnum;
2608 rdev = conf->mirrors[dev].rdev;
2609 if (r10_bio->devs[m].bio == NULL ||
2610 r10_bio->devs[m].bio->bi_end_io == NULL)
2611 continue;
2612 if (!r10_bio->devs[m].bio->bi_status) {
2613 rdev_clear_badblocks(
2614 rdev,
2615 r10_bio->devs[m].addr,
2616 r10_bio->sectors, 0);
2617 } else {
2618 if (!rdev_set_badblocks(
2619 rdev,
2620 r10_bio->devs[m].addr,
2621 r10_bio->sectors, 0))
2622 md_error(conf->mddev, rdev);
2623 }
2624 rdev = conf->mirrors[dev].replacement;
2625 if (r10_bio->devs[m].repl_bio == NULL ||
2626 r10_bio->devs[m].repl_bio->bi_end_io == NULL)
2627 continue;
2628
2629 if (!r10_bio->devs[m].repl_bio->bi_status) {
2630 rdev_clear_badblocks(
2631 rdev,
2632 r10_bio->devs[m].addr,
2633 r10_bio->sectors, 0);
2634 } else {
2635 if (!rdev_set_badblocks(
2636 rdev,
2637 r10_bio->devs[m].addr,
2638 r10_bio->sectors, 0))
2639 md_error(conf->mddev, rdev);
2640 }
2641 }
2642 put_buf(r10_bio);
2643 } else {
2644 bool fail = false;
2645 for (m = 0; m < conf->copies; m++) {
2646 int dev = r10_bio->devs[m].devnum;
2647 struct bio *bio = r10_bio->devs[m].bio;
2648 rdev = conf->mirrors[dev].rdev;
2649 if (bio == IO_MADE_GOOD) {
2650 rdev_clear_badblocks(
2651 rdev,
2652 r10_bio->devs[m].addr,
2653 r10_bio->sectors, 0);
2654 rdev_dec_pending(rdev, conf->mddev);
2655 } else if (bio != NULL && bio->bi_status) {
2656 fail = true;
2657 if (!narrow_write_error(r10_bio, m)) {
2658 md_error(conf->mddev, rdev);
2659 set_bit(R10BIO_Degraded,
2660 &r10_bio->state);
2661 }
2662 rdev_dec_pending(rdev, conf->mddev);
2663 }
2664 bio = r10_bio->devs[m].repl_bio;
2665 rdev = conf->mirrors[dev].replacement;
2666 if (rdev && bio == IO_MADE_GOOD) {
2667 rdev_clear_badblocks(
2668 rdev,
2669 r10_bio->devs[m].addr,
2670 r10_bio->sectors, 0);
2671 rdev_dec_pending(rdev, conf->mddev);
2672 }
2673 }
2674 if (fail) {
2675 spin_lock_irq(&conf->device_lock);
2676 list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
2677 conf->nr_queued++;
2678 spin_unlock_irq(&conf->device_lock);
2679
2680
2681
2682
2683 wake_up(&conf->wait_barrier);
2684 md_wakeup_thread(conf->mddev->thread);
2685 } else {
2686 if (test_bit(R10BIO_WriteError,
2687 &r10_bio->state))
2688 close_write(r10_bio);
2689 raid_end_bio_io(r10_bio);
2690 }
2691 }
2692}
2693
2694static void raid10d(struct md_thread *thread)
2695{
2696 struct mddev *mddev = thread->mddev;
2697 struct r10bio *r10_bio;
2698 unsigned long flags;
2699 struct r10conf *conf = mddev->private;
2700 struct list_head *head = &conf->retry_list;
2701 struct blk_plug plug;
2702
2703 md_check_recovery(mddev);
2704
2705 if (!list_empty_careful(&conf->bio_end_io_list) &&
2706 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2707 LIST_HEAD(tmp);
2708 spin_lock_irqsave(&conf->device_lock, flags);
2709 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2710 while (!list_empty(&conf->bio_end_io_list)) {
2711 list_move(conf->bio_end_io_list.prev, &tmp);
2712 conf->nr_queued--;
2713 }
2714 }
2715 spin_unlock_irqrestore(&conf->device_lock, flags);
2716 while (!list_empty(&tmp)) {
2717 r10_bio = list_first_entry(&tmp, struct r10bio,
2718 retry_list);
2719 list_del(&r10_bio->retry_list);
2720 if (mddev->degraded)
2721 set_bit(R10BIO_Degraded, &r10_bio->state);
2722
2723 if (test_bit(R10BIO_WriteError,
2724 &r10_bio->state))
2725 close_write(r10_bio);
2726 raid_end_bio_io(r10_bio);
2727 }
2728 }
2729
2730 blk_start_plug(&plug);
2731 for (;;) {
2732
2733 flush_pending_writes(conf);
2734
2735 spin_lock_irqsave(&conf->device_lock, flags);
2736 if (list_empty(head)) {
2737 spin_unlock_irqrestore(&conf->device_lock, flags);
2738 break;
2739 }
2740 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2741 list_del(head->prev);
2742 conf->nr_queued--;
2743 spin_unlock_irqrestore(&conf->device_lock, flags);
2744
2745 mddev = r10_bio->mddev;
2746 conf = mddev->private;
2747 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2748 test_bit(R10BIO_WriteError, &r10_bio->state))
2749 handle_write_completed(conf, r10_bio);
2750 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2751 reshape_request_write(mddev, r10_bio);
2752 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2753 sync_request_write(mddev, r10_bio);
2754 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2755 recovery_request_write(mddev, r10_bio);
2756 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2757 handle_read_error(mddev, r10_bio);
2758 else
2759 WARN_ON_ONCE(1);
2760
2761 cond_resched();
2762 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
2763 md_check_recovery(mddev);
2764 }
2765 blk_finish_plug(&plug);
2766}
2767
2768static int init_resync(struct r10conf *conf)
2769{
2770 int ret, buffs, i;
2771
2772 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2773 BUG_ON(mempool_initialized(&conf->r10buf_pool));
2774 conf->have_replacement = 0;
2775 for (i = 0; i < conf->geo.raid_disks; i++)
2776 if (conf->mirrors[i].replacement)
2777 conf->have_replacement = 1;
2778 ret = mempool_init(&conf->r10buf_pool, buffs,
2779 r10buf_pool_alloc, r10buf_pool_free, conf);
2780 if (ret)
2781 return ret;
2782 conf->next_resync = 0;
2783 return 0;
2784}
2785
2786static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
2787{
2788 struct r10bio *r10bio = mempool_alloc(&conf->r10buf_pool, GFP_NOIO);
2789 struct rsync_pages *rp;
2790 struct bio *bio;
2791 int nalloc;
2792 int i;
2793
2794 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
2795 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
2796 nalloc = conf->copies;
2797 else
2798 nalloc = 2;
2799
2800 for (i = 0; i < nalloc; i++) {
2801 bio = r10bio->devs[i].bio;
2802 rp = bio->bi_private;
2803 bio_reset(bio);
2804 bio->bi_private = rp;
2805 bio = r10bio->devs[i].repl_bio;
2806 if (bio) {
2807 rp = bio->bi_private;
2808 bio_reset(bio);
2809 bio->bi_private = rp;
2810 }
2811 }
2812 return r10bio;
2813}
2814
2815
2816
2817
2818
2819static void raid10_set_cluster_sync_high(struct r10conf *conf)
2820{
2821 sector_t window_size;
2822 int extra_chunk, chunks;
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836 chunks = conf->geo.raid_disks / conf->geo.near_copies;
2837 if (conf->geo.raid_disks % conf->geo.near_copies == 0)
2838 extra_chunk = 0;
2839 else
2840 extra_chunk = 1;
2841 window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors;
2842
2843
2844
2845
2846 window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ?
2847 CLUSTER_RESYNC_WINDOW_SECTORS : window_size;
2848
2849 conf->cluster_sync_high = conf->cluster_sync_low + window_size;
2850}
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
2885 int *skipped)
2886{
2887 struct r10conf *conf = mddev->private;
2888 struct r10bio *r10_bio;
2889 struct bio *biolist = NULL, *bio;
2890 sector_t max_sector, nr_sectors;
2891 int i;
2892 int max_sync;
2893 sector_t sync_blocks;
2894 sector_t sectors_skipped = 0;
2895 int chunks_skipped = 0;
2896 sector_t chunk_mask = conf->geo.chunk_mask;
2897 int page_idx = 0;
2898
2899 if (!mempool_initialized(&conf->r10buf_pool))
2900 if (init_resync(conf))
2901 return 0;
2902
2903
2904
2905
2906
2907 if (mddev->bitmap == NULL &&
2908 mddev->recovery_cp == MaxSector &&
2909 mddev->reshape_position == MaxSector &&
2910 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2911 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
2912 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2913 conf->fullsync == 0) {
2914 *skipped = 1;
2915 return mddev->dev_sectors - sector_nr;
2916 }
2917
2918 skipped:
2919 max_sector = mddev->dev_sectors;
2920 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2921 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2922 max_sector = mddev->resync_max_sectors;
2923 if (sector_nr >= max_sector) {
2924 conf->cluster_sync_low = 0;
2925 conf->cluster_sync_high = 0;
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2937 end_reshape(conf);
2938 close_sync(conf);
2939 return 0;
2940 }
2941
2942 if (mddev->curr_resync < max_sector) {
2943 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2944 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2945 &sync_blocks, 1);
2946 else for (i = 0; i < conf->geo.raid_disks; i++) {
2947 sector_t sect =
2948 raid10_find_virt(conf, mddev->curr_resync, i);
2949 md_bitmap_end_sync(mddev->bitmap, sect,
2950 &sync_blocks, 1);
2951 }
2952 } else {
2953
2954 if ((!mddev->bitmap || conf->fullsync)
2955 && conf->have_replacement
2956 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2957
2958
2959
2960 rcu_read_lock();
2961 for (i = 0; i < conf->geo.raid_disks; i++) {
2962 struct md_rdev *rdev =
2963 rcu_dereference(conf->mirrors[i].replacement);
2964 if (rdev)
2965 rdev->recovery_offset = MaxSector;
2966 }
2967 rcu_read_unlock();
2968 }
2969 conf->fullsync = 0;
2970 }
2971 md_bitmap_close_sync(mddev->bitmap);
2972 close_sync(conf);
2973 *skipped = 1;
2974 return sectors_skipped;
2975 }
2976
2977 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2978 return reshape_request(mddev, sector_nr, skipped);
2979
2980 if (chunks_skipped >= conf->geo.raid_disks) {
2981
2982
2983
2984 *skipped = 1;
2985 return (max_sector - sector_nr) + sectors_skipped;
2986 }
2987
2988 if (max_sector > mddev->resync_max)
2989 max_sector = mddev->resync_max;
2990
2991
2992
2993
2994 if (conf->geo.near_copies < conf->geo.raid_disks &&
2995 max_sector > (sector_nr | chunk_mask))
2996 max_sector = (sector_nr | chunk_mask) + 1;
2997
2998
2999
3000
3001
3002 if (conf->nr_waiting)
3003 schedule_timeout_uninterruptible(1);
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
3021 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3022
3023 int j;
3024 r10_bio = NULL;
3025
3026 for (i = 0 ; i < conf->geo.raid_disks; i++) {
3027 int still_degraded;
3028 struct r10bio *rb2;
3029 sector_t sect;
3030 int must_sync;
3031 int any_working;
3032 int need_recover = 0;
3033 int need_replace = 0;
3034 struct raid10_info *mirror = &conf->mirrors[i];
3035 struct md_rdev *mrdev, *mreplace;
3036
3037 rcu_read_lock();
3038 mrdev = rcu_dereference(mirror->rdev);
3039 mreplace = rcu_dereference(mirror->replacement);
3040
3041 if (mrdev != NULL &&
3042 !test_bit(Faulty, &mrdev->flags) &&
3043 !test_bit(In_sync, &mrdev->flags))
3044 need_recover = 1;
3045 if (mreplace != NULL &&
3046 !test_bit(Faulty, &mreplace->flags))
3047 need_replace = 1;
3048
3049 if (!need_recover && !need_replace) {
3050 rcu_read_unlock();
3051 continue;
3052 }
3053
3054 still_degraded = 0;
3055
3056 rb2 = r10_bio;
3057 sect = raid10_find_virt(conf, sector_nr, i);
3058 if (sect >= mddev->resync_max_sectors) {
3059
3060
3061
3062 rcu_read_unlock();
3063 continue;
3064 }
3065 if (mreplace && test_bit(Faulty, &mreplace->flags))
3066 mreplace = NULL;
3067
3068
3069
3070
3071 must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
3072 &sync_blocks, 1);
3073 if (sync_blocks < max_sync)
3074 max_sync = sync_blocks;
3075 if (!must_sync &&
3076 mreplace == NULL &&
3077 !conf->fullsync) {
3078
3079
3080
3081 chunks_skipped = -1;
3082 rcu_read_unlock();
3083 continue;
3084 }
3085 atomic_inc(&mrdev->nr_pending);
3086 if (mreplace)
3087 atomic_inc(&mreplace->nr_pending);
3088 rcu_read_unlock();
3089
3090 r10_bio = raid10_alloc_init_r10buf(conf);
3091 r10_bio->state = 0;
3092 raise_barrier(conf, rb2 != NULL);
3093 atomic_set(&r10_bio->remaining, 0);
3094
3095 r10_bio->master_bio = (struct bio*)rb2;
3096 if (rb2)
3097 atomic_inc(&rb2->remaining);
3098 r10_bio->mddev = mddev;
3099 set_bit(R10BIO_IsRecover, &r10_bio->state);
3100 r10_bio->sector = sect;
3101
3102 raid10_find_phys(conf, r10_bio);
3103
3104
3105
3106
3107 rcu_read_lock();
3108 for (j = 0; j < conf->geo.raid_disks; j++) {
3109 struct md_rdev *rdev = rcu_dereference(
3110 conf->mirrors[j].rdev);
3111 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3112 still_degraded = 1;
3113 break;
3114 }
3115 }
3116
3117 must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
3118 &sync_blocks, still_degraded);
3119
3120 any_working = 0;
3121 for (j=0; j<conf->copies;j++) {
3122 int k;
3123 int d = r10_bio->devs[j].devnum;
3124 sector_t from_addr, to_addr;
3125 struct md_rdev *rdev =
3126 rcu_dereference(conf->mirrors[d].rdev);
3127 sector_t sector, first_bad;
3128 int bad_sectors;
3129 if (!rdev ||
3130 !test_bit(In_sync, &rdev->flags))
3131 continue;
3132
3133 any_working = 1;
3134 sector = r10_bio->devs[j].addr;
3135
3136 if (is_badblock(rdev, sector, max_sync,
3137 &first_bad, &bad_sectors)) {
3138 if (first_bad > sector)
3139 max_sync = first_bad - sector;
3140 else {
3141 bad_sectors -= (sector
3142 - first_bad);
3143 if (max_sync > bad_sectors)
3144 max_sync = bad_sectors;
3145 continue;
3146 }
3147 }
3148 bio = r10_bio->devs[0].bio;
3149 bio->bi_next = biolist;
3150 biolist = bio;
3151 bio->bi_end_io = end_sync_read;
3152 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3153 if (test_bit(FailFast, &rdev->flags))
3154 bio->bi_opf |= MD_FAILFAST;
3155 from_addr = r10_bio->devs[j].addr;
3156 bio->bi_iter.bi_sector = from_addr +
3157 rdev->data_offset;
3158 bio_set_dev(bio, rdev->bdev);
3159 atomic_inc(&rdev->nr_pending);
3160
3161
3162 for (k=0; k<conf->copies; k++)
3163 if (r10_bio->devs[k].devnum == i)
3164 break;
3165 BUG_ON(k == conf->copies);
3166 to_addr = r10_bio->devs[k].addr;
3167 r10_bio->devs[0].devnum = d;
3168 r10_bio->devs[0].addr = from_addr;
3169 r10_bio->devs[1].devnum = i;
3170 r10_bio->devs[1].addr = to_addr;
3171
3172 if (need_recover) {
3173 bio = r10_bio->devs[1].bio;
3174 bio->bi_next = biolist;
3175 biolist = bio;
3176 bio->bi_end_io = end_sync_write;
3177 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3178 bio->bi_iter.bi_sector = to_addr
3179 + mrdev->data_offset;
3180 bio_set_dev(bio, mrdev->bdev);
3181 atomic_inc(&r10_bio->remaining);
3182 } else
3183 r10_bio->devs[1].bio->bi_end_io = NULL;
3184
3185
3186 bio = r10_bio->devs[1].repl_bio;
3187 if (bio)
3188 bio->bi_end_io = NULL;
3189
3190
3191
3192
3193 if (!need_replace)
3194 break;
3195 bio->bi_next = biolist;
3196 biolist = bio;
3197 bio->bi_end_io = end_sync_write;
3198 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3199 bio->bi_iter.bi_sector = to_addr +
3200 mreplace->data_offset;
3201 bio_set_dev(bio, mreplace->bdev);
3202 atomic_inc(&r10_bio->remaining);
3203 break;
3204 }
3205 rcu_read_unlock();
3206 if (j == conf->copies) {
3207
3208
3209 if (any_working) {
3210
3211
3212
3213 int k;
3214 for (k = 0; k < conf->copies; k++)
3215 if (r10_bio->devs[k].devnum == i)
3216 break;
3217 if (!test_bit(In_sync,
3218 &mrdev->flags)
3219 && !rdev_set_badblocks(
3220 mrdev,
3221 r10_bio->devs[k].addr,
3222 max_sync, 0))
3223 any_working = 0;
3224 if (mreplace &&
3225 !rdev_set_badblocks(
3226 mreplace,
3227 r10_bio->devs[k].addr,
3228 max_sync, 0))
3229 any_working = 0;
3230 }
3231 if (!any_working) {
3232 if (!test_and_set_bit(MD_RECOVERY_INTR,
3233 &mddev->recovery))
3234 pr_warn("md/raid10:%s: insufficient working devices for recovery.\n",
3235 mdname(mddev));
3236 mirror->recovery_disabled
3237 = mddev->recovery_disabled;
3238 }
3239 put_buf(r10_bio);
3240 if (rb2)
3241 atomic_dec(&rb2->remaining);
3242 r10_bio = rb2;
3243 rdev_dec_pending(mrdev, mddev);
3244 if (mreplace)
3245 rdev_dec_pending(mreplace, mddev);
3246 break;
3247 }
3248 rdev_dec_pending(mrdev, mddev);
3249 if (mreplace)
3250 rdev_dec_pending(mreplace, mddev);
3251 if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) {
3252
3253
3254
3255
3256 int targets = 1;
3257 for (; j < conf->copies; j++) {
3258 int d = r10_bio->devs[j].devnum;
3259 if (conf->mirrors[d].rdev &&
3260 test_bit(In_sync,
3261 &conf->mirrors[d].rdev->flags))
3262 targets++;
3263 }
3264 if (targets == 1)
3265 r10_bio->devs[0].bio->bi_opf
3266 &= ~MD_FAILFAST;
3267 }
3268 }
3269 if (biolist == NULL) {
3270 while (r10_bio) {
3271 struct r10bio *rb2 = r10_bio;
3272 r10_bio = (struct r10bio*) rb2->master_bio;
3273 rb2->master_bio = NULL;
3274 put_buf(rb2);
3275 }
3276 goto giveup;
3277 }
3278 } else {
3279
3280 int count = 0;
3281
3282
3283
3284
3285
3286
3287
3288
3289 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
3290 mddev_is_clustered(mddev) &&
3291 (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
3292
3293 if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
3294 &sync_blocks, mddev->degraded) &&
3295 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3296 &mddev->recovery)) {
3297
3298 *skipped = 1;
3299 return sync_blocks + sectors_skipped;
3300 }
3301 if (sync_blocks < max_sync)
3302 max_sync = sync_blocks;
3303 r10_bio = raid10_alloc_init_r10buf(conf);
3304 r10_bio->state = 0;
3305
3306 r10_bio->mddev = mddev;
3307 atomic_set(&r10_bio->remaining, 0);
3308 raise_barrier(conf, 0);
3309 conf->next_resync = sector_nr;
3310
3311 r10_bio->master_bio = NULL;
3312 r10_bio->sector = sector_nr;
3313 set_bit(R10BIO_IsSync, &r10_bio->state);
3314 raid10_find_phys(conf, r10_bio);
3315 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3316
3317 for (i = 0; i < conf->copies; i++) {
3318 int d = r10_bio->devs[i].devnum;
3319 sector_t first_bad, sector;
3320 int bad_sectors;
3321 struct md_rdev *rdev;
3322
3323 if (r10_bio->devs[i].repl_bio)
3324 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3325
3326 bio = r10_bio->devs[i].bio;
3327 bio->bi_status = BLK_STS_IOERR;
3328 rcu_read_lock();
3329 rdev = rcu_dereference(conf->mirrors[d].rdev);
3330 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3331 rcu_read_unlock();
3332 continue;
3333 }
3334 sector = r10_bio->devs[i].addr;
3335 if (is_badblock(rdev, sector, max_sync,
3336 &first_bad, &bad_sectors)) {
3337 if (first_bad > sector)
3338 max_sync = first_bad - sector;
3339 else {
3340 bad_sectors -= (sector - first_bad);
3341 if (max_sync > bad_sectors)
3342 max_sync = bad_sectors;
3343 rcu_read_unlock();
3344 continue;
3345 }
3346 }
3347 atomic_inc(&rdev->nr_pending);
3348 atomic_inc(&r10_bio->remaining);
3349 bio->bi_next = biolist;
3350 biolist = bio;
3351 bio->bi_end_io = end_sync_read;
3352 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3353 if (test_bit(FailFast, &rdev->flags))
3354 bio->bi_opf |= MD_FAILFAST;
3355 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3356 bio_set_dev(bio, rdev->bdev);
3357 count++;
3358
3359 rdev = rcu_dereference(conf->mirrors[d].replacement);
3360 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3361 rcu_read_unlock();
3362 continue;
3363 }
3364 atomic_inc(&rdev->nr_pending);
3365
3366
3367 bio = r10_bio->devs[i].repl_bio;
3368 bio->bi_status = BLK_STS_IOERR;
3369
3370 sector = r10_bio->devs[i].addr;
3371 bio->bi_next = biolist;
3372 biolist = bio;
3373 bio->bi_end_io = end_sync_write;
3374 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3375 if (test_bit(FailFast, &rdev->flags))
3376 bio->bi_opf |= MD_FAILFAST;
3377 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3378 bio_set_dev(bio, rdev->bdev);
3379 count++;
3380 rcu_read_unlock();
3381 }
3382
3383 if (count < 2) {
3384 for (i=0; i<conf->copies; i++) {
3385 int d = r10_bio->devs[i].devnum;
3386 if (r10_bio->devs[i].bio->bi_end_io)
3387 rdev_dec_pending(conf->mirrors[d].rdev,
3388 mddev);
3389 if (r10_bio->devs[i].repl_bio &&
3390 r10_bio->devs[i].repl_bio->bi_end_io)
3391 rdev_dec_pending(
3392 conf->mirrors[d].replacement,
3393 mddev);
3394 }
3395 put_buf(r10_bio);
3396 biolist = NULL;
3397 goto giveup;
3398 }
3399 }
3400
3401 nr_sectors = 0;
3402 if (sector_nr + max_sync < max_sector)
3403 max_sector = sector_nr + max_sync;
3404 do {
3405 struct page *page;
3406 int len = PAGE_SIZE;
3407 if (sector_nr + (len>>9) > max_sector)
3408 len = (max_sector - sector_nr) << 9;
3409 if (len == 0)
3410 break;
3411 for (bio= biolist ; bio ; bio=bio->bi_next) {
3412 struct resync_pages *rp = get_resync_pages(bio);
3413 page = resync_fetch_page(rp, page_idx);
3414
3415
3416
3417
3418 bio_add_page(bio, page, len, 0);
3419 }
3420 nr_sectors += len>>9;
3421 sector_nr += len>>9;
3422 } while (++page_idx < RESYNC_PAGES);
3423 r10_bio->sectors = nr_sectors;
3424
3425 if (mddev_is_clustered(mddev) &&
3426 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3427
3428 if (conf->cluster_sync_high < sector_nr + nr_sectors) {
3429 conf->cluster_sync_low = mddev->curr_resync_completed;
3430 raid10_set_cluster_sync_high(conf);
3431
3432 md_cluster_ops->resync_info_update(mddev,
3433 conf->cluster_sync_low,
3434 conf->cluster_sync_high);
3435 }
3436 } else if (mddev_is_clustered(mddev)) {
3437
3438 sector_t sect_va1, sect_va2;
3439 bool broadcast_msg = false;
3440
3441 for (i = 0; i < conf->geo.raid_disks; i++) {
3442
3443
3444
3445
3446
3447 sect_va1 = raid10_find_virt(conf, sector_nr, i);
3448
3449 if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
3450 broadcast_msg = true;
3451
3452
3453
3454
3455 sect_va2 = raid10_find_virt(conf,
3456 mddev->curr_resync_completed, i);
3457
3458 if (conf->cluster_sync_low == 0 ||
3459 conf->cluster_sync_low > sect_va2)
3460 conf->cluster_sync_low = sect_va2;
3461 }
3462 }
3463 if (broadcast_msg) {
3464 raid10_set_cluster_sync_high(conf);
3465 md_cluster_ops->resync_info_update(mddev,
3466 conf->cluster_sync_low,
3467 conf->cluster_sync_high);
3468 }
3469 }
3470
3471 while (biolist) {
3472 bio = biolist;
3473 biolist = biolist->bi_next;
3474
3475 bio->bi_next = NULL;
3476 r10_bio = get_resync_r10bio(bio);
3477 r10_bio->sectors = nr_sectors;
3478
3479 if (bio->bi_end_io == end_sync_read) {
3480 md_sync_acct_bio(bio, nr_sectors);
3481 bio->bi_status = 0;
3482 submit_bio_noacct(bio);
3483 }
3484 }
3485
3486 if (sectors_skipped)
3487
3488
3489
3490 md_done_sync(mddev, sectors_skipped, 1);
3491
3492 return sectors_skipped + nr_sectors;
3493 giveup:
3494
3495
3496
3497
3498 if (sector_nr + max_sync < max_sector)
3499 max_sector = sector_nr + max_sync;
3500
3501 sectors_skipped += (max_sector - sector_nr);
3502 chunks_skipped ++;
3503 sector_nr = max_sector;
3504 goto skipped;
3505}
3506
3507static sector_t
3508raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3509{
3510 sector_t size;
3511 struct r10conf *conf = mddev->private;
3512
3513 if (!raid_disks)
3514 raid_disks = min(conf->geo.raid_disks,
3515 conf->prev.raid_disks);
3516 if (!sectors)
3517 sectors = conf->dev_sectors;
3518
3519 size = sectors >> conf->geo.chunk_shift;
3520 sector_div(size, conf->geo.far_copies);
3521 size = size * raid_disks;
3522 sector_div(size, conf->geo.near_copies);
3523
3524 return size << conf->geo.chunk_shift;
3525}
3526
3527static void calc_sectors(struct r10conf *conf, sector_t size)
3528{
3529
3530
3531
3532
3533
3534 size = size >> conf->geo.chunk_shift;
3535 sector_div(size, conf->geo.far_copies);
3536 size = size * conf->geo.raid_disks;
3537 sector_div(size, conf->geo.near_copies);
3538
3539
3540 size = size * conf->copies;
3541
3542
3543
3544
3545 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3546
3547 conf->dev_sectors = size << conf->geo.chunk_shift;
3548
3549 if (conf->geo.far_offset)
3550 conf->geo.stride = 1 << conf->geo.chunk_shift;
3551 else {
3552 sector_div(size, conf->geo.far_copies);
3553 conf->geo.stride = size << conf->geo.chunk_shift;
3554 }
3555}
3556
3557enum geo_type {geo_new, geo_old, geo_start};
3558static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3559{
3560 int nc, fc, fo;
3561 int layout, chunk, disks;
3562 switch (new) {
3563 case geo_old:
3564 layout = mddev->layout;
3565 chunk = mddev->chunk_sectors;
3566 disks = mddev->raid_disks - mddev->delta_disks;
3567 break;
3568 case geo_new:
3569 layout = mddev->new_layout;
3570 chunk = mddev->new_chunk_sectors;
3571 disks = mddev->raid_disks;
3572 break;
3573 default:
3574 case geo_start:
3575
3576 layout = mddev->new_layout;
3577 chunk = mddev->new_chunk_sectors;
3578 disks = mddev->raid_disks + mddev->delta_disks;
3579 break;
3580 }
3581 if (layout >> 19)
3582 return -1;
3583 if (chunk < (PAGE_SIZE >> 9) ||
3584 !is_power_of_2(chunk))
3585 return -2;
3586 nc = layout & 255;
3587 fc = (layout >> 8) & 255;
3588 fo = layout & (1<<16);
3589 geo->raid_disks = disks;
3590 geo->near_copies = nc;
3591 geo->far_copies = fc;
3592 geo->far_offset = fo;
3593 switch (layout >> 17) {
3594 case 0:
3595 geo->far_set_size = disks;
3596 break;
3597 case 1:
3598
3599 geo->far_set_size = disks/fc;
3600 WARN(geo->far_set_size < fc,
3601 "This RAID10 layout does not provide data safety - please backup and create new array\n");
3602 break;
3603 case 2:
3604 geo->far_set_size = fc * nc;
3605 break;
3606 default:
3607 return -1;
3608 }
3609 geo->chunk_mask = chunk - 1;
3610 geo->chunk_shift = ffz(~chunk);
3611 return nc*fc;
3612}
3613
3614static struct r10conf *setup_conf(struct mddev *mddev)
3615{
3616 struct r10conf *conf = NULL;
3617 int err = -EINVAL;
3618 struct geom geo;
3619 int copies;
3620
3621 copies = setup_geo(&geo, mddev, geo_new);
3622
3623 if (copies == -2) {
3624 pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n",
3625 mdname(mddev), PAGE_SIZE);
3626 goto out;
3627 }
3628
3629 if (copies < 2 || copies > mddev->raid_disks) {
3630 pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3631 mdname(mddev), mddev->new_layout);
3632 goto out;
3633 }
3634
3635 err = -ENOMEM;
3636 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3637 if (!conf)
3638 goto out;
3639
3640
3641 conf->mirrors = kcalloc(mddev->raid_disks + max(0, -mddev->delta_disks),
3642 sizeof(struct raid10_info),
3643 GFP_KERNEL);
3644 if (!conf->mirrors)
3645 goto out;
3646
3647 conf->tmppage = alloc_page(GFP_KERNEL);
3648 if (!conf->tmppage)
3649 goto out;
3650
3651 conf->geo = geo;
3652 conf->copies = copies;
3653 err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc,
3654 rbio_pool_free, conf);
3655 if (err)
3656 goto out;
3657
3658 err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
3659 if (err)
3660 goto out;
3661
3662 calc_sectors(conf, mddev->dev_sectors);
3663 if (mddev->reshape_position == MaxSector) {
3664 conf->prev = conf->geo;
3665 conf->reshape_progress = MaxSector;
3666 } else {
3667 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3668 err = -EINVAL;
3669 goto out;
3670 }
3671 conf->reshape_progress = mddev->reshape_position;
3672 if (conf->prev.far_offset)
3673 conf->prev.stride = 1 << conf->prev.chunk_shift;
3674 else
3675
3676 conf->prev.stride = conf->dev_sectors;
3677 }
3678 conf->reshape_safe = conf->reshape_progress;
3679 spin_lock_init(&conf->device_lock);
3680 INIT_LIST_HEAD(&conf->retry_list);
3681 INIT_LIST_HEAD(&conf->bio_end_io_list);
3682
3683 spin_lock_init(&conf->resync_lock);
3684 init_waitqueue_head(&conf->wait_barrier);
3685 atomic_set(&conf->nr_pending, 0);
3686
3687 err = -ENOMEM;
3688 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3689 if (!conf->thread)
3690 goto out;
3691
3692 conf->mddev = mddev;
3693 return conf;
3694
3695 out:
3696 if (conf) {
3697 mempool_exit(&conf->r10bio_pool);
3698 kfree(conf->mirrors);
3699 safe_put_page(conf->tmppage);
3700 bioset_exit(&conf->bio_split);
3701 kfree(conf);
3702 }
3703 return ERR_PTR(err);
3704}
3705
3706static void raid10_set_io_opt(struct r10conf *conf)
3707{
3708 int raid_disks = conf->geo.raid_disks;
3709
3710 if (!(conf->geo.raid_disks % conf->geo.near_copies))
3711 raid_disks /= conf->geo.near_copies;
3712 blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) *
3713 raid_disks);
3714}
3715
3716static int raid10_run(struct mddev *mddev)
3717{
3718 struct r10conf *conf;
3719 int i, disk_idx;
3720 struct raid10_info *disk;
3721 struct md_rdev *rdev;
3722 sector_t size;
3723 sector_t min_offset_diff = 0;
3724 int first = 1;
3725 bool discard_supported = false;
3726
3727 if (mddev_init_writes_pending(mddev) < 0)
3728 return -ENOMEM;
3729
3730 if (mddev->private == NULL) {
3731 conf = setup_conf(mddev);
3732 if (IS_ERR(conf))
3733 return PTR_ERR(conf);
3734 mddev->private = conf;
3735 }
3736 conf = mddev->private;
3737 if (!conf)
3738 goto out;
3739
3740 if (mddev_is_clustered(conf->mddev)) {
3741 int fc, fo;
3742
3743 fc = (mddev->layout >> 8) & 255;
3744 fo = mddev->layout & (1<<16);
3745 if (fc > 1 || fo > 0) {
3746 pr_err("only near layout is supported by clustered"
3747 " raid10\n");
3748 goto out_free_conf;
3749 }
3750 }
3751
3752 mddev->thread = conf->thread;
3753 conf->thread = NULL;
3754
3755 if (mddev->queue) {
3756 blk_queue_max_discard_sectors(mddev->queue,
3757 mddev->chunk_sectors);
3758 blk_queue_max_write_same_sectors(mddev->queue, 0);
3759 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
3760 blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
3761 raid10_set_io_opt(conf);
3762 }
3763
3764 rdev_for_each(rdev, mddev) {
3765 long long diff;
3766
3767 disk_idx = rdev->raid_disk;
3768 if (disk_idx < 0)
3769 continue;
3770 if (disk_idx >= conf->geo.raid_disks &&
3771 disk_idx >= conf->prev.raid_disks)
3772 continue;
3773 disk = conf->mirrors + disk_idx;
3774
3775 if (test_bit(Replacement, &rdev->flags)) {
3776 if (disk->replacement)
3777 goto out_free_conf;
3778 disk->replacement = rdev;
3779 } else {
3780 if (disk->rdev)
3781 goto out_free_conf;
3782 disk->rdev = rdev;
3783 }
3784 diff = (rdev->new_data_offset - rdev->data_offset);
3785 if (!mddev->reshape_backwards)
3786 diff = -diff;
3787 if (diff < 0)
3788 diff = 0;
3789 if (first || diff < min_offset_diff)
3790 min_offset_diff = diff;
3791
3792 if (mddev->gendisk)
3793 disk_stack_limits(mddev->gendisk, rdev->bdev,
3794 rdev->data_offset << 9);
3795
3796 disk->head_position = 0;
3797
3798 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3799 discard_supported = true;
3800 first = 0;
3801 }
3802
3803 if (mddev->queue) {
3804 if (discard_supported)
3805 blk_queue_flag_set(QUEUE_FLAG_DISCARD,
3806 mddev->queue);
3807 else
3808 blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
3809 mddev->queue);
3810 }
3811
3812 if (!enough(conf, -1)) {
3813 pr_err("md/raid10:%s: not enough operational mirrors.\n",
3814 mdname(mddev));
3815 goto out_free_conf;
3816 }
3817
3818 if (conf->reshape_progress != MaxSector) {
3819
3820 if (conf->geo.far_copies != 1 &&
3821 conf->geo.far_offset == 0)
3822 goto out_free_conf;
3823 if (conf->prev.far_copies != 1 &&
3824 conf->prev.far_offset == 0)
3825 goto out_free_conf;
3826 }
3827
3828 mddev->degraded = 0;
3829 for (i = 0;
3830 i < conf->geo.raid_disks
3831 || i < conf->prev.raid_disks;
3832 i++) {
3833
3834 disk = conf->mirrors + i;
3835
3836 if (!disk->rdev && disk->replacement) {
3837
3838 disk->rdev = disk->replacement;
3839 disk->replacement = NULL;
3840 clear_bit(Replacement, &disk->rdev->flags);
3841 }
3842
3843 if (!disk->rdev ||
3844 !test_bit(In_sync, &disk->rdev->flags)) {
3845 disk->head_position = 0;
3846 mddev->degraded++;
3847 if (disk->rdev &&
3848 disk->rdev->saved_raid_disk < 0)
3849 conf->fullsync = 1;
3850 }
3851
3852 if (disk->replacement &&
3853 !test_bit(In_sync, &disk->replacement->flags) &&
3854 disk->replacement->saved_raid_disk < 0) {
3855 conf->fullsync = 1;
3856 }
3857
3858 disk->recovery_disabled = mddev->recovery_disabled - 1;
3859 }
3860
3861 if (mddev->recovery_cp != MaxSector)
3862 pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n",
3863 mdname(mddev));
3864 pr_info("md/raid10:%s: active with %d out of %d devices\n",
3865 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3866 conf->geo.raid_disks);
3867
3868
3869
3870 mddev->dev_sectors = conf->dev_sectors;
3871 size = raid10_size(mddev, 0, 0);
3872 md_set_array_sectors(mddev, size);
3873 mddev->resync_max_sectors = size;
3874 set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
3875
3876 if (md_integrity_register(mddev))
3877 goto out_free_conf;
3878
3879 if (conf->reshape_progress != MaxSector) {
3880 unsigned long before_length, after_length;
3881
3882 before_length = ((1 << conf->prev.chunk_shift) *
3883 conf->prev.far_copies);
3884 after_length = ((1 << conf->geo.chunk_shift) *
3885 conf->geo.far_copies);
3886
3887 if (max(before_length, after_length) > min_offset_diff) {
3888
3889 pr_warn("md/raid10: offset difference not enough to continue reshape\n");
3890 goto out_free_conf;
3891 }
3892 conf->offset_diff = min_offset_diff;
3893
3894 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3895 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3896 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3897 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3898 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3899 "reshape");
3900 if (!mddev->sync_thread)
3901 goto out_free_conf;
3902 }
3903
3904 return 0;
3905
3906out_free_conf:
3907 md_unregister_thread(&mddev->thread);
3908 mempool_exit(&conf->r10bio_pool);
3909 safe_put_page(conf->tmppage);
3910 kfree(conf->mirrors);
3911 kfree(conf);
3912 mddev->private = NULL;
3913out:
3914 return -EIO;
3915}
3916
3917static void raid10_free(struct mddev *mddev, void *priv)
3918{
3919 struct r10conf *conf = priv;
3920
3921 mempool_exit(&conf->r10bio_pool);
3922 safe_put_page(conf->tmppage);
3923 kfree(conf->mirrors);
3924 kfree(conf->mirrors_old);
3925 kfree(conf->mirrors_new);
3926 bioset_exit(&conf->bio_split);
3927 kfree(conf);
3928}
3929
3930static void raid10_quiesce(struct mddev *mddev, int quiesce)
3931{
3932 struct r10conf *conf = mddev->private;
3933
3934 if (quiesce)
3935 raise_barrier(conf, 0);
3936 else
3937 lower_barrier(conf);
3938}
3939
3940static int raid10_resize(struct mddev *mddev, sector_t sectors)
3941{
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954 struct r10conf *conf = mddev->private;
3955 sector_t oldsize, size;
3956
3957 if (mddev->reshape_position != MaxSector)
3958 return -EBUSY;
3959
3960 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3961 return -EINVAL;
3962
3963 oldsize = raid10_size(mddev, 0, 0);
3964 size = raid10_size(mddev, sectors, 0);
3965 if (mddev->external_size &&
3966 mddev->array_sectors > size)
3967 return -EINVAL;
3968 if (mddev->bitmap) {
3969 int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0);
3970 if (ret)
3971 return ret;
3972 }
3973 md_set_array_sectors(mddev, size);
3974 if (sectors > mddev->dev_sectors &&
3975 mddev->recovery_cp > oldsize) {
3976 mddev->recovery_cp = oldsize;
3977 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3978 }
3979 calc_sectors(conf, sectors);
3980 mddev->dev_sectors = conf->dev_sectors;
3981 mddev->resync_max_sectors = size;
3982 return 0;
3983}
3984
3985static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
3986{
3987 struct md_rdev *rdev;
3988 struct r10conf *conf;
3989
3990 if (mddev->degraded > 0) {
3991 pr_warn("md/raid10:%s: Error: degraded raid0!\n",
3992 mdname(mddev));
3993 return ERR_PTR(-EINVAL);
3994 }
3995 sector_div(size, devs);
3996
3997
3998 mddev->new_level = 10;
3999
4000 mddev->new_layout = (1<<8) + 2;
4001 mddev->new_chunk_sectors = mddev->chunk_sectors;
4002 mddev->delta_disks = mddev->raid_disks;
4003 mddev->raid_disks *= 2;
4004
4005 mddev->recovery_cp = MaxSector;
4006 mddev->dev_sectors = size;
4007
4008 conf = setup_conf(mddev);
4009 if (!IS_ERR(conf)) {
4010 rdev_for_each(rdev, mddev)
4011 if (rdev->raid_disk >= 0) {
4012 rdev->new_raid_disk = rdev->raid_disk * 2;
4013 rdev->sectors = size;
4014 }
4015 conf->barrier = 1;
4016 }
4017
4018 return conf;
4019}
4020
4021static void *raid10_takeover(struct mddev *mddev)
4022{
4023 struct r0conf *raid0_conf;
4024
4025
4026
4027
4028 if (mddev->level == 0) {
4029
4030 raid0_conf = mddev->private;
4031 if (raid0_conf->nr_strip_zones > 1) {
4032 pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n",
4033 mdname(mddev));
4034 return ERR_PTR(-EINVAL);
4035 }
4036 return raid10_takeover_raid0(mddev,
4037 raid0_conf->strip_zone->zone_end,
4038 raid0_conf->strip_zone->nb_dev);
4039 }
4040 return ERR_PTR(-EINVAL);
4041}
4042
4043static int raid10_check_reshape(struct mddev *mddev)
4044{
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059 struct r10conf *conf = mddev->private;
4060 struct geom geo;
4061
4062 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
4063 return -EINVAL;
4064
4065 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
4066
4067 return -EINVAL;
4068 if (geo.far_copies > 1 && !geo.far_offset)
4069
4070 return -EINVAL;
4071
4072 if (mddev->array_sectors & geo.chunk_mask)
4073
4074 return -EINVAL;
4075
4076 if (!enough(conf, -1))
4077 return -EINVAL;
4078
4079 kfree(conf->mirrors_new);
4080 conf->mirrors_new = NULL;
4081 if (mddev->delta_disks > 0) {
4082
4083 conf->mirrors_new =
4084 kcalloc(mddev->raid_disks + mddev->delta_disks,
4085 sizeof(struct raid10_info),
4086 GFP_KERNEL);
4087 if (!conf->mirrors_new)
4088 return -ENOMEM;
4089 }
4090 return 0;
4091}
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106static int calc_degraded(struct r10conf *conf)
4107{
4108 int degraded, degraded2;
4109 int i;
4110
4111 rcu_read_lock();
4112 degraded = 0;
4113
4114 for (i = 0; i < conf->prev.raid_disks; i++) {
4115 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4116 if (!rdev || test_bit(Faulty, &rdev->flags))
4117 degraded++;
4118 else if (!test_bit(In_sync, &rdev->flags))
4119
4120
4121
4122
4123 degraded++;
4124 }
4125 rcu_read_unlock();
4126 if (conf->geo.raid_disks == conf->prev.raid_disks)
4127 return degraded;
4128 rcu_read_lock();
4129 degraded2 = 0;
4130 for (i = 0; i < conf->geo.raid_disks; i++) {
4131 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4132 if (!rdev || test_bit(Faulty, &rdev->flags))
4133 degraded2++;
4134 else if (!test_bit(In_sync, &rdev->flags)) {
4135
4136
4137
4138
4139
4140 if (conf->geo.raid_disks <= conf->prev.raid_disks)
4141 degraded2++;
4142 }
4143 }
4144 rcu_read_unlock();
4145 if (degraded2 > degraded)
4146 return degraded2;
4147 return degraded;
4148}
4149
4150static int raid10_start_reshape(struct mddev *mddev)
4151{
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162 unsigned long before_length, after_length;
4163 sector_t min_offset_diff = 0;
4164 int first = 1;
4165 struct geom new;
4166 struct r10conf *conf = mddev->private;
4167 struct md_rdev *rdev;
4168 int spares = 0;
4169 int ret;
4170
4171 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4172 return -EBUSY;
4173
4174 if (setup_geo(&new, mddev, geo_start) != conf->copies)
4175 return -EINVAL;
4176
4177 before_length = ((1 << conf->prev.chunk_shift) *
4178 conf->prev.far_copies);
4179 after_length = ((1 << conf->geo.chunk_shift) *
4180 conf->geo.far_copies);
4181
4182 rdev_for_each(rdev, mddev) {
4183 if (!test_bit(In_sync, &rdev->flags)
4184 && !test_bit(Faulty, &rdev->flags))
4185 spares++;
4186 if (rdev->raid_disk >= 0) {
4187 long long diff = (rdev->new_data_offset
4188 - rdev->data_offset);
4189 if (!mddev->reshape_backwards)
4190 diff = -diff;
4191 if (diff < 0)
4192 diff = 0;
4193 if (first || diff < min_offset_diff)
4194 min_offset_diff = diff;
4195 first = 0;
4196 }
4197 }
4198
4199 if (max(before_length, after_length) > min_offset_diff)
4200 return -EINVAL;
4201
4202 if (spares < mddev->delta_disks)
4203 return -EINVAL;
4204
4205 conf->offset_diff = min_offset_diff;
4206 spin_lock_irq(&conf->device_lock);
4207 if (conf->mirrors_new) {
4208 memcpy(conf->mirrors_new, conf->mirrors,
4209 sizeof(struct raid10_info)*conf->prev.raid_disks);
4210 smp_mb();
4211 kfree(conf->mirrors_old);
4212 conf->mirrors_old = conf->mirrors;
4213 conf->mirrors = conf->mirrors_new;
4214 conf->mirrors_new = NULL;
4215 }
4216 setup_geo(&conf->geo, mddev, geo_start);
4217 smp_mb();
4218 if (mddev->reshape_backwards) {
4219 sector_t size = raid10_size(mddev, 0, 0);
4220 if (size < mddev->array_sectors) {
4221 spin_unlock_irq(&conf->device_lock);
4222 pr_warn("md/raid10:%s: array size must be reduce before number of disks\n",
4223 mdname(mddev));
4224 return -EINVAL;
4225 }
4226 mddev->resync_max_sectors = size;
4227 conf->reshape_progress = size;
4228 } else
4229 conf->reshape_progress = 0;
4230 conf->reshape_safe = conf->reshape_progress;
4231 spin_unlock_irq(&conf->device_lock);
4232
4233 if (mddev->delta_disks && mddev->bitmap) {
4234 struct mdp_superblock_1 *sb = NULL;
4235 sector_t oldsize, newsize;
4236
4237 oldsize = raid10_size(mddev, 0, 0);
4238 newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
4239
4240 if (!mddev_is_clustered(mddev)) {
4241 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
4242 if (ret)
4243 goto abort;
4244 else
4245 goto out;
4246 }
4247
4248 rdev_for_each(rdev, mddev) {
4249 if (rdev->raid_disk > -1 &&
4250 !test_bit(Faulty, &rdev->flags))
4251 sb = page_address(rdev->sb_page);
4252 }
4253
4254
4255
4256
4257
4258
4259 if ((sb && (le32_to_cpu(sb->feature_map) &
4260 MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
4261 goto out;
4262
4263 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
4264 if (ret)
4265 goto abort;
4266
4267 ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
4268 if (ret) {
4269 md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
4270 goto abort;
4271 }
4272 }
4273out:
4274 if (mddev->delta_disks > 0) {
4275 rdev_for_each(rdev, mddev)
4276 if (rdev->raid_disk < 0 &&
4277 !test_bit(Faulty, &rdev->flags)) {
4278 if (raid10_add_disk(mddev, rdev) == 0) {
4279 if (rdev->raid_disk >=
4280 conf->prev.raid_disks)
4281 set_bit(In_sync, &rdev->flags);
4282 else
4283 rdev->recovery_offset = 0;
4284
4285
4286 sysfs_link_rdev(mddev, rdev);
4287 }
4288 } else if (rdev->raid_disk >= conf->prev.raid_disks
4289 && !test_bit(Faulty, &rdev->flags)) {
4290
4291 set_bit(In_sync, &rdev->flags);
4292 }
4293 }
4294
4295
4296
4297
4298 spin_lock_irq(&conf->device_lock);
4299 mddev->degraded = calc_degraded(conf);
4300 spin_unlock_irq(&conf->device_lock);
4301 mddev->raid_disks = conf->geo.raid_disks;
4302 mddev->reshape_position = conf->reshape_progress;
4303 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4304
4305 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4306 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4307 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
4308 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4309 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4310
4311 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4312 "reshape");
4313 if (!mddev->sync_thread) {
4314 ret = -EAGAIN;
4315 goto abort;
4316 }
4317 conf->reshape_checkpoint = jiffies;
4318 md_wakeup_thread(mddev->sync_thread);
4319 md_new_event(mddev);
4320 return 0;
4321
4322abort:
4323 mddev->recovery = 0;
4324 spin_lock_irq(&conf->device_lock);
4325 conf->geo = conf->prev;
4326 mddev->raid_disks = conf->geo.raid_disks;
4327 rdev_for_each(rdev, mddev)
4328 rdev->new_data_offset = rdev->data_offset;
4329 smp_wmb();
4330 conf->reshape_progress = MaxSector;
4331 conf->reshape_safe = MaxSector;
4332 mddev->reshape_position = MaxSector;
4333 spin_unlock_irq(&conf->device_lock);
4334 return ret;
4335}
4336
4337
4338
4339
4340
4341
4342
4343static sector_t last_dev_address(sector_t s, struct geom *geo)
4344{
4345 s = (s | geo->chunk_mask) + 1;
4346 s >>= geo->chunk_shift;
4347 s *= geo->near_copies;
4348 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4349 s *= geo->far_copies;
4350 s <<= geo->chunk_shift;
4351 return s;
4352}
4353
4354
4355
4356
4357
4358static sector_t first_dev_address(sector_t s, struct geom *geo)
4359{
4360 s >>= geo->chunk_shift;
4361 s *= geo->near_copies;
4362 sector_div(s, geo->raid_disks);
4363 s *= geo->far_copies;
4364 s <<= geo->chunk_shift;
4365 return s;
4366}
4367
4368static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4369 int *skipped)
4370{
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408 struct r10conf *conf = mddev->private;
4409 struct r10bio *r10_bio;
4410 sector_t next, safe, last;
4411 int max_sectors;
4412 int nr_sectors;
4413 int s;
4414 struct md_rdev *rdev;
4415 int need_flush = 0;
4416 struct bio *blist;
4417 struct bio *bio, *read_bio;
4418 int sectors_done = 0;
4419 struct page **pages;
4420
4421 if (sector_nr == 0) {
4422
4423 if (mddev->reshape_backwards &&
4424 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4425 sector_nr = (raid10_size(mddev, 0, 0)
4426 - conf->reshape_progress);
4427 } else if (!mddev->reshape_backwards &&
4428 conf->reshape_progress > 0)
4429 sector_nr = conf->reshape_progress;
4430 if (sector_nr) {
4431 mddev->curr_resync_completed = sector_nr;
4432 sysfs_notify_dirent_safe(mddev->sysfs_completed);
4433 *skipped = 1;
4434 return sector_nr;
4435 }
4436 }
4437
4438
4439
4440
4441
4442 if (mddev->reshape_backwards) {
4443
4444
4445
4446 next = first_dev_address(conf->reshape_progress - 1,
4447 &conf->geo);
4448
4449
4450
4451
4452 safe = last_dev_address(conf->reshape_safe - 1,
4453 &conf->prev);
4454
4455 if (next + conf->offset_diff < safe)
4456 need_flush = 1;
4457
4458 last = conf->reshape_progress - 1;
4459 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4460 & conf->prev.chunk_mask);
4461 if (sector_nr + RESYNC_SECTORS < last)
4462 sector_nr = last + 1 - RESYNC_SECTORS;
4463 } else {
4464
4465
4466
4467 next = last_dev_address(conf->reshape_progress, &conf->geo);
4468
4469
4470
4471
4472 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4473
4474
4475
4476
4477 if (next > safe + conf->offset_diff)
4478 need_flush = 1;
4479
4480 sector_nr = conf->reshape_progress;
4481 last = sector_nr | (conf->geo.chunk_mask
4482 & conf->prev.chunk_mask);
4483
4484 if (sector_nr + RESYNC_SECTORS <= last)
4485 last = sector_nr + RESYNC_SECTORS - 1;
4486 }
4487
4488 if (need_flush ||
4489 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4490
4491 wait_barrier(conf);
4492 mddev->reshape_position = conf->reshape_progress;
4493 if (mddev->reshape_backwards)
4494 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4495 - conf->reshape_progress;
4496 else
4497 mddev->curr_resync_completed = conf->reshape_progress;
4498 conf->reshape_checkpoint = jiffies;
4499 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4500 md_wakeup_thread(mddev->thread);
4501 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
4502 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4503 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4504 allow_barrier(conf);
4505 return sectors_done;
4506 }
4507 conf->reshape_safe = mddev->reshape_position;
4508 allow_barrier(conf);
4509 }
4510
4511 raise_barrier(conf, 0);
4512read_more:
4513
4514 r10_bio = raid10_alloc_init_r10buf(conf);
4515 r10_bio->state = 0;
4516 raise_barrier(conf, 1);
4517 atomic_set(&r10_bio->remaining, 0);
4518 r10_bio->mddev = mddev;
4519 r10_bio->sector = sector_nr;
4520 set_bit(R10BIO_IsReshape, &r10_bio->state);
4521 r10_bio->sectors = last - sector_nr + 1;
4522 rdev = read_balance(conf, r10_bio, &max_sectors);
4523 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4524
4525 if (!rdev) {
4526
4527
4528
4529
4530 mempool_free(r10_bio, &conf->r10buf_pool);
4531 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4532 return sectors_done;
4533 }
4534
4535 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4536
4537 bio_set_dev(read_bio, rdev->bdev);
4538 read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4539 + rdev->data_offset);
4540 read_bio->bi_private = r10_bio;
4541 read_bio->bi_end_io = end_reshape_read;
4542 bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
4543 read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
4544 read_bio->bi_status = 0;
4545 read_bio->bi_vcnt = 0;
4546 read_bio->bi_iter.bi_size = 0;
4547 r10_bio->master_bio = read_bio;
4548 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4549
4550
4551
4552
4553
4554 if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) {
4555 struct mdp_superblock_1 *sb = NULL;
4556 int sb_reshape_pos = 0;
4557
4558 conf->cluster_sync_low = sector_nr;
4559 conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
4560 sb = page_address(rdev->sb_page);
4561 if (sb) {
4562 sb_reshape_pos = le64_to_cpu(sb->reshape_position);
4563
4564
4565
4566
4567
4568 if (sb_reshape_pos < conf->cluster_sync_low)
4569 conf->cluster_sync_low = sb_reshape_pos;
4570 }
4571
4572 md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
4573 conf->cluster_sync_high);
4574 }
4575
4576
4577 __raid10_find_phys(&conf->geo, r10_bio);
4578
4579 blist = read_bio;
4580 read_bio->bi_next = NULL;
4581
4582 rcu_read_lock();
4583 for (s = 0; s < conf->copies*2; s++) {
4584 struct bio *b;
4585 int d = r10_bio->devs[s/2].devnum;
4586 struct md_rdev *rdev2;
4587 if (s&1) {
4588 rdev2 = rcu_dereference(conf->mirrors[d].replacement);
4589 b = r10_bio->devs[s/2].repl_bio;
4590 } else {
4591 rdev2 = rcu_dereference(conf->mirrors[d].rdev);
4592 b = r10_bio->devs[s/2].bio;
4593 }
4594 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4595 continue;
4596
4597 bio_set_dev(b, rdev2->bdev);
4598 b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
4599 rdev2->new_data_offset;
4600 b->bi_end_io = end_reshape_write;
4601 bio_set_op_attrs(b, REQ_OP_WRITE, 0);
4602 b->bi_next = blist;
4603 blist = b;
4604 }
4605
4606
4607
4608 nr_sectors = 0;
4609 pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
4610 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4611 struct page *page = pages[s / (PAGE_SIZE >> 9)];
4612 int len = (max_sectors - s) << 9;
4613 if (len > PAGE_SIZE)
4614 len = PAGE_SIZE;
4615 for (bio = blist; bio ; bio = bio->bi_next) {
4616
4617
4618
4619
4620 bio_add_page(bio, page, len, 0);
4621 }
4622 sector_nr += len >> 9;
4623 nr_sectors += len >> 9;
4624 }
4625 rcu_read_unlock();
4626 r10_bio->sectors = nr_sectors;
4627
4628
4629 md_sync_acct_bio(read_bio, r10_bio->sectors);
4630 atomic_inc(&r10_bio->remaining);
4631 read_bio->bi_next = NULL;
4632 submit_bio_noacct(read_bio);
4633 sectors_done += nr_sectors;
4634 if (sector_nr <= last)
4635 goto read_more;
4636
4637 lower_barrier(conf);
4638
4639
4640
4641
4642 if (mddev->reshape_backwards)
4643 conf->reshape_progress -= sectors_done;
4644 else
4645 conf->reshape_progress += sectors_done;
4646
4647 return sectors_done;
4648}
4649
4650static void end_reshape_request(struct r10bio *r10_bio);
4651static int handle_reshape_read_error(struct mddev *mddev,
4652 struct r10bio *r10_bio);
4653static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4654{
4655
4656
4657
4658
4659
4660 struct r10conf *conf = mddev->private;
4661 int s;
4662
4663 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4664 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4665
4666 md_done_sync(mddev, r10_bio->sectors, 0);
4667 return;
4668 }
4669
4670
4671
4672
4673 atomic_set(&r10_bio->remaining, 1);
4674 for (s = 0; s < conf->copies*2; s++) {
4675 struct bio *b;
4676 int d = r10_bio->devs[s/2].devnum;
4677 struct md_rdev *rdev;
4678 rcu_read_lock();
4679 if (s&1) {
4680 rdev = rcu_dereference(conf->mirrors[d].replacement);
4681 b = r10_bio->devs[s/2].repl_bio;
4682 } else {
4683 rdev = rcu_dereference(conf->mirrors[d].rdev);
4684 b = r10_bio->devs[s/2].bio;
4685 }
4686 if (!rdev || test_bit(Faulty, &rdev->flags)) {
4687 rcu_read_unlock();
4688 continue;
4689 }
4690 atomic_inc(&rdev->nr_pending);
4691 rcu_read_unlock();
4692 md_sync_acct_bio(b, r10_bio->sectors);
4693 atomic_inc(&r10_bio->remaining);
4694 b->bi_next = NULL;
4695 submit_bio_noacct(b);
4696 }
4697 end_reshape_request(r10_bio);
4698}
4699
4700static void end_reshape(struct r10conf *conf)
4701{
4702 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4703 return;
4704
4705 spin_lock_irq(&conf->device_lock);
4706 conf->prev = conf->geo;
4707 md_finish_reshape(conf->mddev);
4708 smp_wmb();
4709 conf->reshape_progress = MaxSector;
4710 conf->reshape_safe = MaxSector;
4711 spin_unlock_irq(&conf->device_lock);
4712
4713 if (conf->mddev->queue)
4714 raid10_set_io_opt(conf);
4715 conf->fullsync = 0;
4716}
4717
4718static void raid10_update_reshape_pos(struct mddev *mddev)
4719{
4720 struct r10conf *conf = mddev->private;
4721 sector_t lo, hi;
4722
4723 md_cluster_ops->resync_info_get(mddev, &lo, &hi);
4724 if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
4725 || mddev->reshape_position == MaxSector)
4726 conf->reshape_progress = mddev->reshape_position;
4727 else
4728 WARN_ON_ONCE(1);
4729}
4730
4731static int handle_reshape_read_error(struct mddev *mddev,
4732 struct r10bio *r10_bio)
4733{
4734
4735 int sectors = r10_bio->sectors;
4736 struct r10conf *conf = mddev->private;
4737 struct r10bio *r10b;
4738 int slot = 0;
4739 int idx = 0;
4740 struct page **pages;
4741
4742 r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO);
4743 if (!r10b) {
4744 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4745 return -ENOMEM;
4746 }
4747
4748
4749 pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
4750
4751 r10b->sector = r10_bio->sector;
4752 __raid10_find_phys(&conf->prev, r10b);
4753
4754 while (sectors) {
4755 int s = sectors;
4756 int success = 0;
4757 int first_slot = slot;
4758
4759 if (s > (PAGE_SIZE >> 9))
4760 s = PAGE_SIZE >> 9;
4761
4762 rcu_read_lock();
4763 while (!success) {
4764 int d = r10b->devs[slot].devnum;
4765 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4766 sector_t addr;
4767 if (rdev == NULL ||
4768 test_bit(Faulty, &rdev->flags) ||
4769 !test_bit(In_sync, &rdev->flags))
4770 goto failed;
4771
4772 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4773 atomic_inc(&rdev->nr_pending);
4774 rcu_read_unlock();
4775 success = sync_page_io(rdev,
4776 addr,
4777 s << 9,
4778 pages[idx],
4779 REQ_OP_READ, 0, false);
4780 rdev_dec_pending(rdev, mddev);
4781 rcu_read_lock();
4782 if (success)
4783 break;
4784 failed:
4785 slot++;
4786 if (slot >= conf->copies)
4787 slot = 0;
4788 if (slot == first_slot)
4789 break;
4790 }
4791 rcu_read_unlock();
4792 if (!success) {
4793
4794 set_bit(MD_RECOVERY_INTR,
4795 &mddev->recovery);
4796 kfree(r10b);
4797 return -EIO;
4798 }
4799 sectors -= s;
4800 idx++;
4801 }
4802 kfree(r10b);
4803 return 0;
4804}
4805
4806static void end_reshape_write(struct bio *bio)
4807{
4808 struct r10bio *r10_bio = get_resync_r10bio(bio);
4809 struct mddev *mddev = r10_bio->mddev;
4810 struct r10conf *conf = mddev->private;
4811 int d;
4812 int slot;
4813 int repl;
4814 struct md_rdev *rdev = NULL;
4815
4816 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4817 if (repl)
4818 rdev = conf->mirrors[d].replacement;
4819 if (!rdev) {
4820 smp_mb();
4821 rdev = conf->mirrors[d].rdev;
4822 }
4823
4824 if (bio->bi_status) {
4825
4826 md_error(mddev, rdev);
4827 }
4828
4829 rdev_dec_pending(rdev, mddev);
4830 end_reshape_request(r10_bio);
4831}
4832
4833static void end_reshape_request(struct r10bio *r10_bio)
4834{
4835 if (!atomic_dec_and_test(&r10_bio->remaining))
4836 return;
4837 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4838 bio_put(r10_bio->master_bio);
4839 put_buf(r10_bio);
4840}
4841
4842static void raid10_finish_reshape(struct mddev *mddev)
4843{
4844 struct r10conf *conf = mddev->private;
4845
4846 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4847 return;
4848
4849 if (mddev->delta_disks > 0) {
4850 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4851 mddev->recovery_cp = mddev->resync_max_sectors;
4852 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4853 }
4854 mddev->resync_max_sectors = mddev->array_sectors;
4855 } else {
4856 int d;
4857 rcu_read_lock();
4858 for (d = conf->geo.raid_disks ;
4859 d < conf->geo.raid_disks - mddev->delta_disks;
4860 d++) {
4861 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4862 if (rdev)
4863 clear_bit(In_sync, &rdev->flags);
4864 rdev = rcu_dereference(conf->mirrors[d].replacement);
4865 if (rdev)
4866 clear_bit(In_sync, &rdev->flags);
4867 }
4868 rcu_read_unlock();
4869 }
4870 mddev->layout = mddev->new_layout;
4871 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4872 mddev->reshape_position = MaxSector;
4873 mddev->delta_disks = 0;
4874 mddev->reshape_backwards = 0;
4875}
4876
4877static struct md_personality raid10_personality =
4878{
4879 .name = "raid10",
4880 .level = 10,
4881 .owner = THIS_MODULE,
4882 .make_request = raid10_make_request,
4883 .run = raid10_run,
4884 .free = raid10_free,
4885 .status = raid10_status,
4886 .error_handler = raid10_error,
4887 .hot_add_disk = raid10_add_disk,
4888 .hot_remove_disk= raid10_remove_disk,
4889 .spare_active = raid10_spare_active,
4890 .sync_request = raid10_sync_request,
4891 .quiesce = raid10_quiesce,
4892 .size = raid10_size,
4893 .resize = raid10_resize,
4894 .takeover = raid10_takeover,
4895 .check_reshape = raid10_check_reshape,
4896 .start_reshape = raid10_start_reshape,
4897 .finish_reshape = raid10_finish_reshape,
4898 .update_reshape_pos = raid10_update_reshape_pos,
4899};
4900
4901static int __init raid_init(void)
4902{
4903 return register_md_personality(&raid10_personality);
4904}
4905
4906static void raid_exit(void)
4907{
4908 unregister_md_personality(&raid10_personality);
4909}
4910
4911module_init(raid_init);
4912module_exit(raid_exit);
4913MODULE_LICENSE("GPL");
4914MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4915MODULE_ALIAS("md-personality-9");
4916MODULE_ALIAS("md-raid10");
4917MODULE_ALIAS("md-level-10");
4918
4919module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
4920