1
2
3
4
5
6
7
8
9
10
11
12#include <linux/slab.h>
13#include <linux/delay.h>
14#include <linux/blkdev.h>
15#include <linux/module.h>
16#include <linux/seq_file.h>
17#include <linux/ratelimit.h>
18#include <linux/kthread.h>
19#include <linux/raid/md_p.h>
20#include <trace/events/block.h>
21#include "md.h"
22#include "raid10.h"
23#include "raid0.h"
24#include "md-bitmap.h"
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67static void allow_barrier(struct r10conf *conf);
68static void lower_barrier(struct r10conf *conf);
69static int _enough(struct r10conf *conf, int previous, int ignore);
70static int enough(struct r10conf *conf, int ignore);
71static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
72 int *skipped);
73static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
74static void end_reshape_write(struct bio *bio);
75static void end_reshape(struct r10conf *conf);
76
77#define raid10_log(md, fmt, args...) \
78 do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
79
80#include "raid1-10.c"
81
82
83
84
85
86static inline struct r10bio *get_resync_r10bio(struct bio *bio)
87{
88 return get_resync_pages(bio)->raid_bio;
89}
90
91static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
92{
93 struct r10conf *conf = data;
94 int size = offsetof(struct r10bio, devs[conf->copies]);
95
96
97
98 return kzalloc(size, gfp_flags);
99}
100
101#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
102
103#define RESYNC_WINDOW (1024*1024)
104
105#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
106#define CLUSTER_RESYNC_WINDOW (32 * RESYNC_WINDOW)
107#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
108
109
110
111
112
113
114
115
116static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
117{
118 struct r10conf *conf = data;
119 struct r10bio *r10_bio;
120 struct bio *bio;
121 int j;
122 int nalloc, nalloc_rp;
123 struct resync_pages *rps;
124
125 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
126 if (!r10_bio)
127 return NULL;
128
129 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
130 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
131 nalloc = conf->copies;
132 else
133 nalloc = 2;
134
135
136 if (!conf->have_replacement)
137 nalloc_rp = nalloc;
138 else
139 nalloc_rp = nalloc * 2;
140 rps = kmalloc_array(nalloc_rp, sizeof(struct resync_pages), gfp_flags);
141 if (!rps)
142 goto out_free_r10bio;
143
144
145
146
147 for (j = nalloc ; j-- ; ) {
148 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
149 if (!bio)
150 goto out_free_bio;
151 r10_bio->devs[j].bio = bio;
152 if (!conf->have_replacement)
153 continue;
154 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
155 if (!bio)
156 goto out_free_bio;
157 r10_bio->devs[j].repl_bio = bio;
158 }
159
160
161
162
163 for (j = 0; j < nalloc; j++) {
164 struct bio *rbio = r10_bio->devs[j].repl_bio;
165 struct resync_pages *rp, *rp_repl;
166
167 rp = &rps[j];
168 if (rbio)
169 rp_repl = &rps[nalloc + j];
170
171 bio = r10_bio->devs[j].bio;
172
173 if (!j || test_bit(MD_RECOVERY_SYNC,
174 &conf->mddev->recovery)) {
175 if (resync_alloc_pages(rp, gfp_flags))
176 goto out_free_pages;
177 } else {
178 memcpy(rp, &rps[0], sizeof(*rp));
179 resync_get_all_pages(rp);
180 }
181
182 rp->raid_bio = r10_bio;
183 bio->bi_private = rp;
184 if (rbio) {
185 memcpy(rp_repl, rp, sizeof(*rp));
186 rbio->bi_private = rp_repl;
187 }
188 }
189
190 return r10_bio;
191
192out_free_pages:
193 while (--j >= 0)
194 resync_free_pages(&rps[j]);
195
196 j = 0;
197out_free_bio:
198 for ( ; j < nalloc; j++) {
199 if (r10_bio->devs[j].bio)
200 bio_put(r10_bio->devs[j].bio);
201 if (r10_bio->devs[j].repl_bio)
202 bio_put(r10_bio->devs[j].repl_bio);
203 }
204 kfree(rps);
205out_free_r10bio:
206 rbio_pool_free(r10_bio, conf);
207 return NULL;
208}
209
210static void r10buf_pool_free(void *__r10_bio, void *data)
211{
212 struct r10conf *conf = data;
213 struct r10bio *r10bio = __r10_bio;
214 int j;
215 struct resync_pages *rp = NULL;
216
217 for (j = conf->copies; j--; ) {
218 struct bio *bio = r10bio->devs[j].bio;
219
220 if (bio) {
221 rp = get_resync_pages(bio);
222 resync_free_pages(rp);
223 bio_put(bio);
224 }
225
226 bio = r10bio->devs[j].repl_bio;
227 if (bio)
228 bio_put(bio);
229 }
230
231
232 kfree(rp);
233
234 rbio_pool_free(r10bio, conf);
235}
236
237static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
238{
239 int i;
240
241 for (i = 0; i < conf->copies; i++) {
242 struct bio **bio = & r10_bio->devs[i].bio;
243 if (!BIO_SPECIAL(*bio))
244 bio_put(*bio);
245 *bio = NULL;
246 bio = &r10_bio->devs[i].repl_bio;
247 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
248 bio_put(*bio);
249 *bio = NULL;
250 }
251}
252
253static void free_r10bio(struct r10bio *r10_bio)
254{
255 struct r10conf *conf = r10_bio->mddev->private;
256
257 put_all_bios(conf, r10_bio);
258 mempool_free(r10_bio, &conf->r10bio_pool);
259}
260
261static void put_buf(struct r10bio *r10_bio)
262{
263 struct r10conf *conf = r10_bio->mddev->private;
264
265 mempool_free(r10_bio, &conf->r10buf_pool);
266
267 lower_barrier(conf);
268}
269
270static void reschedule_retry(struct r10bio *r10_bio)
271{
272 unsigned long flags;
273 struct mddev *mddev = r10_bio->mddev;
274 struct r10conf *conf = mddev->private;
275
276 spin_lock_irqsave(&conf->device_lock, flags);
277 list_add(&r10_bio->retry_list, &conf->retry_list);
278 conf->nr_queued ++;
279 spin_unlock_irqrestore(&conf->device_lock, flags);
280
281
282 wake_up(&conf->wait_barrier);
283
284 md_wakeup_thread(mddev->thread);
285}
286
287
288
289
290
291
292static void raid_end_bio_io(struct r10bio *r10_bio)
293{
294 struct bio *bio = r10_bio->master_bio;
295 struct r10conf *conf = r10_bio->mddev->private;
296
297 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
298 bio->bi_status = BLK_STS_IOERR;
299
300 bio_endio(bio);
301
302
303
304
305 allow_barrier(conf);
306
307 free_r10bio(r10_bio);
308}
309
310
311
312
313static inline void update_head_pos(int slot, struct r10bio *r10_bio)
314{
315 struct r10conf *conf = r10_bio->mddev->private;
316
317 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
318 r10_bio->devs[slot].addr + (r10_bio->sectors);
319}
320
321
322
323
324static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
325 struct bio *bio, int *slotp, int *replp)
326{
327 int slot;
328 int repl = 0;
329
330 for (slot = 0; slot < conf->copies; slot++) {
331 if (r10_bio->devs[slot].bio == bio)
332 break;
333 if (r10_bio->devs[slot].repl_bio == bio) {
334 repl = 1;
335 break;
336 }
337 }
338
339 BUG_ON(slot == conf->copies);
340 update_head_pos(slot, r10_bio);
341
342 if (slotp)
343 *slotp = slot;
344 if (replp)
345 *replp = repl;
346 return r10_bio->devs[slot].devnum;
347}
348
349static void raid10_end_read_request(struct bio *bio)
350{
351 int uptodate = !bio->bi_status;
352 struct r10bio *r10_bio = bio->bi_private;
353 int slot;
354 struct md_rdev *rdev;
355 struct r10conf *conf = r10_bio->mddev->private;
356
357 slot = r10_bio->read_slot;
358 rdev = r10_bio->devs[slot].rdev;
359
360
361
362 update_head_pos(slot, r10_bio);
363
364 if (uptodate) {
365
366
367
368
369
370
371
372
373
374 set_bit(R10BIO_Uptodate, &r10_bio->state);
375 } else {
376
377
378
379
380
381 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
382 rdev->raid_disk))
383 uptodate = 1;
384 }
385 if (uptodate) {
386 raid_end_bio_io(r10_bio);
387 rdev_dec_pending(rdev, conf->mddev);
388 } else {
389
390
391
392 char b[BDEVNAME_SIZE];
393 pr_err_ratelimited("md/raid10:%s: %s: rescheduling sector %llu\n",
394 mdname(conf->mddev),
395 bdevname(rdev->bdev, b),
396 (unsigned long long)r10_bio->sector);
397 set_bit(R10BIO_ReadError, &r10_bio->state);
398 reschedule_retry(r10_bio);
399 }
400}
401
402static void close_write(struct r10bio *r10_bio)
403{
404
405 md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
406 r10_bio->sectors,
407 !test_bit(R10BIO_Degraded, &r10_bio->state),
408 0);
409 md_write_end(r10_bio->mddev);
410}
411
412static void one_write_done(struct r10bio *r10_bio)
413{
414 if (atomic_dec_and_test(&r10_bio->remaining)) {
415 if (test_bit(R10BIO_WriteError, &r10_bio->state))
416 reschedule_retry(r10_bio);
417 else {
418 close_write(r10_bio);
419 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
420 reschedule_retry(r10_bio);
421 else
422 raid_end_bio_io(r10_bio);
423 }
424 }
425}
426
427static void raid10_end_write_request(struct bio *bio)
428{
429 struct r10bio *r10_bio = bio->bi_private;
430 int dev;
431 int dec_rdev = 1;
432 struct r10conf *conf = r10_bio->mddev->private;
433 int slot, repl;
434 struct md_rdev *rdev = NULL;
435 struct bio *to_put = NULL;
436 bool discard_error;
437
438 discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
439
440 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
441
442 if (repl)
443 rdev = conf->mirrors[dev].replacement;
444 if (!rdev) {
445 smp_rmb();
446 repl = 0;
447 rdev = conf->mirrors[dev].rdev;
448 }
449
450
451
452 if (bio->bi_status && !discard_error) {
453 if (repl)
454
455
456
457 md_error(rdev->mddev, rdev);
458 else {
459 set_bit(WriteErrorSeen, &rdev->flags);
460 if (!test_and_set_bit(WantReplacement, &rdev->flags))
461 set_bit(MD_RECOVERY_NEEDED,
462 &rdev->mddev->recovery);
463
464 dec_rdev = 0;
465 if (test_bit(FailFast, &rdev->flags) &&
466 (bio->bi_opf & MD_FAILFAST)) {
467 md_error(rdev->mddev, rdev);
468 }
469
470
471
472
473
474
475
476 if (!test_bit(Faulty, &rdev->flags))
477 set_bit(R10BIO_WriteError, &r10_bio->state);
478 else {
479 r10_bio->devs[slot].bio = NULL;
480 to_put = bio;
481 dec_rdev = 1;
482 }
483 }
484 } else {
485
486
487
488
489
490
491
492
493
494 sector_t first_bad;
495 int bad_sectors;
496
497
498
499
500
501
502
503
504
505 if (test_bit(In_sync, &rdev->flags) &&
506 !test_bit(Faulty, &rdev->flags))
507 set_bit(R10BIO_Uptodate, &r10_bio->state);
508
509
510 if (is_badblock(rdev,
511 r10_bio->devs[slot].addr,
512 r10_bio->sectors,
513 &first_bad, &bad_sectors) && !discard_error) {
514 bio_put(bio);
515 if (repl)
516 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
517 else
518 r10_bio->devs[slot].bio = IO_MADE_GOOD;
519 dec_rdev = 0;
520 set_bit(R10BIO_MadeGood, &r10_bio->state);
521 }
522 }
523
524
525
526
527
528
529 one_write_done(r10_bio);
530 if (dec_rdev)
531 rdev_dec_pending(rdev, conf->mddev);
532 if (to_put)
533 bio_put(to_put);
534}
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
562{
563 int n,f;
564 sector_t sector;
565 sector_t chunk;
566 sector_t stripe;
567 int dev;
568 int slot = 0;
569 int last_far_set_start, last_far_set_size;
570
571 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
572 last_far_set_start *= geo->far_set_size;
573
574 last_far_set_size = geo->far_set_size;
575 last_far_set_size += (geo->raid_disks % geo->far_set_size);
576
577
578 chunk = r10bio->sector >> geo->chunk_shift;
579 sector = r10bio->sector & geo->chunk_mask;
580
581 chunk *= geo->near_copies;
582 stripe = chunk;
583 dev = sector_div(stripe, geo->raid_disks);
584 if (geo->far_offset)
585 stripe *= geo->far_copies;
586
587 sector += stripe << geo->chunk_shift;
588
589
590 for (n = 0; n < geo->near_copies; n++) {
591 int d = dev;
592 int set;
593 sector_t s = sector;
594 r10bio->devs[slot].devnum = d;
595 r10bio->devs[slot].addr = s;
596 slot++;
597
598 for (f = 1; f < geo->far_copies; f++) {
599 set = d / geo->far_set_size;
600 d += geo->near_copies;
601
602 if ((geo->raid_disks % geo->far_set_size) &&
603 (d > last_far_set_start)) {
604 d -= last_far_set_start;
605 d %= last_far_set_size;
606 d += last_far_set_start;
607 } else {
608 d %= geo->far_set_size;
609 d += geo->far_set_size * set;
610 }
611 s += geo->stride;
612 r10bio->devs[slot].devnum = d;
613 r10bio->devs[slot].addr = s;
614 slot++;
615 }
616 dev++;
617 if (dev >= geo->raid_disks) {
618 dev = 0;
619 sector += (geo->chunk_mask + 1);
620 }
621 }
622}
623
624static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
625{
626 struct geom *geo = &conf->geo;
627
628 if (conf->reshape_progress != MaxSector &&
629 ((r10bio->sector >= conf->reshape_progress) !=
630 conf->mddev->reshape_backwards)) {
631 set_bit(R10BIO_Previous, &r10bio->state);
632 geo = &conf->prev;
633 } else
634 clear_bit(R10BIO_Previous, &r10bio->state);
635
636 __raid10_find_phys(geo, r10bio);
637}
638
639static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
640{
641 sector_t offset, chunk, vchunk;
642
643
644
645 struct geom *geo = &conf->geo;
646 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
647 int far_set_size = geo->far_set_size;
648 int last_far_set_start;
649
650 if (geo->raid_disks % geo->far_set_size) {
651 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
652 last_far_set_start *= geo->far_set_size;
653
654 if (dev >= last_far_set_start) {
655 far_set_size = geo->far_set_size;
656 far_set_size += (geo->raid_disks % geo->far_set_size);
657 far_set_start = last_far_set_start;
658 }
659 }
660
661 offset = sector & geo->chunk_mask;
662 if (geo->far_offset) {
663 int fc;
664 chunk = sector >> geo->chunk_shift;
665 fc = sector_div(chunk, geo->far_copies);
666 dev -= fc * geo->near_copies;
667 if (dev < far_set_start)
668 dev += far_set_size;
669 } else {
670 while (sector >= geo->stride) {
671 sector -= geo->stride;
672 if (dev < (geo->near_copies + far_set_start))
673 dev += far_set_size - geo->near_copies;
674 else
675 dev -= geo->near_copies;
676 }
677 chunk = sector >> geo->chunk_shift;
678 }
679 vchunk = chunk * geo->raid_disks + dev;
680 sector_div(vchunk, geo->near_copies);
681 return (vchunk << geo->chunk_shift) + offset;
682}
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703static struct md_rdev *read_balance(struct r10conf *conf,
704 struct r10bio *r10_bio,
705 int *max_sectors)
706{
707 const sector_t this_sector = r10_bio->sector;
708 int disk, slot;
709 int sectors = r10_bio->sectors;
710 int best_good_sectors;
711 sector_t new_distance, best_dist;
712 struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL;
713 int do_balance;
714 int best_dist_slot, best_pending_slot;
715 bool has_nonrot_disk = false;
716 unsigned int min_pending;
717 struct geom *geo = &conf->geo;
718
719 raid10_find_phys(conf, r10_bio);
720 rcu_read_lock();
721 best_dist_slot = -1;
722 min_pending = UINT_MAX;
723 best_dist_rdev = NULL;
724 best_pending_rdev = NULL;
725 best_dist = MaxSector;
726 best_good_sectors = 0;
727 do_balance = 1;
728 clear_bit(R10BIO_FailFast, &r10_bio->state);
729
730
731
732
733
734
735 if ((conf->mddev->recovery_cp < MaxSector
736 && (this_sector + sectors >= conf->next_resync)) ||
737 (mddev_is_clustered(conf->mddev) &&
738 md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
739 this_sector + sectors)))
740 do_balance = 0;
741
742 for (slot = 0; slot < conf->copies ; slot++) {
743 sector_t first_bad;
744 int bad_sectors;
745 sector_t dev_sector;
746 unsigned int pending;
747 bool nonrot;
748
749 if (r10_bio->devs[slot].bio == IO_BLOCKED)
750 continue;
751 disk = r10_bio->devs[slot].devnum;
752 rdev = rcu_dereference(conf->mirrors[disk].replacement);
753 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
754 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
755 rdev = rcu_dereference(conf->mirrors[disk].rdev);
756 if (rdev == NULL ||
757 test_bit(Faulty, &rdev->flags))
758 continue;
759 if (!test_bit(In_sync, &rdev->flags) &&
760 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
761 continue;
762
763 dev_sector = r10_bio->devs[slot].addr;
764 if (is_badblock(rdev, dev_sector, sectors,
765 &first_bad, &bad_sectors)) {
766 if (best_dist < MaxSector)
767
768 continue;
769 if (first_bad <= dev_sector) {
770
771
772
773
774 bad_sectors -= (dev_sector - first_bad);
775 if (!do_balance && sectors > bad_sectors)
776 sectors = bad_sectors;
777 if (best_good_sectors > sectors)
778 best_good_sectors = sectors;
779 } else {
780 sector_t good_sectors =
781 first_bad - dev_sector;
782 if (good_sectors > best_good_sectors) {
783 best_good_sectors = good_sectors;
784 best_dist_slot = slot;
785 best_dist_rdev = rdev;
786 }
787 if (!do_balance)
788
789 break;
790 }
791 continue;
792 } else
793 best_good_sectors = sectors;
794
795 if (!do_balance)
796 break;
797
798 nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
799 has_nonrot_disk |= nonrot;
800 pending = atomic_read(&rdev->nr_pending);
801 if (min_pending > pending && nonrot) {
802 min_pending = pending;
803 best_pending_slot = slot;
804 best_pending_rdev = rdev;
805 }
806
807 if (best_dist_slot >= 0)
808
809 set_bit(R10BIO_FailFast, &r10_bio->state);
810
811
812
813
814 if (geo->near_copies > 1 && !pending)
815 new_distance = 0;
816
817
818 else if (geo->far_copies > 1)
819 new_distance = r10_bio->devs[slot].addr;
820 else
821 new_distance = abs(r10_bio->devs[slot].addr -
822 conf->mirrors[disk].head_position);
823
824 if (new_distance < best_dist) {
825 best_dist = new_distance;
826 best_dist_slot = slot;
827 best_dist_rdev = rdev;
828 }
829 }
830 if (slot >= conf->copies) {
831 if (has_nonrot_disk) {
832 slot = best_pending_slot;
833 rdev = best_pending_rdev;
834 } else {
835 slot = best_dist_slot;
836 rdev = best_dist_rdev;
837 }
838 }
839
840 if (slot >= 0) {
841 atomic_inc(&rdev->nr_pending);
842 r10_bio->read_slot = slot;
843 } else
844 rdev = NULL;
845 rcu_read_unlock();
846 *max_sectors = best_good_sectors;
847
848 return rdev;
849}
850
851static void flush_pending_writes(struct r10conf *conf)
852{
853
854
855
856 spin_lock_irq(&conf->device_lock);
857
858 if (conf->pending_bio_list.head) {
859 struct blk_plug plug;
860 struct bio *bio;
861
862 bio = bio_list_get(&conf->pending_bio_list);
863 conf->pending_count = 0;
864 spin_unlock_irq(&conf->device_lock);
865
866
867
868
869
870
871
872
873
874
875 __set_current_state(TASK_RUNNING);
876
877 blk_start_plug(&plug);
878
879
880 md_bitmap_unplug(conf->mddev->bitmap);
881 wake_up(&conf->wait_barrier);
882
883 while (bio) {
884 struct bio *next = bio->bi_next;
885 struct md_rdev *rdev = (void*)bio->bi_disk;
886 bio->bi_next = NULL;
887 bio_set_dev(bio, rdev->bdev);
888 if (test_bit(Faulty, &rdev->flags)) {
889 bio_io_error(bio);
890 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
891 !blk_queue_discard(bio->bi_disk->queue)))
892
893 bio_endio(bio);
894 else
895 submit_bio_noacct(bio);
896 bio = next;
897 }
898 blk_finish_plug(&plug);
899 } else
900 spin_unlock_irq(&conf->device_lock);
901}
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925static void raise_barrier(struct r10conf *conf, int force)
926{
927 BUG_ON(force && !conf->barrier);
928 spin_lock_irq(&conf->resync_lock);
929
930
931 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
932 conf->resync_lock);
933
934
935 conf->barrier++;
936
937
938 wait_event_lock_irq(conf->wait_barrier,
939 !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
940 conf->resync_lock);
941
942 spin_unlock_irq(&conf->resync_lock);
943}
944
945static void lower_barrier(struct r10conf *conf)
946{
947 unsigned long flags;
948 spin_lock_irqsave(&conf->resync_lock, flags);
949 conf->barrier--;
950 spin_unlock_irqrestore(&conf->resync_lock, flags);
951 wake_up(&conf->wait_barrier);
952}
953
954static void wait_barrier(struct r10conf *conf)
955{
956 spin_lock_irq(&conf->resync_lock);
957 if (conf->barrier) {
958 struct bio_list *bio_list = current->bio_list;
959 conf->nr_waiting++;
960
961
962
963
964
965
966
967
968
969 raid10_log(conf->mddev, "wait barrier");
970 wait_event_lock_irq(conf->wait_barrier,
971 !conf->barrier ||
972 (atomic_read(&conf->nr_pending) &&
973 bio_list &&
974 (!bio_list_empty(&bio_list[0]) ||
975 !bio_list_empty(&bio_list[1]))) ||
976
977
978
979 (conf->mddev->thread->tsk == current &&
980 test_bit(MD_RECOVERY_RUNNING,
981 &conf->mddev->recovery) &&
982 conf->nr_queued > 0),
983 conf->resync_lock);
984 conf->nr_waiting--;
985 if (!conf->nr_waiting)
986 wake_up(&conf->wait_barrier);
987 }
988 atomic_inc(&conf->nr_pending);
989 spin_unlock_irq(&conf->resync_lock);
990}
991
992static void allow_barrier(struct r10conf *conf)
993{
994 if ((atomic_dec_and_test(&conf->nr_pending)) ||
995 (conf->array_freeze_pending))
996 wake_up(&conf->wait_barrier);
997}
998
999static void freeze_array(struct r10conf *conf, int extra)
1000{
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013 spin_lock_irq(&conf->resync_lock);
1014 conf->array_freeze_pending++;
1015 conf->barrier++;
1016 conf->nr_waiting++;
1017 wait_event_lock_irq_cmd(conf->wait_barrier,
1018 atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
1019 conf->resync_lock,
1020 flush_pending_writes(conf));
1021
1022 conf->array_freeze_pending--;
1023 spin_unlock_irq(&conf->resync_lock);
1024}
1025
1026static void unfreeze_array(struct r10conf *conf)
1027{
1028
1029 spin_lock_irq(&conf->resync_lock);
1030 conf->barrier--;
1031 conf->nr_waiting--;
1032 wake_up(&conf->wait_barrier);
1033 spin_unlock_irq(&conf->resync_lock);
1034}
1035
1036static sector_t choose_data_offset(struct r10bio *r10_bio,
1037 struct md_rdev *rdev)
1038{
1039 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1040 test_bit(R10BIO_Previous, &r10_bio->state))
1041 return rdev->data_offset;
1042 else
1043 return rdev->new_data_offset;
1044}
1045
1046struct raid10_plug_cb {
1047 struct blk_plug_cb cb;
1048 struct bio_list pending;
1049 int pending_cnt;
1050};
1051
1052static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1053{
1054 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1055 cb);
1056 struct mddev *mddev = plug->cb.data;
1057 struct r10conf *conf = mddev->private;
1058 struct bio *bio;
1059
1060 if (from_schedule || current->bio_list) {
1061 spin_lock_irq(&conf->device_lock);
1062 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1063 conf->pending_count += plug->pending_cnt;
1064 spin_unlock_irq(&conf->device_lock);
1065 wake_up(&conf->wait_barrier);
1066 md_wakeup_thread(mddev->thread);
1067 kfree(plug);
1068 return;
1069 }
1070
1071
1072 bio = bio_list_get(&plug->pending);
1073 md_bitmap_unplug(mddev->bitmap);
1074 wake_up(&conf->wait_barrier);
1075
1076 while (bio) {
1077 struct bio *next = bio->bi_next;
1078 struct md_rdev *rdev = (void*)bio->bi_disk;
1079 bio->bi_next = NULL;
1080 bio_set_dev(bio, rdev->bdev);
1081 if (test_bit(Faulty, &rdev->flags)) {
1082 bio_io_error(bio);
1083 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
1084 !blk_queue_discard(bio->bi_disk->queue)))
1085
1086 bio_endio(bio);
1087 else
1088 submit_bio_noacct(bio);
1089 bio = next;
1090 }
1091 kfree(plug);
1092}
1093
1094
1095
1096
1097
1098
1099
1100static void regular_request_wait(struct mddev *mddev, struct r10conf *conf,
1101 struct bio *bio, sector_t sectors)
1102{
1103 wait_barrier(conf);
1104 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1105 bio->bi_iter.bi_sector < conf->reshape_progress &&
1106 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1107 raid10_log(conf->mddev, "wait reshape");
1108 allow_barrier(conf);
1109 wait_event(conf->wait_barrier,
1110 conf->reshape_progress <= bio->bi_iter.bi_sector ||
1111 conf->reshape_progress >= bio->bi_iter.bi_sector +
1112 sectors);
1113 wait_barrier(conf);
1114 }
1115}
1116
1117static void raid10_read_request(struct mddev *mddev, struct bio *bio,
1118 struct r10bio *r10_bio)
1119{
1120 struct r10conf *conf = mddev->private;
1121 struct bio *read_bio;
1122 const int op = bio_op(bio);
1123 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1124 int max_sectors;
1125 struct md_rdev *rdev;
1126 char b[BDEVNAME_SIZE];
1127 int slot = r10_bio->read_slot;
1128 struct md_rdev *err_rdev = NULL;
1129 gfp_t gfp = GFP_NOIO;
1130
1131 if (r10_bio->devs[slot].rdev) {
1132
1133
1134
1135
1136
1137
1138
1139 int disk;
1140
1141
1142
1143
1144 gfp = GFP_NOIO | __GFP_HIGH;
1145
1146 rcu_read_lock();
1147 disk = r10_bio->devs[slot].devnum;
1148 err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
1149 if (err_rdev)
1150 bdevname(err_rdev->bdev, b);
1151 else {
1152 strcpy(b, "???");
1153
1154 err_rdev = r10_bio->devs[slot].rdev;
1155 }
1156 rcu_read_unlock();
1157 }
1158
1159 regular_request_wait(mddev, conf, bio, r10_bio->sectors);
1160 rdev = read_balance(conf, r10_bio, &max_sectors);
1161 if (!rdev) {
1162 if (err_rdev) {
1163 pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
1164 mdname(mddev), b,
1165 (unsigned long long)r10_bio->sector);
1166 }
1167 raid_end_bio_io(r10_bio);
1168 return;
1169 }
1170 if (err_rdev)
1171 pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
1172 mdname(mddev),
1173 bdevname(rdev->bdev, b),
1174 (unsigned long long)r10_bio->sector);
1175 if (max_sectors < bio_sectors(bio)) {
1176 struct bio *split = bio_split(bio, max_sectors,
1177 gfp, &conf->bio_split);
1178 bio_chain(split, bio);
1179 allow_barrier(conf);
1180 submit_bio_noacct(bio);
1181 wait_barrier(conf);
1182 bio = split;
1183 r10_bio->master_bio = bio;
1184 r10_bio->sectors = max_sectors;
1185 }
1186 slot = r10_bio->read_slot;
1187
1188 read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
1189
1190 r10_bio->devs[slot].bio = read_bio;
1191 r10_bio->devs[slot].rdev = rdev;
1192
1193 read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
1194 choose_data_offset(r10_bio, rdev);
1195 bio_set_dev(read_bio, rdev->bdev);
1196 read_bio->bi_end_io = raid10_end_read_request;
1197 bio_set_op_attrs(read_bio, op, do_sync);
1198 if (test_bit(FailFast, &rdev->flags) &&
1199 test_bit(R10BIO_FailFast, &r10_bio->state))
1200 read_bio->bi_opf |= MD_FAILFAST;
1201 read_bio->bi_private = r10_bio;
1202
1203 if (mddev->gendisk)
1204 trace_block_bio_remap(read_bio->bi_disk->queue,
1205 read_bio, disk_devt(mddev->gendisk),
1206 r10_bio->sector);
1207 submit_bio_noacct(read_bio);
1208 return;
1209}
1210
1211static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
1212 struct bio *bio, bool replacement,
1213 int n_copy)
1214{
1215 const int op = bio_op(bio);
1216 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1217 const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
1218 unsigned long flags;
1219 struct blk_plug_cb *cb;
1220 struct raid10_plug_cb *plug = NULL;
1221 struct r10conf *conf = mddev->private;
1222 struct md_rdev *rdev;
1223 int devnum = r10_bio->devs[n_copy].devnum;
1224 struct bio *mbio;
1225
1226 if (replacement) {
1227 rdev = conf->mirrors[devnum].replacement;
1228 if (rdev == NULL) {
1229
1230 smp_mb();
1231 rdev = conf->mirrors[devnum].rdev;
1232 }
1233 } else
1234 rdev = conf->mirrors[devnum].rdev;
1235
1236 mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
1237 if (replacement)
1238 r10_bio->devs[n_copy].repl_bio = mbio;
1239 else
1240 r10_bio->devs[n_copy].bio = mbio;
1241
1242 mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
1243 choose_data_offset(r10_bio, rdev));
1244 bio_set_dev(mbio, rdev->bdev);
1245 mbio->bi_end_io = raid10_end_write_request;
1246 bio_set_op_attrs(mbio, op, do_sync | do_fua);
1247 if (!replacement && test_bit(FailFast,
1248 &conf->mirrors[devnum].rdev->flags)
1249 && enough(conf, devnum))
1250 mbio->bi_opf |= MD_FAILFAST;
1251 mbio->bi_private = r10_bio;
1252
1253 if (conf->mddev->gendisk)
1254 trace_block_bio_remap(mbio->bi_disk->queue,
1255 mbio, disk_devt(conf->mddev->gendisk),
1256 r10_bio->sector);
1257
1258 mbio->bi_disk = (void *)rdev;
1259
1260 atomic_inc(&r10_bio->remaining);
1261
1262 cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
1263 if (cb)
1264 plug = container_of(cb, struct raid10_plug_cb, cb);
1265 else
1266 plug = NULL;
1267 if (plug) {
1268 bio_list_add(&plug->pending, mbio);
1269 plug->pending_cnt++;
1270 } else {
1271 spin_lock_irqsave(&conf->device_lock, flags);
1272 bio_list_add(&conf->pending_bio_list, mbio);
1273 conf->pending_count++;
1274 spin_unlock_irqrestore(&conf->device_lock, flags);
1275 md_wakeup_thread(mddev->thread);
1276 }
1277}
1278
1279static void raid10_write_request(struct mddev *mddev, struct bio *bio,
1280 struct r10bio *r10_bio)
1281{
1282 struct r10conf *conf = mddev->private;
1283 int i;
1284 struct md_rdev *blocked_rdev;
1285 sector_t sectors;
1286 int max_sectors;
1287
1288 if ((mddev_is_clustered(mddev) &&
1289 md_cluster_ops->area_resyncing(mddev, WRITE,
1290 bio->bi_iter.bi_sector,
1291 bio_end_sector(bio)))) {
1292 DEFINE_WAIT(w);
1293 for (;;) {
1294 prepare_to_wait(&conf->wait_barrier,
1295 &w, TASK_IDLE);
1296 if (!md_cluster_ops->area_resyncing(mddev, WRITE,
1297 bio->bi_iter.bi_sector, bio_end_sector(bio)))
1298 break;
1299 schedule();
1300 }
1301 finish_wait(&conf->wait_barrier, &w);
1302 }
1303
1304 sectors = r10_bio->sectors;
1305 regular_request_wait(mddev, conf, bio, sectors);
1306 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1307 (mddev->reshape_backwards
1308 ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
1309 bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
1310 : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
1311 bio->bi_iter.bi_sector < conf->reshape_progress))) {
1312
1313 mddev->reshape_position = conf->reshape_progress;
1314 set_mask_bits(&mddev->sb_flags, 0,
1315 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1316 md_wakeup_thread(mddev->thread);
1317 raid10_log(conf->mddev, "wait reshape metadata");
1318 wait_event(mddev->sb_wait,
1319 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
1320
1321 conf->reshape_safe = mddev->reshape_position;
1322 }
1323
1324 if (conf->pending_count >= max_queued_requests) {
1325 md_wakeup_thread(mddev->thread);
1326 raid10_log(mddev, "wait queued");
1327 wait_event(conf->wait_barrier,
1328 conf->pending_count < max_queued_requests);
1329 }
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340 r10_bio->read_slot = -1;
1341 raid10_find_phys(conf, r10_bio);
1342retry_write:
1343 blocked_rdev = NULL;
1344 rcu_read_lock();
1345 max_sectors = r10_bio->sectors;
1346
1347 for (i = 0; i < conf->copies; i++) {
1348 int d = r10_bio->devs[i].devnum;
1349 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1350 struct md_rdev *rrdev = rcu_dereference(
1351 conf->mirrors[d].replacement);
1352 if (rdev == rrdev)
1353 rrdev = NULL;
1354 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1355 atomic_inc(&rdev->nr_pending);
1356 blocked_rdev = rdev;
1357 break;
1358 }
1359 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1360 atomic_inc(&rrdev->nr_pending);
1361 blocked_rdev = rrdev;
1362 break;
1363 }
1364 if (rdev && (test_bit(Faulty, &rdev->flags)))
1365 rdev = NULL;
1366 if (rrdev && (test_bit(Faulty, &rrdev->flags)))
1367 rrdev = NULL;
1368
1369 r10_bio->devs[i].bio = NULL;
1370 r10_bio->devs[i].repl_bio = NULL;
1371
1372 if (!rdev && !rrdev) {
1373 set_bit(R10BIO_Degraded, &r10_bio->state);
1374 continue;
1375 }
1376 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1377 sector_t first_bad;
1378 sector_t dev_sector = r10_bio->devs[i].addr;
1379 int bad_sectors;
1380 int is_bad;
1381
1382 is_bad = is_badblock(rdev, dev_sector, max_sectors,
1383 &first_bad, &bad_sectors);
1384 if (is_bad < 0) {
1385
1386
1387
1388 atomic_inc(&rdev->nr_pending);
1389 set_bit(BlockedBadBlocks, &rdev->flags);
1390 blocked_rdev = rdev;
1391 break;
1392 }
1393 if (is_bad && first_bad <= dev_sector) {
1394
1395 bad_sectors -= (dev_sector - first_bad);
1396 if (bad_sectors < max_sectors)
1397
1398
1399
1400 max_sectors = bad_sectors;
1401
1402
1403
1404
1405
1406
1407
1408
1409 continue;
1410 }
1411 if (is_bad) {
1412 int good_sectors = first_bad - dev_sector;
1413 if (good_sectors < max_sectors)
1414 max_sectors = good_sectors;
1415 }
1416 }
1417 if (rdev) {
1418 r10_bio->devs[i].bio = bio;
1419 atomic_inc(&rdev->nr_pending);
1420 }
1421 if (rrdev) {
1422 r10_bio->devs[i].repl_bio = bio;
1423 atomic_inc(&rrdev->nr_pending);
1424 }
1425 }
1426 rcu_read_unlock();
1427
1428 if (unlikely(blocked_rdev)) {
1429
1430 int j;
1431 int d;
1432
1433 for (j = 0; j < i; j++) {
1434 if (r10_bio->devs[j].bio) {
1435 d = r10_bio->devs[j].devnum;
1436 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1437 }
1438 if (r10_bio->devs[j].repl_bio) {
1439 struct md_rdev *rdev;
1440 d = r10_bio->devs[j].devnum;
1441 rdev = conf->mirrors[d].replacement;
1442 if (!rdev) {
1443
1444 smp_mb();
1445 rdev = conf->mirrors[d].rdev;
1446 }
1447 rdev_dec_pending(rdev, mddev);
1448 }
1449 }
1450 allow_barrier(conf);
1451 raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
1452 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1453 wait_barrier(conf);
1454 goto retry_write;
1455 }
1456
1457 if (max_sectors < r10_bio->sectors)
1458 r10_bio->sectors = max_sectors;
1459
1460 if (r10_bio->sectors < bio_sectors(bio)) {
1461 struct bio *split = bio_split(bio, r10_bio->sectors,
1462 GFP_NOIO, &conf->bio_split);
1463 bio_chain(split, bio);
1464 allow_barrier(conf);
1465 submit_bio_noacct(bio);
1466 wait_barrier(conf);
1467 bio = split;
1468 r10_bio->master_bio = bio;
1469 }
1470
1471 atomic_set(&r10_bio->remaining, 1);
1472 md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1473
1474 for (i = 0; i < conf->copies; i++) {
1475 if (r10_bio->devs[i].bio)
1476 raid10_write_one_disk(mddev, r10_bio, bio, false, i);
1477 if (r10_bio->devs[i].repl_bio)
1478 raid10_write_one_disk(mddev, r10_bio, bio, true, i);
1479 }
1480 one_write_done(r10_bio);
1481}
1482
1483static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
1484{
1485 struct r10conf *conf = mddev->private;
1486 struct r10bio *r10_bio;
1487
1488 r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
1489
1490 r10_bio->master_bio = bio;
1491 r10_bio->sectors = sectors;
1492
1493 r10_bio->mddev = mddev;
1494 r10_bio->sector = bio->bi_iter.bi_sector;
1495 r10_bio->state = 0;
1496 memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
1497
1498 if (bio_data_dir(bio) == READ)
1499 raid10_read_request(mddev, bio, r10_bio);
1500 else
1501 raid10_write_request(mddev, bio, r10_bio);
1502}
1503
1504static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
1505{
1506 struct r10conf *conf = mddev->private;
1507 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1508 int chunk_sects = chunk_mask + 1;
1509 int sectors = bio_sectors(bio);
1510
1511 if (unlikely(bio->bi_opf & REQ_PREFLUSH)
1512 && md_flush_request(mddev, bio))
1513 return true;
1514
1515 if (!md_write_start(mddev, bio))
1516 return false;
1517
1518
1519
1520
1521
1522 if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
1523 sectors > chunk_sects
1524 && (conf->geo.near_copies < conf->geo.raid_disks
1525 || conf->prev.near_copies <
1526 conf->prev.raid_disks)))
1527 sectors = chunk_sects -
1528 (bio->bi_iter.bi_sector &
1529 (chunk_sects - 1));
1530 __make_request(mddev, bio, sectors);
1531
1532
1533 wake_up(&conf->wait_barrier);
1534 return true;
1535}
1536
1537static void raid10_status(struct seq_file *seq, struct mddev *mddev)
1538{
1539 struct r10conf *conf = mddev->private;
1540 int i;
1541
1542 if (conf->geo.near_copies < conf->geo.raid_disks)
1543 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1544 if (conf->geo.near_copies > 1)
1545 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1546 if (conf->geo.far_copies > 1) {
1547 if (conf->geo.far_offset)
1548 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1549 else
1550 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1551 if (conf->geo.far_set_size != conf->geo.raid_disks)
1552 seq_printf(seq, " %d devices per set", conf->geo.far_set_size);
1553 }
1554 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1555 conf->geo.raid_disks - mddev->degraded);
1556 rcu_read_lock();
1557 for (i = 0; i < conf->geo.raid_disks; i++) {
1558 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1559 seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1560 }
1561 rcu_read_unlock();
1562 seq_printf(seq, "]");
1563}
1564
1565
1566
1567
1568
1569
1570static int _enough(struct r10conf *conf, int previous, int ignore)
1571{
1572 int first = 0;
1573 int has_enough = 0;
1574 int disks, ncopies;
1575 if (previous) {
1576 disks = conf->prev.raid_disks;
1577 ncopies = conf->prev.near_copies;
1578 } else {
1579 disks = conf->geo.raid_disks;
1580 ncopies = conf->geo.near_copies;
1581 }
1582
1583 rcu_read_lock();
1584 do {
1585 int n = conf->copies;
1586 int cnt = 0;
1587 int this = first;
1588 while (n--) {
1589 struct md_rdev *rdev;
1590 if (this != ignore &&
1591 (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
1592 test_bit(In_sync, &rdev->flags))
1593 cnt++;
1594 this = (this+1) % disks;
1595 }
1596 if (cnt == 0)
1597 goto out;
1598 first = (first + ncopies) % disks;
1599 } while (first != 0);
1600 has_enough = 1;
1601out:
1602 rcu_read_unlock();
1603 return has_enough;
1604}
1605
1606static int enough(struct r10conf *conf, int ignore)
1607{
1608
1609
1610
1611
1612
1613 return _enough(conf, 0, ignore) &&
1614 _enough(conf, 1, ignore);
1615}
1616
1617static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
1618{
1619 char b[BDEVNAME_SIZE];
1620 struct r10conf *conf = mddev->private;
1621 unsigned long flags;
1622
1623
1624
1625
1626
1627
1628
1629 spin_lock_irqsave(&conf->device_lock, flags);
1630 if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
1631 && !enough(conf, rdev->raid_disk)) {
1632
1633
1634
1635 spin_unlock_irqrestore(&conf->device_lock, flags);
1636 return;
1637 }
1638 if (test_and_clear_bit(In_sync, &rdev->flags))
1639 mddev->degraded++;
1640
1641
1642
1643 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1644 set_bit(Blocked, &rdev->flags);
1645 set_bit(Faulty, &rdev->flags);
1646 set_mask_bits(&mddev->sb_flags, 0,
1647 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1648 spin_unlock_irqrestore(&conf->device_lock, flags);
1649 pr_crit("md/raid10:%s: Disk failure on %s, disabling device.\n"
1650 "md/raid10:%s: Operation continuing on %d devices.\n",
1651 mdname(mddev), bdevname(rdev->bdev, b),
1652 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1653}
1654
1655static void print_conf(struct r10conf *conf)
1656{
1657 int i;
1658 struct md_rdev *rdev;
1659
1660 pr_debug("RAID10 conf printout:\n");
1661 if (!conf) {
1662 pr_debug("(!conf)\n");
1663 return;
1664 }
1665 pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1666 conf->geo.raid_disks);
1667
1668
1669
1670 for (i = 0; i < conf->geo.raid_disks; i++) {
1671 char b[BDEVNAME_SIZE];
1672 rdev = conf->mirrors[i].rdev;
1673 if (rdev)
1674 pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n",
1675 i, !test_bit(In_sync, &rdev->flags),
1676 !test_bit(Faulty, &rdev->flags),
1677 bdevname(rdev->bdev,b));
1678 }
1679}
1680
1681static void close_sync(struct r10conf *conf)
1682{
1683 wait_barrier(conf);
1684 allow_barrier(conf);
1685
1686 mempool_exit(&conf->r10buf_pool);
1687}
1688
1689static int raid10_spare_active(struct mddev *mddev)
1690{
1691 int i;
1692 struct r10conf *conf = mddev->private;
1693 struct raid10_info *tmp;
1694 int count = 0;
1695 unsigned long flags;
1696
1697
1698
1699
1700
1701 for (i = 0; i < conf->geo.raid_disks; i++) {
1702 tmp = conf->mirrors + i;
1703 if (tmp->replacement
1704 && tmp->replacement->recovery_offset == MaxSector
1705 && !test_bit(Faulty, &tmp->replacement->flags)
1706 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1707
1708 if (!tmp->rdev
1709 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1710 count++;
1711 if (tmp->rdev) {
1712
1713
1714
1715
1716 set_bit(Faulty, &tmp->rdev->flags);
1717 sysfs_notify_dirent_safe(
1718 tmp->rdev->sysfs_state);
1719 }
1720 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1721 } else if (tmp->rdev
1722 && tmp->rdev->recovery_offset == MaxSector
1723 && !test_bit(Faulty, &tmp->rdev->flags)
1724 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1725 count++;
1726 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
1727 }
1728 }
1729 spin_lock_irqsave(&conf->device_lock, flags);
1730 mddev->degraded -= count;
1731 spin_unlock_irqrestore(&conf->device_lock, flags);
1732
1733 print_conf(conf);
1734 return count;
1735}
1736
1737static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1738{
1739 struct r10conf *conf = mddev->private;
1740 int err = -EEXIST;
1741 int mirror;
1742 int first = 0;
1743 int last = conf->geo.raid_disks - 1;
1744
1745 if (mddev->recovery_cp < MaxSector)
1746
1747
1748
1749 return -EBUSY;
1750 if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
1751 return -EINVAL;
1752
1753 if (md_integrity_add_rdev(rdev, mddev))
1754 return -ENXIO;
1755
1756 if (rdev->raid_disk >= 0)
1757 first = last = rdev->raid_disk;
1758
1759 if (rdev->saved_raid_disk >= first &&
1760 rdev->saved_raid_disk < conf->geo.raid_disks &&
1761 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1762 mirror = rdev->saved_raid_disk;
1763 else
1764 mirror = first;
1765 for ( ; mirror <= last ; mirror++) {
1766 struct raid10_info *p = &conf->mirrors[mirror];
1767 if (p->recovery_disabled == mddev->recovery_disabled)
1768 continue;
1769 if (p->rdev) {
1770 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1771 p->replacement != NULL)
1772 continue;
1773 clear_bit(In_sync, &rdev->flags);
1774 set_bit(Replacement, &rdev->flags);
1775 rdev->raid_disk = mirror;
1776 err = 0;
1777 if (mddev->gendisk)
1778 disk_stack_limits(mddev->gendisk, rdev->bdev,
1779 rdev->data_offset << 9);
1780 conf->fullsync = 1;
1781 rcu_assign_pointer(p->replacement, rdev);
1782 break;
1783 }
1784
1785 if (mddev->gendisk)
1786 disk_stack_limits(mddev->gendisk, rdev->bdev,
1787 rdev->data_offset << 9);
1788
1789 p->head_position = 0;
1790 p->recovery_disabled = mddev->recovery_disabled - 1;
1791 rdev->raid_disk = mirror;
1792 err = 0;
1793 if (rdev->saved_raid_disk != mirror)
1794 conf->fullsync = 1;
1795 rcu_assign_pointer(p->rdev, rdev);
1796 break;
1797 }
1798 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1799 blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue);
1800
1801 print_conf(conf);
1802 return err;
1803}
1804
1805static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1806{
1807 struct r10conf *conf = mddev->private;
1808 int err = 0;
1809 int number = rdev->raid_disk;
1810 struct md_rdev **rdevp;
1811 struct raid10_info *p = conf->mirrors + number;
1812
1813 print_conf(conf);
1814 if (rdev == p->rdev)
1815 rdevp = &p->rdev;
1816 else if (rdev == p->replacement)
1817 rdevp = &p->replacement;
1818 else
1819 return 0;
1820
1821 if (test_bit(In_sync, &rdev->flags) ||
1822 atomic_read(&rdev->nr_pending)) {
1823 err = -EBUSY;
1824 goto abort;
1825 }
1826
1827
1828
1829 if (!test_bit(Faulty, &rdev->flags) &&
1830 mddev->recovery_disabled != p->recovery_disabled &&
1831 (!p->replacement || p->replacement == rdev) &&
1832 number < conf->geo.raid_disks &&
1833 enough(conf, -1)) {
1834 err = -EBUSY;
1835 goto abort;
1836 }
1837 *rdevp = NULL;
1838 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
1839 synchronize_rcu();
1840 if (atomic_read(&rdev->nr_pending)) {
1841
1842 err = -EBUSY;
1843 *rdevp = rdev;
1844 goto abort;
1845 }
1846 }
1847 if (p->replacement) {
1848
1849 p->rdev = p->replacement;
1850 clear_bit(Replacement, &p->replacement->flags);
1851 smp_mb();
1852
1853
1854 p->replacement = NULL;
1855 }
1856
1857 clear_bit(WantReplacement, &rdev->flags);
1858 err = md_integrity_register(mddev);
1859
1860abort:
1861
1862 print_conf(conf);
1863 return err;
1864}
1865
1866static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
1867{
1868 struct r10conf *conf = r10_bio->mddev->private;
1869
1870 if (!bio->bi_status)
1871 set_bit(R10BIO_Uptodate, &r10_bio->state);
1872 else
1873
1874
1875
1876 atomic_add(r10_bio->sectors,
1877 &conf->mirrors[d].rdev->corrected_errors);
1878
1879
1880
1881
1882 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1883 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1884 atomic_dec_and_test(&r10_bio->remaining)) {
1885
1886
1887
1888 reschedule_retry(r10_bio);
1889 }
1890}
1891
1892static void end_sync_read(struct bio *bio)
1893{
1894 struct r10bio *r10_bio = get_resync_r10bio(bio);
1895 struct r10conf *conf = r10_bio->mddev->private;
1896 int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1897
1898 __end_sync_read(r10_bio, bio, d);
1899}
1900
1901static void end_reshape_read(struct bio *bio)
1902{
1903
1904 struct r10bio *r10_bio = bio->bi_private;
1905
1906 __end_sync_read(r10_bio, bio, r10_bio->read_slot);
1907}
1908
1909static void end_sync_request(struct r10bio *r10_bio)
1910{
1911 struct mddev *mddev = r10_bio->mddev;
1912
1913 while (atomic_dec_and_test(&r10_bio->remaining)) {
1914 if (r10_bio->master_bio == NULL) {
1915
1916 sector_t s = r10_bio->sectors;
1917 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1918 test_bit(R10BIO_WriteError, &r10_bio->state))
1919 reschedule_retry(r10_bio);
1920 else
1921 put_buf(r10_bio);
1922 md_done_sync(mddev, s, 1);
1923 break;
1924 } else {
1925 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1926 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1927 test_bit(R10BIO_WriteError, &r10_bio->state))
1928 reschedule_retry(r10_bio);
1929 else
1930 put_buf(r10_bio);
1931 r10_bio = r10_bio2;
1932 }
1933 }
1934}
1935
1936static void end_sync_write(struct bio *bio)
1937{
1938 struct r10bio *r10_bio = get_resync_r10bio(bio);
1939 struct mddev *mddev = r10_bio->mddev;
1940 struct r10conf *conf = mddev->private;
1941 int d;
1942 sector_t first_bad;
1943 int bad_sectors;
1944 int slot;
1945 int repl;
1946 struct md_rdev *rdev = NULL;
1947
1948 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1949 if (repl)
1950 rdev = conf->mirrors[d].replacement;
1951 else
1952 rdev = conf->mirrors[d].rdev;
1953
1954 if (bio->bi_status) {
1955 if (repl)
1956 md_error(mddev, rdev);
1957 else {
1958 set_bit(WriteErrorSeen, &rdev->flags);
1959 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1960 set_bit(MD_RECOVERY_NEEDED,
1961 &rdev->mddev->recovery);
1962 set_bit(R10BIO_WriteError, &r10_bio->state);
1963 }
1964 } else if (is_badblock(rdev,
1965 r10_bio->devs[slot].addr,
1966 r10_bio->sectors,
1967 &first_bad, &bad_sectors))
1968 set_bit(R10BIO_MadeGood, &r10_bio->state);
1969
1970 rdev_dec_pending(rdev, mddev);
1971
1972 end_sync_request(r10_bio);
1973}
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1992{
1993 struct r10conf *conf = mddev->private;
1994 int i, first;
1995 struct bio *tbio, *fbio;
1996 int vcnt;
1997 struct page **tpages, **fpages;
1998
1999 atomic_set(&r10_bio->remaining, 1);
2000
2001
2002 for (i=0; i<conf->copies; i++)
2003 if (!r10_bio->devs[i].bio->bi_status)
2004 break;
2005
2006 if (i == conf->copies)
2007 goto done;
2008
2009 first = i;
2010 fbio = r10_bio->devs[i].bio;
2011 fbio->bi_iter.bi_size = r10_bio->sectors << 9;
2012 fbio->bi_iter.bi_idx = 0;
2013 fpages = get_resync_pages(fbio)->pages;
2014
2015 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2016
2017 for (i=0 ; i < conf->copies ; i++) {
2018 int j, d;
2019 struct md_rdev *rdev;
2020 struct resync_pages *rp;
2021
2022 tbio = r10_bio->devs[i].bio;
2023
2024 if (tbio->bi_end_io != end_sync_read)
2025 continue;
2026 if (i == first)
2027 continue;
2028
2029 tpages = get_resync_pages(tbio)->pages;
2030 d = r10_bio->devs[i].devnum;
2031 rdev = conf->mirrors[d].rdev;
2032 if (!r10_bio->devs[i].bio->bi_status) {
2033
2034
2035
2036
2037 int sectors = r10_bio->sectors;
2038 for (j = 0; j < vcnt; j++) {
2039 int len = PAGE_SIZE;
2040 if (sectors < (len / 512))
2041 len = sectors * 512;
2042 if (memcmp(page_address(fpages[j]),
2043 page_address(tpages[j]),
2044 len))
2045 break;
2046 sectors -= len/512;
2047 }
2048 if (j == vcnt)
2049 continue;
2050 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2051 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2052
2053 continue;
2054 } else if (test_bit(FailFast, &rdev->flags)) {
2055
2056 md_error(rdev->mddev, rdev);
2057 continue;
2058 }
2059
2060
2061
2062
2063
2064 rp = get_resync_pages(tbio);
2065 bio_reset(tbio);
2066
2067 md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size);
2068
2069 rp->raid_bio = r10_bio;
2070 tbio->bi_private = rp;
2071 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
2072 tbio->bi_end_io = end_sync_write;
2073 bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
2074
2075 bio_copy_data(tbio, fbio);
2076
2077 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2078 atomic_inc(&r10_bio->remaining);
2079 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2080
2081 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
2082 tbio->bi_opf |= MD_FAILFAST;
2083 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
2084 bio_set_dev(tbio, conf->mirrors[d].rdev->bdev);
2085 submit_bio_noacct(tbio);
2086 }
2087
2088
2089
2090
2091 for (i = 0; i < conf->copies; i++) {
2092 int d;
2093
2094 tbio = r10_bio->devs[i].repl_bio;
2095 if (!tbio || !tbio->bi_end_io)
2096 continue;
2097 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2098 && r10_bio->devs[i].bio != fbio)
2099 bio_copy_data(tbio, fbio);
2100 d = r10_bio->devs[i].devnum;
2101 atomic_inc(&r10_bio->remaining);
2102 md_sync_acct(conf->mirrors[d].replacement->bdev,
2103 bio_sectors(tbio));
2104 submit_bio_noacct(tbio);
2105 }
2106
2107done:
2108 if (atomic_dec_and_test(&r10_bio->remaining)) {
2109 md_done_sync(mddev, r10_bio->sectors, 1);
2110 put_buf(r10_bio);
2111 }
2112}
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124static void fix_recovery_read_error(struct r10bio *r10_bio)
2125{
2126
2127
2128
2129
2130
2131
2132
2133 struct mddev *mddev = r10_bio->mddev;
2134 struct r10conf *conf = mddev->private;
2135 struct bio *bio = r10_bio->devs[0].bio;
2136 sector_t sect = 0;
2137 int sectors = r10_bio->sectors;
2138 int idx = 0;
2139 int dr = r10_bio->devs[0].devnum;
2140 int dw = r10_bio->devs[1].devnum;
2141 struct page **pages = get_resync_pages(bio)->pages;
2142
2143 while (sectors) {
2144 int s = sectors;
2145 struct md_rdev *rdev;
2146 sector_t addr;
2147 int ok;
2148
2149 if (s > (PAGE_SIZE>>9))
2150 s = PAGE_SIZE >> 9;
2151
2152 rdev = conf->mirrors[dr].rdev;
2153 addr = r10_bio->devs[0].addr + sect,
2154 ok = sync_page_io(rdev,
2155 addr,
2156 s << 9,
2157 pages[idx],
2158 REQ_OP_READ, 0, false);
2159 if (ok) {
2160 rdev = conf->mirrors[dw].rdev;
2161 addr = r10_bio->devs[1].addr + sect;
2162 ok = sync_page_io(rdev,
2163 addr,
2164 s << 9,
2165 pages[idx],
2166 REQ_OP_WRITE, 0, false);
2167 if (!ok) {
2168 set_bit(WriteErrorSeen, &rdev->flags);
2169 if (!test_and_set_bit(WantReplacement,
2170 &rdev->flags))
2171 set_bit(MD_RECOVERY_NEEDED,
2172 &rdev->mddev->recovery);
2173 }
2174 }
2175 if (!ok) {
2176
2177
2178
2179
2180 rdev_set_badblocks(rdev, addr, s, 0);
2181
2182 if (rdev != conf->mirrors[dw].rdev) {
2183
2184 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2185 addr = r10_bio->devs[1].addr + sect;
2186 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2187 if (!ok) {
2188
2189 pr_notice("md/raid10:%s: recovery aborted due to read error\n",
2190 mdname(mddev));
2191
2192 conf->mirrors[dw].recovery_disabled
2193 = mddev->recovery_disabled;
2194 set_bit(MD_RECOVERY_INTR,
2195 &mddev->recovery);
2196 break;
2197 }
2198 }
2199 }
2200
2201 sectors -= s;
2202 sect += s;
2203 idx++;
2204 }
2205}
2206
2207static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2208{
2209 struct r10conf *conf = mddev->private;
2210 int d;
2211 struct bio *wbio, *wbio2;
2212
2213 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2214 fix_recovery_read_error(r10_bio);
2215 end_sync_request(r10_bio);
2216 return;
2217 }
2218
2219
2220
2221
2222
2223 d = r10_bio->devs[1].devnum;
2224 wbio = r10_bio->devs[1].bio;
2225 wbio2 = r10_bio->devs[1].repl_bio;
2226
2227
2228
2229
2230 if (wbio2 && !wbio2->bi_end_io)
2231 wbio2 = NULL;
2232 if (wbio->bi_end_io) {
2233 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2234 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2235 submit_bio_noacct(wbio);
2236 }
2237 if (wbio2) {
2238 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2239 md_sync_acct(conf->mirrors[d].replacement->bdev,
2240 bio_sectors(wbio2));
2241 submit_bio_noacct(wbio2);
2242 }
2243}
2244
2245
2246
2247
2248
2249
2250
2251static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2252{
2253 long cur_time_mon;
2254 unsigned long hours_since_last;
2255 unsigned int read_errors = atomic_read(&rdev->read_errors);
2256
2257 cur_time_mon = ktime_get_seconds();
2258
2259 if (rdev->last_read_error == 0) {
2260
2261 rdev->last_read_error = cur_time_mon;
2262 return;
2263 }
2264
2265 hours_since_last = (long)(cur_time_mon -
2266 rdev->last_read_error) / 3600;
2267
2268 rdev->last_read_error = cur_time_mon;
2269
2270
2271
2272
2273
2274
2275 if (hours_since_last >= 8 * sizeof(read_errors))
2276 atomic_set(&rdev->read_errors, 0);
2277 else
2278 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2279}
2280
2281static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2282 int sectors, struct page *page, int rw)
2283{
2284 sector_t first_bad;
2285 int bad_sectors;
2286
2287 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2288 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2289 return -1;
2290 if (sync_page_io(rdev, sector, sectors << 9, page, rw, 0, false))
2291
2292 return 1;
2293 if (rw == WRITE) {
2294 set_bit(WriteErrorSeen, &rdev->flags);
2295 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2296 set_bit(MD_RECOVERY_NEEDED,
2297 &rdev->mddev->recovery);
2298 }
2299
2300 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2301 md_error(rdev->mddev, rdev);
2302 return 0;
2303}
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2314{
2315 int sect = 0;
2316 int sectors = r10_bio->sectors;
2317 struct md_rdev *rdev;
2318 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2319 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2320
2321
2322
2323
2324 rdev = conf->mirrors[d].rdev;
2325
2326 if (test_bit(Faulty, &rdev->flags))
2327
2328
2329 return;
2330
2331 check_decay_read_errors(mddev, rdev);
2332 atomic_inc(&rdev->read_errors);
2333 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2334 char b[BDEVNAME_SIZE];
2335 bdevname(rdev->bdev, b);
2336
2337 pr_notice("md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %d:max %d]\n",
2338 mdname(mddev), b,
2339 atomic_read(&rdev->read_errors), max_read_errors);
2340 pr_notice("md/raid10:%s: %s: Failing raid device\n",
2341 mdname(mddev), b);
2342 md_error(mddev, rdev);
2343 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2344 return;
2345 }
2346
2347 while(sectors) {
2348 int s = sectors;
2349 int sl = r10_bio->read_slot;
2350 int success = 0;
2351 int start;
2352
2353 if (s > (PAGE_SIZE>>9))
2354 s = PAGE_SIZE >> 9;
2355
2356 rcu_read_lock();
2357 do {
2358 sector_t first_bad;
2359 int bad_sectors;
2360
2361 d = r10_bio->devs[sl].devnum;
2362 rdev = rcu_dereference(conf->mirrors[d].rdev);
2363 if (rdev &&
2364 test_bit(In_sync, &rdev->flags) &&
2365 !test_bit(Faulty, &rdev->flags) &&
2366 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2367 &first_bad, &bad_sectors) == 0) {
2368 atomic_inc(&rdev->nr_pending);
2369 rcu_read_unlock();
2370 success = sync_page_io(rdev,
2371 r10_bio->devs[sl].addr +
2372 sect,
2373 s<<9,
2374 conf->tmppage,
2375 REQ_OP_READ, 0, false);
2376 rdev_dec_pending(rdev, mddev);
2377 rcu_read_lock();
2378 if (success)
2379 break;
2380 }
2381 sl++;
2382 if (sl == conf->copies)
2383 sl = 0;
2384 } while (!success && sl != r10_bio->read_slot);
2385 rcu_read_unlock();
2386
2387 if (!success) {
2388
2389
2390
2391
2392 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2393 rdev = conf->mirrors[dn].rdev;
2394
2395 if (!rdev_set_badblocks(
2396 rdev,
2397 r10_bio->devs[r10_bio->read_slot].addr
2398 + sect,
2399 s, 0)) {
2400 md_error(mddev, rdev);
2401 r10_bio->devs[r10_bio->read_slot].bio
2402 = IO_BLOCKED;
2403 }
2404 break;
2405 }
2406
2407 start = sl;
2408
2409 rcu_read_lock();
2410 while (sl != r10_bio->read_slot) {
2411 char b[BDEVNAME_SIZE];
2412
2413 if (sl==0)
2414 sl = conf->copies;
2415 sl--;
2416 d = r10_bio->devs[sl].devnum;
2417 rdev = rcu_dereference(conf->mirrors[d].rdev);
2418 if (!rdev ||
2419 test_bit(Faulty, &rdev->flags) ||
2420 !test_bit(In_sync, &rdev->flags))
2421 continue;
2422
2423 atomic_inc(&rdev->nr_pending);
2424 rcu_read_unlock();
2425 if (r10_sync_page_io(rdev,
2426 r10_bio->devs[sl].addr +
2427 sect,
2428 s, conf->tmppage, WRITE)
2429 == 0) {
2430
2431 pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %s)\n",
2432 mdname(mddev), s,
2433 (unsigned long long)(
2434 sect +
2435 choose_data_offset(r10_bio,
2436 rdev)),
2437 bdevname(rdev->bdev, b));
2438 pr_notice("md/raid10:%s: %s: failing drive\n",
2439 mdname(mddev),
2440 bdevname(rdev->bdev, b));
2441 }
2442 rdev_dec_pending(rdev, mddev);
2443 rcu_read_lock();
2444 }
2445 sl = start;
2446 while (sl != r10_bio->read_slot) {
2447 char b[BDEVNAME_SIZE];
2448
2449 if (sl==0)
2450 sl = conf->copies;
2451 sl--;
2452 d = r10_bio->devs[sl].devnum;
2453 rdev = rcu_dereference(conf->mirrors[d].rdev);
2454 if (!rdev ||
2455 test_bit(Faulty, &rdev->flags) ||
2456 !test_bit(In_sync, &rdev->flags))
2457 continue;
2458
2459 atomic_inc(&rdev->nr_pending);
2460 rcu_read_unlock();
2461 switch (r10_sync_page_io(rdev,
2462 r10_bio->devs[sl].addr +
2463 sect,
2464 s, conf->tmppage,
2465 READ)) {
2466 case 0:
2467
2468 pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %s)\n",
2469 mdname(mddev), s,
2470 (unsigned long long)(
2471 sect +
2472 choose_data_offset(r10_bio, rdev)),
2473 bdevname(rdev->bdev, b));
2474 pr_notice("md/raid10:%s: %s: failing drive\n",
2475 mdname(mddev),
2476 bdevname(rdev->bdev, b));
2477 break;
2478 case 1:
2479 pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %s)\n",
2480 mdname(mddev), s,
2481 (unsigned long long)(
2482 sect +
2483 choose_data_offset(r10_bio, rdev)),
2484 bdevname(rdev->bdev, b));
2485 atomic_add(s, &rdev->corrected_errors);
2486 }
2487
2488 rdev_dec_pending(rdev, mddev);
2489 rcu_read_lock();
2490 }
2491 rcu_read_unlock();
2492
2493 sectors -= s;
2494 sect += s;
2495 }
2496}
2497
2498static int narrow_write_error(struct r10bio *r10_bio, int i)
2499{
2500 struct bio *bio = r10_bio->master_bio;
2501 struct mddev *mddev = r10_bio->mddev;
2502 struct r10conf *conf = mddev->private;
2503 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515 int block_sectors;
2516 sector_t sector;
2517 int sectors;
2518 int sect_to_write = r10_bio->sectors;
2519 int ok = 1;
2520
2521 if (rdev->badblocks.shift < 0)
2522 return 0;
2523
2524 block_sectors = roundup(1 << rdev->badblocks.shift,
2525 bdev_logical_block_size(rdev->bdev) >> 9);
2526 sector = r10_bio->sector;
2527 sectors = ((r10_bio->sector + block_sectors)
2528 & ~(sector_t)(block_sectors - 1))
2529 - sector;
2530
2531 while (sect_to_write) {
2532 struct bio *wbio;
2533 sector_t wsector;
2534 if (sectors > sect_to_write)
2535 sectors = sect_to_write;
2536
2537 wbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
2538 bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
2539 wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
2540 wbio->bi_iter.bi_sector = wsector +
2541 choose_data_offset(r10_bio, rdev);
2542 bio_set_dev(wbio, rdev->bdev);
2543 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
2544
2545 if (submit_bio_wait(wbio) < 0)
2546
2547 ok = rdev_set_badblocks(rdev, wsector,
2548 sectors, 0)
2549 && ok;
2550
2551 bio_put(wbio);
2552 sect_to_write -= sectors;
2553 sector += sectors;
2554 sectors = block_sectors;
2555 }
2556 return ok;
2557}
2558
2559static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2560{
2561 int slot = r10_bio->read_slot;
2562 struct bio *bio;
2563 struct r10conf *conf = mddev->private;
2564 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574 bio = r10_bio->devs[slot].bio;
2575 bio_put(bio);
2576 r10_bio->devs[slot].bio = NULL;
2577
2578 if (mddev->ro)
2579 r10_bio->devs[slot].bio = IO_BLOCKED;
2580 else if (!test_bit(FailFast, &rdev->flags)) {
2581 freeze_array(conf, 1);
2582 fix_read_error(conf, mddev, r10_bio);
2583 unfreeze_array(conf);
2584 } else
2585 md_error(mddev, rdev);
2586
2587 rdev_dec_pending(rdev, mddev);
2588 allow_barrier(conf);
2589 r10_bio->state = 0;
2590 raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
2591}
2592
2593static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2594{
2595
2596
2597
2598
2599
2600
2601 int m;
2602 struct md_rdev *rdev;
2603
2604 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2605 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2606 for (m = 0; m < conf->copies; m++) {
2607 int dev = r10_bio->devs[m].devnum;
2608 rdev = conf->mirrors[dev].rdev;
2609 if (r10_bio->devs[m].bio == NULL ||
2610 r10_bio->devs[m].bio->bi_end_io == NULL)
2611 continue;
2612 if (!r10_bio->devs[m].bio->bi_status) {
2613 rdev_clear_badblocks(
2614 rdev,
2615 r10_bio->devs[m].addr,
2616 r10_bio->sectors, 0);
2617 } else {
2618 if (!rdev_set_badblocks(
2619 rdev,
2620 r10_bio->devs[m].addr,
2621 r10_bio->sectors, 0))
2622 md_error(conf->mddev, rdev);
2623 }
2624 rdev = conf->mirrors[dev].replacement;
2625 if (r10_bio->devs[m].repl_bio == NULL ||
2626 r10_bio->devs[m].repl_bio->bi_end_io == NULL)
2627 continue;
2628
2629 if (!r10_bio->devs[m].repl_bio->bi_status) {
2630 rdev_clear_badblocks(
2631 rdev,
2632 r10_bio->devs[m].addr,
2633 r10_bio->sectors, 0);
2634 } else {
2635 if (!rdev_set_badblocks(
2636 rdev,
2637 r10_bio->devs[m].addr,
2638 r10_bio->sectors, 0))
2639 md_error(conf->mddev, rdev);
2640 }
2641 }
2642 put_buf(r10_bio);
2643 } else {
2644 bool fail = false;
2645 for (m = 0; m < conf->copies; m++) {
2646 int dev = r10_bio->devs[m].devnum;
2647 struct bio *bio = r10_bio->devs[m].bio;
2648 rdev = conf->mirrors[dev].rdev;
2649 if (bio == IO_MADE_GOOD) {
2650 rdev_clear_badblocks(
2651 rdev,
2652 r10_bio->devs[m].addr,
2653 r10_bio->sectors, 0);
2654 rdev_dec_pending(rdev, conf->mddev);
2655 } else if (bio != NULL && bio->bi_status) {
2656 fail = true;
2657 if (!narrow_write_error(r10_bio, m)) {
2658 md_error(conf->mddev, rdev);
2659 set_bit(R10BIO_Degraded,
2660 &r10_bio->state);
2661 }
2662 rdev_dec_pending(rdev, conf->mddev);
2663 }
2664 bio = r10_bio->devs[m].repl_bio;
2665 rdev = conf->mirrors[dev].replacement;
2666 if (rdev && bio == IO_MADE_GOOD) {
2667 rdev_clear_badblocks(
2668 rdev,
2669 r10_bio->devs[m].addr,
2670 r10_bio->sectors, 0);
2671 rdev_dec_pending(rdev, conf->mddev);
2672 }
2673 }
2674 if (fail) {
2675 spin_lock_irq(&conf->device_lock);
2676 list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
2677 conf->nr_queued++;
2678 spin_unlock_irq(&conf->device_lock);
2679
2680
2681
2682
2683 wake_up(&conf->wait_barrier);
2684 md_wakeup_thread(conf->mddev->thread);
2685 } else {
2686 if (test_bit(R10BIO_WriteError,
2687 &r10_bio->state))
2688 close_write(r10_bio);
2689 raid_end_bio_io(r10_bio);
2690 }
2691 }
2692}
2693
2694static void raid10d(struct md_thread *thread)
2695{
2696 struct mddev *mddev = thread->mddev;
2697 struct r10bio *r10_bio;
2698 unsigned long flags;
2699 struct r10conf *conf = mddev->private;
2700 struct list_head *head = &conf->retry_list;
2701 struct blk_plug plug;
2702
2703 md_check_recovery(mddev);
2704
2705 if (!list_empty_careful(&conf->bio_end_io_list) &&
2706 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2707 LIST_HEAD(tmp);
2708 spin_lock_irqsave(&conf->device_lock, flags);
2709 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2710 while (!list_empty(&conf->bio_end_io_list)) {
2711 list_move(conf->bio_end_io_list.prev, &tmp);
2712 conf->nr_queued--;
2713 }
2714 }
2715 spin_unlock_irqrestore(&conf->device_lock, flags);
2716 while (!list_empty(&tmp)) {
2717 r10_bio = list_first_entry(&tmp, struct r10bio,
2718 retry_list);
2719 list_del(&r10_bio->retry_list);
2720 if (mddev->degraded)
2721 set_bit(R10BIO_Degraded, &r10_bio->state);
2722
2723 if (test_bit(R10BIO_WriteError,
2724 &r10_bio->state))
2725 close_write(r10_bio);
2726 raid_end_bio_io(r10_bio);
2727 }
2728 }
2729
2730 blk_start_plug(&plug);
2731 for (;;) {
2732
2733 flush_pending_writes(conf);
2734
2735 spin_lock_irqsave(&conf->device_lock, flags);
2736 if (list_empty(head)) {
2737 spin_unlock_irqrestore(&conf->device_lock, flags);
2738 break;
2739 }
2740 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2741 list_del(head->prev);
2742 conf->nr_queued--;
2743 spin_unlock_irqrestore(&conf->device_lock, flags);
2744
2745 mddev = r10_bio->mddev;
2746 conf = mddev->private;
2747 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2748 test_bit(R10BIO_WriteError, &r10_bio->state))
2749 handle_write_completed(conf, r10_bio);
2750 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2751 reshape_request_write(mddev, r10_bio);
2752 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2753 sync_request_write(mddev, r10_bio);
2754 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2755 recovery_request_write(mddev, r10_bio);
2756 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2757 handle_read_error(mddev, r10_bio);
2758 else
2759 WARN_ON_ONCE(1);
2760
2761 cond_resched();
2762 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
2763 md_check_recovery(mddev);
2764 }
2765 blk_finish_plug(&plug);
2766}
2767
2768static int init_resync(struct r10conf *conf)
2769{
2770 int ret, buffs, i;
2771
2772 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2773 BUG_ON(mempool_initialized(&conf->r10buf_pool));
2774 conf->have_replacement = 0;
2775 for (i = 0; i < conf->geo.raid_disks; i++)
2776 if (conf->mirrors[i].replacement)
2777 conf->have_replacement = 1;
2778 ret = mempool_init(&conf->r10buf_pool, buffs,
2779 r10buf_pool_alloc, r10buf_pool_free, conf);
2780 if (ret)
2781 return ret;
2782 conf->next_resync = 0;
2783 return 0;
2784}
2785
2786static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
2787{
2788 struct r10bio *r10bio = mempool_alloc(&conf->r10buf_pool, GFP_NOIO);
2789 struct rsync_pages *rp;
2790 struct bio *bio;
2791 int nalloc;
2792 int i;
2793
2794 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
2795 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
2796 nalloc = conf->copies;
2797 else
2798 nalloc = 2;
2799
2800 for (i = 0; i < nalloc; i++) {
2801 bio = r10bio->devs[i].bio;
2802 rp = bio->bi_private;
2803 bio_reset(bio);
2804 bio->bi_private = rp;
2805 bio = r10bio->devs[i].repl_bio;
2806 if (bio) {
2807 rp = bio->bi_private;
2808 bio_reset(bio);
2809 bio->bi_private = rp;
2810 }
2811 }
2812 return r10bio;
2813}
2814
2815
2816
2817
2818
2819static void raid10_set_cluster_sync_high(struct r10conf *conf)
2820{
2821 sector_t window_size;
2822 int extra_chunk, chunks;
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836 chunks = conf->geo.raid_disks / conf->geo.near_copies;
2837 if (conf->geo.raid_disks % conf->geo.near_copies == 0)
2838 extra_chunk = 0;
2839 else
2840 extra_chunk = 1;
2841 window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors;
2842
2843
2844
2845
2846 window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ?
2847 CLUSTER_RESYNC_WINDOW_SECTORS : window_size;
2848
2849 conf->cluster_sync_high = conf->cluster_sync_low + window_size;
2850}
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
2885 int *skipped)
2886{
2887 struct r10conf *conf = mddev->private;
2888 struct r10bio *r10_bio;
2889 struct bio *biolist = NULL, *bio;
2890 sector_t max_sector, nr_sectors;
2891 int i;
2892 int max_sync;
2893 sector_t sync_blocks;
2894 sector_t sectors_skipped = 0;
2895 int chunks_skipped = 0;
2896 sector_t chunk_mask = conf->geo.chunk_mask;
2897 int page_idx = 0;
2898
2899 if (!mempool_initialized(&conf->r10buf_pool))
2900 if (init_resync(conf))
2901 return 0;
2902
2903
2904
2905
2906
2907 if (mddev->bitmap == NULL &&
2908 mddev->recovery_cp == MaxSector &&
2909 mddev->reshape_position == MaxSector &&
2910 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2911 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
2912 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2913 conf->fullsync == 0) {
2914 *skipped = 1;
2915 return mddev->dev_sectors - sector_nr;
2916 }
2917
2918 skipped:
2919 max_sector = mddev->dev_sectors;
2920 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2921 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2922 max_sector = mddev->resync_max_sectors;
2923 if (sector_nr >= max_sector) {
2924 conf->cluster_sync_low = 0;
2925 conf->cluster_sync_high = 0;
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2937 end_reshape(conf);
2938 close_sync(conf);
2939 return 0;
2940 }
2941
2942 if (mddev->curr_resync < max_sector) {
2943 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2944 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2945 &sync_blocks, 1);
2946 else for (i = 0; i < conf->geo.raid_disks; i++) {
2947 sector_t sect =
2948 raid10_find_virt(conf, mddev->curr_resync, i);
2949 md_bitmap_end_sync(mddev->bitmap, sect,
2950 &sync_blocks, 1);
2951 }
2952 } else {
2953
2954 if ((!mddev->bitmap || conf->fullsync)
2955 && conf->have_replacement
2956 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2957
2958
2959
2960 rcu_read_lock();
2961 for (i = 0; i < conf->geo.raid_disks; i++) {
2962 struct md_rdev *rdev =
2963 rcu_dereference(conf->mirrors[i].replacement);
2964 if (rdev)
2965 rdev->recovery_offset = MaxSector;
2966 }
2967 rcu_read_unlock();
2968 }
2969 conf->fullsync = 0;
2970 }
2971 md_bitmap_close_sync(mddev->bitmap);
2972 close_sync(conf);
2973 *skipped = 1;
2974 return sectors_skipped;
2975 }
2976
2977 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2978 return reshape_request(mddev, sector_nr, skipped);
2979
2980 if (chunks_skipped >= conf->geo.raid_disks) {
2981
2982
2983
2984 *skipped = 1;
2985 return (max_sector - sector_nr) + sectors_skipped;
2986 }
2987
2988 if (max_sector > mddev->resync_max)
2989 max_sector = mddev->resync_max;
2990
2991
2992
2993
2994 if (conf->geo.near_copies < conf->geo.raid_disks &&
2995 max_sector > (sector_nr | chunk_mask))
2996 max_sector = (sector_nr | chunk_mask) + 1;
2997
2998
2999
3000
3001
3002 if (conf->nr_waiting)
3003 schedule_timeout_uninterruptible(1);
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
3021 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3022
3023 int j;
3024 r10_bio = NULL;
3025
3026 for (i = 0 ; i < conf->geo.raid_disks; i++) {
3027 int still_degraded;
3028 struct r10bio *rb2;
3029 sector_t sect;
3030 int must_sync;
3031 int any_working;
3032 int need_recover = 0;
3033 int need_replace = 0;
3034 struct raid10_info *mirror = &conf->mirrors[i];
3035 struct md_rdev *mrdev, *mreplace;
3036
3037 rcu_read_lock();
3038 mrdev = rcu_dereference(mirror->rdev);
3039 mreplace = rcu_dereference(mirror->replacement);
3040
3041 if (mrdev != NULL &&
3042 !test_bit(Faulty, &mrdev->flags) &&
3043 !test_bit(In_sync, &mrdev->flags))
3044 need_recover = 1;
3045 if (mreplace != NULL &&
3046 !test_bit(Faulty, &mreplace->flags))
3047 need_replace = 1;
3048
3049 if (!need_recover && !need_replace) {
3050 rcu_read_unlock();
3051 continue;
3052 }
3053
3054 still_degraded = 0;
3055
3056 rb2 = r10_bio;
3057 sect = raid10_find_virt(conf, sector_nr, i);
3058 if (sect >= mddev->resync_max_sectors) {
3059
3060
3061
3062 rcu_read_unlock();
3063 continue;
3064 }
3065 if (mreplace && test_bit(Faulty, &mreplace->flags))
3066 mreplace = NULL;
3067
3068
3069
3070
3071 must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
3072 &sync_blocks, 1);
3073 if (sync_blocks < max_sync)
3074 max_sync = sync_blocks;
3075 if (!must_sync &&
3076 mreplace == NULL &&
3077 !conf->fullsync) {
3078
3079
3080
3081 chunks_skipped = -1;
3082 rcu_read_unlock();
3083 continue;
3084 }
3085 atomic_inc(&mrdev->nr_pending);
3086 if (mreplace)
3087 atomic_inc(&mreplace->nr_pending);
3088 rcu_read_unlock();
3089
3090 r10_bio = raid10_alloc_init_r10buf(conf);
3091 r10_bio->state = 0;
3092 raise_barrier(conf, rb2 != NULL);
3093 atomic_set(&r10_bio->remaining, 0);
3094
3095 r10_bio->master_bio = (struct bio*)rb2;
3096 if (rb2)
3097 atomic_inc(&rb2->remaining);
3098 r10_bio->mddev = mddev;
3099 set_bit(R10BIO_IsRecover, &r10_bio->state);
3100 r10_bio->sector = sect;
3101
3102 raid10_find_phys(conf, r10_bio);
3103
3104
3105
3106
3107 rcu_read_lock();
3108 for (j = 0; j < conf->geo.raid_disks; j++) {
3109 struct md_rdev *rdev = rcu_dereference(
3110 conf->mirrors[j].rdev);
3111 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3112 still_degraded = 1;
3113 break;
3114 }
3115 }
3116
3117 must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
3118 &sync_blocks, still_degraded);
3119
3120 any_working = 0;
3121 for (j=0; j<conf->copies;j++) {
3122 int k;
3123 int d = r10_bio->devs[j].devnum;
3124 sector_t from_addr, to_addr;
3125 struct md_rdev *rdev =
3126 rcu_dereference(conf->mirrors[d].rdev);
3127 sector_t sector, first_bad;
3128 int bad_sectors;
3129 if (!rdev ||
3130 !test_bit(In_sync, &rdev->flags))
3131 continue;
3132
3133 any_working = 1;
3134 sector = r10_bio->devs[j].addr;
3135
3136 if (is_badblock(rdev, sector, max_sync,
3137 &first_bad, &bad_sectors)) {
3138 if (first_bad > sector)
3139 max_sync = first_bad - sector;
3140 else {
3141 bad_sectors -= (sector
3142 - first_bad);
3143 if (max_sync > bad_sectors)
3144 max_sync = bad_sectors;
3145 continue;
3146 }
3147 }
3148 bio = r10_bio->devs[0].bio;
3149 bio->bi_next = biolist;
3150 biolist = bio;
3151 bio->bi_end_io = end_sync_read;
3152 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3153 if (test_bit(FailFast, &rdev->flags))
3154 bio->bi_opf |= MD_FAILFAST;
3155 from_addr = r10_bio->devs[j].addr;
3156 bio->bi_iter.bi_sector = from_addr +
3157 rdev->data_offset;
3158 bio_set_dev(bio, rdev->bdev);
3159 atomic_inc(&rdev->nr_pending);
3160
3161
3162 for (k=0; k<conf->copies; k++)
3163 if (r10_bio->devs[k].devnum == i)
3164 break;
3165 BUG_ON(k == conf->copies);
3166 to_addr = r10_bio->devs[k].addr;
3167 r10_bio->devs[0].devnum = d;
3168 r10_bio->devs[0].addr = from_addr;
3169 r10_bio->devs[1].devnum = i;
3170 r10_bio->devs[1].addr = to_addr;
3171
3172 if (need_recover) {
3173 bio = r10_bio->devs[1].bio;
3174 bio->bi_next = biolist;
3175 biolist = bio;
3176 bio->bi_end_io = end_sync_write;
3177 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3178 bio->bi_iter.bi_sector = to_addr
3179 + mrdev->data_offset;
3180 bio_set_dev(bio, mrdev->bdev);
3181 atomic_inc(&r10_bio->remaining);
3182 } else
3183 r10_bio->devs[1].bio->bi_end_io = NULL;
3184
3185
3186 bio = r10_bio->devs[1].repl_bio;
3187 if (bio)
3188 bio->bi_end_io = NULL;
3189
3190
3191
3192
3193 if (!need_replace)
3194 break;
3195 bio->bi_next = biolist;
3196 biolist = bio;
3197 bio->bi_end_io = end_sync_write;
3198 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3199 bio->bi_iter.bi_sector = to_addr +
3200 mreplace->data_offset;
3201 bio_set_dev(bio, mreplace->bdev);
3202 atomic_inc(&r10_bio->remaining);
3203 break;
3204 }
3205 rcu_read_unlock();
3206 if (j == conf->copies) {
3207
3208
3209 if (any_working) {
3210
3211
3212
3213 int k;
3214 for (k = 0; k < conf->copies; k++)
3215 if (r10_bio->devs[k].devnum == i)
3216 break;
3217 if (!test_bit(In_sync,
3218 &mrdev->flags)
3219 && !rdev_set_badblocks(
3220 mrdev,
3221 r10_bio->devs[k].addr,
3222 max_sync, 0))
3223 any_working = 0;
3224 if (mreplace &&
3225 !rdev_set_badblocks(
3226 mreplace,
3227 r10_bio->devs[k].addr,
3228 max_sync, 0))
3229 any_working = 0;
3230 }
3231 if (!any_working) {
3232 if (!test_and_set_bit(MD_RECOVERY_INTR,
3233 &mddev->recovery))
3234 pr_warn("md/raid10:%s: insufficient working devices for recovery.\n",
3235 mdname(mddev));
3236 mirror->recovery_disabled
3237 = mddev->recovery_disabled;
3238 }
3239 put_buf(r10_bio);
3240 if (rb2)
3241 atomic_dec(&rb2->remaining);
3242 r10_bio = rb2;
3243 rdev_dec_pending(mrdev, mddev);
3244 if (mreplace)
3245 rdev_dec_pending(mreplace, mddev);
3246 break;
3247 }
3248 rdev_dec_pending(mrdev, mddev);
3249 if (mreplace)
3250 rdev_dec_pending(mreplace, mddev);
3251 if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) {
3252
3253
3254
3255
3256 int targets = 1;
3257 for (; j < conf->copies; j++) {
3258 int d = r10_bio->devs[j].devnum;
3259 if (conf->mirrors[d].rdev &&
3260 test_bit(In_sync,
3261 &conf->mirrors[d].rdev->flags))
3262 targets++;
3263 }
3264 if (targets == 1)
3265 r10_bio->devs[0].bio->bi_opf
3266 &= ~MD_FAILFAST;
3267 }
3268 }
3269 if (biolist == NULL) {
3270 while (r10_bio) {
3271 struct r10bio *rb2 = r10_bio;
3272 r10_bio = (struct r10bio*) rb2->master_bio;
3273 rb2->master_bio = NULL;
3274 put_buf(rb2);
3275 }
3276 goto giveup;
3277 }
3278 } else {
3279
3280 int count = 0;
3281
3282
3283
3284
3285
3286
3287
3288
3289 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
3290 mddev_is_clustered(mddev) &&
3291 (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
3292
3293 if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
3294 &sync_blocks, mddev->degraded) &&
3295 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3296 &mddev->recovery)) {
3297
3298 *skipped = 1;
3299 return sync_blocks + sectors_skipped;
3300 }
3301 if (sync_blocks < max_sync)
3302 max_sync = sync_blocks;
3303 r10_bio = raid10_alloc_init_r10buf(conf);
3304 r10_bio->state = 0;
3305
3306 r10_bio->mddev = mddev;
3307 atomic_set(&r10_bio->remaining, 0);
3308 raise_barrier(conf, 0);
3309 conf->next_resync = sector_nr;
3310
3311 r10_bio->master_bio = NULL;
3312 r10_bio->sector = sector_nr;
3313 set_bit(R10BIO_IsSync, &r10_bio->state);
3314 raid10_find_phys(conf, r10_bio);
3315 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3316
3317 for (i = 0; i < conf->copies; i++) {
3318 int d = r10_bio->devs[i].devnum;
3319 sector_t first_bad, sector;
3320 int bad_sectors;
3321 struct md_rdev *rdev;
3322
3323 if (r10_bio->devs[i].repl_bio)
3324 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3325
3326 bio = r10_bio->devs[i].bio;
3327 bio->bi_status = BLK_STS_IOERR;
3328 rcu_read_lock();
3329 rdev = rcu_dereference(conf->mirrors[d].rdev);
3330 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3331 rcu_read_unlock();
3332 continue;
3333 }
3334 sector = r10_bio->devs[i].addr;
3335 if (is_badblock(rdev, sector, max_sync,
3336 &first_bad, &bad_sectors)) {
3337 if (first_bad > sector)
3338 max_sync = first_bad - sector;
3339 else {
3340 bad_sectors -= (sector - first_bad);
3341 if (max_sync > bad_sectors)
3342 max_sync = bad_sectors;
3343 rcu_read_unlock();
3344 continue;
3345 }
3346 }
3347 atomic_inc(&rdev->nr_pending);
3348 atomic_inc(&r10_bio->remaining);
3349 bio->bi_next = biolist;
3350 biolist = bio;
3351 bio->bi_end_io = end_sync_read;
3352 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3353 if (test_bit(FailFast, &rdev->flags))
3354 bio->bi_opf |= MD_FAILFAST;
3355 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3356 bio_set_dev(bio, rdev->bdev);
3357 count++;
3358
3359 rdev = rcu_dereference(conf->mirrors[d].replacement);
3360 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3361 rcu_read_unlock();
3362 continue;
3363 }
3364 atomic_inc(&rdev->nr_pending);
3365
3366
3367 bio = r10_bio->devs[i].repl_bio;
3368 bio->bi_status = BLK_STS_IOERR;
3369
3370 sector = r10_bio->devs[i].addr;
3371 bio->bi_next = biolist;
3372 biolist = bio;
3373 bio->bi_end_io = end_sync_write;
3374 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3375 if (test_bit(FailFast, &rdev->flags))
3376 bio->bi_opf |= MD_FAILFAST;
3377 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3378 bio_set_dev(bio, rdev->bdev);
3379 count++;
3380 rcu_read_unlock();
3381 }
3382
3383 if (count < 2) {
3384 for (i=0; i<conf->copies; i++) {
3385 int d = r10_bio->devs[i].devnum;
3386 if (r10_bio->devs[i].bio->bi_end_io)
3387 rdev_dec_pending(conf->mirrors[d].rdev,
3388 mddev);
3389 if (r10_bio->devs[i].repl_bio &&
3390 r10_bio->devs[i].repl_bio->bi_end_io)
3391 rdev_dec_pending(
3392 conf->mirrors[d].replacement,
3393 mddev);
3394 }
3395 put_buf(r10_bio);
3396 biolist = NULL;
3397 goto giveup;
3398 }
3399 }
3400
3401 nr_sectors = 0;
3402 if (sector_nr + max_sync < max_sector)
3403 max_sector = sector_nr + max_sync;
3404 do {
3405 struct page *page;
3406 int len = PAGE_SIZE;
3407 if (sector_nr + (len>>9) > max_sector)
3408 len = (max_sector - sector_nr) << 9;
3409 if (len == 0)
3410 break;
3411 for (bio= biolist ; bio ; bio=bio->bi_next) {
3412 struct resync_pages *rp = get_resync_pages(bio);
3413 page = resync_fetch_page(rp, page_idx);
3414
3415
3416
3417
3418 bio_add_page(bio, page, len, 0);
3419 }
3420 nr_sectors += len>>9;
3421 sector_nr += len>>9;
3422 } while (++page_idx < RESYNC_PAGES);
3423 r10_bio->sectors = nr_sectors;
3424
3425 if (mddev_is_clustered(mddev) &&
3426 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3427
3428 if (conf->cluster_sync_high < sector_nr + nr_sectors) {
3429 conf->cluster_sync_low = mddev->curr_resync_completed;
3430 raid10_set_cluster_sync_high(conf);
3431
3432 md_cluster_ops->resync_info_update(mddev,
3433 conf->cluster_sync_low,
3434 conf->cluster_sync_high);
3435 }
3436 } else if (mddev_is_clustered(mddev)) {
3437
3438 sector_t sect_va1, sect_va2;
3439 bool broadcast_msg = false;
3440
3441 for (i = 0; i < conf->geo.raid_disks; i++) {
3442
3443
3444
3445
3446
3447 sect_va1 = raid10_find_virt(conf, sector_nr, i);
3448
3449 if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
3450 broadcast_msg = true;
3451
3452
3453
3454
3455 sect_va2 = raid10_find_virt(conf,
3456 mddev->curr_resync_completed, i);
3457
3458 if (conf->cluster_sync_low == 0 ||
3459 conf->cluster_sync_low > sect_va2)
3460 conf->cluster_sync_low = sect_va2;
3461 }
3462 }
3463 if (broadcast_msg) {
3464 raid10_set_cluster_sync_high(conf);
3465 md_cluster_ops->resync_info_update(mddev,
3466 conf->cluster_sync_low,
3467 conf->cluster_sync_high);
3468 }
3469 }
3470
3471 while (biolist) {
3472 bio = biolist;
3473 biolist = biolist->bi_next;
3474
3475 bio->bi_next = NULL;
3476 r10_bio = get_resync_r10bio(bio);
3477 r10_bio->sectors = nr_sectors;
3478
3479 if (bio->bi_end_io == end_sync_read) {
3480 md_sync_acct_bio(bio, nr_sectors);
3481 bio->bi_status = 0;
3482 submit_bio_noacct(bio);
3483 }
3484 }
3485
3486 if (sectors_skipped)
3487
3488
3489
3490 md_done_sync(mddev, sectors_skipped, 1);
3491
3492 return sectors_skipped + nr_sectors;
3493 giveup:
3494
3495
3496
3497
3498 if (sector_nr + max_sync < max_sector)
3499 max_sector = sector_nr + max_sync;
3500
3501 sectors_skipped += (max_sector - sector_nr);
3502 chunks_skipped ++;
3503 sector_nr = max_sector;
3504 goto skipped;
3505}
3506
3507static sector_t
3508raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3509{
3510 sector_t size;
3511 struct r10conf *conf = mddev->private;
3512
3513 if (!raid_disks)
3514 raid_disks = min(conf->geo.raid_disks,
3515 conf->prev.raid_disks);
3516 if (!sectors)
3517 sectors = conf->dev_sectors;
3518
3519 size = sectors >> conf->geo.chunk_shift;
3520 sector_div(size, conf->geo.far_copies);
3521 size = size * raid_disks;
3522 sector_div(size, conf->geo.near_copies);
3523
3524 return size << conf->geo.chunk_shift;
3525}
3526
3527static void calc_sectors(struct r10conf *conf, sector_t size)
3528{
3529
3530
3531
3532
3533
3534 size = size >> conf->geo.chunk_shift;
3535 sector_div(size, conf->geo.far_copies);
3536 size = size * conf->geo.raid_disks;
3537 sector_div(size, conf->geo.near_copies);
3538
3539
3540 size = size * conf->copies;
3541
3542
3543
3544
3545 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3546
3547 conf->dev_sectors = size << conf->geo.chunk_shift;
3548
3549 if (conf->geo.far_offset)
3550 conf->geo.stride = 1 << conf->geo.chunk_shift;
3551 else {
3552 sector_div(size, conf->geo.far_copies);
3553 conf->geo.stride = size << conf->geo.chunk_shift;
3554 }
3555}
3556
3557enum geo_type {geo_new, geo_old, geo_start};
3558static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3559{
3560 int nc, fc, fo;
3561 int layout, chunk, disks;
3562 switch (new) {
3563 case geo_old:
3564 layout = mddev->layout;
3565 chunk = mddev->chunk_sectors;
3566 disks = mddev->raid_disks - mddev->delta_disks;
3567 break;
3568 case geo_new:
3569 layout = mddev->new_layout;
3570 chunk = mddev->new_chunk_sectors;
3571 disks = mddev->raid_disks;
3572 break;
3573 default:
3574 case geo_start:
3575
3576 layout = mddev->new_layout;
3577 chunk = mddev->new_chunk_sectors;
3578 disks = mddev->raid_disks + mddev->delta_disks;
3579 break;
3580 }
3581 if (layout >> 19)
3582 return -1;
3583 if (chunk < (PAGE_SIZE >> 9) ||
3584 !is_power_of_2(chunk))
3585 return -2;
3586 nc = layout & 255;
3587 fc = (layout >> 8) & 255;
3588 fo = layout & (1<<16);
3589 geo->raid_disks = disks;
3590 geo->near_copies = nc;
3591 geo->far_copies = fc;
3592 geo->far_offset = fo;
3593 switch (layout >> 17) {
3594 case 0:
3595 geo->far_set_size = disks;
3596 break;
3597 case 1:
3598
3599 geo->far_set_size = disks/fc;
3600 WARN(geo->far_set_size < fc,
3601 "This RAID10 layout does not provide data safety - please backup and create new array\n");
3602 break;
3603 case 2:
3604 geo->far_set_size = fc * nc;
3605 break;
3606 default:
3607 return -1;
3608 }
3609 geo->chunk_mask = chunk - 1;
3610 geo->chunk_shift = ffz(~chunk);
3611 return nc*fc;
3612}
3613
3614static struct r10conf *setup_conf(struct mddev *mddev)
3615{
3616 struct r10conf *conf = NULL;
3617 int err = -EINVAL;
3618 struct geom geo;
3619 int copies;
3620
3621 copies = setup_geo(&geo, mddev, geo_new);
3622
3623 if (copies == -2) {
3624 pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n",
3625 mdname(mddev), PAGE_SIZE);
3626 goto out;
3627 }
3628
3629 if (copies < 2 || copies > mddev->raid_disks) {
3630 pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3631 mdname(mddev), mddev->new_layout);
3632 goto out;
3633 }
3634
3635 err = -ENOMEM;
3636 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3637 if (!conf)
3638 goto out;
3639
3640
3641 conf->mirrors = kcalloc(mddev->raid_disks + max(0, -mddev->delta_disks),
3642 sizeof(struct raid10_info),
3643 GFP_KERNEL);
3644 if (!conf->mirrors)
3645 goto out;
3646
3647 conf->tmppage = alloc_page(GFP_KERNEL);
3648 if (!conf->tmppage)
3649 goto out;
3650
3651 conf->geo = geo;
3652 conf->copies = copies;
3653 err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc,
3654 rbio_pool_free, conf);
3655 if (err)
3656 goto out;
3657
3658 err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
3659 if (err)
3660 goto out;
3661
3662 calc_sectors(conf, mddev->dev_sectors);
3663 if (mddev->reshape_position == MaxSector) {
3664 conf->prev = conf->geo;
3665 conf->reshape_progress = MaxSector;
3666 } else {
3667 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3668 err = -EINVAL;
3669 goto out;
3670 }
3671 conf->reshape_progress = mddev->reshape_position;
3672 if (conf->prev.far_offset)
3673 conf->prev.stride = 1 << conf->prev.chunk_shift;
3674 else
3675
3676 conf->prev.stride = conf->dev_sectors;
3677 }
3678 conf->reshape_safe = conf->reshape_progress;
3679 spin_lock_init(&conf->device_lock);
3680 INIT_LIST_HEAD(&conf->retry_list);
3681 INIT_LIST_HEAD(&conf->bio_end_io_list);
3682
3683 spin_lock_init(&conf->resync_lock);
3684 init_waitqueue_head(&conf->wait_barrier);
3685 atomic_set(&conf->nr_pending, 0);
3686
3687 err = -ENOMEM;
3688 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3689 if (!conf->thread)
3690 goto out;
3691
3692 conf->mddev = mddev;
3693 return conf;
3694
3695 out:
3696 if (conf) {
3697 mempool_exit(&conf->r10bio_pool);
3698 kfree(conf->mirrors);
3699 safe_put_page(conf->tmppage);
3700 bioset_exit(&conf->bio_split);
3701 kfree(conf);
3702 }
3703 return ERR_PTR(err);
3704}
3705
3706static int raid10_run(struct mddev *mddev)
3707{
3708 struct r10conf *conf;
3709 int i, disk_idx, chunk_size;
3710 struct raid10_info *disk;
3711 struct md_rdev *rdev;
3712 sector_t size;
3713 sector_t min_offset_diff = 0;
3714 int first = 1;
3715 bool discard_supported = false;
3716
3717 if (mddev_init_writes_pending(mddev) < 0)
3718 return -ENOMEM;
3719
3720 if (mddev->private == NULL) {
3721 conf = setup_conf(mddev);
3722 if (IS_ERR(conf))
3723 return PTR_ERR(conf);
3724 mddev->private = conf;
3725 }
3726 conf = mddev->private;
3727 if (!conf)
3728 goto out;
3729
3730 if (mddev_is_clustered(conf->mddev)) {
3731 int fc, fo;
3732
3733 fc = (mddev->layout >> 8) & 255;
3734 fo = mddev->layout & (1<<16);
3735 if (fc > 1 || fo > 0) {
3736 pr_err("only near layout is supported by clustered"
3737 " raid10\n");
3738 goto out_free_conf;
3739 }
3740 }
3741
3742 mddev->thread = conf->thread;
3743 conf->thread = NULL;
3744
3745 chunk_size = mddev->chunk_sectors << 9;
3746 if (mddev->queue) {
3747 blk_queue_max_discard_sectors(mddev->queue,
3748 mddev->chunk_sectors);
3749 blk_queue_max_write_same_sectors(mddev->queue, 0);
3750 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
3751 blk_queue_io_min(mddev->queue, chunk_size);
3752 if (conf->geo.raid_disks % conf->geo.near_copies)
3753 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3754 else
3755 blk_queue_io_opt(mddev->queue, chunk_size *
3756 (conf->geo.raid_disks / conf->geo.near_copies));
3757 }
3758
3759 rdev_for_each(rdev, mddev) {
3760 long long diff;
3761
3762 disk_idx = rdev->raid_disk;
3763 if (disk_idx < 0)
3764 continue;
3765 if (disk_idx >= conf->geo.raid_disks &&
3766 disk_idx >= conf->prev.raid_disks)
3767 continue;
3768 disk = conf->mirrors + disk_idx;
3769
3770 if (test_bit(Replacement, &rdev->flags)) {
3771 if (disk->replacement)
3772 goto out_free_conf;
3773 disk->replacement = rdev;
3774 } else {
3775 if (disk->rdev)
3776 goto out_free_conf;
3777 disk->rdev = rdev;
3778 }
3779 diff = (rdev->new_data_offset - rdev->data_offset);
3780 if (!mddev->reshape_backwards)
3781 diff = -diff;
3782 if (diff < 0)
3783 diff = 0;
3784 if (first || diff < min_offset_diff)
3785 min_offset_diff = diff;
3786
3787 if (mddev->gendisk)
3788 disk_stack_limits(mddev->gendisk, rdev->bdev,
3789 rdev->data_offset << 9);
3790
3791 disk->head_position = 0;
3792
3793 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3794 discard_supported = true;
3795 first = 0;
3796 }
3797
3798 if (mddev->queue) {
3799 if (discard_supported)
3800 blk_queue_flag_set(QUEUE_FLAG_DISCARD,
3801 mddev->queue);
3802 else
3803 blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
3804 mddev->queue);
3805 }
3806
3807 if (!enough(conf, -1)) {
3808 pr_err("md/raid10:%s: not enough operational mirrors.\n",
3809 mdname(mddev));
3810 goto out_free_conf;
3811 }
3812
3813 if (conf->reshape_progress != MaxSector) {
3814
3815 if (conf->geo.far_copies != 1 &&
3816 conf->geo.far_offset == 0)
3817 goto out_free_conf;
3818 if (conf->prev.far_copies != 1 &&
3819 conf->prev.far_offset == 0)
3820 goto out_free_conf;
3821 }
3822
3823 mddev->degraded = 0;
3824 for (i = 0;
3825 i < conf->geo.raid_disks
3826 || i < conf->prev.raid_disks;
3827 i++) {
3828
3829 disk = conf->mirrors + i;
3830
3831 if (!disk->rdev && disk->replacement) {
3832
3833 disk->rdev = disk->replacement;
3834 disk->replacement = NULL;
3835 clear_bit(Replacement, &disk->rdev->flags);
3836 }
3837
3838 if (!disk->rdev ||
3839 !test_bit(In_sync, &disk->rdev->flags)) {
3840 disk->head_position = 0;
3841 mddev->degraded++;
3842 if (disk->rdev &&
3843 disk->rdev->saved_raid_disk < 0)
3844 conf->fullsync = 1;
3845 }
3846
3847 if (disk->replacement &&
3848 !test_bit(In_sync, &disk->replacement->flags) &&
3849 disk->replacement->saved_raid_disk < 0) {
3850 conf->fullsync = 1;
3851 }
3852
3853 disk->recovery_disabled = mddev->recovery_disabled - 1;
3854 }
3855
3856 if (mddev->recovery_cp != MaxSector)
3857 pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n",
3858 mdname(mddev));
3859 pr_info("md/raid10:%s: active with %d out of %d devices\n",
3860 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3861 conf->geo.raid_disks);
3862
3863
3864
3865 mddev->dev_sectors = conf->dev_sectors;
3866 size = raid10_size(mddev, 0, 0);
3867 md_set_array_sectors(mddev, size);
3868 mddev->resync_max_sectors = size;
3869 set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
3870
3871 if (mddev->queue) {
3872 int stripe = conf->geo.raid_disks *
3873 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3874
3875
3876
3877
3878
3879 stripe /= conf->geo.near_copies;
3880 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
3881 mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
3882 }
3883
3884 if (md_integrity_register(mddev))
3885 goto out_free_conf;
3886
3887 if (conf->reshape_progress != MaxSector) {
3888 unsigned long before_length, after_length;
3889
3890 before_length = ((1 << conf->prev.chunk_shift) *
3891 conf->prev.far_copies);
3892 after_length = ((1 << conf->geo.chunk_shift) *
3893 conf->geo.far_copies);
3894
3895 if (max(before_length, after_length) > min_offset_diff) {
3896
3897 pr_warn("md/raid10: offset difference not enough to continue reshape\n");
3898 goto out_free_conf;
3899 }
3900 conf->offset_diff = min_offset_diff;
3901
3902 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3903 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3904 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3905 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3906 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3907 "reshape");
3908 if (!mddev->sync_thread)
3909 goto out_free_conf;
3910 }
3911
3912 return 0;
3913
3914out_free_conf:
3915 md_unregister_thread(&mddev->thread);
3916 mempool_exit(&conf->r10bio_pool);
3917 safe_put_page(conf->tmppage);
3918 kfree(conf->mirrors);
3919 kfree(conf);
3920 mddev->private = NULL;
3921out:
3922 return -EIO;
3923}
3924
3925static void raid10_free(struct mddev *mddev, void *priv)
3926{
3927 struct r10conf *conf = priv;
3928
3929 mempool_exit(&conf->r10bio_pool);
3930 safe_put_page(conf->tmppage);
3931 kfree(conf->mirrors);
3932 kfree(conf->mirrors_old);
3933 kfree(conf->mirrors_new);
3934 bioset_exit(&conf->bio_split);
3935 kfree(conf);
3936}
3937
3938static void raid10_quiesce(struct mddev *mddev, int quiesce)
3939{
3940 struct r10conf *conf = mddev->private;
3941
3942 if (quiesce)
3943 raise_barrier(conf, 0);
3944 else
3945 lower_barrier(conf);
3946}
3947
3948static int raid10_resize(struct mddev *mddev, sector_t sectors)
3949{
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962 struct r10conf *conf = mddev->private;
3963 sector_t oldsize, size;
3964
3965 if (mddev->reshape_position != MaxSector)
3966 return -EBUSY;
3967
3968 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3969 return -EINVAL;
3970
3971 oldsize = raid10_size(mddev, 0, 0);
3972 size = raid10_size(mddev, sectors, 0);
3973 if (mddev->external_size &&
3974 mddev->array_sectors > size)
3975 return -EINVAL;
3976 if (mddev->bitmap) {
3977 int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0);
3978 if (ret)
3979 return ret;
3980 }
3981 md_set_array_sectors(mddev, size);
3982 if (sectors > mddev->dev_sectors &&
3983 mddev->recovery_cp > oldsize) {
3984 mddev->recovery_cp = oldsize;
3985 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3986 }
3987 calc_sectors(conf, sectors);
3988 mddev->dev_sectors = conf->dev_sectors;
3989 mddev->resync_max_sectors = size;
3990 return 0;
3991}
3992
3993static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
3994{
3995 struct md_rdev *rdev;
3996 struct r10conf *conf;
3997
3998 if (mddev->degraded > 0) {
3999 pr_warn("md/raid10:%s: Error: degraded raid0!\n",
4000 mdname(mddev));
4001 return ERR_PTR(-EINVAL);
4002 }
4003 sector_div(size, devs);
4004
4005
4006 mddev->new_level = 10;
4007
4008 mddev->new_layout = (1<<8) + 2;
4009 mddev->new_chunk_sectors = mddev->chunk_sectors;
4010 mddev->delta_disks = mddev->raid_disks;
4011 mddev->raid_disks *= 2;
4012
4013 mddev->recovery_cp = MaxSector;
4014 mddev->dev_sectors = size;
4015
4016 conf = setup_conf(mddev);
4017 if (!IS_ERR(conf)) {
4018 rdev_for_each(rdev, mddev)
4019 if (rdev->raid_disk >= 0) {
4020 rdev->new_raid_disk = rdev->raid_disk * 2;
4021 rdev->sectors = size;
4022 }
4023 conf->barrier = 1;
4024 }
4025
4026 return conf;
4027}
4028
4029static void *raid10_takeover(struct mddev *mddev)
4030{
4031 struct r0conf *raid0_conf;
4032
4033
4034
4035
4036 if (mddev->level == 0) {
4037
4038 raid0_conf = mddev->private;
4039 if (raid0_conf->nr_strip_zones > 1) {
4040 pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n",
4041 mdname(mddev));
4042 return ERR_PTR(-EINVAL);
4043 }
4044 return raid10_takeover_raid0(mddev,
4045 raid0_conf->strip_zone->zone_end,
4046 raid0_conf->strip_zone->nb_dev);
4047 }
4048 return ERR_PTR(-EINVAL);
4049}
4050
4051static int raid10_check_reshape(struct mddev *mddev)
4052{
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067 struct r10conf *conf = mddev->private;
4068 struct geom geo;
4069
4070 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
4071 return -EINVAL;
4072
4073 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
4074
4075 return -EINVAL;
4076 if (geo.far_copies > 1 && !geo.far_offset)
4077
4078 return -EINVAL;
4079
4080 if (mddev->array_sectors & geo.chunk_mask)
4081
4082 return -EINVAL;
4083
4084 if (!enough(conf, -1))
4085 return -EINVAL;
4086
4087 kfree(conf->mirrors_new);
4088 conf->mirrors_new = NULL;
4089 if (mddev->delta_disks > 0) {
4090
4091 conf->mirrors_new =
4092 kcalloc(mddev->raid_disks + mddev->delta_disks,
4093 sizeof(struct raid10_info),
4094 GFP_KERNEL);
4095 if (!conf->mirrors_new)
4096 return -ENOMEM;
4097 }
4098 return 0;
4099}
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114static int calc_degraded(struct r10conf *conf)
4115{
4116 int degraded, degraded2;
4117 int i;
4118
4119 rcu_read_lock();
4120 degraded = 0;
4121
4122 for (i = 0; i < conf->prev.raid_disks; i++) {
4123 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4124 if (!rdev || test_bit(Faulty, &rdev->flags))
4125 degraded++;
4126 else if (!test_bit(In_sync, &rdev->flags))
4127
4128
4129
4130
4131 degraded++;
4132 }
4133 rcu_read_unlock();
4134 if (conf->geo.raid_disks == conf->prev.raid_disks)
4135 return degraded;
4136 rcu_read_lock();
4137 degraded2 = 0;
4138 for (i = 0; i < conf->geo.raid_disks; i++) {
4139 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4140 if (!rdev || test_bit(Faulty, &rdev->flags))
4141 degraded2++;
4142 else if (!test_bit(In_sync, &rdev->flags)) {
4143
4144
4145
4146
4147
4148 if (conf->geo.raid_disks <= conf->prev.raid_disks)
4149 degraded2++;
4150 }
4151 }
4152 rcu_read_unlock();
4153 if (degraded2 > degraded)
4154 return degraded2;
4155 return degraded;
4156}
4157
4158static int raid10_start_reshape(struct mddev *mddev)
4159{
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170 unsigned long before_length, after_length;
4171 sector_t min_offset_diff = 0;
4172 int first = 1;
4173 struct geom new;
4174 struct r10conf *conf = mddev->private;
4175 struct md_rdev *rdev;
4176 int spares = 0;
4177 int ret;
4178
4179 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4180 return -EBUSY;
4181
4182 if (setup_geo(&new, mddev, geo_start) != conf->copies)
4183 return -EINVAL;
4184
4185 before_length = ((1 << conf->prev.chunk_shift) *
4186 conf->prev.far_copies);
4187 after_length = ((1 << conf->geo.chunk_shift) *
4188 conf->geo.far_copies);
4189
4190 rdev_for_each(rdev, mddev) {
4191 if (!test_bit(In_sync, &rdev->flags)
4192 && !test_bit(Faulty, &rdev->flags))
4193 spares++;
4194 if (rdev->raid_disk >= 0) {
4195 long long diff = (rdev->new_data_offset
4196 - rdev->data_offset);
4197 if (!mddev->reshape_backwards)
4198 diff = -diff;
4199 if (diff < 0)
4200 diff = 0;
4201 if (first || diff < min_offset_diff)
4202 min_offset_diff = diff;
4203 first = 0;
4204 }
4205 }
4206
4207 if (max(before_length, after_length) > min_offset_diff)
4208 return -EINVAL;
4209
4210 if (spares < mddev->delta_disks)
4211 return -EINVAL;
4212
4213 conf->offset_diff = min_offset_diff;
4214 spin_lock_irq(&conf->device_lock);
4215 if (conf->mirrors_new) {
4216 memcpy(conf->mirrors_new, conf->mirrors,
4217 sizeof(struct raid10_info)*conf->prev.raid_disks);
4218 smp_mb();
4219 kfree(conf->mirrors_old);
4220 conf->mirrors_old = conf->mirrors;
4221 conf->mirrors = conf->mirrors_new;
4222 conf->mirrors_new = NULL;
4223 }
4224 setup_geo(&conf->geo, mddev, geo_start);
4225 smp_mb();
4226 if (mddev->reshape_backwards) {
4227 sector_t size = raid10_size(mddev, 0, 0);
4228 if (size < mddev->array_sectors) {
4229 spin_unlock_irq(&conf->device_lock);
4230 pr_warn("md/raid10:%s: array size must be reduce before number of disks\n",
4231 mdname(mddev));
4232 return -EINVAL;
4233 }
4234 mddev->resync_max_sectors = size;
4235 conf->reshape_progress = size;
4236 } else
4237 conf->reshape_progress = 0;
4238 conf->reshape_safe = conf->reshape_progress;
4239 spin_unlock_irq(&conf->device_lock);
4240
4241 if (mddev->delta_disks && mddev->bitmap) {
4242 struct mdp_superblock_1 *sb = NULL;
4243 sector_t oldsize, newsize;
4244
4245 oldsize = raid10_size(mddev, 0, 0);
4246 newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
4247
4248 if (!mddev_is_clustered(mddev)) {
4249 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
4250 if (ret)
4251 goto abort;
4252 else
4253 goto out;
4254 }
4255
4256 rdev_for_each(rdev, mddev) {
4257 if (rdev->raid_disk > -1 &&
4258 !test_bit(Faulty, &rdev->flags))
4259 sb = page_address(rdev->sb_page);
4260 }
4261
4262
4263
4264
4265
4266
4267 if ((sb && (le32_to_cpu(sb->feature_map) &
4268 MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
4269 goto out;
4270
4271 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
4272 if (ret)
4273 goto abort;
4274
4275 ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
4276 if (ret) {
4277 md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
4278 goto abort;
4279 }
4280 }
4281out:
4282 if (mddev->delta_disks > 0) {
4283 rdev_for_each(rdev, mddev)
4284 if (rdev->raid_disk < 0 &&
4285 !test_bit(Faulty, &rdev->flags)) {
4286 if (raid10_add_disk(mddev, rdev) == 0) {
4287 if (rdev->raid_disk >=
4288 conf->prev.raid_disks)
4289 set_bit(In_sync, &rdev->flags);
4290 else
4291 rdev->recovery_offset = 0;
4292
4293
4294 sysfs_link_rdev(mddev, rdev);
4295 }
4296 } else if (rdev->raid_disk >= conf->prev.raid_disks
4297 && !test_bit(Faulty, &rdev->flags)) {
4298
4299 set_bit(In_sync, &rdev->flags);
4300 }
4301 }
4302
4303
4304
4305
4306 spin_lock_irq(&conf->device_lock);
4307 mddev->degraded = calc_degraded(conf);
4308 spin_unlock_irq(&conf->device_lock);
4309 mddev->raid_disks = conf->geo.raid_disks;
4310 mddev->reshape_position = conf->reshape_progress;
4311 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4312
4313 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4314 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4315 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
4316 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4317 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4318
4319 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4320 "reshape");
4321 if (!mddev->sync_thread) {
4322 ret = -EAGAIN;
4323 goto abort;
4324 }
4325 conf->reshape_checkpoint = jiffies;
4326 md_wakeup_thread(mddev->sync_thread);
4327 md_new_event(mddev);
4328 return 0;
4329
4330abort:
4331 mddev->recovery = 0;
4332 spin_lock_irq(&conf->device_lock);
4333 conf->geo = conf->prev;
4334 mddev->raid_disks = conf->geo.raid_disks;
4335 rdev_for_each(rdev, mddev)
4336 rdev->new_data_offset = rdev->data_offset;
4337 smp_wmb();
4338 conf->reshape_progress = MaxSector;
4339 conf->reshape_safe = MaxSector;
4340 mddev->reshape_position = MaxSector;
4341 spin_unlock_irq(&conf->device_lock);
4342 return ret;
4343}
4344
4345
4346
4347
4348
4349
4350
4351static sector_t last_dev_address(sector_t s, struct geom *geo)
4352{
4353 s = (s | geo->chunk_mask) + 1;
4354 s >>= geo->chunk_shift;
4355 s *= geo->near_copies;
4356 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4357 s *= geo->far_copies;
4358 s <<= geo->chunk_shift;
4359 return s;
4360}
4361
4362
4363
4364
4365
4366static sector_t first_dev_address(sector_t s, struct geom *geo)
4367{
4368 s >>= geo->chunk_shift;
4369 s *= geo->near_copies;
4370 sector_div(s, geo->raid_disks);
4371 s *= geo->far_copies;
4372 s <<= geo->chunk_shift;
4373 return s;
4374}
4375
4376static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4377 int *skipped)
4378{
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416 struct r10conf *conf = mddev->private;
4417 struct r10bio *r10_bio;
4418 sector_t next, safe, last;
4419 int max_sectors;
4420 int nr_sectors;
4421 int s;
4422 struct md_rdev *rdev;
4423 int need_flush = 0;
4424 struct bio *blist;
4425 struct bio *bio, *read_bio;
4426 int sectors_done = 0;
4427 struct page **pages;
4428
4429 if (sector_nr == 0) {
4430
4431 if (mddev->reshape_backwards &&
4432 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4433 sector_nr = (raid10_size(mddev, 0, 0)
4434 - conf->reshape_progress);
4435 } else if (!mddev->reshape_backwards &&
4436 conf->reshape_progress > 0)
4437 sector_nr = conf->reshape_progress;
4438 if (sector_nr) {
4439 mddev->curr_resync_completed = sector_nr;
4440 sysfs_notify_dirent_safe(mddev->sysfs_completed);
4441 *skipped = 1;
4442 return sector_nr;
4443 }
4444 }
4445
4446
4447
4448
4449
4450 if (mddev->reshape_backwards) {
4451
4452
4453
4454 next = first_dev_address(conf->reshape_progress - 1,
4455 &conf->geo);
4456
4457
4458
4459
4460 safe = last_dev_address(conf->reshape_safe - 1,
4461 &conf->prev);
4462
4463 if (next + conf->offset_diff < safe)
4464 need_flush = 1;
4465
4466 last = conf->reshape_progress - 1;
4467 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4468 & conf->prev.chunk_mask);
4469 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4470 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4471 } else {
4472
4473
4474
4475 next = last_dev_address(conf->reshape_progress, &conf->geo);
4476
4477
4478
4479
4480 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4481
4482
4483
4484
4485 if (next > safe + conf->offset_diff)
4486 need_flush = 1;
4487
4488 sector_nr = conf->reshape_progress;
4489 last = sector_nr | (conf->geo.chunk_mask
4490 & conf->prev.chunk_mask);
4491
4492 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4493 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4494 }
4495
4496 if (need_flush ||
4497 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4498
4499 wait_barrier(conf);
4500 mddev->reshape_position = conf->reshape_progress;
4501 if (mddev->reshape_backwards)
4502 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4503 - conf->reshape_progress;
4504 else
4505 mddev->curr_resync_completed = conf->reshape_progress;
4506 conf->reshape_checkpoint = jiffies;
4507 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4508 md_wakeup_thread(mddev->thread);
4509 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
4510 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4511 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4512 allow_barrier(conf);
4513 return sectors_done;
4514 }
4515 conf->reshape_safe = mddev->reshape_position;
4516 allow_barrier(conf);
4517 }
4518
4519 raise_barrier(conf, 0);
4520read_more:
4521
4522 r10_bio = raid10_alloc_init_r10buf(conf);
4523 r10_bio->state = 0;
4524 raise_barrier(conf, 1);
4525 atomic_set(&r10_bio->remaining, 0);
4526 r10_bio->mddev = mddev;
4527 r10_bio->sector = sector_nr;
4528 set_bit(R10BIO_IsReshape, &r10_bio->state);
4529 r10_bio->sectors = last - sector_nr + 1;
4530 rdev = read_balance(conf, r10_bio, &max_sectors);
4531 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4532
4533 if (!rdev) {
4534
4535
4536
4537
4538 mempool_free(r10_bio, &conf->r10buf_pool);
4539 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4540 return sectors_done;
4541 }
4542
4543 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4544
4545 bio_set_dev(read_bio, rdev->bdev);
4546 read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4547 + rdev->data_offset);
4548 read_bio->bi_private = r10_bio;
4549 read_bio->bi_end_io = end_reshape_read;
4550 bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
4551 read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
4552 read_bio->bi_status = 0;
4553 read_bio->bi_vcnt = 0;
4554 read_bio->bi_iter.bi_size = 0;
4555 r10_bio->master_bio = read_bio;
4556 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4557
4558
4559
4560
4561
4562 if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) {
4563 struct mdp_superblock_1 *sb = NULL;
4564 int sb_reshape_pos = 0;
4565
4566 conf->cluster_sync_low = sector_nr;
4567 conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
4568 sb = page_address(rdev->sb_page);
4569 if (sb) {
4570 sb_reshape_pos = le64_to_cpu(sb->reshape_position);
4571
4572
4573
4574
4575
4576 if (sb_reshape_pos < conf->cluster_sync_low)
4577 conf->cluster_sync_low = sb_reshape_pos;
4578 }
4579
4580 md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
4581 conf->cluster_sync_high);
4582 }
4583
4584
4585 __raid10_find_phys(&conf->geo, r10_bio);
4586
4587 blist = read_bio;
4588 read_bio->bi_next = NULL;
4589
4590 rcu_read_lock();
4591 for (s = 0; s < conf->copies*2; s++) {
4592 struct bio *b;
4593 int d = r10_bio->devs[s/2].devnum;
4594 struct md_rdev *rdev2;
4595 if (s&1) {
4596 rdev2 = rcu_dereference(conf->mirrors[d].replacement);
4597 b = r10_bio->devs[s/2].repl_bio;
4598 } else {
4599 rdev2 = rcu_dereference(conf->mirrors[d].rdev);
4600 b = r10_bio->devs[s/2].bio;
4601 }
4602 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4603 continue;
4604
4605 bio_set_dev(b, rdev2->bdev);
4606 b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
4607 rdev2->new_data_offset;
4608 b->bi_end_io = end_reshape_write;
4609 bio_set_op_attrs(b, REQ_OP_WRITE, 0);
4610 b->bi_next = blist;
4611 blist = b;
4612 }
4613
4614
4615
4616 nr_sectors = 0;
4617 pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
4618 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4619 struct page *page = pages[s / (PAGE_SIZE >> 9)];
4620 int len = (max_sectors - s) << 9;
4621 if (len > PAGE_SIZE)
4622 len = PAGE_SIZE;
4623 for (bio = blist; bio ; bio = bio->bi_next) {
4624
4625
4626
4627
4628 bio_add_page(bio, page, len, 0);
4629 }
4630 sector_nr += len >> 9;
4631 nr_sectors += len >> 9;
4632 }
4633 rcu_read_unlock();
4634 r10_bio->sectors = nr_sectors;
4635
4636
4637 md_sync_acct_bio(read_bio, r10_bio->sectors);
4638 atomic_inc(&r10_bio->remaining);
4639 read_bio->bi_next = NULL;
4640 submit_bio_noacct(read_bio);
4641 sectors_done += nr_sectors;
4642 if (sector_nr <= last)
4643 goto read_more;
4644
4645 lower_barrier(conf);
4646
4647
4648
4649
4650 if (mddev->reshape_backwards)
4651 conf->reshape_progress -= sectors_done;
4652 else
4653 conf->reshape_progress += sectors_done;
4654
4655 return sectors_done;
4656}
4657
4658static void end_reshape_request(struct r10bio *r10_bio);
4659static int handle_reshape_read_error(struct mddev *mddev,
4660 struct r10bio *r10_bio);
4661static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4662{
4663
4664
4665
4666
4667
4668 struct r10conf *conf = mddev->private;
4669 int s;
4670
4671 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4672 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4673
4674 md_done_sync(mddev, r10_bio->sectors, 0);
4675 return;
4676 }
4677
4678
4679
4680
4681 atomic_set(&r10_bio->remaining, 1);
4682 for (s = 0; s < conf->copies*2; s++) {
4683 struct bio *b;
4684 int d = r10_bio->devs[s/2].devnum;
4685 struct md_rdev *rdev;
4686 rcu_read_lock();
4687 if (s&1) {
4688 rdev = rcu_dereference(conf->mirrors[d].replacement);
4689 b = r10_bio->devs[s/2].repl_bio;
4690 } else {
4691 rdev = rcu_dereference(conf->mirrors[d].rdev);
4692 b = r10_bio->devs[s/2].bio;
4693 }
4694 if (!rdev || test_bit(Faulty, &rdev->flags)) {
4695 rcu_read_unlock();
4696 continue;
4697 }
4698 atomic_inc(&rdev->nr_pending);
4699 rcu_read_unlock();
4700 md_sync_acct_bio(b, r10_bio->sectors);
4701 atomic_inc(&r10_bio->remaining);
4702 b->bi_next = NULL;
4703 submit_bio_noacct(b);
4704 }
4705 end_reshape_request(r10_bio);
4706}
4707
4708static void end_reshape(struct r10conf *conf)
4709{
4710 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4711 return;
4712
4713 spin_lock_irq(&conf->device_lock);
4714 conf->prev = conf->geo;
4715 md_finish_reshape(conf->mddev);
4716 smp_wmb();
4717 conf->reshape_progress = MaxSector;
4718 conf->reshape_safe = MaxSector;
4719 spin_unlock_irq(&conf->device_lock);
4720
4721
4722
4723
4724 if (conf->mddev->queue) {
4725 int stripe = conf->geo.raid_disks *
4726 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4727 stripe /= conf->geo.near_copies;
4728 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
4729 conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
4730 }
4731 conf->fullsync = 0;
4732}
4733
4734static void raid10_update_reshape_pos(struct mddev *mddev)
4735{
4736 struct r10conf *conf = mddev->private;
4737 sector_t lo, hi;
4738
4739 md_cluster_ops->resync_info_get(mddev, &lo, &hi);
4740 if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
4741 || mddev->reshape_position == MaxSector)
4742 conf->reshape_progress = mddev->reshape_position;
4743 else
4744 WARN_ON_ONCE(1);
4745}
4746
4747static int handle_reshape_read_error(struct mddev *mddev,
4748 struct r10bio *r10_bio)
4749{
4750
4751 int sectors = r10_bio->sectors;
4752 struct r10conf *conf = mddev->private;
4753 struct r10bio *r10b;
4754 int slot = 0;
4755 int idx = 0;
4756 struct page **pages;
4757
4758 r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO);
4759 if (!r10b) {
4760 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4761 return -ENOMEM;
4762 }
4763
4764
4765 pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
4766
4767 r10b->sector = r10_bio->sector;
4768 __raid10_find_phys(&conf->prev, r10b);
4769
4770 while (sectors) {
4771 int s = sectors;
4772 int success = 0;
4773 int first_slot = slot;
4774
4775 if (s > (PAGE_SIZE >> 9))
4776 s = PAGE_SIZE >> 9;
4777
4778 rcu_read_lock();
4779 while (!success) {
4780 int d = r10b->devs[slot].devnum;
4781 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4782 sector_t addr;
4783 if (rdev == NULL ||
4784 test_bit(Faulty, &rdev->flags) ||
4785 !test_bit(In_sync, &rdev->flags))
4786 goto failed;
4787
4788 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4789 atomic_inc(&rdev->nr_pending);
4790 rcu_read_unlock();
4791 success = sync_page_io(rdev,
4792 addr,
4793 s << 9,
4794 pages[idx],
4795 REQ_OP_READ, 0, false);
4796 rdev_dec_pending(rdev, mddev);
4797 rcu_read_lock();
4798 if (success)
4799 break;
4800 failed:
4801 slot++;
4802 if (slot >= conf->copies)
4803 slot = 0;
4804 if (slot == first_slot)
4805 break;
4806 }
4807 rcu_read_unlock();
4808 if (!success) {
4809
4810 set_bit(MD_RECOVERY_INTR,
4811 &mddev->recovery);
4812 kfree(r10b);
4813 return -EIO;
4814 }
4815 sectors -= s;
4816 idx++;
4817 }
4818 kfree(r10b);
4819 return 0;
4820}
4821
4822static void end_reshape_write(struct bio *bio)
4823{
4824 struct r10bio *r10_bio = get_resync_r10bio(bio);
4825 struct mddev *mddev = r10_bio->mddev;
4826 struct r10conf *conf = mddev->private;
4827 int d;
4828 int slot;
4829 int repl;
4830 struct md_rdev *rdev = NULL;
4831
4832 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4833 if (repl)
4834 rdev = conf->mirrors[d].replacement;
4835 if (!rdev) {
4836 smp_mb();
4837 rdev = conf->mirrors[d].rdev;
4838 }
4839
4840 if (bio->bi_status) {
4841
4842 md_error(mddev, rdev);
4843 }
4844
4845 rdev_dec_pending(rdev, mddev);
4846 end_reshape_request(r10_bio);
4847}
4848
4849static void end_reshape_request(struct r10bio *r10_bio)
4850{
4851 if (!atomic_dec_and_test(&r10_bio->remaining))
4852 return;
4853 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4854 bio_put(r10_bio->master_bio);
4855 put_buf(r10_bio);
4856}
4857
4858static void raid10_finish_reshape(struct mddev *mddev)
4859{
4860 struct r10conf *conf = mddev->private;
4861
4862 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4863 return;
4864
4865 if (mddev->delta_disks > 0) {
4866 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4867 mddev->recovery_cp = mddev->resync_max_sectors;
4868 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4869 }
4870 mddev->resync_max_sectors = mddev->array_sectors;
4871 } else {
4872 int d;
4873 rcu_read_lock();
4874 for (d = conf->geo.raid_disks ;
4875 d < conf->geo.raid_disks - mddev->delta_disks;
4876 d++) {
4877 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4878 if (rdev)
4879 clear_bit(In_sync, &rdev->flags);
4880 rdev = rcu_dereference(conf->mirrors[d].replacement);
4881 if (rdev)
4882 clear_bit(In_sync, &rdev->flags);
4883 }
4884 rcu_read_unlock();
4885 }
4886 mddev->layout = mddev->new_layout;
4887 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4888 mddev->reshape_position = MaxSector;
4889 mddev->delta_disks = 0;
4890 mddev->reshape_backwards = 0;
4891}
4892
4893static struct md_personality raid10_personality =
4894{
4895 .name = "raid10",
4896 .level = 10,
4897 .owner = THIS_MODULE,
4898 .make_request = raid10_make_request,
4899 .run = raid10_run,
4900 .free = raid10_free,
4901 .status = raid10_status,
4902 .error_handler = raid10_error,
4903 .hot_add_disk = raid10_add_disk,
4904 .hot_remove_disk= raid10_remove_disk,
4905 .spare_active = raid10_spare_active,
4906 .sync_request = raid10_sync_request,
4907 .quiesce = raid10_quiesce,
4908 .size = raid10_size,
4909 .resize = raid10_resize,
4910 .takeover = raid10_takeover,
4911 .check_reshape = raid10_check_reshape,
4912 .start_reshape = raid10_start_reshape,
4913 .finish_reshape = raid10_finish_reshape,
4914 .update_reshape_pos = raid10_update_reshape_pos,
4915};
4916
4917static int __init raid_init(void)
4918{
4919 return register_md_personality(&raid10_personality);
4920}
4921
4922static void raid_exit(void)
4923{
4924 unregister_md_personality(&raid10_personality);
4925}
4926
4927module_init(raid_init);
4928module_exit(raid_exit);
4929MODULE_LICENSE("GPL");
4930MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4931MODULE_ALIAS("md-personality-9");
4932MODULE_ALIAS("md-raid10");
4933MODULE_ALIAS("md-level-10");
4934
4935module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
4936