1
2
3
4
5
6
7
8
9
10
11
12#include <linux/slab.h>
13#include <linux/delay.h>
14#include <linux/blkdev.h>
15#include <linux/module.h>
16#include <linux/seq_file.h>
17#include <linux/ratelimit.h>
18#include <linux/kthread.h>
19#include <linux/raid/md_p.h>
20#include <trace/events/block.h>
21#include "md.h"
22#include "raid10.h"
23#include "raid0.h"
24#include "md-bitmap.h"
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67static void allow_barrier(struct r10conf *conf);
68static void lower_barrier(struct r10conf *conf);
69static int _enough(struct r10conf *conf, int previous, int ignore);
70static int enough(struct r10conf *conf, int ignore);
71static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
72 int *skipped);
73static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
74static void end_reshape_write(struct bio *bio);
75static void end_reshape(struct r10conf *conf);
76
77#define raid10_log(md, fmt, args...) \
78 do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
79
80#include "raid1-10.c"
81
82
83
84
85
86static inline struct r10bio *get_resync_r10bio(struct bio *bio)
87{
88 return get_resync_pages(bio)->raid_bio;
89}
90
91static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
92{
93 struct r10conf *conf = data;
94 int size = offsetof(struct r10bio, devs[conf->copies]);
95
96
97
98 return kzalloc(size, gfp_flags);
99}
100
101#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
102
103#define RESYNC_WINDOW (1024*1024)
104
105#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
106#define CLUSTER_RESYNC_WINDOW (32 * RESYNC_WINDOW)
107#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
108
109
110
111
112
113
114
115
116static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
117{
118 struct r10conf *conf = data;
119 struct r10bio *r10_bio;
120 struct bio *bio;
121 int j;
122 int nalloc, nalloc_rp;
123 struct resync_pages *rps;
124
125 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
126 if (!r10_bio)
127 return NULL;
128
129 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
130 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
131 nalloc = conf->copies;
132 else
133 nalloc = 2;
134
135
136 if (!conf->have_replacement)
137 nalloc_rp = nalloc;
138 else
139 nalloc_rp = nalloc * 2;
140 rps = kmalloc_array(nalloc_rp, sizeof(struct resync_pages), gfp_flags);
141 if (!rps)
142 goto out_free_r10bio;
143
144
145
146
147 for (j = nalloc ; j-- ; ) {
148 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
149 if (!bio)
150 goto out_free_bio;
151 r10_bio->devs[j].bio = bio;
152 if (!conf->have_replacement)
153 continue;
154 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
155 if (!bio)
156 goto out_free_bio;
157 r10_bio->devs[j].repl_bio = bio;
158 }
159
160
161
162
163 for (j = 0; j < nalloc; j++) {
164 struct bio *rbio = r10_bio->devs[j].repl_bio;
165 struct resync_pages *rp, *rp_repl;
166
167 rp = &rps[j];
168 if (rbio)
169 rp_repl = &rps[nalloc + j];
170
171 bio = r10_bio->devs[j].bio;
172
173 if (!j || test_bit(MD_RECOVERY_SYNC,
174 &conf->mddev->recovery)) {
175 if (resync_alloc_pages(rp, gfp_flags))
176 goto out_free_pages;
177 } else {
178 memcpy(rp, &rps[0], sizeof(*rp));
179 resync_get_all_pages(rp);
180 }
181
182 rp->raid_bio = r10_bio;
183 bio->bi_private = rp;
184 if (rbio) {
185 memcpy(rp_repl, rp, sizeof(*rp));
186 rbio->bi_private = rp_repl;
187 }
188 }
189
190 return r10_bio;
191
192out_free_pages:
193 while (--j >= 0)
194 resync_free_pages(&rps[j]);
195
196 j = 0;
197out_free_bio:
198 for ( ; j < nalloc; j++) {
199 if (r10_bio->devs[j].bio)
200 bio_put(r10_bio->devs[j].bio);
201 if (r10_bio->devs[j].repl_bio)
202 bio_put(r10_bio->devs[j].repl_bio);
203 }
204 kfree(rps);
205out_free_r10bio:
206 rbio_pool_free(r10_bio, conf);
207 return NULL;
208}
209
210static void r10buf_pool_free(void *__r10_bio, void *data)
211{
212 struct r10conf *conf = data;
213 struct r10bio *r10bio = __r10_bio;
214 int j;
215 struct resync_pages *rp = NULL;
216
217 for (j = conf->copies; j--; ) {
218 struct bio *bio = r10bio->devs[j].bio;
219
220 if (bio) {
221 rp = get_resync_pages(bio);
222 resync_free_pages(rp);
223 bio_put(bio);
224 }
225
226 bio = r10bio->devs[j].repl_bio;
227 if (bio)
228 bio_put(bio);
229 }
230
231
232 kfree(rp);
233
234 rbio_pool_free(r10bio, conf);
235}
236
237static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
238{
239 int i;
240
241 for (i = 0; i < conf->copies; i++) {
242 struct bio **bio = & r10_bio->devs[i].bio;
243 if (!BIO_SPECIAL(*bio))
244 bio_put(*bio);
245 *bio = NULL;
246 bio = &r10_bio->devs[i].repl_bio;
247 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
248 bio_put(*bio);
249 *bio = NULL;
250 }
251}
252
253static void free_r10bio(struct r10bio *r10_bio)
254{
255 struct r10conf *conf = r10_bio->mddev->private;
256
257 put_all_bios(conf, r10_bio);
258 mempool_free(r10_bio, &conf->r10bio_pool);
259}
260
261static void put_buf(struct r10bio *r10_bio)
262{
263 struct r10conf *conf = r10_bio->mddev->private;
264
265 mempool_free(r10_bio, &conf->r10buf_pool);
266
267 lower_barrier(conf);
268}
269
270static void reschedule_retry(struct r10bio *r10_bio)
271{
272 unsigned long flags;
273 struct mddev *mddev = r10_bio->mddev;
274 struct r10conf *conf = mddev->private;
275
276 spin_lock_irqsave(&conf->device_lock, flags);
277 list_add(&r10_bio->retry_list, &conf->retry_list);
278 conf->nr_queued ++;
279 spin_unlock_irqrestore(&conf->device_lock, flags);
280
281
282 wake_up(&conf->wait_barrier);
283
284 md_wakeup_thread(mddev->thread);
285}
286
287
288
289
290
291
292static void raid_end_bio_io(struct r10bio *r10_bio)
293{
294 struct bio *bio = r10_bio->master_bio;
295 struct r10conf *conf = r10_bio->mddev->private;
296
297 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
298 bio->bi_status = BLK_STS_IOERR;
299
300 bio_endio(bio);
301
302
303
304
305 allow_barrier(conf);
306
307 free_r10bio(r10_bio);
308}
309
310
311
312
313static inline void update_head_pos(int slot, struct r10bio *r10_bio)
314{
315 struct r10conf *conf = r10_bio->mddev->private;
316
317 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
318 r10_bio->devs[slot].addr + (r10_bio->sectors);
319}
320
321
322
323
324static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
325 struct bio *bio, int *slotp, int *replp)
326{
327 int slot;
328 int repl = 0;
329
330 for (slot = 0; slot < conf->copies; slot++) {
331 if (r10_bio->devs[slot].bio == bio)
332 break;
333 if (r10_bio->devs[slot].repl_bio == bio) {
334 repl = 1;
335 break;
336 }
337 }
338
339 BUG_ON(slot == conf->copies);
340 update_head_pos(slot, r10_bio);
341
342 if (slotp)
343 *slotp = slot;
344 if (replp)
345 *replp = repl;
346 return r10_bio->devs[slot].devnum;
347}
348
349static void raid10_end_read_request(struct bio *bio)
350{
351 int uptodate = !bio->bi_status;
352 struct r10bio *r10_bio = bio->bi_private;
353 int slot;
354 struct md_rdev *rdev;
355 struct r10conf *conf = r10_bio->mddev->private;
356
357 slot = r10_bio->read_slot;
358 rdev = r10_bio->devs[slot].rdev;
359
360
361
362 update_head_pos(slot, r10_bio);
363
364 if (uptodate) {
365
366
367
368
369
370
371
372
373
374 set_bit(R10BIO_Uptodate, &r10_bio->state);
375 } else {
376
377
378
379
380
381 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
382 rdev->raid_disk))
383 uptodate = 1;
384 }
385 if (uptodate) {
386 raid_end_bio_io(r10_bio);
387 rdev_dec_pending(rdev, conf->mddev);
388 } else {
389
390
391
392 char b[BDEVNAME_SIZE];
393 pr_err_ratelimited("md/raid10:%s: %s: rescheduling sector %llu\n",
394 mdname(conf->mddev),
395 bdevname(rdev->bdev, b),
396 (unsigned long long)r10_bio->sector);
397 set_bit(R10BIO_ReadError, &r10_bio->state);
398 reschedule_retry(r10_bio);
399 }
400}
401
402static void close_write(struct r10bio *r10_bio)
403{
404
405 md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
406 r10_bio->sectors,
407 !test_bit(R10BIO_Degraded, &r10_bio->state),
408 0);
409 md_write_end(r10_bio->mddev);
410}
411
412static void one_write_done(struct r10bio *r10_bio)
413{
414 if (atomic_dec_and_test(&r10_bio->remaining)) {
415 if (test_bit(R10BIO_WriteError, &r10_bio->state))
416 reschedule_retry(r10_bio);
417 else {
418 close_write(r10_bio);
419 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
420 reschedule_retry(r10_bio);
421 else
422 raid_end_bio_io(r10_bio);
423 }
424 }
425}
426
427static void raid10_end_write_request(struct bio *bio)
428{
429 struct r10bio *r10_bio = bio->bi_private;
430 int dev;
431 int dec_rdev = 1;
432 struct r10conf *conf = r10_bio->mddev->private;
433 int slot, repl;
434 struct md_rdev *rdev = NULL;
435 struct bio *to_put = NULL;
436 bool discard_error;
437
438 discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
439
440 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
441
442 if (repl)
443 rdev = conf->mirrors[dev].replacement;
444 if (!rdev) {
445 smp_rmb();
446 repl = 0;
447 rdev = conf->mirrors[dev].rdev;
448 }
449
450
451
452 if (bio->bi_status && !discard_error) {
453 if (repl)
454
455
456
457 md_error(rdev->mddev, rdev);
458 else {
459 set_bit(WriteErrorSeen, &rdev->flags);
460 if (!test_and_set_bit(WantReplacement, &rdev->flags))
461 set_bit(MD_RECOVERY_NEEDED,
462 &rdev->mddev->recovery);
463
464 dec_rdev = 0;
465 if (test_bit(FailFast, &rdev->flags) &&
466 (bio->bi_opf & MD_FAILFAST)) {
467 md_error(rdev->mddev, rdev);
468 }
469
470
471
472
473
474
475
476 if (!test_bit(Faulty, &rdev->flags))
477 set_bit(R10BIO_WriteError, &r10_bio->state);
478 else {
479 r10_bio->devs[slot].bio = NULL;
480 to_put = bio;
481 dec_rdev = 1;
482 }
483 }
484 } else {
485
486
487
488
489
490
491
492
493
494 sector_t first_bad;
495 int bad_sectors;
496
497
498
499
500
501
502
503
504
505 if (test_bit(In_sync, &rdev->flags) &&
506 !test_bit(Faulty, &rdev->flags))
507 set_bit(R10BIO_Uptodate, &r10_bio->state);
508
509
510 if (is_badblock(rdev,
511 r10_bio->devs[slot].addr,
512 r10_bio->sectors,
513 &first_bad, &bad_sectors) && !discard_error) {
514 bio_put(bio);
515 if (repl)
516 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
517 else
518 r10_bio->devs[slot].bio = IO_MADE_GOOD;
519 dec_rdev = 0;
520 set_bit(R10BIO_MadeGood, &r10_bio->state);
521 }
522 }
523
524
525
526
527
528
529 one_write_done(r10_bio);
530 if (dec_rdev)
531 rdev_dec_pending(rdev, conf->mddev);
532 if (to_put)
533 bio_put(to_put);
534}
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
562{
563 int n,f;
564 sector_t sector;
565 sector_t chunk;
566 sector_t stripe;
567 int dev;
568 int slot = 0;
569 int last_far_set_start, last_far_set_size;
570
571 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
572 last_far_set_start *= geo->far_set_size;
573
574 last_far_set_size = geo->far_set_size;
575 last_far_set_size += (geo->raid_disks % geo->far_set_size);
576
577
578 chunk = r10bio->sector >> geo->chunk_shift;
579 sector = r10bio->sector & geo->chunk_mask;
580
581 chunk *= geo->near_copies;
582 stripe = chunk;
583 dev = sector_div(stripe, geo->raid_disks);
584 if (geo->far_offset)
585 stripe *= geo->far_copies;
586
587 sector += stripe << geo->chunk_shift;
588
589
590 for (n = 0; n < geo->near_copies; n++) {
591 int d = dev;
592 int set;
593 sector_t s = sector;
594 r10bio->devs[slot].devnum = d;
595 r10bio->devs[slot].addr = s;
596 slot++;
597
598 for (f = 1; f < geo->far_copies; f++) {
599 set = d / geo->far_set_size;
600 d += geo->near_copies;
601
602 if ((geo->raid_disks % geo->far_set_size) &&
603 (d > last_far_set_start)) {
604 d -= last_far_set_start;
605 d %= last_far_set_size;
606 d += last_far_set_start;
607 } else {
608 d %= geo->far_set_size;
609 d += geo->far_set_size * set;
610 }
611 s += geo->stride;
612 r10bio->devs[slot].devnum = d;
613 r10bio->devs[slot].addr = s;
614 slot++;
615 }
616 dev++;
617 if (dev >= geo->raid_disks) {
618 dev = 0;
619 sector += (geo->chunk_mask + 1);
620 }
621 }
622}
623
624static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
625{
626 struct geom *geo = &conf->geo;
627
628 if (conf->reshape_progress != MaxSector &&
629 ((r10bio->sector >= conf->reshape_progress) !=
630 conf->mddev->reshape_backwards)) {
631 set_bit(R10BIO_Previous, &r10bio->state);
632 geo = &conf->prev;
633 } else
634 clear_bit(R10BIO_Previous, &r10bio->state);
635
636 __raid10_find_phys(geo, r10bio);
637}
638
639static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
640{
641 sector_t offset, chunk, vchunk;
642
643
644
645 struct geom *geo = &conf->geo;
646 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
647 int far_set_size = geo->far_set_size;
648 int last_far_set_start;
649
650 if (geo->raid_disks % geo->far_set_size) {
651 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
652 last_far_set_start *= geo->far_set_size;
653
654 if (dev >= last_far_set_start) {
655 far_set_size = geo->far_set_size;
656 far_set_size += (geo->raid_disks % geo->far_set_size);
657 far_set_start = last_far_set_start;
658 }
659 }
660
661 offset = sector & geo->chunk_mask;
662 if (geo->far_offset) {
663 int fc;
664 chunk = sector >> geo->chunk_shift;
665 fc = sector_div(chunk, geo->far_copies);
666 dev -= fc * geo->near_copies;
667 if (dev < far_set_start)
668 dev += far_set_size;
669 } else {
670 while (sector >= geo->stride) {
671 sector -= geo->stride;
672 if (dev < (geo->near_copies + far_set_start))
673 dev += far_set_size - geo->near_copies;
674 else
675 dev -= geo->near_copies;
676 }
677 chunk = sector >> geo->chunk_shift;
678 }
679 vchunk = chunk * geo->raid_disks + dev;
680 sector_div(vchunk, geo->near_copies);
681 return (vchunk << geo->chunk_shift) + offset;
682}
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703static struct md_rdev *read_balance(struct r10conf *conf,
704 struct r10bio *r10_bio,
705 int *max_sectors)
706{
707 const sector_t this_sector = r10_bio->sector;
708 int disk, slot;
709 int sectors = r10_bio->sectors;
710 int best_good_sectors;
711 sector_t new_distance, best_dist;
712 struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL;
713 int do_balance;
714 int best_dist_slot, best_pending_slot;
715 bool has_nonrot_disk = false;
716 unsigned int min_pending;
717 struct geom *geo = &conf->geo;
718
719 raid10_find_phys(conf, r10_bio);
720 rcu_read_lock();
721 best_dist_slot = -1;
722 min_pending = UINT_MAX;
723 best_dist_rdev = NULL;
724 best_pending_rdev = NULL;
725 best_dist = MaxSector;
726 best_good_sectors = 0;
727 do_balance = 1;
728 clear_bit(R10BIO_FailFast, &r10_bio->state);
729
730
731
732
733
734
735 if ((conf->mddev->recovery_cp < MaxSector
736 && (this_sector + sectors >= conf->next_resync)) ||
737 (mddev_is_clustered(conf->mddev) &&
738 md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
739 this_sector + sectors)))
740 do_balance = 0;
741
742 for (slot = 0; slot < conf->copies ; slot++) {
743 sector_t first_bad;
744 int bad_sectors;
745 sector_t dev_sector;
746 unsigned int pending;
747 bool nonrot;
748
749 if (r10_bio->devs[slot].bio == IO_BLOCKED)
750 continue;
751 disk = r10_bio->devs[slot].devnum;
752 rdev = rcu_dereference(conf->mirrors[disk].replacement);
753 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
754 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
755 rdev = rcu_dereference(conf->mirrors[disk].rdev);
756 if (rdev == NULL ||
757 test_bit(Faulty, &rdev->flags))
758 continue;
759 if (!test_bit(In_sync, &rdev->flags) &&
760 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
761 continue;
762
763 dev_sector = r10_bio->devs[slot].addr;
764 if (is_badblock(rdev, dev_sector, sectors,
765 &first_bad, &bad_sectors)) {
766 if (best_dist < MaxSector)
767
768 continue;
769 if (first_bad <= dev_sector) {
770
771
772
773
774 bad_sectors -= (dev_sector - first_bad);
775 if (!do_balance && sectors > bad_sectors)
776 sectors = bad_sectors;
777 if (best_good_sectors > sectors)
778 best_good_sectors = sectors;
779 } else {
780 sector_t good_sectors =
781 first_bad - dev_sector;
782 if (good_sectors > best_good_sectors) {
783 best_good_sectors = good_sectors;
784 best_dist_slot = slot;
785 best_dist_rdev = rdev;
786 }
787 if (!do_balance)
788
789 break;
790 }
791 continue;
792 } else
793 best_good_sectors = sectors;
794
795 if (!do_balance)
796 break;
797
798 nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
799 has_nonrot_disk |= nonrot;
800 pending = atomic_read(&rdev->nr_pending);
801 if (min_pending > pending && nonrot) {
802 min_pending = pending;
803 best_pending_slot = slot;
804 best_pending_rdev = rdev;
805 }
806
807 if (best_dist_slot >= 0)
808
809 set_bit(R10BIO_FailFast, &r10_bio->state);
810
811
812
813
814 if (geo->near_copies > 1 && !pending)
815 new_distance = 0;
816
817
818 else if (geo->far_copies > 1)
819 new_distance = r10_bio->devs[slot].addr;
820 else
821 new_distance = abs(r10_bio->devs[slot].addr -
822 conf->mirrors[disk].head_position);
823
824 if (new_distance < best_dist) {
825 best_dist = new_distance;
826 best_dist_slot = slot;
827 best_dist_rdev = rdev;
828 }
829 }
830 if (slot >= conf->copies) {
831 if (has_nonrot_disk) {
832 slot = best_pending_slot;
833 rdev = best_pending_rdev;
834 } else {
835 slot = best_dist_slot;
836 rdev = best_dist_rdev;
837 }
838 }
839
840 if (slot >= 0) {
841 atomic_inc(&rdev->nr_pending);
842 r10_bio->read_slot = slot;
843 } else
844 rdev = NULL;
845 rcu_read_unlock();
846 *max_sectors = best_good_sectors;
847
848 return rdev;
849}
850
851static void flush_pending_writes(struct r10conf *conf)
852{
853
854
855
856 spin_lock_irq(&conf->device_lock);
857
858 if (conf->pending_bio_list.head) {
859 struct blk_plug plug;
860 struct bio *bio;
861
862 bio = bio_list_get(&conf->pending_bio_list);
863 conf->pending_count = 0;
864 spin_unlock_irq(&conf->device_lock);
865
866
867
868
869
870
871
872
873
874
875 __set_current_state(TASK_RUNNING);
876
877 blk_start_plug(&plug);
878
879
880 md_bitmap_unplug(conf->mddev->bitmap);
881 wake_up(&conf->wait_barrier);
882
883 while (bio) {
884 struct bio *next = bio->bi_next;
885 struct md_rdev *rdev = (void*)bio->bi_disk;
886 bio->bi_next = NULL;
887 bio_set_dev(bio, rdev->bdev);
888 if (test_bit(Faulty, &rdev->flags)) {
889 bio_io_error(bio);
890 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
891 !blk_queue_discard(bio->bi_disk->queue)))
892
893 bio_endio(bio);
894 else
895 submit_bio_noacct(bio);
896 bio = next;
897 }
898 blk_finish_plug(&plug);
899 } else
900 spin_unlock_irq(&conf->device_lock);
901}
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925static void raise_barrier(struct r10conf *conf, int force)
926{
927 BUG_ON(force && !conf->barrier);
928 spin_lock_irq(&conf->resync_lock);
929
930
931 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
932 conf->resync_lock);
933
934
935 conf->barrier++;
936
937
938 wait_event_lock_irq(conf->wait_barrier,
939 !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
940 conf->resync_lock);
941
942 spin_unlock_irq(&conf->resync_lock);
943}
944
945static void lower_barrier(struct r10conf *conf)
946{
947 unsigned long flags;
948 spin_lock_irqsave(&conf->resync_lock, flags);
949 conf->barrier--;
950 spin_unlock_irqrestore(&conf->resync_lock, flags);
951 wake_up(&conf->wait_barrier);
952}
953
954static void wait_barrier(struct r10conf *conf)
955{
956 spin_lock_irq(&conf->resync_lock);
957 if (conf->barrier) {
958 struct bio_list *bio_list = current->bio_list;
959 conf->nr_waiting++;
960
961
962
963
964
965
966
967
968
969 raid10_log(conf->mddev, "wait barrier");
970 wait_event_lock_irq(conf->wait_barrier,
971 !conf->barrier ||
972 (atomic_read(&conf->nr_pending) &&
973 bio_list &&
974 (!bio_list_empty(&bio_list[0]) ||
975 !bio_list_empty(&bio_list[1]))) ||
976
977
978
979 (conf->mddev->thread->tsk == current &&
980 test_bit(MD_RECOVERY_RUNNING,
981 &conf->mddev->recovery) &&
982 conf->nr_queued > 0),
983 conf->resync_lock);
984 conf->nr_waiting--;
985 if (!conf->nr_waiting)
986 wake_up(&conf->wait_barrier);
987 }
988 atomic_inc(&conf->nr_pending);
989 spin_unlock_irq(&conf->resync_lock);
990}
991
992static void allow_barrier(struct r10conf *conf)
993{
994 if ((atomic_dec_and_test(&conf->nr_pending)) ||
995 (conf->array_freeze_pending))
996 wake_up(&conf->wait_barrier);
997}
998
999static void freeze_array(struct r10conf *conf, int extra)
1000{
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013 spin_lock_irq(&conf->resync_lock);
1014 conf->array_freeze_pending++;
1015 conf->barrier++;
1016 conf->nr_waiting++;
1017 wait_event_lock_irq_cmd(conf->wait_barrier,
1018 atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
1019 conf->resync_lock,
1020 flush_pending_writes(conf));
1021
1022 conf->array_freeze_pending--;
1023 spin_unlock_irq(&conf->resync_lock);
1024}
1025
1026static void unfreeze_array(struct r10conf *conf)
1027{
1028
1029 spin_lock_irq(&conf->resync_lock);
1030 conf->barrier--;
1031 conf->nr_waiting--;
1032 wake_up(&conf->wait_barrier);
1033 spin_unlock_irq(&conf->resync_lock);
1034}
1035
1036static sector_t choose_data_offset(struct r10bio *r10_bio,
1037 struct md_rdev *rdev)
1038{
1039 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1040 test_bit(R10BIO_Previous, &r10_bio->state))
1041 return rdev->data_offset;
1042 else
1043 return rdev->new_data_offset;
1044}
1045
1046struct raid10_plug_cb {
1047 struct blk_plug_cb cb;
1048 struct bio_list pending;
1049 int pending_cnt;
1050};
1051
1052static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1053{
1054 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1055 cb);
1056 struct mddev *mddev = plug->cb.data;
1057 struct r10conf *conf = mddev->private;
1058 struct bio *bio;
1059
1060 if (from_schedule || current->bio_list) {
1061 spin_lock_irq(&conf->device_lock);
1062 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1063 conf->pending_count += plug->pending_cnt;
1064 spin_unlock_irq(&conf->device_lock);
1065 wake_up(&conf->wait_barrier);
1066 md_wakeup_thread(mddev->thread);
1067 kfree(plug);
1068 return;
1069 }
1070
1071
1072 bio = bio_list_get(&plug->pending);
1073 md_bitmap_unplug(mddev->bitmap);
1074 wake_up(&conf->wait_barrier);
1075
1076 while (bio) {
1077 struct bio *next = bio->bi_next;
1078 struct md_rdev *rdev = (void*)bio->bi_disk;
1079 bio->bi_next = NULL;
1080 bio_set_dev(bio, rdev->bdev);
1081 if (test_bit(Faulty, &rdev->flags)) {
1082 bio_io_error(bio);
1083 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
1084 !blk_queue_discard(bio->bi_disk->queue)))
1085
1086 bio_endio(bio);
1087 else
1088 submit_bio_noacct(bio);
1089 bio = next;
1090 }
1091 kfree(plug);
1092}
1093
1094
1095
1096
1097
1098
1099
1100static void regular_request_wait(struct mddev *mddev, struct r10conf *conf,
1101 struct bio *bio, sector_t sectors)
1102{
1103 wait_barrier(conf);
1104 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1105 bio->bi_iter.bi_sector < conf->reshape_progress &&
1106 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1107 raid10_log(conf->mddev, "wait reshape");
1108 allow_barrier(conf);
1109 wait_event(conf->wait_barrier,
1110 conf->reshape_progress <= bio->bi_iter.bi_sector ||
1111 conf->reshape_progress >= bio->bi_iter.bi_sector +
1112 sectors);
1113 wait_barrier(conf);
1114 }
1115}
1116
1117static void raid10_read_request(struct mddev *mddev, struct bio *bio,
1118 struct r10bio *r10_bio)
1119{
1120 struct r10conf *conf = mddev->private;
1121 struct bio *read_bio;
1122 const int op = bio_op(bio);
1123 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1124 int max_sectors;
1125 struct md_rdev *rdev;
1126 char b[BDEVNAME_SIZE];
1127 int slot = r10_bio->read_slot;
1128 struct md_rdev *err_rdev = NULL;
1129 gfp_t gfp = GFP_NOIO;
1130
1131 if (slot >= 0 && r10_bio->devs[slot].rdev) {
1132
1133
1134
1135
1136
1137
1138
1139 int disk;
1140
1141
1142
1143
1144 gfp = GFP_NOIO | __GFP_HIGH;
1145
1146 rcu_read_lock();
1147 disk = r10_bio->devs[slot].devnum;
1148 err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
1149 if (err_rdev)
1150 bdevname(err_rdev->bdev, b);
1151 else {
1152 strcpy(b, "???");
1153
1154 err_rdev = r10_bio->devs[slot].rdev;
1155 }
1156 rcu_read_unlock();
1157 }
1158
1159 regular_request_wait(mddev, conf, bio, r10_bio->sectors);
1160 rdev = read_balance(conf, r10_bio, &max_sectors);
1161 if (!rdev) {
1162 if (err_rdev) {
1163 pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
1164 mdname(mddev), b,
1165 (unsigned long long)r10_bio->sector);
1166 }
1167 raid_end_bio_io(r10_bio);
1168 return;
1169 }
1170 if (err_rdev)
1171 pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
1172 mdname(mddev),
1173 bdevname(rdev->bdev, b),
1174 (unsigned long long)r10_bio->sector);
1175 if (max_sectors < bio_sectors(bio)) {
1176 struct bio *split = bio_split(bio, max_sectors,
1177 gfp, &conf->bio_split);
1178 bio_chain(split, bio);
1179 allow_barrier(conf);
1180 submit_bio_noacct(bio);
1181 wait_barrier(conf);
1182 bio = split;
1183 r10_bio->master_bio = bio;
1184 r10_bio->sectors = max_sectors;
1185 }
1186 slot = r10_bio->read_slot;
1187
1188 read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
1189
1190 r10_bio->devs[slot].bio = read_bio;
1191 r10_bio->devs[slot].rdev = rdev;
1192
1193 read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
1194 choose_data_offset(r10_bio, rdev);
1195 bio_set_dev(read_bio, rdev->bdev);
1196 read_bio->bi_end_io = raid10_end_read_request;
1197 bio_set_op_attrs(read_bio, op, do_sync);
1198 if (test_bit(FailFast, &rdev->flags) &&
1199 test_bit(R10BIO_FailFast, &r10_bio->state))
1200 read_bio->bi_opf |= MD_FAILFAST;
1201 read_bio->bi_private = r10_bio;
1202
1203 if (mddev->gendisk)
1204 trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk),
1205 r10_bio->sector);
1206 submit_bio_noacct(read_bio);
1207 return;
1208}
1209
1210static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
1211 struct bio *bio, bool replacement,
1212 int n_copy)
1213{
1214 const int op = bio_op(bio);
1215 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1216 const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
1217 unsigned long flags;
1218 struct blk_plug_cb *cb;
1219 struct raid10_plug_cb *plug = NULL;
1220 struct r10conf *conf = mddev->private;
1221 struct md_rdev *rdev;
1222 int devnum = r10_bio->devs[n_copy].devnum;
1223 struct bio *mbio;
1224
1225 if (replacement) {
1226 rdev = conf->mirrors[devnum].replacement;
1227 if (rdev == NULL) {
1228
1229 smp_mb();
1230 rdev = conf->mirrors[devnum].rdev;
1231 }
1232 } else
1233 rdev = conf->mirrors[devnum].rdev;
1234
1235 mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
1236 if (replacement)
1237 r10_bio->devs[n_copy].repl_bio = mbio;
1238 else
1239 r10_bio->devs[n_copy].bio = mbio;
1240
1241 mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
1242 choose_data_offset(r10_bio, rdev));
1243 bio_set_dev(mbio, rdev->bdev);
1244 mbio->bi_end_io = raid10_end_write_request;
1245 bio_set_op_attrs(mbio, op, do_sync | do_fua);
1246 if (!replacement && test_bit(FailFast,
1247 &conf->mirrors[devnum].rdev->flags)
1248 && enough(conf, devnum))
1249 mbio->bi_opf |= MD_FAILFAST;
1250 mbio->bi_private = r10_bio;
1251
1252 if (conf->mddev->gendisk)
1253 trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk),
1254 r10_bio->sector);
1255
1256 mbio->bi_disk = (void *)rdev;
1257
1258 atomic_inc(&r10_bio->remaining);
1259
1260 cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
1261 if (cb)
1262 plug = container_of(cb, struct raid10_plug_cb, cb);
1263 else
1264 plug = NULL;
1265 if (plug) {
1266 bio_list_add(&plug->pending, mbio);
1267 plug->pending_cnt++;
1268 } else {
1269 spin_lock_irqsave(&conf->device_lock, flags);
1270 bio_list_add(&conf->pending_bio_list, mbio);
1271 conf->pending_count++;
1272 spin_unlock_irqrestore(&conf->device_lock, flags);
1273 md_wakeup_thread(mddev->thread);
1274 }
1275}
1276
1277static void raid10_write_request(struct mddev *mddev, struct bio *bio,
1278 struct r10bio *r10_bio)
1279{
1280 struct r10conf *conf = mddev->private;
1281 int i;
1282 struct md_rdev *blocked_rdev;
1283 sector_t sectors;
1284 int max_sectors;
1285
1286 if ((mddev_is_clustered(mddev) &&
1287 md_cluster_ops->area_resyncing(mddev, WRITE,
1288 bio->bi_iter.bi_sector,
1289 bio_end_sector(bio)))) {
1290 DEFINE_WAIT(w);
1291 for (;;) {
1292 prepare_to_wait(&conf->wait_barrier,
1293 &w, TASK_IDLE);
1294 if (!md_cluster_ops->area_resyncing(mddev, WRITE,
1295 bio->bi_iter.bi_sector, bio_end_sector(bio)))
1296 break;
1297 schedule();
1298 }
1299 finish_wait(&conf->wait_barrier, &w);
1300 }
1301
1302 sectors = r10_bio->sectors;
1303 regular_request_wait(mddev, conf, bio, sectors);
1304 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1305 (mddev->reshape_backwards
1306 ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
1307 bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
1308 : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
1309 bio->bi_iter.bi_sector < conf->reshape_progress))) {
1310
1311 mddev->reshape_position = conf->reshape_progress;
1312 set_mask_bits(&mddev->sb_flags, 0,
1313 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1314 md_wakeup_thread(mddev->thread);
1315 raid10_log(conf->mddev, "wait reshape metadata");
1316 wait_event(mddev->sb_wait,
1317 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
1318
1319 conf->reshape_safe = mddev->reshape_position;
1320 }
1321
1322 if (conf->pending_count >= max_queued_requests) {
1323 md_wakeup_thread(mddev->thread);
1324 raid10_log(mddev, "wait queued");
1325 wait_event(conf->wait_barrier,
1326 conf->pending_count < max_queued_requests);
1327 }
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338 r10_bio->read_slot = -1;
1339 raid10_find_phys(conf, r10_bio);
1340retry_write:
1341 blocked_rdev = NULL;
1342 rcu_read_lock();
1343 max_sectors = r10_bio->sectors;
1344
1345 for (i = 0; i < conf->copies; i++) {
1346 int d = r10_bio->devs[i].devnum;
1347 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1348 struct md_rdev *rrdev = rcu_dereference(
1349 conf->mirrors[d].replacement);
1350 if (rdev == rrdev)
1351 rrdev = NULL;
1352 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1353 atomic_inc(&rdev->nr_pending);
1354 blocked_rdev = rdev;
1355 break;
1356 }
1357 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1358 atomic_inc(&rrdev->nr_pending);
1359 blocked_rdev = rrdev;
1360 break;
1361 }
1362 if (rdev && (test_bit(Faulty, &rdev->flags)))
1363 rdev = NULL;
1364 if (rrdev && (test_bit(Faulty, &rrdev->flags)))
1365 rrdev = NULL;
1366
1367 r10_bio->devs[i].bio = NULL;
1368 r10_bio->devs[i].repl_bio = NULL;
1369
1370 if (!rdev && !rrdev) {
1371 set_bit(R10BIO_Degraded, &r10_bio->state);
1372 continue;
1373 }
1374 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1375 sector_t first_bad;
1376 sector_t dev_sector = r10_bio->devs[i].addr;
1377 int bad_sectors;
1378 int is_bad;
1379
1380 is_bad = is_badblock(rdev, dev_sector, max_sectors,
1381 &first_bad, &bad_sectors);
1382 if (is_bad < 0) {
1383
1384
1385
1386 atomic_inc(&rdev->nr_pending);
1387 set_bit(BlockedBadBlocks, &rdev->flags);
1388 blocked_rdev = rdev;
1389 break;
1390 }
1391 if (is_bad && first_bad <= dev_sector) {
1392
1393 bad_sectors -= (dev_sector - first_bad);
1394 if (bad_sectors < max_sectors)
1395
1396
1397
1398 max_sectors = bad_sectors;
1399
1400
1401
1402
1403
1404
1405
1406
1407 continue;
1408 }
1409 if (is_bad) {
1410 int good_sectors = first_bad - dev_sector;
1411 if (good_sectors < max_sectors)
1412 max_sectors = good_sectors;
1413 }
1414 }
1415 if (rdev) {
1416 r10_bio->devs[i].bio = bio;
1417 atomic_inc(&rdev->nr_pending);
1418 }
1419 if (rrdev) {
1420 r10_bio->devs[i].repl_bio = bio;
1421 atomic_inc(&rrdev->nr_pending);
1422 }
1423 }
1424 rcu_read_unlock();
1425
1426 if (unlikely(blocked_rdev)) {
1427
1428 int j;
1429 int d;
1430
1431 for (j = 0; j < i; j++) {
1432 if (r10_bio->devs[j].bio) {
1433 d = r10_bio->devs[j].devnum;
1434 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1435 }
1436 if (r10_bio->devs[j].repl_bio) {
1437 struct md_rdev *rdev;
1438 d = r10_bio->devs[j].devnum;
1439 rdev = conf->mirrors[d].replacement;
1440 if (!rdev) {
1441
1442 smp_mb();
1443 rdev = conf->mirrors[d].rdev;
1444 }
1445 rdev_dec_pending(rdev, mddev);
1446 }
1447 }
1448 allow_barrier(conf);
1449 raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
1450 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1451 wait_barrier(conf);
1452 goto retry_write;
1453 }
1454
1455 if (max_sectors < r10_bio->sectors)
1456 r10_bio->sectors = max_sectors;
1457
1458 if (r10_bio->sectors < bio_sectors(bio)) {
1459 struct bio *split = bio_split(bio, r10_bio->sectors,
1460 GFP_NOIO, &conf->bio_split);
1461 bio_chain(split, bio);
1462 allow_barrier(conf);
1463 submit_bio_noacct(bio);
1464 wait_barrier(conf);
1465 bio = split;
1466 r10_bio->master_bio = bio;
1467 }
1468
1469 atomic_set(&r10_bio->remaining, 1);
1470 md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1471
1472 for (i = 0; i < conf->copies; i++) {
1473 if (r10_bio->devs[i].bio)
1474 raid10_write_one_disk(mddev, r10_bio, bio, false, i);
1475 if (r10_bio->devs[i].repl_bio)
1476 raid10_write_one_disk(mddev, r10_bio, bio, true, i);
1477 }
1478 one_write_done(r10_bio);
1479}
1480
1481static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
1482{
1483 struct r10conf *conf = mddev->private;
1484 struct r10bio *r10_bio;
1485
1486 r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
1487
1488 r10_bio->master_bio = bio;
1489 r10_bio->sectors = sectors;
1490
1491 r10_bio->mddev = mddev;
1492 r10_bio->sector = bio->bi_iter.bi_sector;
1493 r10_bio->state = 0;
1494 r10_bio->read_slot = -1;
1495 memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
1496
1497 if (bio_data_dir(bio) == READ)
1498 raid10_read_request(mddev, bio, r10_bio);
1499 else
1500 raid10_write_request(mddev, bio, r10_bio);
1501}
1502
1503static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
1504{
1505 struct r10conf *conf = mddev->private;
1506 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1507 int chunk_sects = chunk_mask + 1;
1508 int sectors = bio_sectors(bio);
1509
1510 if (unlikely(bio->bi_opf & REQ_PREFLUSH)
1511 && md_flush_request(mddev, bio))
1512 return true;
1513
1514 if (!md_write_start(mddev, bio))
1515 return false;
1516
1517
1518
1519
1520
1521 if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
1522 sectors > chunk_sects
1523 && (conf->geo.near_copies < conf->geo.raid_disks
1524 || conf->prev.near_copies <
1525 conf->prev.raid_disks)))
1526 sectors = chunk_sects -
1527 (bio->bi_iter.bi_sector &
1528 (chunk_sects - 1));
1529 __make_request(mddev, bio, sectors);
1530
1531
1532 wake_up(&conf->wait_barrier);
1533 return true;
1534}
1535
1536static void raid10_status(struct seq_file *seq, struct mddev *mddev)
1537{
1538 struct r10conf *conf = mddev->private;
1539 int i;
1540
1541 if (conf->geo.near_copies < conf->geo.raid_disks)
1542 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1543 if (conf->geo.near_copies > 1)
1544 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1545 if (conf->geo.far_copies > 1) {
1546 if (conf->geo.far_offset)
1547 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1548 else
1549 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1550 if (conf->geo.far_set_size != conf->geo.raid_disks)
1551 seq_printf(seq, " %d devices per set", conf->geo.far_set_size);
1552 }
1553 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1554 conf->geo.raid_disks - mddev->degraded);
1555 rcu_read_lock();
1556 for (i = 0; i < conf->geo.raid_disks; i++) {
1557 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1558 seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1559 }
1560 rcu_read_unlock();
1561 seq_printf(seq, "]");
1562}
1563
1564
1565
1566
1567
1568
1569static int _enough(struct r10conf *conf, int previous, int ignore)
1570{
1571 int first = 0;
1572 int has_enough = 0;
1573 int disks, ncopies;
1574 if (previous) {
1575 disks = conf->prev.raid_disks;
1576 ncopies = conf->prev.near_copies;
1577 } else {
1578 disks = conf->geo.raid_disks;
1579 ncopies = conf->geo.near_copies;
1580 }
1581
1582 rcu_read_lock();
1583 do {
1584 int n = conf->copies;
1585 int cnt = 0;
1586 int this = first;
1587 while (n--) {
1588 struct md_rdev *rdev;
1589 if (this != ignore &&
1590 (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
1591 test_bit(In_sync, &rdev->flags))
1592 cnt++;
1593 this = (this+1) % disks;
1594 }
1595 if (cnt == 0)
1596 goto out;
1597 first = (first + ncopies) % disks;
1598 } while (first != 0);
1599 has_enough = 1;
1600out:
1601 rcu_read_unlock();
1602 return has_enough;
1603}
1604
1605static int enough(struct r10conf *conf, int ignore)
1606{
1607
1608
1609
1610
1611
1612 return _enough(conf, 0, ignore) &&
1613 _enough(conf, 1, ignore);
1614}
1615
1616static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
1617{
1618 char b[BDEVNAME_SIZE];
1619 struct r10conf *conf = mddev->private;
1620 unsigned long flags;
1621
1622
1623
1624
1625
1626
1627
1628 spin_lock_irqsave(&conf->device_lock, flags);
1629 if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
1630 && !enough(conf, rdev->raid_disk)) {
1631
1632
1633
1634 spin_unlock_irqrestore(&conf->device_lock, flags);
1635 return;
1636 }
1637 if (test_and_clear_bit(In_sync, &rdev->flags))
1638 mddev->degraded++;
1639
1640
1641
1642 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1643 set_bit(Blocked, &rdev->flags);
1644 set_bit(Faulty, &rdev->flags);
1645 set_mask_bits(&mddev->sb_flags, 0,
1646 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1647 spin_unlock_irqrestore(&conf->device_lock, flags);
1648 pr_crit("md/raid10:%s: Disk failure on %s, disabling device.\n"
1649 "md/raid10:%s: Operation continuing on %d devices.\n",
1650 mdname(mddev), bdevname(rdev->bdev, b),
1651 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1652}
1653
1654static void print_conf(struct r10conf *conf)
1655{
1656 int i;
1657 struct md_rdev *rdev;
1658
1659 pr_debug("RAID10 conf printout:\n");
1660 if (!conf) {
1661 pr_debug("(!conf)\n");
1662 return;
1663 }
1664 pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1665 conf->geo.raid_disks);
1666
1667
1668
1669 for (i = 0; i < conf->geo.raid_disks; i++) {
1670 char b[BDEVNAME_SIZE];
1671 rdev = conf->mirrors[i].rdev;
1672 if (rdev)
1673 pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n",
1674 i, !test_bit(In_sync, &rdev->flags),
1675 !test_bit(Faulty, &rdev->flags),
1676 bdevname(rdev->bdev,b));
1677 }
1678}
1679
1680static void close_sync(struct r10conf *conf)
1681{
1682 wait_barrier(conf);
1683 allow_barrier(conf);
1684
1685 mempool_exit(&conf->r10buf_pool);
1686}
1687
1688static int raid10_spare_active(struct mddev *mddev)
1689{
1690 int i;
1691 struct r10conf *conf = mddev->private;
1692 struct raid10_info *tmp;
1693 int count = 0;
1694 unsigned long flags;
1695
1696
1697
1698
1699
1700 for (i = 0; i < conf->geo.raid_disks; i++) {
1701 tmp = conf->mirrors + i;
1702 if (tmp->replacement
1703 && tmp->replacement->recovery_offset == MaxSector
1704 && !test_bit(Faulty, &tmp->replacement->flags)
1705 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1706
1707 if (!tmp->rdev
1708 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1709 count++;
1710 if (tmp->rdev) {
1711
1712
1713
1714
1715 set_bit(Faulty, &tmp->rdev->flags);
1716 sysfs_notify_dirent_safe(
1717 tmp->rdev->sysfs_state);
1718 }
1719 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1720 } else if (tmp->rdev
1721 && tmp->rdev->recovery_offset == MaxSector
1722 && !test_bit(Faulty, &tmp->rdev->flags)
1723 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1724 count++;
1725 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
1726 }
1727 }
1728 spin_lock_irqsave(&conf->device_lock, flags);
1729 mddev->degraded -= count;
1730 spin_unlock_irqrestore(&conf->device_lock, flags);
1731
1732 print_conf(conf);
1733 return count;
1734}
1735
1736static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1737{
1738 struct r10conf *conf = mddev->private;
1739 int err = -EEXIST;
1740 int mirror;
1741 int first = 0;
1742 int last = conf->geo.raid_disks - 1;
1743
1744 if (mddev->recovery_cp < MaxSector)
1745
1746
1747
1748 return -EBUSY;
1749 if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
1750 return -EINVAL;
1751
1752 if (md_integrity_add_rdev(rdev, mddev))
1753 return -ENXIO;
1754
1755 if (rdev->raid_disk >= 0)
1756 first = last = rdev->raid_disk;
1757
1758 if (rdev->saved_raid_disk >= first &&
1759 rdev->saved_raid_disk < conf->geo.raid_disks &&
1760 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1761 mirror = rdev->saved_raid_disk;
1762 else
1763 mirror = first;
1764 for ( ; mirror <= last ; mirror++) {
1765 struct raid10_info *p = &conf->mirrors[mirror];
1766 if (p->recovery_disabled == mddev->recovery_disabled)
1767 continue;
1768 if (p->rdev) {
1769 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1770 p->replacement != NULL)
1771 continue;
1772 clear_bit(In_sync, &rdev->flags);
1773 set_bit(Replacement, &rdev->flags);
1774 rdev->raid_disk = mirror;
1775 err = 0;
1776 if (mddev->gendisk)
1777 disk_stack_limits(mddev->gendisk, rdev->bdev,
1778 rdev->data_offset << 9);
1779 conf->fullsync = 1;
1780 rcu_assign_pointer(p->replacement, rdev);
1781 break;
1782 }
1783
1784 if (mddev->gendisk)
1785 disk_stack_limits(mddev->gendisk, rdev->bdev,
1786 rdev->data_offset << 9);
1787
1788 p->head_position = 0;
1789 p->recovery_disabled = mddev->recovery_disabled - 1;
1790 rdev->raid_disk = mirror;
1791 err = 0;
1792 if (rdev->saved_raid_disk != mirror)
1793 conf->fullsync = 1;
1794 rcu_assign_pointer(p->rdev, rdev);
1795 break;
1796 }
1797 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1798 blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue);
1799
1800 print_conf(conf);
1801 return err;
1802}
1803
1804static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1805{
1806 struct r10conf *conf = mddev->private;
1807 int err = 0;
1808 int number = rdev->raid_disk;
1809 struct md_rdev **rdevp;
1810 struct raid10_info *p = conf->mirrors + number;
1811
1812 print_conf(conf);
1813 if (rdev == p->rdev)
1814 rdevp = &p->rdev;
1815 else if (rdev == p->replacement)
1816 rdevp = &p->replacement;
1817 else
1818 return 0;
1819
1820 if (test_bit(In_sync, &rdev->flags) ||
1821 atomic_read(&rdev->nr_pending)) {
1822 err = -EBUSY;
1823 goto abort;
1824 }
1825
1826
1827
1828 if (!test_bit(Faulty, &rdev->flags) &&
1829 mddev->recovery_disabled != p->recovery_disabled &&
1830 (!p->replacement || p->replacement == rdev) &&
1831 number < conf->geo.raid_disks &&
1832 enough(conf, -1)) {
1833 err = -EBUSY;
1834 goto abort;
1835 }
1836 *rdevp = NULL;
1837 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
1838 synchronize_rcu();
1839 if (atomic_read(&rdev->nr_pending)) {
1840
1841 err = -EBUSY;
1842 *rdevp = rdev;
1843 goto abort;
1844 }
1845 }
1846 if (p->replacement) {
1847
1848 p->rdev = p->replacement;
1849 clear_bit(Replacement, &p->replacement->flags);
1850 smp_mb();
1851
1852
1853 p->replacement = NULL;
1854 }
1855
1856 clear_bit(WantReplacement, &rdev->flags);
1857 err = md_integrity_register(mddev);
1858
1859abort:
1860
1861 print_conf(conf);
1862 return err;
1863}
1864
1865static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
1866{
1867 struct r10conf *conf = r10_bio->mddev->private;
1868
1869 if (!bio->bi_status)
1870 set_bit(R10BIO_Uptodate, &r10_bio->state);
1871 else
1872
1873
1874
1875 atomic_add(r10_bio->sectors,
1876 &conf->mirrors[d].rdev->corrected_errors);
1877
1878
1879
1880
1881 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1882 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1883 atomic_dec_and_test(&r10_bio->remaining)) {
1884
1885
1886
1887 reschedule_retry(r10_bio);
1888 }
1889}
1890
1891static void end_sync_read(struct bio *bio)
1892{
1893 struct r10bio *r10_bio = get_resync_r10bio(bio);
1894 struct r10conf *conf = r10_bio->mddev->private;
1895 int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1896
1897 __end_sync_read(r10_bio, bio, d);
1898}
1899
1900static void end_reshape_read(struct bio *bio)
1901{
1902
1903 struct r10bio *r10_bio = bio->bi_private;
1904
1905 __end_sync_read(r10_bio, bio, r10_bio->read_slot);
1906}
1907
1908static void end_sync_request(struct r10bio *r10_bio)
1909{
1910 struct mddev *mddev = r10_bio->mddev;
1911
1912 while (atomic_dec_and_test(&r10_bio->remaining)) {
1913 if (r10_bio->master_bio == NULL) {
1914
1915 sector_t s = r10_bio->sectors;
1916 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1917 test_bit(R10BIO_WriteError, &r10_bio->state))
1918 reschedule_retry(r10_bio);
1919 else
1920 put_buf(r10_bio);
1921 md_done_sync(mddev, s, 1);
1922 break;
1923 } else {
1924 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1925 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1926 test_bit(R10BIO_WriteError, &r10_bio->state))
1927 reschedule_retry(r10_bio);
1928 else
1929 put_buf(r10_bio);
1930 r10_bio = r10_bio2;
1931 }
1932 }
1933}
1934
1935static void end_sync_write(struct bio *bio)
1936{
1937 struct r10bio *r10_bio = get_resync_r10bio(bio);
1938 struct mddev *mddev = r10_bio->mddev;
1939 struct r10conf *conf = mddev->private;
1940 int d;
1941 sector_t first_bad;
1942 int bad_sectors;
1943 int slot;
1944 int repl;
1945 struct md_rdev *rdev = NULL;
1946
1947 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1948 if (repl)
1949 rdev = conf->mirrors[d].replacement;
1950 else
1951 rdev = conf->mirrors[d].rdev;
1952
1953 if (bio->bi_status) {
1954 if (repl)
1955 md_error(mddev, rdev);
1956 else {
1957 set_bit(WriteErrorSeen, &rdev->flags);
1958 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1959 set_bit(MD_RECOVERY_NEEDED,
1960 &rdev->mddev->recovery);
1961 set_bit(R10BIO_WriteError, &r10_bio->state);
1962 }
1963 } else if (is_badblock(rdev,
1964 r10_bio->devs[slot].addr,
1965 r10_bio->sectors,
1966 &first_bad, &bad_sectors))
1967 set_bit(R10BIO_MadeGood, &r10_bio->state);
1968
1969 rdev_dec_pending(rdev, mddev);
1970
1971 end_sync_request(r10_bio);
1972}
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1991{
1992 struct r10conf *conf = mddev->private;
1993 int i, first;
1994 struct bio *tbio, *fbio;
1995 int vcnt;
1996 struct page **tpages, **fpages;
1997
1998 atomic_set(&r10_bio->remaining, 1);
1999
2000
2001 for (i=0; i<conf->copies; i++)
2002 if (!r10_bio->devs[i].bio->bi_status)
2003 break;
2004
2005 if (i == conf->copies)
2006 goto done;
2007
2008 first = i;
2009 fbio = r10_bio->devs[i].bio;
2010 fbio->bi_iter.bi_size = r10_bio->sectors << 9;
2011 fbio->bi_iter.bi_idx = 0;
2012 fpages = get_resync_pages(fbio)->pages;
2013
2014 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2015
2016 for (i=0 ; i < conf->copies ; i++) {
2017 int j, d;
2018 struct md_rdev *rdev;
2019 struct resync_pages *rp;
2020
2021 tbio = r10_bio->devs[i].bio;
2022
2023 if (tbio->bi_end_io != end_sync_read)
2024 continue;
2025 if (i == first)
2026 continue;
2027
2028 tpages = get_resync_pages(tbio)->pages;
2029 d = r10_bio->devs[i].devnum;
2030 rdev = conf->mirrors[d].rdev;
2031 if (!r10_bio->devs[i].bio->bi_status) {
2032
2033
2034
2035
2036 int sectors = r10_bio->sectors;
2037 for (j = 0; j < vcnt; j++) {
2038 int len = PAGE_SIZE;
2039 if (sectors < (len / 512))
2040 len = sectors * 512;
2041 if (memcmp(page_address(fpages[j]),
2042 page_address(tpages[j]),
2043 len))
2044 break;
2045 sectors -= len/512;
2046 }
2047 if (j == vcnt)
2048 continue;
2049 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2050 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2051
2052 continue;
2053 } else if (test_bit(FailFast, &rdev->flags)) {
2054
2055 md_error(rdev->mddev, rdev);
2056 continue;
2057 }
2058
2059
2060
2061
2062
2063 rp = get_resync_pages(tbio);
2064 bio_reset(tbio);
2065
2066 md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size);
2067
2068 rp->raid_bio = r10_bio;
2069 tbio->bi_private = rp;
2070 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
2071 tbio->bi_end_io = end_sync_write;
2072 bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
2073
2074 bio_copy_data(tbio, fbio);
2075
2076 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2077 atomic_inc(&r10_bio->remaining);
2078 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2079
2080 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
2081 tbio->bi_opf |= MD_FAILFAST;
2082 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
2083 bio_set_dev(tbio, conf->mirrors[d].rdev->bdev);
2084 submit_bio_noacct(tbio);
2085 }
2086
2087
2088
2089
2090 for (i = 0; i < conf->copies; i++) {
2091 int d;
2092
2093 tbio = r10_bio->devs[i].repl_bio;
2094 if (!tbio || !tbio->bi_end_io)
2095 continue;
2096 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2097 && r10_bio->devs[i].bio != fbio)
2098 bio_copy_data(tbio, fbio);
2099 d = r10_bio->devs[i].devnum;
2100 atomic_inc(&r10_bio->remaining);
2101 md_sync_acct(conf->mirrors[d].replacement->bdev,
2102 bio_sectors(tbio));
2103 submit_bio_noacct(tbio);
2104 }
2105
2106done:
2107 if (atomic_dec_and_test(&r10_bio->remaining)) {
2108 md_done_sync(mddev, r10_bio->sectors, 1);
2109 put_buf(r10_bio);
2110 }
2111}
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123static void fix_recovery_read_error(struct r10bio *r10_bio)
2124{
2125
2126
2127
2128
2129
2130
2131
2132 struct mddev *mddev = r10_bio->mddev;
2133 struct r10conf *conf = mddev->private;
2134 struct bio *bio = r10_bio->devs[0].bio;
2135 sector_t sect = 0;
2136 int sectors = r10_bio->sectors;
2137 int idx = 0;
2138 int dr = r10_bio->devs[0].devnum;
2139 int dw = r10_bio->devs[1].devnum;
2140 struct page **pages = get_resync_pages(bio)->pages;
2141
2142 while (sectors) {
2143 int s = sectors;
2144 struct md_rdev *rdev;
2145 sector_t addr;
2146 int ok;
2147
2148 if (s > (PAGE_SIZE>>9))
2149 s = PAGE_SIZE >> 9;
2150
2151 rdev = conf->mirrors[dr].rdev;
2152 addr = r10_bio->devs[0].addr + sect,
2153 ok = sync_page_io(rdev,
2154 addr,
2155 s << 9,
2156 pages[idx],
2157 REQ_OP_READ, 0, false);
2158 if (ok) {
2159 rdev = conf->mirrors[dw].rdev;
2160 addr = r10_bio->devs[1].addr + sect;
2161 ok = sync_page_io(rdev,
2162 addr,
2163 s << 9,
2164 pages[idx],
2165 REQ_OP_WRITE, 0, false);
2166 if (!ok) {
2167 set_bit(WriteErrorSeen, &rdev->flags);
2168 if (!test_and_set_bit(WantReplacement,
2169 &rdev->flags))
2170 set_bit(MD_RECOVERY_NEEDED,
2171 &rdev->mddev->recovery);
2172 }
2173 }
2174 if (!ok) {
2175
2176
2177
2178
2179 rdev_set_badblocks(rdev, addr, s, 0);
2180
2181 if (rdev != conf->mirrors[dw].rdev) {
2182
2183 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2184 addr = r10_bio->devs[1].addr + sect;
2185 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2186 if (!ok) {
2187
2188 pr_notice("md/raid10:%s: recovery aborted due to read error\n",
2189 mdname(mddev));
2190
2191 conf->mirrors[dw].recovery_disabled
2192 = mddev->recovery_disabled;
2193 set_bit(MD_RECOVERY_INTR,
2194 &mddev->recovery);
2195 break;
2196 }
2197 }
2198 }
2199
2200 sectors -= s;
2201 sect += s;
2202 idx++;
2203 }
2204}
2205
2206static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2207{
2208 struct r10conf *conf = mddev->private;
2209 int d;
2210 struct bio *wbio, *wbio2;
2211
2212 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2213 fix_recovery_read_error(r10_bio);
2214 end_sync_request(r10_bio);
2215 return;
2216 }
2217
2218
2219
2220
2221
2222 d = r10_bio->devs[1].devnum;
2223 wbio = r10_bio->devs[1].bio;
2224 wbio2 = r10_bio->devs[1].repl_bio;
2225
2226
2227
2228
2229 if (wbio2 && !wbio2->bi_end_io)
2230 wbio2 = NULL;
2231 if (wbio->bi_end_io) {
2232 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2233 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2234 submit_bio_noacct(wbio);
2235 }
2236 if (wbio2) {
2237 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2238 md_sync_acct(conf->mirrors[d].replacement->bdev,
2239 bio_sectors(wbio2));
2240 submit_bio_noacct(wbio2);
2241 }
2242}
2243
2244
2245
2246
2247
2248
2249
2250static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2251{
2252 long cur_time_mon;
2253 unsigned long hours_since_last;
2254 unsigned int read_errors = atomic_read(&rdev->read_errors);
2255
2256 cur_time_mon = ktime_get_seconds();
2257
2258 if (rdev->last_read_error == 0) {
2259
2260 rdev->last_read_error = cur_time_mon;
2261 return;
2262 }
2263
2264 hours_since_last = (long)(cur_time_mon -
2265 rdev->last_read_error) / 3600;
2266
2267 rdev->last_read_error = cur_time_mon;
2268
2269
2270
2271
2272
2273
2274 if (hours_since_last >= 8 * sizeof(read_errors))
2275 atomic_set(&rdev->read_errors, 0);
2276 else
2277 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2278}
2279
2280static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2281 int sectors, struct page *page, int rw)
2282{
2283 sector_t first_bad;
2284 int bad_sectors;
2285
2286 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2287 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2288 return -1;
2289 if (sync_page_io(rdev, sector, sectors << 9, page, rw, 0, false))
2290
2291 return 1;
2292 if (rw == WRITE) {
2293 set_bit(WriteErrorSeen, &rdev->flags);
2294 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2295 set_bit(MD_RECOVERY_NEEDED,
2296 &rdev->mddev->recovery);
2297 }
2298
2299 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2300 md_error(rdev->mddev, rdev);
2301 return 0;
2302}
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2313{
2314 int sect = 0;
2315 int sectors = r10_bio->sectors;
2316 struct md_rdev *rdev;
2317 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2318 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2319
2320
2321
2322
2323 rdev = conf->mirrors[d].rdev;
2324
2325 if (test_bit(Faulty, &rdev->flags))
2326
2327
2328 return;
2329
2330 check_decay_read_errors(mddev, rdev);
2331 atomic_inc(&rdev->read_errors);
2332 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2333 char b[BDEVNAME_SIZE];
2334 bdevname(rdev->bdev, b);
2335
2336 pr_notice("md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %d:max %d]\n",
2337 mdname(mddev), b,
2338 atomic_read(&rdev->read_errors), max_read_errors);
2339 pr_notice("md/raid10:%s: %s: Failing raid device\n",
2340 mdname(mddev), b);
2341 md_error(mddev, rdev);
2342 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2343 return;
2344 }
2345
2346 while(sectors) {
2347 int s = sectors;
2348 int sl = r10_bio->read_slot;
2349 int success = 0;
2350 int start;
2351
2352 if (s > (PAGE_SIZE>>9))
2353 s = PAGE_SIZE >> 9;
2354
2355 rcu_read_lock();
2356 do {
2357 sector_t first_bad;
2358 int bad_sectors;
2359
2360 d = r10_bio->devs[sl].devnum;
2361 rdev = rcu_dereference(conf->mirrors[d].rdev);
2362 if (rdev &&
2363 test_bit(In_sync, &rdev->flags) &&
2364 !test_bit(Faulty, &rdev->flags) &&
2365 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2366 &first_bad, &bad_sectors) == 0) {
2367 atomic_inc(&rdev->nr_pending);
2368 rcu_read_unlock();
2369 success = sync_page_io(rdev,
2370 r10_bio->devs[sl].addr +
2371 sect,
2372 s<<9,
2373 conf->tmppage,
2374 REQ_OP_READ, 0, false);
2375 rdev_dec_pending(rdev, mddev);
2376 rcu_read_lock();
2377 if (success)
2378 break;
2379 }
2380 sl++;
2381 if (sl == conf->copies)
2382 sl = 0;
2383 } while (!success && sl != r10_bio->read_slot);
2384 rcu_read_unlock();
2385
2386 if (!success) {
2387
2388
2389
2390
2391 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2392 rdev = conf->mirrors[dn].rdev;
2393
2394 if (!rdev_set_badblocks(
2395 rdev,
2396 r10_bio->devs[r10_bio->read_slot].addr
2397 + sect,
2398 s, 0)) {
2399 md_error(mddev, rdev);
2400 r10_bio->devs[r10_bio->read_slot].bio
2401 = IO_BLOCKED;
2402 }
2403 break;
2404 }
2405
2406 start = sl;
2407
2408 rcu_read_lock();
2409 while (sl != r10_bio->read_slot) {
2410 char b[BDEVNAME_SIZE];
2411
2412 if (sl==0)
2413 sl = conf->copies;
2414 sl--;
2415 d = r10_bio->devs[sl].devnum;
2416 rdev = rcu_dereference(conf->mirrors[d].rdev);
2417 if (!rdev ||
2418 test_bit(Faulty, &rdev->flags) ||
2419 !test_bit(In_sync, &rdev->flags))
2420 continue;
2421
2422 atomic_inc(&rdev->nr_pending);
2423 rcu_read_unlock();
2424 if (r10_sync_page_io(rdev,
2425 r10_bio->devs[sl].addr +
2426 sect,
2427 s, conf->tmppage, WRITE)
2428 == 0) {
2429
2430 pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %s)\n",
2431 mdname(mddev), s,
2432 (unsigned long long)(
2433 sect +
2434 choose_data_offset(r10_bio,
2435 rdev)),
2436 bdevname(rdev->bdev, b));
2437 pr_notice("md/raid10:%s: %s: failing drive\n",
2438 mdname(mddev),
2439 bdevname(rdev->bdev, b));
2440 }
2441 rdev_dec_pending(rdev, mddev);
2442 rcu_read_lock();
2443 }
2444 sl = start;
2445 while (sl != r10_bio->read_slot) {
2446 char b[BDEVNAME_SIZE];
2447
2448 if (sl==0)
2449 sl = conf->copies;
2450 sl--;
2451 d = r10_bio->devs[sl].devnum;
2452 rdev = rcu_dereference(conf->mirrors[d].rdev);
2453 if (!rdev ||
2454 test_bit(Faulty, &rdev->flags) ||
2455 !test_bit(In_sync, &rdev->flags))
2456 continue;
2457
2458 atomic_inc(&rdev->nr_pending);
2459 rcu_read_unlock();
2460 switch (r10_sync_page_io(rdev,
2461 r10_bio->devs[sl].addr +
2462 sect,
2463 s, conf->tmppage,
2464 READ)) {
2465 case 0:
2466
2467 pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %s)\n",
2468 mdname(mddev), s,
2469 (unsigned long long)(
2470 sect +
2471 choose_data_offset(r10_bio, rdev)),
2472 bdevname(rdev->bdev, b));
2473 pr_notice("md/raid10:%s: %s: failing drive\n",
2474 mdname(mddev),
2475 bdevname(rdev->bdev, b));
2476 break;
2477 case 1:
2478 pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %s)\n",
2479 mdname(mddev), s,
2480 (unsigned long long)(
2481 sect +
2482 choose_data_offset(r10_bio, rdev)),
2483 bdevname(rdev->bdev, b));
2484 atomic_add(s, &rdev->corrected_errors);
2485 }
2486
2487 rdev_dec_pending(rdev, mddev);
2488 rcu_read_lock();
2489 }
2490 rcu_read_unlock();
2491
2492 sectors -= s;
2493 sect += s;
2494 }
2495}
2496
2497static int narrow_write_error(struct r10bio *r10_bio, int i)
2498{
2499 struct bio *bio = r10_bio->master_bio;
2500 struct mddev *mddev = r10_bio->mddev;
2501 struct r10conf *conf = mddev->private;
2502 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514 int block_sectors;
2515 sector_t sector;
2516 int sectors;
2517 int sect_to_write = r10_bio->sectors;
2518 int ok = 1;
2519
2520 if (rdev->badblocks.shift < 0)
2521 return 0;
2522
2523 block_sectors = roundup(1 << rdev->badblocks.shift,
2524 bdev_logical_block_size(rdev->bdev) >> 9);
2525 sector = r10_bio->sector;
2526 sectors = ((r10_bio->sector + block_sectors)
2527 & ~(sector_t)(block_sectors - 1))
2528 - sector;
2529
2530 while (sect_to_write) {
2531 struct bio *wbio;
2532 sector_t wsector;
2533 if (sectors > sect_to_write)
2534 sectors = sect_to_write;
2535
2536 wbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
2537 bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
2538 wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
2539 wbio->bi_iter.bi_sector = wsector +
2540 choose_data_offset(r10_bio, rdev);
2541 bio_set_dev(wbio, rdev->bdev);
2542 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
2543
2544 if (submit_bio_wait(wbio) < 0)
2545
2546 ok = rdev_set_badblocks(rdev, wsector,
2547 sectors, 0)
2548 && ok;
2549
2550 bio_put(wbio);
2551 sect_to_write -= sectors;
2552 sector += sectors;
2553 sectors = block_sectors;
2554 }
2555 return ok;
2556}
2557
2558static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2559{
2560 int slot = r10_bio->read_slot;
2561 struct bio *bio;
2562 struct r10conf *conf = mddev->private;
2563 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573 bio = r10_bio->devs[slot].bio;
2574 bio_put(bio);
2575 r10_bio->devs[slot].bio = NULL;
2576
2577 if (mddev->ro)
2578 r10_bio->devs[slot].bio = IO_BLOCKED;
2579 else if (!test_bit(FailFast, &rdev->flags)) {
2580 freeze_array(conf, 1);
2581 fix_read_error(conf, mddev, r10_bio);
2582 unfreeze_array(conf);
2583 } else
2584 md_error(mddev, rdev);
2585
2586 rdev_dec_pending(rdev, mddev);
2587 allow_barrier(conf);
2588 r10_bio->state = 0;
2589 raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
2590}
2591
2592static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2593{
2594
2595
2596
2597
2598
2599
2600 int m;
2601 struct md_rdev *rdev;
2602
2603 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2604 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2605 for (m = 0; m < conf->copies; m++) {
2606 int dev = r10_bio->devs[m].devnum;
2607 rdev = conf->mirrors[dev].rdev;
2608 if (r10_bio->devs[m].bio == NULL ||
2609 r10_bio->devs[m].bio->bi_end_io == NULL)
2610 continue;
2611 if (!r10_bio->devs[m].bio->bi_status) {
2612 rdev_clear_badblocks(
2613 rdev,
2614 r10_bio->devs[m].addr,
2615 r10_bio->sectors, 0);
2616 } else {
2617 if (!rdev_set_badblocks(
2618 rdev,
2619 r10_bio->devs[m].addr,
2620 r10_bio->sectors, 0))
2621 md_error(conf->mddev, rdev);
2622 }
2623 rdev = conf->mirrors[dev].replacement;
2624 if (r10_bio->devs[m].repl_bio == NULL ||
2625 r10_bio->devs[m].repl_bio->bi_end_io == NULL)
2626 continue;
2627
2628 if (!r10_bio->devs[m].repl_bio->bi_status) {
2629 rdev_clear_badblocks(
2630 rdev,
2631 r10_bio->devs[m].addr,
2632 r10_bio->sectors, 0);
2633 } else {
2634 if (!rdev_set_badblocks(
2635 rdev,
2636 r10_bio->devs[m].addr,
2637 r10_bio->sectors, 0))
2638 md_error(conf->mddev, rdev);
2639 }
2640 }
2641 put_buf(r10_bio);
2642 } else {
2643 bool fail = false;
2644 for (m = 0; m < conf->copies; m++) {
2645 int dev = r10_bio->devs[m].devnum;
2646 struct bio *bio = r10_bio->devs[m].bio;
2647 rdev = conf->mirrors[dev].rdev;
2648 if (bio == IO_MADE_GOOD) {
2649 rdev_clear_badblocks(
2650 rdev,
2651 r10_bio->devs[m].addr,
2652 r10_bio->sectors, 0);
2653 rdev_dec_pending(rdev, conf->mddev);
2654 } else if (bio != NULL && bio->bi_status) {
2655 fail = true;
2656 if (!narrow_write_error(r10_bio, m)) {
2657 md_error(conf->mddev, rdev);
2658 set_bit(R10BIO_Degraded,
2659 &r10_bio->state);
2660 }
2661 rdev_dec_pending(rdev, conf->mddev);
2662 }
2663 bio = r10_bio->devs[m].repl_bio;
2664 rdev = conf->mirrors[dev].replacement;
2665 if (rdev && bio == IO_MADE_GOOD) {
2666 rdev_clear_badblocks(
2667 rdev,
2668 r10_bio->devs[m].addr,
2669 r10_bio->sectors, 0);
2670 rdev_dec_pending(rdev, conf->mddev);
2671 }
2672 }
2673 if (fail) {
2674 spin_lock_irq(&conf->device_lock);
2675 list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
2676 conf->nr_queued++;
2677 spin_unlock_irq(&conf->device_lock);
2678
2679
2680
2681
2682 wake_up(&conf->wait_barrier);
2683 md_wakeup_thread(conf->mddev->thread);
2684 } else {
2685 if (test_bit(R10BIO_WriteError,
2686 &r10_bio->state))
2687 close_write(r10_bio);
2688 raid_end_bio_io(r10_bio);
2689 }
2690 }
2691}
2692
2693static void raid10d(struct md_thread *thread)
2694{
2695 struct mddev *mddev = thread->mddev;
2696 struct r10bio *r10_bio;
2697 unsigned long flags;
2698 struct r10conf *conf = mddev->private;
2699 struct list_head *head = &conf->retry_list;
2700 struct blk_plug plug;
2701
2702 md_check_recovery(mddev);
2703
2704 if (!list_empty_careful(&conf->bio_end_io_list) &&
2705 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2706 LIST_HEAD(tmp);
2707 spin_lock_irqsave(&conf->device_lock, flags);
2708 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2709 while (!list_empty(&conf->bio_end_io_list)) {
2710 list_move(conf->bio_end_io_list.prev, &tmp);
2711 conf->nr_queued--;
2712 }
2713 }
2714 spin_unlock_irqrestore(&conf->device_lock, flags);
2715 while (!list_empty(&tmp)) {
2716 r10_bio = list_first_entry(&tmp, struct r10bio,
2717 retry_list);
2718 list_del(&r10_bio->retry_list);
2719 if (mddev->degraded)
2720 set_bit(R10BIO_Degraded, &r10_bio->state);
2721
2722 if (test_bit(R10BIO_WriteError,
2723 &r10_bio->state))
2724 close_write(r10_bio);
2725 raid_end_bio_io(r10_bio);
2726 }
2727 }
2728
2729 blk_start_plug(&plug);
2730 for (;;) {
2731
2732 flush_pending_writes(conf);
2733
2734 spin_lock_irqsave(&conf->device_lock, flags);
2735 if (list_empty(head)) {
2736 spin_unlock_irqrestore(&conf->device_lock, flags);
2737 break;
2738 }
2739 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2740 list_del(head->prev);
2741 conf->nr_queued--;
2742 spin_unlock_irqrestore(&conf->device_lock, flags);
2743
2744 mddev = r10_bio->mddev;
2745 conf = mddev->private;
2746 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2747 test_bit(R10BIO_WriteError, &r10_bio->state))
2748 handle_write_completed(conf, r10_bio);
2749 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2750 reshape_request_write(mddev, r10_bio);
2751 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2752 sync_request_write(mddev, r10_bio);
2753 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2754 recovery_request_write(mddev, r10_bio);
2755 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2756 handle_read_error(mddev, r10_bio);
2757 else
2758 WARN_ON_ONCE(1);
2759
2760 cond_resched();
2761 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
2762 md_check_recovery(mddev);
2763 }
2764 blk_finish_plug(&plug);
2765}
2766
2767static int init_resync(struct r10conf *conf)
2768{
2769 int ret, buffs, i;
2770
2771 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2772 BUG_ON(mempool_initialized(&conf->r10buf_pool));
2773 conf->have_replacement = 0;
2774 for (i = 0; i < conf->geo.raid_disks; i++)
2775 if (conf->mirrors[i].replacement)
2776 conf->have_replacement = 1;
2777 ret = mempool_init(&conf->r10buf_pool, buffs,
2778 r10buf_pool_alloc, r10buf_pool_free, conf);
2779 if (ret)
2780 return ret;
2781 conf->next_resync = 0;
2782 return 0;
2783}
2784
2785static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
2786{
2787 struct r10bio *r10bio = mempool_alloc(&conf->r10buf_pool, GFP_NOIO);
2788 struct rsync_pages *rp;
2789 struct bio *bio;
2790 int nalloc;
2791 int i;
2792
2793 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
2794 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
2795 nalloc = conf->copies;
2796 else
2797 nalloc = 2;
2798
2799 for (i = 0; i < nalloc; i++) {
2800 bio = r10bio->devs[i].bio;
2801 rp = bio->bi_private;
2802 bio_reset(bio);
2803 bio->bi_private = rp;
2804 bio = r10bio->devs[i].repl_bio;
2805 if (bio) {
2806 rp = bio->bi_private;
2807 bio_reset(bio);
2808 bio->bi_private = rp;
2809 }
2810 }
2811 return r10bio;
2812}
2813
2814
2815
2816
2817
2818static void raid10_set_cluster_sync_high(struct r10conf *conf)
2819{
2820 sector_t window_size;
2821 int extra_chunk, chunks;
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835 chunks = conf->geo.raid_disks / conf->geo.near_copies;
2836 if (conf->geo.raid_disks % conf->geo.near_copies == 0)
2837 extra_chunk = 0;
2838 else
2839 extra_chunk = 1;
2840 window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors;
2841
2842
2843
2844
2845 window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ?
2846 CLUSTER_RESYNC_WINDOW_SECTORS : window_size;
2847
2848 conf->cluster_sync_high = conf->cluster_sync_low + window_size;
2849}
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
2884 int *skipped)
2885{
2886 struct r10conf *conf = mddev->private;
2887 struct r10bio *r10_bio;
2888 struct bio *biolist = NULL, *bio;
2889 sector_t max_sector, nr_sectors;
2890 int i;
2891 int max_sync;
2892 sector_t sync_blocks;
2893 sector_t sectors_skipped = 0;
2894 int chunks_skipped = 0;
2895 sector_t chunk_mask = conf->geo.chunk_mask;
2896 int page_idx = 0;
2897
2898 if (!mempool_initialized(&conf->r10buf_pool))
2899 if (init_resync(conf))
2900 return 0;
2901
2902
2903
2904
2905
2906 if (mddev->bitmap == NULL &&
2907 mddev->recovery_cp == MaxSector &&
2908 mddev->reshape_position == MaxSector &&
2909 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2910 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
2911 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2912 conf->fullsync == 0) {
2913 *skipped = 1;
2914 return mddev->dev_sectors - sector_nr;
2915 }
2916
2917 skipped:
2918 max_sector = mddev->dev_sectors;
2919 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2920 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2921 max_sector = mddev->resync_max_sectors;
2922 if (sector_nr >= max_sector) {
2923 conf->cluster_sync_low = 0;
2924 conf->cluster_sync_high = 0;
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2936 end_reshape(conf);
2937 close_sync(conf);
2938 return 0;
2939 }
2940
2941 if (mddev->curr_resync < max_sector) {
2942 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2943 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2944 &sync_blocks, 1);
2945 else for (i = 0; i < conf->geo.raid_disks; i++) {
2946 sector_t sect =
2947 raid10_find_virt(conf, mddev->curr_resync, i);
2948 md_bitmap_end_sync(mddev->bitmap, sect,
2949 &sync_blocks, 1);
2950 }
2951 } else {
2952
2953 if ((!mddev->bitmap || conf->fullsync)
2954 && conf->have_replacement
2955 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2956
2957
2958
2959 rcu_read_lock();
2960 for (i = 0; i < conf->geo.raid_disks; i++) {
2961 struct md_rdev *rdev =
2962 rcu_dereference(conf->mirrors[i].replacement);
2963 if (rdev)
2964 rdev->recovery_offset = MaxSector;
2965 }
2966 rcu_read_unlock();
2967 }
2968 conf->fullsync = 0;
2969 }
2970 md_bitmap_close_sync(mddev->bitmap);
2971 close_sync(conf);
2972 *skipped = 1;
2973 return sectors_skipped;
2974 }
2975
2976 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2977 return reshape_request(mddev, sector_nr, skipped);
2978
2979 if (chunks_skipped >= conf->geo.raid_disks) {
2980
2981
2982
2983 *skipped = 1;
2984 return (max_sector - sector_nr) + sectors_skipped;
2985 }
2986
2987 if (max_sector > mddev->resync_max)
2988 max_sector = mddev->resync_max;
2989
2990
2991
2992
2993 if (conf->geo.near_copies < conf->geo.raid_disks &&
2994 max_sector > (sector_nr | chunk_mask))
2995 max_sector = (sector_nr | chunk_mask) + 1;
2996
2997
2998
2999
3000
3001 if (conf->nr_waiting)
3002 schedule_timeout_uninterruptible(1);
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
3020 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3021
3022 int j;
3023 r10_bio = NULL;
3024
3025 for (i = 0 ; i < conf->geo.raid_disks; i++) {
3026 int still_degraded;
3027 struct r10bio *rb2;
3028 sector_t sect;
3029 int must_sync;
3030 int any_working;
3031 int need_recover = 0;
3032 int need_replace = 0;
3033 struct raid10_info *mirror = &conf->mirrors[i];
3034 struct md_rdev *mrdev, *mreplace;
3035
3036 rcu_read_lock();
3037 mrdev = rcu_dereference(mirror->rdev);
3038 mreplace = rcu_dereference(mirror->replacement);
3039
3040 if (mrdev != NULL &&
3041 !test_bit(Faulty, &mrdev->flags) &&
3042 !test_bit(In_sync, &mrdev->flags))
3043 need_recover = 1;
3044 if (mreplace != NULL &&
3045 !test_bit(Faulty, &mreplace->flags))
3046 need_replace = 1;
3047
3048 if (!need_recover && !need_replace) {
3049 rcu_read_unlock();
3050 continue;
3051 }
3052
3053 still_degraded = 0;
3054
3055 rb2 = r10_bio;
3056 sect = raid10_find_virt(conf, sector_nr, i);
3057 if (sect >= mddev->resync_max_sectors) {
3058
3059
3060
3061 rcu_read_unlock();
3062 continue;
3063 }
3064 if (mreplace && test_bit(Faulty, &mreplace->flags))
3065 mreplace = NULL;
3066
3067
3068
3069
3070 must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
3071 &sync_blocks, 1);
3072 if (sync_blocks < max_sync)
3073 max_sync = sync_blocks;
3074 if (!must_sync &&
3075 mreplace == NULL &&
3076 !conf->fullsync) {
3077
3078
3079
3080 chunks_skipped = -1;
3081 rcu_read_unlock();
3082 continue;
3083 }
3084 atomic_inc(&mrdev->nr_pending);
3085 if (mreplace)
3086 atomic_inc(&mreplace->nr_pending);
3087 rcu_read_unlock();
3088
3089 r10_bio = raid10_alloc_init_r10buf(conf);
3090 r10_bio->state = 0;
3091 raise_barrier(conf, rb2 != NULL);
3092 atomic_set(&r10_bio->remaining, 0);
3093
3094 r10_bio->master_bio = (struct bio*)rb2;
3095 if (rb2)
3096 atomic_inc(&rb2->remaining);
3097 r10_bio->mddev = mddev;
3098 set_bit(R10BIO_IsRecover, &r10_bio->state);
3099 r10_bio->sector = sect;
3100
3101 raid10_find_phys(conf, r10_bio);
3102
3103
3104
3105
3106 rcu_read_lock();
3107 for (j = 0; j < conf->geo.raid_disks; j++) {
3108 struct md_rdev *rdev = rcu_dereference(
3109 conf->mirrors[j].rdev);
3110 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3111 still_degraded = 1;
3112 break;
3113 }
3114 }
3115
3116 must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
3117 &sync_blocks, still_degraded);
3118
3119 any_working = 0;
3120 for (j=0; j<conf->copies;j++) {
3121 int k;
3122 int d = r10_bio->devs[j].devnum;
3123 sector_t from_addr, to_addr;
3124 struct md_rdev *rdev =
3125 rcu_dereference(conf->mirrors[d].rdev);
3126 sector_t sector, first_bad;
3127 int bad_sectors;
3128 if (!rdev ||
3129 !test_bit(In_sync, &rdev->flags))
3130 continue;
3131
3132 any_working = 1;
3133 sector = r10_bio->devs[j].addr;
3134
3135 if (is_badblock(rdev, sector, max_sync,
3136 &first_bad, &bad_sectors)) {
3137 if (first_bad > sector)
3138 max_sync = first_bad - sector;
3139 else {
3140 bad_sectors -= (sector
3141 - first_bad);
3142 if (max_sync > bad_sectors)
3143 max_sync = bad_sectors;
3144 continue;
3145 }
3146 }
3147 bio = r10_bio->devs[0].bio;
3148 bio->bi_next = biolist;
3149 biolist = bio;
3150 bio->bi_end_io = end_sync_read;
3151 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3152 if (test_bit(FailFast, &rdev->flags))
3153 bio->bi_opf |= MD_FAILFAST;
3154 from_addr = r10_bio->devs[j].addr;
3155 bio->bi_iter.bi_sector = from_addr +
3156 rdev->data_offset;
3157 bio_set_dev(bio, rdev->bdev);
3158 atomic_inc(&rdev->nr_pending);
3159
3160
3161 for (k=0; k<conf->copies; k++)
3162 if (r10_bio->devs[k].devnum == i)
3163 break;
3164 BUG_ON(k == conf->copies);
3165 to_addr = r10_bio->devs[k].addr;
3166 r10_bio->devs[0].devnum = d;
3167 r10_bio->devs[0].addr = from_addr;
3168 r10_bio->devs[1].devnum = i;
3169 r10_bio->devs[1].addr = to_addr;
3170
3171 if (need_recover) {
3172 bio = r10_bio->devs[1].bio;
3173 bio->bi_next = biolist;
3174 biolist = bio;
3175 bio->bi_end_io = end_sync_write;
3176 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3177 bio->bi_iter.bi_sector = to_addr
3178 + mrdev->data_offset;
3179 bio_set_dev(bio, mrdev->bdev);
3180 atomic_inc(&r10_bio->remaining);
3181 } else
3182 r10_bio->devs[1].bio->bi_end_io = NULL;
3183
3184
3185 bio = r10_bio->devs[1].repl_bio;
3186 if (bio)
3187 bio->bi_end_io = NULL;
3188
3189
3190
3191
3192 if (!need_replace)
3193 break;
3194 bio->bi_next = biolist;
3195 biolist = bio;
3196 bio->bi_end_io = end_sync_write;
3197 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3198 bio->bi_iter.bi_sector = to_addr +
3199 mreplace->data_offset;
3200 bio_set_dev(bio, mreplace->bdev);
3201 atomic_inc(&r10_bio->remaining);
3202 break;
3203 }
3204 rcu_read_unlock();
3205 if (j == conf->copies) {
3206
3207
3208 if (any_working) {
3209
3210
3211
3212 int k;
3213 for (k = 0; k < conf->copies; k++)
3214 if (r10_bio->devs[k].devnum == i)
3215 break;
3216 if (!test_bit(In_sync,
3217 &mrdev->flags)
3218 && !rdev_set_badblocks(
3219 mrdev,
3220 r10_bio->devs[k].addr,
3221 max_sync, 0))
3222 any_working = 0;
3223 if (mreplace &&
3224 !rdev_set_badblocks(
3225 mreplace,
3226 r10_bio->devs[k].addr,
3227 max_sync, 0))
3228 any_working = 0;
3229 }
3230 if (!any_working) {
3231 if (!test_and_set_bit(MD_RECOVERY_INTR,
3232 &mddev->recovery))
3233 pr_warn("md/raid10:%s: insufficient working devices for recovery.\n",
3234 mdname(mddev));
3235 mirror->recovery_disabled
3236 = mddev->recovery_disabled;
3237 }
3238 put_buf(r10_bio);
3239 if (rb2)
3240 atomic_dec(&rb2->remaining);
3241 r10_bio = rb2;
3242 rdev_dec_pending(mrdev, mddev);
3243 if (mreplace)
3244 rdev_dec_pending(mreplace, mddev);
3245 break;
3246 }
3247 rdev_dec_pending(mrdev, mddev);
3248 if (mreplace)
3249 rdev_dec_pending(mreplace, mddev);
3250 if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) {
3251
3252
3253
3254
3255 int targets = 1;
3256 for (; j < conf->copies; j++) {
3257 int d = r10_bio->devs[j].devnum;
3258 if (conf->mirrors[d].rdev &&
3259 test_bit(In_sync,
3260 &conf->mirrors[d].rdev->flags))
3261 targets++;
3262 }
3263 if (targets == 1)
3264 r10_bio->devs[0].bio->bi_opf
3265 &= ~MD_FAILFAST;
3266 }
3267 }
3268 if (biolist == NULL) {
3269 while (r10_bio) {
3270 struct r10bio *rb2 = r10_bio;
3271 r10_bio = (struct r10bio*) rb2->master_bio;
3272 rb2->master_bio = NULL;
3273 put_buf(rb2);
3274 }
3275 goto giveup;
3276 }
3277 } else {
3278
3279 int count = 0;
3280
3281
3282
3283
3284
3285
3286
3287
3288 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
3289 mddev_is_clustered(mddev) &&
3290 (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
3291
3292 if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
3293 &sync_blocks, mddev->degraded) &&
3294 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3295 &mddev->recovery)) {
3296
3297 *skipped = 1;
3298 return sync_blocks + sectors_skipped;
3299 }
3300 if (sync_blocks < max_sync)
3301 max_sync = sync_blocks;
3302 r10_bio = raid10_alloc_init_r10buf(conf);
3303 r10_bio->state = 0;
3304
3305 r10_bio->mddev = mddev;
3306 atomic_set(&r10_bio->remaining, 0);
3307 raise_barrier(conf, 0);
3308 conf->next_resync = sector_nr;
3309
3310 r10_bio->master_bio = NULL;
3311 r10_bio->sector = sector_nr;
3312 set_bit(R10BIO_IsSync, &r10_bio->state);
3313 raid10_find_phys(conf, r10_bio);
3314 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3315
3316 for (i = 0; i < conf->copies; i++) {
3317 int d = r10_bio->devs[i].devnum;
3318 sector_t first_bad, sector;
3319 int bad_sectors;
3320 struct md_rdev *rdev;
3321
3322 if (r10_bio->devs[i].repl_bio)
3323 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3324
3325 bio = r10_bio->devs[i].bio;
3326 bio->bi_status = BLK_STS_IOERR;
3327 rcu_read_lock();
3328 rdev = rcu_dereference(conf->mirrors[d].rdev);
3329 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3330 rcu_read_unlock();
3331 continue;
3332 }
3333 sector = r10_bio->devs[i].addr;
3334 if (is_badblock(rdev, sector, max_sync,
3335 &first_bad, &bad_sectors)) {
3336 if (first_bad > sector)
3337 max_sync = first_bad - sector;
3338 else {
3339 bad_sectors -= (sector - first_bad);
3340 if (max_sync > bad_sectors)
3341 max_sync = bad_sectors;
3342 rcu_read_unlock();
3343 continue;
3344 }
3345 }
3346 atomic_inc(&rdev->nr_pending);
3347 atomic_inc(&r10_bio->remaining);
3348 bio->bi_next = biolist;
3349 biolist = bio;
3350 bio->bi_end_io = end_sync_read;
3351 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3352 if (test_bit(FailFast, &rdev->flags))
3353 bio->bi_opf |= MD_FAILFAST;
3354 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3355 bio_set_dev(bio, rdev->bdev);
3356 count++;
3357
3358 rdev = rcu_dereference(conf->mirrors[d].replacement);
3359 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3360 rcu_read_unlock();
3361 continue;
3362 }
3363 atomic_inc(&rdev->nr_pending);
3364
3365
3366 bio = r10_bio->devs[i].repl_bio;
3367 bio->bi_status = BLK_STS_IOERR;
3368
3369 sector = r10_bio->devs[i].addr;
3370 bio->bi_next = biolist;
3371 biolist = bio;
3372 bio->bi_end_io = end_sync_write;
3373 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3374 if (test_bit(FailFast, &rdev->flags))
3375 bio->bi_opf |= MD_FAILFAST;
3376 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3377 bio_set_dev(bio, rdev->bdev);
3378 count++;
3379 rcu_read_unlock();
3380 }
3381
3382 if (count < 2) {
3383 for (i=0; i<conf->copies; i++) {
3384 int d = r10_bio->devs[i].devnum;
3385 if (r10_bio->devs[i].bio->bi_end_io)
3386 rdev_dec_pending(conf->mirrors[d].rdev,
3387 mddev);
3388 if (r10_bio->devs[i].repl_bio &&
3389 r10_bio->devs[i].repl_bio->bi_end_io)
3390 rdev_dec_pending(
3391 conf->mirrors[d].replacement,
3392 mddev);
3393 }
3394 put_buf(r10_bio);
3395 biolist = NULL;
3396 goto giveup;
3397 }
3398 }
3399
3400 nr_sectors = 0;
3401 if (sector_nr + max_sync < max_sector)
3402 max_sector = sector_nr + max_sync;
3403 do {
3404 struct page *page;
3405 int len = PAGE_SIZE;
3406 if (sector_nr + (len>>9) > max_sector)
3407 len = (max_sector - sector_nr) << 9;
3408 if (len == 0)
3409 break;
3410 for (bio= biolist ; bio ; bio=bio->bi_next) {
3411 struct resync_pages *rp = get_resync_pages(bio);
3412 page = resync_fetch_page(rp, page_idx);
3413
3414
3415
3416
3417 bio_add_page(bio, page, len, 0);
3418 }
3419 nr_sectors += len>>9;
3420 sector_nr += len>>9;
3421 } while (++page_idx < RESYNC_PAGES);
3422 r10_bio->sectors = nr_sectors;
3423
3424 if (mddev_is_clustered(mddev) &&
3425 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3426
3427 if (conf->cluster_sync_high < sector_nr + nr_sectors) {
3428 conf->cluster_sync_low = mddev->curr_resync_completed;
3429 raid10_set_cluster_sync_high(conf);
3430
3431 md_cluster_ops->resync_info_update(mddev,
3432 conf->cluster_sync_low,
3433 conf->cluster_sync_high);
3434 }
3435 } else if (mddev_is_clustered(mddev)) {
3436
3437 sector_t sect_va1, sect_va2;
3438 bool broadcast_msg = false;
3439
3440 for (i = 0; i < conf->geo.raid_disks; i++) {
3441
3442
3443
3444
3445
3446 sect_va1 = raid10_find_virt(conf, sector_nr, i);
3447
3448 if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
3449 broadcast_msg = true;
3450
3451
3452
3453
3454 sect_va2 = raid10_find_virt(conf,
3455 mddev->curr_resync_completed, i);
3456
3457 if (conf->cluster_sync_low == 0 ||
3458 conf->cluster_sync_low > sect_va2)
3459 conf->cluster_sync_low = sect_va2;
3460 }
3461 }
3462 if (broadcast_msg) {
3463 raid10_set_cluster_sync_high(conf);
3464 md_cluster_ops->resync_info_update(mddev,
3465 conf->cluster_sync_low,
3466 conf->cluster_sync_high);
3467 }
3468 }
3469
3470 while (biolist) {
3471 bio = biolist;
3472 biolist = biolist->bi_next;
3473
3474 bio->bi_next = NULL;
3475 r10_bio = get_resync_r10bio(bio);
3476 r10_bio->sectors = nr_sectors;
3477
3478 if (bio->bi_end_io == end_sync_read) {
3479 md_sync_acct_bio(bio, nr_sectors);
3480 bio->bi_status = 0;
3481 submit_bio_noacct(bio);
3482 }
3483 }
3484
3485 if (sectors_skipped)
3486
3487
3488
3489 md_done_sync(mddev, sectors_skipped, 1);
3490
3491 return sectors_skipped + nr_sectors;
3492 giveup:
3493
3494
3495
3496
3497 if (sector_nr + max_sync < max_sector)
3498 max_sector = sector_nr + max_sync;
3499
3500 sectors_skipped += (max_sector - sector_nr);
3501 chunks_skipped ++;
3502 sector_nr = max_sector;
3503 goto skipped;
3504}
3505
3506static sector_t
3507raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3508{
3509 sector_t size;
3510 struct r10conf *conf = mddev->private;
3511
3512 if (!raid_disks)
3513 raid_disks = min(conf->geo.raid_disks,
3514 conf->prev.raid_disks);
3515 if (!sectors)
3516 sectors = conf->dev_sectors;
3517
3518 size = sectors >> conf->geo.chunk_shift;
3519 sector_div(size, conf->geo.far_copies);
3520 size = size * raid_disks;
3521 sector_div(size, conf->geo.near_copies);
3522
3523 return size << conf->geo.chunk_shift;
3524}
3525
3526static void calc_sectors(struct r10conf *conf, sector_t size)
3527{
3528
3529
3530
3531
3532
3533 size = size >> conf->geo.chunk_shift;
3534 sector_div(size, conf->geo.far_copies);
3535 size = size * conf->geo.raid_disks;
3536 sector_div(size, conf->geo.near_copies);
3537
3538
3539 size = size * conf->copies;
3540
3541
3542
3543
3544 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3545
3546 conf->dev_sectors = size << conf->geo.chunk_shift;
3547
3548 if (conf->geo.far_offset)
3549 conf->geo.stride = 1 << conf->geo.chunk_shift;
3550 else {
3551 sector_div(size, conf->geo.far_copies);
3552 conf->geo.stride = size << conf->geo.chunk_shift;
3553 }
3554}
3555
3556enum geo_type {geo_new, geo_old, geo_start};
3557static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3558{
3559 int nc, fc, fo;
3560 int layout, chunk, disks;
3561 switch (new) {
3562 case geo_old:
3563 layout = mddev->layout;
3564 chunk = mddev->chunk_sectors;
3565 disks = mddev->raid_disks - mddev->delta_disks;
3566 break;
3567 case geo_new:
3568 layout = mddev->new_layout;
3569 chunk = mddev->new_chunk_sectors;
3570 disks = mddev->raid_disks;
3571 break;
3572 default:
3573 case geo_start:
3574
3575 layout = mddev->new_layout;
3576 chunk = mddev->new_chunk_sectors;
3577 disks = mddev->raid_disks + mddev->delta_disks;
3578 break;
3579 }
3580 if (layout >> 19)
3581 return -1;
3582 if (chunk < (PAGE_SIZE >> 9) ||
3583 !is_power_of_2(chunk))
3584 return -2;
3585 nc = layout & 255;
3586 fc = (layout >> 8) & 255;
3587 fo = layout & (1<<16);
3588 geo->raid_disks = disks;
3589 geo->near_copies = nc;
3590 geo->far_copies = fc;
3591 geo->far_offset = fo;
3592 switch (layout >> 17) {
3593 case 0:
3594 geo->far_set_size = disks;
3595 break;
3596 case 1:
3597
3598 geo->far_set_size = disks/fc;
3599 WARN(geo->far_set_size < fc,
3600 "This RAID10 layout does not provide data safety - please backup and create new array\n");
3601 break;
3602 case 2:
3603 geo->far_set_size = fc * nc;
3604 break;
3605 default:
3606 return -1;
3607 }
3608 geo->chunk_mask = chunk - 1;
3609 geo->chunk_shift = ffz(~chunk);
3610 return nc*fc;
3611}
3612
3613static struct r10conf *setup_conf(struct mddev *mddev)
3614{
3615 struct r10conf *conf = NULL;
3616 int err = -EINVAL;
3617 struct geom geo;
3618 int copies;
3619
3620 copies = setup_geo(&geo, mddev, geo_new);
3621
3622 if (copies == -2) {
3623 pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n",
3624 mdname(mddev), PAGE_SIZE);
3625 goto out;
3626 }
3627
3628 if (copies < 2 || copies > mddev->raid_disks) {
3629 pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3630 mdname(mddev), mddev->new_layout);
3631 goto out;
3632 }
3633
3634 err = -ENOMEM;
3635 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3636 if (!conf)
3637 goto out;
3638
3639
3640 conf->mirrors = kcalloc(mddev->raid_disks + max(0, -mddev->delta_disks),
3641 sizeof(struct raid10_info),
3642 GFP_KERNEL);
3643 if (!conf->mirrors)
3644 goto out;
3645
3646 conf->tmppage = alloc_page(GFP_KERNEL);
3647 if (!conf->tmppage)
3648 goto out;
3649
3650 conf->geo = geo;
3651 conf->copies = copies;
3652 err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc,
3653 rbio_pool_free, conf);
3654 if (err)
3655 goto out;
3656
3657 err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
3658 if (err)
3659 goto out;
3660
3661 calc_sectors(conf, mddev->dev_sectors);
3662 if (mddev->reshape_position == MaxSector) {
3663 conf->prev = conf->geo;
3664 conf->reshape_progress = MaxSector;
3665 } else {
3666 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3667 err = -EINVAL;
3668 goto out;
3669 }
3670 conf->reshape_progress = mddev->reshape_position;
3671 if (conf->prev.far_offset)
3672 conf->prev.stride = 1 << conf->prev.chunk_shift;
3673 else
3674
3675 conf->prev.stride = conf->dev_sectors;
3676 }
3677 conf->reshape_safe = conf->reshape_progress;
3678 spin_lock_init(&conf->device_lock);
3679 INIT_LIST_HEAD(&conf->retry_list);
3680 INIT_LIST_HEAD(&conf->bio_end_io_list);
3681
3682 spin_lock_init(&conf->resync_lock);
3683 init_waitqueue_head(&conf->wait_barrier);
3684 atomic_set(&conf->nr_pending, 0);
3685
3686 err = -ENOMEM;
3687 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3688 if (!conf->thread)
3689 goto out;
3690
3691 conf->mddev = mddev;
3692 return conf;
3693
3694 out:
3695 if (conf) {
3696 mempool_exit(&conf->r10bio_pool);
3697 kfree(conf->mirrors);
3698 safe_put_page(conf->tmppage);
3699 bioset_exit(&conf->bio_split);
3700 kfree(conf);
3701 }
3702 return ERR_PTR(err);
3703}
3704
3705static void raid10_set_io_opt(struct r10conf *conf)
3706{
3707 int raid_disks = conf->geo.raid_disks;
3708
3709 if (!(conf->geo.raid_disks % conf->geo.near_copies))
3710 raid_disks /= conf->geo.near_copies;
3711 blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) *
3712 raid_disks);
3713}
3714
3715static int raid10_run(struct mddev *mddev)
3716{
3717 struct r10conf *conf;
3718 int i, disk_idx;
3719 struct raid10_info *disk;
3720 struct md_rdev *rdev;
3721 sector_t size;
3722 sector_t min_offset_diff = 0;
3723 int first = 1;
3724 bool discard_supported = false;
3725
3726 if (mddev_init_writes_pending(mddev) < 0)
3727 return -ENOMEM;
3728
3729 if (mddev->private == NULL) {
3730 conf = setup_conf(mddev);
3731 if (IS_ERR(conf))
3732 return PTR_ERR(conf);
3733 mddev->private = conf;
3734 }
3735 conf = mddev->private;
3736 if (!conf)
3737 goto out;
3738
3739 if (mddev_is_clustered(conf->mddev)) {
3740 int fc, fo;
3741
3742 fc = (mddev->layout >> 8) & 255;
3743 fo = mddev->layout & (1<<16);
3744 if (fc > 1 || fo > 0) {
3745 pr_err("only near layout is supported by clustered"
3746 " raid10\n");
3747 goto out_free_conf;
3748 }
3749 }
3750
3751 mddev->thread = conf->thread;
3752 conf->thread = NULL;
3753
3754 if (mddev->queue) {
3755 blk_queue_max_discard_sectors(mddev->queue,
3756 mddev->chunk_sectors);
3757 blk_queue_max_write_same_sectors(mddev->queue, 0);
3758 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
3759 blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
3760 raid10_set_io_opt(conf);
3761 }
3762
3763 rdev_for_each(rdev, mddev) {
3764 long long diff;
3765
3766 disk_idx = rdev->raid_disk;
3767 if (disk_idx < 0)
3768 continue;
3769 if (disk_idx >= conf->geo.raid_disks &&
3770 disk_idx >= conf->prev.raid_disks)
3771 continue;
3772 disk = conf->mirrors + disk_idx;
3773
3774 if (test_bit(Replacement, &rdev->flags)) {
3775 if (disk->replacement)
3776 goto out_free_conf;
3777 disk->replacement = rdev;
3778 } else {
3779 if (disk->rdev)
3780 goto out_free_conf;
3781 disk->rdev = rdev;
3782 }
3783 diff = (rdev->new_data_offset - rdev->data_offset);
3784 if (!mddev->reshape_backwards)
3785 diff = -diff;
3786 if (diff < 0)
3787 diff = 0;
3788 if (first || diff < min_offset_diff)
3789 min_offset_diff = diff;
3790
3791 if (mddev->gendisk)
3792 disk_stack_limits(mddev->gendisk, rdev->bdev,
3793 rdev->data_offset << 9);
3794
3795 disk->head_position = 0;
3796
3797 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3798 discard_supported = true;
3799 first = 0;
3800 }
3801
3802 if (mddev->queue) {
3803 if (discard_supported)
3804 blk_queue_flag_set(QUEUE_FLAG_DISCARD,
3805 mddev->queue);
3806 else
3807 blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
3808 mddev->queue);
3809 }
3810
3811 if (!enough(conf, -1)) {
3812 pr_err("md/raid10:%s: not enough operational mirrors.\n",
3813 mdname(mddev));
3814 goto out_free_conf;
3815 }
3816
3817 if (conf->reshape_progress != MaxSector) {
3818
3819 if (conf->geo.far_copies != 1 &&
3820 conf->geo.far_offset == 0)
3821 goto out_free_conf;
3822 if (conf->prev.far_copies != 1 &&
3823 conf->prev.far_offset == 0)
3824 goto out_free_conf;
3825 }
3826
3827 mddev->degraded = 0;
3828 for (i = 0;
3829 i < conf->geo.raid_disks
3830 || i < conf->prev.raid_disks;
3831 i++) {
3832
3833 disk = conf->mirrors + i;
3834
3835 if (!disk->rdev && disk->replacement) {
3836
3837 disk->rdev = disk->replacement;
3838 disk->replacement = NULL;
3839 clear_bit(Replacement, &disk->rdev->flags);
3840 }
3841
3842 if (!disk->rdev ||
3843 !test_bit(In_sync, &disk->rdev->flags)) {
3844 disk->head_position = 0;
3845 mddev->degraded++;
3846 if (disk->rdev &&
3847 disk->rdev->saved_raid_disk < 0)
3848 conf->fullsync = 1;
3849 }
3850
3851 if (disk->replacement &&
3852 !test_bit(In_sync, &disk->replacement->flags) &&
3853 disk->replacement->saved_raid_disk < 0) {
3854 conf->fullsync = 1;
3855 }
3856
3857 disk->recovery_disabled = mddev->recovery_disabled - 1;
3858 }
3859
3860 if (mddev->recovery_cp != MaxSector)
3861 pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n",
3862 mdname(mddev));
3863 pr_info("md/raid10:%s: active with %d out of %d devices\n",
3864 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3865 conf->geo.raid_disks);
3866
3867
3868
3869 mddev->dev_sectors = conf->dev_sectors;
3870 size = raid10_size(mddev, 0, 0);
3871 md_set_array_sectors(mddev, size);
3872 mddev->resync_max_sectors = size;
3873 set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
3874
3875 if (md_integrity_register(mddev))
3876 goto out_free_conf;
3877
3878 if (conf->reshape_progress != MaxSector) {
3879 unsigned long before_length, after_length;
3880
3881 before_length = ((1 << conf->prev.chunk_shift) *
3882 conf->prev.far_copies);
3883 after_length = ((1 << conf->geo.chunk_shift) *
3884 conf->geo.far_copies);
3885
3886 if (max(before_length, after_length) > min_offset_diff) {
3887
3888 pr_warn("md/raid10: offset difference not enough to continue reshape\n");
3889 goto out_free_conf;
3890 }
3891 conf->offset_diff = min_offset_diff;
3892
3893 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3894 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3895 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3896 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3897 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3898 "reshape");
3899 if (!mddev->sync_thread)
3900 goto out_free_conf;
3901 }
3902
3903 return 0;
3904
3905out_free_conf:
3906 md_unregister_thread(&mddev->thread);
3907 mempool_exit(&conf->r10bio_pool);
3908 safe_put_page(conf->tmppage);
3909 kfree(conf->mirrors);
3910 kfree(conf);
3911 mddev->private = NULL;
3912out:
3913 return -EIO;
3914}
3915
3916static void raid10_free(struct mddev *mddev, void *priv)
3917{
3918 struct r10conf *conf = priv;
3919
3920 mempool_exit(&conf->r10bio_pool);
3921 safe_put_page(conf->tmppage);
3922 kfree(conf->mirrors);
3923 kfree(conf->mirrors_old);
3924 kfree(conf->mirrors_new);
3925 bioset_exit(&conf->bio_split);
3926 kfree(conf);
3927}
3928
3929static void raid10_quiesce(struct mddev *mddev, int quiesce)
3930{
3931 struct r10conf *conf = mddev->private;
3932
3933 if (quiesce)
3934 raise_barrier(conf, 0);
3935 else
3936 lower_barrier(conf);
3937}
3938
3939static int raid10_resize(struct mddev *mddev, sector_t sectors)
3940{
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953 struct r10conf *conf = mddev->private;
3954 sector_t oldsize, size;
3955
3956 if (mddev->reshape_position != MaxSector)
3957 return -EBUSY;
3958
3959 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3960 return -EINVAL;
3961
3962 oldsize = raid10_size(mddev, 0, 0);
3963 size = raid10_size(mddev, sectors, 0);
3964 if (mddev->external_size &&
3965 mddev->array_sectors > size)
3966 return -EINVAL;
3967 if (mddev->bitmap) {
3968 int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0);
3969 if (ret)
3970 return ret;
3971 }
3972 md_set_array_sectors(mddev, size);
3973 if (sectors > mddev->dev_sectors &&
3974 mddev->recovery_cp > oldsize) {
3975 mddev->recovery_cp = oldsize;
3976 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3977 }
3978 calc_sectors(conf, sectors);
3979 mddev->dev_sectors = conf->dev_sectors;
3980 mddev->resync_max_sectors = size;
3981 return 0;
3982}
3983
3984static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
3985{
3986 struct md_rdev *rdev;
3987 struct r10conf *conf;
3988
3989 if (mddev->degraded > 0) {
3990 pr_warn("md/raid10:%s: Error: degraded raid0!\n",
3991 mdname(mddev));
3992 return ERR_PTR(-EINVAL);
3993 }
3994 sector_div(size, devs);
3995
3996
3997 mddev->new_level = 10;
3998
3999 mddev->new_layout = (1<<8) + 2;
4000 mddev->new_chunk_sectors = mddev->chunk_sectors;
4001 mddev->delta_disks = mddev->raid_disks;
4002 mddev->raid_disks *= 2;
4003
4004 mddev->recovery_cp = MaxSector;
4005 mddev->dev_sectors = size;
4006
4007 conf = setup_conf(mddev);
4008 if (!IS_ERR(conf)) {
4009 rdev_for_each(rdev, mddev)
4010 if (rdev->raid_disk >= 0) {
4011 rdev->new_raid_disk = rdev->raid_disk * 2;
4012 rdev->sectors = size;
4013 }
4014 conf->barrier = 1;
4015 }
4016
4017 return conf;
4018}
4019
4020static void *raid10_takeover(struct mddev *mddev)
4021{
4022 struct r0conf *raid0_conf;
4023
4024
4025
4026
4027 if (mddev->level == 0) {
4028
4029 raid0_conf = mddev->private;
4030 if (raid0_conf->nr_strip_zones > 1) {
4031 pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n",
4032 mdname(mddev));
4033 return ERR_PTR(-EINVAL);
4034 }
4035 return raid10_takeover_raid0(mddev,
4036 raid0_conf->strip_zone->zone_end,
4037 raid0_conf->strip_zone->nb_dev);
4038 }
4039 return ERR_PTR(-EINVAL);
4040}
4041
4042static int raid10_check_reshape(struct mddev *mddev)
4043{
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058 struct r10conf *conf = mddev->private;
4059 struct geom geo;
4060
4061 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
4062 return -EINVAL;
4063
4064 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
4065
4066 return -EINVAL;
4067 if (geo.far_copies > 1 && !geo.far_offset)
4068
4069 return -EINVAL;
4070
4071 if (mddev->array_sectors & geo.chunk_mask)
4072
4073 return -EINVAL;
4074
4075 if (!enough(conf, -1))
4076 return -EINVAL;
4077
4078 kfree(conf->mirrors_new);
4079 conf->mirrors_new = NULL;
4080 if (mddev->delta_disks > 0) {
4081
4082 conf->mirrors_new =
4083 kcalloc(mddev->raid_disks + mddev->delta_disks,
4084 sizeof(struct raid10_info),
4085 GFP_KERNEL);
4086 if (!conf->mirrors_new)
4087 return -ENOMEM;
4088 }
4089 return 0;
4090}
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105static int calc_degraded(struct r10conf *conf)
4106{
4107 int degraded, degraded2;
4108 int i;
4109
4110 rcu_read_lock();
4111 degraded = 0;
4112
4113 for (i = 0; i < conf->prev.raid_disks; i++) {
4114 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4115 if (!rdev || test_bit(Faulty, &rdev->flags))
4116 degraded++;
4117 else if (!test_bit(In_sync, &rdev->flags))
4118
4119
4120
4121
4122 degraded++;
4123 }
4124 rcu_read_unlock();
4125 if (conf->geo.raid_disks == conf->prev.raid_disks)
4126 return degraded;
4127 rcu_read_lock();
4128 degraded2 = 0;
4129 for (i = 0; i < conf->geo.raid_disks; i++) {
4130 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4131 if (!rdev || test_bit(Faulty, &rdev->flags))
4132 degraded2++;
4133 else if (!test_bit(In_sync, &rdev->flags)) {
4134
4135
4136
4137
4138
4139 if (conf->geo.raid_disks <= conf->prev.raid_disks)
4140 degraded2++;
4141 }
4142 }
4143 rcu_read_unlock();
4144 if (degraded2 > degraded)
4145 return degraded2;
4146 return degraded;
4147}
4148
4149static int raid10_start_reshape(struct mddev *mddev)
4150{
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161 unsigned long before_length, after_length;
4162 sector_t min_offset_diff = 0;
4163 int first = 1;
4164 struct geom new;
4165 struct r10conf *conf = mddev->private;
4166 struct md_rdev *rdev;
4167 int spares = 0;
4168 int ret;
4169
4170 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4171 return -EBUSY;
4172
4173 if (setup_geo(&new, mddev, geo_start) != conf->copies)
4174 return -EINVAL;
4175
4176 before_length = ((1 << conf->prev.chunk_shift) *
4177 conf->prev.far_copies);
4178 after_length = ((1 << conf->geo.chunk_shift) *
4179 conf->geo.far_copies);
4180
4181 rdev_for_each(rdev, mddev) {
4182 if (!test_bit(In_sync, &rdev->flags)
4183 && !test_bit(Faulty, &rdev->flags))
4184 spares++;
4185 if (rdev->raid_disk >= 0) {
4186 long long diff = (rdev->new_data_offset
4187 - rdev->data_offset);
4188 if (!mddev->reshape_backwards)
4189 diff = -diff;
4190 if (diff < 0)
4191 diff = 0;
4192 if (first || diff < min_offset_diff)
4193 min_offset_diff = diff;
4194 first = 0;
4195 }
4196 }
4197
4198 if (max(before_length, after_length) > min_offset_diff)
4199 return -EINVAL;
4200
4201 if (spares < mddev->delta_disks)
4202 return -EINVAL;
4203
4204 conf->offset_diff = min_offset_diff;
4205 spin_lock_irq(&conf->device_lock);
4206 if (conf->mirrors_new) {
4207 memcpy(conf->mirrors_new, conf->mirrors,
4208 sizeof(struct raid10_info)*conf->prev.raid_disks);
4209 smp_mb();
4210 kfree(conf->mirrors_old);
4211 conf->mirrors_old = conf->mirrors;
4212 conf->mirrors = conf->mirrors_new;
4213 conf->mirrors_new = NULL;
4214 }
4215 setup_geo(&conf->geo, mddev, geo_start);
4216 smp_mb();
4217 if (mddev->reshape_backwards) {
4218 sector_t size = raid10_size(mddev, 0, 0);
4219 if (size < mddev->array_sectors) {
4220 spin_unlock_irq(&conf->device_lock);
4221 pr_warn("md/raid10:%s: array size must be reduce before number of disks\n",
4222 mdname(mddev));
4223 return -EINVAL;
4224 }
4225 mddev->resync_max_sectors = size;
4226 conf->reshape_progress = size;
4227 } else
4228 conf->reshape_progress = 0;
4229 conf->reshape_safe = conf->reshape_progress;
4230 spin_unlock_irq(&conf->device_lock);
4231
4232 if (mddev->delta_disks && mddev->bitmap) {
4233 struct mdp_superblock_1 *sb = NULL;
4234 sector_t oldsize, newsize;
4235
4236 oldsize = raid10_size(mddev, 0, 0);
4237 newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
4238
4239 if (!mddev_is_clustered(mddev)) {
4240 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
4241 if (ret)
4242 goto abort;
4243 else
4244 goto out;
4245 }
4246
4247 rdev_for_each(rdev, mddev) {
4248 if (rdev->raid_disk > -1 &&
4249 !test_bit(Faulty, &rdev->flags))
4250 sb = page_address(rdev->sb_page);
4251 }
4252
4253
4254
4255
4256
4257
4258 if ((sb && (le32_to_cpu(sb->feature_map) &
4259 MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
4260 goto out;
4261
4262 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
4263 if (ret)
4264 goto abort;
4265
4266 ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
4267 if (ret) {
4268 md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
4269 goto abort;
4270 }
4271 }
4272out:
4273 if (mddev->delta_disks > 0) {
4274 rdev_for_each(rdev, mddev)
4275 if (rdev->raid_disk < 0 &&
4276 !test_bit(Faulty, &rdev->flags)) {
4277 if (raid10_add_disk(mddev, rdev) == 0) {
4278 if (rdev->raid_disk >=
4279 conf->prev.raid_disks)
4280 set_bit(In_sync, &rdev->flags);
4281 else
4282 rdev->recovery_offset = 0;
4283
4284
4285 sysfs_link_rdev(mddev, rdev);
4286 }
4287 } else if (rdev->raid_disk >= conf->prev.raid_disks
4288 && !test_bit(Faulty, &rdev->flags)) {
4289
4290 set_bit(In_sync, &rdev->flags);
4291 }
4292 }
4293
4294
4295
4296
4297 spin_lock_irq(&conf->device_lock);
4298 mddev->degraded = calc_degraded(conf);
4299 spin_unlock_irq(&conf->device_lock);
4300 mddev->raid_disks = conf->geo.raid_disks;
4301 mddev->reshape_position = conf->reshape_progress;
4302 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4303
4304 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4305 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4306 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
4307 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4308 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4309
4310 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4311 "reshape");
4312 if (!mddev->sync_thread) {
4313 ret = -EAGAIN;
4314 goto abort;
4315 }
4316 conf->reshape_checkpoint = jiffies;
4317 md_wakeup_thread(mddev->sync_thread);
4318 md_new_event(mddev);
4319 return 0;
4320
4321abort:
4322 mddev->recovery = 0;
4323 spin_lock_irq(&conf->device_lock);
4324 conf->geo = conf->prev;
4325 mddev->raid_disks = conf->geo.raid_disks;
4326 rdev_for_each(rdev, mddev)
4327 rdev->new_data_offset = rdev->data_offset;
4328 smp_wmb();
4329 conf->reshape_progress = MaxSector;
4330 conf->reshape_safe = MaxSector;
4331 mddev->reshape_position = MaxSector;
4332 spin_unlock_irq(&conf->device_lock);
4333 return ret;
4334}
4335
4336
4337
4338
4339
4340
4341
4342static sector_t last_dev_address(sector_t s, struct geom *geo)
4343{
4344 s = (s | geo->chunk_mask) + 1;
4345 s >>= geo->chunk_shift;
4346 s *= geo->near_copies;
4347 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4348 s *= geo->far_copies;
4349 s <<= geo->chunk_shift;
4350 return s;
4351}
4352
4353
4354
4355
4356
4357static sector_t first_dev_address(sector_t s, struct geom *geo)
4358{
4359 s >>= geo->chunk_shift;
4360 s *= geo->near_copies;
4361 sector_div(s, geo->raid_disks);
4362 s *= geo->far_copies;
4363 s <<= geo->chunk_shift;
4364 return s;
4365}
4366
4367static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4368 int *skipped)
4369{
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407 struct r10conf *conf = mddev->private;
4408 struct r10bio *r10_bio;
4409 sector_t next, safe, last;
4410 int max_sectors;
4411 int nr_sectors;
4412 int s;
4413 struct md_rdev *rdev;
4414 int need_flush = 0;
4415 struct bio *blist;
4416 struct bio *bio, *read_bio;
4417 int sectors_done = 0;
4418 struct page **pages;
4419
4420 if (sector_nr == 0) {
4421
4422 if (mddev->reshape_backwards &&
4423 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4424 sector_nr = (raid10_size(mddev, 0, 0)
4425 - conf->reshape_progress);
4426 } else if (!mddev->reshape_backwards &&
4427 conf->reshape_progress > 0)
4428 sector_nr = conf->reshape_progress;
4429 if (sector_nr) {
4430 mddev->curr_resync_completed = sector_nr;
4431 sysfs_notify_dirent_safe(mddev->sysfs_completed);
4432 *skipped = 1;
4433 return sector_nr;
4434 }
4435 }
4436
4437
4438
4439
4440
4441 if (mddev->reshape_backwards) {
4442
4443
4444
4445 next = first_dev_address(conf->reshape_progress - 1,
4446 &conf->geo);
4447
4448
4449
4450
4451 safe = last_dev_address(conf->reshape_safe - 1,
4452 &conf->prev);
4453
4454 if (next + conf->offset_diff < safe)
4455 need_flush = 1;
4456
4457 last = conf->reshape_progress - 1;
4458 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4459 & conf->prev.chunk_mask);
4460 if (sector_nr + RESYNC_SECTORS < last)
4461 sector_nr = last + 1 - RESYNC_SECTORS;
4462 } else {
4463
4464
4465
4466 next = last_dev_address(conf->reshape_progress, &conf->geo);
4467
4468
4469
4470
4471 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4472
4473
4474
4475
4476 if (next > safe + conf->offset_diff)
4477 need_flush = 1;
4478
4479 sector_nr = conf->reshape_progress;
4480 last = sector_nr | (conf->geo.chunk_mask
4481 & conf->prev.chunk_mask);
4482
4483 if (sector_nr + RESYNC_SECTORS <= last)
4484 last = sector_nr + RESYNC_SECTORS - 1;
4485 }
4486
4487 if (need_flush ||
4488 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4489
4490 wait_barrier(conf);
4491 mddev->reshape_position = conf->reshape_progress;
4492 if (mddev->reshape_backwards)
4493 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4494 - conf->reshape_progress;
4495 else
4496 mddev->curr_resync_completed = conf->reshape_progress;
4497 conf->reshape_checkpoint = jiffies;
4498 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4499 md_wakeup_thread(mddev->thread);
4500 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
4501 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4502 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4503 allow_barrier(conf);
4504 return sectors_done;
4505 }
4506 conf->reshape_safe = mddev->reshape_position;
4507 allow_barrier(conf);
4508 }
4509
4510 raise_barrier(conf, 0);
4511read_more:
4512
4513 r10_bio = raid10_alloc_init_r10buf(conf);
4514 r10_bio->state = 0;
4515 raise_barrier(conf, 1);
4516 atomic_set(&r10_bio->remaining, 0);
4517 r10_bio->mddev = mddev;
4518 r10_bio->sector = sector_nr;
4519 set_bit(R10BIO_IsReshape, &r10_bio->state);
4520 r10_bio->sectors = last - sector_nr + 1;
4521 rdev = read_balance(conf, r10_bio, &max_sectors);
4522 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4523
4524 if (!rdev) {
4525
4526
4527
4528
4529 mempool_free(r10_bio, &conf->r10buf_pool);
4530 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4531 return sectors_done;
4532 }
4533
4534 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4535
4536 bio_set_dev(read_bio, rdev->bdev);
4537 read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4538 + rdev->data_offset);
4539 read_bio->bi_private = r10_bio;
4540 read_bio->bi_end_io = end_reshape_read;
4541 bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
4542 read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
4543 read_bio->bi_status = 0;
4544 read_bio->bi_vcnt = 0;
4545 read_bio->bi_iter.bi_size = 0;
4546 r10_bio->master_bio = read_bio;
4547 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4548
4549
4550
4551
4552
4553 if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) {
4554 struct mdp_superblock_1 *sb = NULL;
4555 int sb_reshape_pos = 0;
4556
4557 conf->cluster_sync_low = sector_nr;
4558 conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
4559 sb = page_address(rdev->sb_page);
4560 if (sb) {
4561 sb_reshape_pos = le64_to_cpu(sb->reshape_position);
4562
4563
4564
4565
4566
4567 if (sb_reshape_pos < conf->cluster_sync_low)
4568 conf->cluster_sync_low = sb_reshape_pos;
4569 }
4570
4571 md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
4572 conf->cluster_sync_high);
4573 }
4574
4575
4576 __raid10_find_phys(&conf->geo, r10_bio);
4577
4578 blist = read_bio;
4579 read_bio->bi_next = NULL;
4580
4581 rcu_read_lock();
4582 for (s = 0; s < conf->copies*2; s++) {
4583 struct bio *b;
4584 int d = r10_bio->devs[s/2].devnum;
4585 struct md_rdev *rdev2;
4586 if (s&1) {
4587 rdev2 = rcu_dereference(conf->mirrors[d].replacement);
4588 b = r10_bio->devs[s/2].repl_bio;
4589 } else {
4590 rdev2 = rcu_dereference(conf->mirrors[d].rdev);
4591 b = r10_bio->devs[s/2].bio;
4592 }
4593 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4594 continue;
4595
4596 bio_set_dev(b, rdev2->bdev);
4597 b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
4598 rdev2->new_data_offset;
4599 b->bi_end_io = end_reshape_write;
4600 bio_set_op_attrs(b, REQ_OP_WRITE, 0);
4601 b->bi_next = blist;
4602 blist = b;
4603 }
4604
4605
4606
4607 nr_sectors = 0;
4608 pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
4609 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4610 struct page *page = pages[s / (PAGE_SIZE >> 9)];
4611 int len = (max_sectors - s) << 9;
4612 if (len > PAGE_SIZE)
4613 len = PAGE_SIZE;
4614 for (bio = blist; bio ; bio = bio->bi_next) {
4615
4616
4617
4618
4619 bio_add_page(bio, page, len, 0);
4620 }
4621 sector_nr += len >> 9;
4622 nr_sectors += len >> 9;
4623 }
4624 rcu_read_unlock();
4625 r10_bio->sectors = nr_sectors;
4626
4627
4628 md_sync_acct_bio(read_bio, r10_bio->sectors);
4629 atomic_inc(&r10_bio->remaining);
4630 read_bio->bi_next = NULL;
4631 submit_bio_noacct(read_bio);
4632 sectors_done += nr_sectors;
4633 if (sector_nr <= last)
4634 goto read_more;
4635
4636 lower_barrier(conf);
4637
4638
4639
4640
4641 if (mddev->reshape_backwards)
4642 conf->reshape_progress -= sectors_done;
4643 else
4644 conf->reshape_progress += sectors_done;
4645
4646 return sectors_done;
4647}
4648
4649static void end_reshape_request(struct r10bio *r10_bio);
4650static int handle_reshape_read_error(struct mddev *mddev,
4651 struct r10bio *r10_bio);
4652static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4653{
4654
4655
4656
4657
4658
4659 struct r10conf *conf = mddev->private;
4660 int s;
4661
4662 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4663 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4664
4665 md_done_sync(mddev, r10_bio->sectors, 0);
4666 return;
4667 }
4668
4669
4670
4671
4672 atomic_set(&r10_bio->remaining, 1);
4673 for (s = 0; s < conf->copies*2; s++) {
4674 struct bio *b;
4675 int d = r10_bio->devs[s/2].devnum;
4676 struct md_rdev *rdev;
4677 rcu_read_lock();
4678 if (s&1) {
4679 rdev = rcu_dereference(conf->mirrors[d].replacement);
4680 b = r10_bio->devs[s/2].repl_bio;
4681 } else {
4682 rdev = rcu_dereference(conf->mirrors[d].rdev);
4683 b = r10_bio->devs[s/2].bio;
4684 }
4685 if (!rdev || test_bit(Faulty, &rdev->flags)) {
4686 rcu_read_unlock();
4687 continue;
4688 }
4689 atomic_inc(&rdev->nr_pending);
4690 rcu_read_unlock();
4691 md_sync_acct_bio(b, r10_bio->sectors);
4692 atomic_inc(&r10_bio->remaining);
4693 b->bi_next = NULL;
4694 submit_bio_noacct(b);
4695 }
4696 end_reshape_request(r10_bio);
4697}
4698
4699static void end_reshape(struct r10conf *conf)
4700{
4701 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4702 return;
4703
4704 spin_lock_irq(&conf->device_lock);
4705 conf->prev = conf->geo;
4706 md_finish_reshape(conf->mddev);
4707 smp_wmb();
4708 conf->reshape_progress = MaxSector;
4709 conf->reshape_safe = MaxSector;
4710 spin_unlock_irq(&conf->device_lock);
4711
4712 if (conf->mddev->queue)
4713 raid10_set_io_opt(conf);
4714 conf->fullsync = 0;
4715}
4716
4717static void raid10_update_reshape_pos(struct mddev *mddev)
4718{
4719 struct r10conf *conf = mddev->private;
4720 sector_t lo, hi;
4721
4722 md_cluster_ops->resync_info_get(mddev, &lo, &hi);
4723 if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
4724 || mddev->reshape_position == MaxSector)
4725 conf->reshape_progress = mddev->reshape_position;
4726 else
4727 WARN_ON_ONCE(1);
4728}
4729
4730static int handle_reshape_read_error(struct mddev *mddev,
4731 struct r10bio *r10_bio)
4732{
4733
4734 int sectors = r10_bio->sectors;
4735 struct r10conf *conf = mddev->private;
4736 struct r10bio *r10b;
4737 int slot = 0;
4738 int idx = 0;
4739 struct page **pages;
4740
4741 r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO);
4742 if (!r10b) {
4743 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4744 return -ENOMEM;
4745 }
4746
4747
4748 pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
4749
4750 r10b->sector = r10_bio->sector;
4751 __raid10_find_phys(&conf->prev, r10b);
4752
4753 while (sectors) {
4754 int s = sectors;
4755 int success = 0;
4756 int first_slot = slot;
4757
4758 if (s > (PAGE_SIZE >> 9))
4759 s = PAGE_SIZE >> 9;
4760
4761 rcu_read_lock();
4762 while (!success) {
4763 int d = r10b->devs[slot].devnum;
4764 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4765 sector_t addr;
4766 if (rdev == NULL ||
4767 test_bit(Faulty, &rdev->flags) ||
4768 !test_bit(In_sync, &rdev->flags))
4769 goto failed;
4770
4771 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4772 atomic_inc(&rdev->nr_pending);
4773 rcu_read_unlock();
4774 success = sync_page_io(rdev,
4775 addr,
4776 s << 9,
4777 pages[idx],
4778 REQ_OP_READ, 0, false);
4779 rdev_dec_pending(rdev, mddev);
4780 rcu_read_lock();
4781 if (success)
4782 break;
4783 failed:
4784 slot++;
4785 if (slot >= conf->copies)
4786 slot = 0;
4787 if (slot == first_slot)
4788 break;
4789 }
4790 rcu_read_unlock();
4791 if (!success) {
4792
4793 set_bit(MD_RECOVERY_INTR,
4794 &mddev->recovery);
4795 kfree(r10b);
4796 return -EIO;
4797 }
4798 sectors -= s;
4799 idx++;
4800 }
4801 kfree(r10b);
4802 return 0;
4803}
4804
4805static void end_reshape_write(struct bio *bio)
4806{
4807 struct r10bio *r10_bio = get_resync_r10bio(bio);
4808 struct mddev *mddev = r10_bio->mddev;
4809 struct r10conf *conf = mddev->private;
4810 int d;
4811 int slot;
4812 int repl;
4813 struct md_rdev *rdev = NULL;
4814
4815 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4816 if (repl)
4817 rdev = conf->mirrors[d].replacement;
4818 if (!rdev) {
4819 smp_mb();
4820 rdev = conf->mirrors[d].rdev;
4821 }
4822
4823 if (bio->bi_status) {
4824
4825 md_error(mddev, rdev);
4826 }
4827
4828 rdev_dec_pending(rdev, mddev);
4829 end_reshape_request(r10_bio);
4830}
4831
4832static void end_reshape_request(struct r10bio *r10_bio)
4833{
4834 if (!atomic_dec_and_test(&r10_bio->remaining))
4835 return;
4836 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4837 bio_put(r10_bio->master_bio);
4838 put_buf(r10_bio);
4839}
4840
4841static void raid10_finish_reshape(struct mddev *mddev)
4842{
4843 struct r10conf *conf = mddev->private;
4844
4845 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4846 return;
4847
4848 if (mddev->delta_disks > 0) {
4849 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4850 mddev->recovery_cp = mddev->resync_max_sectors;
4851 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4852 }
4853 mddev->resync_max_sectors = mddev->array_sectors;
4854 } else {
4855 int d;
4856 rcu_read_lock();
4857 for (d = conf->geo.raid_disks ;
4858 d < conf->geo.raid_disks - mddev->delta_disks;
4859 d++) {
4860 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4861 if (rdev)
4862 clear_bit(In_sync, &rdev->flags);
4863 rdev = rcu_dereference(conf->mirrors[d].replacement);
4864 if (rdev)
4865 clear_bit(In_sync, &rdev->flags);
4866 }
4867 rcu_read_unlock();
4868 }
4869 mddev->layout = mddev->new_layout;
4870 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4871 mddev->reshape_position = MaxSector;
4872 mddev->delta_disks = 0;
4873 mddev->reshape_backwards = 0;
4874}
4875
4876static struct md_personality raid10_personality =
4877{
4878 .name = "raid10",
4879 .level = 10,
4880 .owner = THIS_MODULE,
4881 .make_request = raid10_make_request,
4882 .run = raid10_run,
4883 .free = raid10_free,
4884 .status = raid10_status,
4885 .error_handler = raid10_error,
4886 .hot_add_disk = raid10_add_disk,
4887 .hot_remove_disk= raid10_remove_disk,
4888 .spare_active = raid10_spare_active,
4889 .sync_request = raid10_sync_request,
4890 .quiesce = raid10_quiesce,
4891 .size = raid10_size,
4892 .resize = raid10_resize,
4893 .takeover = raid10_takeover,
4894 .check_reshape = raid10_check_reshape,
4895 .start_reshape = raid10_start_reshape,
4896 .finish_reshape = raid10_finish_reshape,
4897 .update_reshape_pos = raid10_update_reshape_pos,
4898};
4899
4900static int __init raid_init(void)
4901{
4902 return register_md_personality(&raid10_personality);
4903}
4904
4905static void raid_exit(void)
4906{
4907 unregister_md_personality(&raid10_personality);
4908}
4909
4910module_init(raid_init);
4911module_exit(raid_exit);
4912MODULE_LICENSE("GPL");
4913MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4914MODULE_ALIAS("md-personality-9");
4915MODULE_ALIAS("md-raid10");
4916MODULE_ALIAS("md-level-10");
4917
4918module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
4919