1
2
3
4
5
6
7
8
9
10
11
12#include <linux/slab.h>
13#include <linux/delay.h>
14#include <linux/blkdev.h>
15#include <linux/module.h>
16#include <linux/seq_file.h>
17#include <linux/ratelimit.h>
18#include <linux/kthread.h>
19#include <linux/raid/md_p.h>
20#include <trace/events/block.h>
21#include "md.h"
22#include "raid10.h"
23#include "raid0.h"
24#include "md-bitmap.h"
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67static void allow_barrier(struct r10conf *conf);
68static void lower_barrier(struct r10conf *conf);
69static int _enough(struct r10conf *conf, int previous, int ignore);
70static int enough(struct r10conf *conf, int ignore);
71static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
72 int *skipped);
73static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
74static void end_reshape_write(struct bio *bio);
75static void end_reshape(struct r10conf *conf);
76
77#define raid10_log(md, fmt, args...) \
78 do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
79
80#include "raid1-10.c"
81
82
83
84
85
86static inline struct r10bio *get_resync_r10bio(struct bio *bio)
87{
88 return get_resync_pages(bio)->raid_bio;
89}
90
91static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
92{
93 struct r10conf *conf = data;
94 int size = offsetof(struct r10bio, devs[conf->copies]);
95
96
97
98 return kzalloc(size, gfp_flags);
99}
100
101#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
102
103#define RESYNC_WINDOW (1024*1024)
104
105#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
106#define CLUSTER_RESYNC_WINDOW (32 * RESYNC_WINDOW)
107#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
108
109
110
111
112
113
114
115
116static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
117{
118 struct r10conf *conf = data;
119 struct r10bio *r10_bio;
120 struct bio *bio;
121 int j;
122 int nalloc, nalloc_rp;
123 struct resync_pages *rps;
124
125 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
126 if (!r10_bio)
127 return NULL;
128
129 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
130 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
131 nalloc = conf->copies;
132 else
133 nalloc = 2;
134
135
136 if (!conf->have_replacement)
137 nalloc_rp = nalloc;
138 else
139 nalloc_rp = nalloc * 2;
140 rps = kmalloc_array(nalloc_rp, sizeof(struct resync_pages), gfp_flags);
141 if (!rps)
142 goto out_free_r10bio;
143
144
145
146
147 for (j = nalloc ; j-- ; ) {
148 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
149 if (!bio)
150 goto out_free_bio;
151 r10_bio->devs[j].bio = bio;
152 if (!conf->have_replacement)
153 continue;
154 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
155 if (!bio)
156 goto out_free_bio;
157 r10_bio->devs[j].repl_bio = bio;
158 }
159
160
161
162
163 for (j = 0; j < nalloc; j++) {
164 struct bio *rbio = r10_bio->devs[j].repl_bio;
165 struct resync_pages *rp, *rp_repl;
166
167 rp = &rps[j];
168 if (rbio)
169 rp_repl = &rps[nalloc + j];
170
171 bio = r10_bio->devs[j].bio;
172
173 if (!j || test_bit(MD_RECOVERY_SYNC,
174 &conf->mddev->recovery)) {
175 if (resync_alloc_pages(rp, gfp_flags))
176 goto out_free_pages;
177 } else {
178 memcpy(rp, &rps[0], sizeof(*rp));
179 resync_get_all_pages(rp);
180 }
181
182 rp->raid_bio = r10_bio;
183 bio->bi_private = rp;
184 if (rbio) {
185 memcpy(rp_repl, rp, sizeof(*rp));
186 rbio->bi_private = rp_repl;
187 }
188 }
189
190 return r10_bio;
191
192out_free_pages:
193 while (--j >= 0)
194 resync_free_pages(&rps[j * 2]);
195
196 j = 0;
197out_free_bio:
198 for ( ; j < nalloc; j++) {
199 if (r10_bio->devs[j].bio)
200 bio_put(r10_bio->devs[j].bio);
201 if (r10_bio->devs[j].repl_bio)
202 bio_put(r10_bio->devs[j].repl_bio);
203 }
204 kfree(rps);
205out_free_r10bio:
206 rbio_pool_free(r10_bio, conf);
207 return NULL;
208}
209
210static void r10buf_pool_free(void *__r10_bio, void *data)
211{
212 struct r10conf *conf = data;
213 struct r10bio *r10bio = __r10_bio;
214 int j;
215 struct resync_pages *rp = NULL;
216
217 for (j = conf->copies; j--; ) {
218 struct bio *bio = r10bio->devs[j].bio;
219
220 if (bio) {
221 rp = get_resync_pages(bio);
222 resync_free_pages(rp);
223 bio_put(bio);
224 }
225
226 bio = r10bio->devs[j].repl_bio;
227 if (bio)
228 bio_put(bio);
229 }
230
231
232 kfree(rp);
233
234 rbio_pool_free(r10bio, conf);
235}
236
237static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
238{
239 int i;
240
241 for (i = 0; i < conf->copies; i++) {
242 struct bio **bio = & r10_bio->devs[i].bio;
243 if (!BIO_SPECIAL(*bio))
244 bio_put(*bio);
245 *bio = NULL;
246 bio = &r10_bio->devs[i].repl_bio;
247 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
248 bio_put(*bio);
249 *bio = NULL;
250 }
251}
252
253static void free_r10bio(struct r10bio *r10_bio)
254{
255 struct r10conf *conf = r10_bio->mddev->private;
256
257 put_all_bios(conf, r10_bio);
258 mempool_free(r10_bio, &conf->r10bio_pool);
259}
260
261static void put_buf(struct r10bio *r10_bio)
262{
263 struct r10conf *conf = r10_bio->mddev->private;
264
265 mempool_free(r10_bio, &conf->r10buf_pool);
266
267 lower_barrier(conf);
268}
269
270static void reschedule_retry(struct r10bio *r10_bio)
271{
272 unsigned long flags;
273 struct mddev *mddev = r10_bio->mddev;
274 struct r10conf *conf = mddev->private;
275
276 spin_lock_irqsave(&conf->device_lock, flags);
277 list_add(&r10_bio->retry_list, &conf->retry_list);
278 conf->nr_queued ++;
279 spin_unlock_irqrestore(&conf->device_lock, flags);
280
281
282 wake_up(&conf->wait_barrier);
283
284 md_wakeup_thread(mddev->thread);
285}
286
287
288
289
290
291
292static void raid_end_bio_io(struct r10bio *r10_bio)
293{
294 struct bio *bio = r10_bio->master_bio;
295 struct r10conf *conf = r10_bio->mddev->private;
296
297 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
298 bio->bi_status = BLK_STS_IOERR;
299
300 bio_endio(bio);
301
302
303
304
305 allow_barrier(conf);
306
307 free_r10bio(r10_bio);
308}
309
310
311
312
313static inline void update_head_pos(int slot, struct r10bio *r10_bio)
314{
315 struct r10conf *conf = r10_bio->mddev->private;
316
317 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
318 r10_bio->devs[slot].addr + (r10_bio->sectors);
319}
320
321
322
323
324static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
325 struct bio *bio, int *slotp, int *replp)
326{
327 int slot;
328 int repl = 0;
329
330 for (slot = 0; slot < conf->copies; slot++) {
331 if (r10_bio->devs[slot].bio == bio)
332 break;
333 if (r10_bio->devs[slot].repl_bio == bio) {
334 repl = 1;
335 break;
336 }
337 }
338
339 BUG_ON(slot == conf->copies);
340 update_head_pos(slot, r10_bio);
341
342 if (slotp)
343 *slotp = slot;
344 if (replp)
345 *replp = repl;
346 return r10_bio->devs[slot].devnum;
347}
348
349static void raid10_end_read_request(struct bio *bio)
350{
351 int uptodate = !bio->bi_status;
352 struct r10bio *r10_bio = bio->bi_private;
353 int slot;
354 struct md_rdev *rdev;
355 struct r10conf *conf = r10_bio->mddev->private;
356
357 slot = r10_bio->read_slot;
358 rdev = r10_bio->devs[slot].rdev;
359
360
361
362 update_head_pos(slot, r10_bio);
363
364 if (uptodate) {
365
366
367
368
369
370
371
372
373
374 set_bit(R10BIO_Uptodate, &r10_bio->state);
375 } else {
376
377
378
379
380
381 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
382 rdev->raid_disk))
383 uptodate = 1;
384 }
385 if (uptodate) {
386 raid_end_bio_io(r10_bio);
387 rdev_dec_pending(rdev, conf->mddev);
388 } else {
389
390
391
392 char b[BDEVNAME_SIZE];
393 pr_err_ratelimited("md/raid10:%s: %s: rescheduling sector %llu\n",
394 mdname(conf->mddev),
395 bdevname(rdev->bdev, b),
396 (unsigned long long)r10_bio->sector);
397 set_bit(R10BIO_ReadError, &r10_bio->state);
398 reschedule_retry(r10_bio);
399 }
400}
401
402static void close_write(struct r10bio *r10_bio)
403{
404
405 md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
406 r10_bio->sectors,
407 !test_bit(R10BIO_Degraded, &r10_bio->state),
408 0);
409 md_write_end(r10_bio->mddev);
410}
411
412static void one_write_done(struct r10bio *r10_bio)
413{
414 if (atomic_dec_and_test(&r10_bio->remaining)) {
415 if (test_bit(R10BIO_WriteError, &r10_bio->state))
416 reschedule_retry(r10_bio);
417 else {
418 close_write(r10_bio);
419 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
420 reschedule_retry(r10_bio);
421 else
422 raid_end_bio_io(r10_bio);
423 }
424 }
425}
426
427static void raid10_end_write_request(struct bio *bio)
428{
429 struct r10bio *r10_bio = bio->bi_private;
430 int dev;
431 int dec_rdev = 1;
432 struct r10conf *conf = r10_bio->mddev->private;
433 int slot, repl;
434 struct md_rdev *rdev = NULL;
435 struct bio *to_put = NULL;
436 bool discard_error;
437
438 discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
439
440 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
441
442 if (repl)
443 rdev = conf->mirrors[dev].replacement;
444 if (!rdev) {
445 smp_rmb();
446 repl = 0;
447 rdev = conf->mirrors[dev].rdev;
448 }
449
450
451
452 if (bio->bi_status && !discard_error) {
453 if (repl)
454
455
456
457 md_error(rdev->mddev, rdev);
458 else {
459 set_bit(WriteErrorSeen, &rdev->flags);
460 if (!test_and_set_bit(WantReplacement, &rdev->flags))
461 set_bit(MD_RECOVERY_NEEDED,
462 &rdev->mddev->recovery);
463
464 dec_rdev = 0;
465 if (test_bit(FailFast, &rdev->flags) &&
466 (bio->bi_opf & MD_FAILFAST)) {
467 md_error(rdev->mddev, rdev);
468 if (!test_bit(Faulty, &rdev->flags))
469
470
471
472
473 set_bit(R10BIO_WriteError, &r10_bio->state);
474 else {
475 r10_bio->devs[slot].bio = NULL;
476 to_put = bio;
477 dec_rdev = 1;
478 }
479 } else
480 set_bit(R10BIO_WriteError, &r10_bio->state);
481 }
482 } else {
483
484
485
486
487
488
489
490
491
492 sector_t first_bad;
493 int bad_sectors;
494
495
496
497
498
499
500
501
502
503 if (test_bit(In_sync, &rdev->flags) &&
504 !test_bit(Faulty, &rdev->flags))
505 set_bit(R10BIO_Uptodate, &r10_bio->state);
506
507
508 if (is_badblock(rdev,
509 r10_bio->devs[slot].addr,
510 r10_bio->sectors,
511 &first_bad, &bad_sectors) && !discard_error) {
512 bio_put(bio);
513 if (repl)
514 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
515 else
516 r10_bio->devs[slot].bio = IO_MADE_GOOD;
517 dec_rdev = 0;
518 set_bit(R10BIO_MadeGood, &r10_bio->state);
519 }
520 }
521
522
523
524
525
526
527 one_write_done(r10_bio);
528 if (dec_rdev)
529 rdev_dec_pending(rdev, conf->mddev);
530 if (to_put)
531 bio_put(to_put);
532}
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
560{
561 int n,f;
562 sector_t sector;
563 sector_t chunk;
564 sector_t stripe;
565 int dev;
566 int slot = 0;
567 int last_far_set_start, last_far_set_size;
568
569 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
570 last_far_set_start *= geo->far_set_size;
571
572 last_far_set_size = geo->far_set_size;
573 last_far_set_size += (geo->raid_disks % geo->far_set_size);
574
575
576 chunk = r10bio->sector >> geo->chunk_shift;
577 sector = r10bio->sector & geo->chunk_mask;
578
579 chunk *= geo->near_copies;
580 stripe = chunk;
581 dev = sector_div(stripe, geo->raid_disks);
582 if (geo->far_offset)
583 stripe *= geo->far_copies;
584
585 sector += stripe << geo->chunk_shift;
586
587
588 for (n = 0; n < geo->near_copies; n++) {
589 int d = dev;
590 int set;
591 sector_t s = sector;
592 r10bio->devs[slot].devnum = d;
593 r10bio->devs[slot].addr = s;
594 slot++;
595
596 for (f = 1; f < geo->far_copies; f++) {
597 set = d / geo->far_set_size;
598 d += geo->near_copies;
599
600 if ((geo->raid_disks % geo->far_set_size) &&
601 (d > last_far_set_start)) {
602 d -= last_far_set_start;
603 d %= last_far_set_size;
604 d += last_far_set_start;
605 } else {
606 d %= geo->far_set_size;
607 d += geo->far_set_size * set;
608 }
609 s += geo->stride;
610 r10bio->devs[slot].devnum = d;
611 r10bio->devs[slot].addr = s;
612 slot++;
613 }
614 dev++;
615 if (dev >= geo->raid_disks) {
616 dev = 0;
617 sector += (geo->chunk_mask + 1);
618 }
619 }
620}
621
622static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
623{
624 struct geom *geo = &conf->geo;
625
626 if (conf->reshape_progress != MaxSector &&
627 ((r10bio->sector >= conf->reshape_progress) !=
628 conf->mddev->reshape_backwards)) {
629 set_bit(R10BIO_Previous, &r10bio->state);
630 geo = &conf->prev;
631 } else
632 clear_bit(R10BIO_Previous, &r10bio->state);
633
634 __raid10_find_phys(geo, r10bio);
635}
636
637static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
638{
639 sector_t offset, chunk, vchunk;
640
641
642
643 struct geom *geo = &conf->geo;
644 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
645 int far_set_size = geo->far_set_size;
646 int last_far_set_start;
647
648 if (geo->raid_disks % geo->far_set_size) {
649 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
650 last_far_set_start *= geo->far_set_size;
651
652 if (dev >= last_far_set_start) {
653 far_set_size = geo->far_set_size;
654 far_set_size += (geo->raid_disks % geo->far_set_size);
655 far_set_start = last_far_set_start;
656 }
657 }
658
659 offset = sector & geo->chunk_mask;
660 if (geo->far_offset) {
661 int fc;
662 chunk = sector >> geo->chunk_shift;
663 fc = sector_div(chunk, geo->far_copies);
664 dev -= fc * geo->near_copies;
665 if (dev < far_set_start)
666 dev += far_set_size;
667 } else {
668 while (sector >= geo->stride) {
669 sector -= geo->stride;
670 if (dev < (geo->near_copies + far_set_start))
671 dev += far_set_size - geo->near_copies;
672 else
673 dev -= geo->near_copies;
674 }
675 chunk = sector >> geo->chunk_shift;
676 }
677 vchunk = chunk * geo->raid_disks + dev;
678 sector_div(vchunk, geo->near_copies);
679 return (vchunk << geo->chunk_shift) + offset;
680}
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701static struct md_rdev *read_balance(struct r10conf *conf,
702 struct r10bio *r10_bio,
703 int *max_sectors)
704{
705 const sector_t this_sector = r10_bio->sector;
706 int disk, slot;
707 int sectors = r10_bio->sectors;
708 int best_good_sectors;
709 sector_t new_distance, best_dist;
710 struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL;
711 int do_balance;
712 int best_dist_slot, best_pending_slot;
713 bool has_nonrot_disk = false;
714 unsigned int min_pending;
715 struct geom *geo = &conf->geo;
716
717 raid10_find_phys(conf, r10_bio);
718 rcu_read_lock();
719 best_dist_slot = -1;
720 min_pending = UINT_MAX;
721 best_dist_rdev = NULL;
722 best_pending_rdev = NULL;
723 best_dist = MaxSector;
724 best_good_sectors = 0;
725 do_balance = 1;
726 clear_bit(R10BIO_FailFast, &r10_bio->state);
727
728
729
730
731
732
733 if ((conf->mddev->recovery_cp < MaxSector
734 && (this_sector + sectors >= conf->next_resync)) ||
735 (mddev_is_clustered(conf->mddev) &&
736 md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
737 this_sector + sectors)))
738 do_balance = 0;
739
740 for (slot = 0; slot < conf->copies ; slot++) {
741 sector_t first_bad;
742 int bad_sectors;
743 sector_t dev_sector;
744 unsigned int pending;
745 bool nonrot;
746
747 if (r10_bio->devs[slot].bio == IO_BLOCKED)
748 continue;
749 disk = r10_bio->devs[slot].devnum;
750 rdev = rcu_dereference(conf->mirrors[disk].replacement);
751 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
752 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
753 rdev = rcu_dereference(conf->mirrors[disk].rdev);
754 if (rdev == NULL ||
755 test_bit(Faulty, &rdev->flags))
756 continue;
757 if (!test_bit(In_sync, &rdev->flags) &&
758 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
759 continue;
760
761 dev_sector = r10_bio->devs[slot].addr;
762 if (is_badblock(rdev, dev_sector, sectors,
763 &first_bad, &bad_sectors)) {
764 if (best_dist < MaxSector)
765
766 continue;
767 if (first_bad <= dev_sector) {
768
769
770
771
772 bad_sectors -= (dev_sector - first_bad);
773 if (!do_balance && sectors > bad_sectors)
774 sectors = bad_sectors;
775 if (best_good_sectors > sectors)
776 best_good_sectors = sectors;
777 } else {
778 sector_t good_sectors =
779 first_bad - dev_sector;
780 if (good_sectors > best_good_sectors) {
781 best_good_sectors = good_sectors;
782 best_dist_slot = slot;
783 best_dist_rdev = rdev;
784 }
785 if (!do_balance)
786
787 break;
788 }
789 continue;
790 } else
791 best_good_sectors = sectors;
792
793 if (!do_balance)
794 break;
795
796 nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
797 has_nonrot_disk |= nonrot;
798 pending = atomic_read(&rdev->nr_pending);
799 if (min_pending > pending && nonrot) {
800 min_pending = pending;
801 best_pending_slot = slot;
802 best_pending_rdev = rdev;
803 }
804
805 if (best_dist_slot >= 0)
806
807 set_bit(R10BIO_FailFast, &r10_bio->state);
808
809
810
811
812 if (geo->near_copies > 1 && !pending)
813 new_distance = 0;
814
815
816 else if (geo->far_copies > 1)
817 new_distance = r10_bio->devs[slot].addr;
818 else
819 new_distance = abs(r10_bio->devs[slot].addr -
820 conf->mirrors[disk].head_position);
821
822 if (new_distance < best_dist) {
823 best_dist = new_distance;
824 best_dist_slot = slot;
825 best_dist_rdev = rdev;
826 }
827 }
828 if (slot >= conf->copies) {
829 if (has_nonrot_disk) {
830 slot = best_pending_slot;
831 rdev = best_pending_rdev;
832 } else {
833 slot = best_dist_slot;
834 rdev = best_dist_rdev;
835 }
836 }
837
838 if (slot >= 0) {
839 atomic_inc(&rdev->nr_pending);
840 r10_bio->read_slot = slot;
841 } else
842 rdev = NULL;
843 rcu_read_unlock();
844 *max_sectors = best_good_sectors;
845
846 return rdev;
847}
848
849static int raid10_congested(struct mddev *mddev, int bits)
850{
851 struct r10conf *conf = mddev->private;
852 int i, ret = 0;
853
854 if ((bits & (1 << WB_async_congested)) &&
855 conf->pending_count >= max_queued_requests)
856 return 1;
857
858 rcu_read_lock();
859 for (i = 0;
860 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
861 && ret == 0;
862 i++) {
863 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
864 if (rdev && !test_bit(Faulty, &rdev->flags)) {
865 struct request_queue *q = bdev_get_queue(rdev->bdev);
866
867 ret |= bdi_congested(q->backing_dev_info, bits);
868 }
869 }
870 rcu_read_unlock();
871 return ret;
872}
873
874static void flush_pending_writes(struct r10conf *conf)
875{
876
877
878
879 spin_lock_irq(&conf->device_lock);
880
881 if (conf->pending_bio_list.head) {
882 struct blk_plug plug;
883 struct bio *bio;
884
885 bio = bio_list_get(&conf->pending_bio_list);
886 conf->pending_count = 0;
887 spin_unlock_irq(&conf->device_lock);
888
889
890
891
892
893
894
895
896
897
898 __set_current_state(TASK_RUNNING);
899
900 blk_start_plug(&plug);
901
902
903 md_bitmap_unplug(conf->mddev->bitmap);
904 wake_up(&conf->wait_barrier);
905
906 while (bio) {
907 struct bio *next = bio->bi_next;
908 struct md_rdev *rdev = (void*)bio->bi_disk;
909 bio->bi_next = NULL;
910 bio_set_dev(bio, rdev->bdev);
911 if (test_bit(Faulty, &rdev->flags)) {
912 bio_io_error(bio);
913 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
914 !blk_queue_discard(bio->bi_disk->queue)))
915
916 bio_endio(bio);
917 else
918 generic_make_request(bio);
919 bio = next;
920 }
921 blk_finish_plug(&plug);
922 } else
923 spin_unlock_irq(&conf->device_lock);
924}
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948static void raise_barrier(struct r10conf *conf, int force)
949{
950 BUG_ON(force && !conf->barrier);
951 spin_lock_irq(&conf->resync_lock);
952
953
954 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
955 conf->resync_lock);
956
957
958 conf->barrier++;
959
960
961 wait_event_lock_irq(conf->wait_barrier,
962 !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
963 conf->resync_lock);
964
965 spin_unlock_irq(&conf->resync_lock);
966}
967
968static void lower_barrier(struct r10conf *conf)
969{
970 unsigned long flags;
971 spin_lock_irqsave(&conf->resync_lock, flags);
972 conf->barrier--;
973 spin_unlock_irqrestore(&conf->resync_lock, flags);
974 wake_up(&conf->wait_barrier);
975}
976
977static void wait_barrier(struct r10conf *conf)
978{
979 spin_lock_irq(&conf->resync_lock);
980 if (conf->barrier) {
981 conf->nr_waiting++;
982
983
984
985
986
987
988
989
990
991 raid10_log(conf->mddev, "wait barrier");
992 wait_event_lock_irq(conf->wait_barrier,
993 !conf->barrier ||
994 (atomic_read(&conf->nr_pending) &&
995 current->bio_list &&
996 (!bio_list_empty(¤t->bio_list[0]) ||
997 !bio_list_empty(¤t->bio_list[1]))),
998 conf->resync_lock);
999 conf->nr_waiting--;
1000 if (!conf->nr_waiting)
1001 wake_up(&conf->wait_barrier);
1002 }
1003 atomic_inc(&conf->nr_pending);
1004 spin_unlock_irq(&conf->resync_lock);
1005}
1006
1007static void allow_barrier(struct r10conf *conf)
1008{
1009 if ((atomic_dec_and_test(&conf->nr_pending)) ||
1010 (conf->array_freeze_pending))
1011 wake_up(&conf->wait_barrier);
1012}
1013
1014static void freeze_array(struct r10conf *conf, int extra)
1015{
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028 spin_lock_irq(&conf->resync_lock);
1029 conf->array_freeze_pending++;
1030 conf->barrier++;
1031 conf->nr_waiting++;
1032 wait_event_lock_irq_cmd(conf->wait_barrier,
1033 atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
1034 conf->resync_lock,
1035 flush_pending_writes(conf));
1036
1037 conf->array_freeze_pending--;
1038 spin_unlock_irq(&conf->resync_lock);
1039}
1040
1041static void unfreeze_array(struct r10conf *conf)
1042{
1043
1044 spin_lock_irq(&conf->resync_lock);
1045 conf->barrier--;
1046 conf->nr_waiting--;
1047 wake_up(&conf->wait_barrier);
1048 spin_unlock_irq(&conf->resync_lock);
1049}
1050
1051static sector_t choose_data_offset(struct r10bio *r10_bio,
1052 struct md_rdev *rdev)
1053{
1054 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1055 test_bit(R10BIO_Previous, &r10_bio->state))
1056 return rdev->data_offset;
1057 else
1058 return rdev->new_data_offset;
1059}
1060
1061struct raid10_plug_cb {
1062 struct blk_plug_cb cb;
1063 struct bio_list pending;
1064 int pending_cnt;
1065};
1066
1067static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1068{
1069 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1070 cb);
1071 struct mddev *mddev = plug->cb.data;
1072 struct r10conf *conf = mddev->private;
1073 struct bio *bio;
1074
1075 if (from_schedule || current->bio_list) {
1076 spin_lock_irq(&conf->device_lock);
1077 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1078 conf->pending_count += plug->pending_cnt;
1079 spin_unlock_irq(&conf->device_lock);
1080 wake_up(&conf->wait_barrier);
1081 md_wakeup_thread(mddev->thread);
1082 kfree(plug);
1083 return;
1084 }
1085
1086
1087 bio = bio_list_get(&plug->pending);
1088 md_bitmap_unplug(mddev->bitmap);
1089 wake_up(&conf->wait_barrier);
1090
1091 while (bio) {
1092 struct bio *next = bio->bi_next;
1093 struct md_rdev *rdev = (void*)bio->bi_disk;
1094 bio->bi_next = NULL;
1095 bio_set_dev(bio, rdev->bdev);
1096 if (test_bit(Faulty, &rdev->flags)) {
1097 bio_io_error(bio);
1098 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
1099 !blk_queue_discard(bio->bi_disk->queue)))
1100
1101 bio_endio(bio);
1102 else
1103 generic_make_request(bio);
1104 bio = next;
1105 }
1106 kfree(plug);
1107}
1108
1109
1110
1111
1112
1113
1114
1115static void regular_request_wait(struct mddev *mddev, struct r10conf *conf,
1116 struct bio *bio, sector_t sectors)
1117{
1118 wait_barrier(conf);
1119 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1120 bio->bi_iter.bi_sector < conf->reshape_progress &&
1121 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1122 raid10_log(conf->mddev, "wait reshape");
1123 allow_barrier(conf);
1124 wait_event(conf->wait_barrier,
1125 conf->reshape_progress <= bio->bi_iter.bi_sector ||
1126 conf->reshape_progress >= bio->bi_iter.bi_sector +
1127 sectors);
1128 wait_barrier(conf);
1129 }
1130}
1131
1132static void raid10_read_request(struct mddev *mddev, struct bio *bio,
1133 struct r10bio *r10_bio)
1134{
1135 struct r10conf *conf = mddev->private;
1136 struct bio *read_bio;
1137 const int op = bio_op(bio);
1138 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1139 int max_sectors;
1140 struct md_rdev *rdev;
1141 char b[BDEVNAME_SIZE];
1142 int slot = r10_bio->read_slot;
1143 struct md_rdev *err_rdev = NULL;
1144 gfp_t gfp = GFP_NOIO;
1145
1146 if (r10_bio->devs[slot].rdev) {
1147
1148
1149
1150
1151
1152
1153
1154 int disk;
1155
1156
1157
1158
1159 gfp = GFP_NOIO | __GFP_HIGH;
1160
1161 rcu_read_lock();
1162 disk = r10_bio->devs[slot].devnum;
1163 err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
1164 if (err_rdev)
1165 bdevname(err_rdev->bdev, b);
1166 else {
1167 strcpy(b, "???");
1168
1169 err_rdev = r10_bio->devs[slot].rdev;
1170 }
1171 rcu_read_unlock();
1172 }
1173
1174 regular_request_wait(mddev, conf, bio, r10_bio->sectors);
1175 rdev = read_balance(conf, r10_bio, &max_sectors);
1176 if (!rdev) {
1177 if (err_rdev) {
1178 pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
1179 mdname(mddev), b,
1180 (unsigned long long)r10_bio->sector);
1181 }
1182 raid_end_bio_io(r10_bio);
1183 return;
1184 }
1185 if (err_rdev)
1186 pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
1187 mdname(mddev),
1188 bdevname(rdev->bdev, b),
1189 (unsigned long long)r10_bio->sector);
1190 if (max_sectors < bio_sectors(bio)) {
1191 struct bio *split = bio_split(bio, max_sectors,
1192 gfp, &conf->bio_split);
1193 bio_chain(split, bio);
1194 allow_barrier(conf);
1195 generic_make_request(bio);
1196 wait_barrier(conf);
1197 bio = split;
1198 r10_bio->master_bio = bio;
1199 r10_bio->sectors = max_sectors;
1200 }
1201 slot = r10_bio->read_slot;
1202
1203 read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
1204
1205 r10_bio->devs[slot].bio = read_bio;
1206 r10_bio->devs[slot].rdev = rdev;
1207
1208 read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
1209 choose_data_offset(r10_bio, rdev);
1210 bio_set_dev(read_bio, rdev->bdev);
1211 read_bio->bi_end_io = raid10_end_read_request;
1212 bio_set_op_attrs(read_bio, op, do_sync);
1213 if (test_bit(FailFast, &rdev->flags) &&
1214 test_bit(R10BIO_FailFast, &r10_bio->state))
1215 read_bio->bi_opf |= MD_FAILFAST;
1216 read_bio->bi_private = r10_bio;
1217
1218 if (mddev->gendisk)
1219 trace_block_bio_remap(read_bio->bi_disk->queue,
1220 read_bio, disk_devt(mddev->gendisk),
1221 r10_bio->sector);
1222 generic_make_request(read_bio);
1223 return;
1224}
1225
1226static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
1227 struct bio *bio, bool replacement,
1228 int n_copy)
1229{
1230 const int op = bio_op(bio);
1231 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1232 const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
1233 unsigned long flags;
1234 struct blk_plug_cb *cb;
1235 struct raid10_plug_cb *plug = NULL;
1236 struct r10conf *conf = mddev->private;
1237 struct md_rdev *rdev;
1238 int devnum = r10_bio->devs[n_copy].devnum;
1239 struct bio *mbio;
1240
1241 if (replacement) {
1242 rdev = conf->mirrors[devnum].replacement;
1243 if (rdev == NULL) {
1244
1245 smp_mb();
1246 rdev = conf->mirrors[devnum].rdev;
1247 }
1248 } else
1249 rdev = conf->mirrors[devnum].rdev;
1250
1251 mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
1252 if (replacement)
1253 r10_bio->devs[n_copy].repl_bio = mbio;
1254 else
1255 r10_bio->devs[n_copy].bio = mbio;
1256
1257 mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
1258 choose_data_offset(r10_bio, rdev));
1259 bio_set_dev(mbio, rdev->bdev);
1260 mbio->bi_end_io = raid10_end_write_request;
1261 bio_set_op_attrs(mbio, op, do_sync | do_fua);
1262 if (!replacement && test_bit(FailFast,
1263 &conf->mirrors[devnum].rdev->flags)
1264 && enough(conf, devnum))
1265 mbio->bi_opf |= MD_FAILFAST;
1266 mbio->bi_private = r10_bio;
1267
1268 if (conf->mddev->gendisk)
1269 trace_block_bio_remap(mbio->bi_disk->queue,
1270 mbio, disk_devt(conf->mddev->gendisk),
1271 r10_bio->sector);
1272
1273 mbio->bi_disk = (void *)rdev;
1274
1275 atomic_inc(&r10_bio->remaining);
1276
1277 cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
1278 if (cb)
1279 plug = container_of(cb, struct raid10_plug_cb, cb);
1280 else
1281 plug = NULL;
1282 if (plug) {
1283 bio_list_add(&plug->pending, mbio);
1284 plug->pending_cnt++;
1285 } else {
1286 spin_lock_irqsave(&conf->device_lock, flags);
1287 bio_list_add(&conf->pending_bio_list, mbio);
1288 conf->pending_count++;
1289 spin_unlock_irqrestore(&conf->device_lock, flags);
1290 md_wakeup_thread(mddev->thread);
1291 }
1292}
1293
1294static void raid10_write_request(struct mddev *mddev, struct bio *bio,
1295 struct r10bio *r10_bio)
1296{
1297 struct r10conf *conf = mddev->private;
1298 int i;
1299 struct md_rdev *blocked_rdev;
1300 sector_t sectors;
1301 int max_sectors;
1302
1303 if ((mddev_is_clustered(mddev) &&
1304 md_cluster_ops->area_resyncing(mddev, WRITE,
1305 bio->bi_iter.bi_sector,
1306 bio_end_sector(bio)))) {
1307 DEFINE_WAIT(w);
1308 for (;;) {
1309 prepare_to_wait(&conf->wait_barrier,
1310 &w, TASK_IDLE);
1311 if (!md_cluster_ops->area_resyncing(mddev, WRITE,
1312 bio->bi_iter.bi_sector, bio_end_sector(bio)))
1313 break;
1314 schedule();
1315 }
1316 finish_wait(&conf->wait_barrier, &w);
1317 }
1318
1319 sectors = r10_bio->sectors;
1320 regular_request_wait(mddev, conf, bio, sectors);
1321 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1322 (mddev->reshape_backwards
1323 ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
1324 bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
1325 : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
1326 bio->bi_iter.bi_sector < conf->reshape_progress))) {
1327
1328 mddev->reshape_position = conf->reshape_progress;
1329 set_mask_bits(&mddev->sb_flags, 0,
1330 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1331 md_wakeup_thread(mddev->thread);
1332 raid10_log(conf->mddev, "wait reshape metadata");
1333 wait_event(mddev->sb_wait,
1334 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
1335
1336 conf->reshape_safe = mddev->reshape_position;
1337 }
1338
1339 if (conf->pending_count >= max_queued_requests) {
1340 md_wakeup_thread(mddev->thread);
1341 raid10_log(mddev, "wait queued");
1342 wait_event(conf->wait_barrier,
1343 conf->pending_count < max_queued_requests);
1344 }
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355 r10_bio->read_slot = -1;
1356 raid10_find_phys(conf, r10_bio);
1357retry_write:
1358 blocked_rdev = NULL;
1359 rcu_read_lock();
1360 max_sectors = r10_bio->sectors;
1361
1362 for (i = 0; i < conf->copies; i++) {
1363 int d = r10_bio->devs[i].devnum;
1364 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1365 struct md_rdev *rrdev = rcu_dereference(
1366 conf->mirrors[d].replacement);
1367 if (rdev == rrdev)
1368 rrdev = NULL;
1369 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1370 atomic_inc(&rdev->nr_pending);
1371 blocked_rdev = rdev;
1372 break;
1373 }
1374 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1375 atomic_inc(&rrdev->nr_pending);
1376 blocked_rdev = rrdev;
1377 break;
1378 }
1379 if (rdev && (test_bit(Faulty, &rdev->flags)))
1380 rdev = NULL;
1381 if (rrdev && (test_bit(Faulty, &rrdev->flags)))
1382 rrdev = NULL;
1383
1384 r10_bio->devs[i].bio = NULL;
1385 r10_bio->devs[i].repl_bio = NULL;
1386
1387 if (!rdev && !rrdev) {
1388 set_bit(R10BIO_Degraded, &r10_bio->state);
1389 continue;
1390 }
1391 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1392 sector_t first_bad;
1393 sector_t dev_sector = r10_bio->devs[i].addr;
1394 int bad_sectors;
1395 int is_bad;
1396
1397 is_bad = is_badblock(rdev, dev_sector, max_sectors,
1398 &first_bad, &bad_sectors);
1399 if (is_bad < 0) {
1400
1401
1402
1403 atomic_inc(&rdev->nr_pending);
1404 set_bit(BlockedBadBlocks, &rdev->flags);
1405 blocked_rdev = rdev;
1406 break;
1407 }
1408 if (is_bad && first_bad <= dev_sector) {
1409
1410 bad_sectors -= (dev_sector - first_bad);
1411 if (bad_sectors < max_sectors)
1412
1413
1414
1415 max_sectors = bad_sectors;
1416
1417
1418
1419
1420
1421
1422
1423
1424 continue;
1425 }
1426 if (is_bad) {
1427 int good_sectors = first_bad - dev_sector;
1428 if (good_sectors < max_sectors)
1429 max_sectors = good_sectors;
1430 }
1431 }
1432 if (rdev) {
1433 r10_bio->devs[i].bio = bio;
1434 atomic_inc(&rdev->nr_pending);
1435 }
1436 if (rrdev) {
1437 r10_bio->devs[i].repl_bio = bio;
1438 atomic_inc(&rrdev->nr_pending);
1439 }
1440 }
1441 rcu_read_unlock();
1442
1443 if (unlikely(blocked_rdev)) {
1444
1445 int j;
1446 int d;
1447
1448 for (j = 0; j < i; j++) {
1449 if (r10_bio->devs[j].bio) {
1450 d = r10_bio->devs[j].devnum;
1451 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1452 }
1453 if (r10_bio->devs[j].repl_bio) {
1454 struct md_rdev *rdev;
1455 d = r10_bio->devs[j].devnum;
1456 rdev = conf->mirrors[d].replacement;
1457 if (!rdev) {
1458
1459 smp_mb();
1460 rdev = conf->mirrors[d].rdev;
1461 }
1462 rdev_dec_pending(rdev, mddev);
1463 }
1464 }
1465 allow_barrier(conf);
1466 raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
1467 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1468 wait_barrier(conf);
1469 goto retry_write;
1470 }
1471
1472 if (max_sectors < r10_bio->sectors)
1473 r10_bio->sectors = max_sectors;
1474
1475 if (r10_bio->sectors < bio_sectors(bio)) {
1476 struct bio *split = bio_split(bio, r10_bio->sectors,
1477 GFP_NOIO, &conf->bio_split);
1478 bio_chain(split, bio);
1479 allow_barrier(conf);
1480 generic_make_request(bio);
1481 wait_barrier(conf);
1482 bio = split;
1483 r10_bio->master_bio = bio;
1484 }
1485
1486 atomic_set(&r10_bio->remaining, 1);
1487 md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1488
1489 for (i = 0; i < conf->copies; i++) {
1490 if (r10_bio->devs[i].bio)
1491 raid10_write_one_disk(mddev, r10_bio, bio, false, i);
1492 if (r10_bio->devs[i].repl_bio)
1493 raid10_write_one_disk(mddev, r10_bio, bio, true, i);
1494 }
1495 one_write_done(r10_bio);
1496}
1497
1498static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
1499{
1500 struct r10conf *conf = mddev->private;
1501 struct r10bio *r10_bio;
1502
1503 r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
1504
1505 r10_bio->master_bio = bio;
1506 r10_bio->sectors = sectors;
1507
1508 r10_bio->mddev = mddev;
1509 r10_bio->sector = bio->bi_iter.bi_sector;
1510 r10_bio->state = 0;
1511 memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
1512
1513 if (bio_data_dir(bio) == READ)
1514 raid10_read_request(mddev, bio, r10_bio);
1515 else
1516 raid10_write_request(mddev, bio, r10_bio);
1517}
1518
1519static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
1520{
1521 struct r10conf *conf = mddev->private;
1522 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1523 int chunk_sects = chunk_mask + 1;
1524 int sectors = bio_sectors(bio);
1525
1526 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1527 md_flush_request(mddev, bio);
1528 return true;
1529 }
1530
1531 if (!md_write_start(mddev, bio))
1532 return false;
1533
1534
1535
1536
1537
1538 if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
1539 sectors > chunk_sects
1540 && (conf->geo.near_copies < conf->geo.raid_disks
1541 || conf->prev.near_copies <
1542 conf->prev.raid_disks)))
1543 sectors = chunk_sects -
1544 (bio->bi_iter.bi_sector &
1545 (chunk_sects - 1));
1546 __make_request(mddev, bio, sectors);
1547
1548
1549 wake_up(&conf->wait_barrier);
1550 return true;
1551}
1552
1553static void raid10_status(struct seq_file *seq, struct mddev *mddev)
1554{
1555 struct r10conf *conf = mddev->private;
1556 int i;
1557
1558 if (conf->geo.near_copies < conf->geo.raid_disks)
1559 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1560 if (conf->geo.near_copies > 1)
1561 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1562 if (conf->geo.far_copies > 1) {
1563 if (conf->geo.far_offset)
1564 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1565 else
1566 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1567 if (conf->geo.far_set_size != conf->geo.raid_disks)
1568 seq_printf(seq, " %d devices per set", conf->geo.far_set_size);
1569 }
1570 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1571 conf->geo.raid_disks - mddev->degraded);
1572 rcu_read_lock();
1573 for (i = 0; i < conf->geo.raid_disks; i++) {
1574 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1575 seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1576 }
1577 rcu_read_unlock();
1578 seq_printf(seq, "]");
1579}
1580
1581
1582
1583
1584
1585
1586static int _enough(struct r10conf *conf, int previous, int ignore)
1587{
1588 int first = 0;
1589 int has_enough = 0;
1590 int disks, ncopies;
1591 if (previous) {
1592 disks = conf->prev.raid_disks;
1593 ncopies = conf->prev.near_copies;
1594 } else {
1595 disks = conf->geo.raid_disks;
1596 ncopies = conf->geo.near_copies;
1597 }
1598
1599 rcu_read_lock();
1600 do {
1601 int n = conf->copies;
1602 int cnt = 0;
1603 int this = first;
1604 while (n--) {
1605 struct md_rdev *rdev;
1606 if (this != ignore &&
1607 (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
1608 test_bit(In_sync, &rdev->flags))
1609 cnt++;
1610 this = (this+1) % disks;
1611 }
1612 if (cnt == 0)
1613 goto out;
1614 first = (first + ncopies) % disks;
1615 } while (first != 0);
1616 has_enough = 1;
1617out:
1618 rcu_read_unlock();
1619 return has_enough;
1620}
1621
1622static int enough(struct r10conf *conf, int ignore)
1623{
1624
1625
1626
1627
1628
1629 return _enough(conf, 0, ignore) &&
1630 _enough(conf, 1, ignore);
1631}
1632
1633static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
1634{
1635 char b[BDEVNAME_SIZE];
1636 struct r10conf *conf = mddev->private;
1637 unsigned long flags;
1638
1639
1640
1641
1642
1643
1644
1645 spin_lock_irqsave(&conf->device_lock, flags);
1646 if (test_bit(In_sync, &rdev->flags)
1647 && !enough(conf, rdev->raid_disk)) {
1648
1649
1650
1651 spin_unlock_irqrestore(&conf->device_lock, flags);
1652 return;
1653 }
1654 if (test_and_clear_bit(In_sync, &rdev->flags))
1655 mddev->degraded++;
1656
1657
1658
1659 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1660 set_bit(Blocked, &rdev->flags);
1661 set_bit(Faulty, &rdev->flags);
1662 set_mask_bits(&mddev->sb_flags, 0,
1663 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1664 spin_unlock_irqrestore(&conf->device_lock, flags);
1665 pr_crit("md/raid10:%s: Disk failure on %s, disabling device.\n"
1666 "md/raid10:%s: Operation continuing on %d devices.\n",
1667 mdname(mddev), bdevname(rdev->bdev, b),
1668 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1669}
1670
1671static void print_conf(struct r10conf *conf)
1672{
1673 int i;
1674 struct md_rdev *rdev;
1675
1676 pr_debug("RAID10 conf printout:\n");
1677 if (!conf) {
1678 pr_debug("(!conf)\n");
1679 return;
1680 }
1681 pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1682 conf->geo.raid_disks);
1683
1684
1685
1686 for (i = 0; i < conf->geo.raid_disks; i++) {
1687 char b[BDEVNAME_SIZE];
1688 rdev = conf->mirrors[i].rdev;
1689 if (rdev)
1690 pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n",
1691 i, !test_bit(In_sync, &rdev->flags),
1692 !test_bit(Faulty, &rdev->flags),
1693 bdevname(rdev->bdev,b));
1694 }
1695}
1696
1697static void close_sync(struct r10conf *conf)
1698{
1699 wait_barrier(conf);
1700 allow_barrier(conf);
1701
1702 mempool_exit(&conf->r10buf_pool);
1703}
1704
1705static int raid10_spare_active(struct mddev *mddev)
1706{
1707 int i;
1708 struct r10conf *conf = mddev->private;
1709 struct raid10_info *tmp;
1710 int count = 0;
1711 unsigned long flags;
1712
1713
1714
1715
1716
1717 for (i = 0; i < conf->geo.raid_disks; i++) {
1718 tmp = conf->mirrors + i;
1719 if (tmp->replacement
1720 && tmp->replacement->recovery_offset == MaxSector
1721 && !test_bit(Faulty, &tmp->replacement->flags)
1722 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1723
1724 if (!tmp->rdev
1725 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1726 count++;
1727 if (tmp->rdev) {
1728
1729
1730
1731
1732 set_bit(Faulty, &tmp->rdev->flags);
1733 sysfs_notify_dirent_safe(
1734 tmp->rdev->sysfs_state);
1735 }
1736 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1737 } else if (tmp->rdev
1738 && tmp->rdev->recovery_offset == MaxSector
1739 && !test_bit(Faulty, &tmp->rdev->flags)
1740 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1741 count++;
1742 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
1743 }
1744 }
1745 spin_lock_irqsave(&conf->device_lock, flags);
1746 mddev->degraded -= count;
1747 spin_unlock_irqrestore(&conf->device_lock, flags);
1748
1749 print_conf(conf);
1750 return count;
1751}
1752
1753static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1754{
1755 struct r10conf *conf = mddev->private;
1756 int err = -EEXIST;
1757 int mirror;
1758 int first = 0;
1759 int last = conf->geo.raid_disks - 1;
1760
1761 if (mddev->recovery_cp < MaxSector)
1762
1763
1764
1765 return -EBUSY;
1766 if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
1767 return -EINVAL;
1768
1769 if (md_integrity_add_rdev(rdev, mddev))
1770 return -ENXIO;
1771
1772 if (rdev->raid_disk >= 0)
1773 first = last = rdev->raid_disk;
1774
1775 if (rdev->saved_raid_disk >= first &&
1776 rdev->saved_raid_disk < conf->geo.raid_disks &&
1777 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1778 mirror = rdev->saved_raid_disk;
1779 else
1780 mirror = first;
1781 for ( ; mirror <= last ; mirror++) {
1782 struct raid10_info *p = &conf->mirrors[mirror];
1783 if (p->recovery_disabled == mddev->recovery_disabled)
1784 continue;
1785 if (p->rdev) {
1786 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1787 p->replacement != NULL)
1788 continue;
1789 clear_bit(In_sync, &rdev->flags);
1790 set_bit(Replacement, &rdev->flags);
1791 rdev->raid_disk = mirror;
1792 err = 0;
1793 if (mddev->gendisk)
1794 disk_stack_limits(mddev->gendisk, rdev->bdev,
1795 rdev->data_offset << 9);
1796 conf->fullsync = 1;
1797 rcu_assign_pointer(p->replacement, rdev);
1798 break;
1799 }
1800
1801 if (mddev->gendisk)
1802 disk_stack_limits(mddev->gendisk, rdev->bdev,
1803 rdev->data_offset << 9);
1804
1805 p->head_position = 0;
1806 p->recovery_disabled = mddev->recovery_disabled - 1;
1807 rdev->raid_disk = mirror;
1808 err = 0;
1809 if (rdev->saved_raid_disk != mirror)
1810 conf->fullsync = 1;
1811 rcu_assign_pointer(p->rdev, rdev);
1812 break;
1813 }
1814 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1815 blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue);
1816
1817 print_conf(conf);
1818 return err;
1819}
1820
1821static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1822{
1823 struct r10conf *conf = mddev->private;
1824 int err = 0;
1825 int number = rdev->raid_disk;
1826 struct md_rdev **rdevp;
1827 struct raid10_info *p = conf->mirrors + number;
1828
1829 print_conf(conf);
1830 if (rdev == p->rdev)
1831 rdevp = &p->rdev;
1832 else if (rdev == p->replacement)
1833 rdevp = &p->replacement;
1834 else
1835 return 0;
1836
1837 if (test_bit(In_sync, &rdev->flags) ||
1838 atomic_read(&rdev->nr_pending)) {
1839 err = -EBUSY;
1840 goto abort;
1841 }
1842
1843
1844
1845 if (!test_bit(Faulty, &rdev->flags) &&
1846 mddev->recovery_disabled != p->recovery_disabled &&
1847 (!p->replacement || p->replacement == rdev) &&
1848 number < conf->geo.raid_disks &&
1849 enough(conf, -1)) {
1850 err = -EBUSY;
1851 goto abort;
1852 }
1853 *rdevp = NULL;
1854 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
1855 synchronize_rcu();
1856 if (atomic_read(&rdev->nr_pending)) {
1857
1858 err = -EBUSY;
1859 *rdevp = rdev;
1860 goto abort;
1861 }
1862 }
1863 if (p->replacement) {
1864
1865 p->rdev = p->replacement;
1866 clear_bit(Replacement, &p->replacement->flags);
1867 smp_mb();
1868
1869
1870 p->replacement = NULL;
1871 }
1872
1873 clear_bit(WantReplacement, &rdev->flags);
1874 err = md_integrity_register(mddev);
1875
1876abort:
1877
1878 print_conf(conf);
1879 return err;
1880}
1881
1882static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
1883{
1884 struct r10conf *conf = r10_bio->mddev->private;
1885
1886 if (!bio->bi_status)
1887 set_bit(R10BIO_Uptodate, &r10_bio->state);
1888 else
1889
1890
1891
1892 atomic_add(r10_bio->sectors,
1893 &conf->mirrors[d].rdev->corrected_errors);
1894
1895
1896
1897
1898 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1899 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1900 atomic_dec_and_test(&r10_bio->remaining)) {
1901
1902
1903
1904 reschedule_retry(r10_bio);
1905 }
1906}
1907
1908static void end_sync_read(struct bio *bio)
1909{
1910 struct r10bio *r10_bio = get_resync_r10bio(bio);
1911 struct r10conf *conf = r10_bio->mddev->private;
1912 int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1913
1914 __end_sync_read(r10_bio, bio, d);
1915}
1916
1917static void end_reshape_read(struct bio *bio)
1918{
1919
1920 struct r10bio *r10_bio = bio->bi_private;
1921
1922 __end_sync_read(r10_bio, bio, r10_bio->read_slot);
1923}
1924
1925static void end_sync_request(struct r10bio *r10_bio)
1926{
1927 struct mddev *mddev = r10_bio->mddev;
1928
1929 while (atomic_dec_and_test(&r10_bio->remaining)) {
1930 if (r10_bio->master_bio == NULL) {
1931
1932 sector_t s = r10_bio->sectors;
1933 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1934 test_bit(R10BIO_WriteError, &r10_bio->state))
1935 reschedule_retry(r10_bio);
1936 else
1937 put_buf(r10_bio);
1938 md_done_sync(mddev, s, 1);
1939 break;
1940 } else {
1941 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1942 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1943 test_bit(R10BIO_WriteError, &r10_bio->state))
1944 reschedule_retry(r10_bio);
1945 else
1946 put_buf(r10_bio);
1947 r10_bio = r10_bio2;
1948 }
1949 }
1950}
1951
1952static void end_sync_write(struct bio *bio)
1953{
1954 struct r10bio *r10_bio = get_resync_r10bio(bio);
1955 struct mddev *mddev = r10_bio->mddev;
1956 struct r10conf *conf = mddev->private;
1957 int d;
1958 sector_t first_bad;
1959 int bad_sectors;
1960 int slot;
1961 int repl;
1962 struct md_rdev *rdev = NULL;
1963
1964 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1965 if (repl)
1966 rdev = conf->mirrors[d].replacement;
1967 else
1968 rdev = conf->mirrors[d].rdev;
1969
1970 if (bio->bi_status) {
1971 if (repl)
1972 md_error(mddev, rdev);
1973 else {
1974 set_bit(WriteErrorSeen, &rdev->flags);
1975 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1976 set_bit(MD_RECOVERY_NEEDED,
1977 &rdev->mddev->recovery);
1978 set_bit(R10BIO_WriteError, &r10_bio->state);
1979 }
1980 } else if (is_badblock(rdev,
1981 r10_bio->devs[slot].addr,
1982 r10_bio->sectors,
1983 &first_bad, &bad_sectors))
1984 set_bit(R10BIO_MadeGood, &r10_bio->state);
1985
1986 rdev_dec_pending(rdev, mddev);
1987
1988 end_sync_request(r10_bio);
1989}
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2008{
2009 struct r10conf *conf = mddev->private;
2010 int i, first;
2011 struct bio *tbio, *fbio;
2012 int vcnt;
2013 struct page **tpages, **fpages;
2014
2015 atomic_set(&r10_bio->remaining, 1);
2016
2017
2018 for (i=0; i<conf->copies; i++)
2019 if (!r10_bio->devs[i].bio->bi_status)
2020 break;
2021
2022 if (i == conf->copies)
2023 goto done;
2024
2025 first = i;
2026 fbio = r10_bio->devs[i].bio;
2027 fbio->bi_iter.bi_size = r10_bio->sectors << 9;
2028 fbio->bi_iter.bi_idx = 0;
2029 fpages = get_resync_pages(fbio)->pages;
2030
2031 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2032
2033 for (i=0 ; i < conf->copies ; i++) {
2034 int j, d;
2035 struct md_rdev *rdev;
2036 struct resync_pages *rp;
2037
2038 tbio = r10_bio->devs[i].bio;
2039
2040 if (tbio->bi_end_io != end_sync_read)
2041 continue;
2042 if (i == first)
2043 continue;
2044
2045 tpages = get_resync_pages(tbio)->pages;
2046 d = r10_bio->devs[i].devnum;
2047 rdev = conf->mirrors[d].rdev;
2048 if (!r10_bio->devs[i].bio->bi_status) {
2049
2050
2051
2052
2053 int sectors = r10_bio->sectors;
2054 for (j = 0; j < vcnt; j++) {
2055 int len = PAGE_SIZE;
2056 if (sectors < (len / 512))
2057 len = sectors * 512;
2058 if (memcmp(page_address(fpages[j]),
2059 page_address(tpages[j]),
2060 len))
2061 break;
2062 sectors -= len/512;
2063 }
2064 if (j == vcnt)
2065 continue;
2066 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2067 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2068
2069 continue;
2070 } else if (test_bit(FailFast, &rdev->flags)) {
2071
2072 md_error(rdev->mddev, rdev);
2073 continue;
2074 }
2075
2076
2077
2078
2079
2080 rp = get_resync_pages(tbio);
2081 bio_reset(tbio);
2082
2083 md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size);
2084
2085 rp->raid_bio = r10_bio;
2086 tbio->bi_private = rp;
2087 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
2088 tbio->bi_end_io = end_sync_write;
2089 bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
2090
2091 bio_copy_data(tbio, fbio);
2092
2093 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2094 atomic_inc(&r10_bio->remaining);
2095 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2096
2097 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
2098 tbio->bi_opf |= MD_FAILFAST;
2099 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
2100 bio_set_dev(tbio, conf->mirrors[d].rdev->bdev);
2101 generic_make_request(tbio);
2102 }
2103
2104
2105
2106
2107 for (i = 0; i < conf->copies; i++) {
2108 int d;
2109
2110 tbio = r10_bio->devs[i].repl_bio;
2111 if (!tbio || !tbio->bi_end_io)
2112 continue;
2113 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2114 && r10_bio->devs[i].bio != fbio)
2115 bio_copy_data(tbio, fbio);
2116 d = r10_bio->devs[i].devnum;
2117 atomic_inc(&r10_bio->remaining);
2118 md_sync_acct(conf->mirrors[d].replacement->bdev,
2119 bio_sectors(tbio));
2120 generic_make_request(tbio);
2121 }
2122
2123done:
2124 if (atomic_dec_and_test(&r10_bio->remaining)) {
2125 md_done_sync(mddev, r10_bio->sectors, 1);
2126 put_buf(r10_bio);
2127 }
2128}
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140static void fix_recovery_read_error(struct r10bio *r10_bio)
2141{
2142
2143
2144
2145
2146
2147
2148
2149 struct mddev *mddev = r10_bio->mddev;
2150 struct r10conf *conf = mddev->private;
2151 struct bio *bio = r10_bio->devs[0].bio;
2152 sector_t sect = 0;
2153 int sectors = r10_bio->sectors;
2154 int idx = 0;
2155 int dr = r10_bio->devs[0].devnum;
2156 int dw = r10_bio->devs[1].devnum;
2157 struct page **pages = get_resync_pages(bio)->pages;
2158
2159 while (sectors) {
2160 int s = sectors;
2161 struct md_rdev *rdev;
2162 sector_t addr;
2163 int ok;
2164
2165 if (s > (PAGE_SIZE>>9))
2166 s = PAGE_SIZE >> 9;
2167
2168 rdev = conf->mirrors[dr].rdev;
2169 addr = r10_bio->devs[0].addr + sect,
2170 ok = sync_page_io(rdev,
2171 addr,
2172 s << 9,
2173 pages[idx],
2174 REQ_OP_READ, 0, false);
2175 if (ok) {
2176 rdev = conf->mirrors[dw].rdev;
2177 addr = r10_bio->devs[1].addr + sect;
2178 ok = sync_page_io(rdev,
2179 addr,
2180 s << 9,
2181 pages[idx],
2182 REQ_OP_WRITE, 0, false);
2183 if (!ok) {
2184 set_bit(WriteErrorSeen, &rdev->flags);
2185 if (!test_and_set_bit(WantReplacement,
2186 &rdev->flags))
2187 set_bit(MD_RECOVERY_NEEDED,
2188 &rdev->mddev->recovery);
2189 }
2190 }
2191 if (!ok) {
2192
2193
2194
2195
2196 rdev_set_badblocks(rdev, addr, s, 0);
2197
2198 if (rdev != conf->mirrors[dw].rdev) {
2199
2200 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2201 addr = r10_bio->devs[1].addr + sect;
2202 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2203 if (!ok) {
2204
2205 pr_notice("md/raid10:%s: recovery aborted due to read error\n",
2206 mdname(mddev));
2207
2208 conf->mirrors[dw].recovery_disabled
2209 = mddev->recovery_disabled;
2210 set_bit(MD_RECOVERY_INTR,
2211 &mddev->recovery);
2212 break;
2213 }
2214 }
2215 }
2216
2217 sectors -= s;
2218 sect += s;
2219 idx++;
2220 }
2221}
2222
2223static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2224{
2225 struct r10conf *conf = mddev->private;
2226 int d;
2227 struct bio *wbio, *wbio2;
2228
2229 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2230 fix_recovery_read_error(r10_bio);
2231 end_sync_request(r10_bio);
2232 return;
2233 }
2234
2235
2236
2237
2238
2239 d = r10_bio->devs[1].devnum;
2240 wbio = r10_bio->devs[1].bio;
2241 wbio2 = r10_bio->devs[1].repl_bio;
2242
2243
2244
2245
2246 if (wbio2 && !wbio2->bi_end_io)
2247 wbio2 = NULL;
2248 if (wbio->bi_end_io) {
2249 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2250 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2251 generic_make_request(wbio);
2252 }
2253 if (wbio2) {
2254 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2255 md_sync_acct(conf->mirrors[d].replacement->bdev,
2256 bio_sectors(wbio2));
2257 generic_make_request(wbio2);
2258 }
2259}
2260
2261
2262
2263
2264
2265
2266
2267static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2268{
2269 long cur_time_mon;
2270 unsigned long hours_since_last;
2271 unsigned int read_errors = atomic_read(&rdev->read_errors);
2272
2273 cur_time_mon = ktime_get_seconds();
2274
2275 if (rdev->last_read_error == 0) {
2276
2277 rdev->last_read_error = cur_time_mon;
2278 return;
2279 }
2280
2281 hours_since_last = (long)(cur_time_mon -
2282 rdev->last_read_error) / 3600;
2283
2284 rdev->last_read_error = cur_time_mon;
2285
2286
2287
2288
2289
2290
2291 if (hours_since_last >= 8 * sizeof(read_errors))
2292 atomic_set(&rdev->read_errors, 0);
2293 else
2294 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2295}
2296
2297static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2298 int sectors, struct page *page, int rw)
2299{
2300 sector_t first_bad;
2301 int bad_sectors;
2302
2303 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2304 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2305 return -1;
2306 if (sync_page_io(rdev, sector, sectors << 9, page, rw, 0, false))
2307
2308 return 1;
2309 if (rw == WRITE) {
2310 set_bit(WriteErrorSeen, &rdev->flags);
2311 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2312 set_bit(MD_RECOVERY_NEEDED,
2313 &rdev->mddev->recovery);
2314 }
2315
2316 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2317 md_error(rdev->mddev, rdev);
2318 return 0;
2319}
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2330{
2331 int sect = 0;
2332 int sectors = r10_bio->sectors;
2333 struct md_rdev *rdev;
2334 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2335 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2336
2337
2338
2339
2340 rdev = conf->mirrors[d].rdev;
2341
2342 if (test_bit(Faulty, &rdev->flags))
2343
2344
2345 return;
2346
2347 check_decay_read_errors(mddev, rdev);
2348 atomic_inc(&rdev->read_errors);
2349 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2350 char b[BDEVNAME_SIZE];
2351 bdevname(rdev->bdev, b);
2352
2353 pr_notice("md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %d:max %d]\n",
2354 mdname(mddev), b,
2355 atomic_read(&rdev->read_errors), max_read_errors);
2356 pr_notice("md/raid10:%s: %s: Failing raid device\n",
2357 mdname(mddev), b);
2358 md_error(mddev, rdev);
2359 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2360 return;
2361 }
2362
2363 while(sectors) {
2364 int s = sectors;
2365 int sl = r10_bio->read_slot;
2366 int success = 0;
2367 int start;
2368
2369 if (s > (PAGE_SIZE>>9))
2370 s = PAGE_SIZE >> 9;
2371
2372 rcu_read_lock();
2373 do {
2374 sector_t first_bad;
2375 int bad_sectors;
2376
2377 d = r10_bio->devs[sl].devnum;
2378 rdev = rcu_dereference(conf->mirrors[d].rdev);
2379 if (rdev &&
2380 test_bit(In_sync, &rdev->flags) &&
2381 !test_bit(Faulty, &rdev->flags) &&
2382 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2383 &first_bad, &bad_sectors) == 0) {
2384 atomic_inc(&rdev->nr_pending);
2385 rcu_read_unlock();
2386 success = sync_page_io(rdev,
2387 r10_bio->devs[sl].addr +
2388 sect,
2389 s<<9,
2390 conf->tmppage,
2391 REQ_OP_READ, 0, false);
2392 rdev_dec_pending(rdev, mddev);
2393 rcu_read_lock();
2394 if (success)
2395 break;
2396 }
2397 sl++;
2398 if (sl == conf->copies)
2399 sl = 0;
2400 } while (!success && sl != r10_bio->read_slot);
2401 rcu_read_unlock();
2402
2403 if (!success) {
2404
2405
2406
2407
2408 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2409 rdev = conf->mirrors[dn].rdev;
2410
2411 if (!rdev_set_badblocks(
2412 rdev,
2413 r10_bio->devs[r10_bio->read_slot].addr
2414 + sect,
2415 s, 0)) {
2416 md_error(mddev, rdev);
2417 r10_bio->devs[r10_bio->read_slot].bio
2418 = IO_BLOCKED;
2419 }
2420 break;
2421 }
2422
2423 start = sl;
2424
2425 rcu_read_lock();
2426 while (sl != r10_bio->read_slot) {
2427 char b[BDEVNAME_SIZE];
2428
2429 if (sl==0)
2430 sl = conf->copies;
2431 sl--;
2432 d = r10_bio->devs[sl].devnum;
2433 rdev = rcu_dereference(conf->mirrors[d].rdev);
2434 if (!rdev ||
2435 test_bit(Faulty, &rdev->flags) ||
2436 !test_bit(In_sync, &rdev->flags))
2437 continue;
2438
2439 atomic_inc(&rdev->nr_pending);
2440 rcu_read_unlock();
2441 if (r10_sync_page_io(rdev,
2442 r10_bio->devs[sl].addr +
2443 sect,
2444 s, conf->tmppage, WRITE)
2445 == 0) {
2446
2447 pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %s)\n",
2448 mdname(mddev), s,
2449 (unsigned long long)(
2450 sect +
2451 choose_data_offset(r10_bio,
2452 rdev)),
2453 bdevname(rdev->bdev, b));
2454 pr_notice("md/raid10:%s: %s: failing drive\n",
2455 mdname(mddev),
2456 bdevname(rdev->bdev, b));
2457 }
2458 rdev_dec_pending(rdev, mddev);
2459 rcu_read_lock();
2460 }
2461 sl = start;
2462 while (sl != r10_bio->read_slot) {
2463 char b[BDEVNAME_SIZE];
2464
2465 if (sl==0)
2466 sl = conf->copies;
2467 sl--;
2468 d = r10_bio->devs[sl].devnum;
2469 rdev = rcu_dereference(conf->mirrors[d].rdev);
2470 if (!rdev ||
2471 test_bit(Faulty, &rdev->flags) ||
2472 !test_bit(In_sync, &rdev->flags))
2473 continue;
2474
2475 atomic_inc(&rdev->nr_pending);
2476 rcu_read_unlock();
2477 switch (r10_sync_page_io(rdev,
2478 r10_bio->devs[sl].addr +
2479 sect,
2480 s, conf->tmppage,
2481 READ)) {
2482 case 0:
2483
2484 pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %s)\n",
2485 mdname(mddev), s,
2486 (unsigned long long)(
2487 sect +
2488 choose_data_offset(r10_bio, rdev)),
2489 bdevname(rdev->bdev, b));
2490 pr_notice("md/raid10:%s: %s: failing drive\n",
2491 mdname(mddev),
2492 bdevname(rdev->bdev, b));
2493 break;
2494 case 1:
2495 pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %s)\n",
2496 mdname(mddev), s,
2497 (unsigned long long)(
2498 sect +
2499 choose_data_offset(r10_bio, rdev)),
2500 bdevname(rdev->bdev, b));
2501 atomic_add(s, &rdev->corrected_errors);
2502 }
2503
2504 rdev_dec_pending(rdev, mddev);
2505 rcu_read_lock();
2506 }
2507 rcu_read_unlock();
2508
2509 sectors -= s;
2510 sect += s;
2511 }
2512}
2513
2514static int narrow_write_error(struct r10bio *r10_bio, int i)
2515{
2516 struct bio *bio = r10_bio->master_bio;
2517 struct mddev *mddev = r10_bio->mddev;
2518 struct r10conf *conf = mddev->private;
2519 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531 int block_sectors;
2532 sector_t sector;
2533 int sectors;
2534 int sect_to_write = r10_bio->sectors;
2535 int ok = 1;
2536
2537 if (rdev->badblocks.shift < 0)
2538 return 0;
2539
2540 block_sectors = roundup(1 << rdev->badblocks.shift,
2541 bdev_logical_block_size(rdev->bdev) >> 9);
2542 sector = r10_bio->sector;
2543 sectors = ((r10_bio->sector + block_sectors)
2544 & ~(sector_t)(block_sectors - 1))
2545 - sector;
2546
2547 while (sect_to_write) {
2548 struct bio *wbio;
2549 sector_t wsector;
2550 if (sectors > sect_to_write)
2551 sectors = sect_to_write;
2552
2553 wbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
2554 bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
2555 wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
2556 wbio->bi_iter.bi_sector = wsector +
2557 choose_data_offset(r10_bio, rdev);
2558 bio_set_dev(wbio, rdev->bdev);
2559 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
2560
2561 if (submit_bio_wait(wbio) < 0)
2562
2563 ok = rdev_set_badblocks(rdev, wsector,
2564 sectors, 0)
2565 && ok;
2566
2567 bio_put(wbio);
2568 sect_to_write -= sectors;
2569 sector += sectors;
2570 sectors = block_sectors;
2571 }
2572 return ok;
2573}
2574
2575static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2576{
2577 int slot = r10_bio->read_slot;
2578 struct bio *bio;
2579 struct r10conf *conf = mddev->private;
2580 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590 bio = r10_bio->devs[slot].bio;
2591 bio_put(bio);
2592 r10_bio->devs[slot].bio = NULL;
2593
2594 if (mddev->ro)
2595 r10_bio->devs[slot].bio = IO_BLOCKED;
2596 else if (!test_bit(FailFast, &rdev->flags)) {
2597 freeze_array(conf, 1);
2598 fix_read_error(conf, mddev, r10_bio);
2599 unfreeze_array(conf);
2600 } else
2601 md_error(mddev, rdev);
2602
2603 rdev_dec_pending(rdev, mddev);
2604 allow_barrier(conf);
2605 r10_bio->state = 0;
2606 raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
2607}
2608
2609static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2610{
2611
2612
2613
2614
2615
2616
2617 int m;
2618 struct md_rdev *rdev;
2619
2620 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2621 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2622 for (m = 0; m < conf->copies; m++) {
2623 int dev = r10_bio->devs[m].devnum;
2624 rdev = conf->mirrors[dev].rdev;
2625 if (r10_bio->devs[m].bio == NULL ||
2626 r10_bio->devs[m].bio->bi_end_io == NULL)
2627 continue;
2628 if (!r10_bio->devs[m].bio->bi_status) {
2629 rdev_clear_badblocks(
2630 rdev,
2631 r10_bio->devs[m].addr,
2632 r10_bio->sectors, 0);
2633 } else {
2634 if (!rdev_set_badblocks(
2635 rdev,
2636 r10_bio->devs[m].addr,
2637 r10_bio->sectors, 0))
2638 md_error(conf->mddev, rdev);
2639 }
2640 rdev = conf->mirrors[dev].replacement;
2641 if (r10_bio->devs[m].repl_bio == NULL ||
2642 r10_bio->devs[m].repl_bio->bi_end_io == NULL)
2643 continue;
2644
2645 if (!r10_bio->devs[m].repl_bio->bi_status) {
2646 rdev_clear_badblocks(
2647 rdev,
2648 r10_bio->devs[m].addr,
2649 r10_bio->sectors, 0);
2650 } else {
2651 if (!rdev_set_badblocks(
2652 rdev,
2653 r10_bio->devs[m].addr,
2654 r10_bio->sectors, 0))
2655 md_error(conf->mddev, rdev);
2656 }
2657 }
2658 put_buf(r10_bio);
2659 } else {
2660 bool fail = false;
2661 for (m = 0; m < conf->copies; m++) {
2662 int dev = r10_bio->devs[m].devnum;
2663 struct bio *bio = r10_bio->devs[m].bio;
2664 rdev = conf->mirrors[dev].rdev;
2665 if (bio == IO_MADE_GOOD) {
2666 rdev_clear_badblocks(
2667 rdev,
2668 r10_bio->devs[m].addr,
2669 r10_bio->sectors, 0);
2670 rdev_dec_pending(rdev, conf->mddev);
2671 } else if (bio != NULL && bio->bi_status) {
2672 fail = true;
2673 if (!narrow_write_error(r10_bio, m)) {
2674 md_error(conf->mddev, rdev);
2675 set_bit(R10BIO_Degraded,
2676 &r10_bio->state);
2677 }
2678 rdev_dec_pending(rdev, conf->mddev);
2679 }
2680 bio = r10_bio->devs[m].repl_bio;
2681 rdev = conf->mirrors[dev].replacement;
2682 if (rdev && bio == IO_MADE_GOOD) {
2683 rdev_clear_badblocks(
2684 rdev,
2685 r10_bio->devs[m].addr,
2686 r10_bio->sectors, 0);
2687 rdev_dec_pending(rdev, conf->mddev);
2688 }
2689 }
2690 if (fail) {
2691 spin_lock_irq(&conf->device_lock);
2692 list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
2693 conf->nr_queued++;
2694 spin_unlock_irq(&conf->device_lock);
2695
2696
2697
2698
2699 wake_up(&conf->wait_barrier);
2700 md_wakeup_thread(conf->mddev->thread);
2701 } else {
2702 if (test_bit(R10BIO_WriteError,
2703 &r10_bio->state))
2704 close_write(r10_bio);
2705 raid_end_bio_io(r10_bio);
2706 }
2707 }
2708}
2709
2710static void raid10d(struct md_thread *thread)
2711{
2712 struct mddev *mddev = thread->mddev;
2713 struct r10bio *r10_bio;
2714 unsigned long flags;
2715 struct r10conf *conf = mddev->private;
2716 struct list_head *head = &conf->retry_list;
2717 struct blk_plug plug;
2718
2719 md_check_recovery(mddev);
2720
2721 if (!list_empty_careful(&conf->bio_end_io_list) &&
2722 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2723 LIST_HEAD(tmp);
2724 spin_lock_irqsave(&conf->device_lock, flags);
2725 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2726 while (!list_empty(&conf->bio_end_io_list)) {
2727 list_move(conf->bio_end_io_list.prev, &tmp);
2728 conf->nr_queued--;
2729 }
2730 }
2731 spin_unlock_irqrestore(&conf->device_lock, flags);
2732 while (!list_empty(&tmp)) {
2733 r10_bio = list_first_entry(&tmp, struct r10bio,
2734 retry_list);
2735 list_del(&r10_bio->retry_list);
2736 if (mddev->degraded)
2737 set_bit(R10BIO_Degraded, &r10_bio->state);
2738
2739 if (test_bit(R10BIO_WriteError,
2740 &r10_bio->state))
2741 close_write(r10_bio);
2742 raid_end_bio_io(r10_bio);
2743 }
2744 }
2745
2746 blk_start_plug(&plug);
2747 for (;;) {
2748
2749 flush_pending_writes(conf);
2750
2751 spin_lock_irqsave(&conf->device_lock, flags);
2752 if (list_empty(head)) {
2753 spin_unlock_irqrestore(&conf->device_lock, flags);
2754 break;
2755 }
2756 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2757 list_del(head->prev);
2758 conf->nr_queued--;
2759 spin_unlock_irqrestore(&conf->device_lock, flags);
2760
2761 mddev = r10_bio->mddev;
2762 conf = mddev->private;
2763 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2764 test_bit(R10BIO_WriteError, &r10_bio->state))
2765 handle_write_completed(conf, r10_bio);
2766 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2767 reshape_request_write(mddev, r10_bio);
2768 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2769 sync_request_write(mddev, r10_bio);
2770 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2771 recovery_request_write(mddev, r10_bio);
2772 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2773 handle_read_error(mddev, r10_bio);
2774 else
2775 WARN_ON_ONCE(1);
2776
2777 cond_resched();
2778 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
2779 md_check_recovery(mddev);
2780 }
2781 blk_finish_plug(&plug);
2782}
2783
2784static int init_resync(struct r10conf *conf)
2785{
2786 int ret, buffs, i;
2787
2788 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2789 BUG_ON(mempool_initialized(&conf->r10buf_pool));
2790 conf->have_replacement = 0;
2791 for (i = 0; i < conf->geo.raid_disks; i++)
2792 if (conf->mirrors[i].replacement)
2793 conf->have_replacement = 1;
2794 ret = mempool_init(&conf->r10buf_pool, buffs,
2795 r10buf_pool_alloc, r10buf_pool_free, conf);
2796 if (ret)
2797 return ret;
2798 conf->next_resync = 0;
2799 return 0;
2800}
2801
2802static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
2803{
2804 struct r10bio *r10bio = mempool_alloc(&conf->r10buf_pool, GFP_NOIO);
2805 struct rsync_pages *rp;
2806 struct bio *bio;
2807 int nalloc;
2808 int i;
2809
2810 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
2811 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
2812 nalloc = conf->copies;
2813 else
2814 nalloc = 2;
2815
2816 for (i = 0; i < nalloc; i++) {
2817 bio = r10bio->devs[i].bio;
2818 rp = bio->bi_private;
2819 bio_reset(bio);
2820 bio->bi_private = rp;
2821 bio = r10bio->devs[i].repl_bio;
2822 if (bio) {
2823 rp = bio->bi_private;
2824 bio_reset(bio);
2825 bio->bi_private = rp;
2826 }
2827 }
2828 return r10bio;
2829}
2830
2831
2832
2833
2834
2835static void raid10_set_cluster_sync_high(struct r10conf *conf)
2836{
2837 sector_t window_size;
2838 int extra_chunk, chunks;
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852 chunks = conf->geo.raid_disks / conf->geo.near_copies;
2853 if (conf->geo.raid_disks % conf->geo.near_copies == 0)
2854 extra_chunk = 0;
2855 else
2856 extra_chunk = 1;
2857 window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors;
2858
2859
2860
2861
2862 window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ?
2863 CLUSTER_RESYNC_WINDOW_SECTORS : window_size;
2864
2865 conf->cluster_sync_high = conf->cluster_sync_low + window_size;
2866}
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
2901 int *skipped)
2902{
2903 struct r10conf *conf = mddev->private;
2904 struct r10bio *r10_bio;
2905 struct bio *biolist = NULL, *bio;
2906 sector_t max_sector, nr_sectors;
2907 int i;
2908 int max_sync;
2909 sector_t sync_blocks;
2910 sector_t sectors_skipped = 0;
2911 int chunks_skipped = 0;
2912 sector_t chunk_mask = conf->geo.chunk_mask;
2913 int page_idx = 0;
2914
2915 if (!mempool_initialized(&conf->r10buf_pool))
2916 if (init_resync(conf))
2917 return 0;
2918
2919
2920
2921
2922
2923 if (mddev->bitmap == NULL &&
2924 mddev->recovery_cp == MaxSector &&
2925 mddev->reshape_position == MaxSector &&
2926 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2927 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
2928 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2929 conf->fullsync == 0) {
2930 *skipped = 1;
2931 return mddev->dev_sectors - sector_nr;
2932 }
2933
2934 skipped:
2935 max_sector = mddev->dev_sectors;
2936 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2937 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2938 max_sector = mddev->resync_max_sectors;
2939 if (sector_nr >= max_sector) {
2940 conf->cluster_sync_low = 0;
2941 conf->cluster_sync_high = 0;
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2953 end_reshape(conf);
2954 close_sync(conf);
2955 return 0;
2956 }
2957
2958 if (mddev->curr_resync < max_sector) {
2959 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2960 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2961 &sync_blocks, 1);
2962 else for (i = 0; i < conf->geo.raid_disks; i++) {
2963 sector_t sect =
2964 raid10_find_virt(conf, mddev->curr_resync, i);
2965 md_bitmap_end_sync(mddev->bitmap, sect,
2966 &sync_blocks, 1);
2967 }
2968 } else {
2969
2970 if ((!mddev->bitmap || conf->fullsync)
2971 && conf->have_replacement
2972 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2973
2974
2975
2976 rcu_read_lock();
2977 for (i = 0; i < conf->geo.raid_disks; i++) {
2978 struct md_rdev *rdev =
2979 rcu_dereference(conf->mirrors[i].replacement);
2980 if (rdev)
2981 rdev->recovery_offset = MaxSector;
2982 }
2983 rcu_read_unlock();
2984 }
2985 conf->fullsync = 0;
2986 }
2987 md_bitmap_close_sync(mddev->bitmap);
2988 close_sync(conf);
2989 *skipped = 1;
2990 return sectors_skipped;
2991 }
2992
2993 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2994 return reshape_request(mddev, sector_nr, skipped);
2995
2996 if (chunks_skipped >= conf->geo.raid_disks) {
2997
2998
2999
3000 *skipped = 1;
3001 return (max_sector - sector_nr) + sectors_skipped;
3002 }
3003
3004 if (max_sector > mddev->resync_max)
3005 max_sector = mddev->resync_max;
3006
3007
3008
3009
3010 if (conf->geo.near_copies < conf->geo.raid_disks &&
3011 max_sector > (sector_nr | chunk_mask))
3012 max_sector = (sector_nr | chunk_mask) + 1;
3013
3014
3015
3016
3017
3018 if (conf->nr_waiting)
3019 schedule_timeout_uninterruptible(1);
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
3037 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3038
3039 int j;
3040 r10_bio = NULL;
3041
3042 for (i = 0 ; i < conf->geo.raid_disks; i++) {
3043 int still_degraded;
3044 struct r10bio *rb2;
3045 sector_t sect;
3046 int must_sync;
3047 int any_working;
3048 int need_recover = 0;
3049 int need_replace = 0;
3050 struct raid10_info *mirror = &conf->mirrors[i];
3051 struct md_rdev *mrdev, *mreplace;
3052
3053 rcu_read_lock();
3054 mrdev = rcu_dereference(mirror->rdev);
3055 mreplace = rcu_dereference(mirror->replacement);
3056
3057 if (mrdev != NULL &&
3058 !test_bit(Faulty, &mrdev->flags) &&
3059 !test_bit(In_sync, &mrdev->flags))
3060 need_recover = 1;
3061 if (mreplace != NULL &&
3062 !test_bit(Faulty, &mreplace->flags))
3063 need_replace = 1;
3064
3065 if (!need_recover && !need_replace) {
3066 rcu_read_unlock();
3067 continue;
3068 }
3069
3070 still_degraded = 0;
3071
3072 rb2 = r10_bio;
3073 sect = raid10_find_virt(conf, sector_nr, i);
3074 if (sect >= mddev->resync_max_sectors) {
3075
3076
3077
3078 rcu_read_unlock();
3079 continue;
3080 }
3081 if (mreplace && test_bit(Faulty, &mreplace->flags))
3082 mreplace = NULL;
3083
3084
3085
3086
3087 must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
3088 &sync_blocks, 1);
3089 if (sync_blocks < max_sync)
3090 max_sync = sync_blocks;
3091 if (!must_sync &&
3092 mreplace == NULL &&
3093 !conf->fullsync) {
3094
3095
3096
3097 chunks_skipped = -1;
3098 rcu_read_unlock();
3099 continue;
3100 }
3101 atomic_inc(&mrdev->nr_pending);
3102 if (mreplace)
3103 atomic_inc(&mreplace->nr_pending);
3104 rcu_read_unlock();
3105
3106 r10_bio = raid10_alloc_init_r10buf(conf);
3107 r10_bio->state = 0;
3108 raise_barrier(conf, rb2 != NULL);
3109 atomic_set(&r10_bio->remaining, 0);
3110
3111 r10_bio->master_bio = (struct bio*)rb2;
3112 if (rb2)
3113 atomic_inc(&rb2->remaining);
3114 r10_bio->mddev = mddev;
3115 set_bit(R10BIO_IsRecover, &r10_bio->state);
3116 r10_bio->sector = sect;
3117
3118 raid10_find_phys(conf, r10_bio);
3119
3120
3121
3122
3123 rcu_read_lock();
3124 for (j = 0; j < conf->geo.raid_disks; j++) {
3125 struct md_rdev *rdev = rcu_dereference(
3126 conf->mirrors[j].rdev);
3127 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3128 still_degraded = 1;
3129 break;
3130 }
3131 }
3132
3133 must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
3134 &sync_blocks, still_degraded);
3135
3136 any_working = 0;
3137 for (j=0; j<conf->copies;j++) {
3138 int k;
3139 int d = r10_bio->devs[j].devnum;
3140 sector_t from_addr, to_addr;
3141 struct md_rdev *rdev =
3142 rcu_dereference(conf->mirrors[d].rdev);
3143 sector_t sector, first_bad;
3144 int bad_sectors;
3145 if (!rdev ||
3146 !test_bit(In_sync, &rdev->flags))
3147 continue;
3148
3149 any_working = 1;
3150 sector = r10_bio->devs[j].addr;
3151
3152 if (is_badblock(rdev, sector, max_sync,
3153 &first_bad, &bad_sectors)) {
3154 if (first_bad > sector)
3155 max_sync = first_bad - sector;
3156 else {
3157 bad_sectors -= (sector
3158 - first_bad);
3159 if (max_sync > bad_sectors)
3160 max_sync = bad_sectors;
3161 continue;
3162 }
3163 }
3164 bio = r10_bio->devs[0].bio;
3165 bio->bi_next = biolist;
3166 biolist = bio;
3167 bio->bi_end_io = end_sync_read;
3168 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3169 if (test_bit(FailFast, &rdev->flags))
3170 bio->bi_opf |= MD_FAILFAST;
3171 from_addr = r10_bio->devs[j].addr;
3172 bio->bi_iter.bi_sector = from_addr +
3173 rdev->data_offset;
3174 bio_set_dev(bio, rdev->bdev);
3175 atomic_inc(&rdev->nr_pending);
3176
3177
3178 for (k=0; k<conf->copies; k++)
3179 if (r10_bio->devs[k].devnum == i)
3180 break;
3181 BUG_ON(k == conf->copies);
3182 to_addr = r10_bio->devs[k].addr;
3183 r10_bio->devs[0].devnum = d;
3184 r10_bio->devs[0].addr = from_addr;
3185 r10_bio->devs[1].devnum = i;
3186 r10_bio->devs[1].addr = to_addr;
3187
3188 if (need_recover) {
3189 bio = r10_bio->devs[1].bio;
3190 bio->bi_next = biolist;
3191 biolist = bio;
3192 bio->bi_end_io = end_sync_write;
3193 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3194 bio->bi_iter.bi_sector = to_addr
3195 + mrdev->data_offset;
3196 bio_set_dev(bio, mrdev->bdev);
3197 atomic_inc(&r10_bio->remaining);
3198 } else
3199 r10_bio->devs[1].bio->bi_end_io = NULL;
3200
3201
3202 bio = r10_bio->devs[1].repl_bio;
3203 if (bio)
3204 bio->bi_end_io = NULL;
3205
3206
3207
3208
3209 if (!need_replace)
3210 break;
3211 bio->bi_next = biolist;
3212 biolist = bio;
3213 bio->bi_end_io = end_sync_write;
3214 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3215 bio->bi_iter.bi_sector = to_addr +
3216 mreplace->data_offset;
3217 bio_set_dev(bio, mreplace->bdev);
3218 atomic_inc(&r10_bio->remaining);
3219 break;
3220 }
3221 rcu_read_unlock();
3222 if (j == conf->copies) {
3223
3224
3225 if (any_working) {
3226
3227
3228
3229 int k;
3230 for (k = 0; k < conf->copies; k++)
3231 if (r10_bio->devs[k].devnum == i)
3232 break;
3233 if (!test_bit(In_sync,
3234 &mrdev->flags)
3235 && !rdev_set_badblocks(
3236 mrdev,
3237 r10_bio->devs[k].addr,
3238 max_sync, 0))
3239 any_working = 0;
3240 if (mreplace &&
3241 !rdev_set_badblocks(
3242 mreplace,
3243 r10_bio->devs[k].addr,
3244 max_sync, 0))
3245 any_working = 0;
3246 }
3247 if (!any_working) {
3248 if (!test_and_set_bit(MD_RECOVERY_INTR,
3249 &mddev->recovery))
3250 pr_warn("md/raid10:%s: insufficient working devices for recovery.\n",
3251 mdname(mddev));
3252 mirror->recovery_disabled
3253 = mddev->recovery_disabled;
3254 }
3255 put_buf(r10_bio);
3256 if (rb2)
3257 atomic_dec(&rb2->remaining);
3258 r10_bio = rb2;
3259 rdev_dec_pending(mrdev, mddev);
3260 if (mreplace)
3261 rdev_dec_pending(mreplace, mddev);
3262 break;
3263 }
3264 rdev_dec_pending(mrdev, mddev);
3265 if (mreplace)
3266 rdev_dec_pending(mreplace, mddev);
3267 if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) {
3268
3269
3270
3271
3272 int targets = 1;
3273 for (; j < conf->copies; j++) {
3274 int d = r10_bio->devs[j].devnum;
3275 if (conf->mirrors[d].rdev &&
3276 test_bit(In_sync,
3277 &conf->mirrors[d].rdev->flags))
3278 targets++;
3279 }
3280 if (targets == 1)
3281 r10_bio->devs[0].bio->bi_opf
3282 &= ~MD_FAILFAST;
3283 }
3284 }
3285 if (biolist == NULL) {
3286 while (r10_bio) {
3287 struct r10bio *rb2 = r10_bio;
3288 r10_bio = (struct r10bio*) rb2->master_bio;
3289 rb2->master_bio = NULL;
3290 put_buf(rb2);
3291 }
3292 goto giveup;
3293 }
3294 } else {
3295
3296 int count = 0;
3297
3298
3299
3300
3301
3302
3303
3304
3305 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
3306 mddev_is_clustered(mddev) &&
3307 (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
3308
3309 if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
3310 &sync_blocks, mddev->degraded) &&
3311 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3312 &mddev->recovery)) {
3313
3314 *skipped = 1;
3315 return sync_blocks + sectors_skipped;
3316 }
3317 if (sync_blocks < max_sync)
3318 max_sync = sync_blocks;
3319 r10_bio = raid10_alloc_init_r10buf(conf);
3320 r10_bio->state = 0;
3321
3322 r10_bio->mddev = mddev;
3323 atomic_set(&r10_bio->remaining, 0);
3324 raise_barrier(conf, 0);
3325 conf->next_resync = sector_nr;
3326
3327 r10_bio->master_bio = NULL;
3328 r10_bio->sector = sector_nr;
3329 set_bit(R10BIO_IsSync, &r10_bio->state);
3330 raid10_find_phys(conf, r10_bio);
3331 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3332
3333 for (i = 0; i < conf->copies; i++) {
3334 int d = r10_bio->devs[i].devnum;
3335 sector_t first_bad, sector;
3336 int bad_sectors;
3337 struct md_rdev *rdev;
3338
3339 if (r10_bio->devs[i].repl_bio)
3340 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3341
3342 bio = r10_bio->devs[i].bio;
3343 bio->bi_status = BLK_STS_IOERR;
3344 rcu_read_lock();
3345 rdev = rcu_dereference(conf->mirrors[d].rdev);
3346 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3347 rcu_read_unlock();
3348 continue;
3349 }
3350 sector = r10_bio->devs[i].addr;
3351 if (is_badblock(rdev, sector, max_sync,
3352 &first_bad, &bad_sectors)) {
3353 if (first_bad > sector)
3354 max_sync = first_bad - sector;
3355 else {
3356 bad_sectors -= (sector - first_bad);
3357 if (max_sync > bad_sectors)
3358 max_sync = bad_sectors;
3359 rcu_read_unlock();
3360 continue;
3361 }
3362 }
3363 atomic_inc(&rdev->nr_pending);
3364 atomic_inc(&r10_bio->remaining);
3365 bio->bi_next = biolist;
3366 biolist = bio;
3367 bio->bi_end_io = end_sync_read;
3368 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3369 if (test_bit(FailFast, &rdev->flags))
3370 bio->bi_opf |= MD_FAILFAST;
3371 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3372 bio_set_dev(bio, rdev->bdev);
3373 count++;
3374
3375 rdev = rcu_dereference(conf->mirrors[d].replacement);
3376 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3377 rcu_read_unlock();
3378 continue;
3379 }
3380 atomic_inc(&rdev->nr_pending);
3381
3382
3383 bio = r10_bio->devs[i].repl_bio;
3384 bio->bi_status = BLK_STS_IOERR;
3385
3386 sector = r10_bio->devs[i].addr;
3387 bio->bi_next = biolist;
3388 biolist = bio;
3389 bio->bi_end_io = end_sync_write;
3390 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3391 if (test_bit(FailFast, &rdev->flags))
3392 bio->bi_opf |= MD_FAILFAST;
3393 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3394 bio_set_dev(bio, rdev->bdev);
3395 count++;
3396 rcu_read_unlock();
3397 }
3398
3399 if (count < 2) {
3400 for (i=0; i<conf->copies; i++) {
3401 int d = r10_bio->devs[i].devnum;
3402 if (r10_bio->devs[i].bio->bi_end_io)
3403 rdev_dec_pending(conf->mirrors[d].rdev,
3404 mddev);
3405 if (r10_bio->devs[i].repl_bio &&
3406 r10_bio->devs[i].repl_bio->bi_end_io)
3407 rdev_dec_pending(
3408 conf->mirrors[d].replacement,
3409 mddev);
3410 }
3411 put_buf(r10_bio);
3412 biolist = NULL;
3413 goto giveup;
3414 }
3415 }
3416
3417 nr_sectors = 0;
3418 if (sector_nr + max_sync < max_sector)
3419 max_sector = sector_nr + max_sync;
3420 do {
3421 struct page *page;
3422 int len = PAGE_SIZE;
3423 if (sector_nr + (len>>9) > max_sector)
3424 len = (max_sector - sector_nr) << 9;
3425 if (len == 0)
3426 break;
3427 for (bio= biolist ; bio ; bio=bio->bi_next) {
3428 struct resync_pages *rp = get_resync_pages(bio);
3429 page = resync_fetch_page(rp, page_idx);
3430
3431
3432
3433
3434 bio_add_page(bio, page, len, 0);
3435 }
3436 nr_sectors += len>>9;
3437 sector_nr += len>>9;
3438 } while (++page_idx < RESYNC_PAGES);
3439 r10_bio->sectors = nr_sectors;
3440
3441 if (mddev_is_clustered(mddev) &&
3442 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3443
3444 if (conf->cluster_sync_high < sector_nr + nr_sectors) {
3445 conf->cluster_sync_low = mddev->curr_resync_completed;
3446 raid10_set_cluster_sync_high(conf);
3447
3448 md_cluster_ops->resync_info_update(mddev,
3449 conf->cluster_sync_low,
3450 conf->cluster_sync_high);
3451 }
3452 } else if (mddev_is_clustered(mddev)) {
3453
3454 sector_t sect_va1, sect_va2;
3455 bool broadcast_msg = false;
3456
3457 for (i = 0; i < conf->geo.raid_disks; i++) {
3458
3459
3460
3461
3462
3463 sect_va1 = raid10_find_virt(conf, sector_nr, i);
3464
3465 if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
3466 broadcast_msg = true;
3467
3468
3469
3470
3471 sect_va2 = raid10_find_virt(conf,
3472 mddev->curr_resync_completed, i);
3473
3474 if (conf->cluster_sync_low == 0 ||
3475 conf->cluster_sync_low > sect_va2)
3476 conf->cluster_sync_low = sect_va2;
3477 }
3478 }
3479 if (broadcast_msg) {
3480 raid10_set_cluster_sync_high(conf);
3481 md_cluster_ops->resync_info_update(mddev,
3482 conf->cluster_sync_low,
3483 conf->cluster_sync_high);
3484 }
3485 }
3486
3487 while (biolist) {
3488 bio = biolist;
3489 biolist = biolist->bi_next;
3490
3491 bio->bi_next = NULL;
3492 r10_bio = get_resync_r10bio(bio);
3493 r10_bio->sectors = nr_sectors;
3494
3495 if (bio->bi_end_io == end_sync_read) {
3496 md_sync_acct_bio(bio, nr_sectors);
3497 bio->bi_status = 0;
3498 generic_make_request(bio);
3499 }
3500 }
3501
3502 if (sectors_skipped)
3503
3504
3505
3506 md_done_sync(mddev, sectors_skipped, 1);
3507
3508 return sectors_skipped + nr_sectors;
3509 giveup:
3510
3511
3512
3513
3514 if (sector_nr + max_sync < max_sector)
3515 max_sector = sector_nr + max_sync;
3516
3517 sectors_skipped += (max_sector - sector_nr);
3518 chunks_skipped ++;
3519 sector_nr = max_sector;
3520 goto skipped;
3521}
3522
3523static sector_t
3524raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3525{
3526 sector_t size;
3527 struct r10conf *conf = mddev->private;
3528
3529 if (!raid_disks)
3530 raid_disks = min(conf->geo.raid_disks,
3531 conf->prev.raid_disks);
3532 if (!sectors)
3533 sectors = conf->dev_sectors;
3534
3535 size = sectors >> conf->geo.chunk_shift;
3536 sector_div(size, conf->geo.far_copies);
3537 size = size * raid_disks;
3538 sector_div(size, conf->geo.near_copies);
3539
3540 return size << conf->geo.chunk_shift;
3541}
3542
3543static void calc_sectors(struct r10conf *conf, sector_t size)
3544{
3545
3546
3547
3548
3549
3550 size = size >> conf->geo.chunk_shift;
3551 sector_div(size, conf->geo.far_copies);
3552 size = size * conf->geo.raid_disks;
3553 sector_div(size, conf->geo.near_copies);
3554
3555
3556 size = size * conf->copies;
3557
3558
3559
3560
3561 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3562
3563 conf->dev_sectors = size << conf->geo.chunk_shift;
3564
3565 if (conf->geo.far_offset)
3566 conf->geo.stride = 1 << conf->geo.chunk_shift;
3567 else {
3568 sector_div(size, conf->geo.far_copies);
3569 conf->geo.stride = size << conf->geo.chunk_shift;
3570 }
3571}
3572
3573enum geo_type {geo_new, geo_old, geo_start};
3574static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3575{
3576 int nc, fc, fo;
3577 int layout, chunk, disks;
3578 switch (new) {
3579 case geo_old:
3580 layout = mddev->layout;
3581 chunk = mddev->chunk_sectors;
3582 disks = mddev->raid_disks - mddev->delta_disks;
3583 break;
3584 case geo_new:
3585 layout = mddev->new_layout;
3586 chunk = mddev->new_chunk_sectors;
3587 disks = mddev->raid_disks;
3588 break;
3589 default:
3590 case geo_start:
3591
3592 layout = mddev->new_layout;
3593 chunk = mddev->new_chunk_sectors;
3594 disks = mddev->raid_disks + mddev->delta_disks;
3595 break;
3596 }
3597 if (layout >> 19)
3598 return -1;
3599 if (chunk < (PAGE_SIZE >> 9) ||
3600 !is_power_of_2(chunk))
3601 return -2;
3602 nc = layout & 255;
3603 fc = (layout >> 8) & 255;
3604 fo = layout & (1<<16);
3605 geo->raid_disks = disks;
3606 geo->near_copies = nc;
3607 geo->far_copies = fc;
3608 geo->far_offset = fo;
3609 switch (layout >> 17) {
3610 case 0:
3611 geo->far_set_size = disks;
3612 break;
3613 case 1:
3614
3615 geo->far_set_size = disks/fc;
3616 WARN(geo->far_set_size < fc,
3617 "This RAID10 layout does not provide data safety - please backup and create new array\n");
3618 break;
3619 case 2:
3620 geo->far_set_size = fc * nc;
3621 break;
3622 default:
3623 return -1;
3624 }
3625 geo->chunk_mask = chunk - 1;
3626 geo->chunk_shift = ffz(~chunk);
3627 return nc*fc;
3628}
3629
3630static struct r10conf *setup_conf(struct mddev *mddev)
3631{
3632 struct r10conf *conf = NULL;
3633 int err = -EINVAL;
3634 struct geom geo;
3635 int copies;
3636
3637 copies = setup_geo(&geo, mddev, geo_new);
3638
3639 if (copies == -2) {
3640 pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n",
3641 mdname(mddev), PAGE_SIZE);
3642 goto out;
3643 }
3644
3645 if (copies < 2 || copies > mddev->raid_disks) {
3646 pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3647 mdname(mddev), mddev->new_layout);
3648 goto out;
3649 }
3650
3651 err = -ENOMEM;
3652 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3653 if (!conf)
3654 goto out;
3655
3656
3657 conf->mirrors = kcalloc(mddev->raid_disks + max(0, -mddev->delta_disks),
3658 sizeof(struct raid10_info),
3659 GFP_KERNEL);
3660 if (!conf->mirrors)
3661 goto out;
3662
3663 conf->tmppage = alloc_page(GFP_KERNEL);
3664 if (!conf->tmppage)
3665 goto out;
3666
3667 conf->geo = geo;
3668 conf->copies = copies;
3669 err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc,
3670 rbio_pool_free, conf);
3671 if (err)
3672 goto out;
3673
3674 err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
3675 if (err)
3676 goto out;
3677
3678 calc_sectors(conf, mddev->dev_sectors);
3679 if (mddev->reshape_position == MaxSector) {
3680 conf->prev = conf->geo;
3681 conf->reshape_progress = MaxSector;
3682 } else {
3683 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3684 err = -EINVAL;
3685 goto out;
3686 }
3687 conf->reshape_progress = mddev->reshape_position;
3688 if (conf->prev.far_offset)
3689 conf->prev.stride = 1 << conf->prev.chunk_shift;
3690 else
3691
3692 conf->prev.stride = conf->dev_sectors;
3693 }
3694 conf->reshape_safe = conf->reshape_progress;
3695 spin_lock_init(&conf->device_lock);
3696 INIT_LIST_HEAD(&conf->retry_list);
3697 INIT_LIST_HEAD(&conf->bio_end_io_list);
3698
3699 spin_lock_init(&conf->resync_lock);
3700 init_waitqueue_head(&conf->wait_barrier);
3701 atomic_set(&conf->nr_pending, 0);
3702
3703 err = -ENOMEM;
3704 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3705 if (!conf->thread)
3706 goto out;
3707
3708 conf->mddev = mddev;
3709 return conf;
3710
3711 out:
3712 if (conf) {
3713 mempool_exit(&conf->r10bio_pool);
3714 kfree(conf->mirrors);
3715 safe_put_page(conf->tmppage);
3716 bioset_exit(&conf->bio_split);
3717 kfree(conf);
3718 }
3719 return ERR_PTR(err);
3720}
3721
3722static int raid10_run(struct mddev *mddev)
3723{
3724 struct r10conf *conf;
3725 int i, disk_idx, chunk_size;
3726 struct raid10_info *disk;
3727 struct md_rdev *rdev;
3728 sector_t size;
3729 sector_t min_offset_diff = 0;
3730 int first = 1;
3731 bool discard_supported = false;
3732
3733 if (mddev_init_writes_pending(mddev) < 0)
3734 return -ENOMEM;
3735
3736 if (mddev->private == NULL) {
3737 conf = setup_conf(mddev);
3738 if (IS_ERR(conf))
3739 return PTR_ERR(conf);
3740 mddev->private = conf;
3741 }
3742 conf = mddev->private;
3743 if (!conf)
3744 goto out;
3745
3746 if (mddev_is_clustered(conf->mddev)) {
3747 int fc, fo;
3748
3749 fc = (mddev->layout >> 8) & 255;
3750 fo = mddev->layout & (1<<16);
3751 if (fc > 1 || fo > 0) {
3752 pr_err("only near layout is supported by clustered"
3753 " raid10\n");
3754 goto out_free_conf;
3755 }
3756 }
3757
3758 mddev->thread = conf->thread;
3759 conf->thread = NULL;
3760
3761 chunk_size = mddev->chunk_sectors << 9;
3762 if (mddev->queue) {
3763 blk_queue_max_discard_sectors(mddev->queue,
3764 mddev->chunk_sectors);
3765 blk_queue_max_write_same_sectors(mddev->queue, 0);
3766 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
3767 blk_queue_io_min(mddev->queue, chunk_size);
3768 if (conf->geo.raid_disks % conf->geo.near_copies)
3769 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3770 else
3771 blk_queue_io_opt(mddev->queue, chunk_size *
3772 (conf->geo.raid_disks / conf->geo.near_copies));
3773 }
3774
3775 rdev_for_each(rdev, mddev) {
3776 long long diff;
3777
3778 disk_idx = rdev->raid_disk;
3779 if (disk_idx < 0)
3780 continue;
3781 if (disk_idx >= conf->geo.raid_disks &&
3782 disk_idx >= conf->prev.raid_disks)
3783 continue;
3784 disk = conf->mirrors + disk_idx;
3785
3786 if (test_bit(Replacement, &rdev->flags)) {
3787 if (disk->replacement)
3788 goto out_free_conf;
3789 disk->replacement = rdev;
3790 } else {
3791 if (disk->rdev)
3792 goto out_free_conf;
3793 disk->rdev = rdev;
3794 }
3795 diff = (rdev->new_data_offset - rdev->data_offset);
3796 if (!mddev->reshape_backwards)
3797 diff = -diff;
3798 if (diff < 0)
3799 diff = 0;
3800 if (first || diff < min_offset_diff)
3801 min_offset_diff = diff;
3802
3803 if (mddev->gendisk)
3804 disk_stack_limits(mddev->gendisk, rdev->bdev,
3805 rdev->data_offset << 9);
3806
3807 disk->head_position = 0;
3808
3809 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3810 discard_supported = true;
3811 first = 0;
3812 }
3813
3814 if (mddev->queue) {
3815 if (discard_supported)
3816 blk_queue_flag_set(QUEUE_FLAG_DISCARD,
3817 mddev->queue);
3818 else
3819 blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
3820 mddev->queue);
3821 }
3822
3823 if (!enough(conf, -1)) {
3824 pr_err("md/raid10:%s: not enough operational mirrors.\n",
3825 mdname(mddev));
3826 goto out_free_conf;
3827 }
3828
3829 if (conf->reshape_progress != MaxSector) {
3830
3831 if (conf->geo.far_copies != 1 &&
3832 conf->geo.far_offset == 0)
3833 goto out_free_conf;
3834 if (conf->prev.far_copies != 1 &&
3835 conf->prev.far_offset == 0)
3836 goto out_free_conf;
3837 }
3838
3839 mddev->degraded = 0;
3840 for (i = 0;
3841 i < conf->geo.raid_disks
3842 || i < conf->prev.raid_disks;
3843 i++) {
3844
3845 disk = conf->mirrors + i;
3846
3847 if (!disk->rdev && disk->replacement) {
3848
3849 disk->rdev = disk->replacement;
3850 disk->replacement = NULL;
3851 clear_bit(Replacement, &disk->rdev->flags);
3852 }
3853
3854 if (!disk->rdev ||
3855 !test_bit(In_sync, &disk->rdev->flags)) {
3856 disk->head_position = 0;
3857 mddev->degraded++;
3858 if (disk->rdev &&
3859 disk->rdev->saved_raid_disk < 0)
3860 conf->fullsync = 1;
3861 }
3862
3863 if (disk->replacement &&
3864 !test_bit(In_sync, &disk->replacement->flags) &&
3865 disk->replacement->saved_raid_disk < 0) {
3866 conf->fullsync = 1;
3867 }
3868
3869 disk->recovery_disabled = mddev->recovery_disabled - 1;
3870 }
3871
3872 if (mddev->recovery_cp != MaxSector)
3873 pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n",
3874 mdname(mddev));
3875 pr_info("md/raid10:%s: active with %d out of %d devices\n",
3876 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3877 conf->geo.raid_disks);
3878
3879
3880
3881 mddev->dev_sectors = conf->dev_sectors;
3882 size = raid10_size(mddev, 0, 0);
3883 md_set_array_sectors(mddev, size);
3884 mddev->resync_max_sectors = size;
3885 set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
3886
3887 if (mddev->queue) {
3888 int stripe = conf->geo.raid_disks *
3889 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3890
3891
3892
3893
3894
3895 stripe /= conf->geo.near_copies;
3896 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
3897 mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
3898 }
3899
3900 if (md_integrity_register(mddev))
3901 goto out_free_conf;
3902
3903 if (conf->reshape_progress != MaxSector) {
3904 unsigned long before_length, after_length;
3905
3906 before_length = ((1 << conf->prev.chunk_shift) *
3907 conf->prev.far_copies);
3908 after_length = ((1 << conf->geo.chunk_shift) *
3909 conf->geo.far_copies);
3910
3911 if (max(before_length, after_length) > min_offset_diff) {
3912
3913 pr_warn("md/raid10: offset difference not enough to continue reshape\n");
3914 goto out_free_conf;
3915 }
3916 conf->offset_diff = min_offset_diff;
3917
3918 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3919 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3920 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3921 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3922 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3923 "reshape");
3924 if (!mddev->sync_thread)
3925 goto out_free_conf;
3926 }
3927
3928 return 0;
3929
3930out_free_conf:
3931 md_unregister_thread(&mddev->thread);
3932 mempool_exit(&conf->r10bio_pool);
3933 safe_put_page(conf->tmppage);
3934 kfree(conf->mirrors);
3935 kfree(conf);
3936 mddev->private = NULL;
3937out:
3938 return -EIO;
3939}
3940
3941static void raid10_free(struct mddev *mddev, void *priv)
3942{
3943 struct r10conf *conf = priv;
3944
3945 mempool_exit(&conf->r10bio_pool);
3946 safe_put_page(conf->tmppage);
3947 kfree(conf->mirrors);
3948 kfree(conf->mirrors_old);
3949 kfree(conf->mirrors_new);
3950 bioset_exit(&conf->bio_split);
3951 kfree(conf);
3952}
3953
3954static void raid10_quiesce(struct mddev *mddev, int quiesce)
3955{
3956 struct r10conf *conf = mddev->private;
3957
3958 if (quiesce)
3959 raise_barrier(conf, 0);
3960 else
3961 lower_barrier(conf);
3962}
3963
3964static int raid10_resize(struct mddev *mddev, sector_t sectors)
3965{
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978 struct r10conf *conf = mddev->private;
3979 sector_t oldsize, size;
3980
3981 if (mddev->reshape_position != MaxSector)
3982 return -EBUSY;
3983
3984 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3985 return -EINVAL;
3986
3987 oldsize = raid10_size(mddev, 0, 0);
3988 size = raid10_size(mddev, sectors, 0);
3989 if (mddev->external_size &&
3990 mddev->array_sectors > size)
3991 return -EINVAL;
3992 if (mddev->bitmap) {
3993 int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0);
3994 if (ret)
3995 return ret;
3996 }
3997 md_set_array_sectors(mddev, size);
3998 if (sectors > mddev->dev_sectors &&
3999 mddev->recovery_cp > oldsize) {
4000 mddev->recovery_cp = oldsize;
4001 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4002 }
4003 calc_sectors(conf, sectors);
4004 mddev->dev_sectors = conf->dev_sectors;
4005 mddev->resync_max_sectors = size;
4006 return 0;
4007}
4008
4009static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
4010{
4011 struct md_rdev *rdev;
4012 struct r10conf *conf;
4013
4014 if (mddev->degraded > 0) {
4015 pr_warn("md/raid10:%s: Error: degraded raid0!\n",
4016 mdname(mddev));
4017 return ERR_PTR(-EINVAL);
4018 }
4019 sector_div(size, devs);
4020
4021
4022 mddev->new_level = 10;
4023
4024 mddev->new_layout = (1<<8) + 2;
4025 mddev->new_chunk_sectors = mddev->chunk_sectors;
4026 mddev->delta_disks = mddev->raid_disks;
4027 mddev->raid_disks *= 2;
4028
4029 mddev->recovery_cp = MaxSector;
4030 mddev->dev_sectors = size;
4031
4032 conf = setup_conf(mddev);
4033 if (!IS_ERR(conf)) {
4034 rdev_for_each(rdev, mddev)
4035 if (rdev->raid_disk >= 0) {
4036 rdev->new_raid_disk = rdev->raid_disk * 2;
4037 rdev->sectors = size;
4038 }
4039 conf->barrier = 1;
4040 }
4041
4042 return conf;
4043}
4044
4045static void *raid10_takeover(struct mddev *mddev)
4046{
4047 struct r0conf *raid0_conf;
4048
4049
4050
4051
4052 if (mddev->level == 0) {
4053
4054 raid0_conf = mddev->private;
4055 if (raid0_conf->nr_strip_zones > 1) {
4056 pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n",
4057 mdname(mddev));
4058 return ERR_PTR(-EINVAL);
4059 }
4060 return raid10_takeover_raid0(mddev,
4061 raid0_conf->strip_zone->zone_end,
4062 raid0_conf->strip_zone->nb_dev);
4063 }
4064 return ERR_PTR(-EINVAL);
4065}
4066
4067static int raid10_check_reshape(struct mddev *mddev)
4068{
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083 struct r10conf *conf = mddev->private;
4084 struct geom geo;
4085
4086 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
4087 return -EINVAL;
4088
4089 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
4090
4091 return -EINVAL;
4092 if (geo.far_copies > 1 && !geo.far_offset)
4093
4094 return -EINVAL;
4095
4096 if (mddev->array_sectors & geo.chunk_mask)
4097
4098 return -EINVAL;
4099
4100 if (!enough(conf, -1))
4101 return -EINVAL;
4102
4103 kfree(conf->mirrors_new);
4104 conf->mirrors_new = NULL;
4105 if (mddev->delta_disks > 0) {
4106
4107 conf->mirrors_new =
4108 kcalloc(mddev->raid_disks + mddev->delta_disks,
4109 sizeof(struct raid10_info),
4110 GFP_KERNEL);
4111 if (!conf->mirrors_new)
4112 return -ENOMEM;
4113 }
4114 return 0;
4115}
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130static int calc_degraded(struct r10conf *conf)
4131{
4132 int degraded, degraded2;
4133 int i;
4134
4135 rcu_read_lock();
4136 degraded = 0;
4137
4138 for (i = 0; i < conf->prev.raid_disks; i++) {
4139 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4140 if (!rdev || test_bit(Faulty, &rdev->flags))
4141 degraded++;
4142 else if (!test_bit(In_sync, &rdev->flags))
4143
4144
4145
4146
4147 degraded++;
4148 }
4149 rcu_read_unlock();
4150 if (conf->geo.raid_disks == conf->prev.raid_disks)
4151 return degraded;
4152 rcu_read_lock();
4153 degraded2 = 0;
4154 for (i = 0; i < conf->geo.raid_disks; i++) {
4155 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4156 if (!rdev || test_bit(Faulty, &rdev->flags))
4157 degraded2++;
4158 else if (!test_bit(In_sync, &rdev->flags)) {
4159
4160
4161
4162
4163
4164 if (conf->geo.raid_disks <= conf->prev.raid_disks)
4165 degraded2++;
4166 }
4167 }
4168 rcu_read_unlock();
4169 if (degraded2 > degraded)
4170 return degraded2;
4171 return degraded;
4172}
4173
4174static int raid10_start_reshape(struct mddev *mddev)
4175{
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186 unsigned long before_length, after_length;
4187 sector_t min_offset_diff = 0;
4188 int first = 1;
4189 struct geom new;
4190 struct r10conf *conf = mddev->private;
4191 struct md_rdev *rdev;
4192 int spares = 0;
4193 int ret;
4194
4195 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4196 return -EBUSY;
4197
4198 if (setup_geo(&new, mddev, geo_start) != conf->copies)
4199 return -EINVAL;
4200
4201 before_length = ((1 << conf->prev.chunk_shift) *
4202 conf->prev.far_copies);
4203 after_length = ((1 << conf->geo.chunk_shift) *
4204 conf->geo.far_copies);
4205
4206 rdev_for_each(rdev, mddev) {
4207 if (!test_bit(In_sync, &rdev->flags)
4208 && !test_bit(Faulty, &rdev->flags))
4209 spares++;
4210 if (rdev->raid_disk >= 0) {
4211 long long diff = (rdev->new_data_offset
4212 - rdev->data_offset);
4213 if (!mddev->reshape_backwards)
4214 diff = -diff;
4215 if (diff < 0)
4216 diff = 0;
4217 if (first || diff < min_offset_diff)
4218 min_offset_diff = diff;
4219 first = 0;
4220 }
4221 }
4222
4223 if (max(before_length, after_length) > min_offset_diff)
4224 return -EINVAL;
4225
4226 if (spares < mddev->delta_disks)
4227 return -EINVAL;
4228
4229 conf->offset_diff = min_offset_diff;
4230 spin_lock_irq(&conf->device_lock);
4231 if (conf->mirrors_new) {
4232 memcpy(conf->mirrors_new, conf->mirrors,
4233 sizeof(struct raid10_info)*conf->prev.raid_disks);
4234 smp_mb();
4235 kfree(conf->mirrors_old);
4236 conf->mirrors_old = conf->mirrors;
4237 conf->mirrors = conf->mirrors_new;
4238 conf->mirrors_new = NULL;
4239 }
4240 setup_geo(&conf->geo, mddev, geo_start);
4241 smp_mb();
4242 if (mddev->reshape_backwards) {
4243 sector_t size = raid10_size(mddev, 0, 0);
4244 if (size < mddev->array_sectors) {
4245 spin_unlock_irq(&conf->device_lock);
4246 pr_warn("md/raid10:%s: array size must be reduce before number of disks\n",
4247 mdname(mddev));
4248 return -EINVAL;
4249 }
4250 mddev->resync_max_sectors = size;
4251 conf->reshape_progress = size;
4252 } else
4253 conf->reshape_progress = 0;
4254 conf->reshape_safe = conf->reshape_progress;
4255 spin_unlock_irq(&conf->device_lock);
4256
4257 if (mddev->delta_disks && mddev->bitmap) {
4258 struct mdp_superblock_1 *sb = NULL;
4259 sector_t oldsize, newsize;
4260
4261 oldsize = raid10_size(mddev, 0, 0);
4262 newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
4263
4264 if (!mddev_is_clustered(mddev)) {
4265 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
4266 if (ret)
4267 goto abort;
4268 else
4269 goto out;
4270 }
4271
4272 rdev_for_each(rdev, mddev) {
4273 if (rdev->raid_disk > -1 &&
4274 !test_bit(Faulty, &rdev->flags))
4275 sb = page_address(rdev->sb_page);
4276 }
4277
4278
4279
4280
4281
4282
4283 if ((sb && (le32_to_cpu(sb->feature_map) &
4284 MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
4285 goto out;
4286
4287 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
4288 if (ret)
4289 goto abort;
4290
4291 ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
4292 if (ret) {
4293 md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
4294 goto abort;
4295 }
4296 }
4297out:
4298 if (mddev->delta_disks > 0) {
4299 rdev_for_each(rdev, mddev)
4300 if (rdev->raid_disk < 0 &&
4301 !test_bit(Faulty, &rdev->flags)) {
4302 if (raid10_add_disk(mddev, rdev) == 0) {
4303 if (rdev->raid_disk >=
4304 conf->prev.raid_disks)
4305 set_bit(In_sync, &rdev->flags);
4306 else
4307 rdev->recovery_offset = 0;
4308
4309 if (sysfs_link_rdev(mddev, rdev))
4310 ;
4311 }
4312 } else if (rdev->raid_disk >= conf->prev.raid_disks
4313 && !test_bit(Faulty, &rdev->flags)) {
4314
4315 set_bit(In_sync, &rdev->flags);
4316 }
4317 }
4318
4319
4320
4321
4322 spin_lock_irq(&conf->device_lock);
4323 mddev->degraded = calc_degraded(conf);
4324 spin_unlock_irq(&conf->device_lock);
4325 mddev->raid_disks = conf->geo.raid_disks;
4326 mddev->reshape_position = conf->reshape_progress;
4327 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4328
4329 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4330 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4331 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
4332 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4333 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4334
4335 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4336 "reshape");
4337 if (!mddev->sync_thread) {
4338 ret = -EAGAIN;
4339 goto abort;
4340 }
4341 conf->reshape_checkpoint = jiffies;
4342 md_wakeup_thread(mddev->sync_thread);
4343 md_new_event(mddev);
4344 return 0;
4345
4346abort:
4347 mddev->recovery = 0;
4348 spin_lock_irq(&conf->device_lock);
4349 conf->geo = conf->prev;
4350 mddev->raid_disks = conf->geo.raid_disks;
4351 rdev_for_each(rdev, mddev)
4352 rdev->new_data_offset = rdev->data_offset;
4353 smp_wmb();
4354 conf->reshape_progress = MaxSector;
4355 conf->reshape_safe = MaxSector;
4356 mddev->reshape_position = MaxSector;
4357 spin_unlock_irq(&conf->device_lock);
4358 return ret;
4359}
4360
4361
4362
4363
4364
4365
4366
4367static sector_t last_dev_address(sector_t s, struct geom *geo)
4368{
4369 s = (s | geo->chunk_mask) + 1;
4370 s >>= geo->chunk_shift;
4371 s *= geo->near_copies;
4372 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4373 s *= geo->far_copies;
4374 s <<= geo->chunk_shift;
4375 return s;
4376}
4377
4378
4379
4380
4381
4382static sector_t first_dev_address(sector_t s, struct geom *geo)
4383{
4384 s >>= geo->chunk_shift;
4385 s *= geo->near_copies;
4386 sector_div(s, geo->raid_disks);
4387 s *= geo->far_copies;
4388 s <<= geo->chunk_shift;
4389 return s;
4390}
4391
4392static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4393 int *skipped)
4394{
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432 struct r10conf *conf = mddev->private;
4433 struct r10bio *r10_bio;
4434 sector_t next, safe, last;
4435 int max_sectors;
4436 int nr_sectors;
4437 int s;
4438 struct md_rdev *rdev;
4439 int need_flush = 0;
4440 struct bio *blist;
4441 struct bio *bio, *read_bio;
4442 int sectors_done = 0;
4443 struct page **pages;
4444
4445 if (sector_nr == 0) {
4446
4447 if (mddev->reshape_backwards &&
4448 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4449 sector_nr = (raid10_size(mddev, 0, 0)
4450 - conf->reshape_progress);
4451 } else if (!mddev->reshape_backwards &&
4452 conf->reshape_progress > 0)
4453 sector_nr = conf->reshape_progress;
4454 if (sector_nr) {
4455 mddev->curr_resync_completed = sector_nr;
4456 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4457 *skipped = 1;
4458 return sector_nr;
4459 }
4460 }
4461
4462
4463
4464
4465
4466 if (mddev->reshape_backwards) {
4467
4468
4469
4470 next = first_dev_address(conf->reshape_progress - 1,
4471 &conf->geo);
4472
4473
4474
4475
4476 safe = last_dev_address(conf->reshape_safe - 1,
4477 &conf->prev);
4478
4479 if (next + conf->offset_diff < safe)
4480 need_flush = 1;
4481
4482 last = conf->reshape_progress - 1;
4483 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4484 & conf->prev.chunk_mask);
4485 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4486 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4487 } else {
4488
4489
4490
4491 next = last_dev_address(conf->reshape_progress, &conf->geo);
4492
4493
4494
4495
4496 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4497
4498
4499
4500
4501 if (next > safe + conf->offset_diff)
4502 need_flush = 1;
4503
4504 sector_nr = conf->reshape_progress;
4505 last = sector_nr | (conf->geo.chunk_mask
4506 & conf->prev.chunk_mask);
4507
4508 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4509 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4510 }
4511
4512 if (need_flush ||
4513 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4514
4515 wait_barrier(conf);
4516 mddev->reshape_position = conf->reshape_progress;
4517 if (mddev->reshape_backwards)
4518 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4519 - conf->reshape_progress;
4520 else
4521 mddev->curr_resync_completed = conf->reshape_progress;
4522 conf->reshape_checkpoint = jiffies;
4523 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4524 md_wakeup_thread(mddev->thread);
4525 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
4526 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4527 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4528 allow_barrier(conf);
4529 return sectors_done;
4530 }
4531 conf->reshape_safe = mddev->reshape_position;
4532 allow_barrier(conf);
4533 }
4534
4535 raise_barrier(conf, 0);
4536read_more:
4537
4538 r10_bio = raid10_alloc_init_r10buf(conf);
4539 r10_bio->state = 0;
4540 raise_barrier(conf, 1);
4541 atomic_set(&r10_bio->remaining, 0);
4542 r10_bio->mddev = mddev;
4543 r10_bio->sector = sector_nr;
4544 set_bit(R10BIO_IsReshape, &r10_bio->state);
4545 r10_bio->sectors = last - sector_nr + 1;
4546 rdev = read_balance(conf, r10_bio, &max_sectors);
4547 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4548
4549 if (!rdev) {
4550
4551
4552
4553
4554 mempool_free(r10_bio, &conf->r10buf_pool);
4555 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4556 return sectors_done;
4557 }
4558
4559 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4560
4561 bio_set_dev(read_bio, rdev->bdev);
4562 read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4563 + rdev->data_offset);
4564 read_bio->bi_private = r10_bio;
4565 read_bio->bi_end_io = end_reshape_read;
4566 bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
4567 read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
4568 read_bio->bi_status = 0;
4569 read_bio->bi_vcnt = 0;
4570 read_bio->bi_iter.bi_size = 0;
4571 r10_bio->master_bio = read_bio;
4572 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4573
4574
4575
4576
4577
4578 if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) {
4579 struct mdp_superblock_1 *sb = NULL;
4580 int sb_reshape_pos = 0;
4581
4582 conf->cluster_sync_low = sector_nr;
4583 conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
4584 sb = page_address(rdev->sb_page);
4585 if (sb) {
4586 sb_reshape_pos = le64_to_cpu(sb->reshape_position);
4587
4588
4589
4590
4591
4592 if (sb_reshape_pos < conf->cluster_sync_low)
4593 conf->cluster_sync_low = sb_reshape_pos;
4594 }
4595
4596 md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
4597 conf->cluster_sync_high);
4598 }
4599
4600
4601 __raid10_find_phys(&conf->geo, r10_bio);
4602
4603 blist = read_bio;
4604 read_bio->bi_next = NULL;
4605
4606 rcu_read_lock();
4607 for (s = 0; s < conf->copies*2; s++) {
4608 struct bio *b;
4609 int d = r10_bio->devs[s/2].devnum;
4610 struct md_rdev *rdev2;
4611 if (s&1) {
4612 rdev2 = rcu_dereference(conf->mirrors[d].replacement);
4613 b = r10_bio->devs[s/2].repl_bio;
4614 } else {
4615 rdev2 = rcu_dereference(conf->mirrors[d].rdev);
4616 b = r10_bio->devs[s/2].bio;
4617 }
4618 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4619 continue;
4620
4621 bio_set_dev(b, rdev2->bdev);
4622 b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
4623 rdev2->new_data_offset;
4624 b->bi_end_io = end_reshape_write;
4625 bio_set_op_attrs(b, REQ_OP_WRITE, 0);
4626 b->bi_next = blist;
4627 blist = b;
4628 }
4629
4630
4631
4632 nr_sectors = 0;
4633 pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
4634 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4635 struct page *page = pages[s / (PAGE_SIZE >> 9)];
4636 int len = (max_sectors - s) << 9;
4637 if (len > PAGE_SIZE)
4638 len = PAGE_SIZE;
4639 for (bio = blist; bio ; bio = bio->bi_next) {
4640
4641
4642
4643
4644 bio_add_page(bio, page, len, 0);
4645 }
4646 sector_nr += len >> 9;
4647 nr_sectors += len >> 9;
4648 }
4649 rcu_read_unlock();
4650 r10_bio->sectors = nr_sectors;
4651
4652
4653 md_sync_acct_bio(read_bio, r10_bio->sectors);
4654 atomic_inc(&r10_bio->remaining);
4655 read_bio->bi_next = NULL;
4656 generic_make_request(read_bio);
4657 sectors_done += nr_sectors;
4658 if (sector_nr <= last)
4659 goto read_more;
4660
4661 lower_barrier(conf);
4662
4663
4664
4665
4666 if (mddev->reshape_backwards)
4667 conf->reshape_progress -= sectors_done;
4668 else
4669 conf->reshape_progress += sectors_done;
4670
4671 return sectors_done;
4672}
4673
4674static void end_reshape_request(struct r10bio *r10_bio);
4675static int handle_reshape_read_error(struct mddev *mddev,
4676 struct r10bio *r10_bio);
4677static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4678{
4679
4680
4681
4682
4683
4684 struct r10conf *conf = mddev->private;
4685 int s;
4686
4687 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4688 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4689
4690 md_done_sync(mddev, r10_bio->sectors, 0);
4691 return;
4692 }
4693
4694
4695
4696
4697 atomic_set(&r10_bio->remaining, 1);
4698 for (s = 0; s < conf->copies*2; s++) {
4699 struct bio *b;
4700 int d = r10_bio->devs[s/2].devnum;
4701 struct md_rdev *rdev;
4702 rcu_read_lock();
4703 if (s&1) {
4704 rdev = rcu_dereference(conf->mirrors[d].replacement);
4705 b = r10_bio->devs[s/2].repl_bio;
4706 } else {
4707 rdev = rcu_dereference(conf->mirrors[d].rdev);
4708 b = r10_bio->devs[s/2].bio;
4709 }
4710 if (!rdev || test_bit(Faulty, &rdev->flags)) {
4711 rcu_read_unlock();
4712 continue;
4713 }
4714 atomic_inc(&rdev->nr_pending);
4715 rcu_read_unlock();
4716 md_sync_acct_bio(b, r10_bio->sectors);
4717 atomic_inc(&r10_bio->remaining);
4718 b->bi_next = NULL;
4719 generic_make_request(b);
4720 }
4721 end_reshape_request(r10_bio);
4722}
4723
4724static void end_reshape(struct r10conf *conf)
4725{
4726 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4727 return;
4728
4729 spin_lock_irq(&conf->device_lock);
4730 conf->prev = conf->geo;
4731 md_finish_reshape(conf->mddev);
4732 smp_wmb();
4733 conf->reshape_progress = MaxSector;
4734 conf->reshape_safe = MaxSector;
4735 spin_unlock_irq(&conf->device_lock);
4736
4737
4738
4739
4740 if (conf->mddev->queue) {
4741 int stripe = conf->geo.raid_disks *
4742 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4743 stripe /= conf->geo.near_copies;
4744 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
4745 conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
4746 }
4747 conf->fullsync = 0;
4748}
4749
4750static void raid10_update_reshape_pos(struct mddev *mddev)
4751{
4752 struct r10conf *conf = mddev->private;
4753 sector_t lo, hi;
4754
4755 md_cluster_ops->resync_info_get(mddev, &lo, &hi);
4756 if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
4757 || mddev->reshape_position == MaxSector)
4758 conf->reshape_progress = mddev->reshape_position;
4759 else
4760 WARN_ON_ONCE(1);
4761}
4762
4763static int handle_reshape_read_error(struct mddev *mddev,
4764 struct r10bio *r10_bio)
4765{
4766
4767 int sectors = r10_bio->sectors;
4768 struct r10conf *conf = mddev->private;
4769 struct r10bio *r10b;
4770 int slot = 0;
4771 int idx = 0;
4772 struct page **pages;
4773
4774 r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO);
4775 if (!r10b) {
4776 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4777 return -ENOMEM;
4778 }
4779
4780
4781 pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
4782
4783 r10b->sector = r10_bio->sector;
4784 __raid10_find_phys(&conf->prev, r10b);
4785
4786 while (sectors) {
4787 int s = sectors;
4788 int success = 0;
4789 int first_slot = slot;
4790
4791 if (s > (PAGE_SIZE >> 9))
4792 s = PAGE_SIZE >> 9;
4793
4794 rcu_read_lock();
4795 while (!success) {
4796 int d = r10b->devs[slot].devnum;
4797 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4798 sector_t addr;
4799 if (rdev == NULL ||
4800 test_bit(Faulty, &rdev->flags) ||
4801 !test_bit(In_sync, &rdev->flags))
4802 goto failed;
4803
4804 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4805 atomic_inc(&rdev->nr_pending);
4806 rcu_read_unlock();
4807 success = sync_page_io(rdev,
4808 addr,
4809 s << 9,
4810 pages[idx],
4811 REQ_OP_READ, 0, false);
4812 rdev_dec_pending(rdev, mddev);
4813 rcu_read_lock();
4814 if (success)
4815 break;
4816 failed:
4817 slot++;
4818 if (slot >= conf->copies)
4819 slot = 0;
4820 if (slot == first_slot)
4821 break;
4822 }
4823 rcu_read_unlock();
4824 if (!success) {
4825
4826 set_bit(MD_RECOVERY_INTR,
4827 &mddev->recovery);
4828 kfree(r10b);
4829 return -EIO;
4830 }
4831 sectors -= s;
4832 idx++;
4833 }
4834 kfree(r10b);
4835 return 0;
4836}
4837
4838static void end_reshape_write(struct bio *bio)
4839{
4840 struct r10bio *r10_bio = get_resync_r10bio(bio);
4841 struct mddev *mddev = r10_bio->mddev;
4842 struct r10conf *conf = mddev->private;
4843 int d;
4844 int slot;
4845 int repl;
4846 struct md_rdev *rdev = NULL;
4847
4848 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4849 if (repl)
4850 rdev = conf->mirrors[d].replacement;
4851 if (!rdev) {
4852 smp_mb();
4853 rdev = conf->mirrors[d].rdev;
4854 }
4855
4856 if (bio->bi_status) {
4857
4858 md_error(mddev, rdev);
4859 }
4860
4861 rdev_dec_pending(rdev, mddev);
4862 end_reshape_request(r10_bio);
4863}
4864
4865static void end_reshape_request(struct r10bio *r10_bio)
4866{
4867 if (!atomic_dec_and_test(&r10_bio->remaining))
4868 return;
4869 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4870 bio_put(r10_bio->master_bio);
4871 put_buf(r10_bio);
4872}
4873
4874static void raid10_finish_reshape(struct mddev *mddev)
4875{
4876 struct r10conf *conf = mddev->private;
4877
4878 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4879 return;
4880
4881 if (mddev->delta_disks > 0) {
4882 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4883 mddev->recovery_cp = mddev->resync_max_sectors;
4884 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4885 }
4886 mddev->resync_max_sectors = mddev->array_sectors;
4887 } else {
4888 int d;
4889 rcu_read_lock();
4890 for (d = conf->geo.raid_disks ;
4891 d < conf->geo.raid_disks - mddev->delta_disks;
4892 d++) {
4893 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4894 if (rdev)
4895 clear_bit(In_sync, &rdev->flags);
4896 rdev = rcu_dereference(conf->mirrors[d].replacement);
4897 if (rdev)
4898 clear_bit(In_sync, &rdev->flags);
4899 }
4900 rcu_read_unlock();
4901 }
4902 mddev->layout = mddev->new_layout;
4903 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4904 mddev->reshape_position = MaxSector;
4905 mddev->delta_disks = 0;
4906 mddev->reshape_backwards = 0;
4907}
4908
4909static struct md_personality raid10_personality =
4910{
4911 .name = "raid10",
4912 .level = 10,
4913 .owner = THIS_MODULE,
4914 .make_request = raid10_make_request,
4915 .run = raid10_run,
4916 .free = raid10_free,
4917 .status = raid10_status,
4918 .error_handler = raid10_error,
4919 .hot_add_disk = raid10_add_disk,
4920 .hot_remove_disk= raid10_remove_disk,
4921 .spare_active = raid10_spare_active,
4922 .sync_request = raid10_sync_request,
4923 .quiesce = raid10_quiesce,
4924 .size = raid10_size,
4925 .resize = raid10_resize,
4926 .takeover = raid10_takeover,
4927 .check_reshape = raid10_check_reshape,
4928 .start_reshape = raid10_start_reshape,
4929 .finish_reshape = raid10_finish_reshape,
4930 .update_reshape_pos = raid10_update_reshape_pos,
4931 .congested = raid10_congested,
4932};
4933
4934static int __init raid_init(void)
4935{
4936 return register_md_personality(&raid10_personality);
4937}
4938
4939static void raid_exit(void)
4940{
4941 unregister_md_personality(&raid10_personality);
4942}
4943
4944module_init(raid_init);
4945module_exit(raid_exit);
4946MODULE_LICENSE("GPL");
4947MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4948MODULE_ALIAS("md-personality-9");
4949MODULE_ALIAS("md-raid10");
4950MODULE_ALIAS("md-level-10");
4951
4952module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
4953