1
2
3
4
5
6
7
8
9
10
11
12#include <linux/slab.h>
13#include <linux/delay.h>
14#include <linux/blkdev.h>
15#include <linux/module.h>
16#include <linux/seq_file.h>
17#include <linux/ratelimit.h>
18#include <linux/kthread.h>
19#include <linux/raid/md_p.h>
20#include <trace/events/block.h>
21#include "md.h"
22#include "raid10.h"
23#include "raid0.h"
24#include "md-bitmap.h"
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67static void allow_barrier(struct r10conf *conf);
68static void lower_barrier(struct r10conf *conf);
69static int _enough(struct r10conf *conf, int previous, int ignore);
70static int enough(struct r10conf *conf, int ignore);
71static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
72 int *skipped);
73static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
74static void end_reshape_write(struct bio *bio);
75static void end_reshape(struct r10conf *conf);
76
77#define raid10_log(md, fmt, args...) \
78 do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
79
80#include "raid1-10.c"
81
82
83
84
85
86static inline struct r10bio *get_resync_r10bio(struct bio *bio)
87{
88 return get_resync_pages(bio)->raid_bio;
89}
90
91static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
92{
93 struct r10conf *conf = data;
94 int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]);
95
96
97
98 return kzalloc(size, gfp_flags);
99}
100
101#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
102
103#define RESYNC_WINDOW (1024*1024)
104
105#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
106#define CLUSTER_RESYNC_WINDOW (32 * RESYNC_WINDOW)
107#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
108
109
110
111
112
113
114
115
116static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
117{
118 struct r10conf *conf = data;
119 struct r10bio *r10_bio;
120 struct bio *bio;
121 int j;
122 int nalloc, nalloc_rp;
123 struct resync_pages *rps;
124
125 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
126 if (!r10_bio)
127 return NULL;
128
129 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
130 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
131 nalloc = conf->copies;
132 else
133 nalloc = 2;
134
135
136 if (!conf->have_replacement)
137 nalloc_rp = nalloc;
138 else
139 nalloc_rp = nalloc * 2;
140 rps = kmalloc_array(nalloc_rp, sizeof(struct resync_pages), gfp_flags);
141 if (!rps)
142 goto out_free_r10bio;
143
144
145
146
147 for (j = nalloc ; j-- ; ) {
148 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
149 if (!bio)
150 goto out_free_bio;
151 r10_bio->devs[j].bio = bio;
152 if (!conf->have_replacement)
153 continue;
154 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
155 if (!bio)
156 goto out_free_bio;
157 r10_bio->devs[j].repl_bio = bio;
158 }
159
160
161
162
163 for (j = 0; j < nalloc; j++) {
164 struct bio *rbio = r10_bio->devs[j].repl_bio;
165 struct resync_pages *rp, *rp_repl;
166
167 rp = &rps[j];
168 if (rbio)
169 rp_repl = &rps[nalloc + j];
170
171 bio = r10_bio->devs[j].bio;
172
173 if (!j || test_bit(MD_RECOVERY_SYNC,
174 &conf->mddev->recovery)) {
175 if (resync_alloc_pages(rp, gfp_flags))
176 goto out_free_pages;
177 } else {
178 memcpy(rp, &rps[0], sizeof(*rp));
179 resync_get_all_pages(rp);
180 }
181
182 rp->raid_bio = r10_bio;
183 bio->bi_private = rp;
184 if (rbio) {
185 memcpy(rp_repl, rp, sizeof(*rp));
186 rbio->bi_private = rp_repl;
187 }
188 }
189
190 return r10_bio;
191
192out_free_pages:
193 while (--j >= 0)
194 resync_free_pages(&rps[j]);
195
196 j = 0;
197out_free_bio:
198 for ( ; j < nalloc; j++) {
199 if (r10_bio->devs[j].bio)
200 bio_put(r10_bio->devs[j].bio);
201 if (r10_bio->devs[j].repl_bio)
202 bio_put(r10_bio->devs[j].repl_bio);
203 }
204 kfree(rps);
205out_free_r10bio:
206 rbio_pool_free(r10_bio, conf);
207 return NULL;
208}
209
210static void r10buf_pool_free(void *__r10_bio, void *data)
211{
212 struct r10conf *conf = data;
213 struct r10bio *r10bio = __r10_bio;
214 int j;
215 struct resync_pages *rp = NULL;
216
217 for (j = conf->copies; j--; ) {
218 struct bio *bio = r10bio->devs[j].bio;
219
220 if (bio) {
221 rp = get_resync_pages(bio);
222 resync_free_pages(rp);
223 bio_put(bio);
224 }
225
226 bio = r10bio->devs[j].repl_bio;
227 if (bio)
228 bio_put(bio);
229 }
230
231
232 kfree(rp);
233
234 rbio_pool_free(r10bio, conf);
235}
236
237static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
238{
239 int i;
240
241 for (i = 0; i < conf->geo.raid_disks; i++) {
242 struct bio **bio = & r10_bio->devs[i].bio;
243 if (!BIO_SPECIAL(*bio))
244 bio_put(*bio);
245 *bio = NULL;
246 bio = &r10_bio->devs[i].repl_bio;
247 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
248 bio_put(*bio);
249 *bio = NULL;
250 }
251}
252
253static void free_r10bio(struct r10bio *r10_bio)
254{
255 struct r10conf *conf = r10_bio->mddev->private;
256
257 put_all_bios(conf, r10_bio);
258 mempool_free(r10_bio, &conf->r10bio_pool);
259}
260
261static void put_buf(struct r10bio *r10_bio)
262{
263 struct r10conf *conf = r10_bio->mddev->private;
264
265 mempool_free(r10_bio, &conf->r10buf_pool);
266
267 lower_barrier(conf);
268}
269
270static void reschedule_retry(struct r10bio *r10_bio)
271{
272 unsigned long flags;
273 struct mddev *mddev = r10_bio->mddev;
274 struct r10conf *conf = mddev->private;
275
276 spin_lock_irqsave(&conf->device_lock, flags);
277 list_add(&r10_bio->retry_list, &conf->retry_list);
278 conf->nr_queued ++;
279 spin_unlock_irqrestore(&conf->device_lock, flags);
280
281
282 wake_up(&conf->wait_barrier);
283
284 md_wakeup_thread(mddev->thread);
285}
286
287
288
289
290
291
292static void raid_end_bio_io(struct r10bio *r10_bio)
293{
294 struct bio *bio = r10_bio->master_bio;
295 struct r10conf *conf = r10_bio->mddev->private;
296
297 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
298 bio->bi_status = BLK_STS_IOERR;
299
300 if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
301 bio_end_io_acct(bio, r10_bio->start_time);
302 bio_endio(bio);
303
304
305
306
307 allow_barrier(conf);
308
309 free_r10bio(r10_bio);
310}
311
312
313
314
315static inline void update_head_pos(int slot, struct r10bio *r10_bio)
316{
317 struct r10conf *conf = r10_bio->mddev->private;
318
319 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
320 r10_bio->devs[slot].addr + (r10_bio->sectors);
321}
322
323
324
325
326static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
327 struct bio *bio, int *slotp, int *replp)
328{
329 int slot;
330 int repl = 0;
331
332 for (slot = 0; slot < conf->geo.raid_disks; slot++) {
333 if (r10_bio->devs[slot].bio == bio)
334 break;
335 if (r10_bio->devs[slot].repl_bio == bio) {
336 repl = 1;
337 break;
338 }
339 }
340
341 update_head_pos(slot, r10_bio);
342
343 if (slotp)
344 *slotp = slot;
345 if (replp)
346 *replp = repl;
347 return r10_bio->devs[slot].devnum;
348}
349
350static void raid10_end_read_request(struct bio *bio)
351{
352 int uptodate = !bio->bi_status;
353 struct r10bio *r10_bio = bio->bi_private;
354 int slot;
355 struct md_rdev *rdev;
356 struct r10conf *conf = r10_bio->mddev->private;
357
358 slot = r10_bio->read_slot;
359 rdev = r10_bio->devs[slot].rdev;
360
361
362
363 update_head_pos(slot, r10_bio);
364
365 if (uptodate) {
366
367
368
369
370
371
372
373
374
375 set_bit(R10BIO_Uptodate, &r10_bio->state);
376 } else {
377
378
379
380
381
382 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
383 rdev->raid_disk))
384 uptodate = 1;
385 }
386 if (uptodate) {
387 raid_end_bio_io(r10_bio);
388 rdev_dec_pending(rdev, conf->mddev);
389 } else {
390
391
392
393 char b[BDEVNAME_SIZE];
394 pr_err_ratelimited("md/raid10:%s: %s: rescheduling sector %llu\n",
395 mdname(conf->mddev),
396 bdevname(rdev->bdev, b),
397 (unsigned long long)r10_bio->sector);
398 set_bit(R10BIO_ReadError, &r10_bio->state);
399 reschedule_retry(r10_bio);
400 }
401}
402
403static void close_write(struct r10bio *r10_bio)
404{
405
406 md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
407 r10_bio->sectors,
408 !test_bit(R10BIO_Degraded, &r10_bio->state),
409 0);
410 md_write_end(r10_bio->mddev);
411}
412
413static void one_write_done(struct r10bio *r10_bio)
414{
415 if (atomic_dec_and_test(&r10_bio->remaining)) {
416 if (test_bit(R10BIO_WriteError, &r10_bio->state))
417 reschedule_retry(r10_bio);
418 else {
419 close_write(r10_bio);
420 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
421 reschedule_retry(r10_bio);
422 else
423 raid_end_bio_io(r10_bio);
424 }
425 }
426}
427
428static void raid10_end_write_request(struct bio *bio)
429{
430 struct r10bio *r10_bio = bio->bi_private;
431 int dev;
432 int dec_rdev = 1;
433 struct r10conf *conf = r10_bio->mddev->private;
434 int slot, repl;
435 struct md_rdev *rdev = NULL;
436 struct bio *to_put = NULL;
437 bool discard_error;
438
439 discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
440
441 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
442
443 if (repl)
444 rdev = conf->mirrors[dev].replacement;
445 if (!rdev) {
446 smp_rmb();
447 repl = 0;
448 rdev = conf->mirrors[dev].rdev;
449 }
450
451
452
453 if (bio->bi_status && !discard_error) {
454 if (repl)
455
456
457
458 md_error(rdev->mddev, rdev);
459 else {
460 set_bit(WriteErrorSeen, &rdev->flags);
461 if (!test_and_set_bit(WantReplacement, &rdev->flags))
462 set_bit(MD_RECOVERY_NEEDED,
463 &rdev->mddev->recovery);
464
465 dec_rdev = 0;
466 if (test_bit(FailFast, &rdev->flags) &&
467 (bio->bi_opf & MD_FAILFAST)) {
468 md_error(rdev->mddev, rdev);
469 }
470
471
472
473
474
475 if (!test_bit(Faulty, &rdev->flags))
476 set_bit(R10BIO_WriteError, &r10_bio->state);
477 else {
478
479 set_bit(R10BIO_Degraded, &r10_bio->state);
480 r10_bio->devs[slot].bio = NULL;
481 to_put = bio;
482 dec_rdev = 1;
483 }
484 }
485 } else {
486
487
488
489
490
491
492
493
494
495 sector_t first_bad;
496 int bad_sectors;
497
498
499
500
501
502
503
504
505
506 if (test_bit(In_sync, &rdev->flags) &&
507 !test_bit(Faulty, &rdev->flags))
508 set_bit(R10BIO_Uptodate, &r10_bio->state);
509
510
511 if (is_badblock(rdev,
512 r10_bio->devs[slot].addr,
513 r10_bio->sectors,
514 &first_bad, &bad_sectors) && !discard_error) {
515 bio_put(bio);
516 if (repl)
517 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
518 else
519 r10_bio->devs[slot].bio = IO_MADE_GOOD;
520 dec_rdev = 0;
521 set_bit(R10BIO_MadeGood, &r10_bio->state);
522 }
523 }
524
525
526
527
528
529
530 one_write_done(r10_bio);
531 if (dec_rdev)
532 rdev_dec_pending(rdev, conf->mddev);
533 if (to_put)
534 bio_put(to_put);
535}
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
563{
564 int n,f;
565 sector_t sector;
566 sector_t chunk;
567 sector_t stripe;
568 int dev;
569 int slot = 0;
570 int last_far_set_start, last_far_set_size;
571
572 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
573 last_far_set_start *= geo->far_set_size;
574
575 last_far_set_size = geo->far_set_size;
576 last_far_set_size += (geo->raid_disks % geo->far_set_size);
577
578
579 chunk = r10bio->sector >> geo->chunk_shift;
580 sector = r10bio->sector & geo->chunk_mask;
581
582 chunk *= geo->near_copies;
583 stripe = chunk;
584 dev = sector_div(stripe, geo->raid_disks);
585 if (geo->far_offset)
586 stripe *= geo->far_copies;
587
588 sector += stripe << geo->chunk_shift;
589
590
591 for (n = 0; n < geo->near_copies; n++) {
592 int d = dev;
593 int set;
594 sector_t s = sector;
595 r10bio->devs[slot].devnum = d;
596 r10bio->devs[slot].addr = s;
597 slot++;
598
599 for (f = 1; f < geo->far_copies; f++) {
600 set = d / geo->far_set_size;
601 d += geo->near_copies;
602
603 if ((geo->raid_disks % geo->far_set_size) &&
604 (d > last_far_set_start)) {
605 d -= last_far_set_start;
606 d %= last_far_set_size;
607 d += last_far_set_start;
608 } else {
609 d %= geo->far_set_size;
610 d += geo->far_set_size * set;
611 }
612 s += geo->stride;
613 r10bio->devs[slot].devnum = d;
614 r10bio->devs[slot].addr = s;
615 slot++;
616 }
617 dev++;
618 if (dev >= geo->raid_disks) {
619 dev = 0;
620 sector += (geo->chunk_mask + 1);
621 }
622 }
623}
624
625static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
626{
627 struct geom *geo = &conf->geo;
628
629 if (conf->reshape_progress != MaxSector &&
630 ((r10bio->sector >= conf->reshape_progress) !=
631 conf->mddev->reshape_backwards)) {
632 set_bit(R10BIO_Previous, &r10bio->state);
633 geo = &conf->prev;
634 } else
635 clear_bit(R10BIO_Previous, &r10bio->state);
636
637 __raid10_find_phys(geo, r10bio);
638}
639
640static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
641{
642 sector_t offset, chunk, vchunk;
643
644
645
646 struct geom *geo = &conf->geo;
647 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
648 int far_set_size = geo->far_set_size;
649 int last_far_set_start;
650
651 if (geo->raid_disks % geo->far_set_size) {
652 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
653 last_far_set_start *= geo->far_set_size;
654
655 if (dev >= last_far_set_start) {
656 far_set_size = geo->far_set_size;
657 far_set_size += (geo->raid_disks % geo->far_set_size);
658 far_set_start = last_far_set_start;
659 }
660 }
661
662 offset = sector & geo->chunk_mask;
663 if (geo->far_offset) {
664 int fc;
665 chunk = sector >> geo->chunk_shift;
666 fc = sector_div(chunk, geo->far_copies);
667 dev -= fc * geo->near_copies;
668 if (dev < far_set_start)
669 dev += far_set_size;
670 } else {
671 while (sector >= geo->stride) {
672 sector -= geo->stride;
673 if (dev < (geo->near_copies + far_set_start))
674 dev += far_set_size - geo->near_copies;
675 else
676 dev -= geo->near_copies;
677 }
678 chunk = sector >> geo->chunk_shift;
679 }
680 vchunk = chunk * geo->raid_disks + dev;
681 sector_div(vchunk, geo->near_copies);
682 return (vchunk << geo->chunk_shift) + offset;
683}
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704static struct md_rdev *read_balance(struct r10conf *conf,
705 struct r10bio *r10_bio,
706 int *max_sectors)
707{
708 const sector_t this_sector = r10_bio->sector;
709 int disk, slot;
710 int sectors = r10_bio->sectors;
711 int best_good_sectors;
712 sector_t new_distance, best_dist;
713 struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL;
714 int do_balance;
715 int best_dist_slot, best_pending_slot;
716 bool has_nonrot_disk = false;
717 unsigned int min_pending;
718 struct geom *geo = &conf->geo;
719
720 raid10_find_phys(conf, r10_bio);
721 rcu_read_lock();
722 best_dist_slot = -1;
723 min_pending = UINT_MAX;
724 best_dist_rdev = NULL;
725 best_pending_rdev = NULL;
726 best_dist = MaxSector;
727 best_good_sectors = 0;
728 do_balance = 1;
729 clear_bit(R10BIO_FailFast, &r10_bio->state);
730
731
732
733
734
735
736 if ((conf->mddev->recovery_cp < MaxSector
737 && (this_sector + sectors >= conf->next_resync)) ||
738 (mddev_is_clustered(conf->mddev) &&
739 md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
740 this_sector + sectors)))
741 do_balance = 0;
742
743 for (slot = 0; slot < conf->copies ; slot++) {
744 sector_t first_bad;
745 int bad_sectors;
746 sector_t dev_sector;
747 unsigned int pending;
748 bool nonrot;
749
750 if (r10_bio->devs[slot].bio == IO_BLOCKED)
751 continue;
752 disk = r10_bio->devs[slot].devnum;
753 rdev = rcu_dereference(conf->mirrors[disk].replacement);
754 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
755 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
756 rdev = rcu_dereference(conf->mirrors[disk].rdev);
757 if (rdev == NULL ||
758 test_bit(Faulty, &rdev->flags))
759 continue;
760 if (!test_bit(In_sync, &rdev->flags) &&
761 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
762 continue;
763
764 dev_sector = r10_bio->devs[slot].addr;
765 if (is_badblock(rdev, dev_sector, sectors,
766 &first_bad, &bad_sectors)) {
767 if (best_dist < MaxSector)
768
769 continue;
770 if (first_bad <= dev_sector) {
771
772
773
774
775 bad_sectors -= (dev_sector - first_bad);
776 if (!do_balance && sectors > bad_sectors)
777 sectors = bad_sectors;
778 if (best_good_sectors > sectors)
779 best_good_sectors = sectors;
780 } else {
781 sector_t good_sectors =
782 first_bad - dev_sector;
783 if (good_sectors > best_good_sectors) {
784 best_good_sectors = good_sectors;
785 best_dist_slot = slot;
786 best_dist_rdev = rdev;
787 }
788 if (!do_balance)
789
790 break;
791 }
792 continue;
793 } else
794 best_good_sectors = sectors;
795
796 if (!do_balance)
797 break;
798
799 nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
800 has_nonrot_disk |= nonrot;
801 pending = atomic_read(&rdev->nr_pending);
802 if (min_pending > pending && nonrot) {
803 min_pending = pending;
804 best_pending_slot = slot;
805 best_pending_rdev = rdev;
806 }
807
808 if (best_dist_slot >= 0)
809
810 set_bit(R10BIO_FailFast, &r10_bio->state);
811
812
813
814
815 if (geo->near_copies > 1 && !pending)
816 new_distance = 0;
817
818
819 else if (geo->far_copies > 1)
820 new_distance = r10_bio->devs[slot].addr;
821 else
822 new_distance = abs(r10_bio->devs[slot].addr -
823 conf->mirrors[disk].head_position);
824
825 if (new_distance < best_dist) {
826 best_dist = new_distance;
827 best_dist_slot = slot;
828 best_dist_rdev = rdev;
829 }
830 }
831 if (slot >= conf->copies) {
832 if (has_nonrot_disk) {
833 slot = best_pending_slot;
834 rdev = best_pending_rdev;
835 } else {
836 slot = best_dist_slot;
837 rdev = best_dist_rdev;
838 }
839 }
840
841 if (slot >= 0) {
842 atomic_inc(&rdev->nr_pending);
843 r10_bio->read_slot = slot;
844 } else
845 rdev = NULL;
846 rcu_read_unlock();
847 *max_sectors = best_good_sectors;
848
849 return rdev;
850}
851
852static void flush_pending_writes(struct r10conf *conf)
853{
854
855
856
857 spin_lock_irq(&conf->device_lock);
858
859 if (conf->pending_bio_list.head) {
860 struct blk_plug plug;
861 struct bio *bio;
862
863 bio = bio_list_get(&conf->pending_bio_list);
864 conf->pending_count = 0;
865 spin_unlock_irq(&conf->device_lock);
866
867
868
869
870
871
872
873
874
875
876 __set_current_state(TASK_RUNNING);
877
878 blk_start_plug(&plug);
879
880
881 md_bitmap_unplug(conf->mddev->bitmap);
882 wake_up(&conf->wait_barrier);
883
884 while (bio) {
885 struct bio *next = bio->bi_next;
886 struct md_rdev *rdev = (void*)bio->bi_bdev;
887 bio->bi_next = NULL;
888 bio_set_dev(bio, rdev->bdev);
889 if (test_bit(Faulty, &rdev->flags)) {
890 bio_io_error(bio);
891 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
892 !blk_queue_discard(bio->bi_bdev->bd_disk->queue)))
893
894 bio_endio(bio);
895 else
896 submit_bio_noacct(bio);
897 bio = next;
898 }
899 blk_finish_plug(&plug);
900 } else
901 spin_unlock_irq(&conf->device_lock);
902}
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926static void raise_barrier(struct r10conf *conf, int force)
927{
928 BUG_ON(force && !conf->barrier);
929 spin_lock_irq(&conf->resync_lock);
930
931
932 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
933 conf->resync_lock);
934
935
936 conf->barrier++;
937
938
939 wait_event_lock_irq(conf->wait_barrier,
940 !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
941 conf->resync_lock);
942
943 spin_unlock_irq(&conf->resync_lock);
944}
945
946static void lower_barrier(struct r10conf *conf)
947{
948 unsigned long flags;
949 spin_lock_irqsave(&conf->resync_lock, flags);
950 conf->barrier--;
951 spin_unlock_irqrestore(&conf->resync_lock, flags);
952 wake_up(&conf->wait_barrier);
953}
954
955static void wait_barrier(struct r10conf *conf)
956{
957 spin_lock_irq(&conf->resync_lock);
958 if (conf->barrier) {
959 struct bio_list *bio_list = current->bio_list;
960 conf->nr_waiting++;
961
962
963
964
965
966
967
968
969
970 raid10_log(conf->mddev, "wait barrier");
971 wait_event_lock_irq(conf->wait_barrier,
972 !conf->barrier ||
973 (atomic_read(&conf->nr_pending) &&
974 bio_list &&
975 (!bio_list_empty(&bio_list[0]) ||
976 !bio_list_empty(&bio_list[1]))) ||
977
978
979
980 (conf->mddev->thread->tsk == current &&
981 test_bit(MD_RECOVERY_RUNNING,
982 &conf->mddev->recovery) &&
983 conf->nr_queued > 0),
984 conf->resync_lock);
985 conf->nr_waiting--;
986 if (!conf->nr_waiting)
987 wake_up(&conf->wait_barrier);
988 }
989 atomic_inc(&conf->nr_pending);
990 spin_unlock_irq(&conf->resync_lock);
991}
992
993static void allow_barrier(struct r10conf *conf)
994{
995 if ((atomic_dec_and_test(&conf->nr_pending)) ||
996 (conf->array_freeze_pending))
997 wake_up(&conf->wait_barrier);
998}
999
1000static void freeze_array(struct r10conf *conf, int extra)
1001{
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014 spin_lock_irq(&conf->resync_lock);
1015 conf->array_freeze_pending++;
1016 conf->barrier++;
1017 conf->nr_waiting++;
1018 wait_event_lock_irq_cmd(conf->wait_barrier,
1019 atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
1020 conf->resync_lock,
1021 flush_pending_writes(conf));
1022
1023 conf->array_freeze_pending--;
1024 spin_unlock_irq(&conf->resync_lock);
1025}
1026
1027static void unfreeze_array(struct r10conf *conf)
1028{
1029
1030 spin_lock_irq(&conf->resync_lock);
1031 conf->barrier--;
1032 conf->nr_waiting--;
1033 wake_up(&conf->wait_barrier);
1034 spin_unlock_irq(&conf->resync_lock);
1035}
1036
1037static sector_t choose_data_offset(struct r10bio *r10_bio,
1038 struct md_rdev *rdev)
1039{
1040 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1041 test_bit(R10BIO_Previous, &r10_bio->state))
1042 return rdev->data_offset;
1043 else
1044 return rdev->new_data_offset;
1045}
1046
1047struct raid10_plug_cb {
1048 struct blk_plug_cb cb;
1049 struct bio_list pending;
1050 int pending_cnt;
1051};
1052
1053static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1054{
1055 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1056 cb);
1057 struct mddev *mddev = plug->cb.data;
1058 struct r10conf *conf = mddev->private;
1059 struct bio *bio;
1060
1061 if (from_schedule || current->bio_list) {
1062 spin_lock_irq(&conf->device_lock);
1063 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1064 conf->pending_count += plug->pending_cnt;
1065 spin_unlock_irq(&conf->device_lock);
1066 wake_up(&conf->wait_barrier);
1067 md_wakeup_thread(mddev->thread);
1068 kfree(plug);
1069 return;
1070 }
1071
1072
1073 bio = bio_list_get(&plug->pending);
1074 md_bitmap_unplug(mddev->bitmap);
1075 wake_up(&conf->wait_barrier);
1076
1077 while (bio) {
1078 struct bio *next = bio->bi_next;
1079 struct md_rdev *rdev = (void*)bio->bi_bdev;
1080 bio->bi_next = NULL;
1081 bio_set_dev(bio, rdev->bdev);
1082 if (test_bit(Faulty, &rdev->flags)) {
1083 bio_io_error(bio);
1084 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
1085 !blk_queue_discard(bio->bi_bdev->bd_disk->queue)))
1086
1087 bio_endio(bio);
1088 else
1089 submit_bio_noacct(bio);
1090 bio = next;
1091 }
1092 kfree(plug);
1093}
1094
1095
1096
1097
1098
1099
1100
1101static void regular_request_wait(struct mddev *mddev, struct r10conf *conf,
1102 struct bio *bio, sector_t sectors)
1103{
1104 wait_barrier(conf);
1105 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1106 bio->bi_iter.bi_sector < conf->reshape_progress &&
1107 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1108 raid10_log(conf->mddev, "wait reshape");
1109 allow_barrier(conf);
1110 wait_event(conf->wait_barrier,
1111 conf->reshape_progress <= bio->bi_iter.bi_sector ||
1112 conf->reshape_progress >= bio->bi_iter.bi_sector +
1113 sectors);
1114 wait_barrier(conf);
1115 }
1116}
1117
1118static void raid10_read_request(struct mddev *mddev, struct bio *bio,
1119 struct r10bio *r10_bio)
1120{
1121 struct r10conf *conf = mddev->private;
1122 struct bio *read_bio;
1123 const int op = bio_op(bio);
1124 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1125 int max_sectors;
1126 struct md_rdev *rdev;
1127 char b[BDEVNAME_SIZE];
1128 int slot = r10_bio->read_slot;
1129 struct md_rdev *err_rdev = NULL;
1130 gfp_t gfp = GFP_NOIO;
1131
1132 if (slot >= 0 && r10_bio->devs[slot].rdev) {
1133
1134
1135
1136
1137
1138
1139
1140 int disk;
1141
1142
1143
1144
1145 gfp = GFP_NOIO | __GFP_HIGH;
1146
1147 rcu_read_lock();
1148 disk = r10_bio->devs[slot].devnum;
1149 err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
1150 if (err_rdev)
1151 bdevname(err_rdev->bdev, b);
1152 else {
1153 strcpy(b, "???");
1154
1155 err_rdev = r10_bio->devs[slot].rdev;
1156 }
1157 rcu_read_unlock();
1158 }
1159
1160 regular_request_wait(mddev, conf, bio, r10_bio->sectors);
1161 rdev = read_balance(conf, r10_bio, &max_sectors);
1162 if (!rdev) {
1163 if (err_rdev) {
1164 pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
1165 mdname(mddev), b,
1166 (unsigned long long)r10_bio->sector);
1167 }
1168 raid_end_bio_io(r10_bio);
1169 return;
1170 }
1171 if (err_rdev)
1172 pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
1173 mdname(mddev),
1174 bdevname(rdev->bdev, b),
1175 (unsigned long long)r10_bio->sector);
1176 if (max_sectors < bio_sectors(bio)) {
1177 struct bio *split = bio_split(bio, max_sectors,
1178 gfp, &conf->bio_split);
1179 bio_chain(split, bio);
1180 allow_barrier(conf);
1181 submit_bio_noacct(bio);
1182 wait_barrier(conf);
1183 bio = split;
1184 r10_bio->master_bio = bio;
1185 r10_bio->sectors = max_sectors;
1186 }
1187 slot = r10_bio->read_slot;
1188
1189 if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
1190 r10_bio->start_time = bio_start_io_acct(bio);
1191 read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
1192
1193 r10_bio->devs[slot].bio = read_bio;
1194 r10_bio->devs[slot].rdev = rdev;
1195
1196 read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
1197 choose_data_offset(r10_bio, rdev);
1198 bio_set_dev(read_bio, rdev->bdev);
1199 read_bio->bi_end_io = raid10_end_read_request;
1200 bio_set_op_attrs(read_bio, op, do_sync);
1201 if (test_bit(FailFast, &rdev->flags) &&
1202 test_bit(R10BIO_FailFast, &r10_bio->state))
1203 read_bio->bi_opf |= MD_FAILFAST;
1204 read_bio->bi_private = r10_bio;
1205
1206 if (mddev->gendisk)
1207 trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk),
1208 r10_bio->sector);
1209 submit_bio_noacct(read_bio);
1210 return;
1211}
1212
1213static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
1214 struct bio *bio, bool replacement,
1215 int n_copy)
1216{
1217 const int op = bio_op(bio);
1218 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1219 const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
1220 unsigned long flags;
1221 struct blk_plug_cb *cb;
1222 struct raid10_plug_cb *plug = NULL;
1223 struct r10conf *conf = mddev->private;
1224 struct md_rdev *rdev;
1225 int devnum = r10_bio->devs[n_copy].devnum;
1226 struct bio *mbio;
1227
1228 if (replacement) {
1229 rdev = conf->mirrors[devnum].replacement;
1230 if (rdev == NULL) {
1231
1232 smp_mb();
1233 rdev = conf->mirrors[devnum].rdev;
1234 }
1235 } else
1236 rdev = conf->mirrors[devnum].rdev;
1237
1238 mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
1239 if (replacement)
1240 r10_bio->devs[n_copy].repl_bio = mbio;
1241 else
1242 r10_bio->devs[n_copy].bio = mbio;
1243
1244 mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
1245 choose_data_offset(r10_bio, rdev));
1246 bio_set_dev(mbio, rdev->bdev);
1247 mbio->bi_end_io = raid10_end_write_request;
1248 bio_set_op_attrs(mbio, op, do_sync | do_fua);
1249 if (!replacement && test_bit(FailFast,
1250 &conf->mirrors[devnum].rdev->flags)
1251 && enough(conf, devnum))
1252 mbio->bi_opf |= MD_FAILFAST;
1253 mbio->bi_private = r10_bio;
1254
1255 if (conf->mddev->gendisk)
1256 trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk),
1257 r10_bio->sector);
1258
1259 mbio->bi_bdev = (void *)rdev;
1260
1261 atomic_inc(&r10_bio->remaining);
1262
1263 cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
1264 if (cb)
1265 plug = container_of(cb, struct raid10_plug_cb, cb);
1266 else
1267 plug = NULL;
1268 if (plug) {
1269 bio_list_add(&plug->pending, mbio);
1270 plug->pending_cnt++;
1271 } else {
1272 spin_lock_irqsave(&conf->device_lock, flags);
1273 bio_list_add(&conf->pending_bio_list, mbio);
1274 conf->pending_count++;
1275 spin_unlock_irqrestore(&conf->device_lock, flags);
1276 md_wakeup_thread(mddev->thread);
1277 }
1278}
1279
1280static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
1281{
1282 int i;
1283 struct r10conf *conf = mddev->private;
1284 struct md_rdev *blocked_rdev;
1285
1286retry_wait:
1287 blocked_rdev = NULL;
1288 rcu_read_lock();
1289 for (i = 0; i < conf->copies; i++) {
1290 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1291 struct md_rdev *rrdev = rcu_dereference(
1292 conf->mirrors[i].replacement);
1293 if (rdev == rrdev)
1294 rrdev = NULL;
1295 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1296 atomic_inc(&rdev->nr_pending);
1297 blocked_rdev = rdev;
1298 break;
1299 }
1300 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1301 atomic_inc(&rrdev->nr_pending);
1302 blocked_rdev = rrdev;
1303 break;
1304 }
1305
1306 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1307 sector_t first_bad;
1308 sector_t dev_sector = r10_bio->devs[i].addr;
1309 int bad_sectors;
1310 int is_bad;
1311
1312
1313
1314
1315
1316 if (!r10_bio->sectors)
1317 continue;
1318
1319 is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors,
1320 &first_bad, &bad_sectors);
1321 if (is_bad < 0) {
1322
1323
1324
1325
1326 atomic_inc(&rdev->nr_pending);
1327 set_bit(BlockedBadBlocks, &rdev->flags);
1328 blocked_rdev = rdev;
1329 break;
1330 }
1331 }
1332 }
1333 rcu_read_unlock();
1334
1335 if (unlikely(blocked_rdev)) {
1336
1337 allow_barrier(conf);
1338 raid10_log(conf->mddev, "%s wait rdev %d blocked",
1339 __func__, blocked_rdev->raid_disk);
1340 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1341 wait_barrier(conf);
1342 goto retry_wait;
1343 }
1344}
1345
1346static void raid10_write_request(struct mddev *mddev, struct bio *bio,
1347 struct r10bio *r10_bio)
1348{
1349 struct r10conf *conf = mddev->private;
1350 int i;
1351 sector_t sectors;
1352 int max_sectors;
1353
1354 if ((mddev_is_clustered(mddev) &&
1355 md_cluster_ops->area_resyncing(mddev, WRITE,
1356 bio->bi_iter.bi_sector,
1357 bio_end_sector(bio)))) {
1358 DEFINE_WAIT(w);
1359 for (;;) {
1360 prepare_to_wait(&conf->wait_barrier,
1361 &w, TASK_IDLE);
1362 if (!md_cluster_ops->area_resyncing(mddev, WRITE,
1363 bio->bi_iter.bi_sector, bio_end_sector(bio)))
1364 break;
1365 schedule();
1366 }
1367 finish_wait(&conf->wait_barrier, &w);
1368 }
1369
1370 sectors = r10_bio->sectors;
1371 regular_request_wait(mddev, conf, bio, sectors);
1372 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1373 (mddev->reshape_backwards
1374 ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
1375 bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
1376 : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
1377 bio->bi_iter.bi_sector < conf->reshape_progress))) {
1378
1379 mddev->reshape_position = conf->reshape_progress;
1380 set_mask_bits(&mddev->sb_flags, 0,
1381 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1382 md_wakeup_thread(mddev->thread);
1383 raid10_log(conf->mddev, "wait reshape metadata");
1384 wait_event(mddev->sb_wait,
1385 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
1386
1387 conf->reshape_safe = mddev->reshape_position;
1388 }
1389
1390 if (conf->pending_count >= max_queued_requests) {
1391 md_wakeup_thread(mddev->thread);
1392 raid10_log(mddev, "wait queued");
1393 wait_event(conf->wait_barrier,
1394 conf->pending_count < max_queued_requests);
1395 }
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406 r10_bio->read_slot = -1;
1407 raid10_find_phys(conf, r10_bio);
1408
1409 wait_blocked_dev(mddev, r10_bio);
1410
1411 rcu_read_lock();
1412 max_sectors = r10_bio->sectors;
1413
1414 for (i = 0; i < conf->copies; i++) {
1415 int d = r10_bio->devs[i].devnum;
1416 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1417 struct md_rdev *rrdev = rcu_dereference(
1418 conf->mirrors[d].replacement);
1419 if (rdev == rrdev)
1420 rrdev = NULL;
1421 if (rdev && (test_bit(Faulty, &rdev->flags)))
1422 rdev = NULL;
1423 if (rrdev && (test_bit(Faulty, &rrdev->flags)))
1424 rrdev = NULL;
1425
1426 r10_bio->devs[i].bio = NULL;
1427 r10_bio->devs[i].repl_bio = NULL;
1428
1429 if (!rdev && !rrdev) {
1430 set_bit(R10BIO_Degraded, &r10_bio->state);
1431 continue;
1432 }
1433 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1434 sector_t first_bad;
1435 sector_t dev_sector = r10_bio->devs[i].addr;
1436 int bad_sectors;
1437 int is_bad;
1438
1439 is_bad = is_badblock(rdev, dev_sector, max_sectors,
1440 &first_bad, &bad_sectors);
1441 if (is_bad && first_bad <= dev_sector) {
1442
1443 bad_sectors -= (dev_sector - first_bad);
1444 if (bad_sectors < max_sectors)
1445
1446
1447
1448 max_sectors = bad_sectors;
1449
1450
1451
1452
1453
1454
1455
1456
1457 continue;
1458 }
1459 if (is_bad) {
1460 int good_sectors = first_bad - dev_sector;
1461 if (good_sectors < max_sectors)
1462 max_sectors = good_sectors;
1463 }
1464 }
1465 if (rdev) {
1466 r10_bio->devs[i].bio = bio;
1467 atomic_inc(&rdev->nr_pending);
1468 }
1469 if (rrdev) {
1470 r10_bio->devs[i].repl_bio = bio;
1471 atomic_inc(&rrdev->nr_pending);
1472 }
1473 }
1474 rcu_read_unlock();
1475
1476 if (max_sectors < r10_bio->sectors)
1477 r10_bio->sectors = max_sectors;
1478
1479 if (r10_bio->sectors < bio_sectors(bio)) {
1480 struct bio *split = bio_split(bio, r10_bio->sectors,
1481 GFP_NOIO, &conf->bio_split);
1482 bio_chain(split, bio);
1483 allow_barrier(conf);
1484 submit_bio_noacct(bio);
1485 wait_barrier(conf);
1486 bio = split;
1487 r10_bio->master_bio = bio;
1488 }
1489
1490 if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
1491 r10_bio->start_time = bio_start_io_acct(bio);
1492 atomic_set(&r10_bio->remaining, 1);
1493 md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1494
1495 for (i = 0; i < conf->copies; i++) {
1496 if (r10_bio->devs[i].bio)
1497 raid10_write_one_disk(mddev, r10_bio, bio, false, i);
1498 if (r10_bio->devs[i].repl_bio)
1499 raid10_write_one_disk(mddev, r10_bio, bio, true, i);
1500 }
1501 one_write_done(r10_bio);
1502}
1503
1504static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
1505{
1506 struct r10conf *conf = mddev->private;
1507 struct r10bio *r10_bio;
1508
1509 r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
1510
1511 r10_bio->master_bio = bio;
1512 r10_bio->sectors = sectors;
1513
1514 r10_bio->mddev = mddev;
1515 r10_bio->sector = bio->bi_iter.bi_sector;
1516 r10_bio->state = 0;
1517 r10_bio->read_slot = -1;
1518 memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) *
1519 conf->geo.raid_disks);
1520
1521 if (bio_data_dir(bio) == READ)
1522 raid10_read_request(mddev, bio, r10_bio);
1523 else
1524 raid10_write_request(mddev, bio, r10_bio);
1525}
1526
1527static void raid_end_discard_bio(struct r10bio *r10bio)
1528{
1529 struct r10conf *conf = r10bio->mddev->private;
1530 struct r10bio *first_r10bio;
1531
1532 while (atomic_dec_and_test(&r10bio->remaining)) {
1533
1534 allow_barrier(conf);
1535
1536 if (!test_bit(R10BIO_Discard, &r10bio->state)) {
1537 first_r10bio = (struct r10bio *)r10bio->master_bio;
1538 free_r10bio(r10bio);
1539 r10bio = first_r10bio;
1540 } else {
1541 md_write_end(r10bio->mddev);
1542 bio_endio(r10bio->master_bio);
1543 free_r10bio(r10bio);
1544 break;
1545 }
1546 }
1547}
1548
1549static void raid10_end_discard_request(struct bio *bio)
1550{
1551 struct r10bio *r10_bio = bio->bi_private;
1552 struct r10conf *conf = r10_bio->mddev->private;
1553 struct md_rdev *rdev = NULL;
1554 int dev;
1555 int slot, repl;
1556
1557
1558
1559
1560 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
1561 set_bit(R10BIO_Uptodate, &r10_bio->state);
1562
1563 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1564 if (repl)
1565 rdev = conf->mirrors[dev].replacement;
1566 if (!rdev) {
1567
1568
1569
1570
1571
1572 smp_rmb();
1573 rdev = conf->mirrors[dev].rdev;
1574 }
1575
1576 raid_end_discard_bio(r10_bio);
1577 rdev_dec_pending(rdev, conf->mddev);
1578}
1579
1580
1581
1582
1583
1584
1585
1586static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
1587{
1588 struct r10conf *conf = mddev->private;
1589 struct geom *geo = &conf->geo;
1590 int far_copies = geo->far_copies;
1591 bool first_copy = true;
1592 struct r10bio *r10_bio, *first_r10bio;
1593 struct bio *split;
1594 int disk;
1595 sector_t chunk;
1596 unsigned int stripe_size;
1597 unsigned int stripe_data_disks;
1598 sector_t split_size;
1599 sector_t bio_start, bio_end;
1600 sector_t first_stripe_index, last_stripe_index;
1601 sector_t start_disk_offset;
1602 unsigned int start_disk_index;
1603 sector_t end_disk_offset;
1604 unsigned int end_disk_index;
1605 unsigned int remainder;
1606
1607 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
1608 return -EAGAIN;
1609
1610 wait_barrier(conf);
1611
1612
1613
1614
1615
1616 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
1617 goto out;
1618
1619 if (geo->near_copies)
1620 stripe_data_disks = geo->raid_disks / geo->near_copies +
1621 geo->raid_disks % geo->near_copies;
1622 else
1623 stripe_data_disks = geo->raid_disks;
1624
1625 stripe_size = stripe_data_disks << geo->chunk_shift;
1626
1627 bio_start = bio->bi_iter.bi_sector;
1628 bio_end = bio_end_sector(bio);
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638 if (bio_sectors(bio) < stripe_size*2)
1639 goto out;
1640
1641
1642
1643
1644 div_u64_rem(bio_start, stripe_size, &remainder);
1645 if (remainder) {
1646 split_size = stripe_size - remainder;
1647 split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split);
1648 bio_chain(split, bio);
1649 allow_barrier(conf);
1650
1651 submit_bio_noacct(split);
1652 wait_barrier(conf);
1653 }
1654 div_u64_rem(bio_end, stripe_size, &remainder);
1655 if (remainder) {
1656 split_size = bio_sectors(bio) - remainder;
1657 split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split);
1658 bio_chain(split, bio);
1659 allow_barrier(conf);
1660
1661 submit_bio_noacct(bio);
1662 bio = split;
1663 wait_barrier(conf);
1664 }
1665
1666 bio_start = bio->bi_iter.bi_sector;
1667 bio_end = bio_end_sector(bio);
1668
1669
1670
1671
1672
1673
1674 chunk = bio_start >> geo->chunk_shift;
1675 chunk *= geo->near_copies;
1676 first_stripe_index = chunk;
1677 start_disk_index = sector_div(first_stripe_index, geo->raid_disks);
1678 if (geo->far_offset)
1679 first_stripe_index *= geo->far_copies;
1680 start_disk_offset = (bio_start & geo->chunk_mask) +
1681 (first_stripe_index << geo->chunk_shift);
1682
1683 chunk = bio_end >> geo->chunk_shift;
1684 chunk *= geo->near_copies;
1685 last_stripe_index = chunk;
1686 end_disk_index = sector_div(last_stripe_index, geo->raid_disks);
1687 if (geo->far_offset)
1688 last_stripe_index *= geo->far_copies;
1689 end_disk_offset = (bio_end & geo->chunk_mask) +
1690 (last_stripe_index << geo->chunk_shift);
1691
1692retry_discard:
1693 r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
1694 r10_bio->mddev = mddev;
1695 r10_bio->state = 0;
1696 r10_bio->sectors = 0;
1697 memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);
1698 wait_blocked_dev(mddev, r10_bio);
1699
1700
1701
1702
1703
1704
1705
1706
1707 if (first_copy) {
1708 r10_bio->master_bio = bio;
1709 set_bit(R10BIO_Discard, &r10_bio->state);
1710 first_copy = false;
1711 first_r10bio = r10_bio;
1712 } else
1713 r10_bio->master_bio = (struct bio *)first_r10bio;
1714
1715
1716
1717
1718
1719
1720 rcu_read_lock();
1721 for (disk = 0; disk < geo->raid_disks; disk++) {
1722 struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
1723 struct md_rdev *rrdev = rcu_dereference(
1724 conf->mirrors[disk].replacement);
1725
1726 r10_bio->devs[disk].bio = NULL;
1727 r10_bio->devs[disk].repl_bio = NULL;
1728
1729 if (rdev && (test_bit(Faulty, &rdev->flags)))
1730 rdev = NULL;
1731 if (rrdev && (test_bit(Faulty, &rrdev->flags)))
1732 rrdev = NULL;
1733 if (!rdev && !rrdev)
1734 continue;
1735
1736 if (rdev) {
1737 r10_bio->devs[disk].bio = bio;
1738 atomic_inc(&rdev->nr_pending);
1739 }
1740 if (rrdev) {
1741 r10_bio->devs[disk].repl_bio = bio;
1742 atomic_inc(&rrdev->nr_pending);
1743 }
1744 }
1745 rcu_read_unlock();
1746
1747 atomic_set(&r10_bio->remaining, 1);
1748 for (disk = 0; disk < geo->raid_disks; disk++) {
1749 sector_t dev_start, dev_end;
1750 struct bio *mbio, *rbio = NULL;
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764 if (disk < start_disk_index)
1765 dev_start = (first_stripe_index + 1) * mddev->chunk_sectors;
1766 else if (disk > start_disk_index)
1767 dev_start = first_stripe_index * mddev->chunk_sectors;
1768 else
1769 dev_start = start_disk_offset;
1770
1771 if (disk < end_disk_index)
1772 dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
1773 else if (disk > end_disk_index)
1774 dev_end = last_stripe_index * mddev->chunk_sectors;
1775 else
1776 dev_end = end_disk_offset;
1777
1778
1779
1780
1781
1782
1783
1784 if (r10_bio->devs[disk].bio) {
1785 struct md_rdev *rdev = conf->mirrors[disk].rdev;
1786 mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
1787 mbio->bi_end_io = raid10_end_discard_request;
1788 mbio->bi_private = r10_bio;
1789 r10_bio->devs[disk].bio = mbio;
1790 r10_bio->devs[disk].devnum = disk;
1791 atomic_inc(&r10_bio->remaining);
1792 md_submit_discard_bio(mddev, rdev, mbio,
1793 dev_start + choose_data_offset(r10_bio, rdev),
1794 dev_end - dev_start);
1795 bio_endio(mbio);
1796 }
1797 if (r10_bio->devs[disk].repl_bio) {
1798 struct md_rdev *rrdev = conf->mirrors[disk].replacement;
1799 rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
1800 rbio->bi_end_io = raid10_end_discard_request;
1801 rbio->bi_private = r10_bio;
1802 r10_bio->devs[disk].repl_bio = rbio;
1803 r10_bio->devs[disk].devnum = disk;
1804 atomic_inc(&r10_bio->remaining);
1805 md_submit_discard_bio(mddev, rrdev, rbio,
1806 dev_start + choose_data_offset(r10_bio, rrdev),
1807 dev_end - dev_start);
1808 bio_endio(rbio);
1809 }
1810 }
1811
1812 if (!geo->far_offset && --far_copies) {
1813 first_stripe_index += geo->stride >> geo->chunk_shift;
1814 start_disk_offset += geo->stride;
1815 last_stripe_index += geo->stride >> geo->chunk_shift;
1816 end_disk_offset += geo->stride;
1817 atomic_inc(&first_r10bio->remaining);
1818 raid_end_discard_bio(r10_bio);
1819 wait_barrier(conf);
1820 goto retry_discard;
1821 }
1822
1823 raid_end_discard_bio(r10_bio);
1824
1825 return 0;
1826out:
1827 allow_barrier(conf);
1828 return -EAGAIN;
1829}
1830
1831static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
1832{
1833 struct r10conf *conf = mddev->private;
1834 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1835 int chunk_sects = chunk_mask + 1;
1836 int sectors = bio_sectors(bio);
1837
1838 if (unlikely(bio->bi_opf & REQ_PREFLUSH)
1839 && md_flush_request(mddev, bio))
1840 return true;
1841
1842 if (!md_write_start(mddev, bio))
1843 return false;
1844
1845 if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
1846 if (!raid10_handle_discard(mddev, bio))
1847 return true;
1848
1849
1850
1851
1852
1853 if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
1854 sectors > chunk_sects
1855 && (conf->geo.near_copies < conf->geo.raid_disks
1856 || conf->prev.near_copies <
1857 conf->prev.raid_disks)))
1858 sectors = chunk_sects -
1859 (bio->bi_iter.bi_sector &
1860 (chunk_sects - 1));
1861 __make_request(mddev, bio, sectors);
1862
1863
1864 wake_up(&conf->wait_barrier);
1865 return true;
1866}
1867
1868static void raid10_status(struct seq_file *seq, struct mddev *mddev)
1869{
1870 struct r10conf *conf = mddev->private;
1871 int i;
1872
1873 if (conf->geo.near_copies < conf->geo.raid_disks)
1874 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1875 if (conf->geo.near_copies > 1)
1876 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1877 if (conf->geo.far_copies > 1) {
1878 if (conf->geo.far_offset)
1879 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1880 else
1881 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1882 if (conf->geo.far_set_size != conf->geo.raid_disks)
1883 seq_printf(seq, " %d devices per set", conf->geo.far_set_size);
1884 }
1885 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1886 conf->geo.raid_disks - mddev->degraded);
1887 rcu_read_lock();
1888 for (i = 0; i < conf->geo.raid_disks; i++) {
1889 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1890 seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1891 }
1892 rcu_read_unlock();
1893 seq_printf(seq, "]");
1894}
1895
1896
1897
1898
1899
1900
1901static int _enough(struct r10conf *conf, int previous, int ignore)
1902{
1903 int first = 0;
1904 int has_enough = 0;
1905 int disks, ncopies;
1906 if (previous) {
1907 disks = conf->prev.raid_disks;
1908 ncopies = conf->prev.near_copies;
1909 } else {
1910 disks = conf->geo.raid_disks;
1911 ncopies = conf->geo.near_copies;
1912 }
1913
1914 rcu_read_lock();
1915 do {
1916 int n = conf->copies;
1917 int cnt = 0;
1918 int this = first;
1919 while (n--) {
1920 struct md_rdev *rdev;
1921 if (this != ignore &&
1922 (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
1923 test_bit(In_sync, &rdev->flags))
1924 cnt++;
1925 this = (this+1) % disks;
1926 }
1927 if (cnt == 0)
1928 goto out;
1929 first = (first + ncopies) % disks;
1930 } while (first != 0);
1931 has_enough = 1;
1932out:
1933 rcu_read_unlock();
1934 return has_enough;
1935}
1936
1937static int enough(struct r10conf *conf, int ignore)
1938{
1939
1940
1941
1942
1943
1944 return _enough(conf, 0, ignore) &&
1945 _enough(conf, 1, ignore);
1946}
1947
1948static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
1949{
1950 char b[BDEVNAME_SIZE];
1951 struct r10conf *conf = mddev->private;
1952 unsigned long flags;
1953
1954
1955
1956
1957
1958
1959
1960 spin_lock_irqsave(&conf->device_lock, flags);
1961 if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
1962 && !enough(conf, rdev->raid_disk)) {
1963
1964
1965
1966 spin_unlock_irqrestore(&conf->device_lock, flags);
1967 return;
1968 }
1969 if (test_and_clear_bit(In_sync, &rdev->flags))
1970 mddev->degraded++;
1971
1972
1973
1974 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1975 set_bit(Blocked, &rdev->flags);
1976 set_bit(Faulty, &rdev->flags);
1977 set_mask_bits(&mddev->sb_flags, 0,
1978 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1979 spin_unlock_irqrestore(&conf->device_lock, flags);
1980 pr_crit("md/raid10:%s: Disk failure on %s, disabling device.\n"
1981 "md/raid10:%s: Operation continuing on %d devices.\n",
1982 mdname(mddev), bdevname(rdev->bdev, b),
1983 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1984}
1985
1986static void print_conf(struct r10conf *conf)
1987{
1988 int i;
1989 struct md_rdev *rdev;
1990
1991 pr_debug("RAID10 conf printout:\n");
1992 if (!conf) {
1993 pr_debug("(!conf)\n");
1994 return;
1995 }
1996 pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1997 conf->geo.raid_disks);
1998
1999
2000
2001 for (i = 0; i < conf->geo.raid_disks; i++) {
2002 char b[BDEVNAME_SIZE];
2003 rdev = conf->mirrors[i].rdev;
2004 if (rdev)
2005 pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n",
2006 i, !test_bit(In_sync, &rdev->flags),
2007 !test_bit(Faulty, &rdev->flags),
2008 bdevname(rdev->bdev,b));
2009 }
2010}
2011
2012static void close_sync(struct r10conf *conf)
2013{
2014 wait_barrier(conf);
2015 allow_barrier(conf);
2016
2017 mempool_exit(&conf->r10buf_pool);
2018}
2019
2020static int raid10_spare_active(struct mddev *mddev)
2021{
2022 int i;
2023 struct r10conf *conf = mddev->private;
2024 struct raid10_info *tmp;
2025 int count = 0;
2026 unsigned long flags;
2027
2028
2029
2030
2031
2032 for (i = 0; i < conf->geo.raid_disks; i++) {
2033 tmp = conf->mirrors + i;
2034 if (tmp->replacement
2035 && tmp->replacement->recovery_offset == MaxSector
2036 && !test_bit(Faulty, &tmp->replacement->flags)
2037 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
2038
2039 if (!tmp->rdev
2040 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
2041 count++;
2042 if (tmp->rdev) {
2043
2044
2045
2046
2047 set_bit(Faulty, &tmp->rdev->flags);
2048 sysfs_notify_dirent_safe(
2049 tmp->rdev->sysfs_state);
2050 }
2051 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
2052 } else if (tmp->rdev
2053 && tmp->rdev->recovery_offset == MaxSector
2054 && !test_bit(Faulty, &tmp->rdev->flags)
2055 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
2056 count++;
2057 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
2058 }
2059 }
2060 spin_lock_irqsave(&conf->device_lock, flags);
2061 mddev->degraded -= count;
2062 spin_unlock_irqrestore(&conf->device_lock, flags);
2063
2064 print_conf(conf);
2065 return count;
2066}
2067
2068static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
2069{
2070 struct r10conf *conf = mddev->private;
2071 int err = -EEXIST;
2072 int mirror;
2073 int first = 0;
2074 int last = conf->geo.raid_disks - 1;
2075
2076 if (mddev->recovery_cp < MaxSector)
2077
2078
2079
2080 return -EBUSY;
2081 if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
2082 return -EINVAL;
2083
2084 if (md_integrity_add_rdev(rdev, mddev))
2085 return -ENXIO;
2086
2087 if (rdev->raid_disk >= 0)
2088 first = last = rdev->raid_disk;
2089
2090 if (rdev->saved_raid_disk >= first &&
2091 rdev->saved_raid_disk < conf->geo.raid_disks &&
2092 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
2093 mirror = rdev->saved_raid_disk;
2094 else
2095 mirror = first;
2096 for ( ; mirror <= last ; mirror++) {
2097 struct raid10_info *p = &conf->mirrors[mirror];
2098 if (p->recovery_disabled == mddev->recovery_disabled)
2099 continue;
2100 if (p->rdev) {
2101 if (!test_bit(WantReplacement, &p->rdev->flags) ||
2102 p->replacement != NULL)
2103 continue;
2104 clear_bit(In_sync, &rdev->flags);
2105 set_bit(Replacement, &rdev->flags);
2106 rdev->raid_disk = mirror;
2107 err = 0;
2108 if (mddev->gendisk)
2109 disk_stack_limits(mddev->gendisk, rdev->bdev,
2110 rdev->data_offset << 9);
2111 conf->fullsync = 1;
2112 rcu_assign_pointer(p->replacement, rdev);
2113 break;
2114 }
2115
2116 if (mddev->gendisk)
2117 disk_stack_limits(mddev->gendisk, rdev->bdev,
2118 rdev->data_offset << 9);
2119
2120 p->head_position = 0;
2121 p->recovery_disabled = mddev->recovery_disabled - 1;
2122 rdev->raid_disk = mirror;
2123 err = 0;
2124 if (rdev->saved_raid_disk != mirror)
2125 conf->fullsync = 1;
2126 rcu_assign_pointer(p->rdev, rdev);
2127 break;
2128 }
2129 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
2130 blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue);
2131
2132 print_conf(conf);
2133 return err;
2134}
2135
2136static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
2137{
2138 struct r10conf *conf = mddev->private;
2139 int err = 0;
2140 int number = rdev->raid_disk;
2141 struct md_rdev **rdevp;
2142 struct raid10_info *p = conf->mirrors + number;
2143
2144 print_conf(conf);
2145 if (rdev == p->rdev)
2146 rdevp = &p->rdev;
2147 else if (rdev == p->replacement)
2148 rdevp = &p->replacement;
2149 else
2150 return 0;
2151
2152 if (test_bit(In_sync, &rdev->flags) ||
2153 atomic_read(&rdev->nr_pending)) {
2154 err = -EBUSY;
2155 goto abort;
2156 }
2157
2158
2159
2160 if (!test_bit(Faulty, &rdev->flags) &&
2161 mddev->recovery_disabled != p->recovery_disabled &&
2162 (!p->replacement || p->replacement == rdev) &&
2163 number < conf->geo.raid_disks &&
2164 enough(conf, -1)) {
2165 err = -EBUSY;
2166 goto abort;
2167 }
2168 *rdevp = NULL;
2169 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
2170 synchronize_rcu();
2171 if (atomic_read(&rdev->nr_pending)) {
2172
2173 err = -EBUSY;
2174 *rdevp = rdev;
2175 goto abort;
2176 }
2177 }
2178 if (p->replacement) {
2179
2180 p->rdev = p->replacement;
2181 clear_bit(Replacement, &p->replacement->flags);
2182 smp_mb();
2183
2184
2185 p->replacement = NULL;
2186 }
2187
2188 clear_bit(WantReplacement, &rdev->flags);
2189 err = md_integrity_register(mddev);
2190
2191abort:
2192
2193 print_conf(conf);
2194 return err;
2195}
2196
2197static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
2198{
2199 struct r10conf *conf = r10_bio->mddev->private;
2200
2201 if (!bio->bi_status)
2202 set_bit(R10BIO_Uptodate, &r10_bio->state);
2203 else
2204
2205
2206
2207 atomic_add(r10_bio->sectors,
2208 &conf->mirrors[d].rdev->corrected_errors);
2209
2210
2211
2212
2213 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
2214 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
2215 atomic_dec_and_test(&r10_bio->remaining)) {
2216
2217
2218
2219 reschedule_retry(r10_bio);
2220 }
2221}
2222
2223static void end_sync_read(struct bio *bio)
2224{
2225 struct r10bio *r10_bio = get_resync_r10bio(bio);
2226 struct r10conf *conf = r10_bio->mddev->private;
2227 int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
2228
2229 __end_sync_read(r10_bio, bio, d);
2230}
2231
2232static void end_reshape_read(struct bio *bio)
2233{
2234
2235 struct r10bio *r10_bio = bio->bi_private;
2236
2237 __end_sync_read(r10_bio, bio, r10_bio->read_slot);
2238}
2239
2240static void end_sync_request(struct r10bio *r10_bio)
2241{
2242 struct mddev *mddev = r10_bio->mddev;
2243
2244 while (atomic_dec_and_test(&r10_bio->remaining)) {
2245 if (r10_bio->master_bio == NULL) {
2246
2247 sector_t s = r10_bio->sectors;
2248 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2249 test_bit(R10BIO_WriteError, &r10_bio->state))
2250 reschedule_retry(r10_bio);
2251 else
2252 put_buf(r10_bio);
2253 md_done_sync(mddev, s, 1);
2254 break;
2255 } else {
2256 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
2257 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2258 test_bit(R10BIO_WriteError, &r10_bio->state))
2259 reschedule_retry(r10_bio);
2260 else
2261 put_buf(r10_bio);
2262 r10_bio = r10_bio2;
2263 }
2264 }
2265}
2266
2267static void end_sync_write(struct bio *bio)
2268{
2269 struct r10bio *r10_bio = get_resync_r10bio(bio);
2270 struct mddev *mddev = r10_bio->mddev;
2271 struct r10conf *conf = mddev->private;
2272 int d;
2273 sector_t first_bad;
2274 int bad_sectors;
2275 int slot;
2276 int repl;
2277 struct md_rdev *rdev = NULL;
2278
2279 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
2280 if (repl)
2281 rdev = conf->mirrors[d].replacement;
2282 else
2283 rdev = conf->mirrors[d].rdev;
2284
2285 if (bio->bi_status) {
2286 if (repl)
2287 md_error(mddev, rdev);
2288 else {
2289 set_bit(WriteErrorSeen, &rdev->flags);
2290 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2291 set_bit(MD_RECOVERY_NEEDED,
2292 &rdev->mddev->recovery);
2293 set_bit(R10BIO_WriteError, &r10_bio->state);
2294 }
2295 } else if (is_badblock(rdev,
2296 r10_bio->devs[slot].addr,
2297 r10_bio->sectors,
2298 &first_bad, &bad_sectors))
2299 set_bit(R10BIO_MadeGood, &r10_bio->state);
2300
2301 rdev_dec_pending(rdev, mddev);
2302
2303 end_sync_request(r10_bio);
2304}
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2323{
2324 struct r10conf *conf = mddev->private;
2325 int i, first;
2326 struct bio *tbio, *fbio;
2327 int vcnt;
2328 struct page **tpages, **fpages;
2329
2330 atomic_set(&r10_bio->remaining, 1);
2331
2332
2333 for (i=0; i<conf->copies; i++)
2334 if (!r10_bio->devs[i].bio->bi_status)
2335 break;
2336
2337 if (i == conf->copies)
2338 goto done;
2339
2340 first = i;
2341 fbio = r10_bio->devs[i].bio;
2342 fbio->bi_iter.bi_size = r10_bio->sectors << 9;
2343 fbio->bi_iter.bi_idx = 0;
2344 fpages = get_resync_pages(fbio)->pages;
2345
2346 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2347
2348 for (i=0 ; i < conf->copies ; i++) {
2349 int j, d;
2350 struct md_rdev *rdev;
2351 struct resync_pages *rp;
2352
2353 tbio = r10_bio->devs[i].bio;
2354
2355 if (tbio->bi_end_io != end_sync_read)
2356 continue;
2357 if (i == first)
2358 continue;
2359
2360 tpages = get_resync_pages(tbio)->pages;
2361 d = r10_bio->devs[i].devnum;
2362 rdev = conf->mirrors[d].rdev;
2363 if (!r10_bio->devs[i].bio->bi_status) {
2364
2365
2366
2367
2368 int sectors = r10_bio->sectors;
2369 for (j = 0; j < vcnt; j++) {
2370 int len = PAGE_SIZE;
2371 if (sectors < (len / 512))
2372 len = sectors * 512;
2373 if (memcmp(page_address(fpages[j]),
2374 page_address(tpages[j]),
2375 len))
2376 break;
2377 sectors -= len/512;
2378 }
2379 if (j == vcnt)
2380 continue;
2381 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2382 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2383
2384 continue;
2385 } else if (test_bit(FailFast, &rdev->flags)) {
2386
2387 md_error(rdev->mddev, rdev);
2388 continue;
2389 }
2390
2391
2392
2393
2394
2395 rp = get_resync_pages(tbio);
2396 bio_reset(tbio);
2397
2398 md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size);
2399
2400 rp->raid_bio = r10_bio;
2401 tbio->bi_private = rp;
2402 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
2403 tbio->bi_end_io = end_sync_write;
2404 bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
2405
2406 bio_copy_data(tbio, fbio);
2407
2408 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2409 atomic_inc(&r10_bio->remaining);
2410 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2411
2412 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
2413 tbio->bi_opf |= MD_FAILFAST;
2414 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
2415 bio_set_dev(tbio, conf->mirrors[d].rdev->bdev);
2416 submit_bio_noacct(tbio);
2417 }
2418
2419
2420
2421
2422 for (i = 0; i < conf->copies; i++) {
2423 int d;
2424
2425 tbio = r10_bio->devs[i].repl_bio;
2426 if (!tbio || !tbio->bi_end_io)
2427 continue;
2428 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2429 && r10_bio->devs[i].bio != fbio)
2430 bio_copy_data(tbio, fbio);
2431 d = r10_bio->devs[i].devnum;
2432 atomic_inc(&r10_bio->remaining);
2433 md_sync_acct(conf->mirrors[d].replacement->bdev,
2434 bio_sectors(tbio));
2435 submit_bio_noacct(tbio);
2436 }
2437
2438done:
2439 if (atomic_dec_and_test(&r10_bio->remaining)) {
2440 md_done_sync(mddev, r10_bio->sectors, 1);
2441 put_buf(r10_bio);
2442 }
2443}
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455static void fix_recovery_read_error(struct r10bio *r10_bio)
2456{
2457
2458
2459
2460
2461
2462
2463
2464 struct mddev *mddev = r10_bio->mddev;
2465 struct r10conf *conf = mddev->private;
2466 struct bio *bio = r10_bio->devs[0].bio;
2467 sector_t sect = 0;
2468 int sectors = r10_bio->sectors;
2469 int idx = 0;
2470 int dr = r10_bio->devs[0].devnum;
2471 int dw = r10_bio->devs[1].devnum;
2472 struct page **pages = get_resync_pages(bio)->pages;
2473
2474 while (sectors) {
2475 int s = sectors;
2476 struct md_rdev *rdev;
2477 sector_t addr;
2478 int ok;
2479
2480 if (s > (PAGE_SIZE>>9))
2481 s = PAGE_SIZE >> 9;
2482
2483 rdev = conf->mirrors[dr].rdev;
2484 addr = r10_bio->devs[0].addr + sect,
2485 ok = sync_page_io(rdev,
2486 addr,
2487 s << 9,
2488 pages[idx],
2489 REQ_OP_READ, 0, false);
2490 if (ok) {
2491 rdev = conf->mirrors[dw].rdev;
2492 addr = r10_bio->devs[1].addr + sect;
2493 ok = sync_page_io(rdev,
2494 addr,
2495 s << 9,
2496 pages[idx],
2497 REQ_OP_WRITE, 0, false);
2498 if (!ok) {
2499 set_bit(WriteErrorSeen, &rdev->flags);
2500 if (!test_and_set_bit(WantReplacement,
2501 &rdev->flags))
2502 set_bit(MD_RECOVERY_NEEDED,
2503 &rdev->mddev->recovery);
2504 }
2505 }
2506 if (!ok) {
2507
2508
2509
2510
2511 rdev_set_badblocks(rdev, addr, s, 0);
2512
2513 if (rdev != conf->mirrors[dw].rdev) {
2514
2515 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2516 addr = r10_bio->devs[1].addr + sect;
2517 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2518 if (!ok) {
2519
2520 pr_notice("md/raid10:%s: recovery aborted due to read error\n",
2521 mdname(mddev));
2522
2523 conf->mirrors[dw].recovery_disabled
2524 = mddev->recovery_disabled;
2525 set_bit(MD_RECOVERY_INTR,
2526 &mddev->recovery);
2527 break;
2528 }
2529 }
2530 }
2531
2532 sectors -= s;
2533 sect += s;
2534 idx++;
2535 }
2536}
2537
2538static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2539{
2540 struct r10conf *conf = mddev->private;
2541 int d;
2542 struct bio *wbio, *wbio2;
2543
2544 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2545 fix_recovery_read_error(r10_bio);
2546 end_sync_request(r10_bio);
2547 return;
2548 }
2549
2550
2551
2552
2553
2554 d = r10_bio->devs[1].devnum;
2555 wbio = r10_bio->devs[1].bio;
2556 wbio2 = r10_bio->devs[1].repl_bio;
2557
2558
2559
2560
2561 if (wbio2 && !wbio2->bi_end_io)
2562 wbio2 = NULL;
2563 if (wbio->bi_end_io) {
2564 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2565 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2566 submit_bio_noacct(wbio);
2567 }
2568 if (wbio2) {
2569 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2570 md_sync_acct(conf->mirrors[d].replacement->bdev,
2571 bio_sectors(wbio2));
2572 submit_bio_noacct(wbio2);
2573 }
2574}
2575
2576
2577
2578
2579
2580
2581
2582static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2583{
2584 long cur_time_mon;
2585 unsigned long hours_since_last;
2586 unsigned int read_errors = atomic_read(&rdev->read_errors);
2587
2588 cur_time_mon = ktime_get_seconds();
2589
2590 if (rdev->last_read_error == 0) {
2591
2592 rdev->last_read_error = cur_time_mon;
2593 return;
2594 }
2595
2596 hours_since_last = (long)(cur_time_mon -
2597 rdev->last_read_error) / 3600;
2598
2599 rdev->last_read_error = cur_time_mon;
2600
2601
2602
2603
2604
2605
2606 if (hours_since_last >= 8 * sizeof(read_errors))
2607 atomic_set(&rdev->read_errors, 0);
2608 else
2609 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2610}
2611
2612static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2613 int sectors, struct page *page, int rw)
2614{
2615 sector_t first_bad;
2616 int bad_sectors;
2617
2618 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2619 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2620 return -1;
2621 if (sync_page_io(rdev, sector, sectors << 9, page, rw, 0, false))
2622
2623 return 1;
2624 if (rw == WRITE) {
2625 set_bit(WriteErrorSeen, &rdev->flags);
2626 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2627 set_bit(MD_RECOVERY_NEEDED,
2628 &rdev->mddev->recovery);
2629 }
2630
2631 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2632 md_error(rdev->mddev, rdev);
2633 return 0;
2634}
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2645{
2646 int sect = 0;
2647 int sectors = r10_bio->sectors;
2648 struct md_rdev *rdev;
2649 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2650 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2651
2652
2653
2654
2655 rdev = conf->mirrors[d].rdev;
2656
2657 if (test_bit(Faulty, &rdev->flags))
2658
2659
2660 return;
2661
2662 check_decay_read_errors(mddev, rdev);
2663 atomic_inc(&rdev->read_errors);
2664 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2665 char b[BDEVNAME_SIZE];
2666 bdevname(rdev->bdev, b);
2667
2668 pr_notice("md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %d:max %d]\n",
2669 mdname(mddev), b,
2670 atomic_read(&rdev->read_errors), max_read_errors);
2671 pr_notice("md/raid10:%s: %s: Failing raid device\n",
2672 mdname(mddev), b);
2673 md_error(mddev, rdev);
2674 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2675 return;
2676 }
2677
2678 while(sectors) {
2679 int s = sectors;
2680 int sl = r10_bio->read_slot;
2681 int success = 0;
2682 int start;
2683
2684 if (s > (PAGE_SIZE>>9))
2685 s = PAGE_SIZE >> 9;
2686
2687 rcu_read_lock();
2688 do {
2689 sector_t first_bad;
2690 int bad_sectors;
2691
2692 d = r10_bio->devs[sl].devnum;
2693 rdev = rcu_dereference(conf->mirrors[d].rdev);
2694 if (rdev &&
2695 test_bit(In_sync, &rdev->flags) &&
2696 !test_bit(Faulty, &rdev->flags) &&
2697 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2698 &first_bad, &bad_sectors) == 0) {
2699 atomic_inc(&rdev->nr_pending);
2700 rcu_read_unlock();
2701 success = sync_page_io(rdev,
2702 r10_bio->devs[sl].addr +
2703 sect,
2704 s<<9,
2705 conf->tmppage,
2706 REQ_OP_READ, 0, false);
2707 rdev_dec_pending(rdev, mddev);
2708 rcu_read_lock();
2709 if (success)
2710 break;
2711 }
2712 sl++;
2713 if (sl == conf->copies)
2714 sl = 0;
2715 } while (!success && sl != r10_bio->read_slot);
2716 rcu_read_unlock();
2717
2718 if (!success) {
2719
2720
2721
2722
2723 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2724 rdev = conf->mirrors[dn].rdev;
2725
2726 if (!rdev_set_badblocks(
2727 rdev,
2728 r10_bio->devs[r10_bio->read_slot].addr
2729 + sect,
2730 s, 0)) {
2731 md_error(mddev, rdev);
2732 r10_bio->devs[r10_bio->read_slot].bio
2733 = IO_BLOCKED;
2734 }
2735 break;
2736 }
2737
2738 start = sl;
2739
2740 rcu_read_lock();
2741 while (sl != r10_bio->read_slot) {
2742 char b[BDEVNAME_SIZE];
2743
2744 if (sl==0)
2745 sl = conf->copies;
2746 sl--;
2747 d = r10_bio->devs[sl].devnum;
2748 rdev = rcu_dereference(conf->mirrors[d].rdev);
2749 if (!rdev ||
2750 test_bit(Faulty, &rdev->flags) ||
2751 !test_bit(In_sync, &rdev->flags))
2752 continue;
2753
2754 atomic_inc(&rdev->nr_pending);
2755 rcu_read_unlock();
2756 if (r10_sync_page_io(rdev,
2757 r10_bio->devs[sl].addr +
2758 sect,
2759 s, conf->tmppage, WRITE)
2760 == 0) {
2761
2762 pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %s)\n",
2763 mdname(mddev), s,
2764 (unsigned long long)(
2765 sect +
2766 choose_data_offset(r10_bio,
2767 rdev)),
2768 bdevname(rdev->bdev, b));
2769 pr_notice("md/raid10:%s: %s: failing drive\n",
2770 mdname(mddev),
2771 bdevname(rdev->bdev, b));
2772 }
2773 rdev_dec_pending(rdev, mddev);
2774 rcu_read_lock();
2775 }
2776 sl = start;
2777 while (sl != r10_bio->read_slot) {
2778 char b[BDEVNAME_SIZE];
2779
2780 if (sl==0)
2781 sl = conf->copies;
2782 sl--;
2783 d = r10_bio->devs[sl].devnum;
2784 rdev = rcu_dereference(conf->mirrors[d].rdev);
2785 if (!rdev ||
2786 test_bit(Faulty, &rdev->flags) ||
2787 !test_bit(In_sync, &rdev->flags))
2788 continue;
2789
2790 atomic_inc(&rdev->nr_pending);
2791 rcu_read_unlock();
2792 switch (r10_sync_page_io(rdev,
2793 r10_bio->devs[sl].addr +
2794 sect,
2795 s, conf->tmppage,
2796 READ)) {
2797 case 0:
2798
2799 pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %s)\n",
2800 mdname(mddev), s,
2801 (unsigned long long)(
2802 sect +
2803 choose_data_offset(r10_bio, rdev)),
2804 bdevname(rdev->bdev, b));
2805 pr_notice("md/raid10:%s: %s: failing drive\n",
2806 mdname(mddev),
2807 bdevname(rdev->bdev, b));
2808 break;
2809 case 1:
2810 pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %s)\n",
2811 mdname(mddev), s,
2812 (unsigned long long)(
2813 sect +
2814 choose_data_offset(r10_bio, rdev)),
2815 bdevname(rdev->bdev, b));
2816 atomic_add(s, &rdev->corrected_errors);
2817 }
2818
2819 rdev_dec_pending(rdev, mddev);
2820 rcu_read_lock();
2821 }
2822 rcu_read_unlock();
2823
2824 sectors -= s;
2825 sect += s;
2826 }
2827}
2828
2829static int narrow_write_error(struct r10bio *r10_bio, int i)
2830{
2831 struct bio *bio = r10_bio->master_bio;
2832 struct mddev *mddev = r10_bio->mddev;
2833 struct r10conf *conf = mddev->private;
2834 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846 int block_sectors;
2847 sector_t sector;
2848 int sectors;
2849 int sect_to_write = r10_bio->sectors;
2850 int ok = 1;
2851
2852 if (rdev->badblocks.shift < 0)
2853 return 0;
2854
2855 block_sectors = roundup(1 << rdev->badblocks.shift,
2856 bdev_logical_block_size(rdev->bdev) >> 9);
2857 sector = r10_bio->sector;
2858 sectors = ((r10_bio->sector + block_sectors)
2859 & ~(sector_t)(block_sectors - 1))
2860 - sector;
2861
2862 while (sect_to_write) {
2863 struct bio *wbio;
2864 sector_t wsector;
2865 if (sectors > sect_to_write)
2866 sectors = sect_to_write;
2867
2868 wbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
2869 bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
2870 wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
2871 wbio->bi_iter.bi_sector = wsector +
2872 choose_data_offset(r10_bio, rdev);
2873 bio_set_dev(wbio, rdev->bdev);
2874 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
2875
2876 if (submit_bio_wait(wbio) < 0)
2877
2878 ok = rdev_set_badblocks(rdev, wsector,
2879 sectors, 0)
2880 && ok;
2881
2882 bio_put(wbio);
2883 sect_to_write -= sectors;
2884 sector += sectors;
2885 sectors = block_sectors;
2886 }
2887 return ok;
2888}
2889
2890static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2891{
2892 int slot = r10_bio->read_slot;
2893 struct bio *bio;
2894 struct r10conf *conf = mddev->private;
2895 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905 bio = r10_bio->devs[slot].bio;
2906 bio_put(bio);
2907 r10_bio->devs[slot].bio = NULL;
2908
2909 if (mddev->ro)
2910 r10_bio->devs[slot].bio = IO_BLOCKED;
2911 else if (!test_bit(FailFast, &rdev->flags)) {
2912 freeze_array(conf, 1);
2913 fix_read_error(conf, mddev, r10_bio);
2914 unfreeze_array(conf);
2915 } else
2916 md_error(mddev, rdev);
2917
2918 rdev_dec_pending(rdev, mddev);
2919 allow_barrier(conf);
2920 r10_bio->state = 0;
2921 raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
2922}
2923
2924static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2925{
2926
2927
2928
2929
2930
2931
2932 int m;
2933 struct md_rdev *rdev;
2934
2935 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2936 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2937 for (m = 0; m < conf->copies; m++) {
2938 int dev = r10_bio->devs[m].devnum;
2939 rdev = conf->mirrors[dev].rdev;
2940 if (r10_bio->devs[m].bio == NULL ||
2941 r10_bio->devs[m].bio->bi_end_io == NULL)
2942 continue;
2943 if (!r10_bio->devs[m].bio->bi_status) {
2944 rdev_clear_badblocks(
2945 rdev,
2946 r10_bio->devs[m].addr,
2947 r10_bio->sectors, 0);
2948 } else {
2949 if (!rdev_set_badblocks(
2950 rdev,
2951 r10_bio->devs[m].addr,
2952 r10_bio->sectors, 0))
2953 md_error(conf->mddev, rdev);
2954 }
2955 rdev = conf->mirrors[dev].replacement;
2956 if (r10_bio->devs[m].repl_bio == NULL ||
2957 r10_bio->devs[m].repl_bio->bi_end_io == NULL)
2958 continue;
2959
2960 if (!r10_bio->devs[m].repl_bio->bi_status) {
2961 rdev_clear_badblocks(
2962 rdev,
2963 r10_bio->devs[m].addr,
2964 r10_bio->sectors, 0);
2965 } else {
2966 if (!rdev_set_badblocks(
2967 rdev,
2968 r10_bio->devs[m].addr,
2969 r10_bio->sectors, 0))
2970 md_error(conf->mddev, rdev);
2971 }
2972 }
2973 put_buf(r10_bio);
2974 } else {
2975 bool fail = false;
2976 for (m = 0; m < conf->copies; m++) {
2977 int dev = r10_bio->devs[m].devnum;
2978 struct bio *bio = r10_bio->devs[m].bio;
2979 rdev = conf->mirrors[dev].rdev;
2980 if (bio == IO_MADE_GOOD) {
2981 rdev_clear_badblocks(
2982 rdev,
2983 r10_bio->devs[m].addr,
2984 r10_bio->sectors, 0);
2985 rdev_dec_pending(rdev, conf->mddev);
2986 } else if (bio != NULL && bio->bi_status) {
2987 fail = true;
2988 if (!narrow_write_error(r10_bio, m)) {
2989 md_error(conf->mddev, rdev);
2990 set_bit(R10BIO_Degraded,
2991 &r10_bio->state);
2992 }
2993 rdev_dec_pending(rdev, conf->mddev);
2994 }
2995 bio = r10_bio->devs[m].repl_bio;
2996 rdev = conf->mirrors[dev].replacement;
2997 if (rdev && bio == IO_MADE_GOOD) {
2998 rdev_clear_badblocks(
2999 rdev,
3000 r10_bio->devs[m].addr,
3001 r10_bio->sectors, 0);
3002 rdev_dec_pending(rdev, conf->mddev);
3003 }
3004 }
3005 if (fail) {
3006 spin_lock_irq(&conf->device_lock);
3007 list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
3008 conf->nr_queued++;
3009 spin_unlock_irq(&conf->device_lock);
3010
3011
3012
3013
3014 wake_up(&conf->wait_barrier);
3015 md_wakeup_thread(conf->mddev->thread);
3016 } else {
3017 if (test_bit(R10BIO_WriteError,
3018 &r10_bio->state))
3019 close_write(r10_bio);
3020 raid_end_bio_io(r10_bio);
3021 }
3022 }
3023}
3024
3025static void raid10d(struct md_thread *thread)
3026{
3027 struct mddev *mddev = thread->mddev;
3028 struct r10bio *r10_bio;
3029 unsigned long flags;
3030 struct r10conf *conf = mddev->private;
3031 struct list_head *head = &conf->retry_list;
3032 struct blk_plug plug;
3033
3034 md_check_recovery(mddev);
3035
3036 if (!list_empty_careful(&conf->bio_end_io_list) &&
3037 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
3038 LIST_HEAD(tmp);
3039 spin_lock_irqsave(&conf->device_lock, flags);
3040 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
3041 while (!list_empty(&conf->bio_end_io_list)) {
3042 list_move(conf->bio_end_io_list.prev, &tmp);
3043 conf->nr_queued--;
3044 }
3045 }
3046 spin_unlock_irqrestore(&conf->device_lock, flags);
3047 while (!list_empty(&tmp)) {
3048 r10_bio = list_first_entry(&tmp, struct r10bio,
3049 retry_list);
3050 list_del(&r10_bio->retry_list);
3051 if (mddev->degraded)
3052 set_bit(R10BIO_Degraded, &r10_bio->state);
3053
3054 if (test_bit(R10BIO_WriteError,
3055 &r10_bio->state))
3056 close_write(r10_bio);
3057 raid_end_bio_io(r10_bio);
3058 }
3059 }
3060
3061 blk_start_plug(&plug);
3062 for (;;) {
3063
3064 flush_pending_writes(conf);
3065
3066 spin_lock_irqsave(&conf->device_lock, flags);
3067 if (list_empty(head)) {
3068 spin_unlock_irqrestore(&conf->device_lock, flags);
3069 break;
3070 }
3071 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
3072 list_del(head->prev);
3073 conf->nr_queued--;
3074 spin_unlock_irqrestore(&conf->device_lock, flags);
3075
3076 mddev = r10_bio->mddev;
3077 conf = mddev->private;
3078 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
3079 test_bit(R10BIO_WriteError, &r10_bio->state))
3080 handle_write_completed(conf, r10_bio);
3081 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
3082 reshape_request_write(mddev, r10_bio);
3083 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
3084 sync_request_write(mddev, r10_bio);
3085 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
3086 recovery_request_write(mddev, r10_bio);
3087 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
3088 handle_read_error(mddev, r10_bio);
3089 else
3090 WARN_ON_ONCE(1);
3091
3092 cond_resched();
3093 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
3094 md_check_recovery(mddev);
3095 }
3096 blk_finish_plug(&plug);
3097}
3098
3099static int init_resync(struct r10conf *conf)
3100{
3101 int ret, buffs, i;
3102
3103 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
3104 BUG_ON(mempool_initialized(&conf->r10buf_pool));
3105 conf->have_replacement = 0;
3106 for (i = 0; i < conf->geo.raid_disks; i++)
3107 if (conf->mirrors[i].replacement)
3108 conf->have_replacement = 1;
3109 ret = mempool_init(&conf->r10buf_pool, buffs,
3110 r10buf_pool_alloc, r10buf_pool_free, conf);
3111 if (ret)
3112 return ret;
3113 conf->next_resync = 0;
3114 return 0;
3115}
3116
3117static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
3118{
3119 struct r10bio *r10bio = mempool_alloc(&conf->r10buf_pool, GFP_NOIO);
3120 struct rsync_pages *rp;
3121 struct bio *bio;
3122 int nalloc;
3123 int i;
3124
3125 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
3126 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
3127 nalloc = conf->copies;
3128 else
3129 nalloc = 2;
3130
3131 for (i = 0; i < nalloc; i++) {
3132 bio = r10bio->devs[i].bio;
3133 rp = bio->bi_private;
3134 bio_reset(bio);
3135 bio->bi_private = rp;
3136 bio = r10bio->devs[i].repl_bio;
3137 if (bio) {
3138 rp = bio->bi_private;
3139 bio_reset(bio);
3140 bio->bi_private = rp;
3141 }
3142 }
3143 return r10bio;
3144}
3145
3146
3147
3148
3149
3150static void raid10_set_cluster_sync_high(struct r10conf *conf)
3151{
3152 sector_t window_size;
3153 int extra_chunk, chunks;
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167 chunks = conf->geo.raid_disks / conf->geo.near_copies;
3168 if (conf->geo.raid_disks % conf->geo.near_copies == 0)
3169 extra_chunk = 0;
3170 else
3171 extra_chunk = 1;
3172 window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors;
3173
3174
3175
3176
3177 window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ?
3178 CLUSTER_RESYNC_WINDOW_SECTORS : window_size;
3179
3180 conf->cluster_sync_high = conf->cluster_sync_low + window_size;
3181}
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3216 int *skipped)
3217{
3218 struct r10conf *conf = mddev->private;
3219 struct r10bio *r10_bio;
3220 struct bio *biolist = NULL, *bio;
3221 sector_t max_sector, nr_sectors;
3222 int i;
3223 int max_sync;
3224 sector_t sync_blocks;
3225 sector_t sectors_skipped = 0;
3226 int chunks_skipped = 0;
3227 sector_t chunk_mask = conf->geo.chunk_mask;
3228 int page_idx = 0;
3229
3230 if (!mempool_initialized(&conf->r10buf_pool))
3231 if (init_resync(conf))
3232 return 0;
3233
3234
3235
3236
3237
3238 if (mddev->bitmap == NULL &&
3239 mddev->recovery_cp == MaxSector &&
3240 mddev->reshape_position == MaxSector &&
3241 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
3242 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
3243 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
3244 conf->fullsync == 0) {
3245 *skipped = 1;
3246 return mddev->dev_sectors - sector_nr;
3247 }
3248
3249 skipped:
3250 max_sector = mddev->dev_sectors;
3251 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
3252 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3253 max_sector = mddev->resync_max_sectors;
3254 if (sector_nr >= max_sector) {
3255 conf->cluster_sync_low = 0;
3256 conf->cluster_sync_high = 0;
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
3268 end_reshape(conf);
3269 close_sync(conf);
3270 return 0;
3271 }
3272
3273 if (mddev->curr_resync < max_sector) {
3274 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3275 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
3276 &sync_blocks, 1);
3277 else for (i = 0; i < conf->geo.raid_disks; i++) {
3278 sector_t sect =
3279 raid10_find_virt(conf, mddev->curr_resync, i);
3280 md_bitmap_end_sync(mddev->bitmap, sect,
3281 &sync_blocks, 1);
3282 }
3283 } else {
3284
3285 if ((!mddev->bitmap || conf->fullsync)
3286 && conf->have_replacement
3287 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3288
3289
3290
3291 rcu_read_lock();
3292 for (i = 0; i < conf->geo.raid_disks; i++) {
3293 struct md_rdev *rdev =
3294 rcu_dereference(conf->mirrors[i].replacement);
3295 if (rdev)
3296 rdev->recovery_offset = MaxSector;
3297 }
3298 rcu_read_unlock();
3299 }
3300 conf->fullsync = 0;
3301 }
3302 md_bitmap_close_sync(mddev->bitmap);
3303 close_sync(conf);
3304 *skipped = 1;
3305 return sectors_skipped;
3306 }
3307
3308 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3309 return reshape_request(mddev, sector_nr, skipped);
3310
3311 if (chunks_skipped >= conf->geo.raid_disks) {
3312
3313
3314
3315 *skipped = 1;
3316 return (max_sector - sector_nr) + sectors_skipped;
3317 }
3318
3319 if (max_sector > mddev->resync_max)
3320 max_sector = mddev->resync_max;
3321
3322
3323
3324
3325 if (conf->geo.near_copies < conf->geo.raid_disks &&
3326 max_sector > (sector_nr | chunk_mask))
3327 max_sector = (sector_nr | chunk_mask) + 1;
3328
3329
3330
3331
3332
3333 if (conf->nr_waiting)
3334 schedule_timeout_uninterruptible(1);
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
3352 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3353
3354 int j;
3355 r10_bio = NULL;
3356
3357 for (i = 0 ; i < conf->geo.raid_disks; i++) {
3358 int still_degraded;
3359 struct r10bio *rb2;
3360 sector_t sect;
3361 int must_sync;
3362 int any_working;
3363 int need_recover = 0;
3364 int need_replace = 0;
3365 struct raid10_info *mirror = &conf->mirrors[i];
3366 struct md_rdev *mrdev, *mreplace;
3367
3368 rcu_read_lock();
3369 mrdev = rcu_dereference(mirror->rdev);
3370 mreplace = rcu_dereference(mirror->replacement);
3371
3372 if (mrdev != NULL &&
3373 !test_bit(Faulty, &mrdev->flags) &&
3374 !test_bit(In_sync, &mrdev->flags))
3375 need_recover = 1;
3376 if (mreplace != NULL &&
3377 !test_bit(Faulty, &mreplace->flags))
3378 need_replace = 1;
3379
3380 if (!need_recover && !need_replace) {
3381 rcu_read_unlock();
3382 continue;
3383 }
3384
3385 still_degraded = 0;
3386
3387 rb2 = r10_bio;
3388 sect = raid10_find_virt(conf, sector_nr, i);
3389 if (sect >= mddev->resync_max_sectors) {
3390
3391
3392
3393 rcu_read_unlock();
3394 continue;
3395 }
3396 if (mreplace && test_bit(Faulty, &mreplace->flags))
3397 mreplace = NULL;
3398
3399
3400
3401
3402 must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
3403 &sync_blocks, 1);
3404 if (sync_blocks < max_sync)
3405 max_sync = sync_blocks;
3406 if (!must_sync &&
3407 mreplace == NULL &&
3408 !conf->fullsync) {
3409
3410
3411
3412 chunks_skipped = -1;
3413 rcu_read_unlock();
3414 continue;
3415 }
3416 atomic_inc(&mrdev->nr_pending);
3417 if (mreplace)
3418 atomic_inc(&mreplace->nr_pending);
3419 rcu_read_unlock();
3420
3421 r10_bio = raid10_alloc_init_r10buf(conf);
3422 r10_bio->state = 0;
3423 raise_barrier(conf, rb2 != NULL);
3424 atomic_set(&r10_bio->remaining, 0);
3425
3426 r10_bio->master_bio = (struct bio*)rb2;
3427 if (rb2)
3428 atomic_inc(&rb2->remaining);
3429 r10_bio->mddev = mddev;
3430 set_bit(R10BIO_IsRecover, &r10_bio->state);
3431 r10_bio->sector = sect;
3432
3433 raid10_find_phys(conf, r10_bio);
3434
3435
3436
3437
3438 rcu_read_lock();
3439 for (j = 0; j < conf->geo.raid_disks; j++) {
3440 struct md_rdev *rdev = rcu_dereference(
3441 conf->mirrors[j].rdev);
3442 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3443 still_degraded = 1;
3444 break;
3445 }
3446 }
3447
3448 must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
3449 &sync_blocks, still_degraded);
3450
3451 any_working = 0;
3452 for (j=0; j<conf->copies;j++) {
3453 int k;
3454 int d = r10_bio->devs[j].devnum;
3455 sector_t from_addr, to_addr;
3456 struct md_rdev *rdev =
3457 rcu_dereference(conf->mirrors[d].rdev);
3458 sector_t sector, first_bad;
3459 int bad_sectors;
3460 if (!rdev ||
3461 !test_bit(In_sync, &rdev->flags))
3462 continue;
3463
3464 any_working = 1;
3465 sector = r10_bio->devs[j].addr;
3466
3467 if (is_badblock(rdev, sector, max_sync,
3468 &first_bad, &bad_sectors)) {
3469 if (first_bad > sector)
3470 max_sync = first_bad - sector;
3471 else {
3472 bad_sectors -= (sector
3473 - first_bad);
3474 if (max_sync > bad_sectors)
3475 max_sync = bad_sectors;
3476 continue;
3477 }
3478 }
3479 bio = r10_bio->devs[0].bio;
3480 bio->bi_next = biolist;
3481 biolist = bio;
3482 bio->bi_end_io = end_sync_read;
3483 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3484 if (test_bit(FailFast, &rdev->flags))
3485 bio->bi_opf |= MD_FAILFAST;
3486 from_addr = r10_bio->devs[j].addr;
3487 bio->bi_iter.bi_sector = from_addr +
3488 rdev->data_offset;
3489 bio_set_dev(bio, rdev->bdev);
3490 atomic_inc(&rdev->nr_pending);
3491
3492
3493 for (k=0; k<conf->copies; k++)
3494 if (r10_bio->devs[k].devnum == i)
3495 break;
3496 BUG_ON(k == conf->copies);
3497 to_addr = r10_bio->devs[k].addr;
3498 r10_bio->devs[0].devnum = d;
3499 r10_bio->devs[0].addr = from_addr;
3500 r10_bio->devs[1].devnum = i;
3501 r10_bio->devs[1].addr = to_addr;
3502
3503 if (need_recover) {
3504 bio = r10_bio->devs[1].bio;
3505 bio->bi_next = biolist;
3506 biolist = bio;
3507 bio->bi_end_io = end_sync_write;
3508 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3509 bio->bi_iter.bi_sector = to_addr
3510 + mrdev->data_offset;
3511 bio_set_dev(bio, mrdev->bdev);
3512 atomic_inc(&r10_bio->remaining);
3513 } else
3514 r10_bio->devs[1].bio->bi_end_io = NULL;
3515
3516
3517 bio = r10_bio->devs[1].repl_bio;
3518 if (bio)
3519 bio->bi_end_io = NULL;
3520
3521
3522
3523
3524 if (!need_replace)
3525 break;
3526 bio->bi_next = biolist;
3527 biolist = bio;
3528 bio->bi_end_io = end_sync_write;
3529 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3530 bio->bi_iter.bi_sector = to_addr +
3531 mreplace->data_offset;
3532 bio_set_dev(bio, mreplace->bdev);
3533 atomic_inc(&r10_bio->remaining);
3534 break;
3535 }
3536 rcu_read_unlock();
3537 if (j == conf->copies) {
3538
3539
3540 if (any_working) {
3541
3542
3543
3544 int k;
3545 for (k = 0; k < conf->copies; k++)
3546 if (r10_bio->devs[k].devnum == i)
3547 break;
3548 if (!test_bit(In_sync,
3549 &mrdev->flags)
3550 && !rdev_set_badblocks(
3551 mrdev,
3552 r10_bio->devs[k].addr,
3553 max_sync, 0))
3554 any_working = 0;
3555 if (mreplace &&
3556 !rdev_set_badblocks(
3557 mreplace,
3558 r10_bio->devs[k].addr,
3559 max_sync, 0))
3560 any_working = 0;
3561 }
3562 if (!any_working) {
3563 if (!test_and_set_bit(MD_RECOVERY_INTR,
3564 &mddev->recovery))
3565 pr_warn("md/raid10:%s: insufficient working devices for recovery.\n",
3566 mdname(mddev));
3567 mirror->recovery_disabled
3568 = mddev->recovery_disabled;
3569 }
3570 put_buf(r10_bio);
3571 if (rb2)
3572 atomic_dec(&rb2->remaining);
3573 r10_bio = rb2;
3574 rdev_dec_pending(mrdev, mddev);
3575 if (mreplace)
3576 rdev_dec_pending(mreplace, mddev);
3577 break;
3578 }
3579 rdev_dec_pending(mrdev, mddev);
3580 if (mreplace)
3581 rdev_dec_pending(mreplace, mddev);
3582 if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) {
3583
3584
3585
3586
3587 int targets = 1;
3588 for (; j < conf->copies; j++) {
3589 int d = r10_bio->devs[j].devnum;
3590 if (conf->mirrors[d].rdev &&
3591 test_bit(In_sync,
3592 &conf->mirrors[d].rdev->flags))
3593 targets++;
3594 }
3595 if (targets == 1)
3596 r10_bio->devs[0].bio->bi_opf
3597 &= ~MD_FAILFAST;
3598 }
3599 }
3600 if (biolist == NULL) {
3601 while (r10_bio) {
3602 struct r10bio *rb2 = r10_bio;
3603 r10_bio = (struct r10bio*) rb2->master_bio;
3604 rb2->master_bio = NULL;
3605 put_buf(rb2);
3606 }
3607 goto giveup;
3608 }
3609 } else {
3610
3611 int count = 0;
3612
3613
3614
3615
3616
3617
3618
3619
3620 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
3621 mddev_is_clustered(mddev) &&
3622 (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
3623
3624 if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
3625 &sync_blocks, mddev->degraded) &&
3626 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3627 &mddev->recovery)) {
3628
3629 *skipped = 1;
3630 return sync_blocks + sectors_skipped;
3631 }
3632 if (sync_blocks < max_sync)
3633 max_sync = sync_blocks;
3634 r10_bio = raid10_alloc_init_r10buf(conf);
3635 r10_bio->state = 0;
3636
3637 r10_bio->mddev = mddev;
3638 atomic_set(&r10_bio->remaining, 0);
3639 raise_barrier(conf, 0);
3640 conf->next_resync = sector_nr;
3641
3642 r10_bio->master_bio = NULL;
3643 r10_bio->sector = sector_nr;
3644 set_bit(R10BIO_IsSync, &r10_bio->state);
3645 raid10_find_phys(conf, r10_bio);
3646 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3647
3648 for (i = 0; i < conf->copies; i++) {
3649 int d = r10_bio->devs[i].devnum;
3650 sector_t first_bad, sector;
3651 int bad_sectors;
3652 struct md_rdev *rdev;
3653
3654 if (r10_bio->devs[i].repl_bio)
3655 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3656
3657 bio = r10_bio->devs[i].bio;
3658 bio->bi_status = BLK_STS_IOERR;
3659 rcu_read_lock();
3660 rdev = rcu_dereference(conf->mirrors[d].rdev);
3661 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3662 rcu_read_unlock();
3663 continue;
3664 }
3665 sector = r10_bio->devs[i].addr;
3666 if (is_badblock(rdev, sector, max_sync,
3667 &first_bad, &bad_sectors)) {
3668 if (first_bad > sector)
3669 max_sync = first_bad - sector;
3670 else {
3671 bad_sectors -= (sector - first_bad);
3672 if (max_sync > bad_sectors)
3673 max_sync = bad_sectors;
3674 rcu_read_unlock();
3675 continue;
3676 }
3677 }
3678 atomic_inc(&rdev->nr_pending);
3679 atomic_inc(&r10_bio->remaining);
3680 bio->bi_next = biolist;
3681 biolist = bio;
3682 bio->bi_end_io = end_sync_read;
3683 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3684 if (test_bit(FailFast, &rdev->flags))
3685 bio->bi_opf |= MD_FAILFAST;
3686 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3687 bio_set_dev(bio, rdev->bdev);
3688 count++;
3689
3690 rdev = rcu_dereference(conf->mirrors[d].replacement);
3691 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3692 rcu_read_unlock();
3693 continue;
3694 }
3695 atomic_inc(&rdev->nr_pending);
3696
3697
3698 bio = r10_bio->devs[i].repl_bio;
3699 bio->bi_status = BLK_STS_IOERR;
3700
3701 sector = r10_bio->devs[i].addr;
3702 bio->bi_next = biolist;
3703 biolist = bio;
3704 bio->bi_end_io = end_sync_write;
3705 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3706 if (test_bit(FailFast, &rdev->flags))
3707 bio->bi_opf |= MD_FAILFAST;
3708 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3709 bio_set_dev(bio, rdev->bdev);
3710 count++;
3711 rcu_read_unlock();
3712 }
3713
3714 if (count < 2) {
3715 for (i=0; i<conf->copies; i++) {
3716 int d = r10_bio->devs[i].devnum;
3717 if (r10_bio->devs[i].bio->bi_end_io)
3718 rdev_dec_pending(conf->mirrors[d].rdev,
3719 mddev);
3720 if (r10_bio->devs[i].repl_bio &&
3721 r10_bio->devs[i].repl_bio->bi_end_io)
3722 rdev_dec_pending(
3723 conf->mirrors[d].replacement,
3724 mddev);
3725 }
3726 put_buf(r10_bio);
3727 biolist = NULL;
3728 goto giveup;
3729 }
3730 }
3731
3732 nr_sectors = 0;
3733 if (sector_nr + max_sync < max_sector)
3734 max_sector = sector_nr + max_sync;
3735 do {
3736 struct page *page;
3737 int len = PAGE_SIZE;
3738 if (sector_nr + (len>>9) > max_sector)
3739 len = (max_sector - sector_nr) << 9;
3740 if (len == 0)
3741 break;
3742 for (bio= biolist ; bio ; bio=bio->bi_next) {
3743 struct resync_pages *rp = get_resync_pages(bio);
3744 page = resync_fetch_page(rp, page_idx);
3745
3746
3747
3748
3749 bio_add_page(bio, page, len, 0);
3750 }
3751 nr_sectors += len>>9;
3752 sector_nr += len>>9;
3753 } while (++page_idx < RESYNC_PAGES);
3754 r10_bio->sectors = nr_sectors;
3755
3756 if (mddev_is_clustered(mddev) &&
3757 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3758
3759 if (conf->cluster_sync_high < sector_nr + nr_sectors) {
3760 conf->cluster_sync_low = mddev->curr_resync_completed;
3761 raid10_set_cluster_sync_high(conf);
3762
3763 md_cluster_ops->resync_info_update(mddev,
3764 conf->cluster_sync_low,
3765 conf->cluster_sync_high);
3766 }
3767 } else if (mddev_is_clustered(mddev)) {
3768
3769 sector_t sect_va1, sect_va2;
3770 bool broadcast_msg = false;
3771
3772 for (i = 0; i < conf->geo.raid_disks; i++) {
3773
3774
3775
3776
3777
3778 sect_va1 = raid10_find_virt(conf, sector_nr, i);
3779
3780 if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
3781 broadcast_msg = true;
3782
3783
3784
3785
3786 sect_va2 = raid10_find_virt(conf,
3787 mddev->curr_resync_completed, i);
3788
3789 if (conf->cluster_sync_low == 0 ||
3790 conf->cluster_sync_low > sect_va2)
3791 conf->cluster_sync_low = sect_va2;
3792 }
3793 }
3794 if (broadcast_msg) {
3795 raid10_set_cluster_sync_high(conf);
3796 md_cluster_ops->resync_info_update(mddev,
3797 conf->cluster_sync_low,
3798 conf->cluster_sync_high);
3799 }
3800 }
3801
3802 while (biolist) {
3803 bio = biolist;
3804 biolist = biolist->bi_next;
3805
3806 bio->bi_next = NULL;
3807 r10_bio = get_resync_r10bio(bio);
3808 r10_bio->sectors = nr_sectors;
3809
3810 if (bio->bi_end_io == end_sync_read) {
3811 md_sync_acct_bio(bio, nr_sectors);
3812 bio->bi_status = 0;
3813 submit_bio_noacct(bio);
3814 }
3815 }
3816
3817 if (sectors_skipped)
3818
3819
3820
3821 md_done_sync(mddev, sectors_skipped, 1);
3822
3823 return sectors_skipped + nr_sectors;
3824 giveup:
3825
3826
3827
3828
3829 if (sector_nr + max_sync < max_sector)
3830 max_sector = sector_nr + max_sync;
3831
3832 sectors_skipped += (max_sector - sector_nr);
3833 chunks_skipped ++;
3834 sector_nr = max_sector;
3835 goto skipped;
3836}
3837
3838static sector_t
3839raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3840{
3841 sector_t size;
3842 struct r10conf *conf = mddev->private;
3843
3844 if (!raid_disks)
3845 raid_disks = min(conf->geo.raid_disks,
3846 conf->prev.raid_disks);
3847 if (!sectors)
3848 sectors = conf->dev_sectors;
3849
3850 size = sectors >> conf->geo.chunk_shift;
3851 sector_div(size, conf->geo.far_copies);
3852 size = size * raid_disks;
3853 sector_div(size, conf->geo.near_copies);
3854
3855 return size << conf->geo.chunk_shift;
3856}
3857
3858static void calc_sectors(struct r10conf *conf, sector_t size)
3859{
3860
3861
3862
3863
3864
3865 size = size >> conf->geo.chunk_shift;
3866 sector_div(size, conf->geo.far_copies);
3867 size = size * conf->geo.raid_disks;
3868 sector_div(size, conf->geo.near_copies);
3869
3870
3871 size = size * conf->copies;
3872
3873
3874
3875
3876 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3877
3878 conf->dev_sectors = size << conf->geo.chunk_shift;
3879
3880 if (conf->geo.far_offset)
3881 conf->geo.stride = 1 << conf->geo.chunk_shift;
3882 else {
3883 sector_div(size, conf->geo.far_copies);
3884 conf->geo.stride = size << conf->geo.chunk_shift;
3885 }
3886}
3887
3888enum geo_type {geo_new, geo_old, geo_start};
3889static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3890{
3891 int nc, fc, fo;
3892 int layout, chunk, disks;
3893 switch (new) {
3894 case geo_old:
3895 layout = mddev->layout;
3896 chunk = mddev->chunk_sectors;
3897 disks = mddev->raid_disks - mddev->delta_disks;
3898 break;
3899 case geo_new:
3900 layout = mddev->new_layout;
3901 chunk = mddev->new_chunk_sectors;
3902 disks = mddev->raid_disks;
3903 break;
3904 default:
3905 case geo_start:
3906
3907 layout = mddev->new_layout;
3908 chunk = mddev->new_chunk_sectors;
3909 disks = mddev->raid_disks + mddev->delta_disks;
3910 break;
3911 }
3912 if (layout >> 19)
3913 return -1;
3914 if (chunk < (PAGE_SIZE >> 9) ||
3915 !is_power_of_2(chunk))
3916 return -2;
3917 nc = layout & 255;
3918 fc = (layout >> 8) & 255;
3919 fo = layout & (1<<16);
3920 geo->raid_disks = disks;
3921 geo->near_copies = nc;
3922 geo->far_copies = fc;
3923 geo->far_offset = fo;
3924 switch (layout >> 17) {
3925 case 0:
3926 geo->far_set_size = disks;
3927 break;
3928 case 1:
3929
3930 geo->far_set_size = disks/fc;
3931 WARN(geo->far_set_size < fc,
3932 "This RAID10 layout does not provide data safety - please backup and create new array\n");
3933 break;
3934 case 2:
3935 geo->far_set_size = fc * nc;
3936 break;
3937 default:
3938 return -1;
3939 }
3940 geo->chunk_mask = chunk - 1;
3941 geo->chunk_shift = ffz(~chunk);
3942 return nc*fc;
3943}
3944
3945static struct r10conf *setup_conf(struct mddev *mddev)
3946{
3947 struct r10conf *conf = NULL;
3948 int err = -EINVAL;
3949 struct geom geo;
3950 int copies;
3951
3952 copies = setup_geo(&geo, mddev, geo_new);
3953
3954 if (copies == -2) {
3955 pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n",
3956 mdname(mddev), PAGE_SIZE);
3957 goto out;
3958 }
3959
3960 if (copies < 2 || copies > mddev->raid_disks) {
3961 pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3962 mdname(mddev), mddev->new_layout);
3963 goto out;
3964 }
3965
3966 err = -ENOMEM;
3967 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3968 if (!conf)
3969 goto out;
3970
3971
3972 conf->mirrors = kcalloc(mddev->raid_disks + max(0, -mddev->delta_disks),
3973 sizeof(struct raid10_info),
3974 GFP_KERNEL);
3975 if (!conf->mirrors)
3976 goto out;
3977
3978 conf->tmppage = alloc_page(GFP_KERNEL);
3979 if (!conf->tmppage)
3980 goto out;
3981
3982 conf->geo = geo;
3983 conf->copies = copies;
3984 err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc,
3985 rbio_pool_free, conf);
3986 if (err)
3987 goto out;
3988
3989 err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
3990 if (err)
3991 goto out;
3992
3993 calc_sectors(conf, mddev->dev_sectors);
3994 if (mddev->reshape_position == MaxSector) {
3995 conf->prev = conf->geo;
3996 conf->reshape_progress = MaxSector;
3997 } else {
3998 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3999 err = -EINVAL;
4000 goto out;
4001 }
4002 conf->reshape_progress = mddev->reshape_position;
4003 if (conf->prev.far_offset)
4004 conf->prev.stride = 1 << conf->prev.chunk_shift;
4005 else
4006
4007 conf->prev.stride = conf->dev_sectors;
4008 }
4009 conf->reshape_safe = conf->reshape_progress;
4010 spin_lock_init(&conf->device_lock);
4011 INIT_LIST_HEAD(&conf->retry_list);
4012 INIT_LIST_HEAD(&conf->bio_end_io_list);
4013
4014 spin_lock_init(&conf->resync_lock);
4015 init_waitqueue_head(&conf->wait_barrier);
4016 atomic_set(&conf->nr_pending, 0);
4017
4018 err = -ENOMEM;
4019 conf->thread = md_register_thread(raid10d, mddev, "raid10");
4020 if (!conf->thread)
4021 goto out;
4022
4023 conf->mddev = mddev;
4024 return conf;
4025
4026 out:
4027 if (conf) {
4028 mempool_exit(&conf->r10bio_pool);
4029 kfree(conf->mirrors);
4030 safe_put_page(conf->tmppage);
4031 bioset_exit(&conf->bio_split);
4032 kfree(conf);
4033 }
4034 return ERR_PTR(err);
4035}
4036
4037static void raid10_set_io_opt(struct r10conf *conf)
4038{
4039 int raid_disks = conf->geo.raid_disks;
4040
4041 if (!(conf->geo.raid_disks % conf->geo.near_copies))
4042 raid_disks /= conf->geo.near_copies;
4043 blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) *
4044 raid_disks);
4045}
4046
4047static int raid10_run(struct mddev *mddev)
4048{
4049 struct r10conf *conf;
4050 int i, disk_idx;
4051 struct raid10_info *disk;
4052 struct md_rdev *rdev;
4053 sector_t size;
4054 sector_t min_offset_diff = 0;
4055 int first = 1;
4056 bool discard_supported = false;
4057
4058 if (mddev_init_writes_pending(mddev) < 0)
4059 return -ENOMEM;
4060
4061 if (mddev->private == NULL) {
4062 conf = setup_conf(mddev);
4063 if (IS_ERR(conf))
4064 return PTR_ERR(conf);
4065 mddev->private = conf;
4066 }
4067 conf = mddev->private;
4068 if (!conf)
4069 goto out;
4070
4071 if (mddev_is_clustered(conf->mddev)) {
4072 int fc, fo;
4073
4074 fc = (mddev->layout >> 8) & 255;
4075 fo = mddev->layout & (1<<16);
4076 if (fc > 1 || fo > 0) {
4077 pr_err("only near layout is supported by clustered"
4078 " raid10\n");
4079 goto out_free_conf;
4080 }
4081 }
4082
4083 mddev->thread = conf->thread;
4084 conf->thread = NULL;
4085
4086 if (mddev->queue) {
4087 blk_queue_max_discard_sectors(mddev->queue,
4088 UINT_MAX);
4089 blk_queue_max_write_same_sectors(mddev->queue, 0);
4090 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
4091 blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
4092 raid10_set_io_opt(conf);
4093 }
4094
4095 rdev_for_each(rdev, mddev) {
4096 long long diff;
4097
4098 disk_idx = rdev->raid_disk;
4099 if (disk_idx < 0)
4100 continue;
4101 if (disk_idx >= conf->geo.raid_disks &&
4102 disk_idx >= conf->prev.raid_disks)
4103 continue;
4104 disk = conf->mirrors + disk_idx;
4105
4106 if (test_bit(Replacement, &rdev->flags)) {
4107 if (disk->replacement)
4108 goto out_free_conf;
4109 disk->replacement = rdev;
4110 } else {
4111 if (disk->rdev)
4112 goto out_free_conf;
4113 disk->rdev = rdev;
4114 }
4115 diff = (rdev->new_data_offset - rdev->data_offset);
4116 if (!mddev->reshape_backwards)
4117 diff = -diff;
4118 if (diff < 0)
4119 diff = 0;
4120 if (first || diff < min_offset_diff)
4121 min_offset_diff = diff;
4122
4123 if (mddev->gendisk)
4124 disk_stack_limits(mddev->gendisk, rdev->bdev,
4125 rdev->data_offset << 9);
4126
4127 disk->head_position = 0;
4128
4129 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
4130 discard_supported = true;
4131 first = 0;
4132 }
4133
4134 if (mddev->queue) {
4135 if (discard_supported)
4136 blk_queue_flag_set(QUEUE_FLAG_DISCARD,
4137 mddev->queue);
4138 else
4139 blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
4140 mddev->queue);
4141 }
4142
4143 if (!enough(conf, -1)) {
4144 pr_err("md/raid10:%s: not enough operational mirrors.\n",
4145 mdname(mddev));
4146 goto out_free_conf;
4147 }
4148
4149 if (conf->reshape_progress != MaxSector) {
4150
4151 if (conf->geo.far_copies != 1 &&
4152 conf->geo.far_offset == 0)
4153 goto out_free_conf;
4154 if (conf->prev.far_copies != 1 &&
4155 conf->prev.far_offset == 0)
4156 goto out_free_conf;
4157 }
4158
4159 mddev->degraded = 0;
4160 for (i = 0;
4161 i < conf->geo.raid_disks
4162 || i < conf->prev.raid_disks;
4163 i++) {
4164
4165 disk = conf->mirrors + i;
4166
4167 if (!disk->rdev && disk->replacement) {
4168
4169 disk->rdev = disk->replacement;
4170 disk->replacement = NULL;
4171 clear_bit(Replacement, &disk->rdev->flags);
4172 }
4173
4174 if (!disk->rdev ||
4175 !test_bit(In_sync, &disk->rdev->flags)) {
4176 disk->head_position = 0;
4177 mddev->degraded++;
4178 if (disk->rdev &&
4179 disk->rdev->saved_raid_disk < 0)
4180 conf->fullsync = 1;
4181 }
4182
4183 if (disk->replacement &&
4184 !test_bit(In_sync, &disk->replacement->flags) &&
4185 disk->replacement->saved_raid_disk < 0) {
4186 conf->fullsync = 1;
4187 }
4188
4189 disk->recovery_disabled = mddev->recovery_disabled - 1;
4190 }
4191
4192 if (mddev->recovery_cp != MaxSector)
4193 pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n",
4194 mdname(mddev));
4195 pr_info("md/raid10:%s: active with %d out of %d devices\n",
4196 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
4197 conf->geo.raid_disks);
4198
4199
4200
4201 mddev->dev_sectors = conf->dev_sectors;
4202 size = raid10_size(mddev, 0, 0);
4203 md_set_array_sectors(mddev, size);
4204 mddev->resync_max_sectors = size;
4205 set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
4206
4207 if (md_integrity_register(mddev))
4208 goto out_free_conf;
4209
4210 if (conf->reshape_progress != MaxSector) {
4211 unsigned long before_length, after_length;
4212
4213 before_length = ((1 << conf->prev.chunk_shift) *
4214 conf->prev.far_copies);
4215 after_length = ((1 << conf->geo.chunk_shift) *
4216 conf->geo.far_copies);
4217
4218 if (max(before_length, after_length) > min_offset_diff) {
4219
4220 pr_warn("md/raid10: offset difference not enough to continue reshape\n");
4221 goto out_free_conf;
4222 }
4223 conf->offset_diff = min_offset_diff;
4224
4225 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4226 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4227 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4228 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4229 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4230 "reshape");
4231 if (!mddev->sync_thread)
4232 goto out_free_conf;
4233 }
4234
4235 return 0;
4236
4237out_free_conf:
4238 md_unregister_thread(&mddev->thread);
4239 mempool_exit(&conf->r10bio_pool);
4240 safe_put_page(conf->tmppage);
4241 kfree(conf->mirrors);
4242 kfree(conf);
4243 mddev->private = NULL;
4244out:
4245 return -EIO;
4246}
4247
4248static void raid10_free(struct mddev *mddev, void *priv)
4249{
4250 struct r10conf *conf = priv;
4251
4252 mempool_exit(&conf->r10bio_pool);
4253 safe_put_page(conf->tmppage);
4254 kfree(conf->mirrors);
4255 kfree(conf->mirrors_old);
4256 kfree(conf->mirrors_new);
4257 bioset_exit(&conf->bio_split);
4258 kfree(conf);
4259}
4260
4261static void raid10_quiesce(struct mddev *mddev, int quiesce)
4262{
4263 struct r10conf *conf = mddev->private;
4264
4265 if (quiesce)
4266 raise_barrier(conf, 0);
4267 else
4268 lower_barrier(conf);
4269}
4270
4271static int raid10_resize(struct mddev *mddev, sector_t sectors)
4272{
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285 struct r10conf *conf = mddev->private;
4286 sector_t oldsize, size;
4287
4288 if (mddev->reshape_position != MaxSector)
4289 return -EBUSY;
4290
4291 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
4292 return -EINVAL;
4293
4294 oldsize = raid10_size(mddev, 0, 0);
4295 size = raid10_size(mddev, sectors, 0);
4296 if (mddev->external_size &&
4297 mddev->array_sectors > size)
4298 return -EINVAL;
4299 if (mddev->bitmap) {
4300 int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0);
4301 if (ret)
4302 return ret;
4303 }
4304 md_set_array_sectors(mddev, size);
4305 if (sectors > mddev->dev_sectors &&
4306 mddev->recovery_cp > oldsize) {
4307 mddev->recovery_cp = oldsize;
4308 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4309 }
4310 calc_sectors(conf, sectors);
4311 mddev->dev_sectors = conf->dev_sectors;
4312 mddev->resync_max_sectors = size;
4313 return 0;
4314}
4315
4316static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
4317{
4318 struct md_rdev *rdev;
4319 struct r10conf *conf;
4320
4321 if (mddev->degraded > 0) {
4322 pr_warn("md/raid10:%s: Error: degraded raid0!\n",
4323 mdname(mddev));
4324 return ERR_PTR(-EINVAL);
4325 }
4326 sector_div(size, devs);
4327
4328
4329 mddev->new_level = 10;
4330
4331 mddev->new_layout = (1<<8) + 2;
4332 mddev->new_chunk_sectors = mddev->chunk_sectors;
4333 mddev->delta_disks = mddev->raid_disks;
4334 mddev->raid_disks *= 2;
4335
4336 mddev->recovery_cp = MaxSector;
4337 mddev->dev_sectors = size;
4338
4339 conf = setup_conf(mddev);
4340 if (!IS_ERR(conf)) {
4341 rdev_for_each(rdev, mddev)
4342 if (rdev->raid_disk >= 0) {
4343 rdev->new_raid_disk = rdev->raid_disk * 2;
4344 rdev->sectors = size;
4345 }
4346 conf->barrier = 1;
4347 }
4348
4349 return conf;
4350}
4351
4352static void *raid10_takeover(struct mddev *mddev)
4353{
4354 struct r0conf *raid0_conf;
4355
4356
4357
4358
4359 if (mddev->level == 0) {
4360
4361 raid0_conf = mddev->private;
4362 if (raid0_conf->nr_strip_zones > 1) {
4363 pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n",
4364 mdname(mddev));
4365 return ERR_PTR(-EINVAL);
4366 }
4367 return raid10_takeover_raid0(mddev,
4368 raid0_conf->strip_zone->zone_end,
4369 raid0_conf->strip_zone->nb_dev);
4370 }
4371 return ERR_PTR(-EINVAL);
4372}
4373
4374static int raid10_check_reshape(struct mddev *mddev)
4375{
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390 struct r10conf *conf = mddev->private;
4391 struct geom geo;
4392
4393 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
4394 return -EINVAL;
4395
4396 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
4397
4398 return -EINVAL;
4399 if (geo.far_copies > 1 && !geo.far_offset)
4400
4401 return -EINVAL;
4402
4403 if (mddev->array_sectors & geo.chunk_mask)
4404
4405 return -EINVAL;
4406
4407 if (!enough(conf, -1))
4408 return -EINVAL;
4409
4410 kfree(conf->mirrors_new);
4411 conf->mirrors_new = NULL;
4412 if (mddev->delta_disks > 0) {
4413
4414 conf->mirrors_new =
4415 kcalloc(mddev->raid_disks + mddev->delta_disks,
4416 sizeof(struct raid10_info),
4417 GFP_KERNEL);
4418 if (!conf->mirrors_new)
4419 return -ENOMEM;
4420 }
4421 return 0;
4422}
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437static int calc_degraded(struct r10conf *conf)
4438{
4439 int degraded, degraded2;
4440 int i;
4441
4442 rcu_read_lock();
4443 degraded = 0;
4444
4445 for (i = 0; i < conf->prev.raid_disks; i++) {
4446 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4447 if (!rdev || test_bit(Faulty, &rdev->flags))
4448 degraded++;
4449 else if (!test_bit(In_sync, &rdev->flags))
4450
4451
4452
4453
4454 degraded++;
4455 }
4456 rcu_read_unlock();
4457 if (conf->geo.raid_disks == conf->prev.raid_disks)
4458 return degraded;
4459 rcu_read_lock();
4460 degraded2 = 0;
4461 for (i = 0; i < conf->geo.raid_disks; i++) {
4462 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4463 if (!rdev || test_bit(Faulty, &rdev->flags))
4464 degraded2++;
4465 else if (!test_bit(In_sync, &rdev->flags)) {
4466
4467
4468
4469
4470
4471 if (conf->geo.raid_disks <= conf->prev.raid_disks)
4472 degraded2++;
4473 }
4474 }
4475 rcu_read_unlock();
4476 if (degraded2 > degraded)
4477 return degraded2;
4478 return degraded;
4479}
4480
4481static int raid10_start_reshape(struct mddev *mddev)
4482{
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493 unsigned long before_length, after_length;
4494 sector_t min_offset_diff = 0;
4495 int first = 1;
4496 struct geom new;
4497 struct r10conf *conf = mddev->private;
4498 struct md_rdev *rdev;
4499 int spares = 0;
4500 int ret;
4501
4502 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4503 return -EBUSY;
4504
4505 if (setup_geo(&new, mddev, geo_start) != conf->copies)
4506 return -EINVAL;
4507
4508 before_length = ((1 << conf->prev.chunk_shift) *
4509 conf->prev.far_copies);
4510 after_length = ((1 << conf->geo.chunk_shift) *
4511 conf->geo.far_copies);
4512
4513 rdev_for_each(rdev, mddev) {
4514 if (!test_bit(In_sync, &rdev->flags)
4515 && !test_bit(Faulty, &rdev->flags))
4516 spares++;
4517 if (rdev->raid_disk >= 0) {
4518 long long diff = (rdev->new_data_offset
4519 - rdev->data_offset);
4520 if (!mddev->reshape_backwards)
4521 diff = -diff;
4522 if (diff < 0)
4523 diff = 0;
4524 if (first || diff < min_offset_diff)
4525 min_offset_diff = diff;
4526 first = 0;
4527 }
4528 }
4529
4530 if (max(before_length, after_length) > min_offset_diff)
4531 return -EINVAL;
4532
4533 if (spares < mddev->delta_disks)
4534 return -EINVAL;
4535
4536 conf->offset_diff = min_offset_diff;
4537 spin_lock_irq(&conf->device_lock);
4538 if (conf->mirrors_new) {
4539 memcpy(conf->mirrors_new, conf->mirrors,
4540 sizeof(struct raid10_info)*conf->prev.raid_disks);
4541 smp_mb();
4542 kfree(conf->mirrors_old);
4543 conf->mirrors_old = conf->mirrors;
4544 conf->mirrors = conf->mirrors_new;
4545 conf->mirrors_new = NULL;
4546 }
4547 setup_geo(&conf->geo, mddev, geo_start);
4548 smp_mb();
4549 if (mddev->reshape_backwards) {
4550 sector_t size = raid10_size(mddev, 0, 0);
4551 if (size < mddev->array_sectors) {
4552 spin_unlock_irq(&conf->device_lock);
4553 pr_warn("md/raid10:%s: array size must be reduce before number of disks\n",
4554 mdname(mddev));
4555 return -EINVAL;
4556 }
4557 mddev->resync_max_sectors = size;
4558 conf->reshape_progress = size;
4559 } else
4560 conf->reshape_progress = 0;
4561 conf->reshape_safe = conf->reshape_progress;
4562 spin_unlock_irq(&conf->device_lock);
4563
4564 if (mddev->delta_disks && mddev->bitmap) {
4565 struct mdp_superblock_1 *sb = NULL;
4566 sector_t oldsize, newsize;
4567
4568 oldsize = raid10_size(mddev, 0, 0);
4569 newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
4570
4571 if (!mddev_is_clustered(mddev)) {
4572 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
4573 if (ret)
4574 goto abort;
4575 else
4576 goto out;
4577 }
4578
4579 rdev_for_each(rdev, mddev) {
4580 if (rdev->raid_disk > -1 &&
4581 !test_bit(Faulty, &rdev->flags))
4582 sb = page_address(rdev->sb_page);
4583 }
4584
4585
4586
4587
4588
4589
4590 if ((sb && (le32_to_cpu(sb->feature_map) &
4591 MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
4592 goto out;
4593
4594 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
4595 if (ret)
4596 goto abort;
4597
4598 ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
4599 if (ret) {
4600 md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
4601 goto abort;
4602 }
4603 }
4604out:
4605 if (mddev->delta_disks > 0) {
4606 rdev_for_each(rdev, mddev)
4607 if (rdev->raid_disk < 0 &&
4608 !test_bit(Faulty, &rdev->flags)) {
4609 if (raid10_add_disk(mddev, rdev) == 0) {
4610 if (rdev->raid_disk >=
4611 conf->prev.raid_disks)
4612 set_bit(In_sync, &rdev->flags);
4613 else
4614 rdev->recovery_offset = 0;
4615
4616
4617 sysfs_link_rdev(mddev, rdev);
4618 }
4619 } else if (rdev->raid_disk >= conf->prev.raid_disks
4620 && !test_bit(Faulty, &rdev->flags)) {
4621
4622 set_bit(In_sync, &rdev->flags);
4623 }
4624 }
4625
4626
4627
4628
4629 spin_lock_irq(&conf->device_lock);
4630 mddev->degraded = calc_degraded(conf);
4631 spin_unlock_irq(&conf->device_lock);
4632 mddev->raid_disks = conf->geo.raid_disks;
4633 mddev->reshape_position = conf->reshape_progress;
4634 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4635
4636 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4637 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4638 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
4639 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4640 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4641
4642 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4643 "reshape");
4644 if (!mddev->sync_thread) {
4645 ret = -EAGAIN;
4646 goto abort;
4647 }
4648 conf->reshape_checkpoint = jiffies;
4649 md_wakeup_thread(mddev->sync_thread);
4650 md_new_event(mddev);
4651 return 0;
4652
4653abort:
4654 mddev->recovery = 0;
4655 spin_lock_irq(&conf->device_lock);
4656 conf->geo = conf->prev;
4657 mddev->raid_disks = conf->geo.raid_disks;
4658 rdev_for_each(rdev, mddev)
4659 rdev->new_data_offset = rdev->data_offset;
4660 smp_wmb();
4661 conf->reshape_progress = MaxSector;
4662 conf->reshape_safe = MaxSector;
4663 mddev->reshape_position = MaxSector;
4664 spin_unlock_irq(&conf->device_lock);
4665 return ret;
4666}
4667
4668
4669
4670
4671
4672
4673
4674static sector_t last_dev_address(sector_t s, struct geom *geo)
4675{
4676 s = (s | geo->chunk_mask) + 1;
4677 s >>= geo->chunk_shift;
4678 s *= geo->near_copies;
4679 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4680 s *= geo->far_copies;
4681 s <<= geo->chunk_shift;
4682 return s;
4683}
4684
4685
4686
4687
4688
4689static sector_t first_dev_address(sector_t s, struct geom *geo)
4690{
4691 s >>= geo->chunk_shift;
4692 s *= geo->near_copies;
4693 sector_div(s, geo->raid_disks);
4694 s *= geo->far_copies;
4695 s <<= geo->chunk_shift;
4696 return s;
4697}
4698
4699static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4700 int *skipped)
4701{
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739 struct r10conf *conf = mddev->private;
4740 struct r10bio *r10_bio;
4741 sector_t next, safe, last;
4742 int max_sectors;
4743 int nr_sectors;
4744 int s;
4745 struct md_rdev *rdev;
4746 int need_flush = 0;
4747 struct bio *blist;
4748 struct bio *bio, *read_bio;
4749 int sectors_done = 0;
4750 struct page **pages;
4751
4752 if (sector_nr == 0) {
4753
4754 if (mddev->reshape_backwards &&
4755 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4756 sector_nr = (raid10_size(mddev, 0, 0)
4757 - conf->reshape_progress);
4758 } else if (!mddev->reshape_backwards &&
4759 conf->reshape_progress > 0)
4760 sector_nr = conf->reshape_progress;
4761 if (sector_nr) {
4762 mddev->curr_resync_completed = sector_nr;
4763 sysfs_notify_dirent_safe(mddev->sysfs_completed);
4764 *skipped = 1;
4765 return sector_nr;
4766 }
4767 }
4768
4769
4770
4771
4772
4773 if (mddev->reshape_backwards) {
4774
4775
4776
4777 next = first_dev_address(conf->reshape_progress - 1,
4778 &conf->geo);
4779
4780
4781
4782
4783 safe = last_dev_address(conf->reshape_safe - 1,
4784 &conf->prev);
4785
4786 if (next + conf->offset_diff < safe)
4787 need_flush = 1;
4788
4789 last = conf->reshape_progress - 1;
4790 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4791 & conf->prev.chunk_mask);
4792 if (sector_nr + RESYNC_SECTORS < last)
4793 sector_nr = last + 1 - RESYNC_SECTORS;
4794 } else {
4795
4796
4797
4798 next = last_dev_address(conf->reshape_progress, &conf->geo);
4799
4800
4801
4802
4803 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4804
4805
4806
4807
4808 if (next > safe + conf->offset_diff)
4809 need_flush = 1;
4810
4811 sector_nr = conf->reshape_progress;
4812 last = sector_nr | (conf->geo.chunk_mask
4813 & conf->prev.chunk_mask);
4814
4815 if (sector_nr + RESYNC_SECTORS <= last)
4816 last = sector_nr + RESYNC_SECTORS - 1;
4817 }
4818
4819 if (need_flush ||
4820 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4821
4822 wait_barrier(conf);
4823 mddev->reshape_position = conf->reshape_progress;
4824 if (mddev->reshape_backwards)
4825 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4826 - conf->reshape_progress;
4827 else
4828 mddev->curr_resync_completed = conf->reshape_progress;
4829 conf->reshape_checkpoint = jiffies;
4830 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4831 md_wakeup_thread(mddev->thread);
4832 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
4833 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4834 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4835 allow_barrier(conf);
4836 return sectors_done;
4837 }
4838 conf->reshape_safe = mddev->reshape_position;
4839 allow_barrier(conf);
4840 }
4841
4842 raise_barrier(conf, 0);
4843read_more:
4844
4845 r10_bio = raid10_alloc_init_r10buf(conf);
4846 r10_bio->state = 0;
4847 raise_barrier(conf, 1);
4848 atomic_set(&r10_bio->remaining, 0);
4849 r10_bio->mddev = mddev;
4850 r10_bio->sector = sector_nr;
4851 set_bit(R10BIO_IsReshape, &r10_bio->state);
4852 r10_bio->sectors = last - sector_nr + 1;
4853 rdev = read_balance(conf, r10_bio, &max_sectors);
4854 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4855
4856 if (!rdev) {
4857
4858
4859
4860
4861 mempool_free(r10_bio, &conf->r10buf_pool);
4862 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4863 return sectors_done;
4864 }
4865
4866 read_bio = bio_alloc_bioset(GFP_KERNEL, RESYNC_PAGES, &mddev->bio_set);
4867
4868 bio_set_dev(read_bio, rdev->bdev);
4869 read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4870 + rdev->data_offset);
4871 read_bio->bi_private = r10_bio;
4872 read_bio->bi_end_io = end_reshape_read;
4873 bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
4874 r10_bio->master_bio = read_bio;
4875 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4876
4877
4878
4879
4880
4881 if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) {
4882 struct mdp_superblock_1 *sb = NULL;
4883 int sb_reshape_pos = 0;
4884
4885 conf->cluster_sync_low = sector_nr;
4886 conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
4887 sb = page_address(rdev->sb_page);
4888 if (sb) {
4889 sb_reshape_pos = le64_to_cpu(sb->reshape_position);
4890
4891
4892
4893
4894
4895 if (sb_reshape_pos < conf->cluster_sync_low)
4896 conf->cluster_sync_low = sb_reshape_pos;
4897 }
4898
4899 md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
4900 conf->cluster_sync_high);
4901 }
4902
4903
4904 __raid10_find_phys(&conf->geo, r10_bio);
4905
4906 blist = read_bio;
4907 read_bio->bi_next = NULL;
4908
4909 rcu_read_lock();
4910 for (s = 0; s < conf->copies*2; s++) {
4911 struct bio *b;
4912 int d = r10_bio->devs[s/2].devnum;
4913 struct md_rdev *rdev2;
4914 if (s&1) {
4915 rdev2 = rcu_dereference(conf->mirrors[d].replacement);
4916 b = r10_bio->devs[s/2].repl_bio;
4917 } else {
4918 rdev2 = rcu_dereference(conf->mirrors[d].rdev);
4919 b = r10_bio->devs[s/2].bio;
4920 }
4921 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4922 continue;
4923
4924 bio_set_dev(b, rdev2->bdev);
4925 b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
4926 rdev2->new_data_offset;
4927 b->bi_end_io = end_reshape_write;
4928 bio_set_op_attrs(b, REQ_OP_WRITE, 0);
4929 b->bi_next = blist;
4930 blist = b;
4931 }
4932
4933
4934
4935 nr_sectors = 0;
4936 pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
4937 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4938 struct page *page = pages[s / (PAGE_SIZE >> 9)];
4939 int len = (max_sectors - s) << 9;
4940 if (len > PAGE_SIZE)
4941 len = PAGE_SIZE;
4942 for (bio = blist; bio ; bio = bio->bi_next) {
4943
4944
4945
4946
4947 bio_add_page(bio, page, len, 0);
4948 }
4949 sector_nr += len >> 9;
4950 nr_sectors += len >> 9;
4951 }
4952 rcu_read_unlock();
4953 r10_bio->sectors = nr_sectors;
4954
4955
4956 md_sync_acct_bio(read_bio, r10_bio->sectors);
4957 atomic_inc(&r10_bio->remaining);
4958 read_bio->bi_next = NULL;
4959 submit_bio_noacct(read_bio);
4960 sectors_done += nr_sectors;
4961 if (sector_nr <= last)
4962 goto read_more;
4963
4964 lower_barrier(conf);
4965
4966
4967
4968
4969 if (mddev->reshape_backwards)
4970 conf->reshape_progress -= sectors_done;
4971 else
4972 conf->reshape_progress += sectors_done;
4973
4974 return sectors_done;
4975}
4976
4977static void end_reshape_request(struct r10bio *r10_bio);
4978static int handle_reshape_read_error(struct mddev *mddev,
4979 struct r10bio *r10_bio);
4980static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4981{
4982
4983
4984
4985
4986
4987 struct r10conf *conf = mddev->private;
4988 int s;
4989
4990 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4991 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4992
4993 md_done_sync(mddev, r10_bio->sectors, 0);
4994 return;
4995 }
4996
4997
4998
4999
5000 atomic_set(&r10_bio->remaining, 1);
5001 for (s = 0; s < conf->copies*2; s++) {
5002 struct bio *b;
5003 int d = r10_bio->devs[s/2].devnum;
5004 struct md_rdev *rdev;
5005 rcu_read_lock();
5006 if (s&1) {
5007 rdev = rcu_dereference(conf->mirrors[d].replacement);
5008 b = r10_bio->devs[s/2].repl_bio;
5009 } else {
5010 rdev = rcu_dereference(conf->mirrors[d].rdev);
5011 b = r10_bio->devs[s/2].bio;
5012 }
5013 if (!rdev || test_bit(Faulty, &rdev->flags)) {
5014 rcu_read_unlock();
5015 continue;
5016 }
5017 atomic_inc(&rdev->nr_pending);
5018 rcu_read_unlock();
5019 md_sync_acct_bio(b, r10_bio->sectors);
5020 atomic_inc(&r10_bio->remaining);
5021 b->bi_next = NULL;
5022 submit_bio_noacct(b);
5023 }
5024 end_reshape_request(r10_bio);
5025}
5026
5027static void end_reshape(struct r10conf *conf)
5028{
5029 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
5030 return;
5031
5032 spin_lock_irq(&conf->device_lock);
5033 conf->prev = conf->geo;
5034 md_finish_reshape(conf->mddev);
5035 smp_wmb();
5036 conf->reshape_progress = MaxSector;
5037 conf->reshape_safe = MaxSector;
5038 spin_unlock_irq(&conf->device_lock);
5039
5040 if (conf->mddev->queue)
5041 raid10_set_io_opt(conf);
5042 conf->fullsync = 0;
5043}
5044
5045static void raid10_update_reshape_pos(struct mddev *mddev)
5046{
5047 struct r10conf *conf = mddev->private;
5048 sector_t lo, hi;
5049
5050 md_cluster_ops->resync_info_get(mddev, &lo, &hi);
5051 if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
5052 || mddev->reshape_position == MaxSector)
5053 conf->reshape_progress = mddev->reshape_position;
5054 else
5055 WARN_ON_ONCE(1);
5056}
5057
5058static int handle_reshape_read_error(struct mddev *mddev,
5059 struct r10bio *r10_bio)
5060{
5061
5062 int sectors = r10_bio->sectors;
5063 struct r10conf *conf = mddev->private;
5064 struct r10bio *r10b;
5065 int slot = 0;
5066 int idx = 0;
5067 struct page **pages;
5068
5069 r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO);
5070 if (!r10b) {
5071 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5072 return -ENOMEM;
5073 }
5074
5075
5076 pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
5077
5078 r10b->sector = r10_bio->sector;
5079 __raid10_find_phys(&conf->prev, r10b);
5080
5081 while (sectors) {
5082 int s = sectors;
5083 int success = 0;
5084 int first_slot = slot;
5085
5086 if (s > (PAGE_SIZE >> 9))
5087 s = PAGE_SIZE >> 9;
5088
5089 rcu_read_lock();
5090 while (!success) {
5091 int d = r10b->devs[slot].devnum;
5092 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
5093 sector_t addr;
5094 if (rdev == NULL ||
5095 test_bit(Faulty, &rdev->flags) ||
5096 !test_bit(In_sync, &rdev->flags))
5097 goto failed;
5098
5099 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
5100 atomic_inc(&rdev->nr_pending);
5101 rcu_read_unlock();
5102 success = sync_page_io(rdev,
5103 addr,
5104 s << 9,
5105 pages[idx],
5106 REQ_OP_READ, 0, false);
5107 rdev_dec_pending(rdev, mddev);
5108 rcu_read_lock();
5109 if (success)
5110 break;
5111 failed:
5112 slot++;
5113 if (slot >= conf->copies)
5114 slot = 0;
5115 if (slot == first_slot)
5116 break;
5117 }
5118 rcu_read_unlock();
5119 if (!success) {
5120
5121 set_bit(MD_RECOVERY_INTR,
5122 &mddev->recovery);
5123 kfree(r10b);
5124 return -EIO;
5125 }
5126 sectors -= s;
5127 idx++;
5128 }
5129 kfree(r10b);
5130 return 0;
5131}
5132
5133static void end_reshape_write(struct bio *bio)
5134{
5135 struct r10bio *r10_bio = get_resync_r10bio(bio);
5136 struct mddev *mddev = r10_bio->mddev;
5137 struct r10conf *conf = mddev->private;
5138 int d;
5139 int slot;
5140 int repl;
5141 struct md_rdev *rdev = NULL;
5142
5143 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
5144 if (repl)
5145 rdev = conf->mirrors[d].replacement;
5146 if (!rdev) {
5147 smp_mb();
5148 rdev = conf->mirrors[d].rdev;
5149 }
5150
5151 if (bio->bi_status) {
5152
5153 md_error(mddev, rdev);
5154 }
5155
5156 rdev_dec_pending(rdev, mddev);
5157 end_reshape_request(r10_bio);
5158}
5159
5160static void end_reshape_request(struct r10bio *r10_bio)
5161{
5162 if (!atomic_dec_and_test(&r10_bio->remaining))
5163 return;
5164 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
5165 bio_put(r10_bio->master_bio);
5166 put_buf(r10_bio);
5167}
5168
5169static void raid10_finish_reshape(struct mddev *mddev)
5170{
5171 struct r10conf *conf = mddev->private;
5172
5173 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5174 return;
5175
5176 if (mddev->delta_disks > 0) {
5177 if (mddev->recovery_cp > mddev->resync_max_sectors) {
5178 mddev->recovery_cp = mddev->resync_max_sectors;
5179 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5180 }
5181 mddev->resync_max_sectors = mddev->array_sectors;
5182 } else {
5183 int d;
5184 rcu_read_lock();
5185 for (d = conf->geo.raid_disks ;
5186 d < conf->geo.raid_disks - mddev->delta_disks;
5187 d++) {
5188 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
5189 if (rdev)
5190 clear_bit(In_sync, &rdev->flags);
5191 rdev = rcu_dereference(conf->mirrors[d].replacement);
5192 if (rdev)
5193 clear_bit(In_sync, &rdev->flags);
5194 }
5195 rcu_read_unlock();
5196 }
5197 mddev->layout = mddev->new_layout;
5198 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
5199 mddev->reshape_position = MaxSector;
5200 mddev->delta_disks = 0;
5201 mddev->reshape_backwards = 0;
5202}
5203
5204static struct md_personality raid10_personality =
5205{
5206 .name = "raid10",
5207 .level = 10,
5208 .owner = THIS_MODULE,
5209 .make_request = raid10_make_request,
5210 .run = raid10_run,
5211 .free = raid10_free,
5212 .status = raid10_status,
5213 .error_handler = raid10_error,
5214 .hot_add_disk = raid10_add_disk,
5215 .hot_remove_disk= raid10_remove_disk,
5216 .spare_active = raid10_spare_active,
5217 .sync_request = raid10_sync_request,
5218 .quiesce = raid10_quiesce,
5219 .size = raid10_size,
5220 .resize = raid10_resize,
5221 .takeover = raid10_takeover,
5222 .check_reshape = raid10_check_reshape,
5223 .start_reshape = raid10_start_reshape,
5224 .finish_reshape = raid10_finish_reshape,
5225 .update_reshape_pos = raid10_update_reshape_pos,
5226};
5227
5228static int __init raid_init(void)
5229{
5230 return register_md_personality(&raid10_personality);
5231}
5232
5233static void raid_exit(void)
5234{
5235 unregister_md_personality(&raid10_personality);
5236}
5237
5238module_init(raid_init);
5239module_exit(raid_exit);
5240MODULE_LICENSE("GPL");
5241MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
5242MODULE_ALIAS("md-personality-9");
5243MODULE_ALIAS("md-raid10");
5244MODULE_ALIAS("md-level-10");
5245
5246module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
5247