1
2
3
4
5
6
7
8
9
10
11
12#include <linux/slab.h>
13#include <linux/delay.h>
14#include <linux/blkdev.h>
15#include <linux/module.h>
16#include <linux/seq_file.h>
17#include <linux/ratelimit.h>
18#include <linux/kthread.h>
19#include <linux/raid/md_p.h>
20#include <trace/events/block.h>
21#include "md.h"
22#include "raid10.h"
23#include "raid0.h"
24#include "md-bitmap.h"
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67static void allow_barrier(struct r10conf *conf);
68static void lower_barrier(struct r10conf *conf);
69static int _enough(struct r10conf *conf, int previous, int ignore);
70static int enough(struct r10conf *conf, int ignore);
71static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
72 int *skipped);
73static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
74static void end_reshape_write(struct bio *bio);
75static void end_reshape(struct r10conf *conf);
76
77#define raid10_log(md, fmt, args...) \
78 do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
79
80#include "raid1-10.c"
81
82
83
84
85
86static inline struct r10bio *get_resync_r10bio(struct bio *bio)
87{
88 return get_resync_pages(bio)->raid_bio;
89}
90
91static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
92{
93 struct r10conf *conf = data;
94 int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]);
95
96
97
98 return kzalloc(size, gfp_flags);
99}
100
101#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
102
103#define RESYNC_WINDOW (1024*1024)
104
105#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
106#define CLUSTER_RESYNC_WINDOW (32 * RESYNC_WINDOW)
107#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
108
109
110
111
112
113
114
115
116static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
117{
118 struct r10conf *conf = data;
119 struct r10bio *r10_bio;
120 struct bio *bio;
121 int j;
122 int nalloc, nalloc_rp;
123 struct resync_pages *rps;
124
125 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
126 if (!r10_bio)
127 return NULL;
128
129 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
130 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
131 nalloc = conf->copies;
132 else
133 nalloc = 2;
134
135
136 if (!conf->have_replacement)
137 nalloc_rp = nalloc;
138 else
139 nalloc_rp = nalloc * 2;
140 rps = kmalloc_array(nalloc_rp, sizeof(struct resync_pages), gfp_flags);
141 if (!rps)
142 goto out_free_r10bio;
143
144
145
146
147 for (j = nalloc ; j-- ; ) {
148 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
149 if (!bio)
150 goto out_free_bio;
151 r10_bio->devs[j].bio = bio;
152 if (!conf->have_replacement)
153 continue;
154 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
155 if (!bio)
156 goto out_free_bio;
157 r10_bio->devs[j].repl_bio = bio;
158 }
159
160
161
162
163 for (j = 0; j < nalloc; j++) {
164 struct bio *rbio = r10_bio->devs[j].repl_bio;
165 struct resync_pages *rp, *rp_repl;
166
167 rp = &rps[j];
168 if (rbio)
169 rp_repl = &rps[nalloc + j];
170
171 bio = r10_bio->devs[j].bio;
172
173 if (!j || test_bit(MD_RECOVERY_SYNC,
174 &conf->mddev->recovery)) {
175 if (resync_alloc_pages(rp, gfp_flags))
176 goto out_free_pages;
177 } else {
178 memcpy(rp, &rps[0], sizeof(*rp));
179 resync_get_all_pages(rp);
180 }
181
182 rp->raid_bio = r10_bio;
183 bio->bi_private = rp;
184 if (rbio) {
185 memcpy(rp_repl, rp, sizeof(*rp));
186 rbio->bi_private = rp_repl;
187 }
188 }
189
190 return r10_bio;
191
192out_free_pages:
193 while (--j >= 0)
194 resync_free_pages(&rps[j]);
195
196 j = 0;
197out_free_bio:
198 for ( ; j < nalloc; j++) {
199 if (r10_bio->devs[j].bio)
200 bio_put(r10_bio->devs[j].bio);
201 if (r10_bio->devs[j].repl_bio)
202 bio_put(r10_bio->devs[j].repl_bio);
203 }
204 kfree(rps);
205out_free_r10bio:
206 rbio_pool_free(r10_bio, conf);
207 return NULL;
208}
209
210static void r10buf_pool_free(void *__r10_bio, void *data)
211{
212 struct r10conf *conf = data;
213 struct r10bio *r10bio = __r10_bio;
214 int j;
215 struct resync_pages *rp = NULL;
216
217 for (j = conf->copies; j--; ) {
218 struct bio *bio = r10bio->devs[j].bio;
219
220 if (bio) {
221 rp = get_resync_pages(bio);
222 resync_free_pages(rp);
223 bio_put(bio);
224 }
225
226 bio = r10bio->devs[j].repl_bio;
227 if (bio)
228 bio_put(bio);
229 }
230
231
232 kfree(rp);
233
234 rbio_pool_free(r10bio, conf);
235}
236
237static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
238{
239 int i;
240
241 for (i = 0; i < conf->geo.raid_disks; i++) {
242 struct bio **bio = & r10_bio->devs[i].bio;
243 if (!BIO_SPECIAL(*bio))
244 bio_put(*bio);
245 *bio = NULL;
246 bio = &r10_bio->devs[i].repl_bio;
247 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
248 bio_put(*bio);
249 *bio = NULL;
250 }
251}
252
253static void free_r10bio(struct r10bio *r10_bio)
254{
255 struct r10conf *conf = r10_bio->mddev->private;
256
257 put_all_bios(conf, r10_bio);
258 mempool_free(r10_bio, &conf->r10bio_pool);
259}
260
261static void put_buf(struct r10bio *r10_bio)
262{
263 struct r10conf *conf = r10_bio->mddev->private;
264
265 mempool_free(r10_bio, &conf->r10buf_pool);
266
267 lower_barrier(conf);
268}
269
270static void reschedule_retry(struct r10bio *r10_bio)
271{
272 unsigned long flags;
273 struct mddev *mddev = r10_bio->mddev;
274 struct r10conf *conf = mddev->private;
275
276 spin_lock_irqsave(&conf->device_lock, flags);
277 list_add(&r10_bio->retry_list, &conf->retry_list);
278 conf->nr_queued ++;
279 spin_unlock_irqrestore(&conf->device_lock, flags);
280
281
282 wake_up(&conf->wait_barrier);
283
284 md_wakeup_thread(mddev->thread);
285}
286
287
288
289
290
291
292static void raid_end_bio_io(struct r10bio *r10_bio)
293{
294 struct bio *bio = r10_bio->master_bio;
295 struct r10conf *conf = r10_bio->mddev->private;
296
297 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
298 bio->bi_status = BLK_STS_IOERR;
299
300 if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
301 bio_end_io_acct(bio, r10_bio->start_time);
302 bio_endio(bio);
303
304
305
306
307 allow_barrier(conf);
308
309 free_r10bio(r10_bio);
310}
311
312
313
314
315static inline void update_head_pos(int slot, struct r10bio *r10_bio)
316{
317 struct r10conf *conf = r10_bio->mddev->private;
318
319 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
320 r10_bio->devs[slot].addr + (r10_bio->sectors);
321}
322
323
324
325
326static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
327 struct bio *bio, int *slotp, int *replp)
328{
329 int slot;
330 int repl = 0;
331
332 for (slot = 0; slot < conf->geo.raid_disks; slot++) {
333 if (r10_bio->devs[slot].bio == bio)
334 break;
335 if (r10_bio->devs[slot].repl_bio == bio) {
336 repl = 1;
337 break;
338 }
339 }
340
341 update_head_pos(slot, r10_bio);
342
343 if (slotp)
344 *slotp = slot;
345 if (replp)
346 *replp = repl;
347 return r10_bio->devs[slot].devnum;
348}
349
350static void raid10_end_read_request(struct bio *bio)
351{
352 int uptodate = !bio->bi_status;
353 struct r10bio *r10_bio = bio->bi_private;
354 int slot;
355 struct md_rdev *rdev;
356 struct r10conf *conf = r10_bio->mddev->private;
357
358 slot = r10_bio->read_slot;
359 rdev = r10_bio->devs[slot].rdev;
360
361
362
363 update_head_pos(slot, r10_bio);
364
365 if (uptodate) {
366
367
368
369
370
371
372
373
374
375 set_bit(R10BIO_Uptodate, &r10_bio->state);
376 } else {
377
378
379
380
381
382 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
383 rdev->raid_disk))
384 uptodate = 1;
385 }
386 if (uptodate) {
387 raid_end_bio_io(r10_bio);
388 rdev_dec_pending(rdev, conf->mddev);
389 } else {
390
391
392
393 char b[BDEVNAME_SIZE];
394 pr_err_ratelimited("md/raid10:%s: %s: rescheduling sector %llu\n",
395 mdname(conf->mddev),
396 bdevname(rdev->bdev, b),
397 (unsigned long long)r10_bio->sector);
398 set_bit(R10BIO_ReadError, &r10_bio->state);
399 reschedule_retry(r10_bio);
400 }
401}
402
403static void close_write(struct r10bio *r10_bio)
404{
405
406 md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
407 r10_bio->sectors,
408 !test_bit(R10BIO_Degraded, &r10_bio->state),
409 0);
410 md_write_end(r10_bio->mddev);
411}
412
413static void one_write_done(struct r10bio *r10_bio)
414{
415 if (atomic_dec_and_test(&r10_bio->remaining)) {
416 if (test_bit(R10BIO_WriteError, &r10_bio->state))
417 reschedule_retry(r10_bio);
418 else {
419 close_write(r10_bio);
420 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
421 reschedule_retry(r10_bio);
422 else
423 raid_end_bio_io(r10_bio);
424 }
425 }
426}
427
428static void raid10_end_write_request(struct bio *bio)
429{
430 struct r10bio *r10_bio = bio->bi_private;
431 int dev;
432 int dec_rdev = 1;
433 struct r10conf *conf = r10_bio->mddev->private;
434 int slot, repl;
435 struct md_rdev *rdev = NULL;
436 struct bio *to_put = NULL;
437 bool discard_error;
438
439 discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
440
441 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
442
443 if (repl)
444 rdev = conf->mirrors[dev].replacement;
445 if (!rdev) {
446 smp_rmb();
447 repl = 0;
448 rdev = conf->mirrors[dev].rdev;
449 }
450
451
452
453 if (bio->bi_status && !discard_error) {
454 if (repl)
455
456
457
458 md_error(rdev->mddev, rdev);
459 else {
460 set_bit(WriteErrorSeen, &rdev->flags);
461 if (!test_and_set_bit(WantReplacement, &rdev->flags))
462 set_bit(MD_RECOVERY_NEEDED,
463 &rdev->mddev->recovery);
464
465 dec_rdev = 0;
466 if (test_bit(FailFast, &rdev->flags) &&
467 (bio->bi_opf & MD_FAILFAST)) {
468 md_error(rdev->mddev, rdev);
469 }
470
471
472
473
474
475 if (!test_bit(Faulty, &rdev->flags))
476 set_bit(R10BIO_WriteError, &r10_bio->state);
477 else {
478
479 set_bit(R10BIO_Degraded, &r10_bio->state);
480 r10_bio->devs[slot].bio = NULL;
481 to_put = bio;
482 dec_rdev = 1;
483 }
484 }
485 } else {
486
487
488
489
490
491
492
493
494
495 sector_t first_bad;
496 int bad_sectors;
497
498
499
500
501
502
503
504
505
506 if (test_bit(In_sync, &rdev->flags) &&
507 !test_bit(Faulty, &rdev->flags))
508 set_bit(R10BIO_Uptodate, &r10_bio->state);
509
510
511 if (is_badblock(rdev,
512 r10_bio->devs[slot].addr,
513 r10_bio->sectors,
514 &first_bad, &bad_sectors) && !discard_error) {
515 bio_put(bio);
516 if (repl)
517 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
518 else
519 r10_bio->devs[slot].bio = IO_MADE_GOOD;
520 dec_rdev = 0;
521 set_bit(R10BIO_MadeGood, &r10_bio->state);
522 }
523 }
524
525
526
527
528
529
530 one_write_done(r10_bio);
531 if (dec_rdev)
532 rdev_dec_pending(rdev, conf->mddev);
533 if (to_put)
534 bio_put(to_put);
535}
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
563{
564 int n,f;
565 sector_t sector;
566 sector_t chunk;
567 sector_t stripe;
568 int dev;
569 int slot = 0;
570 int last_far_set_start, last_far_set_size;
571
572 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
573 last_far_set_start *= geo->far_set_size;
574
575 last_far_set_size = geo->far_set_size;
576 last_far_set_size += (geo->raid_disks % geo->far_set_size);
577
578
579 chunk = r10bio->sector >> geo->chunk_shift;
580 sector = r10bio->sector & geo->chunk_mask;
581
582 chunk *= geo->near_copies;
583 stripe = chunk;
584 dev = sector_div(stripe, geo->raid_disks);
585 if (geo->far_offset)
586 stripe *= geo->far_copies;
587
588 sector += stripe << geo->chunk_shift;
589
590
591 for (n = 0; n < geo->near_copies; n++) {
592 int d = dev;
593 int set;
594 sector_t s = sector;
595 r10bio->devs[slot].devnum = d;
596 r10bio->devs[slot].addr = s;
597 slot++;
598
599 for (f = 1; f < geo->far_copies; f++) {
600 set = d / geo->far_set_size;
601 d += geo->near_copies;
602
603 if ((geo->raid_disks % geo->far_set_size) &&
604 (d > last_far_set_start)) {
605 d -= last_far_set_start;
606 d %= last_far_set_size;
607 d += last_far_set_start;
608 } else {
609 d %= geo->far_set_size;
610 d += geo->far_set_size * set;
611 }
612 s += geo->stride;
613 r10bio->devs[slot].devnum = d;
614 r10bio->devs[slot].addr = s;
615 slot++;
616 }
617 dev++;
618 if (dev >= geo->raid_disks) {
619 dev = 0;
620 sector += (geo->chunk_mask + 1);
621 }
622 }
623}
624
625static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
626{
627 struct geom *geo = &conf->geo;
628
629 if (conf->reshape_progress != MaxSector &&
630 ((r10bio->sector >= conf->reshape_progress) !=
631 conf->mddev->reshape_backwards)) {
632 set_bit(R10BIO_Previous, &r10bio->state);
633 geo = &conf->prev;
634 } else
635 clear_bit(R10BIO_Previous, &r10bio->state);
636
637 __raid10_find_phys(geo, r10bio);
638}
639
640static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
641{
642 sector_t offset, chunk, vchunk;
643
644
645
646 struct geom *geo = &conf->geo;
647 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
648 int far_set_size = geo->far_set_size;
649 int last_far_set_start;
650
651 if (geo->raid_disks % geo->far_set_size) {
652 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
653 last_far_set_start *= geo->far_set_size;
654
655 if (dev >= last_far_set_start) {
656 far_set_size = geo->far_set_size;
657 far_set_size += (geo->raid_disks % geo->far_set_size);
658 far_set_start = last_far_set_start;
659 }
660 }
661
662 offset = sector & geo->chunk_mask;
663 if (geo->far_offset) {
664 int fc;
665 chunk = sector >> geo->chunk_shift;
666 fc = sector_div(chunk, geo->far_copies);
667 dev -= fc * geo->near_copies;
668 if (dev < far_set_start)
669 dev += far_set_size;
670 } else {
671 while (sector >= geo->stride) {
672 sector -= geo->stride;
673 if (dev < (geo->near_copies + far_set_start))
674 dev += far_set_size - geo->near_copies;
675 else
676 dev -= geo->near_copies;
677 }
678 chunk = sector >> geo->chunk_shift;
679 }
680 vchunk = chunk * geo->raid_disks + dev;
681 sector_div(vchunk, geo->near_copies);
682 return (vchunk << geo->chunk_shift) + offset;
683}
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704static struct md_rdev *read_balance(struct r10conf *conf,
705 struct r10bio *r10_bio,
706 int *max_sectors)
707{
708 const sector_t this_sector = r10_bio->sector;
709 int disk, slot;
710 int sectors = r10_bio->sectors;
711 int best_good_sectors;
712 sector_t new_distance, best_dist;
713 struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL;
714 int do_balance;
715 int best_dist_slot, best_pending_slot;
716 bool has_nonrot_disk = false;
717 unsigned int min_pending;
718 struct geom *geo = &conf->geo;
719
720 raid10_find_phys(conf, r10_bio);
721 rcu_read_lock();
722 best_dist_slot = -1;
723 min_pending = UINT_MAX;
724 best_dist_rdev = NULL;
725 best_pending_rdev = NULL;
726 best_dist = MaxSector;
727 best_good_sectors = 0;
728 do_balance = 1;
729 clear_bit(R10BIO_FailFast, &r10_bio->state);
730
731
732
733
734
735
736 if ((conf->mddev->recovery_cp < MaxSector
737 && (this_sector + sectors >= conf->next_resync)) ||
738 (mddev_is_clustered(conf->mddev) &&
739 md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
740 this_sector + sectors)))
741 do_balance = 0;
742
743 for (slot = 0; slot < conf->copies ; slot++) {
744 sector_t first_bad;
745 int bad_sectors;
746 sector_t dev_sector;
747 unsigned int pending;
748 bool nonrot;
749
750 if (r10_bio->devs[slot].bio == IO_BLOCKED)
751 continue;
752 disk = r10_bio->devs[slot].devnum;
753 rdev = rcu_dereference(conf->mirrors[disk].replacement);
754 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
755 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
756 rdev = rcu_dereference(conf->mirrors[disk].rdev);
757 if (rdev == NULL ||
758 test_bit(Faulty, &rdev->flags))
759 continue;
760 if (!test_bit(In_sync, &rdev->flags) &&
761 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
762 continue;
763
764 dev_sector = r10_bio->devs[slot].addr;
765 if (is_badblock(rdev, dev_sector, sectors,
766 &first_bad, &bad_sectors)) {
767 if (best_dist < MaxSector)
768
769 continue;
770 if (first_bad <= dev_sector) {
771
772
773
774
775 bad_sectors -= (dev_sector - first_bad);
776 if (!do_balance && sectors > bad_sectors)
777 sectors = bad_sectors;
778 if (best_good_sectors > sectors)
779 best_good_sectors = sectors;
780 } else {
781 sector_t good_sectors =
782 first_bad - dev_sector;
783 if (good_sectors > best_good_sectors) {
784 best_good_sectors = good_sectors;
785 best_dist_slot = slot;
786 best_dist_rdev = rdev;
787 }
788 if (!do_balance)
789
790 break;
791 }
792 continue;
793 } else
794 best_good_sectors = sectors;
795
796 if (!do_balance)
797 break;
798
799 nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
800 has_nonrot_disk |= nonrot;
801 pending = atomic_read(&rdev->nr_pending);
802 if (min_pending > pending && nonrot) {
803 min_pending = pending;
804 best_pending_slot = slot;
805 best_pending_rdev = rdev;
806 }
807
808 if (best_dist_slot >= 0)
809
810 set_bit(R10BIO_FailFast, &r10_bio->state);
811
812
813
814
815 if (geo->near_copies > 1 && !pending)
816 new_distance = 0;
817
818
819 else if (geo->far_copies > 1)
820 new_distance = r10_bio->devs[slot].addr;
821 else
822 new_distance = abs(r10_bio->devs[slot].addr -
823 conf->mirrors[disk].head_position);
824
825 if (new_distance < best_dist) {
826 best_dist = new_distance;
827 best_dist_slot = slot;
828 best_dist_rdev = rdev;
829 }
830 }
831 if (slot >= conf->copies) {
832 if (has_nonrot_disk) {
833 slot = best_pending_slot;
834 rdev = best_pending_rdev;
835 } else {
836 slot = best_dist_slot;
837 rdev = best_dist_rdev;
838 }
839 }
840
841 if (slot >= 0) {
842 atomic_inc(&rdev->nr_pending);
843 r10_bio->read_slot = slot;
844 } else
845 rdev = NULL;
846 rcu_read_unlock();
847 *max_sectors = best_good_sectors;
848
849 return rdev;
850}
851
852static void flush_pending_writes(struct r10conf *conf)
853{
854
855
856
857 spin_lock_irq(&conf->device_lock);
858
859 if (conf->pending_bio_list.head) {
860 struct blk_plug plug;
861 struct bio *bio;
862
863 bio = bio_list_get(&conf->pending_bio_list);
864 conf->pending_count = 0;
865 spin_unlock_irq(&conf->device_lock);
866
867
868
869
870
871
872
873
874
875
876 __set_current_state(TASK_RUNNING);
877
878 blk_start_plug(&plug);
879
880
881 md_bitmap_unplug(conf->mddev->bitmap);
882 wake_up(&conf->wait_barrier);
883
884 while (bio) {
885 struct bio *next = bio->bi_next;
886 struct md_rdev *rdev = (void*)bio->bi_bdev;
887 bio->bi_next = NULL;
888 bio_set_dev(bio, rdev->bdev);
889 if (test_bit(Faulty, &rdev->flags)) {
890 bio_io_error(bio);
891 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
892 !blk_queue_discard(bio->bi_bdev->bd_disk->queue)))
893
894 bio_endio(bio);
895 else
896 submit_bio_noacct(bio);
897 bio = next;
898 }
899 blk_finish_plug(&plug);
900 } else
901 spin_unlock_irq(&conf->device_lock);
902}
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926static void raise_barrier(struct r10conf *conf, int force)
927{
928 BUG_ON(force && !conf->barrier);
929 spin_lock_irq(&conf->resync_lock);
930
931
932 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
933 conf->resync_lock);
934
935
936 conf->barrier++;
937
938
939 wait_event_lock_irq(conf->wait_barrier,
940 !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
941 conf->resync_lock);
942
943 spin_unlock_irq(&conf->resync_lock);
944}
945
946static void lower_barrier(struct r10conf *conf)
947{
948 unsigned long flags;
949 spin_lock_irqsave(&conf->resync_lock, flags);
950 conf->barrier--;
951 spin_unlock_irqrestore(&conf->resync_lock, flags);
952 wake_up(&conf->wait_barrier);
953}
954
955static void wait_barrier(struct r10conf *conf)
956{
957 spin_lock_irq(&conf->resync_lock);
958 if (conf->barrier) {
959 struct bio_list *bio_list = current->bio_list;
960 conf->nr_waiting++;
961
962
963
964
965
966
967
968
969
970 raid10_log(conf->mddev, "wait barrier");
971 wait_event_lock_irq(conf->wait_barrier,
972 !conf->barrier ||
973 (atomic_read(&conf->nr_pending) &&
974 bio_list &&
975 (!bio_list_empty(&bio_list[0]) ||
976 !bio_list_empty(&bio_list[1]))) ||
977
978
979
980 (conf->mddev->thread->tsk == current &&
981 test_bit(MD_RECOVERY_RUNNING,
982 &conf->mddev->recovery) &&
983 conf->nr_queued > 0),
984 conf->resync_lock);
985 conf->nr_waiting--;
986 if (!conf->nr_waiting)
987 wake_up(&conf->wait_barrier);
988 }
989 atomic_inc(&conf->nr_pending);
990 spin_unlock_irq(&conf->resync_lock);
991}
992
993static void allow_barrier(struct r10conf *conf)
994{
995 if ((atomic_dec_and_test(&conf->nr_pending)) ||
996 (conf->array_freeze_pending))
997 wake_up(&conf->wait_barrier);
998}
999
1000static void freeze_array(struct r10conf *conf, int extra)
1001{
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014 spin_lock_irq(&conf->resync_lock);
1015 conf->array_freeze_pending++;
1016 conf->barrier++;
1017 conf->nr_waiting++;
1018 wait_event_lock_irq_cmd(conf->wait_barrier,
1019 atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
1020 conf->resync_lock,
1021 flush_pending_writes(conf));
1022
1023 conf->array_freeze_pending--;
1024 spin_unlock_irq(&conf->resync_lock);
1025}
1026
1027static void unfreeze_array(struct r10conf *conf)
1028{
1029
1030 spin_lock_irq(&conf->resync_lock);
1031 conf->barrier--;
1032 conf->nr_waiting--;
1033 wake_up(&conf->wait_barrier);
1034 spin_unlock_irq(&conf->resync_lock);
1035}
1036
1037static sector_t choose_data_offset(struct r10bio *r10_bio,
1038 struct md_rdev *rdev)
1039{
1040 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1041 test_bit(R10BIO_Previous, &r10_bio->state))
1042 return rdev->data_offset;
1043 else
1044 return rdev->new_data_offset;
1045}
1046
1047struct raid10_plug_cb {
1048 struct blk_plug_cb cb;
1049 struct bio_list pending;
1050 int pending_cnt;
1051};
1052
1053static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1054{
1055 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1056 cb);
1057 struct mddev *mddev = plug->cb.data;
1058 struct r10conf *conf = mddev->private;
1059 struct bio *bio;
1060
1061 if (from_schedule || current->bio_list) {
1062 spin_lock_irq(&conf->device_lock);
1063 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1064 conf->pending_count += plug->pending_cnt;
1065 spin_unlock_irq(&conf->device_lock);
1066 wake_up(&conf->wait_barrier);
1067 md_wakeup_thread(mddev->thread);
1068 kfree(plug);
1069 return;
1070 }
1071
1072
1073 bio = bio_list_get(&plug->pending);
1074 md_bitmap_unplug(mddev->bitmap);
1075 wake_up(&conf->wait_barrier);
1076
1077 while (bio) {
1078 struct bio *next = bio->bi_next;
1079 struct md_rdev *rdev = (void*)bio->bi_bdev;
1080 bio->bi_next = NULL;
1081 bio_set_dev(bio, rdev->bdev);
1082 if (test_bit(Faulty, &rdev->flags)) {
1083 bio_io_error(bio);
1084 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
1085 !blk_queue_discard(bio->bi_bdev->bd_disk->queue)))
1086
1087 bio_endio(bio);
1088 else
1089 submit_bio_noacct(bio);
1090 bio = next;
1091 }
1092 kfree(plug);
1093}
1094
1095
1096
1097
1098
1099
1100
1101static void regular_request_wait(struct mddev *mddev, struct r10conf *conf,
1102 struct bio *bio, sector_t sectors)
1103{
1104 wait_barrier(conf);
1105 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1106 bio->bi_iter.bi_sector < conf->reshape_progress &&
1107 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1108 raid10_log(conf->mddev, "wait reshape");
1109 allow_barrier(conf);
1110 wait_event(conf->wait_barrier,
1111 conf->reshape_progress <= bio->bi_iter.bi_sector ||
1112 conf->reshape_progress >= bio->bi_iter.bi_sector +
1113 sectors);
1114 wait_barrier(conf);
1115 }
1116}
1117
1118static void raid10_read_request(struct mddev *mddev, struct bio *bio,
1119 struct r10bio *r10_bio)
1120{
1121 struct r10conf *conf = mddev->private;
1122 struct bio *read_bio;
1123 const int op = bio_op(bio);
1124 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1125 int max_sectors;
1126 struct md_rdev *rdev;
1127 char b[BDEVNAME_SIZE];
1128 int slot = r10_bio->read_slot;
1129 struct md_rdev *err_rdev = NULL;
1130 gfp_t gfp = GFP_NOIO;
1131
1132 if (slot >= 0 && r10_bio->devs[slot].rdev) {
1133
1134
1135
1136
1137
1138
1139
1140 int disk;
1141
1142
1143
1144
1145 gfp = GFP_NOIO | __GFP_HIGH;
1146
1147 rcu_read_lock();
1148 disk = r10_bio->devs[slot].devnum;
1149 err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
1150 if (err_rdev)
1151 bdevname(err_rdev->bdev, b);
1152 else {
1153 strcpy(b, "???");
1154
1155 err_rdev = r10_bio->devs[slot].rdev;
1156 }
1157 rcu_read_unlock();
1158 }
1159
1160 regular_request_wait(mddev, conf, bio, r10_bio->sectors);
1161 rdev = read_balance(conf, r10_bio, &max_sectors);
1162 if (!rdev) {
1163 if (err_rdev) {
1164 pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
1165 mdname(mddev), b,
1166 (unsigned long long)r10_bio->sector);
1167 }
1168 raid_end_bio_io(r10_bio);
1169 return;
1170 }
1171 if (err_rdev)
1172 pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
1173 mdname(mddev),
1174 bdevname(rdev->bdev, b),
1175 (unsigned long long)r10_bio->sector);
1176 if (max_sectors < bio_sectors(bio)) {
1177 struct bio *split = bio_split(bio, max_sectors,
1178 gfp, &conf->bio_split);
1179 bio_chain(split, bio);
1180 allow_barrier(conf);
1181 submit_bio_noacct(bio);
1182 wait_barrier(conf);
1183 bio = split;
1184 r10_bio->master_bio = bio;
1185 r10_bio->sectors = max_sectors;
1186 }
1187 slot = r10_bio->read_slot;
1188
1189 if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
1190 r10_bio->start_time = bio_start_io_acct(bio);
1191 read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
1192
1193 r10_bio->devs[slot].bio = read_bio;
1194 r10_bio->devs[slot].rdev = rdev;
1195
1196 read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
1197 choose_data_offset(r10_bio, rdev);
1198 bio_set_dev(read_bio, rdev->bdev);
1199 read_bio->bi_end_io = raid10_end_read_request;
1200 bio_set_op_attrs(read_bio, op, do_sync);
1201 if (test_bit(FailFast, &rdev->flags) &&
1202 test_bit(R10BIO_FailFast, &r10_bio->state))
1203 read_bio->bi_opf |= MD_FAILFAST;
1204 read_bio->bi_private = r10_bio;
1205
1206 if (mddev->gendisk)
1207 trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk),
1208 r10_bio->sector);
1209 submit_bio_noacct(read_bio);
1210 return;
1211}
1212
1213static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
1214 struct bio *bio, bool replacement,
1215 int n_copy)
1216{
1217 const int op = bio_op(bio);
1218 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1219 const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
1220 unsigned long flags;
1221 struct blk_plug_cb *cb;
1222 struct raid10_plug_cb *plug = NULL;
1223 struct r10conf *conf = mddev->private;
1224 struct md_rdev *rdev;
1225 int devnum = r10_bio->devs[n_copy].devnum;
1226 struct bio *mbio;
1227
1228 if (replacement) {
1229 rdev = conf->mirrors[devnum].replacement;
1230 if (rdev == NULL) {
1231
1232 smp_mb();
1233 rdev = conf->mirrors[devnum].rdev;
1234 }
1235 } else
1236 rdev = conf->mirrors[devnum].rdev;
1237
1238 mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
1239 if (replacement)
1240 r10_bio->devs[n_copy].repl_bio = mbio;
1241 else
1242 r10_bio->devs[n_copy].bio = mbio;
1243
1244 mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
1245 choose_data_offset(r10_bio, rdev));
1246 bio_set_dev(mbio, rdev->bdev);
1247 mbio->bi_end_io = raid10_end_write_request;
1248 bio_set_op_attrs(mbio, op, do_sync | do_fua);
1249 if (!replacement && test_bit(FailFast,
1250 &conf->mirrors[devnum].rdev->flags)
1251 && enough(conf, devnum))
1252 mbio->bi_opf |= MD_FAILFAST;
1253 mbio->bi_private = r10_bio;
1254
1255 if (conf->mddev->gendisk)
1256 trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk),
1257 r10_bio->sector);
1258
1259 mbio->bi_bdev = (void *)rdev;
1260
1261 atomic_inc(&r10_bio->remaining);
1262
1263 cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
1264 if (cb)
1265 plug = container_of(cb, struct raid10_plug_cb, cb);
1266 else
1267 plug = NULL;
1268 if (plug) {
1269 bio_list_add(&plug->pending, mbio);
1270 plug->pending_cnt++;
1271 } else {
1272 spin_lock_irqsave(&conf->device_lock, flags);
1273 bio_list_add(&conf->pending_bio_list, mbio);
1274 conf->pending_count++;
1275 spin_unlock_irqrestore(&conf->device_lock, flags);
1276 md_wakeup_thread(mddev->thread);
1277 }
1278}
1279
1280static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
1281{
1282 int i;
1283 struct r10conf *conf = mddev->private;
1284 struct md_rdev *blocked_rdev;
1285
1286retry_wait:
1287 blocked_rdev = NULL;
1288 rcu_read_lock();
1289 for (i = 0; i < conf->copies; i++) {
1290 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1291 struct md_rdev *rrdev = rcu_dereference(
1292 conf->mirrors[i].replacement);
1293 if (rdev == rrdev)
1294 rrdev = NULL;
1295 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1296 atomic_inc(&rdev->nr_pending);
1297 blocked_rdev = rdev;
1298 break;
1299 }
1300 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1301 atomic_inc(&rrdev->nr_pending);
1302 blocked_rdev = rrdev;
1303 break;
1304 }
1305
1306 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1307 sector_t first_bad;
1308 sector_t dev_sector = r10_bio->devs[i].addr;
1309 int bad_sectors;
1310 int is_bad;
1311
1312
1313
1314
1315
1316 if (!r10_bio->sectors)
1317 continue;
1318
1319 is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors,
1320 &first_bad, &bad_sectors);
1321 if (is_bad < 0) {
1322
1323
1324
1325
1326 atomic_inc(&rdev->nr_pending);
1327 set_bit(BlockedBadBlocks, &rdev->flags);
1328 blocked_rdev = rdev;
1329 break;
1330 }
1331 }
1332 }
1333 rcu_read_unlock();
1334
1335 if (unlikely(blocked_rdev)) {
1336
1337 allow_barrier(conf);
1338 raid10_log(conf->mddev, "%s wait rdev %d blocked",
1339 __func__, blocked_rdev->raid_disk);
1340 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1341 wait_barrier(conf);
1342 goto retry_wait;
1343 }
1344}
1345
1346static void raid10_write_request(struct mddev *mddev, struct bio *bio,
1347 struct r10bio *r10_bio)
1348{
1349 struct r10conf *conf = mddev->private;
1350 int i;
1351 sector_t sectors;
1352 int max_sectors;
1353
1354 if ((mddev_is_clustered(mddev) &&
1355 md_cluster_ops->area_resyncing(mddev, WRITE,
1356 bio->bi_iter.bi_sector,
1357 bio_end_sector(bio)))) {
1358 DEFINE_WAIT(w);
1359 for (;;) {
1360 prepare_to_wait(&conf->wait_barrier,
1361 &w, TASK_IDLE);
1362 if (!md_cluster_ops->area_resyncing(mddev, WRITE,
1363 bio->bi_iter.bi_sector, bio_end_sector(bio)))
1364 break;
1365 schedule();
1366 }
1367 finish_wait(&conf->wait_barrier, &w);
1368 }
1369
1370 sectors = r10_bio->sectors;
1371 regular_request_wait(mddev, conf, bio, sectors);
1372 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1373 (mddev->reshape_backwards
1374 ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
1375 bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
1376 : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
1377 bio->bi_iter.bi_sector < conf->reshape_progress))) {
1378
1379 mddev->reshape_position = conf->reshape_progress;
1380 set_mask_bits(&mddev->sb_flags, 0,
1381 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1382 md_wakeup_thread(mddev->thread);
1383 raid10_log(conf->mddev, "wait reshape metadata");
1384 wait_event(mddev->sb_wait,
1385 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
1386
1387 conf->reshape_safe = mddev->reshape_position;
1388 }
1389
1390 if (conf->pending_count >= max_queued_requests) {
1391 md_wakeup_thread(mddev->thread);
1392 raid10_log(mddev, "wait queued");
1393 wait_event(conf->wait_barrier,
1394 conf->pending_count < max_queued_requests);
1395 }
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406 r10_bio->read_slot = -1;
1407 raid10_find_phys(conf, r10_bio);
1408
1409 wait_blocked_dev(mddev, r10_bio);
1410
1411 rcu_read_lock();
1412 max_sectors = r10_bio->sectors;
1413
1414 for (i = 0; i < conf->copies; i++) {
1415 int d = r10_bio->devs[i].devnum;
1416 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1417 struct md_rdev *rrdev = rcu_dereference(
1418 conf->mirrors[d].replacement);
1419 if (rdev == rrdev)
1420 rrdev = NULL;
1421 if (rdev && (test_bit(Faulty, &rdev->flags)))
1422 rdev = NULL;
1423 if (rrdev && (test_bit(Faulty, &rrdev->flags)))
1424 rrdev = NULL;
1425
1426 r10_bio->devs[i].bio = NULL;
1427 r10_bio->devs[i].repl_bio = NULL;
1428
1429 if (!rdev && !rrdev) {
1430 set_bit(R10BIO_Degraded, &r10_bio->state);
1431 continue;
1432 }
1433 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1434 sector_t first_bad;
1435 sector_t dev_sector = r10_bio->devs[i].addr;
1436 int bad_sectors;
1437 int is_bad;
1438
1439 is_bad = is_badblock(rdev, dev_sector, max_sectors,
1440 &first_bad, &bad_sectors);
1441 if (is_bad && first_bad <= dev_sector) {
1442
1443 bad_sectors -= (dev_sector - first_bad);
1444 if (bad_sectors < max_sectors)
1445
1446
1447
1448 max_sectors = bad_sectors;
1449
1450
1451
1452
1453
1454
1455
1456
1457 continue;
1458 }
1459 if (is_bad) {
1460 int good_sectors = first_bad - dev_sector;
1461 if (good_sectors < max_sectors)
1462 max_sectors = good_sectors;
1463 }
1464 }
1465 if (rdev) {
1466 r10_bio->devs[i].bio = bio;
1467 atomic_inc(&rdev->nr_pending);
1468 }
1469 if (rrdev) {
1470 r10_bio->devs[i].repl_bio = bio;
1471 atomic_inc(&rrdev->nr_pending);
1472 }
1473 }
1474 rcu_read_unlock();
1475
1476 if (max_sectors < r10_bio->sectors)
1477 r10_bio->sectors = max_sectors;
1478
1479 if (r10_bio->sectors < bio_sectors(bio)) {
1480 struct bio *split = bio_split(bio, r10_bio->sectors,
1481 GFP_NOIO, &conf->bio_split);
1482 bio_chain(split, bio);
1483 allow_barrier(conf);
1484 submit_bio_noacct(bio);
1485 wait_barrier(conf);
1486 bio = split;
1487 r10_bio->master_bio = bio;
1488 }
1489
1490 if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
1491 r10_bio->start_time = bio_start_io_acct(bio);
1492 atomic_set(&r10_bio->remaining, 1);
1493 md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1494
1495 for (i = 0; i < conf->copies; i++) {
1496 if (r10_bio->devs[i].bio)
1497 raid10_write_one_disk(mddev, r10_bio, bio, false, i);
1498 if (r10_bio->devs[i].repl_bio)
1499 raid10_write_one_disk(mddev, r10_bio, bio, true, i);
1500 }
1501 one_write_done(r10_bio);
1502}
1503
1504static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
1505{
1506 struct r10conf *conf = mddev->private;
1507 struct r10bio *r10_bio;
1508
1509 r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
1510
1511 r10_bio->master_bio = bio;
1512 r10_bio->sectors = sectors;
1513
1514 r10_bio->mddev = mddev;
1515 r10_bio->sector = bio->bi_iter.bi_sector;
1516 r10_bio->state = 0;
1517 r10_bio->read_slot = -1;
1518 memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) *
1519 conf->geo.raid_disks);
1520
1521 if (bio_data_dir(bio) == READ)
1522 raid10_read_request(mddev, bio, r10_bio);
1523 else
1524 raid10_write_request(mddev, bio, r10_bio);
1525}
1526
1527static void raid_end_discard_bio(struct r10bio *r10bio)
1528{
1529 struct r10conf *conf = r10bio->mddev->private;
1530 struct r10bio *first_r10bio;
1531
1532 while (atomic_dec_and_test(&r10bio->remaining)) {
1533
1534 allow_barrier(conf);
1535
1536 if (!test_bit(R10BIO_Discard, &r10bio->state)) {
1537 first_r10bio = (struct r10bio *)r10bio->master_bio;
1538 free_r10bio(r10bio);
1539 r10bio = first_r10bio;
1540 } else {
1541 md_write_end(r10bio->mddev);
1542 bio_endio(r10bio->master_bio);
1543 free_r10bio(r10bio);
1544 break;
1545 }
1546 }
1547}
1548
1549static void raid10_end_discard_request(struct bio *bio)
1550{
1551 struct r10bio *r10_bio = bio->bi_private;
1552 struct r10conf *conf = r10_bio->mddev->private;
1553 struct md_rdev *rdev = NULL;
1554 int dev;
1555 int slot, repl;
1556
1557
1558
1559
1560 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
1561 set_bit(R10BIO_Uptodate, &r10_bio->state);
1562
1563 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1564 if (repl)
1565 rdev = conf->mirrors[dev].replacement;
1566 if (!rdev) {
1567
1568
1569
1570
1571
1572 smp_rmb();
1573 rdev = conf->mirrors[dev].rdev;
1574 }
1575
1576 raid_end_discard_bio(r10_bio);
1577 rdev_dec_pending(rdev, conf->mddev);
1578}
1579
1580
1581
1582
1583
1584
1585
1586static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
1587{
1588 struct r10conf *conf = mddev->private;
1589 struct geom *geo = &conf->geo;
1590 int far_copies = geo->far_copies;
1591 bool first_copy = true;
1592 struct r10bio *r10_bio, *first_r10bio;
1593 struct bio *split;
1594 int disk;
1595 sector_t chunk;
1596 unsigned int stripe_size;
1597 unsigned int stripe_data_disks;
1598 sector_t split_size;
1599 sector_t bio_start, bio_end;
1600 sector_t first_stripe_index, last_stripe_index;
1601 sector_t start_disk_offset;
1602 unsigned int start_disk_index;
1603 sector_t end_disk_offset;
1604 unsigned int end_disk_index;
1605 unsigned int remainder;
1606
1607 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
1608 return -EAGAIN;
1609
1610 wait_barrier(conf);
1611
1612
1613
1614
1615
1616 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
1617 goto out;
1618
1619 if (geo->near_copies)
1620 stripe_data_disks = geo->raid_disks / geo->near_copies +
1621 geo->raid_disks % geo->near_copies;
1622 else
1623 stripe_data_disks = geo->raid_disks;
1624
1625 stripe_size = stripe_data_disks << geo->chunk_shift;
1626
1627 bio_start = bio->bi_iter.bi_sector;
1628 bio_end = bio_end_sector(bio);
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638 if (bio_sectors(bio) < stripe_size*2)
1639 goto out;
1640
1641
1642
1643
1644 div_u64_rem(bio_start, stripe_size, &remainder);
1645 if (remainder) {
1646 split_size = stripe_size - remainder;
1647 split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split);
1648 bio_chain(split, bio);
1649 allow_barrier(conf);
1650
1651 submit_bio_noacct(split);
1652 wait_barrier(conf);
1653 }
1654 div_u64_rem(bio_end, stripe_size, &remainder);
1655 if (remainder) {
1656 split_size = bio_sectors(bio) - remainder;
1657 split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split);
1658 bio_chain(split, bio);
1659 allow_barrier(conf);
1660
1661 submit_bio_noacct(bio);
1662 bio = split;
1663 wait_barrier(conf);
1664 }
1665
1666 bio_start = bio->bi_iter.bi_sector;
1667 bio_end = bio_end_sector(bio);
1668
1669
1670
1671
1672
1673
1674 chunk = bio_start >> geo->chunk_shift;
1675 chunk *= geo->near_copies;
1676 first_stripe_index = chunk;
1677 start_disk_index = sector_div(first_stripe_index, geo->raid_disks);
1678 if (geo->far_offset)
1679 first_stripe_index *= geo->far_copies;
1680 start_disk_offset = (bio_start & geo->chunk_mask) +
1681 (first_stripe_index << geo->chunk_shift);
1682
1683 chunk = bio_end >> geo->chunk_shift;
1684 chunk *= geo->near_copies;
1685 last_stripe_index = chunk;
1686 end_disk_index = sector_div(last_stripe_index, geo->raid_disks);
1687 if (geo->far_offset)
1688 last_stripe_index *= geo->far_copies;
1689 end_disk_offset = (bio_end & geo->chunk_mask) +
1690 (last_stripe_index << geo->chunk_shift);
1691
1692retry_discard:
1693 r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
1694 r10_bio->mddev = mddev;
1695 r10_bio->state = 0;
1696 r10_bio->sectors = 0;
1697 memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);
1698 wait_blocked_dev(mddev, r10_bio);
1699
1700
1701
1702
1703
1704
1705
1706
1707 if (first_copy) {
1708 r10_bio->master_bio = bio;
1709 set_bit(R10BIO_Discard, &r10_bio->state);
1710 first_copy = false;
1711 first_r10bio = r10_bio;
1712 } else
1713 r10_bio->master_bio = (struct bio *)first_r10bio;
1714
1715 rcu_read_lock();
1716 for (disk = 0; disk < geo->raid_disks; disk++) {
1717 struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
1718 struct md_rdev *rrdev = rcu_dereference(
1719 conf->mirrors[disk].replacement);
1720
1721 r10_bio->devs[disk].bio = NULL;
1722 r10_bio->devs[disk].repl_bio = NULL;
1723
1724 if (rdev && (test_bit(Faulty, &rdev->flags)))
1725 rdev = NULL;
1726 if (rrdev && (test_bit(Faulty, &rrdev->flags)))
1727 rrdev = NULL;
1728 if (!rdev && !rrdev)
1729 continue;
1730
1731 if (rdev) {
1732 r10_bio->devs[disk].bio = bio;
1733 atomic_inc(&rdev->nr_pending);
1734 }
1735 if (rrdev) {
1736 r10_bio->devs[disk].repl_bio = bio;
1737 atomic_inc(&rrdev->nr_pending);
1738 }
1739 }
1740 rcu_read_unlock();
1741
1742 atomic_set(&r10_bio->remaining, 1);
1743 for (disk = 0; disk < geo->raid_disks; disk++) {
1744 sector_t dev_start, dev_end;
1745 struct bio *mbio, *rbio = NULL;
1746 struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
1747 struct md_rdev *rrdev = rcu_dereference(
1748 conf->mirrors[disk].replacement);
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762 if (disk < start_disk_index)
1763 dev_start = (first_stripe_index + 1) * mddev->chunk_sectors;
1764 else if (disk > start_disk_index)
1765 dev_start = first_stripe_index * mddev->chunk_sectors;
1766 else
1767 dev_start = start_disk_offset;
1768
1769 if (disk < end_disk_index)
1770 dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
1771 else if (disk > end_disk_index)
1772 dev_end = last_stripe_index * mddev->chunk_sectors;
1773 else
1774 dev_end = end_disk_offset;
1775
1776
1777
1778
1779
1780 if (r10_bio->devs[disk].bio) {
1781 mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
1782 mbio->bi_end_io = raid10_end_discard_request;
1783 mbio->bi_private = r10_bio;
1784 r10_bio->devs[disk].bio = mbio;
1785 r10_bio->devs[disk].devnum = disk;
1786 atomic_inc(&r10_bio->remaining);
1787 md_submit_discard_bio(mddev, rdev, mbio,
1788 dev_start + choose_data_offset(r10_bio, rdev),
1789 dev_end - dev_start);
1790 bio_endio(mbio);
1791 }
1792 if (r10_bio->devs[disk].repl_bio) {
1793 rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
1794 rbio->bi_end_io = raid10_end_discard_request;
1795 rbio->bi_private = r10_bio;
1796 r10_bio->devs[disk].repl_bio = rbio;
1797 r10_bio->devs[disk].devnum = disk;
1798 atomic_inc(&r10_bio->remaining);
1799 md_submit_discard_bio(mddev, rrdev, rbio,
1800 dev_start + choose_data_offset(r10_bio, rrdev),
1801 dev_end - dev_start);
1802 bio_endio(rbio);
1803 }
1804 }
1805
1806 if (!geo->far_offset && --far_copies) {
1807 first_stripe_index += geo->stride >> geo->chunk_shift;
1808 start_disk_offset += geo->stride;
1809 last_stripe_index += geo->stride >> geo->chunk_shift;
1810 end_disk_offset += geo->stride;
1811 atomic_inc(&first_r10bio->remaining);
1812 raid_end_discard_bio(r10_bio);
1813 wait_barrier(conf);
1814 goto retry_discard;
1815 }
1816
1817 raid_end_discard_bio(r10_bio);
1818
1819 return 0;
1820out:
1821 allow_barrier(conf);
1822 return -EAGAIN;
1823}
1824
1825static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
1826{
1827 struct r10conf *conf = mddev->private;
1828 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1829 int chunk_sects = chunk_mask + 1;
1830 int sectors = bio_sectors(bio);
1831
1832 if (unlikely(bio->bi_opf & REQ_PREFLUSH)
1833 && md_flush_request(mddev, bio))
1834 return true;
1835
1836 if (!md_write_start(mddev, bio))
1837 return false;
1838
1839 if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
1840 if (!raid10_handle_discard(mddev, bio))
1841 return true;
1842
1843
1844
1845
1846
1847 if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
1848 sectors > chunk_sects
1849 && (conf->geo.near_copies < conf->geo.raid_disks
1850 || conf->prev.near_copies <
1851 conf->prev.raid_disks)))
1852 sectors = chunk_sects -
1853 (bio->bi_iter.bi_sector &
1854 (chunk_sects - 1));
1855 __make_request(mddev, bio, sectors);
1856
1857
1858 wake_up(&conf->wait_barrier);
1859 return true;
1860}
1861
1862static void raid10_status(struct seq_file *seq, struct mddev *mddev)
1863{
1864 struct r10conf *conf = mddev->private;
1865 int i;
1866
1867 if (conf->geo.near_copies < conf->geo.raid_disks)
1868 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1869 if (conf->geo.near_copies > 1)
1870 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1871 if (conf->geo.far_copies > 1) {
1872 if (conf->geo.far_offset)
1873 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1874 else
1875 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1876 if (conf->geo.far_set_size != conf->geo.raid_disks)
1877 seq_printf(seq, " %d devices per set", conf->geo.far_set_size);
1878 }
1879 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1880 conf->geo.raid_disks - mddev->degraded);
1881 rcu_read_lock();
1882 for (i = 0; i < conf->geo.raid_disks; i++) {
1883 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1884 seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1885 }
1886 rcu_read_unlock();
1887 seq_printf(seq, "]");
1888}
1889
1890
1891
1892
1893
1894
1895static int _enough(struct r10conf *conf, int previous, int ignore)
1896{
1897 int first = 0;
1898 int has_enough = 0;
1899 int disks, ncopies;
1900 if (previous) {
1901 disks = conf->prev.raid_disks;
1902 ncopies = conf->prev.near_copies;
1903 } else {
1904 disks = conf->geo.raid_disks;
1905 ncopies = conf->geo.near_copies;
1906 }
1907
1908 rcu_read_lock();
1909 do {
1910 int n = conf->copies;
1911 int cnt = 0;
1912 int this = first;
1913 while (n--) {
1914 struct md_rdev *rdev;
1915 if (this != ignore &&
1916 (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
1917 test_bit(In_sync, &rdev->flags))
1918 cnt++;
1919 this = (this+1) % disks;
1920 }
1921 if (cnt == 0)
1922 goto out;
1923 first = (first + ncopies) % disks;
1924 } while (first != 0);
1925 has_enough = 1;
1926out:
1927 rcu_read_unlock();
1928 return has_enough;
1929}
1930
1931static int enough(struct r10conf *conf, int ignore)
1932{
1933
1934
1935
1936
1937
1938 return _enough(conf, 0, ignore) &&
1939 _enough(conf, 1, ignore);
1940}
1941
1942static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
1943{
1944 char b[BDEVNAME_SIZE];
1945 struct r10conf *conf = mddev->private;
1946 unsigned long flags;
1947
1948
1949
1950
1951
1952
1953
1954 spin_lock_irqsave(&conf->device_lock, flags);
1955 if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
1956 && !enough(conf, rdev->raid_disk)) {
1957
1958
1959
1960 spin_unlock_irqrestore(&conf->device_lock, flags);
1961 return;
1962 }
1963 if (test_and_clear_bit(In_sync, &rdev->flags))
1964 mddev->degraded++;
1965
1966
1967
1968 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1969 set_bit(Blocked, &rdev->flags);
1970 set_bit(Faulty, &rdev->flags);
1971 set_mask_bits(&mddev->sb_flags, 0,
1972 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1973 spin_unlock_irqrestore(&conf->device_lock, flags);
1974 pr_crit("md/raid10:%s: Disk failure on %s, disabling device.\n"
1975 "md/raid10:%s: Operation continuing on %d devices.\n",
1976 mdname(mddev), bdevname(rdev->bdev, b),
1977 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1978}
1979
1980static void print_conf(struct r10conf *conf)
1981{
1982 int i;
1983 struct md_rdev *rdev;
1984
1985 pr_debug("RAID10 conf printout:\n");
1986 if (!conf) {
1987 pr_debug("(!conf)\n");
1988 return;
1989 }
1990 pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1991 conf->geo.raid_disks);
1992
1993
1994
1995 for (i = 0; i < conf->geo.raid_disks; i++) {
1996 char b[BDEVNAME_SIZE];
1997 rdev = conf->mirrors[i].rdev;
1998 if (rdev)
1999 pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n",
2000 i, !test_bit(In_sync, &rdev->flags),
2001 !test_bit(Faulty, &rdev->flags),
2002 bdevname(rdev->bdev,b));
2003 }
2004}
2005
2006static void close_sync(struct r10conf *conf)
2007{
2008 wait_barrier(conf);
2009 allow_barrier(conf);
2010
2011 mempool_exit(&conf->r10buf_pool);
2012}
2013
2014static int raid10_spare_active(struct mddev *mddev)
2015{
2016 int i;
2017 struct r10conf *conf = mddev->private;
2018 struct raid10_info *tmp;
2019 int count = 0;
2020 unsigned long flags;
2021
2022
2023
2024
2025
2026 for (i = 0; i < conf->geo.raid_disks; i++) {
2027 tmp = conf->mirrors + i;
2028 if (tmp->replacement
2029 && tmp->replacement->recovery_offset == MaxSector
2030 && !test_bit(Faulty, &tmp->replacement->flags)
2031 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
2032
2033 if (!tmp->rdev
2034 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
2035 count++;
2036 if (tmp->rdev) {
2037
2038
2039
2040
2041 set_bit(Faulty, &tmp->rdev->flags);
2042 sysfs_notify_dirent_safe(
2043 tmp->rdev->sysfs_state);
2044 }
2045 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
2046 } else if (tmp->rdev
2047 && tmp->rdev->recovery_offset == MaxSector
2048 && !test_bit(Faulty, &tmp->rdev->flags)
2049 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
2050 count++;
2051 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
2052 }
2053 }
2054 spin_lock_irqsave(&conf->device_lock, flags);
2055 mddev->degraded -= count;
2056 spin_unlock_irqrestore(&conf->device_lock, flags);
2057
2058 print_conf(conf);
2059 return count;
2060}
2061
2062static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
2063{
2064 struct r10conf *conf = mddev->private;
2065 int err = -EEXIST;
2066 int mirror;
2067 int first = 0;
2068 int last = conf->geo.raid_disks - 1;
2069
2070 if (mddev->recovery_cp < MaxSector)
2071
2072
2073
2074 return -EBUSY;
2075 if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
2076 return -EINVAL;
2077
2078 if (md_integrity_add_rdev(rdev, mddev))
2079 return -ENXIO;
2080
2081 if (rdev->raid_disk >= 0)
2082 first = last = rdev->raid_disk;
2083
2084 if (rdev->saved_raid_disk >= first &&
2085 rdev->saved_raid_disk < conf->geo.raid_disks &&
2086 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
2087 mirror = rdev->saved_raid_disk;
2088 else
2089 mirror = first;
2090 for ( ; mirror <= last ; mirror++) {
2091 struct raid10_info *p = &conf->mirrors[mirror];
2092 if (p->recovery_disabled == mddev->recovery_disabled)
2093 continue;
2094 if (p->rdev) {
2095 if (!test_bit(WantReplacement, &p->rdev->flags) ||
2096 p->replacement != NULL)
2097 continue;
2098 clear_bit(In_sync, &rdev->flags);
2099 set_bit(Replacement, &rdev->flags);
2100 rdev->raid_disk = mirror;
2101 err = 0;
2102 if (mddev->gendisk)
2103 disk_stack_limits(mddev->gendisk, rdev->bdev,
2104 rdev->data_offset << 9);
2105 conf->fullsync = 1;
2106 rcu_assign_pointer(p->replacement, rdev);
2107 break;
2108 }
2109
2110 if (mddev->gendisk)
2111 disk_stack_limits(mddev->gendisk, rdev->bdev,
2112 rdev->data_offset << 9);
2113
2114 p->head_position = 0;
2115 p->recovery_disabled = mddev->recovery_disabled - 1;
2116 rdev->raid_disk = mirror;
2117 err = 0;
2118 if (rdev->saved_raid_disk != mirror)
2119 conf->fullsync = 1;
2120 rcu_assign_pointer(p->rdev, rdev);
2121 break;
2122 }
2123 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
2124 blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue);
2125
2126 print_conf(conf);
2127 return err;
2128}
2129
2130static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
2131{
2132 struct r10conf *conf = mddev->private;
2133 int err = 0;
2134 int number = rdev->raid_disk;
2135 struct md_rdev **rdevp;
2136 struct raid10_info *p = conf->mirrors + number;
2137
2138 print_conf(conf);
2139 if (rdev == p->rdev)
2140 rdevp = &p->rdev;
2141 else if (rdev == p->replacement)
2142 rdevp = &p->replacement;
2143 else
2144 return 0;
2145
2146 if (test_bit(In_sync, &rdev->flags) ||
2147 atomic_read(&rdev->nr_pending)) {
2148 err = -EBUSY;
2149 goto abort;
2150 }
2151
2152
2153
2154 if (!test_bit(Faulty, &rdev->flags) &&
2155 mddev->recovery_disabled != p->recovery_disabled &&
2156 (!p->replacement || p->replacement == rdev) &&
2157 number < conf->geo.raid_disks &&
2158 enough(conf, -1)) {
2159 err = -EBUSY;
2160 goto abort;
2161 }
2162 *rdevp = NULL;
2163 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
2164 synchronize_rcu();
2165 if (atomic_read(&rdev->nr_pending)) {
2166
2167 err = -EBUSY;
2168 *rdevp = rdev;
2169 goto abort;
2170 }
2171 }
2172 if (p->replacement) {
2173
2174 p->rdev = p->replacement;
2175 clear_bit(Replacement, &p->replacement->flags);
2176 smp_mb();
2177
2178
2179 p->replacement = NULL;
2180 }
2181
2182 clear_bit(WantReplacement, &rdev->flags);
2183 err = md_integrity_register(mddev);
2184
2185abort:
2186
2187 print_conf(conf);
2188 return err;
2189}
2190
2191static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
2192{
2193 struct r10conf *conf = r10_bio->mddev->private;
2194
2195 if (!bio->bi_status)
2196 set_bit(R10BIO_Uptodate, &r10_bio->state);
2197 else
2198
2199
2200
2201 atomic_add(r10_bio->sectors,
2202 &conf->mirrors[d].rdev->corrected_errors);
2203
2204
2205
2206
2207 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
2208 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
2209 atomic_dec_and_test(&r10_bio->remaining)) {
2210
2211
2212
2213 reschedule_retry(r10_bio);
2214 }
2215}
2216
2217static void end_sync_read(struct bio *bio)
2218{
2219 struct r10bio *r10_bio = get_resync_r10bio(bio);
2220 struct r10conf *conf = r10_bio->mddev->private;
2221 int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
2222
2223 __end_sync_read(r10_bio, bio, d);
2224}
2225
2226static void end_reshape_read(struct bio *bio)
2227{
2228
2229 struct r10bio *r10_bio = bio->bi_private;
2230
2231 __end_sync_read(r10_bio, bio, r10_bio->read_slot);
2232}
2233
2234static void end_sync_request(struct r10bio *r10_bio)
2235{
2236 struct mddev *mddev = r10_bio->mddev;
2237
2238 while (atomic_dec_and_test(&r10_bio->remaining)) {
2239 if (r10_bio->master_bio == NULL) {
2240
2241 sector_t s = r10_bio->sectors;
2242 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2243 test_bit(R10BIO_WriteError, &r10_bio->state))
2244 reschedule_retry(r10_bio);
2245 else
2246 put_buf(r10_bio);
2247 md_done_sync(mddev, s, 1);
2248 break;
2249 } else {
2250 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
2251 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2252 test_bit(R10BIO_WriteError, &r10_bio->state))
2253 reschedule_retry(r10_bio);
2254 else
2255 put_buf(r10_bio);
2256 r10_bio = r10_bio2;
2257 }
2258 }
2259}
2260
2261static void end_sync_write(struct bio *bio)
2262{
2263 struct r10bio *r10_bio = get_resync_r10bio(bio);
2264 struct mddev *mddev = r10_bio->mddev;
2265 struct r10conf *conf = mddev->private;
2266 int d;
2267 sector_t first_bad;
2268 int bad_sectors;
2269 int slot;
2270 int repl;
2271 struct md_rdev *rdev = NULL;
2272
2273 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
2274 if (repl)
2275 rdev = conf->mirrors[d].replacement;
2276 else
2277 rdev = conf->mirrors[d].rdev;
2278
2279 if (bio->bi_status) {
2280 if (repl)
2281 md_error(mddev, rdev);
2282 else {
2283 set_bit(WriteErrorSeen, &rdev->flags);
2284 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2285 set_bit(MD_RECOVERY_NEEDED,
2286 &rdev->mddev->recovery);
2287 set_bit(R10BIO_WriteError, &r10_bio->state);
2288 }
2289 } else if (is_badblock(rdev,
2290 r10_bio->devs[slot].addr,
2291 r10_bio->sectors,
2292 &first_bad, &bad_sectors))
2293 set_bit(R10BIO_MadeGood, &r10_bio->state);
2294
2295 rdev_dec_pending(rdev, mddev);
2296
2297 end_sync_request(r10_bio);
2298}
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2317{
2318 struct r10conf *conf = mddev->private;
2319 int i, first;
2320 struct bio *tbio, *fbio;
2321 int vcnt;
2322 struct page **tpages, **fpages;
2323
2324 atomic_set(&r10_bio->remaining, 1);
2325
2326
2327 for (i=0; i<conf->copies; i++)
2328 if (!r10_bio->devs[i].bio->bi_status)
2329 break;
2330
2331 if (i == conf->copies)
2332 goto done;
2333
2334 first = i;
2335 fbio = r10_bio->devs[i].bio;
2336 fbio->bi_iter.bi_size = r10_bio->sectors << 9;
2337 fbio->bi_iter.bi_idx = 0;
2338 fpages = get_resync_pages(fbio)->pages;
2339
2340 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2341
2342 for (i=0 ; i < conf->copies ; i++) {
2343 int j, d;
2344 struct md_rdev *rdev;
2345 struct resync_pages *rp;
2346
2347 tbio = r10_bio->devs[i].bio;
2348
2349 if (tbio->bi_end_io != end_sync_read)
2350 continue;
2351 if (i == first)
2352 continue;
2353
2354 tpages = get_resync_pages(tbio)->pages;
2355 d = r10_bio->devs[i].devnum;
2356 rdev = conf->mirrors[d].rdev;
2357 if (!r10_bio->devs[i].bio->bi_status) {
2358
2359
2360
2361
2362 int sectors = r10_bio->sectors;
2363 for (j = 0; j < vcnt; j++) {
2364 int len = PAGE_SIZE;
2365 if (sectors < (len / 512))
2366 len = sectors * 512;
2367 if (memcmp(page_address(fpages[j]),
2368 page_address(tpages[j]),
2369 len))
2370 break;
2371 sectors -= len/512;
2372 }
2373 if (j == vcnt)
2374 continue;
2375 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2376 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2377
2378 continue;
2379 } else if (test_bit(FailFast, &rdev->flags)) {
2380
2381 md_error(rdev->mddev, rdev);
2382 continue;
2383 }
2384
2385
2386
2387
2388
2389 rp = get_resync_pages(tbio);
2390 bio_reset(tbio);
2391
2392 md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size);
2393
2394 rp->raid_bio = r10_bio;
2395 tbio->bi_private = rp;
2396 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
2397 tbio->bi_end_io = end_sync_write;
2398 bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
2399
2400 bio_copy_data(tbio, fbio);
2401
2402 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2403 atomic_inc(&r10_bio->remaining);
2404 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2405
2406 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
2407 tbio->bi_opf |= MD_FAILFAST;
2408 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
2409 bio_set_dev(tbio, conf->mirrors[d].rdev->bdev);
2410 submit_bio_noacct(tbio);
2411 }
2412
2413
2414
2415
2416 for (i = 0; i < conf->copies; i++) {
2417 int d;
2418
2419 tbio = r10_bio->devs[i].repl_bio;
2420 if (!tbio || !tbio->bi_end_io)
2421 continue;
2422 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2423 && r10_bio->devs[i].bio != fbio)
2424 bio_copy_data(tbio, fbio);
2425 d = r10_bio->devs[i].devnum;
2426 atomic_inc(&r10_bio->remaining);
2427 md_sync_acct(conf->mirrors[d].replacement->bdev,
2428 bio_sectors(tbio));
2429 submit_bio_noacct(tbio);
2430 }
2431
2432done:
2433 if (atomic_dec_and_test(&r10_bio->remaining)) {
2434 md_done_sync(mddev, r10_bio->sectors, 1);
2435 put_buf(r10_bio);
2436 }
2437}
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449static void fix_recovery_read_error(struct r10bio *r10_bio)
2450{
2451
2452
2453
2454
2455
2456
2457
2458 struct mddev *mddev = r10_bio->mddev;
2459 struct r10conf *conf = mddev->private;
2460 struct bio *bio = r10_bio->devs[0].bio;
2461 sector_t sect = 0;
2462 int sectors = r10_bio->sectors;
2463 int idx = 0;
2464 int dr = r10_bio->devs[0].devnum;
2465 int dw = r10_bio->devs[1].devnum;
2466 struct page **pages = get_resync_pages(bio)->pages;
2467
2468 while (sectors) {
2469 int s = sectors;
2470 struct md_rdev *rdev;
2471 sector_t addr;
2472 int ok;
2473
2474 if (s > (PAGE_SIZE>>9))
2475 s = PAGE_SIZE >> 9;
2476
2477 rdev = conf->mirrors[dr].rdev;
2478 addr = r10_bio->devs[0].addr + sect,
2479 ok = sync_page_io(rdev,
2480 addr,
2481 s << 9,
2482 pages[idx],
2483 REQ_OP_READ, 0, false);
2484 if (ok) {
2485 rdev = conf->mirrors[dw].rdev;
2486 addr = r10_bio->devs[1].addr + sect;
2487 ok = sync_page_io(rdev,
2488 addr,
2489 s << 9,
2490 pages[idx],
2491 REQ_OP_WRITE, 0, false);
2492 if (!ok) {
2493 set_bit(WriteErrorSeen, &rdev->flags);
2494 if (!test_and_set_bit(WantReplacement,
2495 &rdev->flags))
2496 set_bit(MD_RECOVERY_NEEDED,
2497 &rdev->mddev->recovery);
2498 }
2499 }
2500 if (!ok) {
2501
2502
2503
2504
2505 rdev_set_badblocks(rdev, addr, s, 0);
2506
2507 if (rdev != conf->mirrors[dw].rdev) {
2508
2509 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2510 addr = r10_bio->devs[1].addr + sect;
2511 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2512 if (!ok) {
2513
2514 pr_notice("md/raid10:%s: recovery aborted due to read error\n",
2515 mdname(mddev));
2516
2517 conf->mirrors[dw].recovery_disabled
2518 = mddev->recovery_disabled;
2519 set_bit(MD_RECOVERY_INTR,
2520 &mddev->recovery);
2521 break;
2522 }
2523 }
2524 }
2525
2526 sectors -= s;
2527 sect += s;
2528 idx++;
2529 }
2530}
2531
2532static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2533{
2534 struct r10conf *conf = mddev->private;
2535 int d;
2536 struct bio *wbio, *wbio2;
2537
2538 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2539 fix_recovery_read_error(r10_bio);
2540 end_sync_request(r10_bio);
2541 return;
2542 }
2543
2544
2545
2546
2547
2548 d = r10_bio->devs[1].devnum;
2549 wbio = r10_bio->devs[1].bio;
2550 wbio2 = r10_bio->devs[1].repl_bio;
2551
2552
2553
2554
2555 if (wbio2 && !wbio2->bi_end_io)
2556 wbio2 = NULL;
2557 if (wbio->bi_end_io) {
2558 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2559 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2560 submit_bio_noacct(wbio);
2561 }
2562 if (wbio2) {
2563 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2564 md_sync_acct(conf->mirrors[d].replacement->bdev,
2565 bio_sectors(wbio2));
2566 submit_bio_noacct(wbio2);
2567 }
2568}
2569
2570
2571
2572
2573
2574
2575
2576static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2577{
2578 long cur_time_mon;
2579 unsigned long hours_since_last;
2580 unsigned int read_errors = atomic_read(&rdev->read_errors);
2581
2582 cur_time_mon = ktime_get_seconds();
2583
2584 if (rdev->last_read_error == 0) {
2585
2586 rdev->last_read_error = cur_time_mon;
2587 return;
2588 }
2589
2590 hours_since_last = (long)(cur_time_mon -
2591 rdev->last_read_error) / 3600;
2592
2593 rdev->last_read_error = cur_time_mon;
2594
2595
2596
2597
2598
2599
2600 if (hours_since_last >= 8 * sizeof(read_errors))
2601 atomic_set(&rdev->read_errors, 0);
2602 else
2603 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2604}
2605
2606static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2607 int sectors, struct page *page, int rw)
2608{
2609 sector_t first_bad;
2610 int bad_sectors;
2611
2612 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2613 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2614 return -1;
2615 if (sync_page_io(rdev, sector, sectors << 9, page, rw, 0, false))
2616
2617 return 1;
2618 if (rw == WRITE) {
2619 set_bit(WriteErrorSeen, &rdev->flags);
2620 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2621 set_bit(MD_RECOVERY_NEEDED,
2622 &rdev->mddev->recovery);
2623 }
2624
2625 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2626 md_error(rdev->mddev, rdev);
2627 return 0;
2628}
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2639{
2640 int sect = 0;
2641 int sectors = r10_bio->sectors;
2642 struct md_rdev *rdev;
2643 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2644 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2645
2646
2647
2648
2649 rdev = conf->mirrors[d].rdev;
2650
2651 if (test_bit(Faulty, &rdev->flags))
2652
2653
2654 return;
2655
2656 check_decay_read_errors(mddev, rdev);
2657 atomic_inc(&rdev->read_errors);
2658 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2659 char b[BDEVNAME_SIZE];
2660 bdevname(rdev->bdev, b);
2661
2662 pr_notice("md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %d:max %d]\n",
2663 mdname(mddev), b,
2664 atomic_read(&rdev->read_errors), max_read_errors);
2665 pr_notice("md/raid10:%s: %s: Failing raid device\n",
2666 mdname(mddev), b);
2667 md_error(mddev, rdev);
2668 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2669 return;
2670 }
2671
2672 while(sectors) {
2673 int s = sectors;
2674 int sl = r10_bio->read_slot;
2675 int success = 0;
2676 int start;
2677
2678 if (s > (PAGE_SIZE>>9))
2679 s = PAGE_SIZE >> 9;
2680
2681 rcu_read_lock();
2682 do {
2683 sector_t first_bad;
2684 int bad_sectors;
2685
2686 d = r10_bio->devs[sl].devnum;
2687 rdev = rcu_dereference(conf->mirrors[d].rdev);
2688 if (rdev &&
2689 test_bit(In_sync, &rdev->flags) &&
2690 !test_bit(Faulty, &rdev->flags) &&
2691 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2692 &first_bad, &bad_sectors) == 0) {
2693 atomic_inc(&rdev->nr_pending);
2694 rcu_read_unlock();
2695 success = sync_page_io(rdev,
2696 r10_bio->devs[sl].addr +
2697 sect,
2698 s<<9,
2699 conf->tmppage,
2700 REQ_OP_READ, 0, false);
2701 rdev_dec_pending(rdev, mddev);
2702 rcu_read_lock();
2703 if (success)
2704 break;
2705 }
2706 sl++;
2707 if (sl == conf->copies)
2708 sl = 0;
2709 } while (!success && sl != r10_bio->read_slot);
2710 rcu_read_unlock();
2711
2712 if (!success) {
2713
2714
2715
2716
2717 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2718 rdev = conf->mirrors[dn].rdev;
2719
2720 if (!rdev_set_badblocks(
2721 rdev,
2722 r10_bio->devs[r10_bio->read_slot].addr
2723 + sect,
2724 s, 0)) {
2725 md_error(mddev, rdev);
2726 r10_bio->devs[r10_bio->read_slot].bio
2727 = IO_BLOCKED;
2728 }
2729 break;
2730 }
2731
2732 start = sl;
2733
2734 rcu_read_lock();
2735 while (sl != r10_bio->read_slot) {
2736 char b[BDEVNAME_SIZE];
2737
2738 if (sl==0)
2739 sl = conf->copies;
2740 sl--;
2741 d = r10_bio->devs[sl].devnum;
2742 rdev = rcu_dereference(conf->mirrors[d].rdev);
2743 if (!rdev ||
2744 test_bit(Faulty, &rdev->flags) ||
2745 !test_bit(In_sync, &rdev->flags))
2746 continue;
2747
2748 atomic_inc(&rdev->nr_pending);
2749 rcu_read_unlock();
2750 if (r10_sync_page_io(rdev,
2751 r10_bio->devs[sl].addr +
2752 sect,
2753 s, conf->tmppage, WRITE)
2754 == 0) {
2755
2756 pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %s)\n",
2757 mdname(mddev), s,
2758 (unsigned long long)(
2759 sect +
2760 choose_data_offset(r10_bio,
2761 rdev)),
2762 bdevname(rdev->bdev, b));
2763 pr_notice("md/raid10:%s: %s: failing drive\n",
2764 mdname(mddev),
2765 bdevname(rdev->bdev, b));
2766 }
2767 rdev_dec_pending(rdev, mddev);
2768 rcu_read_lock();
2769 }
2770 sl = start;
2771 while (sl != r10_bio->read_slot) {
2772 char b[BDEVNAME_SIZE];
2773
2774 if (sl==0)
2775 sl = conf->copies;
2776 sl--;
2777 d = r10_bio->devs[sl].devnum;
2778 rdev = rcu_dereference(conf->mirrors[d].rdev);
2779 if (!rdev ||
2780 test_bit(Faulty, &rdev->flags) ||
2781 !test_bit(In_sync, &rdev->flags))
2782 continue;
2783
2784 atomic_inc(&rdev->nr_pending);
2785 rcu_read_unlock();
2786 switch (r10_sync_page_io(rdev,
2787 r10_bio->devs[sl].addr +
2788 sect,
2789 s, conf->tmppage,
2790 READ)) {
2791 case 0:
2792
2793 pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %s)\n",
2794 mdname(mddev), s,
2795 (unsigned long long)(
2796 sect +
2797 choose_data_offset(r10_bio, rdev)),
2798 bdevname(rdev->bdev, b));
2799 pr_notice("md/raid10:%s: %s: failing drive\n",
2800 mdname(mddev),
2801 bdevname(rdev->bdev, b));
2802 break;
2803 case 1:
2804 pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %s)\n",
2805 mdname(mddev), s,
2806 (unsigned long long)(
2807 sect +
2808 choose_data_offset(r10_bio, rdev)),
2809 bdevname(rdev->bdev, b));
2810 atomic_add(s, &rdev->corrected_errors);
2811 }
2812
2813 rdev_dec_pending(rdev, mddev);
2814 rcu_read_lock();
2815 }
2816 rcu_read_unlock();
2817
2818 sectors -= s;
2819 sect += s;
2820 }
2821}
2822
2823static int narrow_write_error(struct r10bio *r10_bio, int i)
2824{
2825 struct bio *bio = r10_bio->master_bio;
2826 struct mddev *mddev = r10_bio->mddev;
2827 struct r10conf *conf = mddev->private;
2828 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840 int block_sectors;
2841 sector_t sector;
2842 int sectors;
2843 int sect_to_write = r10_bio->sectors;
2844 int ok = 1;
2845
2846 if (rdev->badblocks.shift < 0)
2847 return 0;
2848
2849 block_sectors = roundup(1 << rdev->badblocks.shift,
2850 bdev_logical_block_size(rdev->bdev) >> 9);
2851 sector = r10_bio->sector;
2852 sectors = ((r10_bio->sector + block_sectors)
2853 & ~(sector_t)(block_sectors - 1))
2854 - sector;
2855
2856 while (sect_to_write) {
2857 struct bio *wbio;
2858 sector_t wsector;
2859 if (sectors > sect_to_write)
2860 sectors = sect_to_write;
2861
2862 wbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
2863 bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
2864 wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
2865 wbio->bi_iter.bi_sector = wsector +
2866 choose_data_offset(r10_bio, rdev);
2867 bio_set_dev(wbio, rdev->bdev);
2868 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
2869
2870 if (submit_bio_wait(wbio) < 0)
2871
2872 ok = rdev_set_badblocks(rdev, wsector,
2873 sectors, 0)
2874 && ok;
2875
2876 bio_put(wbio);
2877 sect_to_write -= sectors;
2878 sector += sectors;
2879 sectors = block_sectors;
2880 }
2881 return ok;
2882}
2883
2884static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2885{
2886 int slot = r10_bio->read_slot;
2887 struct bio *bio;
2888 struct r10conf *conf = mddev->private;
2889 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899 bio = r10_bio->devs[slot].bio;
2900 bio_put(bio);
2901 r10_bio->devs[slot].bio = NULL;
2902
2903 if (mddev->ro)
2904 r10_bio->devs[slot].bio = IO_BLOCKED;
2905 else if (!test_bit(FailFast, &rdev->flags)) {
2906 freeze_array(conf, 1);
2907 fix_read_error(conf, mddev, r10_bio);
2908 unfreeze_array(conf);
2909 } else
2910 md_error(mddev, rdev);
2911
2912 rdev_dec_pending(rdev, mddev);
2913 allow_barrier(conf);
2914 r10_bio->state = 0;
2915 raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
2916}
2917
2918static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2919{
2920
2921
2922
2923
2924
2925
2926 int m;
2927 struct md_rdev *rdev;
2928
2929 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2930 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2931 for (m = 0; m < conf->copies; m++) {
2932 int dev = r10_bio->devs[m].devnum;
2933 rdev = conf->mirrors[dev].rdev;
2934 if (r10_bio->devs[m].bio == NULL ||
2935 r10_bio->devs[m].bio->bi_end_io == NULL)
2936 continue;
2937 if (!r10_bio->devs[m].bio->bi_status) {
2938 rdev_clear_badblocks(
2939 rdev,
2940 r10_bio->devs[m].addr,
2941 r10_bio->sectors, 0);
2942 } else {
2943 if (!rdev_set_badblocks(
2944 rdev,
2945 r10_bio->devs[m].addr,
2946 r10_bio->sectors, 0))
2947 md_error(conf->mddev, rdev);
2948 }
2949 rdev = conf->mirrors[dev].replacement;
2950 if (r10_bio->devs[m].repl_bio == NULL ||
2951 r10_bio->devs[m].repl_bio->bi_end_io == NULL)
2952 continue;
2953
2954 if (!r10_bio->devs[m].repl_bio->bi_status) {
2955 rdev_clear_badblocks(
2956 rdev,
2957 r10_bio->devs[m].addr,
2958 r10_bio->sectors, 0);
2959 } else {
2960 if (!rdev_set_badblocks(
2961 rdev,
2962 r10_bio->devs[m].addr,
2963 r10_bio->sectors, 0))
2964 md_error(conf->mddev, rdev);
2965 }
2966 }
2967 put_buf(r10_bio);
2968 } else {
2969 bool fail = false;
2970 for (m = 0; m < conf->copies; m++) {
2971 int dev = r10_bio->devs[m].devnum;
2972 struct bio *bio = r10_bio->devs[m].bio;
2973 rdev = conf->mirrors[dev].rdev;
2974 if (bio == IO_MADE_GOOD) {
2975 rdev_clear_badblocks(
2976 rdev,
2977 r10_bio->devs[m].addr,
2978 r10_bio->sectors, 0);
2979 rdev_dec_pending(rdev, conf->mddev);
2980 } else if (bio != NULL && bio->bi_status) {
2981 fail = true;
2982 if (!narrow_write_error(r10_bio, m)) {
2983 md_error(conf->mddev, rdev);
2984 set_bit(R10BIO_Degraded,
2985 &r10_bio->state);
2986 }
2987 rdev_dec_pending(rdev, conf->mddev);
2988 }
2989 bio = r10_bio->devs[m].repl_bio;
2990 rdev = conf->mirrors[dev].replacement;
2991 if (rdev && bio == IO_MADE_GOOD) {
2992 rdev_clear_badblocks(
2993 rdev,
2994 r10_bio->devs[m].addr,
2995 r10_bio->sectors, 0);
2996 rdev_dec_pending(rdev, conf->mddev);
2997 }
2998 }
2999 if (fail) {
3000 spin_lock_irq(&conf->device_lock);
3001 list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
3002 conf->nr_queued++;
3003 spin_unlock_irq(&conf->device_lock);
3004
3005
3006
3007
3008 wake_up(&conf->wait_barrier);
3009 md_wakeup_thread(conf->mddev->thread);
3010 } else {
3011 if (test_bit(R10BIO_WriteError,
3012 &r10_bio->state))
3013 close_write(r10_bio);
3014 raid_end_bio_io(r10_bio);
3015 }
3016 }
3017}
3018
3019static void raid10d(struct md_thread *thread)
3020{
3021 struct mddev *mddev = thread->mddev;
3022 struct r10bio *r10_bio;
3023 unsigned long flags;
3024 struct r10conf *conf = mddev->private;
3025 struct list_head *head = &conf->retry_list;
3026 struct blk_plug plug;
3027
3028 md_check_recovery(mddev);
3029
3030 if (!list_empty_careful(&conf->bio_end_io_list) &&
3031 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
3032 LIST_HEAD(tmp);
3033 spin_lock_irqsave(&conf->device_lock, flags);
3034 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
3035 while (!list_empty(&conf->bio_end_io_list)) {
3036 list_move(conf->bio_end_io_list.prev, &tmp);
3037 conf->nr_queued--;
3038 }
3039 }
3040 spin_unlock_irqrestore(&conf->device_lock, flags);
3041 while (!list_empty(&tmp)) {
3042 r10_bio = list_first_entry(&tmp, struct r10bio,
3043 retry_list);
3044 list_del(&r10_bio->retry_list);
3045 if (mddev->degraded)
3046 set_bit(R10BIO_Degraded, &r10_bio->state);
3047
3048 if (test_bit(R10BIO_WriteError,
3049 &r10_bio->state))
3050 close_write(r10_bio);
3051 raid_end_bio_io(r10_bio);
3052 }
3053 }
3054
3055 blk_start_plug(&plug);
3056 for (;;) {
3057
3058 flush_pending_writes(conf);
3059
3060 spin_lock_irqsave(&conf->device_lock, flags);
3061 if (list_empty(head)) {
3062 spin_unlock_irqrestore(&conf->device_lock, flags);
3063 break;
3064 }
3065 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
3066 list_del(head->prev);
3067 conf->nr_queued--;
3068 spin_unlock_irqrestore(&conf->device_lock, flags);
3069
3070 mddev = r10_bio->mddev;
3071 conf = mddev->private;
3072 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
3073 test_bit(R10BIO_WriteError, &r10_bio->state))
3074 handle_write_completed(conf, r10_bio);
3075 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
3076 reshape_request_write(mddev, r10_bio);
3077 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
3078 sync_request_write(mddev, r10_bio);
3079 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
3080 recovery_request_write(mddev, r10_bio);
3081 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
3082 handle_read_error(mddev, r10_bio);
3083 else
3084 WARN_ON_ONCE(1);
3085
3086 cond_resched();
3087 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
3088 md_check_recovery(mddev);
3089 }
3090 blk_finish_plug(&plug);
3091}
3092
3093static int init_resync(struct r10conf *conf)
3094{
3095 int ret, buffs, i;
3096
3097 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
3098 BUG_ON(mempool_initialized(&conf->r10buf_pool));
3099 conf->have_replacement = 0;
3100 for (i = 0; i < conf->geo.raid_disks; i++)
3101 if (conf->mirrors[i].replacement)
3102 conf->have_replacement = 1;
3103 ret = mempool_init(&conf->r10buf_pool, buffs,
3104 r10buf_pool_alloc, r10buf_pool_free, conf);
3105 if (ret)
3106 return ret;
3107 conf->next_resync = 0;
3108 return 0;
3109}
3110
3111static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
3112{
3113 struct r10bio *r10bio = mempool_alloc(&conf->r10buf_pool, GFP_NOIO);
3114 struct rsync_pages *rp;
3115 struct bio *bio;
3116 int nalloc;
3117 int i;
3118
3119 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
3120 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
3121 nalloc = conf->copies;
3122 else
3123 nalloc = 2;
3124
3125 for (i = 0; i < nalloc; i++) {
3126 bio = r10bio->devs[i].bio;
3127 rp = bio->bi_private;
3128 bio_reset(bio);
3129 bio->bi_private = rp;
3130 bio = r10bio->devs[i].repl_bio;
3131 if (bio) {
3132 rp = bio->bi_private;
3133 bio_reset(bio);
3134 bio->bi_private = rp;
3135 }
3136 }
3137 return r10bio;
3138}
3139
3140
3141
3142
3143
3144static void raid10_set_cluster_sync_high(struct r10conf *conf)
3145{
3146 sector_t window_size;
3147 int extra_chunk, chunks;
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161 chunks = conf->geo.raid_disks / conf->geo.near_copies;
3162 if (conf->geo.raid_disks % conf->geo.near_copies == 0)
3163 extra_chunk = 0;
3164 else
3165 extra_chunk = 1;
3166 window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors;
3167
3168
3169
3170
3171 window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ?
3172 CLUSTER_RESYNC_WINDOW_SECTORS : window_size;
3173
3174 conf->cluster_sync_high = conf->cluster_sync_low + window_size;
3175}
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3210 int *skipped)
3211{
3212 struct r10conf *conf = mddev->private;
3213 struct r10bio *r10_bio;
3214 struct bio *biolist = NULL, *bio;
3215 sector_t max_sector, nr_sectors;
3216 int i;
3217 int max_sync;
3218 sector_t sync_blocks;
3219 sector_t sectors_skipped = 0;
3220 int chunks_skipped = 0;
3221 sector_t chunk_mask = conf->geo.chunk_mask;
3222 int page_idx = 0;
3223
3224 if (!mempool_initialized(&conf->r10buf_pool))
3225 if (init_resync(conf))
3226 return 0;
3227
3228
3229
3230
3231
3232 if (mddev->bitmap == NULL &&
3233 mddev->recovery_cp == MaxSector &&
3234 mddev->reshape_position == MaxSector &&
3235 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
3236 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
3237 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
3238 conf->fullsync == 0) {
3239 *skipped = 1;
3240 return mddev->dev_sectors - sector_nr;
3241 }
3242
3243 skipped:
3244 max_sector = mddev->dev_sectors;
3245 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
3246 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3247 max_sector = mddev->resync_max_sectors;
3248 if (sector_nr >= max_sector) {
3249 conf->cluster_sync_low = 0;
3250 conf->cluster_sync_high = 0;
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
3262 end_reshape(conf);
3263 close_sync(conf);
3264 return 0;
3265 }
3266
3267 if (mddev->curr_resync < max_sector) {
3268 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3269 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
3270 &sync_blocks, 1);
3271 else for (i = 0; i < conf->geo.raid_disks; i++) {
3272 sector_t sect =
3273 raid10_find_virt(conf, mddev->curr_resync, i);
3274 md_bitmap_end_sync(mddev->bitmap, sect,
3275 &sync_blocks, 1);
3276 }
3277 } else {
3278
3279 if ((!mddev->bitmap || conf->fullsync)
3280 && conf->have_replacement
3281 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3282
3283
3284
3285 rcu_read_lock();
3286 for (i = 0; i < conf->geo.raid_disks; i++) {
3287 struct md_rdev *rdev =
3288 rcu_dereference(conf->mirrors[i].replacement);
3289 if (rdev)
3290 rdev->recovery_offset = MaxSector;
3291 }
3292 rcu_read_unlock();
3293 }
3294 conf->fullsync = 0;
3295 }
3296 md_bitmap_close_sync(mddev->bitmap);
3297 close_sync(conf);
3298 *skipped = 1;
3299 return sectors_skipped;
3300 }
3301
3302 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3303 return reshape_request(mddev, sector_nr, skipped);
3304
3305 if (chunks_skipped >= conf->geo.raid_disks) {
3306
3307
3308
3309 *skipped = 1;
3310 return (max_sector - sector_nr) + sectors_skipped;
3311 }
3312
3313 if (max_sector > mddev->resync_max)
3314 max_sector = mddev->resync_max;
3315
3316
3317
3318
3319 if (conf->geo.near_copies < conf->geo.raid_disks &&
3320 max_sector > (sector_nr | chunk_mask))
3321 max_sector = (sector_nr | chunk_mask) + 1;
3322
3323
3324
3325
3326
3327 if (conf->nr_waiting)
3328 schedule_timeout_uninterruptible(1);
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
3346 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3347
3348 int j;
3349 r10_bio = NULL;
3350
3351 for (i = 0 ; i < conf->geo.raid_disks; i++) {
3352 int still_degraded;
3353 struct r10bio *rb2;
3354 sector_t sect;
3355 int must_sync;
3356 int any_working;
3357 int need_recover = 0;
3358 int need_replace = 0;
3359 struct raid10_info *mirror = &conf->mirrors[i];
3360 struct md_rdev *mrdev, *mreplace;
3361
3362 rcu_read_lock();
3363 mrdev = rcu_dereference(mirror->rdev);
3364 mreplace = rcu_dereference(mirror->replacement);
3365
3366 if (mrdev != NULL &&
3367 !test_bit(Faulty, &mrdev->flags) &&
3368 !test_bit(In_sync, &mrdev->flags))
3369 need_recover = 1;
3370 if (mreplace != NULL &&
3371 !test_bit(Faulty, &mreplace->flags))
3372 need_replace = 1;
3373
3374 if (!need_recover && !need_replace) {
3375 rcu_read_unlock();
3376 continue;
3377 }
3378
3379 still_degraded = 0;
3380
3381 rb2 = r10_bio;
3382 sect = raid10_find_virt(conf, sector_nr, i);
3383 if (sect >= mddev->resync_max_sectors) {
3384
3385
3386
3387 rcu_read_unlock();
3388 continue;
3389 }
3390 if (mreplace && test_bit(Faulty, &mreplace->flags))
3391 mreplace = NULL;
3392
3393
3394
3395
3396 must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
3397 &sync_blocks, 1);
3398 if (sync_blocks < max_sync)
3399 max_sync = sync_blocks;
3400 if (!must_sync &&
3401 mreplace == NULL &&
3402 !conf->fullsync) {
3403
3404
3405
3406 chunks_skipped = -1;
3407 rcu_read_unlock();
3408 continue;
3409 }
3410 atomic_inc(&mrdev->nr_pending);
3411 if (mreplace)
3412 atomic_inc(&mreplace->nr_pending);
3413 rcu_read_unlock();
3414
3415 r10_bio = raid10_alloc_init_r10buf(conf);
3416 r10_bio->state = 0;
3417 raise_barrier(conf, rb2 != NULL);
3418 atomic_set(&r10_bio->remaining, 0);
3419
3420 r10_bio->master_bio = (struct bio*)rb2;
3421 if (rb2)
3422 atomic_inc(&rb2->remaining);
3423 r10_bio->mddev = mddev;
3424 set_bit(R10BIO_IsRecover, &r10_bio->state);
3425 r10_bio->sector = sect;
3426
3427 raid10_find_phys(conf, r10_bio);
3428
3429
3430
3431
3432 rcu_read_lock();
3433 for (j = 0; j < conf->geo.raid_disks; j++) {
3434 struct md_rdev *rdev = rcu_dereference(
3435 conf->mirrors[j].rdev);
3436 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3437 still_degraded = 1;
3438 break;
3439 }
3440 }
3441
3442 must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
3443 &sync_blocks, still_degraded);
3444
3445 any_working = 0;
3446 for (j=0; j<conf->copies;j++) {
3447 int k;
3448 int d = r10_bio->devs[j].devnum;
3449 sector_t from_addr, to_addr;
3450 struct md_rdev *rdev =
3451 rcu_dereference(conf->mirrors[d].rdev);
3452 sector_t sector, first_bad;
3453 int bad_sectors;
3454 if (!rdev ||
3455 !test_bit(In_sync, &rdev->flags))
3456 continue;
3457
3458 any_working = 1;
3459 sector = r10_bio->devs[j].addr;
3460
3461 if (is_badblock(rdev, sector, max_sync,
3462 &first_bad, &bad_sectors)) {
3463 if (first_bad > sector)
3464 max_sync = first_bad - sector;
3465 else {
3466 bad_sectors -= (sector
3467 - first_bad);
3468 if (max_sync > bad_sectors)
3469 max_sync = bad_sectors;
3470 continue;
3471 }
3472 }
3473 bio = r10_bio->devs[0].bio;
3474 bio->bi_next = biolist;
3475 biolist = bio;
3476 bio->bi_end_io = end_sync_read;
3477 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3478 if (test_bit(FailFast, &rdev->flags))
3479 bio->bi_opf |= MD_FAILFAST;
3480 from_addr = r10_bio->devs[j].addr;
3481 bio->bi_iter.bi_sector = from_addr +
3482 rdev->data_offset;
3483 bio_set_dev(bio, rdev->bdev);
3484 atomic_inc(&rdev->nr_pending);
3485
3486
3487 for (k=0; k<conf->copies; k++)
3488 if (r10_bio->devs[k].devnum == i)
3489 break;
3490 BUG_ON(k == conf->copies);
3491 to_addr = r10_bio->devs[k].addr;
3492 r10_bio->devs[0].devnum = d;
3493 r10_bio->devs[0].addr = from_addr;
3494 r10_bio->devs[1].devnum = i;
3495 r10_bio->devs[1].addr = to_addr;
3496
3497 if (need_recover) {
3498 bio = r10_bio->devs[1].bio;
3499 bio->bi_next = biolist;
3500 biolist = bio;
3501 bio->bi_end_io = end_sync_write;
3502 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3503 bio->bi_iter.bi_sector = to_addr
3504 + mrdev->data_offset;
3505 bio_set_dev(bio, mrdev->bdev);
3506 atomic_inc(&r10_bio->remaining);
3507 } else
3508 r10_bio->devs[1].bio->bi_end_io = NULL;
3509
3510
3511 bio = r10_bio->devs[1].repl_bio;
3512 if (bio)
3513 bio->bi_end_io = NULL;
3514
3515
3516
3517
3518 if (!need_replace)
3519 break;
3520 bio->bi_next = biolist;
3521 biolist = bio;
3522 bio->bi_end_io = end_sync_write;
3523 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3524 bio->bi_iter.bi_sector = to_addr +
3525 mreplace->data_offset;
3526 bio_set_dev(bio, mreplace->bdev);
3527 atomic_inc(&r10_bio->remaining);
3528 break;
3529 }
3530 rcu_read_unlock();
3531 if (j == conf->copies) {
3532
3533
3534 if (any_working) {
3535
3536
3537
3538 int k;
3539 for (k = 0; k < conf->copies; k++)
3540 if (r10_bio->devs[k].devnum == i)
3541 break;
3542 if (!test_bit(In_sync,
3543 &mrdev->flags)
3544 && !rdev_set_badblocks(
3545 mrdev,
3546 r10_bio->devs[k].addr,
3547 max_sync, 0))
3548 any_working = 0;
3549 if (mreplace &&
3550 !rdev_set_badblocks(
3551 mreplace,
3552 r10_bio->devs[k].addr,
3553 max_sync, 0))
3554 any_working = 0;
3555 }
3556 if (!any_working) {
3557 if (!test_and_set_bit(MD_RECOVERY_INTR,
3558 &mddev->recovery))
3559 pr_warn("md/raid10:%s: insufficient working devices for recovery.\n",
3560 mdname(mddev));
3561 mirror->recovery_disabled
3562 = mddev->recovery_disabled;
3563 }
3564 put_buf(r10_bio);
3565 if (rb2)
3566 atomic_dec(&rb2->remaining);
3567 r10_bio = rb2;
3568 rdev_dec_pending(mrdev, mddev);
3569 if (mreplace)
3570 rdev_dec_pending(mreplace, mddev);
3571 break;
3572 }
3573 rdev_dec_pending(mrdev, mddev);
3574 if (mreplace)
3575 rdev_dec_pending(mreplace, mddev);
3576 if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) {
3577
3578
3579
3580
3581 int targets = 1;
3582 for (; j < conf->copies; j++) {
3583 int d = r10_bio->devs[j].devnum;
3584 if (conf->mirrors[d].rdev &&
3585 test_bit(In_sync,
3586 &conf->mirrors[d].rdev->flags))
3587 targets++;
3588 }
3589 if (targets == 1)
3590 r10_bio->devs[0].bio->bi_opf
3591 &= ~MD_FAILFAST;
3592 }
3593 }
3594 if (biolist == NULL) {
3595 while (r10_bio) {
3596 struct r10bio *rb2 = r10_bio;
3597 r10_bio = (struct r10bio*) rb2->master_bio;
3598 rb2->master_bio = NULL;
3599 put_buf(rb2);
3600 }
3601 goto giveup;
3602 }
3603 } else {
3604
3605 int count = 0;
3606
3607
3608
3609
3610
3611
3612
3613
3614 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
3615 mddev_is_clustered(mddev) &&
3616 (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
3617
3618 if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
3619 &sync_blocks, mddev->degraded) &&
3620 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3621 &mddev->recovery)) {
3622
3623 *skipped = 1;
3624 return sync_blocks + sectors_skipped;
3625 }
3626 if (sync_blocks < max_sync)
3627 max_sync = sync_blocks;
3628 r10_bio = raid10_alloc_init_r10buf(conf);
3629 r10_bio->state = 0;
3630
3631 r10_bio->mddev = mddev;
3632 atomic_set(&r10_bio->remaining, 0);
3633 raise_barrier(conf, 0);
3634 conf->next_resync = sector_nr;
3635
3636 r10_bio->master_bio = NULL;
3637 r10_bio->sector = sector_nr;
3638 set_bit(R10BIO_IsSync, &r10_bio->state);
3639 raid10_find_phys(conf, r10_bio);
3640 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3641
3642 for (i = 0; i < conf->copies; i++) {
3643 int d = r10_bio->devs[i].devnum;
3644 sector_t first_bad, sector;
3645 int bad_sectors;
3646 struct md_rdev *rdev;
3647
3648 if (r10_bio->devs[i].repl_bio)
3649 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3650
3651 bio = r10_bio->devs[i].bio;
3652 bio->bi_status = BLK_STS_IOERR;
3653 rcu_read_lock();
3654 rdev = rcu_dereference(conf->mirrors[d].rdev);
3655 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3656 rcu_read_unlock();
3657 continue;
3658 }
3659 sector = r10_bio->devs[i].addr;
3660 if (is_badblock(rdev, sector, max_sync,
3661 &first_bad, &bad_sectors)) {
3662 if (first_bad > sector)
3663 max_sync = first_bad - sector;
3664 else {
3665 bad_sectors -= (sector - first_bad);
3666 if (max_sync > bad_sectors)
3667 max_sync = bad_sectors;
3668 rcu_read_unlock();
3669 continue;
3670 }
3671 }
3672 atomic_inc(&rdev->nr_pending);
3673 atomic_inc(&r10_bio->remaining);
3674 bio->bi_next = biolist;
3675 biolist = bio;
3676 bio->bi_end_io = end_sync_read;
3677 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3678 if (test_bit(FailFast, &rdev->flags))
3679 bio->bi_opf |= MD_FAILFAST;
3680 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3681 bio_set_dev(bio, rdev->bdev);
3682 count++;
3683
3684 rdev = rcu_dereference(conf->mirrors[d].replacement);
3685 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3686 rcu_read_unlock();
3687 continue;
3688 }
3689 atomic_inc(&rdev->nr_pending);
3690
3691
3692 bio = r10_bio->devs[i].repl_bio;
3693 bio->bi_status = BLK_STS_IOERR;
3694
3695 sector = r10_bio->devs[i].addr;
3696 bio->bi_next = biolist;
3697 biolist = bio;
3698 bio->bi_end_io = end_sync_write;
3699 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3700 if (test_bit(FailFast, &rdev->flags))
3701 bio->bi_opf |= MD_FAILFAST;
3702 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3703 bio_set_dev(bio, rdev->bdev);
3704 count++;
3705 rcu_read_unlock();
3706 }
3707
3708 if (count < 2) {
3709 for (i=0; i<conf->copies; i++) {
3710 int d = r10_bio->devs[i].devnum;
3711 if (r10_bio->devs[i].bio->bi_end_io)
3712 rdev_dec_pending(conf->mirrors[d].rdev,
3713 mddev);
3714 if (r10_bio->devs[i].repl_bio &&
3715 r10_bio->devs[i].repl_bio->bi_end_io)
3716 rdev_dec_pending(
3717 conf->mirrors[d].replacement,
3718 mddev);
3719 }
3720 put_buf(r10_bio);
3721 biolist = NULL;
3722 goto giveup;
3723 }
3724 }
3725
3726 nr_sectors = 0;
3727 if (sector_nr + max_sync < max_sector)
3728 max_sector = sector_nr + max_sync;
3729 do {
3730 struct page *page;
3731 int len = PAGE_SIZE;
3732 if (sector_nr + (len>>9) > max_sector)
3733 len = (max_sector - sector_nr) << 9;
3734 if (len == 0)
3735 break;
3736 for (bio= biolist ; bio ; bio=bio->bi_next) {
3737 struct resync_pages *rp = get_resync_pages(bio);
3738 page = resync_fetch_page(rp, page_idx);
3739
3740
3741
3742
3743 bio_add_page(bio, page, len, 0);
3744 }
3745 nr_sectors += len>>9;
3746 sector_nr += len>>9;
3747 } while (++page_idx < RESYNC_PAGES);
3748 r10_bio->sectors = nr_sectors;
3749
3750 if (mddev_is_clustered(mddev) &&
3751 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3752
3753 if (conf->cluster_sync_high < sector_nr + nr_sectors) {
3754 conf->cluster_sync_low = mddev->curr_resync_completed;
3755 raid10_set_cluster_sync_high(conf);
3756
3757 md_cluster_ops->resync_info_update(mddev,
3758 conf->cluster_sync_low,
3759 conf->cluster_sync_high);
3760 }
3761 } else if (mddev_is_clustered(mddev)) {
3762
3763 sector_t sect_va1, sect_va2;
3764 bool broadcast_msg = false;
3765
3766 for (i = 0; i < conf->geo.raid_disks; i++) {
3767
3768
3769
3770
3771
3772 sect_va1 = raid10_find_virt(conf, sector_nr, i);
3773
3774 if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
3775 broadcast_msg = true;
3776
3777
3778
3779
3780 sect_va2 = raid10_find_virt(conf,
3781 mddev->curr_resync_completed, i);
3782
3783 if (conf->cluster_sync_low == 0 ||
3784 conf->cluster_sync_low > sect_va2)
3785 conf->cluster_sync_low = sect_va2;
3786 }
3787 }
3788 if (broadcast_msg) {
3789 raid10_set_cluster_sync_high(conf);
3790 md_cluster_ops->resync_info_update(mddev,
3791 conf->cluster_sync_low,
3792 conf->cluster_sync_high);
3793 }
3794 }
3795
3796 while (biolist) {
3797 bio = biolist;
3798 biolist = biolist->bi_next;
3799
3800 bio->bi_next = NULL;
3801 r10_bio = get_resync_r10bio(bio);
3802 r10_bio->sectors = nr_sectors;
3803
3804 if (bio->bi_end_io == end_sync_read) {
3805 md_sync_acct_bio(bio, nr_sectors);
3806 bio->bi_status = 0;
3807 submit_bio_noacct(bio);
3808 }
3809 }
3810
3811 if (sectors_skipped)
3812
3813
3814
3815 md_done_sync(mddev, sectors_skipped, 1);
3816
3817 return sectors_skipped + nr_sectors;
3818 giveup:
3819
3820
3821
3822
3823 if (sector_nr + max_sync < max_sector)
3824 max_sector = sector_nr + max_sync;
3825
3826 sectors_skipped += (max_sector - sector_nr);
3827 chunks_skipped ++;
3828 sector_nr = max_sector;
3829 goto skipped;
3830}
3831
3832static sector_t
3833raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3834{
3835 sector_t size;
3836 struct r10conf *conf = mddev->private;
3837
3838 if (!raid_disks)
3839 raid_disks = min(conf->geo.raid_disks,
3840 conf->prev.raid_disks);
3841 if (!sectors)
3842 sectors = conf->dev_sectors;
3843
3844 size = sectors >> conf->geo.chunk_shift;
3845 sector_div(size, conf->geo.far_copies);
3846 size = size * raid_disks;
3847 sector_div(size, conf->geo.near_copies);
3848
3849 return size << conf->geo.chunk_shift;
3850}
3851
3852static void calc_sectors(struct r10conf *conf, sector_t size)
3853{
3854
3855
3856
3857
3858
3859 size = size >> conf->geo.chunk_shift;
3860 sector_div(size, conf->geo.far_copies);
3861 size = size * conf->geo.raid_disks;
3862 sector_div(size, conf->geo.near_copies);
3863
3864
3865 size = size * conf->copies;
3866
3867
3868
3869
3870 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3871
3872 conf->dev_sectors = size << conf->geo.chunk_shift;
3873
3874 if (conf->geo.far_offset)
3875 conf->geo.stride = 1 << conf->geo.chunk_shift;
3876 else {
3877 sector_div(size, conf->geo.far_copies);
3878 conf->geo.stride = size << conf->geo.chunk_shift;
3879 }
3880}
3881
3882enum geo_type {geo_new, geo_old, geo_start};
3883static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3884{
3885 int nc, fc, fo;
3886 int layout, chunk, disks;
3887 switch (new) {
3888 case geo_old:
3889 layout = mddev->layout;
3890 chunk = mddev->chunk_sectors;
3891 disks = mddev->raid_disks - mddev->delta_disks;
3892 break;
3893 case geo_new:
3894 layout = mddev->new_layout;
3895 chunk = mddev->new_chunk_sectors;
3896 disks = mddev->raid_disks;
3897 break;
3898 default:
3899 case geo_start:
3900
3901 layout = mddev->new_layout;
3902 chunk = mddev->new_chunk_sectors;
3903 disks = mddev->raid_disks + mddev->delta_disks;
3904 break;
3905 }
3906 if (layout >> 19)
3907 return -1;
3908 if (chunk < (PAGE_SIZE >> 9) ||
3909 !is_power_of_2(chunk))
3910 return -2;
3911 nc = layout & 255;
3912 fc = (layout >> 8) & 255;
3913 fo = layout & (1<<16);
3914 geo->raid_disks = disks;
3915 geo->near_copies = nc;
3916 geo->far_copies = fc;
3917 geo->far_offset = fo;
3918 switch (layout >> 17) {
3919 case 0:
3920 geo->far_set_size = disks;
3921 break;
3922 case 1:
3923
3924 geo->far_set_size = disks/fc;
3925 WARN(geo->far_set_size < fc,
3926 "This RAID10 layout does not provide data safety - please backup and create new array\n");
3927 break;
3928 case 2:
3929 geo->far_set_size = fc * nc;
3930 break;
3931 default:
3932 return -1;
3933 }
3934 geo->chunk_mask = chunk - 1;
3935 geo->chunk_shift = ffz(~chunk);
3936 return nc*fc;
3937}
3938
3939static struct r10conf *setup_conf(struct mddev *mddev)
3940{
3941 struct r10conf *conf = NULL;
3942 int err = -EINVAL;
3943 struct geom geo;
3944 int copies;
3945
3946 copies = setup_geo(&geo, mddev, geo_new);
3947
3948 if (copies == -2) {
3949 pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n",
3950 mdname(mddev), PAGE_SIZE);
3951 goto out;
3952 }
3953
3954 if (copies < 2 || copies > mddev->raid_disks) {
3955 pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3956 mdname(mddev), mddev->new_layout);
3957 goto out;
3958 }
3959
3960 err = -ENOMEM;
3961 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3962 if (!conf)
3963 goto out;
3964
3965
3966 conf->mirrors = kcalloc(mddev->raid_disks + max(0, -mddev->delta_disks),
3967 sizeof(struct raid10_info),
3968 GFP_KERNEL);
3969 if (!conf->mirrors)
3970 goto out;
3971
3972 conf->tmppage = alloc_page(GFP_KERNEL);
3973 if (!conf->tmppage)
3974 goto out;
3975
3976 conf->geo = geo;
3977 conf->copies = copies;
3978 err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc,
3979 rbio_pool_free, conf);
3980 if (err)
3981 goto out;
3982
3983 err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
3984 if (err)
3985 goto out;
3986
3987 calc_sectors(conf, mddev->dev_sectors);
3988 if (mddev->reshape_position == MaxSector) {
3989 conf->prev = conf->geo;
3990 conf->reshape_progress = MaxSector;
3991 } else {
3992 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3993 err = -EINVAL;
3994 goto out;
3995 }
3996 conf->reshape_progress = mddev->reshape_position;
3997 if (conf->prev.far_offset)
3998 conf->prev.stride = 1 << conf->prev.chunk_shift;
3999 else
4000
4001 conf->prev.stride = conf->dev_sectors;
4002 }
4003 conf->reshape_safe = conf->reshape_progress;
4004 spin_lock_init(&conf->device_lock);
4005 INIT_LIST_HEAD(&conf->retry_list);
4006 INIT_LIST_HEAD(&conf->bio_end_io_list);
4007
4008 spin_lock_init(&conf->resync_lock);
4009 init_waitqueue_head(&conf->wait_barrier);
4010 atomic_set(&conf->nr_pending, 0);
4011
4012 err = -ENOMEM;
4013 conf->thread = md_register_thread(raid10d, mddev, "raid10");
4014 if (!conf->thread)
4015 goto out;
4016
4017 conf->mddev = mddev;
4018 return conf;
4019
4020 out:
4021 if (conf) {
4022 mempool_exit(&conf->r10bio_pool);
4023 kfree(conf->mirrors);
4024 safe_put_page(conf->tmppage);
4025 bioset_exit(&conf->bio_split);
4026 kfree(conf);
4027 }
4028 return ERR_PTR(err);
4029}
4030
4031static void raid10_set_io_opt(struct r10conf *conf)
4032{
4033 int raid_disks = conf->geo.raid_disks;
4034
4035 if (!(conf->geo.raid_disks % conf->geo.near_copies))
4036 raid_disks /= conf->geo.near_copies;
4037 blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) *
4038 raid_disks);
4039}
4040
4041static int raid10_run(struct mddev *mddev)
4042{
4043 struct r10conf *conf;
4044 int i, disk_idx;
4045 struct raid10_info *disk;
4046 struct md_rdev *rdev;
4047 sector_t size;
4048 sector_t min_offset_diff = 0;
4049 int first = 1;
4050 bool discard_supported = false;
4051
4052 if (mddev_init_writes_pending(mddev) < 0)
4053 return -ENOMEM;
4054
4055 if (mddev->private == NULL) {
4056 conf = setup_conf(mddev);
4057 if (IS_ERR(conf))
4058 return PTR_ERR(conf);
4059 mddev->private = conf;
4060 }
4061 conf = mddev->private;
4062 if (!conf)
4063 goto out;
4064
4065 if (mddev_is_clustered(conf->mddev)) {
4066 int fc, fo;
4067
4068 fc = (mddev->layout >> 8) & 255;
4069 fo = mddev->layout & (1<<16);
4070 if (fc > 1 || fo > 0) {
4071 pr_err("only near layout is supported by clustered"
4072 " raid10\n");
4073 goto out_free_conf;
4074 }
4075 }
4076
4077 mddev->thread = conf->thread;
4078 conf->thread = NULL;
4079
4080 if (mddev->queue) {
4081 blk_queue_max_discard_sectors(mddev->queue,
4082 UINT_MAX);
4083 blk_queue_max_write_same_sectors(mddev->queue, 0);
4084 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
4085 blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
4086 raid10_set_io_opt(conf);
4087 }
4088
4089 rdev_for_each(rdev, mddev) {
4090 long long diff;
4091
4092 disk_idx = rdev->raid_disk;
4093 if (disk_idx < 0)
4094 continue;
4095 if (disk_idx >= conf->geo.raid_disks &&
4096 disk_idx >= conf->prev.raid_disks)
4097 continue;
4098 disk = conf->mirrors + disk_idx;
4099
4100 if (test_bit(Replacement, &rdev->flags)) {
4101 if (disk->replacement)
4102 goto out_free_conf;
4103 disk->replacement = rdev;
4104 } else {
4105 if (disk->rdev)
4106 goto out_free_conf;
4107 disk->rdev = rdev;
4108 }
4109 diff = (rdev->new_data_offset - rdev->data_offset);
4110 if (!mddev->reshape_backwards)
4111 diff = -diff;
4112 if (diff < 0)
4113 diff = 0;
4114 if (first || diff < min_offset_diff)
4115 min_offset_diff = diff;
4116
4117 if (mddev->gendisk)
4118 disk_stack_limits(mddev->gendisk, rdev->bdev,
4119 rdev->data_offset << 9);
4120
4121 disk->head_position = 0;
4122
4123 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
4124 discard_supported = true;
4125 first = 0;
4126 }
4127
4128 if (mddev->queue) {
4129 if (discard_supported)
4130 blk_queue_flag_set(QUEUE_FLAG_DISCARD,
4131 mddev->queue);
4132 else
4133 blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
4134 mddev->queue);
4135 }
4136
4137 if (!enough(conf, -1)) {
4138 pr_err("md/raid10:%s: not enough operational mirrors.\n",
4139 mdname(mddev));
4140 goto out_free_conf;
4141 }
4142
4143 if (conf->reshape_progress != MaxSector) {
4144
4145 if (conf->geo.far_copies != 1 &&
4146 conf->geo.far_offset == 0)
4147 goto out_free_conf;
4148 if (conf->prev.far_copies != 1 &&
4149 conf->prev.far_offset == 0)
4150 goto out_free_conf;
4151 }
4152
4153 mddev->degraded = 0;
4154 for (i = 0;
4155 i < conf->geo.raid_disks
4156 || i < conf->prev.raid_disks;
4157 i++) {
4158
4159 disk = conf->mirrors + i;
4160
4161 if (!disk->rdev && disk->replacement) {
4162
4163 disk->rdev = disk->replacement;
4164 disk->replacement = NULL;
4165 clear_bit(Replacement, &disk->rdev->flags);
4166 }
4167
4168 if (!disk->rdev ||
4169 !test_bit(In_sync, &disk->rdev->flags)) {
4170 disk->head_position = 0;
4171 mddev->degraded++;
4172 if (disk->rdev &&
4173 disk->rdev->saved_raid_disk < 0)
4174 conf->fullsync = 1;
4175 }
4176
4177 if (disk->replacement &&
4178 !test_bit(In_sync, &disk->replacement->flags) &&
4179 disk->replacement->saved_raid_disk < 0) {
4180 conf->fullsync = 1;
4181 }
4182
4183 disk->recovery_disabled = mddev->recovery_disabled - 1;
4184 }
4185
4186 if (mddev->recovery_cp != MaxSector)
4187 pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n",
4188 mdname(mddev));
4189 pr_info("md/raid10:%s: active with %d out of %d devices\n",
4190 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
4191 conf->geo.raid_disks);
4192
4193
4194
4195 mddev->dev_sectors = conf->dev_sectors;
4196 size = raid10_size(mddev, 0, 0);
4197 md_set_array_sectors(mddev, size);
4198 mddev->resync_max_sectors = size;
4199 set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
4200
4201 if (md_integrity_register(mddev))
4202 goto out_free_conf;
4203
4204 if (conf->reshape_progress != MaxSector) {
4205 unsigned long before_length, after_length;
4206
4207 before_length = ((1 << conf->prev.chunk_shift) *
4208 conf->prev.far_copies);
4209 after_length = ((1 << conf->geo.chunk_shift) *
4210 conf->geo.far_copies);
4211
4212 if (max(before_length, after_length) > min_offset_diff) {
4213
4214 pr_warn("md/raid10: offset difference not enough to continue reshape\n");
4215 goto out_free_conf;
4216 }
4217 conf->offset_diff = min_offset_diff;
4218
4219 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4220 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4221 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4222 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4223 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4224 "reshape");
4225 if (!mddev->sync_thread)
4226 goto out_free_conf;
4227 }
4228
4229 return 0;
4230
4231out_free_conf:
4232 md_unregister_thread(&mddev->thread);
4233 mempool_exit(&conf->r10bio_pool);
4234 safe_put_page(conf->tmppage);
4235 kfree(conf->mirrors);
4236 kfree(conf);
4237 mddev->private = NULL;
4238out:
4239 return -EIO;
4240}
4241
4242static void raid10_free(struct mddev *mddev, void *priv)
4243{
4244 struct r10conf *conf = priv;
4245
4246 mempool_exit(&conf->r10bio_pool);
4247 safe_put_page(conf->tmppage);
4248 kfree(conf->mirrors);
4249 kfree(conf->mirrors_old);
4250 kfree(conf->mirrors_new);
4251 bioset_exit(&conf->bio_split);
4252 kfree(conf);
4253}
4254
4255static void raid10_quiesce(struct mddev *mddev, int quiesce)
4256{
4257 struct r10conf *conf = mddev->private;
4258
4259 if (quiesce)
4260 raise_barrier(conf, 0);
4261 else
4262 lower_barrier(conf);
4263}
4264
4265static int raid10_resize(struct mddev *mddev, sector_t sectors)
4266{
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279 struct r10conf *conf = mddev->private;
4280 sector_t oldsize, size;
4281
4282 if (mddev->reshape_position != MaxSector)
4283 return -EBUSY;
4284
4285 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
4286 return -EINVAL;
4287
4288 oldsize = raid10_size(mddev, 0, 0);
4289 size = raid10_size(mddev, sectors, 0);
4290 if (mddev->external_size &&
4291 mddev->array_sectors > size)
4292 return -EINVAL;
4293 if (mddev->bitmap) {
4294 int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0);
4295 if (ret)
4296 return ret;
4297 }
4298 md_set_array_sectors(mddev, size);
4299 if (sectors > mddev->dev_sectors &&
4300 mddev->recovery_cp > oldsize) {
4301 mddev->recovery_cp = oldsize;
4302 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4303 }
4304 calc_sectors(conf, sectors);
4305 mddev->dev_sectors = conf->dev_sectors;
4306 mddev->resync_max_sectors = size;
4307 return 0;
4308}
4309
4310static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
4311{
4312 struct md_rdev *rdev;
4313 struct r10conf *conf;
4314
4315 if (mddev->degraded > 0) {
4316 pr_warn("md/raid10:%s: Error: degraded raid0!\n",
4317 mdname(mddev));
4318 return ERR_PTR(-EINVAL);
4319 }
4320 sector_div(size, devs);
4321
4322
4323 mddev->new_level = 10;
4324
4325 mddev->new_layout = (1<<8) + 2;
4326 mddev->new_chunk_sectors = mddev->chunk_sectors;
4327 mddev->delta_disks = mddev->raid_disks;
4328 mddev->raid_disks *= 2;
4329
4330 mddev->recovery_cp = MaxSector;
4331 mddev->dev_sectors = size;
4332
4333 conf = setup_conf(mddev);
4334 if (!IS_ERR(conf)) {
4335 rdev_for_each(rdev, mddev)
4336 if (rdev->raid_disk >= 0) {
4337 rdev->new_raid_disk = rdev->raid_disk * 2;
4338 rdev->sectors = size;
4339 }
4340 conf->barrier = 1;
4341 }
4342
4343 return conf;
4344}
4345
4346static void *raid10_takeover(struct mddev *mddev)
4347{
4348 struct r0conf *raid0_conf;
4349
4350
4351
4352
4353 if (mddev->level == 0) {
4354
4355 raid0_conf = mddev->private;
4356 if (raid0_conf->nr_strip_zones > 1) {
4357 pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n",
4358 mdname(mddev));
4359 return ERR_PTR(-EINVAL);
4360 }
4361 return raid10_takeover_raid0(mddev,
4362 raid0_conf->strip_zone->zone_end,
4363 raid0_conf->strip_zone->nb_dev);
4364 }
4365 return ERR_PTR(-EINVAL);
4366}
4367
4368static int raid10_check_reshape(struct mddev *mddev)
4369{
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384 struct r10conf *conf = mddev->private;
4385 struct geom geo;
4386
4387 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
4388 return -EINVAL;
4389
4390 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
4391
4392 return -EINVAL;
4393 if (geo.far_copies > 1 && !geo.far_offset)
4394
4395 return -EINVAL;
4396
4397 if (mddev->array_sectors & geo.chunk_mask)
4398
4399 return -EINVAL;
4400
4401 if (!enough(conf, -1))
4402 return -EINVAL;
4403
4404 kfree(conf->mirrors_new);
4405 conf->mirrors_new = NULL;
4406 if (mddev->delta_disks > 0) {
4407
4408 conf->mirrors_new =
4409 kcalloc(mddev->raid_disks + mddev->delta_disks,
4410 sizeof(struct raid10_info),
4411 GFP_KERNEL);
4412 if (!conf->mirrors_new)
4413 return -ENOMEM;
4414 }
4415 return 0;
4416}
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431static int calc_degraded(struct r10conf *conf)
4432{
4433 int degraded, degraded2;
4434 int i;
4435
4436 rcu_read_lock();
4437 degraded = 0;
4438
4439 for (i = 0; i < conf->prev.raid_disks; i++) {
4440 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4441 if (!rdev || test_bit(Faulty, &rdev->flags))
4442 degraded++;
4443 else if (!test_bit(In_sync, &rdev->flags))
4444
4445
4446
4447
4448 degraded++;
4449 }
4450 rcu_read_unlock();
4451 if (conf->geo.raid_disks == conf->prev.raid_disks)
4452 return degraded;
4453 rcu_read_lock();
4454 degraded2 = 0;
4455 for (i = 0; i < conf->geo.raid_disks; i++) {
4456 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4457 if (!rdev || test_bit(Faulty, &rdev->flags))
4458 degraded2++;
4459 else if (!test_bit(In_sync, &rdev->flags)) {
4460
4461
4462
4463
4464
4465 if (conf->geo.raid_disks <= conf->prev.raid_disks)
4466 degraded2++;
4467 }
4468 }
4469 rcu_read_unlock();
4470 if (degraded2 > degraded)
4471 return degraded2;
4472 return degraded;
4473}
4474
4475static int raid10_start_reshape(struct mddev *mddev)
4476{
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487 unsigned long before_length, after_length;
4488 sector_t min_offset_diff = 0;
4489 int first = 1;
4490 struct geom new;
4491 struct r10conf *conf = mddev->private;
4492 struct md_rdev *rdev;
4493 int spares = 0;
4494 int ret;
4495
4496 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4497 return -EBUSY;
4498
4499 if (setup_geo(&new, mddev, geo_start) != conf->copies)
4500 return -EINVAL;
4501
4502 before_length = ((1 << conf->prev.chunk_shift) *
4503 conf->prev.far_copies);
4504 after_length = ((1 << conf->geo.chunk_shift) *
4505 conf->geo.far_copies);
4506
4507 rdev_for_each(rdev, mddev) {
4508 if (!test_bit(In_sync, &rdev->flags)
4509 && !test_bit(Faulty, &rdev->flags))
4510 spares++;
4511 if (rdev->raid_disk >= 0) {
4512 long long diff = (rdev->new_data_offset
4513 - rdev->data_offset);
4514 if (!mddev->reshape_backwards)
4515 diff = -diff;
4516 if (diff < 0)
4517 diff = 0;
4518 if (first || diff < min_offset_diff)
4519 min_offset_diff = diff;
4520 first = 0;
4521 }
4522 }
4523
4524 if (max(before_length, after_length) > min_offset_diff)
4525 return -EINVAL;
4526
4527 if (spares < mddev->delta_disks)
4528 return -EINVAL;
4529
4530 conf->offset_diff = min_offset_diff;
4531 spin_lock_irq(&conf->device_lock);
4532 if (conf->mirrors_new) {
4533 memcpy(conf->mirrors_new, conf->mirrors,
4534 sizeof(struct raid10_info)*conf->prev.raid_disks);
4535 smp_mb();
4536 kfree(conf->mirrors_old);
4537 conf->mirrors_old = conf->mirrors;
4538 conf->mirrors = conf->mirrors_new;
4539 conf->mirrors_new = NULL;
4540 }
4541 setup_geo(&conf->geo, mddev, geo_start);
4542 smp_mb();
4543 if (mddev->reshape_backwards) {
4544 sector_t size = raid10_size(mddev, 0, 0);
4545 if (size < mddev->array_sectors) {
4546 spin_unlock_irq(&conf->device_lock);
4547 pr_warn("md/raid10:%s: array size must be reduce before number of disks\n",
4548 mdname(mddev));
4549 return -EINVAL;
4550 }
4551 mddev->resync_max_sectors = size;
4552 conf->reshape_progress = size;
4553 } else
4554 conf->reshape_progress = 0;
4555 conf->reshape_safe = conf->reshape_progress;
4556 spin_unlock_irq(&conf->device_lock);
4557
4558 if (mddev->delta_disks && mddev->bitmap) {
4559 struct mdp_superblock_1 *sb = NULL;
4560 sector_t oldsize, newsize;
4561
4562 oldsize = raid10_size(mddev, 0, 0);
4563 newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
4564
4565 if (!mddev_is_clustered(mddev)) {
4566 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
4567 if (ret)
4568 goto abort;
4569 else
4570 goto out;
4571 }
4572
4573 rdev_for_each(rdev, mddev) {
4574 if (rdev->raid_disk > -1 &&
4575 !test_bit(Faulty, &rdev->flags))
4576 sb = page_address(rdev->sb_page);
4577 }
4578
4579
4580
4581
4582
4583
4584 if ((sb && (le32_to_cpu(sb->feature_map) &
4585 MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
4586 goto out;
4587
4588 ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
4589 if (ret)
4590 goto abort;
4591
4592 ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
4593 if (ret) {
4594 md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
4595 goto abort;
4596 }
4597 }
4598out:
4599 if (mddev->delta_disks > 0) {
4600 rdev_for_each(rdev, mddev)
4601 if (rdev->raid_disk < 0 &&
4602 !test_bit(Faulty, &rdev->flags)) {
4603 if (raid10_add_disk(mddev, rdev) == 0) {
4604 if (rdev->raid_disk >=
4605 conf->prev.raid_disks)
4606 set_bit(In_sync, &rdev->flags);
4607 else
4608 rdev->recovery_offset = 0;
4609
4610
4611 sysfs_link_rdev(mddev, rdev);
4612 }
4613 } else if (rdev->raid_disk >= conf->prev.raid_disks
4614 && !test_bit(Faulty, &rdev->flags)) {
4615
4616 set_bit(In_sync, &rdev->flags);
4617 }
4618 }
4619
4620
4621
4622
4623 spin_lock_irq(&conf->device_lock);
4624 mddev->degraded = calc_degraded(conf);
4625 spin_unlock_irq(&conf->device_lock);
4626 mddev->raid_disks = conf->geo.raid_disks;
4627 mddev->reshape_position = conf->reshape_progress;
4628 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4629
4630 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4631 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4632 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
4633 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4634 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4635
4636 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4637 "reshape");
4638 if (!mddev->sync_thread) {
4639 ret = -EAGAIN;
4640 goto abort;
4641 }
4642 conf->reshape_checkpoint = jiffies;
4643 md_wakeup_thread(mddev->sync_thread);
4644 md_new_event(mddev);
4645 return 0;
4646
4647abort:
4648 mddev->recovery = 0;
4649 spin_lock_irq(&conf->device_lock);
4650 conf->geo = conf->prev;
4651 mddev->raid_disks = conf->geo.raid_disks;
4652 rdev_for_each(rdev, mddev)
4653 rdev->new_data_offset = rdev->data_offset;
4654 smp_wmb();
4655 conf->reshape_progress = MaxSector;
4656 conf->reshape_safe = MaxSector;
4657 mddev->reshape_position = MaxSector;
4658 spin_unlock_irq(&conf->device_lock);
4659 return ret;
4660}
4661
4662
4663
4664
4665
4666
4667
4668static sector_t last_dev_address(sector_t s, struct geom *geo)
4669{
4670 s = (s | geo->chunk_mask) + 1;
4671 s >>= geo->chunk_shift;
4672 s *= geo->near_copies;
4673 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4674 s *= geo->far_copies;
4675 s <<= geo->chunk_shift;
4676 return s;
4677}
4678
4679
4680
4681
4682
4683static sector_t first_dev_address(sector_t s, struct geom *geo)
4684{
4685 s >>= geo->chunk_shift;
4686 s *= geo->near_copies;
4687 sector_div(s, geo->raid_disks);
4688 s *= geo->far_copies;
4689 s <<= geo->chunk_shift;
4690 return s;
4691}
4692
4693static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4694 int *skipped)
4695{
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733 struct r10conf *conf = mddev->private;
4734 struct r10bio *r10_bio;
4735 sector_t next, safe, last;
4736 int max_sectors;
4737 int nr_sectors;
4738 int s;
4739 struct md_rdev *rdev;
4740 int need_flush = 0;
4741 struct bio *blist;
4742 struct bio *bio, *read_bio;
4743 int sectors_done = 0;
4744 struct page **pages;
4745
4746 if (sector_nr == 0) {
4747
4748 if (mddev->reshape_backwards &&
4749 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4750 sector_nr = (raid10_size(mddev, 0, 0)
4751 - conf->reshape_progress);
4752 } else if (!mddev->reshape_backwards &&
4753 conf->reshape_progress > 0)
4754 sector_nr = conf->reshape_progress;
4755 if (sector_nr) {
4756 mddev->curr_resync_completed = sector_nr;
4757 sysfs_notify_dirent_safe(mddev->sysfs_completed);
4758 *skipped = 1;
4759 return sector_nr;
4760 }
4761 }
4762
4763
4764
4765
4766
4767 if (mddev->reshape_backwards) {
4768
4769
4770
4771 next = first_dev_address(conf->reshape_progress - 1,
4772 &conf->geo);
4773
4774
4775
4776
4777 safe = last_dev_address(conf->reshape_safe - 1,
4778 &conf->prev);
4779
4780 if (next + conf->offset_diff < safe)
4781 need_flush = 1;
4782
4783 last = conf->reshape_progress - 1;
4784 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4785 & conf->prev.chunk_mask);
4786 if (sector_nr + RESYNC_SECTORS < last)
4787 sector_nr = last + 1 - RESYNC_SECTORS;
4788 } else {
4789
4790
4791
4792 next = last_dev_address(conf->reshape_progress, &conf->geo);
4793
4794
4795
4796
4797 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4798
4799
4800
4801
4802 if (next > safe + conf->offset_diff)
4803 need_flush = 1;
4804
4805 sector_nr = conf->reshape_progress;
4806 last = sector_nr | (conf->geo.chunk_mask
4807 & conf->prev.chunk_mask);
4808
4809 if (sector_nr + RESYNC_SECTORS <= last)
4810 last = sector_nr + RESYNC_SECTORS - 1;
4811 }
4812
4813 if (need_flush ||
4814 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4815
4816 wait_barrier(conf);
4817 mddev->reshape_position = conf->reshape_progress;
4818 if (mddev->reshape_backwards)
4819 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4820 - conf->reshape_progress;
4821 else
4822 mddev->curr_resync_completed = conf->reshape_progress;
4823 conf->reshape_checkpoint = jiffies;
4824 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4825 md_wakeup_thread(mddev->thread);
4826 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
4827 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4828 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4829 allow_barrier(conf);
4830 return sectors_done;
4831 }
4832 conf->reshape_safe = mddev->reshape_position;
4833 allow_barrier(conf);
4834 }
4835
4836 raise_barrier(conf, 0);
4837read_more:
4838
4839 r10_bio = raid10_alloc_init_r10buf(conf);
4840 r10_bio->state = 0;
4841 raise_barrier(conf, 1);
4842 atomic_set(&r10_bio->remaining, 0);
4843 r10_bio->mddev = mddev;
4844 r10_bio->sector = sector_nr;
4845 set_bit(R10BIO_IsReshape, &r10_bio->state);
4846 r10_bio->sectors = last - sector_nr + 1;
4847 rdev = read_balance(conf, r10_bio, &max_sectors);
4848 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4849
4850 if (!rdev) {
4851
4852
4853
4854
4855 mempool_free(r10_bio, &conf->r10buf_pool);
4856 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4857 return sectors_done;
4858 }
4859
4860 read_bio = bio_alloc_bioset(GFP_KERNEL, RESYNC_PAGES, &mddev->bio_set);
4861
4862 bio_set_dev(read_bio, rdev->bdev);
4863 read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4864 + rdev->data_offset);
4865 read_bio->bi_private = r10_bio;
4866 read_bio->bi_end_io = end_reshape_read;
4867 bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
4868 r10_bio->master_bio = read_bio;
4869 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4870
4871
4872
4873
4874
4875 if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) {
4876 struct mdp_superblock_1 *sb = NULL;
4877 int sb_reshape_pos = 0;
4878
4879 conf->cluster_sync_low = sector_nr;
4880 conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
4881 sb = page_address(rdev->sb_page);
4882 if (sb) {
4883 sb_reshape_pos = le64_to_cpu(sb->reshape_position);
4884
4885
4886
4887
4888
4889 if (sb_reshape_pos < conf->cluster_sync_low)
4890 conf->cluster_sync_low = sb_reshape_pos;
4891 }
4892
4893 md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
4894 conf->cluster_sync_high);
4895 }
4896
4897
4898 __raid10_find_phys(&conf->geo, r10_bio);
4899
4900 blist = read_bio;
4901 read_bio->bi_next = NULL;
4902
4903 rcu_read_lock();
4904 for (s = 0; s < conf->copies*2; s++) {
4905 struct bio *b;
4906 int d = r10_bio->devs[s/2].devnum;
4907 struct md_rdev *rdev2;
4908 if (s&1) {
4909 rdev2 = rcu_dereference(conf->mirrors[d].replacement);
4910 b = r10_bio->devs[s/2].repl_bio;
4911 } else {
4912 rdev2 = rcu_dereference(conf->mirrors[d].rdev);
4913 b = r10_bio->devs[s/2].bio;
4914 }
4915 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4916 continue;
4917
4918 bio_set_dev(b, rdev2->bdev);
4919 b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
4920 rdev2->new_data_offset;
4921 b->bi_end_io = end_reshape_write;
4922 bio_set_op_attrs(b, REQ_OP_WRITE, 0);
4923 b->bi_next = blist;
4924 blist = b;
4925 }
4926
4927
4928
4929 nr_sectors = 0;
4930 pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
4931 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4932 struct page *page = pages[s / (PAGE_SIZE >> 9)];
4933 int len = (max_sectors - s) << 9;
4934 if (len > PAGE_SIZE)
4935 len = PAGE_SIZE;
4936 for (bio = blist; bio ; bio = bio->bi_next) {
4937
4938
4939
4940
4941 bio_add_page(bio, page, len, 0);
4942 }
4943 sector_nr += len >> 9;
4944 nr_sectors += len >> 9;
4945 }
4946 rcu_read_unlock();
4947 r10_bio->sectors = nr_sectors;
4948
4949
4950 md_sync_acct_bio(read_bio, r10_bio->sectors);
4951 atomic_inc(&r10_bio->remaining);
4952 read_bio->bi_next = NULL;
4953 submit_bio_noacct(read_bio);
4954 sectors_done += nr_sectors;
4955 if (sector_nr <= last)
4956 goto read_more;
4957
4958 lower_barrier(conf);
4959
4960
4961
4962
4963 if (mddev->reshape_backwards)
4964 conf->reshape_progress -= sectors_done;
4965 else
4966 conf->reshape_progress += sectors_done;
4967
4968 return sectors_done;
4969}
4970
4971static void end_reshape_request(struct r10bio *r10_bio);
4972static int handle_reshape_read_error(struct mddev *mddev,
4973 struct r10bio *r10_bio);
4974static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4975{
4976
4977
4978
4979
4980
4981 struct r10conf *conf = mddev->private;
4982 int s;
4983
4984 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4985 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4986
4987 md_done_sync(mddev, r10_bio->sectors, 0);
4988 return;
4989 }
4990
4991
4992
4993
4994 atomic_set(&r10_bio->remaining, 1);
4995 for (s = 0; s < conf->copies*2; s++) {
4996 struct bio *b;
4997 int d = r10_bio->devs[s/2].devnum;
4998 struct md_rdev *rdev;
4999 rcu_read_lock();
5000 if (s&1) {
5001 rdev = rcu_dereference(conf->mirrors[d].replacement);
5002 b = r10_bio->devs[s/2].repl_bio;
5003 } else {
5004 rdev = rcu_dereference(conf->mirrors[d].rdev);
5005 b = r10_bio->devs[s/2].bio;
5006 }
5007 if (!rdev || test_bit(Faulty, &rdev->flags)) {
5008 rcu_read_unlock();
5009 continue;
5010 }
5011 atomic_inc(&rdev->nr_pending);
5012 rcu_read_unlock();
5013 md_sync_acct_bio(b, r10_bio->sectors);
5014 atomic_inc(&r10_bio->remaining);
5015 b->bi_next = NULL;
5016 submit_bio_noacct(b);
5017 }
5018 end_reshape_request(r10_bio);
5019}
5020
5021static void end_reshape(struct r10conf *conf)
5022{
5023 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
5024 return;
5025
5026 spin_lock_irq(&conf->device_lock);
5027 conf->prev = conf->geo;
5028 md_finish_reshape(conf->mddev);
5029 smp_wmb();
5030 conf->reshape_progress = MaxSector;
5031 conf->reshape_safe = MaxSector;
5032 spin_unlock_irq(&conf->device_lock);
5033
5034 if (conf->mddev->queue)
5035 raid10_set_io_opt(conf);
5036 conf->fullsync = 0;
5037}
5038
5039static void raid10_update_reshape_pos(struct mddev *mddev)
5040{
5041 struct r10conf *conf = mddev->private;
5042 sector_t lo, hi;
5043
5044 md_cluster_ops->resync_info_get(mddev, &lo, &hi);
5045 if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
5046 || mddev->reshape_position == MaxSector)
5047 conf->reshape_progress = mddev->reshape_position;
5048 else
5049 WARN_ON_ONCE(1);
5050}
5051
5052static int handle_reshape_read_error(struct mddev *mddev,
5053 struct r10bio *r10_bio)
5054{
5055
5056 int sectors = r10_bio->sectors;
5057 struct r10conf *conf = mddev->private;
5058 struct r10bio *r10b;
5059 int slot = 0;
5060 int idx = 0;
5061 struct page **pages;
5062
5063 r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO);
5064 if (!r10b) {
5065 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5066 return -ENOMEM;
5067 }
5068
5069
5070 pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
5071
5072 r10b->sector = r10_bio->sector;
5073 __raid10_find_phys(&conf->prev, r10b);
5074
5075 while (sectors) {
5076 int s = sectors;
5077 int success = 0;
5078 int first_slot = slot;
5079
5080 if (s > (PAGE_SIZE >> 9))
5081 s = PAGE_SIZE >> 9;
5082
5083 rcu_read_lock();
5084 while (!success) {
5085 int d = r10b->devs[slot].devnum;
5086 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
5087 sector_t addr;
5088 if (rdev == NULL ||
5089 test_bit(Faulty, &rdev->flags) ||
5090 !test_bit(In_sync, &rdev->flags))
5091 goto failed;
5092
5093 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
5094 atomic_inc(&rdev->nr_pending);
5095 rcu_read_unlock();
5096 success = sync_page_io(rdev,
5097 addr,
5098 s << 9,
5099 pages[idx],
5100 REQ_OP_READ, 0, false);
5101 rdev_dec_pending(rdev, mddev);
5102 rcu_read_lock();
5103 if (success)
5104 break;
5105 failed:
5106 slot++;
5107 if (slot >= conf->copies)
5108 slot = 0;
5109 if (slot == first_slot)
5110 break;
5111 }
5112 rcu_read_unlock();
5113 if (!success) {
5114
5115 set_bit(MD_RECOVERY_INTR,
5116 &mddev->recovery);
5117 kfree(r10b);
5118 return -EIO;
5119 }
5120 sectors -= s;
5121 idx++;
5122 }
5123 kfree(r10b);
5124 return 0;
5125}
5126
5127static void end_reshape_write(struct bio *bio)
5128{
5129 struct r10bio *r10_bio = get_resync_r10bio(bio);
5130 struct mddev *mddev = r10_bio->mddev;
5131 struct r10conf *conf = mddev->private;
5132 int d;
5133 int slot;
5134 int repl;
5135 struct md_rdev *rdev = NULL;
5136
5137 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
5138 if (repl)
5139 rdev = conf->mirrors[d].replacement;
5140 if (!rdev) {
5141 smp_mb();
5142 rdev = conf->mirrors[d].rdev;
5143 }
5144
5145 if (bio->bi_status) {
5146
5147 md_error(mddev, rdev);
5148 }
5149
5150 rdev_dec_pending(rdev, mddev);
5151 end_reshape_request(r10_bio);
5152}
5153
5154static void end_reshape_request(struct r10bio *r10_bio)
5155{
5156 if (!atomic_dec_and_test(&r10_bio->remaining))
5157 return;
5158 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
5159 bio_put(r10_bio->master_bio);
5160 put_buf(r10_bio);
5161}
5162
5163static void raid10_finish_reshape(struct mddev *mddev)
5164{
5165 struct r10conf *conf = mddev->private;
5166
5167 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5168 return;
5169
5170 if (mddev->delta_disks > 0) {
5171 if (mddev->recovery_cp > mddev->resync_max_sectors) {
5172 mddev->recovery_cp = mddev->resync_max_sectors;
5173 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5174 }
5175 mddev->resync_max_sectors = mddev->array_sectors;
5176 } else {
5177 int d;
5178 rcu_read_lock();
5179 for (d = conf->geo.raid_disks ;
5180 d < conf->geo.raid_disks - mddev->delta_disks;
5181 d++) {
5182 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
5183 if (rdev)
5184 clear_bit(In_sync, &rdev->flags);
5185 rdev = rcu_dereference(conf->mirrors[d].replacement);
5186 if (rdev)
5187 clear_bit(In_sync, &rdev->flags);
5188 }
5189 rcu_read_unlock();
5190 }
5191 mddev->layout = mddev->new_layout;
5192 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
5193 mddev->reshape_position = MaxSector;
5194 mddev->delta_disks = 0;
5195 mddev->reshape_backwards = 0;
5196}
5197
5198static struct md_personality raid10_personality =
5199{
5200 .name = "raid10",
5201 .level = 10,
5202 .owner = THIS_MODULE,
5203 .make_request = raid10_make_request,
5204 .run = raid10_run,
5205 .free = raid10_free,
5206 .status = raid10_status,
5207 .error_handler = raid10_error,
5208 .hot_add_disk = raid10_add_disk,
5209 .hot_remove_disk= raid10_remove_disk,
5210 .spare_active = raid10_spare_active,
5211 .sync_request = raid10_sync_request,
5212 .quiesce = raid10_quiesce,
5213 .size = raid10_size,
5214 .resize = raid10_resize,
5215 .takeover = raid10_takeover,
5216 .check_reshape = raid10_check_reshape,
5217 .start_reshape = raid10_start_reshape,
5218 .finish_reshape = raid10_finish_reshape,
5219 .update_reshape_pos = raid10_update_reshape_pos,
5220};
5221
5222static int __init raid_init(void)
5223{
5224 return register_md_personality(&raid10_personality);
5225}
5226
5227static void raid_exit(void)
5228{
5229 unregister_md_personality(&raid10_personality);
5230}
5231
5232module_init(raid_init);
5233module_exit(raid_exit);
5234MODULE_LICENSE("GPL");
5235MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
5236MODULE_ALIAS("md-personality-9");
5237MODULE_ALIAS("md-raid10");
5238MODULE_ALIAS("md-level-10");
5239
5240module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
5241