1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/slab.h>
22#include <linux/delay.h>
23#include <linux/blkdev.h>
24#include <linux/module.h>
25#include <linux/seq_file.h>
26#include <linux/ratelimit.h>
27#include <linux/kthread.h>
28#include "md.h"
29#include "raid10.h"
30#include "raid0.h"
31#include "bitmap.h"
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77#define NR_RAID10_BIOS 256
78
79
80
81
82
83
84#define IO_BLOCKED ((struct bio *)1)
85
86
87
88
89#define IO_MADE_GOOD ((struct bio *)2)
90
91#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
92
93
94
95
96
97static int max_queued_requests = 1024;
98
99static void allow_barrier(struct r10conf *conf);
100static void lower_barrier(struct r10conf *conf);
101static int _enough(struct r10conf *conf, int previous, int ignore);
102static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
103 int *skipped);
104static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
105static void end_reshape_write(struct bio *bio);
106static void end_reshape(struct r10conf *conf);
107
108static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
109{
110 struct r10conf *conf = data;
111 int size = offsetof(struct r10bio, devs[conf->copies]);
112
113
114
115 return kzalloc(size, gfp_flags);
116}
117
118static void r10bio_pool_free(void *r10_bio, void *data)
119{
120 kfree(r10_bio);
121}
122
123
124#define RESYNC_BLOCK_SIZE (64*1024)
125#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
126
127#define RESYNC_WINDOW (1024*1024)
128
129#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
130
131
132
133
134
135
136
137
138static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
139{
140 struct r10conf *conf = data;
141 struct page *page;
142 struct r10bio *r10_bio;
143 struct bio *bio;
144 int i, j;
145 int nalloc;
146
147 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
148 if (!r10_bio)
149 return NULL;
150
151 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
152 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
153 nalloc = conf->copies;
154 else
155 nalloc = 2;
156
157
158
159
160 for (j = nalloc ; j-- ; ) {
161 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
162 if (!bio)
163 goto out_free_bio;
164 r10_bio->devs[j].bio = bio;
165 if (!conf->have_replacement)
166 continue;
167 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
168 if (!bio)
169 goto out_free_bio;
170 r10_bio->devs[j].repl_bio = bio;
171 }
172
173
174
175
176 for (j = 0 ; j < nalloc; j++) {
177 struct bio *rbio = r10_bio->devs[j].repl_bio;
178 bio = r10_bio->devs[j].bio;
179 for (i = 0; i < RESYNC_PAGES; i++) {
180 if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
181 &conf->mddev->recovery)) {
182
183
184 struct bio *rbio = r10_bio->devs[0].bio;
185 page = rbio->bi_io_vec[i].bv_page;
186 get_page(page);
187 } else
188 page = alloc_page(gfp_flags);
189 if (unlikely(!page))
190 goto out_free_pages;
191
192 bio->bi_io_vec[i].bv_page = page;
193 if (rbio)
194 rbio->bi_io_vec[i].bv_page = page;
195 }
196 }
197
198 return r10_bio;
199
200out_free_pages:
201 for ( ; i > 0 ; i--)
202 safe_put_page(bio->bi_io_vec[i-1].bv_page);
203 while (j--)
204 for (i = 0; i < RESYNC_PAGES ; i++)
205 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
206 j = 0;
207out_free_bio:
208 for ( ; j < nalloc; j++) {
209 if (r10_bio->devs[j].bio)
210 bio_put(r10_bio->devs[j].bio);
211 if (r10_bio->devs[j].repl_bio)
212 bio_put(r10_bio->devs[j].repl_bio);
213 }
214 r10bio_pool_free(r10_bio, conf);
215 return NULL;
216}
217
218static void r10buf_pool_free(void *__r10_bio, void *data)
219{
220 int i;
221 struct r10conf *conf = data;
222 struct r10bio *r10bio = __r10_bio;
223 int j;
224
225 for (j=0; j < conf->copies; j++) {
226 struct bio *bio = r10bio->devs[j].bio;
227 if (bio) {
228 for (i = 0; i < RESYNC_PAGES; i++) {
229 safe_put_page(bio->bi_io_vec[i].bv_page);
230 bio->bi_io_vec[i].bv_page = NULL;
231 }
232 bio_put(bio);
233 }
234 bio = r10bio->devs[j].repl_bio;
235 if (bio)
236 bio_put(bio);
237 }
238 r10bio_pool_free(r10bio, conf);
239}
240
241static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
242{
243 int i;
244
245 for (i = 0; i < conf->copies; i++) {
246 struct bio **bio = & r10_bio->devs[i].bio;
247 if (!BIO_SPECIAL(*bio))
248 bio_put(*bio);
249 *bio = NULL;
250 bio = &r10_bio->devs[i].repl_bio;
251 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
252 bio_put(*bio);
253 *bio = NULL;
254 }
255}
256
257static void free_r10bio(struct r10bio *r10_bio)
258{
259 struct r10conf *conf = r10_bio->mddev->private;
260
261 put_all_bios(conf, r10_bio);
262 mempool_free(r10_bio, conf->r10bio_pool);
263}
264
265static void put_buf(struct r10bio *r10_bio)
266{
267 struct r10conf *conf = r10_bio->mddev->private;
268
269 mempool_free(r10_bio, conf->r10buf_pool);
270
271 lower_barrier(conf);
272}
273
274static void reschedule_retry(struct r10bio *r10_bio)
275{
276 unsigned long flags;
277 struct mddev *mddev = r10_bio->mddev;
278 struct r10conf *conf = mddev->private;
279
280 spin_lock_irqsave(&conf->device_lock, flags);
281 list_add(&r10_bio->retry_list, &conf->retry_list);
282 conf->nr_queued ++;
283 spin_unlock_irqrestore(&conf->device_lock, flags);
284
285
286 wake_up(&conf->wait_barrier);
287
288 md_wakeup_thread(mddev->thread);
289}
290
291
292
293
294
295
296static void raid_end_bio_io(struct r10bio *r10_bio)
297{
298 struct bio *bio = r10_bio->master_bio;
299 int done;
300 struct r10conf *conf = r10_bio->mddev->private;
301
302 if (bio->bi_phys_segments) {
303 unsigned long flags;
304 spin_lock_irqsave(&conf->device_lock, flags);
305 bio->bi_phys_segments--;
306 done = (bio->bi_phys_segments == 0);
307 spin_unlock_irqrestore(&conf->device_lock, flags);
308 } else
309 done = 1;
310 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
311 bio->bi_error = -EIO;
312 if (done) {
313 bio_endio(bio);
314
315
316
317
318 allow_barrier(conf);
319 }
320 free_r10bio(r10_bio);
321}
322
323
324
325
326static inline void update_head_pos(int slot, struct r10bio *r10_bio)
327{
328 struct r10conf *conf = r10_bio->mddev->private;
329
330 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
331 r10_bio->devs[slot].addr + (r10_bio->sectors);
332}
333
334
335
336
337static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
338 struct bio *bio, int *slotp, int *replp)
339{
340 int slot;
341 int repl = 0;
342
343 for (slot = 0; slot < conf->copies; slot++) {
344 if (r10_bio->devs[slot].bio == bio)
345 break;
346 if (r10_bio->devs[slot].repl_bio == bio) {
347 repl = 1;
348 break;
349 }
350 }
351
352 BUG_ON(slot == conf->copies);
353 update_head_pos(slot, r10_bio);
354
355 if (slotp)
356 *slotp = slot;
357 if (replp)
358 *replp = repl;
359 return r10_bio->devs[slot].devnum;
360}
361
362static void raid10_end_read_request(struct bio *bio)
363{
364 int uptodate = !bio->bi_error;
365 struct r10bio *r10_bio = bio->bi_private;
366 int slot, dev;
367 struct md_rdev *rdev;
368 struct r10conf *conf = r10_bio->mddev->private;
369
370 slot = r10_bio->read_slot;
371 dev = r10_bio->devs[slot].devnum;
372 rdev = r10_bio->devs[slot].rdev;
373
374
375
376 update_head_pos(slot, r10_bio);
377
378 if (uptodate) {
379
380
381
382
383
384
385
386
387
388 set_bit(R10BIO_Uptodate, &r10_bio->state);
389 } else {
390
391
392
393
394
395 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
396 rdev->raid_disk))
397 uptodate = 1;
398 }
399 if (uptodate) {
400 raid_end_bio_io(r10_bio);
401 rdev_dec_pending(rdev, conf->mddev);
402 } else {
403
404
405
406 char b[BDEVNAME_SIZE];
407 printk_ratelimited(KERN_ERR
408 "md/raid10:%s: %s: rescheduling sector %llu\n",
409 mdname(conf->mddev),
410 bdevname(rdev->bdev, b),
411 (unsigned long long)r10_bio->sector);
412 set_bit(R10BIO_ReadError, &r10_bio->state);
413 reschedule_retry(r10_bio);
414 }
415}
416
417static void close_write(struct r10bio *r10_bio)
418{
419
420 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
421 r10_bio->sectors,
422 !test_bit(R10BIO_Degraded, &r10_bio->state),
423 0);
424 md_write_end(r10_bio->mddev);
425}
426
427static void one_write_done(struct r10bio *r10_bio)
428{
429 if (atomic_dec_and_test(&r10_bio->remaining)) {
430 if (test_bit(R10BIO_WriteError, &r10_bio->state))
431 reschedule_retry(r10_bio);
432 else {
433 close_write(r10_bio);
434 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
435 reschedule_retry(r10_bio);
436 else
437 raid_end_bio_io(r10_bio);
438 }
439 }
440}
441
442static void raid10_end_write_request(struct bio *bio)
443{
444 struct r10bio *r10_bio = bio->bi_private;
445 int dev;
446 int dec_rdev = 1;
447 struct r10conf *conf = r10_bio->mddev->private;
448 int slot, repl;
449 struct md_rdev *rdev = NULL;
450 bool discard_error;
451
452 discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD;
453
454 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
455
456 if (repl)
457 rdev = conf->mirrors[dev].replacement;
458 if (!rdev) {
459 smp_rmb();
460 repl = 0;
461 rdev = conf->mirrors[dev].rdev;
462 }
463
464
465
466 if (bio->bi_error && !discard_error) {
467 if (repl)
468
469
470
471 md_error(rdev->mddev, rdev);
472 else {
473 set_bit(WriteErrorSeen, &rdev->flags);
474 if (!test_and_set_bit(WantReplacement, &rdev->flags))
475 set_bit(MD_RECOVERY_NEEDED,
476 &rdev->mddev->recovery);
477 set_bit(R10BIO_WriteError, &r10_bio->state);
478 dec_rdev = 0;
479 }
480 } else {
481
482
483
484
485
486
487
488
489
490 sector_t first_bad;
491 int bad_sectors;
492
493
494
495
496
497
498
499
500
501 if (test_bit(In_sync, &rdev->flags) &&
502 !test_bit(Faulty, &rdev->flags))
503 set_bit(R10BIO_Uptodate, &r10_bio->state);
504
505
506 if (is_badblock(rdev,
507 r10_bio->devs[slot].addr,
508 r10_bio->sectors,
509 &first_bad, &bad_sectors) && !discard_error) {
510 bio_put(bio);
511 if (repl)
512 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
513 else
514 r10_bio->devs[slot].bio = IO_MADE_GOOD;
515 dec_rdev = 0;
516 set_bit(R10BIO_MadeGood, &r10_bio->state);
517 }
518 }
519
520
521
522
523
524
525 one_write_done(r10_bio);
526 if (dec_rdev)
527 rdev_dec_pending(rdev, conf->mddev);
528}
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
556{
557 int n,f;
558 sector_t sector;
559 sector_t chunk;
560 sector_t stripe;
561 int dev;
562 int slot = 0;
563 int last_far_set_start, last_far_set_size;
564
565 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
566 last_far_set_start *= geo->far_set_size;
567
568 last_far_set_size = geo->far_set_size;
569 last_far_set_size += (geo->raid_disks % geo->far_set_size);
570
571
572 chunk = r10bio->sector >> geo->chunk_shift;
573 sector = r10bio->sector & geo->chunk_mask;
574
575 chunk *= geo->near_copies;
576 stripe = chunk;
577 dev = sector_div(stripe, geo->raid_disks);
578 if (geo->far_offset)
579 stripe *= geo->far_copies;
580
581 sector += stripe << geo->chunk_shift;
582
583
584 for (n = 0; n < geo->near_copies; n++) {
585 int d = dev;
586 int set;
587 sector_t s = sector;
588 r10bio->devs[slot].devnum = d;
589 r10bio->devs[slot].addr = s;
590 slot++;
591
592 for (f = 1; f < geo->far_copies; f++) {
593 set = d / geo->far_set_size;
594 d += geo->near_copies;
595
596 if ((geo->raid_disks % geo->far_set_size) &&
597 (d > last_far_set_start)) {
598 d -= last_far_set_start;
599 d %= last_far_set_size;
600 d += last_far_set_start;
601 } else {
602 d %= geo->far_set_size;
603 d += geo->far_set_size * set;
604 }
605 s += geo->stride;
606 r10bio->devs[slot].devnum = d;
607 r10bio->devs[slot].addr = s;
608 slot++;
609 }
610 dev++;
611 if (dev >= geo->raid_disks) {
612 dev = 0;
613 sector += (geo->chunk_mask + 1);
614 }
615 }
616}
617
618static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
619{
620 struct geom *geo = &conf->geo;
621
622 if (conf->reshape_progress != MaxSector &&
623 ((r10bio->sector >= conf->reshape_progress) !=
624 conf->mddev->reshape_backwards)) {
625 set_bit(R10BIO_Previous, &r10bio->state);
626 geo = &conf->prev;
627 } else
628 clear_bit(R10BIO_Previous, &r10bio->state);
629
630 __raid10_find_phys(geo, r10bio);
631}
632
633static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
634{
635 sector_t offset, chunk, vchunk;
636
637
638
639 struct geom *geo = &conf->geo;
640 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
641 int far_set_size = geo->far_set_size;
642 int last_far_set_start;
643
644 if (geo->raid_disks % geo->far_set_size) {
645 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
646 last_far_set_start *= geo->far_set_size;
647
648 if (dev >= last_far_set_start) {
649 far_set_size = geo->far_set_size;
650 far_set_size += (geo->raid_disks % geo->far_set_size);
651 far_set_start = last_far_set_start;
652 }
653 }
654
655 offset = sector & geo->chunk_mask;
656 if (geo->far_offset) {
657 int fc;
658 chunk = sector >> geo->chunk_shift;
659 fc = sector_div(chunk, geo->far_copies);
660 dev -= fc * geo->near_copies;
661 if (dev < far_set_start)
662 dev += far_set_size;
663 } else {
664 while (sector >= geo->stride) {
665 sector -= geo->stride;
666 if (dev < (geo->near_copies + far_set_start))
667 dev += far_set_size - geo->near_copies;
668 else
669 dev -= geo->near_copies;
670 }
671 chunk = sector >> geo->chunk_shift;
672 }
673 vchunk = chunk * geo->raid_disks + dev;
674 sector_div(vchunk, geo->near_copies);
675 return (vchunk << geo->chunk_shift) + offset;
676}
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697static struct md_rdev *read_balance(struct r10conf *conf,
698 struct r10bio *r10_bio,
699 int *max_sectors)
700{
701 const sector_t this_sector = r10_bio->sector;
702 int disk, slot;
703 int sectors = r10_bio->sectors;
704 int best_good_sectors;
705 sector_t new_distance, best_dist;
706 struct md_rdev *best_rdev, *rdev = NULL;
707 int do_balance;
708 int best_slot;
709 struct geom *geo = &conf->geo;
710
711 raid10_find_phys(conf, r10_bio);
712 rcu_read_lock();
713 sectors = r10_bio->sectors;
714 best_slot = -1;
715 best_rdev = NULL;
716 best_dist = MaxSector;
717 best_good_sectors = 0;
718 do_balance = 1;
719
720
721
722
723
724
725 if (conf->mddev->recovery_cp < MaxSector
726 && (this_sector + sectors >= conf->next_resync))
727 do_balance = 0;
728
729 for (slot = 0; slot < conf->copies ; slot++) {
730 sector_t first_bad;
731 int bad_sectors;
732 sector_t dev_sector;
733
734 if (r10_bio->devs[slot].bio == IO_BLOCKED)
735 continue;
736 disk = r10_bio->devs[slot].devnum;
737 rdev = rcu_dereference(conf->mirrors[disk].replacement);
738 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
739 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
740 rdev = rcu_dereference(conf->mirrors[disk].rdev);
741 if (rdev == NULL ||
742 test_bit(Faulty, &rdev->flags))
743 continue;
744 if (!test_bit(In_sync, &rdev->flags) &&
745 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
746 continue;
747
748 dev_sector = r10_bio->devs[slot].addr;
749 if (is_badblock(rdev, dev_sector, sectors,
750 &first_bad, &bad_sectors)) {
751 if (best_dist < MaxSector)
752
753 continue;
754 if (first_bad <= dev_sector) {
755
756
757
758
759 bad_sectors -= (dev_sector - first_bad);
760 if (!do_balance && sectors > bad_sectors)
761 sectors = bad_sectors;
762 if (best_good_sectors > sectors)
763 best_good_sectors = sectors;
764 } else {
765 sector_t good_sectors =
766 first_bad - dev_sector;
767 if (good_sectors > best_good_sectors) {
768 best_good_sectors = good_sectors;
769 best_slot = slot;
770 best_rdev = rdev;
771 }
772 if (!do_balance)
773
774 break;
775 }
776 continue;
777 } else
778 best_good_sectors = sectors;
779
780 if (!do_balance)
781 break;
782
783
784
785
786
787 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
788 break;
789
790
791 if (geo->far_copies > 1)
792 new_distance = r10_bio->devs[slot].addr;
793 else
794 new_distance = abs(r10_bio->devs[slot].addr -
795 conf->mirrors[disk].head_position);
796 if (new_distance < best_dist) {
797 best_dist = new_distance;
798 best_slot = slot;
799 best_rdev = rdev;
800 }
801 }
802 if (slot >= conf->copies) {
803 slot = best_slot;
804 rdev = best_rdev;
805 }
806
807 if (slot >= 0) {
808 atomic_inc(&rdev->nr_pending);
809 r10_bio->read_slot = slot;
810 } else
811 rdev = NULL;
812 rcu_read_unlock();
813 *max_sectors = best_good_sectors;
814
815 return rdev;
816}
817
818static int raid10_congested(struct mddev *mddev, int bits)
819{
820 struct r10conf *conf = mddev->private;
821 int i, ret = 0;
822
823 if ((bits & (1 << WB_async_congested)) &&
824 conf->pending_count >= max_queued_requests)
825 return 1;
826
827 rcu_read_lock();
828 for (i = 0;
829 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
830 && ret == 0;
831 i++) {
832 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
833 if (rdev && !test_bit(Faulty, &rdev->flags)) {
834 struct request_queue *q = bdev_get_queue(rdev->bdev);
835
836 ret |= bdi_congested(&q->backing_dev_info, bits);
837 }
838 }
839 rcu_read_unlock();
840 return ret;
841}
842
843static void flush_pending_writes(struct r10conf *conf)
844{
845
846
847
848 spin_lock_irq(&conf->device_lock);
849
850 if (conf->pending_bio_list.head) {
851 struct bio *bio;
852 bio = bio_list_get(&conf->pending_bio_list);
853 conf->pending_count = 0;
854 spin_unlock_irq(&conf->device_lock);
855
856
857 bitmap_unplug(conf->mddev->bitmap);
858 wake_up(&conf->wait_barrier);
859
860 while (bio) {
861 struct bio *next = bio->bi_next;
862 bio->bi_next = NULL;
863 if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
864 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
865
866 bio_endio(bio);
867 else
868 generic_make_request(bio);
869 bio = next;
870 }
871 } else
872 spin_unlock_irq(&conf->device_lock);
873}
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897static void raise_barrier(struct r10conf *conf, int force)
898{
899 BUG_ON(force && !conf->barrier);
900 spin_lock_irq(&conf->resync_lock);
901
902
903 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
904 conf->resync_lock);
905
906
907 conf->barrier++;
908
909
910 wait_event_lock_irq(conf->wait_barrier,
911 !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
912 conf->resync_lock);
913
914 spin_unlock_irq(&conf->resync_lock);
915}
916
917static void lower_barrier(struct r10conf *conf)
918{
919 unsigned long flags;
920 spin_lock_irqsave(&conf->resync_lock, flags);
921 conf->barrier--;
922 spin_unlock_irqrestore(&conf->resync_lock, flags);
923 wake_up(&conf->wait_barrier);
924}
925
926static void wait_barrier(struct r10conf *conf)
927{
928 spin_lock_irq(&conf->resync_lock);
929 if (conf->barrier) {
930 conf->nr_waiting++;
931
932
933
934
935
936
937
938
939
940 wait_event_lock_irq(conf->wait_barrier,
941 !conf->barrier ||
942 (atomic_read(&conf->nr_pending) &&
943 current->bio_list &&
944 !bio_list_empty(current->bio_list)),
945 conf->resync_lock);
946 conf->nr_waiting--;
947 if (!conf->nr_waiting)
948 wake_up(&conf->wait_barrier);
949 }
950 atomic_inc(&conf->nr_pending);
951 spin_unlock_irq(&conf->resync_lock);
952}
953
954static void allow_barrier(struct r10conf *conf)
955{
956 if ((atomic_dec_and_test(&conf->nr_pending)) ||
957 (conf->array_freeze_pending))
958 wake_up(&conf->wait_barrier);
959}
960
961static void freeze_array(struct r10conf *conf, int extra)
962{
963
964
965
966
967
968
969
970
971
972
973
974
975 spin_lock_irq(&conf->resync_lock);
976 conf->array_freeze_pending++;
977 conf->barrier++;
978 conf->nr_waiting++;
979 wait_event_lock_irq_cmd(conf->wait_barrier,
980 atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
981 conf->resync_lock,
982 flush_pending_writes(conf));
983
984 conf->array_freeze_pending--;
985 spin_unlock_irq(&conf->resync_lock);
986}
987
988static void unfreeze_array(struct r10conf *conf)
989{
990
991 spin_lock_irq(&conf->resync_lock);
992 conf->barrier--;
993 conf->nr_waiting--;
994 wake_up(&conf->wait_barrier);
995 spin_unlock_irq(&conf->resync_lock);
996}
997
998static sector_t choose_data_offset(struct r10bio *r10_bio,
999 struct md_rdev *rdev)
1000{
1001 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1002 test_bit(R10BIO_Previous, &r10_bio->state))
1003 return rdev->data_offset;
1004 else
1005 return rdev->new_data_offset;
1006}
1007
1008struct raid10_plug_cb {
1009 struct blk_plug_cb cb;
1010 struct bio_list pending;
1011 int pending_cnt;
1012};
1013
1014static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1015{
1016 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1017 cb);
1018 struct mddev *mddev = plug->cb.data;
1019 struct r10conf *conf = mddev->private;
1020 struct bio *bio;
1021
1022 if (from_schedule || current->bio_list) {
1023 spin_lock_irq(&conf->device_lock);
1024 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1025 conf->pending_count += plug->pending_cnt;
1026 spin_unlock_irq(&conf->device_lock);
1027 wake_up(&conf->wait_barrier);
1028 md_wakeup_thread(mddev->thread);
1029 kfree(plug);
1030 return;
1031 }
1032
1033
1034 bio = bio_list_get(&plug->pending);
1035 bitmap_unplug(mddev->bitmap);
1036 wake_up(&conf->wait_barrier);
1037
1038 while (bio) {
1039 struct bio *next = bio->bi_next;
1040 bio->bi_next = NULL;
1041 if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
1042 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
1043
1044 bio_endio(bio);
1045 else
1046 generic_make_request(bio);
1047 bio = next;
1048 }
1049 kfree(plug);
1050}
1051
1052static void __make_request(struct mddev *mddev, struct bio *bio)
1053{
1054 struct r10conf *conf = mddev->private;
1055 struct r10bio *r10_bio;
1056 struct bio *read_bio;
1057 int i;
1058 const int op = bio_op(bio);
1059 const int rw = bio_data_dir(bio);
1060 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1061 const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
1062 unsigned long flags;
1063 struct md_rdev *blocked_rdev;
1064 struct blk_plug_cb *cb;
1065 struct raid10_plug_cb *plug = NULL;
1066 int sectors_handled;
1067 int max_sectors;
1068 int sectors;
1069
1070 md_write_start(mddev, bio);
1071
1072
1073
1074
1075
1076
1077 wait_barrier(conf);
1078
1079 sectors = bio_sectors(bio);
1080 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1081 bio->bi_iter.bi_sector < conf->reshape_progress &&
1082 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1083
1084
1085
1086 allow_barrier(conf);
1087 wait_event(conf->wait_barrier,
1088 conf->reshape_progress <= bio->bi_iter.bi_sector ||
1089 conf->reshape_progress >= bio->bi_iter.bi_sector +
1090 sectors);
1091 wait_barrier(conf);
1092 }
1093 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1094 bio_data_dir(bio) == WRITE &&
1095 (mddev->reshape_backwards
1096 ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
1097 bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
1098 : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
1099 bio->bi_iter.bi_sector < conf->reshape_progress))) {
1100
1101 mddev->reshape_position = conf->reshape_progress;
1102 set_mask_bits(&mddev->flags, 0,
1103 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
1104 md_wakeup_thread(mddev->thread);
1105 wait_event(mddev->sb_wait,
1106 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
1107
1108 conf->reshape_safe = mddev->reshape_position;
1109 }
1110
1111 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1112
1113 r10_bio->master_bio = bio;
1114 r10_bio->sectors = sectors;
1115
1116 r10_bio->mddev = mddev;
1117 r10_bio->sector = bio->bi_iter.bi_sector;
1118 r10_bio->state = 0;
1119
1120
1121
1122
1123
1124
1125
1126
1127 bio->bi_phys_segments = 0;
1128 bio_clear_flag(bio, BIO_SEG_VALID);
1129
1130 if (rw == READ) {
1131
1132
1133
1134 struct md_rdev *rdev;
1135 int slot;
1136
1137read_again:
1138 rdev = read_balance(conf, r10_bio, &max_sectors);
1139 if (!rdev) {
1140 raid_end_bio_io(r10_bio);
1141 return;
1142 }
1143 slot = r10_bio->read_slot;
1144
1145 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1146 bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
1147 max_sectors);
1148
1149 r10_bio->devs[slot].bio = read_bio;
1150 r10_bio->devs[slot].rdev = rdev;
1151
1152 read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
1153 choose_data_offset(r10_bio, rdev);
1154 read_bio->bi_bdev = rdev->bdev;
1155 read_bio->bi_end_io = raid10_end_read_request;
1156 bio_set_op_attrs(read_bio, op, do_sync);
1157 read_bio->bi_private = r10_bio;
1158
1159 if (max_sectors < r10_bio->sectors) {
1160
1161
1162
1163 sectors_handled = (r10_bio->sector + max_sectors
1164 - bio->bi_iter.bi_sector);
1165 r10_bio->sectors = max_sectors;
1166 spin_lock_irq(&conf->device_lock);
1167 if (bio->bi_phys_segments == 0)
1168 bio->bi_phys_segments = 2;
1169 else
1170 bio->bi_phys_segments++;
1171 spin_unlock_irq(&conf->device_lock);
1172
1173
1174
1175
1176
1177 reschedule_retry(r10_bio);
1178
1179 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1180
1181 r10_bio->master_bio = bio;
1182 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1183 r10_bio->state = 0;
1184 r10_bio->mddev = mddev;
1185 r10_bio->sector = bio->bi_iter.bi_sector +
1186 sectors_handled;
1187 goto read_again;
1188 } else
1189 generic_make_request(read_bio);
1190 return;
1191 }
1192
1193
1194
1195
1196 if (conf->pending_count >= max_queued_requests) {
1197 md_wakeup_thread(mddev->thread);
1198 wait_event(conf->wait_barrier,
1199 conf->pending_count < max_queued_requests);
1200 }
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213 r10_bio->read_slot = -1;
1214 raid10_find_phys(conf, r10_bio);
1215retry_write:
1216 blocked_rdev = NULL;
1217 rcu_read_lock();
1218 max_sectors = r10_bio->sectors;
1219
1220 for (i = 0; i < conf->copies; i++) {
1221 int d = r10_bio->devs[i].devnum;
1222 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1223 struct md_rdev *rrdev = rcu_dereference(
1224 conf->mirrors[d].replacement);
1225 if (rdev == rrdev)
1226 rrdev = NULL;
1227 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1228 atomic_inc(&rdev->nr_pending);
1229 blocked_rdev = rdev;
1230 break;
1231 }
1232 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1233 atomic_inc(&rrdev->nr_pending);
1234 blocked_rdev = rrdev;
1235 break;
1236 }
1237 if (rdev && (test_bit(Faulty, &rdev->flags)))
1238 rdev = NULL;
1239 if (rrdev && (test_bit(Faulty, &rrdev->flags)))
1240 rrdev = NULL;
1241
1242 r10_bio->devs[i].bio = NULL;
1243 r10_bio->devs[i].repl_bio = NULL;
1244
1245 if (!rdev && !rrdev) {
1246 set_bit(R10BIO_Degraded, &r10_bio->state);
1247 continue;
1248 }
1249 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1250 sector_t first_bad;
1251 sector_t dev_sector = r10_bio->devs[i].addr;
1252 int bad_sectors;
1253 int is_bad;
1254
1255 is_bad = is_badblock(rdev, dev_sector,
1256 max_sectors,
1257 &first_bad, &bad_sectors);
1258 if (is_bad < 0) {
1259
1260
1261
1262 atomic_inc(&rdev->nr_pending);
1263 set_bit(BlockedBadBlocks, &rdev->flags);
1264 blocked_rdev = rdev;
1265 break;
1266 }
1267 if (is_bad && first_bad <= dev_sector) {
1268
1269 bad_sectors -= (dev_sector - first_bad);
1270 if (bad_sectors < max_sectors)
1271
1272
1273
1274 max_sectors = bad_sectors;
1275
1276
1277
1278
1279
1280
1281
1282
1283 continue;
1284 }
1285 if (is_bad) {
1286 int good_sectors = first_bad - dev_sector;
1287 if (good_sectors < max_sectors)
1288 max_sectors = good_sectors;
1289 }
1290 }
1291 if (rdev) {
1292 r10_bio->devs[i].bio = bio;
1293 atomic_inc(&rdev->nr_pending);
1294 }
1295 if (rrdev) {
1296 r10_bio->devs[i].repl_bio = bio;
1297 atomic_inc(&rrdev->nr_pending);
1298 }
1299 }
1300 rcu_read_unlock();
1301
1302 if (unlikely(blocked_rdev)) {
1303
1304 int j;
1305 int d;
1306
1307 for (j = 0; j < i; j++) {
1308 if (r10_bio->devs[j].bio) {
1309 d = r10_bio->devs[j].devnum;
1310 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1311 }
1312 if (r10_bio->devs[j].repl_bio) {
1313 struct md_rdev *rdev;
1314 d = r10_bio->devs[j].devnum;
1315 rdev = conf->mirrors[d].replacement;
1316 if (!rdev) {
1317
1318 smp_mb();
1319 rdev = conf->mirrors[d].rdev;
1320 }
1321 rdev_dec_pending(rdev, mddev);
1322 }
1323 }
1324 allow_barrier(conf);
1325 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1326 wait_barrier(conf);
1327 goto retry_write;
1328 }
1329
1330 if (max_sectors < r10_bio->sectors) {
1331
1332
1333
1334 r10_bio->sectors = max_sectors;
1335 spin_lock_irq(&conf->device_lock);
1336 if (bio->bi_phys_segments == 0)
1337 bio->bi_phys_segments = 2;
1338 else
1339 bio->bi_phys_segments++;
1340 spin_unlock_irq(&conf->device_lock);
1341 }
1342 sectors_handled = r10_bio->sector + max_sectors -
1343 bio->bi_iter.bi_sector;
1344
1345 atomic_set(&r10_bio->remaining, 1);
1346 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1347
1348 for (i = 0; i < conf->copies; i++) {
1349 struct bio *mbio;
1350 int d = r10_bio->devs[i].devnum;
1351 if (r10_bio->devs[i].bio) {
1352 struct md_rdev *rdev = conf->mirrors[d].rdev;
1353 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1354 bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
1355 max_sectors);
1356 r10_bio->devs[i].bio = mbio;
1357
1358 mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
1359 choose_data_offset(r10_bio,
1360 rdev));
1361 mbio->bi_bdev = rdev->bdev;
1362 mbio->bi_end_io = raid10_end_write_request;
1363 bio_set_op_attrs(mbio, op, do_sync | do_fua);
1364 mbio->bi_private = r10_bio;
1365
1366 atomic_inc(&r10_bio->remaining);
1367
1368 cb = blk_check_plugged(raid10_unplug, mddev,
1369 sizeof(*plug));
1370 if (cb)
1371 plug = container_of(cb, struct raid10_plug_cb,
1372 cb);
1373 else
1374 plug = NULL;
1375 spin_lock_irqsave(&conf->device_lock, flags);
1376 if (plug) {
1377 bio_list_add(&plug->pending, mbio);
1378 plug->pending_cnt++;
1379 } else {
1380 bio_list_add(&conf->pending_bio_list, mbio);
1381 conf->pending_count++;
1382 }
1383 spin_unlock_irqrestore(&conf->device_lock, flags);
1384 if (!plug)
1385 md_wakeup_thread(mddev->thread);
1386 }
1387
1388 if (r10_bio->devs[i].repl_bio) {
1389 struct md_rdev *rdev = conf->mirrors[d].replacement;
1390 if (rdev == NULL) {
1391
1392 smp_mb();
1393 rdev = conf->mirrors[d].rdev;
1394 }
1395 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1396 bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
1397 max_sectors);
1398 r10_bio->devs[i].repl_bio = mbio;
1399
1400 mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr +
1401 choose_data_offset(
1402 r10_bio, rdev));
1403 mbio->bi_bdev = rdev->bdev;
1404 mbio->bi_end_io = raid10_end_write_request;
1405 bio_set_op_attrs(mbio, op, do_sync | do_fua);
1406 mbio->bi_private = r10_bio;
1407
1408 atomic_inc(&r10_bio->remaining);
1409 spin_lock_irqsave(&conf->device_lock, flags);
1410 bio_list_add(&conf->pending_bio_list, mbio);
1411 conf->pending_count++;
1412 spin_unlock_irqrestore(&conf->device_lock, flags);
1413 if (!mddev_check_plugged(mddev))
1414 md_wakeup_thread(mddev->thread);
1415 }
1416 }
1417
1418
1419
1420
1421
1422 if (sectors_handled < bio_sectors(bio)) {
1423 one_write_done(r10_bio);
1424
1425
1426
1427 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1428
1429 r10_bio->master_bio = bio;
1430 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1431
1432 r10_bio->mddev = mddev;
1433 r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
1434 r10_bio->state = 0;
1435 goto retry_write;
1436 }
1437 one_write_done(r10_bio);
1438}
1439
1440static void raid10_make_request(struct mddev *mddev, struct bio *bio)
1441{
1442 struct r10conf *conf = mddev->private;
1443 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1444 int chunk_sects = chunk_mask + 1;
1445
1446 struct bio *split;
1447
1448 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1449 md_flush_request(mddev, bio);
1450 return;
1451 }
1452
1453 do {
1454
1455
1456
1457
1458
1459 if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
1460 bio_sectors(bio) > chunk_sects
1461 && (conf->geo.near_copies < conf->geo.raid_disks
1462 || conf->prev.near_copies <
1463 conf->prev.raid_disks))) {
1464 split = bio_split(bio, chunk_sects -
1465 (bio->bi_iter.bi_sector &
1466 (chunk_sects - 1)),
1467 GFP_NOIO, fs_bio_set);
1468 bio_chain(split, bio);
1469 } else {
1470 split = bio;
1471 }
1472
1473 __make_request(mddev, split);
1474 } while (split != bio);
1475
1476
1477 wake_up(&conf->wait_barrier);
1478}
1479
1480static void raid10_status(struct seq_file *seq, struct mddev *mddev)
1481{
1482 struct r10conf *conf = mddev->private;
1483 int i;
1484
1485 if (conf->geo.near_copies < conf->geo.raid_disks)
1486 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1487 if (conf->geo.near_copies > 1)
1488 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1489 if (conf->geo.far_copies > 1) {
1490 if (conf->geo.far_offset)
1491 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1492 else
1493 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1494 if (conf->geo.far_set_size != conf->geo.raid_disks)
1495 seq_printf(seq, " %d devices per set", conf->geo.far_set_size);
1496 }
1497 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1498 conf->geo.raid_disks - mddev->degraded);
1499 rcu_read_lock();
1500 for (i = 0; i < conf->geo.raid_disks; i++) {
1501 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1502 seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1503 }
1504 rcu_read_unlock();
1505 seq_printf(seq, "]");
1506}
1507
1508
1509
1510
1511
1512
1513static int _enough(struct r10conf *conf, int previous, int ignore)
1514{
1515 int first = 0;
1516 int has_enough = 0;
1517 int disks, ncopies;
1518 if (previous) {
1519 disks = conf->prev.raid_disks;
1520 ncopies = conf->prev.near_copies;
1521 } else {
1522 disks = conf->geo.raid_disks;
1523 ncopies = conf->geo.near_copies;
1524 }
1525
1526 rcu_read_lock();
1527 do {
1528 int n = conf->copies;
1529 int cnt = 0;
1530 int this = first;
1531 while (n--) {
1532 struct md_rdev *rdev;
1533 if (this != ignore &&
1534 (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
1535 test_bit(In_sync, &rdev->flags))
1536 cnt++;
1537 this = (this+1) % disks;
1538 }
1539 if (cnt == 0)
1540 goto out;
1541 first = (first + ncopies) % disks;
1542 } while (first != 0);
1543 has_enough = 1;
1544out:
1545 rcu_read_unlock();
1546 return has_enough;
1547}
1548
1549static int enough(struct r10conf *conf, int ignore)
1550{
1551
1552
1553
1554
1555
1556 return _enough(conf, 0, ignore) &&
1557 _enough(conf, 1, ignore);
1558}
1559
1560static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
1561{
1562 char b[BDEVNAME_SIZE];
1563 struct r10conf *conf = mddev->private;
1564 unsigned long flags;
1565
1566
1567
1568
1569
1570
1571
1572 spin_lock_irqsave(&conf->device_lock, flags);
1573 if (test_bit(In_sync, &rdev->flags)
1574 && !enough(conf, rdev->raid_disk)) {
1575
1576
1577
1578 spin_unlock_irqrestore(&conf->device_lock, flags);
1579 return;
1580 }
1581 if (test_and_clear_bit(In_sync, &rdev->flags))
1582 mddev->degraded++;
1583
1584
1585
1586 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1587 set_bit(Blocked, &rdev->flags);
1588 set_bit(Faulty, &rdev->flags);
1589 set_mask_bits(&mddev->flags, 0,
1590 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
1591 spin_unlock_irqrestore(&conf->device_lock, flags);
1592 printk(KERN_ALERT
1593 "md/raid10:%s: Disk failure on %s, disabling device.\n"
1594 "md/raid10:%s: Operation continuing on %d devices.\n",
1595 mdname(mddev), bdevname(rdev->bdev, b),
1596 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1597}
1598
1599static void print_conf(struct r10conf *conf)
1600{
1601 int i;
1602 struct md_rdev *rdev;
1603
1604 printk(KERN_DEBUG "RAID10 conf printout:\n");
1605 if (!conf) {
1606 printk(KERN_DEBUG "(!conf)\n");
1607 return;
1608 }
1609 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1610 conf->geo.raid_disks);
1611
1612
1613
1614 for (i = 0; i < conf->geo.raid_disks; i++) {
1615 char b[BDEVNAME_SIZE];
1616 rdev = conf->mirrors[i].rdev;
1617 if (rdev)
1618 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1619 i, !test_bit(In_sync, &rdev->flags),
1620 !test_bit(Faulty, &rdev->flags),
1621 bdevname(rdev->bdev,b));
1622 }
1623}
1624
1625static void close_sync(struct r10conf *conf)
1626{
1627 wait_barrier(conf);
1628 allow_barrier(conf);
1629
1630 mempool_destroy(conf->r10buf_pool);
1631 conf->r10buf_pool = NULL;
1632}
1633
1634static int raid10_spare_active(struct mddev *mddev)
1635{
1636 int i;
1637 struct r10conf *conf = mddev->private;
1638 struct raid10_info *tmp;
1639 int count = 0;
1640 unsigned long flags;
1641
1642
1643
1644
1645
1646 for (i = 0; i < conf->geo.raid_disks; i++) {
1647 tmp = conf->mirrors + i;
1648 if (tmp->replacement
1649 && tmp->replacement->recovery_offset == MaxSector
1650 && !test_bit(Faulty, &tmp->replacement->flags)
1651 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1652
1653 if (!tmp->rdev
1654 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1655 count++;
1656 if (tmp->rdev) {
1657
1658
1659
1660
1661 set_bit(Faulty, &tmp->rdev->flags);
1662 sysfs_notify_dirent_safe(
1663 tmp->rdev->sysfs_state);
1664 }
1665 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1666 } else if (tmp->rdev
1667 && tmp->rdev->recovery_offset == MaxSector
1668 && !test_bit(Faulty, &tmp->rdev->flags)
1669 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1670 count++;
1671 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
1672 }
1673 }
1674 spin_lock_irqsave(&conf->device_lock, flags);
1675 mddev->degraded -= count;
1676 spin_unlock_irqrestore(&conf->device_lock, flags);
1677
1678 print_conf(conf);
1679 return count;
1680}
1681
1682static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1683{
1684 struct r10conf *conf = mddev->private;
1685 int err = -EEXIST;
1686 int mirror;
1687 int first = 0;
1688 int last = conf->geo.raid_disks - 1;
1689
1690 if (mddev->recovery_cp < MaxSector)
1691
1692
1693
1694 return -EBUSY;
1695 if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
1696 return -EINVAL;
1697
1698 if (md_integrity_add_rdev(rdev, mddev))
1699 return -ENXIO;
1700
1701 if (rdev->raid_disk >= 0)
1702 first = last = rdev->raid_disk;
1703
1704 if (rdev->saved_raid_disk >= first &&
1705 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1706 mirror = rdev->saved_raid_disk;
1707 else
1708 mirror = first;
1709 for ( ; mirror <= last ; mirror++) {
1710 struct raid10_info *p = &conf->mirrors[mirror];
1711 if (p->recovery_disabled == mddev->recovery_disabled)
1712 continue;
1713 if (p->rdev) {
1714 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1715 p->replacement != NULL)
1716 continue;
1717 clear_bit(In_sync, &rdev->flags);
1718 set_bit(Replacement, &rdev->flags);
1719 rdev->raid_disk = mirror;
1720 err = 0;
1721 if (mddev->gendisk)
1722 disk_stack_limits(mddev->gendisk, rdev->bdev,
1723 rdev->data_offset << 9);
1724 conf->fullsync = 1;
1725 rcu_assign_pointer(p->replacement, rdev);
1726 break;
1727 }
1728
1729 if (mddev->gendisk)
1730 disk_stack_limits(mddev->gendisk, rdev->bdev,
1731 rdev->data_offset << 9);
1732
1733 p->head_position = 0;
1734 p->recovery_disabled = mddev->recovery_disabled - 1;
1735 rdev->raid_disk = mirror;
1736 err = 0;
1737 if (rdev->saved_raid_disk != mirror)
1738 conf->fullsync = 1;
1739 rcu_assign_pointer(p->rdev, rdev);
1740 break;
1741 }
1742 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1743 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
1744
1745 print_conf(conf);
1746 return err;
1747}
1748
1749static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1750{
1751 struct r10conf *conf = mddev->private;
1752 int err = 0;
1753 int number = rdev->raid_disk;
1754 struct md_rdev **rdevp;
1755 struct raid10_info *p = conf->mirrors + number;
1756
1757 print_conf(conf);
1758 if (rdev == p->rdev)
1759 rdevp = &p->rdev;
1760 else if (rdev == p->replacement)
1761 rdevp = &p->replacement;
1762 else
1763 return 0;
1764
1765 if (test_bit(In_sync, &rdev->flags) ||
1766 atomic_read(&rdev->nr_pending)) {
1767 err = -EBUSY;
1768 goto abort;
1769 }
1770
1771
1772
1773 if (!test_bit(Faulty, &rdev->flags) &&
1774 mddev->recovery_disabled != p->recovery_disabled &&
1775 (!p->replacement || p->replacement == rdev) &&
1776 number < conf->geo.raid_disks &&
1777 enough(conf, -1)) {
1778 err = -EBUSY;
1779 goto abort;
1780 }
1781 *rdevp = NULL;
1782 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
1783 synchronize_rcu();
1784 if (atomic_read(&rdev->nr_pending)) {
1785
1786 err = -EBUSY;
1787 *rdevp = rdev;
1788 goto abort;
1789 }
1790 }
1791 if (p->replacement) {
1792
1793 p->rdev = p->replacement;
1794 clear_bit(Replacement, &p->replacement->flags);
1795 smp_mb();
1796
1797
1798 p->replacement = NULL;
1799 clear_bit(WantReplacement, &rdev->flags);
1800 } else
1801
1802
1803
1804 clear_bit(WantReplacement, &rdev->flags);
1805
1806 err = md_integrity_register(mddev);
1807
1808abort:
1809
1810 print_conf(conf);
1811 return err;
1812}
1813
1814static void end_sync_read(struct bio *bio)
1815{
1816 struct r10bio *r10_bio = bio->bi_private;
1817 struct r10conf *conf = r10_bio->mddev->private;
1818 int d;
1819
1820 if (bio == r10_bio->master_bio) {
1821
1822 d = r10_bio->read_slot;
1823 } else
1824 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1825
1826 if (!bio->bi_error)
1827 set_bit(R10BIO_Uptodate, &r10_bio->state);
1828 else
1829
1830
1831
1832 atomic_add(r10_bio->sectors,
1833 &conf->mirrors[d].rdev->corrected_errors);
1834
1835
1836
1837
1838 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1839 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1840 atomic_dec_and_test(&r10_bio->remaining)) {
1841
1842
1843
1844 reschedule_retry(r10_bio);
1845 }
1846}
1847
1848static void end_sync_request(struct r10bio *r10_bio)
1849{
1850 struct mddev *mddev = r10_bio->mddev;
1851
1852 while (atomic_dec_and_test(&r10_bio->remaining)) {
1853 if (r10_bio->master_bio == NULL) {
1854
1855 sector_t s = r10_bio->sectors;
1856 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1857 test_bit(R10BIO_WriteError, &r10_bio->state))
1858 reschedule_retry(r10_bio);
1859 else
1860 put_buf(r10_bio);
1861 md_done_sync(mddev, s, 1);
1862 break;
1863 } else {
1864 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1865 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1866 test_bit(R10BIO_WriteError, &r10_bio->state))
1867 reschedule_retry(r10_bio);
1868 else
1869 put_buf(r10_bio);
1870 r10_bio = r10_bio2;
1871 }
1872 }
1873}
1874
1875static void end_sync_write(struct bio *bio)
1876{
1877 struct r10bio *r10_bio = bio->bi_private;
1878 struct mddev *mddev = r10_bio->mddev;
1879 struct r10conf *conf = mddev->private;
1880 int d;
1881 sector_t first_bad;
1882 int bad_sectors;
1883 int slot;
1884 int repl;
1885 struct md_rdev *rdev = NULL;
1886
1887 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1888 if (repl)
1889 rdev = conf->mirrors[d].replacement;
1890 else
1891 rdev = conf->mirrors[d].rdev;
1892
1893 if (bio->bi_error) {
1894 if (repl)
1895 md_error(mddev, rdev);
1896 else {
1897 set_bit(WriteErrorSeen, &rdev->flags);
1898 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1899 set_bit(MD_RECOVERY_NEEDED,
1900 &rdev->mddev->recovery);
1901 set_bit(R10BIO_WriteError, &r10_bio->state);
1902 }
1903 } else if (is_badblock(rdev,
1904 r10_bio->devs[slot].addr,
1905 r10_bio->sectors,
1906 &first_bad, &bad_sectors))
1907 set_bit(R10BIO_MadeGood, &r10_bio->state);
1908
1909 rdev_dec_pending(rdev, mddev);
1910
1911 end_sync_request(r10_bio);
1912}
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1931{
1932 struct r10conf *conf = mddev->private;
1933 int i, first;
1934 struct bio *tbio, *fbio;
1935 int vcnt;
1936
1937 atomic_set(&r10_bio->remaining, 1);
1938
1939
1940 for (i=0; i<conf->copies; i++)
1941 if (!r10_bio->devs[i].bio->bi_error)
1942 break;
1943
1944 if (i == conf->copies)
1945 goto done;
1946
1947 first = i;
1948 fbio = r10_bio->devs[i].bio;
1949 fbio->bi_iter.bi_size = r10_bio->sectors << 9;
1950 fbio->bi_iter.bi_idx = 0;
1951
1952 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
1953
1954 for (i=0 ; i < conf->copies ; i++) {
1955 int j, d;
1956
1957 tbio = r10_bio->devs[i].bio;
1958
1959 if (tbio->bi_end_io != end_sync_read)
1960 continue;
1961 if (i == first)
1962 continue;
1963 if (!r10_bio->devs[i].bio->bi_error) {
1964
1965
1966
1967
1968 int sectors = r10_bio->sectors;
1969 for (j = 0; j < vcnt; j++) {
1970 int len = PAGE_SIZE;
1971 if (sectors < (len / 512))
1972 len = sectors * 512;
1973 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
1974 page_address(tbio->bi_io_vec[j].bv_page),
1975 len))
1976 break;
1977 sectors -= len/512;
1978 }
1979 if (j == vcnt)
1980 continue;
1981 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
1982 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1983
1984 continue;
1985 }
1986
1987
1988
1989
1990
1991 bio_reset(tbio);
1992
1993 tbio->bi_vcnt = vcnt;
1994 tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
1995 tbio->bi_private = r10_bio;
1996 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
1997 tbio->bi_end_io = end_sync_write;
1998 bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
1999
2000 bio_copy_data(tbio, fbio);
2001
2002 d = r10_bio->devs[i].devnum;
2003 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2004 atomic_inc(&r10_bio->remaining);
2005 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2006
2007 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
2008 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
2009 generic_make_request(tbio);
2010 }
2011
2012
2013
2014
2015 for (i = 0; i < conf->copies; i++) {
2016 int d;
2017
2018 tbio = r10_bio->devs[i].repl_bio;
2019 if (!tbio || !tbio->bi_end_io)
2020 continue;
2021 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2022 && r10_bio->devs[i].bio != fbio)
2023 bio_copy_data(tbio, fbio);
2024 d = r10_bio->devs[i].devnum;
2025 atomic_inc(&r10_bio->remaining);
2026 md_sync_acct(conf->mirrors[d].replacement->bdev,
2027 bio_sectors(tbio));
2028 generic_make_request(tbio);
2029 }
2030
2031done:
2032 if (atomic_dec_and_test(&r10_bio->remaining)) {
2033 md_done_sync(mddev, r10_bio->sectors, 1);
2034 put_buf(r10_bio);
2035 }
2036}
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048static void fix_recovery_read_error(struct r10bio *r10_bio)
2049{
2050
2051
2052
2053
2054
2055
2056
2057 struct mddev *mddev = r10_bio->mddev;
2058 struct r10conf *conf = mddev->private;
2059 struct bio *bio = r10_bio->devs[0].bio;
2060 sector_t sect = 0;
2061 int sectors = r10_bio->sectors;
2062 int idx = 0;
2063 int dr = r10_bio->devs[0].devnum;
2064 int dw = r10_bio->devs[1].devnum;
2065
2066 while (sectors) {
2067 int s = sectors;
2068 struct md_rdev *rdev;
2069 sector_t addr;
2070 int ok;
2071
2072 if (s > (PAGE_SIZE>>9))
2073 s = PAGE_SIZE >> 9;
2074
2075 rdev = conf->mirrors[dr].rdev;
2076 addr = r10_bio->devs[0].addr + sect,
2077 ok = sync_page_io(rdev,
2078 addr,
2079 s << 9,
2080 bio->bi_io_vec[idx].bv_page,
2081 REQ_OP_READ, 0, false);
2082 if (ok) {
2083 rdev = conf->mirrors[dw].rdev;
2084 addr = r10_bio->devs[1].addr + sect;
2085 ok = sync_page_io(rdev,
2086 addr,
2087 s << 9,
2088 bio->bi_io_vec[idx].bv_page,
2089 REQ_OP_WRITE, 0, false);
2090 if (!ok) {
2091 set_bit(WriteErrorSeen, &rdev->flags);
2092 if (!test_and_set_bit(WantReplacement,
2093 &rdev->flags))
2094 set_bit(MD_RECOVERY_NEEDED,
2095 &rdev->mddev->recovery);
2096 }
2097 }
2098 if (!ok) {
2099
2100
2101
2102
2103 rdev_set_badblocks(rdev, addr, s, 0);
2104
2105 if (rdev != conf->mirrors[dw].rdev) {
2106
2107 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2108 addr = r10_bio->devs[1].addr + sect;
2109 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2110 if (!ok) {
2111
2112 printk(KERN_NOTICE
2113 "md/raid10:%s: recovery aborted"
2114 " due to read error\n",
2115 mdname(mddev));
2116
2117 conf->mirrors[dw].recovery_disabled
2118 = mddev->recovery_disabled;
2119 set_bit(MD_RECOVERY_INTR,
2120 &mddev->recovery);
2121 break;
2122 }
2123 }
2124 }
2125
2126 sectors -= s;
2127 sect += s;
2128 idx++;
2129 }
2130}
2131
2132static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2133{
2134 struct r10conf *conf = mddev->private;
2135 int d;
2136 struct bio *wbio, *wbio2;
2137
2138 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2139 fix_recovery_read_error(r10_bio);
2140 end_sync_request(r10_bio);
2141 return;
2142 }
2143
2144
2145
2146
2147
2148 d = r10_bio->devs[1].devnum;
2149 wbio = r10_bio->devs[1].bio;
2150 wbio2 = r10_bio->devs[1].repl_bio;
2151
2152
2153
2154
2155 if (wbio2 && !wbio2->bi_end_io)
2156 wbio2 = NULL;
2157 if (wbio->bi_end_io) {
2158 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2159 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2160 generic_make_request(wbio);
2161 }
2162 if (wbio2) {
2163 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2164 md_sync_acct(conf->mirrors[d].replacement->bdev,
2165 bio_sectors(wbio2));
2166 generic_make_request(wbio2);
2167 }
2168}
2169
2170
2171
2172
2173
2174
2175
2176static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2177{
2178 long cur_time_mon;
2179 unsigned long hours_since_last;
2180 unsigned int read_errors = atomic_read(&rdev->read_errors);
2181
2182 cur_time_mon = ktime_get_seconds();
2183
2184 if (rdev->last_read_error == 0) {
2185
2186 rdev->last_read_error = cur_time_mon;
2187 return;
2188 }
2189
2190 hours_since_last = (long)(cur_time_mon -
2191 rdev->last_read_error) / 3600;
2192
2193 rdev->last_read_error = cur_time_mon;
2194
2195
2196
2197
2198
2199
2200 if (hours_since_last >= 8 * sizeof(read_errors))
2201 atomic_set(&rdev->read_errors, 0);
2202 else
2203 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2204}
2205
2206static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2207 int sectors, struct page *page, int rw)
2208{
2209 sector_t first_bad;
2210 int bad_sectors;
2211
2212 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2213 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2214 return -1;
2215 if (sync_page_io(rdev, sector, sectors << 9, page, rw, 0, false))
2216
2217 return 1;
2218 if (rw == WRITE) {
2219 set_bit(WriteErrorSeen, &rdev->flags);
2220 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2221 set_bit(MD_RECOVERY_NEEDED,
2222 &rdev->mddev->recovery);
2223 }
2224
2225 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2226 md_error(rdev->mddev, rdev);
2227 return 0;
2228}
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2239{
2240 int sect = 0;
2241 int sectors = r10_bio->sectors;
2242 struct md_rdev*rdev;
2243 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2244 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2245
2246
2247
2248
2249 rdev = conf->mirrors[d].rdev;
2250
2251 if (test_bit(Faulty, &rdev->flags))
2252
2253
2254 return;
2255
2256 check_decay_read_errors(mddev, rdev);
2257 atomic_inc(&rdev->read_errors);
2258 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2259 char b[BDEVNAME_SIZE];
2260 bdevname(rdev->bdev, b);
2261
2262 printk(KERN_NOTICE
2263 "md/raid10:%s: %s: Raid device exceeded "
2264 "read_error threshold [cur %d:max %d]\n",
2265 mdname(mddev), b,
2266 atomic_read(&rdev->read_errors), max_read_errors);
2267 printk(KERN_NOTICE
2268 "md/raid10:%s: %s: Failing raid device\n",
2269 mdname(mddev), b);
2270 md_error(mddev, rdev);
2271 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2272 return;
2273 }
2274
2275 while(sectors) {
2276 int s = sectors;
2277 int sl = r10_bio->read_slot;
2278 int success = 0;
2279 int start;
2280
2281 if (s > (PAGE_SIZE>>9))
2282 s = PAGE_SIZE >> 9;
2283
2284 rcu_read_lock();
2285 do {
2286 sector_t first_bad;
2287 int bad_sectors;
2288
2289 d = r10_bio->devs[sl].devnum;
2290 rdev = rcu_dereference(conf->mirrors[d].rdev);
2291 if (rdev &&
2292 test_bit(In_sync, &rdev->flags) &&
2293 !test_bit(Faulty, &rdev->flags) &&
2294 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2295 &first_bad, &bad_sectors) == 0) {
2296 atomic_inc(&rdev->nr_pending);
2297 rcu_read_unlock();
2298 success = sync_page_io(rdev,
2299 r10_bio->devs[sl].addr +
2300 sect,
2301 s<<9,
2302 conf->tmppage,
2303 REQ_OP_READ, 0, false);
2304 rdev_dec_pending(rdev, mddev);
2305 rcu_read_lock();
2306 if (success)
2307 break;
2308 }
2309 sl++;
2310 if (sl == conf->copies)
2311 sl = 0;
2312 } while (!success && sl != r10_bio->read_slot);
2313 rcu_read_unlock();
2314
2315 if (!success) {
2316
2317
2318
2319
2320 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2321 rdev = conf->mirrors[dn].rdev;
2322
2323 if (!rdev_set_badblocks(
2324 rdev,
2325 r10_bio->devs[r10_bio->read_slot].addr
2326 + sect,
2327 s, 0)) {
2328 md_error(mddev, rdev);
2329 r10_bio->devs[r10_bio->read_slot].bio
2330 = IO_BLOCKED;
2331 }
2332 break;
2333 }
2334
2335 start = sl;
2336
2337 rcu_read_lock();
2338 while (sl != r10_bio->read_slot) {
2339 char b[BDEVNAME_SIZE];
2340
2341 if (sl==0)
2342 sl = conf->copies;
2343 sl--;
2344 d = r10_bio->devs[sl].devnum;
2345 rdev = rcu_dereference(conf->mirrors[d].rdev);
2346 if (!rdev ||
2347 test_bit(Faulty, &rdev->flags) ||
2348 !test_bit(In_sync, &rdev->flags))
2349 continue;
2350
2351 atomic_inc(&rdev->nr_pending);
2352 rcu_read_unlock();
2353 if (r10_sync_page_io(rdev,
2354 r10_bio->devs[sl].addr +
2355 sect,
2356 s, conf->tmppage, WRITE)
2357 == 0) {
2358
2359 printk(KERN_NOTICE
2360 "md/raid10:%s: read correction "
2361 "write failed"
2362 " (%d sectors at %llu on %s)\n",
2363 mdname(mddev), s,
2364 (unsigned long long)(
2365 sect +
2366 choose_data_offset(r10_bio,
2367 rdev)),
2368 bdevname(rdev->bdev, b));
2369 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2370 "drive\n",
2371 mdname(mddev),
2372 bdevname(rdev->bdev, b));
2373 }
2374 rdev_dec_pending(rdev, mddev);
2375 rcu_read_lock();
2376 }
2377 sl = start;
2378 while (sl != r10_bio->read_slot) {
2379 char b[BDEVNAME_SIZE];
2380
2381 if (sl==0)
2382 sl = conf->copies;
2383 sl--;
2384 d = r10_bio->devs[sl].devnum;
2385 rdev = rcu_dereference(conf->mirrors[d].rdev);
2386 if (!rdev ||
2387 test_bit(Faulty, &rdev->flags) ||
2388 !test_bit(In_sync, &rdev->flags))
2389 continue;
2390
2391 atomic_inc(&rdev->nr_pending);
2392 rcu_read_unlock();
2393 switch (r10_sync_page_io(rdev,
2394 r10_bio->devs[sl].addr +
2395 sect,
2396 s, conf->tmppage,
2397 READ)) {
2398 case 0:
2399
2400 printk(KERN_NOTICE
2401 "md/raid10:%s: unable to read back "
2402 "corrected sectors"
2403 " (%d sectors at %llu on %s)\n",
2404 mdname(mddev), s,
2405 (unsigned long long)(
2406 sect +
2407 choose_data_offset(r10_bio, rdev)),
2408 bdevname(rdev->bdev, b));
2409 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2410 "drive\n",
2411 mdname(mddev),
2412 bdevname(rdev->bdev, b));
2413 break;
2414 case 1:
2415 printk(KERN_INFO
2416 "md/raid10:%s: read error corrected"
2417 " (%d sectors at %llu on %s)\n",
2418 mdname(mddev), s,
2419 (unsigned long long)(
2420 sect +
2421 choose_data_offset(r10_bio, rdev)),
2422 bdevname(rdev->bdev, b));
2423 atomic_add(s, &rdev->corrected_errors);
2424 }
2425
2426 rdev_dec_pending(rdev, mddev);
2427 rcu_read_lock();
2428 }
2429 rcu_read_unlock();
2430
2431 sectors -= s;
2432 sect += s;
2433 }
2434}
2435
2436static int narrow_write_error(struct r10bio *r10_bio, int i)
2437{
2438 struct bio *bio = r10_bio->master_bio;
2439 struct mddev *mddev = r10_bio->mddev;
2440 struct r10conf *conf = mddev->private;
2441 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453 int block_sectors;
2454 sector_t sector;
2455 int sectors;
2456 int sect_to_write = r10_bio->sectors;
2457 int ok = 1;
2458
2459 if (rdev->badblocks.shift < 0)
2460 return 0;
2461
2462 block_sectors = roundup(1 << rdev->badblocks.shift,
2463 bdev_logical_block_size(rdev->bdev) >> 9);
2464 sector = r10_bio->sector;
2465 sectors = ((r10_bio->sector + block_sectors)
2466 & ~(sector_t)(block_sectors - 1))
2467 - sector;
2468
2469 while (sect_to_write) {
2470 struct bio *wbio;
2471 sector_t wsector;
2472 if (sectors > sect_to_write)
2473 sectors = sect_to_write;
2474
2475 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2476 bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
2477 wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
2478 wbio->bi_iter.bi_sector = wsector +
2479 choose_data_offset(r10_bio, rdev);
2480 wbio->bi_bdev = rdev->bdev;
2481 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
2482
2483 if (submit_bio_wait(wbio) < 0)
2484
2485 ok = rdev_set_badblocks(rdev, wsector,
2486 sectors, 0)
2487 && ok;
2488
2489 bio_put(wbio);
2490 sect_to_write -= sectors;
2491 sector += sectors;
2492 sectors = block_sectors;
2493 }
2494 return ok;
2495}
2496
2497static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2498{
2499 int slot = r10_bio->read_slot;
2500 struct bio *bio;
2501 struct r10conf *conf = mddev->private;
2502 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2503 char b[BDEVNAME_SIZE];
2504 unsigned long do_sync;
2505 int max_sectors;
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515 bio = r10_bio->devs[slot].bio;
2516 bdevname(bio->bi_bdev, b);
2517 bio_put(bio);
2518 r10_bio->devs[slot].bio = NULL;
2519
2520 if (mddev->ro == 0) {
2521 freeze_array(conf, 1);
2522 fix_read_error(conf, mddev, r10_bio);
2523 unfreeze_array(conf);
2524 } else
2525 r10_bio->devs[slot].bio = IO_BLOCKED;
2526
2527 rdev_dec_pending(rdev, mddev);
2528
2529read_more:
2530 rdev = read_balance(conf, r10_bio, &max_sectors);
2531 if (rdev == NULL) {
2532 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2533 " read error for block %llu\n",
2534 mdname(mddev), b,
2535 (unsigned long long)r10_bio->sector);
2536 raid_end_bio_io(r10_bio);
2537 return;
2538 }
2539
2540 do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC);
2541 slot = r10_bio->read_slot;
2542 printk_ratelimited(
2543 KERN_ERR
2544 "md/raid10:%s: %s: redirecting "
2545 "sector %llu to another mirror\n",
2546 mdname(mddev),
2547 bdevname(rdev->bdev, b),
2548 (unsigned long long)r10_bio->sector);
2549 bio = bio_clone_mddev(r10_bio->master_bio,
2550 GFP_NOIO, mddev);
2551 bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
2552 r10_bio->devs[slot].bio = bio;
2553 r10_bio->devs[slot].rdev = rdev;
2554 bio->bi_iter.bi_sector = r10_bio->devs[slot].addr
2555 + choose_data_offset(r10_bio, rdev);
2556 bio->bi_bdev = rdev->bdev;
2557 bio_set_op_attrs(bio, REQ_OP_READ, do_sync);
2558 bio->bi_private = r10_bio;
2559 bio->bi_end_io = raid10_end_read_request;
2560 if (max_sectors < r10_bio->sectors) {
2561
2562 struct bio *mbio = r10_bio->master_bio;
2563 int sectors_handled =
2564 r10_bio->sector + max_sectors
2565 - mbio->bi_iter.bi_sector;
2566 r10_bio->sectors = max_sectors;
2567 spin_lock_irq(&conf->device_lock);
2568 if (mbio->bi_phys_segments == 0)
2569 mbio->bi_phys_segments = 2;
2570 else
2571 mbio->bi_phys_segments++;
2572 spin_unlock_irq(&conf->device_lock);
2573 generic_make_request(bio);
2574
2575 r10_bio = mempool_alloc(conf->r10bio_pool,
2576 GFP_NOIO);
2577 r10_bio->master_bio = mbio;
2578 r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
2579 r10_bio->state = 0;
2580 set_bit(R10BIO_ReadError,
2581 &r10_bio->state);
2582 r10_bio->mddev = mddev;
2583 r10_bio->sector = mbio->bi_iter.bi_sector
2584 + sectors_handled;
2585
2586 goto read_more;
2587 } else
2588 generic_make_request(bio);
2589}
2590
2591static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2592{
2593
2594
2595
2596
2597
2598
2599 int m;
2600 struct md_rdev *rdev;
2601
2602 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2603 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2604 for (m = 0; m < conf->copies; m++) {
2605 int dev = r10_bio->devs[m].devnum;
2606 rdev = conf->mirrors[dev].rdev;
2607 if (r10_bio->devs[m].bio == NULL)
2608 continue;
2609 if (!r10_bio->devs[m].bio->bi_error) {
2610 rdev_clear_badblocks(
2611 rdev,
2612 r10_bio->devs[m].addr,
2613 r10_bio->sectors, 0);
2614 } else {
2615 if (!rdev_set_badblocks(
2616 rdev,
2617 r10_bio->devs[m].addr,
2618 r10_bio->sectors, 0))
2619 md_error(conf->mddev, rdev);
2620 }
2621 rdev = conf->mirrors[dev].replacement;
2622 if (r10_bio->devs[m].repl_bio == NULL)
2623 continue;
2624
2625 if (!r10_bio->devs[m].repl_bio->bi_error) {
2626 rdev_clear_badblocks(
2627 rdev,
2628 r10_bio->devs[m].addr,
2629 r10_bio->sectors, 0);
2630 } else {
2631 if (!rdev_set_badblocks(
2632 rdev,
2633 r10_bio->devs[m].addr,
2634 r10_bio->sectors, 0))
2635 md_error(conf->mddev, rdev);
2636 }
2637 }
2638 put_buf(r10_bio);
2639 } else {
2640 bool fail = false;
2641 for (m = 0; m < conf->copies; m++) {
2642 int dev = r10_bio->devs[m].devnum;
2643 struct bio *bio = r10_bio->devs[m].bio;
2644 rdev = conf->mirrors[dev].rdev;
2645 if (bio == IO_MADE_GOOD) {
2646 rdev_clear_badblocks(
2647 rdev,
2648 r10_bio->devs[m].addr,
2649 r10_bio->sectors, 0);
2650 rdev_dec_pending(rdev, conf->mddev);
2651 } else if (bio != NULL && bio->bi_error) {
2652 fail = true;
2653 if (!narrow_write_error(r10_bio, m)) {
2654 md_error(conf->mddev, rdev);
2655 set_bit(R10BIO_Degraded,
2656 &r10_bio->state);
2657 }
2658 rdev_dec_pending(rdev, conf->mddev);
2659 }
2660 bio = r10_bio->devs[m].repl_bio;
2661 rdev = conf->mirrors[dev].replacement;
2662 if (rdev && bio == IO_MADE_GOOD) {
2663 rdev_clear_badblocks(
2664 rdev,
2665 r10_bio->devs[m].addr,
2666 r10_bio->sectors, 0);
2667 rdev_dec_pending(rdev, conf->mddev);
2668 }
2669 }
2670 if (fail) {
2671 spin_lock_irq(&conf->device_lock);
2672 list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
2673 conf->nr_queued++;
2674 spin_unlock_irq(&conf->device_lock);
2675 md_wakeup_thread(conf->mddev->thread);
2676 } else {
2677 if (test_bit(R10BIO_WriteError,
2678 &r10_bio->state))
2679 close_write(r10_bio);
2680 raid_end_bio_io(r10_bio);
2681 }
2682 }
2683}
2684
2685static void raid10d(struct md_thread *thread)
2686{
2687 struct mddev *mddev = thread->mddev;
2688 struct r10bio *r10_bio;
2689 unsigned long flags;
2690 struct r10conf *conf = mddev->private;
2691 struct list_head *head = &conf->retry_list;
2692 struct blk_plug plug;
2693
2694 md_check_recovery(mddev);
2695
2696 if (!list_empty_careful(&conf->bio_end_io_list) &&
2697 !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
2698 LIST_HEAD(tmp);
2699 spin_lock_irqsave(&conf->device_lock, flags);
2700 if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
2701 while (!list_empty(&conf->bio_end_io_list)) {
2702 list_move(conf->bio_end_io_list.prev, &tmp);
2703 conf->nr_queued--;
2704 }
2705 }
2706 spin_unlock_irqrestore(&conf->device_lock, flags);
2707 while (!list_empty(&tmp)) {
2708 r10_bio = list_first_entry(&tmp, struct r10bio,
2709 retry_list);
2710 list_del(&r10_bio->retry_list);
2711 if (mddev->degraded)
2712 set_bit(R10BIO_Degraded, &r10_bio->state);
2713
2714 if (test_bit(R10BIO_WriteError,
2715 &r10_bio->state))
2716 close_write(r10_bio);
2717 raid_end_bio_io(r10_bio);
2718 }
2719 }
2720
2721 blk_start_plug(&plug);
2722 for (;;) {
2723
2724 flush_pending_writes(conf);
2725
2726 spin_lock_irqsave(&conf->device_lock, flags);
2727 if (list_empty(head)) {
2728 spin_unlock_irqrestore(&conf->device_lock, flags);
2729 break;
2730 }
2731 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2732 list_del(head->prev);
2733 conf->nr_queued--;
2734 spin_unlock_irqrestore(&conf->device_lock, flags);
2735
2736 mddev = r10_bio->mddev;
2737 conf = mddev->private;
2738 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2739 test_bit(R10BIO_WriteError, &r10_bio->state))
2740 handle_write_completed(conf, r10_bio);
2741 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2742 reshape_request_write(mddev, r10_bio);
2743 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2744 sync_request_write(mddev, r10_bio);
2745 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2746 recovery_request_write(mddev, r10_bio);
2747 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2748 handle_read_error(mddev, r10_bio);
2749 else {
2750
2751
2752
2753 int slot = r10_bio->read_slot;
2754 generic_make_request(r10_bio->devs[slot].bio);
2755 }
2756
2757 cond_resched();
2758 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2759 md_check_recovery(mddev);
2760 }
2761 blk_finish_plug(&plug);
2762}
2763
2764static int init_resync(struct r10conf *conf)
2765{
2766 int buffs;
2767 int i;
2768
2769 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2770 BUG_ON(conf->r10buf_pool);
2771 conf->have_replacement = 0;
2772 for (i = 0; i < conf->geo.raid_disks; i++)
2773 if (conf->mirrors[i].replacement)
2774 conf->have_replacement = 1;
2775 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2776 if (!conf->r10buf_pool)
2777 return -ENOMEM;
2778 conf->next_resync = 0;
2779 return 0;
2780}
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
2815 int *skipped)
2816{
2817 struct r10conf *conf = mddev->private;
2818 struct r10bio *r10_bio;
2819 struct bio *biolist = NULL, *bio;
2820 sector_t max_sector, nr_sectors;
2821 int i;
2822 int max_sync;
2823 sector_t sync_blocks;
2824 sector_t sectors_skipped = 0;
2825 int chunks_skipped = 0;
2826 sector_t chunk_mask = conf->geo.chunk_mask;
2827
2828 if (!conf->r10buf_pool)
2829 if (init_resync(conf))
2830 return 0;
2831
2832
2833
2834
2835
2836 if (mddev->bitmap == NULL &&
2837 mddev->recovery_cp == MaxSector &&
2838 mddev->reshape_position == MaxSector &&
2839 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2840 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
2841 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2842 conf->fullsync == 0) {
2843 *skipped = 1;
2844 return mddev->dev_sectors - sector_nr;
2845 }
2846
2847 skipped:
2848 max_sector = mddev->dev_sectors;
2849 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2850 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2851 max_sector = mddev->resync_max_sectors;
2852 if (sector_nr >= max_sector) {
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2863 end_reshape(conf);
2864 close_sync(conf);
2865 return 0;
2866 }
2867
2868 if (mddev->curr_resync < max_sector) {
2869 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2870 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2871 &sync_blocks, 1);
2872 else for (i = 0; i < conf->geo.raid_disks; i++) {
2873 sector_t sect =
2874 raid10_find_virt(conf, mddev->curr_resync, i);
2875 bitmap_end_sync(mddev->bitmap, sect,
2876 &sync_blocks, 1);
2877 }
2878 } else {
2879
2880 if ((!mddev->bitmap || conf->fullsync)
2881 && conf->have_replacement
2882 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2883
2884
2885
2886 rcu_read_lock();
2887 for (i = 0; i < conf->geo.raid_disks; i++) {
2888 struct md_rdev *rdev =
2889 rcu_dereference(conf->mirrors[i].replacement);
2890 if (rdev)
2891 rdev->recovery_offset = MaxSector;
2892 }
2893 rcu_read_unlock();
2894 }
2895 conf->fullsync = 0;
2896 }
2897 bitmap_close_sync(mddev->bitmap);
2898 close_sync(conf);
2899 *skipped = 1;
2900 return sectors_skipped;
2901 }
2902
2903 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2904 return reshape_request(mddev, sector_nr, skipped);
2905
2906 if (chunks_skipped >= conf->geo.raid_disks) {
2907
2908
2909
2910 *skipped = 1;
2911 return (max_sector - sector_nr) + sectors_skipped;
2912 }
2913
2914 if (max_sector > mddev->resync_max)
2915 max_sector = mddev->resync_max;
2916
2917
2918
2919
2920 if (conf->geo.near_copies < conf->geo.raid_disks &&
2921 max_sector > (sector_nr | chunk_mask))
2922 max_sector = (sector_nr | chunk_mask) + 1;
2923
2924
2925
2926
2927
2928 if (conf->nr_waiting)
2929 schedule_timeout_uninterruptible(1);
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
2947 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2948
2949 int j;
2950 r10_bio = NULL;
2951
2952 for (i = 0 ; i < conf->geo.raid_disks; i++) {
2953 int still_degraded;
2954 struct r10bio *rb2;
2955 sector_t sect;
2956 int must_sync;
2957 int any_working;
2958 struct raid10_info *mirror = &conf->mirrors[i];
2959 struct md_rdev *mrdev, *mreplace;
2960
2961 rcu_read_lock();
2962 mrdev = rcu_dereference(mirror->rdev);
2963 mreplace = rcu_dereference(mirror->replacement);
2964
2965 if ((mrdev == NULL ||
2966 test_bit(Faulty, &mrdev->flags) ||
2967 test_bit(In_sync, &mrdev->flags)) &&
2968 (mreplace == NULL ||
2969 test_bit(Faulty, &mreplace->flags))) {
2970 rcu_read_unlock();
2971 continue;
2972 }
2973
2974 still_degraded = 0;
2975
2976 rb2 = r10_bio;
2977 sect = raid10_find_virt(conf, sector_nr, i);
2978 if (sect >= mddev->resync_max_sectors) {
2979
2980
2981
2982 rcu_read_unlock();
2983 continue;
2984 }
2985 if (mreplace && test_bit(Faulty, &mreplace->flags))
2986 mreplace = NULL;
2987
2988
2989
2990
2991 must_sync = bitmap_start_sync(mddev->bitmap, sect,
2992 &sync_blocks, 1);
2993 if (sync_blocks < max_sync)
2994 max_sync = sync_blocks;
2995 if (!must_sync &&
2996 mreplace == NULL &&
2997 !conf->fullsync) {
2998
2999
3000
3001 chunks_skipped = -1;
3002 rcu_read_unlock();
3003 continue;
3004 }
3005 atomic_inc(&mrdev->nr_pending);
3006 if (mreplace)
3007 atomic_inc(&mreplace->nr_pending);
3008 rcu_read_unlock();
3009
3010 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3011 r10_bio->state = 0;
3012 raise_barrier(conf, rb2 != NULL);
3013 atomic_set(&r10_bio->remaining, 0);
3014
3015 r10_bio->master_bio = (struct bio*)rb2;
3016 if (rb2)
3017 atomic_inc(&rb2->remaining);
3018 r10_bio->mddev = mddev;
3019 set_bit(R10BIO_IsRecover, &r10_bio->state);
3020 r10_bio->sector = sect;
3021
3022 raid10_find_phys(conf, r10_bio);
3023
3024
3025
3026
3027 rcu_read_lock();
3028 for (j = 0; j < conf->geo.raid_disks; j++) {
3029 struct md_rdev *rdev = rcu_dereference(
3030 conf->mirrors[j].rdev);
3031 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3032 still_degraded = 1;
3033 break;
3034 }
3035 }
3036
3037 must_sync = bitmap_start_sync(mddev->bitmap, sect,
3038 &sync_blocks, still_degraded);
3039
3040 any_working = 0;
3041 for (j=0; j<conf->copies;j++) {
3042 int k;
3043 int d = r10_bio->devs[j].devnum;
3044 sector_t from_addr, to_addr;
3045 struct md_rdev *rdev =
3046 rcu_dereference(conf->mirrors[d].rdev);
3047 sector_t sector, first_bad;
3048 int bad_sectors;
3049 if (!rdev ||
3050 !test_bit(In_sync, &rdev->flags))
3051 continue;
3052
3053 any_working = 1;
3054 sector = r10_bio->devs[j].addr;
3055
3056 if (is_badblock(rdev, sector, max_sync,
3057 &first_bad, &bad_sectors)) {
3058 if (first_bad > sector)
3059 max_sync = first_bad - sector;
3060 else {
3061 bad_sectors -= (sector
3062 - first_bad);
3063 if (max_sync > bad_sectors)
3064 max_sync = bad_sectors;
3065 continue;
3066 }
3067 }
3068 bio = r10_bio->devs[0].bio;
3069 bio_reset(bio);
3070 bio->bi_next = biolist;
3071 biolist = bio;
3072 bio->bi_private = r10_bio;
3073 bio->bi_end_io = end_sync_read;
3074 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3075 from_addr = r10_bio->devs[j].addr;
3076 bio->bi_iter.bi_sector = from_addr +
3077 rdev->data_offset;
3078 bio->bi_bdev = rdev->bdev;
3079 atomic_inc(&rdev->nr_pending);
3080
3081
3082 for (k=0; k<conf->copies; k++)
3083 if (r10_bio->devs[k].devnum == i)
3084 break;
3085 BUG_ON(k == conf->copies);
3086 to_addr = r10_bio->devs[k].addr;
3087 r10_bio->devs[0].devnum = d;
3088 r10_bio->devs[0].addr = from_addr;
3089 r10_bio->devs[1].devnum = i;
3090 r10_bio->devs[1].addr = to_addr;
3091
3092 if (!test_bit(In_sync, &mrdev->flags)) {
3093 bio = r10_bio->devs[1].bio;
3094 bio_reset(bio);
3095 bio->bi_next = biolist;
3096 biolist = bio;
3097 bio->bi_private = r10_bio;
3098 bio->bi_end_io = end_sync_write;
3099 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3100 bio->bi_iter.bi_sector = to_addr
3101 + mrdev->data_offset;
3102 bio->bi_bdev = mrdev->bdev;
3103 atomic_inc(&r10_bio->remaining);
3104 } else
3105 r10_bio->devs[1].bio->bi_end_io = NULL;
3106
3107
3108 bio = r10_bio->devs[1].repl_bio;
3109 if (bio)
3110 bio->bi_end_io = NULL;
3111
3112
3113
3114
3115
3116
3117
3118
3119 if (mreplace == NULL || bio == NULL ||
3120 test_bit(Faulty, &mreplace->flags))
3121 break;
3122 bio_reset(bio);
3123 bio->bi_next = biolist;
3124 biolist = bio;
3125 bio->bi_private = r10_bio;
3126 bio->bi_end_io = end_sync_write;
3127 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3128 bio->bi_iter.bi_sector = to_addr +
3129 mreplace->data_offset;
3130 bio->bi_bdev = mreplace->bdev;
3131 atomic_inc(&r10_bio->remaining);
3132 break;
3133 }
3134 rcu_read_unlock();
3135 if (j == conf->copies) {
3136
3137
3138 if (any_working) {
3139
3140
3141
3142 int k;
3143 for (k = 0; k < conf->copies; k++)
3144 if (r10_bio->devs[k].devnum == i)
3145 break;
3146 if (!test_bit(In_sync,
3147 &mrdev->flags)
3148 && !rdev_set_badblocks(
3149 mrdev,
3150 r10_bio->devs[k].addr,
3151 max_sync, 0))
3152 any_working = 0;
3153 if (mreplace &&
3154 !rdev_set_badblocks(
3155 mreplace,
3156 r10_bio->devs[k].addr,
3157 max_sync, 0))
3158 any_working = 0;
3159 }
3160 if (!any_working) {
3161 if (!test_and_set_bit(MD_RECOVERY_INTR,
3162 &mddev->recovery))
3163 printk(KERN_INFO "md/raid10:%s: insufficient "
3164 "working devices for recovery.\n",
3165 mdname(mddev));
3166 mirror->recovery_disabled
3167 = mddev->recovery_disabled;
3168 }
3169 put_buf(r10_bio);
3170 if (rb2)
3171 atomic_dec(&rb2->remaining);
3172 r10_bio = rb2;
3173 rdev_dec_pending(mrdev, mddev);
3174 if (mreplace)
3175 rdev_dec_pending(mreplace, mddev);
3176 break;
3177 }
3178 rdev_dec_pending(mrdev, mddev);
3179 if (mreplace)
3180 rdev_dec_pending(mreplace, mddev);
3181 }
3182 if (biolist == NULL) {
3183 while (r10_bio) {
3184 struct r10bio *rb2 = r10_bio;
3185 r10_bio = (struct r10bio*) rb2->master_bio;
3186 rb2->master_bio = NULL;
3187 put_buf(rb2);
3188 }
3189 goto giveup;
3190 }
3191 } else {
3192
3193 int count = 0;
3194
3195 bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0);
3196
3197 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
3198 &sync_blocks, mddev->degraded) &&
3199 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3200 &mddev->recovery)) {
3201
3202 *skipped = 1;
3203 return sync_blocks + sectors_skipped;
3204 }
3205 if (sync_blocks < max_sync)
3206 max_sync = sync_blocks;
3207 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3208 r10_bio->state = 0;
3209
3210 r10_bio->mddev = mddev;
3211 atomic_set(&r10_bio->remaining, 0);
3212 raise_barrier(conf, 0);
3213 conf->next_resync = sector_nr;
3214
3215 r10_bio->master_bio = NULL;
3216 r10_bio->sector = sector_nr;
3217 set_bit(R10BIO_IsSync, &r10_bio->state);
3218 raid10_find_phys(conf, r10_bio);
3219 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3220
3221 for (i = 0; i < conf->copies; i++) {
3222 int d = r10_bio->devs[i].devnum;
3223 sector_t first_bad, sector;
3224 int bad_sectors;
3225 struct md_rdev *rdev;
3226
3227 if (r10_bio->devs[i].repl_bio)
3228 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3229
3230 bio = r10_bio->devs[i].bio;
3231 bio_reset(bio);
3232 bio->bi_error = -EIO;
3233 rcu_read_lock();
3234 rdev = rcu_dereference(conf->mirrors[d].rdev);
3235 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3236 rcu_read_unlock();
3237 continue;
3238 }
3239 sector = r10_bio->devs[i].addr;
3240 if (is_badblock(rdev, sector, max_sync,
3241 &first_bad, &bad_sectors)) {
3242 if (first_bad > sector)
3243 max_sync = first_bad - sector;
3244 else {
3245 bad_sectors -= (sector - first_bad);
3246 if (max_sync > bad_sectors)
3247 max_sync = bad_sectors;
3248 rcu_read_unlock();
3249 continue;
3250 }
3251 }
3252 atomic_inc(&rdev->nr_pending);
3253 atomic_inc(&r10_bio->remaining);
3254 bio->bi_next = biolist;
3255 biolist = bio;
3256 bio->bi_private = r10_bio;
3257 bio->bi_end_io = end_sync_read;
3258 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3259 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3260 bio->bi_bdev = rdev->bdev;
3261 count++;
3262
3263 rdev = rcu_dereference(conf->mirrors[d].replacement);
3264 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3265 rcu_read_unlock();
3266 continue;
3267 }
3268 atomic_inc(&rdev->nr_pending);
3269 rcu_read_unlock();
3270
3271
3272 bio = r10_bio->devs[i].repl_bio;
3273 bio_reset(bio);
3274 bio->bi_error = -EIO;
3275
3276 sector = r10_bio->devs[i].addr;
3277 bio->bi_next = biolist;
3278 biolist = bio;
3279 bio->bi_private = r10_bio;
3280 bio->bi_end_io = end_sync_write;
3281 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3282 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3283 bio->bi_bdev = rdev->bdev;
3284 count++;
3285 }
3286
3287 if (count < 2) {
3288 for (i=0; i<conf->copies; i++) {
3289 int d = r10_bio->devs[i].devnum;
3290 if (r10_bio->devs[i].bio->bi_end_io)
3291 rdev_dec_pending(conf->mirrors[d].rdev,
3292 mddev);
3293 if (r10_bio->devs[i].repl_bio &&
3294 r10_bio->devs[i].repl_bio->bi_end_io)
3295 rdev_dec_pending(
3296 conf->mirrors[d].replacement,
3297 mddev);
3298 }
3299 put_buf(r10_bio);
3300 biolist = NULL;
3301 goto giveup;
3302 }
3303 }
3304
3305 nr_sectors = 0;
3306 if (sector_nr + max_sync < max_sector)
3307 max_sector = sector_nr + max_sync;
3308 do {
3309 struct page *page;
3310 int len = PAGE_SIZE;
3311 if (sector_nr + (len>>9) > max_sector)
3312 len = (max_sector - sector_nr) << 9;
3313 if (len == 0)
3314 break;
3315 for (bio= biolist ; bio ; bio=bio->bi_next) {
3316 struct bio *bio2;
3317 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
3318 if (bio_add_page(bio, page, len, 0))
3319 continue;
3320
3321
3322 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
3323 for (bio2 = biolist;
3324 bio2 && bio2 != bio;
3325 bio2 = bio2->bi_next) {
3326
3327 bio2->bi_vcnt--;
3328 bio2->bi_iter.bi_size -= len;
3329 bio_clear_flag(bio2, BIO_SEG_VALID);
3330 }
3331 goto bio_full;
3332 }
3333 nr_sectors += len>>9;
3334 sector_nr += len>>9;
3335 } while (biolist->bi_vcnt < RESYNC_PAGES);
3336 bio_full:
3337 r10_bio->sectors = nr_sectors;
3338
3339 while (biolist) {
3340 bio = biolist;
3341 biolist = biolist->bi_next;
3342
3343 bio->bi_next = NULL;
3344 r10_bio = bio->bi_private;
3345 r10_bio->sectors = nr_sectors;
3346
3347 if (bio->bi_end_io == end_sync_read) {
3348 md_sync_acct(bio->bi_bdev, nr_sectors);
3349 bio->bi_error = 0;
3350 generic_make_request(bio);
3351 }
3352 }
3353
3354 if (sectors_skipped)
3355
3356
3357
3358 md_done_sync(mddev, sectors_skipped, 1);
3359
3360 return sectors_skipped + nr_sectors;
3361 giveup:
3362
3363
3364
3365
3366 if (sector_nr + max_sync < max_sector)
3367 max_sector = sector_nr + max_sync;
3368
3369 sectors_skipped += (max_sector - sector_nr);
3370 chunks_skipped ++;
3371 sector_nr = max_sector;
3372 goto skipped;
3373}
3374
3375static sector_t
3376raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3377{
3378 sector_t size;
3379 struct r10conf *conf = mddev->private;
3380
3381 if (!raid_disks)
3382 raid_disks = min(conf->geo.raid_disks,
3383 conf->prev.raid_disks);
3384 if (!sectors)
3385 sectors = conf->dev_sectors;
3386
3387 size = sectors >> conf->geo.chunk_shift;
3388 sector_div(size, conf->geo.far_copies);
3389 size = size * raid_disks;
3390 sector_div(size, conf->geo.near_copies);
3391
3392 return size << conf->geo.chunk_shift;
3393}
3394
3395static void calc_sectors(struct r10conf *conf, sector_t size)
3396{
3397
3398
3399
3400
3401
3402 size = size >> conf->geo.chunk_shift;
3403 sector_div(size, conf->geo.far_copies);
3404 size = size * conf->geo.raid_disks;
3405 sector_div(size, conf->geo.near_copies);
3406
3407
3408 size = size * conf->copies;
3409
3410
3411
3412
3413 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3414
3415 conf->dev_sectors = size << conf->geo.chunk_shift;
3416
3417 if (conf->geo.far_offset)
3418 conf->geo.stride = 1 << conf->geo.chunk_shift;
3419 else {
3420 sector_div(size, conf->geo.far_copies);
3421 conf->geo.stride = size << conf->geo.chunk_shift;
3422 }
3423}
3424
3425enum geo_type {geo_new, geo_old, geo_start};
3426static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3427{
3428 int nc, fc, fo;
3429 int layout, chunk, disks;
3430 switch (new) {
3431 case geo_old:
3432 layout = mddev->layout;
3433 chunk = mddev->chunk_sectors;
3434 disks = mddev->raid_disks - mddev->delta_disks;
3435 break;
3436 case geo_new:
3437 layout = mddev->new_layout;
3438 chunk = mddev->new_chunk_sectors;
3439 disks = mddev->raid_disks;
3440 break;
3441 default:
3442 case geo_start:
3443
3444 layout = mddev->new_layout;
3445 chunk = mddev->new_chunk_sectors;
3446 disks = mddev->raid_disks + mddev->delta_disks;
3447 break;
3448 }
3449 if (layout >> 19)
3450 return -1;
3451 if (chunk < (PAGE_SIZE >> 9) ||
3452 !is_power_of_2(chunk))
3453 return -2;
3454 nc = layout & 255;
3455 fc = (layout >> 8) & 255;
3456 fo = layout & (1<<16);
3457 geo->raid_disks = disks;
3458 geo->near_copies = nc;
3459 geo->far_copies = fc;
3460 geo->far_offset = fo;
3461 switch (layout >> 17) {
3462 case 0:
3463 geo->far_set_size = disks;
3464 break;
3465 case 1:
3466
3467 geo->far_set_size = disks/fc;
3468 WARN(geo->far_set_size < fc,
3469 "This RAID10 layout does not provide data safety - please backup and create new array\n");
3470 break;
3471 case 2:
3472 geo->far_set_size = fc * nc;
3473 break;
3474 default:
3475 return -1;
3476 }
3477 geo->chunk_mask = chunk - 1;
3478 geo->chunk_shift = ffz(~chunk);
3479 return nc*fc;
3480}
3481
3482static struct r10conf *setup_conf(struct mddev *mddev)
3483{
3484 struct r10conf *conf = NULL;
3485 int err = -EINVAL;
3486 struct geom geo;
3487 int copies;
3488
3489 copies = setup_geo(&geo, mddev, geo_new);
3490
3491 if (copies == -2) {
3492 printk(KERN_ERR "md/raid10:%s: chunk size must be "
3493 "at least PAGE_SIZE(%ld) and be a power of 2.\n",
3494 mdname(mddev), PAGE_SIZE);
3495 goto out;
3496 }
3497
3498 if (copies < 2 || copies > mddev->raid_disks) {
3499 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3500 mdname(mddev), mddev->new_layout);
3501 goto out;
3502 }
3503
3504 err = -ENOMEM;
3505 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3506 if (!conf)
3507 goto out;
3508
3509
3510 conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
3511 max(0,-mddev->delta_disks)),
3512 GFP_KERNEL);
3513 if (!conf->mirrors)
3514 goto out;
3515
3516 conf->tmppage = alloc_page(GFP_KERNEL);
3517 if (!conf->tmppage)
3518 goto out;
3519
3520 conf->geo = geo;
3521 conf->copies = copies;
3522 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
3523 r10bio_pool_free, conf);
3524 if (!conf->r10bio_pool)
3525 goto out;
3526
3527 calc_sectors(conf, mddev->dev_sectors);
3528 if (mddev->reshape_position == MaxSector) {
3529 conf->prev = conf->geo;
3530 conf->reshape_progress = MaxSector;
3531 } else {
3532 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3533 err = -EINVAL;
3534 goto out;
3535 }
3536 conf->reshape_progress = mddev->reshape_position;
3537 if (conf->prev.far_offset)
3538 conf->prev.stride = 1 << conf->prev.chunk_shift;
3539 else
3540
3541 conf->prev.stride = conf->dev_sectors;
3542 }
3543 conf->reshape_safe = conf->reshape_progress;
3544 spin_lock_init(&conf->device_lock);
3545 INIT_LIST_HEAD(&conf->retry_list);
3546 INIT_LIST_HEAD(&conf->bio_end_io_list);
3547
3548 spin_lock_init(&conf->resync_lock);
3549 init_waitqueue_head(&conf->wait_barrier);
3550 atomic_set(&conf->nr_pending, 0);
3551
3552 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3553 if (!conf->thread)
3554 goto out;
3555
3556 conf->mddev = mddev;
3557 return conf;
3558
3559 out:
3560 if (err == -ENOMEM)
3561 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
3562 mdname(mddev));
3563 if (conf) {
3564 mempool_destroy(conf->r10bio_pool);
3565 kfree(conf->mirrors);
3566 safe_put_page(conf->tmppage);
3567 kfree(conf);
3568 }
3569 return ERR_PTR(err);
3570}
3571
3572static int raid10_run(struct mddev *mddev)
3573{
3574 struct r10conf *conf;
3575 int i, disk_idx, chunk_size;
3576 struct raid10_info *disk;
3577 struct md_rdev *rdev;
3578 sector_t size;
3579 sector_t min_offset_diff = 0;
3580 int first = 1;
3581 bool discard_supported = false;
3582
3583 if (mddev->private == NULL) {
3584 conf = setup_conf(mddev);
3585 if (IS_ERR(conf))
3586 return PTR_ERR(conf);
3587 mddev->private = conf;
3588 }
3589 conf = mddev->private;
3590 if (!conf)
3591 goto out;
3592
3593 mddev->thread = conf->thread;
3594 conf->thread = NULL;
3595
3596 chunk_size = mddev->chunk_sectors << 9;
3597 if (mddev->queue) {
3598 blk_queue_max_discard_sectors(mddev->queue,
3599 mddev->chunk_sectors);
3600 blk_queue_max_write_same_sectors(mddev->queue, 0);
3601 blk_queue_io_min(mddev->queue, chunk_size);
3602 if (conf->geo.raid_disks % conf->geo.near_copies)
3603 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3604 else
3605 blk_queue_io_opt(mddev->queue, chunk_size *
3606 (conf->geo.raid_disks / conf->geo.near_copies));
3607 }
3608
3609 rdev_for_each(rdev, mddev) {
3610 long long diff;
3611 struct request_queue *q;
3612
3613 disk_idx = rdev->raid_disk;
3614 if (disk_idx < 0)
3615 continue;
3616 if (disk_idx >= conf->geo.raid_disks &&
3617 disk_idx >= conf->prev.raid_disks)
3618 continue;
3619 disk = conf->mirrors + disk_idx;
3620
3621 if (test_bit(Replacement, &rdev->flags)) {
3622 if (disk->replacement)
3623 goto out_free_conf;
3624 disk->replacement = rdev;
3625 } else {
3626 if (disk->rdev)
3627 goto out_free_conf;
3628 disk->rdev = rdev;
3629 }
3630 q = bdev_get_queue(rdev->bdev);
3631 diff = (rdev->new_data_offset - rdev->data_offset);
3632 if (!mddev->reshape_backwards)
3633 diff = -diff;
3634 if (diff < 0)
3635 diff = 0;
3636 if (first || diff < min_offset_diff)
3637 min_offset_diff = diff;
3638
3639 if (mddev->gendisk)
3640 disk_stack_limits(mddev->gendisk, rdev->bdev,
3641 rdev->data_offset << 9);
3642
3643 disk->head_position = 0;
3644
3645 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3646 discard_supported = true;
3647 }
3648
3649 if (mddev->queue) {
3650 if (discard_supported)
3651 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
3652 mddev->queue);
3653 else
3654 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
3655 mddev->queue);
3656 }
3657
3658 if (!enough(conf, -1)) {
3659 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
3660 mdname(mddev));
3661 goto out_free_conf;
3662 }
3663
3664 if (conf->reshape_progress != MaxSector) {
3665
3666 if (conf->geo.far_copies != 1 &&
3667 conf->geo.far_offset == 0)
3668 goto out_free_conf;
3669 if (conf->prev.far_copies != 1 &&
3670 conf->prev.far_offset == 0)
3671 goto out_free_conf;
3672 }
3673
3674 mddev->degraded = 0;
3675 for (i = 0;
3676 i < conf->geo.raid_disks
3677 || i < conf->prev.raid_disks;
3678 i++) {
3679
3680 disk = conf->mirrors + i;
3681
3682 if (!disk->rdev && disk->replacement) {
3683
3684 disk->rdev = disk->replacement;
3685 disk->replacement = NULL;
3686 clear_bit(Replacement, &disk->rdev->flags);
3687 }
3688
3689 if (!disk->rdev ||
3690 !test_bit(In_sync, &disk->rdev->flags)) {
3691 disk->head_position = 0;
3692 mddev->degraded++;
3693 if (disk->rdev &&
3694 disk->rdev->saved_raid_disk < 0)
3695 conf->fullsync = 1;
3696 }
3697 disk->recovery_disabled = mddev->recovery_disabled - 1;
3698 }
3699
3700 if (mddev->recovery_cp != MaxSector)
3701 printk(KERN_NOTICE "md/raid10:%s: not clean"
3702 " -- starting background reconstruction\n",
3703 mdname(mddev));
3704 printk(KERN_INFO
3705 "md/raid10:%s: active with %d out of %d devices\n",
3706 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3707 conf->geo.raid_disks);
3708
3709
3710
3711 mddev->dev_sectors = conf->dev_sectors;
3712 size = raid10_size(mddev, 0, 0);
3713 md_set_array_sectors(mddev, size);
3714 mddev->resync_max_sectors = size;
3715
3716 if (mddev->queue) {
3717 int stripe = conf->geo.raid_disks *
3718 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3719
3720
3721
3722
3723
3724 stripe /= conf->geo.near_copies;
3725 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3726 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3727 }
3728
3729 if (md_integrity_register(mddev))
3730 goto out_free_conf;
3731
3732 if (conf->reshape_progress != MaxSector) {
3733 unsigned long before_length, after_length;
3734
3735 before_length = ((1 << conf->prev.chunk_shift) *
3736 conf->prev.far_copies);
3737 after_length = ((1 << conf->geo.chunk_shift) *
3738 conf->geo.far_copies);
3739
3740 if (max(before_length, after_length) > min_offset_diff) {
3741
3742 printk("md/raid10: offset difference not enough to continue reshape\n");
3743 goto out_free_conf;
3744 }
3745 conf->offset_diff = min_offset_diff;
3746
3747 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3748 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3749 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3750 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3751 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3752 "reshape");
3753 }
3754
3755 return 0;
3756
3757out_free_conf:
3758 md_unregister_thread(&mddev->thread);
3759 mempool_destroy(conf->r10bio_pool);
3760 safe_put_page(conf->tmppage);
3761 kfree(conf->mirrors);
3762 kfree(conf);
3763 mddev->private = NULL;
3764out:
3765 return -EIO;
3766}
3767
3768static void raid10_free(struct mddev *mddev, void *priv)
3769{
3770 struct r10conf *conf = priv;
3771
3772 mempool_destroy(conf->r10bio_pool);
3773 safe_put_page(conf->tmppage);
3774 kfree(conf->mirrors);
3775 kfree(conf->mirrors_old);
3776 kfree(conf->mirrors_new);
3777 kfree(conf);
3778}
3779
3780static void raid10_quiesce(struct mddev *mddev, int state)
3781{
3782 struct r10conf *conf = mddev->private;
3783
3784 switch(state) {
3785 case 1:
3786 raise_barrier(conf, 0);
3787 break;
3788 case 0:
3789 lower_barrier(conf);
3790 break;
3791 }
3792}
3793
3794static int raid10_resize(struct mddev *mddev, sector_t sectors)
3795{
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808 struct r10conf *conf = mddev->private;
3809 sector_t oldsize, size;
3810
3811 if (mddev->reshape_position != MaxSector)
3812 return -EBUSY;
3813
3814 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3815 return -EINVAL;
3816
3817 oldsize = raid10_size(mddev, 0, 0);
3818 size = raid10_size(mddev, sectors, 0);
3819 if (mddev->external_size &&
3820 mddev->array_sectors > size)
3821 return -EINVAL;
3822 if (mddev->bitmap) {
3823 int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
3824 if (ret)
3825 return ret;
3826 }
3827 md_set_array_sectors(mddev, size);
3828 if (mddev->queue) {
3829 set_capacity(mddev->gendisk, mddev->array_sectors);
3830 revalidate_disk(mddev->gendisk);
3831 }
3832 if (sectors > mddev->dev_sectors &&
3833 mddev->recovery_cp > oldsize) {
3834 mddev->recovery_cp = oldsize;
3835 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3836 }
3837 calc_sectors(conf, sectors);
3838 mddev->dev_sectors = conf->dev_sectors;
3839 mddev->resync_max_sectors = size;
3840 return 0;
3841}
3842
3843static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
3844{
3845 struct md_rdev *rdev;
3846 struct r10conf *conf;
3847
3848 if (mddev->degraded > 0) {
3849 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
3850 mdname(mddev));
3851 return ERR_PTR(-EINVAL);
3852 }
3853 sector_div(size, devs);
3854
3855
3856 mddev->new_level = 10;
3857
3858 mddev->new_layout = (1<<8) + 2;
3859 mddev->new_chunk_sectors = mddev->chunk_sectors;
3860 mddev->delta_disks = mddev->raid_disks;
3861 mddev->raid_disks *= 2;
3862
3863 mddev->recovery_cp = MaxSector;
3864 mddev->dev_sectors = size;
3865
3866 conf = setup_conf(mddev);
3867 if (!IS_ERR(conf)) {
3868 rdev_for_each(rdev, mddev)
3869 if (rdev->raid_disk >= 0) {
3870 rdev->new_raid_disk = rdev->raid_disk * 2;
3871 rdev->sectors = size;
3872 }
3873 conf->barrier = 1;
3874 }
3875
3876 return conf;
3877}
3878
3879static void *raid10_takeover(struct mddev *mddev)
3880{
3881 struct r0conf *raid0_conf;
3882
3883
3884
3885
3886 if (mddev->level == 0) {
3887
3888 raid0_conf = mddev->private;
3889 if (raid0_conf->nr_strip_zones > 1) {
3890 printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
3891 " with more than one zone.\n",
3892 mdname(mddev));
3893 return ERR_PTR(-EINVAL);
3894 }
3895 return raid10_takeover_raid0(mddev,
3896 raid0_conf->strip_zone->zone_end,
3897 raid0_conf->strip_zone->nb_dev);
3898 }
3899 return ERR_PTR(-EINVAL);
3900}
3901
3902static int raid10_check_reshape(struct mddev *mddev)
3903{
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918 struct r10conf *conf = mddev->private;
3919 struct geom geo;
3920
3921 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
3922 return -EINVAL;
3923
3924 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
3925
3926 return -EINVAL;
3927 if (geo.far_copies > 1 && !geo.far_offset)
3928
3929 return -EINVAL;
3930
3931 if (mddev->array_sectors & geo.chunk_mask)
3932
3933 return -EINVAL;
3934
3935 if (!enough(conf, -1))
3936 return -EINVAL;
3937
3938 kfree(conf->mirrors_new);
3939 conf->mirrors_new = NULL;
3940 if (mddev->delta_disks > 0) {
3941
3942 conf->mirrors_new = kzalloc(
3943 sizeof(struct raid10_info)
3944 *(mddev->raid_disks +
3945 mddev->delta_disks),
3946 GFP_KERNEL);
3947 if (!conf->mirrors_new)
3948 return -ENOMEM;
3949 }
3950 return 0;
3951}
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966static int calc_degraded(struct r10conf *conf)
3967{
3968 int degraded, degraded2;
3969 int i;
3970
3971 rcu_read_lock();
3972 degraded = 0;
3973
3974 for (i = 0; i < conf->prev.raid_disks; i++) {
3975 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3976 if (!rdev || test_bit(Faulty, &rdev->flags))
3977 degraded++;
3978 else if (!test_bit(In_sync, &rdev->flags))
3979
3980
3981
3982
3983 degraded++;
3984 }
3985 rcu_read_unlock();
3986 if (conf->geo.raid_disks == conf->prev.raid_disks)
3987 return degraded;
3988 rcu_read_lock();
3989 degraded2 = 0;
3990 for (i = 0; i < conf->geo.raid_disks; i++) {
3991 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3992 if (!rdev || test_bit(Faulty, &rdev->flags))
3993 degraded2++;
3994 else if (!test_bit(In_sync, &rdev->flags)) {
3995
3996
3997
3998
3999
4000 if (conf->geo.raid_disks <= conf->prev.raid_disks)
4001 degraded2++;
4002 }
4003 }
4004 rcu_read_unlock();
4005 if (degraded2 > degraded)
4006 return degraded2;
4007 return degraded;
4008}
4009
4010static int raid10_start_reshape(struct mddev *mddev)
4011{
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022 unsigned long before_length, after_length;
4023 sector_t min_offset_diff = 0;
4024 int first = 1;
4025 struct geom new;
4026 struct r10conf *conf = mddev->private;
4027 struct md_rdev *rdev;
4028 int spares = 0;
4029 int ret;
4030
4031 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4032 return -EBUSY;
4033
4034 if (setup_geo(&new, mddev, geo_start) != conf->copies)
4035 return -EINVAL;
4036
4037 before_length = ((1 << conf->prev.chunk_shift) *
4038 conf->prev.far_copies);
4039 after_length = ((1 << conf->geo.chunk_shift) *
4040 conf->geo.far_copies);
4041
4042 rdev_for_each(rdev, mddev) {
4043 if (!test_bit(In_sync, &rdev->flags)
4044 && !test_bit(Faulty, &rdev->flags))
4045 spares++;
4046 if (rdev->raid_disk >= 0) {
4047 long long diff = (rdev->new_data_offset
4048 - rdev->data_offset);
4049 if (!mddev->reshape_backwards)
4050 diff = -diff;
4051 if (diff < 0)
4052 diff = 0;
4053 if (first || diff < min_offset_diff)
4054 min_offset_diff = diff;
4055 }
4056 }
4057
4058 if (max(before_length, after_length) > min_offset_diff)
4059 return -EINVAL;
4060
4061 if (spares < mddev->delta_disks)
4062 return -EINVAL;
4063
4064 conf->offset_diff = min_offset_diff;
4065 spin_lock_irq(&conf->device_lock);
4066 if (conf->mirrors_new) {
4067 memcpy(conf->mirrors_new, conf->mirrors,
4068 sizeof(struct raid10_info)*conf->prev.raid_disks);
4069 smp_mb();
4070 kfree(conf->mirrors_old);
4071 conf->mirrors_old = conf->mirrors;
4072 conf->mirrors = conf->mirrors_new;
4073 conf->mirrors_new = NULL;
4074 }
4075 setup_geo(&conf->geo, mddev, geo_start);
4076 smp_mb();
4077 if (mddev->reshape_backwards) {
4078 sector_t size = raid10_size(mddev, 0, 0);
4079 if (size < mddev->array_sectors) {
4080 spin_unlock_irq(&conf->device_lock);
4081 printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
4082 mdname(mddev));
4083 return -EINVAL;
4084 }
4085 mddev->resync_max_sectors = size;
4086 conf->reshape_progress = size;
4087 } else
4088 conf->reshape_progress = 0;
4089 conf->reshape_safe = conf->reshape_progress;
4090 spin_unlock_irq(&conf->device_lock);
4091
4092 if (mddev->delta_disks && mddev->bitmap) {
4093 ret = bitmap_resize(mddev->bitmap,
4094 raid10_size(mddev, 0,
4095 conf->geo.raid_disks),
4096 0, 0);
4097 if (ret)
4098 goto abort;
4099 }
4100 if (mddev->delta_disks > 0) {
4101 rdev_for_each(rdev, mddev)
4102 if (rdev->raid_disk < 0 &&
4103 !test_bit(Faulty, &rdev->flags)) {
4104 if (raid10_add_disk(mddev, rdev) == 0) {
4105 if (rdev->raid_disk >=
4106 conf->prev.raid_disks)
4107 set_bit(In_sync, &rdev->flags);
4108 else
4109 rdev->recovery_offset = 0;
4110
4111 if (sysfs_link_rdev(mddev, rdev))
4112 ;
4113 }
4114 } else if (rdev->raid_disk >= conf->prev.raid_disks
4115 && !test_bit(Faulty, &rdev->flags)) {
4116
4117 set_bit(In_sync, &rdev->flags);
4118 }
4119 }
4120
4121
4122
4123
4124 spin_lock_irq(&conf->device_lock);
4125 mddev->degraded = calc_degraded(conf);
4126 spin_unlock_irq(&conf->device_lock);
4127 mddev->raid_disks = conf->geo.raid_disks;
4128 mddev->reshape_position = conf->reshape_progress;
4129 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4130
4131 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4132 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4133 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
4134 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4135 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4136
4137 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4138 "reshape");
4139 if (!mddev->sync_thread) {
4140 ret = -EAGAIN;
4141 goto abort;
4142 }
4143 conf->reshape_checkpoint = jiffies;
4144 md_wakeup_thread(mddev->sync_thread);
4145 md_new_event(mddev);
4146 return 0;
4147
4148abort:
4149 mddev->recovery = 0;
4150 spin_lock_irq(&conf->device_lock);
4151 conf->geo = conf->prev;
4152 mddev->raid_disks = conf->geo.raid_disks;
4153 rdev_for_each(rdev, mddev)
4154 rdev->new_data_offset = rdev->data_offset;
4155 smp_wmb();
4156 conf->reshape_progress = MaxSector;
4157 conf->reshape_safe = MaxSector;
4158 mddev->reshape_position = MaxSector;
4159 spin_unlock_irq(&conf->device_lock);
4160 return ret;
4161}
4162
4163
4164
4165
4166
4167
4168
4169static sector_t last_dev_address(sector_t s, struct geom *geo)
4170{
4171 s = (s | geo->chunk_mask) + 1;
4172 s >>= geo->chunk_shift;
4173 s *= geo->near_copies;
4174 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4175 s *= geo->far_copies;
4176 s <<= geo->chunk_shift;
4177 return s;
4178}
4179
4180
4181
4182
4183
4184static sector_t first_dev_address(sector_t s, struct geom *geo)
4185{
4186 s >>= geo->chunk_shift;
4187 s *= geo->near_copies;
4188 sector_div(s, geo->raid_disks);
4189 s *= geo->far_copies;
4190 s <<= geo->chunk_shift;
4191 return s;
4192}
4193
4194static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4195 int *skipped)
4196{
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234 struct r10conf *conf = mddev->private;
4235 struct r10bio *r10_bio;
4236 sector_t next, safe, last;
4237 int max_sectors;
4238 int nr_sectors;
4239 int s;
4240 struct md_rdev *rdev;
4241 int need_flush = 0;
4242 struct bio *blist;
4243 struct bio *bio, *read_bio;
4244 int sectors_done = 0;
4245
4246 if (sector_nr == 0) {
4247
4248 if (mddev->reshape_backwards &&
4249 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4250 sector_nr = (raid10_size(mddev, 0, 0)
4251 - conf->reshape_progress);
4252 } else if (!mddev->reshape_backwards &&
4253 conf->reshape_progress > 0)
4254 sector_nr = conf->reshape_progress;
4255 if (sector_nr) {
4256 mddev->curr_resync_completed = sector_nr;
4257 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4258 *skipped = 1;
4259 return sector_nr;
4260 }
4261 }
4262
4263
4264
4265
4266
4267 if (mddev->reshape_backwards) {
4268
4269
4270
4271 next = first_dev_address(conf->reshape_progress - 1,
4272 &conf->geo);
4273
4274
4275
4276
4277 safe = last_dev_address(conf->reshape_safe - 1,
4278 &conf->prev);
4279
4280 if (next + conf->offset_diff < safe)
4281 need_flush = 1;
4282
4283 last = conf->reshape_progress - 1;
4284 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4285 & conf->prev.chunk_mask);
4286 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4287 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4288 } else {
4289
4290
4291
4292 next = last_dev_address(conf->reshape_progress, &conf->geo);
4293
4294
4295
4296
4297 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4298
4299
4300
4301
4302 if (next > safe + conf->offset_diff)
4303 need_flush = 1;
4304
4305 sector_nr = conf->reshape_progress;
4306 last = sector_nr | (conf->geo.chunk_mask
4307 & conf->prev.chunk_mask);
4308
4309 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4310 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4311 }
4312
4313 if (need_flush ||
4314 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4315
4316 wait_barrier(conf);
4317 mddev->reshape_position = conf->reshape_progress;
4318 if (mddev->reshape_backwards)
4319 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4320 - conf->reshape_progress;
4321 else
4322 mddev->curr_resync_completed = conf->reshape_progress;
4323 conf->reshape_checkpoint = jiffies;
4324 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4325 md_wakeup_thread(mddev->thread);
4326 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4327 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4328 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4329 allow_barrier(conf);
4330 return sectors_done;
4331 }
4332 conf->reshape_safe = mddev->reshape_position;
4333 allow_barrier(conf);
4334 }
4335
4336read_more:
4337
4338 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
4339 r10_bio->state = 0;
4340 raise_barrier(conf, sectors_done != 0);
4341 atomic_set(&r10_bio->remaining, 0);
4342 r10_bio->mddev = mddev;
4343 r10_bio->sector = sector_nr;
4344 set_bit(R10BIO_IsReshape, &r10_bio->state);
4345 r10_bio->sectors = last - sector_nr + 1;
4346 rdev = read_balance(conf, r10_bio, &max_sectors);
4347 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4348
4349 if (!rdev) {
4350
4351
4352
4353
4354 mempool_free(r10_bio, conf->r10buf_pool);
4355 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4356 return sectors_done;
4357 }
4358
4359 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4360
4361 read_bio->bi_bdev = rdev->bdev;
4362 read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4363 + rdev->data_offset);
4364 read_bio->bi_private = r10_bio;
4365 read_bio->bi_end_io = end_sync_read;
4366 bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
4367 read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
4368 read_bio->bi_error = 0;
4369 read_bio->bi_vcnt = 0;
4370 read_bio->bi_iter.bi_size = 0;
4371 r10_bio->master_bio = read_bio;
4372 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4373
4374
4375 __raid10_find_phys(&conf->geo, r10_bio);
4376
4377 blist = read_bio;
4378 read_bio->bi_next = NULL;
4379
4380 rcu_read_lock();
4381 for (s = 0; s < conf->copies*2; s++) {
4382 struct bio *b;
4383 int d = r10_bio->devs[s/2].devnum;
4384 struct md_rdev *rdev2;
4385 if (s&1) {
4386 rdev2 = rcu_dereference(conf->mirrors[d].replacement);
4387 b = r10_bio->devs[s/2].repl_bio;
4388 } else {
4389 rdev2 = rcu_dereference(conf->mirrors[d].rdev);
4390 b = r10_bio->devs[s/2].bio;
4391 }
4392 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4393 continue;
4394
4395 bio_reset(b);
4396 b->bi_bdev = rdev2->bdev;
4397 b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
4398 rdev2->new_data_offset;
4399 b->bi_private = r10_bio;
4400 b->bi_end_io = end_reshape_write;
4401 bio_set_op_attrs(b, REQ_OP_WRITE, 0);
4402 b->bi_next = blist;
4403 blist = b;
4404 }
4405
4406
4407
4408 nr_sectors = 0;
4409 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4410 struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
4411 int len = (max_sectors - s) << 9;
4412 if (len > PAGE_SIZE)
4413 len = PAGE_SIZE;
4414 for (bio = blist; bio ; bio = bio->bi_next) {
4415 struct bio *bio2;
4416 if (bio_add_page(bio, page, len, 0))
4417 continue;
4418
4419
4420 for (bio2 = blist;
4421 bio2 && bio2 != bio;
4422 bio2 = bio2->bi_next) {
4423
4424 bio2->bi_vcnt--;
4425 bio2->bi_iter.bi_size -= len;
4426 bio_clear_flag(bio2, BIO_SEG_VALID);
4427 }
4428 goto bio_full;
4429 }
4430 sector_nr += len >> 9;
4431 nr_sectors += len >> 9;
4432 }
4433bio_full:
4434 rcu_read_unlock();
4435 r10_bio->sectors = nr_sectors;
4436
4437
4438 md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
4439 atomic_inc(&r10_bio->remaining);
4440 read_bio->bi_next = NULL;
4441 generic_make_request(read_bio);
4442 sector_nr += nr_sectors;
4443 sectors_done += nr_sectors;
4444 if (sector_nr <= last)
4445 goto read_more;
4446
4447
4448
4449
4450 if (mddev->reshape_backwards)
4451 conf->reshape_progress -= sectors_done;
4452 else
4453 conf->reshape_progress += sectors_done;
4454
4455 return sectors_done;
4456}
4457
4458static void end_reshape_request(struct r10bio *r10_bio);
4459static int handle_reshape_read_error(struct mddev *mddev,
4460 struct r10bio *r10_bio);
4461static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4462{
4463
4464
4465
4466
4467
4468 struct r10conf *conf = mddev->private;
4469 int s;
4470
4471 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4472 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4473
4474 md_done_sync(mddev, r10_bio->sectors, 0);
4475 return;
4476 }
4477
4478
4479
4480
4481 atomic_set(&r10_bio->remaining, 1);
4482 for (s = 0; s < conf->copies*2; s++) {
4483 struct bio *b;
4484 int d = r10_bio->devs[s/2].devnum;
4485 struct md_rdev *rdev;
4486 rcu_read_lock();
4487 if (s&1) {
4488 rdev = rcu_dereference(conf->mirrors[d].replacement);
4489 b = r10_bio->devs[s/2].repl_bio;
4490 } else {
4491 rdev = rcu_dereference(conf->mirrors[d].rdev);
4492 b = r10_bio->devs[s/2].bio;
4493 }
4494 if (!rdev || test_bit(Faulty, &rdev->flags)) {
4495 rcu_read_unlock();
4496 continue;
4497 }
4498 atomic_inc(&rdev->nr_pending);
4499 rcu_read_unlock();
4500 md_sync_acct(b->bi_bdev, r10_bio->sectors);
4501 atomic_inc(&r10_bio->remaining);
4502 b->bi_next = NULL;
4503 generic_make_request(b);
4504 }
4505 end_reshape_request(r10_bio);
4506}
4507
4508static void end_reshape(struct r10conf *conf)
4509{
4510 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4511 return;
4512
4513 spin_lock_irq(&conf->device_lock);
4514 conf->prev = conf->geo;
4515 md_finish_reshape(conf->mddev);
4516 smp_wmb();
4517 conf->reshape_progress = MaxSector;
4518 conf->reshape_safe = MaxSector;
4519 spin_unlock_irq(&conf->device_lock);
4520
4521
4522
4523
4524 if (conf->mddev->queue) {
4525 int stripe = conf->geo.raid_disks *
4526 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4527 stripe /= conf->geo.near_copies;
4528 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4529 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4530 }
4531 conf->fullsync = 0;
4532}
4533
4534static int handle_reshape_read_error(struct mddev *mddev,
4535 struct r10bio *r10_bio)
4536{
4537
4538 int sectors = r10_bio->sectors;
4539 struct r10conf *conf = mddev->private;
4540 struct {
4541 struct r10bio r10_bio;
4542 struct r10dev devs[conf->copies];
4543 } on_stack;
4544 struct r10bio *r10b = &on_stack.r10_bio;
4545 int slot = 0;
4546 int idx = 0;
4547 struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
4548
4549 r10b->sector = r10_bio->sector;
4550 __raid10_find_phys(&conf->prev, r10b);
4551
4552 while (sectors) {
4553 int s = sectors;
4554 int success = 0;
4555 int first_slot = slot;
4556
4557 if (s > (PAGE_SIZE >> 9))
4558 s = PAGE_SIZE >> 9;
4559
4560 rcu_read_lock();
4561 while (!success) {
4562 int d = r10b->devs[slot].devnum;
4563 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4564 sector_t addr;
4565 if (rdev == NULL ||
4566 test_bit(Faulty, &rdev->flags) ||
4567 !test_bit(In_sync, &rdev->flags))
4568 goto failed;
4569
4570 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4571 atomic_inc(&rdev->nr_pending);
4572 rcu_read_unlock();
4573 success = sync_page_io(rdev,
4574 addr,
4575 s << 9,
4576 bvec[idx].bv_page,
4577 REQ_OP_READ, 0, false);
4578 rdev_dec_pending(rdev, mddev);
4579 rcu_read_lock();
4580 if (success)
4581 break;
4582 failed:
4583 slot++;
4584 if (slot >= conf->copies)
4585 slot = 0;
4586 if (slot == first_slot)
4587 break;
4588 }
4589 rcu_read_unlock();
4590 if (!success) {
4591
4592 set_bit(MD_RECOVERY_INTR,
4593 &mddev->recovery);
4594 return -EIO;
4595 }
4596 sectors -= s;
4597 idx++;
4598 }
4599 return 0;
4600}
4601
4602static void end_reshape_write(struct bio *bio)
4603{
4604 struct r10bio *r10_bio = bio->bi_private;
4605 struct mddev *mddev = r10_bio->mddev;
4606 struct r10conf *conf = mddev->private;
4607 int d;
4608 int slot;
4609 int repl;
4610 struct md_rdev *rdev = NULL;
4611
4612 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4613 if (repl)
4614 rdev = conf->mirrors[d].replacement;
4615 if (!rdev) {
4616 smp_mb();
4617 rdev = conf->mirrors[d].rdev;
4618 }
4619
4620 if (bio->bi_error) {
4621
4622 md_error(mddev, rdev);
4623 }
4624
4625 rdev_dec_pending(rdev, mddev);
4626 end_reshape_request(r10_bio);
4627}
4628
4629static void end_reshape_request(struct r10bio *r10_bio)
4630{
4631 if (!atomic_dec_and_test(&r10_bio->remaining))
4632 return;
4633 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4634 bio_put(r10_bio->master_bio);
4635 put_buf(r10_bio);
4636}
4637
4638static void raid10_finish_reshape(struct mddev *mddev)
4639{
4640 struct r10conf *conf = mddev->private;
4641
4642 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4643 return;
4644
4645 if (mddev->delta_disks > 0) {
4646 sector_t size = raid10_size(mddev, 0, 0);
4647 md_set_array_sectors(mddev, size);
4648 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4649 mddev->recovery_cp = mddev->resync_max_sectors;
4650 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4651 }
4652 mddev->resync_max_sectors = size;
4653 if (mddev->queue) {
4654 set_capacity(mddev->gendisk, mddev->array_sectors);
4655 revalidate_disk(mddev->gendisk);
4656 }
4657 } else {
4658 int d;
4659 rcu_read_lock();
4660 for (d = conf->geo.raid_disks ;
4661 d < conf->geo.raid_disks - mddev->delta_disks;
4662 d++) {
4663 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4664 if (rdev)
4665 clear_bit(In_sync, &rdev->flags);
4666 rdev = rcu_dereference(conf->mirrors[d].replacement);
4667 if (rdev)
4668 clear_bit(In_sync, &rdev->flags);
4669 }
4670 rcu_read_unlock();
4671 }
4672 mddev->layout = mddev->new_layout;
4673 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4674 mddev->reshape_position = MaxSector;
4675 mddev->delta_disks = 0;
4676 mddev->reshape_backwards = 0;
4677}
4678
4679static struct md_personality raid10_personality =
4680{
4681 .name = "raid10",
4682 .level = 10,
4683 .owner = THIS_MODULE,
4684 .make_request = raid10_make_request,
4685 .run = raid10_run,
4686 .free = raid10_free,
4687 .status = raid10_status,
4688 .error_handler = raid10_error,
4689 .hot_add_disk = raid10_add_disk,
4690 .hot_remove_disk= raid10_remove_disk,
4691 .spare_active = raid10_spare_active,
4692 .sync_request = raid10_sync_request,
4693 .quiesce = raid10_quiesce,
4694 .size = raid10_size,
4695 .resize = raid10_resize,
4696 .takeover = raid10_takeover,
4697 .check_reshape = raid10_check_reshape,
4698 .start_reshape = raid10_start_reshape,
4699 .finish_reshape = raid10_finish_reshape,
4700 .congested = raid10_congested,
4701};
4702
4703static int __init raid_init(void)
4704{
4705 return register_md_personality(&raid10_personality);
4706}
4707
4708static void raid_exit(void)
4709{
4710 unregister_md_personality(&raid10_personality);
4711}
4712
4713module_init(raid_init);
4714module_exit(raid_exit);
4715MODULE_LICENSE("GPL");
4716MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4717MODULE_ALIAS("md-personality-9");
4718MODULE_ALIAS("md-raid10");
4719MODULE_ALIAS("md-level-10");
4720
4721module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
4722