1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/slab.h>
22#include <linux/delay.h>
23#include <linux/blkdev.h>
24#include <linux/module.h>
25#include <linux/seq_file.h>
26#include <linux/ratelimit.h>
27#include <linux/kthread.h>
28#include "md.h"
29#include "raid10.h"
30#include "raid0.h"
31#include "bitmap.h"
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61#define NR_RAID10_BIOS 256
62
63
64
65
66
67static int max_queued_requests = 1024;
68
69static void allow_barrier(struct r10conf *conf);
70static void lower_barrier(struct r10conf *conf);
71static int enough(struct r10conf *conf, int ignore);
72static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
73 int *skipped);
74static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
75static void end_reshape_write(struct bio *bio, int error);
76static void end_reshape(struct r10conf *conf);
77
78static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
79{
80 struct r10conf *conf = data;
81 int size = offsetof(struct r10bio, devs[conf->copies]);
82
83
84
85 return kzalloc(size, gfp_flags);
86}
87
88static void r10bio_pool_free(void *r10_bio, void *data)
89{
90 kfree(r10_bio);
91}
92
93
94#define RESYNC_BLOCK_SIZE (64*1024)
95#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
96
97#define RESYNC_WINDOW (1024*1024)
98
99#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
100
101
102
103
104
105
106
107
108static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
109{
110 struct r10conf *conf = data;
111 struct page *page;
112 struct r10bio *r10_bio;
113 struct bio *bio;
114 int i, j;
115 int nalloc;
116
117 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
118 if (!r10_bio)
119 return NULL;
120
121 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
122 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
123 nalloc = conf->copies;
124 else
125 nalloc = 2;
126
127
128
129
130 for (j = nalloc ; j-- ; ) {
131 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
132 if (!bio)
133 goto out_free_bio;
134 r10_bio->devs[j].bio = bio;
135 if (!conf->have_replacement)
136 continue;
137 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
138 if (!bio)
139 goto out_free_bio;
140 r10_bio->devs[j].repl_bio = bio;
141 }
142
143
144
145
146 for (j = 0 ; j < nalloc; j++) {
147 struct bio *rbio = r10_bio->devs[j].repl_bio;
148 bio = r10_bio->devs[j].bio;
149 for (i = 0; i < RESYNC_PAGES; i++) {
150 if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
151 &conf->mddev->recovery)) {
152
153
154 struct bio *rbio = r10_bio->devs[0].bio;
155 page = rbio->bi_io_vec[i].bv_page;
156 get_page(page);
157 } else
158 page = alloc_page(gfp_flags);
159 if (unlikely(!page))
160 goto out_free_pages;
161
162 bio->bi_io_vec[i].bv_page = page;
163 if (rbio)
164 rbio->bi_io_vec[i].bv_page = page;
165 }
166 }
167
168 return r10_bio;
169
170out_free_pages:
171 for ( ; i > 0 ; i--)
172 safe_put_page(bio->bi_io_vec[i-1].bv_page);
173 while (j--)
174 for (i = 0; i < RESYNC_PAGES ; i++)
175 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
176 j = 0;
177out_free_bio:
178 for ( ; j < nalloc; j++) {
179 if (r10_bio->devs[j].bio)
180 bio_put(r10_bio->devs[j].bio);
181 if (r10_bio->devs[j].repl_bio)
182 bio_put(r10_bio->devs[j].repl_bio);
183 }
184 r10bio_pool_free(r10_bio, conf);
185 return NULL;
186}
187
188static void r10buf_pool_free(void *__r10_bio, void *data)
189{
190 int i;
191 struct r10conf *conf = data;
192 struct r10bio *r10bio = __r10_bio;
193 int j;
194
195 for (j=0; j < conf->copies; j++) {
196 struct bio *bio = r10bio->devs[j].bio;
197 if (bio) {
198 for (i = 0; i < RESYNC_PAGES; i++) {
199 safe_put_page(bio->bi_io_vec[i].bv_page);
200 bio->bi_io_vec[i].bv_page = NULL;
201 }
202 bio_put(bio);
203 }
204 bio = r10bio->devs[j].repl_bio;
205 if (bio)
206 bio_put(bio);
207 }
208 r10bio_pool_free(r10bio, conf);
209}
210
211static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
212{
213 int i;
214
215 for (i = 0; i < conf->copies; i++) {
216 struct bio **bio = & r10_bio->devs[i].bio;
217 if (!BIO_SPECIAL(*bio))
218 bio_put(*bio);
219 *bio = NULL;
220 bio = &r10_bio->devs[i].repl_bio;
221 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
222 bio_put(*bio);
223 *bio = NULL;
224 }
225}
226
227static void free_r10bio(struct r10bio *r10_bio)
228{
229 struct r10conf *conf = r10_bio->mddev->private;
230
231 put_all_bios(conf, r10_bio);
232 mempool_free(r10_bio, conf->r10bio_pool);
233}
234
235static void put_buf(struct r10bio *r10_bio)
236{
237 struct r10conf *conf = r10_bio->mddev->private;
238
239 mempool_free(r10_bio, conf->r10buf_pool);
240
241 lower_barrier(conf);
242}
243
244static void reschedule_retry(struct r10bio *r10_bio)
245{
246 unsigned long flags;
247 struct mddev *mddev = r10_bio->mddev;
248 struct r10conf *conf = mddev->private;
249
250 spin_lock_irqsave(&conf->device_lock, flags);
251 list_add(&r10_bio->retry_list, &conf->retry_list);
252 conf->nr_queued ++;
253 spin_unlock_irqrestore(&conf->device_lock, flags);
254
255
256 wake_up(&conf->wait_barrier);
257
258 md_wakeup_thread(mddev->thread);
259}
260
261
262
263
264
265
266static void raid_end_bio_io(struct r10bio *r10_bio)
267{
268 struct bio *bio = r10_bio->master_bio;
269 int done;
270 struct r10conf *conf = r10_bio->mddev->private;
271
272 if (bio->bi_phys_segments) {
273 unsigned long flags;
274 spin_lock_irqsave(&conf->device_lock, flags);
275 bio->bi_phys_segments--;
276 done = (bio->bi_phys_segments == 0);
277 spin_unlock_irqrestore(&conf->device_lock, flags);
278 } else
279 done = 1;
280 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
281 clear_bit(BIO_UPTODATE, &bio->bi_flags);
282 if (done) {
283 bio_endio(bio, 0);
284
285
286
287
288 allow_barrier(conf);
289 }
290 free_r10bio(r10_bio);
291}
292
293
294
295
296static inline void update_head_pos(int slot, struct r10bio *r10_bio)
297{
298 struct r10conf *conf = r10_bio->mddev->private;
299
300 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
301 r10_bio->devs[slot].addr + (r10_bio->sectors);
302}
303
304
305
306
307static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
308 struct bio *bio, int *slotp, int *replp)
309{
310 int slot;
311 int repl = 0;
312
313 for (slot = 0; slot < conf->copies; slot++) {
314 if (r10_bio->devs[slot].bio == bio)
315 break;
316 if (r10_bio->devs[slot].repl_bio == bio) {
317 repl = 1;
318 break;
319 }
320 }
321
322 BUG_ON(slot == conf->copies);
323 update_head_pos(slot, r10_bio);
324
325 if (slotp)
326 *slotp = slot;
327 if (replp)
328 *replp = repl;
329 return r10_bio->devs[slot].devnum;
330}
331
332static void raid10_end_read_request(struct bio *bio, int error)
333{
334 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
335 struct r10bio *r10_bio = bio->bi_private;
336 int slot, dev;
337 struct md_rdev *rdev;
338 struct r10conf *conf = r10_bio->mddev->private;
339
340
341 slot = r10_bio->read_slot;
342 dev = r10_bio->devs[slot].devnum;
343 rdev = r10_bio->devs[slot].rdev;
344
345
346
347 update_head_pos(slot, r10_bio);
348
349 if (uptodate) {
350
351
352
353
354
355
356
357
358
359 set_bit(R10BIO_Uptodate, &r10_bio->state);
360 } else {
361
362
363
364
365
366 unsigned long flags;
367 spin_lock_irqsave(&conf->device_lock, flags);
368 if (!enough(conf, rdev->raid_disk))
369 uptodate = 1;
370 spin_unlock_irqrestore(&conf->device_lock, flags);
371 }
372 if (uptodate) {
373 raid_end_bio_io(r10_bio);
374 rdev_dec_pending(rdev, conf->mddev);
375 } else {
376
377
378
379 char b[BDEVNAME_SIZE];
380 printk_ratelimited(KERN_ERR
381 "md/raid10:%s: %s: rescheduling sector %llu\n",
382 mdname(conf->mddev),
383 bdevname(rdev->bdev, b),
384 (unsigned long long)r10_bio->sector);
385 set_bit(R10BIO_ReadError, &r10_bio->state);
386 reschedule_retry(r10_bio);
387 }
388}
389
390static void close_write(struct r10bio *r10_bio)
391{
392
393 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
394 r10_bio->sectors,
395 !test_bit(R10BIO_Degraded, &r10_bio->state),
396 0);
397 md_write_end(r10_bio->mddev);
398}
399
400static void one_write_done(struct r10bio *r10_bio)
401{
402 if (atomic_dec_and_test(&r10_bio->remaining)) {
403 if (test_bit(R10BIO_WriteError, &r10_bio->state))
404 reschedule_retry(r10_bio);
405 else {
406 close_write(r10_bio);
407 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
408 reschedule_retry(r10_bio);
409 else
410 raid_end_bio_io(r10_bio);
411 }
412 }
413}
414
415static void raid10_end_write_request(struct bio *bio, int error)
416{
417 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
418 struct r10bio *r10_bio = bio->bi_private;
419 int dev;
420 int dec_rdev = 1;
421 struct r10conf *conf = r10_bio->mddev->private;
422 int slot, repl;
423 struct md_rdev *rdev = NULL;
424
425 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
426
427 if (repl)
428 rdev = conf->mirrors[dev].replacement;
429 if (!rdev) {
430 smp_rmb();
431 repl = 0;
432 rdev = conf->mirrors[dev].rdev;
433 }
434
435
436
437 if (!uptodate) {
438 if (repl)
439
440
441
442 md_error(rdev->mddev, rdev);
443 else {
444 set_bit(WriteErrorSeen, &rdev->flags);
445 if (!test_and_set_bit(WantReplacement, &rdev->flags))
446 set_bit(MD_RECOVERY_NEEDED,
447 &rdev->mddev->recovery);
448 set_bit(R10BIO_WriteError, &r10_bio->state);
449 dec_rdev = 0;
450 }
451 } else {
452
453
454
455
456
457
458
459
460
461 sector_t first_bad;
462 int bad_sectors;
463
464 set_bit(R10BIO_Uptodate, &r10_bio->state);
465
466
467 if (is_badblock(rdev,
468 r10_bio->devs[slot].addr,
469 r10_bio->sectors,
470 &first_bad, &bad_sectors)) {
471 bio_put(bio);
472 if (repl)
473 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
474 else
475 r10_bio->devs[slot].bio = IO_MADE_GOOD;
476 dec_rdev = 0;
477 set_bit(R10BIO_MadeGood, &r10_bio->state);
478 }
479 }
480
481
482
483
484
485
486 one_write_done(r10_bio);
487 if (dec_rdev)
488 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
489}
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
517{
518 int n,f;
519 sector_t sector;
520 sector_t chunk;
521 sector_t stripe;
522 int dev;
523 int slot = 0;
524
525
526 chunk = r10bio->sector >> geo->chunk_shift;
527 sector = r10bio->sector & geo->chunk_mask;
528
529 chunk *= geo->near_copies;
530 stripe = chunk;
531 dev = sector_div(stripe, geo->raid_disks);
532 if (geo->far_offset)
533 stripe *= geo->far_copies;
534
535 sector += stripe << geo->chunk_shift;
536
537
538 for (n = 0; n < geo->near_copies; n++) {
539 int d = dev;
540 sector_t s = sector;
541 r10bio->devs[slot].addr = sector;
542 r10bio->devs[slot].devnum = d;
543 slot++;
544
545 for (f = 1; f < geo->far_copies; f++) {
546 d += geo->near_copies;
547 if (d >= geo->raid_disks)
548 d -= geo->raid_disks;
549 s += geo->stride;
550 r10bio->devs[slot].devnum = d;
551 r10bio->devs[slot].addr = s;
552 slot++;
553 }
554 dev++;
555 if (dev >= geo->raid_disks) {
556 dev = 0;
557 sector += (geo->chunk_mask + 1);
558 }
559 }
560}
561
562static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
563{
564 struct geom *geo = &conf->geo;
565
566 if (conf->reshape_progress != MaxSector &&
567 ((r10bio->sector >= conf->reshape_progress) !=
568 conf->mddev->reshape_backwards)) {
569 set_bit(R10BIO_Previous, &r10bio->state);
570 geo = &conf->prev;
571 } else
572 clear_bit(R10BIO_Previous, &r10bio->state);
573
574 __raid10_find_phys(geo, r10bio);
575}
576
577static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
578{
579 sector_t offset, chunk, vchunk;
580
581
582
583 struct geom *geo = &conf->geo;
584
585 offset = sector & geo->chunk_mask;
586 if (geo->far_offset) {
587 int fc;
588 chunk = sector >> geo->chunk_shift;
589 fc = sector_div(chunk, geo->far_copies);
590 dev -= fc * geo->near_copies;
591 if (dev < 0)
592 dev += geo->raid_disks;
593 } else {
594 while (sector >= geo->stride) {
595 sector -= geo->stride;
596 if (dev < geo->near_copies)
597 dev += geo->raid_disks - geo->near_copies;
598 else
599 dev -= geo->near_copies;
600 }
601 chunk = sector >> geo->chunk_shift;
602 }
603 vchunk = chunk * geo->raid_disks + dev;
604 sector_div(vchunk, geo->near_copies);
605 return (vchunk << geo->chunk_shift) + offset;
606}
607
608
609
610
611
612
613
614
615
616
617
618static int raid10_mergeable_bvec(struct request_queue *q,
619 struct bvec_merge_data *bvm,
620 struct bio_vec *biovec)
621{
622 struct mddev *mddev = q->queuedata;
623 struct r10conf *conf = mddev->private;
624 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
625 int max;
626 unsigned int chunk_sectors;
627 unsigned int bio_sectors = bvm->bi_size >> 9;
628 struct geom *geo = &conf->geo;
629
630 chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
631 if (conf->reshape_progress != MaxSector &&
632 ((sector >= conf->reshape_progress) !=
633 conf->mddev->reshape_backwards))
634 geo = &conf->prev;
635
636 if (geo->near_copies < geo->raid_disks) {
637 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
638 + bio_sectors)) << 9;
639 if (max < 0)
640
641 max = 0;
642 if (max <= biovec->bv_len && bio_sectors == 0)
643 return biovec->bv_len;
644 } else
645 max = biovec->bv_len;
646
647 if (mddev->merge_check_needed) {
648 struct r10bio r10_bio;
649 int s;
650 if (conf->reshape_progress != MaxSector) {
651
652 if (max <= biovec->bv_len && bio_sectors == 0)
653 return biovec->bv_len;
654 return 0;
655 }
656 r10_bio.sector = sector;
657 raid10_find_phys(conf, &r10_bio);
658 rcu_read_lock();
659 for (s = 0; s < conf->copies; s++) {
660 int disk = r10_bio.devs[s].devnum;
661 struct md_rdev *rdev = rcu_dereference(
662 conf->mirrors[disk].rdev);
663 if (rdev && !test_bit(Faulty, &rdev->flags)) {
664 struct request_queue *q =
665 bdev_get_queue(rdev->bdev);
666 if (q->merge_bvec_fn) {
667 bvm->bi_sector = r10_bio.devs[s].addr
668 + rdev->data_offset;
669 bvm->bi_bdev = rdev->bdev;
670 max = min(max, q->merge_bvec_fn(
671 q, bvm, biovec));
672 }
673 }
674 rdev = rcu_dereference(conf->mirrors[disk].replacement);
675 if (rdev && !test_bit(Faulty, &rdev->flags)) {
676 struct request_queue *q =
677 bdev_get_queue(rdev->bdev);
678 if (q->merge_bvec_fn) {
679 bvm->bi_sector = r10_bio.devs[s].addr
680 + rdev->data_offset;
681 bvm->bi_bdev = rdev->bdev;
682 max = min(max, q->merge_bvec_fn(
683 q, bvm, biovec));
684 }
685 }
686 }
687 rcu_read_unlock();
688 }
689 return max;
690}
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711static struct md_rdev *read_balance(struct r10conf *conf,
712 struct r10bio *r10_bio,
713 int *max_sectors)
714{
715 const sector_t this_sector = r10_bio->sector;
716 int disk, slot;
717 int sectors = r10_bio->sectors;
718 int best_good_sectors;
719 sector_t new_distance, best_dist;
720 struct md_rdev *rdev, *best_rdev;
721 int do_balance;
722 int best_slot;
723 struct geom *geo = &conf->geo;
724
725 raid10_find_phys(conf, r10_bio);
726 rcu_read_lock();
727retry:
728 sectors = r10_bio->sectors;
729 best_slot = -1;
730 best_rdev = NULL;
731 best_dist = MaxSector;
732 best_good_sectors = 0;
733 do_balance = 1;
734
735
736
737
738
739
740 if (conf->mddev->recovery_cp < MaxSector
741 && (this_sector + sectors >= conf->next_resync))
742 do_balance = 0;
743
744 for (slot = 0; slot < conf->copies ; slot++) {
745 sector_t first_bad;
746 int bad_sectors;
747 sector_t dev_sector;
748
749 if (r10_bio->devs[slot].bio == IO_BLOCKED)
750 continue;
751 disk = r10_bio->devs[slot].devnum;
752 rdev = rcu_dereference(conf->mirrors[disk].replacement);
753 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
754 test_bit(Unmerged, &rdev->flags) ||
755 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
756 rdev = rcu_dereference(conf->mirrors[disk].rdev);
757 if (rdev == NULL ||
758 test_bit(Faulty, &rdev->flags) ||
759 test_bit(Unmerged, &rdev->flags))
760 continue;
761 if (!test_bit(In_sync, &rdev->flags) &&
762 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
763 continue;
764
765 dev_sector = r10_bio->devs[slot].addr;
766 if (is_badblock(rdev, dev_sector, sectors,
767 &first_bad, &bad_sectors)) {
768 if (best_dist < MaxSector)
769
770 continue;
771 if (first_bad <= dev_sector) {
772
773
774
775
776 bad_sectors -= (dev_sector - first_bad);
777 if (!do_balance && sectors > bad_sectors)
778 sectors = bad_sectors;
779 if (best_good_sectors > sectors)
780 best_good_sectors = sectors;
781 } else {
782 sector_t good_sectors =
783 first_bad - dev_sector;
784 if (good_sectors > best_good_sectors) {
785 best_good_sectors = good_sectors;
786 best_slot = slot;
787 best_rdev = rdev;
788 }
789 if (!do_balance)
790
791 break;
792 }
793 continue;
794 } else
795 best_good_sectors = sectors;
796
797 if (!do_balance)
798 break;
799
800
801
802
803
804 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
805 break;
806
807
808 if (geo->far_copies > 1)
809 new_distance = r10_bio->devs[slot].addr;
810 else
811 new_distance = abs(r10_bio->devs[slot].addr -
812 conf->mirrors[disk].head_position);
813 if (new_distance < best_dist) {
814 best_dist = new_distance;
815 best_slot = slot;
816 best_rdev = rdev;
817 }
818 }
819 if (slot >= conf->copies) {
820 slot = best_slot;
821 rdev = best_rdev;
822 }
823
824 if (slot >= 0) {
825 atomic_inc(&rdev->nr_pending);
826 if (test_bit(Faulty, &rdev->flags)) {
827
828
829
830 rdev_dec_pending(rdev, conf->mddev);
831 goto retry;
832 }
833 r10_bio->read_slot = slot;
834 } else
835 rdev = NULL;
836 rcu_read_unlock();
837 *max_sectors = best_good_sectors;
838
839 return rdev;
840}
841
842static int raid10_congested(void *data, int bits)
843{
844 struct mddev *mddev = data;
845 struct r10conf *conf = mddev->private;
846 int i, ret = 0;
847
848 if ((bits & (1 << BDI_async_congested)) &&
849 conf->pending_count >= max_queued_requests)
850 return 1;
851
852 if (mddev_congested(mddev, bits))
853 return 1;
854 rcu_read_lock();
855 for (i = 0;
856 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
857 && ret == 0;
858 i++) {
859 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
860 if (rdev && !test_bit(Faulty, &rdev->flags)) {
861 struct request_queue *q = bdev_get_queue(rdev->bdev);
862
863 ret |= bdi_congested(&q->backing_dev_info, bits);
864 }
865 }
866 rcu_read_unlock();
867 return ret;
868}
869
870static void flush_pending_writes(struct r10conf *conf)
871{
872
873
874
875 spin_lock_irq(&conf->device_lock);
876
877 if (conf->pending_bio_list.head) {
878 struct bio *bio;
879 bio = bio_list_get(&conf->pending_bio_list);
880 conf->pending_count = 0;
881 spin_unlock_irq(&conf->device_lock);
882
883
884 bitmap_unplug(conf->mddev->bitmap);
885 wake_up(&conf->wait_barrier);
886
887 while (bio) {
888 struct bio *next = bio->bi_next;
889 bio->bi_next = NULL;
890 generic_make_request(bio);
891 bio = next;
892 }
893 } else
894 spin_unlock_irq(&conf->device_lock);
895}
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919static void raise_barrier(struct r10conf *conf, int force)
920{
921 BUG_ON(force && !conf->barrier);
922 spin_lock_irq(&conf->resync_lock);
923
924
925 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
926 conf->resync_lock, );
927
928
929 conf->barrier++;
930
931
932 wait_event_lock_irq(conf->wait_barrier,
933 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
934 conf->resync_lock, );
935
936 spin_unlock_irq(&conf->resync_lock);
937}
938
939static void lower_barrier(struct r10conf *conf)
940{
941 unsigned long flags;
942 spin_lock_irqsave(&conf->resync_lock, flags);
943 conf->barrier--;
944 spin_unlock_irqrestore(&conf->resync_lock, flags);
945 wake_up(&conf->wait_barrier);
946}
947
948static void wait_barrier(struct r10conf *conf)
949{
950 spin_lock_irq(&conf->resync_lock);
951 if (conf->barrier) {
952 conf->nr_waiting++;
953
954
955
956
957
958
959
960
961
962 wait_event_lock_irq(conf->wait_barrier,
963 !conf->barrier ||
964 (conf->nr_pending &&
965 current->bio_list &&
966 !bio_list_empty(current->bio_list)),
967 conf->resync_lock,
968 );
969 conf->nr_waiting--;
970 }
971 conf->nr_pending++;
972 spin_unlock_irq(&conf->resync_lock);
973}
974
975static void allow_barrier(struct r10conf *conf)
976{
977 unsigned long flags;
978 spin_lock_irqsave(&conf->resync_lock, flags);
979 conf->nr_pending--;
980 spin_unlock_irqrestore(&conf->resync_lock, flags);
981 wake_up(&conf->wait_barrier);
982}
983
984static void freeze_array(struct r10conf *conf)
985{
986
987
988
989
990
991
992
993
994
995
996
997
998 spin_lock_irq(&conf->resync_lock);
999 conf->barrier++;
1000 conf->nr_waiting++;
1001 wait_event_lock_irq(conf->wait_barrier,
1002 conf->nr_pending == conf->nr_queued+1,
1003 conf->resync_lock,
1004 flush_pending_writes(conf));
1005
1006 spin_unlock_irq(&conf->resync_lock);
1007}
1008
1009static void unfreeze_array(struct r10conf *conf)
1010{
1011
1012 spin_lock_irq(&conf->resync_lock);
1013 conf->barrier--;
1014 conf->nr_waiting--;
1015 wake_up(&conf->wait_barrier);
1016 spin_unlock_irq(&conf->resync_lock);
1017}
1018
1019static sector_t choose_data_offset(struct r10bio *r10_bio,
1020 struct md_rdev *rdev)
1021{
1022 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1023 test_bit(R10BIO_Previous, &r10_bio->state))
1024 return rdev->data_offset;
1025 else
1026 return rdev->new_data_offset;
1027}
1028
1029static void make_request(struct mddev *mddev, struct bio * bio)
1030{
1031 struct r10conf *conf = mddev->private;
1032 struct r10bio *r10_bio;
1033 struct bio *read_bio;
1034 int i;
1035 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1036 int chunk_sects = chunk_mask + 1;
1037 const int rw = bio_data_dir(bio);
1038 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
1039 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
1040 unsigned long flags;
1041 struct md_rdev *blocked_rdev;
1042 int sectors_handled;
1043 int max_sectors;
1044 int sectors;
1045
1046 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
1047 md_flush_request(mddev, bio);
1048 return;
1049 }
1050
1051
1052
1053
1054 if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9)
1055 > chunk_sects
1056 && (conf->geo.near_copies < conf->geo.raid_disks
1057 || conf->prev.near_copies < conf->prev.raid_disks))) {
1058 struct bio_pair *bp;
1059
1060 if (bio->bi_vcnt != 1 ||
1061 bio->bi_idx != 0)
1062 goto bad_map;
1063
1064
1065
1066 bp = bio_split(bio,
1067 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077 spin_lock_irq(&conf->resync_lock);
1078 conf->nr_waiting++;
1079 spin_unlock_irq(&conf->resync_lock);
1080
1081 make_request(mddev, &bp->bio1);
1082 make_request(mddev, &bp->bio2);
1083
1084 spin_lock_irq(&conf->resync_lock);
1085 conf->nr_waiting--;
1086 wake_up(&conf->wait_barrier);
1087 spin_unlock_irq(&conf->resync_lock);
1088
1089 bio_pair_release(bp);
1090 return;
1091 bad_map:
1092 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
1093 " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
1094 (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
1095
1096 bio_io_error(bio);
1097 return;
1098 }
1099
1100 md_write_start(mddev, bio);
1101
1102
1103
1104
1105
1106
1107 wait_barrier(conf);
1108
1109 sectors = bio->bi_size >> 9;
1110 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1111 bio->bi_sector < conf->reshape_progress &&
1112 bio->bi_sector + sectors > conf->reshape_progress) {
1113
1114
1115
1116 allow_barrier(conf);
1117 wait_event(conf->wait_barrier,
1118 conf->reshape_progress <= bio->bi_sector ||
1119 conf->reshape_progress >= bio->bi_sector + sectors);
1120 wait_barrier(conf);
1121 }
1122 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1123 bio_data_dir(bio) == WRITE &&
1124 (mddev->reshape_backwards
1125 ? (bio->bi_sector < conf->reshape_safe &&
1126 bio->bi_sector + sectors > conf->reshape_progress)
1127 : (bio->bi_sector + sectors > conf->reshape_safe &&
1128 bio->bi_sector < conf->reshape_progress))) {
1129
1130 mddev->reshape_position = conf->reshape_progress;
1131 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1132 set_bit(MD_CHANGE_PENDING, &mddev->flags);
1133 md_wakeup_thread(mddev->thread);
1134 wait_event(mddev->sb_wait,
1135 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
1136
1137 conf->reshape_safe = mddev->reshape_position;
1138 }
1139
1140 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1141
1142 r10_bio->master_bio = bio;
1143 r10_bio->sectors = sectors;
1144
1145 r10_bio->mddev = mddev;
1146 r10_bio->sector = bio->bi_sector;
1147 r10_bio->state = 0;
1148
1149
1150
1151
1152
1153
1154
1155
1156 bio->bi_phys_segments = 0;
1157 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1158
1159 if (rw == READ) {
1160
1161
1162
1163 struct md_rdev *rdev;
1164 int slot;
1165
1166read_again:
1167 rdev = read_balance(conf, r10_bio, &max_sectors);
1168 if (!rdev) {
1169 raid_end_bio_io(r10_bio);
1170 return;
1171 }
1172 slot = r10_bio->read_slot;
1173
1174 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1175 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
1176 max_sectors);
1177
1178 r10_bio->devs[slot].bio = read_bio;
1179 r10_bio->devs[slot].rdev = rdev;
1180
1181 read_bio->bi_sector = r10_bio->devs[slot].addr +
1182 choose_data_offset(r10_bio, rdev);
1183 read_bio->bi_bdev = rdev->bdev;
1184 read_bio->bi_end_io = raid10_end_read_request;
1185 read_bio->bi_rw = READ | do_sync;
1186 read_bio->bi_private = r10_bio;
1187
1188 if (max_sectors < r10_bio->sectors) {
1189
1190
1191
1192 sectors_handled = (r10_bio->sectors + max_sectors
1193 - bio->bi_sector);
1194 r10_bio->sectors = max_sectors;
1195 spin_lock_irq(&conf->device_lock);
1196 if (bio->bi_phys_segments == 0)
1197 bio->bi_phys_segments = 2;
1198 else
1199 bio->bi_phys_segments++;
1200 spin_unlock(&conf->device_lock);
1201
1202
1203
1204
1205
1206 reschedule_retry(r10_bio);
1207
1208 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1209
1210 r10_bio->master_bio = bio;
1211 r10_bio->sectors = ((bio->bi_size >> 9)
1212 - sectors_handled);
1213 r10_bio->state = 0;
1214 r10_bio->mddev = mddev;
1215 r10_bio->sector = bio->bi_sector + sectors_handled;
1216 goto read_again;
1217 } else
1218 generic_make_request(read_bio);
1219 return;
1220 }
1221
1222
1223
1224
1225 if (conf->pending_count >= max_queued_requests) {
1226 md_wakeup_thread(mddev->thread);
1227 wait_event(conf->wait_barrier,
1228 conf->pending_count < max_queued_requests);
1229 }
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242 r10_bio->read_slot = -1;
1243 raid10_find_phys(conf, r10_bio);
1244retry_write:
1245 blocked_rdev = NULL;
1246 rcu_read_lock();
1247 max_sectors = r10_bio->sectors;
1248
1249 for (i = 0; i < conf->copies; i++) {
1250 int d = r10_bio->devs[i].devnum;
1251 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1252 struct md_rdev *rrdev = rcu_dereference(
1253 conf->mirrors[d].replacement);
1254 if (rdev == rrdev)
1255 rrdev = NULL;
1256 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1257 atomic_inc(&rdev->nr_pending);
1258 blocked_rdev = rdev;
1259 break;
1260 }
1261 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1262 atomic_inc(&rrdev->nr_pending);
1263 blocked_rdev = rrdev;
1264 break;
1265 }
1266 if (rrdev && (test_bit(Faulty, &rrdev->flags)
1267 || test_bit(Unmerged, &rrdev->flags)))
1268 rrdev = NULL;
1269
1270 r10_bio->devs[i].bio = NULL;
1271 r10_bio->devs[i].repl_bio = NULL;
1272 if (!rdev || test_bit(Faulty, &rdev->flags) ||
1273 test_bit(Unmerged, &rdev->flags)) {
1274 set_bit(R10BIO_Degraded, &r10_bio->state);
1275 continue;
1276 }
1277 if (test_bit(WriteErrorSeen, &rdev->flags)) {
1278 sector_t first_bad;
1279 sector_t dev_sector = r10_bio->devs[i].addr;
1280 int bad_sectors;
1281 int is_bad;
1282
1283 is_bad = is_badblock(rdev, dev_sector,
1284 max_sectors,
1285 &first_bad, &bad_sectors);
1286 if (is_bad < 0) {
1287
1288
1289
1290 atomic_inc(&rdev->nr_pending);
1291 set_bit(BlockedBadBlocks, &rdev->flags);
1292 blocked_rdev = rdev;
1293 break;
1294 }
1295 if (is_bad && first_bad <= dev_sector) {
1296
1297 bad_sectors -= (dev_sector - first_bad);
1298 if (bad_sectors < max_sectors)
1299
1300
1301
1302 max_sectors = bad_sectors;
1303
1304
1305
1306
1307
1308
1309
1310
1311 continue;
1312 }
1313 if (is_bad) {
1314 int good_sectors = first_bad - dev_sector;
1315 if (good_sectors < max_sectors)
1316 max_sectors = good_sectors;
1317 }
1318 }
1319 r10_bio->devs[i].bio = bio;
1320 atomic_inc(&rdev->nr_pending);
1321 if (rrdev) {
1322 r10_bio->devs[i].repl_bio = bio;
1323 atomic_inc(&rrdev->nr_pending);
1324 }
1325 }
1326 rcu_read_unlock();
1327
1328 if (unlikely(blocked_rdev)) {
1329
1330 int j;
1331 int d;
1332
1333 for (j = 0; j < i; j++) {
1334 if (r10_bio->devs[j].bio) {
1335 d = r10_bio->devs[j].devnum;
1336 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1337 }
1338 if (r10_bio->devs[j].repl_bio) {
1339 struct md_rdev *rdev;
1340 d = r10_bio->devs[j].devnum;
1341 rdev = conf->mirrors[d].replacement;
1342 if (!rdev) {
1343
1344 smp_mb();
1345 rdev = conf->mirrors[d].rdev;
1346 }
1347 rdev_dec_pending(rdev, mddev);
1348 }
1349 }
1350 allow_barrier(conf);
1351 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1352 wait_barrier(conf);
1353 goto retry_write;
1354 }
1355
1356 if (max_sectors < r10_bio->sectors) {
1357
1358
1359
1360 r10_bio->sectors = max_sectors;
1361 spin_lock_irq(&conf->device_lock);
1362 if (bio->bi_phys_segments == 0)
1363 bio->bi_phys_segments = 2;
1364 else
1365 bio->bi_phys_segments++;
1366 spin_unlock_irq(&conf->device_lock);
1367 }
1368 sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
1369
1370 atomic_set(&r10_bio->remaining, 1);
1371 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1372
1373 for (i = 0; i < conf->copies; i++) {
1374 struct bio *mbio;
1375 int d = r10_bio->devs[i].devnum;
1376 if (!r10_bio->devs[i].bio)
1377 continue;
1378
1379 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1380 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1381 max_sectors);
1382 r10_bio->devs[i].bio = mbio;
1383
1384 mbio->bi_sector = (r10_bio->devs[i].addr+
1385 choose_data_offset(r10_bio,
1386 conf->mirrors[d].rdev));
1387 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1388 mbio->bi_end_io = raid10_end_write_request;
1389 mbio->bi_rw = WRITE | do_sync | do_fua;
1390 mbio->bi_private = r10_bio;
1391
1392 atomic_inc(&r10_bio->remaining);
1393 spin_lock_irqsave(&conf->device_lock, flags);
1394 bio_list_add(&conf->pending_bio_list, mbio);
1395 conf->pending_count++;
1396 spin_unlock_irqrestore(&conf->device_lock, flags);
1397 if (!mddev_check_plugged(mddev))
1398 md_wakeup_thread(mddev->thread);
1399
1400 if (!r10_bio->devs[i].repl_bio)
1401 continue;
1402
1403 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1404 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1405 max_sectors);
1406 r10_bio->devs[i].repl_bio = mbio;
1407
1408
1409
1410
1411
1412 mbio->bi_sector = (r10_bio->devs[i].addr +
1413 choose_data_offset(
1414 r10_bio,
1415 conf->mirrors[d].replacement));
1416 mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
1417 mbio->bi_end_io = raid10_end_write_request;
1418 mbio->bi_rw = WRITE | do_sync | do_fua;
1419 mbio->bi_private = r10_bio;
1420
1421 atomic_inc(&r10_bio->remaining);
1422 spin_lock_irqsave(&conf->device_lock, flags);
1423 bio_list_add(&conf->pending_bio_list, mbio);
1424 conf->pending_count++;
1425 spin_unlock_irqrestore(&conf->device_lock, flags);
1426 if (!mddev_check_plugged(mddev))
1427 md_wakeup_thread(mddev->thread);
1428 }
1429
1430
1431
1432
1433
1434 if (sectors_handled < (bio->bi_size >> 9)) {
1435 one_write_done(r10_bio);
1436
1437
1438
1439 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1440
1441 r10_bio->master_bio = bio;
1442 r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
1443
1444 r10_bio->mddev = mddev;
1445 r10_bio->sector = bio->bi_sector + sectors_handled;
1446 r10_bio->state = 0;
1447 goto retry_write;
1448 }
1449 one_write_done(r10_bio);
1450
1451
1452 wake_up(&conf->wait_barrier);
1453}
1454
1455static void status(struct seq_file *seq, struct mddev *mddev)
1456{
1457 struct r10conf *conf = mddev->private;
1458 int i;
1459
1460 if (conf->geo.near_copies < conf->geo.raid_disks)
1461 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1462 if (conf->geo.near_copies > 1)
1463 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1464 if (conf->geo.far_copies > 1) {
1465 if (conf->geo.far_offset)
1466 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1467 else
1468 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1469 }
1470 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1471 conf->geo.raid_disks - mddev->degraded);
1472 for (i = 0; i < conf->geo.raid_disks; i++)
1473 seq_printf(seq, "%s",
1474 conf->mirrors[i].rdev &&
1475 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
1476 seq_printf(seq, "]");
1477}
1478
1479
1480
1481
1482
1483
1484static int _enough(struct r10conf *conf, struct geom *geo, int ignore)
1485{
1486 int first = 0;
1487
1488 do {
1489 int n = conf->copies;
1490 int cnt = 0;
1491 while (n--) {
1492 if (conf->mirrors[first].rdev &&
1493 first != ignore)
1494 cnt++;
1495 first = (first+1) % geo->raid_disks;
1496 }
1497 if (cnt == 0)
1498 return 0;
1499 } while (first != 0);
1500 return 1;
1501}
1502
1503static int enough(struct r10conf *conf, int ignore)
1504{
1505 return _enough(conf, &conf->geo, ignore) &&
1506 _enough(conf, &conf->prev, ignore);
1507}
1508
1509static void error(struct mddev *mddev, struct md_rdev *rdev)
1510{
1511 char b[BDEVNAME_SIZE];
1512 struct r10conf *conf = mddev->private;
1513
1514
1515
1516
1517
1518
1519
1520 if (test_bit(In_sync, &rdev->flags)
1521 && !enough(conf, rdev->raid_disk))
1522
1523
1524
1525 return;
1526 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1527 unsigned long flags;
1528 spin_lock_irqsave(&conf->device_lock, flags);
1529 mddev->degraded++;
1530 spin_unlock_irqrestore(&conf->device_lock, flags);
1531
1532
1533
1534 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1535 }
1536 set_bit(Blocked, &rdev->flags);
1537 set_bit(Faulty, &rdev->flags);
1538 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1539 printk(KERN_ALERT
1540 "md/raid10:%s: Disk failure on %s, disabling device.\n"
1541 "md/raid10:%s: Operation continuing on %d devices.\n",
1542 mdname(mddev), bdevname(rdev->bdev, b),
1543 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1544}
1545
1546static void print_conf(struct r10conf *conf)
1547{
1548 int i;
1549 struct mirror_info *tmp;
1550
1551 printk(KERN_DEBUG "RAID10 conf printout:\n");
1552 if (!conf) {
1553 printk(KERN_DEBUG "(!conf)\n");
1554 return;
1555 }
1556 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1557 conf->geo.raid_disks);
1558
1559 for (i = 0; i < conf->geo.raid_disks; i++) {
1560 char b[BDEVNAME_SIZE];
1561 tmp = conf->mirrors + i;
1562 if (tmp->rdev)
1563 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1564 i, !test_bit(In_sync, &tmp->rdev->flags),
1565 !test_bit(Faulty, &tmp->rdev->flags),
1566 bdevname(tmp->rdev->bdev,b));
1567 }
1568}
1569
1570static void close_sync(struct r10conf *conf)
1571{
1572 wait_barrier(conf);
1573 allow_barrier(conf);
1574
1575 mempool_destroy(conf->r10buf_pool);
1576 conf->r10buf_pool = NULL;
1577}
1578
1579static int raid10_spare_active(struct mddev *mddev)
1580{
1581 int i;
1582 struct r10conf *conf = mddev->private;
1583 struct mirror_info *tmp;
1584 int count = 0;
1585 unsigned long flags;
1586
1587
1588
1589
1590
1591 for (i = 0; i < conf->geo.raid_disks; i++) {
1592 tmp = conf->mirrors + i;
1593 if (tmp->replacement
1594 && tmp->replacement->recovery_offset == MaxSector
1595 && !test_bit(Faulty, &tmp->replacement->flags)
1596 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1597
1598 if (!tmp->rdev
1599 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1600 count++;
1601 if (tmp->rdev) {
1602
1603
1604
1605
1606 set_bit(Faulty, &tmp->rdev->flags);
1607 sysfs_notify_dirent_safe(
1608 tmp->rdev->sysfs_state);
1609 }
1610 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1611 } else if (tmp->rdev
1612 && !test_bit(Faulty, &tmp->rdev->flags)
1613 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1614 count++;
1615 sysfs_notify_dirent(tmp->rdev->sysfs_state);
1616 }
1617 }
1618 spin_lock_irqsave(&conf->device_lock, flags);
1619 mddev->degraded -= count;
1620 spin_unlock_irqrestore(&conf->device_lock, flags);
1621
1622 print_conf(conf);
1623 return count;
1624}
1625
1626
1627static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1628{
1629 struct r10conf *conf = mddev->private;
1630 int err = -EEXIST;
1631 int mirror;
1632 int first = 0;
1633 int last = conf->geo.raid_disks - 1;
1634 struct request_queue *q = bdev_get_queue(rdev->bdev);
1635
1636 if (mddev->recovery_cp < MaxSector)
1637
1638
1639
1640 return -EBUSY;
1641 if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1))
1642 return -EINVAL;
1643
1644 if (rdev->raid_disk >= 0)
1645 first = last = rdev->raid_disk;
1646
1647 if (q->merge_bvec_fn) {
1648 set_bit(Unmerged, &rdev->flags);
1649 mddev->merge_check_needed = 1;
1650 }
1651
1652 if (rdev->saved_raid_disk >= first &&
1653 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1654 mirror = rdev->saved_raid_disk;
1655 else
1656 mirror = first;
1657 for ( ; mirror <= last ; mirror++) {
1658 struct mirror_info *p = &conf->mirrors[mirror];
1659 if (p->recovery_disabled == mddev->recovery_disabled)
1660 continue;
1661 if (p->rdev) {
1662 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1663 p->replacement != NULL)
1664 continue;
1665 clear_bit(In_sync, &rdev->flags);
1666 set_bit(Replacement, &rdev->flags);
1667 rdev->raid_disk = mirror;
1668 err = 0;
1669 disk_stack_limits(mddev->gendisk, rdev->bdev,
1670 rdev->data_offset << 9);
1671 conf->fullsync = 1;
1672 rcu_assign_pointer(p->replacement, rdev);
1673 break;
1674 }
1675
1676 disk_stack_limits(mddev->gendisk, rdev->bdev,
1677 rdev->data_offset << 9);
1678
1679 p->head_position = 0;
1680 p->recovery_disabled = mddev->recovery_disabled - 1;
1681 rdev->raid_disk = mirror;
1682 err = 0;
1683 if (rdev->saved_raid_disk != mirror)
1684 conf->fullsync = 1;
1685 rcu_assign_pointer(p->rdev, rdev);
1686 break;
1687 }
1688 if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
1689
1690
1691
1692
1693
1694
1695
1696 synchronize_sched();
1697 raise_barrier(conf, 0);
1698 lower_barrier(conf);
1699 clear_bit(Unmerged, &rdev->flags);
1700 }
1701 md_integrity_add_rdev(rdev, mddev);
1702 print_conf(conf);
1703 return err;
1704}
1705
1706static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1707{
1708 struct r10conf *conf = mddev->private;
1709 int err = 0;
1710 int number = rdev->raid_disk;
1711 struct md_rdev **rdevp;
1712 struct mirror_info *p = conf->mirrors + number;
1713
1714 print_conf(conf);
1715 if (rdev == p->rdev)
1716 rdevp = &p->rdev;
1717 else if (rdev == p->replacement)
1718 rdevp = &p->replacement;
1719 else
1720 return 0;
1721
1722 if (test_bit(In_sync, &rdev->flags) ||
1723 atomic_read(&rdev->nr_pending)) {
1724 err = -EBUSY;
1725 goto abort;
1726 }
1727
1728
1729
1730 if (!test_bit(Faulty, &rdev->flags) &&
1731 mddev->recovery_disabled != p->recovery_disabled &&
1732 (!p->replacement || p->replacement == rdev) &&
1733 number < conf->geo.raid_disks &&
1734 enough(conf, -1)) {
1735 err = -EBUSY;
1736 goto abort;
1737 }
1738 *rdevp = NULL;
1739 synchronize_rcu();
1740 if (atomic_read(&rdev->nr_pending)) {
1741
1742 err = -EBUSY;
1743 *rdevp = rdev;
1744 goto abort;
1745 } else if (p->replacement) {
1746
1747 p->rdev = p->replacement;
1748 clear_bit(Replacement, &p->replacement->flags);
1749 smp_mb();
1750
1751
1752 p->replacement = NULL;
1753 clear_bit(WantReplacement, &rdev->flags);
1754 } else
1755
1756
1757
1758 clear_bit(WantReplacement, &rdev->flags);
1759
1760 err = md_integrity_register(mddev);
1761
1762abort:
1763
1764 print_conf(conf);
1765 return err;
1766}
1767
1768
1769static void end_sync_read(struct bio *bio, int error)
1770{
1771 struct r10bio *r10_bio = bio->bi_private;
1772 struct r10conf *conf = r10_bio->mddev->private;
1773 int d;
1774
1775 if (bio == r10_bio->master_bio) {
1776
1777 d = r10_bio->read_slot;
1778 } else
1779 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1780
1781 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1782 set_bit(R10BIO_Uptodate, &r10_bio->state);
1783 else
1784
1785
1786
1787 atomic_add(r10_bio->sectors,
1788 &conf->mirrors[d].rdev->corrected_errors);
1789
1790
1791
1792
1793 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1794 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1795 atomic_dec_and_test(&r10_bio->remaining)) {
1796
1797
1798
1799 reschedule_retry(r10_bio);
1800 }
1801}
1802
1803static void end_sync_request(struct r10bio *r10_bio)
1804{
1805 struct mddev *mddev = r10_bio->mddev;
1806
1807 while (atomic_dec_and_test(&r10_bio->remaining)) {
1808 if (r10_bio->master_bio == NULL) {
1809
1810 sector_t s = r10_bio->sectors;
1811 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1812 test_bit(R10BIO_WriteError, &r10_bio->state))
1813 reschedule_retry(r10_bio);
1814 else
1815 put_buf(r10_bio);
1816 md_done_sync(mddev, s, 1);
1817 break;
1818 } else {
1819 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1820 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1821 test_bit(R10BIO_WriteError, &r10_bio->state))
1822 reschedule_retry(r10_bio);
1823 else
1824 put_buf(r10_bio);
1825 r10_bio = r10_bio2;
1826 }
1827 }
1828}
1829
1830static void end_sync_write(struct bio *bio, int error)
1831{
1832 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1833 struct r10bio *r10_bio = bio->bi_private;
1834 struct mddev *mddev = r10_bio->mddev;
1835 struct r10conf *conf = mddev->private;
1836 int d;
1837 sector_t first_bad;
1838 int bad_sectors;
1839 int slot;
1840 int repl;
1841 struct md_rdev *rdev = NULL;
1842
1843 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1844 if (repl)
1845 rdev = conf->mirrors[d].replacement;
1846 else
1847 rdev = conf->mirrors[d].rdev;
1848
1849 if (!uptodate) {
1850 if (repl)
1851 md_error(mddev, rdev);
1852 else {
1853 set_bit(WriteErrorSeen, &rdev->flags);
1854 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1855 set_bit(MD_RECOVERY_NEEDED,
1856 &rdev->mddev->recovery);
1857 set_bit(R10BIO_WriteError, &r10_bio->state);
1858 }
1859 } else if (is_badblock(rdev,
1860 r10_bio->devs[slot].addr,
1861 r10_bio->sectors,
1862 &first_bad, &bad_sectors))
1863 set_bit(R10BIO_MadeGood, &r10_bio->state);
1864
1865 rdev_dec_pending(rdev, mddev);
1866
1867 end_sync_request(r10_bio);
1868}
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1887{
1888 struct r10conf *conf = mddev->private;
1889 int i, first;
1890 struct bio *tbio, *fbio;
1891 int vcnt;
1892
1893 atomic_set(&r10_bio->remaining, 1);
1894
1895
1896 for (i=0; i<conf->copies; i++)
1897 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
1898 break;
1899
1900 if (i == conf->copies)
1901 goto done;
1902
1903 first = i;
1904 fbio = r10_bio->devs[i].bio;
1905
1906 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
1907
1908 for (i=0 ; i < conf->copies ; i++) {
1909 int j, d;
1910
1911 tbio = r10_bio->devs[i].bio;
1912
1913 if (tbio->bi_end_io != end_sync_read)
1914 continue;
1915 if (i == first)
1916 continue;
1917 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
1918
1919
1920
1921
1922 for (j = 0; j < vcnt; j++)
1923 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
1924 page_address(tbio->bi_io_vec[j].bv_page),
1925 fbio->bi_io_vec[j].bv_len))
1926 break;
1927 if (j == vcnt)
1928 continue;
1929 mddev->resync_mismatches += r10_bio->sectors;
1930 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1931
1932 continue;
1933 }
1934
1935
1936
1937
1938
1939 tbio->bi_vcnt = vcnt;
1940 tbio->bi_size = r10_bio->sectors << 9;
1941 tbio->bi_idx = 0;
1942 tbio->bi_phys_segments = 0;
1943 tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1944 tbio->bi_flags |= 1 << BIO_UPTODATE;
1945 tbio->bi_next = NULL;
1946 tbio->bi_rw = WRITE;
1947 tbio->bi_private = r10_bio;
1948 tbio->bi_sector = r10_bio->devs[i].addr;
1949
1950 for (j=0; j < vcnt ; j++) {
1951 tbio->bi_io_vec[j].bv_offset = 0;
1952 tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
1953
1954 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
1955 page_address(fbio->bi_io_vec[j].bv_page),
1956 PAGE_SIZE);
1957 }
1958 tbio->bi_end_io = end_sync_write;
1959
1960 d = r10_bio->devs[i].devnum;
1961 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1962 atomic_inc(&r10_bio->remaining);
1963 md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
1964
1965 tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
1966 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1967 generic_make_request(tbio);
1968 }
1969
1970
1971
1972
1973 for (i = 0; i < conf->copies; i++) {
1974 int j, d;
1975
1976 tbio = r10_bio->devs[i].repl_bio;
1977 if (!tbio || !tbio->bi_end_io)
1978 continue;
1979 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
1980 && r10_bio->devs[i].bio != fbio)
1981 for (j = 0; j < vcnt; j++)
1982 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
1983 page_address(fbio->bi_io_vec[j].bv_page),
1984 PAGE_SIZE);
1985 d = r10_bio->devs[i].devnum;
1986 atomic_inc(&r10_bio->remaining);
1987 md_sync_acct(conf->mirrors[d].replacement->bdev,
1988 tbio->bi_size >> 9);
1989 generic_make_request(tbio);
1990 }
1991
1992done:
1993 if (atomic_dec_and_test(&r10_bio->remaining)) {
1994 md_done_sync(mddev, r10_bio->sectors, 1);
1995 put_buf(r10_bio);
1996 }
1997}
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009static void fix_recovery_read_error(struct r10bio *r10_bio)
2010{
2011
2012
2013
2014
2015
2016
2017
2018 struct mddev *mddev = r10_bio->mddev;
2019 struct r10conf *conf = mddev->private;
2020 struct bio *bio = r10_bio->devs[0].bio;
2021 sector_t sect = 0;
2022 int sectors = r10_bio->sectors;
2023 int idx = 0;
2024 int dr = r10_bio->devs[0].devnum;
2025 int dw = r10_bio->devs[1].devnum;
2026
2027 while (sectors) {
2028 int s = sectors;
2029 struct md_rdev *rdev;
2030 sector_t addr;
2031 int ok;
2032
2033 if (s > (PAGE_SIZE>>9))
2034 s = PAGE_SIZE >> 9;
2035
2036 rdev = conf->mirrors[dr].rdev;
2037 addr = r10_bio->devs[0].addr + sect,
2038 ok = sync_page_io(rdev,
2039 addr,
2040 s << 9,
2041 bio->bi_io_vec[idx].bv_page,
2042 READ, false);
2043 if (ok) {
2044 rdev = conf->mirrors[dw].rdev;
2045 addr = r10_bio->devs[1].addr + sect;
2046 ok = sync_page_io(rdev,
2047 addr,
2048 s << 9,
2049 bio->bi_io_vec[idx].bv_page,
2050 WRITE, false);
2051 if (!ok) {
2052 set_bit(WriteErrorSeen, &rdev->flags);
2053 if (!test_and_set_bit(WantReplacement,
2054 &rdev->flags))
2055 set_bit(MD_RECOVERY_NEEDED,
2056 &rdev->mddev->recovery);
2057 }
2058 }
2059 if (!ok) {
2060
2061
2062
2063
2064 rdev_set_badblocks(rdev, addr, s, 0);
2065
2066 if (rdev != conf->mirrors[dw].rdev) {
2067
2068 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2069 addr = r10_bio->devs[1].addr + sect;
2070 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2071 if (!ok) {
2072
2073 printk(KERN_NOTICE
2074 "md/raid10:%s: recovery aborted"
2075 " due to read error\n",
2076 mdname(mddev));
2077
2078 conf->mirrors[dw].recovery_disabled
2079 = mddev->recovery_disabled;
2080 set_bit(MD_RECOVERY_INTR,
2081 &mddev->recovery);
2082 break;
2083 }
2084 }
2085 }
2086
2087 sectors -= s;
2088 sect += s;
2089 idx++;
2090 }
2091}
2092
2093static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2094{
2095 struct r10conf *conf = mddev->private;
2096 int d;
2097 struct bio *wbio, *wbio2;
2098
2099 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2100 fix_recovery_read_error(r10_bio);
2101 end_sync_request(r10_bio);
2102 return;
2103 }
2104
2105
2106
2107
2108
2109 d = r10_bio->devs[1].devnum;
2110 wbio = r10_bio->devs[1].bio;
2111 wbio2 = r10_bio->devs[1].repl_bio;
2112 if (wbio->bi_end_io) {
2113 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2114 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
2115 generic_make_request(wbio);
2116 }
2117 if (wbio2 && wbio2->bi_end_io) {
2118 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2119 md_sync_acct(conf->mirrors[d].replacement->bdev,
2120 wbio2->bi_size >> 9);
2121 generic_make_request(wbio2);
2122 }
2123}
2124
2125
2126
2127
2128
2129
2130
2131
2132static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2133{
2134 struct timespec cur_time_mon;
2135 unsigned long hours_since_last;
2136 unsigned int read_errors = atomic_read(&rdev->read_errors);
2137
2138 ktime_get_ts(&cur_time_mon);
2139
2140 if (rdev->last_read_error.tv_sec == 0 &&
2141 rdev->last_read_error.tv_nsec == 0) {
2142
2143 rdev->last_read_error = cur_time_mon;
2144 return;
2145 }
2146
2147 hours_since_last = (cur_time_mon.tv_sec -
2148 rdev->last_read_error.tv_sec) / 3600;
2149
2150 rdev->last_read_error = cur_time_mon;
2151
2152
2153
2154
2155
2156
2157 if (hours_since_last >= 8 * sizeof(read_errors))
2158 atomic_set(&rdev->read_errors, 0);
2159 else
2160 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2161}
2162
2163static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2164 int sectors, struct page *page, int rw)
2165{
2166 sector_t first_bad;
2167 int bad_sectors;
2168
2169 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2170 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2171 return -1;
2172 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
2173
2174 return 1;
2175 if (rw == WRITE) {
2176 set_bit(WriteErrorSeen, &rdev->flags);
2177 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2178 set_bit(MD_RECOVERY_NEEDED,
2179 &rdev->mddev->recovery);
2180 }
2181
2182 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2183 md_error(rdev->mddev, rdev);
2184 return 0;
2185}
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2196{
2197 int sect = 0;
2198 int sectors = r10_bio->sectors;
2199 struct md_rdev*rdev;
2200 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2201 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2202
2203
2204
2205
2206 rdev = conf->mirrors[d].rdev;
2207
2208 if (test_bit(Faulty, &rdev->flags))
2209
2210
2211 return;
2212
2213 check_decay_read_errors(mddev, rdev);
2214 atomic_inc(&rdev->read_errors);
2215 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2216 char b[BDEVNAME_SIZE];
2217 bdevname(rdev->bdev, b);
2218
2219 printk(KERN_NOTICE
2220 "md/raid10:%s: %s: Raid device exceeded "
2221 "read_error threshold [cur %d:max %d]\n",
2222 mdname(mddev), b,
2223 atomic_read(&rdev->read_errors), max_read_errors);
2224 printk(KERN_NOTICE
2225 "md/raid10:%s: %s: Failing raid device\n",
2226 mdname(mddev), b);
2227 md_error(mddev, conf->mirrors[d].rdev);
2228 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2229 return;
2230 }
2231
2232 while(sectors) {
2233 int s = sectors;
2234 int sl = r10_bio->read_slot;
2235 int success = 0;
2236 int start;
2237
2238 if (s > (PAGE_SIZE>>9))
2239 s = PAGE_SIZE >> 9;
2240
2241 rcu_read_lock();
2242 do {
2243 sector_t first_bad;
2244 int bad_sectors;
2245
2246 d = r10_bio->devs[sl].devnum;
2247 rdev = rcu_dereference(conf->mirrors[d].rdev);
2248 if (rdev &&
2249 !test_bit(Unmerged, &rdev->flags) &&
2250 test_bit(In_sync, &rdev->flags) &&
2251 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2252 &first_bad, &bad_sectors) == 0) {
2253 atomic_inc(&rdev->nr_pending);
2254 rcu_read_unlock();
2255 success = sync_page_io(rdev,
2256 r10_bio->devs[sl].addr +
2257 sect,
2258 s<<9,
2259 conf->tmppage, READ, false);
2260 rdev_dec_pending(rdev, mddev);
2261 rcu_read_lock();
2262 if (success)
2263 break;
2264 }
2265 sl++;
2266 if (sl == conf->copies)
2267 sl = 0;
2268 } while (!success && sl != r10_bio->read_slot);
2269 rcu_read_unlock();
2270
2271 if (!success) {
2272
2273
2274
2275
2276 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2277 rdev = conf->mirrors[dn].rdev;
2278
2279 if (!rdev_set_badblocks(
2280 rdev,
2281 r10_bio->devs[r10_bio->read_slot].addr
2282 + sect,
2283 s, 0)) {
2284 md_error(mddev, rdev);
2285 r10_bio->devs[r10_bio->read_slot].bio
2286 = IO_BLOCKED;
2287 }
2288 break;
2289 }
2290
2291 start = sl;
2292
2293 rcu_read_lock();
2294 while (sl != r10_bio->read_slot) {
2295 char b[BDEVNAME_SIZE];
2296
2297 if (sl==0)
2298 sl = conf->copies;
2299 sl--;
2300 d = r10_bio->devs[sl].devnum;
2301 rdev = rcu_dereference(conf->mirrors[d].rdev);
2302 if (!rdev ||
2303 test_bit(Unmerged, &rdev->flags) ||
2304 !test_bit(In_sync, &rdev->flags))
2305 continue;
2306
2307 atomic_inc(&rdev->nr_pending);
2308 rcu_read_unlock();
2309 if (r10_sync_page_io(rdev,
2310 r10_bio->devs[sl].addr +
2311 sect,
2312 s, conf->tmppage, WRITE)
2313 == 0) {
2314
2315 printk(KERN_NOTICE
2316 "md/raid10:%s: read correction "
2317 "write failed"
2318 " (%d sectors at %llu on %s)\n",
2319 mdname(mddev), s,
2320 (unsigned long long)(
2321 sect +
2322 choose_data_offset(r10_bio,
2323 rdev)),
2324 bdevname(rdev->bdev, b));
2325 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2326 "drive\n",
2327 mdname(mddev),
2328 bdevname(rdev->bdev, b));
2329 }
2330 rdev_dec_pending(rdev, mddev);
2331 rcu_read_lock();
2332 }
2333 sl = start;
2334 while (sl != r10_bio->read_slot) {
2335 char b[BDEVNAME_SIZE];
2336
2337 if (sl==0)
2338 sl = conf->copies;
2339 sl--;
2340 d = r10_bio->devs[sl].devnum;
2341 rdev = rcu_dereference(conf->mirrors[d].rdev);
2342 if (!rdev ||
2343 !test_bit(In_sync, &rdev->flags))
2344 continue;
2345
2346 atomic_inc(&rdev->nr_pending);
2347 rcu_read_unlock();
2348 switch (r10_sync_page_io(rdev,
2349 r10_bio->devs[sl].addr +
2350 sect,
2351 s, conf->tmppage,
2352 READ)) {
2353 case 0:
2354
2355 printk(KERN_NOTICE
2356 "md/raid10:%s: unable to read back "
2357 "corrected sectors"
2358 " (%d sectors at %llu on %s)\n",
2359 mdname(mddev), s,
2360 (unsigned long long)(
2361 sect +
2362 choose_data_offset(r10_bio, rdev)),
2363 bdevname(rdev->bdev, b));
2364 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2365 "drive\n",
2366 mdname(mddev),
2367 bdevname(rdev->bdev, b));
2368 break;
2369 case 1:
2370 printk(KERN_INFO
2371 "md/raid10:%s: read error corrected"
2372 " (%d sectors at %llu on %s)\n",
2373 mdname(mddev), s,
2374 (unsigned long long)(
2375 sect +
2376 choose_data_offset(r10_bio, rdev)),
2377 bdevname(rdev->bdev, b));
2378 atomic_add(s, &rdev->corrected_errors);
2379 }
2380
2381 rdev_dec_pending(rdev, mddev);
2382 rcu_read_lock();
2383 }
2384 rcu_read_unlock();
2385
2386 sectors -= s;
2387 sect += s;
2388 }
2389}
2390
2391static void bi_complete(struct bio *bio, int error)
2392{
2393 complete((struct completion *)bio->bi_private);
2394}
2395
2396static int submit_bio_wait(int rw, struct bio *bio)
2397{
2398 struct completion event;
2399 rw |= REQ_SYNC;
2400
2401 init_completion(&event);
2402 bio->bi_private = &event;
2403 bio->bi_end_io = bi_complete;
2404 submit_bio(rw, bio);
2405 wait_for_completion(&event);
2406
2407 return test_bit(BIO_UPTODATE, &bio->bi_flags);
2408}
2409
2410static int narrow_write_error(struct r10bio *r10_bio, int i)
2411{
2412 struct bio *bio = r10_bio->master_bio;
2413 struct mddev *mddev = r10_bio->mddev;
2414 struct r10conf *conf = mddev->private;
2415 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427 int block_sectors;
2428 sector_t sector;
2429 int sectors;
2430 int sect_to_write = r10_bio->sectors;
2431 int ok = 1;
2432
2433 if (rdev->badblocks.shift < 0)
2434 return 0;
2435
2436 block_sectors = 1 << rdev->badblocks.shift;
2437 sector = r10_bio->sector;
2438 sectors = ((r10_bio->sector + block_sectors)
2439 & ~(sector_t)(block_sectors - 1))
2440 - sector;
2441
2442 while (sect_to_write) {
2443 struct bio *wbio;
2444 if (sectors > sect_to_write)
2445 sectors = sect_to_write;
2446
2447 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2448 md_trim_bio(wbio, sector - bio->bi_sector, sectors);
2449 wbio->bi_sector = (r10_bio->devs[i].addr+
2450 choose_data_offset(r10_bio, rdev) +
2451 (sector - r10_bio->sector));
2452 wbio->bi_bdev = rdev->bdev;
2453 if (submit_bio_wait(WRITE, wbio) == 0)
2454
2455 ok = rdev_set_badblocks(rdev, sector,
2456 sectors, 0)
2457 && ok;
2458
2459 bio_put(wbio);
2460 sect_to_write -= sectors;
2461 sector += sectors;
2462 sectors = block_sectors;
2463 }
2464 return ok;
2465}
2466
2467static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2468{
2469 int slot = r10_bio->read_slot;
2470 struct bio *bio;
2471 struct r10conf *conf = mddev->private;
2472 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2473 char b[BDEVNAME_SIZE];
2474 unsigned long do_sync;
2475 int max_sectors;
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485 bio = r10_bio->devs[slot].bio;
2486 bdevname(bio->bi_bdev, b);
2487 bio_put(bio);
2488 r10_bio->devs[slot].bio = NULL;
2489
2490 if (mddev->ro == 0) {
2491 freeze_array(conf);
2492 fix_read_error(conf, mddev, r10_bio);
2493 unfreeze_array(conf);
2494 } else
2495 r10_bio->devs[slot].bio = IO_BLOCKED;
2496
2497 rdev_dec_pending(rdev, mddev);
2498
2499read_more:
2500 rdev = read_balance(conf, r10_bio, &max_sectors);
2501 if (rdev == NULL) {
2502 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2503 " read error for block %llu\n",
2504 mdname(mddev), b,
2505 (unsigned long long)r10_bio->sector);
2506 raid_end_bio_io(r10_bio);
2507 return;
2508 }
2509
2510 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
2511 slot = r10_bio->read_slot;
2512 printk_ratelimited(
2513 KERN_ERR
2514 "md/raid10:%s: %s: redirecting "
2515 "sector %llu to another mirror\n",
2516 mdname(mddev),
2517 bdevname(rdev->bdev, b),
2518 (unsigned long long)r10_bio->sector);
2519 bio = bio_clone_mddev(r10_bio->master_bio,
2520 GFP_NOIO, mddev);
2521 md_trim_bio(bio,
2522 r10_bio->sector - bio->bi_sector,
2523 max_sectors);
2524 r10_bio->devs[slot].bio = bio;
2525 r10_bio->devs[slot].rdev = rdev;
2526 bio->bi_sector = r10_bio->devs[slot].addr
2527 + choose_data_offset(r10_bio, rdev);
2528 bio->bi_bdev = rdev->bdev;
2529 bio->bi_rw = READ | do_sync;
2530 bio->bi_private = r10_bio;
2531 bio->bi_end_io = raid10_end_read_request;
2532 if (max_sectors < r10_bio->sectors) {
2533
2534 struct bio *mbio = r10_bio->master_bio;
2535 int sectors_handled =
2536 r10_bio->sector + max_sectors
2537 - mbio->bi_sector;
2538 r10_bio->sectors = max_sectors;
2539 spin_lock_irq(&conf->device_lock);
2540 if (mbio->bi_phys_segments == 0)
2541 mbio->bi_phys_segments = 2;
2542 else
2543 mbio->bi_phys_segments++;
2544 spin_unlock_irq(&conf->device_lock);
2545 generic_make_request(bio);
2546
2547 r10_bio = mempool_alloc(conf->r10bio_pool,
2548 GFP_NOIO);
2549 r10_bio->master_bio = mbio;
2550 r10_bio->sectors = (mbio->bi_size >> 9)
2551 - sectors_handled;
2552 r10_bio->state = 0;
2553 set_bit(R10BIO_ReadError,
2554 &r10_bio->state);
2555 r10_bio->mddev = mddev;
2556 r10_bio->sector = mbio->bi_sector
2557 + sectors_handled;
2558
2559 goto read_more;
2560 } else
2561 generic_make_request(bio);
2562}
2563
2564static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2565{
2566
2567
2568
2569
2570
2571
2572 int m;
2573 struct md_rdev *rdev;
2574
2575 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2576 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2577 for (m = 0; m < conf->copies; m++) {
2578 int dev = r10_bio->devs[m].devnum;
2579 rdev = conf->mirrors[dev].rdev;
2580 if (r10_bio->devs[m].bio == NULL)
2581 continue;
2582 if (test_bit(BIO_UPTODATE,
2583 &r10_bio->devs[m].bio->bi_flags)) {
2584 rdev_clear_badblocks(
2585 rdev,
2586 r10_bio->devs[m].addr,
2587 r10_bio->sectors, 0);
2588 } else {
2589 if (!rdev_set_badblocks(
2590 rdev,
2591 r10_bio->devs[m].addr,
2592 r10_bio->sectors, 0))
2593 md_error(conf->mddev, rdev);
2594 }
2595 rdev = conf->mirrors[dev].replacement;
2596 if (r10_bio->devs[m].repl_bio == NULL)
2597 continue;
2598 if (test_bit(BIO_UPTODATE,
2599 &r10_bio->devs[m].repl_bio->bi_flags)) {
2600 rdev_clear_badblocks(
2601 rdev,
2602 r10_bio->devs[m].addr,
2603 r10_bio->sectors, 0);
2604 } else {
2605 if (!rdev_set_badblocks(
2606 rdev,
2607 r10_bio->devs[m].addr,
2608 r10_bio->sectors, 0))
2609 md_error(conf->mddev, rdev);
2610 }
2611 }
2612 put_buf(r10_bio);
2613 } else {
2614 for (m = 0; m < conf->copies; m++) {
2615 int dev = r10_bio->devs[m].devnum;
2616 struct bio *bio = r10_bio->devs[m].bio;
2617 rdev = conf->mirrors[dev].rdev;
2618 if (bio == IO_MADE_GOOD) {
2619 rdev_clear_badblocks(
2620 rdev,
2621 r10_bio->devs[m].addr,
2622 r10_bio->sectors, 0);
2623 rdev_dec_pending(rdev, conf->mddev);
2624 } else if (bio != NULL &&
2625 !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2626 if (!narrow_write_error(r10_bio, m)) {
2627 md_error(conf->mddev, rdev);
2628 set_bit(R10BIO_Degraded,
2629 &r10_bio->state);
2630 }
2631 rdev_dec_pending(rdev, conf->mddev);
2632 }
2633 bio = r10_bio->devs[m].repl_bio;
2634 rdev = conf->mirrors[dev].replacement;
2635 if (rdev && bio == IO_MADE_GOOD) {
2636 rdev_clear_badblocks(
2637 rdev,
2638 r10_bio->devs[m].addr,
2639 r10_bio->sectors, 0);
2640 rdev_dec_pending(rdev, conf->mddev);
2641 }
2642 }
2643 if (test_bit(R10BIO_WriteError,
2644 &r10_bio->state))
2645 close_write(r10_bio);
2646 raid_end_bio_io(r10_bio);
2647 }
2648}
2649
2650static void raid10d(struct mddev *mddev)
2651{
2652 struct r10bio *r10_bio;
2653 unsigned long flags;
2654 struct r10conf *conf = mddev->private;
2655 struct list_head *head = &conf->retry_list;
2656 struct blk_plug plug;
2657
2658 md_check_recovery(mddev);
2659
2660 blk_start_plug(&plug);
2661 for (;;) {
2662
2663 if (atomic_read(&mddev->plug_cnt) == 0)
2664 flush_pending_writes(conf);
2665
2666 spin_lock_irqsave(&conf->device_lock, flags);
2667 if (list_empty(head)) {
2668 spin_unlock_irqrestore(&conf->device_lock, flags);
2669 break;
2670 }
2671 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2672 list_del(head->prev);
2673 conf->nr_queued--;
2674 spin_unlock_irqrestore(&conf->device_lock, flags);
2675
2676 mddev = r10_bio->mddev;
2677 conf = mddev->private;
2678 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2679 test_bit(R10BIO_WriteError, &r10_bio->state))
2680 handle_write_completed(conf, r10_bio);
2681 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2682 reshape_request_write(mddev, r10_bio);
2683 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2684 sync_request_write(mddev, r10_bio);
2685 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2686 recovery_request_write(mddev, r10_bio);
2687 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2688 handle_read_error(mddev, r10_bio);
2689 else {
2690
2691
2692
2693 int slot = r10_bio->read_slot;
2694 generic_make_request(r10_bio->devs[slot].bio);
2695 }
2696
2697 cond_resched();
2698 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2699 md_check_recovery(mddev);
2700 }
2701 blk_finish_plug(&plug);
2702}
2703
2704
2705static int init_resync(struct r10conf *conf)
2706{
2707 int buffs;
2708 int i;
2709
2710 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2711 BUG_ON(conf->r10buf_pool);
2712 conf->have_replacement = 0;
2713 for (i = 0; i < conf->geo.raid_disks; i++)
2714 if (conf->mirrors[i].replacement)
2715 conf->have_replacement = 1;
2716 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2717 if (!conf->r10buf_pool)
2718 return -ENOMEM;
2719 conf->next_resync = 0;
2720 return 0;
2721}
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2756 int *skipped, int go_faster)
2757{
2758 struct r10conf *conf = mddev->private;
2759 struct r10bio *r10_bio;
2760 struct bio *biolist = NULL, *bio;
2761 sector_t max_sector, nr_sectors;
2762 int i;
2763 int max_sync;
2764 sector_t sync_blocks;
2765 sector_t sectors_skipped = 0;
2766 int chunks_skipped = 0;
2767 sector_t chunk_mask = conf->geo.chunk_mask;
2768
2769 if (!conf->r10buf_pool)
2770 if (init_resync(conf))
2771 return 0;
2772
2773 skipped:
2774 max_sector = mddev->dev_sectors;
2775 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2776 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2777 max_sector = mddev->resync_max_sectors;
2778 if (sector_nr >= max_sector) {
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2789 end_reshape(conf);
2790 return 0;
2791 }
2792
2793 if (mddev->curr_resync < max_sector) {
2794 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2795 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2796 &sync_blocks, 1);
2797 else for (i = 0; i < conf->geo.raid_disks; i++) {
2798 sector_t sect =
2799 raid10_find_virt(conf, mddev->curr_resync, i);
2800 bitmap_end_sync(mddev->bitmap, sect,
2801 &sync_blocks, 1);
2802 }
2803 } else {
2804
2805 if ((!mddev->bitmap || conf->fullsync)
2806 && conf->have_replacement
2807 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2808
2809
2810
2811 for (i = 0; i < conf->geo.raid_disks; i++)
2812 if (conf->mirrors[i].replacement)
2813 conf->mirrors[i].replacement
2814 ->recovery_offset
2815 = MaxSector;
2816 }
2817 conf->fullsync = 0;
2818 }
2819 bitmap_close_sync(mddev->bitmap);
2820 close_sync(conf);
2821 *skipped = 1;
2822 return sectors_skipped;
2823 }
2824
2825 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2826 return reshape_request(mddev, sector_nr, skipped);
2827
2828 if (chunks_skipped >= conf->geo.raid_disks) {
2829
2830
2831
2832 *skipped = 1;
2833 return (max_sector - sector_nr) + sectors_skipped;
2834 }
2835
2836 if (max_sector > mddev->resync_max)
2837 max_sector = mddev->resync_max;
2838
2839
2840
2841
2842 if (conf->geo.near_copies < conf->geo.raid_disks &&
2843 max_sector > (sector_nr | chunk_mask))
2844 max_sector = (sector_nr | chunk_mask) + 1;
2845
2846
2847
2848
2849 if (!go_faster && conf->nr_waiting)
2850 msleep_interruptible(1000);
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
2868 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2869
2870 int j;
2871 r10_bio = NULL;
2872
2873 for (i = 0 ; i < conf->geo.raid_disks; i++) {
2874 int still_degraded;
2875 struct r10bio *rb2;
2876 sector_t sect;
2877 int must_sync;
2878 int any_working;
2879 struct mirror_info *mirror = &conf->mirrors[i];
2880
2881 if ((mirror->rdev == NULL ||
2882 test_bit(In_sync, &mirror->rdev->flags))
2883 &&
2884 (mirror->replacement == NULL ||
2885 test_bit(Faulty,
2886 &mirror->replacement->flags)))
2887 continue;
2888
2889 still_degraded = 0;
2890
2891 rb2 = r10_bio;
2892 sect = raid10_find_virt(conf, sector_nr, i);
2893 if (sect >= mddev->resync_max_sectors) {
2894
2895
2896
2897 continue;
2898 }
2899
2900
2901
2902
2903 must_sync = bitmap_start_sync(mddev->bitmap, sect,
2904 &sync_blocks, 1);
2905 if (sync_blocks < max_sync)
2906 max_sync = sync_blocks;
2907 if (!must_sync &&
2908 mirror->replacement == NULL &&
2909 !conf->fullsync) {
2910
2911
2912
2913 chunks_skipped = -1;
2914 continue;
2915 }
2916
2917 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
2918 raise_barrier(conf, rb2 != NULL);
2919 atomic_set(&r10_bio->remaining, 0);
2920
2921 r10_bio->master_bio = (struct bio*)rb2;
2922 if (rb2)
2923 atomic_inc(&rb2->remaining);
2924 r10_bio->mddev = mddev;
2925 set_bit(R10BIO_IsRecover, &r10_bio->state);
2926 r10_bio->sector = sect;
2927
2928 raid10_find_phys(conf, r10_bio);
2929
2930
2931
2932
2933 for (j = 0; j < conf->geo.raid_disks; j++)
2934 if (conf->mirrors[j].rdev == NULL ||
2935 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
2936 still_degraded = 1;
2937 break;
2938 }
2939
2940 must_sync = bitmap_start_sync(mddev->bitmap, sect,
2941 &sync_blocks, still_degraded);
2942
2943 any_working = 0;
2944 for (j=0; j<conf->copies;j++) {
2945 int k;
2946 int d = r10_bio->devs[j].devnum;
2947 sector_t from_addr, to_addr;
2948 struct md_rdev *rdev;
2949 sector_t sector, first_bad;
2950 int bad_sectors;
2951 if (!conf->mirrors[d].rdev ||
2952 !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
2953 continue;
2954
2955 any_working = 1;
2956 rdev = conf->mirrors[d].rdev;
2957 sector = r10_bio->devs[j].addr;
2958
2959 if (is_badblock(rdev, sector, max_sync,
2960 &first_bad, &bad_sectors)) {
2961 if (first_bad > sector)
2962 max_sync = first_bad - sector;
2963 else {
2964 bad_sectors -= (sector
2965 - first_bad);
2966 if (max_sync > bad_sectors)
2967 max_sync = bad_sectors;
2968 continue;
2969 }
2970 }
2971 bio = r10_bio->devs[0].bio;
2972 bio->bi_next = biolist;
2973 biolist = bio;
2974 bio->bi_private = r10_bio;
2975 bio->bi_end_io = end_sync_read;
2976 bio->bi_rw = READ;
2977 from_addr = r10_bio->devs[j].addr;
2978 bio->bi_sector = from_addr + rdev->data_offset;
2979 bio->bi_bdev = rdev->bdev;
2980 atomic_inc(&rdev->nr_pending);
2981
2982
2983 for (k=0; k<conf->copies; k++)
2984 if (r10_bio->devs[k].devnum == i)
2985 break;
2986 BUG_ON(k == conf->copies);
2987 to_addr = r10_bio->devs[k].addr;
2988 r10_bio->devs[0].devnum = d;
2989 r10_bio->devs[0].addr = from_addr;
2990 r10_bio->devs[1].devnum = i;
2991 r10_bio->devs[1].addr = to_addr;
2992
2993 rdev = mirror->rdev;
2994 if (!test_bit(In_sync, &rdev->flags)) {
2995 bio = r10_bio->devs[1].bio;
2996 bio->bi_next = biolist;
2997 biolist = bio;
2998 bio->bi_private = r10_bio;
2999 bio->bi_end_io = end_sync_write;
3000 bio->bi_rw = WRITE;
3001 bio->bi_sector = to_addr
3002 + rdev->data_offset;
3003 bio->bi_bdev = rdev->bdev;
3004 atomic_inc(&r10_bio->remaining);
3005 } else
3006 r10_bio->devs[1].bio->bi_end_io = NULL;
3007
3008
3009 bio = r10_bio->devs[1].repl_bio;
3010 if (bio)
3011 bio->bi_end_io = NULL;
3012 rdev = mirror->replacement;
3013
3014
3015
3016
3017
3018
3019
3020
3021 if (rdev == NULL || bio == NULL ||
3022 test_bit(Faulty, &rdev->flags))
3023 break;
3024 bio->bi_next = biolist;
3025 biolist = bio;
3026 bio->bi_private = r10_bio;
3027 bio->bi_end_io = end_sync_write;
3028 bio->bi_rw = WRITE;
3029 bio->bi_sector = to_addr + rdev->data_offset;
3030 bio->bi_bdev = rdev->bdev;
3031 atomic_inc(&r10_bio->remaining);
3032 break;
3033 }
3034 if (j == conf->copies) {
3035
3036
3037 put_buf(r10_bio);
3038 if (rb2)
3039 atomic_dec(&rb2->remaining);
3040 r10_bio = rb2;
3041 if (any_working) {
3042
3043
3044
3045 int k;
3046 for (k = 0; k < conf->copies; k++)
3047 if (r10_bio->devs[k].devnum == i)
3048 break;
3049 if (!test_bit(In_sync,
3050 &mirror->rdev->flags)
3051 && !rdev_set_badblocks(
3052 mirror->rdev,
3053 r10_bio->devs[k].addr,
3054 max_sync, 0))
3055 any_working = 0;
3056 if (mirror->replacement &&
3057 !rdev_set_badblocks(
3058 mirror->replacement,
3059 r10_bio->devs[k].addr,
3060 max_sync, 0))
3061 any_working = 0;
3062 }
3063 if (!any_working) {
3064 if (!test_and_set_bit(MD_RECOVERY_INTR,
3065 &mddev->recovery))
3066 printk(KERN_INFO "md/raid10:%s: insufficient "
3067 "working devices for recovery.\n",
3068 mdname(mddev));
3069 mirror->recovery_disabled
3070 = mddev->recovery_disabled;
3071 }
3072 break;
3073 }
3074 }
3075 if (biolist == NULL) {
3076 while (r10_bio) {
3077 struct r10bio *rb2 = r10_bio;
3078 r10_bio = (struct r10bio*) rb2->master_bio;
3079 rb2->master_bio = NULL;
3080 put_buf(rb2);
3081 }
3082 goto giveup;
3083 }
3084 } else {
3085
3086 int count = 0;
3087
3088 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
3089
3090 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
3091 &sync_blocks, mddev->degraded) &&
3092 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3093 &mddev->recovery)) {
3094
3095 *skipped = 1;
3096 return sync_blocks + sectors_skipped;
3097 }
3098 if (sync_blocks < max_sync)
3099 max_sync = sync_blocks;
3100 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3101
3102 r10_bio->mddev = mddev;
3103 atomic_set(&r10_bio->remaining, 0);
3104 raise_barrier(conf, 0);
3105 conf->next_resync = sector_nr;
3106
3107 r10_bio->master_bio = NULL;
3108 r10_bio->sector = sector_nr;
3109 set_bit(R10BIO_IsSync, &r10_bio->state);
3110 raid10_find_phys(conf, r10_bio);
3111 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3112
3113 for (i = 0; i < conf->copies; i++) {
3114 int d = r10_bio->devs[i].devnum;
3115 sector_t first_bad, sector;
3116 int bad_sectors;
3117
3118 if (r10_bio->devs[i].repl_bio)
3119 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3120
3121 bio = r10_bio->devs[i].bio;
3122 bio->bi_end_io = NULL;
3123 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3124 if (conf->mirrors[d].rdev == NULL ||
3125 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
3126 continue;
3127 sector = r10_bio->devs[i].addr;
3128 if (is_badblock(conf->mirrors[d].rdev,
3129 sector, max_sync,
3130 &first_bad, &bad_sectors)) {
3131 if (first_bad > sector)
3132 max_sync = first_bad - sector;
3133 else {
3134 bad_sectors -= (sector - first_bad);
3135 if (max_sync > bad_sectors)
3136 max_sync = max_sync;
3137 continue;
3138 }
3139 }
3140 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3141 atomic_inc(&r10_bio->remaining);
3142 bio->bi_next = biolist;
3143 biolist = bio;
3144 bio->bi_private = r10_bio;
3145 bio->bi_end_io = end_sync_read;
3146 bio->bi_rw = READ;
3147 bio->bi_sector = sector +
3148 conf->mirrors[d].rdev->data_offset;
3149 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
3150 count++;
3151
3152 if (conf->mirrors[d].replacement == NULL ||
3153 test_bit(Faulty,
3154 &conf->mirrors[d].replacement->flags))
3155 continue;
3156
3157
3158 bio = r10_bio->devs[i].repl_bio;
3159 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3160
3161 sector = r10_bio->devs[i].addr;
3162 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3163 bio->bi_next = biolist;
3164 biolist = bio;
3165 bio->bi_private = r10_bio;
3166 bio->bi_end_io = end_sync_write;
3167 bio->bi_rw = WRITE;
3168 bio->bi_sector = sector +
3169 conf->mirrors[d].replacement->data_offset;
3170 bio->bi_bdev = conf->mirrors[d].replacement->bdev;
3171 count++;
3172 }
3173
3174 if (count < 2) {
3175 for (i=0; i<conf->copies; i++) {
3176 int d = r10_bio->devs[i].devnum;
3177 if (r10_bio->devs[i].bio->bi_end_io)
3178 rdev_dec_pending(conf->mirrors[d].rdev,
3179 mddev);
3180 if (r10_bio->devs[i].repl_bio &&
3181 r10_bio->devs[i].repl_bio->bi_end_io)
3182 rdev_dec_pending(
3183 conf->mirrors[d].replacement,
3184 mddev);
3185 }
3186 put_buf(r10_bio);
3187 biolist = NULL;
3188 goto giveup;
3189 }
3190 }
3191
3192 for (bio = biolist; bio ; bio=bio->bi_next) {
3193
3194 bio->bi_flags &= ~(BIO_POOL_MASK - 1);
3195 if (bio->bi_end_io)
3196 bio->bi_flags |= 1 << BIO_UPTODATE;
3197 bio->bi_vcnt = 0;
3198 bio->bi_idx = 0;
3199 bio->bi_phys_segments = 0;
3200 bio->bi_size = 0;
3201 }
3202
3203 nr_sectors = 0;
3204 if (sector_nr + max_sync < max_sector)
3205 max_sector = sector_nr + max_sync;
3206 do {
3207 struct page *page;
3208 int len = PAGE_SIZE;
3209 if (sector_nr + (len>>9) > max_sector)
3210 len = (max_sector - sector_nr) << 9;
3211 if (len == 0)
3212 break;
3213 for (bio= biolist ; bio ; bio=bio->bi_next) {
3214 struct bio *bio2;
3215 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
3216 if (bio_add_page(bio, page, len, 0))
3217 continue;
3218
3219
3220 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
3221 for (bio2 = biolist;
3222 bio2 && bio2 != bio;
3223 bio2 = bio2->bi_next) {
3224
3225 bio2->bi_vcnt--;
3226 bio2->bi_size -= len;
3227 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
3228 }
3229 goto bio_full;
3230 }
3231 nr_sectors += len>>9;
3232 sector_nr += len>>9;
3233 } while (biolist->bi_vcnt < RESYNC_PAGES);
3234 bio_full:
3235 r10_bio->sectors = nr_sectors;
3236
3237 while (biolist) {
3238 bio = biolist;
3239 biolist = biolist->bi_next;
3240
3241 bio->bi_next = NULL;
3242 r10_bio = bio->bi_private;
3243 r10_bio->sectors = nr_sectors;
3244
3245 if (bio->bi_end_io == end_sync_read) {
3246 md_sync_acct(bio->bi_bdev, nr_sectors);
3247 generic_make_request(bio);
3248 }
3249 }
3250
3251 if (sectors_skipped)
3252
3253
3254
3255 md_done_sync(mddev, sectors_skipped, 1);
3256
3257 return sectors_skipped + nr_sectors;
3258 giveup:
3259
3260
3261
3262
3263 if (sector_nr + max_sync < max_sector)
3264 max_sector = sector_nr + max_sync;
3265
3266 sectors_skipped += (max_sector - sector_nr);
3267 chunks_skipped ++;
3268 sector_nr = max_sector;
3269 goto skipped;
3270}
3271
3272static sector_t
3273raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3274{
3275 sector_t size;
3276 struct r10conf *conf = mddev->private;
3277
3278 if (!raid_disks)
3279 raid_disks = min(conf->geo.raid_disks,
3280 conf->prev.raid_disks);
3281 if (!sectors)
3282 sectors = conf->dev_sectors;
3283
3284 size = sectors >> conf->geo.chunk_shift;
3285 sector_div(size, conf->geo.far_copies);
3286 size = size * raid_disks;
3287 sector_div(size, conf->geo.near_copies);
3288
3289 return size << conf->geo.chunk_shift;
3290}
3291
3292static void calc_sectors(struct r10conf *conf, sector_t size)
3293{
3294
3295
3296
3297
3298
3299 size = size >> conf->geo.chunk_shift;
3300 sector_div(size, conf->geo.far_copies);
3301 size = size * conf->geo.raid_disks;
3302 sector_div(size, conf->geo.near_copies);
3303
3304
3305 size = size * conf->copies;
3306
3307
3308
3309
3310 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3311
3312 conf->dev_sectors = size << conf->geo.chunk_shift;
3313
3314 if (conf->geo.far_offset)
3315 conf->geo.stride = 1 << conf->geo.chunk_shift;
3316 else {
3317 sector_div(size, conf->geo.far_copies);
3318 conf->geo.stride = size << conf->geo.chunk_shift;
3319 }
3320}
3321
3322enum geo_type {geo_new, geo_old, geo_start};
3323static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3324{
3325 int nc, fc, fo;
3326 int layout, chunk, disks;
3327 switch (new) {
3328 case geo_old:
3329 layout = mddev->layout;
3330 chunk = mddev->chunk_sectors;
3331 disks = mddev->raid_disks - mddev->delta_disks;
3332 break;
3333 case geo_new:
3334 layout = mddev->new_layout;
3335 chunk = mddev->new_chunk_sectors;
3336 disks = mddev->raid_disks;
3337 break;
3338 default:
3339 case geo_start:
3340
3341 layout = mddev->new_layout;
3342 chunk = mddev->new_chunk_sectors;
3343 disks = mddev->raid_disks + mddev->delta_disks;
3344 break;
3345 }
3346 if (layout >> 17)
3347 return -1;
3348 if (chunk < (PAGE_SIZE >> 9) ||
3349 !is_power_of_2(chunk))
3350 return -2;
3351 nc = layout & 255;
3352 fc = (layout >> 8) & 255;
3353 fo = layout & (1<<16);
3354 geo->raid_disks = disks;
3355 geo->near_copies = nc;
3356 geo->far_copies = fc;
3357 geo->far_offset = fo;
3358 geo->chunk_mask = chunk - 1;
3359 geo->chunk_shift = ffz(~chunk);
3360 return nc*fc;
3361}
3362
3363static struct r10conf *setup_conf(struct mddev *mddev)
3364{
3365 struct r10conf *conf = NULL;
3366 int err = -EINVAL;
3367 struct geom geo;
3368 int copies;
3369
3370 copies = setup_geo(&geo, mddev, geo_new);
3371
3372 if (copies == -2) {
3373 printk(KERN_ERR "md/raid10:%s: chunk size must be "
3374 "at least PAGE_SIZE(%ld) and be a power of 2.\n",
3375 mdname(mddev), PAGE_SIZE);
3376 goto out;
3377 }
3378
3379 if (copies < 2 || copies > mddev->raid_disks) {
3380 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3381 mdname(mddev), mddev->new_layout);
3382 goto out;
3383 }
3384
3385 err = -ENOMEM;
3386 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3387 if (!conf)
3388 goto out;
3389
3390
3391 conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks +
3392 max(0,mddev->delta_disks)),
3393 GFP_KERNEL);
3394 if (!conf->mirrors)
3395 goto out;
3396
3397 conf->tmppage = alloc_page(GFP_KERNEL);
3398 if (!conf->tmppage)
3399 goto out;
3400
3401 conf->geo = geo;
3402 conf->copies = copies;
3403 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
3404 r10bio_pool_free, conf);
3405 if (!conf->r10bio_pool)
3406 goto out;
3407
3408 calc_sectors(conf, mddev->dev_sectors);
3409 if (mddev->reshape_position == MaxSector) {
3410 conf->prev = conf->geo;
3411 conf->reshape_progress = MaxSector;
3412 } else {
3413 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3414 err = -EINVAL;
3415 goto out;
3416 }
3417 conf->reshape_progress = mddev->reshape_position;
3418 if (conf->prev.far_offset)
3419 conf->prev.stride = 1 << conf->prev.chunk_shift;
3420 else
3421
3422 conf->prev.stride = conf->dev_sectors;
3423 }
3424 spin_lock_init(&conf->device_lock);
3425 INIT_LIST_HEAD(&conf->retry_list);
3426
3427 spin_lock_init(&conf->resync_lock);
3428 init_waitqueue_head(&conf->wait_barrier);
3429
3430 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3431 if (!conf->thread)
3432 goto out;
3433
3434 conf->mddev = mddev;
3435 return conf;
3436
3437 out:
3438 if (err == -ENOMEM)
3439 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
3440 mdname(mddev));
3441 if (conf) {
3442 if (conf->r10bio_pool)
3443 mempool_destroy(conf->r10bio_pool);
3444 kfree(conf->mirrors);
3445 safe_put_page(conf->tmppage);
3446 kfree(conf);
3447 }
3448 return ERR_PTR(err);
3449}
3450
3451static int run(struct mddev *mddev)
3452{
3453 struct r10conf *conf;
3454 int i, disk_idx, chunk_size;
3455 struct mirror_info *disk;
3456 struct md_rdev *rdev;
3457 sector_t size;
3458 sector_t min_offset_diff = 0;
3459 int first = 1;
3460
3461 if (mddev->private == NULL) {
3462 conf = setup_conf(mddev);
3463 if (IS_ERR(conf))
3464 return PTR_ERR(conf);
3465 mddev->private = conf;
3466 }
3467 conf = mddev->private;
3468 if (!conf)
3469 goto out;
3470
3471 mddev->thread = conf->thread;
3472 conf->thread = NULL;
3473
3474 chunk_size = mddev->chunk_sectors << 9;
3475 blk_queue_io_min(mddev->queue, chunk_size);
3476 if (conf->geo.raid_disks % conf->geo.near_copies)
3477 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3478 else
3479 blk_queue_io_opt(mddev->queue, chunk_size *
3480 (conf->geo.raid_disks / conf->geo.near_copies));
3481
3482 rdev_for_each(rdev, mddev) {
3483 long long diff;
3484 struct request_queue *q;
3485
3486 disk_idx = rdev->raid_disk;
3487 if (disk_idx < 0)
3488 continue;
3489 if (disk_idx >= conf->geo.raid_disks &&
3490 disk_idx >= conf->prev.raid_disks)
3491 continue;
3492 disk = conf->mirrors + disk_idx;
3493
3494 if (test_bit(Replacement, &rdev->flags)) {
3495 if (disk->replacement)
3496 goto out_free_conf;
3497 disk->replacement = rdev;
3498 } else {
3499 if (disk->rdev)
3500 goto out_free_conf;
3501 disk->rdev = rdev;
3502 }
3503 q = bdev_get_queue(rdev->bdev);
3504 if (q->merge_bvec_fn)
3505 mddev->merge_check_needed = 1;
3506 diff = (rdev->new_data_offset - rdev->data_offset);
3507 if (!mddev->reshape_backwards)
3508 diff = -diff;
3509 if (diff < 0)
3510 diff = 0;
3511 if (first || diff < min_offset_diff)
3512 min_offset_diff = diff;
3513
3514 disk_stack_limits(mddev->gendisk, rdev->bdev,
3515 rdev->data_offset << 9);
3516
3517 disk->head_position = 0;
3518 }
3519
3520
3521 if (!enough(conf, -1)) {
3522 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
3523 mdname(mddev));
3524 goto out_free_conf;
3525 }
3526
3527 if (conf->reshape_progress != MaxSector) {
3528
3529 if (conf->geo.far_copies != 1 &&
3530 conf->geo.far_offset == 0)
3531 goto out_free_conf;
3532 if (conf->prev.far_copies != 1 &&
3533 conf->geo.far_offset == 0)
3534 goto out_free_conf;
3535 }
3536
3537 mddev->degraded = 0;
3538 for (i = 0;
3539 i < conf->geo.raid_disks
3540 || i < conf->prev.raid_disks;
3541 i++) {
3542
3543 disk = conf->mirrors + i;
3544
3545 if (!disk->rdev && disk->replacement) {
3546
3547 disk->rdev = disk->replacement;
3548 disk->replacement = NULL;
3549 clear_bit(Replacement, &disk->rdev->flags);
3550 }
3551
3552 if (!disk->rdev ||
3553 !test_bit(In_sync, &disk->rdev->flags)) {
3554 disk->head_position = 0;
3555 mddev->degraded++;
3556 if (disk->rdev)
3557 conf->fullsync = 1;
3558 }
3559 disk->recovery_disabled = mddev->recovery_disabled - 1;
3560 }
3561
3562 if (mddev->recovery_cp != MaxSector)
3563 printk(KERN_NOTICE "md/raid10:%s: not clean"
3564 " -- starting background reconstruction\n",
3565 mdname(mddev));
3566 printk(KERN_INFO
3567 "md/raid10:%s: active with %d out of %d devices\n",
3568 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3569 conf->geo.raid_disks);
3570
3571
3572
3573 mddev->dev_sectors = conf->dev_sectors;
3574 size = raid10_size(mddev, 0, 0);
3575 md_set_array_sectors(mddev, size);
3576 mddev->resync_max_sectors = size;
3577
3578 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
3579 mddev->queue->backing_dev_info.congested_data = mddev;
3580
3581
3582
3583
3584
3585 {
3586 int stripe = conf->geo.raid_disks *
3587 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3588 stripe /= conf->geo.near_copies;
3589 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3590 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3591 }
3592
3593 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3594
3595 if (md_integrity_register(mddev))
3596 goto out_free_conf;
3597
3598 if (conf->reshape_progress != MaxSector) {
3599 unsigned long before_length, after_length;
3600
3601 before_length = ((1 << conf->prev.chunk_shift) *
3602 conf->prev.far_copies);
3603 after_length = ((1 << conf->geo.chunk_shift) *
3604 conf->geo.far_copies);
3605
3606 if (max(before_length, after_length) > min_offset_diff) {
3607
3608 printk("md/raid10: offset difference not enough to continue reshape\n");
3609 goto out_free_conf;
3610 }
3611 conf->offset_diff = min_offset_diff;
3612
3613 conf->reshape_safe = conf->reshape_progress;
3614 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3615 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3616 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3617 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3618 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3619 "reshape");
3620 }
3621
3622 return 0;
3623
3624out_free_conf:
3625 md_unregister_thread(&mddev->thread);
3626 if (conf->r10bio_pool)
3627 mempool_destroy(conf->r10bio_pool);
3628 safe_put_page(conf->tmppage);
3629 kfree(conf->mirrors);
3630 kfree(conf);
3631 mddev->private = NULL;
3632out:
3633 return -EIO;
3634}
3635
3636static int stop(struct mddev *mddev)
3637{
3638 struct r10conf *conf = mddev->private;
3639
3640 raise_barrier(conf, 0);
3641 lower_barrier(conf);
3642
3643 md_unregister_thread(&mddev->thread);
3644 blk_sync_queue(mddev->queue);
3645 if (conf->r10bio_pool)
3646 mempool_destroy(conf->r10bio_pool);
3647 kfree(conf->mirrors);
3648 kfree(conf);
3649 mddev->private = NULL;
3650 return 0;
3651}
3652
3653static void raid10_quiesce(struct mddev *mddev, int state)
3654{
3655 struct r10conf *conf = mddev->private;
3656
3657 switch(state) {
3658 case 1:
3659 raise_barrier(conf, 0);
3660 break;
3661 case 0:
3662 lower_barrier(conf);
3663 break;
3664 }
3665}
3666
3667static int raid10_resize(struct mddev *mddev, sector_t sectors)
3668{
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681 struct r10conf *conf = mddev->private;
3682 sector_t oldsize, size;
3683
3684 if (mddev->reshape_position != MaxSector)
3685 return -EBUSY;
3686
3687 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3688 return -EINVAL;
3689
3690 oldsize = raid10_size(mddev, 0, 0);
3691 size = raid10_size(mddev, sectors, 0);
3692 if (mddev->external_size &&
3693 mddev->array_sectors > size)
3694 return -EINVAL;
3695 if (mddev->bitmap) {
3696 int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
3697 if (ret)
3698 return ret;
3699 }
3700 md_set_array_sectors(mddev, size);
3701 set_capacity(mddev->gendisk, mddev->array_sectors);
3702 revalidate_disk(mddev->gendisk);
3703 if (sectors > mddev->dev_sectors &&
3704 mddev->recovery_cp > oldsize) {
3705 mddev->recovery_cp = oldsize;
3706 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3707 }
3708 calc_sectors(conf, sectors);
3709 mddev->dev_sectors = conf->dev_sectors;
3710 mddev->resync_max_sectors = size;
3711 return 0;
3712}
3713
3714static void *raid10_takeover_raid0(struct mddev *mddev)
3715{
3716 struct md_rdev *rdev;
3717 struct r10conf *conf;
3718
3719 if (mddev->degraded > 0) {
3720 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
3721 mdname(mddev));
3722 return ERR_PTR(-EINVAL);
3723 }
3724
3725
3726 mddev->new_level = 10;
3727
3728 mddev->new_layout = (1<<8) + 2;
3729 mddev->new_chunk_sectors = mddev->chunk_sectors;
3730 mddev->delta_disks = mddev->raid_disks;
3731 mddev->raid_disks *= 2;
3732
3733 mddev->recovery_cp = MaxSector;
3734
3735 conf = setup_conf(mddev);
3736 if (!IS_ERR(conf)) {
3737 rdev_for_each(rdev, mddev)
3738 if (rdev->raid_disk >= 0)
3739 rdev->new_raid_disk = rdev->raid_disk * 2;
3740 conf->barrier = 1;
3741 }
3742
3743 return conf;
3744}
3745
3746static void *raid10_takeover(struct mddev *mddev)
3747{
3748 struct r0conf *raid0_conf;
3749
3750
3751
3752
3753 if (mddev->level == 0) {
3754
3755 raid0_conf = mddev->private;
3756 if (raid0_conf->nr_strip_zones > 1) {
3757 printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
3758 " with more than one zone.\n",
3759 mdname(mddev));
3760 return ERR_PTR(-EINVAL);
3761 }
3762 return raid10_takeover_raid0(mddev);
3763 }
3764 return ERR_PTR(-EINVAL);
3765}
3766
3767static int raid10_check_reshape(struct mddev *mddev)
3768{
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783 struct r10conf *conf = mddev->private;
3784 struct geom geo;
3785
3786 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
3787 return -EINVAL;
3788
3789 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
3790
3791 return -EINVAL;
3792 if (geo.far_copies > 1 && !geo.far_offset)
3793
3794 return -EINVAL;
3795
3796 if (mddev->array_sectors & geo.chunk_mask)
3797
3798 return -EINVAL;
3799
3800 if (!enough(conf, -1))
3801 return -EINVAL;
3802
3803 kfree(conf->mirrors_new);
3804 conf->mirrors_new = NULL;
3805 if (mddev->delta_disks > 0) {
3806
3807 conf->mirrors_new = kzalloc(
3808 sizeof(struct mirror_info)
3809 *(mddev->raid_disks +
3810 mddev->delta_disks),
3811 GFP_KERNEL);
3812 if (!conf->mirrors_new)
3813 return -ENOMEM;
3814 }
3815 return 0;
3816}
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831static int calc_degraded(struct r10conf *conf)
3832{
3833 int degraded, degraded2;
3834 int i;
3835
3836 rcu_read_lock();
3837 degraded = 0;
3838
3839 for (i = 0; i < conf->prev.raid_disks; i++) {
3840 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3841 if (!rdev || test_bit(Faulty, &rdev->flags))
3842 degraded++;
3843 else if (!test_bit(In_sync, &rdev->flags))
3844
3845
3846
3847
3848 degraded++;
3849 }
3850 rcu_read_unlock();
3851 if (conf->geo.raid_disks == conf->prev.raid_disks)
3852 return degraded;
3853 rcu_read_lock();
3854 degraded2 = 0;
3855 for (i = 0; i < conf->geo.raid_disks; i++) {
3856 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3857 if (!rdev || test_bit(Faulty, &rdev->flags))
3858 degraded2++;
3859 else if (!test_bit(In_sync, &rdev->flags)) {
3860
3861
3862
3863
3864
3865 if (conf->geo.raid_disks <= conf->prev.raid_disks)
3866 degraded2++;
3867 }
3868 }
3869 rcu_read_unlock();
3870 if (degraded2 > degraded)
3871 return degraded2;
3872 return degraded;
3873}
3874
3875static int raid10_start_reshape(struct mddev *mddev)
3876{
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887 unsigned long before_length, after_length;
3888 sector_t min_offset_diff = 0;
3889 int first = 1;
3890 struct geom new;
3891 struct r10conf *conf = mddev->private;
3892 struct md_rdev *rdev;
3893 int spares = 0;
3894 int ret;
3895
3896 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3897 return -EBUSY;
3898
3899 if (setup_geo(&new, mddev, geo_start) != conf->copies)
3900 return -EINVAL;
3901
3902 before_length = ((1 << conf->prev.chunk_shift) *
3903 conf->prev.far_copies);
3904 after_length = ((1 << conf->geo.chunk_shift) *
3905 conf->geo.far_copies);
3906
3907 rdev_for_each(rdev, mddev) {
3908 if (!test_bit(In_sync, &rdev->flags)
3909 && !test_bit(Faulty, &rdev->flags))
3910 spares++;
3911 if (rdev->raid_disk >= 0) {
3912 long long diff = (rdev->new_data_offset
3913 - rdev->data_offset);
3914 if (!mddev->reshape_backwards)
3915 diff = -diff;
3916 if (diff < 0)
3917 diff = 0;
3918 if (first || diff < min_offset_diff)
3919 min_offset_diff = diff;
3920 }
3921 }
3922
3923 if (max(before_length, after_length) > min_offset_diff)
3924 return -EINVAL;
3925
3926 if (spares < mddev->delta_disks)
3927 return -EINVAL;
3928
3929 conf->offset_diff = min_offset_diff;
3930 spin_lock_irq(&conf->device_lock);
3931 if (conf->mirrors_new) {
3932 memcpy(conf->mirrors_new, conf->mirrors,
3933 sizeof(struct mirror_info)*conf->prev.raid_disks);
3934 smp_mb();
3935 kfree(conf->mirrors_old);
3936 conf->mirrors_old = conf->mirrors;
3937 conf->mirrors = conf->mirrors_new;
3938 conf->mirrors_new = NULL;
3939 }
3940 setup_geo(&conf->geo, mddev, geo_start);
3941 smp_mb();
3942 if (mddev->reshape_backwards) {
3943 sector_t size = raid10_size(mddev, 0, 0);
3944 if (size < mddev->array_sectors) {
3945 spin_unlock_irq(&conf->device_lock);
3946 printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
3947 mdname(mddev));
3948 return -EINVAL;
3949 }
3950 mddev->resync_max_sectors = size;
3951 conf->reshape_progress = size;
3952 } else
3953 conf->reshape_progress = 0;
3954 spin_unlock_irq(&conf->device_lock);
3955
3956 if (mddev->delta_disks && mddev->bitmap) {
3957 ret = bitmap_resize(mddev->bitmap,
3958 raid10_size(mddev, 0,
3959 conf->geo.raid_disks),
3960 0, 0);
3961 if (ret)
3962 goto abort;
3963 }
3964 if (mddev->delta_disks > 0) {
3965 rdev_for_each(rdev, mddev)
3966 if (rdev->raid_disk < 0 &&
3967 !test_bit(Faulty, &rdev->flags)) {
3968 if (raid10_add_disk(mddev, rdev) == 0) {
3969 if (rdev->raid_disk >=
3970 conf->prev.raid_disks)
3971 set_bit(In_sync, &rdev->flags);
3972 else
3973 rdev->recovery_offset = 0;
3974
3975 if (sysfs_link_rdev(mddev, rdev))
3976 ;
3977 }
3978 } else if (rdev->raid_disk >= conf->prev.raid_disks
3979 && !test_bit(Faulty, &rdev->flags)) {
3980
3981 set_bit(In_sync, &rdev->flags);
3982 }
3983 }
3984
3985
3986
3987
3988 spin_lock_irq(&conf->device_lock);
3989 mddev->degraded = calc_degraded(conf);
3990 spin_unlock_irq(&conf->device_lock);
3991 mddev->raid_disks = conf->geo.raid_disks;
3992 mddev->reshape_position = conf->reshape_progress;
3993 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3994
3995 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3996 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3997 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3998 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3999
4000 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4001 "reshape");
4002 if (!mddev->sync_thread) {
4003 ret = -EAGAIN;
4004 goto abort;
4005 }
4006 conf->reshape_checkpoint = jiffies;
4007 md_wakeup_thread(mddev->sync_thread);
4008 md_new_event(mddev);
4009 return 0;
4010
4011abort:
4012 mddev->recovery = 0;
4013 spin_lock_irq(&conf->device_lock);
4014 conf->geo = conf->prev;
4015 mddev->raid_disks = conf->geo.raid_disks;
4016 rdev_for_each(rdev, mddev)
4017 rdev->new_data_offset = rdev->data_offset;
4018 smp_wmb();
4019 conf->reshape_progress = MaxSector;
4020 mddev->reshape_position = MaxSector;
4021 spin_unlock_irq(&conf->device_lock);
4022 return ret;
4023}
4024
4025
4026
4027
4028
4029
4030
4031static sector_t last_dev_address(sector_t s, struct geom *geo)
4032{
4033 s = (s | geo->chunk_mask) + 1;
4034 s >>= geo->chunk_shift;
4035 s *= geo->near_copies;
4036 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4037 s *= geo->far_copies;
4038 s <<= geo->chunk_shift;
4039 return s;
4040}
4041
4042
4043
4044
4045
4046static sector_t first_dev_address(sector_t s, struct geom *geo)
4047{
4048 s >>= geo->chunk_shift;
4049 s *= geo->near_copies;
4050 sector_div(s, geo->raid_disks);
4051 s *= geo->far_copies;
4052 s <<= geo->chunk_shift;
4053 return s;
4054}
4055
4056static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4057 int *skipped)
4058{
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096 struct r10conf *conf = mddev->private;
4097 struct r10bio *r10_bio;
4098 sector_t next, safe, last;
4099 int max_sectors;
4100 int nr_sectors;
4101 int s;
4102 struct md_rdev *rdev;
4103 int need_flush = 0;
4104 struct bio *blist;
4105 struct bio *bio, *read_bio;
4106 int sectors_done = 0;
4107
4108 if (sector_nr == 0) {
4109
4110 if (mddev->reshape_backwards &&
4111 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4112 sector_nr = (raid10_size(mddev, 0, 0)
4113 - conf->reshape_progress);
4114 } else if (!mddev->reshape_backwards &&
4115 conf->reshape_progress > 0)
4116 sector_nr = conf->reshape_progress;
4117 if (sector_nr) {
4118 mddev->curr_resync_completed = sector_nr;
4119 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4120 *skipped = 1;
4121 return sector_nr;
4122 }
4123 }
4124
4125
4126
4127
4128
4129 if (mddev->reshape_backwards) {
4130
4131
4132
4133 next = first_dev_address(conf->reshape_progress - 1,
4134 &conf->geo);
4135
4136
4137
4138
4139 safe = last_dev_address(conf->reshape_safe - 1,
4140 &conf->prev);
4141
4142 if (next + conf->offset_diff < safe)
4143 need_flush = 1;
4144
4145 last = conf->reshape_progress - 1;
4146 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4147 & conf->prev.chunk_mask);
4148 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4149 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4150 } else {
4151
4152
4153
4154 next = last_dev_address(conf->reshape_progress, &conf->geo);
4155
4156
4157
4158
4159 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4160
4161
4162
4163
4164 if (next > safe + conf->offset_diff)
4165 need_flush = 1;
4166
4167 sector_nr = conf->reshape_progress;
4168 last = sector_nr | (conf->geo.chunk_mask
4169 & conf->prev.chunk_mask);
4170
4171 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4172 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4173 }
4174
4175 if (need_flush ||
4176 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4177
4178 wait_barrier(conf);
4179 mddev->reshape_position = conf->reshape_progress;
4180 if (mddev->reshape_backwards)
4181 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4182 - conf->reshape_progress;
4183 else
4184 mddev->curr_resync_completed = conf->reshape_progress;
4185 conf->reshape_checkpoint = jiffies;
4186 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4187 md_wakeup_thread(mddev->thread);
4188 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4189 kthread_should_stop());
4190 conf->reshape_safe = mddev->reshape_position;
4191 allow_barrier(conf);
4192 }
4193
4194read_more:
4195
4196 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
4197 raise_barrier(conf, sectors_done != 0);
4198 atomic_set(&r10_bio->remaining, 0);
4199 r10_bio->mddev = mddev;
4200 r10_bio->sector = sector_nr;
4201 set_bit(R10BIO_IsReshape, &r10_bio->state);
4202 r10_bio->sectors = last - sector_nr + 1;
4203 rdev = read_balance(conf, r10_bio, &max_sectors);
4204 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4205
4206 if (!rdev) {
4207
4208
4209
4210
4211 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4212 return sectors_done;
4213 }
4214
4215 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4216
4217 read_bio->bi_bdev = rdev->bdev;
4218 read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4219 + rdev->data_offset);
4220 read_bio->bi_private = r10_bio;
4221 read_bio->bi_end_io = end_sync_read;
4222 read_bio->bi_rw = READ;
4223 read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
4224 read_bio->bi_flags |= 1 << BIO_UPTODATE;
4225 read_bio->bi_vcnt = 0;
4226 read_bio->bi_idx = 0;
4227 read_bio->bi_size = 0;
4228 r10_bio->master_bio = read_bio;
4229 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4230
4231
4232 __raid10_find_phys(&conf->geo, r10_bio);
4233
4234 blist = read_bio;
4235 read_bio->bi_next = NULL;
4236
4237 for (s = 0; s < conf->copies*2; s++) {
4238 struct bio *b;
4239 int d = r10_bio->devs[s/2].devnum;
4240 struct md_rdev *rdev2;
4241 if (s&1) {
4242 rdev2 = conf->mirrors[d].replacement;
4243 b = r10_bio->devs[s/2].repl_bio;
4244 } else {
4245 rdev2 = conf->mirrors[d].rdev;
4246 b = r10_bio->devs[s/2].bio;
4247 }
4248 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4249 continue;
4250 b->bi_bdev = rdev2->bdev;
4251 b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
4252 b->bi_private = r10_bio;
4253 b->bi_end_io = end_reshape_write;
4254 b->bi_rw = WRITE;
4255 b->bi_flags &= ~(BIO_POOL_MASK - 1);
4256 b->bi_flags |= 1 << BIO_UPTODATE;
4257 b->bi_next = blist;
4258 b->bi_vcnt = 0;
4259 b->bi_idx = 0;
4260 b->bi_size = 0;
4261 blist = b;
4262 }
4263
4264
4265
4266 nr_sectors = 0;
4267 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4268 struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
4269 int len = (max_sectors - s) << 9;
4270 if (len > PAGE_SIZE)
4271 len = PAGE_SIZE;
4272 for (bio = blist; bio ; bio = bio->bi_next) {
4273 struct bio *bio2;
4274 if (bio_add_page(bio, page, len, 0))
4275 continue;
4276
4277
4278 for (bio2 = blist;
4279 bio2 && bio2 != bio;
4280 bio2 = bio2->bi_next) {
4281
4282 bio2->bi_vcnt--;
4283 bio2->bi_size -= len;
4284 bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
4285 }
4286 goto bio_full;
4287 }
4288 sector_nr += len >> 9;
4289 nr_sectors += len >> 9;
4290 }
4291bio_full:
4292 r10_bio->sectors = nr_sectors;
4293
4294
4295 md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
4296 atomic_inc(&r10_bio->remaining);
4297 read_bio->bi_next = NULL;
4298 generic_make_request(read_bio);
4299 sector_nr += nr_sectors;
4300 sectors_done += nr_sectors;
4301 if (sector_nr <= last)
4302 goto read_more;
4303
4304
4305
4306
4307 if (mddev->reshape_backwards)
4308 conf->reshape_progress -= sectors_done;
4309 else
4310 conf->reshape_progress += sectors_done;
4311
4312 return sectors_done;
4313}
4314
4315static void end_reshape_request(struct r10bio *r10_bio);
4316static int handle_reshape_read_error(struct mddev *mddev,
4317 struct r10bio *r10_bio);
4318static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4319{
4320
4321
4322
4323
4324
4325 struct r10conf *conf = mddev->private;
4326 int s;
4327
4328 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4329 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4330
4331 md_done_sync(mddev, r10_bio->sectors, 0);
4332 return;
4333 }
4334
4335
4336
4337
4338 atomic_set(&r10_bio->remaining, 1);
4339 for (s = 0; s < conf->copies*2; s++) {
4340 struct bio *b;
4341 int d = r10_bio->devs[s/2].devnum;
4342 struct md_rdev *rdev;
4343 if (s&1) {
4344 rdev = conf->mirrors[d].replacement;
4345 b = r10_bio->devs[s/2].repl_bio;
4346 } else {
4347 rdev = conf->mirrors[d].rdev;
4348 b = r10_bio->devs[s/2].bio;
4349 }
4350 if (!rdev || test_bit(Faulty, &rdev->flags))
4351 continue;
4352 atomic_inc(&rdev->nr_pending);
4353 md_sync_acct(b->bi_bdev, r10_bio->sectors);
4354 atomic_inc(&r10_bio->remaining);
4355 b->bi_next = NULL;
4356 generic_make_request(b);
4357 }
4358 end_reshape_request(r10_bio);
4359}
4360
4361static void end_reshape(struct r10conf *conf)
4362{
4363 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4364 return;
4365
4366 spin_lock_irq(&conf->device_lock);
4367 conf->prev = conf->geo;
4368 md_finish_reshape(conf->mddev);
4369 smp_wmb();
4370 conf->reshape_progress = MaxSector;
4371 spin_unlock_irq(&conf->device_lock);
4372
4373
4374
4375
4376 if (conf->mddev->queue) {
4377 int stripe = conf->geo.raid_disks *
4378 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4379 stripe /= conf->geo.near_copies;
4380 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4381 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4382 }
4383 conf->fullsync = 0;
4384}
4385
4386
4387static int handle_reshape_read_error(struct mddev *mddev,
4388 struct r10bio *r10_bio)
4389{
4390
4391 int sectors = r10_bio->sectors;
4392 struct r10bio r10b;
4393 struct r10conf *conf = mddev->private;
4394 int slot = 0;
4395 int idx = 0;
4396 struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
4397
4398 r10b.sector = r10_bio->sector;
4399 __raid10_find_phys(&conf->prev, &r10b);
4400
4401 while (sectors) {
4402 int s = sectors;
4403 int success = 0;
4404 int first_slot = slot;
4405
4406 if (s > (PAGE_SIZE >> 9))
4407 s = PAGE_SIZE >> 9;
4408
4409 while (!success) {
4410 int d = r10b.devs[slot].devnum;
4411 struct md_rdev *rdev = conf->mirrors[d].rdev;
4412 sector_t addr;
4413 if (rdev == NULL ||
4414 test_bit(Faulty, &rdev->flags) ||
4415 !test_bit(In_sync, &rdev->flags))
4416 goto failed;
4417
4418 addr = r10b.devs[slot].addr + idx * PAGE_SIZE;
4419 success = sync_page_io(rdev,
4420 addr,
4421 s << 9,
4422 bvec[idx].bv_page,
4423 READ, false);
4424 if (success)
4425 break;
4426 failed:
4427 slot++;
4428 if (slot >= conf->copies)
4429 slot = 0;
4430 if (slot == first_slot)
4431 break;
4432 }
4433 if (!success) {
4434
4435 set_bit(MD_RECOVERY_INTR,
4436 &mddev->recovery);
4437 return -EIO;
4438 }
4439 sectors -= s;
4440 idx++;
4441 }
4442 return 0;
4443}
4444
4445static void end_reshape_write(struct bio *bio, int error)
4446{
4447 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
4448 struct r10bio *r10_bio = bio->bi_private;
4449 struct mddev *mddev = r10_bio->mddev;
4450 struct r10conf *conf = mddev->private;
4451 int d;
4452 int slot;
4453 int repl;
4454 struct md_rdev *rdev = NULL;
4455
4456 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4457 if (repl)
4458 rdev = conf->mirrors[d].replacement;
4459 if (!rdev) {
4460 smp_mb();
4461 rdev = conf->mirrors[d].rdev;
4462 }
4463
4464 if (!uptodate) {
4465
4466 md_error(mddev, rdev);
4467 }
4468
4469 rdev_dec_pending(rdev, mddev);
4470 end_reshape_request(r10_bio);
4471}
4472
4473static void end_reshape_request(struct r10bio *r10_bio)
4474{
4475 if (!atomic_dec_and_test(&r10_bio->remaining))
4476 return;
4477 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4478 bio_put(r10_bio->master_bio);
4479 put_buf(r10_bio);
4480}
4481
4482static void raid10_finish_reshape(struct mddev *mddev)
4483{
4484 struct r10conf *conf = mddev->private;
4485
4486 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4487 return;
4488
4489 if (mddev->delta_disks > 0) {
4490 sector_t size = raid10_size(mddev, 0, 0);
4491 md_set_array_sectors(mddev, size);
4492 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4493 mddev->recovery_cp = mddev->resync_max_sectors;
4494 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4495 }
4496 mddev->resync_max_sectors = size;
4497 set_capacity(mddev->gendisk, mddev->array_sectors);
4498 revalidate_disk(mddev->gendisk);
4499 } else {
4500 int d;
4501 for (d = conf->geo.raid_disks ;
4502 d < conf->geo.raid_disks - mddev->delta_disks;
4503 d++) {
4504 struct md_rdev *rdev = conf->mirrors[d].rdev;
4505 if (rdev)
4506 clear_bit(In_sync, &rdev->flags);
4507 rdev = conf->mirrors[d].replacement;
4508 if (rdev)
4509 clear_bit(In_sync, &rdev->flags);
4510 }
4511 }
4512 mddev->layout = mddev->new_layout;
4513 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4514 mddev->reshape_position = MaxSector;
4515 mddev->delta_disks = 0;
4516 mddev->reshape_backwards = 0;
4517}
4518
4519static struct md_personality raid10_personality =
4520{
4521 .name = "raid10",
4522 .level = 10,
4523 .owner = THIS_MODULE,
4524 .make_request = make_request,
4525 .run = run,
4526 .stop = stop,
4527 .status = status,
4528 .error_handler = error,
4529 .hot_add_disk = raid10_add_disk,
4530 .hot_remove_disk= raid10_remove_disk,
4531 .spare_active = raid10_spare_active,
4532 .sync_request = sync_request,
4533 .quiesce = raid10_quiesce,
4534 .size = raid10_size,
4535 .resize = raid10_resize,
4536 .takeover = raid10_takeover,
4537 .check_reshape = raid10_check_reshape,
4538 .start_reshape = raid10_start_reshape,
4539 .finish_reshape = raid10_finish_reshape,
4540};
4541
4542static int __init raid_init(void)
4543{
4544 return register_md_personality(&raid10_personality);
4545}
4546
4547static void raid_exit(void)
4548{
4549 unregister_md_personality(&raid10_personality);
4550}
4551
4552module_init(raid_init);
4553module_exit(raid_exit);
4554MODULE_LICENSE("GPL");
4555MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4556MODULE_ALIAS("md-personality-9");
4557MODULE_ALIAS("md-raid10");
4558MODULE_ALIAS("md-level-10");
4559
4560module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
4561