1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/slab.h>
22#include <linux/delay.h>
23#include <linux/blkdev.h>
24#include <linux/module.h>
25#include <linux/seq_file.h>
26#include <linux/ratelimit.h>
27#include "md.h"
28#include "raid10.h"
29#include "raid0.h"
30#include "bitmap.h"
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60#define NR_RAID10_BIOS 256
61
62
63
64
65
66static int max_queued_requests = 1024;
67
68static void allow_barrier(struct r10conf *conf);
69static void lower_barrier(struct r10conf *conf);
70static int enough(struct r10conf *conf, int ignore);
71
72static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
73{
74 struct r10conf *conf = data;
75 int size = offsetof(struct r10bio, devs[conf->copies]);
76
77
78
79 return kzalloc(size, gfp_flags);
80}
81
82static void r10bio_pool_free(void *r10_bio, void *data)
83{
84 kfree(r10_bio);
85}
86
87
88#define RESYNC_BLOCK_SIZE (64*1024)
89#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
90
91#define RESYNC_WINDOW (1024*1024)
92
93#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
94
95
96
97
98
99
100
101
102static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
103{
104 struct r10conf *conf = data;
105 struct page *page;
106 struct r10bio *r10_bio;
107 struct bio *bio;
108 int i, j;
109 int nalloc;
110
111 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
112 if (!r10_bio)
113 return NULL;
114
115 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
116 nalloc = conf->copies;
117 else
118 nalloc = 2;
119
120
121
122
123 for (j = nalloc ; j-- ; ) {
124 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
125 if (!bio)
126 goto out_free_bio;
127 r10_bio->devs[j].bio = bio;
128 if (!conf->have_replacement)
129 continue;
130 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
131 if (!bio)
132 goto out_free_bio;
133 r10_bio->devs[j].repl_bio = bio;
134 }
135
136
137
138
139 for (j = 0 ; j < nalloc; j++) {
140 struct bio *rbio = r10_bio->devs[j].repl_bio;
141 bio = r10_bio->devs[j].bio;
142 for (i = 0; i < RESYNC_PAGES; i++) {
143 if (j == 1 && !test_bit(MD_RECOVERY_SYNC,
144 &conf->mddev->recovery)) {
145
146 struct bio *rbio = r10_bio->devs[0].bio;
147 page = rbio->bi_io_vec[i].bv_page;
148 get_page(page);
149 } else
150 page = alloc_page(gfp_flags);
151 if (unlikely(!page))
152 goto out_free_pages;
153
154 bio->bi_io_vec[i].bv_page = page;
155 if (rbio)
156 rbio->bi_io_vec[i].bv_page = page;
157 }
158 }
159
160 return r10_bio;
161
162out_free_pages:
163 for ( ; i > 0 ; i--)
164 safe_put_page(bio->bi_io_vec[i-1].bv_page);
165 while (j--)
166 for (i = 0; i < RESYNC_PAGES ; i++)
167 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
168 j = -1;
169out_free_bio:
170 while (++j < nalloc) {
171 bio_put(r10_bio->devs[j].bio);
172 if (r10_bio->devs[j].repl_bio)
173 bio_put(r10_bio->devs[j].repl_bio);
174 }
175 r10bio_pool_free(r10_bio, conf);
176 return NULL;
177}
178
179static void r10buf_pool_free(void *__r10_bio, void *data)
180{
181 int i;
182 struct r10conf *conf = data;
183 struct r10bio *r10bio = __r10_bio;
184 int j;
185
186 for (j=0; j < conf->copies; j++) {
187 struct bio *bio = r10bio->devs[j].bio;
188 if (bio) {
189 for (i = 0; i < RESYNC_PAGES; i++) {
190 safe_put_page(bio->bi_io_vec[i].bv_page);
191 bio->bi_io_vec[i].bv_page = NULL;
192 }
193 bio_put(bio);
194 }
195 bio = r10bio->devs[j].repl_bio;
196 if (bio)
197 bio_put(bio);
198 }
199 r10bio_pool_free(r10bio, conf);
200}
201
202static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
203{
204 int i;
205
206 for (i = 0; i < conf->copies; i++) {
207 struct bio **bio = & r10_bio->devs[i].bio;
208 if (!BIO_SPECIAL(*bio))
209 bio_put(*bio);
210 *bio = NULL;
211 bio = &r10_bio->devs[i].repl_bio;
212 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
213 bio_put(*bio);
214 *bio = NULL;
215 }
216}
217
218static void free_r10bio(struct r10bio *r10_bio)
219{
220 struct r10conf *conf = r10_bio->mddev->private;
221
222 put_all_bios(conf, r10_bio);
223 mempool_free(r10_bio, conf->r10bio_pool);
224}
225
226static void put_buf(struct r10bio *r10_bio)
227{
228 struct r10conf *conf = r10_bio->mddev->private;
229
230 mempool_free(r10_bio, conf->r10buf_pool);
231
232 lower_barrier(conf);
233}
234
235static void reschedule_retry(struct r10bio *r10_bio)
236{
237 unsigned long flags;
238 struct mddev *mddev = r10_bio->mddev;
239 struct r10conf *conf = mddev->private;
240
241 spin_lock_irqsave(&conf->device_lock, flags);
242 list_add(&r10_bio->retry_list, &conf->retry_list);
243 conf->nr_queued ++;
244 spin_unlock_irqrestore(&conf->device_lock, flags);
245
246
247 wake_up(&conf->wait_barrier);
248
249 md_wakeup_thread(mddev->thread);
250}
251
252
253
254
255
256
257static void raid_end_bio_io(struct r10bio *r10_bio)
258{
259 struct bio *bio = r10_bio->master_bio;
260 int done;
261 struct r10conf *conf = r10_bio->mddev->private;
262
263 if (bio->bi_phys_segments) {
264 unsigned long flags;
265 spin_lock_irqsave(&conf->device_lock, flags);
266 bio->bi_phys_segments--;
267 done = (bio->bi_phys_segments == 0);
268 spin_unlock_irqrestore(&conf->device_lock, flags);
269 } else
270 done = 1;
271 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
272 clear_bit(BIO_UPTODATE, &bio->bi_flags);
273 if (done) {
274 bio_endio(bio, 0);
275
276
277
278
279 allow_barrier(conf);
280 }
281 free_r10bio(r10_bio);
282}
283
284
285
286
287static inline void update_head_pos(int slot, struct r10bio *r10_bio)
288{
289 struct r10conf *conf = r10_bio->mddev->private;
290
291 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
292 r10_bio->devs[slot].addr + (r10_bio->sectors);
293}
294
295
296
297
298static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
299 struct bio *bio, int *slotp, int *replp)
300{
301 int slot;
302 int repl = 0;
303
304 for (slot = 0; slot < conf->copies; slot++) {
305 if (r10_bio->devs[slot].bio == bio)
306 break;
307 if (r10_bio->devs[slot].repl_bio == bio) {
308 repl = 1;
309 break;
310 }
311 }
312
313 BUG_ON(slot == conf->copies);
314 update_head_pos(slot, r10_bio);
315
316 if (slotp)
317 *slotp = slot;
318 if (replp)
319 *replp = repl;
320 return r10_bio->devs[slot].devnum;
321}
322
323static void raid10_end_read_request(struct bio *bio, int error)
324{
325 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
326 struct r10bio *r10_bio = bio->bi_private;
327 int slot, dev;
328 struct md_rdev *rdev;
329 struct r10conf *conf = r10_bio->mddev->private;
330
331
332 slot = r10_bio->read_slot;
333 dev = r10_bio->devs[slot].devnum;
334 rdev = r10_bio->devs[slot].rdev;
335
336
337
338 update_head_pos(slot, r10_bio);
339
340 if (uptodate) {
341
342
343
344
345
346
347
348
349
350 set_bit(R10BIO_Uptodate, &r10_bio->state);
351 } else {
352
353
354
355
356
357 unsigned long flags;
358 spin_lock_irqsave(&conf->device_lock, flags);
359 if (!enough(conf, rdev->raid_disk))
360 uptodate = 1;
361 spin_unlock_irqrestore(&conf->device_lock, flags);
362 }
363 if (uptodate) {
364 raid_end_bio_io(r10_bio);
365 rdev_dec_pending(rdev, conf->mddev);
366 } else {
367
368
369
370 char b[BDEVNAME_SIZE];
371 printk_ratelimited(KERN_ERR
372 "md/raid10:%s: %s: rescheduling sector %llu\n",
373 mdname(conf->mddev),
374 bdevname(rdev->bdev, b),
375 (unsigned long long)r10_bio->sector);
376 set_bit(R10BIO_ReadError, &r10_bio->state);
377 reschedule_retry(r10_bio);
378 }
379}
380
381static void close_write(struct r10bio *r10_bio)
382{
383
384 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
385 r10_bio->sectors,
386 !test_bit(R10BIO_Degraded, &r10_bio->state),
387 0);
388 md_write_end(r10_bio->mddev);
389}
390
391static void one_write_done(struct r10bio *r10_bio)
392{
393 if (atomic_dec_and_test(&r10_bio->remaining)) {
394 if (test_bit(R10BIO_WriteError, &r10_bio->state))
395 reschedule_retry(r10_bio);
396 else {
397 close_write(r10_bio);
398 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
399 reschedule_retry(r10_bio);
400 else
401 raid_end_bio_io(r10_bio);
402 }
403 }
404}
405
406static void raid10_end_write_request(struct bio *bio, int error)
407{
408 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
409 struct r10bio *r10_bio = bio->bi_private;
410 int dev;
411 int dec_rdev = 1;
412 struct r10conf *conf = r10_bio->mddev->private;
413 int slot, repl;
414 struct md_rdev *rdev = NULL;
415
416 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
417
418 if (repl)
419 rdev = conf->mirrors[dev].replacement;
420 if (!rdev) {
421 smp_rmb();
422 repl = 0;
423 rdev = conf->mirrors[dev].rdev;
424 }
425
426
427
428 if (!uptodate) {
429 if (repl)
430
431
432
433 md_error(rdev->mddev, rdev);
434 else {
435 set_bit(WriteErrorSeen, &rdev->flags);
436 if (!test_and_set_bit(WantReplacement, &rdev->flags))
437 set_bit(MD_RECOVERY_NEEDED,
438 &rdev->mddev->recovery);
439 set_bit(R10BIO_WriteError, &r10_bio->state);
440 dec_rdev = 0;
441 }
442 } else {
443
444
445
446
447
448
449
450
451
452 sector_t first_bad;
453 int bad_sectors;
454
455 set_bit(R10BIO_Uptodate, &r10_bio->state);
456
457
458 if (is_badblock(rdev,
459 r10_bio->devs[slot].addr,
460 r10_bio->sectors,
461 &first_bad, &bad_sectors)) {
462 bio_put(bio);
463 if (repl)
464 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
465 else
466 r10_bio->devs[slot].bio = IO_MADE_GOOD;
467 dec_rdev = 0;
468 set_bit(R10BIO_MadeGood, &r10_bio->state);
469 }
470 }
471
472
473
474
475
476
477 one_write_done(r10_bio);
478 if (dec_rdev)
479 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
480}
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
508{
509 int n,f;
510 sector_t sector;
511 sector_t chunk;
512 sector_t stripe;
513 int dev;
514
515 int slot = 0;
516
517
518 chunk = r10bio->sector >> conf->chunk_shift;
519 sector = r10bio->sector & conf->chunk_mask;
520
521 chunk *= conf->near_copies;
522 stripe = chunk;
523 dev = sector_div(stripe, conf->raid_disks);
524 if (conf->far_offset)
525 stripe *= conf->far_copies;
526
527 sector += stripe << conf->chunk_shift;
528
529
530 for (n=0; n < conf->near_copies; n++) {
531 int d = dev;
532 sector_t s = sector;
533 r10bio->devs[slot].addr = sector;
534 r10bio->devs[slot].devnum = d;
535 slot++;
536
537 for (f = 1; f < conf->far_copies; f++) {
538 d += conf->near_copies;
539 if (d >= conf->raid_disks)
540 d -= conf->raid_disks;
541 s += conf->stride;
542 r10bio->devs[slot].devnum = d;
543 r10bio->devs[slot].addr = s;
544 slot++;
545 }
546 dev++;
547 if (dev >= conf->raid_disks) {
548 dev = 0;
549 sector += (conf->chunk_mask + 1);
550 }
551 }
552 BUG_ON(slot != conf->copies);
553}
554
555static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
556{
557 sector_t offset, chunk, vchunk;
558
559 offset = sector & conf->chunk_mask;
560 if (conf->far_offset) {
561 int fc;
562 chunk = sector >> conf->chunk_shift;
563 fc = sector_div(chunk, conf->far_copies);
564 dev -= fc * conf->near_copies;
565 if (dev < 0)
566 dev += conf->raid_disks;
567 } else {
568 while (sector >= conf->stride) {
569 sector -= conf->stride;
570 if (dev < conf->near_copies)
571 dev += conf->raid_disks - conf->near_copies;
572 else
573 dev -= conf->near_copies;
574 }
575 chunk = sector >> conf->chunk_shift;
576 }
577 vchunk = chunk * conf->raid_disks + dev;
578 sector_div(vchunk, conf->near_copies);
579 return (vchunk << conf->chunk_shift) + offset;
580}
581
582
583
584
585
586
587
588
589
590
591
592static int raid10_mergeable_bvec(struct request_queue *q,
593 struct bvec_merge_data *bvm,
594 struct bio_vec *biovec)
595{
596 struct mddev *mddev = q->queuedata;
597 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
598 int max;
599 unsigned int chunk_sectors = mddev->chunk_sectors;
600 unsigned int bio_sectors = bvm->bi_size >> 9;
601
602 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
603 if (max < 0) max = 0;
604 if (max <= biovec->bv_len && bio_sectors == 0)
605 return biovec->bv_len;
606 else
607 return max;
608}
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629static struct md_rdev *read_balance(struct r10conf *conf,
630 struct r10bio *r10_bio,
631 int *max_sectors)
632{
633 const sector_t this_sector = r10_bio->sector;
634 int disk, slot;
635 int sectors = r10_bio->sectors;
636 int best_good_sectors;
637 sector_t new_distance, best_dist;
638 struct md_rdev *rdev, *best_rdev;
639 int do_balance;
640 int best_slot;
641
642 raid10_find_phys(conf, r10_bio);
643 rcu_read_lock();
644retry:
645 sectors = r10_bio->sectors;
646 best_slot = -1;
647 best_rdev = NULL;
648 best_dist = MaxSector;
649 best_good_sectors = 0;
650 do_balance = 1;
651
652
653
654
655
656
657 if (conf->mddev->recovery_cp < MaxSector
658 && (this_sector + sectors >= conf->next_resync))
659 do_balance = 0;
660
661 for (slot = 0; slot < conf->copies ; slot++) {
662 sector_t first_bad;
663 int bad_sectors;
664 sector_t dev_sector;
665
666 if (r10_bio->devs[slot].bio == IO_BLOCKED)
667 continue;
668 disk = r10_bio->devs[slot].devnum;
669 rdev = rcu_dereference(conf->mirrors[disk].replacement);
670 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
671 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
672 rdev = rcu_dereference(conf->mirrors[disk].rdev);
673 if (rdev == NULL)
674 continue;
675 if (test_bit(Faulty, &rdev->flags))
676 continue;
677 if (!test_bit(In_sync, &rdev->flags) &&
678 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
679 continue;
680
681 dev_sector = r10_bio->devs[slot].addr;
682 if (is_badblock(rdev, dev_sector, sectors,
683 &first_bad, &bad_sectors)) {
684 if (best_dist < MaxSector)
685
686 continue;
687 if (first_bad <= dev_sector) {
688
689
690
691
692 bad_sectors -= (dev_sector - first_bad);
693 if (!do_balance && sectors > bad_sectors)
694 sectors = bad_sectors;
695 if (best_good_sectors > sectors)
696 best_good_sectors = sectors;
697 } else {
698 sector_t good_sectors =
699 first_bad - dev_sector;
700 if (good_sectors > best_good_sectors) {
701 best_good_sectors = good_sectors;
702 best_slot = slot;
703 best_rdev = rdev;
704 }
705 if (!do_balance)
706
707 break;
708 }
709 continue;
710 } else
711 best_good_sectors = sectors;
712
713 if (!do_balance)
714 break;
715
716
717
718
719
720 if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))
721 break;
722
723
724 if (conf->far_copies > 1)
725 new_distance = r10_bio->devs[slot].addr;
726 else
727 new_distance = abs(r10_bio->devs[slot].addr -
728 conf->mirrors[disk].head_position);
729 if (new_distance < best_dist) {
730 best_dist = new_distance;
731 best_slot = slot;
732 best_rdev = rdev;
733 }
734 }
735 if (slot >= conf->copies) {
736 slot = best_slot;
737 rdev = best_rdev;
738 }
739
740 if (slot >= 0) {
741 atomic_inc(&rdev->nr_pending);
742 if (test_bit(Faulty, &rdev->flags)) {
743
744
745
746 rdev_dec_pending(rdev, conf->mddev);
747 goto retry;
748 }
749 r10_bio->read_slot = slot;
750 } else
751 rdev = NULL;
752 rcu_read_unlock();
753 *max_sectors = best_good_sectors;
754
755 return rdev;
756}
757
758static int raid10_congested(void *data, int bits)
759{
760 struct mddev *mddev = data;
761 struct r10conf *conf = mddev->private;
762 int i, ret = 0;
763
764 if ((bits & (1 << BDI_async_congested)) &&
765 conf->pending_count >= max_queued_requests)
766 return 1;
767
768 if (mddev_congested(mddev, bits))
769 return 1;
770 rcu_read_lock();
771 for (i = 0; i < conf->raid_disks && ret == 0; i++) {
772 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
773 if (rdev && !test_bit(Faulty, &rdev->flags)) {
774 struct request_queue *q = bdev_get_queue(rdev->bdev);
775
776 ret |= bdi_congested(&q->backing_dev_info, bits);
777 }
778 }
779 rcu_read_unlock();
780 return ret;
781}
782
783static void flush_pending_writes(struct r10conf *conf)
784{
785
786
787
788 spin_lock_irq(&conf->device_lock);
789
790 if (conf->pending_bio_list.head) {
791 struct bio *bio;
792 bio = bio_list_get(&conf->pending_bio_list);
793 conf->pending_count = 0;
794 spin_unlock_irq(&conf->device_lock);
795
796
797 bitmap_unplug(conf->mddev->bitmap);
798 wake_up(&conf->wait_barrier);
799
800 while (bio) {
801 struct bio *next = bio->bi_next;
802 bio->bi_next = NULL;
803 generic_make_request(bio);
804 bio = next;
805 }
806 } else
807 spin_unlock_irq(&conf->device_lock);
808}
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832static void raise_barrier(struct r10conf *conf, int force)
833{
834 BUG_ON(force && !conf->barrier);
835 spin_lock_irq(&conf->resync_lock);
836
837
838 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
839 conf->resync_lock, );
840
841
842 conf->barrier++;
843
844
845 wait_event_lock_irq(conf->wait_barrier,
846 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
847 conf->resync_lock, );
848
849 spin_unlock_irq(&conf->resync_lock);
850}
851
852static void lower_barrier(struct r10conf *conf)
853{
854 unsigned long flags;
855 spin_lock_irqsave(&conf->resync_lock, flags);
856 conf->barrier--;
857 spin_unlock_irqrestore(&conf->resync_lock, flags);
858 wake_up(&conf->wait_barrier);
859}
860
861static void wait_barrier(struct r10conf *conf)
862{
863 spin_lock_irq(&conf->resync_lock);
864 if (conf->barrier) {
865 conf->nr_waiting++;
866 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
867 conf->resync_lock,
868 );
869 conf->nr_waiting--;
870 }
871 conf->nr_pending++;
872 spin_unlock_irq(&conf->resync_lock);
873}
874
875static void allow_barrier(struct r10conf *conf)
876{
877 unsigned long flags;
878 spin_lock_irqsave(&conf->resync_lock, flags);
879 conf->nr_pending--;
880 spin_unlock_irqrestore(&conf->resync_lock, flags);
881 wake_up(&conf->wait_barrier);
882}
883
884static void freeze_array(struct r10conf *conf)
885{
886
887
888
889
890
891
892
893
894
895
896
897
898 spin_lock_irq(&conf->resync_lock);
899 conf->barrier++;
900 conf->nr_waiting++;
901 wait_event_lock_irq(conf->wait_barrier,
902 conf->nr_pending == conf->nr_queued+1,
903 conf->resync_lock,
904 flush_pending_writes(conf));
905
906 spin_unlock_irq(&conf->resync_lock);
907}
908
909static void unfreeze_array(struct r10conf *conf)
910{
911
912 spin_lock_irq(&conf->resync_lock);
913 conf->barrier--;
914 conf->nr_waiting--;
915 wake_up(&conf->wait_barrier);
916 spin_unlock_irq(&conf->resync_lock);
917}
918
919static void make_request(struct mddev *mddev, struct bio * bio)
920{
921 struct r10conf *conf = mddev->private;
922 struct r10bio *r10_bio;
923 struct bio *read_bio;
924 int i;
925 int chunk_sects = conf->chunk_mask + 1;
926 const int rw = bio_data_dir(bio);
927 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
928 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
929 unsigned long flags;
930 struct md_rdev *blocked_rdev;
931 int plugged;
932 int sectors_handled;
933 int max_sectors;
934
935 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
936 md_flush_request(mddev, bio);
937 return;
938 }
939
940
941
942
943 if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)
944 > chunk_sects &&
945 conf->near_copies < conf->raid_disks)) {
946 struct bio_pair *bp;
947
948 if (bio->bi_vcnt != 1 ||
949 bio->bi_idx != 0)
950 goto bad_map;
951
952
953
954 bp = bio_split(bio,
955 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
956
957
958
959
960
961
962
963
964
965 spin_lock_irq(&conf->resync_lock);
966 conf->nr_waiting++;
967 spin_unlock_irq(&conf->resync_lock);
968
969 make_request(mddev, &bp->bio1);
970 make_request(mddev, &bp->bio2);
971
972 spin_lock_irq(&conf->resync_lock);
973 conf->nr_waiting--;
974 wake_up(&conf->wait_barrier);
975 spin_unlock_irq(&conf->resync_lock);
976
977 bio_pair_release(bp);
978 return;
979 bad_map:
980 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
981 " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
982 (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
983
984 bio_io_error(bio);
985 return;
986 }
987
988 md_write_start(mddev, bio);
989
990
991
992
993
994
995 wait_barrier(conf);
996
997 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
998
999 r10_bio->master_bio = bio;
1000 r10_bio->sectors = bio->bi_size >> 9;
1001
1002 r10_bio->mddev = mddev;
1003 r10_bio->sector = bio->bi_sector;
1004 r10_bio->state = 0;
1005
1006
1007
1008
1009
1010
1011
1012
1013 bio->bi_phys_segments = 0;
1014 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1015
1016 if (rw == READ) {
1017
1018
1019
1020 struct md_rdev *rdev;
1021 int slot;
1022
1023read_again:
1024 rdev = read_balance(conf, r10_bio, &max_sectors);
1025 if (!rdev) {
1026 raid_end_bio_io(r10_bio);
1027 return;
1028 }
1029 slot = r10_bio->read_slot;
1030
1031 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1032 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
1033 max_sectors);
1034
1035 r10_bio->devs[slot].bio = read_bio;
1036 r10_bio->devs[slot].rdev = rdev;
1037
1038 read_bio->bi_sector = r10_bio->devs[slot].addr +
1039 rdev->data_offset;
1040 read_bio->bi_bdev = rdev->bdev;
1041 read_bio->bi_end_io = raid10_end_read_request;
1042 read_bio->bi_rw = READ | do_sync;
1043 read_bio->bi_private = r10_bio;
1044
1045 if (max_sectors < r10_bio->sectors) {
1046
1047
1048
1049 sectors_handled = (r10_bio->sectors + max_sectors
1050 - bio->bi_sector);
1051 r10_bio->sectors = max_sectors;
1052 spin_lock_irq(&conf->device_lock);
1053 if (bio->bi_phys_segments == 0)
1054 bio->bi_phys_segments = 2;
1055 else
1056 bio->bi_phys_segments++;
1057 spin_unlock(&conf->device_lock);
1058
1059
1060
1061
1062
1063 reschedule_retry(r10_bio);
1064
1065 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1066
1067 r10_bio->master_bio = bio;
1068 r10_bio->sectors = ((bio->bi_size >> 9)
1069 - sectors_handled);
1070 r10_bio->state = 0;
1071 r10_bio->mddev = mddev;
1072 r10_bio->sector = bio->bi_sector + sectors_handled;
1073 goto read_again;
1074 } else
1075 generic_make_request(read_bio);
1076 return;
1077 }
1078
1079
1080
1081
1082 if (conf->pending_count >= max_queued_requests) {
1083 md_wakeup_thread(mddev->thread);
1084 wait_event(conf->wait_barrier,
1085 conf->pending_count < max_queued_requests);
1086 }
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098 plugged = mddev_check_plugged(mddev);
1099
1100 r10_bio->read_slot = -1;
1101 raid10_find_phys(conf, r10_bio);
1102retry_write:
1103 blocked_rdev = NULL;
1104 rcu_read_lock();
1105 max_sectors = r10_bio->sectors;
1106
1107 for (i = 0; i < conf->copies; i++) {
1108 int d = r10_bio->devs[i].devnum;
1109 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1110 struct md_rdev *rrdev = rcu_dereference(
1111 conf->mirrors[d].replacement);
1112 if (rdev == rrdev)
1113 rrdev = NULL;
1114 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1115 atomic_inc(&rdev->nr_pending);
1116 blocked_rdev = rdev;
1117 break;
1118 }
1119 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1120 atomic_inc(&rrdev->nr_pending);
1121 blocked_rdev = rrdev;
1122 break;
1123 }
1124 if (rrdev && test_bit(Faulty, &rrdev->flags))
1125 rrdev = NULL;
1126
1127 r10_bio->devs[i].bio = NULL;
1128 r10_bio->devs[i].repl_bio = NULL;
1129 if (!rdev || test_bit(Faulty, &rdev->flags)) {
1130 set_bit(R10BIO_Degraded, &r10_bio->state);
1131 continue;
1132 }
1133 if (test_bit(WriteErrorSeen, &rdev->flags)) {
1134 sector_t first_bad;
1135 sector_t dev_sector = r10_bio->devs[i].addr;
1136 int bad_sectors;
1137 int is_bad;
1138
1139 is_bad = is_badblock(rdev, dev_sector,
1140 max_sectors,
1141 &first_bad, &bad_sectors);
1142 if (is_bad < 0) {
1143
1144
1145
1146 atomic_inc(&rdev->nr_pending);
1147 set_bit(BlockedBadBlocks, &rdev->flags);
1148 blocked_rdev = rdev;
1149 break;
1150 }
1151 if (is_bad && first_bad <= dev_sector) {
1152
1153 bad_sectors -= (dev_sector - first_bad);
1154 if (bad_sectors < max_sectors)
1155
1156
1157
1158 max_sectors = bad_sectors;
1159
1160
1161
1162
1163
1164
1165
1166
1167 continue;
1168 }
1169 if (is_bad) {
1170 int good_sectors = first_bad - dev_sector;
1171 if (good_sectors < max_sectors)
1172 max_sectors = good_sectors;
1173 }
1174 }
1175 r10_bio->devs[i].bio = bio;
1176 atomic_inc(&rdev->nr_pending);
1177 if (rrdev) {
1178 r10_bio->devs[i].repl_bio = bio;
1179 atomic_inc(&rrdev->nr_pending);
1180 }
1181 }
1182 rcu_read_unlock();
1183
1184 if (unlikely(blocked_rdev)) {
1185
1186 int j;
1187 int d;
1188
1189 for (j = 0; j < i; j++) {
1190 if (r10_bio->devs[j].bio) {
1191 d = r10_bio->devs[j].devnum;
1192 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1193 }
1194 if (r10_bio->devs[j].repl_bio) {
1195 struct md_rdev *rdev;
1196 d = r10_bio->devs[j].devnum;
1197 rdev = conf->mirrors[d].replacement;
1198 if (!rdev) {
1199
1200 smp_mb();
1201 rdev = conf->mirrors[d].rdev;
1202 }
1203 rdev_dec_pending(rdev, mddev);
1204 }
1205 }
1206 allow_barrier(conf);
1207 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1208 wait_barrier(conf);
1209 goto retry_write;
1210 }
1211
1212 if (max_sectors < r10_bio->sectors) {
1213
1214
1215
1216 r10_bio->sectors = max_sectors;
1217 spin_lock_irq(&conf->device_lock);
1218 if (bio->bi_phys_segments == 0)
1219 bio->bi_phys_segments = 2;
1220 else
1221 bio->bi_phys_segments++;
1222 spin_unlock_irq(&conf->device_lock);
1223 }
1224 sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
1225
1226 atomic_set(&r10_bio->remaining, 1);
1227 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1228
1229 for (i = 0; i < conf->copies; i++) {
1230 struct bio *mbio;
1231 int d = r10_bio->devs[i].devnum;
1232 if (!r10_bio->devs[i].bio)
1233 continue;
1234
1235 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1236 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1237 max_sectors);
1238 r10_bio->devs[i].bio = mbio;
1239
1240 mbio->bi_sector = (r10_bio->devs[i].addr+
1241 conf->mirrors[d].rdev->data_offset);
1242 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1243 mbio->bi_end_io = raid10_end_write_request;
1244 mbio->bi_rw = WRITE | do_sync | do_fua;
1245 mbio->bi_private = r10_bio;
1246
1247 atomic_inc(&r10_bio->remaining);
1248 spin_lock_irqsave(&conf->device_lock, flags);
1249 bio_list_add(&conf->pending_bio_list, mbio);
1250 conf->pending_count++;
1251 spin_unlock_irqrestore(&conf->device_lock, flags);
1252
1253 if (!r10_bio->devs[i].repl_bio)
1254 continue;
1255
1256 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1257 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1258 max_sectors);
1259 r10_bio->devs[i].repl_bio = mbio;
1260
1261
1262
1263
1264
1265 mbio->bi_sector = (r10_bio->devs[i].addr+
1266 conf->mirrors[d].replacement->data_offset);
1267 mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
1268 mbio->bi_end_io = raid10_end_write_request;
1269 mbio->bi_rw = WRITE | do_sync | do_fua;
1270 mbio->bi_private = r10_bio;
1271
1272 atomic_inc(&r10_bio->remaining);
1273 spin_lock_irqsave(&conf->device_lock, flags);
1274 bio_list_add(&conf->pending_bio_list, mbio);
1275 conf->pending_count++;
1276 spin_unlock_irqrestore(&conf->device_lock, flags);
1277 }
1278
1279
1280
1281
1282
1283 if (sectors_handled < (bio->bi_size >> 9)) {
1284 one_write_done(r10_bio);
1285
1286
1287
1288 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1289
1290 r10_bio->master_bio = bio;
1291 r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
1292
1293 r10_bio->mddev = mddev;
1294 r10_bio->sector = bio->bi_sector + sectors_handled;
1295 r10_bio->state = 0;
1296 goto retry_write;
1297 }
1298 one_write_done(r10_bio);
1299
1300
1301 wake_up(&conf->wait_barrier);
1302
1303 if (do_sync || !mddev->bitmap || !plugged)
1304 md_wakeup_thread(mddev->thread);
1305}
1306
1307static void status(struct seq_file *seq, struct mddev *mddev)
1308{
1309 struct r10conf *conf = mddev->private;
1310 int i;
1311
1312 if (conf->near_copies < conf->raid_disks)
1313 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1314 if (conf->near_copies > 1)
1315 seq_printf(seq, " %d near-copies", conf->near_copies);
1316 if (conf->far_copies > 1) {
1317 if (conf->far_offset)
1318 seq_printf(seq, " %d offset-copies", conf->far_copies);
1319 else
1320 seq_printf(seq, " %d far-copies", conf->far_copies);
1321 }
1322 seq_printf(seq, " [%d/%d] [", conf->raid_disks,
1323 conf->raid_disks - mddev->degraded);
1324 for (i = 0; i < conf->raid_disks; i++)
1325 seq_printf(seq, "%s",
1326 conf->mirrors[i].rdev &&
1327 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
1328 seq_printf(seq, "]");
1329}
1330
1331
1332
1333
1334
1335
1336static int enough(struct r10conf *conf, int ignore)
1337{
1338 int first = 0;
1339
1340 do {
1341 int n = conf->copies;
1342 int cnt = 0;
1343 while (n--) {
1344 if (conf->mirrors[first].rdev &&
1345 first != ignore)
1346 cnt++;
1347 first = (first+1) % conf->raid_disks;
1348 }
1349 if (cnt == 0)
1350 return 0;
1351 } while (first != 0);
1352 return 1;
1353}
1354
1355static void error(struct mddev *mddev, struct md_rdev *rdev)
1356{
1357 char b[BDEVNAME_SIZE];
1358 struct r10conf *conf = mddev->private;
1359
1360
1361
1362
1363
1364
1365
1366 if (test_bit(In_sync, &rdev->flags)
1367 && !enough(conf, rdev->raid_disk))
1368
1369
1370
1371 return;
1372 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1373 unsigned long flags;
1374 spin_lock_irqsave(&conf->device_lock, flags);
1375 mddev->degraded++;
1376 spin_unlock_irqrestore(&conf->device_lock, flags);
1377
1378
1379
1380 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1381 }
1382 set_bit(Blocked, &rdev->flags);
1383 set_bit(Faulty, &rdev->flags);
1384 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1385 printk(KERN_ALERT
1386 "md/raid10:%s: Disk failure on %s, disabling device.\n"
1387 "md/raid10:%s: Operation continuing on %d devices.\n",
1388 mdname(mddev), bdevname(rdev->bdev, b),
1389 mdname(mddev), conf->raid_disks - mddev->degraded);
1390}
1391
1392static void print_conf(struct r10conf *conf)
1393{
1394 int i;
1395 struct mirror_info *tmp;
1396
1397 printk(KERN_DEBUG "RAID10 conf printout:\n");
1398 if (!conf) {
1399 printk(KERN_DEBUG "(!conf)\n");
1400 return;
1401 }
1402 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
1403 conf->raid_disks);
1404
1405 for (i = 0; i < conf->raid_disks; i++) {
1406 char b[BDEVNAME_SIZE];
1407 tmp = conf->mirrors + i;
1408 if (tmp->rdev)
1409 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1410 i, !test_bit(In_sync, &tmp->rdev->flags),
1411 !test_bit(Faulty, &tmp->rdev->flags),
1412 bdevname(tmp->rdev->bdev,b));
1413 }
1414}
1415
1416static void close_sync(struct r10conf *conf)
1417{
1418 wait_barrier(conf);
1419 allow_barrier(conf);
1420
1421 mempool_destroy(conf->r10buf_pool);
1422 conf->r10buf_pool = NULL;
1423}
1424
1425static int raid10_spare_active(struct mddev *mddev)
1426{
1427 int i;
1428 struct r10conf *conf = mddev->private;
1429 struct mirror_info *tmp;
1430 int count = 0;
1431 unsigned long flags;
1432
1433
1434
1435
1436
1437 for (i = 0; i < conf->raid_disks; i++) {
1438 tmp = conf->mirrors + i;
1439 if (tmp->replacement
1440 && tmp->replacement->recovery_offset == MaxSector
1441 && !test_bit(Faulty, &tmp->replacement->flags)
1442 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1443
1444 if (!tmp->rdev
1445 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1446 count++;
1447 if (tmp->rdev) {
1448
1449
1450
1451
1452 set_bit(Faulty, &tmp->rdev->flags);
1453 sysfs_notify_dirent_safe(
1454 tmp->rdev->sysfs_state);
1455 }
1456 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1457 } else if (tmp->rdev
1458 && !test_bit(Faulty, &tmp->rdev->flags)
1459 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1460 count++;
1461 sysfs_notify_dirent(tmp->rdev->sysfs_state);
1462 }
1463 }
1464 spin_lock_irqsave(&conf->device_lock, flags);
1465 mddev->degraded -= count;
1466 spin_unlock_irqrestore(&conf->device_lock, flags);
1467
1468 print_conf(conf);
1469 return count;
1470}
1471
1472
1473static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1474{
1475 struct r10conf *conf = mddev->private;
1476 int err = -EEXIST;
1477 int mirror;
1478 int first = 0;
1479 int last = conf->raid_disks - 1;
1480
1481 if (mddev->recovery_cp < MaxSector)
1482
1483
1484
1485 return -EBUSY;
1486 if (!enough(conf, -1))
1487 return -EINVAL;
1488
1489 if (rdev->raid_disk >= 0)
1490 first = last = rdev->raid_disk;
1491
1492 if (rdev->saved_raid_disk >= first &&
1493 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1494 mirror = rdev->saved_raid_disk;
1495 else
1496 mirror = first;
1497 for ( ; mirror <= last ; mirror++) {
1498 struct mirror_info *p = &conf->mirrors[mirror];
1499 if (p->recovery_disabled == mddev->recovery_disabled)
1500 continue;
1501 if (p->rdev) {
1502 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1503 p->replacement != NULL)
1504 continue;
1505 clear_bit(In_sync, &rdev->flags);
1506 set_bit(Replacement, &rdev->flags);
1507 rdev->raid_disk = mirror;
1508 err = 0;
1509 disk_stack_limits(mddev->gendisk, rdev->bdev,
1510 rdev->data_offset << 9);
1511 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1512 blk_queue_max_segments(mddev->queue, 1);
1513 blk_queue_segment_boundary(mddev->queue,
1514 PAGE_CACHE_SIZE - 1);
1515 }
1516 conf->fullsync = 1;
1517 rcu_assign_pointer(p->replacement, rdev);
1518 break;
1519 }
1520
1521 disk_stack_limits(mddev->gendisk, rdev->bdev,
1522 rdev->data_offset << 9);
1523
1524
1525
1526
1527
1528
1529 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1530 blk_queue_max_segments(mddev->queue, 1);
1531 blk_queue_segment_boundary(mddev->queue,
1532 PAGE_CACHE_SIZE - 1);
1533 }
1534
1535 p->head_position = 0;
1536 p->recovery_disabled = mddev->recovery_disabled - 1;
1537 rdev->raid_disk = mirror;
1538 err = 0;
1539 if (rdev->saved_raid_disk != mirror)
1540 conf->fullsync = 1;
1541 rcu_assign_pointer(p->rdev, rdev);
1542 break;
1543 }
1544
1545 md_integrity_add_rdev(rdev, mddev);
1546 print_conf(conf);
1547 return err;
1548}
1549
1550static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1551{
1552 struct r10conf *conf = mddev->private;
1553 int err = 0;
1554 int number = rdev->raid_disk;
1555 struct md_rdev **rdevp;
1556 struct mirror_info *p = conf->mirrors + number;
1557
1558 print_conf(conf);
1559 if (rdev == p->rdev)
1560 rdevp = &p->rdev;
1561 else if (rdev == p->replacement)
1562 rdevp = &p->replacement;
1563 else
1564 return 0;
1565
1566 if (test_bit(In_sync, &rdev->flags) ||
1567 atomic_read(&rdev->nr_pending)) {
1568 err = -EBUSY;
1569 goto abort;
1570 }
1571
1572
1573
1574 if (!test_bit(Faulty, &rdev->flags) &&
1575 mddev->recovery_disabled != p->recovery_disabled &&
1576 (!p->replacement || p->replacement == rdev) &&
1577 enough(conf, -1)) {
1578 err = -EBUSY;
1579 goto abort;
1580 }
1581 *rdevp = NULL;
1582 synchronize_rcu();
1583 if (atomic_read(&rdev->nr_pending)) {
1584
1585 err = -EBUSY;
1586 *rdevp = rdev;
1587 goto abort;
1588 } else if (p->replacement) {
1589
1590 p->rdev = p->replacement;
1591 clear_bit(Replacement, &p->replacement->flags);
1592 smp_mb();
1593
1594
1595 p->replacement = NULL;
1596 clear_bit(WantReplacement, &rdev->flags);
1597 } else
1598
1599
1600
1601 clear_bit(WantReplacement, &rdev->flags);
1602
1603 err = md_integrity_register(mddev);
1604
1605abort:
1606
1607 print_conf(conf);
1608 return err;
1609}
1610
1611
1612static void end_sync_read(struct bio *bio, int error)
1613{
1614 struct r10bio *r10_bio = bio->bi_private;
1615 struct r10conf *conf = r10_bio->mddev->private;
1616 int d;
1617
1618 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1619
1620 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1621 set_bit(R10BIO_Uptodate, &r10_bio->state);
1622 else
1623
1624
1625
1626 atomic_add(r10_bio->sectors,
1627 &conf->mirrors[d].rdev->corrected_errors);
1628
1629
1630
1631
1632 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1633 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1634 atomic_dec_and_test(&r10_bio->remaining)) {
1635
1636
1637
1638 reschedule_retry(r10_bio);
1639 }
1640}
1641
1642static void end_sync_request(struct r10bio *r10_bio)
1643{
1644 struct mddev *mddev = r10_bio->mddev;
1645
1646 while (atomic_dec_and_test(&r10_bio->remaining)) {
1647 if (r10_bio->master_bio == NULL) {
1648
1649 sector_t s = r10_bio->sectors;
1650 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1651 test_bit(R10BIO_WriteError, &r10_bio->state))
1652 reschedule_retry(r10_bio);
1653 else
1654 put_buf(r10_bio);
1655 md_done_sync(mddev, s, 1);
1656 break;
1657 } else {
1658 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1659 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1660 test_bit(R10BIO_WriteError, &r10_bio->state))
1661 reschedule_retry(r10_bio);
1662 else
1663 put_buf(r10_bio);
1664 r10_bio = r10_bio2;
1665 }
1666 }
1667}
1668
1669static void end_sync_write(struct bio *bio, int error)
1670{
1671 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1672 struct r10bio *r10_bio = bio->bi_private;
1673 struct mddev *mddev = r10_bio->mddev;
1674 struct r10conf *conf = mddev->private;
1675 int d;
1676 sector_t first_bad;
1677 int bad_sectors;
1678 int slot;
1679 int repl;
1680 struct md_rdev *rdev = NULL;
1681
1682 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1683 if (repl)
1684 rdev = conf->mirrors[d].replacement;
1685 if (!rdev) {
1686 smp_mb();
1687 rdev = conf->mirrors[d].rdev;
1688 }
1689
1690 if (!uptodate) {
1691 if (repl)
1692 md_error(mddev, rdev);
1693 else {
1694 set_bit(WriteErrorSeen, &rdev->flags);
1695 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1696 set_bit(MD_RECOVERY_NEEDED,
1697 &rdev->mddev->recovery);
1698 set_bit(R10BIO_WriteError, &r10_bio->state);
1699 }
1700 } else if (is_badblock(rdev,
1701 r10_bio->devs[slot].addr,
1702 r10_bio->sectors,
1703 &first_bad, &bad_sectors))
1704 set_bit(R10BIO_MadeGood, &r10_bio->state);
1705
1706 rdev_dec_pending(rdev, mddev);
1707
1708 end_sync_request(r10_bio);
1709}
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1728{
1729 struct r10conf *conf = mddev->private;
1730 int i, first;
1731 struct bio *tbio, *fbio;
1732
1733 atomic_set(&r10_bio->remaining, 1);
1734
1735
1736 for (i=0; i<conf->copies; i++)
1737 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
1738 break;
1739
1740 if (i == conf->copies)
1741 goto done;
1742
1743 first = i;
1744 fbio = r10_bio->devs[i].bio;
1745
1746
1747 for (i=0 ; i < conf->copies ; i++) {
1748 int j, d;
1749 int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
1750
1751 tbio = r10_bio->devs[i].bio;
1752
1753 if (tbio->bi_end_io != end_sync_read)
1754 continue;
1755 if (i == first)
1756 continue;
1757 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
1758
1759
1760
1761
1762 for (j = 0; j < vcnt; j++)
1763 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
1764 page_address(tbio->bi_io_vec[j].bv_page),
1765 PAGE_SIZE))
1766 break;
1767 if (j == vcnt)
1768 continue;
1769 mddev->resync_mismatches += r10_bio->sectors;
1770 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1771
1772 continue;
1773 }
1774
1775
1776
1777
1778
1779 tbio->bi_vcnt = vcnt;
1780 tbio->bi_size = r10_bio->sectors << 9;
1781 tbio->bi_idx = 0;
1782 tbio->bi_phys_segments = 0;
1783 tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1784 tbio->bi_flags |= 1 << BIO_UPTODATE;
1785 tbio->bi_next = NULL;
1786 tbio->bi_rw = WRITE;
1787 tbio->bi_private = r10_bio;
1788 tbio->bi_sector = r10_bio->devs[i].addr;
1789
1790 for (j=0; j < vcnt ; j++) {
1791 tbio->bi_io_vec[j].bv_offset = 0;
1792 tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
1793
1794 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
1795 page_address(fbio->bi_io_vec[j].bv_page),
1796 PAGE_SIZE);
1797 }
1798 tbio->bi_end_io = end_sync_write;
1799
1800 d = r10_bio->devs[i].devnum;
1801 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1802 atomic_inc(&r10_bio->remaining);
1803 md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
1804
1805 tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
1806 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1807 generic_make_request(tbio);
1808 }
1809
1810
1811
1812
1813 for (i = 0; i < conf->copies; i++) {
1814 int j, d;
1815 int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
1816
1817 tbio = r10_bio->devs[i].repl_bio;
1818 if (!tbio || !tbio->bi_end_io)
1819 continue;
1820 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
1821 && r10_bio->devs[i].bio != fbio)
1822 for (j = 0; j < vcnt; j++)
1823 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
1824 page_address(fbio->bi_io_vec[j].bv_page),
1825 PAGE_SIZE);
1826 d = r10_bio->devs[i].devnum;
1827 atomic_inc(&r10_bio->remaining);
1828 md_sync_acct(conf->mirrors[d].replacement->bdev,
1829 tbio->bi_size >> 9);
1830 generic_make_request(tbio);
1831 }
1832
1833done:
1834 if (atomic_dec_and_test(&r10_bio->remaining)) {
1835 md_done_sync(mddev, r10_bio->sectors, 1);
1836 put_buf(r10_bio);
1837 }
1838}
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850static void fix_recovery_read_error(struct r10bio *r10_bio)
1851{
1852
1853
1854
1855
1856
1857
1858
1859 struct mddev *mddev = r10_bio->mddev;
1860 struct r10conf *conf = mddev->private;
1861 struct bio *bio = r10_bio->devs[0].bio;
1862 sector_t sect = 0;
1863 int sectors = r10_bio->sectors;
1864 int idx = 0;
1865 int dr = r10_bio->devs[0].devnum;
1866 int dw = r10_bio->devs[1].devnum;
1867
1868 while (sectors) {
1869 int s = sectors;
1870 struct md_rdev *rdev;
1871 sector_t addr;
1872 int ok;
1873
1874 if (s > (PAGE_SIZE>>9))
1875 s = PAGE_SIZE >> 9;
1876
1877 rdev = conf->mirrors[dr].rdev;
1878 addr = r10_bio->devs[0].addr + sect,
1879 ok = sync_page_io(rdev,
1880 addr,
1881 s << 9,
1882 bio->bi_io_vec[idx].bv_page,
1883 READ, false);
1884 if (ok) {
1885 rdev = conf->mirrors[dw].rdev;
1886 addr = r10_bio->devs[1].addr + sect;
1887 ok = sync_page_io(rdev,
1888 addr,
1889 s << 9,
1890 bio->bi_io_vec[idx].bv_page,
1891 WRITE, false);
1892 if (!ok) {
1893 set_bit(WriteErrorSeen, &rdev->flags);
1894 if (!test_and_set_bit(WantReplacement,
1895 &rdev->flags))
1896 set_bit(MD_RECOVERY_NEEDED,
1897 &rdev->mddev->recovery);
1898 }
1899 }
1900 if (!ok) {
1901
1902
1903
1904
1905 rdev_set_badblocks(rdev, addr, s, 0);
1906
1907 if (rdev != conf->mirrors[dw].rdev) {
1908
1909 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
1910 addr = r10_bio->devs[1].addr + sect;
1911 ok = rdev_set_badblocks(rdev2, addr, s, 0);
1912 if (!ok) {
1913
1914 printk(KERN_NOTICE
1915 "md/raid10:%s: recovery aborted"
1916 " due to read error\n",
1917 mdname(mddev));
1918
1919 conf->mirrors[dw].recovery_disabled
1920 = mddev->recovery_disabled;
1921 set_bit(MD_RECOVERY_INTR,
1922 &mddev->recovery);
1923 break;
1924 }
1925 }
1926 }
1927
1928 sectors -= s;
1929 sect += s;
1930 idx++;
1931 }
1932}
1933
1934static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1935{
1936 struct r10conf *conf = mddev->private;
1937 int d;
1938 struct bio *wbio, *wbio2;
1939
1940 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
1941 fix_recovery_read_error(r10_bio);
1942 end_sync_request(r10_bio);
1943 return;
1944 }
1945
1946
1947
1948
1949
1950 d = r10_bio->devs[1].devnum;
1951 wbio = r10_bio->devs[1].bio;
1952 wbio2 = r10_bio->devs[1].repl_bio;
1953 if (wbio->bi_end_io) {
1954 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1955 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
1956 generic_make_request(wbio);
1957 }
1958 if (wbio2 && wbio2->bi_end_io) {
1959 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
1960 md_sync_acct(conf->mirrors[d].replacement->bdev,
1961 wbio2->bi_size >> 9);
1962 generic_make_request(wbio2);
1963 }
1964}
1965
1966
1967
1968
1969
1970
1971
1972
1973static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
1974{
1975 struct timespec cur_time_mon;
1976 unsigned long hours_since_last;
1977 unsigned int read_errors = atomic_read(&rdev->read_errors);
1978
1979 ktime_get_ts(&cur_time_mon);
1980
1981 if (rdev->last_read_error.tv_sec == 0 &&
1982 rdev->last_read_error.tv_nsec == 0) {
1983
1984 rdev->last_read_error = cur_time_mon;
1985 return;
1986 }
1987
1988 hours_since_last = (cur_time_mon.tv_sec -
1989 rdev->last_read_error.tv_sec) / 3600;
1990
1991 rdev->last_read_error = cur_time_mon;
1992
1993
1994
1995
1996
1997
1998 if (hours_since_last >= 8 * sizeof(read_errors))
1999 atomic_set(&rdev->read_errors, 0);
2000 else
2001 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2002}
2003
2004static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2005 int sectors, struct page *page, int rw)
2006{
2007 sector_t first_bad;
2008 int bad_sectors;
2009
2010 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2011 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2012 return -1;
2013 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
2014
2015 return 1;
2016 if (rw == WRITE) {
2017 set_bit(WriteErrorSeen, &rdev->flags);
2018 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2019 set_bit(MD_RECOVERY_NEEDED,
2020 &rdev->mddev->recovery);
2021 }
2022
2023 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2024 md_error(rdev->mddev, rdev);
2025 return 0;
2026}
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2037{
2038 int sect = 0;
2039 int sectors = r10_bio->sectors;
2040 struct md_rdev*rdev;
2041 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2042 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2043
2044
2045
2046
2047 rdev = conf->mirrors[d].rdev;
2048
2049 if (test_bit(Faulty, &rdev->flags))
2050
2051
2052 return;
2053
2054 check_decay_read_errors(mddev, rdev);
2055 atomic_inc(&rdev->read_errors);
2056 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2057 char b[BDEVNAME_SIZE];
2058 bdevname(rdev->bdev, b);
2059
2060 printk(KERN_NOTICE
2061 "md/raid10:%s: %s: Raid device exceeded "
2062 "read_error threshold [cur %d:max %d]\n",
2063 mdname(mddev), b,
2064 atomic_read(&rdev->read_errors), max_read_errors);
2065 printk(KERN_NOTICE
2066 "md/raid10:%s: %s: Failing raid device\n",
2067 mdname(mddev), b);
2068 md_error(mddev, conf->mirrors[d].rdev);
2069 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2070 return;
2071 }
2072
2073 while(sectors) {
2074 int s = sectors;
2075 int sl = r10_bio->read_slot;
2076 int success = 0;
2077 int start;
2078
2079 if (s > (PAGE_SIZE>>9))
2080 s = PAGE_SIZE >> 9;
2081
2082 rcu_read_lock();
2083 do {
2084 sector_t first_bad;
2085 int bad_sectors;
2086
2087 d = r10_bio->devs[sl].devnum;
2088 rdev = rcu_dereference(conf->mirrors[d].rdev);
2089 if (rdev &&
2090 test_bit(In_sync, &rdev->flags) &&
2091 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2092 &first_bad, &bad_sectors) == 0) {
2093 atomic_inc(&rdev->nr_pending);
2094 rcu_read_unlock();
2095 success = sync_page_io(rdev,
2096 r10_bio->devs[sl].addr +
2097 sect,
2098 s<<9,
2099 conf->tmppage, READ, false);
2100 rdev_dec_pending(rdev, mddev);
2101 rcu_read_lock();
2102 if (success)
2103 break;
2104 }
2105 sl++;
2106 if (sl == conf->copies)
2107 sl = 0;
2108 } while (!success && sl != r10_bio->read_slot);
2109 rcu_read_unlock();
2110
2111 if (!success) {
2112
2113
2114
2115
2116 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2117 rdev = conf->mirrors[dn].rdev;
2118
2119 if (!rdev_set_badblocks(
2120 rdev,
2121 r10_bio->devs[r10_bio->read_slot].addr
2122 + sect,
2123 s, 0)) {
2124 md_error(mddev, rdev);
2125 r10_bio->devs[r10_bio->read_slot].bio
2126 = IO_BLOCKED;
2127 }
2128 break;
2129 }
2130
2131 start = sl;
2132
2133 rcu_read_lock();
2134 while (sl != r10_bio->read_slot) {
2135 char b[BDEVNAME_SIZE];
2136
2137 if (sl==0)
2138 sl = conf->copies;
2139 sl--;
2140 d = r10_bio->devs[sl].devnum;
2141 rdev = rcu_dereference(conf->mirrors[d].rdev);
2142 if (!rdev ||
2143 !test_bit(In_sync, &rdev->flags))
2144 continue;
2145
2146 atomic_inc(&rdev->nr_pending);
2147 rcu_read_unlock();
2148 if (r10_sync_page_io(rdev,
2149 r10_bio->devs[sl].addr +
2150 sect,
2151 s<<9, conf->tmppage, WRITE)
2152 == 0) {
2153
2154 printk(KERN_NOTICE
2155 "md/raid10:%s: read correction "
2156 "write failed"
2157 " (%d sectors at %llu on %s)\n",
2158 mdname(mddev), s,
2159 (unsigned long long)(
2160 sect + rdev->data_offset),
2161 bdevname(rdev->bdev, b));
2162 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2163 "drive\n",
2164 mdname(mddev),
2165 bdevname(rdev->bdev, b));
2166 }
2167 rdev_dec_pending(rdev, mddev);
2168 rcu_read_lock();
2169 }
2170 sl = start;
2171 while (sl != r10_bio->read_slot) {
2172 char b[BDEVNAME_SIZE];
2173
2174 if (sl==0)
2175 sl = conf->copies;
2176 sl--;
2177 d = r10_bio->devs[sl].devnum;
2178 rdev = rcu_dereference(conf->mirrors[d].rdev);
2179 if (!rdev ||
2180 !test_bit(In_sync, &rdev->flags))
2181 continue;
2182
2183 atomic_inc(&rdev->nr_pending);
2184 rcu_read_unlock();
2185 switch (r10_sync_page_io(rdev,
2186 r10_bio->devs[sl].addr +
2187 sect,
2188 s<<9, conf->tmppage,
2189 READ)) {
2190 case 0:
2191
2192 printk(KERN_NOTICE
2193 "md/raid10:%s: unable to read back "
2194 "corrected sectors"
2195 " (%d sectors at %llu on %s)\n",
2196 mdname(mddev), s,
2197 (unsigned long long)(
2198 sect + rdev->data_offset),
2199 bdevname(rdev->bdev, b));
2200 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2201 "drive\n",
2202 mdname(mddev),
2203 bdevname(rdev->bdev, b));
2204 break;
2205 case 1:
2206 printk(KERN_INFO
2207 "md/raid10:%s: read error corrected"
2208 " (%d sectors at %llu on %s)\n",
2209 mdname(mddev), s,
2210 (unsigned long long)(
2211 sect + rdev->data_offset),
2212 bdevname(rdev->bdev, b));
2213 atomic_add(s, &rdev->corrected_errors);
2214 }
2215
2216 rdev_dec_pending(rdev, mddev);
2217 rcu_read_lock();
2218 }
2219 rcu_read_unlock();
2220
2221 sectors -= s;
2222 sect += s;
2223 }
2224}
2225
2226static void bi_complete(struct bio *bio, int error)
2227{
2228 complete((struct completion *)bio->bi_private);
2229}
2230
2231static int submit_bio_wait(int rw, struct bio *bio)
2232{
2233 struct completion event;
2234 rw |= REQ_SYNC;
2235
2236 init_completion(&event);
2237 bio->bi_private = &event;
2238 bio->bi_end_io = bi_complete;
2239 submit_bio(rw, bio);
2240 wait_for_completion(&event);
2241
2242 return test_bit(BIO_UPTODATE, &bio->bi_flags);
2243}
2244
2245static int narrow_write_error(struct r10bio *r10_bio, int i)
2246{
2247 struct bio *bio = r10_bio->master_bio;
2248 struct mddev *mddev = r10_bio->mddev;
2249 struct r10conf *conf = mddev->private;
2250 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262 int block_sectors;
2263 sector_t sector;
2264 int sectors;
2265 int sect_to_write = r10_bio->sectors;
2266 int ok = 1;
2267
2268 if (rdev->badblocks.shift < 0)
2269 return 0;
2270
2271 block_sectors = 1 << rdev->badblocks.shift;
2272 sector = r10_bio->sector;
2273 sectors = ((r10_bio->sector + block_sectors)
2274 & ~(sector_t)(block_sectors - 1))
2275 - sector;
2276
2277 while (sect_to_write) {
2278 struct bio *wbio;
2279 if (sectors > sect_to_write)
2280 sectors = sect_to_write;
2281
2282 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2283 md_trim_bio(wbio, sector - bio->bi_sector, sectors);
2284 wbio->bi_sector = (r10_bio->devs[i].addr+
2285 rdev->data_offset+
2286 (sector - r10_bio->sector));
2287 wbio->bi_bdev = rdev->bdev;
2288 if (submit_bio_wait(WRITE, wbio) == 0)
2289
2290 ok = rdev_set_badblocks(rdev, sector,
2291 sectors, 0)
2292 && ok;
2293
2294 bio_put(wbio);
2295 sect_to_write -= sectors;
2296 sector += sectors;
2297 sectors = block_sectors;
2298 }
2299 return ok;
2300}
2301
2302static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2303{
2304 int slot = r10_bio->read_slot;
2305 struct bio *bio;
2306 struct r10conf *conf = mddev->private;
2307 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2308 char b[BDEVNAME_SIZE];
2309 unsigned long do_sync;
2310 int max_sectors;
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320 bio = r10_bio->devs[slot].bio;
2321 bdevname(bio->bi_bdev, b);
2322 bio_put(bio);
2323 r10_bio->devs[slot].bio = NULL;
2324
2325 if (mddev->ro == 0) {
2326 freeze_array(conf);
2327 fix_read_error(conf, mddev, r10_bio);
2328 unfreeze_array(conf);
2329 } else
2330 r10_bio->devs[slot].bio = IO_BLOCKED;
2331
2332 rdev_dec_pending(rdev, mddev);
2333
2334read_more:
2335 rdev = read_balance(conf, r10_bio, &max_sectors);
2336 if (rdev == NULL) {
2337 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2338 " read error for block %llu\n",
2339 mdname(mddev), b,
2340 (unsigned long long)r10_bio->sector);
2341 raid_end_bio_io(r10_bio);
2342 return;
2343 }
2344
2345 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
2346 slot = r10_bio->read_slot;
2347 printk_ratelimited(
2348 KERN_ERR
2349 "md/raid10:%s: %s: redirecting"
2350 "sector %llu to another mirror\n",
2351 mdname(mddev),
2352 bdevname(rdev->bdev, b),
2353 (unsigned long long)r10_bio->sector);
2354 bio = bio_clone_mddev(r10_bio->master_bio,
2355 GFP_NOIO, mddev);
2356 md_trim_bio(bio,
2357 r10_bio->sector - bio->bi_sector,
2358 max_sectors);
2359 r10_bio->devs[slot].bio = bio;
2360 r10_bio->devs[slot].rdev = rdev;
2361 bio->bi_sector = r10_bio->devs[slot].addr
2362 + rdev->data_offset;
2363 bio->bi_bdev = rdev->bdev;
2364 bio->bi_rw = READ | do_sync;
2365 bio->bi_private = r10_bio;
2366 bio->bi_end_io = raid10_end_read_request;
2367 if (max_sectors < r10_bio->sectors) {
2368
2369 struct bio *mbio = r10_bio->master_bio;
2370 int sectors_handled =
2371 r10_bio->sector + max_sectors
2372 - mbio->bi_sector;
2373 r10_bio->sectors = max_sectors;
2374 spin_lock_irq(&conf->device_lock);
2375 if (mbio->bi_phys_segments == 0)
2376 mbio->bi_phys_segments = 2;
2377 else
2378 mbio->bi_phys_segments++;
2379 spin_unlock_irq(&conf->device_lock);
2380 generic_make_request(bio);
2381
2382 r10_bio = mempool_alloc(conf->r10bio_pool,
2383 GFP_NOIO);
2384 r10_bio->master_bio = mbio;
2385 r10_bio->sectors = (mbio->bi_size >> 9)
2386 - sectors_handled;
2387 r10_bio->state = 0;
2388 set_bit(R10BIO_ReadError,
2389 &r10_bio->state);
2390 r10_bio->mddev = mddev;
2391 r10_bio->sector = mbio->bi_sector
2392 + sectors_handled;
2393
2394 goto read_more;
2395 } else
2396 generic_make_request(bio);
2397}
2398
2399static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2400{
2401
2402
2403
2404
2405
2406
2407 int m;
2408 struct md_rdev *rdev;
2409
2410 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2411 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2412 for (m = 0; m < conf->copies; m++) {
2413 int dev = r10_bio->devs[m].devnum;
2414 rdev = conf->mirrors[dev].rdev;
2415 if (r10_bio->devs[m].bio == NULL)
2416 continue;
2417 if (test_bit(BIO_UPTODATE,
2418 &r10_bio->devs[m].bio->bi_flags)) {
2419 rdev_clear_badblocks(
2420 rdev,
2421 r10_bio->devs[m].addr,
2422 r10_bio->sectors);
2423 } else {
2424 if (!rdev_set_badblocks(
2425 rdev,
2426 r10_bio->devs[m].addr,
2427 r10_bio->sectors, 0))
2428 md_error(conf->mddev, rdev);
2429 }
2430 rdev = conf->mirrors[dev].replacement;
2431 if (r10_bio->devs[m].repl_bio == NULL)
2432 continue;
2433 if (test_bit(BIO_UPTODATE,
2434 &r10_bio->devs[m].repl_bio->bi_flags)) {
2435 rdev_clear_badblocks(
2436 rdev,
2437 r10_bio->devs[m].addr,
2438 r10_bio->sectors);
2439 } else {
2440 if (!rdev_set_badblocks(
2441 rdev,
2442 r10_bio->devs[m].addr,
2443 r10_bio->sectors, 0))
2444 md_error(conf->mddev, rdev);
2445 }
2446 }
2447 put_buf(r10_bio);
2448 } else {
2449 for (m = 0; m < conf->copies; m++) {
2450 int dev = r10_bio->devs[m].devnum;
2451 struct bio *bio = r10_bio->devs[m].bio;
2452 rdev = conf->mirrors[dev].rdev;
2453 if (bio == IO_MADE_GOOD) {
2454 rdev_clear_badblocks(
2455 rdev,
2456 r10_bio->devs[m].addr,
2457 r10_bio->sectors);
2458 rdev_dec_pending(rdev, conf->mddev);
2459 } else if (bio != NULL &&
2460 !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2461 if (!narrow_write_error(r10_bio, m)) {
2462 md_error(conf->mddev, rdev);
2463 set_bit(R10BIO_Degraded,
2464 &r10_bio->state);
2465 }
2466 rdev_dec_pending(rdev, conf->mddev);
2467 }
2468 bio = r10_bio->devs[m].repl_bio;
2469 rdev = conf->mirrors[dev].replacement;
2470 if (rdev && bio == IO_MADE_GOOD) {
2471 rdev_clear_badblocks(
2472 rdev,
2473 r10_bio->devs[m].addr,
2474 r10_bio->sectors);
2475 rdev_dec_pending(rdev, conf->mddev);
2476 }
2477 }
2478 if (test_bit(R10BIO_WriteError,
2479 &r10_bio->state))
2480 close_write(r10_bio);
2481 raid_end_bio_io(r10_bio);
2482 }
2483}
2484
2485static void raid10d(struct mddev *mddev)
2486{
2487 struct r10bio *r10_bio;
2488 unsigned long flags;
2489 struct r10conf *conf = mddev->private;
2490 struct list_head *head = &conf->retry_list;
2491 struct blk_plug plug;
2492
2493 md_check_recovery(mddev);
2494
2495 blk_start_plug(&plug);
2496 for (;;) {
2497
2498 flush_pending_writes(conf);
2499
2500 spin_lock_irqsave(&conf->device_lock, flags);
2501 if (list_empty(head)) {
2502 spin_unlock_irqrestore(&conf->device_lock, flags);
2503 break;
2504 }
2505 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2506 list_del(head->prev);
2507 conf->nr_queued--;
2508 spin_unlock_irqrestore(&conf->device_lock, flags);
2509
2510 mddev = r10_bio->mddev;
2511 conf = mddev->private;
2512 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2513 test_bit(R10BIO_WriteError, &r10_bio->state))
2514 handle_write_completed(conf, r10_bio);
2515 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2516 sync_request_write(mddev, r10_bio);
2517 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2518 recovery_request_write(mddev, r10_bio);
2519 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2520 handle_read_error(mddev, r10_bio);
2521 else {
2522
2523
2524
2525 int slot = r10_bio->read_slot;
2526 generic_make_request(r10_bio->devs[slot].bio);
2527 }
2528
2529 cond_resched();
2530 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2531 md_check_recovery(mddev);
2532 }
2533 blk_finish_plug(&plug);
2534}
2535
2536
2537static int init_resync(struct r10conf *conf)
2538{
2539 int buffs;
2540 int i;
2541
2542 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2543 BUG_ON(conf->r10buf_pool);
2544 conf->have_replacement = 0;
2545 for (i = 0; i < conf->raid_disks; i++)
2546 if (conf->mirrors[i].replacement)
2547 conf->have_replacement = 1;
2548 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2549 if (!conf->r10buf_pool)
2550 return -ENOMEM;
2551 conf->next_resync = 0;
2552 return 0;
2553}
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2588 int *skipped, int go_faster)
2589{
2590 struct r10conf *conf = mddev->private;
2591 struct r10bio *r10_bio;
2592 struct bio *biolist = NULL, *bio;
2593 sector_t max_sector, nr_sectors;
2594 int i;
2595 int max_sync;
2596 sector_t sync_blocks;
2597 sector_t sectors_skipped = 0;
2598 int chunks_skipped = 0;
2599
2600 if (!conf->r10buf_pool)
2601 if (init_resync(conf))
2602 return 0;
2603
2604 skipped:
2605 max_sector = mddev->dev_sectors;
2606 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2607 max_sector = mddev->resync_max_sectors;
2608 if (sector_nr >= max_sector) {
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618 if (mddev->curr_resync < max_sector) {
2619 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2620 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2621 &sync_blocks, 1);
2622 else for (i=0; i<conf->raid_disks; i++) {
2623 sector_t sect =
2624 raid10_find_virt(conf, mddev->curr_resync, i);
2625 bitmap_end_sync(mddev->bitmap, sect,
2626 &sync_blocks, 1);
2627 }
2628 } else {
2629
2630 if ((!mddev->bitmap || conf->fullsync)
2631 && conf->have_replacement
2632 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2633
2634
2635
2636 for (i = 0; i < conf->raid_disks; i++)
2637 if (conf->mirrors[i].replacement)
2638 conf->mirrors[i].replacement
2639 ->recovery_offset
2640 = MaxSector;
2641 }
2642 conf->fullsync = 0;
2643 }
2644 bitmap_close_sync(mddev->bitmap);
2645 close_sync(conf);
2646 *skipped = 1;
2647 return sectors_skipped;
2648 }
2649 if (chunks_skipped >= conf->raid_disks) {
2650
2651
2652
2653 *skipped = 1;
2654 return (max_sector - sector_nr) + sectors_skipped;
2655 }
2656
2657 if (max_sector > mddev->resync_max)
2658 max_sector = mddev->resync_max;
2659
2660
2661
2662
2663 if (conf->near_copies < conf->raid_disks &&
2664 max_sector > (sector_nr | conf->chunk_mask))
2665 max_sector = (sector_nr | conf->chunk_mask) + 1;
2666
2667
2668
2669
2670 if (!go_faster && conf->nr_waiting)
2671 msleep_interruptible(1000);
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
2689 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2690
2691 int j;
2692 r10_bio = NULL;
2693
2694 for (i=0 ; i<conf->raid_disks; i++) {
2695 int still_degraded;
2696 struct r10bio *rb2;
2697 sector_t sect;
2698 int must_sync;
2699 int any_working;
2700 struct mirror_info *mirror = &conf->mirrors[i];
2701
2702 if ((mirror->rdev == NULL ||
2703 test_bit(In_sync, &mirror->rdev->flags))
2704 &&
2705 (mirror->replacement == NULL ||
2706 test_bit(Faulty,
2707 &mirror->replacement->flags)))
2708 continue;
2709
2710 still_degraded = 0;
2711
2712 rb2 = r10_bio;
2713 sect = raid10_find_virt(conf, sector_nr, i);
2714
2715
2716
2717
2718 must_sync = bitmap_start_sync(mddev->bitmap, sect,
2719 &sync_blocks, 1);
2720 if (sync_blocks < max_sync)
2721 max_sync = sync_blocks;
2722 if (!must_sync &&
2723 mirror->replacement == NULL &&
2724 !conf->fullsync) {
2725
2726
2727
2728 chunks_skipped = -1;
2729 continue;
2730 }
2731
2732 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
2733 raise_barrier(conf, rb2 != NULL);
2734 atomic_set(&r10_bio->remaining, 0);
2735
2736 r10_bio->master_bio = (struct bio*)rb2;
2737 if (rb2)
2738 atomic_inc(&rb2->remaining);
2739 r10_bio->mddev = mddev;
2740 set_bit(R10BIO_IsRecover, &r10_bio->state);
2741 r10_bio->sector = sect;
2742
2743 raid10_find_phys(conf, r10_bio);
2744
2745
2746
2747
2748 for (j=0; j<conf->raid_disks; j++)
2749 if (conf->mirrors[j].rdev == NULL ||
2750 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
2751 still_degraded = 1;
2752 break;
2753 }
2754
2755 must_sync = bitmap_start_sync(mddev->bitmap, sect,
2756 &sync_blocks, still_degraded);
2757
2758 any_working = 0;
2759 for (j=0; j<conf->copies;j++) {
2760 int k;
2761 int d = r10_bio->devs[j].devnum;
2762 sector_t from_addr, to_addr;
2763 struct md_rdev *rdev;
2764 sector_t sector, first_bad;
2765 int bad_sectors;
2766 if (!conf->mirrors[d].rdev ||
2767 !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
2768 continue;
2769
2770 any_working = 1;
2771 rdev = conf->mirrors[d].rdev;
2772 sector = r10_bio->devs[j].addr;
2773
2774 if (is_badblock(rdev, sector, max_sync,
2775 &first_bad, &bad_sectors)) {
2776 if (first_bad > sector)
2777 max_sync = first_bad - sector;
2778 else {
2779 bad_sectors -= (sector
2780 - first_bad);
2781 if (max_sync > bad_sectors)
2782 max_sync = bad_sectors;
2783 continue;
2784 }
2785 }
2786 bio = r10_bio->devs[0].bio;
2787 bio->bi_next = biolist;
2788 biolist = bio;
2789 bio->bi_private = r10_bio;
2790 bio->bi_end_io = end_sync_read;
2791 bio->bi_rw = READ;
2792 from_addr = r10_bio->devs[j].addr;
2793 bio->bi_sector = from_addr + rdev->data_offset;
2794 bio->bi_bdev = rdev->bdev;
2795 atomic_inc(&rdev->nr_pending);
2796
2797
2798 for (k=0; k<conf->copies; k++)
2799 if (r10_bio->devs[k].devnum == i)
2800 break;
2801 BUG_ON(k == conf->copies);
2802 to_addr = r10_bio->devs[k].addr;
2803 r10_bio->devs[0].devnum = d;
2804 r10_bio->devs[0].addr = from_addr;
2805 r10_bio->devs[1].devnum = i;
2806 r10_bio->devs[1].addr = to_addr;
2807
2808 rdev = mirror->rdev;
2809 if (!test_bit(In_sync, &rdev->flags)) {
2810 bio = r10_bio->devs[1].bio;
2811 bio->bi_next = biolist;
2812 biolist = bio;
2813 bio->bi_private = r10_bio;
2814 bio->bi_end_io = end_sync_write;
2815 bio->bi_rw = WRITE;
2816 bio->bi_sector = to_addr
2817 + rdev->data_offset;
2818 bio->bi_bdev = rdev->bdev;
2819 atomic_inc(&r10_bio->remaining);
2820 } else
2821 r10_bio->devs[1].bio->bi_end_io = NULL;
2822
2823
2824 bio = r10_bio->devs[1].repl_bio;
2825 if (bio)
2826 bio->bi_end_io = NULL;
2827 rdev = mirror->replacement;
2828
2829
2830
2831
2832
2833
2834
2835
2836 if (rdev == NULL || bio == NULL ||
2837 test_bit(Faulty, &rdev->flags))
2838 break;
2839 bio->bi_next = biolist;
2840 biolist = bio;
2841 bio->bi_private = r10_bio;
2842 bio->bi_end_io = end_sync_write;
2843 bio->bi_rw = WRITE;
2844 bio->bi_sector = to_addr + rdev->data_offset;
2845 bio->bi_bdev = rdev->bdev;
2846 atomic_inc(&r10_bio->remaining);
2847 break;
2848 }
2849 if (j == conf->copies) {
2850
2851
2852 put_buf(r10_bio);
2853 if (rb2)
2854 atomic_dec(&rb2->remaining);
2855 r10_bio = rb2;
2856 if (any_working) {
2857
2858
2859
2860 int k;
2861 for (k = 0; k < conf->copies; k++)
2862 if (r10_bio->devs[k].devnum == i)
2863 break;
2864 if (!test_bit(In_sync,
2865 &mirror->rdev->flags)
2866 && !rdev_set_badblocks(
2867 mirror->rdev,
2868 r10_bio->devs[k].addr,
2869 max_sync, 0))
2870 any_working = 0;
2871 if (mirror->replacement &&
2872 !rdev_set_badblocks(
2873 mirror->replacement,
2874 r10_bio->devs[k].addr,
2875 max_sync, 0))
2876 any_working = 0;
2877 }
2878 if (!any_working) {
2879 if (!test_and_set_bit(MD_RECOVERY_INTR,
2880 &mddev->recovery))
2881 printk(KERN_INFO "md/raid10:%s: insufficient "
2882 "working devices for recovery.\n",
2883 mdname(mddev));
2884 mirror->recovery_disabled
2885 = mddev->recovery_disabled;
2886 }
2887 break;
2888 }
2889 }
2890 if (biolist == NULL) {
2891 while (r10_bio) {
2892 struct r10bio *rb2 = r10_bio;
2893 r10_bio = (struct r10bio*) rb2->master_bio;
2894 rb2->master_bio = NULL;
2895 put_buf(rb2);
2896 }
2897 goto giveup;
2898 }
2899 } else {
2900
2901 int count = 0;
2902
2903 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
2904
2905 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
2906 &sync_blocks, mddev->degraded) &&
2907 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
2908 &mddev->recovery)) {
2909
2910 *skipped = 1;
2911 return sync_blocks + sectors_skipped;
2912 }
2913 if (sync_blocks < max_sync)
2914 max_sync = sync_blocks;
2915 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
2916
2917 r10_bio->mddev = mddev;
2918 atomic_set(&r10_bio->remaining, 0);
2919 raise_barrier(conf, 0);
2920 conf->next_resync = sector_nr;
2921
2922 r10_bio->master_bio = NULL;
2923 r10_bio->sector = sector_nr;
2924 set_bit(R10BIO_IsSync, &r10_bio->state);
2925 raid10_find_phys(conf, r10_bio);
2926 r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1;
2927
2928 for (i=0; i<conf->copies; i++) {
2929 int d = r10_bio->devs[i].devnum;
2930 sector_t first_bad, sector;
2931 int bad_sectors;
2932
2933 if (r10_bio->devs[i].repl_bio)
2934 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
2935
2936 bio = r10_bio->devs[i].bio;
2937 bio->bi_end_io = NULL;
2938 clear_bit(BIO_UPTODATE, &bio->bi_flags);
2939 if (conf->mirrors[d].rdev == NULL ||
2940 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
2941 continue;
2942 sector = r10_bio->devs[i].addr;
2943 if (is_badblock(conf->mirrors[d].rdev,
2944 sector, max_sync,
2945 &first_bad, &bad_sectors)) {
2946 if (first_bad > sector)
2947 max_sync = first_bad - sector;
2948 else {
2949 bad_sectors -= (sector - first_bad);
2950 if (max_sync > bad_sectors)
2951 max_sync = max_sync;
2952 continue;
2953 }
2954 }
2955 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2956 atomic_inc(&r10_bio->remaining);
2957 bio->bi_next = biolist;
2958 biolist = bio;
2959 bio->bi_private = r10_bio;
2960 bio->bi_end_io = end_sync_read;
2961 bio->bi_rw = READ;
2962 bio->bi_sector = sector +
2963 conf->mirrors[d].rdev->data_offset;
2964 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
2965 count++;
2966
2967 if (conf->mirrors[d].replacement == NULL ||
2968 test_bit(Faulty,
2969 &conf->mirrors[d].replacement->flags))
2970 continue;
2971
2972
2973 bio = r10_bio->devs[i].repl_bio;
2974 clear_bit(BIO_UPTODATE, &bio->bi_flags);
2975
2976 sector = r10_bio->devs[i].addr;
2977 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2978 bio->bi_next = biolist;
2979 biolist = bio;
2980 bio->bi_private = r10_bio;
2981 bio->bi_end_io = end_sync_write;
2982 bio->bi_rw = WRITE;
2983 bio->bi_sector = sector +
2984 conf->mirrors[d].replacement->data_offset;
2985 bio->bi_bdev = conf->mirrors[d].replacement->bdev;
2986 count++;
2987 }
2988
2989 if (count < 2) {
2990 for (i=0; i<conf->copies; i++) {
2991 int d = r10_bio->devs[i].devnum;
2992 if (r10_bio->devs[i].bio->bi_end_io)
2993 rdev_dec_pending(conf->mirrors[d].rdev,
2994 mddev);
2995 if (r10_bio->devs[i].repl_bio &&
2996 r10_bio->devs[i].repl_bio->bi_end_io)
2997 rdev_dec_pending(
2998 conf->mirrors[d].replacement,
2999 mddev);
3000 }
3001 put_buf(r10_bio);
3002 biolist = NULL;
3003 goto giveup;
3004 }
3005 }
3006
3007 for (bio = biolist; bio ; bio=bio->bi_next) {
3008
3009 bio->bi_flags &= ~(BIO_POOL_MASK - 1);
3010 if (bio->bi_end_io)
3011 bio->bi_flags |= 1 << BIO_UPTODATE;
3012 bio->bi_vcnt = 0;
3013 bio->bi_idx = 0;
3014 bio->bi_phys_segments = 0;
3015 bio->bi_size = 0;
3016 }
3017
3018 nr_sectors = 0;
3019 if (sector_nr + max_sync < max_sector)
3020 max_sector = sector_nr + max_sync;
3021 do {
3022 struct page *page;
3023 int len = PAGE_SIZE;
3024 if (sector_nr + (len>>9) > max_sector)
3025 len = (max_sector - sector_nr) << 9;
3026 if (len == 0)
3027 break;
3028 for (bio= biolist ; bio ; bio=bio->bi_next) {
3029 struct bio *bio2;
3030 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
3031 if (bio_add_page(bio, page, len, 0))
3032 continue;
3033
3034
3035 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
3036 for (bio2 = biolist;
3037 bio2 && bio2 != bio;
3038 bio2 = bio2->bi_next) {
3039
3040 bio2->bi_vcnt--;
3041 bio2->bi_size -= len;
3042 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
3043 }
3044 goto bio_full;
3045 }
3046 nr_sectors += len>>9;
3047 sector_nr += len>>9;
3048 } while (biolist->bi_vcnt < RESYNC_PAGES);
3049 bio_full:
3050 r10_bio->sectors = nr_sectors;
3051
3052 while (biolist) {
3053 bio = biolist;
3054 biolist = biolist->bi_next;
3055
3056 bio->bi_next = NULL;
3057 r10_bio = bio->bi_private;
3058 r10_bio->sectors = nr_sectors;
3059
3060 if (bio->bi_end_io == end_sync_read) {
3061 md_sync_acct(bio->bi_bdev, nr_sectors);
3062 generic_make_request(bio);
3063 }
3064 }
3065
3066 if (sectors_skipped)
3067
3068
3069
3070 md_done_sync(mddev, sectors_skipped, 1);
3071
3072 return sectors_skipped + nr_sectors;
3073 giveup:
3074
3075
3076
3077
3078 if (sector_nr + max_sync < max_sector)
3079 max_sector = sector_nr + max_sync;
3080
3081 sectors_skipped += (max_sector - sector_nr);
3082 chunks_skipped ++;
3083 sector_nr = max_sector;
3084 goto skipped;
3085}
3086
3087static sector_t
3088raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3089{
3090 sector_t size;
3091 struct r10conf *conf = mddev->private;
3092
3093 if (!raid_disks)
3094 raid_disks = conf->raid_disks;
3095 if (!sectors)
3096 sectors = conf->dev_sectors;
3097
3098 size = sectors >> conf->chunk_shift;
3099 sector_div(size, conf->far_copies);
3100 size = size * raid_disks;
3101 sector_div(size, conf->near_copies);
3102
3103 return size << conf->chunk_shift;
3104}
3105
3106
3107static struct r10conf *setup_conf(struct mddev *mddev)
3108{
3109 struct r10conf *conf = NULL;
3110 int nc, fc, fo;
3111 sector_t stride, size;
3112 int err = -EINVAL;
3113
3114 if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) ||
3115 !is_power_of_2(mddev->new_chunk_sectors)) {
3116 printk(KERN_ERR "md/raid10:%s: chunk size must be "
3117 "at least PAGE_SIZE(%ld) and be a power of 2.\n",
3118 mdname(mddev), PAGE_SIZE);
3119 goto out;
3120 }
3121
3122 nc = mddev->new_layout & 255;
3123 fc = (mddev->new_layout >> 8) & 255;
3124 fo = mddev->new_layout & (1<<16);
3125
3126 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
3127 (mddev->new_layout >> 17)) {
3128 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3129 mdname(mddev), mddev->new_layout);
3130 goto out;
3131 }
3132
3133 err = -ENOMEM;
3134 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3135 if (!conf)
3136 goto out;
3137
3138 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
3139 GFP_KERNEL);
3140 if (!conf->mirrors)
3141 goto out;
3142
3143 conf->tmppage = alloc_page(GFP_KERNEL);
3144 if (!conf->tmppage)
3145 goto out;
3146
3147
3148 conf->raid_disks = mddev->raid_disks;
3149 conf->near_copies = nc;
3150 conf->far_copies = fc;
3151 conf->copies = nc*fc;
3152 conf->far_offset = fo;
3153 conf->chunk_mask = mddev->new_chunk_sectors - 1;
3154 conf->chunk_shift = ffz(~mddev->new_chunk_sectors);
3155
3156 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
3157 r10bio_pool_free, conf);
3158 if (!conf->r10bio_pool)
3159 goto out;
3160
3161 size = mddev->dev_sectors >> conf->chunk_shift;
3162 sector_div(size, fc);
3163 size = size * conf->raid_disks;
3164 sector_div(size, nc);
3165
3166
3167 stride = size * conf->copies;
3168
3169
3170
3171
3172 stride += conf->raid_disks - 1;
3173 sector_div(stride, conf->raid_disks);
3174
3175 conf->dev_sectors = stride << conf->chunk_shift;
3176
3177 if (fo)
3178 stride = 1;
3179 else
3180 sector_div(stride, fc);
3181 conf->stride = stride << conf->chunk_shift;
3182
3183
3184 spin_lock_init(&conf->device_lock);
3185 INIT_LIST_HEAD(&conf->retry_list);
3186
3187 spin_lock_init(&conf->resync_lock);
3188 init_waitqueue_head(&conf->wait_barrier);
3189
3190 conf->thread = md_register_thread(raid10d, mddev, NULL);
3191 if (!conf->thread)
3192 goto out;
3193
3194 conf->mddev = mddev;
3195 return conf;
3196
3197 out:
3198 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
3199 mdname(mddev));
3200 if (conf) {
3201 if (conf->r10bio_pool)
3202 mempool_destroy(conf->r10bio_pool);
3203 kfree(conf->mirrors);
3204 safe_put_page(conf->tmppage);
3205 kfree(conf);
3206 }
3207 return ERR_PTR(err);
3208}
3209
3210static int run(struct mddev *mddev)
3211{
3212 struct r10conf *conf;
3213 int i, disk_idx, chunk_size;
3214 struct mirror_info *disk;
3215 struct md_rdev *rdev;
3216 sector_t size;
3217
3218
3219
3220
3221
3222
3223
3224 if (mddev->private == NULL) {
3225 conf = setup_conf(mddev);
3226 if (IS_ERR(conf))
3227 return PTR_ERR(conf);
3228 mddev->private = conf;
3229 }
3230 conf = mddev->private;
3231 if (!conf)
3232 goto out;
3233
3234 mddev->thread = conf->thread;
3235 conf->thread = NULL;
3236
3237 chunk_size = mddev->chunk_sectors << 9;
3238 blk_queue_io_min(mddev->queue, chunk_size);
3239 if (conf->raid_disks % conf->near_copies)
3240 blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks);
3241 else
3242 blk_queue_io_opt(mddev->queue, chunk_size *
3243 (conf->raid_disks / conf->near_copies));
3244
3245 list_for_each_entry(rdev, &mddev->disks, same_set) {
3246
3247 disk_idx = rdev->raid_disk;
3248 if (disk_idx >= conf->raid_disks
3249 || disk_idx < 0)
3250 continue;
3251 disk = conf->mirrors + disk_idx;
3252
3253 if (test_bit(Replacement, &rdev->flags)) {
3254 if (disk->replacement)
3255 goto out_free_conf;
3256 disk->replacement = rdev;
3257 } else {
3258 if (disk->rdev)
3259 goto out_free_conf;
3260 disk->rdev = rdev;
3261 }
3262
3263 disk_stack_limits(mddev->gendisk, rdev->bdev,
3264 rdev->data_offset << 9);
3265
3266
3267
3268
3269 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
3270 blk_queue_max_segments(mddev->queue, 1);
3271 blk_queue_segment_boundary(mddev->queue,
3272 PAGE_CACHE_SIZE - 1);
3273 }
3274
3275 disk->head_position = 0;
3276 }
3277
3278 if (!enough(conf, -1)) {
3279 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
3280 mdname(mddev));
3281 goto out_free_conf;
3282 }
3283
3284 mddev->degraded = 0;
3285 for (i = 0; i < conf->raid_disks; i++) {
3286
3287 disk = conf->mirrors + i;
3288
3289 if (!disk->rdev && disk->replacement) {
3290
3291 disk->rdev = disk->replacement;
3292 disk->replacement = NULL;
3293 clear_bit(Replacement, &disk->rdev->flags);
3294 }
3295
3296 if (!disk->rdev ||
3297 !test_bit(In_sync, &disk->rdev->flags)) {
3298 disk->head_position = 0;
3299 mddev->degraded++;
3300 if (disk->rdev)
3301 conf->fullsync = 1;
3302 }
3303 disk->recovery_disabled = mddev->recovery_disabled - 1;
3304 }
3305
3306 if (mddev->recovery_cp != MaxSector)
3307 printk(KERN_NOTICE "md/raid10:%s: not clean"
3308 " -- starting background reconstruction\n",
3309 mdname(mddev));
3310 printk(KERN_INFO
3311 "md/raid10:%s: active with %d out of %d devices\n",
3312 mdname(mddev), conf->raid_disks - mddev->degraded,
3313 conf->raid_disks);
3314
3315
3316
3317 mddev->dev_sectors = conf->dev_sectors;
3318 size = raid10_size(mddev, 0, 0);
3319 md_set_array_sectors(mddev, size);
3320 mddev->resync_max_sectors = size;
3321
3322 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
3323 mddev->queue->backing_dev_info.congested_data = mddev;
3324
3325
3326
3327
3328
3329 {
3330 int stripe = conf->raid_disks *
3331 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3332 stripe /= conf->near_copies;
3333 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
3334 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
3335 }
3336
3337 if (conf->near_copies < conf->raid_disks)
3338 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3339
3340 if (md_integrity_register(mddev))
3341 goto out_free_conf;
3342
3343 return 0;
3344
3345out_free_conf:
3346 md_unregister_thread(&mddev->thread);
3347 if (conf->r10bio_pool)
3348 mempool_destroy(conf->r10bio_pool);
3349 safe_put_page(conf->tmppage);
3350 kfree(conf->mirrors);
3351 kfree(conf);
3352 mddev->private = NULL;
3353out:
3354 return -EIO;
3355}
3356
3357static int stop(struct mddev *mddev)
3358{
3359 struct r10conf *conf = mddev->private;
3360
3361 raise_barrier(conf, 0);
3362 lower_barrier(conf);
3363
3364 md_unregister_thread(&mddev->thread);
3365 blk_sync_queue(mddev->queue);
3366 if (conf->r10bio_pool)
3367 mempool_destroy(conf->r10bio_pool);
3368 kfree(conf->mirrors);
3369 kfree(conf);
3370 mddev->private = NULL;
3371 return 0;
3372}
3373
3374static void raid10_quiesce(struct mddev *mddev, int state)
3375{
3376 struct r10conf *conf = mddev->private;
3377
3378 switch(state) {
3379 case 1:
3380 raise_barrier(conf, 0);
3381 break;
3382 case 0:
3383 lower_barrier(conf);
3384 break;
3385 }
3386}
3387
3388static void *raid10_takeover_raid0(struct mddev *mddev)
3389{
3390 struct md_rdev *rdev;
3391 struct r10conf *conf;
3392
3393 if (mddev->degraded > 0) {
3394 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
3395 mdname(mddev));
3396 return ERR_PTR(-EINVAL);
3397 }
3398
3399
3400 mddev->new_level = 10;
3401
3402 mddev->new_layout = (1<<8) + 2;
3403 mddev->new_chunk_sectors = mddev->chunk_sectors;
3404 mddev->delta_disks = mddev->raid_disks;
3405 mddev->raid_disks *= 2;
3406
3407 mddev->recovery_cp = MaxSector;
3408
3409 conf = setup_conf(mddev);
3410 if (!IS_ERR(conf)) {
3411 list_for_each_entry(rdev, &mddev->disks, same_set)
3412 if (rdev->raid_disk >= 0)
3413 rdev->new_raid_disk = rdev->raid_disk * 2;
3414 conf->barrier = 1;
3415 }
3416
3417 return conf;
3418}
3419
3420static void *raid10_takeover(struct mddev *mddev)
3421{
3422 struct r0conf *raid0_conf;
3423
3424
3425
3426
3427 if (mddev->level == 0) {
3428
3429 raid0_conf = mddev->private;
3430 if (raid0_conf->nr_strip_zones > 1) {
3431 printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
3432 " with more than one zone.\n",
3433 mdname(mddev));
3434 return ERR_PTR(-EINVAL);
3435 }
3436 return raid10_takeover_raid0(mddev);
3437 }
3438 return ERR_PTR(-EINVAL);
3439}
3440
3441static struct md_personality raid10_personality =
3442{
3443 .name = "raid10",
3444 .level = 10,
3445 .owner = THIS_MODULE,
3446 .make_request = make_request,
3447 .run = run,
3448 .stop = stop,
3449 .status = status,
3450 .error_handler = error,
3451 .hot_add_disk = raid10_add_disk,
3452 .hot_remove_disk= raid10_remove_disk,
3453 .spare_active = raid10_spare_active,
3454 .sync_request = sync_request,
3455 .quiesce = raid10_quiesce,
3456 .size = raid10_size,
3457 .takeover = raid10_takeover,
3458};
3459
3460static int __init raid_init(void)
3461{
3462 return register_md_personality(&raid10_personality);
3463}
3464
3465static void raid_exit(void)
3466{
3467 unregister_md_personality(&raid10_personality);
3468}
3469
3470module_init(raid_init);
3471module_exit(raid_exit);
3472MODULE_LICENSE("GPL");
3473MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
3474MODULE_ALIAS("md-personality-9");
3475MODULE_ALIAS("md-raid10");
3476MODULE_ALIAS("md-level-10");
3477
3478module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
3479