1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35#include <linux/module.h>
36#include <linux/kernel.h>
37#include <linux/kthread.h>
38#include <linux/linkage.h>
39#include <linux/raid/md.h>
40#include <linux/raid/bitmap.h>
41#include <linux/sysctl.h>
42#include <linux/buffer_head.h>
43#include <linux/poll.h>
44#include <linux/mutex.h>
45#include <linux/ctype.h>
46#include <linux/freezer.h>
47
48#include <linux/init.h>
49
50#include <linux/file.h>
51
52#ifdef CONFIG_KMOD
53#include <linux/kmod.h>
54#endif
55
56#include <asm/unaligned.h>
57
58#define MAJOR_NR MD_MAJOR
59#define MD_DRIVER
60
61
62#define MdpMinorShift 6
63
64#define DEBUG 0
65#define dprintk(x...) ((void)(DEBUG && printk(x)))
66
67
68#ifndef MODULE
69static void autostart_arrays (int part);
70#endif
71
72static LIST_HEAD(pers_list);
73static DEFINE_SPINLOCK(pers_lock);
74
75static void md_print_devices(void);
76
77#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92static int sysctl_speed_limit_min = 1000;
93static int sysctl_speed_limit_max = 200000;
94static inline int speed_min(mddev_t *mddev)
95{
96 return mddev->sync_speed_min ?
97 mddev->sync_speed_min : sysctl_speed_limit_min;
98}
99
100static inline int speed_max(mddev_t *mddev)
101{
102 return mddev->sync_speed_max ?
103 mddev->sync_speed_max : sysctl_speed_limit_max;
104}
105
106static struct ctl_table_header *raid_table_header;
107
108static ctl_table raid_table[] = {
109 {
110 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN,
111 .procname = "speed_limit_min",
112 .data = &sysctl_speed_limit_min,
113 .maxlen = sizeof(int),
114 .mode = S_IRUGO|S_IWUSR,
115 .proc_handler = &proc_dointvec,
116 },
117 {
118 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX,
119 .procname = "speed_limit_max",
120 .data = &sysctl_speed_limit_max,
121 .maxlen = sizeof(int),
122 .mode = S_IRUGO|S_IWUSR,
123 .proc_handler = &proc_dointvec,
124 },
125 { .ctl_name = 0 }
126};
127
128static ctl_table raid_dir_table[] = {
129 {
130 .ctl_name = DEV_RAID,
131 .procname = "raid",
132 .maxlen = 0,
133 .mode = S_IRUGO|S_IXUGO,
134 .child = raid_table,
135 },
136 { .ctl_name = 0 }
137};
138
139static ctl_table raid_root_table[] = {
140 {
141 .ctl_name = CTL_DEV,
142 .procname = "dev",
143 .maxlen = 0,
144 .mode = 0555,
145 .child = raid_dir_table,
146 },
147 { .ctl_name = 0 }
148};
149
150static struct block_device_operations md_fops;
151
152static int start_readonly;
153
154
155
156
157
158
159
160
161
162
163
164static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
165static atomic_t md_event_count;
166void md_new_event(mddev_t *mddev)
167{
168 atomic_inc(&md_event_count);
169 wake_up(&md_event_waiters);
170 sysfs_notify(&mddev->kobj, NULL, "sync_action");
171}
172EXPORT_SYMBOL_GPL(md_new_event);
173
174
175
176
177static void md_new_event_inintr(mddev_t *mddev)
178{
179 atomic_inc(&md_event_count);
180 wake_up(&md_event_waiters);
181}
182
183
184
185
186
187static LIST_HEAD(all_mddevs);
188static DEFINE_SPINLOCK(all_mddevs_lock);
189
190
191
192
193
194
195
196
197
198#define ITERATE_MDDEV(mddev,tmp) \
199 \
200 for (({ spin_lock(&all_mddevs_lock); \
201 tmp = all_mddevs.next; \
202 mddev = NULL;}); \
203 ({ if (tmp != &all_mddevs) \
204 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
205 spin_unlock(&all_mddevs_lock); \
206 if (mddev) mddev_put(mddev); \
207 mddev = list_entry(tmp, mddev_t, all_mddevs); \
208 tmp != &all_mddevs;}); \
209 ({ spin_lock(&all_mddevs_lock); \
210 tmp = tmp->next;}) \
211 )
212
213
214static int md_fail_request (struct request_queue *q, struct bio *bio)
215{
216 bio_io_error(bio);
217 return 0;
218}
219
220static inline mddev_t *mddev_get(mddev_t *mddev)
221{
222 atomic_inc(&mddev->active);
223 return mddev;
224}
225
226static void mddev_put(mddev_t *mddev)
227{
228 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
229 return;
230 if (!mddev->raid_disks && list_empty(&mddev->disks)) {
231 list_del(&mddev->all_mddevs);
232 spin_unlock(&all_mddevs_lock);
233 blk_cleanup_queue(mddev->queue);
234 kobject_unregister(&mddev->kobj);
235 } else
236 spin_unlock(&all_mddevs_lock);
237}
238
239static mddev_t * mddev_find(dev_t unit)
240{
241 mddev_t *mddev, *new = NULL;
242
243 retry:
244 spin_lock(&all_mddevs_lock);
245 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
246 if (mddev->unit == unit) {
247 mddev_get(mddev);
248 spin_unlock(&all_mddevs_lock);
249 kfree(new);
250 return mddev;
251 }
252
253 if (new) {
254 list_add(&new->all_mddevs, &all_mddevs);
255 spin_unlock(&all_mddevs_lock);
256 return new;
257 }
258 spin_unlock(&all_mddevs_lock);
259
260 new = kzalloc(sizeof(*new), GFP_KERNEL);
261 if (!new)
262 return NULL;
263
264 new->unit = unit;
265 if (MAJOR(unit) == MD_MAJOR)
266 new->md_minor = MINOR(unit);
267 else
268 new->md_minor = MINOR(unit) >> MdpMinorShift;
269
270 mutex_init(&new->reconfig_mutex);
271 INIT_LIST_HEAD(&new->disks);
272 INIT_LIST_HEAD(&new->all_mddevs);
273 init_timer(&new->safemode_timer);
274 atomic_set(&new->active, 1);
275 spin_lock_init(&new->write_lock);
276 init_waitqueue_head(&new->sb_wait);
277 new->reshape_position = MaxSector;
278
279 new->queue = blk_alloc_queue(GFP_KERNEL);
280 if (!new->queue) {
281 kfree(new);
282 return NULL;
283 }
284 set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags);
285
286 blk_queue_make_request(new->queue, md_fail_request);
287
288 goto retry;
289}
290
291static inline int mddev_lock(mddev_t * mddev)
292{
293 return mutex_lock_interruptible(&mddev->reconfig_mutex);
294}
295
296static inline int mddev_trylock(mddev_t * mddev)
297{
298 return mutex_trylock(&mddev->reconfig_mutex);
299}
300
301static inline void mddev_unlock(mddev_t * mddev)
302{
303 mutex_unlock(&mddev->reconfig_mutex);
304
305 md_wakeup_thread(mddev->thread);
306}
307
308static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
309{
310 mdk_rdev_t * rdev;
311 struct list_head *tmp;
312
313 ITERATE_RDEV(mddev,rdev,tmp) {
314 if (rdev->desc_nr == nr)
315 return rdev;
316 }
317 return NULL;
318}
319
320static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
321{
322 struct list_head *tmp;
323 mdk_rdev_t *rdev;
324
325 ITERATE_RDEV(mddev,rdev,tmp) {
326 if (rdev->bdev->bd_dev == dev)
327 return rdev;
328 }
329 return NULL;
330}
331
332static struct mdk_personality *find_pers(int level, char *clevel)
333{
334 struct mdk_personality *pers;
335 list_for_each_entry(pers, &pers_list, list) {
336 if (level != LEVEL_NONE && pers->level == level)
337 return pers;
338 if (strcmp(pers->name, clevel)==0)
339 return pers;
340 }
341 return NULL;
342}
343
344static inline sector_t calc_dev_sboffset(struct block_device *bdev)
345{
346 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
347 return MD_NEW_SIZE_BLOCKS(size);
348}
349
350static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
351{
352 sector_t size;
353
354 size = rdev->sb_offset;
355
356 if (chunk_size)
357 size &= ~((sector_t)chunk_size/1024 - 1);
358 return size;
359}
360
361static int alloc_disk_sb(mdk_rdev_t * rdev)
362{
363 if (rdev->sb_page)
364 MD_BUG();
365
366 rdev->sb_page = alloc_page(GFP_KERNEL);
367 if (!rdev->sb_page) {
368 printk(KERN_ALERT "md: out of memory.\n");
369 return -EINVAL;
370 }
371
372 return 0;
373}
374
375static void free_disk_sb(mdk_rdev_t * rdev)
376{
377 if (rdev->sb_page) {
378 put_page(rdev->sb_page);
379 rdev->sb_loaded = 0;
380 rdev->sb_page = NULL;
381 rdev->sb_offset = 0;
382 rdev->size = 0;
383 }
384}
385
386
387static void super_written(struct bio *bio, int error)
388{
389 mdk_rdev_t *rdev = bio->bi_private;
390 mddev_t *mddev = rdev->mddev;
391
392 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
393 printk("md: super_written gets error=%d, uptodate=%d\n",
394 error, test_bit(BIO_UPTODATE, &bio->bi_flags));
395 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
396 md_error(mddev, rdev);
397 }
398
399 if (atomic_dec_and_test(&mddev->pending_writes))
400 wake_up(&mddev->sb_wait);
401 bio_put(bio);
402}
403
404static void super_written_barrier(struct bio *bio, int error)
405{
406 struct bio *bio2 = bio->bi_private;
407 mdk_rdev_t *rdev = bio2->bi_private;
408 mddev_t *mddev = rdev->mddev;
409
410 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
411 error == -EOPNOTSUPP) {
412 unsigned long flags;
413
414 set_bit(BarriersNotsupp, &rdev->flags);
415 mddev->barriers_work = 0;
416 spin_lock_irqsave(&mddev->write_lock, flags);
417 bio2->bi_next = mddev->biolist;
418 mddev->biolist = bio2;
419 spin_unlock_irqrestore(&mddev->write_lock, flags);
420 wake_up(&mddev->sb_wait);
421 bio_put(bio);
422 } else {
423 bio_put(bio2);
424 bio->bi_private = rdev;
425 super_written(bio, error);
426 }
427}
428
429void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
430 sector_t sector, int size, struct page *page)
431{
432
433
434
435
436
437
438
439
440
441 struct bio *bio = bio_alloc(GFP_NOIO, 1);
442 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
443
444 bio->bi_bdev = rdev->bdev;
445 bio->bi_sector = sector;
446 bio_add_page(bio, page, size, 0);
447 bio->bi_private = rdev;
448 bio->bi_end_io = super_written;
449 bio->bi_rw = rw;
450
451 atomic_inc(&mddev->pending_writes);
452 if (!test_bit(BarriersNotsupp, &rdev->flags)) {
453 struct bio *rbio;
454 rw |= (1<<BIO_RW_BARRIER);
455 rbio = bio_clone(bio, GFP_NOIO);
456 rbio->bi_private = bio;
457 rbio->bi_end_io = super_written_barrier;
458 submit_bio(rw, rbio);
459 } else
460 submit_bio(rw, bio);
461}
462
463void md_super_wait(mddev_t *mddev)
464{
465
466
467
468 DEFINE_WAIT(wq);
469 for(;;) {
470 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
471 if (atomic_read(&mddev->pending_writes)==0)
472 break;
473 while (mddev->biolist) {
474 struct bio *bio;
475 spin_lock_irq(&mddev->write_lock);
476 bio = mddev->biolist;
477 mddev->biolist = bio->bi_next ;
478 bio->bi_next = NULL;
479 spin_unlock_irq(&mddev->write_lock);
480 submit_bio(bio->bi_rw, bio);
481 }
482 schedule();
483 }
484 finish_wait(&mddev->sb_wait, &wq);
485}
486
487static void bi_complete(struct bio *bio, int error)
488{
489 complete((struct completion*)bio->bi_private);
490}
491
492int sync_page_io(struct block_device *bdev, sector_t sector, int size,
493 struct page *page, int rw)
494{
495 struct bio *bio = bio_alloc(GFP_NOIO, 1);
496 struct completion event;
497 int ret;
498
499 rw |= (1 << BIO_RW_SYNC);
500
501 bio->bi_bdev = bdev;
502 bio->bi_sector = sector;
503 bio_add_page(bio, page, size, 0);
504 init_completion(&event);
505 bio->bi_private = &event;
506 bio->bi_end_io = bi_complete;
507 submit_bio(rw, bio);
508 wait_for_completion(&event);
509
510 ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
511 bio_put(bio);
512 return ret;
513}
514EXPORT_SYMBOL_GPL(sync_page_io);
515
516static int read_disk_sb(mdk_rdev_t * rdev, int size)
517{
518 char b[BDEVNAME_SIZE];
519 if (!rdev->sb_page) {
520 MD_BUG();
521 return -EINVAL;
522 }
523 if (rdev->sb_loaded)
524 return 0;
525
526
527 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
528 goto fail;
529 rdev->sb_loaded = 1;
530 return 0;
531
532fail:
533 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
534 bdevname(rdev->bdev,b));
535 return -EINVAL;
536}
537
538static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
539{
540 if ( (sb1->set_uuid0 == sb2->set_uuid0) &&
541 (sb1->set_uuid1 == sb2->set_uuid1) &&
542 (sb1->set_uuid2 == sb2->set_uuid2) &&
543 (sb1->set_uuid3 == sb2->set_uuid3))
544
545 return 1;
546
547 return 0;
548}
549
550
551static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
552{
553 int ret;
554 mdp_super_t *tmp1, *tmp2;
555
556 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
557 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
558
559 if (!tmp1 || !tmp2) {
560 ret = 0;
561 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
562 goto abort;
563 }
564
565 *tmp1 = *sb1;
566 *tmp2 = *sb2;
567
568
569
570
571 tmp1->nr_disks = 0;
572 tmp2->nr_disks = 0;
573
574 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
575 ret = 0;
576 else
577 ret = 1;
578
579abort:
580 kfree(tmp1);
581 kfree(tmp2);
582 return ret;
583}
584
585
586static u32 md_csum_fold(u32 csum)
587{
588 csum = (csum & 0xffff) + (csum >> 16);
589 return (csum & 0xffff) + (csum >> 16);
590}
591
592static unsigned int calc_sb_csum(mdp_super_t * sb)
593{
594 u64 newcsum = 0;
595 u32 *sb32 = (u32*)sb;
596 int i;
597 unsigned int disk_csum, csum;
598
599 disk_csum = sb->sb_csum;
600 sb->sb_csum = 0;
601
602 for (i = 0; i < MD_SB_BYTES/4 ; i++)
603 newcsum += sb32[i];
604 csum = (newcsum & 0xffffffff) + (newcsum>>32);
605
606
607#ifdef CONFIG_ALPHA
608
609
610
611
612
613
614
615
616 sb->sb_csum = md_csum_fold(disk_csum);
617#else
618 sb->sb_csum = disk_csum;
619#endif
620 return csum;
621}
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654struct super_type {
655 char *name;
656 struct module *owner;
657 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
658 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
659 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
660};
661
662
663
664
665static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
666{
667 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
668 mdp_super_t *sb;
669 int ret;
670 sector_t sb_offset;
671
672
673
674
675
676
677
678 sb_offset = calc_dev_sboffset(rdev->bdev);
679 rdev->sb_offset = sb_offset;
680
681 ret = read_disk_sb(rdev, MD_SB_BYTES);
682 if (ret) return ret;
683
684 ret = -EINVAL;
685
686 bdevname(rdev->bdev, b);
687 sb = (mdp_super_t*)page_address(rdev->sb_page);
688
689 if (sb->md_magic != MD_SB_MAGIC) {
690 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
691 b);
692 goto abort;
693 }
694
695 if (sb->major_version != 0 ||
696 sb->minor_version < 90 ||
697 sb->minor_version > 91) {
698 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
699 sb->major_version, sb->minor_version,
700 b);
701 goto abort;
702 }
703
704 if (sb->raid_disks <= 0)
705 goto abort;
706
707 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
708 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
709 b);
710 goto abort;
711 }
712
713 rdev->preferred_minor = sb->md_minor;
714 rdev->data_offset = 0;
715 rdev->sb_size = MD_SB_BYTES;
716
717 if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) {
718 if (sb->level != 1 && sb->level != 4
719 && sb->level != 5 && sb->level != 6
720 && sb->level != 10) {
721
722 printk(KERN_WARNING
723 "md: bitmaps not supported for this level.\n");
724 goto abort;
725 }
726 }
727
728 if (sb->level == LEVEL_MULTIPATH)
729 rdev->desc_nr = -1;
730 else
731 rdev->desc_nr = sb->this_disk.number;
732
733 if (refdev == 0)
734 ret = 1;
735 else {
736 __u64 ev1, ev2;
737 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
738 if (!uuid_equal(refsb, sb)) {
739 printk(KERN_WARNING "md: %s has different UUID to %s\n",
740 b, bdevname(refdev->bdev,b2));
741 goto abort;
742 }
743 if (!sb_equal(refsb, sb)) {
744 printk(KERN_WARNING "md: %s has same UUID"
745 " but different superblock to %s\n",
746 b, bdevname(refdev->bdev, b2));
747 goto abort;
748 }
749 ev1 = md_event(sb);
750 ev2 = md_event(refsb);
751 if (ev1 > ev2)
752 ret = 1;
753 else
754 ret = 0;
755 }
756 rdev->size = calc_dev_size(rdev, sb->chunk_size);
757
758 if (rdev->size < sb->size && sb->level > 1)
759
760 ret = -EINVAL;
761
762 abort:
763 return ret;
764}
765
766
767
768
769static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
770{
771 mdp_disk_t *desc;
772 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
773 __u64 ev1 = md_event(sb);
774
775 rdev->raid_disk = -1;
776 rdev->flags = 0;
777 if (mddev->raid_disks == 0) {
778 mddev->major_version = 0;
779 mddev->minor_version = sb->minor_version;
780 mddev->patch_version = sb->patch_version;
781 mddev->persistent = ! sb->not_persistent;
782 mddev->chunk_size = sb->chunk_size;
783 mddev->ctime = sb->ctime;
784 mddev->utime = sb->utime;
785 mddev->level = sb->level;
786 mddev->clevel[0] = 0;
787 mddev->layout = sb->layout;
788 mddev->raid_disks = sb->raid_disks;
789 mddev->size = sb->size;
790 mddev->events = ev1;
791 mddev->bitmap_offset = 0;
792 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
793
794 if (mddev->minor_version >= 91) {
795 mddev->reshape_position = sb->reshape_position;
796 mddev->delta_disks = sb->delta_disks;
797 mddev->new_level = sb->new_level;
798 mddev->new_layout = sb->new_layout;
799 mddev->new_chunk = sb->new_chunk;
800 } else {
801 mddev->reshape_position = MaxSector;
802 mddev->delta_disks = 0;
803 mddev->new_level = mddev->level;
804 mddev->new_layout = mddev->layout;
805 mddev->new_chunk = mddev->chunk_size;
806 }
807
808 if (sb->state & (1<<MD_SB_CLEAN))
809 mddev->recovery_cp = MaxSector;
810 else {
811 if (sb->events_hi == sb->cp_events_hi &&
812 sb->events_lo == sb->cp_events_lo) {
813 mddev->recovery_cp = sb->recovery_cp;
814 } else
815 mddev->recovery_cp = 0;
816 }
817
818 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
819 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
820 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
821 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
822
823 mddev->max_disks = MD_SB_DISKS;
824
825 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
826 mddev->bitmap_file == NULL)
827 mddev->bitmap_offset = mddev->default_bitmap_offset;
828
829 } else if (mddev->pers == NULL) {
830
831 ++ev1;
832 if (ev1 < mddev->events)
833 return -EINVAL;
834 } else if (mddev->bitmap) {
835
836
837
838 if (ev1 < mddev->bitmap->events_cleared)
839 return 0;
840 } else {
841 if (ev1 < mddev->events)
842
843 return 0;
844 }
845
846 if (mddev->level != LEVEL_MULTIPATH) {
847 desc = sb->disks + rdev->desc_nr;
848
849 if (desc->state & (1<<MD_DISK_FAULTY))
850 set_bit(Faulty, &rdev->flags);
851 else if (desc->state & (1<<MD_DISK_SYNC)
852) {
853 set_bit(In_sync, &rdev->flags);
854 rdev->raid_disk = desc->raid_disk;
855 }
856 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
857 set_bit(WriteMostly, &rdev->flags);
858 } else
859 set_bit(In_sync, &rdev->flags);
860 return 0;
861}
862
863
864
865
866static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
867{
868 mdp_super_t *sb;
869 struct list_head *tmp;
870 mdk_rdev_t *rdev2;
871 int next_spare = mddev->raid_disks;
872
873
874
875
876
877
878
879
880
881
882
883
884 int i;
885 int active=0, working=0,failed=0,spare=0,nr_disks=0;
886
887 rdev->sb_size = MD_SB_BYTES;
888
889 sb = (mdp_super_t*)page_address(rdev->sb_page);
890
891 memset(sb, 0, sizeof(*sb));
892
893 sb->md_magic = MD_SB_MAGIC;
894 sb->major_version = mddev->major_version;
895 sb->patch_version = mddev->patch_version;
896 sb->gvalid_words = 0;
897 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
898 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
899 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
900 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
901
902 sb->ctime = mddev->ctime;
903 sb->level = mddev->level;
904 sb->size = mddev->size;
905 sb->raid_disks = mddev->raid_disks;
906 sb->md_minor = mddev->md_minor;
907 sb->not_persistent = !mddev->persistent;
908 sb->utime = mddev->utime;
909 sb->state = 0;
910 sb->events_hi = (mddev->events>>32);
911 sb->events_lo = (u32)mddev->events;
912
913 if (mddev->reshape_position == MaxSector)
914 sb->minor_version = 90;
915 else {
916 sb->minor_version = 91;
917 sb->reshape_position = mddev->reshape_position;
918 sb->new_level = mddev->new_level;
919 sb->delta_disks = mddev->delta_disks;
920 sb->new_layout = mddev->new_layout;
921 sb->new_chunk = mddev->new_chunk;
922 }
923 mddev->minor_version = sb->minor_version;
924 if (mddev->in_sync)
925 {
926 sb->recovery_cp = mddev->recovery_cp;
927 sb->cp_events_hi = (mddev->events>>32);
928 sb->cp_events_lo = (u32)mddev->events;
929 if (mddev->recovery_cp == MaxSector)
930 sb->state = (1<< MD_SB_CLEAN);
931 } else
932 sb->recovery_cp = 0;
933
934 sb->layout = mddev->layout;
935 sb->chunk_size = mddev->chunk_size;
936
937 if (mddev->bitmap && mddev->bitmap_file == NULL)
938 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
939
940 sb->disks[0].state = (1<<MD_DISK_REMOVED);
941 ITERATE_RDEV(mddev,rdev2,tmp) {
942 mdp_disk_t *d;
943 int desc_nr;
944 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
945 && !test_bit(Faulty, &rdev2->flags))
946 desc_nr = rdev2->raid_disk;
947 else
948 desc_nr = next_spare++;
949 rdev2->desc_nr = desc_nr;
950 d = &sb->disks[rdev2->desc_nr];
951 nr_disks++;
952 d->number = rdev2->desc_nr;
953 d->major = MAJOR(rdev2->bdev->bd_dev);
954 d->minor = MINOR(rdev2->bdev->bd_dev);
955 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
956 && !test_bit(Faulty, &rdev2->flags))
957 d->raid_disk = rdev2->raid_disk;
958 else
959 d->raid_disk = rdev2->desc_nr;
960 if (test_bit(Faulty, &rdev2->flags))
961 d->state = (1<<MD_DISK_FAULTY);
962 else if (test_bit(In_sync, &rdev2->flags)) {
963 d->state = (1<<MD_DISK_ACTIVE);
964 d->state |= (1<<MD_DISK_SYNC);
965 active++;
966 working++;
967 } else {
968 d->state = 0;
969 spare++;
970 working++;
971 }
972 if (test_bit(WriteMostly, &rdev2->flags))
973 d->state |= (1<<MD_DISK_WRITEMOSTLY);
974 }
975
976 for (i=0 ; i < mddev->raid_disks ; i++) {
977 mdp_disk_t *d = &sb->disks[i];
978 if (d->state == 0 && d->number == 0) {
979 d->number = i;
980 d->raid_disk = i;
981 d->state = (1<<MD_DISK_REMOVED);
982 d->state |= (1<<MD_DISK_FAULTY);
983 failed++;
984 }
985 }
986 sb->nr_disks = nr_disks;
987 sb->active_disks = active;
988 sb->working_disks = working;
989 sb->failed_disks = failed;
990 sb->spare_disks = spare;
991
992 sb->this_disk = sb->disks[rdev->desc_nr];
993 sb->sb_csum = calc_sb_csum(sb);
994}
995
996
997
998
999
1000static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1001{
1002 __le32 disk_csum;
1003 u32 csum;
1004 unsigned long long newcsum;
1005 int size = 256 + le32_to_cpu(sb->max_dev)*2;
1006 __le32 *isuper = (__le32*)sb;
1007 int i;
1008
1009 disk_csum = sb->sb_csum;
1010 sb->sb_csum = 0;
1011 newcsum = 0;
1012 for (i=0; size>=4; size -= 4 )
1013 newcsum += le32_to_cpu(*isuper++);
1014
1015 if (size == 2)
1016 newcsum += le16_to_cpu(*(__le16*) isuper);
1017
1018 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1019 sb->sb_csum = disk_csum;
1020 return cpu_to_le32(csum);
1021}
1022
1023static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1024{
1025 struct mdp_superblock_1 *sb;
1026 int ret;
1027 sector_t sb_offset;
1028 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1029 int bmask;
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039 switch(minor_version) {
1040 case 0:
1041 sb_offset = rdev->bdev->bd_inode->i_size >> 9;
1042 sb_offset -= 8*2;
1043 sb_offset &= ~(sector_t)(4*2-1);
1044
1045 sb_offset /= 2;
1046 break;
1047 case 1:
1048 sb_offset = 0;
1049 break;
1050 case 2:
1051 sb_offset = 4;
1052 break;
1053 default:
1054 return -EINVAL;
1055 }
1056 rdev->sb_offset = sb_offset;
1057
1058
1059
1060
1061 ret = read_disk_sb(rdev, 4096);
1062 if (ret) return ret;
1063
1064
1065 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1066
1067 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1068 sb->major_version != cpu_to_le32(1) ||
1069 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1070 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
1071 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1072 return -EINVAL;
1073
1074 if (calc_sb_1_csum(sb) != sb->sb_csum) {
1075 printk("md: invalid superblock checksum on %s\n",
1076 bdevname(rdev->bdev,b));
1077 return -EINVAL;
1078 }
1079 if (le64_to_cpu(sb->data_size) < 10) {
1080 printk("md: data_size too small on %s\n",
1081 bdevname(rdev->bdev,b));
1082 return -EINVAL;
1083 }
1084 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) {
1085 if (sb->level != cpu_to_le32(1) &&
1086 sb->level != cpu_to_le32(4) &&
1087 sb->level != cpu_to_le32(5) &&
1088 sb->level != cpu_to_le32(6) &&
1089 sb->level != cpu_to_le32(10)) {
1090 printk(KERN_WARNING
1091 "md: bitmaps not supported for this level.\n");
1092 return -EINVAL;
1093 }
1094 }
1095
1096 rdev->preferred_minor = 0xffff;
1097 rdev->data_offset = le64_to_cpu(sb->data_offset);
1098 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1099
1100 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1101 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
1102 if (rdev->sb_size & bmask)
1103 rdev-> sb_size = (rdev->sb_size | bmask)+1;
1104
1105 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1106 rdev->desc_nr = -1;
1107 else
1108 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1109
1110 if (refdev == 0)
1111 ret = 1;
1112 else {
1113 __u64 ev1, ev2;
1114 struct mdp_superblock_1 *refsb =
1115 (struct mdp_superblock_1*)page_address(refdev->sb_page);
1116
1117 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1118 sb->level != refsb->level ||
1119 sb->layout != refsb->layout ||
1120 sb->chunksize != refsb->chunksize) {
1121 printk(KERN_WARNING "md: %s has strangely different"
1122 " superblock to %s\n",
1123 bdevname(rdev->bdev,b),
1124 bdevname(refdev->bdev,b2));
1125 return -EINVAL;
1126 }
1127 ev1 = le64_to_cpu(sb->events);
1128 ev2 = le64_to_cpu(refsb->events);
1129
1130 if (ev1 > ev2)
1131 ret = 1;
1132 else
1133 ret = 0;
1134 }
1135 if (minor_version)
1136 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
1137 else
1138 rdev->size = rdev->sb_offset;
1139 if (rdev->size < le64_to_cpu(sb->data_size)/2)
1140 return -EINVAL;
1141 rdev->size = le64_to_cpu(sb->data_size)/2;
1142 if (le32_to_cpu(sb->chunksize))
1143 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
1144
1145 if (le64_to_cpu(sb->size) > rdev->size*2)
1146 return -EINVAL;
1147 return ret;
1148}
1149
1150static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1151{
1152 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1153 __u64 ev1 = le64_to_cpu(sb->events);
1154
1155 rdev->raid_disk = -1;
1156 rdev->flags = 0;
1157 if (mddev->raid_disks == 0) {
1158 mddev->major_version = 1;
1159 mddev->patch_version = 0;
1160 mddev->persistent = 1;
1161 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
1162 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1163 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1164 mddev->level = le32_to_cpu(sb->level);
1165 mddev->clevel[0] = 0;
1166 mddev->layout = le32_to_cpu(sb->layout);
1167 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1168 mddev->size = le64_to_cpu(sb->size)/2;
1169 mddev->events = ev1;
1170 mddev->bitmap_offset = 0;
1171 mddev->default_bitmap_offset = 1024 >> 9;
1172
1173 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1174 memcpy(mddev->uuid, sb->set_uuid, 16);
1175
1176 mddev->max_disks = (4096-256)/2;
1177
1178 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1179 mddev->bitmap_file == NULL )
1180 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
1181
1182 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1183 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1184 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1185 mddev->new_level = le32_to_cpu(sb->new_level);
1186 mddev->new_layout = le32_to_cpu(sb->new_layout);
1187 mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
1188 } else {
1189 mddev->reshape_position = MaxSector;
1190 mddev->delta_disks = 0;
1191 mddev->new_level = mddev->level;
1192 mddev->new_layout = mddev->layout;
1193 mddev->new_chunk = mddev->chunk_size;
1194 }
1195
1196 } else if (mddev->pers == NULL) {
1197
1198 ++ev1;
1199 if (ev1 < mddev->events)
1200 return -EINVAL;
1201 } else if (mddev->bitmap) {
1202
1203
1204
1205 if (ev1 < mddev->bitmap->events_cleared)
1206 return 0;
1207 } else {
1208 if (ev1 < mddev->events)
1209
1210 return 0;
1211 }
1212 if (mddev->level != LEVEL_MULTIPATH) {
1213 int role;
1214 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1215 switch(role) {
1216 case 0xffff:
1217 break;
1218 case 0xfffe:
1219 set_bit(Faulty, &rdev->flags);
1220 break;
1221 default:
1222 if ((le32_to_cpu(sb->feature_map) &
1223 MD_FEATURE_RECOVERY_OFFSET))
1224 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1225 else
1226 set_bit(In_sync, &rdev->flags);
1227 rdev->raid_disk = role;
1228 break;
1229 }
1230 if (sb->devflags & WriteMostly1)
1231 set_bit(WriteMostly, &rdev->flags);
1232 } else
1233 set_bit(In_sync, &rdev->flags);
1234
1235 return 0;
1236}
1237
1238static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1239{
1240 struct mdp_superblock_1 *sb;
1241 struct list_head *tmp;
1242 mdk_rdev_t *rdev2;
1243 int max_dev, i;
1244
1245
1246 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1247
1248 sb->feature_map = 0;
1249 sb->pad0 = 0;
1250 sb->recovery_offset = cpu_to_le64(0);
1251 memset(sb->pad1, 0, sizeof(sb->pad1));
1252 memset(sb->pad2, 0, sizeof(sb->pad2));
1253 memset(sb->pad3, 0, sizeof(sb->pad3));
1254
1255 sb->utime = cpu_to_le64((__u64)mddev->utime);
1256 sb->events = cpu_to_le64(mddev->events);
1257 if (mddev->in_sync)
1258 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1259 else
1260 sb->resync_offset = cpu_to_le64(0);
1261
1262 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1263
1264 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1265 sb->size = cpu_to_le64(mddev->size<<1);
1266
1267 if (mddev->bitmap && mddev->bitmap_file == NULL) {
1268 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1269 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1270 }
1271
1272 if (rdev->raid_disk >= 0 &&
1273 !test_bit(In_sync, &rdev->flags) &&
1274 rdev->recovery_offset > 0) {
1275 sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1276 sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
1277 }
1278
1279 if (mddev->reshape_position != MaxSector) {
1280 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1281 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1282 sb->new_layout = cpu_to_le32(mddev->new_layout);
1283 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1284 sb->new_level = cpu_to_le32(mddev->new_level);
1285 sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
1286 }
1287
1288 max_dev = 0;
1289 ITERATE_RDEV(mddev,rdev2,tmp)
1290 if (rdev2->desc_nr+1 > max_dev)
1291 max_dev = rdev2->desc_nr+1;
1292
1293 if (max_dev > le32_to_cpu(sb->max_dev))
1294 sb->max_dev = cpu_to_le32(max_dev);
1295 for (i=0; i<max_dev;i++)
1296 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1297
1298 ITERATE_RDEV(mddev,rdev2,tmp) {
1299 i = rdev2->desc_nr;
1300 if (test_bit(Faulty, &rdev2->flags))
1301 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1302 else if (test_bit(In_sync, &rdev2->flags))
1303 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1304 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
1305 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1306 else
1307 sb->dev_roles[i] = cpu_to_le16(0xffff);
1308 }
1309
1310 sb->sb_csum = calc_sb_1_csum(sb);
1311}
1312
1313
1314static struct super_type super_types[] = {
1315 [0] = {
1316 .name = "0.90.0",
1317 .owner = THIS_MODULE,
1318 .load_super = super_90_load,
1319 .validate_super = super_90_validate,
1320 .sync_super = super_90_sync,
1321 },
1322 [1] = {
1323 .name = "md-1",
1324 .owner = THIS_MODULE,
1325 .load_super = super_1_load,
1326 .validate_super = super_1_validate,
1327 .sync_super = super_1_sync,
1328 },
1329};
1330
1331static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1332{
1333 struct list_head *tmp, *tmp2;
1334 mdk_rdev_t *rdev, *rdev2;
1335
1336 ITERATE_RDEV(mddev1,rdev,tmp)
1337 ITERATE_RDEV(mddev2, rdev2, tmp2)
1338 if (rdev->bdev->bd_contains ==
1339 rdev2->bdev->bd_contains)
1340 return 1;
1341
1342 return 0;
1343}
1344
1345static LIST_HEAD(pending_raid_disks);
1346
1347static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1348{
1349 char b[BDEVNAME_SIZE];
1350 struct kobject *ko;
1351 char *s;
1352 int err;
1353
1354 if (rdev->mddev) {
1355 MD_BUG();
1356 return -EINVAL;
1357 }
1358
1359 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
1360 if (mddev->pers) {
1361
1362
1363
1364
1365 if (mddev->level > 0)
1366 return -ENOSPC;
1367 } else
1368 mddev->size = rdev->size;
1369 }
1370
1371
1372
1373
1374
1375 if (rdev->desc_nr < 0) {
1376 int choice = 0;
1377 if (mddev->pers) choice = mddev->raid_disks;
1378 while (find_rdev_nr(mddev, choice))
1379 choice++;
1380 rdev->desc_nr = choice;
1381 } else {
1382 if (find_rdev_nr(mddev, rdev->desc_nr))
1383 return -EBUSY;
1384 }
1385 bdevname(rdev->bdev,b);
1386 if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0)
1387 return -ENOMEM;
1388 while ( (s=strchr(rdev->kobj.k_name, '/')) != NULL)
1389 *s = '!';
1390
1391 rdev->mddev = mddev;
1392 printk(KERN_INFO "md: bind<%s>\n", b);
1393
1394 rdev->kobj.parent = &mddev->kobj;
1395 if ((err = kobject_add(&rdev->kobj)))
1396 goto fail;
1397
1398 if (rdev->bdev->bd_part)
1399 ko = &rdev->bdev->bd_part->kobj;
1400 else
1401 ko = &rdev->bdev->bd_disk->kobj;
1402 if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) {
1403 kobject_del(&rdev->kobj);
1404 goto fail;
1405 }
1406 list_add(&rdev->same_set, &mddev->disks);
1407 bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk);
1408 return 0;
1409
1410 fail:
1411 printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
1412 b, mdname(mddev));
1413 return err;
1414}
1415
1416static void delayed_delete(struct work_struct *ws)
1417{
1418 mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
1419 kobject_del(&rdev->kobj);
1420}
1421
1422static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1423{
1424 char b[BDEVNAME_SIZE];
1425 if (!rdev->mddev) {
1426 MD_BUG();
1427 return;
1428 }
1429 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1430 list_del_init(&rdev->same_set);
1431 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1432 rdev->mddev = NULL;
1433 sysfs_remove_link(&rdev->kobj, "block");
1434
1435
1436
1437
1438 INIT_WORK(&rdev->del_work, delayed_delete);
1439 schedule_work(&rdev->del_work);
1440}
1441
1442
1443
1444
1445
1446
1447static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
1448{
1449 int err = 0;
1450 struct block_device *bdev;
1451 char b[BDEVNAME_SIZE];
1452
1453 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1454 if (IS_ERR(bdev)) {
1455 printk(KERN_ERR "md: could not open %s.\n",
1456 __bdevname(dev, b));
1457 return PTR_ERR(bdev);
1458 }
1459 err = bd_claim(bdev, rdev);
1460 if (err) {
1461 printk(KERN_ERR "md: could not bd_claim %s.\n",
1462 bdevname(bdev, b));
1463 blkdev_put(bdev);
1464 return err;
1465 }
1466 rdev->bdev = bdev;
1467 return err;
1468}
1469
1470static void unlock_rdev(mdk_rdev_t *rdev)
1471{
1472 struct block_device *bdev = rdev->bdev;
1473 rdev->bdev = NULL;
1474 if (!bdev)
1475 MD_BUG();
1476 bd_release(bdev);
1477 blkdev_put(bdev);
1478}
1479
1480void md_autodetect_dev(dev_t dev);
1481
1482static void export_rdev(mdk_rdev_t * rdev)
1483{
1484 char b[BDEVNAME_SIZE];
1485 printk(KERN_INFO "md: export_rdev(%s)\n",
1486 bdevname(rdev->bdev,b));
1487 if (rdev->mddev)
1488 MD_BUG();
1489 free_disk_sb(rdev);
1490 list_del_init(&rdev->same_set);
1491#ifndef MODULE
1492 md_autodetect_dev(rdev->bdev->bd_dev);
1493#endif
1494 unlock_rdev(rdev);
1495 kobject_put(&rdev->kobj);
1496}
1497
1498static void kick_rdev_from_array(mdk_rdev_t * rdev)
1499{
1500 unbind_rdev_from_array(rdev);
1501 export_rdev(rdev);
1502}
1503
1504static void export_array(mddev_t *mddev)
1505{
1506 struct list_head *tmp;
1507 mdk_rdev_t *rdev;
1508
1509 ITERATE_RDEV(mddev,rdev,tmp) {
1510 if (!rdev->mddev) {
1511 MD_BUG();
1512 continue;
1513 }
1514 kick_rdev_from_array(rdev);
1515 }
1516 if (!list_empty(&mddev->disks))
1517 MD_BUG();
1518 mddev->raid_disks = 0;
1519 mddev->major_version = 0;
1520}
1521
1522static void print_desc(mdp_disk_t *desc)
1523{
1524 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1525 desc->major,desc->minor,desc->raid_disk,desc->state);
1526}
1527
1528static void print_sb(mdp_super_t *sb)
1529{
1530 int i;
1531
1532 printk(KERN_INFO
1533 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1534 sb->major_version, sb->minor_version, sb->patch_version,
1535 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1536 sb->ctime);
1537 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1538 sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1539 sb->md_minor, sb->layout, sb->chunk_size);
1540 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d"
1541 " FD:%d SD:%d CSUM:%08x E:%08lx\n",
1542 sb->utime, sb->state, sb->active_disks, sb->working_disks,
1543 sb->failed_disks, sb->spare_disks,
1544 sb->sb_csum, (unsigned long)sb->events_lo);
1545
1546 printk(KERN_INFO);
1547 for (i = 0; i < MD_SB_DISKS; i++) {
1548 mdp_disk_t *desc;
1549
1550 desc = sb->disks + i;
1551 if (desc->number || desc->major || desc->minor ||
1552 desc->raid_disk || (desc->state && (desc->state != 4))) {
1553 printk(" D %2d: ", i);
1554 print_desc(desc);
1555 }
1556 }
1557 printk(KERN_INFO "md: THIS: ");
1558 print_desc(&sb->this_disk);
1559
1560}
1561
1562static void print_rdev(mdk_rdev_t *rdev)
1563{
1564 char b[BDEVNAME_SIZE];
1565 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
1566 bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
1567 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1568 rdev->desc_nr);
1569 if (rdev->sb_loaded) {
1570 printk(KERN_INFO "md: rdev superblock:\n");
1571 print_sb((mdp_super_t*)page_address(rdev->sb_page));
1572 } else
1573 printk(KERN_INFO "md: no rdev superblock!\n");
1574}
1575
1576static void md_print_devices(void)
1577{
1578 struct list_head *tmp, *tmp2;
1579 mdk_rdev_t *rdev;
1580 mddev_t *mddev;
1581 char b[BDEVNAME_SIZE];
1582
1583 printk("\n");
1584 printk("md: **********************************\n");
1585 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
1586 printk("md: **********************************\n");
1587 ITERATE_MDDEV(mddev,tmp) {
1588
1589 if (mddev->bitmap)
1590 bitmap_print_sb(mddev->bitmap);
1591 else
1592 printk("%s: ", mdname(mddev));
1593 ITERATE_RDEV(mddev,rdev,tmp2)
1594 printk("<%s>", bdevname(rdev->bdev,b));
1595 printk("\n");
1596
1597 ITERATE_RDEV(mddev,rdev,tmp2)
1598 print_rdev(rdev);
1599 }
1600 printk("md: **********************************\n");
1601 printk("\n");
1602}
1603
1604
1605static void sync_sbs(mddev_t * mddev, int nospares)
1606{
1607
1608
1609
1610
1611
1612
1613 mdk_rdev_t *rdev;
1614 struct list_head *tmp;
1615
1616 ITERATE_RDEV(mddev,rdev,tmp) {
1617 if (rdev->sb_events == mddev->events ||
1618 (nospares &&
1619 rdev->raid_disk < 0 &&
1620 (rdev->sb_events&1)==0 &&
1621 rdev->sb_events+1 == mddev->events)) {
1622
1623 rdev->sb_loaded = 2;
1624 } else {
1625 super_types[mddev->major_version].
1626 sync_super(mddev, rdev);
1627 rdev->sb_loaded = 1;
1628 }
1629 }
1630}
1631
1632static void md_update_sb(mddev_t * mddev, int force_change)
1633{
1634 struct list_head *tmp;
1635 mdk_rdev_t *rdev;
1636 int sync_req;
1637 int nospares = 0;
1638
1639repeat:
1640 spin_lock_irq(&mddev->write_lock);
1641
1642 set_bit(MD_CHANGE_PENDING, &mddev->flags);
1643 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
1644 force_change = 1;
1645 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
1646
1647
1648
1649
1650 nospares = 1;
1651 if (force_change)
1652 nospares = 0;
1653 if (mddev->degraded)
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663 nospares = 0;
1664
1665 sync_req = mddev->in_sync;
1666 mddev->utime = get_seconds();
1667
1668
1669
1670 if (nospares
1671 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
1672 && (mddev->events & 1)
1673 && mddev->events != 1)
1674 mddev->events--;
1675 else {
1676
1677 mddev->events ++;
1678 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) {
1679
1680 if ((mddev->events&1)==0) {
1681 mddev->events++;
1682 nospares = 0;
1683 }
1684 } else {
1685
1686 if ((mddev->events&1)) {
1687 mddev->events++;
1688 nospares = 0;
1689 }
1690 }
1691 }
1692
1693 if (!mddev->events) {
1694
1695
1696
1697
1698
1699 MD_BUG();
1700 mddev->events --;
1701 }
1702 sync_sbs(mddev, nospares);
1703
1704
1705
1706
1707
1708 if (!mddev->persistent) {
1709 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
1710 spin_unlock_irq(&mddev->write_lock);
1711 wake_up(&mddev->sb_wait);
1712 return;
1713 }
1714 spin_unlock_irq(&mddev->write_lock);
1715
1716 dprintk(KERN_INFO
1717 "md: updating %s RAID superblock on device (in sync %d)\n",
1718 mdname(mddev),mddev->in_sync);
1719
1720 bitmap_update_sb(mddev->bitmap);
1721 ITERATE_RDEV(mddev,rdev,tmp) {
1722 char b[BDEVNAME_SIZE];
1723 dprintk(KERN_INFO "md: ");
1724 if (rdev->sb_loaded != 1)
1725 continue;
1726 if (test_bit(Faulty, &rdev->flags))
1727 dprintk("(skipping faulty ");
1728
1729 dprintk("%s ", bdevname(rdev->bdev,b));
1730 if (!test_bit(Faulty, &rdev->flags)) {
1731 md_super_write(mddev,rdev,
1732 rdev->sb_offset<<1, rdev->sb_size,
1733 rdev->sb_page);
1734 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1735 bdevname(rdev->bdev,b),
1736 (unsigned long long)rdev->sb_offset);
1737 rdev->sb_events = mddev->events;
1738
1739 } else
1740 dprintk(")\n");
1741 if (mddev->level == LEVEL_MULTIPATH)
1742
1743 break;
1744 }
1745 md_super_wait(mddev);
1746
1747
1748 spin_lock_irq(&mddev->write_lock);
1749 if (mddev->in_sync != sync_req ||
1750 test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
1751
1752 spin_unlock_irq(&mddev->write_lock);
1753 goto repeat;
1754 }
1755 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
1756 spin_unlock_irq(&mddev->write_lock);
1757 wake_up(&mddev->sb_wait);
1758
1759}
1760
1761
1762
1763
1764static int cmd_match(const char *cmd, const char *str)
1765{
1766
1767
1768
1769
1770 while (*cmd && *str && *cmd == *str) {
1771 cmd++;
1772 str++;
1773 }
1774 if (*cmd == '\n')
1775 cmd++;
1776 if (*str || *cmd)
1777 return 0;
1778 return 1;
1779}
1780
1781struct rdev_sysfs_entry {
1782 struct attribute attr;
1783 ssize_t (*show)(mdk_rdev_t *, char *);
1784 ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
1785};
1786
1787static ssize_t
1788state_show(mdk_rdev_t *rdev, char *page)
1789{
1790 char *sep = "";
1791 int len=0;
1792
1793 if (test_bit(Faulty, &rdev->flags)) {
1794 len+= sprintf(page+len, "%sfaulty",sep);
1795 sep = ",";
1796 }
1797 if (test_bit(In_sync, &rdev->flags)) {
1798 len += sprintf(page+len, "%sin_sync",sep);
1799 sep = ",";
1800 }
1801 if (test_bit(WriteMostly, &rdev->flags)) {
1802 len += sprintf(page+len, "%swrite_mostly",sep);
1803 sep = ",";
1804 }
1805 if (!test_bit(Faulty, &rdev->flags) &&
1806 !test_bit(In_sync, &rdev->flags)) {
1807 len += sprintf(page+len, "%sspare", sep);
1808 sep = ",";
1809 }
1810 return len+sprintf(page+len, "\n");
1811}
1812
1813static ssize_t
1814state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1815{
1816
1817
1818
1819
1820
1821
1822 int err = -EINVAL;
1823 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
1824 md_error(rdev->mddev, rdev);
1825 err = 0;
1826 } else if (cmd_match(buf, "remove")) {
1827 if (rdev->raid_disk >= 0)
1828 err = -EBUSY;
1829 else {
1830 mddev_t *mddev = rdev->mddev;
1831 kick_rdev_from_array(rdev);
1832 if (mddev->pers)
1833 md_update_sb(mddev, 1);
1834 md_new_event(mddev);
1835 err = 0;
1836 }
1837 } else if (cmd_match(buf, "writemostly")) {
1838 set_bit(WriteMostly, &rdev->flags);
1839 err = 0;
1840 } else if (cmd_match(buf, "-writemostly")) {
1841 clear_bit(WriteMostly, &rdev->flags);
1842 err = 0;
1843 }
1844 return err ? err : len;
1845}
1846static struct rdev_sysfs_entry rdev_state =
1847__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
1848
1849static ssize_t
1850super_show(mdk_rdev_t *rdev, char *page)
1851{
1852 if (rdev->sb_loaded && rdev->sb_size) {
1853 memcpy(page, page_address(rdev->sb_page), rdev->sb_size);
1854 return rdev->sb_size;
1855 } else
1856 return 0;
1857}
1858static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super);
1859
1860static ssize_t
1861errors_show(mdk_rdev_t *rdev, char *page)
1862{
1863 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
1864}
1865
1866static ssize_t
1867errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1868{
1869 char *e;
1870 unsigned long n = simple_strtoul(buf, &e, 10);
1871 if (*buf && (*e == 0 || *e == '\n')) {
1872 atomic_set(&rdev->corrected_errors, n);
1873 return len;
1874 }
1875 return -EINVAL;
1876}
1877static struct rdev_sysfs_entry rdev_errors =
1878__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
1879
1880static ssize_t
1881slot_show(mdk_rdev_t *rdev, char *page)
1882{
1883 if (rdev->raid_disk < 0)
1884 return sprintf(page, "none\n");
1885 else
1886 return sprintf(page, "%d\n", rdev->raid_disk);
1887}
1888
1889static ssize_t
1890slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1891{
1892 char *e;
1893 int slot = simple_strtoul(buf, &e, 10);
1894 if (strncmp(buf, "none", 4)==0)
1895 slot = -1;
1896 else if (e==buf || (*e && *e!= '\n'))
1897 return -EINVAL;
1898 if (rdev->mddev->pers)
1899
1900 return -EBUSY;
1901 if (slot >= rdev->mddev->raid_disks)
1902 return -ENOSPC;
1903 rdev->raid_disk = slot;
1904
1905 rdev->flags = 0;
1906 set_bit(In_sync, &rdev->flags);
1907 return len;
1908}
1909
1910
1911static struct rdev_sysfs_entry rdev_slot =
1912__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
1913
1914static ssize_t
1915offset_show(mdk_rdev_t *rdev, char *page)
1916{
1917 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
1918}
1919
1920static ssize_t
1921offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1922{
1923 char *e;
1924 unsigned long long offset = simple_strtoull(buf, &e, 10);
1925 if (e==buf || (*e && *e != '\n'))
1926 return -EINVAL;
1927 if (rdev->mddev->pers)
1928 return -EBUSY;
1929 rdev->data_offset = offset;
1930 return len;
1931}
1932
1933static struct rdev_sysfs_entry rdev_offset =
1934__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
1935
1936static ssize_t
1937rdev_size_show(mdk_rdev_t *rdev, char *page)
1938{
1939 return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
1940}
1941
1942static ssize_t
1943rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1944{
1945 char *e;
1946 unsigned long long size = simple_strtoull(buf, &e, 10);
1947 if (e==buf || (*e && *e != '\n'))
1948 return -EINVAL;
1949 if (rdev->mddev->pers)
1950 return -EBUSY;
1951 rdev->size = size;
1952 if (size < rdev->mddev->size || rdev->mddev->size == 0)
1953 rdev->mddev->size = size;
1954 return len;
1955}
1956
1957static struct rdev_sysfs_entry rdev_size =
1958__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
1959
1960static struct attribute *rdev_default_attrs[] = {
1961 &rdev_state.attr,
1962 &rdev_super.attr,
1963 &rdev_errors.attr,
1964 &rdev_slot.attr,
1965 &rdev_offset.attr,
1966 &rdev_size.attr,
1967 NULL,
1968};
1969static ssize_t
1970rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1971{
1972 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
1973 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
1974
1975 if (!entry->show)
1976 return -EIO;
1977 return entry->show(rdev, page);
1978}
1979
1980static ssize_t
1981rdev_attr_store(struct kobject *kobj, struct attribute *attr,
1982 const char *page, size_t length)
1983{
1984 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
1985 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
1986
1987 if (!entry->store)
1988 return -EIO;
1989 if (!capable(CAP_SYS_ADMIN))
1990 return -EACCES;
1991 return entry->store(rdev, page, length);
1992}
1993
1994static void rdev_free(struct kobject *ko)
1995{
1996 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
1997 kfree(rdev);
1998}
1999static struct sysfs_ops rdev_sysfs_ops = {
2000 .show = rdev_attr_show,
2001 .store = rdev_attr_store,
2002};
2003static struct kobj_type rdev_ktype = {
2004 .release = rdev_free,
2005 .sysfs_ops = &rdev_sysfs_ops,
2006 .default_attrs = rdev_default_attrs,
2007};
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
2020{
2021 char b[BDEVNAME_SIZE];
2022 int err;
2023 mdk_rdev_t *rdev;
2024 sector_t size;
2025
2026 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
2027 if (!rdev) {
2028 printk(KERN_ERR "md: could not alloc mem for new device!\n");
2029 return ERR_PTR(-ENOMEM);
2030 }
2031
2032 if ((err = alloc_disk_sb(rdev)))
2033 goto abort_free;
2034
2035 err = lock_rdev(rdev, newdev);
2036 if (err)
2037 goto abort_free;
2038
2039 rdev->kobj.parent = NULL;
2040 rdev->kobj.ktype = &rdev_ktype;
2041 kobject_init(&rdev->kobj);
2042
2043 rdev->desc_nr = -1;
2044 rdev->saved_raid_disk = -1;
2045 rdev->raid_disk = -1;
2046 rdev->flags = 0;
2047 rdev->data_offset = 0;
2048 rdev->sb_events = 0;
2049 atomic_set(&rdev->nr_pending, 0);
2050 atomic_set(&rdev->read_errors, 0);
2051 atomic_set(&rdev->corrected_errors, 0);
2052
2053 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2054 if (!size) {
2055 printk(KERN_WARNING
2056 "md: %s has zero or unknown size, marking faulty!\n",
2057 bdevname(rdev->bdev,b));
2058 err = -EINVAL;
2059 goto abort_free;
2060 }
2061
2062 if (super_format >= 0) {
2063 err = super_types[super_format].
2064 load_super(rdev, NULL, super_minor);
2065 if (err == -EINVAL) {
2066 printk(KERN_WARNING
2067 "md: %s does not have a valid v%d.%d "
2068 "superblock, not importing!\n",
2069 bdevname(rdev->bdev,b),
2070 super_format, super_minor);
2071 goto abort_free;
2072 }
2073 if (err < 0) {
2074 printk(KERN_WARNING
2075 "md: could not read %s's sb, not importing!\n",
2076 bdevname(rdev->bdev,b));
2077 goto abort_free;
2078 }
2079 }
2080 INIT_LIST_HEAD(&rdev->same_set);
2081
2082 return rdev;
2083
2084abort_free:
2085 if (rdev->sb_page) {
2086 if (rdev->bdev)
2087 unlock_rdev(rdev);
2088 free_disk_sb(rdev);
2089 }
2090 kfree(rdev);
2091 return ERR_PTR(err);
2092}
2093
2094
2095
2096
2097
2098
2099static void analyze_sbs(mddev_t * mddev)
2100{
2101 int i;
2102 struct list_head *tmp;
2103 mdk_rdev_t *rdev, *freshest;
2104 char b[BDEVNAME_SIZE];
2105
2106 freshest = NULL;
2107 ITERATE_RDEV(mddev,rdev,tmp)
2108 switch (super_types[mddev->major_version].
2109 load_super(rdev, freshest, mddev->minor_version)) {
2110 case 1:
2111 freshest = rdev;
2112 break;
2113 case 0:
2114 break;
2115 default:
2116 printk( KERN_ERR \
2117 "md: fatal superblock inconsistency in %s"
2118 " -- removing from array\n",
2119 bdevname(rdev->bdev,b));
2120 kick_rdev_from_array(rdev);
2121 }
2122
2123
2124 super_types[mddev->major_version].
2125 validate_super(mddev, freshest);
2126
2127 i = 0;
2128 ITERATE_RDEV(mddev,rdev,tmp) {
2129 if (rdev != freshest)
2130 if (super_types[mddev->major_version].
2131 validate_super(mddev, rdev)) {
2132 printk(KERN_WARNING "md: kicking non-fresh %s"
2133 " from array!\n",
2134 bdevname(rdev->bdev,b));
2135 kick_rdev_from_array(rdev);
2136 continue;
2137 }
2138 if (mddev->level == LEVEL_MULTIPATH) {
2139 rdev->desc_nr = i++;
2140 rdev->raid_disk = rdev->desc_nr;
2141 set_bit(In_sync, &rdev->flags);
2142 } else if (rdev->raid_disk >= mddev->raid_disks) {
2143 rdev->raid_disk = -1;
2144 clear_bit(In_sync, &rdev->flags);
2145 }
2146 }
2147
2148
2149
2150 if (mddev->recovery_cp != MaxSector &&
2151 mddev->level >= 1)
2152 printk(KERN_ERR "md: %s: raid array is not clean"
2153 " -- starting background reconstruction\n",
2154 mdname(mddev));
2155
2156}
2157
2158static ssize_t
2159safe_delay_show(mddev_t *mddev, char *page)
2160{
2161 int msec = (mddev->safemode_delay*1000)/HZ;
2162 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2163}
2164static ssize_t
2165safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2166{
2167 int scale=1;
2168 int dot=0;
2169 int i;
2170 unsigned long msec;
2171 char buf[30];
2172 char *e;
2173
2174 if (len >= sizeof(buf))
2175 return -EINVAL;
2176 strlcpy(buf, cbuf, len);
2177 buf[len] = 0;
2178 for (i=0; i<len; i++) {
2179 if (dot) {
2180 if (isdigit(buf[i])) {
2181 buf[i-1] = buf[i];
2182 scale *= 10;
2183 }
2184 buf[i] = 0;
2185 } else if (buf[i] == '.') {
2186 dot=1;
2187 buf[i] = 0;
2188 }
2189 }
2190 msec = simple_strtoul(buf, &e, 10);
2191 if (e == buf || (*e && *e != '\n'))
2192 return -EINVAL;
2193 msec = (msec * 1000) / scale;
2194 if (msec == 0)
2195 mddev->safemode_delay = 0;
2196 else {
2197 mddev->safemode_delay = (msec*HZ)/1000;
2198 if (mddev->safemode_delay == 0)
2199 mddev->safemode_delay = 1;
2200 }
2201 return len;
2202}
2203static struct md_sysfs_entry md_safe_delay =
2204__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
2205
2206static ssize_t
2207level_show(mddev_t *mddev, char *page)
2208{
2209 struct mdk_personality *p = mddev->pers;
2210 if (p)
2211 return sprintf(page, "%s\n", p->name);
2212 else if (mddev->clevel[0])
2213 return sprintf(page, "%s\n", mddev->clevel);
2214 else if (mddev->level != LEVEL_NONE)
2215 return sprintf(page, "%d\n", mddev->level);
2216 else
2217 return 0;
2218}
2219
2220static ssize_t
2221level_store(mddev_t *mddev, const char *buf, size_t len)
2222{
2223 int rv = len;
2224 if (mddev->pers)
2225 return -EBUSY;
2226 if (len == 0)
2227 return 0;
2228 if (len >= sizeof(mddev->clevel))
2229 return -ENOSPC;
2230 strncpy(mddev->clevel, buf, len);
2231 if (mddev->clevel[len-1] == '\n')
2232 len--;
2233 mddev->clevel[len] = 0;
2234 mddev->level = LEVEL_NONE;
2235 return rv;
2236}
2237
2238static struct md_sysfs_entry md_level =
2239__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
2240
2241
2242static ssize_t
2243layout_show(mddev_t *mddev, char *page)
2244{
2245
2246 if (mddev->reshape_position != MaxSector &&
2247 mddev->layout != mddev->new_layout)
2248 return sprintf(page, "%d (%d)\n",
2249 mddev->new_layout, mddev->layout);
2250 return sprintf(page, "%d\n", mddev->layout);
2251}
2252
2253static ssize_t
2254layout_store(mddev_t *mddev, const char *buf, size_t len)
2255{
2256 char *e;
2257 unsigned long n = simple_strtoul(buf, &e, 10);
2258
2259 if (!*buf || (*e && *e != '\n'))
2260 return -EINVAL;
2261
2262 if (mddev->pers)
2263 return -EBUSY;
2264 if (mddev->reshape_position != MaxSector)
2265 mddev->new_layout = n;
2266 else
2267 mddev->layout = n;
2268 return len;
2269}
2270static struct md_sysfs_entry md_layout =
2271__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
2272
2273
2274static ssize_t
2275raid_disks_show(mddev_t *mddev, char *page)
2276{
2277 if (mddev->raid_disks == 0)
2278 return 0;
2279 if (mddev->reshape_position != MaxSector &&
2280 mddev->delta_disks != 0)
2281 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
2282 mddev->raid_disks - mddev->delta_disks);
2283 return sprintf(page, "%d\n", mddev->raid_disks);
2284}
2285
2286static int update_raid_disks(mddev_t *mddev, int raid_disks);
2287
2288static ssize_t
2289raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
2290{
2291 char *e;
2292 int rv = 0;
2293 unsigned long n = simple_strtoul(buf, &e, 10);
2294
2295 if (!*buf || (*e && *e != '\n'))
2296 return -EINVAL;
2297
2298 if (mddev->pers)
2299 rv = update_raid_disks(mddev, n);
2300 else if (mddev->reshape_position != MaxSector) {
2301 int olddisks = mddev->raid_disks - mddev->delta_disks;
2302 mddev->delta_disks = n - olddisks;
2303 mddev->raid_disks = n;
2304 } else
2305 mddev->raid_disks = n;
2306 return rv ? rv : len;
2307}
2308static struct md_sysfs_entry md_raid_disks =
2309__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
2310
2311static ssize_t
2312chunk_size_show(mddev_t *mddev, char *page)
2313{
2314 if (mddev->reshape_position != MaxSector &&
2315 mddev->chunk_size != mddev->new_chunk)
2316 return sprintf(page, "%d (%d)\n", mddev->new_chunk,
2317 mddev->chunk_size);
2318 return sprintf(page, "%d\n", mddev->chunk_size);
2319}
2320
2321static ssize_t
2322chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2323{
2324
2325 char *e;
2326 unsigned long n = simple_strtoul(buf, &e, 10);
2327
2328 if (!*buf || (*e && *e != '\n'))
2329 return -EINVAL;
2330
2331 if (mddev->pers)
2332 return -EBUSY;
2333 else if (mddev->reshape_position != MaxSector)
2334 mddev->new_chunk = n;
2335 else
2336 mddev->chunk_size = n;
2337 return len;
2338}
2339static struct md_sysfs_entry md_chunk_size =
2340__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
2341
2342static ssize_t
2343resync_start_show(mddev_t *mddev, char *page)
2344{
2345 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2346}
2347
2348static ssize_t
2349resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2350{
2351
2352 char *e;
2353 unsigned long long n = simple_strtoull(buf, &e, 10);
2354
2355 if (mddev->pers)
2356 return -EBUSY;
2357 if (!*buf || (*e && *e != '\n'))
2358 return -EINVAL;
2359
2360 mddev->recovery_cp = n;
2361 return len;
2362}
2363static struct md_sysfs_entry md_resync_start =
2364__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
2403 write_pending, active_idle, bad_word};
2404static char *array_states[] = {
2405 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
2406 "write-pending", "active-idle", NULL };
2407
2408static int match_word(const char *word, char **list)
2409{
2410 int n;
2411 for (n=0; list[n]; n++)
2412 if (cmd_match(word, list[n]))
2413 break;
2414 return n;
2415}
2416
2417static ssize_t
2418array_state_show(mddev_t *mddev, char *page)
2419{
2420 enum array_state st = inactive;
2421
2422 if (mddev->pers)
2423 switch(mddev->ro) {
2424 case 1:
2425 st = readonly;
2426 break;
2427 case 2:
2428 st = read_auto;
2429 break;
2430 case 0:
2431 if (mddev->in_sync)
2432 st = clean;
2433 else if (mddev->safemode)
2434 st = active_idle;
2435 else
2436 st = active;
2437 }
2438 else {
2439 if (list_empty(&mddev->disks) &&
2440 mddev->raid_disks == 0 &&
2441 mddev->size == 0)
2442 st = clear;
2443 else
2444 st = inactive;
2445 }
2446 return sprintf(page, "%s\n", array_states[st]);
2447}
2448
2449static int do_md_stop(mddev_t * mddev, int ro);
2450static int do_md_run(mddev_t * mddev);
2451static int restart_array(mddev_t *mddev);
2452
2453static ssize_t
2454array_state_store(mddev_t *mddev, const char *buf, size_t len)
2455{
2456 int err = -EINVAL;
2457 enum array_state st = match_word(buf, array_states);
2458 switch(st) {
2459 case bad_word:
2460 break;
2461 case clear:
2462
2463 if (mddev->pers) {
2464 if (atomic_read(&mddev->active) > 1)
2465 return -EBUSY;
2466 err = do_md_stop(mddev, 0);
2467 }
2468 break;
2469 case inactive:
2470
2471 if (mddev->pers) {
2472 if (atomic_read(&mddev->active) > 1)
2473 return -EBUSY;
2474 err = do_md_stop(mddev, 2);
2475 }
2476 break;
2477 case suspended:
2478 break;
2479 case readonly:
2480 if (mddev->pers)
2481 err = do_md_stop(mddev, 1);
2482 else {
2483 mddev->ro = 1;
2484 err = do_md_run(mddev);
2485 }
2486 break;
2487 case read_auto:
2488
2489 if (mddev->pers) {
2490 err = do_md_stop(mddev, 1);
2491 if (err == 0)
2492 mddev->ro = 2;
2493 } else {
2494 mddev->ro = 2;
2495 err = do_md_run(mddev);
2496 }
2497 break;
2498 case clean:
2499 if (mddev->pers) {
2500 restart_array(mddev);
2501 spin_lock_irq(&mddev->write_lock);
2502 if (atomic_read(&mddev->writes_pending) == 0) {
2503 mddev->in_sync = 1;
2504 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
2505 }
2506 spin_unlock_irq(&mddev->write_lock);
2507 } else {
2508 mddev->ro = 0;
2509 mddev->recovery_cp = MaxSector;
2510 err = do_md_run(mddev);
2511 }
2512 break;
2513 case active:
2514 if (mddev->pers) {
2515 restart_array(mddev);
2516 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2517 wake_up(&mddev->sb_wait);
2518 err = 0;
2519 } else {
2520 mddev->ro = 0;
2521 err = do_md_run(mddev);
2522 }
2523 break;
2524 case write_pending:
2525 case active_idle:
2526
2527 break;
2528 }
2529 if (err)
2530 return err;
2531 else
2532 return len;
2533}
2534static struct md_sysfs_entry md_array_state =
2535__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
2536
2537static ssize_t
2538null_show(mddev_t *mddev, char *page)
2539{
2540 return -EINVAL;
2541}
2542
2543static ssize_t
2544new_dev_store(mddev_t *mddev, const char *buf, size_t len)
2545{
2546
2547
2548
2549
2550
2551
2552
2553 char *e;
2554 int major = simple_strtoul(buf, &e, 10);
2555 int minor;
2556 dev_t dev;
2557 mdk_rdev_t *rdev;
2558 int err;
2559
2560 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
2561 return -EINVAL;
2562 minor = simple_strtoul(e+1, &e, 10);
2563 if (*e && *e != '\n')
2564 return -EINVAL;
2565 dev = MKDEV(major, minor);
2566 if (major != MAJOR(dev) ||
2567 minor != MINOR(dev))
2568 return -EOVERFLOW;
2569
2570
2571 if (mddev->persistent) {
2572 rdev = md_import_device(dev, mddev->major_version,
2573 mddev->minor_version);
2574 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
2575 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
2576 mdk_rdev_t, same_set);
2577 err = super_types[mddev->major_version]
2578 .load_super(rdev, rdev0, mddev->minor_version);
2579 if (err < 0)
2580 goto out;
2581 }
2582 } else
2583 rdev = md_import_device(dev, -1, -1);
2584
2585 if (IS_ERR(rdev))
2586 return PTR_ERR(rdev);
2587 err = bind_rdev_to_array(rdev, mddev);
2588 out:
2589 if (err)
2590 export_rdev(rdev);
2591 return err ? err : len;
2592}
2593
2594static struct md_sysfs_entry md_new_device =
2595__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
2596
2597static ssize_t
2598bitmap_store(mddev_t *mddev, const char *buf, size_t len)
2599{
2600 char *end;
2601 unsigned long chunk, end_chunk;
2602
2603 if (!mddev->bitmap)
2604 goto out;
2605
2606 while (*buf) {
2607 chunk = end_chunk = simple_strtoul(buf, &end, 0);
2608 if (buf == end) break;
2609 if (*end == '-') {
2610 buf = end + 1;
2611 end_chunk = simple_strtoul(buf, &end, 0);
2612 if (buf == end) break;
2613 }
2614 if (*end && !isspace(*end)) break;
2615 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
2616 buf = end;
2617 while (isspace(*buf)) buf++;
2618 }
2619 bitmap_unplug(mddev->bitmap);
2620out:
2621 return len;
2622}
2623
2624static struct md_sysfs_entry md_bitmap =
2625__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
2626
2627static ssize_t
2628size_show(mddev_t *mddev, char *page)
2629{
2630 return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
2631}
2632
2633static int update_size(mddev_t *mddev, unsigned long size);
2634
2635static ssize_t
2636size_store(mddev_t *mddev, const char *buf, size_t len)
2637{
2638
2639
2640
2641
2642 char *e;
2643 int err = 0;
2644 unsigned long long size = simple_strtoull(buf, &e, 10);
2645 if (!*buf || *buf == '\n' ||
2646 (*e && *e != '\n'))
2647 return -EINVAL;
2648
2649 if (mddev->pers) {
2650 err = update_size(mddev, size);
2651 md_update_sb(mddev, 1);
2652 } else {
2653 if (mddev->size == 0 ||
2654 mddev->size > size)
2655 mddev->size = size;
2656 else
2657 err = -ENOSPC;
2658 }
2659 return err ? err : len;
2660}
2661
2662static struct md_sysfs_entry md_size =
2663__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
2664
2665
2666
2667
2668
2669
2670static ssize_t
2671metadata_show(mddev_t *mddev, char *page)
2672{
2673 if (mddev->persistent)
2674 return sprintf(page, "%d.%d\n",
2675 mddev->major_version, mddev->minor_version);
2676 else
2677 return sprintf(page, "none\n");
2678}
2679
2680static ssize_t
2681metadata_store(mddev_t *mddev, const char *buf, size_t len)
2682{
2683 int major, minor;
2684 char *e;
2685 if (!list_empty(&mddev->disks))
2686 return -EBUSY;
2687
2688 if (cmd_match(buf, "none")) {
2689 mddev->persistent = 0;
2690 mddev->major_version = 0;
2691 mddev->minor_version = 90;
2692 return len;
2693 }
2694 major = simple_strtoul(buf, &e, 10);
2695 if (e==buf || *e != '.')
2696 return -EINVAL;
2697 buf = e+1;
2698 minor = simple_strtoul(buf, &e, 10);
2699 if (e==buf || (*e && *e != '\n') )
2700 return -EINVAL;
2701 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
2702 return -ENOENT;
2703 mddev->major_version = major;
2704 mddev->minor_version = minor;
2705 mddev->persistent = 1;
2706 return len;
2707}
2708
2709static struct md_sysfs_entry md_metadata =
2710__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
2711
2712static ssize_t
2713action_show(mddev_t *mddev, char *page)
2714{
2715 char *type = "idle";
2716 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2717 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
2718 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2719 type = "reshape";
2720 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2721 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
2722 type = "resync";
2723 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2724 type = "check";
2725 else
2726 type = "repair";
2727 } else
2728 type = "recover";
2729 }
2730 return sprintf(page, "%s\n", type);
2731}
2732
2733static ssize_t
2734action_store(mddev_t *mddev, const char *page, size_t len)
2735{
2736 if (!mddev->pers || !mddev->pers->sync_request)
2737 return -EINVAL;
2738
2739 if (cmd_match(page, "idle")) {
2740 if (mddev->sync_thread) {
2741 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2742 md_unregister_thread(mddev->sync_thread);
2743 mddev->sync_thread = NULL;
2744 mddev->recovery = 0;
2745 }
2746 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2747 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
2748 return -EBUSY;
2749 else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
2750 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2751 else if (cmd_match(page, "reshape")) {
2752 int err;
2753 if (mddev->pers->start_reshape == NULL)
2754 return -EINVAL;
2755 err = mddev->pers->start_reshape(mddev);
2756 if (err)
2757 return err;
2758 } else {
2759 if (cmd_match(page, "check"))
2760 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
2761 else if (!cmd_match(page, "repair"))
2762 return -EINVAL;
2763 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
2764 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
2765 }
2766 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2767 md_wakeup_thread(mddev->thread);
2768 return len;
2769}
2770
2771static ssize_t
2772mismatch_cnt_show(mddev_t *mddev, char *page)
2773{
2774 return sprintf(page, "%llu\n",
2775 (unsigned long long) mddev->resync_mismatches);
2776}
2777
2778static struct md_sysfs_entry md_scan_mode =
2779__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
2780
2781
2782static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
2783
2784static ssize_t
2785sync_min_show(mddev_t *mddev, char *page)
2786{
2787 return sprintf(page, "%d (%s)\n", speed_min(mddev),
2788 mddev->sync_speed_min ? "local": "system");
2789}
2790
2791static ssize_t
2792sync_min_store(mddev_t *mddev, const char *buf, size_t len)
2793{
2794 int min;
2795 char *e;
2796 if (strncmp(buf, "system", 6)==0) {
2797 mddev->sync_speed_min = 0;
2798 return len;
2799 }
2800 min = simple_strtoul(buf, &e, 10);
2801 if (buf == e || (*e && *e != '\n') || min <= 0)
2802 return -EINVAL;
2803 mddev->sync_speed_min = min;
2804 return len;
2805}
2806
2807static struct md_sysfs_entry md_sync_min =
2808__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
2809
2810static ssize_t
2811sync_max_show(mddev_t *mddev, char *page)
2812{
2813 return sprintf(page, "%d (%s)\n", speed_max(mddev),
2814 mddev->sync_speed_max ? "local": "system");
2815}
2816
2817static ssize_t
2818sync_max_store(mddev_t *mddev, const char *buf, size_t len)
2819{
2820 int max;
2821 char *e;
2822 if (strncmp(buf, "system", 6)==0) {
2823 mddev->sync_speed_max = 0;
2824 return len;
2825 }
2826 max = simple_strtoul(buf, &e, 10);
2827 if (buf == e || (*e && *e != '\n') || max <= 0)
2828 return -EINVAL;
2829 mddev->sync_speed_max = max;
2830 return len;
2831}
2832
2833static struct md_sysfs_entry md_sync_max =
2834__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
2835
2836static ssize_t
2837degraded_show(mddev_t *mddev, char *page)
2838{
2839 return sprintf(page, "%d\n", mddev->degraded);
2840}
2841static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
2842
2843static ssize_t
2844sync_speed_show(mddev_t *mddev, char *page)
2845{
2846 unsigned long resync, dt, db;
2847 resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active));
2848 dt = ((jiffies - mddev->resync_mark) / HZ);
2849 if (!dt) dt++;
2850 db = resync - (mddev->resync_mark_cnt);
2851 return sprintf(page, "%ld\n", db/dt/2);
2852}
2853
2854static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
2855
2856static ssize_t
2857sync_completed_show(mddev_t *mddev, char *page)
2858{
2859 unsigned long max_blocks, resync;
2860
2861 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2862 max_blocks = mddev->resync_max_sectors;
2863 else
2864 max_blocks = mddev->size << 1;
2865
2866 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
2867 return sprintf(page, "%lu / %lu\n", resync, max_blocks);
2868}
2869
2870static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
2871
2872static ssize_t
2873suspend_lo_show(mddev_t *mddev, char *page)
2874{
2875 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
2876}
2877
2878static ssize_t
2879suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
2880{
2881 char *e;
2882 unsigned long long new = simple_strtoull(buf, &e, 10);
2883
2884 if (mddev->pers->quiesce == NULL)
2885 return -EINVAL;
2886 if (buf == e || (*e && *e != '\n'))
2887 return -EINVAL;
2888 if (new >= mddev->suspend_hi ||
2889 (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
2890 mddev->suspend_lo = new;
2891 mddev->pers->quiesce(mddev, 2);
2892 return len;
2893 } else
2894 return -EINVAL;
2895}
2896static struct md_sysfs_entry md_suspend_lo =
2897__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
2898
2899
2900static ssize_t
2901suspend_hi_show(mddev_t *mddev, char *page)
2902{
2903 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
2904}
2905
2906static ssize_t
2907suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
2908{
2909 char *e;
2910 unsigned long long new = simple_strtoull(buf, &e, 10);
2911
2912 if (mddev->pers->quiesce == NULL)
2913 return -EINVAL;
2914 if (buf == e || (*e && *e != '\n'))
2915 return -EINVAL;
2916 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
2917 (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
2918 mddev->suspend_hi = new;
2919 mddev->pers->quiesce(mddev, 1);
2920 mddev->pers->quiesce(mddev, 0);
2921 return len;
2922 } else
2923 return -EINVAL;
2924}
2925static struct md_sysfs_entry md_suspend_hi =
2926__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
2927
2928static ssize_t
2929reshape_position_show(mddev_t *mddev, char *page)
2930{
2931 if (mddev->reshape_position != MaxSector)
2932 return sprintf(page, "%llu\n",
2933 (unsigned long long)mddev->reshape_position);
2934 strcpy(page, "none\n");
2935 return 5;
2936}
2937
2938static ssize_t
2939reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
2940{
2941 char *e;
2942 unsigned long long new = simple_strtoull(buf, &e, 10);
2943 if (mddev->pers)
2944 return -EBUSY;
2945 if (buf == e || (*e && *e != '\n'))
2946 return -EINVAL;
2947 mddev->reshape_position = new;
2948 mddev->delta_disks = 0;
2949 mddev->new_level = mddev->level;
2950 mddev->new_layout = mddev->layout;
2951 mddev->new_chunk = mddev->chunk_size;
2952 return len;
2953}
2954
2955static struct md_sysfs_entry md_reshape_position =
2956__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
2957 reshape_position_store);
2958
2959
2960static struct attribute *md_default_attrs[] = {
2961 &md_level.attr,
2962 &md_layout.attr,
2963 &md_raid_disks.attr,
2964 &md_chunk_size.attr,
2965 &md_size.attr,
2966 &md_resync_start.attr,
2967 &md_metadata.attr,
2968 &md_new_device.attr,
2969 &md_safe_delay.attr,
2970 &md_array_state.attr,
2971 &md_reshape_position.attr,
2972 NULL,
2973};
2974
2975static struct attribute *md_redundancy_attrs[] = {
2976 &md_scan_mode.attr,
2977 &md_mismatches.attr,
2978 &md_sync_min.attr,
2979 &md_sync_max.attr,
2980 &md_sync_speed.attr,
2981 &md_sync_completed.attr,
2982 &md_suspend_lo.attr,
2983 &md_suspend_hi.attr,
2984 &md_bitmap.attr,
2985 &md_degraded.attr,
2986 NULL,
2987};
2988static struct attribute_group md_redundancy_group = {
2989 .name = NULL,
2990 .attrs = md_redundancy_attrs,
2991};
2992
2993
2994static ssize_t
2995md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2996{
2997 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
2998 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
2999 ssize_t rv;
3000
3001 if (!entry->show)
3002 return -EIO;
3003 rv = mddev_lock(mddev);
3004 if (!rv) {
3005 rv = entry->show(mddev, page);
3006 mddev_unlock(mddev);
3007 }
3008 return rv;
3009}
3010
3011static ssize_t
3012md_attr_store(struct kobject *kobj, struct attribute *attr,
3013 const char *page, size_t length)
3014{
3015 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3016 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3017 ssize_t rv;
3018
3019 if (!entry->store)
3020 return -EIO;
3021 if (!capable(CAP_SYS_ADMIN))
3022 return -EACCES;
3023 rv = mddev_lock(mddev);
3024 if (!rv) {
3025 rv = entry->store(mddev, page, length);
3026 mddev_unlock(mddev);
3027 }
3028 return rv;
3029}
3030
3031static void md_free(struct kobject *ko)
3032{
3033 mddev_t *mddev = container_of(ko, mddev_t, kobj);
3034 kfree(mddev);
3035}
3036
3037static struct sysfs_ops md_sysfs_ops = {
3038 .show = md_attr_show,
3039 .store = md_attr_store,
3040};
3041static struct kobj_type md_ktype = {
3042 .release = md_free,
3043 .sysfs_ops = &md_sysfs_ops,
3044 .default_attrs = md_default_attrs,
3045};
3046
3047int mdp_major = 0;
3048
3049static struct kobject *md_probe(dev_t dev, int *part, void *data)
3050{
3051 static DEFINE_MUTEX(disks_mutex);
3052 mddev_t *mddev = mddev_find(dev);
3053 struct gendisk *disk;
3054 int partitioned = (MAJOR(dev) != MD_MAJOR);
3055 int shift = partitioned ? MdpMinorShift : 0;
3056 int unit = MINOR(dev) >> shift;
3057
3058 if (!mddev)
3059 return NULL;
3060
3061 mutex_lock(&disks_mutex);
3062 if (mddev->gendisk) {
3063 mutex_unlock(&disks_mutex);
3064 mddev_put(mddev);
3065 return NULL;
3066 }
3067 disk = alloc_disk(1 << shift);
3068 if (!disk) {
3069 mutex_unlock(&disks_mutex);
3070 mddev_put(mddev);
3071 return NULL;
3072 }
3073 disk->major = MAJOR(dev);
3074 disk->first_minor = unit << shift;
3075 if (partitioned)
3076 sprintf(disk->disk_name, "md_d%d", unit);
3077 else
3078 sprintf(disk->disk_name, "md%d", unit);
3079 disk->fops = &md_fops;
3080 disk->private_data = mddev;
3081 disk->queue = mddev->queue;
3082 add_disk(disk);
3083 mddev->gendisk = disk;
3084 mutex_unlock(&disks_mutex);
3085 mddev->kobj.parent = &disk->kobj;
3086 kobject_set_name(&mddev->kobj, "%s", "md");
3087 mddev->kobj.ktype = &md_ktype;
3088 if (kobject_register(&mddev->kobj))
3089 printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
3090 disk->disk_name);
3091 return NULL;
3092}
3093
3094static void md_safemode_timeout(unsigned long data)
3095{
3096 mddev_t *mddev = (mddev_t *) data;
3097
3098 mddev->safemode = 1;
3099 md_wakeup_thread(mddev->thread);
3100}
3101
3102static int start_dirty_degraded;
3103
3104static int do_md_run(mddev_t * mddev)
3105{
3106 int err;
3107 int chunk_size;
3108 struct list_head *tmp;
3109 mdk_rdev_t *rdev;
3110 struct gendisk *disk;
3111 struct mdk_personality *pers;
3112 char b[BDEVNAME_SIZE];
3113
3114 if (list_empty(&mddev->disks))
3115
3116 return -EINVAL;
3117
3118 if (mddev->pers)
3119 return -EBUSY;
3120
3121
3122
3123
3124 if (!mddev->raid_disks)
3125 analyze_sbs(mddev);
3126
3127 chunk_size = mddev->chunk_size;
3128
3129 if (chunk_size) {
3130 if (chunk_size > MAX_CHUNK_SIZE) {
3131 printk(KERN_ERR "too big chunk_size: %d > %d\n",
3132 chunk_size, MAX_CHUNK_SIZE);
3133 return -EINVAL;
3134 }
3135
3136
3137
3138 if ( (1 << ffz(~chunk_size)) != chunk_size) {
3139 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size);
3140 return -EINVAL;
3141 }
3142 if (chunk_size < PAGE_SIZE) {
3143 printk(KERN_ERR "too small chunk_size: %d < %ld\n",
3144 chunk_size, PAGE_SIZE);
3145 return -EINVAL;
3146 }
3147
3148
3149 ITERATE_RDEV(mddev,rdev,tmp) {
3150 if (test_bit(Faulty, &rdev->flags))
3151 continue;
3152 if (rdev->size < chunk_size / 1024) {
3153 printk(KERN_WARNING
3154 "md: Dev %s smaller than chunk_size:"
3155 " %lluk < %dk\n",
3156 bdevname(rdev->bdev,b),
3157 (unsigned long long)rdev->size,
3158 chunk_size / 1024);
3159 return -EINVAL;
3160 }
3161 }
3162 }
3163
3164#ifdef CONFIG_KMOD
3165 if (mddev->level != LEVEL_NONE)
3166 request_module("md-level-%d", mddev->level);
3167 else if (mddev->clevel[0])
3168 request_module("md-%s", mddev->clevel);
3169#endif
3170
3171
3172
3173
3174
3175
3176 ITERATE_RDEV(mddev,rdev,tmp) {
3177 if (test_bit(Faulty, &rdev->flags))
3178 continue;
3179 sync_blockdev(rdev->bdev);
3180 invalidate_bdev(rdev->bdev);
3181
3182
3183
3184
3185
3186 if (rdev->data_offset < rdev->sb_offset) {
3187 if (mddev->size &&
3188 rdev->data_offset + mddev->size*2
3189 > rdev->sb_offset*2) {
3190 printk("md: %s: data overlaps metadata\n",
3191 mdname(mddev));
3192 return -EINVAL;
3193 }
3194 } else {
3195 if (rdev->sb_offset*2 + rdev->sb_size/512
3196 > rdev->data_offset) {
3197 printk("md: %s: metadata overlaps data\n",
3198 mdname(mddev));
3199 return -EINVAL;
3200 }
3201 }
3202 }
3203
3204 md_probe(mddev->unit, NULL, NULL);
3205 disk = mddev->gendisk;
3206 if (!disk)
3207 return -ENOMEM;
3208
3209 spin_lock(&pers_lock);
3210 pers = find_pers(mddev->level, mddev->clevel);
3211 if (!pers || !try_module_get(pers->owner)) {
3212 spin_unlock(&pers_lock);
3213 if (mddev->level != LEVEL_NONE)
3214 printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
3215 mddev->level);
3216 else
3217 printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
3218 mddev->clevel);
3219 return -EINVAL;
3220 }
3221 mddev->pers = pers;
3222 spin_unlock(&pers_lock);
3223 mddev->level = pers->level;
3224 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3225
3226 if (mddev->reshape_position != MaxSector &&
3227 pers->start_reshape == NULL) {
3228
3229 mddev->pers = NULL;
3230 module_put(pers->owner);
3231 return -EINVAL;
3232 }
3233
3234 if (pers->sync_request) {
3235
3236
3237
3238 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3239 mdk_rdev_t *rdev2;
3240 struct list_head *tmp2;
3241 int warned = 0;
3242 ITERATE_RDEV(mddev, rdev, tmp) {
3243 ITERATE_RDEV(mddev, rdev2, tmp2) {
3244 if (rdev < rdev2 &&
3245 rdev->bdev->bd_contains ==
3246 rdev2->bdev->bd_contains) {
3247 printk(KERN_WARNING
3248 "%s: WARNING: %s appears to be"
3249 " on the same physical disk as"
3250 " %s.\n",
3251 mdname(mddev),
3252 bdevname(rdev->bdev,b),
3253 bdevname(rdev2->bdev,b2));
3254 warned = 1;
3255 }
3256 }
3257 }
3258 if (warned)
3259 printk(KERN_WARNING
3260 "True protection against single-disk"
3261 " failure might be compromised.\n");
3262 }
3263
3264 mddev->recovery = 0;
3265 mddev->resync_max_sectors = mddev->size << 1;
3266 mddev->barriers_work = 1;
3267 mddev->ok_start_degraded = start_dirty_degraded;
3268
3269 if (start_readonly)
3270 mddev->ro = 2;
3271
3272 err = mddev->pers->run(mddev);
3273 if (!err && mddev->pers->sync_request) {
3274 err = bitmap_create(mddev);
3275 if (err) {
3276 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
3277 mdname(mddev), err);
3278 mddev->pers->stop(mddev);
3279 }
3280 }
3281 if (err) {
3282 printk(KERN_ERR "md: pers->run() failed ...\n");
3283 module_put(mddev->pers->owner);
3284 mddev->pers = NULL;
3285 bitmap_destroy(mddev);
3286 return err;
3287 }
3288 if (mddev->pers->sync_request) {
3289 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3290 printk(KERN_WARNING
3291 "md: cannot register extra attributes for %s\n",
3292 mdname(mddev));
3293 } else if (mddev->ro == 2)
3294 mddev->ro = 0;
3295
3296 atomic_set(&mddev->writes_pending,0);
3297 mddev->safemode = 0;
3298 mddev->safemode_timer.function = md_safemode_timeout;
3299 mddev->safemode_timer.data = (unsigned long) mddev;
3300 mddev->safemode_delay = (200 * HZ)/1000 +1;
3301 mddev->in_sync = 1;
3302
3303 ITERATE_RDEV(mddev,rdev,tmp)
3304 if (rdev->raid_disk >= 0) {
3305 char nm[20];
3306 sprintf(nm, "rd%d", rdev->raid_disk);
3307 if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
3308 printk("md: cannot register %s for %s\n",
3309 nm, mdname(mddev));
3310 }
3311
3312 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3313
3314 if (mddev->flags)
3315 md_update_sb(mddev, 0);
3316
3317 set_capacity(disk, mddev->array_size<<1);
3318
3319
3320
3321
3322
3323
3324
3325
3326 mddev->queue->queuedata = mddev;
3327 mddev->queue->make_request_fn = mddev->pers->make_request;
3328
3329
3330
3331
3332
3333 if (mddev->degraded && !mddev->sync_thread) {
3334 struct list_head *rtmp;
3335 int spares = 0;
3336 ITERATE_RDEV(mddev,rdev,rtmp)
3337 if (rdev->raid_disk >= 0 &&
3338 !test_bit(In_sync, &rdev->flags) &&
3339 !test_bit(Faulty, &rdev->flags))
3340
3341 spares++;
3342 if (spares && mddev->pers->sync_request) {
3343 mddev->recovery = 0;
3344 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3345 mddev->sync_thread = md_register_thread(md_do_sync,
3346 mddev,
3347 "%s_resync");
3348 if (!mddev->sync_thread) {
3349 printk(KERN_ERR "%s: could not start resync"
3350 " thread...\n",
3351 mdname(mddev));
3352
3353 mddev->recovery = 0;
3354 }
3355 }
3356 }
3357 md_wakeup_thread(mddev->thread);
3358 md_wakeup_thread(mddev->sync_thread);
3359
3360 mddev->changed = 1;
3361 md_new_event(mddev);
3362 kobject_uevent(&mddev->gendisk->kobj, KOBJ_CHANGE);
3363 return 0;
3364}
3365
3366static int restart_array(mddev_t *mddev)
3367{
3368 struct gendisk *disk = mddev->gendisk;
3369 int err;
3370
3371
3372
3373
3374 err = -ENXIO;
3375 if (list_empty(&mddev->disks))
3376 goto out;
3377
3378 if (mddev->pers) {
3379 err = -EBUSY;
3380 if (!mddev->ro)
3381 goto out;
3382
3383 mddev->safemode = 0;
3384 mddev->ro = 0;
3385 set_disk_ro(disk, 0);
3386
3387 printk(KERN_INFO "md: %s switched to read-write mode.\n",
3388 mdname(mddev));
3389
3390
3391
3392 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3393 md_wakeup_thread(mddev->thread);
3394 md_wakeup_thread(mddev->sync_thread);
3395 err = 0;
3396 } else
3397 err = -EINVAL;
3398
3399out:
3400 return err;
3401}
3402
3403
3404
3405static int deny_bitmap_write_access(struct file * file)
3406{
3407 struct inode *inode = file->f_mapping->host;
3408
3409 spin_lock(&inode->i_lock);
3410 if (atomic_read(&inode->i_writecount) > 1) {
3411 spin_unlock(&inode->i_lock);
3412 return -ETXTBSY;
3413 }
3414 atomic_set(&inode->i_writecount, -1);
3415 spin_unlock(&inode->i_lock);
3416
3417 return 0;
3418}
3419
3420static void restore_bitmap_write_access(struct file *file)
3421{
3422 struct inode *inode = file->f_mapping->host;
3423
3424 spin_lock(&inode->i_lock);
3425 atomic_set(&inode->i_writecount, 1);
3426 spin_unlock(&inode->i_lock);
3427}
3428
3429
3430
3431
3432
3433
3434static int do_md_stop(mddev_t * mddev, int mode)
3435{
3436 int err = 0;
3437 struct gendisk *disk = mddev->gendisk;
3438
3439 if (mddev->pers) {
3440 if (atomic_read(&mddev->active)>2) {
3441 printk("md: %s still in use.\n",mdname(mddev));
3442 return -EBUSY;
3443 }
3444
3445 if (mddev->sync_thread) {
3446 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3447 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3448 md_unregister_thread(mddev->sync_thread);
3449 mddev->sync_thread = NULL;
3450 }
3451
3452 del_timer_sync(&mddev->safemode_timer);
3453
3454 invalidate_partition(disk, 0);
3455
3456 switch(mode) {
3457 case 1:
3458 err = -ENXIO;
3459 if (mddev->ro==1)
3460 goto out;
3461 mddev->ro = 1;
3462 break;
3463 case 0:
3464 case 2:
3465 bitmap_flush(mddev);
3466 md_super_wait(mddev);
3467 if (mddev->ro)
3468 set_disk_ro(disk, 0);
3469 blk_queue_make_request(mddev->queue, md_fail_request);
3470 mddev->pers->stop(mddev);
3471 mddev->queue->merge_bvec_fn = NULL;
3472 mddev->queue->unplug_fn = NULL;
3473 mddev->queue->backing_dev_info.congested_fn = NULL;
3474 if (mddev->pers->sync_request)
3475 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
3476
3477 module_put(mddev->pers->owner);
3478 mddev->pers = NULL;
3479
3480 set_capacity(disk, 0);
3481 mddev->changed = 1;
3482
3483 if (mddev->ro)
3484 mddev->ro = 0;
3485 }
3486 if (!mddev->in_sync || mddev->flags) {
3487
3488 mddev->in_sync = 1;
3489 md_update_sb(mddev, 1);
3490 }
3491 if (mode == 1)
3492 set_disk_ro(disk, 1);
3493 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3494 }
3495
3496
3497
3498
3499 if (mode == 0) {
3500 mdk_rdev_t *rdev;
3501 struct list_head *tmp;
3502
3503 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
3504
3505 bitmap_destroy(mddev);
3506 if (mddev->bitmap_file) {
3507 restore_bitmap_write_access(mddev->bitmap_file);
3508 fput(mddev->bitmap_file);
3509 mddev->bitmap_file = NULL;
3510 }
3511 mddev->bitmap_offset = 0;
3512
3513 ITERATE_RDEV(mddev,rdev,tmp)
3514 if (rdev->raid_disk >= 0) {
3515 char nm[20];
3516 sprintf(nm, "rd%d", rdev->raid_disk);
3517 sysfs_remove_link(&mddev->kobj, nm);
3518 }
3519
3520
3521 flush_scheduled_work();
3522
3523 export_array(mddev);
3524
3525 mddev->array_size = 0;
3526 mddev->size = 0;
3527 mddev->raid_disks = 0;
3528 mddev->recovery_cp = 0;
3529 mddev->reshape_position = MaxSector;
3530
3531 } else if (mddev->pers)
3532 printk(KERN_INFO "md: %s switched to read-only mode.\n",
3533 mdname(mddev));
3534 err = 0;
3535 md_new_event(mddev);
3536out:
3537 return err;
3538}
3539
3540#ifndef MODULE
3541static void autorun_array(mddev_t *mddev)
3542{
3543 mdk_rdev_t *rdev;
3544 struct list_head *tmp;
3545 int err;
3546
3547 if (list_empty(&mddev->disks))
3548 return;
3549
3550 printk(KERN_INFO "md: running: ");
3551
3552 ITERATE_RDEV(mddev,rdev,tmp) {
3553 char b[BDEVNAME_SIZE];
3554 printk("<%s>", bdevname(rdev->bdev,b));
3555 }
3556 printk("\n");
3557
3558 err = do_md_run (mddev);
3559 if (err) {
3560 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
3561 do_md_stop (mddev, 0);
3562 }
3563}
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577static void autorun_devices(int part)
3578{
3579 struct list_head *tmp;
3580 mdk_rdev_t *rdev0, *rdev;
3581 mddev_t *mddev;
3582 char b[BDEVNAME_SIZE];
3583
3584 printk(KERN_INFO "md: autorun ...\n");
3585 while (!list_empty(&pending_raid_disks)) {
3586 int unit;
3587 dev_t dev;
3588 LIST_HEAD(candidates);
3589 rdev0 = list_entry(pending_raid_disks.next,
3590 mdk_rdev_t, same_set);
3591
3592 printk(KERN_INFO "md: considering %s ...\n",
3593 bdevname(rdev0->bdev,b));
3594 INIT_LIST_HEAD(&candidates);
3595 ITERATE_RDEV_PENDING(rdev,tmp)
3596 if (super_90_load(rdev, rdev0, 0) >= 0) {
3597 printk(KERN_INFO "md: adding %s ...\n",
3598 bdevname(rdev->bdev,b));
3599 list_move(&rdev->same_set, &candidates);
3600 }
3601
3602
3603
3604
3605
3606 if (part) {
3607 dev = MKDEV(mdp_major,
3608 rdev0->preferred_minor << MdpMinorShift);
3609 unit = MINOR(dev) >> MdpMinorShift;
3610 } else {
3611 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
3612 unit = MINOR(dev);
3613 }
3614 if (rdev0->preferred_minor != unit) {
3615 printk(KERN_INFO "md: unit number in %s is bad: %d\n",
3616 bdevname(rdev0->bdev, b), rdev0->preferred_minor);
3617 break;
3618 }
3619
3620 md_probe(dev, NULL, NULL);
3621 mddev = mddev_find(dev);
3622 if (!mddev) {
3623 printk(KERN_ERR
3624 "md: cannot allocate memory for md drive.\n");
3625 break;
3626 }
3627 if (mddev_lock(mddev))
3628 printk(KERN_WARNING "md: %s locked, cannot run\n",
3629 mdname(mddev));
3630 else if (mddev->raid_disks || mddev->major_version
3631 || !list_empty(&mddev->disks)) {
3632 printk(KERN_WARNING
3633 "md: %s already running, cannot run %s\n",
3634 mdname(mddev), bdevname(rdev0->bdev,b));
3635 mddev_unlock(mddev);
3636 } else {
3637 printk(KERN_INFO "md: created %s\n", mdname(mddev));
3638 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
3639 list_del_init(&rdev->same_set);
3640 if (bind_rdev_to_array(rdev, mddev))
3641 export_rdev(rdev);
3642 }
3643 autorun_array(mddev);
3644 mddev_unlock(mddev);
3645 }
3646
3647
3648
3649 ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
3650 export_rdev(rdev);
3651 mddev_put(mddev);
3652 }
3653 printk(KERN_INFO "md: ... autorun DONE.\n");
3654}
3655#endif
3656
3657static int get_version(void __user * arg)
3658{
3659 mdu_version_t ver;
3660
3661 ver.major = MD_MAJOR_VERSION;
3662 ver.minor = MD_MINOR_VERSION;
3663 ver.patchlevel = MD_PATCHLEVEL_VERSION;
3664
3665 if (copy_to_user(arg, &ver, sizeof(ver)))
3666 return -EFAULT;
3667
3668 return 0;
3669}
3670
3671static int get_array_info(mddev_t * mddev, void __user * arg)
3672{
3673 mdu_array_info_t info;
3674 int nr,working,active,failed,spare;
3675 mdk_rdev_t *rdev;
3676 struct list_head *tmp;
3677
3678 nr=working=active=failed=spare=0;
3679 ITERATE_RDEV(mddev,rdev,tmp) {
3680 nr++;
3681 if (test_bit(Faulty, &rdev->flags))
3682 failed++;
3683 else {
3684 working++;
3685 if (test_bit(In_sync, &rdev->flags))
3686 active++;
3687 else
3688 spare++;
3689 }
3690 }
3691
3692 info.major_version = mddev->major_version;
3693 info.minor_version = mddev->minor_version;
3694 info.patch_version = MD_PATCHLEVEL_VERSION;
3695 info.ctime = mddev->ctime;
3696 info.level = mddev->level;
3697 info.size = mddev->size;
3698 if (info.size != mddev->size)
3699 info.size = -1;
3700 info.nr_disks = nr;
3701 info.raid_disks = mddev->raid_disks;
3702 info.md_minor = mddev->md_minor;
3703 info.not_persistent= !mddev->persistent;
3704
3705 info.utime = mddev->utime;
3706 info.state = 0;
3707 if (mddev->in_sync)
3708 info.state = (1<<MD_SB_CLEAN);
3709 if (mddev->bitmap && mddev->bitmap_offset)
3710 info.state = (1<<MD_SB_BITMAP_PRESENT);
3711 info.active_disks = active;
3712 info.working_disks = working;
3713 info.failed_disks = failed;
3714 info.spare_disks = spare;
3715
3716 info.layout = mddev->layout;
3717 info.chunk_size = mddev->chunk_size;
3718
3719 if (copy_to_user(arg, &info, sizeof(info)))
3720 return -EFAULT;
3721
3722 return 0;
3723}
3724
3725static int get_bitmap_file(mddev_t * mddev, void __user * arg)
3726{
3727 mdu_bitmap_file_t *file = NULL;
3728 char *ptr, *buf = NULL;
3729 int err = -ENOMEM;
3730
3731 md_allow_write(mddev);
3732
3733 file = kmalloc(sizeof(*file), GFP_KERNEL);
3734 if (!file)
3735 goto out;
3736
3737
3738 if (!mddev->bitmap || !mddev->bitmap->file) {
3739 file->pathname[0] = '\0';
3740 goto copy_out;
3741 }
3742
3743 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
3744 if (!buf)
3745 goto out;
3746
3747 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname));
3748 if (!ptr)
3749 goto out;
3750
3751 strcpy(file->pathname, ptr);
3752
3753copy_out:
3754 err = 0;
3755 if (copy_to_user(arg, file, sizeof(*file)))
3756 err = -EFAULT;
3757out:
3758 kfree(buf);
3759 kfree(file);
3760 return err;
3761}
3762
3763static int get_disk_info(mddev_t * mddev, void __user * arg)
3764{
3765 mdu_disk_info_t info;
3766 unsigned int nr;
3767 mdk_rdev_t *rdev;
3768
3769 if (copy_from_user(&info, arg, sizeof(info)))
3770 return -EFAULT;
3771
3772 nr = info.number;
3773
3774 rdev = find_rdev_nr(mddev, nr);
3775 if (rdev) {
3776 info.major = MAJOR(rdev->bdev->bd_dev);
3777 info.minor = MINOR(rdev->bdev->bd_dev);
3778 info.raid_disk = rdev->raid_disk;
3779 info.state = 0;
3780 if (test_bit(Faulty, &rdev->flags))
3781 info.state |= (1<<MD_DISK_FAULTY);
3782 else if (test_bit(In_sync, &rdev->flags)) {
3783 info.state |= (1<<MD_DISK_ACTIVE);
3784 info.state |= (1<<MD_DISK_SYNC);
3785 }
3786 if (test_bit(WriteMostly, &rdev->flags))
3787 info.state |= (1<<MD_DISK_WRITEMOSTLY);
3788 } else {
3789 info.major = info.minor = 0;
3790 info.raid_disk = -1;
3791 info.state = (1<<MD_DISK_REMOVED);
3792 }
3793
3794 if (copy_to_user(arg, &info, sizeof(info)))
3795 return -EFAULT;
3796
3797 return 0;
3798}
3799
3800static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
3801{
3802 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3803 mdk_rdev_t *rdev;
3804 dev_t dev = MKDEV(info->major,info->minor);
3805
3806 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
3807 return -EOVERFLOW;
3808
3809 if (!mddev->raid_disks) {
3810 int err;
3811
3812 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
3813 if (IS_ERR(rdev)) {
3814 printk(KERN_WARNING
3815 "md: md_import_device returned %ld\n",
3816 PTR_ERR(rdev));
3817 return PTR_ERR(rdev);
3818 }
3819 if (!list_empty(&mddev->disks)) {
3820 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
3821 mdk_rdev_t, same_set);
3822 int err = super_types[mddev->major_version]
3823 .load_super(rdev, rdev0, mddev->minor_version);
3824 if (err < 0) {
3825 printk(KERN_WARNING
3826 "md: %s has different UUID to %s\n",
3827 bdevname(rdev->bdev,b),
3828 bdevname(rdev0->bdev,b2));
3829 export_rdev(rdev);
3830 return -EINVAL;
3831 }
3832 }
3833 err = bind_rdev_to_array(rdev, mddev);
3834 if (err)
3835 export_rdev(rdev);
3836 return err;
3837 }
3838
3839
3840
3841
3842
3843
3844 if (mddev->pers) {
3845 int err;
3846 if (!mddev->pers->hot_add_disk) {
3847 printk(KERN_WARNING
3848 "%s: personality does not support diskops!\n",
3849 mdname(mddev));
3850 return -EINVAL;
3851 }
3852 if (mddev->persistent)
3853 rdev = md_import_device(dev, mddev->major_version,
3854 mddev->minor_version);
3855 else
3856 rdev = md_import_device(dev, -1, -1);
3857 if (IS_ERR(rdev)) {
3858 printk(KERN_WARNING
3859 "md: md_import_device returned %ld\n",
3860 PTR_ERR(rdev));
3861 return PTR_ERR(rdev);
3862 }
3863
3864 if (!mddev->persistent) {
3865 if (info->state & (1<<MD_DISK_SYNC) &&
3866 info->raid_disk < mddev->raid_disks)
3867 rdev->raid_disk = info->raid_disk;
3868 else
3869 rdev->raid_disk = -1;
3870 } else
3871 super_types[mddev->major_version].
3872 validate_super(mddev, rdev);
3873 rdev->saved_raid_disk = rdev->raid_disk;
3874
3875 clear_bit(In_sync, &rdev->flags);
3876 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
3877 set_bit(WriteMostly, &rdev->flags);
3878
3879 rdev->raid_disk = -1;
3880 err = bind_rdev_to_array(rdev, mddev);
3881 if (!err && !mddev->pers->hot_remove_disk) {
3882
3883
3884
3885
3886 super_types[mddev->major_version].
3887 validate_super(mddev, rdev);
3888 err = mddev->pers->hot_add_disk(mddev, rdev);
3889 if (err)
3890 unbind_rdev_from_array(rdev);
3891 }
3892 if (err)
3893 export_rdev(rdev);
3894
3895 md_update_sb(mddev, 1);
3896 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3897 md_wakeup_thread(mddev->thread);
3898 return err;
3899 }
3900
3901
3902
3903
3904 if (mddev->major_version != 0) {
3905 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
3906 mdname(mddev));
3907 return -EINVAL;
3908 }
3909
3910 if (!(info->state & (1<<MD_DISK_FAULTY))) {
3911 int err;
3912 rdev = md_import_device (dev, -1, 0);
3913 if (IS_ERR(rdev)) {
3914 printk(KERN_WARNING
3915 "md: error, md_import_device() returned %ld\n",
3916 PTR_ERR(rdev));
3917 return PTR_ERR(rdev);
3918 }
3919 rdev->desc_nr = info->number;
3920 if (info->raid_disk < mddev->raid_disks)
3921 rdev->raid_disk = info->raid_disk;
3922 else
3923 rdev->raid_disk = -1;
3924
3925 rdev->flags = 0;
3926
3927 if (rdev->raid_disk < mddev->raid_disks)
3928 if (info->state & (1<<MD_DISK_SYNC))
3929 set_bit(In_sync, &rdev->flags);
3930
3931 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
3932 set_bit(WriteMostly, &rdev->flags);
3933
3934 if (!mddev->persistent) {
3935 printk(KERN_INFO "md: nonpersistent superblock ...\n");
3936 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
3937 } else
3938 rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
3939 rdev->size = calc_dev_size(rdev, mddev->chunk_size);
3940
3941 err = bind_rdev_to_array(rdev, mddev);
3942 if (err) {
3943 export_rdev(rdev);
3944 return err;
3945 }
3946 }
3947
3948 return 0;
3949}
3950
3951static int hot_remove_disk(mddev_t * mddev, dev_t dev)
3952{
3953 char b[BDEVNAME_SIZE];
3954 mdk_rdev_t *rdev;
3955
3956 if (!mddev->pers)
3957 return -ENODEV;
3958
3959 rdev = find_rdev(mddev, dev);
3960 if (!rdev)
3961 return -ENXIO;
3962
3963 if (rdev->raid_disk >= 0)
3964 goto busy;
3965
3966 kick_rdev_from_array(rdev);
3967 md_update_sb(mddev, 1);
3968 md_new_event(mddev);
3969
3970 return 0;
3971busy:
3972 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n",
3973 bdevname(rdev->bdev,b), mdname(mddev));
3974 return -EBUSY;
3975}
3976
3977static int hot_add_disk(mddev_t * mddev, dev_t dev)
3978{
3979 char b[BDEVNAME_SIZE];
3980 int err;
3981 unsigned int size;
3982 mdk_rdev_t *rdev;
3983
3984 if (!mddev->pers)
3985 return -ENODEV;
3986
3987 if (mddev->major_version != 0) {
3988 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
3989 " version-0 superblocks.\n",
3990 mdname(mddev));
3991 return -EINVAL;
3992 }
3993 if (!mddev->pers->hot_add_disk) {
3994 printk(KERN_WARNING
3995 "%s: personality does not support diskops!\n",
3996 mdname(mddev));
3997 return -EINVAL;
3998 }
3999
4000 rdev = md_import_device (dev, -1, 0);
4001 if (IS_ERR(rdev)) {
4002 printk(KERN_WARNING
4003 "md: error, md_import_device() returned %ld\n",
4004 PTR_ERR(rdev));
4005 return -EINVAL;
4006 }
4007
4008 if (mddev->persistent)
4009 rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
4010 else
4011 rdev->sb_offset =
4012 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
4013
4014 size = calc_dev_size(rdev, mddev->chunk_size);
4015 rdev->size = size;
4016
4017 if (test_bit(Faulty, &rdev->flags)) {
4018 printk(KERN_WARNING
4019 "md: can not hot-add faulty %s disk to %s!\n",
4020 bdevname(rdev->bdev,b), mdname(mddev));
4021 err = -EINVAL;
4022 goto abort_export;
4023 }
4024 clear_bit(In_sync, &rdev->flags);
4025 rdev->desc_nr = -1;
4026 rdev->saved_raid_disk = -1;
4027 err = bind_rdev_to_array(rdev, mddev);
4028 if (err)
4029 goto abort_export;
4030
4031
4032
4033
4034
4035
4036 if (rdev->desc_nr == mddev->max_disks) {
4037 printk(KERN_WARNING "%s: can not hot-add to full array!\n",
4038 mdname(mddev));
4039 err = -EBUSY;
4040 goto abort_unbind_export;
4041 }
4042
4043 rdev->raid_disk = -1;
4044
4045 md_update_sb(mddev, 1);
4046
4047
4048
4049
4050
4051 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4052 md_wakeup_thread(mddev->thread);
4053 md_new_event(mddev);
4054 return 0;
4055
4056abort_unbind_export:
4057 unbind_rdev_from_array(rdev);
4058
4059abort_export:
4060 export_rdev(rdev);
4061 return err;
4062}
4063
4064static int set_bitmap_file(mddev_t *mddev, int fd)
4065{
4066 int err;
4067
4068 if (mddev->pers) {
4069 if (!mddev->pers->quiesce)
4070 return -EBUSY;
4071 if (mddev->recovery || mddev->sync_thread)
4072 return -EBUSY;
4073
4074 }
4075
4076
4077 if (fd >= 0) {
4078 if (mddev->bitmap)
4079 return -EEXIST;
4080 mddev->bitmap_file = fget(fd);
4081
4082 if (mddev->bitmap_file == NULL) {
4083 printk(KERN_ERR "%s: error: failed to get bitmap file\n",
4084 mdname(mddev));
4085 return -EBADF;
4086 }
4087
4088 err = deny_bitmap_write_access(mddev->bitmap_file);
4089 if (err) {
4090 printk(KERN_ERR "%s: error: bitmap file is already in use\n",
4091 mdname(mddev));
4092 fput(mddev->bitmap_file);
4093 mddev->bitmap_file = NULL;
4094 return err;
4095 }
4096 mddev->bitmap_offset = 0;
4097 } else if (mddev->bitmap == NULL)
4098 return -ENOENT;
4099 err = 0;
4100 if (mddev->pers) {
4101 mddev->pers->quiesce(mddev, 1);
4102 if (fd >= 0)
4103 err = bitmap_create(mddev);
4104 if (fd < 0 || err) {
4105 bitmap_destroy(mddev);
4106 fd = -1;
4107 }
4108 mddev->pers->quiesce(mddev, 0);
4109 }
4110 if (fd < 0) {
4111 if (mddev->bitmap_file) {
4112 restore_bitmap_write_access(mddev->bitmap_file);
4113 fput(mddev->bitmap_file);
4114 }
4115 mddev->bitmap_file = NULL;
4116 }
4117
4118 return err;
4119}
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
4135{
4136
4137 if (info->raid_disks == 0) {
4138
4139 if (info->major_version < 0 ||
4140 info->major_version >= ARRAY_SIZE(super_types) ||
4141 super_types[info->major_version].name == NULL) {
4142
4143 printk(KERN_INFO
4144 "md: superblock version %d not known\n",
4145 info->major_version);
4146 return -EINVAL;
4147 }
4148 mddev->major_version = info->major_version;
4149 mddev->minor_version = info->minor_version;
4150 mddev->patch_version = info->patch_version;
4151 mddev->persistent = !info->not_persistent;
4152 return 0;
4153 }
4154 mddev->major_version = MD_MAJOR_VERSION;
4155 mddev->minor_version = MD_MINOR_VERSION;
4156 mddev->patch_version = MD_PATCHLEVEL_VERSION;
4157 mddev->ctime = get_seconds();
4158
4159 mddev->level = info->level;
4160 mddev->clevel[0] = 0;
4161 mddev->size = info->size;
4162 mddev->raid_disks = info->raid_disks;
4163
4164
4165
4166 if (info->state & (1<<MD_SB_CLEAN))
4167 mddev->recovery_cp = MaxSector;
4168 else
4169 mddev->recovery_cp = 0;
4170 mddev->persistent = ! info->not_persistent;
4171
4172 mddev->layout = info->layout;
4173 mddev->chunk_size = info->chunk_size;
4174
4175 mddev->max_disks = MD_SB_DISKS;
4176
4177 mddev->flags = 0;
4178 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4179
4180 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
4181 mddev->bitmap_offset = 0;
4182
4183 mddev->reshape_position = MaxSector;
4184
4185
4186
4187
4188 get_random_bytes(mddev->uuid, 16);
4189
4190 mddev->new_level = mddev->level;
4191 mddev->new_chunk = mddev->chunk_size;
4192 mddev->new_layout = mddev->layout;
4193 mddev->delta_disks = 0;
4194
4195 return 0;
4196}
4197
4198static int update_size(mddev_t *mddev, unsigned long size)
4199{
4200 mdk_rdev_t * rdev;
4201 int rv;
4202 struct list_head *tmp;
4203 int fit = (size == 0);
4204
4205 if (mddev->pers->resize == NULL)
4206 return -EINVAL;
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217 if (mddev->sync_thread)
4218 return -EBUSY;
4219 ITERATE_RDEV(mddev,rdev,tmp) {
4220 sector_t avail;
4221 avail = rdev->size * 2;
4222
4223 if (fit && (size == 0 || size > avail/2))
4224 size = avail/2;
4225 if (avail < ((sector_t)size << 1))
4226 return -ENOSPC;
4227 }
4228 rv = mddev->pers->resize(mddev, (sector_t)size *2);
4229 if (!rv) {
4230 struct block_device *bdev;
4231
4232 bdev = bdget_disk(mddev->gendisk, 0);
4233 if (bdev) {
4234 mutex_lock(&bdev->bd_inode->i_mutex);
4235 i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10);
4236 mutex_unlock(&bdev->bd_inode->i_mutex);
4237 bdput(bdev);
4238 }
4239 }
4240 return rv;
4241}
4242
4243static int update_raid_disks(mddev_t *mddev, int raid_disks)
4244{
4245 int rv;
4246
4247 if (mddev->pers->check_reshape == NULL)
4248 return -EINVAL;
4249 if (raid_disks <= 0 ||
4250 raid_disks >= mddev->max_disks)
4251 return -EINVAL;
4252 if (mddev->sync_thread || mddev->reshape_position != MaxSector)
4253 return -EBUSY;
4254 mddev->delta_disks = raid_disks - mddev->raid_disks;
4255
4256 rv = mddev->pers->check_reshape(mddev);
4257 return rv;
4258}
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
4270{
4271 int rv = 0;
4272 int cnt = 0;
4273 int state = 0;
4274
4275
4276 if (mddev->bitmap && mddev->bitmap_offset)
4277 state |= (1 << MD_SB_BITMAP_PRESENT);
4278
4279 if (mddev->major_version != info->major_version ||
4280 mddev->minor_version != info->minor_version ||
4281
4282 mddev->ctime != info->ctime ||
4283 mddev->level != info->level ||
4284
4285 !mddev->persistent != info->not_persistent||
4286 mddev->chunk_size != info->chunk_size ||
4287
4288 ((state^info->state) & 0xfffffe00)
4289 )
4290 return -EINVAL;
4291
4292 if (info->size >= 0 && mddev->size != info->size) cnt++;
4293 if (mddev->raid_disks != info->raid_disks) cnt++;
4294 if (mddev->layout != info->layout) cnt++;
4295 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
4296 if (cnt == 0) return 0;
4297 if (cnt > 1) return -EINVAL;
4298
4299 if (mddev->layout != info->layout) {
4300
4301
4302
4303
4304 if (mddev->pers->reconfig == NULL)
4305 return -EINVAL;
4306 else
4307 return mddev->pers->reconfig(mddev, info->layout, -1);
4308 }
4309 if (info->size >= 0 && mddev->size != info->size)
4310 rv = update_size(mddev, info->size);
4311
4312 if (mddev->raid_disks != info->raid_disks)
4313 rv = update_raid_disks(mddev, info->raid_disks);
4314
4315 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
4316 if (mddev->pers->quiesce == NULL)
4317 return -EINVAL;
4318 if (mddev->recovery || mddev->sync_thread)
4319 return -EBUSY;
4320 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
4321
4322 if (mddev->bitmap)
4323 return -EEXIST;
4324 if (mddev->default_bitmap_offset == 0)
4325 return -EINVAL;
4326 mddev->bitmap_offset = mddev->default_bitmap_offset;
4327 mddev->pers->quiesce(mddev, 1);
4328 rv = bitmap_create(mddev);
4329 if (rv)
4330 bitmap_destroy(mddev);
4331 mddev->pers->quiesce(mddev, 0);
4332 } else {
4333
4334 if (!mddev->bitmap)
4335 return -ENOENT;
4336 if (mddev->bitmap->file)
4337 return -EINVAL;
4338 mddev->pers->quiesce(mddev, 1);
4339 bitmap_destroy(mddev);
4340 mddev->pers->quiesce(mddev, 0);
4341 mddev->bitmap_offset = 0;
4342 }
4343 }
4344 md_update_sb(mddev, 1);
4345 return rv;
4346}
4347
4348static int set_disk_faulty(mddev_t *mddev, dev_t dev)
4349{
4350 mdk_rdev_t *rdev;
4351
4352 if (mddev->pers == NULL)
4353 return -ENODEV;
4354
4355 rdev = find_rdev(mddev, dev);
4356 if (!rdev)
4357 return -ENODEV;
4358
4359 md_error(mddev, rdev);
4360 return 0;
4361}
4362
4363static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
4364{
4365 mddev_t *mddev = bdev->bd_disk->private_data;
4366
4367 geo->heads = 2;
4368 geo->sectors = 4;
4369 geo->cylinders = get_capacity(mddev->gendisk) / 8;
4370 return 0;
4371}
4372
4373static int md_ioctl(struct inode *inode, struct file *file,
4374 unsigned int cmd, unsigned long arg)
4375{
4376 int err = 0;
4377 void __user *argp = (void __user *)arg;
4378 mddev_t *mddev = NULL;
4379
4380 if (!capable(CAP_SYS_ADMIN))
4381 return -EACCES;
4382
4383
4384
4385
4386
4387 switch (cmd)
4388 {
4389 case RAID_VERSION:
4390 err = get_version(argp);
4391 goto done;
4392
4393 case PRINT_RAID_DEBUG:
4394 err = 0;
4395 md_print_devices();
4396 goto done;
4397
4398#ifndef MODULE
4399 case RAID_AUTORUN:
4400 err = 0;
4401 autostart_arrays(arg);
4402 goto done;
4403#endif
4404 default:;
4405 }
4406
4407
4408
4409
4410
4411 mddev = inode->i_bdev->bd_disk->private_data;
4412
4413 if (!mddev) {
4414 BUG();
4415 goto abort;
4416 }
4417
4418 err = mddev_lock(mddev);
4419 if (err) {
4420 printk(KERN_INFO
4421 "md: ioctl lock interrupted, reason %d, cmd %d\n",
4422 err, cmd);
4423 goto abort;
4424 }
4425
4426 switch (cmd)
4427 {
4428 case SET_ARRAY_INFO:
4429 {
4430 mdu_array_info_t info;
4431 if (!arg)
4432 memset(&info, 0, sizeof(info));
4433 else if (copy_from_user(&info, argp, sizeof(info))) {
4434 err = -EFAULT;
4435 goto abort_unlock;
4436 }
4437 if (mddev->pers) {
4438 err = update_array_info(mddev, &info);
4439 if (err) {
4440 printk(KERN_WARNING "md: couldn't update"
4441 " array info. %d\n", err);
4442 goto abort_unlock;
4443 }
4444 goto done_unlock;
4445 }
4446 if (!list_empty(&mddev->disks)) {
4447 printk(KERN_WARNING
4448 "md: array %s already has disks!\n",
4449 mdname(mddev));
4450 err = -EBUSY;
4451 goto abort_unlock;
4452 }
4453 if (mddev->raid_disks) {
4454 printk(KERN_WARNING
4455 "md: array %s already initialised!\n",
4456 mdname(mddev));
4457 err = -EBUSY;
4458 goto abort_unlock;
4459 }
4460 err = set_array_info(mddev, &info);
4461 if (err) {
4462 printk(KERN_WARNING "md: couldn't set"
4463 " array info. %d\n", err);
4464 goto abort_unlock;
4465 }
4466 }
4467 goto done_unlock;
4468
4469 default:;
4470 }
4471
4472
4473
4474
4475
4476
4477 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
4478 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
4479 && cmd != GET_BITMAP_FILE) {
4480 err = -ENODEV;
4481 goto abort_unlock;
4482 }
4483
4484
4485
4486
4487 switch (cmd)
4488 {
4489 case GET_ARRAY_INFO:
4490 err = get_array_info(mddev, argp);
4491 goto done_unlock;
4492
4493 case GET_BITMAP_FILE:
4494 err = get_bitmap_file(mddev, argp);
4495 goto done_unlock;
4496
4497 case GET_DISK_INFO:
4498 err = get_disk_info(mddev, argp);
4499 goto done_unlock;
4500
4501 case RESTART_ARRAY_RW:
4502 err = restart_array(mddev);
4503 goto done_unlock;
4504
4505 case STOP_ARRAY:
4506 err = do_md_stop (mddev, 0);
4507 goto done_unlock;
4508
4509 case STOP_ARRAY_RO:
4510 err = do_md_stop (mddev, 1);
4511 goto done_unlock;
4512
4513
4514
4515
4516
4517
4518
4519 }
4520
4521
4522
4523
4524
4525
4526
4527
4528 if (_IOC_TYPE(cmd) == MD_MAJOR &&
4529 mddev->ro && mddev->pers) {
4530 if (mddev->ro == 2) {
4531 mddev->ro = 0;
4532 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4533 md_wakeup_thread(mddev->thread);
4534
4535 } else {
4536 err = -EROFS;
4537 goto abort_unlock;
4538 }
4539 }
4540
4541 switch (cmd)
4542 {
4543 case ADD_NEW_DISK:
4544 {
4545 mdu_disk_info_t info;
4546 if (copy_from_user(&info, argp, sizeof(info)))
4547 err = -EFAULT;
4548 else
4549 err = add_new_disk(mddev, &info);
4550 goto done_unlock;
4551 }
4552
4553 case HOT_REMOVE_DISK:
4554 err = hot_remove_disk(mddev, new_decode_dev(arg));
4555 goto done_unlock;
4556
4557 case HOT_ADD_DISK:
4558 err = hot_add_disk(mddev, new_decode_dev(arg));
4559 goto done_unlock;
4560
4561 case SET_DISK_FAULTY:
4562 err = set_disk_faulty(mddev, new_decode_dev(arg));
4563 goto done_unlock;
4564
4565 case RUN_ARRAY:
4566 err = do_md_run (mddev);
4567 goto done_unlock;
4568
4569 case SET_BITMAP_FILE:
4570 err = set_bitmap_file(mddev, (int)arg);
4571 goto done_unlock;
4572
4573 default:
4574 err = -EINVAL;
4575 goto abort_unlock;
4576 }
4577
4578done_unlock:
4579abort_unlock:
4580 mddev_unlock(mddev);
4581
4582 return err;
4583done:
4584 if (err)
4585 MD_BUG();
4586abort:
4587 return err;
4588}
4589
4590static int md_open(struct inode *inode, struct file *file)
4591{
4592
4593
4594
4595
4596 mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
4597 int err;
4598
4599 if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
4600 goto out;
4601
4602 err = 0;
4603 mddev_get(mddev);
4604 mddev_unlock(mddev);
4605
4606 check_disk_change(inode->i_bdev);
4607 out:
4608 return err;
4609}
4610
4611static int md_release(struct inode *inode, struct file * file)
4612{
4613 mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
4614
4615 BUG_ON(!mddev);
4616 mddev_put(mddev);
4617
4618 return 0;
4619}
4620
4621static int md_media_changed(struct gendisk *disk)
4622{
4623 mddev_t *mddev = disk->private_data;
4624
4625 return mddev->changed;
4626}
4627
4628static int md_revalidate(struct gendisk *disk)
4629{
4630 mddev_t *mddev = disk->private_data;
4631
4632 mddev->changed = 0;
4633 return 0;
4634}
4635static struct block_device_operations md_fops =
4636{
4637 .owner = THIS_MODULE,
4638 .open = md_open,
4639 .release = md_release,
4640 .ioctl = md_ioctl,
4641 .getgeo = md_getgeo,
4642 .media_changed = md_media_changed,
4643 .revalidate_disk= md_revalidate,
4644};
4645
4646static int md_thread(void * arg)
4647{
4648 mdk_thread_t *thread = arg;
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662 allow_signal(SIGKILL);
4663 while (!kthread_should_stop()) {
4664
4665
4666
4667
4668
4669
4670 if (signal_pending(current))
4671 flush_signals(current);
4672
4673 wait_event_interruptible_timeout
4674 (thread->wqueue,
4675 test_bit(THREAD_WAKEUP, &thread->flags)
4676 || kthread_should_stop(),
4677 thread->timeout);
4678
4679 clear_bit(THREAD_WAKEUP, &thread->flags);
4680
4681 thread->run(thread->mddev);
4682 }
4683
4684 return 0;
4685}
4686
4687void md_wakeup_thread(mdk_thread_t *thread)
4688{
4689 if (thread) {
4690 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
4691 set_bit(THREAD_WAKEUP, &thread->flags);
4692 wake_up(&thread->wqueue);
4693 }
4694}
4695
4696mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
4697 const char *name)
4698{
4699 mdk_thread_t *thread;
4700
4701 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
4702 if (!thread)
4703 return NULL;
4704
4705 init_waitqueue_head(&thread->wqueue);
4706
4707 thread->run = run;
4708 thread->mddev = mddev;
4709 thread->timeout = MAX_SCHEDULE_TIMEOUT;
4710 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
4711 if (IS_ERR(thread->tsk)) {
4712 kfree(thread);
4713 return NULL;
4714 }
4715 return thread;
4716}
4717
4718void md_unregister_thread(mdk_thread_t *thread)
4719{
4720 dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
4721
4722 kthread_stop(thread->tsk);
4723 kfree(thread);
4724}
4725
4726void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
4727{
4728 if (!mddev) {
4729 MD_BUG();
4730 return;
4731 }
4732
4733 if (!rdev || test_bit(Faulty, &rdev->flags))
4734 return;
4735
4736
4737
4738
4739
4740
4741
4742 if (!mddev->pers)
4743 return;
4744 if (!mddev->pers->error_handler)
4745 return;
4746 mddev->pers->error_handler(mddev,rdev);
4747 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4748 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4749 md_wakeup_thread(mddev->thread);
4750 md_new_event_inintr(mddev);
4751}
4752
4753
4754
4755static void status_unused(struct seq_file *seq)
4756{
4757 int i = 0;
4758 mdk_rdev_t *rdev;
4759 struct list_head *tmp;
4760
4761 seq_printf(seq, "unused devices: ");
4762
4763 ITERATE_RDEV_PENDING(rdev,tmp) {
4764 char b[BDEVNAME_SIZE];
4765 i++;
4766 seq_printf(seq, "%s ",
4767 bdevname(rdev->bdev,b));
4768 }
4769 if (!i)
4770 seq_printf(seq, "<none>");
4771
4772 seq_printf(seq, "\n");
4773}
4774
4775
4776static void status_resync(struct seq_file *seq, mddev_t * mddev)
4777{
4778 sector_t max_blocks, resync, res;
4779 unsigned long dt, db, rt;
4780 int scale;
4781 unsigned int per_milli;
4782
4783 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
4784
4785 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
4786 max_blocks = mddev->resync_max_sectors >> 1;
4787 else
4788 max_blocks = mddev->size;
4789
4790
4791
4792
4793 if (!max_blocks) {
4794 MD_BUG();
4795 return;
4796 }
4797
4798
4799
4800
4801
4802 scale = 10;
4803 if (sizeof(sector_t) > sizeof(unsigned long)) {
4804 while ( max_blocks/2 > (1ULL<<(scale+32)))
4805 scale++;
4806 }
4807 res = (resync>>scale)*1000;
4808 sector_div(res, (u32)((max_blocks>>scale)+1));
4809
4810 per_milli = res;
4811 {
4812 int i, x = per_milli/50, y = 20-x;
4813 seq_printf(seq, "[");
4814 for (i = 0; i < x; i++)
4815 seq_printf(seq, "=");
4816 seq_printf(seq, ">");
4817 for (i = 0; i < y; i++)
4818 seq_printf(seq, ".");
4819 seq_printf(seq, "] ");
4820 }
4821 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
4822 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
4823 "reshape" :
4824 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
4825 "check" :
4826 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
4827 "resync" : "recovery"))),
4828 per_milli/10, per_milli % 10,
4829 (unsigned long long) resync,
4830 (unsigned long long) max_blocks);
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841 dt = ((jiffies - mddev->resync_mark) / HZ);
4842 if (!dt) dt++;
4843 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
4844 - mddev->resync_mark_cnt;
4845 rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100;
4846
4847 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
4848
4849 seq_printf(seq, " speed=%ldK/sec", db/2/dt);
4850}
4851
4852static void *md_seq_start(struct seq_file *seq, loff_t *pos)
4853{
4854 struct list_head *tmp;
4855 loff_t l = *pos;
4856 mddev_t *mddev;
4857
4858 if (l >= 0x10000)
4859 return NULL;
4860 if (!l--)
4861
4862 return (void*)1;
4863
4864 spin_lock(&all_mddevs_lock);
4865 list_for_each(tmp,&all_mddevs)
4866 if (!l--) {
4867 mddev = list_entry(tmp, mddev_t, all_mddevs);
4868 mddev_get(mddev);
4869 spin_unlock(&all_mddevs_lock);
4870 return mddev;
4871 }
4872 spin_unlock(&all_mddevs_lock);
4873 if (!l--)
4874 return (void*)2;
4875 return NULL;
4876}
4877
4878static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4879{
4880 struct list_head *tmp;
4881 mddev_t *next_mddev, *mddev = v;
4882
4883 ++*pos;
4884 if (v == (void*)2)
4885 return NULL;
4886
4887 spin_lock(&all_mddevs_lock);
4888 if (v == (void*)1)
4889 tmp = all_mddevs.next;
4890 else
4891 tmp = mddev->all_mddevs.next;
4892 if (tmp != &all_mddevs)
4893 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
4894 else {
4895 next_mddev = (void*)2;
4896 *pos = 0x10000;
4897 }
4898 spin_unlock(&all_mddevs_lock);
4899
4900 if (v != (void*)1)
4901 mddev_put(mddev);
4902 return next_mddev;
4903
4904}
4905
4906static void md_seq_stop(struct seq_file *seq, void *v)
4907{
4908 mddev_t *mddev = v;
4909
4910 if (mddev && v != (void*)1 && v != (void*)2)
4911 mddev_put(mddev);
4912}
4913
4914struct mdstat_info {
4915 int event;
4916};
4917
4918static int md_seq_show(struct seq_file *seq, void *v)
4919{
4920 mddev_t *mddev = v;
4921 sector_t size;
4922 struct list_head *tmp2;
4923 mdk_rdev_t *rdev;
4924 struct mdstat_info *mi = seq->private;
4925 struct bitmap *bitmap;
4926
4927 if (v == (void*)1) {
4928 struct mdk_personality *pers;
4929 seq_printf(seq, "Personalities : ");
4930 spin_lock(&pers_lock);
4931 list_for_each_entry(pers, &pers_list, list)
4932 seq_printf(seq, "[%s] ", pers->name);
4933
4934 spin_unlock(&pers_lock);
4935 seq_printf(seq, "\n");
4936 mi->event = atomic_read(&md_event_count);
4937 return 0;
4938 }
4939 if (v == (void*)2) {
4940 status_unused(seq);
4941 return 0;
4942 }
4943
4944 if (mddev_lock(mddev) < 0)
4945 return -EINTR;
4946
4947 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
4948 seq_printf(seq, "%s : %sactive", mdname(mddev),
4949 mddev->pers ? "" : "in");
4950 if (mddev->pers) {
4951 if (mddev->ro==1)
4952 seq_printf(seq, " (read-only)");
4953 if (mddev->ro==2)
4954 seq_printf(seq, "(auto-read-only)");
4955 seq_printf(seq, " %s", mddev->pers->name);
4956 }
4957
4958 size = 0;
4959 ITERATE_RDEV(mddev,rdev,tmp2) {
4960 char b[BDEVNAME_SIZE];
4961 seq_printf(seq, " %s[%d]",
4962 bdevname(rdev->bdev,b), rdev->desc_nr);
4963 if (test_bit(WriteMostly, &rdev->flags))
4964 seq_printf(seq, "(W)");
4965 if (test_bit(Faulty, &rdev->flags)) {
4966 seq_printf(seq, "(F)");
4967 continue;
4968 } else if (rdev->raid_disk < 0)
4969 seq_printf(seq, "(S)");
4970 size += rdev->size;
4971 }
4972
4973 if (!list_empty(&mddev->disks)) {
4974 if (mddev->pers)
4975 seq_printf(seq, "\n %llu blocks",
4976 (unsigned long long)mddev->array_size);
4977 else
4978 seq_printf(seq, "\n %llu blocks",
4979 (unsigned long long)size);
4980 }
4981 if (mddev->persistent) {
4982 if (mddev->major_version != 0 ||
4983 mddev->minor_version != 90) {
4984 seq_printf(seq," super %d.%d",
4985 mddev->major_version,
4986 mddev->minor_version);
4987 }
4988 } else
4989 seq_printf(seq, " super non-persistent");
4990
4991 if (mddev->pers) {
4992 mddev->pers->status (seq, mddev);
4993 seq_printf(seq, "\n ");
4994 if (mddev->pers->sync_request) {
4995 if (mddev->curr_resync > 2) {
4996 status_resync (seq, mddev);
4997 seq_printf(seq, "\n ");
4998 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
4999 seq_printf(seq, "\tresync=DELAYED\n ");
5000 else if (mddev->recovery_cp < MaxSector)
5001 seq_printf(seq, "\tresync=PENDING\n ");
5002 }
5003 } else
5004 seq_printf(seq, "\n ");
5005
5006 if ((bitmap = mddev->bitmap)) {
5007 unsigned long chunk_kb;
5008 unsigned long flags;
5009 spin_lock_irqsave(&bitmap->lock, flags);
5010 chunk_kb = bitmap->chunksize >> 10;
5011 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
5012 "%lu%s chunk",
5013 bitmap->pages - bitmap->missing_pages,
5014 bitmap->pages,
5015 (bitmap->pages - bitmap->missing_pages)
5016 << (PAGE_SHIFT - 10),
5017 chunk_kb ? chunk_kb : bitmap->chunksize,
5018 chunk_kb ? "KB" : "B");
5019 if (bitmap->file) {
5020 seq_printf(seq, ", file: ");
5021 seq_path(seq, bitmap->file->f_path.mnt,
5022 bitmap->file->f_path.dentry," \t\n");
5023 }
5024
5025 seq_printf(seq, "\n");
5026 spin_unlock_irqrestore(&bitmap->lock, flags);
5027 }
5028
5029 seq_printf(seq, "\n");
5030 }
5031 mddev_unlock(mddev);
5032
5033 return 0;
5034}
5035
5036static struct seq_operations md_seq_ops = {
5037 .start = md_seq_start,
5038 .next = md_seq_next,
5039 .stop = md_seq_stop,
5040 .show = md_seq_show,
5041};
5042
5043static int md_seq_open(struct inode *inode, struct file *file)
5044{
5045 int error;
5046 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
5047 if (mi == NULL)
5048 return -ENOMEM;
5049
5050 error = seq_open(file, &md_seq_ops);
5051 if (error)
5052 kfree(mi);
5053 else {
5054 struct seq_file *p = file->private_data;
5055 p->private = mi;
5056 mi->event = atomic_read(&md_event_count);
5057 }
5058 return error;
5059}
5060
5061static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
5062{
5063 struct seq_file *m = filp->private_data;
5064 struct mdstat_info *mi = m->private;
5065 int mask;
5066
5067 poll_wait(filp, &md_event_waiters, wait);
5068
5069
5070 mask = POLLIN | POLLRDNORM;
5071
5072 if (mi->event != atomic_read(&md_event_count))
5073 mask |= POLLERR | POLLPRI;
5074 return mask;
5075}
5076
5077static const struct file_operations md_seq_fops = {
5078 .owner = THIS_MODULE,
5079 .open = md_seq_open,
5080 .read = seq_read,
5081 .llseek = seq_lseek,
5082 .release = seq_release_private,
5083 .poll = mdstat_poll,
5084};
5085
5086int register_md_personality(struct mdk_personality *p)
5087{
5088 spin_lock(&pers_lock);
5089 list_add_tail(&p->list, &pers_list);
5090 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
5091 spin_unlock(&pers_lock);
5092 return 0;
5093}
5094
5095int unregister_md_personality(struct mdk_personality *p)
5096{
5097 printk(KERN_INFO "md: %s personality unregistered\n", p->name);
5098 spin_lock(&pers_lock);
5099 list_del_init(&p->list);
5100 spin_unlock(&pers_lock);
5101 return 0;
5102}
5103
5104static int is_mddev_idle(mddev_t *mddev)
5105{
5106 mdk_rdev_t * rdev;
5107 struct list_head *tmp;
5108 int idle;
5109 long curr_events;
5110
5111 idle = 1;
5112 ITERATE_RDEV(mddev,rdev,tmp) {
5113 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
5114 curr_events = disk_stat_read(disk, sectors[0]) +
5115 disk_stat_read(disk, sectors[1]) -
5116 atomic_read(&disk->sync_io);
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139 if (curr_events - rdev->last_events > 4096) {
5140 rdev->last_events = curr_events;
5141 idle = 0;
5142 }
5143 }
5144 return idle;
5145}
5146
5147void md_done_sync(mddev_t *mddev, int blocks, int ok)
5148{
5149
5150 atomic_sub(blocks, &mddev->recovery_active);
5151 wake_up(&mddev->recovery_wait);
5152 if (!ok) {
5153 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
5154 md_wakeup_thread(mddev->thread);
5155
5156 }
5157}
5158
5159
5160
5161
5162
5163
5164
5165void md_write_start(mddev_t *mddev, struct bio *bi)
5166{
5167 if (bio_data_dir(bi) != WRITE)
5168 return;
5169
5170 BUG_ON(mddev->ro == 1);
5171 if (mddev->ro == 2) {
5172
5173 mddev->ro = 0;
5174 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5175 md_wakeup_thread(mddev->thread);
5176 }
5177 atomic_inc(&mddev->writes_pending);
5178 if (mddev->in_sync) {
5179 spin_lock_irq(&mddev->write_lock);
5180 if (mddev->in_sync) {
5181 mddev->in_sync = 0;
5182 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5183 md_wakeup_thread(mddev->thread);
5184 }
5185 spin_unlock_irq(&mddev->write_lock);
5186 }
5187 wait_event(mddev->sb_wait, mddev->flags==0);
5188}
5189
5190void md_write_end(mddev_t *mddev)
5191{
5192 if (atomic_dec_and_test(&mddev->writes_pending)) {
5193 if (mddev->safemode == 2)
5194 md_wakeup_thread(mddev->thread);
5195 else if (mddev->safemode_delay)
5196 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
5197 }
5198}
5199
5200
5201
5202
5203
5204
5205
5206void md_allow_write(mddev_t *mddev)
5207{
5208 if (!mddev->pers)
5209 return;
5210 if (mddev->ro)
5211 return;
5212
5213 spin_lock_irq(&mddev->write_lock);
5214 if (mddev->in_sync) {
5215 mddev->in_sync = 0;
5216 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5217 if (mddev->safemode_delay &&
5218 mddev->safemode == 0)
5219 mddev->safemode = 1;
5220 spin_unlock_irq(&mddev->write_lock);
5221 md_update_sb(mddev, 0);
5222 } else
5223 spin_unlock_irq(&mddev->write_lock);
5224}
5225EXPORT_SYMBOL_GPL(md_allow_write);
5226
5227static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
5228
5229#define SYNC_MARKS 10
5230#define SYNC_MARK_STEP (3*HZ)
5231void md_do_sync(mddev_t *mddev)
5232{
5233 mddev_t *mddev2;
5234 unsigned int currspeed = 0,
5235 window;
5236 sector_t max_sectors,j, io_sectors;
5237 unsigned long mark[SYNC_MARKS];
5238 sector_t mark_cnt[SYNC_MARKS];
5239 int last_mark,m;
5240 struct list_head *tmp;
5241 sector_t last_check;
5242 int skipped = 0;
5243 struct list_head *rtmp;
5244 mdk_rdev_t *rdev;
5245 char *desc;
5246
5247
5248 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
5249 return;
5250 if (mddev->ro)
5251 return;
5252
5253 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5254 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
5255 desc = "data-check";
5256 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5257 desc = "requested-resync";
5258 else
5259 desc = "resync";
5260 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5261 desc = "reshape";
5262 else
5263 desc = "recovery";
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281 do {
5282 mddev->curr_resync = 2;
5283
5284 try_again:
5285 if (kthread_should_stop()) {
5286 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5287 goto skip;
5288 }
5289 ITERATE_MDDEV(mddev2,tmp) {
5290 if (mddev2 == mddev)
5291 continue;
5292 if (mddev2->curr_resync &&
5293 match_mddev_units(mddev,mddev2)) {
5294 DEFINE_WAIT(wq);
5295 if (mddev < mddev2 && mddev->curr_resync == 2) {
5296
5297 mddev->curr_resync = 1;
5298 wake_up(&resync_wait);
5299 }
5300 if (mddev > mddev2 && mddev->curr_resync == 1)
5301
5302
5303
5304 continue;
5305 prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE);
5306 if (!kthread_should_stop() &&
5307 mddev2->curr_resync >= mddev->curr_resync) {
5308 printk(KERN_INFO "md: delaying %s of %s"
5309 " until %s has finished (they"
5310 " share one or more physical units)\n",
5311 desc, mdname(mddev), mdname(mddev2));
5312 mddev_put(mddev2);
5313 schedule();
5314 finish_wait(&resync_wait, &wq);
5315 goto try_again;
5316 }
5317 finish_wait(&resync_wait, &wq);
5318 }
5319 }
5320 } while (mddev->curr_resync < 2);
5321
5322 j = 0;
5323 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5324
5325
5326
5327 max_sectors = mddev->resync_max_sectors;
5328 mddev->resync_mismatches = 0;
5329
5330 if (!mddev->bitmap &&
5331 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5332 j = mddev->recovery_cp;
5333 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5334 max_sectors = mddev->size << 1;
5335 else {
5336
5337 max_sectors = mddev->size << 1;
5338 j = MaxSector;
5339 ITERATE_RDEV(mddev,rdev,rtmp)
5340 if (rdev->raid_disk >= 0 &&
5341 !test_bit(Faulty, &rdev->flags) &&
5342 !test_bit(In_sync, &rdev->flags) &&
5343 rdev->recovery_offset < j)
5344 j = rdev->recovery_offset;
5345 }
5346
5347 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
5348 printk(KERN_INFO "md: minimum _guaranteed_ speed:"
5349 " %d KB/sec/disk.\n", speed_min(mddev));
5350 printk(KERN_INFO "md: using maximum available idle IO bandwidth "
5351 "(but not more than %d KB/sec) for %s.\n",
5352 speed_max(mddev), desc);
5353
5354 is_mddev_idle(mddev);
5355
5356 io_sectors = 0;
5357 for (m = 0; m < SYNC_MARKS; m++) {
5358 mark[m] = jiffies;
5359 mark_cnt[m] = io_sectors;
5360 }
5361 last_mark = 0;
5362 mddev->resync_mark = mark[last_mark];
5363 mddev->resync_mark_cnt = mark_cnt[last_mark];
5364
5365
5366
5367
5368 window = 32*(PAGE_SIZE/512);
5369 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
5370 window/2,(unsigned long long) max_sectors/2);
5371
5372 atomic_set(&mddev->recovery_active, 0);
5373 init_waitqueue_head(&mddev->recovery_wait);
5374 last_check = 0;
5375
5376 if (j>2) {
5377 printk(KERN_INFO
5378 "md: resuming %s of %s from checkpoint.\n",
5379 desc, mdname(mddev));
5380 mddev->curr_resync = j;
5381 }
5382
5383 while (j < max_sectors) {
5384 sector_t sectors;
5385
5386 skipped = 0;
5387 sectors = mddev->pers->sync_request(mddev, j, &skipped,
5388 currspeed < speed_min(mddev));
5389 if (sectors == 0) {
5390 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
5391 goto out;
5392 }
5393
5394 if (!skipped) {
5395 io_sectors += sectors;
5396 atomic_add(sectors, &mddev->recovery_active);
5397 }
5398
5399 j += sectors;
5400 if (j>1) mddev->curr_resync = j;
5401 mddev->curr_mark_cnt = io_sectors;
5402 if (last_check == 0)
5403
5404
5405
5406 md_new_event(mddev);
5407
5408 if (last_check + window > io_sectors || j == max_sectors)
5409 continue;
5410
5411 last_check = io_sectors;
5412
5413 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
5414 test_bit(MD_RECOVERY_ERR, &mddev->recovery))
5415 break;
5416
5417 repeat:
5418 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
5419
5420 int next = (last_mark+1) % SYNC_MARKS;
5421
5422 mddev->resync_mark = mark[next];
5423 mddev->resync_mark_cnt = mark_cnt[next];
5424 mark[next] = jiffies;
5425 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
5426 last_mark = next;
5427 }
5428
5429
5430 if (kthread_should_stop()) {
5431
5432
5433
5434 printk(KERN_INFO
5435 "md: md_do_sync() got signal ... exiting\n");
5436 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5437 goto out;
5438 }
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448 blk_unplug(mddev->queue);
5449 cond_resched();
5450
5451 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
5452 /((jiffies-mddev->resync_mark)/HZ +1) +1;
5453
5454 if (currspeed > speed_min(mddev)) {
5455 if ((currspeed > speed_max(mddev)) ||
5456 !is_mddev_idle(mddev)) {
5457 msleep(500);
5458 goto repeat;
5459 }
5460 }
5461 }
5462 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
5463
5464
5465
5466 out:
5467 blk_unplug(mddev->queue);
5468
5469 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
5470
5471
5472 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
5473
5474 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
5475 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
5476 mddev->curr_resync > 2) {
5477 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5478 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5479 if (mddev->curr_resync >= mddev->recovery_cp) {
5480 printk(KERN_INFO
5481 "md: checkpointing %s of %s.\n",
5482 desc, mdname(mddev));
5483 mddev->recovery_cp = mddev->curr_resync;
5484 }
5485 } else
5486 mddev->recovery_cp = MaxSector;
5487 } else {
5488 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5489 mddev->curr_resync = MaxSector;
5490 ITERATE_RDEV(mddev,rdev,rtmp)
5491 if (rdev->raid_disk >= 0 &&
5492 !test_bit(Faulty, &rdev->flags) &&
5493 !test_bit(In_sync, &rdev->flags) &&
5494 rdev->recovery_offset < mddev->curr_resync)
5495 rdev->recovery_offset = mddev->curr_resync;
5496 }
5497 }
5498 set_bit(MD_CHANGE_DEVS, &mddev->flags);
5499
5500 skip:
5501 mddev->curr_resync = 0;
5502 wake_up(&resync_wait);
5503 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
5504 md_wakeup_thread(mddev->thread);
5505}
5506EXPORT_SYMBOL_GPL(md_do_sync);
5507
5508
5509static int remove_and_add_spares(mddev_t *mddev)
5510{
5511 mdk_rdev_t *rdev;
5512 struct list_head *rtmp;
5513 int spares = 0;
5514
5515 ITERATE_RDEV(mddev,rdev,rtmp)
5516 if (rdev->raid_disk >= 0 &&
5517 (test_bit(Faulty, &rdev->flags) ||
5518 ! test_bit(In_sync, &rdev->flags)) &&
5519 atomic_read(&rdev->nr_pending)==0) {
5520 if (mddev->pers->hot_remove_disk(
5521 mddev, rdev->raid_disk)==0) {
5522 char nm[20];
5523 sprintf(nm,"rd%d", rdev->raid_disk);
5524 sysfs_remove_link(&mddev->kobj, nm);
5525 rdev->raid_disk = -1;
5526 }
5527 }
5528
5529 if (mddev->degraded) {
5530 ITERATE_RDEV(mddev,rdev,rtmp)
5531 if (rdev->raid_disk < 0
5532 && !test_bit(Faulty, &rdev->flags)) {
5533 rdev->recovery_offset = 0;
5534 if (mddev->pers->hot_add_disk(mddev,rdev)) {
5535 char nm[20];
5536 sprintf(nm, "rd%d", rdev->raid_disk);
5537 if (sysfs_create_link(&mddev->kobj,
5538 &rdev->kobj, nm))
5539 printk(KERN_WARNING
5540 "md: cannot register "
5541 "%s for %s\n",
5542 nm, mdname(mddev));
5543 spares++;
5544 md_new_event(mddev);
5545 } else
5546 break;
5547 }
5548 }
5549 return spares;
5550}
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573void md_check_recovery(mddev_t *mddev)
5574{
5575 mdk_rdev_t *rdev;
5576 struct list_head *rtmp;
5577
5578
5579 if (mddev->bitmap)
5580 bitmap_daemon_work(mddev->bitmap);
5581
5582 if (mddev->ro)
5583 return;
5584
5585 if (signal_pending(current)) {
5586 if (mddev->pers->sync_request) {
5587 printk(KERN_INFO "md: %s in immediate safe mode\n",
5588 mdname(mddev));
5589 mddev->safemode = 2;
5590 }
5591 flush_signals(current);
5592 }
5593
5594 if ( ! (
5595 mddev->flags ||
5596 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
5597 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
5598 (mddev->safemode == 1) ||
5599 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
5600 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
5601 ))
5602 return;
5603
5604 if (mddev_trylock(mddev)) {
5605 int spares = 0;
5606
5607 spin_lock_irq(&mddev->write_lock);
5608 if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
5609 !mddev->in_sync && mddev->recovery_cp == MaxSector) {
5610 mddev->in_sync = 1;
5611 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5612 }
5613 if (mddev->safemode == 1)
5614 mddev->safemode = 0;
5615 spin_unlock_irq(&mddev->write_lock);
5616
5617 if (mddev->flags)
5618 md_update_sb(mddev, 0);
5619
5620
5621 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
5622 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
5623
5624 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5625 goto unlock;
5626 }
5627 if (mddev->sync_thread) {
5628
5629 md_unregister_thread(mddev->sync_thread);
5630 mddev->sync_thread = NULL;
5631 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
5632 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5633
5634
5635 mddev->pers->spare_active(mddev);
5636 }
5637 md_update_sb(mddev, 1);
5638
5639
5640
5641
5642 if (!mddev->degraded)
5643 ITERATE_RDEV(mddev,rdev,rtmp)
5644 rdev->saved_raid_disk = -1;
5645
5646 mddev->recovery = 0;
5647
5648 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5649 md_new_event(mddev);
5650 goto unlock;
5651 }
5652
5653
5654
5655 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5656 clear_bit(MD_RECOVERY_ERR, &mddev->recovery);
5657 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
5658 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
5659
5660 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
5661 goto unlock;
5662
5663
5664
5665
5666
5667
5668
5669 if (mddev->reshape_position != MaxSector) {
5670 if (mddev->pers->check_reshape(mddev) != 0)
5671
5672 goto unlock;
5673 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
5674 } else if ((spares = remove_and_add_spares(mddev))) {
5675 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5676 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
5677 } else if (mddev->recovery_cp < MaxSector) {
5678 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5679 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5680
5681 goto unlock;
5682
5683 if (mddev->pers->sync_request) {
5684 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
5685 if (spares && mddev->bitmap && ! mddev->bitmap->file) {
5686
5687
5688
5689
5690 bitmap_write_all(mddev->bitmap);
5691 }
5692 mddev->sync_thread = md_register_thread(md_do_sync,
5693 mddev,
5694 "%s_resync");
5695 if (!mddev->sync_thread) {
5696 printk(KERN_ERR "%s: could not start resync"
5697 " thread...\n",
5698 mdname(mddev));
5699
5700 mddev->recovery = 0;
5701 } else
5702 md_wakeup_thread(mddev->sync_thread);
5703 md_new_event(mddev);
5704 }
5705 unlock:
5706 mddev_unlock(mddev);
5707 }
5708}
5709
5710static int md_notify_reboot(struct notifier_block *this,
5711 unsigned long code, void *x)
5712{
5713 struct list_head *tmp;
5714 mddev_t *mddev;
5715
5716 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
5717
5718 printk(KERN_INFO "md: stopping all md devices.\n");
5719
5720 ITERATE_MDDEV(mddev,tmp)
5721 if (mddev_trylock(mddev)) {
5722 do_md_stop (mddev, 1);
5723 mddev_unlock(mddev);
5724 }
5725
5726
5727
5728
5729
5730
5731 mdelay(1000*1);
5732 }
5733 return NOTIFY_DONE;
5734}
5735
5736static struct notifier_block md_notifier = {
5737 .notifier_call = md_notify_reboot,
5738 .next = NULL,
5739 .priority = INT_MAX,
5740};
5741
5742static void md_geninit(void)
5743{
5744 struct proc_dir_entry *p;
5745
5746 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
5747
5748 p = create_proc_entry("mdstat", S_IRUGO, NULL);
5749 if (p)
5750 p->proc_fops = &md_seq_fops;
5751}
5752
5753static int __init md_init(void)
5754{
5755 if (register_blkdev(MAJOR_NR, "md"))
5756 return -1;
5757 if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
5758 unregister_blkdev(MAJOR_NR, "md");
5759 return -1;
5760 }
5761 blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE,
5762 md_probe, NULL, NULL);
5763 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
5764 md_probe, NULL, NULL);
5765
5766 register_reboot_notifier(&md_notifier);
5767 raid_table_header = register_sysctl_table(raid_root_table);
5768
5769 md_geninit();
5770 return (0);
5771}
5772
5773
5774#ifndef MODULE
5775
5776
5777
5778
5779
5780
5781static LIST_HEAD(all_detected_devices);
5782struct detected_devices_node {
5783 struct list_head list;
5784 dev_t dev;
5785};
5786
5787void md_autodetect_dev(dev_t dev)
5788{
5789 struct detected_devices_node *node_detected_dev;
5790
5791 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
5792 if (node_detected_dev) {
5793 node_detected_dev->dev = dev;
5794 list_add_tail(&node_detected_dev->list, &all_detected_devices);
5795 } else {
5796 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
5797 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
5798 }
5799}
5800
5801
5802static void autostart_arrays(int part)
5803{
5804 mdk_rdev_t *rdev;
5805 struct detected_devices_node *node_detected_dev;
5806 dev_t dev;
5807 int i_scanned, i_passed;
5808
5809 i_scanned = 0;
5810 i_passed = 0;
5811
5812 printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
5813
5814 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
5815 i_scanned++;
5816 node_detected_dev = list_entry(all_detected_devices.next,
5817 struct detected_devices_node, list);
5818 list_del(&node_detected_dev->list);
5819 dev = node_detected_dev->dev;
5820 kfree(node_detected_dev);
5821 rdev = md_import_device(dev,0, 90);
5822 if (IS_ERR(rdev))
5823 continue;
5824
5825 if (test_bit(Faulty, &rdev->flags)) {
5826 MD_BUG();
5827 continue;
5828 }
5829 list_add(&rdev->same_set, &pending_raid_disks);
5830 i_passed++;
5831 }
5832
5833 printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
5834 i_scanned, i_passed);
5835
5836 autorun_devices(part);
5837}
5838
5839#endif
5840
5841static __exit void md_exit(void)
5842{
5843 mddev_t *mddev;
5844 struct list_head *tmp;
5845
5846 blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS);
5847 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
5848
5849 unregister_blkdev(MAJOR_NR,"md");
5850 unregister_blkdev(mdp_major, "mdp");
5851 unregister_reboot_notifier(&md_notifier);
5852 unregister_sysctl_table(raid_table_header);
5853 remove_proc_entry("mdstat", NULL);
5854 ITERATE_MDDEV(mddev,tmp) {
5855 struct gendisk *disk = mddev->gendisk;
5856 if (!disk)
5857 continue;
5858 export_array(mddev);
5859 del_gendisk(disk);
5860 put_disk(disk);
5861 mddev->gendisk = NULL;
5862 mddev_put(mddev);
5863 }
5864}
5865
5866subsys_initcall(md_init);
5867module_exit(md_exit)
5868
5869static int get_ro(char *buffer, struct kernel_param *kp)
5870{
5871 return sprintf(buffer, "%d", start_readonly);
5872}
5873static int set_ro(const char *val, struct kernel_param *kp)
5874{
5875 char *e;
5876 int num = simple_strtoul(val, &e, 10);
5877 if (*val && (*e == '\0' || *e == '\n')) {
5878 start_readonly = num;
5879 return 0;
5880 }
5881 return -EINVAL;
5882}
5883
5884module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
5885module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
5886
5887
5888EXPORT_SYMBOL(register_md_personality);
5889EXPORT_SYMBOL(unregister_md_personality);
5890EXPORT_SYMBOL(md_error);
5891EXPORT_SYMBOL(md_done_sync);
5892EXPORT_SYMBOL(md_write_start);
5893EXPORT_SYMBOL(md_write_end);
5894EXPORT_SYMBOL(md_register_thread);
5895EXPORT_SYMBOL(md_unregister_thread);
5896EXPORT_SYMBOL(md_wakeup_thread);
5897EXPORT_SYMBOL(md_check_recovery);
5898MODULE_LICENSE("GPL");
5899MODULE_ALIAS("md");
5900MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
5901