1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47#include <linux/sched/signal.h>
48#include <linux/kthread.h>
49#include <linux/blkdev.h>
50#include <linux/badblocks.h>
51#include <linux/sysctl.h>
52#include <linux/seq_file.h>
53#include <linux/fs.h>
54#include <linux/poll.h>
55#include <linux/ctype.h>
56#include <linux/string.h>
57#include <linux/hdreg.h>
58#include <linux/proc_fs.h>
59#include <linux/random.h>
60#include <linux/module.h>
61#include <linux/reboot.h>
62#include <linux/file.h>
63#include <linux/compat.h>
64#include <linux/delay.h>
65#include <linux/raid/md_p.h>
66#include <linux/raid/md_u.h>
67#include <linux/slab.h>
68#include <linux/percpu-refcount.h>
69
70#include <trace/events/block.h>
71#include "md.h"
72#include "md-bitmap.h"
73#include "md-cluster.h"
74
75#ifndef MODULE
76static void autostart_arrays(int part);
77#endif
78
79
80
81
82
83
84static LIST_HEAD(pers_list);
85static DEFINE_SPINLOCK(pers_lock);
86
87struct md_cluster_operations *md_cluster_ops;
88EXPORT_SYMBOL(md_cluster_ops);
89struct module *md_cluster_mod;
90EXPORT_SYMBOL(md_cluster_mod);
91
92static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
93static struct workqueue_struct *md_wq;
94static struct workqueue_struct *md_misc_wq;
95
96static int remove_and_add_spares(struct mddev *mddev,
97 struct md_rdev *this);
98static void mddev_detach(struct mddev *mddev);
99
100
101
102
103
104
105#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
106
107
108
109
110
111
112
113
114
115
116
117
118
119static int sysctl_speed_limit_min = 1000;
120static int sysctl_speed_limit_max = 200000;
121static inline int speed_min(struct mddev *mddev)
122{
123 return mddev->sync_speed_min ?
124 mddev->sync_speed_min : sysctl_speed_limit_min;
125}
126
127static inline int speed_max(struct mddev *mddev)
128{
129 return mddev->sync_speed_max ?
130 mddev->sync_speed_max : sysctl_speed_limit_max;
131}
132
133static struct ctl_table_header *raid_table_header;
134
135static struct ctl_table raid_table[] = {
136 {
137 .procname = "speed_limit_min",
138 .data = &sysctl_speed_limit_min,
139 .maxlen = sizeof(int),
140 .mode = S_IRUGO|S_IWUSR,
141 .proc_handler = proc_dointvec,
142 },
143 {
144 .procname = "speed_limit_max",
145 .data = &sysctl_speed_limit_max,
146 .maxlen = sizeof(int),
147 .mode = S_IRUGO|S_IWUSR,
148 .proc_handler = proc_dointvec,
149 },
150 { }
151};
152
153static struct ctl_table raid_dir_table[] = {
154 {
155 .procname = "raid",
156 .maxlen = 0,
157 .mode = S_IRUGO|S_IXUGO,
158 .child = raid_table,
159 },
160 { }
161};
162
163static struct ctl_table raid_root_table[] = {
164 {
165 .procname = "dev",
166 .maxlen = 0,
167 .mode = 0555,
168 .child = raid_dir_table,
169 },
170 { }
171};
172
173static const struct block_device_operations md_fops;
174
175static int start_readonly;
176
177
178
179
180
181
182
183
184
185static bool create_on_open = true;
186
187
188
189
190
191struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
192 struct mddev *mddev)
193{
194 struct bio *b;
195
196 if (!mddev || !mddev->bio_set)
197 return bio_alloc(gfp_mask, nr_iovecs);
198
199 b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set);
200 if (!b)
201 return NULL;
202 return b;
203}
204EXPORT_SYMBOL_GPL(bio_alloc_mddev);
205
206static struct bio *md_bio_alloc_sync(struct mddev *mddev)
207{
208 if (!mddev || !mddev->sync_set)
209 return bio_alloc(GFP_NOIO, 1);
210
211 return bio_alloc_bioset(GFP_NOIO, 1, mddev->sync_set);
212}
213
214
215
216
217
218
219
220
221
222
223
224static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
225static atomic_t md_event_count;
226void md_new_event(struct mddev *mddev)
227{
228 atomic_inc(&md_event_count);
229 wake_up(&md_event_waiters);
230}
231EXPORT_SYMBOL_GPL(md_new_event);
232
233
234
235
236
237static LIST_HEAD(all_mddevs);
238static DEFINE_SPINLOCK(all_mddevs_lock);
239
240
241
242
243
244
245
246
247#define for_each_mddev(_mddev,_tmp) \
248 \
249 for (({ spin_lock(&all_mddevs_lock); \
250 _tmp = all_mddevs.next; \
251 _mddev = NULL;}); \
252 ({ if (_tmp != &all_mddevs) \
253 mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
254 spin_unlock(&all_mddevs_lock); \
255 if (_mddev) mddev_put(_mddev); \
256 _mddev = list_entry(_tmp, struct mddev, all_mddevs); \
257 _tmp != &all_mddevs;}); \
258 ({ spin_lock(&all_mddevs_lock); \
259 _tmp = _tmp->next;}) \
260 )
261
262
263
264
265
266
267
268
269static bool is_suspended(struct mddev *mddev, struct bio *bio)
270{
271 if (mddev->suspended)
272 return true;
273 if (bio_data_dir(bio) != WRITE)
274 return false;
275 if (mddev->suspend_lo >= mddev->suspend_hi)
276 return false;
277 if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
278 return false;
279 if (bio_end_sector(bio) < mddev->suspend_lo)
280 return false;
281 return true;
282}
283
284void md_handle_request(struct mddev *mddev, struct bio *bio)
285{
286check_suspended:
287 rcu_read_lock();
288 if (is_suspended(mddev, bio)) {
289 DEFINE_WAIT(__wait);
290 for (;;) {
291 prepare_to_wait(&mddev->sb_wait, &__wait,
292 TASK_UNINTERRUPTIBLE);
293 if (!is_suspended(mddev, bio))
294 break;
295 rcu_read_unlock();
296 schedule();
297 rcu_read_lock();
298 }
299 finish_wait(&mddev->sb_wait, &__wait);
300 }
301 atomic_inc(&mddev->active_io);
302 rcu_read_unlock();
303
304 if (!mddev->pers->make_request(mddev, bio)) {
305 atomic_dec(&mddev->active_io);
306 wake_up(&mddev->sb_wait);
307 goto check_suspended;
308 }
309
310 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
311 wake_up(&mddev->sb_wait);
312}
313EXPORT_SYMBOL(md_handle_request);
314
315static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
316{
317 const int rw = bio_data_dir(bio);
318 struct mddev *mddev = q->queuedata;
319 unsigned int sectors;
320 int cpu;
321
322 blk_queue_split(q, &bio);
323
324 if (mddev == NULL || mddev->pers == NULL) {
325 bio_io_error(bio);
326 return BLK_QC_T_NONE;
327 }
328 if (mddev->ro == 1 && unlikely(rw == WRITE)) {
329 if (bio_sectors(bio) != 0)
330 bio->bi_status = BLK_STS_IOERR;
331 bio_endio(bio);
332 return BLK_QC_T_NONE;
333 }
334
335
336
337
338
339 sectors = bio_sectors(bio);
340
341 bio->bi_opf &= ~REQ_NOMERGE;
342
343 md_handle_request(mddev, bio);
344
345 cpu = part_stat_lock();
346 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
347 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
348 part_stat_unlock();
349
350 return BLK_QC_T_NONE;
351}
352
353
354
355
356
357
358
359void mddev_suspend(struct mddev *mddev)
360{
361 WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
362 lockdep_assert_held(&mddev->reconfig_mutex);
363 if (mddev->suspended++)
364 return;
365 synchronize_rcu();
366 wake_up(&mddev->sb_wait);
367 set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
368 smp_mb__after_atomic();
369 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
370 mddev->pers->quiesce(mddev, 1);
371 clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
372 wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
373
374 del_timer_sync(&mddev->safemode_timer);
375}
376EXPORT_SYMBOL_GPL(mddev_suspend);
377
378void mddev_resume(struct mddev *mddev)
379{
380 lockdep_assert_held(&mddev->reconfig_mutex);
381 if (--mddev->suspended)
382 return;
383 wake_up(&mddev->sb_wait);
384 mddev->pers->quiesce(mddev, 0);
385
386 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
387 md_wakeup_thread(mddev->thread);
388 md_wakeup_thread(mddev->sync_thread);
389}
390EXPORT_SYMBOL_GPL(mddev_resume);
391
392int mddev_congested(struct mddev *mddev, int bits)
393{
394 struct md_personality *pers = mddev->pers;
395 int ret = 0;
396
397 rcu_read_lock();
398 if (mddev->suspended)
399 ret = 1;
400 else if (pers && pers->congested)
401 ret = pers->congested(mddev, bits);
402 rcu_read_unlock();
403 return ret;
404}
405EXPORT_SYMBOL_GPL(mddev_congested);
406static int md_congested(void *data, int bits)
407{
408 struct mddev *mddev = data;
409 return mddev_congested(mddev, bits);
410}
411
412
413
414
415
416static void md_end_flush(struct bio *bio)
417{
418 struct md_rdev *rdev = bio->bi_private;
419 struct mddev *mddev = rdev->mddev;
420
421 rdev_dec_pending(rdev, mddev);
422
423 if (atomic_dec_and_test(&mddev->flush_pending)) {
424
425 queue_work(md_wq, &mddev->flush_work);
426 }
427 bio_put(bio);
428}
429
430static void md_submit_flush_data(struct work_struct *ws);
431
432static void submit_flushes(struct work_struct *ws)
433{
434 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
435 struct md_rdev *rdev;
436
437 INIT_WORK(&mddev->flush_work, md_submit_flush_data);
438 atomic_set(&mddev->flush_pending, 1);
439 rcu_read_lock();
440 rdev_for_each_rcu(rdev, mddev)
441 if (rdev->raid_disk >= 0 &&
442 !test_bit(Faulty, &rdev->flags)) {
443
444
445
446
447 struct bio *bi;
448 atomic_inc(&rdev->nr_pending);
449 atomic_inc(&rdev->nr_pending);
450 rcu_read_unlock();
451 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
452 bi->bi_end_io = md_end_flush;
453 bi->bi_private = rdev;
454 bio_set_dev(bi, rdev->bdev);
455 bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
456 atomic_inc(&mddev->flush_pending);
457 submit_bio(bi);
458 rcu_read_lock();
459 rdev_dec_pending(rdev, mddev);
460 }
461 rcu_read_unlock();
462 if (atomic_dec_and_test(&mddev->flush_pending))
463 queue_work(md_wq, &mddev->flush_work);
464}
465
466static void md_submit_flush_data(struct work_struct *ws)
467{
468 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
469 struct bio *bio = mddev->flush_bio;
470
471
472
473
474
475
476
477 mddev->flush_bio = NULL;
478 wake_up(&mddev->sb_wait);
479
480 if (bio->bi_iter.bi_size == 0)
481
482 bio_endio(bio);
483 else {
484 bio->bi_opf &= ~REQ_PREFLUSH;
485 md_handle_request(mddev, bio);
486 }
487}
488
489void md_flush_request(struct mddev *mddev, struct bio *bio)
490{
491 spin_lock_irq(&mddev->lock);
492 wait_event_lock_irq(mddev->sb_wait,
493 !mddev->flush_bio,
494 mddev->lock);
495 mddev->flush_bio = bio;
496 spin_unlock_irq(&mddev->lock);
497
498 INIT_WORK(&mddev->flush_work, submit_flushes);
499 queue_work(md_wq, &mddev->flush_work);
500}
501EXPORT_SYMBOL(md_flush_request);
502
503static inline struct mddev *mddev_get(struct mddev *mddev)
504{
505 atomic_inc(&mddev->active);
506 return mddev;
507}
508
509static void mddev_delayed_delete(struct work_struct *ws);
510
511static void mddev_put(struct mddev *mddev)
512{
513 struct bio_set *bs = NULL, *sync_bs = NULL;
514
515 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
516 return;
517 if (!mddev->raid_disks && list_empty(&mddev->disks) &&
518 mddev->ctime == 0 && !mddev->hold_active) {
519
520
521 list_del_init(&mddev->all_mddevs);
522 bs = mddev->bio_set;
523 sync_bs = mddev->sync_set;
524 mddev->bio_set = NULL;
525 mddev->sync_set = NULL;
526 if (mddev->gendisk) {
527
528
529
530
531
532 INIT_WORK(&mddev->del_work, mddev_delayed_delete);
533 queue_work(md_misc_wq, &mddev->del_work);
534 } else
535 kfree(mddev);
536 }
537 spin_unlock(&all_mddevs_lock);
538 if (bs)
539 bioset_free(bs);
540 if (sync_bs)
541 bioset_free(sync_bs);
542}
543
544static void md_safemode_timeout(struct timer_list *t);
545
546void mddev_init(struct mddev *mddev)
547{
548 mutex_init(&mddev->open_mutex);
549 mutex_init(&mddev->reconfig_mutex);
550 mutex_init(&mddev->bitmap_info.mutex);
551 INIT_LIST_HEAD(&mddev->disks);
552 INIT_LIST_HEAD(&mddev->all_mddevs);
553 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
554 atomic_set(&mddev->active, 1);
555 atomic_set(&mddev->openers, 0);
556 atomic_set(&mddev->active_io, 0);
557 spin_lock_init(&mddev->lock);
558 atomic_set(&mddev->flush_pending, 0);
559 init_waitqueue_head(&mddev->sb_wait);
560 init_waitqueue_head(&mddev->recovery_wait);
561 mddev->reshape_position = MaxSector;
562 mddev->reshape_backwards = 0;
563 mddev->last_sync_action = "none";
564 mddev->resync_min = 0;
565 mddev->resync_max = MaxSector;
566 mddev->level = LEVEL_NONE;
567}
568EXPORT_SYMBOL_GPL(mddev_init);
569
570static struct mddev *mddev_find(dev_t unit)
571{
572 struct mddev *mddev, *new = NULL;
573
574 if (unit && MAJOR(unit) != MD_MAJOR)
575 unit &= ~((1<<MdpMinorShift)-1);
576
577 retry:
578 spin_lock(&all_mddevs_lock);
579
580 if (unit) {
581 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
582 if (mddev->unit == unit) {
583 mddev_get(mddev);
584 spin_unlock(&all_mddevs_lock);
585 kfree(new);
586 return mddev;
587 }
588
589 if (new) {
590 list_add(&new->all_mddevs, &all_mddevs);
591 spin_unlock(&all_mddevs_lock);
592 new->hold_active = UNTIL_IOCTL;
593 return new;
594 }
595 } else if (new) {
596
597 static int next_minor = 512;
598 int start = next_minor;
599 int is_free = 0;
600 int dev = 0;
601 while (!is_free) {
602 dev = MKDEV(MD_MAJOR, next_minor);
603 next_minor++;
604 if (next_minor > MINORMASK)
605 next_minor = 0;
606 if (next_minor == start) {
607
608 spin_unlock(&all_mddevs_lock);
609 kfree(new);
610 return NULL;
611 }
612
613 is_free = 1;
614 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
615 if (mddev->unit == dev) {
616 is_free = 0;
617 break;
618 }
619 }
620 new->unit = dev;
621 new->md_minor = MINOR(dev);
622 new->hold_active = UNTIL_STOP;
623 list_add(&new->all_mddevs, &all_mddevs);
624 spin_unlock(&all_mddevs_lock);
625 return new;
626 }
627 spin_unlock(&all_mddevs_lock);
628
629 new = kzalloc(sizeof(*new), GFP_KERNEL);
630 if (!new)
631 return NULL;
632
633 new->unit = unit;
634 if (MAJOR(unit) == MD_MAJOR)
635 new->md_minor = MINOR(unit);
636 else
637 new->md_minor = MINOR(unit) >> MdpMinorShift;
638
639 mddev_init(new);
640
641 goto retry;
642}
643
644static struct attribute_group md_redundancy_group;
645
646void mddev_unlock(struct mddev *mddev)
647{
648 if (mddev->to_remove) {
649
650
651
652
653
654
655
656
657
658
659
660
661 struct attribute_group *to_remove = mddev->to_remove;
662 mddev->to_remove = NULL;
663 mddev->sysfs_active = 1;
664 mutex_unlock(&mddev->reconfig_mutex);
665
666 if (mddev->kobj.sd) {
667 if (to_remove != &md_redundancy_group)
668 sysfs_remove_group(&mddev->kobj, to_remove);
669 if (mddev->pers == NULL ||
670 mddev->pers->sync_request == NULL) {
671 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
672 if (mddev->sysfs_action)
673 sysfs_put(mddev->sysfs_action);
674 mddev->sysfs_action = NULL;
675 }
676 }
677 mddev->sysfs_active = 0;
678 } else
679 mutex_unlock(&mddev->reconfig_mutex);
680
681
682
683
684 spin_lock(&pers_lock);
685 md_wakeup_thread(mddev->thread);
686 wake_up(&mddev->sb_wait);
687 spin_unlock(&pers_lock);
688}
689EXPORT_SYMBOL_GPL(mddev_unlock);
690
691struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
692{
693 struct md_rdev *rdev;
694
695 rdev_for_each_rcu(rdev, mddev)
696 if (rdev->desc_nr == nr)
697 return rdev;
698
699 return NULL;
700}
701EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
702
703static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
704{
705 struct md_rdev *rdev;
706
707 rdev_for_each(rdev, mddev)
708 if (rdev->bdev->bd_dev == dev)
709 return rdev;
710
711 return NULL;
712}
713
714static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev)
715{
716 struct md_rdev *rdev;
717
718 rdev_for_each_rcu(rdev, mddev)
719 if (rdev->bdev->bd_dev == dev)
720 return rdev;
721
722 return NULL;
723}
724
725static struct md_personality *find_pers(int level, char *clevel)
726{
727 struct md_personality *pers;
728 list_for_each_entry(pers, &pers_list, list) {
729 if (level != LEVEL_NONE && pers->level == level)
730 return pers;
731 if (strcmp(pers->name, clevel)==0)
732 return pers;
733 }
734 return NULL;
735}
736
737
738static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
739{
740 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
741 return MD_NEW_SIZE_SECTORS(num_sectors);
742}
743
744static int alloc_disk_sb(struct md_rdev *rdev)
745{
746 rdev->sb_page = alloc_page(GFP_KERNEL);
747 if (!rdev->sb_page)
748 return -ENOMEM;
749 return 0;
750}
751
752void md_rdev_clear(struct md_rdev *rdev)
753{
754 if (rdev->sb_page) {
755 put_page(rdev->sb_page);
756 rdev->sb_loaded = 0;
757 rdev->sb_page = NULL;
758 rdev->sb_start = 0;
759 rdev->sectors = 0;
760 }
761 if (rdev->bb_page) {
762 put_page(rdev->bb_page);
763 rdev->bb_page = NULL;
764 }
765 badblocks_exit(&rdev->badblocks);
766}
767EXPORT_SYMBOL_GPL(md_rdev_clear);
768
769static void super_written(struct bio *bio)
770{
771 struct md_rdev *rdev = bio->bi_private;
772 struct mddev *mddev = rdev->mddev;
773
774 if (bio->bi_status) {
775 pr_err("md: super_written gets error=%d\n", bio->bi_status);
776 md_error(mddev, rdev);
777 if (!test_bit(Faulty, &rdev->flags)
778 && (bio->bi_opf & MD_FAILFAST)) {
779 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
780 set_bit(LastDev, &rdev->flags);
781 }
782 } else
783 clear_bit(LastDev, &rdev->flags);
784
785 if (atomic_dec_and_test(&mddev->pending_writes))
786 wake_up(&mddev->sb_wait);
787 rdev_dec_pending(rdev, mddev);
788 bio_put(bio);
789}
790
791void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
792 sector_t sector, int size, struct page *page)
793{
794
795
796
797
798
799
800 struct bio *bio;
801 int ff = 0;
802
803 if (test_bit(Faulty, &rdev->flags))
804 return;
805
806 bio = md_bio_alloc_sync(mddev);
807
808 atomic_inc(&rdev->nr_pending);
809
810 bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev);
811 bio->bi_iter.bi_sector = sector;
812 bio_add_page(bio, page, size, 0);
813 bio->bi_private = rdev;
814 bio->bi_end_io = super_written;
815
816 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
817 test_bit(FailFast, &rdev->flags) &&
818 !test_bit(LastDev, &rdev->flags))
819 ff = MD_FAILFAST;
820 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff;
821
822 atomic_inc(&mddev->pending_writes);
823 submit_bio(bio);
824}
825
826int md_super_wait(struct mddev *mddev)
827{
828
829 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
830 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
831 return -EAGAIN;
832 return 0;
833}
834
835int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
836 struct page *page, int op, int op_flags, bool metadata_op)
837{
838 struct bio *bio = md_bio_alloc_sync(rdev->mddev);
839 int ret;
840
841 if (metadata_op && rdev->meta_bdev)
842 bio_set_dev(bio, rdev->meta_bdev);
843 else
844 bio_set_dev(bio, rdev->bdev);
845 bio_set_op_attrs(bio, op, op_flags);
846 if (metadata_op)
847 bio->bi_iter.bi_sector = sector + rdev->sb_start;
848 else if (rdev->mddev->reshape_position != MaxSector &&
849 (rdev->mddev->reshape_backwards ==
850 (sector >= rdev->mddev->reshape_position)))
851 bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
852 else
853 bio->bi_iter.bi_sector = sector + rdev->data_offset;
854 bio_add_page(bio, page, size, 0);
855
856 submit_bio_wait(bio);
857
858 ret = !bio->bi_status;
859 bio_put(bio);
860 return ret;
861}
862EXPORT_SYMBOL_GPL(sync_page_io);
863
864static int read_disk_sb(struct md_rdev *rdev, int size)
865{
866 char b[BDEVNAME_SIZE];
867
868 if (rdev->sb_loaded)
869 return 0;
870
871 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true))
872 goto fail;
873 rdev->sb_loaded = 1;
874 return 0;
875
876fail:
877 pr_err("md: disabled device %s, could not read superblock.\n",
878 bdevname(rdev->bdev,b));
879 return -EINVAL;
880}
881
882static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
883{
884 return sb1->set_uuid0 == sb2->set_uuid0 &&
885 sb1->set_uuid1 == sb2->set_uuid1 &&
886 sb1->set_uuid2 == sb2->set_uuid2 &&
887 sb1->set_uuid3 == sb2->set_uuid3;
888}
889
890static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
891{
892 int ret;
893 mdp_super_t *tmp1, *tmp2;
894
895 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
896 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
897
898 if (!tmp1 || !tmp2) {
899 ret = 0;
900 goto abort;
901 }
902
903 *tmp1 = *sb1;
904 *tmp2 = *sb2;
905
906
907
908
909 tmp1->nr_disks = 0;
910 tmp2->nr_disks = 0;
911
912 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
913abort:
914 kfree(tmp1);
915 kfree(tmp2);
916 return ret;
917}
918
919static u32 md_csum_fold(u32 csum)
920{
921 csum = (csum & 0xffff) + (csum >> 16);
922 return (csum & 0xffff) + (csum >> 16);
923}
924
925static unsigned int calc_sb_csum(mdp_super_t *sb)
926{
927 u64 newcsum = 0;
928 u32 *sb32 = (u32*)sb;
929 int i;
930 unsigned int disk_csum, csum;
931
932 disk_csum = sb->sb_csum;
933 sb->sb_csum = 0;
934
935 for (i = 0; i < MD_SB_BYTES/4 ; i++)
936 newcsum += sb32[i];
937 csum = (newcsum & 0xffffffff) + (newcsum>>32);
938
939#ifdef CONFIG_ALPHA
940
941
942
943
944
945
946
947
948 sb->sb_csum = md_csum_fold(disk_csum);
949#else
950 sb->sb_csum = disk_csum;
951#endif
952 return csum;
953}
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985struct super_type {
986 char *name;
987 struct module *owner;
988 int (*load_super)(struct md_rdev *rdev,
989 struct md_rdev *refdev,
990 int minor_version);
991 int (*validate_super)(struct mddev *mddev,
992 struct md_rdev *rdev);
993 void (*sync_super)(struct mddev *mddev,
994 struct md_rdev *rdev);
995 unsigned long long (*rdev_size_change)(struct md_rdev *rdev,
996 sector_t num_sectors);
997 int (*allow_new_offset)(struct md_rdev *rdev,
998 unsigned long long new_offset);
999};
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009int md_check_no_bitmap(struct mddev *mddev)
1010{
1011 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1012 return 0;
1013 pr_warn("%s: bitmaps are not supported for %s\n",
1014 mdname(mddev), mddev->pers->name);
1015 return 1;
1016}
1017EXPORT_SYMBOL(md_check_no_bitmap);
1018
1019
1020
1021
1022static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1023{
1024 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1025 mdp_super_t *sb;
1026 int ret;
1027
1028
1029
1030
1031
1032
1033
1034 rdev->sb_start = calc_dev_sboffset(rdev);
1035
1036 ret = read_disk_sb(rdev, MD_SB_BYTES);
1037 if (ret)
1038 return ret;
1039
1040 ret = -EINVAL;
1041
1042 bdevname(rdev->bdev, b);
1043 sb = page_address(rdev->sb_page);
1044
1045 if (sb->md_magic != MD_SB_MAGIC) {
1046 pr_warn("md: invalid raid superblock magic on %s\n", b);
1047 goto abort;
1048 }
1049
1050 if (sb->major_version != 0 ||
1051 sb->minor_version < 90 ||
1052 sb->minor_version > 91) {
1053 pr_warn("Bad version number %d.%d on %s\n",
1054 sb->major_version, sb->minor_version, b);
1055 goto abort;
1056 }
1057
1058 if (sb->raid_disks <= 0)
1059 goto abort;
1060
1061 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1062 pr_warn("md: invalid superblock checksum on %s\n", b);
1063 goto abort;
1064 }
1065
1066 rdev->preferred_minor = sb->md_minor;
1067 rdev->data_offset = 0;
1068 rdev->new_data_offset = 0;
1069 rdev->sb_size = MD_SB_BYTES;
1070 rdev->badblocks.shift = -1;
1071
1072 if (sb->level == LEVEL_MULTIPATH)
1073 rdev->desc_nr = -1;
1074 else
1075 rdev->desc_nr = sb->this_disk.number;
1076
1077 if (!refdev) {
1078 ret = 1;
1079 } else {
1080 __u64 ev1, ev2;
1081 mdp_super_t *refsb = page_address(refdev->sb_page);
1082 if (!md_uuid_equal(refsb, sb)) {
1083 pr_warn("md: %s has different UUID to %s\n",
1084 b, bdevname(refdev->bdev,b2));
1085 goto abort;
1086 }
1087 if (!md_sb_equal(refsb, sb)) {
1088 pr_warn("md: %s has same UUID but different superblock to %s\n",
1089 b, bdevname(refdev->bdev, b2));
1090 goto abort;
1091 }
1092 ev1 = md_event(sb);
1093 ev2 = md_event(refsb);
1094 if (ev1 > ev2)
1095 ret = 1;
1096 else
1097 ret = 0;
1098 }
1099 rdev->sectors = rdev->sb_start;
1100
1101
1102
1103
1104 if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) &&
1105 sb->level >= 1)
1106 rdev->sectors = (sector_t)(2ULL << 32) - 2;
1107
1108 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1109
1110 ret = -EINVAL;
1111
1112 abort:
1113 return ret;
1114}
1115
1116
1117
1118
1119static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1120{
1121 mdp_disk_t *desc;
1122 mdp_super_t *sb = page_address(rdev->sb_page);
1123 __u64 ev1 = md_event(sb);
1124
1125 rdev->raid_disk = -1;
1126 clear_bit(Faulty, &rdev->flags);
1127 clear_bit(In_sync, &rdev->flags);
1128 clear_bit(Bitmap_sync, &rdev->flags);
1129 clear_bit(WriteMostly, &rdev->flags);
1130
1131 if (mddev->raid_disks == 0) {
1132 mddev->major_version = 0;
1133 mddev->minor_version = sb->minor_version;
1134 mddev->patch_version = sb->patch_version;
1135 mddev->external = 0;
1136 mddev->chunk_sectors = sb->chunk_size >> 9;
1137 mddev->ctime = sb->ctime;
1138 mddev->utime = sb->utime;
1139 mddev->level = sb->level;
1140 mddev->clevel[0] = 0;
1141 mddev->layout = sb->layout;
1142 mddev->raid_disks = sb->raid_disks;
1143 mddev->dev_sectors = ((sector_t)sb->size) * 2;
1144 mddev->events = ev1;
1145 mddev->bitmap_info.offset = 0;
1146 mddev->bitmap_info.space = 0;
1147
1148 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1149 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1150 mddev->reshape_backwards = 0;
1151
1152 if (mddev->minor_version >= 91) {
1153 mddev->reshape_position = sb->reshape_position;
1154 mddev->delta_disks = sb->delta_disks;
1155 mddev->new_level = sb->new_level;
1156 mddev->new_layout = sb->new_layout;
1157 mddev->new_chunk_sectors = sb->new_chunk >> 9;
1158 if (mddev->delta_disks < 0)
1159 mddev->reshape_backwards = 1;
1160 } else {
1161 mddev->reshape_position = MaxSector;
1162 mddev->delta_disks = 0;
1163 mddev->new_level = mddev->level;
1164 mddev->new_layout = mddev->layout;
1165 mddev->new_chunk_sectors = mddev->chunk_sectors;
1166 }
1167
1168 if (sb->state & (1<<MD_SB_CLEAN))
1169 mddev->recovery_cp = MaxSector;
1170 else {
1171 if (sb->events_hi == sb->cp_events_hi &&
1172 sb->events_lo == sb->cp_events_lo) {
1173 mddev->recovery_cp = sb->recovery_cp;
1174 } else
1175 mddev->recovery_cp = 0;
1176 }
1177
1178 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1179 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1180 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1181 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1182
1183 mddev->max_disks = MD_SB_DISKS;
1184
1185 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1186 mddev->bitmap_info.file == NULL) {
1187 mddev->bitmap_info.offset =
1188 mddev->bitmap_info.default_offset;
1189 mddev->bitmap_info.space =
1190 mddev->bitmap_info.default_space;
1191 }
1192
1193 } else if (mddev->pers == NULL) {
1194
1195
1196 ++ev1;
1197 if (sb->disks[rdev->desc_nr].state & (
1198 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1199 if (ev1 < mddev->events)
1200 return -EINVAL;
1201 } else if (mddev->bitmap) {
1202
1203
1204
1205 if (ev1 < mddev->bitmap->events_cleared)
1206 return 0;
1207 if (ev1 < mddev->events)
1208 set_bit(Bitmap_sync, &rdev->flags);
1209 } else {
1210 if (ev1 < mddev->events)
1211
1212 return 0;
1213 }
1214
1215 if (mddev->level != LEVEL_MULTIPATH) {
1216 desc = sb->disks + rdev->desc_nr;
1217
1218 if (desc->state & (1<<MD_DISK_FAULTY))
1219 set_bit(Faulty, &rdev->flags);
1220 else if (desc->state & (1<<MD_DISK_SYNC)
1221) {
1222 set_bit(In_sync, &rdev->flags);
1223 rdev->raid_disk = desc->raid_disk;
1224 rdev->saved_raid_disk = desc->raid_disk;
1225 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1226
1227
1228
1229 if (mddev->minor_version >= 91) {
1230 rdev->recovery_offset = 0;
1231 rdev->raid_disk = desc->raid_disk;
1232 }
1233 }
1234 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1235 set_bit(WriteMostly, &rdev->flags);
1236 if (desc->state & (1<<MD_DISK_FAILFAST))
1237 set_bit(FailFast, &rdev->flags);
1238 } else
1239 set_bit(In_sync, &rdev->flags);
1240 return 0;
1241}
1242
1243
1244
1245
1246static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1247{
1248 mdp_super_t *sb;
1249 struct md_rdev *rdev2;
1250 int next_spare = mddev->raid_disks;
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262 int i;
1263 int active=0, working=0,failed=0,spare=0,nr_disks=0;
1264
1265 rdev->sb_size = MD_SB_BYTES;
1266
1267 sb = page_address(rdev->sb_page);
1268
1269 memset(sb, 0, sizeof(*sb));
1270
1271 sb->md_magic = MD_SB_MAGIC;
1272 sb->major_version = mddev->major_version;
1273 sb->patch_version = mddev->patch_version;
1274 sb->gvalid_words = 0;
1275 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1276 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1277 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1278 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1279
1280 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
1281 sb->level = mddev->level;
1282 sb->size = mddev->dev_sectors / 2;
1283 sb->raid_disks = mddev->raid_disks;
1284 sb->md_minor = mddev->md_minor;
1285 sb->not_persistent = 0;
1286 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
1287 sb->state = 0;
1288 sb->events_hi = (mddev->events>>32);
1289 sb->events_lo = (u32)mddev->events;
1290
1291 if (mddev->reshape_position == MaxSector)
1292 sb->minor_version = 90;
1293 else {
1294 sb->minor_version = 91;
1295 sb->reshape_position = mddev->reshape_position;
1296 sb->new_level = mddev->new_level;
1297 sb->delta_disks = mddev->delta_disks;
1298 sb->new_layout = mddev->new_layout;
1299 sb->new_chunk = mddev->new_chunk_sectors << 9;
1300 }
1301 mddev->minor_version = sb->minor_version;
1302 if (mddev->in_sync)
1303 {
1304 sb->recovery_cp = mddev->recovery_cp;
1305 sb->cp_events_hi = (mddev->events>>32);
1306 sb->cp_events_lo = (u32)mddev->events;
1307 if (mddev->recovery_cp == MaxSector)
1308 sb->state = (1<< MD_SB_CLEAN);
1309 } else
1310 sb->recovery_cp = 0;
1311
1312 sb->layout = mddev->layout;
1313 sb->chunk_size = mddev->chunk_sectors << 9;
1314
1315 if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1316 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1317
1318 sb->disks[0].state = (1<<MD_DISK_REMOVED);
1319 rdev_for_each(rdev2, mddev) {
1320 mdp_disk_t *d;
1321 int desc_nr;
1322 int is_active = test_bit(In_sync, &rdev2->flags);
1323
1324 if (rdev2->raid_disk >= 0 &&
1325 sb->minor_version >= 91)
1326
1327
1328
1329
1330 is_active = 1;
1331 if (rdev2->raid_disk < 0 ||
1332 test_bit(Faulty, &rdev2->flags))
1333 is_active = 0;
1334 if (is_active)
1335 desc_nr = rdev2->raid_disk;
1336 else
1337 desc_nr = next_spare++;
1338 rdev2->desc_nr = desc_nr;
1339 d = &sb->disks[rdev2->desc_nr];
1340 nr_disks++;
1341 d->number = rdev2->desc_nr;
1342 d->major = MAJOR(rdev2->bdev->bd_dev);
1343 d->minor = MINOR(rdev2->bdev->bd_dev);
1344 if (is_active)
1345 d->raid_disk = rdev2->raid_disk;
1346 else
1347 d->raid_disk = rdev2->desc_nr;
1348 if (test_bit(Faulty, &rdev2->flags))
1349 d->state = (1<<MD_DISK_FAULTY);
1350 else if (is_active) {
1351 d->state = (1<<MD_DISK_ACTIVE);
1352 if (test_bit(In_sync, &rdev2->flags))
1353 d->state |= (1<<MD_DISK_SYNC);
1354 active++;
1355 working++;
1356 } else {
1357 d->state = 0;
1358 spare++;
1359 working++;
1360 }
1361 if (test_bit(WriteMostly, &rdev2->flags))
1362 d->state |= (1<<MD_DISK_WRITEMOSTLY);
1363 if (test_bit(FailFast, &rdev2->flags))
1364 d->state |= (1<<MD_DISK_FAILFAST);
1365 }
1366
1367 for (i=0 ; i < mddev->raid_disks ; i++) {
1368 mdp_disk_t *d = &sb->disks[i];
1369 if (d->state == 0 && d->number == 0) {
1370 d->number = i;
1371 d->raid_disk = i;
1372 d->state = (1<<MD_DISK_REMOVED);
1373 d->state |= (1<<MD_DISK_FAULTY);
1374 failed++;
1375 }
1376 }
1377 sb->nr_disks = nr_disks;
1378 sb->active_disks = active;
1379 sb->working_disks = working;
1380 sb->failed_disks = failed;
1381 sb->spare_disks = spare;
1382
1383 sb->this_disk = sb->disks[rdev->desc_nr];
1384 sb->sb_csum = calc_sb_csum(sb);
1385}
1386
1387
1388
1389
1390static unsigned long long
1391super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1392{
1393 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1394 return 0;
1395 if (rdev->mddev->bitmap_info.offset)
1396 return 0;
1397 rdev->sb_start = calc_dev_sboffset(rdev);
1398 if (!num_sectors || num_sectors > rdev->sb_start)
1399 num_sectors = rdev->sb_start;
1400
1401
1402
1403 if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) &&
1404 rdev->mddev->level >= 1)
1405 num_sectors = (sector_t)(2ULL << 32) - 2;
1406 do {
1407 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1408 rdev->sb_page);
1409 } while (md_super_wait(rdev->mddev) < 0);
1410 return num_sectors;
1411}
1412
1413static int
1414super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1415{
1416
1417 return new_offset == 0;
1418}
1419
1420
1421
1422
1423
1424static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1425{
1426 __le32 disk_csum;
1427 u32 csum;
1428 unsigned long long newcsum;
1429 int size = 256 + le32_to_cpu(sb->max_dev)*2;
1430 __le32 *isuper = (__le32*)sb;
1431
1432 disk_csum = sb->sb_csum;
1433 sb->sb_csum = 0;
1434 newcsum = 0;
1435 for (; size >= 4; size -= 4)
1436 newcsum += le32_to_cpu(*isuper++);
1437
1438 if (size == 2)
1439 newcsum += le16_to_cpu(*(__le16*) isuper);
1440
1441 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1442 sb->sb_csum = disk_csum;
1443 return cpu_to_le32(csum);
1444}
1445
1446static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1447{
1448 struct mdp_superblock_1 *sb;
1449 int ret;
1450 sector_t sb_start;
1451 sector_t sectors;
1452 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1453 int bmask;
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463 switch(minor_version) {
1464 case 0:
1465 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1466 sb_start -= 8*2;
1467 sb_start &= ~(sector_t)(4*2-1);
1468 break;
1469 case 1:
1470 sb_start = 0;
1471 break;
1472 case 2:
1473 sb_start = 8;
1474 break;
1475 default:
1476 return -EINVAL;
1477 }
1478 rdev->sb_start = sb_start;
1479
1480
1481
1482
1483 ret = read_disk_sb(rdev, 4096);
1484 if (ret) return ret;
1485
1486 sb = page_address(rdev->sb_page);
1487
1488 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1489 sb->major_version != cpu_to_le32(1) ||
1490 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1491 le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1492 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1493 return -EINVAL;
1494
1495 if (calc_sb_1_csum(sb) != sb->sb_csum) {
1496 pr_warn("md: invalid superblock checksum on %s\n",
1497 bdevname(rdev->bdev,b));
1498 return -EINVAL;
1499 }
1500 if (le64_to_cpu(sb->data_size) < 10) {
1501 pr_warn("md: data_size too small on %s\n",
1502 bdevname(rdev->bdev,b));
1503 return -EINVAL;
1504 }
1505 if (sb->pad0 ||
1506 sb->pad3[0] ||
1507 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1508
1509 return -EINVAL;
1510
1511 rdev->preferred_minor = 0xffff;
1512 rdev->data_offset = le64_to_cpu(sb->data_offset);
1513 rdev->new_data_offset = rdev->data_offset;
1514 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1515 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1516 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1517 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1518
1519 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1520 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1521 if (rdev->sb_size & bmask)
1522 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1523
1524 if (minor_version
1525 && rdev->data_offset < sb_start + (rdev->sb_size/512))
1526 return -EINVAL;
1527 if (minor_version
1528 && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1529 return -EINVAL;
1530
1531 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1532 rdev->desc_nr = -1;
1533 else
1534 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1535
1536 if (!rdev->bb_page) {
1537 rdev->bb_page = alloc_page(GFP_KERNEL);
1538 if (!rdev->bb_page)
1539 return -ENOMEM;
1540 }
1541 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1542 rdev->badblocks.count == 0) {
1543
1544
1545
1546 s32 offset;
1547 sector_t bb_sector;
1548 u64 *bbp;
1549 int i;
1550 int sectors = le16_to_cpu(sb->bblog_size);
1551 if (sectors > (PAGE_SIZE / 512))
1552 return -EINVAL;
1553 offset = le32_to_cpu(sb->bblog_offset);
1554 if (offset == 0)
1555 return -EINVAL;
1556 bb_sector = (long long)offset;
1557 if (!sync_page_io(rdev, bb_sector, sectors << 9,
1558 rdev->bb_page, REQ_OP_READ, 0, true))
1559 return -EIO;
1560 bbp = (u64 *)page_address(rdev->bb_page);
1561 rdev->badblocks.shift = sb->bblog_shift;
1562 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1563 u64 bb = le64_to_cpu(*bbp);
1564 int count = bb & (0x3ff);
1565 u64 sector = bb >> 10;
1566 sector <<= sb->bblog_shift;
1567 count <<= sb->bblog_shift;
1568 if (bb + 1 == 0)
1569 break;
1570 if (badblocks_set(&rdev->badblocks, sector, count, 1))
1571 return -EINVAL;
1572 }
1573 } else if (sb->bblog_offset != 0)
1574 rdev->badblocks.shift = 0;
1575
1576 if ((le32_to_cpu(sb->feature_map) &
1577 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
1578 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
1579 rdev->ppl.size = le16_to_cpu(sb->ppl.size);
1580 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
1581 }
1582
1583 if (!refdev) {
1584 ret = 1;
1585 } else {
1586 __u64 ev1, ev2;
1587 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1588
1589 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1590 sb->level != refsb->level ||
1591 sb->layout != refsb->layout ||
1592 sb->chunksize != refsb->chunksize) {
1593 pr_warn("md: %s has strangely different superblock to %s\n",
1594 bdevname(rdev->bdev,b),
1595 bdevname(refdev->bdev,b2));
1596 return -EINVAL;
1597 }
1598 ev1 = le64_to_cpu(sb->events);
1599 ev2 = le64_to_cpu(refsb->events);
1600
1601 if (ev1 > ev2)
1602 ret = 1;
1603 else
1604 ret = 0;
1605 }
1606 if (minor_version) {
1607 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
1608 sectors -= rdev->data_offset;
1609 } else
1610 sectors = rdev->sb_start;
1611 if (sectors < le64_to_cpu(sb->data_size))
1612 return -EINVAL;
1613 rdev->sectors = le64_to_cpu(sb->data_size);
1614 return ret;
1615}
1616
1617static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1618{
1619 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1620 __u64 ev1 = le64_to_cpu(sb->events);
1621
1622 rdev->raid_disk = -1;
1623 clear_bit(Faulty, &rdev->flags);
1624 clear_bit(In_sync, &rdev->flags);
1625 clear_bit(Bitmap_sync, &rdev->flags);
1626 clear_bit(WriteMostly, &rdev->flags);
1627
1628 if (mddev->raid_disks == 0) {
1629 mddev->major_version = 1;
1630 mddev->patch_version = 0;
1631 mddev->external = 0;
1632 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1633 mddev->ctime = le64_to_cpu(sb->ctime);
1634 mddev->utime = le64_to_cpu(sb->utime);
1635 mddev->level = le32_to_cpu(sb->level);
1636 mddev->clevel[0] = 0;
1637 mddev->layout = le32_to_cpu(sb->layout);
1638 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1639 mddev->dev_sectors = le64_to_cpu(sb->size);
1640 mddev->events = ev1;
1641 mddev->bitmap_info.offset = 0;
1642 mddev->bitmap_info.space = 0;
1643
1644
1645
1646 mddev->bitmap_info.default_offset = 1024 >> 9;
1647 mddev->bitmap_info.default_space = (4096-1024) >> 9;
1648 mddev->reshape_backwards = 0;
1649
1650 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1651 memcpy(mddev->uuid, sb->set_uuid, 16);
1652
1653 mddev->max_disks = (4096-256)/2;
1654
1655 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1656 mddev->bitmap_info.file == NULL) {
1657 mddev->bitmap_info.offset =
1658 (__s32)le32_to_cpu(sb->bitmap_offset);
1659
1660
1661
1662
1663
1664 if (mddev->minor_version > 0)
1665 mddev->bitmap_info.space = 0;
1666 else if (mddev->bitmap_info.offset > 0)
1667 mddev->bitmap_info.space =
1668 8 - mddev->bitmap_info.offset;
1669 else
1670 mddev->bitmap_info.space =
1671 -mddev->bitmap_info.offset;
1672 }
1673
1674 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1675 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1676 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1677 mddev->new_level = le32_to_cpu(sb->new_level);
1678 mddev->new_layout = le32_to_cpu(sb->new_layout);
1679 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1680 if (mddev->delta_disks < 0 ||
1681 (mddev->delta_disks == 0 &&
1682 (le32_to_cpu(sb->feature_map)
1683 & MD_FEATURE_RESHAPE_BACKWARDS)))
1684 mddev->reshape_backwards = 1;
1685 } else {
1686 mddev->reshape_position = MaxSector;
1687 mddev->delta_disks = 0;
1688 mddev->new_level = mddev->level;
1689 mddev->new_layout = mddev->layout;
1690 mddev->new_chunk_sectors = mddev->chunk_sectors;
1691 }
1692
1693 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
1694 set_bit(MD_HAS_JOURNAL, &mddev->flags);
1695
1696 if (le32_to_cpu(sb->feature_map) &
1697 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
1698 if (le32_to_cpu(sb->feature_map) &
1699 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
1700 return -EINVAL;
1701 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
1702 (le32_to_cpu(sb->feature_map) &
1703 MD_FEATURE_MULTIPLE_PPLS))
1704 return -EINVAL;
1705 set_bit(MD_HAS_PPL, &mddev->flags);
1706 }
1707 } else if (mddev->pers == NULL) {
1708
1709
1710 ++ev1;
1711 if (rdev->desc_nr >= 0 &&
1712 rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1713 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1714 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1715 if (ev1 < mddev->events)
1716 return -EINVAL;
1717 } else if (mddev->bitmap) {
1718
1719
1720
1721 if (ev1 < mddev->bitmap->events_cleared)
1722 return 0;
1723 if (ev1 < mddev->events)
1724 set_bit(Bitmap_sync, &rdev->flags);
1725 } else {
1726 if (ev1 < mddev->events)
1727
1728 return 0;
1729 }
1730 if (mddev->level != LEVEL_MULTIPATH) {
1731 int role;
1732 if (rdev->desc_nr < 0 ||
1733 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1734 role = MD_DISK_ROLE_SPARE;
1735 rdev->desc_nr = -1;
1736 } else
1737 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1738 switch(role) {
1739 case MD_DISK_ROLE_SPARE:
1740 break;
1741 case MD_DISK_ROLE_FAULTY:
1742 set_bit(Faulty, &rdev->flags);
1743 break;
1744 case MD_DISK_ROLE_JOURNAL:
1745 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
1746
1747 pr_warn("md: journal device provided without journal feature, ignoring the device\n");
1748 return -EINVAL;
1749 }
1750 set_bit(Journal, &rdev->flags);
1751 rdev->journal_tail = le64_to_cpu(sb->journal_tail);
1752 rdev->raid_disk = 0;
1753 break;
1754 default:
1755 rdev->saved_raid_disk = role;
1756 if ((le32_to_cpu(sb->feature_map) &
1757 MD_FEATURE_RECOVERY_OFFSET)) {
1758 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1759 if (!(le32_to_cpu(sb->feature_map) &
1760 MD_FEATURE_RECOVERY_BITMAP))
1761 rdev->saved_raid_disk = -1;
1762 } else
1763 set_bit(In_sync, &rdev->flags);
1764 rdev->raid_disk = role;
1765 break;
1766 }
1767 if (sb->devflags & WriteMostly1)
1768 set_bit(WriteMostly, &rdev->flags);
1769 if (sb->devflags & FailFast1)
1770 set_bit(FailFast, &rdev->flags);
1771 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1772 set_bit(Replacement, &rdev->flags);
1773 } else
1774 set_bit(In_sync, &rdev->flags);
1775
1776 return 0;
1777}
1778
1779static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1780{
1781 struct mdp_superblock_1 *sb;
1782 struct md_rdev *rdev2;
1783 int max_dev, i;
1784
1785
1786 sb = page_address(rdev->sb_page);
1787
1788 sb->feature_map = 0;
1789 sb->pad0 = 0;
1790 sb->recovery_offset = cpu_to_le64(0);
1791 memset(sb->pad3, 0, sizeof(sb->pad3));
1792
1793 sb->utime = cpu_to_le64((__u64)mddev->utime);
1794 sb->events = cpu_to_le64(mddev->events);
1795 if (mddev->in_sync)
1796 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1797 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
1798 sb->resync_offset = cpu_to_le64(MaxSector);
1799 else
1800 sb->resync_offset = cpu_to_le64(0);
1801
1802 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1803
1804 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1805 sb->size = cpu_to_le64(mddev->dev_sectors);
1806 sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1807 sb->level = cpu_to_le32(mddev->level);
1808 sb->layout = cpu_to_le32(mddev->layout);
1809 if (test_bit(FailFast, &rdev->flags))
1810 sb->devflags |= FailFast1;
1811 else
1812 sb->devflags &= ~FailFast1;
1813
1814 if (test_bit(WriteMostly, &rdev->flags))
1815 sb->devflags |= WriteMostly1;
1816 else
1817 sb->devflags &= ~WriteMostly1;
1818 sb->data_offset = cpu_to_le64(rdev->data_offset);
1819 sb->data_size = cpu_to_le64(rdev->sectors);
1820
1821 if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1822 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1823 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1824 }
1825
1826 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
1827 !test_bit(In_sync, &rdev->flags)) {
1828 sb->feature_map |=
1829 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1830 sb->recovery_offset =
1831 cpu_to_le64(rdev->recovery_offset);
1832 if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
1833 sb->feature_map |=
1834 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
1835 }
1836
1837 if (test_bit(Journal, &rdev->flags))
1838 sb->journal_tail = cpu_to_le64(rdev->journal_tail);
1839 if (test_bit(Replacement, &rdev->flags))
1840 sb->feature_map |=
1841 cpu_to_le32(MD_FEATURE_REPLACEMENT);
1842
1843 if (mddev->reshape_position != MaxSector) {
1844 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1845 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1846 sb->new_layout = cpu_to_le32(mddev->new_layout);
1847 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1848 sb->new_level = cpu_to_le32(mddev->new_level);
1849 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1850 if (mddev->delta_disks == 0 &&
1851 mddev->reshape_backwards)
1852 sb->feature_map
1853 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
1854 if (rdev->new_data_offset != rdev->data_offset) {
1855 sb->feature_map
1856 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
1857 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
1858 - rdev->data_offset));
1859 }
1860 }
1861
1862 if (mddev_is_clustered(mddev))
1863 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
1864
1865 if (rdev->badblocks.count == 0)
1866 ;
1867 else if (sb->bblog_offset == 0)
1868
1869 md_error(mddev, rdev);
1870 else {
1871 struct badblocks *bb = &rdev->badblocks;
1872 u64 *bbp = (u64 *)page_address(rdev->bb_page);
1873 u64 *p = bb->page;
1874 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1875 if (bb->changed) {
1876 unsigned seq;
1877
1878retry:
1879 seq = read_seqbegin(&bb->lock);
1880
1881 memset(bbp, 0xff, PAGE_SIZE);
1882
1883 for (i = 0 ; i < bb->count ; i++) {
1884 u64 internal_bb = p[i];
1885 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1886 | BB_LEN(internal_bb));
1887 bbp[i] = cpu_to_le64(store_bb);
1888 }
1889 bb->changed = 0;
1890 if (read_seqretry(&bb->lock, seq))
1891 goto retry;
1892
1893 bb->sector = (rdev->sb_start +
1894 (int)le32_to_cpu(sb->bblog_offset));
1895 bb->size = le16_to_cpu(sb->bblog_size);
1896 }
1897 }
1898
1899 max_dev = 0;
1900 rdev_for_each(rdev2, mddev)
1901 if (rdev2->desc_nr+1 > max_dev)
1902 max_dev = rdev2->desc_nr+1;
1903
1904 if (max_dev > le32_to_cpu(sb->max_dev)) {
1905 int bmask;
1906 sb->max_dev = cpu_to_le32(max_dev);
1907 rdev->sb_size = max_dev * 2 + 256;
1908 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1909 if (rdev->sb_size & bmask)
1910 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1911 } else
1912 max_dev = le32_to_cpu(sb->max_dev);
1913
1914 for (i=0; i<max_dev;i++)
1915 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
1916
1917 if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
1918 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
1919
1920 if (test_bit(MD_HAS_PPL, &mddev->flags)) {
1921 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
1922 sb->feature_map |=
1923 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
1924 else
1925 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
1926 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
1927 sb->ppl.size = cpu_to_le16(rdev->ppl.size);
1928 }
1929
1930 rdev_for_each(rdev2, mddev) {
1931 i = rdev2->desc_nr;
1932 if (test_bit(Faulty, &rdev2->flags))
1933 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
1934 else if (test_bit(In_sync, &rdev2->flags))
1935 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1936 else if (test_bit(Journal, &rdev2->flags))
1937 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
1938 else if (rdev2->raid_disk >= 0)
1939 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1940 else
1941 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
1942 }
1943
1944 sb->sb_csum = calc_sb_1_csum(sb);
1945}
1946
1947static unsigned long long
1948super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1949{
1950 struct mdp_superblock_1 *sb;
1951 sector_t max_sectors;
1952 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1953 return 0;
1954 if (rdev->data_offset != rdev->new_data_offset)
1955 return 0;
1956 if (rdev->sb_start < rdev->data_offset) {
1957
1958 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
1959 max_sectors -= rdev->data_offset;
1960 if (!num_sectors || num_sectors > max_sectors)
1961 num_sectors = max_sectors;
1962 } else if (rdev->mddev->bitmap_info.offset) {
1963
1964 return 0;
1965 } else {
1966
1967 sector_t sb_start;
1968 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
1969 sb_start &= ~(sector_t)(4*2 - 1);
1970 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1971 if (!num_sectors || num_sectors > max_sectors)
1972 num_sectors = max_sectors;
1973 rdev->sb_start = sb_start;
1974 }
1975 sb = page_address(rdev->sb_page);
1976 sb->data_size = cpu_to_le64(num_sectors);
1977 sb->super_offset = cpu_to_le64(rdev->sb_start);
1978 sb->sb_csum = calc_sb_1_csum(sb);
1979 do {
1980 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1981 rdev->sb_page);
1982 } while (md_super_wait(rdev->mddev) < 0);
1983 return num_sectors;
1984
1985}
1986
1987static int
1988super_1_allow_new_offset(struct md_rdev *rdev,
1989 unsigned long long new_offset)
1990{
1991
1992 struct bitmap *bitmap;
1993 if (new_offset >= rdev->data_offset)
1994 return 1;
1995
1996
1997
1998 if (rdev->mddev->minor_version == 0)
1999 return 1;
2000
2001
2002
2003
2004
2005
2006
2007 if (rdev->sb_start + (32+4)*2 > new_offset)
2008 return 0;
2009 bitmap = rdev->mddev->bitmap;
2010 if (bitmap && !rdev->mddev->bitmap_info.file &&
2011 rdev->sb_start + rdev->mddev->bitmap_info.offset +
2012 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
2013 return 0;
2014 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
2015 return 0;
2016
2017 return 1;
2018}
2019
2020static struct super_type super_types[] = {
2021 [0] = {
2022 .name = "0.90.0",
2023 .owner = THIS_MODULE,
2024 .load_super = super_90_load,
2025 .validate_super = super_90_validate,
2026 .sync_super = super_90_sync,
2027 .rdev_size_change = super_90_rdev_size_change,
2028 .allow_new_offset = super_90_allow_new_offset,
2029 },
2030 [1] = {
2031 .name = "md-1",
2032 .owner = THIS_MODULE,
2033 .load_super = super_1_load,
2034 .validate_super = super_1_validate,
2035 .sync_super = super_1_sync,
2036 .rdev_size_change = super_1_rdev_size_change,
2037 .allow_new_offset = super_1_allow_new_offset,
2038 },
2039};
2040
2041static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
2042{
2043 if (mddev->sync_super) {
2044 mddev->sync_super(mddev, rdev);
2045 return;
2046 }
2047
2048 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
2049
2050 super_types[mddev->major_version].sync_super(mddev, rdev);
2051}
2052
2053static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
2054{
2055 struct md_rdev *rdev, *rdev2;
2056
2057 rcu_read_lock();
2058 rdev_for_each_rcu(rdev, mddev1) {
2059 if (test_bit(Faulty, &rdev->flags) ||
2060 test_bit(Journal, &rdev->flags) ||
2061 rdev->raid_disk == -1)
2062 continue;
2063 rdev_for_each_rcu(rdev2, mddev2) {
2064 if (test_bit(Faulty, &rdev2->flags) ||
2065 test_bit(Journal, &rdev2->flags) ||
2066 rdev2->raid_disk == -1)
2067 continue;
2068 if (rdev->bdev->bd_contains ==
2069 rdev2->bdev->bd_contains) {
2070 rcu_read_unlock();
2071 return 1;
2072 }
2073 }
2074 }
2075 rcu_read_unlock();
2076 return 0;
2077}
2078
2079static LIST_HEAD(pending_raid_disks);
2080
2081
2082
2083
2084
2085
2086
2087
2088int md_integrity_register(struct mddev *mddev)
2089{
2090 struct md_rdev *rdev, *reference = NULL;
2091
2092 if (list_empty(&mddev->disks))
2093 return 0;
2094 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
2095 return 0;
2096 rdev_for_each(rdev, mddev) {
2097
2098 if (test_bit(Faulty, &rdev->flags))
2099 continue;
2100 if (rdev->raid_disk < 0)
2101 continue;
2102 if (!reference) {
2103
2104 reference = rdev;
2105 continue;
2106 }
2107
2108 if (blk_integrity_compare(reference->bdev->bd_disk,
2109 rdev->bdev->bd_disk) < 0)
2110 return -EINVAL;
2111 }
2112 if (!reference || !bdev_get_integrity(reference->bdev))
2113 return 0;
2114
2115
2116
2117
2118 blk_integrity_register(mddev->gendisk,
2119 bdev_get_integrity(reference->bdev));
2120
2121 pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
2122 if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
2123 pr_err("md: failed to create integrity pool for %s\n",
2124 mdname(mddev));
2125 return -EINVAL;
2126 }
2127 return 0;
2128}
2129EXPORT_SYMBOL(md_integrity_register);
2130
2131
2132
2133
2134
2135int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
2136{
2137 struct blk_integrity *bi_rdev;
2138 struct blk_integrity *bi_mddev;
2139 char name[BDEVNAME_SIZE];
2140
2141 if (!mddev->gendisk)
2142 return 0;
2143
2144 bi_rdev = bdev_get_integrity(rdev->bdev);
2145 bi_mddev = blk_get_integrity(mddev->gendisk);
2146
2147 if (!bi_mddev)
2148 return 0;
2149
2150 if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
2151 pr_err("%s: incompatible integrity profile for %s\n",
2152 mdname(mddev), bdevname(rdev->bdev, name));
2153 return -ENXIO;
2154 }
2155
2156 return 0;
2157}
2158EXPORT_SYMBOL(md_integrity_add_rdev);
2159
2160static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2161{
2162 char b[BDEVNAME_SIZE];
2163 struct kobject *ko;
2164 int err;
2165
2166
2167 if (find_rdev(mddev, rdev->bdev->bd_dev))
2168 return -EEXIST;
2169
2170 if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) &&
2171 mddev->pers)
2172 return -EROFS;
2173
2174
2175 if (!test_bit(Journal, &rdev->flags) &&
2176 rdev->sectors &&
2177 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
2178 if (mddev->pers) {
2179
2180
2181
2182
2183 if (mddev->level > 0)
2184 return -ENOSPC;
2185 } else
2186 mddev->dev_sectors = rdev->sectors;
2187 }
2188
2189
2190
2191
2192
2193 rcu_read_lock();
2194 if (rdev->desc_nr < 0) {
2195 int choice = 0;
2196 if (mddev->pers)
2197 choice = mddev->raid_disks;
2198 while (md_find_rdev_nr_rcu(mddev, choice))
2199 choice++;
2200 rdev->desc_nr = choice;
2201 } else {
2202 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2203 rcu_read_unlock();
2204 return -EBUSY;
2205 }
2206 }
2207 rcu_read_unlock();
2208 if (!test_bit(Journal, &rdev->flags) &&
2209 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2210 pr_warn("md: %s: array is limited to %d devices\n",
2211 mdname(mddev), mddev->max_disks);
2212 return -EBUSY;
2213 }
2214 bdevname(rdev->bdev,b);
2215 strreplace(b, '/', '!');
2216
2217 rdev->mddev = mddev;
2218 pr_debug("md: bind<%s>\n", b);
2219
2220 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2221 goto fail;
2222
2223 ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
2224 if (sysfs_create_link(&rdev->kobj, ko, "block"))
2225 ;
2226 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2227
2228 list_add_rcu(&rdev->same_set, &mddev->disks);
2229 bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2230
2231
2232 mddev->recovery_disabled++;
2233
2234 return 0;
2235
2236 fail:
2237 pr_warn("md: failed to register dev-%s for %s\n",
2238 b, mdname(mddev));
2239 return err;
2240}
2241
2242static void md_delayed_delete(struct work_struct *ws)
2243{
2244 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
2245 kobject_del(&rdev->kobj);
2246 kobject_put(&rdev->kobj);
2247}
2248
2249static void unbind_rdev_from_array(struct md_rdev *rdev)
2250{
2251 char b[BDEVNAME_SIZE];
2252
2253 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2254 list_del_rcu(&rdev->same_set);
2255 pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
2256 rdev->mddev = NULL;
2257 sysfs_remove_link(&rdev->kobj, "block");
2258 sysfs_put(rdev->sysfs_state);
2259 rdev->sysfs_state = NULL;
2260 rdev->badblocks.count = 0;
2261
2262
2263
2264
2265 synchronize_rcu();
2266 INIT_WORK(&rdev->del_work, md_delayed_delete);
2267 kobject_get(&rdev->kobj);
2268 queue_work(md_misc_wq, &rdev->del_work);
2269}
2270
2271
2272
2273
2274
2275
2276static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
2277{
2278 int err = 0;
2279 struct block_device *bdev;
2280 char b[BDEVNAME_SIZE];
2281
2282 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2283 shared ? (struct md_rdev *)lock_rdev : rdev);
2284 if (IS_ERR(bdev)) {
2285 pr_warn("md: could not open %s.\n", __bdevname(dev, b));
2286 return PTR_ERR(bdev);
2287 }
2288 rdev->bdev = bdev;
2289 return err;
2290}
2291
2292static void unlock_rdev(struct md_rdev *rdev)
2293{
2294 struct block_device *bdev = rdev->bdev;
2295 rdev->bdev = NULL;
2296 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2297}
2298
2299void md_autodetect_dev(dev_t dev);
2300
2301static void export_rdev(struct md_rdev *rdev)
2302{
2303 char b[BDEVNAME_SIZE];
2304
2305 pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b));
2306 md_rdev_clear(rdev);
2307#ifndef MODULE
2308 if (test_bit(AutoDetected, &rdev->flags))
2309 md_autodetect_dev(rdev->bdev->bd_dev);
2310#endif
2311 unlock_rdev(rdev);
2312 kobject_put(&rdev->kobj);
2313}
2314
2315void md_kick_rdev_from_array(struct md_rdev *rdev)
2316{
2317 unbind_rdev_from_array(rdev);
2318 export_rdev(rdev);
2319}
2320EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
2321
2322static void export_array(struct mddev *mddev)
2323{
2324 struct md_rdev *rdev;
2325
2326 while (!list_empty(&mddev->disks)) {
2327 rdev = list_first_entry(&mddev->disks, struct md_rdev,
2328 same_set);
2329 md_kick_rdev_from_array(rdev);
2330 }
2331 mddev->raid_disks = 0;
2332 mddev->major_version = 0;
2333}
2334
2335static bool set_in_sync(struct mddev *mddev)
2336{
2337 lockdep_assert_held(&mddev->lock);
2338 if (!mddev->in_sync) {
2339 mddev->sync_checkers++;
2340 spin_unlock(&mddev->lock);
2341 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
2342 spin_lock(&mddev->lock);
2343 if (!mddev->in_sync &&
2344 percpu_ref_is_zero(&mddev->writes_pending)) {
2345 mddev->in_sync = 1;
2346
2347
2348
2349
2350 smp_mb();
2351 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2352 sysfs_notify_dirent_safe(mddev->sysfs_state);
2353 }
2354 if (--mddev->sync_checkers == 0)
2355 percpu_ref_switch_to_percpu(&mddev->writes_pending);
2356 }
2357 if (mddev->safemode == 1)
2358 mddev->safemode = 0;
2359 return mddev->in_sync;
2360}
2361
2362static void sync_sbs(struct mddev *mddev, int nospares)
2363{
2364
2365
2366
2367
2368
2369
2370 struct md_rdev *rdev;
2371 rdev_for_each(rdev, mddev) {
2372 if (rdev->sb_events == mddev->events ||
2373 (nospares &&
2374 rdev->raid_disk < 0 &&
2375 rdev->sb_events+1 == mddev->events)) {
2376
2377 rdev->sb_loaded = 2;
2378 } else {
2379 sync_super(mddev, rdev);
2380 rdev->sb_loaded = 1;
2381 }
2382 }
2383}
2384
2385static bool does_sb_need_changing(struct mddev *mddev)
2386{
2387 struct md_rdev *rdev;
2388 struct mdp_superblock_1 *sb;
2389 int role;
2390
2391
2392 rdev_for_each(rdev, mddev)
2393 if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
2394 break;
2395
2396
2397 if (!rdev)
2398 return false;
2399
2400 sb = page_address(rdev->sb_page);
2401
2402 rdev_for_each(rdev, mddev) {
2403 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2404
2405 if (role == 0xffff && rdev->raid_disk >=0 &&
2406 !test_bit(Faulty, &rdev->flags))
2407 return true;
2408
2409 if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
2410 return true;
2411 }
2412
2413
2414 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
2415 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2416 (mddev->layout != le32_to_cpu(sb->layout)) ||
2417 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
2418 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
2419 return true;
2420
2421 return false;
2422}
2423
2424void md_update_sb(struct mddev *mddev, int force_change)
2425{
2426 struct md_rdev *rdev;
2427 int sync_req;
2428 int nospares = 0;
2429 int any_badblocks_changed = 0;
2430 int ret = -1;
2431
2432 if (mddev->ro) {
2433 if (force_change)
2434 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2435 return;
2436 }
2437
2438repeat:
2439 if (mddev_is_clustered(mddev)) {
2440 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2441 force_change = 1;
2442 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2443 nospares = 1;
2444 ret = md_cluster_ops->metadata_update_start(mddev);
2445
2446 if (!does_sb_need_changing(mddev)) {
2447 if (ret == 0)
2448 md_cluster_ops->metadata_update_cancel(mddev);
2449 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2450 BIT(MD_SB_CHANGE_DEVS) |
2451 BIT(MD_SB_CHANGE_CLEAN));
2452 return;
2453 }
2454 }
2455
2456
2457
2458
2459
2460
2461
2462 rdev_for_each(rdev, mddev) {
2463 if (rdev->raid_disk >= 0 &&
2464 mddev->delta_disks >= 0 &&
2465 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
2466 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
2467 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2468 !test_bit(Journal, &rdev->flags) &&
2469 !test_bit(In_sync, &rdev->flags) &&
2470 mddev->curr_resync_completed > rdev->recovery_offset)
2471 rdev->recovery_offset = mddev->curr_resync_completed;
2472
2473 }
2474 if (!mddev->persistent) {
2475 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2476 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2477 if (!mddev->external) {
2478 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2479 rdev_for_each(rdev, mddev) {
2480 if (rdev->badblocks.changed) {
2481 rdev->badblocks.changed = 0;
2482 ack_all_badblocks(&rdev->badblocks);
2483 md_error(mddev, rdev);
2484 }
2485 clear_bit(Blocked, &rdev->flags);
2486 clear_bit(BlockedBadBlocks, &rdev->flags);
2487 wake_up(&rdev->blocked_wait);
2488 }
2489 }
2490 wake_up(&mddev->sb_wait);
2491 return;
2492 }
2493
2494 spin_lock(&mddev->lock);
2495
2496 mddev->utime = ktime_get_real_seconds();
2497
2498 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2499 force_change = 1;
2500 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2501
2502
2503
2504
2505 nospares = 1;
2506 if (force_change)
2507 nospares = 0;
2508 if (mddev->degraded)
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518 nospares = 0;
2519
2520 sync_req = mddev->in_sync;
2521
2522
2523
2524 if (nospares
2525 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2526 && mddev->can_decrease_events
2527 && mddev->events != 1) {
2528 mddev->events--;
2529 mddev->can_decrease_events = 0;
2530 } else {
2531
2532 mddev->events ++;
2533 mddev->can_decrease_events = nospares;
2534 }
2535
2536
2537
2538
2539
2540
2541 WARN_ON(mddev->events == 0);
2542
2543 rdev_for_each(rdev, mddev) {
2544 if (rdev->badblocks.changed)
2545 any_badblocks_changed++;
2546 if (test_bit(Faulty, &rdev->flags))
2547 set_bit(FaultRecorded, &rdev->flags);
2548 }
2549
2550 sync_sbs(mddev, nospares);
2551 spin_unlock(&mddev->lock);
2552
2553 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2554 mdname(mddev), mddev->in_sync);
2555
2556 if (mddev->queue)
2557 blk_add_trace_msg(mddev->queue, "md md_update_sb");
2558rewrite:
2559 bitmap_update_sb(mddev->bitmap);
2560 rdev_for_each(rdev, mddev) {
2561 char b[BDEVNAME_SIZE];
2562
2563 if (rdev->sb_loaded != 1)
2564 continue;
2565
2566 if (!test_bit(Faulty, &rdev->flags)) {
2567 md_super_write(mddev,rdev,
2568 rdev->sb_start, rdev->sb_size,
2569 rdev->sb_page);
2570 pr_debug("md: (write) %s's sb offset: %llu\n",
2571 bdevname(rdev->bdev, b),
2572 (unsigned long long)rdev->sb_start);
2573 rdev->sb_events = mddev->events;
2574 if (rdev->badblocks.size) {
2575 md_super_write(mddev, rdev,
2576 rdev->badblocks.sector,
2577 rdev->badblocks.size << 9,
2578 rdev->bb_page);
2579 rdev->badblocks.size = 0;
2580 }
2581
2582 } else
2583 pr_debug("md: %s (skipping faulty)\n",
2584 bdevname(rdev->bdev, b));
2585
2586 if (mddev->level == LEVEL_MULTIPATH)
2587
2588 break;
2589 }
2590 if (md_super_wait(mddev) < 0)
2591 goto rewrite;
2592
2593
2594 if (mddev_is_clustered(mddev) && ret == 0)
2595 md_cluster_ops->metadata_update_finish(mddev);
2596
2597 if (mddev->in_sync != sync_req ||
2598 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2599 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
2600
2601 goto repeat;
2602 wake_up(&mddev->sb_wait);
2603 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2604 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2605
2606 rdev_for_each(rdev, mddev) {
2607 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2608 clear_bit(Blocked, &rdev->flags);
2609
2610 if (any_badblocks_changed)
2611 ack_all_badblocks(&rdev->badblocks);
2612 clear_bit(BlockedBadBlocks, &rdev->flags);
2613 wake_up(&rdev->blocked_wait);
2614 }
2615}
2616EXPORT_SYMBOL(md_update_sb);
2617
2618static int add_bound_rdev(struct md_rdev *rdev)
2619{
2620 struct mddev *mddev = rdev->mddev;
2621 int err = 0;
2622 bool add_journal = test_bit(Journal, &rdev->flags);
2623
2624 if (!mddev->pers->hot_remove_disk || add_journal) {
2625
2626
2627
2628
2629 super_types[mddev->major_version].
2630 validate_super(mddev, rdev);
2631 if (add_journal)
2632 mddev_suspend(mddev);
2633 err = mddev->pers->hot_add_disk(mddev, rdev);
2634 if (add_journal)
2635 mddev_resume(mddev);
2636 if (err) {
2637 md_kick_rdev_from_array(rdev);
2638 return err;
2639 }
2640 }
2641 sysfs_notify_dirent_safe(rdev->sysfs_state);
2642
2643 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2644 if (mddev->degraded)
2645 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2646 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2647 md_new_event(mddev);
2648 md_wakeup_thread(mddev->thread);
2649 return 0;
2650}
2651
2652
2653
2654
2655static int cmd_match(const char *cmd, const char *str)
2656{
2657
2658
2659
2660
2661 while (*cmd && *str && *cmd == *str) {
2662 cmd++;
2663 str++;
2664 }
2665 if (*cmd == '\n')
2666 cmd++;
2667 if (*str || *cmd)
2668 return 0;
2669 return 1;
2670}
2671
2672struct rdev_sysfs_entry {
2673 struct attribute attr;
2674 ssize_t (*show)(struct md_rdev *, char *);
2675 ssize_t (*store)(struct md_rdev *, const char *, size_t);
2676};
2677
2678static ssize_t
2679state_show(struct md_rdev *rdev, char *page)
2680{
2681 char *sep = ",";
2682 size_t len = 0;
2683 unsigned long flags = READ_ONCE(rdev->flags);
2684
2685 if (test_bit(Faulty, &flags) ||
2686 (!test_bit(ExternalBbl, &flags) &&
2687 rdev->badblocks.unacked_exist))
2688 len += sprintf(page+len, "faulty%s", sep);
2689 if (test_bit(In_sync, &flags))
2690 len += sprintf(page+len, "in_sync%s", sep);
2691 if (test_bit(Journal, &flags))
2692 len += sprintf(page+len, "journal%s", sep);
2693 if (test_bit(WriteMostly, &flags))
2694 len += sprintf(page+len, "write_mostly%s", sep);
2695 if (test_bit(Blocked, &flags) ||
2696 (rdev->badblocks.unacked_exist
2697 && !test_bit(Faulty, &flags)))
2698 len += sprintf(page+len, "blocked%s", sep);
2699 if (!test_bit(Faulty, &flags) &&
2700 !test_bit(Journal, &flags) &&
2701 !test_bit(In_sync, &flags))
2702 len += sprintf(page+len, "spare%s", sep);
2703 if (test_bit(WriteErrorSeen, &flags))
2704 len += sprintf(page+len, "write_error%s", sep);
2705 if (test_bit(WantReplacement, &flags))
2706 len += sprintf(page+len, "want_replacement%s", sep);
2707 if (test_bit(Replacement, &flags))
2708 len += sprintf(page+len, "replacement%s", sep);
2709 if (test_bit(ExternalBbl, &flags))
2710 len += sprintf(page+len, "external_bbl%s", sep);
2711 if (test_bit(FailFast, &flags))
2712 len += sprintf(page+len, "failfast%s", sep);
2713
2714 if (len)
2715 len -= strlen(sep);
2716
2717 return len+sprintf(page+len, "\n");
2718}
2719
2720static ssize_t
2721state_store(struct md_rdev *rdev, const char *buf, size_t len)
2722{
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737 int err = -EINVAL;
2738 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2739 md_error(rdev->mddev, rdev);
2740 if (test_bit(Faulty, &rdev->flags))
2741 err = 0;
2742 else
2743 err = -EBUSY;
2744 } else if (cmd_match(buf, "remove")) {
2745 if (rdev->mddev->pers) {
2746 clear_bit(Blocked, &rdev->flags);
2747 remove_and_add_spares(rdev->mddev, rdev);
2748 }
2749 if (rdev->raid_disk >= 0)
2750 err = -EBUSY;
2751 else {
2752 struct mddev *mddev = rdev->mddev;
2753 err = 0;
2754 if (mddev_is_clustered(mddev))
2755 err = md_cluster_ops->remove_disk(mddev, rdev);
2756
2757 if (err == 0) {
2758 md_kick_rdev_from_array(rdev);
2759 if (mddev->pers) {
2760 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2761 md_wakeup_thread(mddev->thread);
2762 }
2763 md_new_event(mddev);
2764 }
2765 }
2766 } else if (cmd_match(buf, "writemostly")) {
2767 set_bit(WriteMostly, &rdev->flags);
2768 err = 0;
2769 } else if (cmd_match(buf, "-writemostly")) {
2770 clear_bit(WriteMostly, &rdev->flags);
2771 err = 0;
2772 } else if (cmd_match(buf, "blocked")) {
2773 set_bit(Blocked, &rdev->flags);
2774 err = 0;
2775 } else if (cmd_match(buf, "-blocked")) {
2776 if (!test_bit(Faulty, &rdev->flags) &&
2777 !test_bit(ExternalBbl, &rdev->flags) &&
2778 rdev->badblocks.unacked_exist) {
2779
2780
2781
2782 md_error(rdev->mddev, rdev);
2783 }
2784 clear_bit(Blocked, &rdev->flags);
2785 clear_bit(BlockedBadBlocks, &rdev->flags);
2786 wake_up(&rdev->blocked_wait);
2787 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2788 md_wakeup_thread(rdev->mddev->thread);
2789
2790 err = 0;
2791 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2792 set_bit(In_sync, &rdev->flags);
2793 err = 0;
2794 } else if (cmd_match(buf, "failfast")) {
2795 set_bit(FailFast, &rdev->flags);
2796 err = 0;
2797 } else if (cmd_match(buf, "-failfast")) {
2798 clear_bit(FailFast, &rdev->flags);
2799 err = 0;
2800 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
2801 !test_bit(Journal, &rdev->flags)) {
2802 if (rdev->mddev->pers == NULL) {
2803 clear_bit(In_sync, &rdev->flags);
2804 rdev->saved_raid_disk = rdev->raid_disk;
2805 rdev->raid_disk = -1;
2806 err = 0;
2807 }
2808 } else if (cmd_match(buf, "write_error")) {
2809 set_bit(WriteErrorSeen, &rdev->flags);
2810 err = 0;
2811 } else if (cmd_match(buf, "-write_error")) {
2812 clear_bit(WriteErrorSeen, &rdev->flags);
2813 err = 0;
2814 } else if (cmd_match(buf, "want_replacement")) {
2815
2816
2817
2818
2819 if (rdev->raid_disk >= 0 &&
2820 !test_bit(Journal, &rdev->flags) &&
2821 !test_bit(Replacement, &rdev->flags))
2822 set_bit(WantReplacement, &rdev->flags);
2823 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2824 md_wakeup_thread(rdev->mddev->thread);
2825 err = 0;
2826 } else if (cmd_match(buf, "-want_replacement")) {
2827
2828
2829
2830 err = 0;
2831 clear_bit(WantReplacement, &rdev->flags);
2832 } else if (cmd_match(buf, "replacement")) {
2833
2834
2835
2836
2837 if (rdev->mddev->pers)
2838 err = -EBUSY;
2839 else {
2840 set_bit(Replacement, &rdev->flags);
2841 err = 0;
2842 }
2843 } else if (cmd_match(buf, "-replacement")) {
2844
2845 if (rdev->mddev->pers)
2846 err = -EBUSY;
2847 else {
2848 clear_bit(Replacement, &rdev->flags);
2849 err = 0;
2850 }
2851 } else if (cmd_match(buf, "re-add")) {
2852 if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) {
2853
2854
2855
2856
2857
2858
2859 if (!mddev_is_clustered(rdev->mddev) ||
2860 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
2861 clear_bit(Faulty, &rdev->flags);
2862 err = add_bound_rdev(rdev);
2863 }
2864 } else
2865 err = -EBUSY;
2866 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
2867 set_bit(ExternalBbl, &rdev->flags);
2868 rdev->badblocks.shift = 0;
2869 err = 0;
2870 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
2871 clear_bit(ExternalBbl, &rdev->flags);
2872 err = 0;
2873 }
2874 if (!err)
2875 sysfs_notify_dirent_safe(rdev->sysfs_state);
2876 return err ? err : len;
2877}
2878static struct rdev_sysfs_entry rdev_state =
2879__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
2880
2881static ssize_t
2882errors_show(struct md_rdev *rdev, char *page)
2883{
2884 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2885}
2886
2887static ssize_t
2888errors_store(struct md_rdev *rdev, const char *buf, size_t len)
2889{
2890 unsigned int n;
2891 int rv;
2892
2893 rv = kstrtouint(buf, 10, &n);
2894 if (rv < 0)
2895 return rv;
2896 atomic_set(&rdev->corrected_errors, n);
2897 return len;
2898}
2899static struct rdev_sysfs_entry rdev_errors =
2900__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2901
2902static ssize_t
2903slot_show(struct md_rdev *rdev, char *page)
2904{
2905 if (test_bit(Journal, &rdev->flags))
2906 return sprintf(page, "journal\n");
2907 else if (rdev->raid_disk < 0)
2908 return sprintf(page, "none\n");
2909 else
2910 return sprintf(page, "%d\n", rdev->raid_disk);
2911}
2912
2913static ssize_t
2914slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2915{
2916 int slot;
2917 int err;
2918
2919 if (test_bit(Journal, &rdev->flags))
2920 return -EBUSY;
2921 if (strncmp(buf, "none", 4)==0)
2922 slot = -1;
2923 else {
2924 err = kstrtouint(buf, 10, (unsigned int *)&slot);
2925 if (err < 0)
2926 return err;
2927 }
2928 if (rdev->mddev->pers && slot == -1) {
2929
2930
2931
2932
2933
2934
2935
2936 if (rdev->raid_disk == -1)
2937 return -EEXIST;
2938
2939 if (rdev->mddev->pers->hot_remove_disk == NULL)
2940 return -EINVAL;
2941 clear_bit(Blocked, &rdev->flags);
2942 remove_and_add_spares(rdev->mddev, rdev);
2943 if (rdev->raid_disk >= 0)
2944 return -EBUSY;
2945 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2946 md_wakeup_thread(rdev->mddev->thread);
2947 } else if (rdev->mddev->pers) {
2948
2949
2950
2951 int err;
2952
2953 if (rdev->raid_disk != -1)
2954 return -EBUSY;
2955
2956 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
2957 return -EBUSY;
2958
2959 if (rdev->mddev->pers->hot_add_disk == NULL)
2960 return -EINVAL;
2961
2962 if (slot >= rdev->mddev->raid_disks &&
2963 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2964 return -ENOSPC;
2965
2966 rdev->raid_disk = slot;
2967 if (test_bit(In_sync, &rdev->flags))
2968 rdev->saved_raid_disk = slot;
2969 else
2970 rdev->saved_raid_disk = -1;
2971 clear_bit(In_sync, &rdev->flags);
2972 clear_bit(Bitmap_sync, &rdev->flags);
2973 err = rdev->mddev->pers->
2974 hot_add_disk(rdev->mddev, rdev);
2975 if (err) {
2976 rdev->raid_disk = -1;
2977 return err;
2978 } else
2979 sysfs_notify_dirent_safe(rdev->sysfs_state);
2980 if (sysfs_link_rdev(rdev->mddev, rdev))
2981 ;
2982
2983 } else {
2984 if (slot >= rdev->mddev->raid_disks &&
2985 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2986 return -ENOSPC;
2987 rdev->raid_disk = slot;
2988
2989 clear_bit(Faulty, &rdev->flags);
2990 clear_bit(WriteMostly, &rdev->flags);
2991 set_bit(In_sync, &rdev->flags);
2992 sysfs_notify_dirent_safe(rdev->sysfs_state);
2993 }
2994 return len;
2995}
2996
2997static struct rdev_sysfs_entry rdev_slot =
2998__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2999
3000static ssize_t
3001offset_show(struct md_rdev *rdev, char *page)
3002{
3003 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
3004}
3005
3006static ssize_t
3007offset_store(struct md_rdev *rdev, const char *buf, size_t len)
3008{
3009 unsigned long long offset;
3010 if (kstrtoull(buf, 10, &offset) < 0)
3011 return -EINVAL;
3012 if (rdev->mddev->pers && rdev->raid_disk >= 0)
3013 return -EBUSY;
3014 if (rdev->sectors && rdev->mddev->external)
3015
3016
3017 return -EBUSY;
3018 rdev->data_offset = offset;
3019 rdev->new_data_offset = offset;
3020 return len;
3021}
3022
3023static struct rdev_sysfs_entry rdev_offset =
3024__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
3025
3026static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
3027{
3028 return sprintf(page, "%llu\n",
3029 (unsigned long long)rdev->new_data_offset);
3030}
3031
3032static ssize_t new_offset_store(struct md_rdev *rdev,
3033 const char *buf, size_t len)
3034{
3035 unsigned long long new_offset;
3036 struct mddev *mddev = rdev->mddev;
3037
3038 if (kstrtoull(buf, 10, &new_offset) < 0)
3039 return -EINVAL;
3040
3041 if (mddev->sync_thread ||
3042 test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
3043 return -EBUSY;
3044 if (new_offset == rdev->data_offset)
3045
3046 ;
3047 else if (new_offset > rdev->data_offset) {
3048
3049 if (new_offset - rdev->data_offset
3050 + mddev->dev_sectors > rdev->sectors)
3051 return -E2BIG;
3052 }
3053
3054
3055
3056
3057
3058 if (new_offset < rdev->data_offset &&
3059 mddev->reshape_backwards)
3060 return -EINVAL;
3061
3062
3063
3064
3065 if (new_offset > rdev->data_offset &&
3066 !mddev->reshape_backwards)
3067 return -EINVAL;
3068
3069 if (mddev->pers && mddev->persistent &&
3070 !super_types[mddev->major_version]
3071 .allow_new_offset(rdev, new_offset))
3072 return -E2BIG;
3073 rdev->new_data_offset = new_offset;
3074 if (new_offset > rdev->data_offset)
3075 mddev->reshape_backwards = 1;
3076 else if (new_offset < rdev->data_offset)
3077 mddev->reshape_backwards = 0;
3078
3079 return len;
3080}
3081static struct rdev_sysfs_entry rdev_new_offset =
3082__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
3083
3084static ssize_t
3085rdev_size_show(struct md_rdev *rdev, char *page)
3086{
3087 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
3088}
3089
3090static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
3091{
3092
3093 if (s1+l1 <= s2)
3094 return 0;
3095 if (s2+l2 <= s1)
3096 return 0;
3097 return 1;
3098}
3099
3100static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
3101{
3102 unsigned long long blocks;
3103 sector_t new;
3104
3105 if (kstrtoull(buf, 10, &blocks) < 0)
3106 return -EINVAL;
3107
3108 if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
3109 return -EINVAL;
3110
3111 new = blocks * 2;
3112 if (new != blocks * 2)
3113 return -EINVAL;
3114
3115 *sectors = new;
3116 return 0;
3117}
3118
3119static ssize_t
3120rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3121{
3122 struct mddev *my_mddev = rdev->mddev;
3123 sector_t oldsectors = rdev->sectors;
3124 sector_t sectors;
3125
3126 if (test_bit(Journal, &rdev->flags))
3127 return -EBUSY;
3128 if (strict_blocks_to_sectors(buf, §ors) < 0)
3129 return -EINVAL;
3130 if (rdev->data_offset != rdev->new_data_offset)
3131 return -EINVAL;
3132 if (my_mddev->pers && rdev->raid_disk >= 0) {
3133 if (my_mddev->persistent) {
3134 sectors = super_types[my_mddev->major_version].
3135 rdev_size_change(rdev, sectors);
3136 if (!sectors)
3137 return -EBUSY;
3138 } else if (!sectors)
3139 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
3140 rdev->data_offset;
3141 if (!my_mddev->pers->resize)
3142
3143 return -EINVAL;
3144 }
3145 if (sectors < my_mddev->dev_sectors)
3146 return -EINVAL;
3147
3148 rdev->sectors = sectors;
3149 if (sectors > oldsectors && my_mddev->external) {
3150
3151
3152
3153
3154
3155
3156 struct mddev *mddev;
3157 int overlap = 0;
3158 struct list_head *tmp;
3159
3160 rcu_read_lock();
3161 for_each_mddev(mddev, tmp) {
3162 struct md_rdev *rdev2;
3163
3164 rdev_for_each(rdev2, mddev)
3165 if (rdev->bdev == rdev2->bdev &&
3166 rdev != rdev2 &&
3167 overlaps(rdev->data_offset, rdev->sectors,
3168 rdev2->data_offset,
3169 rdev2->sectors)) {
3170 overlap = 1;
3171 break;
3172 }
3173 if (overlap) {
3174 mddev_put(mddev);
3175 break;
3176 }
3177 }
3178 rcu_read_unlock();
3179 if (overlap) {
3180
3181
3182
3183
3184
3185
3186 rdev->sectors = oldsectors;
3187 return -EBUSY;
3188 }
3189 }
3190 return len;
3191}
3192
3193static struct rdev_sysfs_entry rdev_size =
3194__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3195
3196static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3197{
3198 unsigned long long recovery_start = rdev->recovery_offset;
3199
3200 if (test_bit(In_sync, &rdev->flags) ||
3201 recovery_start == MaxSector)
3202 return sprintf(page, "none\n");
3203
3204 return sprintf(page, "%llu\n", recovery_start);
3205}
3206
3207static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3208{
3209 unsigned long long recovery_start;
3210
3211 if (cmd_match(buf, "none"))
3212 recovery_start = MaxSector;
3213 else if (kstrtoull(buf, 10, &recovery_start))
3214 return -EINVAL;
3215
3216 if (rdev->mddev->pers &&
3217 rdev->raid_disk >= 0)
3218 return -EBUSY;
3219
3220 rdev->recovery_offset = recovery_start;
3221 if (recovery_start == MaxSector)
3222 set_bit(In_sync, &rdev->flags);
3223 else
3224 clear_bit(In_sync, &rdev->flags);
3225 return len;
3226}
3227
3228static struct rdev_sysfs_entry rdev_recovery_start =
3229__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242static ssize_t bb_show(struct md_rdev *rdev, char *page)
3243{
3244 return badblocks_show(&rdev->badblocks, page, 0);
3245}
3246static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3247{
3248 int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3249
3250 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3251 wake_up(&rdev->blocked_wait);
3252 return rv;
3253}
3254static struct rdev_sysfs_entry rdev_bad_blocks =
3255__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3256
3257static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3258{
3259 return badblocks_show(&rdev->badblocks, page, 1);
3260}
3261static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3262{
3263 return badblocks_store(&rdev->badblocks, page, len, 1);
3264}
3265static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3266__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3267
3268static ssize_t
3269ppl_sector_show(struct md_rdev *rdev, char *page)
3270{
3271 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
3272}
3273
3274static ssize_t
3275ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
3276{
3277 unsigned long long sector;
3278
3279 if (kstrtoull(buf, 10, §or) < 0)
3280 return -EINVAL;
3281 if (sector != (sector_t)sector)
3282 return -EINVAL;
3283
3284 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3285 rdev->raid_disk >= 0)
3286 return -EBUSY;
3287
3288 if (rdev->mddev->persistent) {
3289 if (rdev->mddev->major_version == 0)
3290 return -EINVAL;
3291 if ((sector > rdev->sb_start &&
3292 sector - rdev->sb_start > S16_MAX) ||
3293 (sector < rdev->sb_start &&
3294 rdev->sb_start - sector > -S16_MIN))
3295 return -EINVAL;
3296 rdev->ppl.offset = sector - rdev->sb_start;
3297 } else if (!rdev->mddev->external) {
3298 return -EBUSY;
3299 }
3300 rdev->ppl.sector = sector;
3301 return len;
3302}
3303
3304static struct rdev_sysfs_entry rdev_ppl_sector =
3305__ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
3306
3307static ssize_t
3308ppl_size_show(struct md_rdev *rdev, char *page)
3309{
3310 return sprintf(page, "%u\n", rdev->ppl.size);
3311}
3312
3313static ssize_t
3314ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3315{
3316 unsigned int size;
3317
3318 if (kstrtouint(buf, 10, &size) < 0)
3319 return -EINVAL;
3320
3321 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3322 rdev->raid_disk >= 0)
3323 return -EBUSY;
3324
3325 if (rdev->mddev->persistent) {
3326 if (rdev->mddev->major_version == 0)
3327 return -EINVAL;
3328 if (size > U16_MAX)
3329 return -EINVAL;
3330 } else if (!rdev->mddev->external) {
3331 return -EBUSY;
3332 }
3333 rdev->ppl.size = size;
3334 return len;
3335}
3336
3337static struct rdev_sysfs_entry rdev_ppl_size =
3338__ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
3339
3340static struct attribute *rdev_default_attrs[] = {
3341 &rdev_state.attr,
3342 &rdev_errors.attr,
3343 &rdev_slot.attr,
3344 &rdev_offset.attr,
3345 &rdev_new_offset.attr,
3346 &rdev_size.attr,
3347 &rdev_recovery_start.attr,
3348 &rdev_bad_blocks.attr,
3349 &rdev_unack_bad_blocks.attr,
3350 &rdev_ppl_sector.attr,
3351 &rdev_ppl_size.attr,
3352 NULL,
3353};
3354static ssize_t
3355rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3356{
3357 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3358 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3359
3360 if (!entry->show)
3361 return -EIO;
3362 if (!rdev->mddev)
3363 return -EBUSY;
3364 return entry->show(rdev, page);
3365}
3366
3367static ssize_t
3368rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3369 const char *page, size_t length)
3370{
3371 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3372 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3373 ssize_t rv;
3374 struct mddev *mddev = rdev->mddev;
3375
3376 if (!entry->store)
3377 return -EIO;
3378 if (!capable(CAP_SYS_ADMIN))
3379 return -EACCES;
3380 rv = mddev ? mddev_lock(mddev): -EBUSY;
3381 if (!rv) {
3382 if (rdev->mddev == NULL)
3383 rv = -EBUSY;
3384 else
3385 rv = entry->store(rdev, page, length);
3386 mddev_unlock(mddev);
3387 }
3388 return rv;
3389}
3390
3391static void rdev_free(struct kobject *ko)
3392{
3393 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3394 kfree(rdev);
3395}
3396static const struct sysfs_ops rdev_sysfs_ops = {
3397 .show = rdev_attr_show,
3398 .store = rdev_attr_store,
3399};
3400static struct kobj_type rdev_ktype = {
3401 .release = rdev_free,
3402 .sysfs_ops = &rdev_sysfs_ops,
3403 .default_attrs = rdev_default_attrs,
3404};
3405
3406int md_rdev_init(struct md_rdev *rdev)
3407{
3408 rdev->desc_nr = -1;
3409 rdev->saved_raid_disk = -1;
3410 rdev->raid_disk = -1;
3411 rdev->flags = 0;
3412 rdev->data_offset = 0;
3413 rdev->new_data_offset = 0;
3414 rdev->sb_events = 0;
3415 rdev->last_read_error = 0;
3416 rdev->sb_loaded = 0;
3417 rdev->bb_page = NULL;
3418 atomic_set(&rdev->nr_pending, 0);
3419 atomic_set(&rdev->read_errors, 0);
3420 atomic_set(&rdev->corrected_errors, 0);
3421
3422 INIT_LIST_HEAD(&rdev->same_set);
3423 init_waitqueue_head(&rdev->blocked_wait);
3424
3425
3426
3427
3428
3429 return badblocks_init(&rdev->badblocks, 0);
3430}
3431EXPORT_SYMBOL_GPL(md_rdev_init);
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3443{
3444 char b[BDEVNAME_SIZE];
3445 int err;
3446 struct md_rdev *rdev;
3447 sector_t size;
3448
3449 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3450 if (!rdev)
3451 return ERR_PTR(-ENOMEM);
3452
3453 err = md_rdev_init(rdev);
3454 if (err)
3455 goto abort_free;
3456 err = alloc_disk_sb(rdev);
3457 if (err)
3458 goto abort_free;
3459
3460 err = lock_rdev(rdev, newdev, super_format == -2);
3461 if (err)
3462 goto abort_free;
3463
3464 kobject_init(&rdev->kobj, &rdev_ktype);
3465
3466 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
3467 if (!size) {
3468 pr_warn("md: %s has zero or unknown size, marking faulty!\n",
3469 bdevname(rdev->bdev,b));
3470 err = -EINVAL;
3471 goto abort_free;
3472 }
3473
3474 if (super_format >= 0) {
3475 err = super_types[super_format].
3476 load_super(rdev, NULL, super_minor);
3477 if (err == -EINVAL) {
3478 pr_warn("md: %s does not have a valid v%d.%d superblock, not importing!\n",
3479 bdevname(rdev->bdev,b),
3480 super_format, super_minor);
3481 goto abort_free;
3482 }
3483 if (err < 0) {
3484 pr_warn("md: could not read %s's sb, not importing!\n",
3485 bdevname(rdev->bdev,b));
3486 goto abort_free;
3487 }
3488 }
3489
3490 return rdev;
3491
3492abort_free:
3493 if (rdev->bdev)
3494 unlock_rdev(rdev);
3495 md_rdev_clear(rdev);
3496 kfree(rdev);
3497 return ERR_PTR(err);
3498}
3499
3500
3501
3502
3503
3504static void analyze_sbs(struct mddev *mddev)
3505{
3506 int i;
3507 struct md_rdev *rdev, *freshest, *tmp;
3508 char b[BDEVNAME_SIZE];
3509
3510 freshest = NULL;
3511 rdev_for_each_safe(rdev, tmp, mddev)
3512 switch (super_types[mddev->major_version].
3513 load_super(rdev, freshest, mddev->minor_version)) {
3514 case 1:
3515 freshest = rdev;
3516 break;
3517 case 0:
3518 break;
3519 default:
3520 pr_warn("md: fatal superblock inconsistency in %s -- removing from array\n",
3521 bdevname(rdev->bdev,b));
3522 md_kick_rdev_from_array(rdev);
3523 }
3524
3525 super_types[mddev->major_version].
3526 validate_super(mddev, freshest);
3527
3528 i = 0;
3529 rdev_for_each_safe(rdev, tmp, mddev) {
3530 if (mddev->max_disks &&
3531 (rdev->desc_nr >= mddev->max_disks ||
3532 i > mddev->max_disks)) {
3533 pr_warn("md: %s: %s: only %d devices permitted\n",
3534 mdname(mddev), bdevname(rdev->bdev, b),
3535 mddev->max_disks);
3536 md_kick_rdev_from_array(rdev);
3537 continue;
3538 }
3539 if (rdev != freshest) {
3540 if (super_types[mddev->major_version].
3541 validate_super(mddev, rdev)) {
3542 pr_warn("md: kicking non-fresh %s from array!\n",
3543 bdevname(rdev->bdev,b));
3544 md_kick_rdev_from_array(rdev);
3545 continue;
3546 }
3547 }
3548 if (mddev->level == LEVEL_MULTIPATH) {
3549 rdev->desc_nr = i++;
3550 rdev->raid_disk = rdev->desc_nr;
3551 set_bit(In_sync, &rdev->flags);
3552 } else if (rdev->raid_disk >=
3553 (mddev->raid_disks - min(0, mddev->delta_disks)) &&
3554 !test_bit(Journal, &rdev->flags)) {
3555 rdev->raid_disk = -1;
3556 clear_bit(In_sync, &rdev->flags);
3557 }
3558 }
3559}
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3572{
3573 unsigned long result = 0;
3574 long decimals = -1;
3575 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3576 if (*cp == '.')
3577 decimals = 0;
3578 else if (decimals < scale) {
3579 unsigned int value;
3580 value = *cp - '0';
3581 result = result * 10 + value;
3582 if (decimals >= 0)
3583 decimals++;
3584 }
3585 cp++;
3586 }
3587 if (*cp == '\n')
3588 cp++;
3589 if (*cp)
3590 return -EINVAL;
3591 if (decimals < 0)
3592 decimals = 0;
3593 while (decimals < scale) {
3594 result *= 10;
3595 decimals ++;
3596 }
3597 *res = result;
3598 return 0;
3599}
3600
3601static ssize_t
3602safe_delay_show(struct mddev *mddev, char *page)
3603{
3604 int msec = (mddev->safemode_delay*1000)/HZ;
3605 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
3606}
3607static ssize_t
3608safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3609{
3610 unsigned long msec;
3611
3612 if (mddev_is_clustered(mddev)) {
3613 pr_warn("md: Safemode is disabled for clustered mode\n");
3614 return -EINVAL;
3615 }
3616
3617 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
3618 return -EINVAL;
3619 if (msec == 0)
3620 mddev->safemode_delay = 0;
3621 else {
3622 unsigned long old_delay = mddev->safemode_delay;
3623 unsigned long new_delay = (msec*HZ)/1000;
3624
3625 if (new_delay == 0)
3626 new_delay = 1;
3627 mddev->safemode_delay = new_delay;
3628 if (new_delay < old_delay || old_delay == 0)
3629 mod_timer(&mddev->safemode_timer, jiffies+1);
3630 }
3631 return len;
3632}
3633static struct md_sysfs_entry md_safe_delay =
3634__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3635
3636static ssize_t
3637level_show(struct mddev *mddev, char *page)
3638{
3639 struct md_personality *p;
3640 int ret;
3641 spin_lock(&mddev->lock);
3642 p = mddev->pers;
3643 if (p)
3644 ret = sprintf(page, "%s\n", p->name);
3645 else if (mddev->clevel[0])
3646 ret = sprintf(page, "%s\n", mddev->clevel);
3647 else if (mddev->level != LEVEL_NONE)
3648 ret = sprintf(page, "%d\n", mddev->level);
3649 else
3650 ret = 0;
3651 spin_unlock(&mddev->lock);
3652 return ret;
3653}
3654
3655static ssize_t
3656level_store(struct mddev *mddev, const char *buf, size_t len)
3657{
3658 char clevel[16];
3659 ssize_t rv;
3660 size_t slen = len;
3661 struct md_personality *pers, *oldpers;
3662 long level;
3663 void *priv, *oldpriv;
3664 struct md_rdev *rdev;
3665
3666 if (slen == 0 || slen >= sizeof(clevel))
3667 return -EINVAL;
3668
3669 rv = mddev_lock(mddev);
3670 if (rv)
3671 return rv;
3672
3673 if (mddev->pers == NULL) {
3674 strncpy(mddev->clevel, buf, slen);
3675 if (mddev->clevel[slen-1] == '\n')
3676 slen--;
3677 mddev->clevel[slen] = 0;
3678 mddev->level = LEVEL_NONE;
3679 rv = len;
3680 goto out_unlock;
3681 }
3682 rv = -EROFS;
3683 if (mddev->ro)
3684 goto out_unlock;
3685
3686
3687
3688
3689
3690
3691
3692 rv = -EBUSY;
3693 if (mddev->sync_thread ||
3694 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3695 mddev->reshape_position != MaxSector ||
3696 mddev->sysfs_active)
3697 goto out_unlock;
3698
3699 rv = -EINVAL;
3700 if (!mddev->pers->quiesce) {
3701 pr_warn("md: %s: %s does not support online personality change\n",
3702 mdname(mddev), mddev->pers->name);
3703 goto out_unlock;
3704 }
3705
3706
3707 strncpy(clevel, buf, slen);
3708 if (clevel[slen-1] == '\n')
3709 slen--;
3710 clevel[slen] = 0;
3711 if (kstrtol(clevel, 10, &level))
3712 level = LEVEL_NONE;
3713
3714 if (request_module("md-%s", clevel) != 0)
3715 request_module("md-level-%s", clevel);
3716 spin_lock(&pers_lock);
3717 pers = find_pers(level, clevel);
3718 if (!pers || !try_module_get(pers->owner)) {
3719 spin_unlock(&pers_lock);
3720 pr_warn("md: personality %s not loaded\n", clevel);
3721 rv = -EINVAL;
3722 goto out_unlock;
3723 }
3724 spin_unlock(&pers_lock);
3725
3726 if (pers == mddev->pers) {
3727
3728 module_put(pers->owner);
3729 rv = len;
3730 goto out_unlock;
3731 }
3732 if (!pers->takeover) {
3733 module_put(pers->owner);
3734 pr_warn("md: %s: %s does not support personality takeover\n",
3735 mdname(mddev), clevel);
3736 rv = -EINVAL;
3737 goto out_unlock;
3738 }
3739
3740 rdev_for_each(rdev, mddev)
3741 rdev->new_raid_disk = rdev->raid_disk;
3742
3743
3744
3745
3746 priv = pers->takeover(mddev);
3747 if (IS_ERR(priv)) {
3748 mddev->new_level = mddev->level;
3749 mddev->new_layout = mddev->layout;
3750 mddev->new_chunk_sectors = mddev->chunk_sectors;
3751 mddev->raid_disks -= mddev->delta_disks;
3752 mddev->delta_disks = 0;
3753 mddev->reshape_backwards = 0;
3754 module_put(pers->owner);
3755 pr_warn("md: %s: %s would not accept array\n",
3756 mdname(mddev), clevel);
3757 rv = PTR_ERR(priv);
3758 goto out_unlock;
3759 }
3760
3761
3762 mddev_suspend(mddev);
3763 mddev_detach(mddev);
3764
3765 spin_lock(&mddev->lock);
3766 oldpers = mddev->pers;
3767 oldpriv = mddev->private;
3768 mddev->pers = pers;
3769 mddev->private = priv;
3770 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3771 mddev->level = mddev->new_level;
3772 mddev->layout = mddev->new_layout;
3773 mddev->chunk_sectors = mddev->new_chunk_sectors;
3774 mddev->delta_disks = 0;
3775 mddev->reshape_backwards = 0;
3776 mddev->degraded = 0;
3777 spin_unlock(&mddev->lock);
3778
3779 if (oldpers->sync_request == NULL &&
3780 mddev->external) {
3781
3782
3783
3784
3785
3786
3787
3788 mddev->in_sync = 0;
3789 mddev->safemode_delay = 0;
3790 mddev->safemode = 0;
3791 }
3792
3793 oldpers->free(mddev, oldpriv);
3794
3795 if (oldpers->sync_request == NULL &&
3796 pers->sync_request != NULL) {
3797
3798 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3799 pr_warn("md: cannot register extra attributes for %s\n",
3800 mdname(mddev));
3801 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
3802 }
3803 if (oldpers->sync_request != NULL &&
3804 pers->sync_request == NULL) {
3805
3806 if (mddev->to_remove == NULL)
3807 mddev->to_remove = &md_redundancy_group;
3808 }
3809
3810 module_put(oldpers->owner);
3811
3812 rdev_for_each(rdev, mddev) {
3813 if (rdev->raid_disk < 0)
3814 continue;
3815 if (rdev->new_raid_disk >= mddev->raid_disks)
3816 rdev->new_raid_disk = -1;
3817 if (rdev->new_raid_disk == rdev->raid_disk)
3818 continue;
3819 sysfs_unlink_rdev(mddev, rdev);
3820 }
3821 rdev_for_each(rdev, mddev) {
3822 if (rdev->raid_disk < 0)
3823 continue;
3824 if (rdev->new_raid_disk == rdev->raid_disk)
3825 continue;
3826 rdev->raid_disk = rdev->new_raid_disk;
3827 if (rdev->raid_disk < 0)
3828 clear_bit(In_sync, &rdev->flags);
3829 else {
3830 if (sysfs_link_rdev(mddev, rdev))
3831 pr_warn("md: cannot register rd%d for %s after level change\n",
3832 rdev->raid_disk, mdname(mddev));
3833 }
3834 }
3835
3836 if (pers->sync_request == NULL) {
3837
3838
3839
3840 mddev->in_sync = 1;
3841 del_timer_sync(&mddev->safemode_timer);
3842 }
3843 blk_set_stacking_limits(&mddev->queue->limits);
3844 pers->run(mddev);
3845 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
3846 mddev_resume(mddev);
3847 if (!mddev->thread)
3848 md_update_sb(mddev, 1);
3849 sysfs_notify(&mddev->kobj, NULL, "level");
3850 md_new_event(mddev);
3851 rv = len;
3852out_unlock:
3853 mddev_unlock(mddev);
3854 return rv;
3855}
3856
3857static struct md_sysfs_entry md_level =
3858__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
3859
3860static ssize_t
3861layout_show(struct mddev *mddev, char *page)
3862{
3863
3864 if (mddev->reshape_position != MaxSector &&
3865 mddev->layout != mddev->new_layout)
3866 return sprintf(page, "%d (%d)\n",
3867 mddev->new_layout, mddev->layout);
3868 return sprintf(page, "%d\n", mddev->layout);
3869}
3870
3871static ssize_t
3872layout_store(struct mddev *mddev, const char *buf, size_t len)
3873{
3874 unsigned int n;
3875 int err;
3876
3877 err = kstrtouint(buf, 10, &n);
3878 if (err < 0)
3879 return err;
3880 err = mddev_lock(mddev);
3881 if (err)
3882 return err;
3883
3884 if (mddev->pers) {
3885 if (mddev->pers->check_reshape == NULL)
3886 err = -EBUSY;
3887 else if (mddev->ro)
3888 err = -EROFS;
3889 else {
3890 mddev->new_layout = n;
3891 err = mddev->pers->check_reshape(mddev);
3892 if (err)
3893 mddev->new_layout = mddev->layout;
3894 }
3895 } else {
3896 mddev->new_layout = n;
3897 if (mddev->reshape_position == MaxSector)
3898 mddev->layout = n;
3899 }
3900 mddev_unlock(mddev);
3901 return err ?: len;
3902}
3903static struct md_sysfs_entry md_layout =
3904__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
3905
3906static ssize_t
3907raid_disks_show(struct mddev *mddev, char *page)
3908{
3909 if (mddev->raid_disks == 0)
3910 return 0;
3911 if (mddev->reshape_position != MaxSector &&
3912 mddev->delta_disks != 0)
3913 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
3914 mddev->raid_disks - mddev->delta_disks);
3915 return sprintf(page, "%d\n", mddev->raid_disks);
3916}
3917
3918static int update_raid_disks(struct mddev *mddev, int raid_disks);
3919
3920static ssize_t
3921raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
3922{
3923 unsigned int n;
3924 int err;
3925
3926 err = kstrtouint(buf, 10, &n);
3927 if (err < 0)
3928 return err;
3929
3930 err = mddev_lock(mddev);
3931 if (err)
3932 return err;
3933 if (mddev->pers)
3934 err = update_raid_disks(mddev, n);
3935 else if (mddev->reshape_position != MaxSector) {
3936 struct md_rdev *rdev;
3937 int olddisks = mddev->raid_disks - mddev->delta_disks;
3938
3939 err = -EINVAL;
3940 rdev_for_each(rdev, mddev) {
3941 if (olddisks < n &&
3942 rdev->data_offset < rdev->new_data_offset)
3943 goto out_unlock;
3944 if (olddisks > n &&
3945 rdev->data_offset > rdev->new_data_offset)
3946 goto out_unlock;
3947 }
3948 err = 0;
3949 mddev->delta_disks = n - olddisks;
3950 mddev->raid_disks = n;
3951 mddev->reshape_backwards = (mddev->delta_disks < 0);
3952 } else
3953 mddev->raid_disks = n;
3954out_unlock:
3955 mddev_unlock(mddev);
3956 return err ? err : len;
3957}
3958static struct md_sysfs_entry md_raid_disks =
3959__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
3960
3961static ssize_t
3962chunk_size_show(struct mddev *mddev, char *page)
3963{
3964 if (mddev->reshape_position != MaxSector &&
3965 mddev->chunk_sectors != mddev->new_chunk_sectors)
3966 return sprintf(page, "%d (%d)\n",
3967 mddev->new_chunk_sectors << 9,
3968 mddev->chunk_sectors << 9);
3969 return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
3970}
3971
3972static ssize_t
3973chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
3974{
3975 unsigned long n;
3976 int err;
3977
3978 err = kstrtoul(buf, 10, &n);
3979 if (err < 0)
3980 return err;
3981
3982 err = mddev_lock(mddev);
3983 if (err)
3984 return err;
3985 if (mddev->pers) {
3986 if (mddev->pers->check_reshape == NULL)
3987 err = -EBUSY;
3988 else if (mddev->ro)
3989 err = -EROFS;
3990 else {
3991 mddev->new_chunk_sectors = n >> 9;
3992 err = mddev->pers->check_reshape(mddev);
3993 if (err)
3994 mddev->new_chunk_sectors = mddev->chunk_sectors;
3995 }
3996 } else {
3997 mddev->new_chunk_sectors = n >> 9;
3998 if (mddev->reshape_position == MaxSector)
3999 mddev->chunk_sectors = n >> 9;
4000 }
4001 mddev_unlock(mddev);
4002 return err ?: len;
4003}
4004static struct md_sysfs_entry md_chunk_size =
4005__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
4006
4007static ssize_t
4008resync_start_show(struct mddev *mddev, char *page)
4009{
4010 if (mddev->recovery_cp == MaxSector)
4011 return sprintf(page, "none\n");
4012 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
4013}
4014
4015static ssize_t
4016resync_start_store(struct mddev *mddev, const char *buf, size_t len)
4017{
4018 unsigned long long n;
4019 int err;
4020
4021 if (cmd_match(buf, "none"))
4022 n = MaxSector;
4023 else {
4024 err = kstrtoull(buf, 10, &n);
4025 if (err < 0)
4026 return err;
4027 if (n != (sector_t)n)
4028 return -EINVAL;
4029 }
4030
4031 err = mddev_lock(mddev);
4032 if (err)
4033 return err;
4034 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4035 err = -EBUSY;
4036
4037 if (!err) {
4038 mddev->recovery_cp = n;
4039 if (mddev->pers)
4040 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
4041 }
4042 mddev_unlock(mddev);
4043 return err ?: len;
4044}
4045static struct md_sysfs_entry md_resync_start =
4046__ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
4047 resync_start_show, resync_start_store);
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
4086 write_pending, active_idle, bad_word};
4087static char *array_states[] = {
4088 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
4089 "write-pending", "active-idle", NULL };
4090
4091static int match_word(const char *word, char **list)
4092{
4093 int n;
4094 for (n=0; list[n]; n++)
4095 if (cmd_match(word, list[n]))
4096 break;
4097 return n;
4098}
4099
4100static ssize_t
4101array_state_show(struct mddev *mddev, char *page)
4102{
4103 enum array_state st = inactive;
4104
4105 if (mddev->pers)
4106 switch(mddev->ro) {
4107 case 1:
4108 st = readonly;
4109 break;
4110 case 2:
4111 st = read_auto;
4112 break;
4113 case 0:
4114 spin_lock(&mddev->lock);
4115 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
4116 st = write_pending;
4117 else if (mddev->in_sync)
4118 st = clean;
4119 else if (mddev->safemode)
4120 st = active_idle;
4121 else
4122 st = active;
4123 spin_unlock(&mddev->lock);
4124 }
4125 else {
4126 if (list_empty(&mddev->disks) &&
4127 mddev->raid_disks == 0 &&
4128 mddev->dev_sectors == 0)
4129 st = clear;
4130 else
4131 st = inactive;
4132 }
4133 return sprintf(page, "%s\n", array_states[st]);
4134}
4135
4136static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
4137static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
4138static int do_md_run(struct mddev *mddev);
4139static int restart_array(struct mddev *mddev);
4140
4141static ssize_t
4142array_state_store(struct mddev *mddev, const char *buf, size_t len)
4143{
4144 int err = 0;
4145 enum array_state st = match_word(buf, array_states);
4146
4147 if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
4148
4149
4150
4151 spin_lock(&mddev->lock);
4152 if (st == active) {
4153 restart_array(mddev);
4154 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4155 md_wakeup_thread(mddev->thread);
4156 wake_up(&mddev->sb_wait);
4157 } else {
4158 restart_array(mddev);
4159 if (!set_in_sync(mddev))
4160 err = -EBUSY;
4161 }
4162 if (!err)
4163 sysfs_notify_dirent_safe(mddev->sysfs_state);
4164 spin_unlock(&mddev->lock);
4165 return err ?: len;
4166 }
4167 err = mddev_lock(mddev);
4168 if (err)
4169 return err;
4170 err = -EINVAL;
4171 switch(st) {
4172 case bad_word:
4173 break;
4174 case clear:
4175
4176 err = do_md_stop(mddev, 0, NULL);
4177 break;
4178 case inactive:
4179
4180 if (mddev->pers)
4181 err = do_md_stop(mddev, 2, NULL);
4182 else
4183 err = 0;
4184 break;
4185 case suspended:
4186 break;
4187 case readonly:
4188 if (mddev->pers)
4189 err = md_set_readonly(mddev, NULL);
4190 else {
4191 mddev->ro = 1;
4192 set_disk_ro(mddev->gendisk, 1);
4193 err = do_md_run(mddev);
4194 }
4195 break;
4196 case read_auto:
4197 if (mddev->pers) {
4198 if (mddev->ro == 0)
4199 err = md_set_readonly(mddev, NULL);
4200 else if (mddev->ro == 1)
4201 err = restart_array(mddev);
4202 if (err == 0) {
4203 mddev->ro = 2;
4204 set_disk_ro(mddev->gendisk, 0);
4205 }
4206 } else {
4207 mddev->ro = 2;
4208 err = do_md_run(mddev);
4209 }
4210 break;
4211 case clean:
4212 if (mddev->pers) {
4213 err = restart_array(mddev);
4214 if (err)
4215 break;
4216 spin_lock(&mddev->lock);
4217 if (!set_in_sync(mddev))
4218 err = -EBUSY;
4219 spin_unlock(&mddev->lock);
4220 } else
4221 err = -EINVAL;
4222 break;
4223 case active:
4224 if (mddev->pers) {
4225 err = restart_array(mddev);
4226 if (err)
4227 break;
4228 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4229 wake_up(&mddev->sb_wait);
4230 err = 0;
4231 } else {
4232 mddev->ro = 0;
4233 set_disk_ro(mddev->gendisk, 0);
4234 err = do_md_run(mddev);
4235 }
4236 break;
4237 case write_pending:
4238 case active_idle:
4239
4240 break;
4241 }
4242
4243 if (!err) {
4244 if (mddev->hold_active == UNTIL_IOCTL)
4245 mddev->hold_active = 0;
4246 sysfs_notify_dirent_safe(mddev->sysfs_state);
4247 }
4248 mddev_unlock(mddev);
4249 return err ?: len;
4250}
4251static struct md_sysfs_entry md_array_state =
4252__ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
4253
4254static ssize_t
4255max_corrected_read_errors_show(struct mddev *mddev, char *page) {
4256 return sprintf(page, "%d\n",
4257 atomic_read(&mddev->max_corr_read_errors));
4258}
4259
4260static ssize_t
4261max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
4262{
4263 unsigned int n;
4264 int rv;
4265
4266 rv = kstrtouint(buf, 10, &n);
4267 if (rv < 0)
4268 return rv;
4269 atomic_set(&mddev->max_corr_read_errors, n);
4270 return len;
4271}
4272
4273static struct md_sysfs_entry max_corr_read_errors =
4274__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
4275 max_corrected_read_errors_store);
4276
4277static ssize_t
4278null_show(struct mddev *mddev, char *page)
4279{
4280 return -EINVAL;
4281}
4282
4283static ssize_t
4284new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4285{
4286
4287
4288
4289
4290
4291
4292
4293 char *e;
4294 int major = simple_strtoul(buf, &e, 10);
4295 int minor;
4296 dev_t dev;
4297 struct md_rdev *rdev;
4298 int err;
4299
4300 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
4301 return -EINVAL;
4302 minor = simple_strtoul(e+1, &e, 10);
4303 if (*e && *e != '\n')
4304 return -EINVAL;
4305 dev = MKDEV(major, minor);
4306 if (major != MAJOR(dev) ||
4307 minor != MINOR(dev))
4308 return -EOVERFLOW;
4309
4310 flush_workqueue(md_misc_wq);
4311
4312 err = mddev_lock(mddev);
4313 if (err)
4314 return err;
4315 if (mddev->persistent) {
4316 rdev = md_import_device(dev, mddev->major_version,
4317 mddev->minor_version);
4318 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4319 struct md_rdev *rdev0
4320 = list_entry(mddev->disks.next,
4321 struct md_rdev, same_set);
4322 err = super_types[mddev->major_version]
4323 .load_super(rdev, rdev0, mddev->minor_version);
4324 if (err < 0)
4325 goto out;
4326 }
4327 } else if (mddev->external)
4328 rdev = md_import_device(dev, -2, -1);
4329 else
4330 rdev = md_import_device(dev, -1, -1);
4331
4332 if (IS_ERR(rdev)) {
4333 mddev_unlock(mddev);
4334 return PTR_ERR(rdev);
4335 }
4336 err = bind_rdev_to_array(rdev, mddev);
4337 out:
4338 if (err)
4339 export_rdev(rdev);
4340 mddev_unlock(mddev);
4341 if (!err)
4342 md_new_event(mddev);
4343 return err ? err : len;
4344}
4345
4346static struct md_sysfs_entry md_new_device =
4347__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4348
4349static ssize_t
4350bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4351{
4352 char *end;
4353 unsigned long chunk, end_chunk;
4354 int err;
4355
4356 err = mddev_lock(mddev);
4357 if (err)
4358 return err;
4359 if (!mddev->bitmap)
4360 goto out;
4361
4362 while (*buf) {
4363 chunk = end_chunk = simple_strtoul(buf, &end, 0);
4364 if (buf == end) break;
4365 if (*end == '-') {
4366 buf = end + 1;
4367 end_chunk = simple_strtoul(buf, &end, 0);
4368 if (buf == end) break;
4369 }
4370 if (*end && !isspace(*end)) break;
4371 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
4372 buf = skip_spaces(end);
4373 }
4374 bitmap_unplug(mddev->bitmap);
4375out:
4376 mddev_unlock(mddev);
4377 return len;
4378}
4379
4380static struct md_sysfs_entry md_bitmap =
4381__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4382
4383static ssize_t
4384size_show(struct mddev *mddev, char *page)
4385{
4386 return sprintf(page, "%llu\n",
4387 (unsigned long long)mddev->dev_sectors / 2);
4388}
4389
4390static int update_size(struct mddev *mddev, sector_t num_sectors);
4391
4392static ssize_t
4393size_store(struct mddev *mddev, const char *buf, size_t len)
4394{
4395
4396
4397
4398
4399 sector_t sectors;
4400 int err = strict_blocks_to_sectors(buf, §ors);
4401
4402 if (err < 0)
4403 return err;
4404 err = mddev_lock(mddev);
4405 if (err)
4406 return err;
4407 if (mddev->pers) {
4408 err = update_size(mddev, sectors);
4409 if (err == 0)
4410 md_update_sb(mddev, 1);
4411 } else {
4412 if (mddev->dev_sectors == 0 ||
4413 mddev->dev_sectors > sectors)
4414 mddev->dev_sectors = sectors;
4415 else
4416 err = -ENOSPC;
4417 }
4418 mddev_unlock(mddev);
4419 return err ? err : len;
4420}
4421
4422static struct md_sysfs_entry md_size =
4423__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4424
4425
4426
4427
4428
4429
4430
4431static ssize_t
4432metadata_show(struct mddev *mddev, char *page)
4433{
4434 if (mddev->persistent)
4435 return sprintf(page, "%d.%d\n",
4436 mddev->major_version, mddev->minor_version);
4437 else if (mddev->external)
4438 return sprintf(page, "external:%s\n", mddev->metadata_type);
4439 else
4440 return sprintf(page, "none\n");
4441}
4442
4443static ssize_t
4444metadata_store(struct mddev *mddev, const char *buf, size_t len)
4445{
4446 int major, minor;
4447 char *e;
4448 int err;
4449
4450
4451
4452
4453
4454 err = mddev_lock(mddev);
4455 if (err)
4456 return err;
4457 err = -EBUSY;
4458 if (mddev->external && strncmp(buf, "external:", 9) == 0)
4459 ;
4460 else if (!list_empty(&mddev->disks))
4461 goto out_unlock;
4462
4463 err = 0;
4464 if (cmd_match(buf, "none")) {
4465 mddev->persistent = 0;
4466 mddev->external = 0;
4467 mddev->major_version = 0;
4468 mddev->minor_version = 90;
4469 goto out_unlock;
4470 }
4471 if (strncmp(buf, "external:", 9) == 0) {
4472 size_t namelen = len-9;
4473 if (namelen >= sizeof(mddev->metadata_type))
4474 namelen = sizeof(mddev->metadata_type)-1;
4475 strncpy(mddev->metadata_type, buf+9, namelen);
4476 mddev->metadata_type[namelen] = 0;
4477 if (namelen && mddev->metadata_type[namelen-1] == '\n')
4478 mddev->metadata_type[--namelen] = 0;
4479 mddev->persistent = 0;
4480 mddev->external = 1;
4481 mddev->major_version = 0;
4482 mddev->minor_version = 90;
4483 goto out_unlock;
4484 }
4485 major = simple_strtoul(buf, &e, 10);
4486 err = -EINVAL;
4487 if (e==buf || *e != '.')
4488 goto out_unlock;
4489 buf = e+1;
4490 minor = simple_strtoul(buf, &e, 10);
4491 if (e==buf || (*e && *e != '\n') )
4492 goto out_unlock;
4493 err = -ENOENT;
4494 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
4495 goto out_unlock;
4496 mddev->major_version = major;
4497 mddev->minor_version = minor;
4498 mddev->persistent = 1;
4499 mddev->external = 0;
4500 err = 0;
4501out_unlock:
4502 mddev_unlock(mddev);
4503 return err ?: len;
4504}
4505
4506static struct md_sysfs_entry md_metadata =
4507__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4508
4509static ssize_t
4510action_show(struct mddev *mddev, char *page)
4511{
4512 char *type = "idle";
4513 unsigned long recovery = mddev->recovery;
4514 if (test_bit(MD_RECOVERY_FROZEN, &recovery))
4515 type = "frozen";
4516 else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
4517 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
4518 if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
4519 type = "reshape";
4520 else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
4521 if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
4522 type = "resync";
4523 else if (test_bit(MD_RECOVERY_CHECK, &recovery))
4524 type = "check";
4525 else
4526 type = "repair";
4527 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
4528 type = "recover";
4529 else if (mddev->reshape_position != MaxSector)
4530 type = "reshape";
4531 }
4532 return sprintf(page, "%s\n", type);
4533}
4534
4535static ssize_t
4536action_store(struct mddev *mddev, const char *page, size_t len)
4537{
4538 if (!mddev->pers || !mddev->pers->sync_request)
4539 return -EINVAL;
4540
4541
4542 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
4543 if (cmd_match(page, "frozen"))
4544 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4545 else
4546 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4547 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
4548 mddev_lock(mddev) == 0) {
4549 flush_workqueue(md_misc_wq);
4550 if (mddev->sync_thread) {
4551 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4552 md_reap_sync_thread(mddev);
4553 }
4554 mddev_unlock(mddev);
4555 }
4556 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4557 return -EBUSY;
4558 else if (cmd_match(page, "resync"))
4559 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4560 else if (cmd_match(page, "recover")) {
4561 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4562 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4563 } else if (cmd_match(page, "reshape")) {
4564 int err;
4565 if (mddev->pers->start_reshape == NULL)
4566 return -EINVAL;
4567 err = mddev_lock(mddev);
4568 if (!err) {
4569 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4570 err = -EBUSY;
4571 else {
4572 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4573 err = mddev->pers->start_reshape(mddev);
4574 }
4575 mddev_unlock(mddev);
4576 }
4577 if (err)
4578 return err;
4579 sysfs_notify(&mddev->kobj, NULL, "degraded");
4580 } else {
4581 if (cmd_match(page, "check"))
4582 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4583 else if (!cmd_match(page, "repair"))
4584 return -EINVAL;
4585 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4586 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
4587 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4588 }
4589 if (mddev->ro == 2) {
4590
4591
4592
4593 mddev->ro = 0;
4594 md_wakeup_thread(mddev->sync_thread);
4595 }
4596 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4597 md_wakeup_thread(mddev->thread);
4598 sysfs_notify_dirent_safe(mddev->sysfs_action);
4599 return len;
4600}
4601
4602static struct md_sysfs_entry md_scan_mode =
4603__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
4604
4605static ssize_t
4606last_sync_action_show(struct mddev *mddev, char *page)
4607{
4608 return sprintf(page, "%s\n", mddev->last_sync_action);
4609}
4610
4611static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
4612
4613static ssize_t
4614mismatch_cnt_show(struct mddev *mddev, char *page)
4615{
4616 return sprintf(page, "%llu\n",
4617 (unsigned long long)
4618 atomic64_read(&mddev->resync_mismatches));
4619}
4620
4621static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
4622
4623static ssize_t
4624sync_min_show(struct mddev *mddev, char *page)
4625{
4626 return sprintf(page, "%d (%s)\n", speed_min(mddev),
4627 mddev->sync_speed_min ? "local": "system");
4628}
4629
4630static ssize_t
4631sync_min_store(struct mddev *mddev, const char *buf, size_t len)
4632{
4633 unsigned int min;
4634 int rv;
4635
4636 if (strncmp(buf, "system", 6)==0) {
4637 min = 0;
4638 } else {
4639 rv = kstrtouint(buf, 10, &min);
4640 if (rv < 0)
4641 return rv;
4642 if (min == 0)
4643 return -EINVAL;
4644 }
4645 mddev->sync_speed_min = min;
4646 return len;
4647}
4648
4649static struct md_sysfs_entry md_sync_min =
4650__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
4651
4652static ssize_t
4653sync_max_show(struct mddev *mddev, char *page)
4654{
4655 return sprintf(page, "%d (%s)\n", speed_max(mddev),
4656 mddev->sync_speed_max ? "local": "system");
4657}
4658
4659static ssize_t
4660sync_max_store(struct mddev *mddev, const char *buf, size_t len)
4661{
4662 unsigned int max;
4663 int rv;
4664
4665 if (strncmp(buf, "system", 6)==0) {
4666 max = 0;
4667 } else {
4668 rv = kstrtouint(buf, 10, &max);
4669 if (rv < 0)
4670 return rv;
4671 if (max == 0)
4672 return -EINVAL;
4673 }
4674 mddev->sync_speed_max = max;
4675 return len;
4676}
4677
4678static struct md_sysfs_entry md_sync_max =
4679__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
4680
4681static ssize_t
4682degraded_show(struct mddev *mddev, char *page)
4683{
4684 return sprintf(page, "%d\n", mddev->degraded);
4685}
4686static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
4687
4688static ssize_t
4689sync_force_parallel_show(struct mddev *mddev, char *page)
4690{
4691 return sprintf(page, "%d\n", mddev->parallel_resync);
4692}
4693
4694static ssize_t
4695sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
4696{
4697 long n;
4698
4699 if (kstrtol(buf, 10, &n))
4700 return -EINVAL;
4701
4702 if (n != 0 && n != 1)
4703 return -EINVAL;
4704
4705 mddev->parallel_resync = n;
4706
4707 if (mddev->sync_thread)
4708 wake_up(&resync_wait);
4709
4710 return len;
4711}
4712
4713
4714static struct md_sysfs_entry md_sync_force_parallel =
4715__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
4716 sync_force_parallel_show, sync_force_parallel_store);
4717
4718static ssize_t
4719sync_speed_show(struct mddev *mddev, char *page)
4720{
4721 unsigned long resync, dt, db;
4722 if (mddev->curr_resync == 0)
4723 return sprintf(page, "none\n");
4724 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
4725 dt = (jiffies - mddev->resync_mark) / HZ;
4726 if (!dt) dt++;
4727 db = resync - mddev->resync_mark_cnt;
4728 return sprintf(page, "%lu\n", db/dt/2);
4729}
4730
4731static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
4732
4733static ssize_t
4734sync_completed_show(struct mddev *mddev, char *page)
4735{
4736 unsigned long long max_sectors, resync;
4737
4738 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4739 return sprintf(page, "none\n");
4740
4741 if (mddev->curr_resync == 1 ||
4742 mddev->curr_resync == 2)
4743 return sprintf(page, "delayed\n");
4744
4745 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
4746 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4747 max_sectors = mddev->resync_max_sectors;
4748 else
4749 max_sectors = mddev->dev_sectors;
4750
4751 resync = mddev->curr_resync_completed;
4752 return sprintf(page, "%llu / %llu\n", resync, max_sectors);
4753}
4754
4755static struct md_sysfs_entry md_sync_completed =
4756 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
4757
4758static ssize_t
4759min_sync_show(struct mddev *mddev, char *page)
4760{
4761 return sprintf(page, "%llu\n",
4762 (unsigned long long)mddev->resync_min);
4763}
4764static ssize_t
4765min_sync_store(struct mddev *mddev, const char *buf, size_t len)
4766{
4767 unsigned long long min;
4768 int err;
4769
4770 if (kstrtoull(buf, 10, &min))
4771 return -EINVAL;
4772
4773 spin_lock(&mddev->lock);
4774 err = -EINVAL;
4775 if (min > mddev->resync_max)
4776 goto out_unlock;
4777
4778 err = -EBUSY;
4779 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4780 goto out_unlock;
4781
4782
4783 mddev->resync_min = round_down(min, 8);
4784 err = 0;
4785
4786out_unlock:
4787 spin_unlock(&mddev->lock);
4788 return err ?: len;
4789}
4790
4791static struct md_sysfs_entry md_min_sync =
4792__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
4793
4794static ssize_t
4795max_sync_show(struct mddev *mddev, char *page)
4796{
4797 if (mddev->resync_max == MaxSector)
4798 return sprintf(page, "max\n");
4799 else
4800 return sprintf(page, "%llu\n",
4801 (unsigned long long)mddev->resync_max);
4802}
4803static ssize_t
4804max_sync_store(struct mddev *mddev, const char *buf, size_t len)
4805{
4806 int err;
4807 spin_lock(&mddev->lock);
4808 if (strncmp(buf, "max", 3) == 0)
4809 mddev->resync_max = MaxSector;
4810 else {
4811 unsigned long long max;
4812 int chunk;
4813
4814 err = -EINVAL;
4815 if (kstrtoull(buf, 10, &max))
4816 goto out_unlock;
4817 if (max < mddev->resync_min)
4818 goto out_unlock;
4819
4820 err = -EBUSY;
4821 if (max < mddev->resync_max &&
4822 mddev->ro == 0 &&
4823 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4824 goto out_unlock;
4825
4826
4827 chunk = mddev->chunk_sectors;
4828 if (chunk) {
4829 sector_t temp = max;
4830
4831 err = -EINVAL;
4832 if (sector_div(temp, chunk))
4833 goto out_unlock;
4834 }
4835 mddev->resync_max = max;
4836 }
4837 wake_up(&mddev->recovery_wait);
4838 err = 0;
4839out_unlock:
4840 spin_unlock(&mddev->lock);
4841 return err ?: len;
4842}
4843
4844static struct md_sysfs_entry md_max_sync =
4845__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
4846
4847static ssize_t
4848suspend_lo_show(struct mddev *mddev, char *page)
4849{
4850 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
4851}
4852
4853static ssize_t
4854suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
4855{
4856 unsigned long long new;
4857 int err;
4858
4859 err = kstrtoull(buf, 10, &new);
4860 if (err < 0)
4861 return err;
4862 if (new != (sector_t)new)
4863 return -EINVAL;
4864
4865 err = mddev_lock(mddev);
4866 if (err)
4867 return err;
4868 err = -EINVAL;
4869 if (mddev->pers == NULL ||
4870 mddev->pers->quiesce == NULL)
4871 goto unlock;
4872 mddev_suspend(mddev);
4873 mddev->suspend_lo = new;
4874 mddev_resume(mddev);
4875
4876 err = 0;
4877unlock:
4878 mddev_unlock(mddev);
4879 return err ?: len;
4880}
4881static struct md_sysfs_entry md_suspend_lo =
4882__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
4883
4884static ssize_t
4885suspend_hi_show(struct mddev *mddev, char *page)
4886{
4887 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
4888}
4889
4890static ssize_t
4891suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
4892{
4893 unsigned long long new;
4894 int err;
4895
4896 err = kstrtoull(buf, 10, &new);
4897 if (err < 0)
4898 return err;
4899 if (new != (sector_t)new)
4900 return -EINVAL;
4901
4902 err = mddev_lock(mddev);
4903 if (err)
4904 return err;
4905 err = -EINVAL;
4906 if (mddev->pers == NULL)
4907 goto unlock;
4908
4909 mddev_suspend(mddev);
4910 mddev->suspend_hi = new;
4911 mddev_resume(mddev);
4912
4913 err = 0;
4914unlock:
4915 mddev_unlock(mddev);
4916 return err ?: len;
4917}
4918static struct md_sysfs_entry md_suspend_hi =
4919__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
4920
4921static ssize_t
4922reshape_position_show(struct mddev *mddev, char *page)
4923{
4924 if (mddev->reshape_position != MaxSector)
4925 return sprintf(page, "%llu\n",
4926 (unsigned long long)mddev->reshape_position);
4927 strcpy(page, "none\n");
4928 return 5;
4929}
4930
4931static ssize_t
4932reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
4933{
4934 struct md_rdev *rdev;
4935 unsigned long long new;
4936 int err;
4937
4938 err = kstrtoull(buf, 10, &new);
4939 if (err < 0)
4940 return err;
4941 if (new != (sector_t)new)
4942 return -EINVAL;
4943 err = mddev_lock(mddev);
4944 if (err)
4945 return err;
4946 err = -EBUSY;
4947 if (mddev->pers)
4948 goto unlock;
4949 mddev->reshape_position = new;
4950 mddev->delta_disks = 0;
4951 mddev->reshape_backwards = 0;
4952 mddev->new_level = mddev->level;
4953 mddev->new_layout = mddev->layout;
4954 mddev->new_chunk_sectors = mddev->chunk_sectors;
4955 rdev_for_each(rdev, mddev)
4956 rdev->new_data_offset = rdev->data_offset;
4957 err = 0;
4958unlock:
4959 mddev_unlock(mddev);
4960 return err ?: len;
4961}
4962
4963static struct md_sysfs_entry md_reshape_position =
4964__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
4965 reshape_position_store);
4966
4967static ssize_t
4968reshape_direction_show(struct mddev *mddev, char *page)
4969{
4970 return sprintf(page, "%s\n",
4971 mddev->reshape_backwards ? "backwards" : "forwards");
4972}
4973
4974static ssize_t
4975reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
4976{
4977 int backwards = 0;
4978 int err;
4979
4980 if (cmd_match(buf, "forwards"))
4981 backwards = 0;
4982 else if (cmd_match(buf, "backwards"))
4983 backwards = 1;
4984 else
4985 return -EINVAL;
4986 if (mddev->reshape_backwards == backwards)
4987 return len;
4988
4989 err = mddev_lock(mddev);
4990 if (err)
4991 return err;
4992
4993 if (mddev->delta_disks)
4994 err = -EBUSY;
4995 else if (mddev->persistent &&
4996 mddev->major_version == 0)
4997 err = -EINVAL;
4998 else
4999 mddev->reshape_backwards = backwards;
5000 mddev_unlock(mddev);
5001 return err ?: len;
5002}
5003
5004static struct md_sysfs_entry md_reshape_direction =
5005__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
5006 reshape_direction_store);
5007
5008static ssize_t
5009array_size_show(struct mddev *mddev, char *page)
5010{
5011 if (mddev->external_size)
5012 return sprintf(page, "%llu\n",
5013 (unsigned long long)mddev->array_sectors/2);
5014 else
5015 return sprintf(page, "default\n");
5016}
5017
5018static ssize_t
5019array_size_store(struct mddev *mddev, const char *buf, size_t len)
5020{
5021 sector_t sectors;
5022 int err;
5023
5024 err = mddev_lock(mddev);
5025 if (err)
5026 return err;
5027
5028
5029 if (mddev_is_clustered(mddev)) {
5030 mddev_unlock(mddev);
5031 return -EINVAL;
5032 }
5033
5034 if (strncmp(buf, "default", 7) == 0) {
5035 if (mddev->pers)
5036 sectors = mddev->pers->size(mddev, 0, 0);
5037 else
5038 sectors = mddev->array_sectors;
5039
5040 mddev->external_size = 0;
5041 } else {
5042 if (strict_blocks_to_sectors(buf, §ors) < 0)
5043 err = -EINVAL;
5044 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
5045 err = -E2BIG;
5046 else
5047 mddev->external_size = 1;
5048 }
5049
5050 if (!err) {
5051 mddev->array_sectors = sectors;
5052 if (mddev->pers) {
5053 set_capacity(mddev->gendisk, mddev->array_sectors);
5054 revalidate_disk(mddev->gendisk);
5055 }
5056 }
5057 mddev_unlock(mddev);
5058 return err ?: len;
5059}
5060
5061static struct md_sysfs_entry md_array_size =
5062__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
5063 array_size_store);
5064
5065static ssize_t
5066consistency_policy_show(struct mddev *mddev, char *page)
5067{
5068 int ret;
5069
5070 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
5071 ret = sprintf(page, "journal\n");
5072 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
5073 ret = sprintf(page, "ppl\n");
5074 } else if (mddev->bitmap) {
5075 ret = sprintf(page, "bitmap\n");
5076 } else if (mddev->pers) {
5077 if (mddev->pers->sync_request)
5078 ret = sprintf(page, "resync\n");
5079 else
5080 ret = sprintf(page, "none\n");
5081 } else {
5082 ret = sprintf(page, "unknown\n");
5083 }
5084
5085 return ret;
5086}
5087
5088static ssize_t
5089consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
5090{
5091 int err = 0;
5092
5093 if (mddev->pers) {
5094 if (mddev->pers->change_consistency_policy)
5095 err = mddev->pers->change_consistency_policy(mddev, buf);
5096 else
5097 err = -EBUSY;
5098 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
5099 set_bit(MD_HAS_PPL, &mddev->flags);
5100 } else {
5101 err = -EINVAL;
5102 }
5103
5104 return err ? err : len;
5105}
5106
5107static struct md_sysfs_entry md_consistency_policy =
5108__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
5109 consistency_policy_store);
5110
5111static struct attribute *md_default_attrs[] = {
5112 &md_level.attr,
5113 &md_layout.attr,
5114 &md_raid_disks.attr,
5115 &md_chunk_size.attr,
5116 &md_size.attr,
5117 &md_resync_start.attr,
5118 &md_metadata.attr,
5119 &md_new_device.attr,
5120 &md_safe_delay.attr,
5121 &md_array_state.attr,
5122 &md_reshape_position.attr,
5123 &md_reshape_direction.attr,
5124 &md_array_size.attr,
5125 &max_corr_read_errors.attr,
5126 &md_consistency_policy.attr,
5127 NULL,
5128};
5129
5130static struct attribute *md_redundancy_attrs[] = {
5131 &md_scan_mode.attr,
5132 &md_last_scan_mode.attr,
5133 &md_mismatches.attr,
5134 &md_sync_min.attr,
5135 &md_sync_max.attr,
5136 &md_sync_speed.attr,
5137 &md_sync_force_parallel.attr,
5138 &md_sync_completed.attr,
5139 &md_min_sync.attr,
5140 &md_max_sync.attr,
5141 &md_suspend_lo.attr,
5142 &md_suspend_hi.attr,
5143 &md_bitmap.attr,
5144 &md_degraded.attr,
5145 NULL,
5146};
5147static struct attribute_group md_redundancy_group = {
5148 .name = NULL,
5149 .attrs = md_redundancy_attrs,
5150};
5151
5152static ssize_t
5153md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
5154{
5155 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5156 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5157 ssize_t rv;
5158
5159 if (!entry->show)
5160 return -EIO;
5161 spin_lock(&all_mddevs_lock);
5162 if (list_empty(&mddev->all_mddevs)) {
5163 spin_unlock(&all_mddevs_lock);
5164 return -EBUSY;
5165 }
5166 mddev_get(mddev);
5167 spin_unlock(&all_mddevs_lock);
5168
5169 rv = entry->show(mddev, page);
5170 mddev_put(mddev);
5171 return rv;
5172}
5173
5174static ssize_t
5175md_attr_store(struct kobject *kobj, struct attribute *attr,
5176 const char *page, size_t length)
5177{
5178 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5179 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5180 ssize_t rv;
5181
5182 if (!entry->store)
5183 return -EIO;
5184 if (!capable(CAP_SYS_ADMIN))
5185 return -EACCES;
5186 spin_lock(&all_mddevs_lock);
5187 if (list_empty(&mddev->all_mddevs)) {
5188 spin_unlock(&all_mddevs_lock);
5189 return -EBUSY;
5190 }
5191 mddev_get(mddev);
5192 spin_unlock(&all_mddevs_lock);
5193 rv = entry->store(mddev, page, length);
5194 mddev_put(mddev);
5195 return rv;
5196}
5197
5198static void md_free(struct kobject *ko)
5199{
5200 struct mddev *mddev = container_of(ko, struct mddev, kobj);
5201
5202 if (mddev->sysfs_state)
5203 sysfs_put(mddev->sysfs_state);
5204
5205 if (mddev->queue)
5206 blk_cleanup_queue(mddev->queue);
5207 if (mddev->gendisk) {
5208 del_gendisk(mddev->gendisk);
5209 put_disk(mddev->gendisk);
5210 }
5211 percpu_ref_exit(&mddev->writes_pending);
5212
5213 kfree(mddev);
5214}
5215
5216static const struct sysfs_ops md_sysfs_ops = {
5217 .show = md_attr_show,
5218 .store = md_attr_store,
5219};
5220static struct kobj_type md_ktype = {
5221 .release = md_free,
5222 .sysfs_ops = &md_sysfs_ops,
5223 .default_attrs = md_default_attrs,
5224};
5225
5226int mdp_major = 0;
5227
5228static void mddev_delayed_delete(struct work_struct *ws)
5229{
5230 struct mddev *mddev = container_of(ws, struct mddev, del_work);
5231
5232 sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
5233 kobject_del(&mddev->kobj);
5234 kobject_put(&mddev->kobj);
5235}
5236
5237static void no_op(struct percpu_ref *r) {}
5238
5239int mddev_init_writes_pending(struct mddev *mddev)
5240{
5241 if (mddev->writes_pending.percpu_count_ptr)
5242 return 0;
5243 if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0)
5244 return -ENOMEM;
5245
5246 percpu_ref_put(&mddev->writes_pending);
5247 return 0;
5248}
5249EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
5250
5251static int md_alloc(dev_t dev, char *name)
5252{
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262 static DEFINE_MUTEX(disks_mutex);
5263 struct mddev *mddev = mddev_find(dev);
5264 struct gendisk *disk;
5265 int partitioned;
5266 int shift;
5267 int unit;
5268 int error;
5269
5270 if (!mddev)
5271 return -ENODEV;
5272
5273 partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
5274 shift = partitioned ? MdpMinorShift : 0;
5275 unit = MINOR(mddev->unit) >> shift;
5276
5277
5278
5279
5280 flush_workqueue(md_misc_wq);
5281
5282 mutex_lock(&disks_mutex);
5283 error = -EEXIST;
5284 if (mddev->gendisk)
5285 goto abort;
5286
5287 if (name && !dev) {
5288
5289
5290 struct mddev *mddev2;
5291 spin_lock(&all_mddevs_lock);
5292
5293 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
5294 if (mddev2->gendisk &&
5295 strcmp(mddev2->gendisk->disk_name, name) == 0) {
5296 spin_unlock(&all_mddevs_lock);
5297 goto abort;
5298 }
5299 spin_unlock(&all_mddevs_lock);
5300 }
5301 if (name && dev)
5302
5303
5304
5305 mddev->hold_active = UNTIL_STOP;
5306
5307 error = -ENOMEM;
5308 mddev->queue = blk_alloc_queue(GFP_KERNEL);
5309 if (!mddev->queue)
5310 goto abort;
5311 mddev->queue->queuedata = mddev;
5312
5313 blk_queue_make_request(mddev->queue, md_make_request);
5314 blk_set_stacking_limits(&mddev->queue->limits);
5315
5316 disk = alloc_disk(1 << shift);
5317 if (!disk) {
5318 blk_cleanup_queue(mddev->queue);
5319 mddev->queue = NULL;
5320 goto abort;
5321 }
5322 disk->major = MAJOR(mddev->unit);
5323 disk->first_minor = unit << shift;
5324 if (name)
5325 strcpy(disk->disk_name, name);
5326 else if (partitioned)
5327 sprintf(disk->disk_name, "md_d%d", unit);
5328 else
5329 sprintf(disk->disk_name, "md%d", unit);
5330 disk->fops = &md_fops;
5331 disk->private_data = mddev;
5332 disk->queue = mddev->queue;
5333 blk_queue_write_cache(mddev->queue, true, true);
5334
5335
5336
5337
5338 disk->flags |= GENHD_FL_EXT_DEVT;
5339 mddev->gendisk = disk;
5340
5341
5342
5343 mutex_lock(&mddev->open_mutex);
5344 add_disk(disk);
5345
5346 error = kobject_init_and_add(&mddev->kobj, &md_ktype,
5347 &disk_to_dev(disk)->kobj, "%s", "md");
5348 if (error) {
5349
5350
5351
5352 pr_debug("md: cannot register %s/md - name in use\n",
5353 disk->disk_name);
5354 error = 0;
5355 }
5356 if (mddev->kobj.sd &&
5357 sysfs_create_group(&mddev->kobj, &md_bitmap_group))
5358 pr_debug("pointless warning\n");
5359 mutex_unlock(&mddev->open_mutex);
5360 abort:
5361 mutex_unlock(&disks_mutex);
5362 if (!error && mddev->kobj.sd) {
5363 kobject_uevent(&mddev->kobj, KOBJ_ADD);
5364 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
5365 }
5366 mddev_put(mddev);
5367 return error;
5368}
5369
5370static struct kobject *md_probe(dev_t dev, int *part, void *data)
5371{
5372 if (create_on_open)
5373 md_alloc(dev, NULL);
5374 return NULL;
5375}
5376
5377static int add_named_array(const char *val, const struct kernel_param *kp)
5378{
5379
5380
5381
5382
5383
5384
5385
5386 int len = strlen(val);
5387 char buf[DISK_NAME_LEN];
5388 unsigned long devnum;
5389
5390 while (len && val[len-1] == '\n')
5391 len--;
5392 if (len >= DISK_NAME_LEN)
5393 return -E2BIG;
5394 strlcpy(buf, val, len+1);
5395 if (strncmp(buf, "md_", 3) == 0)
5396 return md_alloc(0, buf);
5397 if (strncmp(buf, "md", 2) == 0 &&
5398 isdigit(buf[2]) &&
5399 kstrtoul(buf+2, 10, &devnum) == 0 &&
5400 devnum <= MINORMASK)
5401 return md_alloc(MKDEV(MD_MAJOR, devnum), NULL);
5402
5403 return -EINVAL;
5404}
5405
5406static void md_safemode_timeout(struct timer_list *t)
5407{
5408 struct mddev *mddev = from_timer(mddev, t, safemode_timer);
5409
5410 mddev->safemode = 1;
5411 if (mddev->external)
5412 sysfs_notify_dirent_safe(mddev->sysfs_state);
5413
5414 md_wakeup_thread(mddev->thread);
5415}
5416
5417static int start_dirty_degraded;
5418
5419int md_run(struct mddev *mddev)
5420{
5421 int err;
5422 struct md_rdev *rdev;
5423 struct md_personality *pers;
5424
5425 if (list_empty(&mddev->disks))
5426
5427 return -EINVAL;
5428
5429 if (mddev->pers)
5430 return -EBUSY;
5431
5432 if (mddev->sysfs_active)
5433 return -EBUSY;
5434
5435
5436
5437
5438 if (!mddev->raid_disks) {
5439 if (!mddev->persistent)
5440 return -EINVAL;
5441 analyze_sbs(mddev);
5442 }
5443
5444 if (mddev->level != LEVEL_NONE)
5445 request_module("md-level-%d", mddev->level);
5446 else if (mddev->clevel[0])
5447 request_module("md-%s", mddev->clevel);
5448
5449
5450
5451
5452
5453
5454 rdev_for_each(rdev, mddev) {
5455 if (test_bit(Faulty, &rdev->flags))
5456 continue;
5457 sync_blockdev(rdev->bdev);
5458 invalidate_bdev(rdev->bdev);
5459 if (mddev->ro != 1 &&
5460 (bdev_read_only(rdev->bdev) ||
5461 bdev_read_only(rdev->meta_bdev))) {
5462 mddev->ro = 1;
5463 if (mddev->gendisk)
5464 set_disk_ro(mddev->gendisk, 1);
5465 }
5466
5467
5468
5469
5470
5471 if (rdev->meta_bdev) {
5472 ;
5473 } else if (rdev->data_offset < rdev->sb_start) {
5474 if (mddev->dev_sectors &&
5475 rdev->data_offset + mddev->dev_sectors
5476 > rdev->sb_start) {
5477 pr_warn("md: %s: data overlaps metadata\n",
5478 mdname(mddev));
5479 return -EINVAL;
5480 }
5481 } else {
5482 if (rdev->sb_start + rdev->sb_size/512
5483 > rdev->data_offset) {
5484 pr_warn("md: %s: metadata overlaps data\n",
5485 mdname(mddev));
5486 return -EINVAL;
5487 }
5488 }
5489 sysfs_notify_dirent_safe(rdev->sysfs_state);
5490 }
5491
5492 if (mddev->bio_set == NULL) {
5493 mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
5494 if (!mddev->bio_set)
5495 return -ENOMEM;
5496 }
5497 if (mddev->sync_set == NULL) {
5498 mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
5499 if (!mddev->sync_set)
5500 return -ENOMEM;
5501 }
5502
5503 spin_lock(&pers_lock);
5504 pers = find_pers(mddev->level, mddev->clevel);
5505 if (!pers || !try_module_get(pers->owner)) {
5506 spin_unlock(&pers_lock);
5507 if (mddev->level != LEVEL_NONE)
5508 pr_warn("md: personality for level %d is not loaded!\n",
5509 mddev->level);
5510 else
5511 pr_warn("md: personality for level %s is not loaded!\n",
5512 mddev->clevel);
5513 return -EINVAL;
5514 }
5515 spin_unlock(&pers_lock);
5516 if (mddev->level != pers->level) {
5517 mddev->level = pers->level;
5518 mddev->new_level = pers->level;
5519 }
5520 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
5521
5522 if (mddev->reshape_position != MaxSector &&
5523 pers->start_reshape == NULL) {
5524
5525 module_put(pers->owner);
5526 return -EINVAL;
5527 }
5528
5529 if (pers->sync_request) {
5530
5531
5532
5533 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5534 struct md_rdev *rdev2;
5535 int warned = 0;
5536
5537 rdev_for_each(rdev, mddev)
5538 rdev_for_each(rdev2, mddev) {
5539 if (rdev < rdev2 &&
5540 rdev->bdev->bd_contains ==
5541 rdev2->bdev->bd_contains) {
5542 pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n",
5543 mdname(mddev),
5544 bdevname(rdev->bdev,b),
5545 bdevname(rdev2->bdev,b2));
5546 warned = 1;
5547 }
5548 }
5549
5550 if (warned)
5551 pr_warn("True protection against single-disk failure might be compromised.\n");
5552 }
5553
5554 mddev->recovery = 0;
5555
5556 mddev->resync_max_sectors = mddev->dev_sectors;
5557
5558 mddev->ok_start_degraded = start_dirty_degraded;
5559
5560 if (start_readonly && mddev->ro == 0)
5561 mddev->ro = 2;
5562
5563
5564
5565
5566
5567
5568 err = pers->run(mddev);
5569 if (err)
5570 pr_warn("md: pers->run() failed ...\n");
5571 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
5572 WARN_ONCE(!mddev->external_size,
5573 "%s: default size too small, but 'external_size' not in effect?\n",
5574 __func__);
5575 pr_warn("md: invalid array_size %llu > default size %llu\n",
5576 (unsigned long long)mddev->array_sectors / 2,
5577 (unsigned long long)pers->size(mddev, 0, 0) / 2);
5578 err = -EINVAL;
5579 }
5580 if (err == 0 && pers->sync_request &&
5581 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
5582 struct bitmap *bitmap;
5583
5584 bitmap = bitmap_create(mddev, -1);
5585 if (IS_ERR(bitmap)) {
5586 err = PTR_ERR(bitmap);
5587 pr_warn("%s: failed to create bitmap (%d)\n",
5588 mdname(mddev), err);
5589 } else
5590 mddev->bitmap = bitmap;
5591
5592 }
5593 if (err) {
5594 mddev_detach(mddev);
5595 if (mddev->private)
5596 pers->free(mddev, mddev->private);
5597 mddev->private = NULL;
5598 module_put(pers->owner);
5599 bitmap_destroy(mddev);
5600 return err;
5601 }
5602 if (mddev->queue) {
5603 bool nonrot = true;
5604
5605 rdev_for_each(rdev, mddev) {
5606 if (rdev->raid_disk >= 0 &&
5607 !blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
5608 nonrot = false;
5609 break;
5610 }
5611 }
5612 if (mddev->degraded)
5613 nonrot = false;
5614 if (nonrot)
5615 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mddev->queue);
5616 else
5617 queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, mddev->queue);
5618 mddev->queue->backing_dev_info->congested_data = mddev;
5619 mddev->queue->backing_dev_info->congested_fn = md_congested;
5620 }
5621 if (pers->sync_request) {
5622 if (mddev->kobj.sd &&
5623 sysfs_create_group(&mddev->kobj, &md_redundancy_group))
5624 pr_warn("md: cannot register extra attributes for %s\n",
5625 mdname(mddev));
5626 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
5627 } else if (mddev->ro == 2)
5628 mddev->ro = 0;
5629
5630 atomic_set(&mddev->max_corr_read_errors,
5631 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
5632 mddev->safemode = 0;
5633 if (mddev_is_clustered(mddev))
5634 mddev->safemode_delay = 0;
5635 else
5636 mddev->safemode_delay = (200 * HZ)/1000 +1;
5637 mddev->in_sync = 1;
5638 smp_wmb();
5639 spin_lock(&mddev->lock);
5640 mddev->pers = pers;
5641 spin_unlock(&mddev->lock);
5642 rdev_for_each(rdev, mddev)
5643 if (rdev->raid_disk >= 0)
5644 if (sysfs_link_rdev(mddev, rdev))
5645 ;
5646
5647 if (mddev->degraded && !mddev->ro)
5648
5649
5650
5651 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5652 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5653
5654 if (mddev->sb_flags)
5655 md_update_sb(mddev, 0);
5656
5657 md_new_event(mddev);
5658 sysfs_notify_dirent_safe(mddev->sysfs_state);
5659 sysfs_notify_dirent_safe(mddev->sysfs_action);
5660 sysfs_notify(&mddev->kobj, NULL, "degraded");
5661 return 0;
5662}
5663EXPORT_SYMBOL_GPL(md_run);
5664
5665static int do_md_run(struct mddev *mddev)
5666{
5667 int err;
5668
5669 err = md_run(mddev);
5670 if (err)
5671 goto out;
5672 err = bitmap_load(mddev);
5673 if (err) {
5674 bitmap_destroy(mddev);
5675 goto out;
5676 }
5677
5678 if (mddev_is_clustered(mddev))
5679 md_allow_write(mddev);
5680
5681 md_wakeup_thread(mddev->thread);
5682 md_wakeup_thread(mddev->sync_thread);
5683
5684 set_capacity(mddev->gendisk, mddev->array_sectors);
5685 revalidate_disk(mddev->gendisk);
5686 mddev->changed = 1;
5687 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5688out:
5689 return err;
5690}
5691
5692static int restart_array(struct mddev *mddev)
5693{
5694 struct gendisk *disk = mddev->gendisk;
5695 struct md_rdev *rdev;
5696 bool has_journal = false;
5697 bool has_readonly = false;
5698
5699
5700 if (list_empty(&mddev->disks))
5701 return -ENXIO;
5702 if (!mddev->pers)
5703 return -EINVAL;
5704 if (!mddev->ro)
5705 return -EBUSY;
5706
5707 rcu_read_lock();
5708 rdev_for_each_rcu(rdev, mddev) {
5709 if (test_bit(Journal, &rdev->flags) &&
5710 !test_bit(Faulty, &rdev->flags))
5711 has_journal = true;
5712 if (bdev_read_only(rdev->bdev))
5713 has_readonly = true;
5714 }
5715 rcu_read_unlock();
5716 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
5717
5718 return -EINVAL;
5719 if (has_readonly)
5720 return -EROFS;
5721
5722 mddev->safemode = 0;
5723 mddev->ro = 0;
5724 set_disk_ro(disk, 0);
5725 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
5726
5727 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5728 md_wakeup_thread(mddev->thread);
5729 md_wakeup_thread(mddev->sync_thread);
5730 sysfs_notify_dirent_safe(mddev->sysfs_state);
5731 return 0;
5732}
5733
5734static void md_clean(struct mddev *mddev)
5735{
5736 mddev->array_sectors = 0;
5737 mddev->external_size = 0;
5738 mddev->dev_sectors = 0;
5739 mddev->raid_disks = 0;
5740 mddev->recovery_cp = 0;
5741 mddev->resync_min = 0;
5742 mddev->resync_max = MaxSector;
5743 mddev->reshape_position = MaxSector;
5744 mddev->external = 0;
5745 mddev->persistent = 0;
5746 mddev->level = LEVEL_NONE;
5747 mddev->clevel[0] = 0;
5748 mddev->flags = 0;
5749 mddev->sb_flags = 0;
5750 mddev->ro = 0;
5751 mddev->metadata_type[0] = 0;
5752 mddev->chunk_sectors = 0;
5753 mddev->ctime = mddev->utime = 0;
5754 mddev->layout = 0;
5755 mddev->max_disks = 0;
5756 mddev->events = 0;
5757 mddev->can_decrease_events = 0;
5758 mddev->delta_disks = 0;
5759 mddev->reshape_backwards = 0;
5760 mddev->new_level = LEVEL_NONE;
5761 mddev->new_layout = 0;
5762 mddev->new_chunk_sectors = 0;
5763 mddev->curr_resync = 0;
5764 atomic64_set(&mddev->resync_mismatches, 0);
5765 mddev->suspend_lo = mddev->suspend_hi = 0;
5766 mddev->sync_speed_min = mddev->sync_speed_max = 0;
5767 mddev->recovery = 0;
5768 mddev->in_sync = 0;
5769 mddev->changed = 0;
5770 mddev->degraded = 0;
5771 mddev->safemode = 0;
5772 mddev->private = NULL;
5773 mddev->cluster_info = NULL;
5774 mddev->bitmap_info.offset = 0;
5775 mddev->bitmap_info.default_offset = 0;
5776 mddev->bitmap_info.default_space = 0;
5777 mddev->bitmap_info.chunksize = 0;
5778 mddev->bitmap_info.daemon_sleep = 0;
5779 mddev->bitmap_info.max_write_behind = 0;
5780 mddev->bitmap_info.nodes = 0;
5781}
5782
5783static void __md_stop_writes(struct mddev *mddev)
5784{
5785 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5786 flush_workqueue(md_misc_wq);
5787 if (mddev->sync_thread) {
5788 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5789 md_reap_sync_thread(mddev);
5790 }
5791
5792 del_timer_sync(&mddev->safemode_timer);
5793
5794 if (mddev->pers && mddev->pers->quiesce) {
5795 mddev->pers->quiesce(mddev, 1);
5796 mddev->pers->quiesce(mddev, 0);
5797 }
5798 bitmap_flush(mddev);
5799
5800 if (mddev->ro == 0 &&
5801 ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
5802 mddev->sb_flags)) {
5803
5804 if (!mddev_is_clustered(mddev))
5805 mddev->in_sync = 1;
5806 md_update_sb(mddev, 1);
5807 }
5808}
5809
5810void md_stop_writes(struct mddev *mddev)
5811{
5812 mddev_lock_nointr(mddev);
5813 __md_stop_writes(mddev);
5814 mddev_unlock(mddev);
5815}
5816EXPORT_SYMBOL_GPL(md_stop_writes);
5817
5818static void mddev_detach(struct mddev *mddev)
5819{
5820 bitmap_wait_behind_writes(mddev);
5821 if (mddev->pers && mddev->pers->quiesce) {
5822 mddev->pers->quiesce(mddev, 1);
5823 mddev->pers->quiesce(mddev, 0);
5824 }
5825 md_unregister_thread(&mddev->thread);
5826 if (mddev->queue)
5827 blk_sync_queue(mddev->queue);
5828}
5829
5830static void __md_stop(struct mddev *mddev)
5831{
5832 struct md_personality *pers = mddev->pers;
5833 bitmap_destroy(mddev);
5834 mddev_detach(mddev);
5835
5836 flush_workqueue(md_misc_wq);
5837 spin_lock(&mddev->lock);
5838 mddev->pers = NULL;
5839 spin_unlock(&mddev->lock);
5840 pers->free(mddev, mddev->private);
5841 mddev->private = NULL;
5842 if (pers->sync_request && mddev->to_remove == NULL)
5843 mddev->to_remove = &md_redundancy_group;
5844 module_put(pers->owner);
5845 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5846}
5847
5848void md_stop(struct mddev *mddev)
5849{
5850
5851
5852
5853 __md_stop(mddev);
5854 if (mddev->bio_set) {
5855 bioset_free(mddev->bio_set);
5856 mddev->bio_set = NULL;
5857 }
5858 if (mddev->sync_set) {
5859 bioset_free(mddev->sync_set);
5860 mddev->sync_set = NULL;
5861 }
5862}
5863
5864EXPORT_SYMBOL_GPL(md_stop);
5865
5866static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
5867{
5868 int err = 0;
5869 int did_freeze = 0;
5870
5871 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5872 did_freeze = 1;
5873 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5874 md_wakeup_thread(mddev->thread);
5875 }
5876 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5877 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5878 if (mddev->sync_thread)
5879
5880
5881 wake_up_process(mddev->sync_thread->tsk);
5882
5883 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
5884 return -EBUSY;
5885 mddev_unlock(mddev);
5886 wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
5887 &mddev->recovery));
5888 wait_event(mddev->sb_wait,
5889 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
5890 mddev_lock_nointr(mddev);
5891
5892 mutex_lock(&mddev->open_mutex);
5893 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
5894 mddev->sync_thread ||
5895 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
5896 pr_warn("md: %s still in use.\n",mdname(mddev));
5897 if (did_freeze) {
5898 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5899 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5900 md_wakeup_thread(mddev->thread);
5901 }
5902 err = -EBUSY;
5903 goto out;
5904 }
5905 if (mddev->pers) {
5906 __md_stop_writes(mddev);
5907
5908 err = -ENXIO;
5909 if (mddev->ro==1)
5910 goto out;
5911 mddev->ro = 1;
5912 set_disk_ro(mddev->gendisk, 1);
5913 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5914 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5915 md_wakeup_thread(mddev->thread);
5916 sysfs_notify_dirent_safe(mddev->sysfs_state);
5917 err = 0;
5918 }
5919out:
5920 mutex_unlock(&mddev->open_mutex);
5921 return err;
5922}
5923
5924
5925
5926
5927
5928static int do_md_stop(struct mddev *mddev, int mode,
5929 struct block_device *bdev)
5930{
5931 struct gendisk *disk = mddev->gendisk;
5932 struct md_rdev *rdev;
5933 int did_freeze = 0;
5934
5935 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5936 did_freeze = 1;
5937 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5938 md_wakeup_thread(mddev->thread);
5939 }
5940 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5941 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5942 if (mddev->sync_thread)
5943
5944
5945 wake_up_process(mddev->sync_thread->tsk);
5946
5947 mddev_unlock(mddev);
5948 wait_event(resync_wait, (mddev->sync_thread == NULL &&
5949 !test_bit(MD_RECOVERY_RUNNING,
5950 &mddev->recovery)));
5951 mddev_lock_nointr(mddev);
5952
5953 mutex_lock(&mddev->open_mutex);
5954 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
5955 mddev->sysfs_active ||
5956 mddev->sync_thread ||
5957 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
5958 pr_warn("md: %s still in use.\n",mdname(mddev));
5959 mutex_unlock(&mddev->open_mutex);
5960 if (did_freeze) {
5961 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5962 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5963 md_wakeup_thread(mddev->thread);
5964 }
5965 return -EBUSY;
5966 }
5967 if (mddev->pers) {
5968 if (mddev->ro)
5969 set_disk_ro(disk, 0);
5970
5971 __md_stop_writes(mddev);
5972 __md_stop(mddev);
5973 mddev->queue->backing_dev_info->congested_fn = NULL;
5974
5975
5976 sysfs_notify_dirent_safe(mddev->sysfs_state);
5977
5978 rdev_for_each(rdev, mddev)
5979 if (rdev->raid_disk >= 0)
5980 sysfs_unlink_rdev(mddev, rdev);
5981
5982 set_capacity(disk, 0);
5983 mutex_unlock(&mddev->open_mutex);
5984 mddev->changed = 1;
5985 revalidate_disk(disk);
5986
5987 if (mddev->ro)
5988 mddev->ro = 0;
5989 } else
5990 mutex_unlock(&mddev->open_mutex);
5991
5992
5993
5994 if (mode == 0) {
5995 pr_info("md: %s stopped.\n", mdname(mddev));
5996
5997 if (mddev->bitmap_info.file) {
5998 struct file *f = mddev->bitmap_info.file;
5999 spin_lock(&mddev->lock);
6000 mddev->bitmap_info.file = NULL;
6001 spin_unlock(&mddev->lock);
6002 fput(f);
6003 }
6004 mddev->bitmap_info.offset = 0;
6005
6006 export_array(mddev);
6007
6008 md_clean(mddev);
6009 if (mddev->hold_active == UNTIL_STOP)
6010 mddev->hold_active = 0;
6011 }
6012 md_new_event(mddev);
6013 sysfs_notify_dirent_safe(mddev->sysfs_state);
6014 return 0;
6015}
6016
6017#ifndef MODULE
6018static void autorun_array(struct mddev *mddev)
6019{
6020 struct md_rdev *rdev;
6021 int err;
6022
6023 if (list_empty(&mddev->disks))
6024 return;
6025
6026 pr_info("md: running: ");
6027
6028 rdev_for_each(rdev, mddev) {
6029 char b[BDEVNAME_SIZE];
6030 pr_cont("<%s>", bdevname(rdev->bdev,b));
6031 }
6032 pr_cont("\n");
6033
6034 err = do_md_run(mddev);
6035 if (err) {
6036 pr_warn("md: do_md_run() returned %d\n", err);
6037 do_md_stop(mddev, 0, NULL);
6038 }
6039}
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053static void autorun_devices(int part)
6054{
6055 struct md_rdev *rdev0, *rdev, *tmp;
6056 struct mddev *mddev;
6057 char b[BDEVNAME_SIZE];
6058
6059 pr_info("md: autorun ...\n");
6060 while (!list_empty(&pending_raid_disks)) {
6061 int unit;
6062 dev_t dev;
6063 LIST_HEAD(candidates);
6064 rdev0 = list_entry(pending_raid_disks.next,
6065 struct md_rdev, same_set);
6066
6067 pr_debug("md: considering %s ...\n", bdevname(rdev0->bdev,b));
6068 INIT_LIST_HEAD(&candidates);
6069 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
6070 if (super_90_load(rdev, rdev0, 0) >= 0) {
6071 pr_debug("md: adding %s ...\n",
6072 bdevname(rdev->bdev,b));
6073 list_move(&rdev->same_set, &candidates);
6074 }
6075
6076
6077
6078
6079
6080 if (part) {
6081 dev = MKDEV(mdp_major,
6082 rdev0->preferred_minor << MdpMinorShift);
6083 unit = MINOR(dev) >> MdpMinorShift;
6084 } else {
6085 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
6086 unit = MINOR(dev);
6087 }
6088 if (rdev0->preferred_minor != unit) {
6089 pr_warn("md: unit number in %s is bad: %d\n",
6090 bdevname(rdev0->bdev, b), rdev0->preferred_minor);
6091 break;
6092 }
6093
6094 md_probe(dev, NULL, NULL);
6095 mddev = mddev_find(dev);
6096 if (!mddev || !mddev->gendisk) {
6097 if (mddev)
6098 mddev_put(mddev);
6099 break;
6100 }
6101 if (mddev_lock(mddev))
6102 pr_warn("md: %s locked, cannot run\n", mdname(mddev));
6103 else if (mddev->raid_disks || mddev->major_version
6104 || !list_empty(&mddev->disks)) {
6105 pr_warn("md: %s already running, cannot run %s\n",
6106 mdname(mddev), bdevname(rdev0->bdev,b));
6107 mddev_unlock(mddev);
6108 } else {
6109 pr_debug("md: created %s\n", mdname(mddev));
6110 mddev->persistent = 1;
6111 rdev_for_each_list(rdev, tmp, &candidates) {
6112 list_del_init(&rdev->same_set);
6113 if (bind_rdev_to_array(rdev, mddev))
6114 export_rdev(rdev);
6115 }
6116 autorun_array(mddev);
6117 mddev_unlock(mddev);
6118 }
6119
6120
6121
6122 rdev_for_each_list(rdev, tmp, &candidates) {
6123 list_del_init(&rdev->same_set);
6124 export_rdev(rdev);
6125 }
6126 mddev_put(mddev);
6127 }
6128 pr_info("md: ... autorun DONE.\n");
6129}
6130#endif
6131
6132static int get_version(void __user *arg)
6133{
6134 mdu_version_t ver;
6135
6136 ver.major = MD_MAJOR_VERSION;
6137 ver.minor = MD_MINOR_VERSION;
6138 ver.patchlevel = MD_PATCHLEVEL_VERSION;
6139
6140 if (copy_to_user(arg, &ver, sizeof(ver)))
6141 return -EFAULT;
6142
6143 return 0;
6144}
6145
6146static int get_array_info(struct mddev *mddev, void __user *arg)
6147{
6148 mdu_array_info_t info;
6149 int nr,working,insync,failed,spare;
6150 struct md_rdev *rdev;
6151
6152 nr = working = insync = failed = spare = 0;
6153 rcu_read_lock();
6154 rdev_for_each_rcu(rdev, mddev) {
6155 nr++;
6156 if (test_bit(Faulty, &rdev->flags))
6157 failed++;
6158 else {
6159 working++;
6160 if (test_bit(In_sync, &rdev->flags))
6161 insync++;
6162 else if (test_bit(Journal, &rdev->flags))
6163
6164 ;
6165 else
6166 spare++;
6167 }
6168 }
6169 rcu_read_unlock();
6170
6171 info.major_version = mddev->major_version;
6172 info.minor_version = mddev->minor_version;
6173 info.patch_version = MD_PATCHLEVEL_VERSION;
6174 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
6175 info.level = mddev->level;
6176 info.size = mddev->dev_sectors / 2;
6177 if (info.size != mddev->dev_sectors / 2)
6178 info.size = -1;
6179 info.nr_disks = nr;
6180 info.raid_disks = mddev->raid_disks;
6181 info.md_minor = mddev->md_minor;
6182 info.not_persistent= !mddev->persistent;
6183
6184 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
6185 info.state = 0;
6186 if (mddev->in_sync)
6187 info.state = (1<<MD_SB_CLEAN);
6188 if (mddev->bitmap && mddev->bitmap_info.offset)
6189 info.state |= (1<<MD_SB_BITMAP_PRESENT);
6190 if (mddev_is_clustered(mddev))
6191 info.state |= (1<<MD_SB_CLUSTERED);
6192 info.active_disks = insync;
6193 info.working_disks = working;
6194 info.failed_disks = failed;
6195 info.spare_disks = spare;
6196
6197 info.layout = mddev->layout;
6198 info.chunk_size = mddev->chunk_sectors << 9;
6199
6200 if (copy_to_user(arg, &info, sizeof(info)))
6201 return -EFAULT;
6202
6203 return 0;
6204}
6205
6206static int get_bitmap_file(struct mddev *mddev, void __user * arg)
6207{
6208 mdu_bitmap_file_t *file = NULL;
6209 char *ptr;
6210 int err;
6211
6212 file = kzalloc(sizeof(*file), GFP_NOIO);
6213 if (!file)
6214 return -ENOMEM;
6215
6216 err = 0;
6217 spin_lock(&mddev->lock);
6218
6219 if (mddev->bitmap_info.file) {
6220 ptr = file_path(mddev->bitmap_info.file, file->pathname,
6221 sizeof(file->pathname));
6222 if (IS_ERR(ptr))
6223 err = PTR_ERR(ptr);
6224 else
6225 memmove(file->pathname, ptr,
6226 sizeof(file->pathname)-(ptr-file->pathname));
6227 }
6228 spin_unlock(&mddev->lock);
6229
6230 if (err == 0 &&
6231 copy_to_user(arg, file, sizeof(*file)))
6232 err = -EFAULT;
6233
6234 kfree(file);
6235 return err;
6236}
6237
6238static int get_disk_info(struct mddev *mddev, void __user * arg)
6239{
6240 mdu_disk_info_t info;
6241 struct md_rdev *rdev;
6242
6243 if (copy_from_user(&info, arg, sizeof(info)))
6244 return -EFAULT;
6245
6246 rcu_read_lock();
6247 rdev = md_find_rdev_nr_rcu(mddev, info.number);
6248 if (rdev) {
6249 info.major = MAJOR(rdev->bdev->bd_dev);
6250 info.minor = MINOR(rdev->bdev->bd_dev);
6251 info.raid_disk = rdev->raid_disk;
6252 info.state = 0;
6253 if (test_bit(Faulty, &rdev->flags))
6254 info.state |= (1<<MD_DISK_FAULTY);
6255 else if (test_bit(In_sync, &rdev->flags)) {
6256 info.state |= (1<<MD_DISK_ACTIVE);
6257 info.state |= (1<<MD_DISK_SYNC);
6258 }
6259 if (test_bit(Journal, &rdev->flags))
6260 info.state |= (1<<MD_DISK_JOURNAL);
6261 if (test_bit(WriteMostly, &rdev->flags))
6262 info.state |= (1<<MD_DISK_WRITEMOSTLY);
6263 if (test_bit(FailFast, &rdev->flags))
6264 info.state |= (1<<MD_DISK_FAILFAST);
6265 } else {
6266 info.major = info.minor = 0;
6267 info.raid_disk = -1;
6268 info.state = (1<<MD_DISK_REMOVED);
6269 }
6270 rcu_read_unlock();
6271
6272 if (copy_to_user(arg, &info, sizeof(info)))
6273 return -EFAULT;
6274
6275 return 0;
6276}
6277
6278static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
6279{
6280 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
6281 struct md_rdev *rdev;
6282 dev_t dev = MKDEV(info->major,info->minor);
6283
6284 if (mddev_is_clustered(mddev) &&
6285 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
6286 pr_warn("%s: Cannot add to clustered mddev.\n",
6287 mdname(mddev));
6288 return -EINVAL;
6289 }
6290
6291 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
6292 return -EOVERFLOW;
6293
6294 if (!mddev->raid_disks) {
6295 int err;
6296
6297 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
6298 if (IS_ERR(rdev)) {
6299 pr_warn("md: md_import_device returned %ld\n",
6300 PTR_ERR(rdev));
6301 return PTR_ERR(rdev);
6302 }
6303 if (!list_empty(&mddev->disks)) {
6304 struct md_rdev *rdev0
6305 = list_entry(mddev->disks.next,
6306 struct md_rdev, same_set);
6307 err = super_types[mddev->major_version]
6308 .load_super(rdev, rdev0, mddev->minor_version);
6309 if (err < 0) {
6310 pr_warn("md: %s has different UUID to %s\n",
6311 bdevname(rdev->bdev,b),
6312 bdevname(rdev0->bdev,b2));
6313 export_rdev(rdev);
6314 return -EINVAL;
6315 }
6316 }
6317 err = bind_rdev_to_array(rdev, mddev);
6318 if (err)
6319 export_rdev(rdev);
6320 return err;
6321 }
6322
6323
6324
6325
6326
6327
6328 if (mddev->pers) {
6329 int err;
6330 if (!mddev->pers->hot_add_disk) {
6331 pr_warn("%s: personality does not support diskops!\n",
6332 mdname(mddev));
6333 return -EINVAL;
6334 }
6335 if (mddev->persistent)
6336 rdev = md_import_device(dev, mddev->major_version,
6337 mddev->minor_version);
6338 else
6339 rdev = md_import_device(dev, -1, -1);
6340 if (IS_ERR(rdev)) {
6341 pr_warn("md: md_import_device returned %ld\n",
6342 PTR_ERR(rdev));
6343 return PTR_ERR(rdev);
6344 }
6345
6346 if (!mddev->persistent) {
6347 if (info->state & (1<<MD_DISK_SYNC) &&
6348 info->raid_disk < mddev->raid_disks) {
6349 rdev->raid_disk = info->raid_disk;
6350 set_bit(In_sync, &rdev->flags);
6351 clear_bit(Bitmap_sync, &rdev->flags);
6352 } else
6353 rdev->raid_disk = -1;
6354 rdev->saved_raid_disk = rdev->raid_disk;
6355 } else
6356 super_types[mddev->major_version].
6357 validate_super(mddev, rdev);
6358 if ((info->state & (1<<MD_DISK_SYNC)) &&
6359 rdev->raid_disk != info->raid_disk) {
6360
6361
6362
6363 export_rdev(rdev);
6364 return -EINVAL;
6365 }
6366
6367 clear_bit(In_sync, &rdev->flags);
6368 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6369 set_bit(WriteMostly, &rdev->flags);
6370 else
6371 clear_bit(WriteMostly, &rdev->flags);
6372 if (info->state & (1<<MD_DISK_FAILFAST))
6373 set_bit(FailFast, &rdev->flags);
6374 else
6375 clear_bit(FailFast, &rdev->flags);
6376
6377 if (info->state & (1<<MD_DISK_JOURNAL)) {
6378 struct md_rdev *rdev2;
6379 bool has_journal = false;
6380
6381
6382 rdev_for_each(rdev2, mddev) {
6383 if (test_bit(Journal, &rdev2->flags)) {
6384 has_journal = true;
6385 break;
6386 }
6387 }
6388 if (has_journal || mddev->bitmap) {
6389 export_rdev(rdev);
6390 return -EBUSY;
6391 }
6392 set_bit(Journal, &rdev->flags);
6393 }
6394
6395
6396
6397 if (mddev_is_clustered(mddev)) {
6398 if (info->state & (1 << MD_DISK_CANDIDATE))
6399 set_bit(Candidate, &rdev->flags);
6400 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
6401
6402 err = md_cluster_ops->add_new_disk(mddev, rdev);
6403 if (err) {
6404 export_rdev(rdev);
6405 return err;
6406 }
6407 }
6408 }
6409
6410 rdev->raid_disk = -1;
6411 err = bind_rdev_to_array(rdev, mddev);
6412
6413 if (err)
6414 export_rdev(rdev);
6415
6416 if (mddev_is_clustered(mddev)) {
6417 if (info->state & (1 << MD_DISK_CANDIDATE)) {
6418 if (!err) {
6419 err = md_cluster_ops->new_disk_ack(mddev,
6420 err == 0);
6421 if (err)
6422 md_kick_rdev_from_array(rdev);
6423 }
6424 } else {
6425 if (err)
6426 md_cluster_ops->add_new_disk_cancel(mddev);
6427 else
6428 err = add_bound_rdev(rdev);
6429 }
6430
6431 } else if (!err)
6432 err = add_bound_rdev(rdev);
6433
6434 return err;
6435 }
6436
6437
6438
6439
6440 if (mddev->major_version != 0) {
6441 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev));
6442 return -EINVAL;
6443 }
6444
6445 if (!(info->state & (1<<MD_DISK_FAULTY))) {
6446 int err;
6447 rdev = md_import_device(dev, -1, 0);
6448 if (IS_ERR(rdev)) {
6449 pr_warn("md: error, md_import_device() returned %ld\n",
6450 PTR_ERR(rdev));
6451 return PTR_ERR(rdev);
6452 }
6453 rdev->desc_nr = info->number;
6454 if (info->raid_disk < mddev->raid_disks)
6455 rdev->raid_disk = info->raid_disk;
6456 else
6457 rdev->raid_disk = -1;
6458
6459 if (rdev->raid_disk < mddev->raid_disks)
6460 if (info->state & (1<<MD_DISK_SYNC))
6461 set_bit(In_sync, &rdev->flags);
6462
6463 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6464 set_bit(WriteMostly, &rdev->flags);
6465 if (info->state & (1<<MD_DISK_FAILFAST))
6466 set_bit(FailFast, &rdev->flags);
6467
6468 if (!mddev->persistent) {
6469 pr_debug("md: nonpersistent superblock ...\n");
6470 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
6471 } else
6472 rdev->sb_start = calc_dev_sboffset(rdev);
6473 rdev->sectors = rdev->sb_start;
6474
6475 err = bind_rdev_to_array(rdev, mddev);
6476 if (err) {
6477 export_rdev(rdev);
6478 return err;
6479 }
6480 }
6481
6482 return 0;
6483}
6484
6485static int hot_remove_disk(struct mddev *mddev, dev_t dev)
6486{
6487 char b[BDEVNAME_SIZE];
6488 struct md_rdev *rdev;
6489
6490 rdev = find_rdev(mddev, dev);
6491 if (!rdev)
6492 return -ENXIO;
6493
6494 if (rdev->raid_disk < 0)
6495 goto kick_rdev;
6496
6497 clear_bit(Blocked, &rdev->flags);
6498 remove_and_add_spares(mddev, rdev);
6499
6500 if (rdev->raid_disk >= 0)
6501 goto busy;
6502
6503kick_rdev:
6504 if (mddev_is_clustered(mddev))
6505 md_cluster_ops->remove_disk(mddev, rdev);
6506
6507 md_kick_rdev_from_array(rdev);
6508 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6509 if (mddev->thread)
6510 md_wakeup_thread(mddev->thread);
6511 else
6512 md_update_sb(mddev, 1);
6513 md_new_event(mddev);
6514
6515 return 0;
6516busy:
6517 pr_debug("md: cannot remove active disk %s from %s ...\n",
6518 bdevname(rdev->bdev,b), mdname(mddev));
6519 return -EBUSY;
6520}
6521
6522static int hot_add_disk(struct mddev *mddev, dev_t dev)
6523{
6524 char b[BDEVNAME_SIZE];
6525 int err;
6526 struct md_rdev *rdev;
6527
6528 if (!mddev->pers)
6529 return -ENODEV;
6530
6531 if (mddev->major_version != 0) {
6532 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n",
6533 mdname(mddev));
6534 return -EINVAL;
6535 }
6536 if (!mddev->pers->hot_add_disk) {
6537 pr_warn("%s: personality does not support diskops!\n",
6538 mdname(mddev));
6539 return -EINVAL;
6540 }
6541
6542 rdev = md_import_device(dev, -1, 0);
6543 if (IS_ERR(rdev)) {
6544 pr_warn("md: error, md_import_device() returned %ld\n",
6545 PTR_ERR(rdev));
6546 return -EINVAL;
6547 }
6548
6549 if (mddev->persistent)
6550 rdev->sb_start = calc_dev_sboffset(rdev);
6551 else
6552 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
6553
6554 rdev->sectors = rdev->sb_start;
6555
6556 if (test_bit(Faulty, &rdev->flags)) {
6557 pr_warn("md: can not hot-add faulty %s disk to %s!\n",
6558 bdevname(rdev->bdev,b), mdname(mddev));
6559 err = -EINVAL;
6560 goto abort_export;
6561 }
6562
6563 clear_bit(In_sync, &rdev->flags);
6564 rdev->desc_nr = -1;
6565 rdev->saved_raid_disk = -1;
6566 err = bind_rdev_to_array(rdev, mddev);
6567 if (err)
6568 goto abort_export;
6569
6570
6571
6572
6573
6574
6575 rdev->raid_disk = -1;
6576
6577 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6578 if (!mddev->thread)
6579 md_update_sb(mddev, 1);
6580
6581
6582
6583
6584 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6585 md_wakeup_thread(mddev->thread);
6586 md_new_event(mddev);
6587 return 0;
6588
6589abort_export:
6590 export_rdev(rdev);
6591 return err;
6592}
6593
6594static int set_bitmap_file(struct mddev *mddev, int fd)
6595{
6596 int err = 0;
6597
6598 if (mddev->pers) {
6599 if (!mddev->pers->quiesce || !mddev->thread)
6600 return -EBUSY;
6601 if (mddev->recovery || mddev->sync_thread)
6602 return -EBUSY;
6603
6604 }
6605
6606 if (fd >= 0) {
6607 struct inode *inode;
6608 struct file *f;
6609
6610 if (mddev->bitmap || mddev->bitmap_info.file)
6611 return -EEXIST;
6612 f = fget(fd);
6613
6614 if (f == NULL) {
6615 pr_warn("%s: error: failed to get bitmap file\n",
6616 mdname(mddev));
6617 return -EBADF;
6618 }
6619
6620 inode = f->f_mapping->host;
6621 if (!S_ISREG(inode->i_mode)) {
6622 pr_warn("%s: error: bitmap file must be a regular file\n",
6623 mdname(mddev));
6624 err = -EBADF;
6625 } else if (!(f->f_mode & FMODE_WRITE)) {
6626 pr_warn("%s: error: bitmap file must open for write\n",
6627 mdname(mddev));
6628 err = -EBADF;
6629 } else if (atomic_read(&inode->i_writecount) != 1) {
6630 pr_warn("%s: error: bitmap file is already in use\n",
6631 mdname(mddev));
6632 err = -EBUSY;
6633 }
6634 if (err) {
6635 fput(f);
6636 return err;
6637 }
6638 mddev->bitmap_info.file = f;
6639 mddev->bitmap_info.offset = 0;
6640 } else if (mddev->bitmap == NULL)
6641 return -ENOENT;
6642 err = 0;
6643 if (mddev->pers) {
6644 if (fd >= 0) {
6645 struct bitmap *bitmap;
6646
6647 bitmap = bitmap_create(mddev, -1);
6648 mddev_suspend(mddev);
6649 if (!IS_ERR(bitmap)) {
6650 mddev->bitmap = bitmap;
6651 err = bitmap_load(mddev);
6652 } else
6653 err = PTR_ERR(bitmap);
6654 if (err) {
6655 bitmap_destroy(mddev);
6656 fd = -1;
6657 }
6658 mddev_resume(mddev);
6659 } else if (fd < 0) {
6660 mddev_suspend(mddev);
6661 bitmap_destroy(mddev);
6662 mddev_resume(mddev);
6663 }
6664 }
6665 if (fd < 0) {
6666 struct file *f = mddev->bitmap_info.file;
6667 if (f) {
6668 spin_lock(&mddev->lock);
6669 mddev->bitmap_info.file = NULL;
6670 spin_unlock(&mddev->lock);
6671 fput(f);
6672 }
6673 }
6674
6675 return err;
6676}
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
6692{
6693
6694 if (info->raid_disks == 0) {
6695
6696 if (info->major_version < 0 ||
6697 info->major_version >= ARRAY_SIZE(super_types) ||
6698 super_types[info->major_version].name == NULL) {
6699
6700 pr_warn("md: superblock version %d not known\n",
6701 info->major_version);
6702 return -EINVAL;
6703 }
6704 mddev->major_version = info->major_version;
6705 mddev->minor_version = info->minor_version;
6706 mddev->patch_version = info->patch_version;
6707 mddev->persistent = !info->not_persistent;
6708
6709
6710
6711 mddev->ctime = ktime_get_real_seconds();
6712 return 0;
6713 }
6714 mddev->major_version = MD_MAJOR_VERSION;
6715 mddev->minor_version = MD_MINOR_VERSION;
6716 mddev->patch_version = MD_PATCHLEVEL_VERSION;
6717 mddev->ctime = ktime_get_real_seconds();
6718
6719 mddev->level = info->level;
6720 mddev->clevel[0] = 0;
6721 mddev->dev_sectors = 2 * (sector_t)info->size;
6722 mddev->raid_disks = info->raid_disks;
6723
6724
6725
6726 if (info->state & (1<<MD_SB_CLEAN))
6727 mddev->recovery_cp = MaxSector;
6728 else
6729 mddev->recovery_cp = 0;
6730 mddev->persistent = ! info->not_persistent;
6731 mddev->external = 0;
6732
6733 mddev->layout = info->layout;
6734 mddev->chunk_sectors = info->chunk_size >> 9;
6735
6736 if (mddev->persistent) {
6737 mddev->max_disks = MD_SB_DISKS;
6738 mddev->flags = 0;
6739 mddev->sb_flags = 0;
6740 }
6741 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6742
6743 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
6744 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
6745 mddev->bitmap_info.offset = 0;
6746
6747 mddev->reshape_position = MaxSector;
6748
6749
6750
6751
6752 get_random_bytes(mddev->uuid, 16);
6753
6754 mddev->new_level = mddev->level;
6755 mddev->new_chunk_sectors = mddev->chunk_sectors;
6756 mddev->new_layout = mddev->layout;
6757 mddev->delta_disks = 0;
6758 mddev->reshape_backwards = 0;
6759
6760 return 0;
6761}
6762
6763void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
6764{
6765 lockdep_assert_held(&mddev->reconfig_mutex);
6766
6767 if (mddev->external_size)
6768 return;
6769
6770 mddev->array_sectors = array_sectors;
6771}
6772EXPORT_SYMBOL(md_set_array_sectors);
6773
6774static int update_size(struct mddev *mddev, sector_t num_sectors)
6775{
6776 struct md_rdev *rdev;
6777 int rv;
6778 int fit = (num_sectors == 0);
6779 sector_t old_dev_sectors = mddev->dev_sectors;
6780
6781 if (mddev->pers->resize == NULL)
6782 return -EINVAL;
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6793 mddev->sync_thread)
6794 return -EBUSY;
6795 if (mddev->ro)
6796 return -EROFS;
6797
6798 rdev_for_each(rdev, mddev) {
6799 sector_t avail = rdev->sectors;
6800
6801 if (fit && (num_sectors == 0 || num_sectors > avail))
6802 num_sectors = avail;
6803 if (avail < num_sectors)
6804 return -ENOSPC;
6805 }
6806 rv = mddev->pers->resize(mddev, num_sectors);
6807 if (!rv) {
6808 if (mddev_is_clustered(mddev))
6809 md_cluster_ops->update_size(mddev, old_dev_sectors);
6810 else if (mddev->queue) {
6811 set_capacity(mddev->gendisk, mddev->array_sectors);
6812 revalidate_disk(mddev->gendisk);
6813 }
6814 }
6815 return rv;
6816}
6817
6818static int update_raid_disks(struct mddev *mddev, int raid_disks)
6819{
6820 int rv;
6821 struct md_rdev *rdev;
6822
6823 if (mddev->pers->check_reshape == NULL)
6824 return -EINVAL;
6825 if (mddev->ro)
6826 return -EROFS;
6827 if (raid_disks <= 0 ||
6828 (mddev->max_disks && raid_disks >= mddev->max_disks))
6829 return -EINVAL;
6830 if (mddev->sync_thread ||
6831 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6832 mddev->reshape_position != MaxSector)
6833 return -EBUSY;
6834
6835 rdev_for_each(rdev, mddev) {
6836 if (mddev->raid_disks < raid_disks &&
6837 rdev->data_offset < rdev->new_data_offset)
6838 return -EINVAL;
6839 if (mddev->raid_disks > raid_disks &&
6840 rdev->data_offset > rdev->new_data_offset)
6841 return -EINVAL;
6842 }
6843
6844 mddev->delta_disks = raid_disks - mddev->raid_disks;
6845 if (mddev->delta_disks < 0)
6846 mddev->reshape_backwards = 1;
6847 else if (mddev->delta_disks > 0)
6848 mddev->reshape_backwards = 0;
6849
6850 rv = mddev->pers->check_reshape(mddev);
6851 if (rv < 0) {
6852 mddev->delta_disks = 0;
6853 mddev->reshape_backwards = 0;
6854 }
6855 return rv;
6856}
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6867{
6868 int rv = 0;
6869 int cnt = 0;
6870 int state = 0;
6871
6872
6873 if (mddev->bitmap && mddev->bitmap_info.offset)
6874 state |= (1 << MD_SB_BITMAP_PRESENT);
6875
6876 if (mddev->major_version != info->major_version ||
6877 mddev->minor_version != info->minor_version ||
6878
6879 mddev->ctime != info->ctime ||
6880 mddev->level != info->level ||
6881
6882 mddev->persistent != !info->not_persistent ||
6883 mddev->chunk_sectors != info->chunk_size >> 9 ||
6884
6885 ((state^info->state) & 0xfffffe00)
6886 )
6887 return -EINVAL;
6888
6889 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6890 cnt++;
6891 if (mddev->raid_disks != info->raid_disks)
6892 cnt++;
6893 if (mddev->layout != info->layout)
6894 cnt++;
6895 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
6896 cnt++;
6897 if (cnt == 0)
6898 return 0;
6899 if (cnt > 1)
6900 return -EINVAL;
6901
6902 if (mddev->layout != info->layout) {
6903
6904
6905
6906
6907 if (mddev->pers->check_reshape == NULL)
6908 return -EINVAL;
6909 else {
6910 mddev->new_layout = info->layout;
6911 rv = mddev->pers->check_reshape(mddev);
6912 if (rv)
6913 mddev->new_layout = mddev->layout;
6914 return rv;
6915 }
6916 }
6917 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6918 rv = update_size(mddev, (sector_t)info->size * 2);
6919
6920 if (mddev->raid_disks != info->raid_disks)
6921 rv = update_raid_disks(mddev, info->raid_disks);
6922
6923 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
6924 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
6925 rv = -EINVAL;
6926 goto err;
6927 }
6928 if (mddev->recovery || mddev->sync_thread) {
6929 rv = -EBUSY;
6930 goto err;
6931 }
6932 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
6933 struct bitmap *bitmap;
6934
6935 if (mddev->bitmap) {
6936 rv = -EEXIST;
6937 goto err;
6938 }
6939 if (mddev->bitmap_info.default_offset == 0) {
6940 rv = -EINVAL;
6941 goto err;
6942 }
6943 mddev->bitmap_info.offset =
6944 mddev->bitmap_info.default_offset;
6945 mddev->bitmap_info.space =
6946 mddev->bitmap_info.default_space;
6947 bitmap = bitmap_create(mddev, -1);
6948 mddev_suspend(mddev);
6949 if (!IS_ERR(bitmap)) {
6950 mddev->bitmap = bitmap;
6951 rv = bitmap_load(mddev);
6952 } else
6953 rv = PTR_ERR(bitmap);
6954 if (rv)
6955 bitmap_destroy(mddev);
6956 mddev_resume(mddev);
6957 } else {
6958
6959 if (!mddev->bitmap) {
6960 rv = -ENOENT;
6961 goto err;
6962 }
6963 if (mddev->bitmap->storage.file) {
6964 rv = -EINVAL;
6965 goto err;
6966 }
6967 if (mddev->bitmap_info.nodes) {
6968
6969 if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
6970 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
6971 rv = -EPERM;
6972 md_cluster_ops->unlock_all_bitmaps(mddev);
6973 goto err;
6974 }
6975
6976 mddev->bitmap_info.nodes = 0;
6977 md_cluster_ops->leave(mddev);
6978 }
6979 mddev_suspend(mddev);
6980 bitmap_destroy(mddev);
6981 mddev_resume(mddev);
6982 mddev->bitmap_info.offset = 0;
6983 }
6984 }
6985 md_update_sb(mddev, 1);
6986 return rv;
6987err:
6988 return rv;
6989}
6990
6991static int set_disk_faulty(struct mddev *mddev, dev_t dev)
6992{
6993 struct md_rdev *rdev;
6994 int err = 0;
6995
6996 if (mddev->pers == NULL)
6997 return -ENODEV;
6998
6999 rcu_read_lock();
7000 rdev = find_rdev_rcu(mddev, dev);
7001 if (!rdev)
7002 err = -ENODEV;
7003 else {
7004 md_error(mddev, rdev);
7005 if (!test_bit(Faulty, &rdev->flags))
7006 err = -EBUSY;
7007 }
7008 rcu_read_unlock();
7009 return err;
7010}
7011
7012
7013
7014
7015
7016
7017
7018static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
7019{
7020 struct mddev *mddev = bdev->bd_disk->private_data;
7021
7022 geo->heads = 2;
7023 geo->sectors = 4;
7024 geo->cylinders = mddev->array_sectors / 8;
7025 return 0;
7026}
7027
7028static inline bool md_ioctl_valid(unsigned int cmd)
7029{
7030 switch (cmd) {
7031 case ADD_NEW_DISK:
7032 case BLKROSET:
7033 case GET_ARRAY_INFO:
7034 case GET_BITMAP_FILE:
7035 case GET_DISK_INFO:
7036 case HOT_ADD_DISK:
7037 case HOT_REMOVE_DISK:
7038 case RAID_AUTORUN:
7039 case RAID_VERSION:
7040 case RESTART_ARRAY_RW:
7041 case RUN_ARRAY:
7042 case SET_ARRAY_INFO:
7043 case SET_BITMAP_FILE:
7044 case SET_DISK_FAULTY:
7045 case STOP_ARRAY:
7046 case STOP_ARRAY_RO:
7047 case CLUSTERED_DISK_NACK:
7048 return true;
7049 default:
7050 return false;
7051 }
7052}
7053
7054static int md_ioctl(struct block_device *bdev, fmode_t mode,
7055 unsigned int cmd, unsigned long arg)
7056{
7057 int err = 0;
7058 void __user *argp = (void __user *)arg;
7059 struct mddev *mddev = NULL;
7060 int ro;
7061 bool did_set_md_closing = false;
7062
7063 if (!md_ioctl_valid(cmd))
7064 return -ENOTTY;
7065
7066 switch (cmd) {
7067 case RAID_VERSION:
7068 case GET_ARRAY_INFO:
7069 case GET_DISK_INFO:
7070 break;
7071 default:
7072 if (!capable(CAP_SYS_ADMIN))
7073 return -EACCES;
7074 }
7075
7076
7077
7078
7079
7080 switch (cmd) {
7081 case RAID_VERSION:
7082 err = get_version(argp);
7083 goto out;
7084
7085#ifndef MODULE
7086 case RAID_AUTORUN:
7087 err = 0;
7088 autostart_arrays(arg);
7089 goto out;
7090#endif
7091 default:;
7092 }
7093
7094
7095
7096
7097
7098 mddev = bdev->bd_disk->private_data;
7099
7100 if (!mddev) {
7101 BUG();
7102 goto out;
7103 }
7104
7105
7106 switch (cmd) {
7107 case GET_ARRAY_INFO:
7108 if (!mddev->raid_disks && !mddev->external)
7109 err = -ENODEV;
7110 else
7111 err = get_array_info(mddev, argp);
7112 goto out;
7113
7114 case GET_DISK_INFO:
7115 if (!mddev->raid_disks && !mddev->external)
7116 err = -ENODEV;
7117 else
7118 err = get_disk_info(mddev, argp);
7119 goto out;
7120
7121 case SET_DISK_FAULTY:
7122 err = set_disk_faulty(mddev, new_decode_dev(arg));
7123 goto out;
7124
7125 case GET_BITMAP_FILE:
7126 err = get_bitmap_file(mddev, argp);
7127 goto out;
7128
7129 }
7130
7131 if (cmd == ADD_NEW_DISK)
7132
7133 flush_workqueue(md_misc_wq);
7134
7135 if (cmd == HOT_REMOVE_DISK)
7136
7137 wait_event_interruptible_timeout(mddev->sb_wait,
7138 !test_bit(MD_RECOVERY_NEEDED,
7139 &mddev->recovery),
7140 msecs_to_jiffies(5000));
7141 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
7142
7143
7144
7145 mutex_lock(&mddev->open_mutex);
7146 if (mddev->pers && atomic_read(&mddev->openers) > 1) {
7147 mutex_unlock(&mddev->open_mutex);
7148 err = -EBUSY;
7149 goto out;
7150 }
7151 WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags));
7152 set_bit(MD_CLOSING, &mddev->flags);
7153 did_set_md_closing = true;
7154 mutex_unlock(&mddev->open_mutex);
7155 sync_blockdev(bdev);
7156 }
7157 err = mddev_lock(mddev);
7158 if (err) {
7159 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
7160 err, cmd);
7161 goto out;
7162 }
7163
7164 if (cmd == SET_ARRAY_INFO) {
7165 mdu_array_info_t info;
7166 if (!arg)
7167 memset(&info, 0, sizeof(info));
7168 else if (copy_from_user(&info, argp, sizeof(info))) {
7169 err = -EFAULT;
7170 goto unlock;
7171 }
7172 if (mddev->pers) {
7173 err = update_array_info(mddev, &info);
7174 if (err) {
7175 pr_warn("md: couldn't update array info. %d\n", err);
7176 goto unlock;
7177 }
7178 goto unlock;
7179 }
7180 if (!list_empty(&mddev->disks)) {
7181 pr_warn("md: array %s already has disks!\n", mdname(mddev));
7182 err = -EBUSY;
7183 goto unlock;
7184 }
7185 if (mddev->raid_disks) {
7186 pr_warn("md: array %s already initialised!\n", mdname(mddev));
7187 err = -EBUSY;
7188 goto unlock;
7189 }
7190 err = set_array_info(mddev, &info);
7191 if (err) {
7192 pr_warn("md: couldn't set array info. %d\n", err);
7193 goto unlock;
7194 }
7195 goto unlock;
7196 }
7197
7198
7199
7200
7201
7202
7203 if ((!mddev->raid_disks && !mddev->external)
7204 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
7205 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
7206 && cmd != GET_BITMAP_FILE) {
7207 err = -ENODEV;
7208 goto unlock;
7209 }
7210
7211
7212
7213
7214 switch (cmd) {
7215 case RESTART_ARRAY_RW:
7216 err = restart_array(mddev);
7217 goto unlock;
7218
7219 case STOP_ARRAY:
7220 err = do_md_stop(mddev, 0, bdev);
7221 goto unlock;
7222
7223 case STOP_ARRAY_RO:
7224 err = md_set_readonly(mddev, bdev);
7225 goto unlock;
7226
7227 case HOT_REMOVE_DISK:
7228 err = hot_remove_disk(mddev, new_decode_dev(arg));
7229 goto unlock;
7230
7231 case ADD_NEW_DISK:
7232
7233
7234
7235
7236 if (mddev->pers) {
7237 mdu_disk_info_t info;
7238 if (copy_from_user(&info, argp, sizeof(info)))
7239 err = -EFAULT;
7240 else if (!(info.state & (1<<MD_DISK_SYNC)))
7241
7242 break;
7243 else
7244 err = add_new_disk(mddev, &info);
7245 goto unlock;
7246 }
7247 break;
7248
7249 case BLKROSET:
7250 if (get_user(ro, (int __user *)(arg))) {
7251 err = -EFAULT;
7252 goto unlock;
7253 }
7254 err = -EINVAL;
7255
7256
7257
7258
7259 if (ro)
7260 goto unlock;
7261
7262
7263 if (mddev->ro != 1)
7264 goto unlock;
7265
7266
7267
7268
7269 if (mddev->pers) {
7270 err = restart_array(mddev);
7271 if (err == 0) {
7272 mddev->ro = 2;
7273 set_disk_ro(mddev->gendisk, 0);
7274 }
7275 }
7276 goto unlock;
7277 }
7278
7279
7280
7281
7282
7283 if (mddev->ro && mddev->pers) {
7284 if (mddev->ro == 2) {
7285 mddev->ro = 0;
7286 sysfs_notify_dirent_safe(mddev->sysfs_state);
7287 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7288
7289
7290
7291
7292 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
7293 mddev_unlock(mddev);
7294 wait_event(mddev->sb_wait,
7295 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
7296 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
7297 mddev_lock_nointr(mddev);
7298 }
7299 } else {
7300 err = -EROFS;
7301 goto unlock;
7302 }
7303 }
7304
7305 switch (cmd) {
7306 case ADD_NEW_DISK:
7307 {
7308 mdu_disk_info_t info;
7309 if (copy_from_user(&info, argp, sizeof(info)))
7310 err = -EFAULT;
7311 else
7312 err = add_new_disk(mddev, &info);
7313 goto unlock;
7314 }
7315
7316 case CLUSTERED_DISK_NACK:
7317 if (mddev_is_clustered(mddev))
7318 md_cluster_ops->new_disk_ack(mddev, false);
7319 else
7320 err = -EINVAL;
7321 goto unlock;
7322
7323 case HOT_ADD_DISK:
7324 err = hot_add_disk(mddev, new_decode_dev(arg));
7325 goto unlock;
7326
7327 case RUN_ARRAY:
7328 err = do_md_run(mddev);
7329 goto unlock;
7330
7331 case SET_BITMAP_FILE:
7332 err = set_bitmap_file(mddev, (int)arg);
7333 goto unlock;
7334
7335 default:
7336 err = -EINVAL;
7337 goto unlock;
7338 }
7339
7340unlock:
7341 if (mddev->hold_active == UNTIL_IOCTL &&
7342 err != -EINVAL)
7343 mddev->hold_active = 0;
7344 mddev_unlock(mddev);
7345out:
7346 if(did_set_md_closing)
7347 clear_bit(MD_CLOSING, &mddev->flags);
7348 return err;
7349}
7350#ifdef CONFIG_COMPAT
7351static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
7352 unsigned int cmd, unsigned long arg)
7353{
7354 switch (cmd) {
7355 case HOT_REMOVE_DISK:
7356 case HOT_ADD_DISK:
7357 case SET_DISK_FAULTY:
7358 case SET_BITMAP_FILE:
7359
7360 break;
7361 default:
7362 arg = (unsigned long)compat_ptr(arg);
7363 break;
7364 }
7365
7366 return md_ioctl(bdev, mode, cmd, arg);
7367}
7368#endif
7369
7370static int md_open(struct block_device *bdev, fmode_t mode)
7371{
7372
7373
7374
7375
7376 struct mddev *mddev = mddev_find(bdev->bd_dev);
7377 int err;
7378
7379 if (!mddev)
7380 return -ENODEV;
7381
7382 if (mddev->gendisk != bdev->bd_disk) {
7383
7384
7385
7386 mddev_put(mddev);
7387
7388 flush_workqueue(md_misc_wq);
7389
7390 return -ERESTARTSYS;
7391 }
7392 BUG_ON(mddev != bdev->bd_disk->private_data);
7393
7394 if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
7395 goto out;
7396
7397 if (test_bit(MD_CLOSING, &mddev->flags)) {
7398 mutex_unlock(&mddev->open_mutex);
7399 err = -ENODEV;
7400 goto out;
7401 }
7402
7403 err = 0;
7404 atomic_inc(&mddev->openers);
7405 mutex_unlock(&mddev->open_mutex);
7406
7407 check_disk_change(bdev);
7408 out:
7409 if (err)
7410 mddev_put(mddev);
7411 return err;
7412}
7413
7414static void md_release(struct gendisk *disk, fmode_t mode)
7415{
7416 struct mddev *mddev = disk->private_data;
7417
7418 BUG_ON(!mddev);
7419 atomic_dec(&mddev->openers);
7420 mddev_put(mddev);
7421}
7422
7423static int md_media_changed(struct gendisk *disk)
7424{
7425 struct mddev *mddev = disk->private_data;
7426
7427 return mddev->changed;
7428}
7429
7430static int md_revalidate(struct gendisk *disk)
7431{
7432 struct mddev *mddev = disk->private_data;
7433
7434 mddev->changed = 0;
7435 return 0;
7436}
7437static const struct block_device_operations md_fops =
7438{
7439 .owner = THIS_MODULE,
7440 .open = md_open,
7441 .release = md_release,
7442 .ioctl = md_ioctl,
7443#ifdef CONFIG_COMPAT
7444 .compat_ioctl = md_compat_ioctl,
7445#endif
7446 .getgeo = md_getgeo,
7447 .media_changed = md_media_changed,
7448 .revalidate_disk= md_revalidate,
7449};
7450
7451static int md_thread(void *arg)
7452{
7453 struct md_thread *thread = arg;
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467 allow_signal(SIGKILL);
7468 while (!kthread_should_stop()) {
7469
7470
7471
7472
7473
7474
7475 if (signal_pending(current))
7476 flush_signals(current);
7477
7478 wait_event_interruptible_timeout
7479 (thread->wqueue,
7480 test_bit(THREAD_WAKEUP, &thread->flags)
7481 || kthread_should_stop() || kthread_should_park(),
7482 thread->timeout);
7483
7484 clear_bit(THREAD_WAKEUP, &thread->flags);
7485 if (kthread_should_park())
7486 kthread_parkme();
7487 if (!kthread_should_stop())
7488 thread->run(thread);
7489 }
7490
7491 return 0;
7492}
7493
7494void md_wakeup_thread(struct md_thread *thread)
7495{
7496 if (thread) {
7497 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
7498 set_bit(THREAD_WAKEUP, &thread->flags);
7499 wake_up(&thread->wqueue);
7500 }
7501}
7502EXPORT_SYMBOL(md_wakeup_thread);
7503
7504struct md_thread *md_register_thread(void (*run) (struct md_thread *),
7505 struct mddev *mddev, const char *name)
7506{
7507 struct md_thread *thread;
7508
7509 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
7510 if (!thread)
7511 return NULL;
7512
7513 init_waitqueue_head(&thread->wqueue);
7514
7515 thread->run = run;
7516 thread->mddev = mddev;
7517 thread->timeout = MAX_SCHEDULE_TIMEOUT;
7518 thread->tsk = kthread_run(md_thread, thread,
7519 "%s_%s",
7520 mdname(thread->mddev),
7521 name);
7522 if (IS_ERR(thread->tsk)) {
7523 kfree(thread);
7524 return NULL;
7525 }
7526 return thread;
7527}
7528EXPORT_SYMBOL(md_register_thread);
7529
7530void md_unregister_thread(struct md_thread **threadp)
7531{
7532 struct md_thread *thread = *threadp;
7533 if (!thread)
7534 return;
7535 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
7536
7537
7538
7539 spin_lock(&pers_lock);
7540 *threadp = NULL;
7541 spin_unlock(&pers_lock);
7542
7543 kthread_stop(thread->tsk);
7544 kfree(thread);
7545}
7546EXPORT_SYMBOL(md_unregister_thread);
7547
7548void md_error(struct mddev *mddev, struct md_rdev *rdev)
7549{
7550 if (!rdev || test_bit(Faulty, &rdev->flags))
7551 return;
7552
7553 if (!mddev->pers || !mddev->pers->error_handler)
7554 return;
7555 mddev->pers->error_handler(mddev,rdev);
7556 if (mddev->degraded)
7557 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7558 sysfs_notify_dirent_safe(rdev->sysfs_state);
7559 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7560 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7561 md_wakeup_thread(mddev->thread);
7562 if (mddev->event_work.func)
7563 queue_work(md_misc_wq, &mddev->event_work);
7564 md_new_event(mddev);
7565}
7566EXPORT_SYMBOL(md_error);
7567
7568
7569
7570static void status_unused(struct seq_file *seq)
7571{
7572 int i = 0;
7573 struct md_rdev *rdev;
7574
7575 seq_printf(seq, "unused devices: ");
7576
7577 list_for_each_entry(rdev, &pending_raid_disks, same_set) {
7578 char b[BDEVNAME_SIZE];
7579 i++;
7580 seq_printf(seq, "%s ",
7581 bdevname(rdev->bdev,b));
7582 }
7583 if (!i)
7584 seq_printf(seq, "<none>");
7585
7586 seq_printf(seq, "\n");
7587}
7588
7589static int status_resync(struct seq_file *seq, struct mddev *mddev)
7590{
7591 sector_t max_sectors, resync, res;
7592 unsigned long dt, db;
7593 sector_t rt;
7594 int scale;
7595 unsigned int per_milli;
7596
7597 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
7598 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7599 max_sectors = mddev->resync_max_sectors;
7600 else
7601 max_sectors = mddev->dev_sectors;
7602
7603 resync = mddev->curr_resync;
7604 if (resync <= 3) {
7605 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
7606
7607 resync = max_sectors;
7608 } else if (resync > max_sectors)
7609 resync = max_sectors;
7610 else
7611 resync -= atomic_read(&mddev->recovery_active);
7612
7613 if (resync == 0) {
7614 if (mddev->recovery_cp < MaxSector) {
7615 seq_printf(seq, "\tresync=PENDING");
7616 return 1;
7617 }
7618 return 0;
7619 }
7620 if (resync < 3) {
7621 seq_printf(seq, "\tresync=DELAYED");
7622 return 1;
7623 }
7624
7625 WARN_ON(max_sectors == 0);
7626
7627
7628
7629
7630
7631 scale = 10;
7632 if (sizeof(sector_t) > sizeof(unsigned long)) {
7633 while ( max_sectors/2 > (1ULL<<(scale+32)))
7634 scale++;
7635 }
7636 res = (resync>>scale)*1000;
7637 sector_div(res, (u32)((max_sectors>>scale)+1));
7638
7639 per_milli = res;
7640 {
7641 int i, x = per_milli/50, y = 20-x;
7642 seq_printf(seq, "[");
7643 for (i = 0; i < x; i++)
7644 seq_printf(seq, "=");
7645 seq_printf(seq, ">");
7646 for (i = 0; i < y; i++)
7647 seq_printf(seq, ".");
7648 seq_printf(seq, "] ");
7649 }
7650 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
7651 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
7652 "reshape" :
7653 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
7654 "check" :
7655 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
7656 "resync" : "recovery"))),
7657 per_milli/10, per_milli % 10,
7658 (unsigned long long) resync/2,
7659 (unsigned long long) max_sectors/2);
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675 dt = ((jiffies - mddev->resync_mark) / HZ);
7676 if (!dt) dt++;
7677 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
7678 - mddev->resync_mark_cnt;
7679
7680 rt = max_sectors - resync;
7681 sector_div(rt, db/32+1);
7682 rt *= dt;
7683 rt >>= 5;
7684
7685 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
7686 ((unsigned long)rt % 60)/6);
7687
7688 seq_printf(seq, " speed=%ldK/sec", db/2/dt);
7689 return 1;
7690}
7691
7692static void *md_seq_start(struct seq_file *seq, loff_t *pos)
7693{
7694 struct list_head *tmp;
7695 loff_t l = *pos;
7696 struct mddev *mddev;
7697
7698 if (l >= 0x10000)
7699 return NULL;
7700 if (!l--)
7701
7702 return (void*)1;
7703
7704 spin_lock(&all_mddevs_lock);
7705 list_for_each(tmp,&all_mddevs)
7706 if (!l--) {
7707 mddev = list_entry(tmp, struct mddev, all_mddevs);
7708 mddev_get(mddev);
7709 spin_unlock(&all_mddevs_lock);
7710 return mddev;
7711 }
7712 spin_unlock(&all_mddevs_lock);
7713 if (!l--)
7714 return (void*)2;
7715 return NULL;
7716}
7717
7718static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
7719{
7720 struct list_head *tmp;
7721 struct mddev *next_mddev, *mddev = v;
7722
7723 ++*pos;
7724 if (v == (void*)2)
7725 return NULL;
7726
7727 spin_lock(&all_mddevs_lock);
7728 if (v == (void*)1)
7729 tmp = all_mddevs.next;
7730 else
7731 tmp = mddev->all_mddevs.next;
7732 if (tmp != &all_mddevs)
7733 next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
7734 else {
7735 next_mddev = (void*)2;
7736 *pos = 0x10000;
7737 }
7738 spin_unlock(&all_mddevs_lock);
7739
7740 if (v != (void*)1)
7741 mddev_put(mddev);
7742 return next_mddev;
7743
7744}
7745
7746static void md_seq_stop(struct seq_file *seq, void *v)
7747{
7748 struct mddev *mddev = v;
7749
7750 if (mddev && v != (void*)1 && v != (void*)2)
7751 mddev_put(mddev);
7752}
7753
7754static int md_seq_show(struct seq_file *seq, void *v)
7755{
7756 struct mddev *mddev = v;
7757 sector_t sectors;
7758 struct md_rdev *rdev;
7759
7760 if (v == (void*)1) {
7761 struct md_personality *pers;
7762 seq_printf(seq, "Personalities : ");
7763 spin_lock(&pers_lock);
7764 list_for_each_entry(pers, &pers_list, list)
7765 seq_printf(seq, "[%s] ", pers->name);
7766
7767 spin_unlock(&pers_lock);
7768 seq_printf(seq, "\n");
7769 seq->poll_event = atomic_read(&md_event_count);
7770 return 0;
7771 }
7772 if (v == (void*)2) {
7773 status_unused(seq);
7774 return 0;
7775 }
7776
7777 spin_lock(&mddev->lock);
7778 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
7779 seq_printf(seq, "%s : %sactive", mdname(mddev),
7780 mddev->pers ? "" : "in");
7781 if (mddev->pers) {
7782 if (mddev->ro==1)
7783 seq_printf(seq, " (read-only)");
7784 if (mddev->ro==2)
7785 seq_printf(seq, " (auto-read-only)");
7786 seq_printf(seq, " %s", mddev->pers->name);
7787 }
7788
7789 sectors = 0;
7790 rcu_read_lock();
7791 rdev_for_each_rcu(rdev, mddev) {
7792 char b[BDEVNAME_SIZE];
7793 seq_printf(seq, " %s[%d]",
7794 bdevname(rdev->bdev,b), rdev->desc_nr);
7795 if (test_bit(WriteMostly, &rdev->flags))
7796 seq_printf(seq, "(W)");
7797 if (test_bit(Journal, &rdev->flags))
7798 seq_printf(seq, "(J)");
7799 if (test_bit(Faulty, &rdev->flags)) {
7800 seq_printf(seq, "(F)");
7801 continue;
7802 }
7803 if (rdev->raid_disk < 0)
7804 seq_printf(seq, "(S)");
7805 if (test_bit(Replacement, &rdev->flags))
7806 seq_printf(seq, "(R)");
7807 sectors += rdev->sectors;
7808 }
7809 rcu_read_unlock();
7810
7811 if (!list_empty(&mddev->disks)) {
7812 if (mddev->pers)
7813 seq_printf(seq, "\n %llu blocks",
7814 (unsigned long long)
7815 mddev->array_sectors / 2);
7816 else
7817 seq_printf(seq, "\n %llu blocks",
7818 (unsigned long long)sectors / 2);
7819 }
7820 if (mddev->persistent) {
7821 if (mddev->major_version != 0 ||
7822 mddev->minor_version != 90) {
7823 seq_printf(seq," super %d.%d",
7824 mddev->major_version,
7825 mddev->minor_version);
7826 }
7827 } else if (mddev->external)
7828 seq_printf(seq, " super external:%s",
7829 mddev->metadata_type);
7830 else
7831 seq_printf(seq, " super non-persistent");
7832
7833 if (mddev->pers) {
7834 mddev->pers->status(seq, mddev);
7835 seq_printf(seq, "\n ");
7836 if (mddev->pers->sync_request) {
7837 if (status_resync(seq, mddev))
7838 seq_printf(seq, "\n ");
7839 }
7840 } else
7841 seq_printf(seq, "\n ");
7842
7843 bitmap_status(seq, mddev->bitmap);
7844
7845 seq_printf(seq, "\n");
7846 }
7847 spin_unlock(&mddev->lock);
7848
7849 return 0;
7850}
7851
7852static const struct seq_operations md_seq_ops = {
7853 .start = md_seq_start,
7854 .next = md_seq_next,
7855 .stop = md_seq_stop,
7856 .show = md_seq_show,
7857};
7858
7859static int md_seq_open(struct inode *inode, struct file *file)
7860{
7861 struct seq_file *seq;
7862 int error;
7863
7864 error = seq_open(file, &md_seq_ops);
7865 if (error)
7866 return error;
7867
7868 seq = file->private_data;
7869 seq->poll_event = atomic_read(&md_event_count);
7870 return error;
7871}
7872
7873static int md_unloading;
7874static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
7875{
7876 struct seq_file *seq = filp->private_data;
7877 int mask;
7878
7879 if (md_unloading)
7880 return POLLIN|POLLRDNORM|POLLERR|POLLPRI;
7881 poll_wait(filp, &md_event_waiters, wait);
7882
7883
7884 mask = POLLIN | POLLRDNORM;
7885
7886 if (seq->poll_event != atomic_read(&md_event_count))
7887 mask |= POLLERR | POLLPRI;
7888 return mask;
7889}
7890
7891static const struct file_operations md_seq_fops = {
7892 .owner = THIS_MODULE,
7893 .open = md_seq_open,
7894 .read = seq_read,
7895 .llseek = seq_lseek,
7896 .release = seq_release,
7897 .poll = mdstat_poll,
7898};
7899
7900int register_md_personality(struct md_personality *p)
7901{
7902 pr_debug("md: %s personality registered for level %d\n",
7903 p->name, p->level);
7904 spin_lock(&pers_lock);
7905 list_add_tail(&p->list, &pers_list);
7906 spin_unlock(&pers_lock);
7907 return 0;
7908}
7909EXPORT_SYMBOL(register_md_personality);
7910
7911int unregister_md_personality(struct md_personality *p)
7912{
7913 pr_debug("md: %s personality unregistered\n", p->name);
7914 spin_lock(&pers_lock);
7915 list_del_init(&p->list);
7916 spin_unlock(&pers_lock);
7917 return 0;
7918}
7919EXPORT_SYMBOL(unregister_md_personality);
7920
7921int register_md_cluster_operations(struct md_cluster_operations *ops,
7922 struct module *module)
7923{
7924 int ret = 0;
7925 spin_lock(&pers_lock);
7926 if (md_cluster_ops != NULL)
7927 ret = -EALREADY;
7928 else {
7929 md_cluster_ops = ops;
7930 md_cluster_mod = module;
7931 }
7932 spin_unlock(&pers_lock);
7933 return ret;
7934}
7935EXPORT_SYMBOL(register_md_cluster_operations);
7936
7937int unregister_md_cluster_operations(void)
7938{
7939 spin_lock(&pers_lock);
7940 md_cluster_ops = NULL;
7941 spin_unlock(&pers_lock);
7942 return 0;
7943}
7944EXPORT_SYMBOL(unregister_md_cluster_operations);
7945
7946int md_setup_cluster(struct mddev *mddev, int nodes)
7947{
7948 if (!md_cluster_ops)
7949 request_module("md-cluster");
7950 spin_lock(&pers_lock);
7951
7952 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
7953 pr_warn("can't find md-cluster module or get it's reference.\n");
7954 spin_unlock(&pers_lock);
7955 return -ENOENT;
7956 }
7957 spin_unlock(&pers_lock);
7958
7959 return md_cluster_ops->join(mddev, nodes);
7960}
7961
7962void md_cluster_stop(struct mddev *mddev)
7963{
7964 if (!md_cluster_ops)
7965 return;
7966 md_cluster_ops->leave(mddev);
7967 module_put(md_cluster_mod);
7968}
7969
7970static int is_mddev_idle(struct mddev *mddev, int init)
7971{
7972 struct md_rdev *rdev;
7973 int idle;
7974 int curr_events;
7975
7976 idle = 1;
7977 rcu_read_lock();
7978 rdev_for_each_rcu(rdev, mddev) {
7979 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
7980 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
7981 (int)part_stat_read(&disk->part0, sectors[1]) -
7982 atomic_read(&disk->sync_io);
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005 if (init || curr_events - rdev->last_events > 64) {
8006 rdev->last_events = curr_events;
8007 idle = 0;
8008 }
8009 }
8010 rcu_read_unlock();
8011 return idle;
8012}
8013
8014void md_done_sync(struct mddev *mddev, int blocks, int ok)
8015{
8016
8017 atomic_sub(blocks, &mddev->recovery_active);
8018 wake_up(&mddev->recovery_wait);
8019 if (!ok) {
8020 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8021 set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
8022 md_wakeup_thread(mddev->thread);
8023
8024 }
8025}
8026EXPORT_SYMBOL(md_done_sync);
8027
8028
8029
8030
8031
8032
8033
8034
8035bool md_write_start(struct mddev *mddev, struct bio *bi)
8036{
8037 int did_change = 0;
8038 if (bio_data_dir(bi) != WRITE)
8039 return true;
8040
8041 BUG_ON(mddev->ro == 1);
8042 if (mddev->ro == 2) {
8043
8044 mddev->ro = 0;
8045 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8046 md_wakeup_thread(mddev->thread);
8047 md_wakeup_thread(mddev->sync_thread);
8048 did_change = 1;
8049 }
8050 rcu_read_lock();
8051 percpu_ref_get(&mddev->writes_pending);
8052 smp_mb();
8053 if (mddev->safemode == 1)
8054 mddev->safemode = 0;
8055
8056 if (mddev->in_sync || mddev->sync_checkers) {
8057 spin_lock(&mddev->lock);
8058 if (mddev->in_sync) {
8059 mddev->in_sync = 0;
8060 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8061 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8062 md_wakeup_thread(mddev->thread);
8063 did_change = 1;
8064 }
8065 spin_unlock(&mddev->lock);
8066 }
8067 rcu_read_unlock();
8068 if (did_change)
8069 sysfs_notify_dirent_safe(mddev->sysfs_state);
8070 wait_event(mddev->sb_wait,
8071 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
8072 mddev->suspended);
8073 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
8074 percpu_ref_put(&mddev->writes_pending);
8075 return false;
8076 }
8077 return true;
8078}
8079EXPORT_SYMBOL(md_write_start);
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089void md_write_inc(struct mddev *mddev, struct bio *bi)
8090{
8091 if (bio_data_dir(bi) != WRITE)
8092 return;
8093 WARN_ON_ONCE(mddev->in_sync || mddev->ro);
8094 percpu_ref_get(&mddev->writes_pending);
8095}
8096EXPORT_SYMBOL(md_write_inc);
8097
8098void md_write_end(struct mddev *mddev)
8099{
8100 percpu_ref_put(&mddev->writes_pending);
8101
8102 if (mddev->safemode == 2)
8103 md_wakeup_thread(mddev->thread);
8104 else if (mddev->safemode_delay)
8105
8106
8107
8108 mod_timer(&mddev->safemode_timer,
8109 roundup(jiffies, mddev->safemode_delay) +
8110 mddev->safemode_delay);
8111}
8112
8113EXPORT_SYMBOL(md_write_end);
8114
8115
8116
8117
8118
8119
8120
8121void md_allow_write(struct mddev *mddev)
8122{
8123 if (!mddev->pers)
8124 return;
8125 if (mddev->ro)
8126 return;
8127 if (!mddev->pers->sync_request)
8128 return;
8129
8130 spin_lock(&mddev->lock);
8131 if (mddev->in_sync) {
8132 mddev->in_sync = 0;
8133 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8134 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8135 if (mddev->safemode_delay &&
8136 mddev->safemode == 0)
8137 mddev->safemode = 1;
8138 spin_unlock(&mddev->lock);
8139 md_update_sb(mddev, 0);
8140 sysfs_notify_dirent_safe(mddev->sysfs_state);
8141
8142 wait_event(mddev->sb_wait,
8143 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
8144 } else
8145 spin_unlock(&mddev->lock);
8146}
8147EXPORT_SYMBOL_GPL(md_allow_write);
8148
8149#define SYNC_MARKS 10
8150#define SYNC_MARK_STEP (3*HZ)
8151#define UPDATE_FREQUENCY (5*60*HZ)
8152void md_do_sync(struct md_thread *thread)
8153{
8154 struct mddev *mddev = thread->mddev;
8155 struct mddev *mddev2;
8156 unsigned int currspeed = 0,
8157 window;
8158 sector_t max_sectors,j, io_sectors, recovery_done;
8159 unsigned long mark[SYNC_MARKS];
8160 unsigned long update_time;
8161 sector_t mark_cnt[SYNC_MARKS];
8162 int last_mark,m;
8163 struct list_head *tmp;
8164 sector_t last_check;
8165 int skipped = 0;
8166 struct md_rdev *rdev;
8167 char *desc, *action = NULL;
8168 struct blk_plug plug;
8169 int ret;
8170
8171
8172 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
8173 return;
8174 if (mddev->ro) {
8175 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8176 return;
8177 }
8178
8179 if (mddev_is_clustered(mddev)) {
8180 ret = md_cluster_ops->resync_start(mddev);
8181 if (ret)
8182 goto skip;
8183
8184 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
8185 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
8186 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
8187 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
8188 && ((unsigned long long)mddev->curr_resync_completed
8189 < (unsigned long long)mddev->resync_max_sectors))
8190 goto skip;
8191 }
8192
8193 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8194 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
8195 desc = "data-check";
8196 action = "check";
8197 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
8198 desc = "requested-resync";
8199 action = "repair";
8200 } else
8201 desc = "resync";
8202 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8203 desc = "reshape";
8204 else
8205 desc = "recovery";
8206
8207 mddev->last_sync_action = action ?: desc;
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225 do {
8226 int mddev2_minor = -1;
8227 mddev->curr_resync = 2;
8228
8229 try_again:
8230 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8231 goto skip;
8232 for_each_mddev(mddev2, tmp) {
8233 if (mddev2 == mddev)
8234 continue;
8235 if (!mddev->parallel_resync
8236 && mddev2->curr_resync
8237 && match_mddev_units(mddev, mddev2)) {
8238 DEFINE_WAIT(wq);
8239 if (mddev < mddev2 && mddev->curr_resync == 2) {
8240
8241 mddev->curr_resync = 1;
8242 wake_up(&resync_wait);
8243 }
8244 if (mddev > mddev2 && mddev->curr_resync == 1)
8245
8246
8247
8248 continue;
8249
8250
8251
8252
8253 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
8254 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8255 mddev2->curr_resync >= mddev->curr_resync) {
8256 if (mddev2_minor != mddev2->md_minor) {
8257 mddev2_minor = mddev2->md_minor;
8258 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
8259 desc, mdname(mddev),
8260 mdname(mddev2));
8261 }
8262 mddev_put(mddev2);
8263 if (signal_pending(current))
8264 flush_signals(current);
8265 schedule();
8266 finish_wait(&resync_wait, &wq);
8267 goto try_again;
8268 }
8269 finish_wait(&resync_wait, &wq);
8270 }
8271 }
8272 } while (mddev->curr_resync < 2);
8273
8274 j = 0;
8275 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8276
8277
8278
8279 max_sectors = mddev->resync_max_sectors;
8280 atomic64_set(&mddev->resync_mismatches, 0);
8281
8282 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8283 j = mddev->resync_min;
8284 else if (!mddev->bitmap)
8285 j = mddev->recovery_cp;
8286
8287 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8288 max_sectors = mddev->resync_max_sectors;
8289 else {
8290
8291 max_sectors = mddev->dev_sectors;
8292 j = MaxSector;
8293 rcu_read_lock();
8294 rdev_for_each_rcu(rdev, mddev)
8295 if (rdev->raid_disk >= 0 &&
8296 !test_bit(Journal, &rdev->flags) &&
8297 !test_bit(Faulty, &rdev->flags) &&
8298 !test_bit(In_sync, &rdev->flags) &&
8299 rdev->recovery_offset < j)
8300 j = rdev->recovery_offset;
8301 rcu_read_unlock();
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311 if (mddev->bitmap) {
8312 mddev->pers->quiesce(mddev, 1);
8313 mddev->pers->quiesce(mddev, 0);
8314 }
8315 }
8316
8317 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
8318 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev));
8319 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
8320 speed_max(mddev), desc);
8321
8322 is_mddev_idle(mddev, 1);
8323
8324 io_sectors = 0;
8325 for (m = 0; m < SYNC_MARKS; m++) {
8326 mark[m] = jiffies;
8327 mark_cnt[m] = io_sectors;
8328 }
8329 last_mark = 0;
8330 mddev->resync_mark = mark[last_mark];
8331 mddev->resync_mark_cnt = mark_cnt[last_mark];
8332
8333
8334
8335
8336 window = 32*(PAGE_SIZE/512);
8337 pr_debug("md: using %dk window, over a total of %lluk.\n",
8338 window/2, (unsigned long long)max_sectors/2);
8339
8340 atomic_set(&mddev->recovery_active, 0);
8341 last_check = 0;
8342
8343 if (j>2) {
8344 pr_debug("md: resuming %s of %s from checkpoint.\n",
8345 desc, mdname(mddev));
8346 mddev->curr_resync = j;
8347 } else
8348 mddev->curr_resync = 3;
8349 mddev->curr_resync_completed = j;
8350 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8351 md_new_event(mddev);
8352 update_time = jiffies;
8353
8354 blk_start_plug(&plug);
8355 while (j < max_sectors) {
8356 sector_t sectors;
8357
8358 skipped = 0;
8359
8360 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8361 ((mddev->curr_resync > mddev->curr_resync_completed &&
8362 (mddev->curr_resync - mddev->curr_resync_completed)
8363 > (max_sectors >> 4)) ||
8364 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
8365 (j - mddev->curr_resync_completed)*2
8366 >= mddev->resync_max - mddev->curr_resync_completed ||
8367 mddev->curr_resync_completed > mddev->resync_max
8368 )) {
8369
8370 wait_event(mddev->recovery_wait,
8371 atomic_read(&mddev->recovery_active) == 0);
8372 mddev->curr_resync_completed = j;
8373 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
8374 j > mddev->recovery_cp)
8375 mddev->recovery_cp = j;
8376 update_time = jiffies;
8377 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8378 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8379 }
8380
8381 while (j >= mddev->resync_max &&
8382 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8383
8384
8385
8386
8387 flush_signals(current);
8388 wait_event_interruptible(mddev->recovery_wait,
8389 mddev->resync_max > j
8390 || test_bit(MD_RECOVERY_INTR,
8391 &mddev->recovery));
8392 }
8393
8394 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8395 break;
8396
8397 sectors = mddev->pers->sync_request(mddev, j, &skipped);
8398 if (sectors == 0) {
8399 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8400 break;
8401 }
8402
8403 if (!skipped) {
8404 io_sectors += sectors;
8405 atomic_add(sectors, &mddev->recovery_active);
8406 }
8407
8408 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8409 break;
8410
8411 j += sectors;
8412 if (j > max_sectors)
8413
8414 j = max_sectors;
8415 if (j > 2)
8416 mddev->curr_resync = j;
8417 mddev->curr_mark_cnt = io_sectors;
8418 if (last_check == 0)
8419
8420
8421
8422 md_new_event(mddev);
8423
8424 if (last_check + window > io_sectors || j == max_sectors)
8425 continue;
8426
8427 last_check = io_sectors;
8428 repeat:
8429 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
8430
8431 int next = (last_mark+1) % SYNC_MARKS;
8432
8433 mddev->resync_mark = mark[next];
8434 mddev->resync_mark_cnt = mark_cnt[next];
8435 mark[next] = jiffies;
8436 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
8437 last_mark = next;
8438 }
8439
8440 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8441 break;
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451 cond_resched();
8452
8453 recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
8454 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
8455 /((jiffies-mddev->resync_mark)/HZ +1) +1;
8456
8457 if (currspeed > speed_min(mddev)) {
8458 if (currspeed > speed_max(mddev)) {
8459 msleep(500);
8460 goto repeat;
8461 }
8462 if (!is_mddev_idle(mddev, 0)) {
8463
8464
8465
8466
8467 wait_event(mddev->recovery_wait,
8468 !atomic_read(&mddev->recovery_active));
8469 }
8470 }
8471 }
8472 pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
8473 test_bit(MD_RECOVERY_INTR, &mddev->recovery)
8474 ? "interrupted" : "done");
8475
8476
8477
8478 blk_finish_plug(&plug);
8479 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
8480
8481 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8482 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8483 mddev->curr_resync > 3) {
8484 mddev->curr_resync_completed = mddev->curr_resync;
8485 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8486 }
8487 mddev->pers->sync_request(mddev, max_sectors, &skipped);
8488
8489 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
8490 mddev->curr_resync > 3) {
8491 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8492 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8493 if (mddev->curr_resync >= mddev->recovery_cp) {
8494 pr_debug("md: checkpointing %s of %s.\n",
8495 desc, mdname(mddev));
8496 if (test_bit(MD_RECOVERY_ERROR,
8497 &mddev->recovery))
8498 mddev->recovery_cp =
8499 mddev->curr_resync_completed;
8500 else
8501 mddev->recovery_cp =
8502 mddev->curr_resync;
8503 }
8504 } else
8505 mddev->recovery_cp = MaxSector;
8506 } else {
8507 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8508 mddev->curr_resync = MaxSector;
8509 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8510 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
8511 rcu_read_lock();
8512 rdev_for_each_rcu(rdev, mddev)
8513 if (rdev->raid_disk >= 0 &&
8514 mddev->delta_disks >= 0 &&
8515 !test_bit(Journal, &rdev->flags) &&
8516 !test_bit(Faulty, &rdev->flags) &&
8517 !test_bit(In_sync, &rdev->flags) &&
8518 rdev->recovery_offset < mddev->curr_resync)
8519 rdev->recovery_offset = mddev->curr_resync;
8520 rcu_read_unlock();
8521 }
8522 }
8523 }
8524 skip:
8525
8526
8527
8528 set_mask_bits(&mddev->sb_flags, 0,
8529 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
8530
8531 spin_lock(&mddev->lock);
8532 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8533
8534 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8535 mddev->resync_min = 0;
8536 mddev->resync_max = MaxSector;
8537 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8538 mddev->resync_min = mddev->curr_resync_completed;
8539 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
8540 mddev->curr_resync = 0;
8541 spin_unlock(&mddev->lock);
8542
8543 wake_up(&resync_wait);
8544 md_wakeup_thread(mddev->thread);
8545 return;
8546}
8547EXPORT_SYMBOL_GPL(md_do_sync);
8548
8549static int remove_and_add_spares(struct mddev *mddev,
8550 struct md_rdev *this)
8551{
8552 struct md_rdev *rdev;
8553 int spares = 0;
8554 int removed = 0;
8555 bool remove_some = false;
8556
8557 rdev_for_each(rdev, mddev) {
8558 if ((this == NULL || rdev == this) &&
8559 rdev->raid_disk >= 0 &&
8560 !test_bit(Blocked, &rdev->flags) &&
8561 test_bit(Faulty, &rdev->flags) &&
8562 atomic_read(&rdev->nr_pending)==0) {
8563
8564
8565
8566
8567
8568 remove_some = true;
8569 set_bit(RemoveSynchronized, &rdev->flags);
8570 }
8571 }
8572
8573 if (remove_some)
8574 synchronize_rcu();
8575 rdev_for_each(rdev, mddev) {
8576 if ((this == NULL || rdev == this) &&
8577 rdev->raid_disk >= 0 &&
8578 !test_bit(Blocked, &rdev->flags) &&
8579 ((test_bit(RemoveSynchronized, &rdev->flags) ||
8580 (!test_bit(In_sync, &rdev->flags) &&
8581 !test_bit(Journal, &rdev->flags))) &&
8582 atomic_read(&rdev->nr_pending)==0)) {
8583 if (mddev->pers->hot_remove_disk(
8584 mddev, rdev) == 0) {
8585 sysfs_unlink_rdev(mddev, rdev);
8586 rdev->raid_disk = -1;
8587 removed++;
8588 }
8589 }
8590 if (remove_some && test_bit(RemoveSynchronized, &rdev->flags))
8591 clear_bit(RemoveSynchronized, &rdev->flags);
8592 }
8593
8594 if (removed && mddev->kobj.sd)
8595 sysfs_notify(&mddev->kobj, NULL, "degraded");
8596
8597 if (this && removed)
8598 goto no_add;
8599
8600 rdev_for_each(rdev, mddev) {
8601 if (this && this != rdev)
8602 continue;
8603 if (test_bit(Candidate, &rdev->flags))
8604 continue;
8605 if (rdev->raid_disk >= 0 &&
8606 !test_bit(In_sync, &rdev->flags) &&
8607 !test_bit(Journal, &rdev->flags) &&
8608 !test_bit(Faulty, &rdev->flags))
8609 spares++;
8610 if (rdev->raid_disk >= 0)
8611 continue;
8612 if (test_bit(Faulty, &rdev->flags))
8613 continue;
8614 if (!test_bit(Journal, &rdev->flags)) {
8615 if (mddev->ro &&
8616 ! (rdev->saved_raid_disk >= 0 &&
8617 !test_bit(Bitmap_sync, &rdev->flags)))
8618 continue;
8619
8620 rdev->recovery_offset = 0;
8621 }
8622 if (mddev->pers->
8623 hot_add_disk(mddev, rdev) == 0) {
8624 if (sysfs_link_rdev(mddev, rdev))
8625 ;
8626 if (!test_bit(Journal, &rdev->flags))
8627 spares++;
8628 md_new_event(mddev);
8629 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8630 }
8631 }
8632no_add:
8633 if (removed)
8634 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8635 return spares;
8636}
8637
8638static void md_start_sync(struct work_struct *ws)
8639{
8640 struct mddev *mddev = container_of(ws, struct mddev, del_work);
8641
8642 mddev->sync_thread = md_register_thread(md_do_sync,
8643 mddev,
8644 "resync");
8645 if (!mddev->sync_thread) {
8646 pr_warn("%s: could not start resync thread...\n",
8647 mdname(mddev));
8648
8649 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8650 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8651 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
8652 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8653 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8654 wake_up(&resync_wait);
8655 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
8656 &mddev->recovery))
8657 if (mddev->sysfs_action)
8658 sysfs_notify_dirent_safe(mddev->sysfs_action);
8659 } else
8660 md_wakeup_thread(mddev->sync_thread);
8661 sysfs_notify_dirent_safe(mddev->sysfs_action);
8662 md_new_event(mddev);
8663}
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687void md_check_recovery(struct mddev *mddev)
8688{
8689 if (mddev->suspended)
8690 return;
8691
8692 if (mddev->bitmap)
8693 bitmap_daemon_work(mddev);
8694
8695 if (signal_pending(current)) {
8696 if (mddev->pers->sync_request && !mddev->external) {
8697 pr_debug("md: %s in immediate safe mode\n",
8698 mdname(mddev));
8699 mddev->safemode = 2;
8700 }
8701 flush_signals(current);
8702 }
8703
8704 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
8705 return;
8706 if ( ! (
8707 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
8708 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
8709 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
8710 (mddev->external == 0 && mddev->safemode == 1) ||
8711 (mddev->safemode == 2
8712 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
8713 ))
8714 return;
8715
8716 if (mddev_trylock(mddev)) {
8717 int spares = 0;
8718
8719 if (!mddev->external && mddev->safemode == 1)
8720 mddev->safemode = 0;
8721
8722 if (mddev->ro) {
8723 struct md_rdev *rdev;
8724 if (!mddev->external && mddev->in_sync)
8725
8726
8727
8728
8729
8730 rdev_for_each(rdev, mddev)
8731 clear_bit(Blocked, &rdev->flags);
8732
8733
8734
8735
8736
8737
8738
8739 remove_and_add_spares(mddev, NULL);
8740
8741
8742
8743 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8744 md_reap_sync_thread(mddev);
8745 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8746 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8747 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8748 goto unlock;
8749 }
8750
8751 if (mddev_is_clustered(mddev)) {
8752 struct md_rdev *rdev;
8753
8754
8755
8756 rdev_for_each(rdev, mddev) {
8757 if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
8758 rdev->raid_disk < 0)
8759 md_kick_rdev_from_array(rdev);
8760 }
8761 }
8762
8763 if (!mddev->external && !mddev->in_sync) {
8764 spin_lock(&mddev->lock);
8765 set_in_sync(mddev);
8766 spin_unlock(&mddev->lock);
8767 }
8768
8769 if (mddev->sb_flags)
8770 md_update_sb(mddev, 0);
8771
8772 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
8773 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
8774
8775 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8776 goto unlock;
8777 }
8778 if (mddev->sync_thread) {
8779 md_reap_sync_thread(mddev);
8780 goto unlock;
8781 }
8782
8783
8784
8785 mddev->curr_resync_completed = 0;
8786 spin_lock(&mddev->lock);
8787 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8788 spin_unlock(&mddev->lock);
8789
8790
8791
8792 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
8793 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
8794
8795 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
8796 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
8797 goto not_running;
8798
8799
8800
8801
8802
8803
8804
8805 if (mddev->reshape_position != MaxSector) {
8806 if (mddev->pers->check_reshape == NULL ||
8807 mddev->pers->check_reshape(mddev) != 0)
8808
8809 goto not_running;
8810 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8811 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8812 } else if ((spares = remove_and_add_spares(mddev, NULL))) {
8813 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8814 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8815 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
8816 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8817 } else if (mddev->recovery_cp < MaxSector) {
8818 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8819 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8820 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
8821
8822 goto not_running;
8823
8824 if (mddev->pers->sync_request) {
8825 if (spares) {
8826
8827
8828
8829
8830 bitmap_write_all(mddev->bitmap);
8831 }
8832 INIT_WORK(&mddev->del_work, md_start_sync);
8833 queue_work(md_misc_wq, &mddev->del_work);
8834 goto unlock;
8835 }
8836 not_running:
8837 if (!mddev->sync_thread) {
8838 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8839 wake_up(&resync_wait);
8840 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
8841 &mddev->recovery))
8842 if (mddev->sysfs_action)
8843 sysfs_notify_dirent_safe(mddev->sysfs_action);
8844 }
8845 unlock:
8846 wake_up(&mddev->sb_wait);
8847 mddev_unlock(mddev);
8848 } else if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
8849
8850
8851
8852 set_bit(MD_UPDATING_SB, &mddev->flags);
8853 smp_mb__after_atomic();
8854 if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
8855 md_update_sb(mddev, 0);
8856 clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
8857 wake_up(&mddev->sb_wait);
8858 }
8859}
8860EXPORT_SYMBOL(md_check_recovery);
8861
8862void md_reap_sync_thread(struct mddev *mddev)
8863{
8864 struct md_rdev *rdev;
8865
8866
8867 md_unregister_thread(&mddev->sync_thread);
8868 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8869 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
8870
8871
8872 if (mddev->pers->spare_active(mddev)) {
8873 sysfs_notify(&mddev->kobj, NULL,
8874 "degraded");
8875 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8876 }
8877 }
8878 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8879 mddev->pers->finish_reshape)
8880 mddev->pers->finish_reshape(mddev);
8881
8882
8883
8884
8885 if (!mddev->degraded)
8886 rdev_for_each(rdev, mddev)
8887 rdev->saved_raid_disk = -1;
8888
8889 md_update_sb(mddev, 1);
8890
8891
8892
8893 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
8894 md_cluster_ops->resync_finish(mddev);
8895 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8896 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
8897 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8898 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8899 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
8900 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8901 wake_up(&resync_wait);
8902
8903 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8904 sysfs_notify_dirent_safe(mddev->sysfs_action);
8905 md_new_event(mddev);
8906 if (mddev->event_work.func)
8907 queue_work(md_misc_wq, &mddev->event_work);
8908}
8909EXPORT_SYMBOL(md_reap_sync_thread);
8910
8911void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
8912{
8913 sysfs_notify_dirent_safe(rdev->sysfs_state);
8914 wait_event_timeout(rdev->blocked_wait,
8915 !test_bit(Blocked, &rdev->flags) &&
8916 !test_bit(BlockedBadBlocks, &rdev->flags),
8917 msecs_to_jiffies(5000));
8918 rdev_dec_pending(rdev, mddev);
8919}
8920EXPORT_SYMBOL(md_wait_for_blocked_rdev);
8921
8922void md_finish_reshape(struct mddev *mddev)
8923{
8924
8925 struct md_rdev *rdev;
8926
8927 rdev_for_each(rdev, mddev) {
8928 if (rdev->data_offset > rdev->new_data_offset)
8929 rdev->sectors += rdev->data_offset - rdev->new_data_offset;
8930 else
8931 rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
8932 rdev->data_offset = rdev->new_data_offset;
8933 }
8934}
8935EXPORT_SYMBOL(md_finish_reshape);
8936
8937
8938
8939
8940int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8941 int is_new)
8942{
8943 struct mddev *mddev = rdev->mddev;
8944 int rv;
8945 if (is_new)
8946 s += rdev->new_data_offset;
8947 else
8948 s += rdev->data_offset;
8949 rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
8950 if (rv == 0) {
8951
8952 if (test_bit(ExternalBbl, &rdev->flags))
8953 sysfs_notify(&rdev->kobj, NULL,
8954 "unacknowledged_bad_blocks");
8955 sysfs_notify_dirent_safe(rdev->sysfs_state);
8956 set_mask_bits(&mddev->sb_flags, 0,
8957 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
8958 md_wakeup_thread(rdev->mddev->thread);
8959 return 1;
8960 } else
8961 return 0;
8962}
8963EXPORT_SYMBOL_GPL(rdev_set_badblocks);
8964
8965int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8966 int is_new)
8967{
8968 int rv;
8969 if (is_new)
8970 s += rdev->new_data_offset;
8971 else
8972 s += rdev->data_offset;
8973 rv = badblocks_clear(&rdev->badblocks, s, sectors);
8974 if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
8975 sysfs_notify(&rdev->kobj, NULL, "bad_blocks");
8976 return rv;
8977}
8978EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
8979
8980static int md_notify_reboot(struct notifier_block *this,
8981 unsigned long code, void *x)
8982{
8983 struct list_head *tmp;
8984 struct mddev *mddev;
8985 int need_delay = 0;
8986
8987 for_each_mddev(mddev, tmp) {
8988 if (mddev_trylock(mddev)) {
8989 if (mddev->pers)
8990 __md_stop_writes(mddev);
8991 if (mddev->persistent)
8992 mddev->safemode = 2;
8993 mddev_unlock(mddev);
8994 }
8995 need_delay = 1;
8996 }
8997
8998
8999
9000
9001
9002
9003 if (need_delay)
9004 mdelay(1000*1);
9005
9006 return NOTIFY_DONE;
9007}
9008
9009static struct notifier_block md_notifier = {
9010 .notifier_call = md_notify_reboot,
9011 .next = NULL,
9012 .priority = INT_MAX,
9013};
9014
9015static void md_geninit(void)
9016{
9017 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
9018
9019 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
9020}
9021
9022static int __init md_init(void)
9023{
9024 int ret = -ENOMEM;
9025
9026 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
9027 if (!md_wq)
9028 goto err_wq;
9029
9030 md_misc_wq = alloc_workqueue("md_misc", 0, 0);
9031 if (!md_misc_wq)
9032 goto err_misc_wq;
9033
9034 if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
9035 goto err_md;
9036
9037 if ((ret = register_blkdev(0, "mdp")) < 0)
9038 goto err_mdp;
9039 mdp_major = ret;
9040
9041 blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE,
9042 md_probe, NULL, NULL);
9043 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
9044 md_probe, NULL, NULL);
9045
9046 register_reboot_notifier(&md_notifier);
9047 raid_table_header = register_sysctl_table(raid_root_table);
9048
9049 md_geninit();
9050 return 0;
9051
9052err_mdp:
9053 unregister_blkdev(MD_MAJOR, "md");
9054err_md:
9055 destroy_workqueue(md_misc_wq);
9056err_misc_wq:
9057 destroy_workqueue(md_wq);
9058err_wq:
9059 return ret;
9060}
9061
9062static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
9063{
9064 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
9065 struct md_rdev *rdev2;
9066 int role, ret;
9067 char b[BDEVNAME_SIZE];
9068
9069
9070
9071
9072
9073 if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
9074 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
9075 if (ret)
9076 pr_info("md-cluster: resize failed\n");
9077 else
9078 bitmap_update_sb(mddev->bitmap);
9079 }
9080
9081
9082 rdev_for_each(rdev2, mddev) {
9083 if (test_bit(Faulty, &rdev2->flags))
9084 continue;
9085
9086
9087 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
9088
9089 if (test_bit(Candidate, &rdev2->flags)) {
9090 if (role == 0xfffe) {
9091 pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
9092 md_kick_rdev_from_array(rdev2);
9093 continue;
9094 }
9095 else
9096 clear_bit(Candidate, &rdev2->flags);
9097 }
9098
9099 if (role != rdev2->raid_disk) {
9100
9101 if (rdev2->raid_disk == -1 && role != 0xffff) {
9102 rdev2->saved_raid_disk = role;
9103 ret = remove_and_add_spares(mddev, rdev2);
9104 pr_info("Activated spare: %s\n",
9105 bdevname(rdev2->bdev,b));
9106
9107
9108 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9109 md_wakeup_thread(mddev->thread);
9110
9111 }
9112
9113
9114
9115
9116
9117 if ((role == 0xfffe) || (role == 0xfffd)) {
9118 md_error(mddev, rdev2);
9119 clear_bit(Blocked, &rdev2->flags);
9120 }
9121 }
9122 }
9123
9124 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
9125 update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
9126
9127
9128 mddev->events = le64_to_cpu(sb->events);
9129}
9130
9131static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
9132{
9133 int err;
9134 struct page *swapout = rdev->sb_page;
9135 struct mdp_superblock_1 *sb;
9136
9137
9138
9139
9140 rdev->sb_page = NULL;
9141 err = alloc_disk_sb(rdev);
9142 if (err == 0) {
9143 ClearPageUptodate(rdev->sb_page);
9144 rdev->sb_loaded = 0;
9145 err = super_types[mddev->major_version].
9146 load_super(rdev, NULL, mddev->minor_version);
9147 }
9148 if (err < 0) {
9149 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
9150 __func__, __LINE__, rdev->desc_nr, err);
9151 if (rdev->sb_page)
9152 put_page(rdev->sb_page);
9153 rdev->sb_page = swapout;
9154 rdev->sb_loaded = 1;
9155 return err;
9156 }
9157
9158 sb = page_address(rdev->sb_page);
9159
9160
9161
9162
9163 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
9164 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
9165
9166
9167
9168
9169 if (rdev->recovery_offset == MaxSector &&
9170 !test_bit(In_sync, &rdev->flags) &&
9171 mddev->pers->spare_active(mddev))
9172 sysfs_notify(&mddev->kobj, NULL, "degraded");
9173
9174 put_page(swapout);
9175 return 0;
9176}
9177
9178void md_reload_sb(struct mddev *mddev, int nr)
9179{
9180 struct md_rdev *rdev;
9181 int err;
9182
9183
9184 rdev_for_each_rcu(rdev, mddev) {
9185 if (rdev->desc_nr == nr)
9186 break;
9187 }
9188
9189 if (!rdev || rdev->desc_nr != nr) {
9190 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
9191 return;
9192 }
9193
9194 err = read_rdev(mddev, rdev);
9195 if (err < 0)
9196 return;
9197
9198 check_sb_changes(mddev, rdev);
9199
9200
9201 rdev_for_each_rcu(rdev, mddev)
9202 read_rdev(mddev, rdev);
9203}
9204EXPORT_SYMBOL(md_reload_sb);
9205
9206#ifndef MODULE
9207
9208
9209
9210
9211
9212
9213static DEFINE_MUTEX(detected_devices_mutex);
9214static LIST_HEAD(all_detected_devices);
9215struct detected_devices_node {
9216 struct list_head list;
9217 dev_t dev;
9218};
9219
9220void md_autodetect_dev(dev_t dev)
9221{
9222 struct detected_devices_node *node_detected_dev;
9223
9224 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
9225 if (node_detected_dev) {
9226 node_detected_dev->dev = dev;
9227 mutex_lock(&detected_devices_mutex);
9228 list_add_tail(&node_detected_dev->list, &all_detected_devices);
9229 mutex_unlock(&detected_devices_mutex);
9230 }
9231}
9232
9233static void autostart_arrays(int part)
9234{
9235 struct md_rdev *rdev;
9236 struct detected_devices_node *node_detected_dev;
9237 dev_t dev;
9238 int i_scanned, i_passed;
9239
9240 i_scanned = 0;
9241 i_passed = 0;
9242
9243 pr_info("md: Autodetecting RAID arrays.\n");
9244
9245 mutex_lock(&detected_devices_mutex);
9246 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
9247 i_scanned++;
9248 node_detected_dev = list_entry(all_detected_devices.next,
9249 struct detected_devices_node, list);
9250 list_del(&node_detected_dev->list);
9251 dev = node_detected_dev->dev;
9252 kfree(node_detected_dev);
9253 mutex_unlock(&detected_devices_mutex);
9254 rdev = md_import_device(dev,0, 90);
9255 mutex_lock(&detected_devices_mutex);
9256 if (IS_ERR(rdev))
9257 continue;
9258
9259 if (test_bit(Faulty, &rdev->flags))
9260 continue;
9261
9262 set_bit(AutoDetected, &rdev->flags);
9263 list_add(&rdev->same_set, &pending_raid_disks);
9264 i_passed++;
9265 }
9266 mutex_unlock(&detected_devices_mutex);
9267
9268 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed);
9269
9270 autorun_devices(part);
9271}
9272
9273#endif
9274
9275static __exit void md_exit(void)
9276{
9277 struct mddev *mddev;
9278 struct list_head *tmp;
9279 int delay = 1;
9280
9281 blk_unregister_region(MKDEV(MD_MAJOR,0), 512);
9282 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
9283
9284 unregister_blkdev(MD_MAJOR,"md");
9285 unregister_blkdev(mdp_major, "mdp");
9286 unregister_reboot_notifier(&md_notifier);
9287 unregister_sysctl_table(raid_table_header);
9288
9289
9290
9291
9292 md_unloading = 1;
9293 while (waitqueue_active(&md_event_waiters)) {
9294
9295 wake_up(&md_event_waiters);
9296 msleep(delay);
9297 delay += delay;
9298 }
9299 remove_proc_entry("mdstat", NULL);
9300
9301 for_each_mddev(mddev, tmp) {
9302 export_array(mddev);
9303 mddev->ctime = 0;
9304 mddev->hold_active = 0;
9305
9306
9307
9308
9309
9310
9311 }
9312 destroy_workqueue(md_misc_wq);
9313 destroy_workqueue(md_wq);
9314}
9315
9316subsys_initcall(md_init);
9317module_exit(md_exit)
9318
9319static int get_ro(char *buffer, const struct kernel_param *kp)
9320{
9321 return sprintf(buffer, "%d", start_readonly);
9322}
9323static int set_ro(const char *val, const struct kernel_param *kp)
9324{
9325 return kstrtouint(val, 10, (unsigned int *)&start_readonly);
9326}
9327
9328module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
9329module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
9330module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
9331module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
9332
9333MODULE_LICENSE("GPL");
9334MODULE_DESCRIPTION("MD RAID framework");
9335MODULE_ALIAS("md");
9336MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
9337