1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38#include <linux/blkdev.h>
39#include <linux/kthread.h>
40#include <linux/raid/pq.h>
41#include <linux/async_tx.h>
42#include <linux/module.h>
43#include <linux/async.h>
44#include <linux/seq_file.h>
45#include <linux/cpu.h>
46#include <linux/slab.h>
47#include <linux/ratelimit.h>
48#include <linux/nodemask.h>
49
50#include <trace/events/block.h>
51#include <linux/list_sort.h>
52
53#include "md.h"
54#include "raid5.h"
55#include "raid0.h"
56#include "md-bitmap.h"
57#include "raid5-log.h"
58
59#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
60
61#define cpu_to_group(cpu) cpu_to_node(cpu)
62#define ANY_GROUP NUMA_NO_NODE
63
64static bool devices_handle_discard_safely = false;
65module_param(devices_handle_discard_safely, bool, 0644);
66MODULE_PARM_DESC(devices_handle_discard_safely,
67 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
68static struct workqueue_struct *raid5_wq;
69
70static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
71{
72 int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
73 return &conf->stripe_hashtbl[hash];
74}
75
76static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
77{
78 return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK;
79}
80
81static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
82{
83 spin_lock_irq(conf->hash_locks + hash);
84 spin_lock(&conf->device_lock);
85}
86
87static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
88{
89 spin_unlock(&conf->device_lock);
90 spin_unlock_irq(conf->hash_locks + hash);
91}
92
93static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
94{
95 int i;
96 spin_lock_irq(conf->hash_locks);
97 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
98 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
99 spin_lock(&conf->device_lock);
100}
101
102static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
103{
104 int i;
105 spin_unlock(&conf->device_lock);
106 for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
107 spin_unlock(conf->hash_locks + i);
108 spin_unlock_irq(conf->hash_locks);
109}
110
111
112static inline int raid6_d0(struct stripe_head *sh)
113{
114 if (sh->ddf_layout)
115
116 return 0;
117
118 if (sh->qd_idx == sh->disks - 1)
119 return 0;
120 else
121 return sh->qd_idx + 1;
122}
123static inline int raid6_next_disk(int disk, int raid_disks)
124{
125 disk++;
126 return (disk < raid_disks) ? disk : 0;
127}
128
129
130
131
132
133
134static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
135 int *count, int syndrome_disks)
136{
137 int slot = *count;
138
139 if (sh->ddf_layout)
140 (*count)++;
141 if (idx == sh->pd_idx)
142 return syndrome_disks;
143 if (idx == sh->qd_idx)
144 return syndrome_disks + 1;
145 if (!sh->ddf_layout)
146 (*count)++;
147 return slot;
148}
149
150static void print_raid5_conf (struct r5conf *conf);
151
152static int stripe_operations_active(struct stripe_head *sh)
153{
154 return sh->check_state || sh->reconstruct_state ||
155 test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
156 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
157}
158
159static bool stripe_is_lowprio(struct stripe_head *sh)
160{
161 return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
162 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
163 !test_bit(STRIPE_R5C_CACHING, &sh->state);
164}
165
166static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
167{
168 struct r5conf *conf = sh->raid_conf;
169 struct r5worker_group *group;
170 int thread_cnt;
171 int i, cpu = sh->cpu;
172
173 if (!cpu_online(cpu)) {
174 cpu = cpumask_any(cpu_online_mask);
175 sh->cpu = cpu;
176 }
177
178 if (list_empty(&sh->lru)) {
179 struct r5worker_group *group;
180 group = conf->worker_groups + cpu_to_group(cpu);
181 if (stripe_is_lowprio(sh))
182 list_add_tail(&sh->lru, &group->loprio_list);
183 else
184 list_add_tail(&sh->lru, &group->handle_list);
185 group->stripes_cnt++;
186 sh->group = group;
187 }
188
189 if (conf->worker_cnt_per_group == 0) {
190 md_wakeup_thread(conf->mddev->thread);
191 return;
192 }
193
194 group = conf->worker_groups + cpu_to_group(sh->cpu);
195
196 group->workers[0].working = true;
197
198 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
199
200 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
201
202 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
203 if (group->workers[i].working == false) {
204 group->workers[i].working = true;
205 queue_work_on(sh->cpu, raid5_wq,
206 &group->workers[i].work);
207 thread_cnt--;
208 }
209 }
210}
211
212static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
213 struct list_head *temp_inactive_list)
214{
215 int i;
216 int injournal = 0;
217
218 BUG_ON(!list_empty(&sh->lru));
219 BUG_ON(atomic_read(&conf->active_stripes)==0);
220
221 if (r5c_is_writeback(conf->log))
222 for (i = sh->disks; i--; )
223 if (test_bit(R5_InJournal, &sh->dev[i].flags))
224 injournal++;
225
226
227
228
229
230
231
232 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
233 (conf->quiesce && r5c_is_writeback(conf->log) &&
234 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
235 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
236 r5c_make_stripe_write_out(sh);
237 set_bit(STRIPE_HANDLE, &sh->state);
238 }
239
240 if (test_bit(STRIPE_HANDLE, &sh->state)) {
241 if (test_bit(STRIPE_DELAYED, &sh->state) &&
242 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
243 list_add_tail(&sh->lru, &conf->delayed_list);
244 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
245 sh->bm_seq - conf->seq_write > 0)
246 list_add_tail(&sh->lru, &conf->bitmap_list);
247 else {
248 clear_bit(STRIPE_DELAYED, &sh->state);
249 clear_bit(STRIPE_BIT_DELAY, &sh->state);
250 if (conf->worker_cnt_per_group == 0) {
251 if (stripe_is_lowprio(sh))
252 list_add_tail(&sh->lru,
253 &conf->loprio_list);
254 else
255 list_add_tail(&sh->lru,
256 &conf->handle_list);
257 } else {
258 raid5_wakeup_stripe_thread(sh);
259 return;
260 }
261 }
262 md_wakeup_thread(conf->mddev->thread);
263 } else {
264 BUG_ON(stripe_operations_active(sh));
265 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
266 if (atomic_dec_return(&conf->preread_active_stripes)
267 < IO_THRESHOLD)
268 md_wakeup_thread(conf->mddev->thread);
269 atomic_dec(&conf->active_stripes);
270 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
271 if (!r5c_is_writeback(conf->log))
272 list_add_tail(&sh->lru, temp_inactive_list);
273 else {
274 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
275 if (injournal == 0)
276 list_add_tail(&sh->lru, temp_inactive_list);
277 else if (injournal == conf->raid_disks - conf->max_degraded) {
278
279 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
280 atomic_inc(&conf->r5c_cached_full_stripes);
281 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
282 atomic_dec(&conf->r5c_cached_partial_stripes);
283 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
284 r5c_check_cached_full_stripe(conf);
285 } else
286
287
288
289
290
291 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
292 }
293 }
294 }
295}
296
297static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
298 struct list_head *temp_inactive_list)
299{
300 if (atomic_dec_and_test(&sh->count))
301 do_release_stripe(conf, sh, temp_inactive_list);
302}
303
304
305
306
307
308
309
310
311static void release_inactive_stripe_list(struct r5conf *conf,
312 struct list_head *temp_inactive_list,
313 int hash)
314{
315 int size;
316 bool do_wakeup = false;
317 unsigned long flags;
318
319 if (hash == NR_STRIPE_HASH_LOCKS) {
320 size = NR_STRIPE_HASH_LOCKS;
321 hash = NR_STRIPE_HASH_LOCKS - 1;
322 } else
323 size = 1;
324 while (size) {
325 struct list_head *list = &temp_inactive_list[size - 1];
326
327
328
329
330
331 if (!list_empty_careful(list)) {
332 spin_lock_irqsave(conf->hash_locks + hash, flags);
333 if (list_empty(conf->inactive_list + hash) &&
334 !list_empty(list))
335 atomic_dec(&conf->empty_inactive_list_nr);
336 list_splice_tail_init(list, conf->inactive_list + hash);
337 do_wakeup = true;
338 spin_unlock_irqrestore(conf->hash_locks + hash, flags);
339 }
340 size--;
341 hash--;
342 }
343
344 if (do_wakeup) {
345 wake_up(&conf->wait_for_stripe);
346 if (atomic_read(&conf->active_stripes) == 0)
347 wake_up(&conf->wait_for_quiescent);
348 if (conf->retry_read_aligned)
349 md_wakeup_thread(conf->mddev->thread);
350 }
351}
352
353
354static int release_stripe_list(struct r5conf *conf,
355 struct list_head *temp_inactive_list)
356{
357 struct stripe_head *sh, *t;
358 int count = 0;
359 struct llist_node *head;
360
361 head = llist_del_all(&conf->released_stripes);
362 head = llist_reverse_order(head);
363 llist_for_each_entry_safe(sh, t, head, release_list) {
364 int hash;
365
366
367 smp_mb();
368 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
369
370
371
372
373
374 hash = sh->hash_lock_index;
375 __release_stripe(conf, sh, &temp_inactive_list[hash]);
376 count++;
377 }
378
379 return count;
380}
381
382void raid5_release_stripe(struct stripe_head *sh)
383{
384 struct r5conf *conf = sh->raid_conf;
385 unsigned long flags;
386 struct list_head list;
387 int hash;
388 bool wakeup;
389
390
391
392 if (atomic_add_unless(&sh->count, -1, 1))
393 return;
394
395 if (unlikely(!conf->mddev->thread) ||
396 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
397 goto slow_path;
398 wakeup = llist_add(&sh->release_list, &conf->released_stripes);
399 if (wakeup)
400 md_wakeup_thread(conf->mddev->thread);
401 return;
402slow_path:
403
404 if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) {
405 INIT_LIST_HEAD(&list);
406 hash = sh->hash_lock_index;
407 do_release_stripe(conf, sh, &list);
408 spin_unlock_irqrestore(&conf->device_lock, flags);
409 release_inactive_stripe_list(conf, &list, hash);
410 }
411}
412
413static inline void remove_hash(struct stripe_head *sh)
414{
415 pr_debug("remove_hash(), stripe %llu\n",
416 (unsigned long long)sh->sector);
417
418 hlist_del_init(&sh->hash);
419}
420
421static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
422{
423 struct hlist_head *hp = stripe_hash(conf, sh->sector);
424
425 pr_debug("insert_hash(), stripe %llu\n",
426 (unsigned long long)sh->sector);
427
428 hlist_add_head(&sh->hash, hp);
429}
430
431
432static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
433{
434 struct stripe_head *sh = NULL;
435 struct list_head *first;
436
437 if (list_empty(conf->inactive_list + hash))
438 goto out;
439 first = (conf->inactive_list + hash)->next;
440 sh = list_entry(first, struct stripe_head, lru);
441 list_del_init(first);
442 remove_hash(sh);
443 atomic_inc(&conf->active_stripes);
444 BUG_ON(hash != sh->hash_lock_index);
445 if (list_empty(conf->inactive_list + hash))
446 atomic_inc(&conf->empty_inactive_list_nr);
447out:
448 return sh;
449}
450
451#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
452static void free_stripe_pages(struct stripe_head *sh)
453{
454 int i;
455 struct page *p;
456
457
458 if (!sh->pages)
459 return;
460
461 for (i = 0; i < sh->nr_pages; i++) {
462 p = sh->pages[i];
463 if (p)
464 put_page(p);
465 sh->pages[i] = NULL;
466 }
467}
468
469static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp)
470{
471 int i;
472 struct page *p;
473
474 for (i = 0; i < sh->nr_pages; i++) {
475
476 if (sh->pages[i])
477 continue;
478
479 p = alloc_page(gfp);
480 if (!p) {
481 free_stripe_pages(sh);
482 return -ENOMEM;
483 }
484 sh->pages[i] = p;
485 }
486 return 0;
487}
488
489static int
490init_stripe_shared_pages(struct stripe_head *sh, struct r5conf *conf, int disks)
491{
492 int nr_pages, cnt;
493
494 if (sh->pages)
495 return 0;
496
497
498 cnt = PAGE_SIZE / conf->stripe_size;
499 nr_pages = (disks + cnt - 1) / cnt;
500
501 sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
502 if (!sh->pages)
503 return -ENOMEM;
504 sh->nr_pages = nr_pages;
505 sh->stripes_per_page = cnt;
506 return 0;
507}
508#endif
509
510static void shrink_buffers(struct stripe_head *sh)
511{
512 int i;
513 int num = sh->raid_conf->pool_size;
514
515#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
516 for (i = 0; i < num ; i++) {
517 struct page *p;
518
519 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
520 p = sh->dev[i].page;
521 if (!p)
522 continue;
523 sh->dev[i].page = NULL;
524 put_page(p);
525 }
526#else
527 for (i = 0; i < num; i++)
528 sh->dev[i].page = NULL;
529 free_stripe_pages(sh);
530#endif
531}
532
533static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
534{
535 int i;
536 int num = sh->raid_conf->pool_size;
537
538#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
539 for (i = 0; i < num; i++) {
540 struct page *page;
541
542 if (!(page = alloc_page(gfp))) {
543 return 1;
544 }
545 sh->dev[i].page = page;
546 sh->dev[i].orig_page = page;
547 sh->dev[i].offset = 0;
548 }
549#else
550 if (alloc_stripe_pages(sh, gfp))
551 return -ENOMEM;
552
553 for (i = 0; i < num; i++) {
554 sh->dev[i].page = raid5_get_dev_page(sh, i);
555 sh->dev[i].orig_page = sh->dev[i].page;
556 sh->dev[i].offset = raid5_get_page_offset(sh, i);
557 }
558#endif
559 return 0;
560}
561
562static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
563 struct stripe_head *sh);
564
565static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
566{
567 struct r5conf *conf = sh->raid_conf;
568 int i, seq;
569
570 BUG_ON(atomic_read(&sh->count) != 0);
571 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
572 BUG_ON(stripe_operations_active(sh));
573 BUG_ON(sh->batch_head);
574
575 pr_debug("init_stripe called, stripe %llu\n",
576 (unsigned long long)sector);
577retry:
578 seq = read_seqcount_begin(&conf->gen_lock);
579 sh->generation = conf->generation - previous;
580 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
581 sh->sector = sector;
582 stripe_set_idx(sector, conf, previous, sh);
583 sh->state = 0;
584
585 for (i = sh->disks; i--; ) {
586 struct r5dev *dev = &sh->dev[i];
587
588 if (dev->toread || dev->read || dev->towrite || dev->written ||
589 test_bit(R5_LOCKED, &dev->flags)) {
590 pr_err("sector=%llx i=%d %p %p %p %p %d\n",
591 (unsigned long long)sh->sector, i, dev->toread,
592 dev->read, dev->towrite, dev->written,
593 test_bit(R5_LOCKED, &dev->flags));
594 WARN_ON(1);
595 }
596 dev->flags = 0;
597 dev->sector = raid5_compute_blocknr(sh, i, previous);
598 }
599 if (read_seqcount_retry(&conf->gen_lock, seq))
600 goto retry;
601 sh->overwrite_disks = 0;
602 insert_hash(conf, sh);
603 sh->cpu = smp_processor_id();
604 set_bit(STRIPE_BATCH_READY, &sh->state);
605}
606
607static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
608 short generation)
609{
610 struct stripe_head *sh;
611
612 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
613 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
614 if (sh->sector == sector && sh->generation == generation)
615 return sh;
616 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
617 return NULL;
618}
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633int raid5_calc_degraded(struct r5conf *conf)
634{
635 int degraded, degraded2;
636 int i;
637
638 rcu_read_lock();
639 degraded = 0;
640 for (i = 0; i < conf->previous_raid_disks; i++) {
641 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
642 if (rdev && test_bit(Faulty, &rdev->flags))
643 rdev = rcu_dereference(conf->disks[i].replacement);
644 if (!rdev || test_bit(Faulty, &rdev->flags))
645 degraded++;
646 else if (test_bit(In_sync, &rdev->flags))
647 ;
648 else
649
650
651
652
653
654
655
656
657
658 if (conf->raid_disks >= conf->previous_raid_disks)
659 degraded++;
660 }
661 rcu_read_unlock();
662 if (conf->raid_disks == conf->previous_raid_disks)
663 return degraded;
664 rcu_read_lock();
665 degraded2 = 0;
666 for (i = 0; i < conf->raid_disks; i++) {
667 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
668 if (rdev && test_bit(Faulty, &rdev->flags))
669 rdev = rcu_dereference(conf->disks[i].replacement);
670 if (!rdev || test_bit(Faulty, &rdev->flags))
671 degraded2++;
672 else if (test_bit(In_sync, &rdev->flags))
673 ;
674 else
675
676
677
678
679
680 if (conf->raid_disks <= conf->previous_raid_disks)
681 degraded2++;
682 }
683 rcu_read_unlock();
684 if (degraded2 > degraded)
685 return degraded2;
686 return degraded;
687}
688
689static int has_failed(struct r5conf *conf)
690{
691 int degraded;
692
693 if (conf->mddev->reshape_position == MaxSector)
694 return conf->mddev->degraded > conf->max_degraded;
695
696 degraded = raid5_calc_degraded(conf);
697 if (degraded > conf->max_degraded)
698 return 1;
699 return 0;
700}
701
702struct stripe_head *
703raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
704 int previous, int noblock, int noquiesce)
705{
706 struct stripe_head *sh;
707 int hash = stripe_hash_locks_hash(conf, sector);
708 int inc_empty_inactive_list_flag;
709
710 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
711
712 spin_lock_irq(conf->hash_locks + hash);
713
714 do {
715 wait_event_lock_irq(conf->wait_for_quiescent,
716 conf->quiesce == 0 || noquiesce,
717 *(conf->hash_locks + hash));
718 sh = __find_stripe(conf, sector, conf->generation - previous);
719 if (!sh) {
720 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
721 sh = get_free_stripe(conf, hash);
722 if (!sh && !test_bit(R5_DID_ALLOC,
723 &conf->cache_state))
724 set_bit(R5_ALLOC_MORE,
725 &conf->cache_state);
726 }
727 if (noblock && sh == NULL)
728 break;
729
730 r5c_check_stripe_cache_usage(conf);
731 if (!sh) {
732 set_bit(R5_INACTIVE_BLOCKED,
733 &conf->cache_state);
734 r5l_wake_reclaim(conf->log, 0);
735 wait_event_lock_irq(
736 conf->wait_for_stripe,
737 !list_empty(conf->inactive_list + hash) &&
738 (atomic_read(&conf->active_stripes)
739 < (conf->max_nr_stripes * 3 / 4)
740 || !test_bit(R5_INACTIVE_BLOCKED,
741 &conf->cache_state)),
742 *(conf->hash_locks + hash));
743 clear_bit(R5_INACTIVE_BLOCKED,
744 &conf->cache_state);
745 } else {
746 init_stripe(sh, sector, previous);
747 atomic_inc(&sh->count);
748 }
749 } else if (!atomic_inc_not_zero(&sh->count)) {
750 spin_lock(&conf->device_lock);
751 if (!atomic_read(&sh->count)) {
752 if (!test_bit(STRIPE_HANDLE, &sh->state))
753 atomic_inc(&conf->active_stripes);
754 BUG_ON(list_empty(&sh->lru) &&
755 !test_bit(STRIPE_EXPANDING, &sh->state));
756 inc_empty_inactive_list_flag = 0;
757 if (!list_empty(conf->inactive_list + hash))
758 inc_empty_inactive_list_flag = 1;
759 list_del_init(&sh->lru);
760 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
761 atomic_inc(&conf->empty_inactive_list_nr);
762 if (sh->group) {
763 sh->group->stripes_cnt--;
764 sh->group = NULL;
765 }
766 }
767 atomic_inc(&sh->count);
768 spin_unlock(&conf->device_lock);
769 }
770 } while (sh == NULL);
771
772 spin_unlock_irq(conf->hash_locks + hash);
773 return sh;
774}
775
776static bool is_full_stripe_write(struct stripe_head *sh)
777{
778 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
779 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
780}
781
782static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
783 __acquires(&sh1->stripe_lock)
784 __acquires(&sh2->stripe_lock)
785{
786 if (sh1 > sh2) {
787 spin_lock_irq(&sh2->stripe_lock);
788 spin_lock_nested(&sh1->stripe_lock, 1);
789 } else {
790 spin_lock_irq(&sh1->stripe_lock);
791 spin_lock_nested(&sh2->stripe_lock, 1);
792 }
793}
794
795static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
796 __releases(&sh1->stripe_lock)
797 __releases(&sh2->stripe_lock)
798{
799 spin_unlock(&sh1->stripe_lock);
800 spin_unlock_irq(&sh2->stripe_lock);
801}
802
803
804static bool stripe_can_batch(struct stripe_head *sh)
805{
806 struct r5conf *conf = sh->raid_conf;
807
808 if (raid5_has_log(conf) || raid5_has_ppl(conf))
809 return false;
810 return test_bit(STRIPE_BATCH_READY, &sh->state) &&
811 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
812 is_full_stripe_write(sh);
813}
814
815
816static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
817{
818 struct stripe_head *head;
819 sector_t head_sector, tmp_sec;
820 int hash;
821 int dd_idx;
822 int inc_empty_inactive_list_flag;
823
824
825 tmp_sec = sh->sector;
826 if (!sector_div(tmp_sec, conf->chunk_sectors))
827 return;
828 head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
829
830 hash = stripe_hash_locks_hash(conf, head_sector);
831 spin_lock_irq(conf->hash_locks + hash);
832 head = __find_stripe(conf, head_sector, conf->generation);
833 if (head && !atomic_inc_not_zero(&head->count)) {
834 spin_lock(&conf->device_lock);
835 if (!atomic_read(&head->count)) {
836 if (!test_bit(STRIPE_HANDLE, &head->state))
837 atomic_inc(&conf->active_stripes);
838 BUG_ON(list_empty(&head->lru) &&
839 !test_bit(STRIPE_EXPANDING, &head->state));
840 inc_empty_inactive_list_flag = 0;
841 if (!list_empty(conf->inactive_list + hash))
842 inc_empty_inactive_list_flag = 1;
843 list_del_init(&head->lru);
844 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
845 atomic_inc(&conf->empty_inactive_list_nr);
846 if (head->group) {
847 head->group->stripes_cnt--;
848 head->group = NULL;
849 }
850 }
851 atomic_inc(&head->count);
852 spin_unlock(&conf->device_lock);
853 }
854 spin_unlock_irq(conf->hash_locks + hash);
855
856 if (!head)
857 return;
858 if (!stripe_can_batch(head))
859 goto out;
860
861 lock_two_stripes(head, sh);
862
863 if (!stripe_can_batch(head) || !stripe_can_batch(sh))
864 goto unlock_out;
865
866 if (sh->batch_head)
867 goto unlock_out;
868
869 dd_idx = 0;
870 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
871 dd_idx++;
872 if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf ||
873 bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite))
874 goto unlock_out;
875
876 if (head->batch_head) {
877 spin_lock(&head->batch_head->batch_lock);
878
879 if (!stripe_can_batch(head)) {
880 spin_unlock(&head->batch_head->batch_lock);
881 goto unlock_out;
882 }
883
884
885
886
887
888
889
890 sh->batch_head = head->batch_head;
891
892
893
894
895
896 list_add(&sh->batch_list, &head->batch_list);
897 spin_unlock(&head->batch_head->batch_lock);
898 } else {
899 head->batch_head = head;
900 sh->batch_head = head->batch_head;
901 spin_lock(&head->batch_lock);
902 list_add_tail(&sh->batch_list, &head->batch_list);
903 spin_unlock(&head->batch_lock);
904 }
905
906 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
907 if (atomic_dec_return(&conf->preread_active_stripes)
908 < IO_THRESHOLD)
909 md_wakeup_thread(conf->mddev->thread);
910
911 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
912 int seq = sh->bm_seq;
913 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
914 sh->batch_head->bm_seq > seq)
915 seq = sh->batch_head->bm_seq;
916 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
917 sh->batch_head->bm_seq = seq;
918 }
919
920 atomic_inc(&sh->count);
921unlock_out:
922 unlock_two_stripes(head, sh);
923out:
924 raid5_release_stripe(head);
925}
926
927
928
929
930static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
931{
932 sector_t progress = conf->reshape_progress;
933
934
935
936
937 smp_rmb();
938 if (progress == MaxSector)
939 return 0;
940 if (sh->generation == conf->generation - 1)
941 return 0;
942
943
944
945 return 1;
946}
947
948static void dispatch_bio_list(struct bio_list *tmp)
949{
950 struct bio *bio;
951
952 while ((bio = bio_list_pop(tmp)))
953 submit_bio_noacct(bio);
954}
955
956static int cmp_stripe(void *priv, const struct list_head *a,
957 const struct list_head *b)
958{
959 const struct r5pending_data *da = list_entry(a,
960 struct r5pending_data, sibling);
961 const struct r5pending_data *db = list_entry(b,
962 struct r5pending_data, sibling);
963 if (da->sector > db->sector)
964 return 1;
965 if (da->sector < db->sector)
966 return -1;
967 return 0;
968}
969
970static void dispatch_defer_bios(struct r5conf *conf, int target,
971 struct bio_list *list)
972{
973 struct r5pending_data *data;
974 struct list_head *first, *next = NULL;
975 int cnt = 0;
976
977 if (conf->pending_data_cnt == 0)
978 return;
979
980 list_sort(NULL, &conf->pending_list, cmp_stripe);
981
982 first = conf->pending_list.next;
983
984
985 if (conf->next_pending_data)
986 list_move_tail(&conf->pending_list,
987 &conf->next_pending_data->sibling);
988
989 while (!list_empty(&conf->pending_list)) {
990 data = list_first_entry(&conf->pending_list,
991 struct r5pending_data, sibling);
992 if (&data->sibling == first)
993 first = data->sibling.next;
994 next = data->sibling.next;
995
996 bio_list_merge(list, &data->bios);
997 list_move(&data->sibling, &conf->free_list);
998 cnt++;
999 if (cnt >= target)
1000 break;
1001 }
1002 conf->pending_data_cnt -= cnt;
1003 BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
1004
1005 if (next != &conf->pending_list)
1006 conf->next_pending_data = list_entry(next,
1007 struct r5pending_data, sibling);
1008 else
1009 conf->next_pending_data = NULL;
1010
1011 if (first != &conf->pending_list)
1012 list_move_tail(&conf->pending_list, first);
1013}
1014
1015static void flush_deferred_bios(struct r5conf *conf)
1016{
1017 struct bio_list tmp = BIO_EMPTY_LIST;
1018
1019 if (conf->pending_data_cnt == 0)
1020 return;
1021
1022 spin_lock(&conf->pending_bios_lock);
1023 dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
1024 BUG_ON(conf->pending_data_cnt != 0);
1025 spin_unlock(&conf->pending_bios_lock);
1026
1027 dispatch_bio_list(&tmp);
1028}
1029
1030static void defer_issue_bios(struct r5conf *conf, sector_t sector,
1031 struct bio_list *bios)
1032{
1033 struct bio_list tmp = BIO_EMPTY_LIST;
1034 struct r5pending_data *ent;
1035
1036 spin_lock(&conf->pending_bios_lock);
1037 ent = list_first_entry(&conf->free_list, struct r5pending_data,
1038 sibling);
1039 list_move_tail(&ent->sibling, &conf->pending_list);
1040 ent->sector = sector;
1041 bio_list_init(&ent->bios);
1042 bio_list_merge(&ent->bios, bios);
1043 conf->pending_data_cnt++;
1044 if (conf->pending_data_cnt >= PENDING_IO_MAX)
1045 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
1046
1047 spin_unlock(&conf->pending_bios_lock);
1048
1049 dispatch_bio_list(&tmp);
1050}
1051
1052static void
1053raid5_end_read_request(struct bio *bi);
1054static void
1055raid5_end_write_request(struct bio *bi);
1056
1057static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
1058{
1059 struct r5conf *conf = sh->raid_conf;
1060 int i, disks = sh->disks;
1061 struct stripe_head *head_sh = sh;
1062 struct bio_list pending_bios = BIO_EMPTY_LIST;
1063 bool should_defer;
1064
1065 might_sleep();
1066
1067 if (log_stripe(sh, s) == 0)
1068 return;
1069
1070 should_defer = conf->batch_bio_dispatch && conf->group_cnt;
1071
1072 for (i = disks; i--; ) {
1073 int op, op_flags = 0;
1074 int replace_only = 0;
1075 struct bio *bi, *rbi;
1076 struct md_rdev *rdev, *rrdev = NULL;
1077
1078 sh = head_sh;
1079 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
1080 op = REQ_OP_WRITE;
1081 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
1082 op_flags = REQ_FUA;
1083 if (test_bit(R5_Discard, &sh->dev[i].flags))
1084 op = REQ_OP_DISCARD;
1085 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1086 op = REQ_OP_READ;
1087 else if (test_and_clear_bit(R5_WantReplace,
1088 &sh->dev[i].flags)) {
1089 op = REQ_OP_WRITE;
1090 replace_only = 1;
1091 } else
1092 continue;
1093 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
1094 op_flags |= REQ_SYNC;
1095
1096again:
1097 bi = &sh->dev[i].req;
1098 rbi = &sh->dev[i].rreq;
1099
1100 rcu_read_lock();
1101 rrdev = rcu_dereference(conf->disks[i].replacement);
1102 smp_mb();
1103 rdev = rcu_dereference(conf->disks[i].rdev);
1104 if (!rdev) {
1105 rdev = rrdev;
1106 rrdev = NULL;
1107 }
1108 if (op_is_write(op)) {
1109 if (replace_only)
1110 rdev = NULL;
1111 if (rdev == rrdev)
1112
1113 rrdev = NULL;
1114 } else {
1115 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
1116 rdev = rrdev;
1117 rrdev = NULL;
1118 }
1119
1120 if (rdev && test_bit(Faulty, &rdev->flags))
1121 rdev = NULL;
1122 if (rdev)
1123 atomic_inc(&rdev->nr_pending);
1124 if (rrdev && test_bit(Faulty, &rrdev->flags))
1125 rrdev = NULL;
1126 if (rrdev)
1127 atomic_inc(&rrdev->nr_pending);
1128 rcu_read_unlock();
1129
1130
1131
1132
1133
1134 while (op_is_write(op) && rdev &&
1135 test_bit(WriteErrorSeen, &rdev->flags)) {
1136 sector_t first_bad;
1137 int bad_sectors;
1138 int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
1139 &first_bad, &bad_sectors);
1140 if (!bad)
1141 break;
1142
1143 if (bad < 0) {
1144 set_bit(BlockedBadBlocks, &rdev->flags);
1145 if (!conf->mddev->external &&
1146 conf->mddev->sb_flags) {
1147
1148
1149
1150
1151 md_check_recovery(conf->mddev);
1152 }
1153
1154
1155
1156
1157
1158 atomic_inc(&rdev->nr_pending);
1159 md_wait_for_blocked_rdev(rdev, conf->mddev);
1160 } else {
1161
1162 rdev_dec_pending(rdev, conf->mddev);
1163 rdev = NULL;
1164 }
1165 }
1166
1167 if (rdev) {
1168 if (s->syncing || s->expanding || s->expanded
1169 || s->replacing)
1170 md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
1171
1172 set_bit(STRIPE_IO_STARTED, &sh->state);
1173
1174 bio_set_dev(bi, rdev->bdev);
1175 bio_set_op_attrs(bi, op, op_flags);
1176 bi->bi_end_io = op_is_write(op)
1177 ? raid5_end_write_request
1178 : raid5_end_read_request;
1179 bi->bi_private = sh;
1180
1181 pr_debug("%s: for %llu schedule op %d on disc %d\n",
1182 __func__, (unsigned long long)sh->sector,
1183 bi->bi_opf, i);
1184 atomic_inc(&sh->count);
1185 if (sh != head_sh)
1186 atomic_inc(&head_sh->count);
1187 if (use_new_offset(conf, sh))
1188 bi->bi_iter.bi_sector = (sh->sector
1189 + rdev->new_data_offset);
1190 else
1191 bi->bi_iter.bi_sector = (sh->sector
1192 + rdev->data_offset);
1193 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
1194 bi->bi_opf |= REQ_NOMERGE;
1195
1196 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1197 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1198
1199 if (!op_is_write(op) &&
1200 test_bit(R5_InJournal, &sh->dev[i].flags))
1201
1202
1203
1204
1205
1206 sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
1207 else
1208 sh->dev[i].vec.bv_page = sh->dev[i].page;
1209 bi->bi_vcnt = 1;
1210 bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1211 bi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1212 bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1213 bi->bi_write_hint = sh->dev[i].write_hint;
1214 if (!rrdev)
1215 sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
1216
1217
1218
1219
1220 if (op == REQ_OP_DISCARD)
1221 bi->bi_vcnt = 0;
1222 if (rrdev)
1223 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
1224
1225 if (conf->mddev->gendisk)
1226 trace_block_bio_remap(bi,
1227 disk_devt(conf->mddev->gendisk),
1228 sh->dev[i].sector);
1229 if (should_defer && op_is_write(op))
1230 bio_list_add(&pending_bios, bi);
1231 else
1232 submit_bio_noacct(bi);
1233 }
1234 if (rrdev) {
1235 if (s->syncing || s->expanding || s->expanded
1236 || s->replacing)
1237 md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
1238
1239 set_bit(STRIPE_IO_STARTED, &sh->state);
1240
1241 bio_set_dev(rbi, rrdev->bdev);
1242 bio_set_op_attrs(rbi, op, op_flags);
1243 BUG_ON(!op_is_write(op));
1244 rbi->bi_end_io = raid5_end_write_request;
1245 rbi->bi_private = sh;
1246
1247 pr_debug("%s: for %llu schedule op %d on "
1248 "replacement disc %d\n",
1249 __func__, (unsigned long long)sh->sector,
1250 rbi->bi_opf, i);
1251 atomic_inc(&sh->count);
1252 if (sh != head_sh)
1253 atomic_inc(&head_sh->count);
1254 if (use_new_offset(conf, sh))
1255 rbi->bi_iter.bi_sector = (sh->sector
1256 + rrdev->new_data_offset);
1257 else
1258 rbi->bi_iter.bi_sector = (sh->sector
1259 + rrdev->data_offset);
1260 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1261 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1262 sh->dev[i].rvec.bv_page = sh->dev[i].page;
1263 rbi->bi_vcnt = 1;
1264 rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1265 rbi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1266 rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1267 rbi->bi_write_hint = sh->dev[i].write_hint;
1268 sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
1269
1270
1271
1272
1273 if (op == REQ_OP_DISCARD)
1274 rbi->bi_vcnt = 0;
1275 if (conf->mddev->gendisk)
1276 trace_block_bio_remap(rbi,
1277 disk_devt(conf->mddev->gendisk),
1278 sh->dev[i].sector);
1279 if (should_defer && op_is_write(op))
1280 bio_list_add(&pending_bios, rbi);
1281 else
1282 submit_bio_noacct(rbi);
1283 }
1284 if (!rdev && !rrdev) {
1285 if (op_is_write(op))
1286 set_bit(STRIPE_DEGRADED, &sh->state);
1287 pr_debug("skip op %d on disc %d for sector %llu\n",
1288 bi->bi_opf, i, (unsigned long long)sh->sector);
1289 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1290 set_bit(STRIPE_HANDLE, &sh->state);
1291 }
1292
1293 if (!head_sh->batch_head)
1294 continue;
1295 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1296 batch_list);
1297 if (sh != head_sh)
1298 goto again;
1299 }
1300
1301 if (should_defer && !bio_list_empty(&pending_bios))
1302 defer_issue_bios(conf, head_sh->sector, &pending_bios);
1303}
1304
1305static struct dma_async_tx_descriptor *
1306async_copy_data(int frombio, struct bio *bio, struct page **page,
1307 unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx,
1308 struct stripe_head *sh, int no_skipcopy)
1309{
1310 struct bio_vec bvl;
1311 struct bvec_iter iter;
1312 struct page *bio_page;
1313 int page_offset;
1314 struct async_submit_ctl submit;
1315 enum async_tx_flags flags = 0;
1316 struct r5conf *conf = sh->raid_conf;
1317
1318 if (bio->bi_iter.bi_sector >= sector)
1319 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
1320 else
1321 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
1322
1323 if (frombio)
1324 flags |= ASYNC_TX_FENCE;
1325 init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
1326
1327 bio_for_each_segment(bvl, bio, iter) {
1328 int len = bvl.bv_len;
1329 int clen;
1330 int b_offset = 0;
1331
1332 if (page_offset < 0) {
1333 b_offset = -page_offset;
1334 page_offset += b_offset;
1335 len -= b_offset;
1336 }
1337
1338 if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf))
1339 clen = RAID5_STRIPE_SIZE(conf) - page_offset;
1340 else
1341 clen = len;
1342
1343 if (clen > 0) {
1344 b_offset += bvl.bv_offset;
1345 bio_page = bvl.bv_page;
1346 if (frombio) {
1347 if (conf->skip_copy &&
1348 b_offset == 0 && page_offset == 0 &&
1349 clen == RAID5_STRIPE_SIZE(conf) &&
1350 !no_skipcopy)
1351 *page = bio_page;
1352 else
1353 tx = async_memcpy(*page, bio_page, page_offset + poff,
1354 b_offset, clen, &submit);
1355 } else
1356 tx = async_memcpy(bio_page, *page, b_offset,
1357 page_offset + poff, clen, &submit);
1358 }
1359
1360 submit.depend_tx = tx;
1361
1362 if (clen < len)
1363 break;
1364 page_offset += len;
1365 }
1366
1367 return tx;
1368}
1369
1370static void ops_complete_biofill(void *stripe_head_ref)
1371{
1372 struct stripe_head *sh = stripe_head_ref;
1373 int i;
1374 struct r5conf *conf = sh->raid_conf;
1375
1376 pr_debug("%s: stripe %llu\n", __func__,
1377 (unsigned long long)sh->sector);
1378
1379
1380 for (i = sh->disks; i--; ) {
1381 struct r5dev *dev = &sh->dev[i];
1382
1383
1384
1385
1386
1387
1388 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
1389 struct bio *rbi, *rbi2;
1390
1391 BUG_ON(!dev->read);
1392 rbi = dev->read;
1393 dev->read = NULL;
1394 while (rbi && rbi->bi_iter.bi_sector <
1395 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1396 rbi2 = r5_next_bio(conf, rbi, dev->sector);
1397 bio_endio(rbi);
1398 rbi = rbi2;
1399 }
1400 }
1401 }
1402 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
1403
1404 set_bit(STRIPE_HANDLE, &sh->state);
1405 raid5_release_stripe(sh);
1406}
1407
1408static void ops_run_biofill(struct stripe_head *sh)
1409{
1410 struct dma_async_tx_descriptor *tx = NULL;
1411 struct async_submit_ctl submit;
1412 int i;
1413 struct r5conf *conf = sh->raid_conf;
1414
1415 BUG_ON(sh->batch_head);
1416 pr_debug("%s: stripe %llu\n", __func__,
1417 (unsigned long long)sh->sector);
1418
1419 for (i = sh->disks; i--; ) {
1420 struct r5dev *dev = &sh->dev[i];
1421 if (test_bit(R5_Wantfill, &dev->flags)) {
1422 struct bio *rbi;
1423 spin_lock_irq(&sh->stripe_lock);
1424 dev->read = rbi = dev->toread;
1425 dev->toread = NULL;
1426 spin_unlock_irq(&sh->stripe_lock);
1427 while (rbi && rbi->bi_iter.bi_sector <
1428 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1429 tx = async_copy_data(0, rbi, &dev->page,
1430 dev->offset,
1431 dev->sector, tx, sh, 0);
1432 rbi = r5_next_bio(conf, rbi, dev->sector);
1433 }
1434 }
1435 }
1436
1437 atomic_inc(&sh->count);
1438 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
1439 async_trigger_callback(&submit);
1440}
1441
1442static void mark_target_uptodate(struct stripe_head *sh, int target)
1443{
1444 struct r5dev *tgt;
1445
1446 if (target < 0)
1447 return;
1448
1449 tgt = &sh->dev[target];
1450 set_bit(R5_UPTODATE, &tgt->flags);
1451 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1452 clear_bit(R5_Wantcompute, &tgt->flags);
1453}
1454
1455static void ops_complete_compute(void *stripe_head_ref)
1456{
1457 struct stripe_head *sh = stripe_head_ref;
1458
1459 pr_debug("%s: stripe %llu\n", __func__,
1460 (unsigned long long)sh->sector);
1461
1462
1463 mark_target_uptodate(sh, sh->ops.target);
1464 mark_target_uptodate(sh, sh->ops.target2);
1465
1466 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
1467 if (sh->check_state == check_state_compute_run)
1468 sh->check_state = check_state_compute_result;
1469 set_bit(STRIPE_HANDLE, &sh->state);
1470 raid5_release_stripe(sh);
1471}
1472
1473
1474static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1475{
1476 return percpu->scribble + i * percpu->scribble_obj_size;
1477}
1478
1479
1480static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1481 struct raid5_percpu *percpu, int i)
1482{
1483 return (void *) (to_addr_page(percpu, i) + sh->disks + 2);
1484}
1485
1486
1487
1488
1489static unsigned int *
1490to_addr_offs(struct stripe_head *sh, struct raid5_percpu *percpu)
1491{
1492 return (unsigned int *) (to_addr_conv(sh, percpu, 0) + sh->disks + 2);
1493}
1494
1495static struct dma_async_tx_descriptor *
1496ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1497{
1498 int disks = sh->disks;
1499 struct page **xor_srcs = to_addr_page(percpu, 0);
1500 unsigned int *off_srcs = to_addr_offs(sh, percpu);
1501 int target = sh->ops.target;
1502 struct r5dev *tgt = &sh->dev[target];
1503 struct page *xor_dest = tgt->page;
1504 unsigned int off_dest = tgt->offset;
1505 int count = 0;
1506 struct dma_async_tx_descriptor *tx;
1507 struct async_submit_ctl submit;
1508 int i;
1509
1510 BUG_ON(sh->batch_head);
1511
1512 pr_debug("%s: stripe %llu block: %d\n",
1513 __func__, (unsigned long long)sh->sector, target);
1514 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1515
1516 for (i = disks; i--; ) {
1517 if (i != target) {
1518 off_srcs[count] = sh->dev[i].offset;
1519 xor_srcs[count++] = sh->dev[i].page;
1520 }
1521 }
1522
1523 atomic_inc(&sh->count);
1524
1525 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
1526 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
1527 if (unlikely(count == 1))
1528 tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
1529 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1530 else
1531 tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1532 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1533
1534 return tx;
1535}
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547static int set_syndrome_sources(struct page **srcs,
1548 unsigned int *offs,
1549 struct stripe_head *sh,
1550 int srctype)
1551{
1552 int disks = sh->disks;
1553 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1554 int d0_idx = raid6_d0(sh);
1555 int count;
1556 int i;
1557
1558 for (i = 0; i < disks; i++)
1559 srcs[i] = NULL;
1560
1561 count = 0;
1562 i = d0_idx;
1563 do {
1564 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1565 struct r5dev *dev = &sh->dev[i];
1566
1567 if (i == sh->qd_idx || i == sh->pd_idx ||
1568 (srctype == SYNDROME_SRC_ALL) ||
1569 (srctype == SYNDROME_SRC_WANT_DRAIN &&
1570 (test_bit(R5_Wantdrain, &dev->flags) ||
1571 test_bit(R5_InJournal, &dev->flags))) ||
1572 (srctype == SYNDROME_SRC_WRITTEN &&
1573 (dev->written ||
1574 test_bit(R5_InJournal, &dev->flags)))) {
1575 if (test_bit(R5_InJournal, &dev->flags))
1576 srcs[slot] = sh->dev[i].orig_page;
1577 else
1578 srcs[slot] = sh->dev[i].page;
1579
1580
1581
1582
1583
1584 offs[slot] = sh->dev[i].offset;
1585 }
1586 i = raid6_next_disk(i, disks);
1587 } while (i != d0_idx);
1588
1589 return syndrome_disks;
1590}
1591
1592static struct dma_async_tx_descriptor *
1593ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1594{
1595 int disks = sh->disks;
1596 struct page **blocks = to_addr_page(percpu, 0);
1597 unsigned int *offs = to_addr_offs(sh, percpu);
1598 int target;
1599 int qd_idx = sh->qd_idx;
1600 struct dma_async_tx_descriptor *tx;
1601 struct async_submit_ctl submit;
1602 struct r5dev *tgt;
1603 struct page *dest;
1604 unsigned int dest_off;
1605 int i;
1606 int count;
1607
1608 BUG_ON(sh->batch_head);
1609 if (sh->ops.target < 0)
1610 target = sh->ops.target2;
1611 else if (sh->ops.target2 < 0)
1612 target = sh->ops.target;
1613 else
1614
1615 BUG();
1616 BUG_ON(target < 0);
1617 pr_debug("%s: stripe %llu block: %d\n",
1618 __func__, (unsigned long long)sh->sector, target);
1619
1620 tgt = &sh->dev[target];
1621 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1622 dest = tgt->page;
1623 dest_off = tgt->offset;
1624
1625 atomic_inc(&sh->count);
1626
1627 if (target == qd_idx) {
1628 count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
1629 blocks[count] = NULL;
1630 BUG_ON(blocks[count+1] != dest);
1631 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1632 ops_complete_compute, sh,
1633 to_addr_conv(sh, percpu, 0));
1634 tx = async_gen_syndrome(blocks, offs, count+2,
1635 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1636 } else {
1637
1638 count = 0;
1639 for (i = disks; i-- ; ) {
1640 if (i == target || i == qd_idx)
1641 continue;
1642 offs[count] = sh->dev[i].offset;
1643 blocks[count++] = sh->dev[i].page;
1644 }
1645
1646 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1647 NULL, ops_complete_compute, sh,
1648 to_addr_conv(sh, percpu, 0));
1649 tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1650 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1651 }
1652
1653 return tx;
1654}
1655
1656static struct dma_async_tx_descriptor *
1657ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1658{
1659 int i, count, disks = sh->disks;
1660 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1661 int d0_idx = raid6_d0(sh);
1662 int faila = -1, failb = -1;
1663 int target = sh->ops.target;
1664 int target2 = sh->ops.target2;
1665 struct r5dev *tgt = &sh->dev[target];
1666 struct r5dev *tgt2 = &sh->dev[target2];
1667 struct dma_async_tx_descriptor *tx;
1668 struct page **blocks = to_addr_page(percpu, 0);
1669 unsigned int *offs = to_addr_offs(sh, percpu);
1670 struct async_submit_ctl submit;
1671
1672 BUG_ON(sh->batch_head);
1673 pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1674 __func__, (unsigned long long)sh->sector, target, target2);
1675 BUG_ON(target < 0 || target2 < 0);
1676 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1677 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1678
1679
1680
1681
1682 for (i = 0; i < disks ; i++) {
1683 offs[i] = 0;
1684 blocks[i] = NULL;
1685 }
1686 count = 0;
1687 i = d0_idx;
1688 do {
1689 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1690
1691 offs[slot] = sh->dev[i].offset;
1692 blocks[slot] = sh->dev[i].page;
1693
1694 if (i == target)
1695 faila = slot;
1696 if (i == target2)
1697 failb = slot;
1698 i = raid6_next_disk(i, disks);
1699 } while (i != d0_idx);
1700
1701 BUG_ON(faila == failb);
1702 if (failb < faila)
1703 swap(faila, failb);
1704 pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1705 __func__, (unsigned long long)sh->sector, faila, failb);
1706
1707 atomic_inc(&sh->count);
1708
1709 if (failb == syndrome_disks+1) {
1710
1711 if (faila == syndrome_disks) {
1712
1713 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1714 ops_complete_compute, sh,
1715 to_addr_conv(sh, percpu, 0));
1716 return async_gen_syndrome(blocks, offs, syndrome_disks+2,
1717 RAID5_STRIPE_SIZE(sh->raid_conf),
1718 &submit);
1719 } else {
1720 struct page *dest;
1721 unsigned int dest_off;
1722 int data_target;
1723 int qd_idx = sh->qd_idx;
1724
1725
1726 if (target == qd_idx)
1727 data_target = target2;
1728 else
1729 data_target = target;
1730
1731 count = 0;
1732 for (i = disks; i-- ; ) {
1733 if (i == data_target || i == qd_idx)
1734 continue;
1735 offs[count] = sh->dev[i].offset;
1736 blocks[count++] = sh->dev[i].page;
1737 }
1738 dest = sh->dev[data_target].page;
1739 dest_off = sh->dev[data_target].offset;
1740 init_async_submit(&submit,
1741 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1742 NULL, NULL, NULL,
1743 to_addr_conv(sh, percpu, 0));
1744 tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1745 RAID5_STRIPE_SIZE(sh->raid_conf),
1746 &submit);
1747
1748 count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
1749 init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1750 ops_complete_compute, sh,
1751 to_addr_conv(sh, percpu, 0));
1752 return async_gen_syndrome(blocks, offs, count+2,
1753 RAID5_STRIPE_SIZE(sh->raid_conf),
1754 &submit);
1755 }
1756 } else {
1757 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1758 ops_complete_compute, sh,
1759 to_addr_conv(sh, percpu, 0));
1760 if (failb == syndrome_disks) {
1761
1762 return async_raid6_datap_recov(syndrome_disks+2,
1763 RAID5_STRIPE_SIZE(sh->raid_conf),
1764 faila,
1765 blocks, offs, &submit);
1766 } else {
1767
1768 return async_raid6_2data_recov(syndrome_disks+2,
1769 RAID5_STRIPE_SIZE(sh->raid_conf),
1770 faila, failb,
1771 blocks, offs, &submit);
1772 }
1773 }
1774}
1775
1776static void ops_complete_prexor(void *stripe_head_ref)
1777{
1778 struct stripe_head *sh = stripe_head_ref;
1779
1780 pr_debug("%s: stripe %llu\n", __func__,
1781 (unsigned long long)sh->sector);
1782
1783 if (r5c_is_writeback(sh->raid_conf->log))
1784
1785
1786
1787
1788 r5c_release_extra_page(sh);
1789}
1790
1791static struct dma_async_tx_descriptor *
1792ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1793 struct dma_async_tx_descriptor *tx)
1794{
1795 int disks = sh->disks;
1796 struct page **xor_srcs = to_addr_page(percpu, 0);
1797 unsigned int *off_srcs = to_addr_offs(sh, percpu);
1798 int count = 0, pd_idx = sh->pd_idx, i;
1799 struct async_submit_ctl submit;
1800
1801
1802 unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
1803 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1804
1805 BUG_ON(sh->batch_head);
1806 pr_debug("%s: stripe %llu\n", __func__,
1807 (unsigned long long)sh->sector);
1808
1809 for (i = disks; i--; ) {
1810 struct r5dev *dev = &sh->dev[i];
1811
1812 if (test_bit(R5_InJournal, &dev->flags)) {
1813
1814
1815
1816
1817 off_srcs[count] = dev->offset;
1818 xor_srcs[count++] = dev->orig_page;
1819 } else if (test_bit(R5_Wantdrain, &dev->flags)) {
1820 off_srcs[count] = dev->offset;
1821 xor_srcs[count++] = dev->page;
1822 }
1823 }
1824
1825 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1826 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1827 tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1828 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1829
1830 return tx;
1831}
1832
1833static struct dma_async_tx_descriptor *
1834ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
1835 struct dma_async_tx_descriptor *tx)
1836{
1837 struct page **blocks = to_addr_page(percpu, 0);
1838 unsigned int *offs = to_addr_offs(sh, percpu);
1839 int count;
1840 struct async_submit_ctl submit;
1841
1842 pr_debug("%s: stripe %llu\n", __func__,
1843 (unsigned long long)sh->sector);
1844
1845 count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_WANT_DRAIN);
1846
1847 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
1848 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1849 tx = async_gen_syndrome(blocks, offs, count+2,
1850 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1851
1852 return tx;
1853}
1854
1855static struct dma_async_tx_descriptor *
1856ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1857{
1858 struct r5conf *conf = sh->raid_conf;
1859 int disks = sh->disks;
1860 int i;
1861 struct stripe_head *head_sh = sh;
1862
1863 pr_debug("%s: stripe %llu\n", __func__,
1864 (unsigned long long)sh->sector);
1865
1866 for (i = disks; i--; ) {
1867 struct r5dev *dev;
1868 struct bio *chosen;
1869
1870 sh = head_sh;
1871 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
1872 struct bio *wbi;
1873
1874again:
1875 dev = &sh->dev[i];
1876
1877
1878
1879
1880 clear_bit(R5_InJournal, &dev->flags);
1881 spin_lock_irq(&sh->stripe_lock);
1882 chosen = dev->towrite;
1883 dev->towrite = NULL;
1884 sh->overwrite_disks = 0;
1885 BUG_ON(dev->written);
1886 wbi = dev->written = chosen;
1887 spin_unlock_irq(&sh->stripe_lock);
1888 WARN_ON(dev->page != dev->orig_page);
1889
1890 while (wbi && wbi->bi_iter.bi_sector <
1891 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1892 if (wbi->bi_opf & REQ_FUA)
1893 set_bit(R5_WantFUA, &dev->flags);
1894 if (wbi->bi_opf & REQ_SYNC)
1895 set_bit(R5_SyncIO, &dev->flags);
1896 if (bio_op(wbi) == REQ_OP_DISCARD)
1897 set_bit(R5_Discard, &dev->flags);
1898 else {
1899 tx = async_copy_data(1, wbi, &dev->page,
1900 dev->offset,
1901 dev->sector, tx, sh,
1902 r5c_is_writeback(conf->log));
1903 if (dev->page != dev->orig_page &&
1904 !r5c_is_writeback(conf->log)) {
1905 set_bit(R5_SkipCopy, &dev->flags);
1906 clear_bit(R5_UPTODATE, &dev->flags);
1907 clear_bit(R5_OVERWRITE, &dev->flags);
1908 }
1909 }
1910 wbi = r5_next_bio(conf, wbi, dev->sector);
1911 }
1912
1913 if (head_sh->batch_head) {
1914 sh = list_first_entry(&sh->batch_list,
1915 struct stripe_head,
1916 batch_list);
1917 if (sh == head_sh)
1918 continue;
1919 goto again;
1920 }
1921 }
1922 }
1923
1924 return tx;
1925}
1926
1927static void ops_complete_reconstruct(void *stripe_head_ref)
1928{
1929 struct stripe_head *sh = stripe_head_ref;
1930 int disks = sh->disks;
1931 int pd_idx = sh->pd_idx;
1932 int qd_idx = sh->qd_idx;
1933 int i;
1934 bool fua = false, sync = false, discard = false;
1935
1936 pr_debug("%s: stripe %llu\n", __func__,
1937 (unsigned long long)sh->sector);
1938
1939 for (i = disks; i--; ) {
1940 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1941 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1942 discard |= test_bit(R5_Discard, &sh->dev[i].flags);
1943 }
1944
1945 for (i = disks; i--; ) {
1946 struct r5dev *dev = &sh->dev[i];
1947
1948 if (dev->written || i == pd_idx || i == qd_idx) {
1949 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) {
1950 set_bit(R5_UPTODATE, &dev->flags);
1951 if (test_bit(STRIPE_EXPAND_READY, &sh->state))
1952 set_bit(R5_Expanded, &dev->flags);
1953 }
1954 if (fua)
1955 set_bit(R5_WantFUA, &dev->flags);
1956 if (sync)
1957 set_bit(R5_SyncIO, &dev->flags);
1958 }
1959 }
1960
1961 if (sh->reconstruct_state == reconstruct_state_drain_run)
1962 sh->reconstruct_state = reconstruct_state_drain_result;
1963 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
1964 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
1965 else {
1966 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
1967 sh->reconstruct_state = reconstruct_state_result;
1968 }
1969
1970 set_bit(STRIPE_HANDLE, &sh->state);
1971 raid5_release_stripe(sh);
1972}
1973
1974static void
1975ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1976 struct dma_async_tx_descriptor *tx)
1977{
1978 int disks = sh->disks;
1979 struct page **xor_srcs;
1980 unsigned int *off_srcs;
1981 struct async_submit_ctl submit;
1982 int count, pd_idx = sh->pd_idx, i;
1983 struct page *xor_dest;
1984 unsigned int off_dest;
1985 int prexor = 0;
1986 unsigned long flags;
1987 int j = 0;
1988 struct stripe_head *head_sh = sh;
1989 int last_stripe;
1990
1991 pr_debug("%s: stripe %llu\n", __func__,
1992 (unsigned long long)sh->sector);
1993
1994 for (i = 0; i < sh->disks; i++) {
1995 if (pd_idx == i)
1996 continue;
1997 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1998 break;
1999 }
2000 if (i >= sh->disks) {
2001 atomic_inc(&sh->count);
2002 set_bit(R5_Discard, &sh->dev[pd_idx].flags);
2003 ops_complete_reconstruct(sh);
2004 return;
2005 }
2006again:
2007 count = 0;
2008 xor_srcs = to_addr_page(percpu, j);
2009 off_srcs = to_addr_offs(sh, percpu);
2010
2011
2012
2013 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
2014 prexor = 1;
2015 off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
2016 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
2017 for (i = disks; i--; ) {
2018 struct r5dev *dev = &sh->dev[i];
2019 if (head_sh->dev[i].written ||
2020 test_bit(R5_InJournal, &head_sh->dev[i].flags)) {
2021 off_srcs[count] = dev->offset;
2022 xor_srcs[count++] = dev->page;
2023 }
2024 }
2025 } else {
2026 xor_dest = sh->dev[pd_idx].page;
2027 off_dest = sh->dev[pd_idx].offset;
2028 for (i = disks; i--; ) {
2029 struct r5dev *dev = &sh->dev[i];
2030 if (i != pd_idx) {
2031 off_srcs[count] = dev->offset;
2032 xor_srcs[count++] = dev->page;
2033 }
2034 }
2035 }
2036
2037
2038
2039
2040
2041
2042 last_stripe = !head_sh->batch_head ||
2043 list_first_entry(&sh->batch_list,
2044 struct stripe_head, batch_list) == head_sh;
2045 if (last_stripe) {
2046 flags = ASYNC_TX_ACK |
2047 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
2048
2049 atomic_inc(&head_sh->count);
2050 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
2051 to_addr_conv(sh, percpu, j));
2052 } else {
2053 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
2054 init_async_submit(&submit, flags, tx, NULL, NULL,
2055 to_addr_conv(sh, percpu, j));
2056 }
2057
2058 if (unlikely(count == 1))
2059 tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
2060 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
2061 else
2062 tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2063 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
2064 if (!last_stripe) {
2065 j++;
2066 sh = list_first_entry(&sh->batch_list, struct stripe_head,
2067 batch_list);
2068 goto again;
2069 }
2070}
2071
2072static void
2073ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
2074 struct dma_async_tx_descriptor *tx)
2075{
2076 struct async_submit_ctl submit;
2077 struct page **blocks;
2078 unsigned int *offs;
2079 int count, i, j = 0;
2080 struct stripe_head *head_sh = sh;
2081 int last_stripe;
2082 int synflags;
2083 unsigned long txflags;
2084
2085 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
2086
2087 for (i = 0; i < sh->disks; i++) {
2088 if (sh->pd_idx == i || sh->qd_idx == i)
2089 continue;
2090 if (!test_bit(R5_Discard, &sh->dev[i].flags))
2091 break;
2092 }
2093 if (i >= sh->disks) {
2094 atomic_inc(&sh->count);
2095 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
2096 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
2097 ops_complete_reconstruct(sh);
2098 return;
2099 }
2100
2101again:
2102 blocks = to_addr_page(percpu, j);
2103 offs = to_addr_offs(sh, percpu);
2104
2105 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
2106 synflags = SYNDROME_SRC_WRITTEN;
2107 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
2108 } else {
2109 synflags = SYNDROME_SRC_ALL;
2110 txflags = ASYNC_TX_ACK;
2111 }
2112
2113 count = set_syndrome_sources(blocks, offs, sh, synflags);
2114 last_stripe = !head_sh->batch_head ||
2115 list_first_entry(&sh->batch_list,
2116 struct stripe_head, batch_list) == head_sh;
2117
2118 if (last_stripe) {
2119 atomic_inc(&head_sh->count);
2120 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
2121 head_sh, to_addr_conv(sh, percpu, j));
2122 } else
2123 init_async_submit(&submit, 0, tx, NULL, NULL,
2124 to_addr_conv(sh, percpu, j));
2125 tx = async_gen_syndrome(blocks, offs, count+2,
2126 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
2127 if (!last_stripe) {
2128 j++;
2129 sh = list_first_entry(&sh->batch_list, struct stripe_head,
2130 batch_list);
2131 goto again;
2132 }
2133}
2134
2135static void ops_complete_check(void *stripe_head_ref)
2136{
2137 struct stripe_head *sh = stripe_head_ref;
2138
2139 pr_debug("%s: stripe %llu\n", __func__,
2140 (unsigned long long)sh->sector);
2141
2142 sh->check_state = check_state_check_result;
2143 set_bit(STRIPE_HANDLE, &sh->state);
2144 raid5_release_stripe(sh);
2145}
2146
2147static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
2148{
2149 int disks = sh->disks;
2150 int pd_idx = sh->pd_idx;
2151 int qd_idx = sh->qd_idx;
2152 struct page *xor_dest;
2153 unsigned int off_dest;
2154 struct page **xor_srcs = to_addr_page(percpu, 0);
2155 unsigned int *off_srcs = to_addr_offs(sh, percpu);
2156 struct dma_async_tx_descriptor *tx;
2157 struct async_submit_ctl submit;
2158 int count;
2159 int i;
2160
2161 pr_debug("%s: stripe %llu\n", __func__,
2162 (unsigned long long)sh->sector);
2163
2164 BUG_ON(sh->batch_head);
2165 count = 0;
2166 xor_dest = sh->dev[pd_idx].page;
2167 off_dest = sh->dev[pd_idx].offset;
2168 off_srcs[count] = off_dest;
2169 xor_srcs[count++] = xor_dest;
2170 for (i = disks; i--; ) {
2171 if (i == pd_idx || i == qd_idx)
2172 continue;
2173 off_srcs[count] = sh->dev[i].offset;
2174 xor_srcs[count++] = sh->dev[i].page;
2175 }
2176
2177 init_async_submit(&submit, 0, NULL, NULL, NULL,
2178 to_addr_conv(sh, percpu, 0));
2179 tx = async_xor_val_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2180 RAID5_STRIPE_SIZE(sh->raid_conf),
2181 &sh->ops.zero_sum_result, &submit);
2182
2183 atomic_inc(&sh->count);
2184 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
2185 tx = async_trigger_callback(&submit);
2186}
2187
2188static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
2189{
2190 struct page **srcs = to_addr_page(percpu, 0);
2191 unsigned int *offs = to_addr_offs(sh, percpu);
2192 struct async_submit_ctl submit;
2193 int count;
2194
2195 pr_debug("%s: stripe %llu checkp: %d\n", __func__,
2196 (unsigned long long)sh->sector, checkp);
2197
2198 BUG_ON(sh->batch_head);
2199 count = set_syndrome_sources(srcs, offs, sh, SYNDROME_SRC_ALL);
2200 if (!checkp)
2201 srcs[count] = NULL;
2202
2203 atomic_inc(&sh->count);
2204 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
2205 sh, to_addr_conv(sh, percpu, 0));
2206 async_syndrome_val(srcs, offs, count+2,
2207 RAID5_STRIPE_SIZE(sh->raid_conf),
2208 &sh->ops.zero_sum_result, percpu->spare_page, 0, &submit);
2209}
2210
2211static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
2212{
2213 int overlap_clear = 0, i, disks = sh->disks;
2214 struct dma_async_tx_descriptor *tx = NULL;
2215 struct r5conf *conf = sh->raid_conf;
2216 int level = conf->level;
2217 struct raid5_percpu *percpu;
2218 unsigned long cpu;
2219
2220 cpu = get_cpu();
2221 percpu = per_cpu_ptr(conf->percpu, cpu);
2222 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
2223 ops_run_biofill(sh);
2224 overlap_clear++;
2225 }
2226
2227 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
2228 if (level < 6)
2229 tx = ops_run_compute5(sh, percpu);
2230 else {
2231 if (sh->ops.target2 < 0 || sh->ops.target < 0)
2232 tx = ops_run_compute6_1(sh, percpu);
2233 else
2234 tx = ops_run_compute6_2(sh, percpu);
2235 }
2236
2237 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
2238 async_tx_ack(tx);
2239 }
2240
2241 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
2242 if (level < 6)
2243 tx = ops_run_prexor5(sh, percpu, tx);
2244 else
2245 tx = ops_run_prexor6(sh, percpu, tx);
2246 }
2247
2248 if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
2249 tx = ops_run_partial_parity(sh, percpu, tx);
2250
2251 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
2252 tx = ops_run_biodrain(sh, tx);
2253 overlap_clear++;
2254 }
2255
2256 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
2257 if (level < 6)
2258 ops_run_reconstruct5(sh, percpu, tx);
2259 else
2260 ops_run_reconstruct6(sh, percpu, tx);
2261 }
2262
2263 if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
2264 if (sh->check_state == check_state_run)
2265 ops_run_check_p(sh, percpu);
2266 else if (sh->check_state == check_state_run_q)
2267 ops_run_check_pq(sh, percpu, 0);
2268 else if (sh->check_state == check_state_run_pq)
2269 ops_run_check_pq(sh, percpu, 1);
2270 else
2271 BUG();
2272 }
2273
2274 if (overlap_clear && !sh->batch_head)
2275 for (i = disks; i--; ) {
2276 struct r5dev *dev = &sh->dev[i];
2277 if (test_and_clear_bit(R5_Overlap, &dev->flags))
2278 wake_up(&sh->raid_conf->wait_for_overlap);
2279 }
2280 put_cpu();
2281}
2282
2283static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
2284{
2285#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2286 kfree(sh->pages);
2287#endif
2288 if (sh->ppl_page)
2289 __free_page(sh->ppl_page);
2290 kmem_cache_free(sc, sh);
2291}
2292
2293static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
2294 int disks, struct r5conf *conf)
2295{
2296 struct stripe_head *sh;
2297 int i;
2298
2299 sh = kmem_cache_zalloc(sc, gfp);
2300 if (sh) {
2301 spin_lock_init(&sh->stripe_lock);
2302 spin_lock_init(&sh->batch_lock);
2303 INIT_LIST_HEAD(&sh->batch_list);
2304 INIT_LIST_HEAD(&sh->lru);
2305 INIT_LIST_HEAD(&sh->r5c);
2306 INIT_LIST_HEAD(&sh->log_list);
2307 atomic_set(&sh->count, 1);
2308 sh->raid_conf = conf;
2309 sh->log_start = MaxSector;
2310 for (i = 0; i < disks; i++) {
2311 struct r5dev *dev = &sh->dev[i];
2312
2313 bio_init(&dev->req, &dev->vec, 1);
2314 bio_init(&dev->rreq, &dev->rvec, 1);
2315 }
2316
2317 if (raid5_has_ppl(conf)) {
2318 sh->ppl_page = alloc_page(gfp);
2319 if (!sh->ppl_page) {
2320 free_stripe(sc, sh);
2321 return NULL;
2322 }
2323 }
2324#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2325 if (init_stripe_shared_pages(sh, conf, disks)) {
2326 free_stripe(sc, sh);
2327 return NULL;
2328 }
2329#endif
2330 }
2331 return sh;
2332}
2333static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
2334{
2335 struct stripe_head *sh;
2336
2337 sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
2338 if (!sh)
2339 return 0;
2340
2341 if (grow_buffers(sh, gfp)) {
2342 shrink_buffers(sh);
2343 free_stripe(conf->slab_cache, sh);
2344 return 0;
2345 }
2346 sh->hash_lock_index =
2347 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
2348
2349 atomic_inc(&conf->active_stripes);
2350
2351 raid5_release_stripe(sh);
2352 conf->max_nr_stripes++;
2353 return 1;
2354}
2355
2356static int grow_stripes(struct r5conf *conf, int num)
2357{
2358 struct kmem_cache *sc;
2359 size_t namelen = sizeof(conf->cache_name[0]);
2360 int devs = max(conf->raid_disks, conf->previous_raid_disks);
2361
2362 if (conf->mddev->gendisk)
2363 snprintf(conf->cache_name[0], namelen,
2364 "raid%d-%s", conf->level, mdname(conf->mddev));
2365 else
2366 snprintf(conf->cache_name[0], namelen,
2367 "raid%d-%p", conf->level, conf->mddev);
2368 snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]);
2369
2370 conf->active_name = 0;
2371 sc = kmem_cache_create(conf->cache_name[conf->active_name],
2372 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
2373 0, 0, NULL);
2374 if (!sc)
2375 return 1;
2376 conf->slab_cache = sc;
2377 conf->pool_size = devs;
2378 while (num--)
2379 if (!grow_one_stripe(conf, GFP_KERNEL))
2380 return 1;
2381
2382 return 0;
2383}
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401static int scribble_alloc(struct raid5_percpu *percpu,
2402 int num, int cnt)
2403{
2404 size_t obj_size =
2405 sizeof(struct page *) * (num + 2) +
2406 sizeof(addr_conv_t) * (num + 2) +
2407 sizeof(unsigned int) * (num + 2);
2408 void *scribble;
2409
2410
2411
2412
2413
2414
2415 scribble = kvmalloc_array(cnt, obj_size, GFP_KERNEL);
2416 if (!scribble)
2417 return -ENOMEM;
2418
2419 kvfree(percpu->scribble);
2420
2421 percpu->scribble = scribble;
2422 percpu->scribble_obj_size = obj_size;
2423 return 0;
2424}
2425
2426static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
2427{
2428 unsigned long cpu;
2429 int err = 0;
2430
2431
2432
2433
2434
2435
2436 if (conf->scribble_disks >= new_disks &&
2437 conf->scribble_sectors >= new_sectors)
2438 return 0;
2439 mddev_suspend(conf->mddev);
2440 get_online_cpus();
2441
2442 for_each_present_cpu(cpu) {
2443 struct raid5_percpu *percpu;
2444
2445 percpu = per_cpu_ptr(conf->percpu, cpu);
2446 err = scribble_alloc(percpu, new_disks,
2447 new_sectors / RAID5_STRIPE_SECTORS(conf));
2448 if (err)
2449 break;
2450 }
2451
2452 put_online_cpus();
2453 mddev_resume(conf->mddev);
2454 if (!err) {
2455 conf->scribble_disks = new_disks;
2456 conf->scribble_sectors = new_sectors;
2457 }
2458 return err;
2459}
2460
2461static int resize_stripes(struct r5conf *conf, int newsize)
2462{
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486 struct stripe_head *osh, *nsh;
2487 LIST_HEAD(newstripes);
2488 struct disk_info *ndisks;
2489 int err = 0;
2490 struct kmem_cache *sc;
2491 int i;
2492 int hash, cnt;
2493
2494 md_allow_write(conf->mddev);
2495
2496
2497 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
2498 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
2499 0, 0, NULL);
2500 if (!sc)
2501 return -ENOMEM;
2502
2503
2504 mutex_lock(&conf->cache_size_mutex);
2505
2506 for (i = conf->max_nr_stripes; i; i--) {
2507 nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
2508 if (!nsh)
2509 break;
2510
2511 list_add(&nsh->lru, &newstripes);
2512 }
2513 if (i) {
2514
2515 while (!list_empty(&newstripes)) {
2516 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2517 list_del(&nsh->lru);
2518 free_stripe(sc, nsh);
2519 }
2520 kmem_cache_destroy(sc);
2521 mutex_unlock(&conf->cache_size_mutex);
2522 return -ENOMEM;
2523 }
2524
2525
2526
2527
2528 hash = 0;
2529 cnt = 0;
2530 list_for_each_entry(nsh, &newstripes, lru) {
2531 lock_device_hash_lock(conf, hash);
2532 wait_event_cmd(conf->wait_for_stripe,
2533 !list_empty(conf->inactive_list + hash),
2534 unlock_device_hash_lock(conf, hash),
2535 lock_device_hash_lock(conf, hash));
2536 osh = get_free_stripe(conf, hash);
2537 unlock_device_hash_lock(conf, hash);
2538
2539#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2540 for (i = 0; i < osh->nr_pages; i++) {
2541 nsh->pages[i] = osh->pages[i];
2542 osh->pages[i] = NULL;
2543 }
2544#endif
2545 for(i=0; i<conf->pool_size; i++) {
2546 nsh->dev[i].page = osh->dev[i].page;
2547 nsh->dev[i].orig_page = osh->dev[i].page;
2548 nsh->dev[i].offset = osh->dev[i].offset;
2549 }
2550 nsh->hash_lock_index = hash;
2551 free_stripe(conf->slab_cache, osh);
2552 cnt++;
2553 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
2554 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
2555 hash++;
2556 cnt = 0;
2557 }
2558 }
2559 kmem_cache_destroy(conf->slab_cache);
2560
2561
2562
2563
2564
2565
2566 ndisks = kcalloc(newsize, sizeof(struct disk_info), GFP_NOIO);
2567 if (ndisks) {
2568 for (i = 0; i < conf->pool_size; i++)
2569 ndisks[i] = conf->disks[i];
2570
2571 for (i = conf->pool_size; i < newsize; i++) {
2572 ndisks[i].extra_page = alloc_page(GFP_NOIO);
2573 if (!ndisks[i].extra_page)
2574 err = -ENOMEM;
2575 }
2576
2577 if (err) {
2578 for (i = conf->pool_size; i < newsize; i++)
2579 if (ndisks[i].extra_page)
2580 put_page(ndisks[i].extra_page);
2581 kfree(ndisks);
2582 } else {
2583 kfree(conf->disks);
2584 conf->disks = ndisks;
2585 }
2586 } else
2587 err = -ENOMEM;
2588
2589 conf->slab_cache = sc;
2590 conf->active_name = 1-conf->active_name;
2591
2592
2593 while(!list_empty(&newstripes)) {
2594 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2595 list_del_init(&nsh->lru);
2596
2597#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2598 for (i = 0; i < nsh->nr_pages; i++) {
2599 if (nsh->pages[i])
2600 continue;
2601 nsh->pages[i] = alloc_page(GFP_NOIO);
2602 if (!nsh->pages[i])
2603 err = -ENOMEM;
2604 }
2605
2606 for (i = conf->raid_disks; i < newsize; i++) {
2607 if (nsh->dev[i].page)
2608 continue;
2609 nsh->dev[i].page = raid5_get_dev_page(nsh, i);
2610 nsh->dev[i].orig_page = nsh->dev[i].page;
2611 nsh->dev[i].offset = raid5_get_page_offset(nsh, i);
2612 }
2613#else
2614 for (i=conf->raid_disks; i < newsize; i++)
2615 if (nsh->dev[i].page == NULL) {
2616 struct page *p = alloc_page(GFP_NOIO);
2617 nsh->dev[i].page = p;
2618 nsh->dev[i].orig_page = p;
2619 nsh->dev[i].offset = 0;
2620 if (!p)
2621 err = -ENOMEM;
2622 }
2623#endif
2624 raid5_release_stripe(nsh);
2625 }
2626
2627
2628 if (!err)
2629 conf->pool_size = newsize;
2630 mutex_unlock(&conf->cache_size_mutex);
2631
2632 return err;
2633}
2634
2635static int drop_one_stripe(struct r5conf *conf)
2636{
2637 struct stripe_head *sh;
2638 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
2639
2640 spin_lock_irq(conf->hash_locks + hash);
2641 sh = get_free_stripe(conf, hash);
2642 spin_unlock_irq(conf->hash_locks + hash);
2643 if (!sh)
2644 return 0;
2645 BUG_ON(atomic_read(&sh->count));
2646 shrink_buffers(sh);
2647 free_stripe(conf->slab_cache, sh);
2648 atomic_dec(&conf->active_stripes);
2649 conf->max_nr_stripes--;
2650 return 1;
2651}
2652
2653static void shrink_stripes(struct r5conf *conf)
2654{
2655 while (conf->max_nr_stripes &&
2656 drop_one_stripe(conf))
2657 ;
2658
2659 kmem_cache_destroy(conf->slab_cache);
2660 conf->slab_cache = NULL;
2661}
2662
2663static void raid5_end_read_request(struct bio * bi)
2664{
2665 struct stripe_head *sh = bi->bi_private;
2666 struct r5conf *conf = sh->raid_conf;
2667 int disks = sh->disks, i;
2668 char b[BDEVNAME_SIZE];
2669 struct md_rdev *rdev = NULL;
2670 sector_t s;
2671
2672 for (i=0 ; i<disks; i++)
2673 if (bi == &sh->dev[i].req)
2674 break;
2675
2676 pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
2677 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2678 bi->bi_status);
2679 if (i == disks) {
2680 bio_reset(bi);
2681 BUG();
2682 return;
2683 }
2684 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2685
2686
2687
2688
2689
2690 rdev = conf->disks[i].replacement;
2691 if (!rdev)
2692 rdev = conf->disks[i].rdev;
2693
2694 if (use_new_offset(conf, sh))
2695 s = sh->sector + rdev->new_data_offset;
2696 else
2697 s = sh->sector + rdev->data_offset;
2698 if (!bi->bi_status) {
2699 set_bit(R5_UPTODATE, &sh->dev[i].flags);
2700 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2701
2702
2703
2704
2705 pr_info_ratelimited(
2706 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
2707 mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf),
2708 (unsigned long long)s,
2709 bdevname(rdev->bdev, b));
2710 atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors);
2711 clear_bit(R5_ReadError, &sh->dev[i].flags);
2712 clear_bit(R5_ReWrite, &sh->dev[i].flags);
2713 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2714 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2715
2716 if (test_bit(R5_InJournal, &sh->dev[i].flags))
2717
2718
2719
2720
2721 set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2722
2723 if (atomic_read(&rdev->read_errors))
2724 atomic_set(&rdev->read_errors, 0);
2725 } else {
2726 const char *bdn = bdevname(rdev->bdev, b);
2727 int retry = 0;
2728 int set_bad = 0;
2729
2730 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
2731 if (!(bi->bi_status == BLK_STS_PROTECTION))
2732 atomic_inc(&rdev->read_errors);
2733 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2734 pr_warn_ratelimited(
2735 "md/raid:%s: read error on replacement device (sector %llu on %s).\n",
2736 mdname(conf->mddev),
2737 (unsigned long long)s,
2738 bdn);
2739 else if (conf->mddev->degraded >= conf->max_degraded) {
2740 set_bad = 1;
2741 pr_warn_ratelimited(
2742 "md/raid:%s: read error not correctable (sector %llu on %s).\n",
2743 mdname(conf->mddev),
2744 (unsigned long long)s,
2745 bdn);
2746 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2747
2748 set_bad = 1;
2749 pr_warn_ratelimited(
2750 "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n",
2751 mdname(conf->mddev),
2752 (unsigned long long)s,
2753 bdn);
2754 } else if (atomic_read(&rdev->read_errors)
2755 > conf->max_nr_stripes) {
2756 if (!test_bit(Faulty, &rdev->flags)) {
2757 pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
2758 mdname(conf->mddev),
2759 atomic_read(&rdev->read_errors),
2760 conf->max_nr_stripes);
2761 pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
2762 mdname(conf->mddev), bdn);
2763 }
2764 } else
2765 retry = 1;
2766 if (set_bad && test_bit(In_sync, &rdev->flags)
2767 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2768 retry = 1;
2769 if (retry)
2770 if (sh->qd_idx >= 0 && sh->pd_idx == i)
2771 set_bit(R5_ReadError, &sh->dev[i].flags);
2772 else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
2773 set_bit(R5_ReadError, &sh->dev[i].flags);
2774 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2775 } else
2776 set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2777 else {
2778 clear_bit(R5_ReadError, &sh->dev[i].flags);
2779 clear_bit(R5_ReWrite, &sh->dev[i].flags);
2780 if (!(set_bad
2781 && test_bit(In_sync, &rdev->flags)
2782 && rdev_set_badblocks(
2783 rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)))
2784 md_error(conf->mddev, rdev);
2785 }
2786 }
2787 rdev_dec_pending(rdev, conf->mddev);
2788 bio_reset(bi);
2789 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2790 set_bit(STRIPE_HANDLE, &sh->state);
2791 raid5_release_stripe(sh);
2792}
2793
2794static void raid5_end_write_request(struct bio *bi)
2795{
2796 struct stripe_head *sh = bi->bi_private;
2797 struct r5conf *conf = sh->raid_conf;
2798 int disks = sh->disks, i;
2799 struct md_rdev *rdev;
2800 sector_t first_bad;
2801 int bad_sectors;
2802 int replacement = 0;
2803
2804 for (i = 0 ; i < disks; i++) {
2805 if (bi == &sh->dev[i].req) {
2806 rdev = conf->disks[i].rdev;
2807 break;
2808 }
2809 if (bi == &sh->dev[i].rreq) {
2810 rdev = conf->disks[i].replacement;
2811 if (rdev)
2812 replacement = 1;
2813 else
2814
2815
2816
2817
2818 rdev = conf->disks[i].rdev;
2819 break;
2820 }
2821 }
2822 pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
2823 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2824 bi->bi_status);
2825 if (i == disks) {
2826 bio_reset(bi);
2827 BUG();
2828 return;
2829 }
2830
2831 if (replacement) {
2832 if (bi->bi_status)
2833 md_error(conf->mddev, rdev);
2834 else if (is_badblock(rdev, sh->sector,
2835 RAID5_STRIPE_SECTORS(conf),
2836 &first_bad, &bad_sectors))
2837 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
2838 } else {
2839 if (bi->bi_status) {
2840 set_bit(STRIPE_DEGRADED, &sh->state);
2841 set_bit(WriteErrorSeen, &rdev->flags);
2842 set_bit(R5_WriteError, &sh->dev[i].flags);
2843 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2844 set_bit(MD_RECOVERY_NEEDED,
2845 &rdev->mddev->recovery);
2846 } else if (is_badblock(rdev, sh->sector,
2847 RAID5_STRIPE_SECTORS(conf),
2848 &first_bad, &bad_sectors)) {
2849 set_bit(R5_MadeGood, &sh->dev[i].flags);
2850 if (test_bit(R5_ReadError, &sh->dev[i].flags))
2851
2852
2853
2854
2855 set_bit(R5_ReWrite, &sh->dev[i].flags);
2856 }
2857 }
2858 rdev_dec_pending(rdev, conf->mddev);
2859
2860 if (sh->batch_head && bi->bi_status && !replacement)
2861 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
2862
2863 bio_reset(bi);
2864 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
2865 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2866 set_bit(STRIPE_HANDLE, &sh->state);
2867 raid5_release_stripe(sh);
2868
2869 if (sh->batch_head && sh != sh->batch_head)
2870 raid5_release_stripe(sh->batch_head);
2871}
2872
2873static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
2874{
2875 char b[BDEVNAME_SIZE];
2876 struct r5conf *conf = mddev->private;
2877 unsigned long flags;
2878 pr_debug("raid456: error called\n");
2879
2880 spin_lock_irqsave(&conf->device_lock, flags);
2881
2882 if (test_bit(In_sync, &rdev->flags) &&
2883 mddev->degraded == conf->max_degraded) {
2884
2885
2886
2887
2888 conf->recovery_disabled = mddev->recovery_disabled;
2889 spin_unlock_irqrestore(&conf->device_lock, flags);
2890 return;
2891 }
2892
2893 set_bit(Faulty, &rdev->flags);
2894 clear_bit(In_sync, &rdev->flags);
2895 mddev->degraded = raid5_calc_degraded(conf);
2896 spin_unlock_irqrestore(&conf->device_lock, flags);
2897 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2898
2899 set_bit(Blocked, &rdev->flags);
2900 set_mask_bits(&mddev->sb_flags, 0,
2901 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
2902 pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
2903 "md/raid:%s: Operation continuing on %d devices.\n",
2904 mdname(mddev),
2905 bdevname(rdev->bdev, b),
2906 mdname(mddev),
2907 conf->raid_disks - mddev->degraded);
2908 r5c_update_on_rdev_error(mddev, rdev);
2909}
2910
2911
2912
2913
2914
2915sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2916 int previous, int *dd_idx,
2917 struct stripe_head *sh)
2918{
2919 sector_t stripe, stripe2;
2920 sector_t chunk_number;
2921 unsigned int chunk_offset;
2922 int pd_idx, qd_idx;
2923 int ddf_layout = 0;
2924 sector_t new_sector;
2925 int algorithm = previous ? conf->prev_algo
2926 : conf->algorithm;
2927 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2928 : conf->chunk_sectors;
2929 int raid_disks = previous ? conf->previous_raid_disks
2930 : conf->raid_disks;
2931 int data_disks = raid_disks - conf->max_degraded;
2932
2933
2934
2935
2936
2937
2938 chunk_offset = sector_div(r_sector, sectors_per_chunk);
2939 chunk_number = r_sector;
2940
2941
2942
2943
2944 stripe = chunk_number;
2945 *dd_idx = sector_div(stripe, data_disks);
2946 stripe2 = stripe;
2947
2948
2949
2950 pd_idx = qd_idx = -1;
2951 switch(conf->level) {
2952 case 4:
2953 pd_idx = data_disks;
2954 break;
2955 case 5:
2956 switch (algorithm) {
2957 case ALGORITHM_LEFT_ASYMMETRIC:
2958 pd_idx = data_disks - sector_div(stripe2, raid_disks);
2959 if (*dd_idx >= pd_idx)
2960 (*dd_idx)++;
2961 break;
2962 case ALGORITHM_RIGHT_ASYMMETRIC:
2963 pd_idx = sector_div(stripe2, raid_disks);
2964 if (*dd_idx >= pd_idx)
2965 (*dd_idx)++;
2966 break;
2967 case ALGORITHM_LEFT_SYMMETRIC:
2968 pd_idx = data_disks - sector_div(stripe2, raid_disks);
2969 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2970 break;
2971 case ALGORITHM_RIGHT_SYMMETRIC:
2972 pd_idx = sector_div(stripe2, raid_disks);
2973 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2974 break;
2975 case ALGORITHM_PARITY_0:
2976 pd_idx = 0;
2977 (*dd_idx)++;
2978 break;
2979 case ALGORITHM_PARITY_N:
2980 pd_idx = data_disks;
2981 break;
2982 default:
2983 BUG();
2984 }
2985 break;
2986 case 6:
2987
2988 switch (algorithm) {
2989 case ALGORITHM_LEFT_ASYMMETRIC:
2990 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2991 qd_idx = pd_idx + 1;
2992 if (pd_idx == raid_disks-1) {
2993 (*dd_idx)++;
2994 qd_idx = 0;
2995 } else if (*dd_idx >= pd_idx)
2996 (*dd_idx) += 2;
2997 break;
2998 case ALGORITHM_RIGHT_ASYMMETRIC:
2999 pd_idx = sector_div(stripe2, raid_disks);
3000 qd_idx = pd_idx + 1;
3001 if (pd_idx == raid_disks-1) {
3002 (*dd_idx)++;
3003 qd_idx = 0;
3004 } else if (*dd_idx >= pd_idx)
3005 (*dd_idx) += 2;
3006 break;
3007 case ALGORITHM_LEFT_SYMMETRIC:
3008 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3009 qd_idx = (pd_idx + 1) % raid_disks;
3010 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
3011 break;
3012 case ALGORITHM_RIGHT_SYMMETRIC:
3013 pd_idx = sector_div(stripe2, raid_disks);
3014 qd_idx = (pd_idx + 1) % raid_disks;
3015 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
3016 break;
3017
3018 case ALGORITHM_PARITY_0:
3019 pd_idx = 0;
3020 qd_idx = 1;
3021 (*dd_idx) += 2;
3022 break;
3023 case ALGORITHM_PARITY_N:
3024 pd_idx = data_disks;
3025 qd_idx = data_disks + 1;
3026 break;
3027
3028 case ALGORITHM_ROTATING_ZERO_RESTART:
3029
3030
3031
3032 pd_idx = sector_div(stripe2, raid_disks);
3033 qd_idx = pd_idx + 1;
3034 if (pd_idx == raid_disks-1) {
3035 (*dd_idx)++;
3036 qd_idx = 0;
3037 } else if (*dd_idx >= pd_idx)
3038 (*dd_idx) += 2;
3039 ddf_layout = 1;
3040 break;
3041
3042 case ALGORITHM_ROTATING_N_RESTART:
3043
3044
3045
3046
3047 stripe2 += 1;
3048 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3049 qd_idx = pd_idx + 1;
3050 if (pd_idx == raid_disks-1) {
3051 (*dd_idx)++;
3052 qd_idx = 0;
3053 } else if (*dd_idx >= pd_idx)
3054 (*dd_idx) += 2;
3055 ddf_layout = 1;
3056 break;
3057
3058 case ALGORITHM_ROTATING_N_CONTINUE:
3059
3060 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3061 qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
3062 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
3063 ddf_layout = 1;
3064 break;
3065
3066 case ALGORITHM_LEFT_ASYMMETRIC_6:
3067
3068 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
3069 if (*dd_idx >= pd_idx)
3070 (*dd_idx)++;
3071 qd_idx = raid_disks - 1;
3072 break;
3073
3074 case ALGORITHM_RIGHT_ASYMMETRIC_6:
3075 pd_idx = sector_div(stripe2, raid_disks-1);
3076 if (*dd_idx >= pd_idx)
3077 (*dd_idx)++;
3078 qd_idx = raid_disks - 1;
3079 break;
3080
3081 case ALGORITHM_LEFT_SYMMETRIC_6:
3082 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
3083 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
3084 qd_idx = raid_disks - 1;
3085 break;
3086
3087 case ALGORITHM_RIGHT_SYMMETRIC_6:
3088 pd_idx = sector_div(stripe2, raid_disks-1);
3089 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
3090 qd_idx = raid_disks - 1;
3091 break;
3092
3093 case ALGORITHM_PARITY_0_6:
3094 pd_idx = 0;
3095 (*dd_idx)++;
3096 qd_idx = raid_disks - 1;
3097 break;
3098
3099 default:
3100 BUG();
3101 }
3102 break;
3103 }
3104
3105 if (sh) {
3106 sh->pd_idx = pd_idx;
3107 sh->qd_idx = qd_idx;
3108 sh->ddf_layout = ddf_layout;
3109 }
3110
3111
3112
3113 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
3114 return new_sector;
3115}
3116
3117sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
3118{
3119 struct r5conf *conf = sh->raid_conf;
3120 int raid_disks = sh->disks;
3121 int data_disks = raid_disks - conf->max_degraded;
3122 sector_t new_sector = sh->sector, check;
3123 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
3124 : conf->chunk_sectors;
3125 int algorithm = previous ? conf->prev_algo
3126 : conf->algorithm;
3127 sector_t stripe;
3128 int chunk_offset;
3129 sector_t chunk_number;
3130 int dummy1, dd_idx = i;
3131 sector_t r_sector;
3132 struct stripe_head sh2;
3133
3134 chunk_offset = sector_div(new_sector, sectors_per_chunk);
3135 stripe = new_sector;
3136
3137 if (i == sh->pd_idx)
3138 return 0;
3139 switch(conf->level) {
3140 case 4: break;
3141 case 5:
3142 switch (algorithm) {
3143 case ALGORITHM_LEFT_ASYMMETRIC:
3144 case ALGORITHM_RIGHT_ASYMMETRIC:
3145 if (i > sh->pd_idx)
3146 i--;
3147 break;
3148 case ALGORITHM_LEFT_SYMMETRIC:
3149 case ALGORITHM_RIGHT_SYMMETRIC:
3150 if (i < sh->pd_idx)
3151 i += raid_disks;
3152 i -= (sh->pd_idx + 1);
3153 break;
3154 case ALGORITHM_PARITY_0:
3155 i -= 1;
3156 break;
3157 case ALGORITHM_PARITY_N:
3158 break;
3159 default:
3160 BUG();
3161 }
3162 break;
3163 case 6:
3164 if (i == sh->qd_idx)
3165 return 0;
3166 switch (algorithm) {
3167 case ALGORITHM_LEFT_ASYMMETRIC:
3168 case ALGORITHM_RIGHT_ASYMMETRIC:
3169 case ALGORITHM_ROTATING_ZERO_RESTART:
3170 case ALGORITHM_ROTATING_N_RESTART:
3171 if (sh->pd_idx == raid_disks-1)
3172 i--;
3173 else if (i > sh->pd_idx)
3174 i -= 2;
3175 break;
3176 case ALGORITHM_LEFT_SYMMETRIC:
3177 case ALGORITHM_RIGHT_SYMMETRIC:
3178 if (sh->pd_idx == raid_disks-1)
3179 i--;
3180 else {
3181
3182 if (i < sh->pd_idx)
3183 i += raid_disks;
3184 i -= (sh->pd_idx + 2);
3185 }
3186 break;
3187 case ALGORITHM_PARITY_0:
3188 i -= 2;
3189 break;
3190 case ALGORITHM_PARITY_N:
3191 break;
3192 case ALGORITHM_ROTATING_N_CONTINUE:
3193
3194 if (sh->pd_idx == 0)
3195 i--;
3196 else {
3197
3198 if (i < sh->pd_idx)
3199 i += raid_disks;
3200 i -= (sh->pd_idx + 1);
3201 }
3202 break;
3203 case ALGORITHM_LEFT_ASYMMETRIC_6:
3204 case ALGORITHM_RIGHT_ASYMMETRIC_6:
3205 if (i > sh->pd_idx)
3206 i--;
3207 break;
3208 case ALGORITHM_LEFT_SYMMETRIC_6:
3209 case ALGORITHM_RIGHT_SYMMETRIC_6:
3210 if (i < sh->pd_idx)
3211 i += data_disks + 1;
3212 i -= (sh->pd_idx + 1);
3213 break;
3214 case ALGORITHM_PARITY_0_6:
3215 i -= 1;
3216 break;
3217 default:
3218 BUG();
3219 }
3220 break;
3221 }
3222
3223 chunk_number = stripe * data_disks + i;
3224 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
3225
3226 check = raid5_compute_sector(conf, r_sector,
3227 previous, &dummy1, &sh2);
3228 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
3229 || sh2.qd_idx != sh->qd_idx) {
3230 pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
3231 mdname(conf->mddev));
3232 return 0;
3233 }
3234 return r_sector;
3235}
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275static inline bool delay_towrite(struct r5conf *conf,
3276 struct r5dev *dev,
3277 struct stripe_head_state *s)
3278{
3279
3280 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3281 !test_bit(R5_Insync, &dev->flags) && s->injournal)
3282 return true;
3283
3284 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
3285 s->injournal > 0)
3286 return true;
3287
3288 if (s->log_failed && s->injournal)
3289 return true;
3290 return false;
3291}
3292
3293static void
3294schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
3295 int rcw, int expand)
3296{
3297 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
3298 struct r5conf *conf = sh->raid_conf;
3299 int level = conf->level;
3300
3301 if (rcw) {
3302
3303
3304
3305
3306
3307
3308 r5c_release_extra_page(sh);
3309
3310 for (i = disks; i--; ) {
3311 struct r5dev *dev = &sh->dev[i];
3312
3313 if (dev->towrite && !delay_towrite(conf, dev, s)) {
3314 set_bit(R5_LOCKED, &dev->flags);
3315 set_bit(R5_Wantdrain, &dev->flags);
3316 if (!expand)
3317 clear_bit(R5_UPTODATE, &dev->flags);
3318 s->locked++;
3319 } else if (test_bit(R5_InJournal, &dev->flags)) {
3320 set_bit(R5_LOCKED, &dev->flags);
3321 s->locked++;
3322 }
3323 }
3324
3325
3326
3327
3328 if (!expand) {
3329 if (!s->locked)
3330
3331 return;
3332 sh->reconstruct_state = reconstruct_state_drain_run;
3333 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3334 } else
3335 sh->reconstruct_state = reconstruct_state_run;
3336
3337 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3338
3339 if (s->locked + conf->max_degraded == disks)
3340 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
3341 atomic_inc(&conf->pending_full_writes);
3342 } else {
3343 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
3344 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
3345 BUG_ON(level == 6 &&
3346 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
3347 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
3348
3349 for (i = disks; i--; ) {
3350 struct r5dev *dev = &sh->dev[i];
3351 if (i == pd_idx || i == qd_idx)
3352 continue;
3353
3354 if (dev->towrite &&
3355 (test_bit(R5_UPTODATE, &dev->flags) ||
3356 test_bit(R5_Wantcompute, &dev->flags))) {
3357 set_bit(R5_Wantdrain, &dev->flags);
3358 set_bit(R5_LOCKED, &dev->flags);
3359 clear_bit(R5_UPTODATE, &dev->flags);
3360 s->locked++;
3361 } else if (test_bit(R5_InJournal, &dev->flags)) {
3362 set_bit(R5_LOCKED, &dev->flags);
3363 s->locked++;
3364 }
3365 }
3366 if (!s->locked)
3367
3368 return;
3369 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
3370 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
3371 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3372 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3373 }
3374
3375
3376
3377
3378 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
3379 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
3380 s->locked++;
3381
3382 if (level == 6) {
3383 int qd_idx = sh->qd_idx;
3384 struct r5dev *dev = &sh->dev[qd_idx];
3385
3386 set_bit(R5_LOCKED, &dev->flags);
3387 clear_bit(R5_UPTODATE, &dev->flags);
3388 s->locked++;
3389 }
3390
3391 if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
3392 test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
3393 !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
3394 test_bit(R5_Insync, &sh->dev[pd_idx].flags))
3395 set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
3396
3397 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
3398 __func__, (unsigned long long)sh->sector,
3399 s->locked, s->ops_request);
3400}
3401
3402
3403
3404
3405
3406
3407static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
3408 int forwrite, int previous)
3409{
3410 struct bio **bip;
3411 struct r5conf *conf = sh->raid_conf;
3412 int firstwrite=0;
3413
3414 pr_debug("adding bi b#%llu to stripe s#%llu\n",
3415 (unsigned long long)bi->bi_iter.bi_sector,
3416 (unsigned long long)sh->sector);
3417
3418 spin_lock_irq(&sh->stripe_lock);
3419 sh->dev[dd_idx].write_hint = bi->bi_write_hint;
3420
3421 if (sh->batch_head)
3422 goto overlap;
3423 if (forwrite) {
3424 bip = &sh->dev[dd_idx].towrite;
3425 if (*bip == NULL)
3426 firstwrite = 1;
3427 } else
3428 bip = &sh->dev[dd_idx].toread;
3429 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
3430 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
3431 goto overlap;
3432 bip = & (*bip)->bi_next;
3433 }
3434 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
3435 goto overlap;
3436
3437 if (forwrite && raid5_has_ppl(conf)) {
3438
3439
3440
3441
3442
3443
3444
3445 sector_t sector;
3446 sector_t first = 0;
3447 sector_t last = 0;
3448 int count = 0;
3449 int i;
3450
3451 for (i = 0; i < sh->disks; i++) {
3452 if (i != sh->pd_idx &&
3453 (i == dd_idx || sh->dev[i].towrite)) {
3454 sector = sh->dev[i].sector;
3455 if (count == 0 || sector < first)
3456 first = sector;
3457 if (sector > last)
3458 last = sector;
3459 count++;
3460 }
3461 }
3462
3463 if (first + conf->chunk_sectors * (count - 1) != last)
3464 goto overlap;
3465 }
3466
3467 if (!forwrite || previous)
3468 clear_bit(STRIPE_BATCH_READY, &sh->state);
3469
3470 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
3471 if (*bip)
3472 bi->bi_next = *bip;
3473 *bip = bi;
3474 bio_inc_remaining(bi);
3475 md_write_inc(conf->mddev, bi);
3476
3477 if (forwrite) {
3478
3479 sector_t sector = sh->dev[dd_idx].sector;
3480 for (bi=sh->dev[dd_idx].towrite;
3481 sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) &&
3482 bi && bi->bi_iter.bi_sector <= sector;
3483 bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) {
3484 if (bio_end_sector(bi) >= sector)
3485 sector = bio_end_sector(bi);
3486 }
3487 if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf))
3488 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
3489 sh->overwrite_disks++;
3490 }
3491
3492 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
3493 (unsigned long long)(*bip)->bi_iter.bi_sector,
3494 (unsigned long long)sh->sector, dd_idx);
3495
3496 if (conf->mddev->bitmap && firstwrite) {
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509 set_bit(STRIPE_BITMAP_PENDING, &sh->state);
3510 spin_unlock_irq(&sh->stripe_lock);
3511 md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
3512 RAID5_STRIPE_SECTORS(conf), 0);
3513 spin_lock_irq(&sh->stripe_lock);
3514 clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
3515 if (!sh->batch_head) {
3516 sh->bm_seq = conf->seq_flush+1;
3517 set_bit(STRIPE_BIT_DELAY, &sh->state);
3518 }
3519 }
3520 spin_unlock_irq(&sh->stripe_lock);
3521
3522 if (stripe_can_batch(sh))
3523 stripe_add_to_batch_list(conf, sh);
3524 return 1;
3525
3526 overlap:
3527 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
3528 spin_unlock_irq(&sh->stripe_lock);
3529 return 0;
3530}
3531
3532static void end_reshape(struct r5conf *conf);
3533
3534static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
3535 struct stripe_head *sh)
3536{
3537 int sectors_per_chunk =
3538 previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
3539 int dd_idx;
3540 int chunk_offset = sector_div(stripe, sectors_per_chunk);
3541 int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
3542
3543 raid5_compute_sector(conf,
3544 stripe * (disks - conf->max_degraded)
3545 *sectors_per_chunk + chunk_offset,
3546 previous,
3547 &dd_idx, sh);
3548}
3549
3550static void
3551handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
3552 struct stripe_head_state *s, int disks)
3553{
3554 int i;
3555 BUG_ON(sh->batch_head);
3556 for (i = disks; i--; ) {
3557 struct bio *bi;
3558 int bitmap_end = 0;
3559
3560 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
3561 struct md_rdev *rdev;
3562 rcu_read_lock();
3563 rdev = rcu_dereference(conf->disks[i].rdev);
3564 if (rdev && test_bit(In_sync, &rdev->flags) &&
3565 !test_bit(Faulty, &rdev->flags))
3566 atomic_inc(&rdev->nr_pending);
3567 else
3568 rdev = NULL;
3569 rcu_read_unlock();
3570 if (rdev) {
3571 if (!rdev_set_badblocks(
3572 rdev,
3573 sh->sector,
3574 RAID5_STRIPE_SECTORS(conf), 0))
3575 md_error(conf->mddev, rdev);
3576 rdev_dec_pending(rdev, conf->mddev);
3577 }
3578 }
3579 spin_lock_irq(&sh->stripe_lock);
3580
3581 bi = sh->dev[i].towrite;
3582 sh->dev[i].towrite = NULL;
3583 sh->overwrite_disks = 0;
3584 spin_unlock_irq(&sh->stripe_lock);
3585 if (bi)
3586 bitmap_end = 1;
3587
3588 log_stripe_write_finished(sh);
3589
3590 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3591 wake_up(&conf->wait_for_overlap);
3592
3593 while (bi && bi->bi_iter.bi_sector <
3594 sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3595 struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector);
3596
3597 md_write_end(conf->mddev);
3598 bio_io_error(bi);
3599 bi = nextbi;
3600 }
3601 if (bitmap_end)
3602 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3603 RAID5_STRIPE_SECTORS(conf), 0, 0);
3604 bitmap_end = 0;
3605
3606 bi = sh->dev[i].written;
3607 sh->dev[i].written = NULL;
3608 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
3609 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
3610 sh->dev[i].page = sh->dev[i].orig_page;
3611 }
3612
3613 if (bi) bitmap_end = 1;
3614 while (bi && bi->bi_iter.bi_sector <
3615 sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3616 struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
3617
3618 md_write_end(conf->mddev);
3619 bio_io_error(bi);
3620 bi = bi2;
3621 }
3622
3623
3624
3625
3626 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
3627 s->failed > conf->max_degraded &&
3628 (!test_bit(R5_Insync, &sh->dev[i].flags) ||
3629 test_bit(R5_ReadError, &sh->dev[i].flags))) {
3630 spin_lock_irq(&sh->stripe_lock);
3631 bi = sh->dev[i].toread;
3632 sh->dev[i].toread = NULL;
3633 spin_unlock_irq(&sh->stripe_lock);
3634 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3635 wake_up(&conf->wait_for_overlap);
3636 if (bi)
3637 s->to_read--;
3638 while (bi && bi->bi_iter.bi_sector <
3639 sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3640 struct bio *nextbi =
3641 r5_next_bio(conf, bi, sh->dev[i].sector);
3642
3643 bio_io_error(bi);
3644 bi = nextbi;
3645 }
3646 }
3647 if (bitmap_end)
3648 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3649 RAID5_STRIPE_SECTORS(conf), 0, 0);
3650
3651
3652
3653 clear_bit(R5_LOCKED, &sh->dev[i].flags);
3654 }
3655 s->to_write = 0;
3656 s->written = 0;
3657
3658 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3659 if (atomic_dec_and_test(&conf->pending_full_writes))
3660 md_wakeup_thread(conf->mddev->thread);
3661}
3662
3663static void
3664handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
3665 struct stripe_head_state *s)
3666{
3667 int abort = 0;
3668 int i;
3669
3670 BUG_ON(sh->batch_head);
3671 clear_bit(STRIPE_SYNCING, &sh->state);
3672 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
3673 wake_up(&conf->wait_for_overlap);
3674 s->syncing = 0;
3675 s->replacing = 0;
3676
3677
3678
3679
3680
3681
3682
3683 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
3684
3685
3686
3687 rcu_read_lock();
3688 for (i = 0; i < conf->raid_disks; i++) {
3689 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
3690 if (rdev
3691 && !test_bit(Faulty, &rdev->flags)
3692 && !test_bit(In_sync, &rdev->flags)
3693 && !rdev_set_badblocks(rdev, sh->sector,
3694 RAID5_STRIPE_SECTORS(conf), 0))
3695 abort = 1;
3696 rdev = rcu_dereference(conf->disks[i].replacement);
3697 if (rdev
3698 && !test_bit(Faulty, &rdev->flags)
3699 && !test_bit(In_sync, &rdev->flags)
3700 && !rdev_set_badblocks(rdev, sh->sector,
3701 RAID5_STRIPE_SECTORS(conf), 0))
3702 abort = 1;
3703 }
3704 rcu_read_unlock();
3705 if (abort)
3706 conf->recovery_disabled =
3707 conf->mddev->recovery_disabled;
3708 }
3709 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort);
3710}
3711
3712static int want_replace(struct stripe_head *sh, int disk_idx)
3713{
3714 struct md_rdev *rdev;
3715 int rv = 0;
3716
3717 rcu_read_lock();
3718 rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement);
3719 if (rdev
3720 && !test_bit(Faulty, &rdev->flags)
3721 && !test_bit(In_sync, &rdev->flags)
3722 && (rdev->recovery_offset <= sh->sector
3723 || rdev->mddev->recovery_cp <= sh->sector))
3724 rv = 1;
3725 rcu_read_unlock();
3726 return rv;
3727}
3728
3729static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
3730 int disk_idx, int disks)
3731{
3732 struct r5dev *dev = &sh->dev[disk_idx];
3733 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
3734 &sh->dev[s->failed_num[1]] };
3735 int i;
3736 bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
3737
3738
3739 if (test_bit(R5_LOCKED, &dev->flags) ||
3740 test_bit(R5_UPTODATE, &dev->flags))
3741
3742
3743
3744 return 0;
3745
3746 if (dev->toread ||
3747 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
3748
3749 return 1;
3750
3751 if (s->syncing || s->expanding ||
3752 (s->replacing && want_replace(sh, disk_idx)))
3753
3754
3755
3756 return 1;
3757
3758 if ((s->failed >= 1 && fdev[0]->toread) ||
3759 (s->failed >= 2 && fdev[1]->toread))
3760
3761
3762
3763 return 1;
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773 if (!s->failed || !s->to_write)
3774 return 0;
3775
3776 if (test_bit(R5_Insync, &dev->flags) &&
3777 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3778
3779
3780
3781
3782
3783 return 0;
3784
3785 for (i = 0; i < s->failed && i < 2; i++) {
3786 if (fdev[i]->towrite &&
3787 !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3788 !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3789
3790
3791
3792
3793
3794 return 1;
3795
3796 if (s->failed >= 2 &&
3797 (fdev[i]->towrite ||
3798 s->failed_num[i] == sh->pd_idx ||
3799 s->failed_num[i] == sh->qd_idx) &&
3800 !test_bit(R5_UPTODATE, &fdev[i]->flags))
3801
3802
3803
3804
3805 force_rcw = true;
3806 }
3807
3808
3809
3810
3811
3812
3813
3814
3815 if (!force_rcw &&
3816 sh->sector < sh->raid_conf->mddev->recovery_cp)
3817
3818 return 0;
3819 for (i = 0; i < s->failed && i < 2; i++) {
3820 if (s->failed_num[i] != sh->pd_idx &&
3821 s->failed_num[i] != sh->qd_idx &&
3822 !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3823 !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3824 return 1;
3825 }
3826
3827 return 0;
3828}
3829
3830
3831
3832
3833
3834
3835
3836static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
3837 int disk_idx, int disks)
3838{
3839 struct r5dev *dev = &sh->dev[disk_idx];
3840
3841
3842 if (need_this_block(sh, s, disk_idx, disks)) {
3843
3844
3845
3846 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
3847 BUG_ON(test_bit(R5_Wantread, &dev->flags));
3848 BUG_ON(sh->batch_head);
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859 if ((s->uptodate == disks - 1) &&
3860 ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
3861 (s->failed && (disk_idx == s->failed_num[0] ||
3862 disk_idx == s->failed_num[1])))) {
3863
3864
3865
3866 pr_debug("Computing stripe %llu block %d\n",
3867 (unsigned long long)sh->sector, disk_idx);
3868 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3869 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3870 set_bit(R5_Wantcompute, &dev->flags);
3871 sh->ops.target = disk_idx;
3872 sh->ops.target2 = -1;
3873 s->req_compute = 1;
3874
3875
3876
3877
3878
3879
3880 s->uptodate++;
3881 return 1;
3882 } else if (s->uptodate == disks-2 && s->failed >= 2) {
3883
3884
3885
3886 int other;
3887 for (other = disks; other--; ) {
3888 if (other == disk_idx)
3889 continue;
3890 if (!test_bit(R5_UPTODATE,
3891 &sh->dev[other].flags))
3892 break;
3893 }
3894 BUG_ON(other < 0);
3895 pr_debug("Computing stripe %llu blocks %d,%d\n",
3896 (unsigned long long)sh->sector,
3897 disk_idx, other);
3898 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3899 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3900 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
3901 set_bit(R5_Wantcompute, &sh->dev[other].flags);
3902 sh->ops.target = disk_idx;
3903 sh->ops.target2 = other;
3904 s->uptodate += 2;
3905 s->req_compute = 1;
3906 return 1;
3907 } else if (test_bit(R5_Insync, &dev->flags)) {
3908 set_bit(R5_LOCKED, &dev->flags);
3909 set_bit(R5_Wantread, &dev->flags);
3910 s->locked++;
3911 pr_debug("Reading block %d (sync=%d)\n",
3912 disk_idx, s->syncing);
3913 }
3914 }
3915
3916 return 0;
3917}
3918
3919
3920
3921
3922static void handle_stripe_fill(struct stripe_head *sh,
3923 struct stripe_head_state *s,
3924 int disks)
3925{
3926 int i;
3927
3928
3929
3930
3931
3932 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
3933 !sh->reconstruct_state) {
3934
3935
3936
3937
3938
3939
3940
3941
3942 if (s->injournal && s->failed) {
3943 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
3944 r5c_make_stripe_write_out(sh);
3945 goto out;
3946 }
3947
3948 for (i = disks; i--; )
3949 if (fetch_block(sh, s, i, disks))
3950 break;
3951 }
3952out:
3953 set_bit(STRIPE_HANDLE, &sh->state);
3954}
3955
3956static void break_stripe_batch_list(struct stripe_head *head_sh,
3957 unsigned long handle_flags);
3958
3959
3960
3961
3962
3963static void handle_stripe_clean_event(struct r5conf *conf,
3964 struct stripe_head *sh, int disks)
3965{
3966 int i;
3967 struct r5dev *dev;
3968 int discard_pending = 0;
3969 struct stripe_head *head_sh = sh;
3970 bool do_endio = false;
3971
3972 for (i = disks; i--; )
3973 if (sh->dev[i].written) {
3974 dev = &sh->dev[i];
3975 if (!test_bit(R5_LOCKED, &dev->flags) &&
3976 (test_bit(R5_UPTODATE, &dev->flags) ||
3977 test_bit(R5_Discard, &dev->flags) ||
3978 test_bit(R5_SkipCopy, &dev->flags))) {
3979
3980 struct bio *wbi, *wbi2;
3981 pr_debug("Return write for disc %d\n", i);
3982 if (test_and_clear_bit(R5_Discard, &dev->flags))
3983 clear_bit(R5_UPTODATE, &dev->flags);
3984 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
3985 WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
3986 }
3987 do_endio = true;
3988
3989returnbi:
3990 dev->page = dev->orig_page;
3991 wbi = dev->written;
3992 dev->written = NULL;
3993 while (wbi && wbi->bi_iter.bi_sector <
3994 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
3995 wbi2 = r5_next_bio(conf, wbi, dev->sector);
3996 md_write_end(conf->mddev);
3997 bio_endio(wbi);
3998 wbi = wbi2;
3999 }
4000 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
4001 RAID5_STRIPE_SECTORS(conf),
4002 !test_bit(STRIPE_DEGRADED, &sh->state),
4003 0);
4004 if (head_sh->batch_head) {
4005 sh = list_first_entry(&sh->batch_list,
4006 struct stripe_head,
4007 batch_list);
4008 if (sh != head_sh) {
4009 dev = &sh->dev[i];
4010 goto returnbi;
4011 }
4012 }
4013 sh = head_sh;
4014 dev = &sh->dev[i];
4015 } else if (test_bit(R5_Discard, &dev->flags))
4016 discard_pending = 1;
4017 }
4018
4019 log_stripe_write_finished(sh);
4020
4021 if (!discard_pending &&
4022 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
4023 int hash;
4024 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
4025 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
4026 if (sh->qd_idx >= 0) {
4027 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
4028 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
4029 }
4030
4031 clear_bit(STRIPE_DISCARD, &sh->state);
4032
4033
4034
4035
4036
4037unhash:
4038 hash = sh->hash_lock_index;
4039 spin_lock_irq(conf->hash_locks + hash);
4040 remove_hash(sh);
4041 spin_unlock_irq(conf->hash_locks + hash);
4042 if (head_sh->batch_head) {
4043 sh = list_first_entry(&sh->batch_list,
4044 struct stripe_head, batch_list);
4045 if (sh != head_sh)
4046 goto unhash;
4047 }
4048 sh = head_sh;
4049
4050 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
4051 set_bit(STRIPE_HANDLE, &sh->state);
4052
4053 }
4054
4055 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
4056 if (atomic_dec_and_test(&conf->pending_full_writes))
4057 md_wakeup_thread(conf->mddev->thread);
4058
4059 if (head_sh->batch_head && do_endio)
4060 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
4061}
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071static inline bool uptodate_for_rmw(struct r5dev *dev)
4072{
4073 return (test_bit(R5_UPTODATE, &dev->flags)) &&
4074 (!test_bit(R5_InJournal, &dev->flags) ||
4075 test_bit(R5_OrigPageUPTDODATE, &dev->flags));
4076}
4077
4078static int handle_stripe_dirtying(struct r5conf *conf,
4079 struct stripe_head *sh,
4080 struct stripe_head_state *s,
4081 int disks)
4082{
4083 int rmw = 0, rcw = 0, i;
4084 sector_t recovery_cp = conf->mddev->recovery_cp;
4085
4086
4087
4088
4089
4090
4091
4092
4093 if (conf->rmw_level == PARITY_DISABLE_RMW ||
4094 (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
4095 s->failed == 0)) {
4096
4097
4098
4099 rcw = 1; rmw = 2;
4100 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
4101 conf->rmw_level, (unsigned long long)recovery_cp,
4102 (unsigned long long)sh->sector);
4103 } else for (i = disks; i--; ) {
4104
4105 struct r5dev *dev = &sh->dev[i];
4106 if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
4107 i == sh->pd_idx || i == sh->qd_idx ||
4108 test_bit(R5_InJournal, &dev->flags)) &&
4109 !test_bit(R5_LOCKED, &dev->flags) &&
4110 !(uptodate_for_rmw(dev) ||
4111 test_bit(R5_Wantcompute, &dev->flags))) {
4112 if (test_bit(R5_Insync, &dev->flags))
4113 rmw++;
4114 else
4115 rmw += 2*disks;
4116 }
4117
4118 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
4119 i != sh->pd_idx && i != sh->qd_idx &&
4120 !test_bit(R5_LOCKED, &dev->flags) &&
4121 !(test_bit(R5_UPTODATE, &dev->flags) ||
4122 test_bit(R5_Wantcompute, &dev->flags))) {
4123 if (test_bit(R5_Insync, &dev->flags))
4124 rcw++;
4125 else
4126 rcw += 2*disks;
4127 }
4128 }
4129
4130 pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
4131 (unsigned long long)sh->sector, sh->state, rmw, rcw);
4132 set_bit(STRIPE_HANDLE, &sh->state);
4133 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
4134
4135 if (conf->mddev->queue)
4136 blk_add_trace_msg(conf->mddev->queue,
4137 "raid5 rmw %llu %d",
4138 (unsigned long long)sh->sector, rmw);
4139 for (i = disks; i--; ) {
4140 struct r5dev *dev = &sh->dev[i];
4141 if (test_bit(R5_InJournal, &dev->flags) &&
4142 dev->page == dev->orig_page &&
4143 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
4144
4145 struct page *p = alloc_page(GFP_NOIO);
4146
4147 if (p) {
4148 dev->orig_page = p;
4149 continue;
4150 }
4151
4152
4153
4154
4155
4156 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
4157 &conf->cache_state)) {
4158 r5c_use_extra_page(sh);
4159 break;
4160 }
4161
4162
4163 set_bit(STRIPE_DELAYED, &sh->state);
4164 s->waiting_extra_page = 1;
4165 return -EAGAIN;
4166 }
4167 }
4168
4169 for (i = disks; i--; ) {
4170 struct r5dev *dev = &sh->dev[i];
4171 if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
4172 i == sh->pd_idx || i == sh->qd_idx ||
4173 test_bit(R5_InJournal, &dev->flags)) &&
4174 !test_bit(R5_LOCKED, &dev->flags) &&
4175 !(uptodate_for_rmw(dev) ||
4176 test_bit(R5_Wantcompute, &dev->flags)) &&
4177 test_bit(R5_Insync, &dev->flags)) {
4178 if (test_bit(STRIPE_PREREAD_ACTIVE,
4179 &sh->state)) {
4180 pr_debug("Read_old block %d for r-m-w\n",
4181 i);
4182 set_bit(R5_LOCKED, &dev->flags);
4183 set_bit(R5_Wantread, &dev->flags);
4184 s->locked++;
4185 } else
4186 set_bit(STRIPE_DELAYED, &sh->state);
4187 }
4188 }
4189 }
4190 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) {
4191
4192 int qread =0;
4193 rcw = 0;
4194 for (i = disks; i--; ) {
4195 struct r5dev *dev = &sh->dev[i];
4196 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
4197 i != sh->pd_idx && i != sh->qd_idx &&
4198 !test_bit(R5_LOCKED, &dev->flags) &&
4199 !(test_bit(R5_UPTODATE, &dev->flags) ||
4200 test_bit(R5_Wantcompute, &dev->flags))) {
4201 rcw++;
4202 if (test_bit(R5_Insync, &dev->flags) &&
4203 test_bit(STRIPE_PREREAD_ACTIVE,
4204 &sh->state)) {
4205 pr_debug("Read_old block "
4206 "%d for Reconstruct\n", i);
4207 set_bit(R5_LOCKED, &dev->flags);
4208 set_bit(R5_Wantread, &dev->flags);
4209 s->locked++;
4210 qread++;
4211 } else
4212 set_bit(STRIPE_DELAYED, &sh->state);
4213 }
4214 }
4215 if (rcw && conf->mddev->queue)
4216 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
4217 (unsigned long long)sh->sector,
4218 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
4219 }
4220
4221 if (rcw > disks && rmw > disks &&
4222 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4223 set_bit(STRIPE_DELAYED, &sh->state);
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
4236 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
4237 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
4238 schedule_reconstruction(sh, s, rcw == 0, 0);
4239 return 0;
4240}
4241
4242static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
4243 struct stripe_head_state *s, int disks)
4244{
4245 struct r5dev *dev = NULL;
4246
4247 BUG_ON(sh->batch_head);
4248 set_bit(STRIPE_HANDLE, &sh->state);
4249
4250 switch (sh->check_state) {
4251 case check_state_idle:
4252
4253 if (s->failed == 0) {
4254 BUG_ON(s->uptodate != disks);
4255 sh->check_state = check_state_run;
4256 set_bit(STRIPE_OP_CHECK, &s->ops_request);
4257 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
4258 s->uptodate--;
4259 break;
4260 }
4261 dev = &sh->dev[s->failed_num[0]];
4262 fallthrough;
4263 case check_state_compute_result:
4264 sh->check_state = check_state_idle;
4265 if (!dev)
4266 dev = &sh->dev[sh->pd_idx];
4267
4268
4269 if (test_bit(STRIPE_INSYNC, &sh->state))
4270 break;
4271
4272
4273 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
4274 BUG_ON(s->uptodate != disks);
4275
4276 set_bit(R5_LOCKED, &dev->flags);
4277 s->locked++;
4278 set_bit(R5_Wantwrite, &dev->flags);
4279
4280 clear_bit(STRIPE_DEGRADED, &sh->state);
4281 set_bit(STRIPE_INSYNC, &sh->state);
4282 break;
4283 case check_state_run:
4284 break;
4285 case check_state_check_result:
4286 sh->check_state = check_state_idle;
4287
4288
4289
4290
4291 if (s->failed)
4292 break;
4293
4294
4295
4296
4297
4298 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
4299
4300
4301
4302 set_bit(STRIPE_INSYNC, &sh->state);
4303 else {
4304 atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
4305 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4306
4307 set_bit(STRIPE_INSYNC, &sh->state);
4308 pr_warn_ratelimited("%s: mismatch sector in range "
4309 "%llu-%llu\n", mdname(conf->mddev),
4310 (unsigned long long) sh->sector,
4311 (unsigned long long) sh->sector +
4312 RAID5_STRIPE_SECTORS(conf));
4313 } else {
4314 sh->check_state = check_state_compute_run;
4315 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4316 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4317 set_bit(R5_Wantcompute,
4318 &sh->dev[sh->pd_idx].flags);
4319 sh->ops.target = sh->pd_idx;
4320 sh->ops.target2 = -1;
4321 s->uptodate++;
4322 }
4323 }
4324 break;
4325 case check_state_compute_run:
4326 break;
4327 default:
4328 pr_err("%s: unknown check_state: %d sector: %llu\n",
4329 __func__, sh->check_state,
4330 (unsigned long long) sh->sector);
4331 BUG();
4332 }
4333}
4334
4335static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
4336 struct stripe_head_state *s,
4337 int disks)
4338{
4339 int pd_idx = sh->pd_idx;
4340 int qd_idx = sh->qd_idx;
4341 struct r5dev *dev;
4342
4343 BUG_ON(sh->batch_head);
4344 set_bit(STRIPE_HANDLE, &sh->state);
4345
4346 BUG_ON(s->failed > 2);
4347
4348
4349
4350
4351
4352
4353
4354 switch (sh->check_state) {
4355 case check_state_idle:
4356
4357 if (s->failed == s->q_failed) {
4358
4359
4360
4361
4362 sh->check_state = check_state_run;
4363 }
4364 if (!s->q_failed && s->failed < 2) {
4365
4366
4367
4368 if (sh->check_state == check_state_run)
4369 sh->check_state = check_state_run_pq;
4370 else
4371 sh->check_state = check_state_run_q;
4372 }
4373
4374
4375 sh->ops.zero_sum_result = 0;
4376
4377 if (sh->check_state == check_state_run) {
4378
4379 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
4380 s->uptodate--;
4381 }
4382 if (sh->check_state >= check_state_run &&
4383 sh->check_state <= check_state_run_pq) {
4384
4385
4386
4387 set_bit(STRIPE_OP_CHECK, &s->ops_request);
4388 break;
4389 }
4390
4391
4392 BUG_ON(s->failed != 2);
4393 fallthrough;
4394 case check_state_compute_result:
4395 sh->check_state = check_state_idle;
4396
4397
4398 if (test_bit(STRIPE_INSYNC, &sh->state))
4399 break;
4400
4401
4402
4403
4404 dev = NULL;
4405 if (s->failed == 2) {
4406 dev = &sh->dev[s->failed_num[1]];
4407 s->locked++;
4408 set_bit(R5_LOCKED, &dev->flags);
4409 set_bit(R5_Wantwrite, &dev->flags);
4410 }
4411 if (s->failed >= 1) {
4412 dev = &sh->dev[s->failed_num[0]];
4413 s->locked++;
4414 set_bit(R5_LOCKED, &dev->flags);
4415 set_bit(R5_Wantwrite, &dev->flags);
4416 }
4417 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4418 dev = &sh->dev[pd_idx];
4419 s->locked++;
4420 set_bit(R5_LOCKED, &dev->flags);
4421 set_bit(R5_Wantwrite, &dev->flags);
4422 }
4423 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4424 dev = &sh->dev[qd_idx];
4425 s->locked++;
4426 set_bit(R5_LOCKED, &dev->flags);
4427 set_bit(R5_Wantwrite, &dev->flags);
4428 }
4429 if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags),
4430 "%s: disk%td not up to date\n",
4431 mdname(conf->mddev),
4432 dev - (struct r5dev *) &sh->dev)) {
4433 clear_bit(R5_LOCKED, &dev->flags);
4434 clear_bit(R5_Wantwrite, &dev->flags);
4435 s->locked--;
4436 }
4437 clear_bit(STRIPE_DEGRADED, &sh->state);
4438
4439 set_bit(STRIPE_INSYNC, &sh->state);
4440 break;
4441 case check_state_run:
4442 case check_state_run_q:
4443 case check_state_run_pq:
4444 break;
4445 case check_state_check_result:
4446 sh->check_state = check_state_idle;
4447
4448
4449
4450
4451
4452 if (sh->ops.zero_sum_result == 0) {
4453
4454 if (!s->failed)
4455 set_bit(STRIPE_INSYNC, &sh->state);
4456 else {
4457
4458
4459
4460
4461 sh->check_state = check_state_compute_result;
4462
4463
4464
4465
4466
4467 }
4468 } else {
4469 atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
4470 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4471
4472 set_bit(STRIPE_INSYNC, &sh->state);
4473 pr_warn_ratelimited("%s: mismatch sector in range "
4474 "%llu-%llu\n", mdname(conf->mddev),
4475 (unsigned long long) sh->sector,
4476 (unsigned long long) sh->sector +
4477 RAID5_STRIPE_SECTORS(conf));
4478 } else {
4479 int *target = &sh->ops.target;
4480
4481 sh->ops.target = -1;
4482 sh->ops.target2 = -1;
4483 sh->check_state = check_state_compute_run;
4484 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4485 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4486 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4487 set_bit(R5_Wantcompute,
4488 &sh->dev[pd_idx].flags);
4489 *target = pd_idx;
4490 target = &sh->ops.target2;
4491 s->uptodate++;
4492 }
4493 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4494 set_bit(R5_Wantcompute,
4495 &sh->dev[qd_idx].flags);
4496 *target = qd_idx;
4497 s->uptodate++;
4498 }
4499 }
4500 }
4501 break;
4502 case check_state_compute_run:
4503 break;
4504 default:
4505 pr_warn("%s: unknown check_state: %d sector: %llu\n",
4506 __func__, sh->check_state,
4507 (unsigned long long) sh->sector);
4508 BUG();
4509 }
4510}
4511
4512static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
4513{
4514 int i;
4515
4516
4517
4518
4519 struct dma_async_tx_descriptor *tx = NULL;
4520 BUG_ON(sh->batch_head);
4521 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
4522 for (i = 0; i < sh->disks; i++)
4523 if (i != sh->pd_idx && i != sh->qd_idx) {
4524 int dd_idx, j;
4525 struct stripe_head *sh2;
4526 struct async_submit_ctl submit;
4527
4528 sector_t bn = raid5_compute_blocknr(sh, i, 1);
4529 sector_t s = raid5_compute_sector(conf, bn, 0,
4530 &dd_idx, NULL);
4531 sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1);
4532 if (sh2 == NULL)
4533
4534
4535
4536
4537 continue;
4538 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
4539 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
4540
4541 raid5_release_stripe(sh2);
4542 continue;
4543 }
4544
4545
4546 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
4547 tx = async_memcpy(sh2->dev[dd_idx].page,
4548 sh->dev[i].page, sh2->dev[dd_idx].offset,
4549 sh->dev[i].offset, RAID5_STRIPE_SIZE(conf),
4550 &submit);
4551
4552 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
4553 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
4554 for (j = 0; j < conf->raid_disks; j++)
4555 if (j != sh2->pd_idx &&
4556 j != sh2->qd_idx &&
4557 !test_bit(R5_Expanded, &sh2->dev[j].flags))
4558 break;
4559 if (j == conf->raid_disks) {
4560 set_bit(STRIPE_EXPAND_READY, &sh2->state);
4561 set_bit(STRIPE_HANDLE, &sh2->state);
4562 }
4563 raid5_release_stripe(sh2);
4564
4565 }
4566
4567 async_tx_quiesce(&tx);
4568}
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
4585{
4586 struct r5conf *conf = sh->raid_conf;
4587 int disks = sh->disks;
4588 struct r5dev *dev;
4589 int i;
4590 int do_recovery = 0;
4591
4592 memset(s, 0, sizeof(*s));
4593
4594 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
4595 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
4596 s->failed_num[0] = -1;
4597 s->failed_num[1] = -1;
4598 s->log_failed = r5l_log_disk_error(conf);
4599
4600
4601 rcu_read_lock();
4602 for (i=disks; i--; ) {
4603 struct md_rdev *rdev;
4604 sector_t first_bad;
4605 int bad_sectors;
4606 int is_bad = 0;
4607
4608 dev = &sh->dev[i];
4609
4610 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
4611 i, dev->flags,
4612 dev->toread, dev->towrite, dev->written);
4613
4614
4615
4616
4617
4618 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
4619 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
4620 set_bit(R5_Wantfill, &dev->flags);
4621
4622
4623 if (test_bit(R5_LOCKED, &dev->flags))
4624 s->locked++;
4625 if (test_bit(R5_UPTODATE, &dev->flags))
4626 s->uptodate++;
4627 if (test_bit(R5_Wantcompute, &dev->flags)) {
4628 s->compute++;
4629 BUG_ON(s->compute > 2);
4630 }
4631
4632 if (test_bit(R5_Wantfill, &dev->flags))
4633 s->to_fill++;
4634 else if (dev->toread)
4635 s->to_read++;
4636 if (dev->towrite) {
4637 s->to_write++;
4638 if (!test_bit(R5_OVERWRITE, &dev->flags))
4639 s->non_overwrite++;
4640 }
4641 if (dev->written)
4642 s->written++;
4643
4644
4645
4646 rdev = rcu_dereference(conf->disks[i].replacement);
4647 if (rdev && !test_bit(Faulty, &rdev->flags) &&
4648 rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
4649 !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
4650 &first_bad, &bad_sectors))
4651 set_bit(R5_ReadRepl, &dev->flags);
4652 else {
4653 if (rdev && !test_bit(Faulty, &rdev->flags))
4654 set_bit(R5_NeedReplace, &dev->flags);
4655 else
4656 clear_bit(R5_NeedReplace, &dev->flags);
4657 rdev = rcu_dereference(conf->disks[i].rdev);
4658 clear_bit(R5_ReadRepl, &dev->flags);
4659 }
4660 if (rdev && test_bit(Faulty, &rdev->flags))
4661 rdev = NULL;
4662 if (rdev) {
4663 is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
4664 &first_bad, &bad_sectors);
4665 if (s->blocked_rdev == NULL
4666 && (test_bit(Blocked, &rdev->flags)
4667 || is_bad < 0)) {
4668 if (is_bad < 0)
4669 set_bit(BlockedBadBlocks,
4670 &rdev->flags);
4671 s->blocked_rdev = rdev;
4672 atomic_inc(&rdev->nr_pending);
4673 }
4674 }
4675 clear_bit(R5_Insync, &dev->flags);
4676 if (!rdev)
4677 ;
4678 else if (is_bad) {
4679
4680 if (!test_bit(WriteErrorSeen, &rdev->flags) &&
4681 test_bit(R5_UPTODATE, &dev->flags)) {
4682
4683
4684
4685 set_bit(R5_Insync, &dev->flags);
4686 set_bit(R5_ReadError, &dev->flags);
4687 }
4688 } else if (test_bit(In_sync, &rdev->flags))
4689 set_bit(R5_Insync, &dev->flags);
4690 else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
4691
4692 set_bit(R5_Insync, &dev->flags);
4693 else if (test_bit(R5_UPTODATE, &dev->flags) &&
4694 test_bit(R5_Expanded, &dev->flags))
4695
4696
4697
4698
4699 set_bit(R5_Insync, &dev->flags);
4700
4701 if (test_bit(R5_WriteError, &dev->flags)) {
4702
4703
4704 struct md_rdev *rdev2 = rcu_dereference(
4705 conf->disks[i].rdev);
4706 if (rdev2 == rdev)
4707 clear_bit(R5_Insync, &dev->flags);
4708 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4709 s->handle_bad_blocks = 1;
4710 atomic_inc(&rdev2->nr_pending);
4711 } else
4712 clear_bit(R5_WriteError, &dev->flags);
4713 }
4714 if (test_bit(R5_MadeGood, &dev->flags)) {
4715
4716
4717 struct md_rdev *rdev2 = rcu_dereference(
4718 conf->disks[i].rdev);
4719 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4720 s->handle_bad_blocks = 1;
4721 atomic_inc(&rdev2->nr_pending);
4722 } else
4723 clear_bit(R5_MadeGood, &dev->flags);
4724 }
4725 if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
4726 struct md_rdev *rdev2 = rcu_dereference(
4727 conf->disks[i].replacement);
4728 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4729 s->handle_bad_blocks = 1;
4730 atomic_inc(&rdev2->nr_pending);
4731 } else
4732 clear_bit(R5_MadeGoodRepl, &dev->flags);
4733 }
4734 if (!test_bit(R5_Insync, &dev->flags)) {
4735
4736 clear_bit(R5_ReadError, &dev->flags);
4737 clear_bit(R5_ReWrite, &dev->flags);
4738 }
4739 if (test_bit(R5_ReadError, &dev->flags))
4740 clear_bit(R5_Insync, &dev->flags);
4741 if (!test_bit(R5_Insync, &dev->flags)) {
4742 if (s->failed < 2)
4743 s->failed_num[s->failed] = i;
4744 s->failed++;
4745 if (rdev && !test_bit(Faulty, &rdev->flags))
4746 do_recovery = 1;
4747 else if (!rdev) {
4748 rdev = rcu_dereference(
4749 conf->disks[i].replacement);
4750 if (rdev && !test_bit(Faulty, &rdev->flags))
4751 do_recovery = 1;
4752 }
4753 }
4754
4755 if (test_bit(R5_InJournal, &dev->flags))
4756 s->injournal++;
4757 if (test_bit(R5_InJournal, &dev->flags) && dev->written)
4758 s->just_cached++;
4759 }
4760 if (test_bit(STRIPE_SYNCING, &sh->state)) {
4761
4762
4763
4764
4765
4766
4767
4768
4769 if (do_recovery ||
4770 sh->sector >= conf->mddev->recovery_cp ||
4771 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
4772 s->syncing = 1;
4773 else
4774 s->replacing = 1;
4775 }
4776 rcu_read_unlock();
4777}
4778
4779
4780
4781
4782
4783static int clear_batch_ready(struct stripe_head *sh)
4784{
4785 struct stripe_head *tmp;
4786 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
4787 return (sh->batch_head && sh->batch_head != sh);
4788 spin_lock(&sh->stripe_lock);
4789 if (!sh->batch_head) {
4790 spin_unlock(&sh->stripe_lock);
4791 return 0;
4792 }
4793
4794
4795
4796
4797
4798 if (sh->batch_head != sh) {
4799 spin_unlock(&sh->stripe_lock);
4800 return 1;
4801 }
4802 spin_lock(&sh->batch_lock);
4803 list_for_each_entry(tmp, &sh->batch_list, batch_list)
4804 clear_bit(STRIPE_BATCH_READY, &tmp->state);
4805 spin_unlock(&sh->batch_lock);
4806 spin_unlock(&sh->stripe_lock);
4807
4808
4809
4810
4811
4812 return 0;
4813}
4814
4815static void break_stripe_batch_list(struct stripe_head *head_sh,
4816 unsigned long handle_flags)
4817{
4818 struct stripe_head *sh, *next;
4819 int i;
4820 int do_wakeup = 0;
4821
4822 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
4823
4824 list_del_init(&sh->batch_list);
4825
4826 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
4827 (1 << STRIPE_SYNCING) |
4828 (1 << STRIPE_REPLACED) |
4829 (1 << STRIPE_DELAYED) |
4830 (1 << STRIPE_BIT_DELAY) |
4831 (1 << STRIPE_FULL_WRITE) |
4832 (1 << STRIPE_BIOFILL_RUN) |
4833 (1 << STRIPE_COMPUTE_RUN) |
4834 (1 << STRIPE_DISCARD) |
4835 (1 << STRIPE_BATCH_READY) |
4836 (1 << STRIPE_BATCH_ERR) |
4837 (1 << STRIPE_BITMAP_PENDING)),
4838 "stripe state: %lx\n", sh->state);
4839 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
4840 (1 << STRIPE_REPLACED)),
4841 "head stripe state: %lx\n", head_sh->state);
4842
4843 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
4844 (1 << STRIPE_PREREAD_ACTIVE) |
4845 (1 << STRIPE_DEGRADED) |
4846 (1 << STRIPE_ON_UNPLUG_LIST)),
4847 head_sh->state & (1 << STRIPE_INSYNC));
4848
4849 sh->check_state = head_sh->check_state;
4850 sh->reconstruct_state = head_sh->reconstruct_state;
4851 spin_lock_irq(&sh->stripe_lock);
4852 sh->batch_head = NULL;
4853 spin_unlock_irq(&sh->stripe_lock);
4854 for (i = 0; i < sh->disks; i++) {
4855 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
4856 do_wakeup = 1;
4857 sh->dev[i].flags = head_sh->dev[i].flags &
4858 (~((1 << R5_WriteError) | (1 << R5_Overlap)));
4859 }
4860 if (handle_flags == 0 ||
4861 sh->state & handle_flags)
4862 set_bit(STRIPE_HANDLE, &sh->state);
4863 raid5_release_stripe(sh);
4864 }
4865 spin_lock_irq(&head_sh->stripe_lock);
4866 head_sh->batch_head = NULL;
4867 spin_unlock_irq(&head_sh->stripe_lock);
4868 for (i = 0; i < head_sh->disks; i++)
4869 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
4870 do_wakeup = 1;
4871 if (head_sh->state & handle_flags)
4872 set_bit(STRIPE_HANDLE, &head_sh->state);
4873
4874 if (do_wakeup)
4875 wake_up(&head_sh->raid_conf->wait_for_overlap);
4876}
4877
4878static void handle_stripe(struct stripe_head *sh)
4879{
4880 struct stripe_head_state s;
4881 struct r5conf *conf = sh->raid_conf;
4882 int i;
4883 int prexor;
4884 int disks = sh->disks;
4885 struct r5dev *pdev, *qdev;
4886
4887 clear_bit(STRIPE_HANDLE, &sh->state);
4888
4889
4890
4891
4892
4893
4894
4895 if (clear_batch_ready(sh))
4896 return;
4897
4898 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
4899
4900
4901 set_bit(STRIPE_HANDLE, &sh->state);
4902 return;
4903 }
4904
4905 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
4906 break_stripe_batch_list(sh, 0);
4907
4908 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
4909 spin_lock(&sh->stripe_lock);
4910
4911
4912
4913
4914 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
4915 !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
4916 !test_bit(STRIPE_DISCARD, &sh->state) &&
4917 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
4918 set_bit(STRIPE_SYNCING, &sh->state);
4919 clear_bit(STRIPE_INSYNC, &sh->state);
4920 clear_bit(STRIPE_REPLACED, &sh->state);
4921 }
4922 spin_unlock(&sh->stripe_lock);
4923 }
4924 clear_bit(STRIPE_DELAYED, &sh->state);
4925
4926 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
4927 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
4928 (unsigned long long)sh->sector, sh->state,
4929 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
4930 sh->check_state, sh->reconstruct_state);
4931
4932 analyse_stripe(sh, &s);
4933
4934 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
4935 goto finish;
4936
4937 if (s.handle_bad_blocks ||
4938 test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
4939 set_bit(STRIPE_HANDLE, &sh->state);
4940 goto finish;
4941 }
4942
4943 if (unlikely(s.blocked_rdev)) {
4944 if (s.syncing || s.expanding || s.expanded ||
4945 s.replacing || s.to_write || s.written) {
4946 set_bit(STRIPE_HANDLE, &sh->state);
4947 goto finish;
4948 }
4949
4950 rdev_dec_pending(s.blocked_rdev, conf->mddev);
4951 s.blocked_rdev = NULL;
4952 }
4953
4954 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
4955 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
4956 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
4957 }
4958
4959 pr_debug("locked=%d uptodate=%d to_read=%d"
4960 " to_write=%d failed=%d failed_num=%d,%d\n",
4961 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
4962 s.failed_num[0], s.failed_num[1]);
4963
4964
4965
4966
4967
4968
4969
4970 if (s.failed > conf->max_degraded ||
4971 (s.log_failed && s.injournal == 0)) {
4972 sh->check_state = 0;
4973 sh->reconstruct_state = 0;
4974 break_stripe_batch_list(sh, 0);
4975 if (s.to_read+s.to_write+s.written)
4976 handle_failed_stripe(conf, sh, &s, disks);
4977 if (s.syncing + s.replacing)
4978 handle_failed_sync(conf, sh, &s);
4979 }
4980
4981
4982
4983
4984 prexor = 0;
4985 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
4986 prexor = 1;
4987 if (sh->reconstruct_state == reconstruct_state_drain_result ||
4988 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
4989 sh->reconstruct_state = reconstruct_state_idle;
4990
4991
4992
4993
4994 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
4995 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
4996 BUG_ON(sh->qd_idx >= 0 &&
4997 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
4998 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
4999 for (i = disks; i--; ) {
5000 struct r5dev *dev = &sh->dev[i];
5001 if (test_bit(R5_LOCKED, &dev->flags) &&
5002 (i == sh->pd_idx || i == sh->qd_idx ||
5003 dev->written || test_bit(R5_InJournal,
5004 &dev->flags))) {
5005 pr_debug("Writing block %d\n", i);
5006 set_bit(R5_Wantwrite, &dev->flags);
5007 if (prexor)
5008 continue;
5009 if (s.failed > 1)
5010 continue;
5011 if (!test_bit(R5_Insync, &dev->flags) ||
5012 ((i == sh->pd_idx || i == sh->qd_idx) &&
5013 s.failed == 0))
5014 set_bit(STRIPE_INSYNC, &sh->state);
5015 }
5016 }
5017 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5018 s.dec_preread_active = 1;
5019 }
5020
5021
5022
5023
5024
5025 pdev = &sh->dev[sh->pd_idx];
5026 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
5027 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
5028 qdev = &sh->dev[sh->qd_idx];
5029 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
5030 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
5031 || conf->level < 6;
5032
5033 if (s.written &&
5034 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
5035 && !test_bit(R5_LOCKED, &pdev->flags)
5036 && (test_bit(R5_UPTODATE, &pdev->flags) ||
5037 test_bit(R5_Discard, &pdev->flags))))) &&
5038 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
5039 && !test_bit(R5_LOCKED, &qdev->flags)
5040 && (test_bit(R5_UPTODATE, &qdev->flags) ||
5041 test_bit(R5_Discard, &qdev->flags))))))
5042 handle_stripe_clean_event(conf, sh, disks);
5043
5044 if (s.just_cached)
5045 r5c_handle_cached_data_endio(conf, sh, disks);
5046 log_stripe_write_finished(sh);
5047
5048
5049
5050
5051
5052 if (s.to_read || s.non_overwrite
5053 || (s.to_write && s.failed)
5054 || (s.syncing && (s.uptodate + s.compute < disks))
5055 || s.replacing
5056 || s.expanding)
5057 handle_stripe_fill(sh, &s, disks);
5058
5059
5060
5061
5062
5063
5064 r5c_finish_stripe_write_out(conf, sh, &s);
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
5076 if (!r5c_is_writeback(conf->log)) {
5077 if (s.to_write)
5078 handle_stripe_dirtying(conf, sh, &s, disks);
5079 } else {
5080 int ret = 0;
5081
5082
5083 if (s.to_write)
5084 ret = r5c_try_caching_write(conf, sh, &s,
5085 disks);
5086
5087
5088
5089
5090
5091
5092
5093 if (ret == -EAGAIN ||
5094
5095 (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
5096 s.injournal > 0)) {
5097 ret = handle_stripe_dirtying(conf, sh, &s,
5098 disks);
5099 if (ret == -EAGAIN)
5100 goto finish;
5101 }
5102 }
5103 }
5104
5105
5106
5107
5108
5109
5110 if (sh->check_state ||
5111 (s.syncing && s.locked == 0 &&
5112 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
5113 !test_bit(STRIPE_INSYNC, &sh->state))) {
5114 if (conf->level == 6)
5115 handle_parity_checks6(conf, sh, &s, disks);
5116 else
5117 handle_parity_checks5(conf, sh, &s, disks);
5118 }
5119
5120 if ((s.replacing || s.syncing) && s.locked == 0
5121 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
5122 && !test_bit(STRIPE_REPLACED, &sh->state)) {
5123
5124 for (i = 0; i < conf->raid_disks; i++)
5125 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
5126 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
5127 set_bit(R5_WantReplace, &sh->dev[i].flags);
5128 set_bit(R5_LOCKED, &sh->dev[i].flags);
5129 s.locked++;
5130 }
5131 if (s.replacing)
5132 set_bit(STRIPE_INSYNC, &sh->state);
5133 set_bit(STRIPE_REPLACED, &sh->state);
5134 }
5135 if ((s.syncing || s.replacing) && s.locked == 0 &&
5136 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
5137 test_bit(STRIPE_INSYNC, &sh->state)) {
5138 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
5139 clear_bit(STRIPE_SYNCING, &sh->state);
5140 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
5141 wake_up(&conf->wait_for_overlap);
5142 }
5143
5144
5145
5146
5147 if (s.failed <= conf->max_degraded && !conf->mddev->ro)
5148 for (i = 0; i < s.failed; i++) {
5149 struct r5dev *dev = &sh->dev[s.failed_num[i]];
5150 if (test_bit(R5_ReadError, &dev->flags)
5151 && !test_bit(R5_LOCKED, &dev->flags)
5152 && test_bit(R5_UPTODATE, &dev->flags)
5153 ) {
5154 if (!test_bit(R5_ReWrite, &dev->flags)) {
5155 set_bit(R5_Wantwrite, &dev->flags);
5156 set_bit(R5_ReWrite, &dev->flags);
5157 } else
5158
5159 set_bit(R5_Wantread, &dev->flags);
5160 set_bit(R5_LOCKED, &dev->flags);
5161 s.locked++;
5162 }
5163 }
5164
5165
5166 if (sh->reconstruct_state == reconstruct_state_result) {
5167 struct stripe_head *sh_src
5168 = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
5169 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
5170
5171
5172
5173 set_bit(STRIPE_DELAYED, &sh->state);
5174 set_bit(STRIPE_HANDLE, &sh->state);
5175 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
5176 &sh_src->state))
5177 atomic_inc(&conf->preread_active_stripes);
5178 raid5_release_stripe(sh_src);
5179 goto finish;
5180 }
5181 if (sh_src)
5182 raid5_release_stripe(sh_src);
5183
5184 sh->reconstruct_state = reconstruct_state_idle;
5185 clear_bit(STRIPE_EXPANDING, &sh->state);
5186 for (i = conf->raid_disks; i--; ) {
5187 set_bit(R5_Wantwrite, &sh->dev[i].flags);
5188 set_bit(R5_LOCKED, &sh->dev[i].flags);
5189 s.locked++;
5190 }
5191 }
5192
5193 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
5194 !sh->reconstruct_state) {
5195
5196 sh->disks = conf->raid_disks;
5197 stripe_set_idx(sh->sector, conf, 0, sh);
5198 schedule_reconstruction(sh, &s, 1, 1);
5199 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
5200 clear_bit(STRIPE_EXPAND_READY, &sh->state);
5201 atomic_dec(&conf->reshape_stripes);
5202 wake_up(&conf->wait_for_overlap);
5203 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
5204 }
5205
5206 if (s.expanding && s.locked == 0 &&
5207 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
5208 handle_stripe_expansion(conf, sh);
5209
5210finish:
5211
5212 if (unlikely(s.blocked_rdev)) {
5213 if (conf->mddev->external)
5214 md_wait_for_blocked_rdev(s.blocked_rdev,
5215 conf->mddev);
5216 else
5217
5218
5219
5220
5221 rdev_dec_pending(s.blocked_rdev,
5222 conf->mddev);
5223 }
5224
5225 if (s.handle_bad_blocks)
5226 for (i = disks; i--; ) {
5227 struct md_rdev *rdev;
5228 struct r5dev *dev = &sh->dev[i];
5229 if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
5230
5231 rdev = conf->disks[i].rdev;
5232 if (!rdev_set_badblocks(rdev, sh->sector,
5233 RAID5_STRIPE_SECTORS(conf), 0))
5234 md_error(conf->mddev, rdev);
5235 rdev_dec_pending(rdev, conf->mddev);
5236 }
5237 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
5238 rdev = conf->disks[i].rdev;
5239 rdev_clear_badblocks(rdev, sh->sector,
5240 RAID5_STRIPE_SECTORS(conf), 0);
5241 rdev_dec_pending(rdev, conf->mddev);
5242 }
5243 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
5244 rdev = conf->disks[i].replacement;
5245 if (!rdev)
5246
5247 rdev = conf->disks[i].rdev;
5248 rdev_clear_badblocks(rdev, sh->sector,
5249 RAID5_STRIPE_SECTORS(conf), 0);
5250 rdev_dec_pending(rdev, conf->mddev);
5251 }
5252 }
5253
5254 if (s.ops_request)
5255 raid_run_ops(sh, s.ops_request);
5256
5257 ops_run_io(sh, &s);
5258
5259 if (s.dec_preread_active) {
5260
5261
5262
5263
5264 atomic_dec(&conf->preread_active_stripes);
5265 if (atomic_read(&conf->preread_active_stripes) <
5266 IO_THRESHOLD)
5267 md_wakeup_thread(conf->mddev->thread);
5268 }
5269
5270 clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
5271}
5272
5273static void raid5_activate_delayed(struct r5conf *conf)
5274{
5275 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
5276 while (!list_empty(&conf->delayed_list)) {
5277 struct list_head *l = conf->delayed_list.next;
5278 struct stripe_head *sh;
5279 sh = list_entry(l, struct stripe_head, lru);
5280 list_del_init(l);
5281 clear_bit(STRIPE_DELAYED, &sh->state);
5282 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5283 atomic_inc(&conf->preread_active_stripes);
5284 list_add_tail(&sh->lru, &conf->hold_list);
5285 raid5_wakeup_stripe_thread(sh);
5286 }
5287 }
5288}
5289
5290static void activate_bit_delay(struct r5conf *conf,
5291 struct list_head *temp_inactive_list)
5292{
5293
5294 struct list_head head;
5295 list_add(&head, &conf->bitmap_list);
5296 list_del_init(&conf->bitmap_list);
5297 while (!list_empty(&head)) {
5298 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
5299 int hash;
5300 list_del_init(&sh->lru);
5301 atomic_inc(&sh->count);
5302 hash = sh->hash_lock_index;
5303 __release_stripe(conf, sh, &temp_inactive_list[hash]);
5304 }
5305}
5306
5307static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
5308{
5309 struct r5conf *conf = mddev->private;
5310 sector_t sector = bio->bi_iter.bi_sector;
5311 unsigned int chunk_sectors;
5312 unsigned int bio_sectors = bio_sectors(bio);
5313
5314 chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
5315 return chunk_sectors >=
5316 ((sector & (chunk_sectors - 1)) + bio_sectors);
5317}
5318
5319
5320
5321
5322
5323static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
5324{
5325 unsigned long flags;
5326
5327 spin_lock_irqsave(&conf->device_lock, flags);
5328
5329 bi->bi_next = conf->retry_read_aligned_list;
5330 conf->retry_read_aligned_list = bi;
5331
5332 spin_unlock_irqrestore(&conf->device_lock, flags);
5333 md_wakeup_thread(conf->mddev->thread);
5334}
5335
5336static struct bio *remove_bio_from_retry(struct r5conf *conf,
5337 unsigned int *offset)
5338{
5339 struct bio *bi;
5340
5341 bi = conf->retry_read_aligned;
5342 if (bi) {
5343 *offset = conf->retry_read_offset;
5344 conf->retry_read_aligned = NULL;
5345 return bi;
5346 }
5347 bi = conf->retry_read_aligned_list;
5348 if(bi) {
5349 conf->retry_read_aligned_list = bi->bi_next;
5350 bi->bi_next = NULL;
5351 *offset = 0;
5352 }
5353
5354 return bi;
5355}
5356
5357
5358
5359
5360
5361
5362
5363static void raid5_align_endio(struct bio *bi)
5364{
5365 struct bio* raid_bi = bi->bi_private;
5366 struct mddev *mddev;
5367 struct r5conf *conf;
5368 struct md_rdev *rdev;
5369 blk_status_t error = bi->bi_status;
5370
5371 bio_put(bi);
5372
5373 rdev = (void*)raid_bi->bi_next;
5374 raid_bi->bi_next = NULL;
5375 mddev = rdev->mddev;
5376 conf = mddev->private;
5377
5378 rdev_dec_pending(rdev, conf->mddev);
5379
5380 if (!error) {
5381 bio_endio(raid_bi);
5382 if (atomic_dec_and_test(&conf->active_aligned_reads))
5383 wake_up(&conf->wait_for_quiescent);
5384 return;
5385 }
5386
5387 pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
5388
5389 add_bio_to_retry(raid_bi, conf);
5390}
5391
5392static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
5393{
5394 struct r5conf *conf = mddev->private;
5395 struct bio *align_bio;
5396 struct md_rdev *rdev;
5397 sector_t sector, end_sector, first_bad;
5398 int bad_sectors, dd_idx;
5399
5400 if (!in_chunk_boundary(mddev, raid_bio)) {
5401 pr_debug("%s: non aligned\n", __func__);
5402 return 0;
5403 }
5404
5405 sector = raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 0,
5406 &dd_idx, NULL);
5407 end_sector = bio_end_sector(raid_bio);
5408
5409 rcu_read_lock();
5410 if (r5c_big_stripe_cached(conf, sector))
5411 goto out_rcu_unlock;
5412
5413 rdev = rcu_dereference(conf->disks[dd_idx].replacement);
5414 if (!rdev || test_bit(Faulty, &rdev->flags) ||
5415 rdev->recovery_offset < end_sector) {
5416 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
5417 if (!rdev)
5418 goto out_rcu_unlock;
5419 if (test_bit(Faulty, &rdev->flags) ||
5420 !(test_bit(In_sync, &rdev->flags) ||
5421 rdev->recovery_offset >= end_sector))
5422 goto out_rcu_unlock;
5423 }
5424
5425 atomic_inc(&rdev->nr_pending);
5426 rcu_read_unlock();
5427
5428 align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set);
5429 bio_set_dev(align_bio, rdev->bdev);
5430 align_bio->bi_end_io = raid5_align_endio;
5431 align_bio->bi_private = raid_bio;
5432 align_bio->bi_iter.bi_sector = sector;
5433
5434 raid_bio->bi_next = (void *)rdev;
5435
5436 if (is_badblock(rdev, sector, bio_sectors(align_bio), &first_bad,
5437 &bad_sectors)) {
5438 bio_put(align_bio);
5439 rdev_dec_pending(rdev, mddev);
5440 return 0;
5441 }
5442
5443
5444 align_bio->bi_iter.bi_sector += rdev->data_offset;
5445
5446 spin_lock_irq(&conf->device_lock);
5447 wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0,
5448 conf->device_lock);
5449 atomic_inc(&conf->active_aligned_reads);
5450 spin_unlock_irq(&conf->device_lock);
5451
5452 if (mddev->gendisk)
5453 trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk),
5454 raid_bio->bi_iter.bi_sector);
5455 submit_bio_noacct(align_bio);
5456 return 1;
5457
5458out_rcu_unlock:
5459 rcu_read_unlock();
5460 return 0;
5461}
5462
5463static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
5464{
5465 struct bio *split;
5466 sector_t sector = raid_bio->bi_iter.bi_sector;
5467 unsigned chunk_sects = mddev->chunk_sectors;
5468 unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
5469
5470 if (sectors < bio_sectors(raid_bio)) {
5471 struct r5conf *conf = mddev->private;
5472 split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
5473 bio_chain(split, raid_bio);
5474 submit_bio_noacct(raid_bio);
5475 raid_bio = split;
5476 }
5477
5478 if (!raid5_read_one_chunk(mddev, raid_bio))
5479 return raid_bio;
5480
5481 return NULL;
5482}
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
5495{
5496 struct stripe_head *sh, *tmp;
5497 struct list_head *handle_list = NULL;
5498 struct r5worker_group *wg;
5499 bool second_try = !r5c_is_writeback(conf->log) &&
5500 !r5l_log_disk_error(conf);
5501 bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
5502 r5l_log_disk_error(conf);
5503
5504again:
5505 wg = NULL;
5506 sh = NULL;
5507 if (conf->worker_cnt_per_group == 0) {
5508 handle_list = try_loprio ? &conf->loprio_list :
5509 &conf->handle_list;
5510 } else if (group != ANY_GROUP) {
5511 handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
5512 &conf->worker_groups[group].handle_list;
5513 wg = &conf->worker_groups[group];
5514 } else {
5515 int i;
5516 for (i = 0; i < conf->group_cnt; i++) {
5517 handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
5518 &conf->worker_groups[i].handle_list;
5519 wg = &conf->worker_groups[i];
5520 if (!list_empty(handle_list))
5521 break;
5522 }
5523 }
5524
5525 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
5526 __func__,
5527 list_empty(handle_list) ? "empty" : "busy",
5528 list_empty(&conf->hold_list) ? "empty" : "busy",
5529 atomic_read(&conf->pending_full_writes), conf->bypass_count);
5530
5531 if (!list_empty(handle_list)) {
5532 sh = list_entry(handle_list->next, typeof(*sh), lru);
5533
5534 if (list_empty(&conf->hold_list))
5535 conf->bypass_count = 0;
5536 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
5537 if (conf->hold_list.next == conf->last_hold)
5538 conf->bypass_count++;
5539 else {
5540 conf->last_hold = conf->hold_list.next;
5541 conf->bypass_count -= conf->bypass_threshold;
5542 if (conf->bypass_count < 0)
5543 conf->bypass_count = 0;
5544 }
5545 }
5546 } else if (!list_empty(&conf->hold_list) &&
5547 ((conf->bypass_threshold &&
5548 conf->bypass_count > conf->bypass_threshold) ||
5549 atomic_read(&conf->pending_full_writes) == 0)) {
5550
5551 list_for_each_entry(tmp, &conf->hold_list, lru) {
5552 if (conf->worker_cnt_per_group == 0 ||
5553 group == ANY_GROUP ||
5554 !cpu_online(tmp->cpu) ||
5555 cpu_to_group(tmp->cpu) == group) {
5556 sh = tmp;
5557 break;
5558 }
5559 }
5560
5561 if (sh) {
5562 conf->bypass_count -= conf->bypass_threshold;
5563 if (conf->bypass_count < 0)
5564 conf->bypass_count = 0;
5565 }
5566 wg = NULL;
5567 }
5568
5569 if (!sh) {
5570 if (second_try)
5571 return NULL;
5572 second_try = true;
5573 try_loprio = !try_loprio;
5574 goto again;
5575 }
5576
5577 if (wg) {
5578 wg->stripes_cnt--;
5579 sh->group = NULL;
5580 }
5581 list_del_init(&sh->lru);
5582 BUG_ON(atomic_inc_return(&sh->count) != 1);
5583 return sh;
5584}
5585
5586struct raid5_plug_cb {
5587 struct blk_plug_cb cb;
5588 struct list_head list;
5589 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
5590};
5591
5592static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
5593{
5594 struct raid5_plug_cb *cb = container_of(
5595 blk_cb, struct raid5_plug_cb, cb);
5596 struct stripe_head *sh;
5597 struct mddev *mddev = cb->cb.data;
5598 struct r5conf *conf = mddev->private;
5599 int cnt = 0;
5600 int hash;
5601
5602 if (cb->list.next && !list_empty(&cb->list)) {
5603 spin_lock_irq(&conf->device_lock);
5604 while (!list_empty(&cb->list)) {
5605 sh = list_first_entry(&cb->list, struct stripe_head, lru);
5606 list_del_init(&sh->lru);
5607
5608
5609
5610
5611
5612 smp_mb__before_atomic();
5613 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
5614
5615
5616
5617
5618 hash = sh->hash_lock_index;
5619 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
5620 cnt++;
5621 }
5622 spin_unlock_irq(&conf->device_lock);
5623 }
5624 release_inactive_stripe_list(conf, cb->temp_inactive_list,
5625 NR_STRIPE_HASH_LOCKS);
5626 if (mddev->queue)
5627 trace_block_unplug(mddev->queue, cnt, !from_schedule);
5628 kfree(cb);
5629}
5630
5631static void release_stripe_plug(struct mddev *mddev,
5632 struct stripe_head *sh)
5633{
5634 struct blk_plug_cb *blk_cb = blk_check_plugged(
5635 raid5_unplug, mddev,
5636 sizeof(struct raid5_plug_cb));
5637 struct raid5_plug_cb *cb;
5638
5639 if (!blk_cb) {
5640 raid5_release_stripe(sh);
5641 return;
5642 }
5643
5644 cb = container_of(blk_cb, struct raid5_plug_cb, cb);
5645
5646 if (cb->list.next == NULL) {
5647 int i;
5648 INIT_LIST_HEAD(&cb->list);
5649 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5650 INIT_LIST_HEAD(cb->temp_inactive_list + i);
5651 }
5652
5653 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
5654 list_add_tail(&sh->lru, &cb->list);
5655 else
5656 raid5_release_stripe(sh);
5657}
5658
5659static void make_discard_request(struct mddev *mddev, struct bio *bi)
5660{
5661 struct r5conf *conf = mddev->private;
5662 sector_t logical_sector, last_sector;
5663 struct stripe_head *sh;
5664 int stripe_sectors;
5665
5666 if (mddev->reshape_position != MaxSector)
5667
5668 return;
5669
5670 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
5671 last_sector = bio_end_sector(bi);
5672
5673 bi->bi_next = NULL;
5674
5675 stripe_sectors = conf->chunk_sectors *
5676 (conf->raid_disks - conf->max_degraded);
5677 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
5678 stripe_sectors);
5679 sector_div(last_sector, stripe_sectors);
5680
5681 logical_sector *= conf->chunk_sectors;
5682 last_sector *= conf->chunk_sectors;
5683
5684 for (; logical_sector < last_sector;
5685 logical_sector += RAID5_STRIPE_SECTORS(conf)) {
5686 DEFINE_WAIT(w);
5687 int d;
5688 again:
5689 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
5690 prepare_to_wait(&conf->wait_for_overlap, &w,
5691 TASK_UNINTERRUPTIBLE);
5692 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5693 if (test_bit(STRIPE_SYNCING, &sh->state)) {
5694 raid5_release_stripe(sh);
5695 schedule();
5696 goto again;
5697 }
5698 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5699 spin_lock_irq(&sh->stripe_lock);
5700 for (d = 0; d < conf->raid_disks; d++) {
5701 if (d == sh->pd_idx || d == sh->qd_idx)
5702 continue;
5703 if (sh->dev[d].towrite || sh->dev[d].toread) {
5704 set_bit(R5_Overlap, &sh->dev[d].flags);
5705 spin_unlock_irq(&sh->stripe_lock);
5706 raid5_release_stripe(sh);
5707 schedule();
5708 goto again;
5709 }
5710 }
5711 set_bit(STRIPE_DISCARD, &sh->state);
5712 finish_wait(&conf->wait_for_overlap, &w);
5713 sh->overwrite_disks = 0;
5714 for (d = 0; d < conf->raid_disks; d++) {
5715 if (d == sh->pd_idx || d == sh->qd_idx)
5716 continue;
5717 sh->dev[d].towrite = bi;
5718 set_bit(R5_OVERWRITE, &sh->dev[d].flags);
5719 bio_inc_remaining(bi);
5720 md_write_inc(mddev, bi);
5721 sh->overwrite_disks++;
5722 }
5723 spin_unlock_irq(&sh->stripe_lock);
5724 if (conf->mddev->bitmap) {
5725 for (d = 0;
5726 d < conf->raid_disks - conf->max_degraded;
5727 d++)
5728 md_bitmap_startwrite(mddev->bitmap,
5729 sh->sector,
5730 RAID5_STRIPE_SECTORS(conf),
5731 0);
5732 sh->bm_seq = conf->seq_flush + 1;
5733 set_bit(STRIPE_BIT_DELAY, &sh->state);
5734 }
5735
5736 set_bit(STRIPE_HANDLE, &sh->state);
5737 clear_bit(STRIPE_DELAYED, &sh->state);
5738 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5739 atomic_inc(&conf->preread_active_stripes);
5740 release_stripe_plug(mddev, sh);
5741 }
5742
5743 bio_endio(bi);
5744}
5745
5746static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
5747{
5748 struct r5conf *conf = mddev->private;
5749 int dd_idx;
5750 sector_t new_sector;
5751 sector_t logical_sector, last_sector;
5752 struct stripe_head *sh;
5753 const int rw = bio_data_dir(bi);
5754 DEFINE_WAIT(w);
5755 bool do_prepare;
5756 bool do_flush = false;
5757
5758 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
5759 int ret = log_handle_flush_request(conf, bi);
5760
5761 if (ret == 0)
5762 return true;
5763 if (ret == -ENODEV) {
5764 if (md_flush_request(mddev, bi))
5765 return true;
5766 }
5767
5768
5769
5770
5771
5772 do_flush = bi->bi_opf & REQ_PREFLUSH;
5773 }
5774
5775 if (!md_write_start(mddev, bi))
5776 return false;
5777
5778
5779
5780
5781
5782 if (rw == READ && mddev->degraded == 0 &&
5783 mddev->reshape_position == MaxSector) {
5784 bi = chunk_aligned_read(mddev, bi);
5785 if (!bi)
5786 return true;
5787 }
5788
5789 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
5790 make_discard_request(mddev, bi);
5791 md_write_end(mddev);
5792 return true;
5793 }
5794
5795 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
5796 last_sector = bio_end_sector(bi);
5797 bi->bi_next = NULL;
5798
5799 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
5800 for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
5801 int previous;
5802 int seq;
5803
5804 do_prepare = false;
5805 retry:
5806 seq = read_seqcount_begin(&conf->gen_lock);
5807 previous = 0;
5808 if (do_prepare)
5809 prepare_to_wait(&conf->wait_for_overlap, &w,
5810 TASK_UNINTERRUPTIBLE);
5811 if (unlikely(conf->reshape_progress != MaxSector)) {
5812
5813
5814
5815
5816
5817
5818
5819
5820 spin_lock_irq(&conf->device_lock);
5821 if (mddev->reshape_backwards
5822 ? logical_sector < conf->reshape_progress
5823 : logical_sector >= conf->reshape_progress) {
5824 previous = 1;
5825 } else {
5826 if (mddev->reshape_backwards
5827 ? logical_sector < conf->reshape_safe
5828 : logical_sector >= conf->reshape_safe) {
5829 spin_unlock_irq(&conf->device_lock);
5830 schedule();
5831 do_prepare = true;
5832 goto retry;
5833 }
5834 }
5835 spin_unlock_irq(&conf->device_lock);
5836 }
5837
5838 new_sector = raid5_compute_sector(conf, logical_sector,
5839 previous,
5840 &dd_idx, NULL);
5841 pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n",
5842 (unsigned long long)new_sector,
5843 (unsigned long long)logical_sector);
5844
5845 sh = raid5_get_active_stripe(conf, new_sector, previous,
5846 (bi->bi_opf & REQ_RAHEAD), 0);
5847 if (sh) {
5848 if (unlikely(previous)) {
5849
5850
5851
5852
5853
5854
5855
5856
5857 int must_retry = 0;
5858 spin_lock_irq(&conf->device_lock);
5859 if (mddev->reshape_backwards
5860 ? logical_sector >= conf->reshape_progress
5861 : logical_sector < conf->reshape_progress)
5862
5863 must_retry = 1;
5864 spin_unlock_irq(&conf->device_lock);
5865 if (must_retry) {
5866 raid5_release_stripe(sh);
5867 schedule();
5868 do_prepare = true;
5869 goto retry;
5870 }
5871 }
5872 if (read_seqcount_retry(&conf->gen_lock, seq)) {
5873
5874
5875
5876 raid5_release_stripe(sh);
5877 goto retry;
5878 }
5879
5880 if (test_bit(STRIPE_EXPANDING, &sh->state) ||
5881 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
5882
5883
5884
5885
5886 md_wakeup_thread(mddev->thread);
5887 raid5_release_stripe(sh);
5888 schedule();
5889 do_prepare = true;
5890 goto retry;
5891 }
5892 if (do_flush) {
5893 set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
5894
5895 do_flush = false;
5896 }
5897
5898 set_bit(STRIPE_HANDLE, &sh->state);
5899 clear_bit(STRIPE_DELAYED, &sh->state);
5900 if ((!sh->batch_head || sh == sh->batch_head) &&
5901 (bi->bi_opf & REQ_SYNC) &&
5902 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5903 atomic_inc(&conf->preread_active_stripes);
5904 release_stripe_plug(mddev, sh);
5905 } else {
5906
5907 bi->bi_status = BLK_STS_IOERR;
5908 break;
5909 }
5910 }
5911 finish_wait(&conf->wait_for_overlap, &w);
5912
5913 if (rw == WRITE)
5914 md_write_end(mddev);
5915 bio_endio(bi);
5916 return true;
5917}
5918
5919static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
5920
5921static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
5922{
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932 struct r5conf *conf = mddev->private;
5933 struct stripe_head *sh;
5934 struct md_rdev *rdev;
5935 sector_t first_sector, last_sector;
5936 int raid_disks = conf->previous_raid_disks;
5937 int data_disks = raid_disks - conf->max_degraded;
5938 int new_data_disks = conf->raid_disks - conf->max_degraded;
5939 int i;
5940 int dd_idx;
5941 sector_t writepos, readpos, safepos;
5942 sector_t stripe_addr;
5943 int reshape_sectors;
5944 struct list_head stripes;
5945 sector_t retn;
5946
5947 if (sector_nr == 0) {
5948
5949 if (mddev->reshape_backwards &&
5950 conf->reshape_progress < raid5_size(mddev, 0, 0)) {
5951 sector_nr = raid5_size(mddev, 0, 0)
5952 - conf->reshape_progress;
5953 } else if (mddev->reshape_backwards &&
5954 conf->reshape_progress == MaxSector) {
5955
5956 sector_nr = MaxSector;
5957 } else if (!mddev->reshape_backwards &&
5958 conf->reshape_progress > 0)
5959 sector_nr = conf->reshape_progress;
5960 sector_div(sector_nr, new_data_disks);
5961 if (sector_nr) {
5962 mddev->curr_resync_completed = sector_nr;
5963 sysfs_notify_dirent_safe(mddev->sysfs_completed);
5964 *skipped = 1;
5965 retn = sector_nr;
5966 goto finish;
5967 }
5968 }
5969
5970
5971
5972
5973
5974
5975 reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
5976
5977
5978
5979
5980
5981
5982
5983 writepos = conf->reshape_progress;
5984 sector_div(writepos, new_data_disks);
5985 readpos = conf->reshape_progress;
5986 sector_div(readpos, data_disks);
5987 safepos = conf->reshape_safe;
5988 sector_div(safepos, data_disks);
5989 if (mddev->reshape_backwards) {
5990 BUG_ON(writepos < reshape_sectors);
5991 writepos -= reshape_sectors;
5992 readpos += reshape_sectors;
5993 safepos += reshape_sectors;
5994 } else {
5995 writepos += reshape_sectors;
5996
5997
5998
5999
6000 readpos -= min_t(sector_t, reshape_sectors, readpos);
6001 safepos -= min_t(sector_t, reshape_sectors, safepos);
6002 }
6003
6004
6005
6006
6007 if (mddev->reshape_backwards) {
6008 BUG_ON(conf->reshape_progress == 0);
6009 stripe_addr = writepos;
6010 BUG_ON((mddev->dev_sectors &
6011 ~((sector_t)reshape_sectors - 1))
6012 - reshape_sectors - stripe_addr
6013 != sector_nr);
6014 } else {
6015 BUG_ON(writepos != sector_nr + reshape_sectors);
6016 stripe_addr = sector_nr;
6017 }
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039 if (conf->min_offset_diff < 0) {
6040 safepos += -conf->min_offset_diff;
6041 readpos += -conf->min_offset_diff;
6042 } else
6043 writepos += conf->min_offset_diff;
6044
6045 if ((mddev->reshape_backwards
6046 ? (safepos > writepos && readpos < writepos)
6047 : (safepos < writepos && readpos > writepos)) ||
6048 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
6049
6050 wait_event(conf->wait_for_overlap,
6051 atomic_read(&conf->reshape_stripes)==0
6052 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6053 if (atomic_read(&conf->reshape_stripes) != 0)
6054 return 0;
6055 mddev->reshape_position = conf->reshape_progress;
6056 mddev->curr_resync_completed = sector_nr;
6057 if (!mddev->reshape_backwards)
6058
6059 rdev_for_each(rdev, mddev)
6060 if (rdev->raid_disk >= 0 &&
6061 !test_bit(Journal, &rdev->flags) &&
6062 !test_bit(In_sync, &rdev->flags) &&
6063 rdev->recovery_offset < sector_nr)
6064 rdev->recovery_offset = sector_nr;
6065
6066 conf->reshape_checkpoint = jiffies;
6067 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6068 md_wakeup_thread(mddev->thread);
6069 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
6070 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6071 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6072 return 0;
6073 spin_lock_irq(&conf->device_lock);
6074 conf->reshape_safe = mddev->reshape_position;
6075 spin_unlock_irq(&conf->device_lock);
6076 wake_up(&conf->wait_for_overlap);
6077 sysfs_notify_dirent_safe(mddev->sysfs_completed);
6078 }
6079
6080 INIT_LIST_HEAD(&stripes);
6081 for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
6082 int j;
6083 int skipped_disk = 0;
6084 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
6085 set_bit(STRIPE_EXPANDING, &sh->state);
6086 atomic_inc(&conf->reshape_stripes);
6087
6088
6089
6090 for (j=sh->disks; j--;) {
6091 sector_t s;
6092 if (j == sh->pd_idx)
6093 continue;
6094 if (conf->level == 6 &&
6095 j == sh->qd_idx)
6096 continue;
6097 s = raid5_compute_blocknr(sh, j, 0);
6098 if (s < raid5_size(mddev, 0, 0)) {
6099 skipped_disk = 1;
6100 continue;
6101 }
6102 memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf));
6103 set_bit(R5_Expanded, &sh->dev[j].flags);
6104 set_bit(R5_UPTODATE, &sh->dev[j].flags);
6105 }
6106 if (!skipped_disk) {
6107 set_bit(STRIPE_EXPAND_READY, &sh->state);
6108 set_bit(STRIPE_HANDLE, &sh->state);
6109 }
6110 list_add(&sh->lru, &stripes);
6111 }
6112 spin_lock_irq(&conf->device_lock);
6113 if (mddev->reshape_backwards)
6114 conf->reshape_progress -= reshape_sectors * new_data_disks;
6115 else
6116 conf->reshape_progress += reshape_sectors * new_data_disks;
6117 spin_unlock_irq(&conf->device_lock);
6118
6119
6120
6121
6122
6123 first_sector =
6124 raid5_compute_sector(conf, stripe_addr*(new_data_disks),
6125 1, &dd_idx, NULL);
6126 last_sector =
6127 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
6128 * new_data_disks - 1),
6129 1, &dd_idx, NULL);
6130 if (last_sector >= mddev->dev_sectors)
6131 last_sector = mddev->dev_sectors - 1;
6132 while (first_sector <= last_sector) {
6133 sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
6134 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
6135 set_bit(STRIPE_HANDLE, &sh->state);
6136 raid5_release_stripe(sh);
6137 first_sector += RAID5_STRIPE_SECTORS(conf);
6138 }
6139
6140
6141
6142 while (!list_empty(&stripes)) {
6143 sh = list_entry(stripes.next, struct stripe_head, lru);
6144 list_del_init(&sh->lru);
6145 raid5_release_stripe(sh);
6146 }
6147
6148
6149
6150 sector_nr += reshape_sectors;
6151 retn = reshape_sectors;
6152finish:
6153 if (mddev->curr_resync_completed > mddev->resync_max ||
6154 (sector_nr - mddev->curr_resync_completed) * 2
6155 >= mddev->resync_max - mddev->curr_resync_completed) {
6156
6157 wait_event(conf->wait_for_overlap,
6158 atomic_read(&conf->reshape_stripes) == 0
6159 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6160 if (atomic_read(&conf->reshape_stripes) != 0)
6161 goto ret;
6162 mddev->reshape_position = conf->reshape_progress;
6163 mddev->curr_resync_completed = sector_nr;
6164 if (!mddev->reshape_backwards)
6165
6166 rdev_for_each(rdev, mddev)
6167 if (rdev->raid_disk >= 0 &&
6168 !test_bit(Journal, &rdev->flags) &&
6169 !test_bit(In_sync, &rdev->flags) &&
6170 rdev->recovery_offset < sector_nr)
6171 rdev->recovery_offset = sector_nr;
6172 conf->reshape_checkpoint = jiffies;
6173 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6174 md_wakeup_thread(mddev->thread);
6175 wait_event(mddev->sb_wait,
6176 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
6177 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6178 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6179 goto ret;
6180 spin_lock_irq(&conf->device_lock);
6181 conf->reshape_safe = mddev->reshape_position;
6182 spin_unlock_irq(&conf->device_lock);
6183 wake_up(&conf->wait_for_overlap);
6184 sysfs_notify_dirent_safe(mddev->sysfs_completed);
6185 }
6186ret:
6187 return retn;
6188}
6189
6190static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
6191 int *skipped)
6192{
6193 struct r5conf *conf = mddev->private;
6194 struct stripe_head *sh;
6195 sector_t max_sector = mddev->dev_sectors;
6196 sector_t sync_blocks;
6197 int still_degraded = 0;
6198 int i;
6199
6200 if (sector_nr >= max_sector) {
6201
6202
6203 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
6204 end_reshape(conf);
6205 return 0;
6206 }
6207
6208 if (mddev->curr_resync < max_sector)
6209 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
6210 &sync_blocks, 1);
6211 else
6212 conf->fullsync = 0;
6213 md_bitmap_close_sync(mddev->bitmap);
6214
6215 return 0;
6216 }
6217
6218
6219 wait_event(conf->wait_for_overlap, conf->quiesce != 2);
6220
6221 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6222 return reshape_request(mddev, sector_nr, skipped);
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234 if (mddev->degraded >= conf->max_degraded &&
6235 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6236 sector_t rv = mddev->dev_sectors - sector_nr;
6237 *skipped = 1;
6238 return rv;
6239 }
6240 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
6241 !conf->fullsync &&
6242 !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
6243 sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
6244
6245 do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
6246 *skipped = 1;
6247
6248 return sync_blocks * RAID5_STRIPE_SECTORS(conf);
6249 }
6250
6251 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
6252
6253 sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0);
6254 if (sh == NULL) {
6255 sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0);
6256
6257
6258
6259 schedule_timeout_uninterruptible(1);
6260 }
6261
6262
6263
6264
6265 rcu_read_lock();
6266 for (i = 0; i < conf->raid_disks; i++) {
6267 struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
6268
6269 if (rdev == NULL || test_bit(Faulty, &rdev->flags))
6270 still_degraded = 1;
6271 }
6272 rcu_read_unlock();
6273
6274 md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
6275
6276 set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
6277 set_bit(STRIPE_HANDLE, &sh->state);
6278
6279 raid5_release_stripe(sh);
6280
6281 return RAID5_STRIPE_SECTORS(conf);
6282}
6283
6284static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
6285 unsigned int offset)
6286{
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297 struct stripe_head *sh;
6298 int dd_idx;
6299 sector_t sector, logical_sector, last_sector;
6300 int scnt = 0;
6301 int handled = 0;
6302
6303 logical_sector = raid_bio->bi_iter.bi_sector &
6304 ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
6305 sector = raid5_compute_sector(conf, logical_sector,
6306 0, &dd_idx, NULL);
6307 last_sector = bio_end_sector(raid_bio);
6308
6309 for (; logical_sector < last_sector;
6310 logical_sector += RAID5_STRIPE_SECTORS(conf),
6311 sector += RAID5_STRIPE_SECTORS(conf),
6312 scnt++) {
6313
6314 if (scnt < offset)
6315
6316 continue;
6317
6318 sh = raid5_get_active_stripe(conf, sector, 0, 1, 1);
6319
6320 if (!sh) {
6321
6322 conf->retry_read_aligned = raid_bio;
6323 conf->retry_read_offset = scnt;
6324 return handled;
6325 }
6326
6327 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
6328 raid5_release_stripe(sh);
6329 conf->retry_read_aligned = raid_bio;
6330 conf->retry_read_offset = scnt;
6331 return handled;
6332 }
6333
6334 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
6335 handle_stripe(sh);
6336 raid5_release_stripe(sh);
6337 handled++;
6338 }
6339
6340 bio_endio(raid_bio);
6341
6342 if (atomic_dec_and_test(&conf->active_aligned_reads))
6343 wake_up(&conf->wait_for_quiescent);
6344 return handled;
6345}
6346
6347static int handle_active_stripes(struct r5conf *conf, int group,
6348 struct r5worker *worker,
6349 struct list_head *temp_inactive_list)
6350 __releases(&conf->device_lock)
6351 __acquires(&conf->device_lock)
6352{
6353 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
6354 int i, batch_size = 0, hash;
6355 bool release_inactive = false;
6356
6357 while (batch_size < MAX_STRIPE_BATCH &&
6358 (sh = __get_priority_stripe(conf, group)) != NULL)
6359 batch[batch_size++] = sh;
6360
6361 if (batch_size == 0) {
6362 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6363 if (!list_empty(temp_inactive_list + i))
6364 break;
6365 if (i == NR_STRIPE_HASH_LOCKS) {
6366 spin_unlock_irq(&conf->device_lock);
6367 log_flush_stripe_to_raid(conf);
6368 spin_lock_irq(&conf->device_lock);
6369 return batch_size;
6370 }
6371 release_inactive = true;
6372 }
6373 spin_unlock_irq(&conf->device_lock);
6374
6375 release_inactive_stripe_list(conf, temp_inactive_list,
6376 NR_STRIPE_HASH_LOCKS);
6377
6378 r5l_flush_stripe_to_raid(conf->log);
6379 if (release_inactive) {
6380 spin_lock_irq(&conf->device_lock);
6381 return 0;
6382 }
6383
6384 for (i = 0; i < batch_size; i++)
6385 handle_stripe(batch[i]);
6386 log_write_stripe_run(conf);
6387
6388 cond_resched();
6389
6390 spin_lock_irq(&conf->device_lock);
6391 for (i = 0; i < batch_size; i++) {
6392 hash = batch[i]->hash_lock_index;
6393 __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
6394 }
6395 return batch_size;
6396}
6397
6398static void raid5_do_work(struct work_struct *work)
6399{
6400 struct r5worker *worker = container_of(work, struct r5worker, work);
6401 struct r5worker_group *group = worker->group;
6402 struct r5conf *conf = group->conf;
6403 struct mddev *mddev = conf->mddev;
6404 int group_id = group - conf->worker_groups;
6405 int handled;
6406 struct blk_plug plug;
6407
6408 pr_debug("+++ raid5worker active\n");
6409
6410 blk_start_plug(&plug);
6411 handled = 0;
6412 spin_lock_irq(&conf->device_lock);
6413 while (1) {
6414 int batch_size, released;
6415
6416 released = release_stripe_list(conf, worker->temp_inactive_list);
6417
6418 batch_size = handle_active_stripes(conf, group_id, worker,
6419 worker->temp_inactive_list);
6420 worker->working = false;
6421 if (!batch_size && !released)
6422 break;
6423 handled += batch_size;
6424 wait_event_lock_irq(mddev->sb_wait,
6425 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6426 conf->device_lock);
6427 }
6428 pr_debug("%d stripes handled\n", handled);
6429
6430 spin_unlock_irq(&conf->device_lock);
6431
6432 flush_deferred_bios(conf);
6433
6434 r5l_flush_stripe_to_raid(conf->log);
6435
6436 async_tx_issue_pending_all();
6437 blk_finish_plug(&plug);
6438
6439 pr_debug("--- raid5worker inactive\n");
6440}
6441
6442
6443
6444
6445
6446
6447
6448
6449static void raid5d(struct md_thread *thread)
6450{
6451 struct mddev *mddev = thread->mddev;
6452 struct r5conf *conf = mddev->private;
6453 int handled;
6454 struct blk_plug plug;
6455
6456 pr_debug("+++ raid5d active\n");
6457
6458 md_check_recovery(mddev);
6459
6460 blk_start_plug(&plug);
6461 handled = 0;
6462 spin_lock_irq(&conf->device_lock);
6463 while (1) {
6464 struct bio *bio;
6465 int batch_size, released;
6466 unsigned int offset;
6467
6468 released = release_stripe_list(conf, conf->temp_inactive_list);
6469 if (released)
6470 clear_bit(R5_DID_ALLOC, &conf->cache_state);
6471
6472 if (
6473 !list_empty(&conf->bitmap_list)) {
6474
6475 conf->seq_flush++;
6476 spin_unlock_irq(&conf->device_lock);
6477 md_bitmap_unplug(mddev->bitmap);
6478 spin_lock_irq(&conf->device_lock);
6479 conf->seq_write = conf->seq_flush;
6480 activate_bit_delay(conf, conf->temp_inactive_list);
6481 }
6482 raid5_activate_delayed(conf);
6483
6484 while ((bio = remove_bio_from_retry(conf, &offset))) {
6485 int ok;
6486 spin_unlock_irq(&conf->device_lock);
6487 ok = retry_aligned_read(conf, bio, offset);
6488 spin_lock_irq(&conf->device_lock);
6489 if (!ok)
6490 break;
6491 handled++;
6492 }
6493
6494 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
6495 conf->temp_inactive_list);
6496 if (!batch_size && !released)
6497 break;
6498 handled += batch_size;
6499
6500 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
6501 spin_unlock_irq(&conf->device_lock);
6502 md_check_recovery(mddev);
6503 spin_lock_irq(&conf->device_lock);
6504 }
6505 }
6506 pr_debug("%d stripes handled\n", handled);
6507
6508 spin_unlock_irq(&conf->device_lock);
6509 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) &&
6510 mutex_trylock(&conf->cache_size_mutex)) {
6511 grow_one_stripe(conf, __GFP_NOWARN);
6512
6513
6514
6515 set_bit(R5_DID_ALLOC, &conf->cache_state);
6516 mutex_unlock(&conf->cache_size_mutex);
6517 }
6518
6519 flush_deferred_bios(conf);
6520
6521 r5l_flush_stripe_to_raid(conf->log);
6522
6523 async_tx_issue_pending_all();
6524 blk_finish_plug(&plug);
6525
6526 pr_debug("--- raid5d inactive\n");
6527}
6528
6529static ssize_t
6530raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
6531{
6532 struct r5conf *conf;
6533 int ret = 0;
6534 spin_lock(&mddev->lock);
6535 conf = mddev->private;
6536 if (conf)
6537 ret = sprintf(page, "%d\n", conf->min_nr_stripes);
6538 spin_unlock(&mddev->lock);
6539 return ret;
6540}
6541
6542int
6543raid5_set_cache_size(struct mddev *mddev, int size)
6544{
6545 int result = 0;
6546 struct r5conf *conf = mddev->private;
6547
6548 if (size <= 16 || size > 32768)
6549 return -EINVAL;
6550
6551 conf->min_nr_stripes = size;
6552 mutex_lock(&conf->cache_size_mutex);
6553 while (size < conf->max_nr_stripes &&
6554 drop_one_stripe(conf))
6555 ;
6556 mutex_unlock(&conf->cache_size_mutex);
6557
6558 md_allow_write(mddev);
6559
6560 mutex_lock(&conf->cache_size_mutex);
6561 while (size > conf->max_nr_stripes)
6562 if (!grow_one_stripe(conf, GFP_KERNEL)) {
6563 conf->min_nr_stripes = conf->max_nr_stripes;
6564 result = -ENOMEM;
6565 break;
6566 }
6567 mutex_unlock(&conf->cache_size_mutex);
6568
6569 return result;
6570}
6571EXPORT_SYMBOL(raid5_set_cache_size);
6572
6573static ssize_t
6574raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
6575{
6576 struct r5conf *conf;
6577 unsigned long new;
6578 int err;
6579
6580 if (len >= PAGE_SIZE)
6581 return -EINVAL;
6582 if (kstrtoul(page, 10, &new))
6583 return -EINVAL;
6584 err = mddev_lock(mddev);
6585 if (err)
6586 return err;
6587 conf = mddev->private;
6588 if (!conf)
6589 err = -ENODEV;
6590 else
6591 err = raid5_set_cache_size(mddev, new);
6592 mddev_unlock(mddev);
6593
6594 return err ?: len;
6595}
6596
6597static struct md_sysfs_entry
6598raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
6599 raid5_show_stripe_cache_size,
6600 raid5_store_stripe_cache_size);
6601
6602static ssize_t
6603raid5_show_rmw_level(struct mddev *mddev, char *page)
6604{
6605 struct r5conf *conf = mddev->private;
6606 if (conf)
6607 return sprintf(page, "%d\n", conf->rmw_level);
6608 else
6609 return 0;
6610}
6611
6612static ssize_t
6613raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len)
6614{
6615 struct r5conf *conf = mddev->private;
6616 unsigned long new;
6617
6618 if (!conf)
6619 return -ENODEV;
6620
6621 if (len >= PAGE_SIZE)
6622 return -EINVAL;
6623
6624 if (kstrtoul(page, 10, &new))
6625 return -EINVAL;
6626
6627 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
6628 return -EINVAL;
6629
6630 if (new != PARITY_DISABLE_RMW &&
6631 new != PARITY_ENABLE_RMW &&
6632 new != PARITY_PREFER_RMW)
6633 return -EINVAL;
6634
6635 conf->rmw_level = new;
6636 return len;
6637}
6638
6639static struct md_sysfs_entry
6640raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
6641 raid5_show_rmw_level,
6642 raid5_store_rmw_level);
6643
6644static ssize_t
6645raid5_show_stripe_size(struct mddev *mddev, char *page)
6646{
6647 struct r5conf *conf;
6648 int ret = 0;
6649
6650 spin_lock(&mddev->lock);
6651 conf = mddev->private;
6652 if (conf)
6653 ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf));
6654 spin_unlock(&mddev->lock);
6655 return ret;
6656}
6657
6658#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
6659static ssize_t
6660raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
6661{
6662 struct r5conf *conf;
6663 unsigned long new;
6664 int err;
6665 int size;
6666
6667 if (len >= PAGE_SIZE)
6668 return -EINVAL;
6669 if (kstrtoul(page, 10, &new))
6670 return -EINVAL;
6671
6672
6673
6674
6675
6676
6677 if (new % DEFAULT_STRIPE_SIZE != 0 ||
6678 new > PAGE_SIZE || new == 0 ||
6679 new != roundup_pow_of_two(new))
6680 return -EINVAL;
6681
6682 err = mddev_lock(mddev);
6683 if (err)
6684 return err;
6685
6686 conf = mddev->private;
6687 if (!conf) {
6688 err = -ENODEV;
6689 goto out_unlock;
6690 }
6691
6692 if (new == conf->stripe_size)
6693 goto out_unlock;
6694
6695 pr_debug("md/raid: change stripe_size from %lu to %lu\n",
6696 conf->stripe_size, new);
6697
6698 if (mddev->sync_thread ||
6699 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6700 mddev->reshape_position != MaxSector ||
6701 mddev->sysfs_active) {
6702 err = -EBUSY;
6703 goto out_unlock;
6704 }
6705
6706 mddev_suspend(mddev);
6707 mutex_lock(&conf->cache_size_mutex);
6708 size = conf->max_nr_stripes;
6709
6710 shrink_stripes(conf);
6711
6712 conf->stripe_size = new;
6713 conf->stripe_shift = ilog2(new) - 9;
6714 conf->stripe_sectors = new >> 9;
6715 if (grow_stripes(conf, size)) {
6716 pr_warn("md/raid:%s: couldn't allocate buffers\n",
6717 mdname(mddev));
6718 err = -ENOMEM;
6719 }
6720 mutex_unlock(&conf->cache_size_mutex);
6721 mddev_resume(mddev);
6722
6723out_unlock:
6724 mddev_unlock(mddev);
6725 return err ?: len;
6726}
6727
6728static struct md_sysfs_entry
6729raid5_stripe_size = __ATTR(stripe_size, 0644,
6730 raid5_show_stripe_size,
6731 raid5_store_stripe_size);
6732#else
6733static struct md_sysfs_entry
6734raid5_stripe_size = __ATTR(stripe_size, 0444,
6735 raid5_show_stripe_size,
6736 NULL);
6737#endif
6738
6739static ssize_t
6740raid5_show_preread_threshold(struct mddev *mddev, char *page)
6741{
6742 struct r5conf *conf;
6743 int ret = 0;
6744 spin_lock(&mddev->lock);
6745 conf = mddev->private;
6746 if (conf)
6747 ret = sprintf(page, "%d\n", conf->bypass_threshold);
6748 spin_unlock(&mddev->lock);
6749 return ret;
6750}
6751
6752static ssize_t
6753raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
6754{
6755 struct r5conf *conf;
6756 unsigned long new;
6757 int err;
6758
6759 if (len >= PAGE_SIZE)
6760 return -EINVAL;
6761 if (kstrtoul(page, 10, &new))
6762 return -EINVAL;
6763
6764 err = mddev_lock(mddev);
6765 if (err)
6766 return err;
6767 conf = mddev->private;
6768 if (!conf)
6769 err = -ENODEV;
6770 else if (new > conf->min_nr_stripes)
6771 err = -EINVAL;
6772 else
6773 conf->bypass_threshold = new;
6774 mddev_unlock(mddev);
6775 return err ?: len;
6776}
6777
6778static struct md_sysfs_entry
6779raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
6780 S_IRUGO | S_IWUSR,
6781 raid5_show_preread_threshold,
6782 raid5_store_preread_threshold);
6783
6784static ssize_t
6785raid5_show_skip_copy(struct mddev *mddev, char *page)
6786{
6787 struct r5conf *conf;
6788 int ret = 0;
6789 spin_lock(&mddev->lock);
6790 conf = mddev->private;
6791 if (conf)
6792 ret = sprintf(page, "%d\n", conf->skip_copy);
6793 spin_unlock(&mddev->lock);
6794 return ret;
6795}
6796
6797static ssize_t
6798raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
6799{
6800 struct r5conf *conf;
6801 unsigned long new;
6802 int err;
6803
6804 if (len >= PAGE_SIZE)
6805 return -EINVAL;
6806 if (kstrtoul(page, 10, &new))
6807 return -EINVAL;
6808 new = !!new;
6809
6810 err = mddev_lock(mddev);
6811 if (err)
6812 return err;
6813 conf = mddev->private;
6814 if (!conf)
6815 err = -ENODEV;
6816 else if (new != conf->skip_copy) {
6817 struct request_queue *q = mddev->queue;
6818
6819 mddev_suspend(mddev);
6820 conf->skip_copy = new;
6821 if (new)
6822 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
6823 else
6824 blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
6825 mddev_resume(mddev);
6826 }
6827 mddev_unlock(mddev);
6828 return err ?: len;
6829}
6830
6831static struct md_sysfs_entry
6832raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
6833 raid5_show_skip_copy,
6834 raid5_store_skip_copy);
6835
6836static ssize_t
6837stripe_cache_active_show(struct mddev *mddev, char *page)
6838{
6839 struct r5conf *conf = mddev->private;
6840 if (conf)
6841 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
6842 else
6843 return 0;
6844}
6845
6846static struct md_sysfs_entry
6847raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
6848
6849static ssize_t
6850raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
6851{
6852 struct r5conf *conf;
6853 int ret = 0;
6854 spin_lock(&mddev->lock);
6855 conf = mddev->private;
6856 if (conf)
6857 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
6858 spin_unlock(&mddev->lock);
6859 return ret;
6860}
6861
6862static int alloc_thread_groups(struct r5conf *conf, int cnt,
6863 int *group_cnt,
6864 struct r5worker_group **worker_groups);
6865static ssize_t
6866raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
6867{
6868 struct r5conf *conf;
6869 unsigned int new;
6870 int err;
6871 struct r5worker_group *new_groups, *old_groups;
6872 int group_cnt;
6873
6874 if (len >= PAGE_SIZE)
6875 return -EINVAL;
6876 if (kstrtouint(page, 10, &new))
6877 return -EINVAL;
6878
6879 if (new > 8192)
6880 return -EINVAL;
6881
6882 err = mddev_lock(mddev);
6883 if (err)
6884 return err;
6885 conf = mddev->private;
6886 if (!conf)
6887 err = -ENODEV;
6888 else if (new != conf->worker_cnt_per_group) {
6889 mddev_suspend(mddev);
6890
6891 old_groups = conf->worker_groups;
6892 if (old_groups)
6893 flush_workqueue(raid5_wq);
6894
6895 err = alloc_thread_groups(conf, new, &group_cnt, &new_groups);
6896 if (!err) {
6897 spin_lock_irq(&conf->device_lock);
6898 conf->group_cnt = group_cnt;
6899 conf->worker_cnt_per_group = new;
6900 conf->worker_groups = new_groups;
6901 spin_unlock_irq(&conf->device_lock);
6902
6903 if (old_groups)
6904 kfree(old_groups[0].workers);
6905 kfree(old_groups);
6906 }
6907 mddev_resume(mddev);
6908 }
6909 mddev_unlock(mddev);
6910
6911 return err ?: len;
6912}
6913
6914static struct md_sysfs_entry
6915raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
6916 raid5_show_group_thread_cnt,
6917 raid5_store_group_thread_cnt);
6918
6919static struct attribute *raid5_attrs[] = {
6920 &raid5_stripecache_size.attr,
6921 &raid5_stripecache_active.attr,
6922 &raid5_preread_bypass_threshold.attr,
6923 &raid5_group_thread_cnt.attr,
6924 &raid5_skip_copy.attr,
6925 &raid5_rmw_level.attr,
6926 &raid5_stripe_size.attr,
6927 &r5c_journal_mode.attr,
6928 &ppl_write_hint.attr,
6929 NULL,
6930};
6931static struct attribute_group raid5_attrs_group = {
6932 .name = NULL,
6933 .attrs = raid5_attrs,
6934};
6935
6936static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt,
6937 struct r5worker_group **worker_groups)
6938{
6939 int i, j, k;
6940 ssize_t size;
6941 struct r5worker *workers;
6942
6943 if (cnt == 0) {
6944 *group_cnt = 0;
6945 *worker_groups = NULL;
6946 return 0;
6947 }
6948 *group_cnt = num_possible_nodes();
6949 size = sizeof(struct r5worker) * cnt;
6950 workers = kcalloc(size, *group_cnt, GFP_NOIO);
6951 *worker_groups = kcalloc(*group_cnt, sizeof(struct r5worker_group),
6952 GFP_NOIO);
6953 if (!*worker_groups || !workers) {
6954 kfree(workers);
6955 kfree(*worker_groups);
6956 return -ENOMEM;
6957 }
6958
6959 for (i = 0; i < *group_cnt; i++) {
6960 struct r5worker_group *group;
6961
6962 group = &(*worker_groups)[i];
6963 INIT_LIST_HEAD(&group->handle_list);
6964 INIT_LIST_HEAD(&group->loprio_list);
6965 group->conf = conf;
6966 group->workers = workers + i * cnt;
6967
6968 for (j = 0; j < cnt; j++) {
6969 struct r5worker *worker = group->workers + j;
6970 worker->group = group;
6971 INIT_WORK(&worker->work, raid5_do_work);
6972
6973 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
6974 INIT_LIST_HEAD(worker->temp_inactive_list + k);
6975 }
6976 }
6977
6978 return 0;
6979}
6980
6981static void free_thread_groups(struct r5conf *conf)
6982{
6983 if (conf->worker_groups)
6984 kfree(conf->worker_groups[0].workers);
6985 kfree(conf->worker_groups);
6986 conf->worker_groups = NULL;
6987}
6988
6989static sector_t
6990raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
6991{
6992 struct r5conf *conf = mddev->private;
6993
6994 if (!sectors)
6995 sectors = mddev->dev_sectors;
6996 if (!raid_disks)
6997
6998 raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
6999
7000 sectors &= ~((sector_t)conf->chunk_sectors - 1);
7001 sectors &= ~((sector_t)conf->prev_chunk_sectors - 1);
7002 return sectors * (raid_disks - conf->max_degraded);
7003}
7004
7005static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
7006{
7007 safe_put_page(percpu->spare_page);
7008 percpu->spare_page = NULL;
7009 kvfree(percpu->scribble);
7010 percpu->scribble = NULL;
7011}
7012
7013static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
7014{
7015 if (conf->level == 6 && !percpu->spare_page) {
7016 percpu->spare_page = alloc_page(GFP_KERNEL);
7017 if (!percpu->spare_page)
7018 return -ENOMEM;
7019 }
7020
7021 if (scribble_alloc(percpu,
7022 max(conf->raid_disks,
7023 conf->previous_raid_disks),
7024 max(conf->chunk_sectors,
7025 conf->prev_chunk_sectors)
7026 / RAID5_STRIPE_SECTORS(conf))) {
7027 free_scratch_buffer(conf, percpu);
7028 return -ENOMEM;
7029 }
7030
7031 return 0;
7032}
7033
7034static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node)
7035{
7036 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
7037
7038 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
7039 return 0;
7040}
7041
7042static void raid5_free_percpu(struct r5conf *conf)
7043{
7044 if (!conf->percpu)
7045 return;
7046
7047 cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
7048 free_percpu(conf->percpu);
7049}
7050
7051static void free_conf(struct r5conf *conf)
7052{
7053 int i;
7054
7055 log_exit(conf);
7056
7057 unregister_shrinker(&conf->shrinker);
7058 free_thread_groups(conf);
7059 shrink_stripes(conf);
7060 raid5_free_percpu(conf);
7061 for (i = 0; i < conf->pool_size; i++)
7062 if (conf->disks[i].extra_page)
7063 put_page(conf->disks[i].extra_page);
7064 kfree(conf->disks);
7065 bioset_exit(&conf->bio_split);
7066 kfree(conf->stripe_hashtbl);
7067 kfree(conf->pending_data);
7068 kfree(conf);
7069}
7070
7071static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
7072{
7073 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
7074 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
7075
7076 if (alloc_scratch_buffer(conf, percpu)) {
7077 pr_warn("%s: failed memory allocation for cpu%u\n",
7078 __func__, cpu);
7079 return -ENOMEM;
7080 }
7081 return 0;
7082}
7083
7084static int raid5_alloc_percpu(struct r5conf *conf)
7085{
7086 int err = 0;
7087
7088 conf->percpu = alloc_percpu(struct raid5_percpu);
7089 if (!conf->percpu)
7090 return -ENOMEM;
7091
7092 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
7093 if (!err) {
7094 conf->scribble_disks = max(conf->raid_disks,
7095 conf->previous_raid_disks);
7096 conf->scribble_sectors = max(conf->chunk_sectors,
7097 conf->prev_chunk_sectors);
7098 }
7099 return err;
7100}
7101
7102static unsigned long raid5_cache_scan(struct shrinker *shrink,
7103 struct shrink_control *sc)
7104{
7105 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
7106 unsigned long ret = SHRINK_STOP;
7107
7108 if (mutex_trylock(&conf->cache_size_mutex)) {
7109 ret= 0;
7110 while (ret < sc->nr_to_scan &&
7111 conf->max_nr_stripes > conf->min_nr_stripes) {
7112 if (drop_one_stripe(conf) == 0) {
7113 ret = SHRINK_STOP;
7114 break;
7115 }
7116 ret++;
7117 }
7118 mutex_unlock(&conf->cache_size_mutex);
7119 }
7120 return ret;
7121}
7122
7123static unsigned long raid5_cache_count(struct shrinker *shrink,
7124 struct shrink_control *sc)
7125{
7126 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
7127
7128 if (conf->max_nr_stripes < conf->min_nr_stripes)
7129
7130 return 0;
7131 return conf->max_nr_stripes - conf->min_nr_stripes;
7132}
7133
7134static struct r5conf *setup_conf(struct mddev *mddev)
7135{
7136 struct r5conf *conf;
7137 int raid_disk, memory, max_disks;
7138 struct md_rdev *rdev;
7139 struct disk_info *disk;
7140 char pers_name[6];
7141 int i;
7142 int group_cnt;
7143 struct r5worker_group *new_group;
7144 int ret;
7145
7146 if (mddev->new_level != 5
7147 && mddev->new_level != 4
7148 && mddev->new_level != 6) {
7149 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
7150 mdname(mddev), mddev->new_level);
7151 return ERR_PTR(-EIO);
7152 }
7153 if ((mddev->new_level == 5
7154 && !algorithm_valid_raid5(mddev->new_layout)) ||
7155 (mddev->new_level == 6
7156 && !algorithm_valid_raid6(mddev->new_layout))) {
7157 pr_warn("md/raid:%s: layout %d not supported\n",
7158 mdname(mddev), mddev->new_layout);
7159 return ERR_PTR(-EIO);
7160 }
7161 if (mddev->new_level == 6 && mddev->raid_disks < 4) {
7162 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
7163 mdname(mddev), mddev->raid_disks);
7164 return ERR_PTR(-EINVAL);
7165 }
7166
7167 if (!mddev->new_chunk_sectors ||
7168 (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
7169 !is_power_of_2(mddev->new_chunk_sectors)) {
7170 pr_warn("md/raid:%s: invalid chunk size %d\n",
7171 mdname(mddev), mddev->new_chunk_sectors << 9);
7172 return ERR_PTR(-EINVAL);
7173 }
7174
7175 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
7176 if (conf == NULL)
7177 goto abort;
7178
7179#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
7180 conf->stripe_size = DEFAULT_STRIPE_SIZE;
7181 conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9;
7182 conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9;
7183#endif
7184 INIT_LIST_HEAD(&conf->free_list);
7185 INIT_LIST_HEAD(&conf->pending_list);
7186 conf->pending_data = kcalloc(PENDING_IO_MAX,
7187 sizeof(struct r5pending_data),
7188 GFP_KERNEL);
7189 if (!conf->pending_data)
7190 goto abort;
7191 for (i = 0; i < PENDING_IO_MAX; i++)
7192 list_add(&conf->pending_data[i].sibling, &conf->free_list);
7193
7194 if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) {
7195 conf->group_cnt = group_cnt;
7196 conf->worker_cnt_per_group = 0;
7197 conf->worker_groups = new_group;
7198 } else
7199 goto abort;
7200 spin_lock_init(&conf->device_lock);
7201 seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
7202 mutex_init(&conf->cache_size_mutex);
7203 init_waitqueue_head(&conf->wait_for_quiescent);
7204 init_waitqueue_head(&conf->wait_for_stripe);
7205 init_waitqueue_head(&conf->wait_for_overlap);
7206 INIT_LIST_HEAD(&conf->handle_list);
7207 INIT_LIST_HEAD(&conf->loprio_list);
7208 INIT_LIST_HEAD(&conf->hold_list);
7209 INIT_LIST_HEAD(&conf->delayed_list);
7210 INIT_LIST_HEAD(&conf->bitmap_list);
7211 init_llist_head(&conf->released_stripes);
7212 atomic_set(&conf->active_stripes, 0);
7213 atomic_set(&conf->preread_active_stripes, 0);
7214 atomic_set(&conf->active_aligned_reads, 0);
7215 spin_lock_init(&conf->pending_bios_lock);
7216 conf->batch_bio_dispatch = true;
7217 rdev_for_each(rdev, mddev) {
7218 if (test_bit(Journal, &rdev->flags))
7219 continue;
7220 if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
7221 conf->batch_bio_dispatch = false;
7222 break;
7223 }
7224 }
7225
7226 conf->bypass_threshold = BYPASS_THRESHOLD;
7227 conf->recovery_disabled = mddev->recovery_disabled - 1;
7228
7229 conf->raid_disks = mddev->raid_disks;
7230 if (mddev->reshape_position == MaxSector)
7231 conf->previous_raid_disks = mddev->raid_disks;
7232 else
7233 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
7234 max_disks = max(conf->raid_disks, conf->previous_raid_disks);
7235
7236 conf->disks = kcalloc(max_disks, sizeof(struct disk_info),
7237 GFP_KERNEL);
7238
7239 if (!conf->disks)
7240 goto abort;
7241
7242 for (i = 0; i < max_disks; i++) {
7243 conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
7244 if (!conf->disks[i].extra_page)
7245 goto abort;
7246 }
7247
7248 ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
7249 if (ret)
7250 goto abort;
7251 conf->mddev = mddev;
7252
7253 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
7254 goto abort;
7255
7256
7257
7258
7259
7260
7261 spin_lock_init(conf->hash_locks);
7262 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
7263 spin_lock_init(conf->hash_locks + i);
7264
7265 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
7266 INIT_LIST_HEAD(conf->inactive_list + i);
7267
7268 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
7269 INIT_LIST_HEAD(conf->temp_inactive_list + i);
7270
7271 atomic_set(&conf->r5c_cached_full_stripes, 0);
7272 INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
7273 atomic_set(&conf->r5c_cached_partial_stripes, 0);
7274 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
7275 atomic_set(&conf->r5c_flushing_full_stripes, 0);
7276 atomic_set(&conf->r5c_flushing_partial_stripes, 0);
7277
7278 conf->level = mddev->new_level;
7279 conf->chunk_sectors = mddev->new_chunk_sectors;
7280 if (raid5_alloc_percpu(conf) != 0)
7281 goto abort;
7282
7283 pr_debug("raid456: run(%s) called.\n", mdname(mddev));
7284
7285 rdev_for_each(rdev, mddev) {
7286 raid_disk = rdev->raid_disk;
7287 if (raid_disk >= max_disks
7288 || raid_disk < 0 || test_bit(Journal, &rdev->flags))
7289 continue;
7290 disk = conf->disks + raid_disk;
7291
7292 if (test_bit(Replacement, &rdev->flags)) {
7293 if (disk->replacement)
7294 goto abort;
7295 disk->replacement = rdev;
7296 } else {
7297 if (disk->rdev)
7298 goto abort;
7299 disk->rdev = rdev;
7300 }
7301
7302 if (test_bit(In_sync, &rdev->flags)) {
7303 char b[BDEVNAME_SIZE];
7304 pr_info("md/raid:%s: device %s operational as raid disk %d\n",
7305 mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
7306 } else if (rdev->saved_raid_disk != raid_disk)
7307
7308 conf->fullsync = 1;
7309 }
7310
7311 conf->level = mddev->new_level;
7312 if (conf->level == 6) {
7313 conf->max_degraded = 2;
7314 if (raid6_call.xor_syndrome)
7315 conf->rmw_level = PARITY_ENABLE_RMW;
7316 else
7317 conf->rmw_level = PARITY_DISABLE_RMW;
7318 } else {
7319 conf->max_degraded = 1;
7320 conf->rmw_level = PARITY_ENABLE_RMW;
7321 }
7322 conf->algorithm = mddev->new_layout;
7323 conf->reshape_progress = mddev->reshape_position;
7324 if (conf->reshape_progress != MaxSector) {
7325 conf->prev_chunk_sectors = mddev->chunk_sectors;
7326 conf->prev_algo = mddev->layout;
7327 } else {
7328 conf->prev_chunk_sectors = conf->chunk_sectors;
7329 conf->prev_algo = conf->algorithm;
7330 }
7331
7332 conf->min_nr_stripes = NR_STRIPES;
7333 if (mddev->reshape_position != MaxSector) {
7334 int stripes = max_t(int,
7335 ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4,
7336 ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4);
7337 conf->min_nr_stripes = max(NR_STRIPES, stripes);
7338 if (conf->min_nr_stripes != NR_STRIPES)
7339 pr_info("md/raid:%s: force stripe size %d for reshape\n",
7340 mdname(mddev), conf->min_nr_stripes);
7341 }
7342 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
7343 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
7344 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
7345 if (grow_stripes(conf, conf->min_nr_stripes)) {
7346 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
7347 mdname(mddev), memory);
7348 goto abort;
7349 } else
7350 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
7351
7352
7353
7354
7355
7356 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
7357 conf->shrinker.scan_objects = raid5_cache_scan;
7358 conf->shrinker.count_objects = raid5_cache_count;
7359 conf->shrinker.batch = 128;
7360 conf->shrinker.flags = 0;
7361 if (register_shrinker(&conf->shrinker)) {
7362 pr_warn("md/raid:%s: couldn't register shrinker.\n",
7363 mdname(mddev));
7364 goto abort;
7365 }
7366
7367 sprintf(pers_name, "raid%d", mddev->new_level);
7368 conf->thread = md_register_thread(raid5d, mddev, pers_name);
7369 if (!conf->thread) {
7370 pr_warn("md/raid:%s: couldn't allocate thread.\n",
7371 mdname(mddev));
7372 goto abort;
7373 }
7374
7375 return conf;
7376
7377 abort:
7378 if (conf) {
7379 free_conf(conf);
7380 return ERR_PTR(-EIO);
7381 } else
7382 return ERR_PTR(-ENOMEM);
7383}
7384
7385static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
7386{
7387 switch (algo) {
7388 case ALGORITHM_PARITY_0:
7389 if (raid_disk < max_degraded)
7390 return 1;
7391 break;
7392 case ALGORITHM_PARITY_N:
7393 if (raid_disk >= raid_disks - max_degraded)
7394 return 1;
7395 break;
7396 case ALGORITHM_PARITY_0_6:
7397 if (raid_disk == 0 ||
7398 raid_disk == raid_disks - 1)
7399 return 1;
7400 break;
7401 case ALGORITHM_LEFT_ASYMMETRIC_6:
7402 case ALGORITHM_RIGHT_ASYMMETRIC_6:
7403 case ALGORITHM_LEFT_SYMMETRIC_6:
7404 case ALGORITHM_RIGHT_SYMMETRIC_6:
7405 if (raid_disk == raid_disks - 1)
7406 return 1;
7407 }
7408 return 0;
7409}
7410
7411static void raid5_set_io_opt(struct r5conf *conf)
7412{
7413 blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) *
7414 (conf->raid_disks - conf->max_degraded));
7415}
7416
7417static int raid5_run(struct mddev *mddev)
7418{
7419 struct r5conf *conf;
7420 int working_disks = 0;
7421 int dirty_parity_disks = 0;
7422 struct md_rdev *rdev;
7423 struct md_rdev *journal_dev = NULL;
7424 sector_t reshape_offset = 0;
7425 int i;
7426 long long min_offset_diff = 0;
7427 int first = 1;
7428
7429 if (mddev_init_writes_pending(mddev) < 0)
7430 return -ENOMEM;
7431
7432 if (mddev->recovery_cp != MaxSector)
7433 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
7434 mdname(mddev));
7435
7436 rdev_for_each(rdev, mddev) {
7437 long long diff;
7438
7439 if (test_bit(Journal, &rdev->flags)) {
7440 journal_dev = rdev;
7441 continue;
7442 }
7443 if (rdev->raid_disk < 0)
7444 continue;
7445 diff = (rdev->new_data_offset - rdev->data_offset);
7446 if (first) {
7447 min_offset_diff = diff;
7448 first = 0;
7449 } else if (mddev->reshape_backwards &&
7450 diff < min_offset_diff)
7451 min_offset_diff = diff;
7452 else if (!mddev->reshape_backwards &&
7453 diff > min_offset_diff)
7454 min_offset_diff = diff;
7455 }
7456
7457 if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) &&
7458 (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
7459 pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
7460 mdname(mddev));
7461 return -EINVAL;
7462 }
7463
7464 if (mddev->reshape_position != MaxSector) {
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477 sector_t here_new, here_old;
7478 int old_disks;
7479 int max_degraded = (mddev->level == 6 ? 2 : 1);
7480 int chunk_sectors;
7481 int new_data_disks;
7482
7483 if (journal_dev) {
7484 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
7485 mdname(mddev));
7486 return -EINVAL;
7487 }
7488
7489 if (mddev->new_level != mddev->level) {
7490 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
7491 mdname(mddev));
7492 return -EINVAL;
7493 }
7494 old_disks = mddev->raid_disks - mddev->delta_disks;
7495
7496
7497
7498
7499
7500
7501
7502 here_new = mddev->reshape_position;
7503 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
7504 new_data_disks = mddev->raid_disks - max_degraded;
7505 if (sector_div(here_new, chunk_sectors * new_data_disks)) {
7506 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
7507 mdname(mddev));
7508 return -EINVAL;
7509 }
7510 reshape_offset = here_new * chunk_sectors;
7511
7512 here_old = mddev->reshape_position;
7513 sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
7514
7515
7516 if (mddev->delta_disks == 0) {
7517
7518
7519
7520
7521
7522
7523
7524 if (abs(min_offset_diff) >= mddev->chunk_sectors &&
7525 abs(min_offset_diff) >= mddev->new_chunk_sectors)
7526 ;
7527 else if (mddev->ro == 0) {
7528 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
7529 mdname(mddev));
7530 return -EINVAL;
7531 }
7532 } else if (mddev->reshape_backwards
7533 ? (here_new * chunk_sectors + min_offset_diff <=
7534 here_old * chunk_sectors)
7535 : (here_new * chunk_sectors >=
7536 here_old * chunk_sectors + (-min_offset_diff))) {
7537
7538 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
7539 mdname(mddev));
7540 return -EINVAL;
7541 }
7542 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
7543
7544 } else {
7545 BUG_ON(mddev->level != mddev->new_level);
7546 BUG_ON(mddev->layout != mddev->new_layout);
7547 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
7548 BUG_ON(mddev->delta_disks != 0);
7549 }
7550
7551 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
7552 test_bit(MD_HAS_PPL, &mddev->flags)) {
7553 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
7554 mdname(mddev));
7555 clear_bit(MD_HAS_PPL, &mddev->flags);
7556 clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags);
7557 }
7558
7559 if (mddev->private == NULL)
7560 conf = setup_conf(mddev);
7561 else
7562 conf = mddev->private;
7563
7564 if (IS_ERR(conf))
7565 return PTR_ERR(conf);
7566
7567 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
7568 if (!journal_dev) {
7569 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
7570 mdname(mddev));
7571 mddev->ro = 1;
7572 set_disk_ro(mddev->gendisk, 1);
7573 } else if (mddev->recovery_cp == MaxSector)
7574 set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
7575 }
7576
7577 conf->min_offset_diff = min_offset_diff;
7578 mddev->thread = conf->thread;
7579 conf->thread = NULL;
7580 mddev->private = conf;
7581
7582 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
7583 i++) {
7584 rdev = conf->disks[i].rdev;
7585 if (!rdev && conf->disks[i].replacement) {
7586
7587 rdev = conf->disks[i].replacement;
7588 conf->disks[i].replacement = NULL;
7589 clear_bit(Replacement, &rdev->flags);
7590 conf->disks[i].rdev = rdev;
7591 }
7592 if (!rdev)
7593 continue;
7594 if (conf->disks[i].replacement &&
7595 conf->reshape_progress != MaxSector) {
7596
7597 pr_warn("md: cannot handle concurrent replacement and reshape.\n");
7598 goto abort;
7599 }
7600 if (test_bit(In_sync, &rdev->flags)) {
7601 working_disks++;
7602 continue;
7603 }
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613 if (mddev->major_version == 0 &&
7614 mddev->minor_version > 90)
7615 rdev->recovery_offset = reshape_offset;
7616
7617 if (rdev->recovery_offset < reshape_offset) {
7618
7619 if (!only_parity(rdev->raid_disk,
7620 conf->algorithm,
7621 conf->raid_disks,
7622 conf->max_degraded))
7623 continue;
7624 }
7625 if (!only_parity(rdev->raid_disk,
7626 conf->prev_algo,
7627 conf->previous_raid_disks,
7628 conf->max_degraded))
7629 continue;
7630 dirty_parity_disks++;
7631 }
7632
7633
7634
7635
7636 mddev->degraded = raid5_calc_degraded(conf);
7637
7638 if (has_failed(conf)) {
7639 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
7640 mdname(mddev), mddev->degraded, conf->raid_disks);
7641 goto abort;
7642 }
7643
7644
7645 mddev->dev_sectors &= ~((sector_t)mddev->chunk_sectors - 1);
7646 mddev->resync_max_sectors = mddev->dev_sectors;
7647
7648 if (mddev->degraded > dirty_parity_disks &&
7649 mddev->recovery_cp != MaxSector) {
7650 if (test_bit(MD_HAS_PPL, &mddev->flags))
7651 pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
7652 mdname(mddev));
7653 else if (mddev->ok_start_degraded)
7654 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
7655 mdname(mddev));
7656 else {
7657 pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
7658 mdname(mddev));
7659 goto abort;
7660 }
7661 }
7662
7663 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
7664 mdname(mddev), conf->level,
7665 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
7666 mddev->new_layout);
7667
7668 print_raid5_conf(conf);
7669
7670 if (conf->reshape_progress != MaxSector) {
7671 conf->reshape_safe = conf->reshape_progress;
7672 atomic_set(&conf->reshape_stripes, 0);
7673 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7674 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7675 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7676 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7677 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
7678 "reshape");
7679 if (!mddev->sync_thread)
7680 goto abort;
7681 }
7682
7683
7684 if (mddev->to_remove == &raid5_attrs_group)
7685 mddev->to_remove = NULL;
7686 else if (mddev->kobj.sd &&
7687 sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
7688 pr_warn("raid5: failed to create sysfs attributes for %s\n",
7689 mdname(mddev));
7690 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
7691
7692 if (mddev->queue) {
7693 int chunk_size;
7694
7695
7696
7697
7698 int data_disks = conf->previous_raid_disks - conf->max_degraded;
7699 int stripe = data_disks *
7700 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
7701
7702 chunk_size = mddev->chunk_sectors << 9;
7703 blk_queue_io_min(mddev->queue, chunk_size);
7704 raid5_set_io_opt(conf);
7705 mddev->queue->limits.raid_partial_stripes_expensive = 1;
7706
7707
7708
7709
7710 stripe = stripe * PAGE_SIZE;
7711
7712
7713 while ((stripe-1) & stripe)
7714 stripe = (stripe | (stripe-1)) + 1;
7715 mddev->queue->limits.discard_alignment = stripe;
7716 mddev->queue->limits.discard_granularity = stripe;
7717
7718 blk_queue_max_write_same_sectors(mddev->queue, 0);
7719 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
7720
7721 rdev_for_each(rdev, mddev) {
7722 disk_stack_limits(mddev->gendisk, rdev->bdev,
7723 rdev->data_offset << 9);
7724 disk_stack_limits(mddev->gendisk, rdev->bdev,
7725 rdev->new_data_offset << 9);
7726 }
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743 if (devices_handle_discard_safely &&
7744 mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
7745 mddev->queue->limits.discard_granularity >= stripe)
7746 blk_queue_flag_set(QUEUE_FLAG_DISCARD,
7747 mddev->queue);
7748 else
7749 blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
7750 mddev->queue);
7751
7752 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
7753 }
7754
7755 if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
7756 goto abort;
7757
7758 return 0;
7759abort:
7760 md_unregister_thread(&mddev->thread);
7761 print_raid5_conf(conf);
7762 free_conf(conf);
7763 mddev->private = NULL;
7764 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
7765 return -EIO;
7766}
7767
7768static void raid5_free(struct mddev *mddev, void *priv)
7769{
7770 struct r5conf *conf = priv;
7771
7772 free_conf(conf);
7773 mddev->to_remove = &raid5_attrs_group;
7774}
7775
7776static void raid5_status(struct seq_file *seq, struct mddev *mddev)
7777{
7778 struct r5conf *conf = mddev->private;
7779 int i;
7780
7781 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
7782 conf->chunk_sectors / 2, mddev->layout);
7783 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
7784 rcu_read_lock();
7785 for (i = 0; i < conf->raid_disks; i++) {
7786 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
7787 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
7788 }
7789 rcu_read_unlock();
7790 seq_printf (seq, "]");
7791}
7792
7793static void print_raid5_conf (struct r5conf *conf)
7794{
7795 int i;
7796 struct disk_info *tmp;
7797
7798 pr_debug("RAID conf printout:\n");
7799 if (!conf) {
7800 pr_debug("(conf==NULL)\n");
7801 return;
7802 }
7803 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
7804 conf->raid_disks,
7805 conf->raid_disks - conf->mddev->degraded);
7806
7807 for (i = 0; i < conf->raid_disks; i++) {
7808 char b[BDEVNAME_SIZE];
7809 tmp = conf->disks + i;
7810 if (tmp->rdev)
7811 pr_debug(" disk %d, o:%d, dev:%s\n",
7812 i, !test_bit(Faulty, &tmp->rdev->flags),
7813 bdevname(tmp->rdev->bdev, b));
7814 }
7815}
7816
7817static int raid5_spare_active(struct mddev *mddev)
7818{
7819 int i;
7820 struct r5conf *conf = mddev->private;
7821 struct disk_info *tmp;
7822 int count = 0;
7823 unsigned long flags;
7824
7825 for (i = 0; i < conf->raid_disks; i++) {
7826 tmp = conf->disks + i;
7827 if (tmp->replacement
7828 && tmp->replacement->recovery_offset == MaxSector
7829 && !test_bit(Faulty, &tmp->replacement->flags)
7830 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
7831
7832 if (!tmp->rdev
7833 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
7834 count++;
7835 if (tmp->rdev) {
7836
7837
7838
7839
7840 set_bit(Faulty, &tmp->rdev->flags);
7841 sysfs_notify_dirent_safe(
7842 tmp->rdev->sysfs_state);
7843 }
7844 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
7845 } else if (tmp->rdev
7846 && tmp->rdev->recovery_offset == MaxSector
7847 && !test_bit(Faulty, &tmp->rdev->flags)
7848 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
7849 count++;
7850 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
7851 }
7852 }
7853 spin_lock_irqsave(&conf->device_lock, flags);
7854 mddev->degraded = raid5_calc_degraded(conf);
7855 spin_unlock_irqrestore(&conf->device_lock, flags);
7856 print_raid5_conf(conf);
7857 return count;
7858}
7859
7860static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
7861{
7862 struct r5conf *conf = mddev->private;
7863 int err = 0;
7864 int number = rdev->raid_disk;
7865 struct md_rdev **rdevp;
7866 struct disk_info *p = conf->disks + number;
7867
7868 print_raid5_conf(conf);
7869 if (test_bit(Journal, &rdev->flags) && conf->log) {
7870
7871
7872
7873
7874
7875
7876 if (atomic_read(&conf->active_stripes) ||
7877 atomic_read(&conf->r5c_cached_full_stripes) ||
7878 atomic_read(&conf->r5c_cached_partial_stripes)) {
7879 return -EBUSY;
7880 }
7881 log_exit(conf);
7882 return 0;
7883 }
7884 if (rdev == p->rdev)
7885 rdevp = &p->rdev;
7886 else if (rdev == p->replacement)
7887 rdevp = &p->replacement;
7888 else
7889 return 0;
7890
7891 if (number >= conf->raid_disks &&
7892 conf->reshape_progress == MaxSector)
7893 clear_bit(In_sync, &rdev->flags);
7894
7895 if (test_bit(In_sync, &rdev->flags) ||
7896 atomic_read(&rdev->nr_pending)) {
7897 err = -EBUSY;
7898 goto abort;
7899 }
7900
7901
7902
7903 if (!test_bit(Faulty, &rdev->flags) &&
7904 mddev->recovery_disabled != conf->recovery_disabled &&
7905 !has_failed(conf) &&
7906 (!p->replacement || p->replacement == rdev) &&
7907 number < conf->raid_disks) {
7908 err = -EBUSY;
7909 goto abort;
7910 }
7911 *rdevp = NULL;
7912 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
7913 synchronize_rcu();
7914 if (atomic_read(&rdev->nr_pending)) {
7915
7916 err = -EBUSY;
7917 *rdevp = rdev;
7918 }
7919 }
7920 if (!err) {
7921 err = log_modify(conf, rdev, false);
7922 if (err)
7923 goto abort;
7924 }
7925 if (p->replacement) {
7926
7927 p->rdev = p->replacement;
7928 clear_bit(Replacement, &p->replacement->flags);
7929 smp_mb();
7930
7931
7932 p->replacement = NULL;
7933
7934 if (!err)
7935 err = log_modify(conf, p->rdev, true);
7936 }
7937
7938 clear_bit(WantReplacement, &rdev->flags);
7939abort:
7940
7941 print_raid5_conf(conf);
7942 return err;
7943}
7944
7945static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
7946{
7947 struct r5conf *conf = mddev->private;
7948 int ret, err = -EEXIST;
7949 int disk;
7950 struct disk_info *p;
7951 int first = 0;
7952 int last = conf->raid_disks - 1;
7953
7954 if (test_bit(Journal, &rdev->flags)) {
7955 if (conf->log)
7956 return -EBUSY;
7957
7958 rdev->raid_disk = 0;
7959
7960
7961
7962
7963 ret = log_init(conf, rdev, false);
7964 if (ret)
7965 return ret;
7966
7967 ret = r5l_start(conf->log);
7968 if (ret)
7969 return ret;
7970
7971 return 0;
7972 }
7973 if (mddev->recovery_disabled == conf->recovery_disabled)
7974 return -EBUSY;
7975
7976 if (rdev->saved_raid_disk < 0 && has_failed(conf))
7977
7978 return -EINVAL;
7979
7980 if (rdev->raid_disk >= 0)
7981 first = last = rdev->raid_disk;
7982
7983
7984
7985
7986
7987 if (rdev->saved_raid_disk >= 0 &&
7988 rdev->saved_raid_disk >= first &&
7989 conf->disks[rdev->saved_raid_disk].rdev == NULL)
7990 first = rdev->saved_raid_disk;
7991
7992 for (disk = first; disk <= last; disk++) {
7993 p = conf->disks + disk;
7994 if (p->rdev == NULL) {
7995 clear_bit(In_sync, &rdev->flags);
7996 rdev->raid_disk = disk;
7997 if (rdev->saved_raid_disk != disk)
7998 conf->fullsync = 1;
7999 rcu_assign_pointer(p->rdev, rdev);
8000
8001 err = log_modify(conf, rdev, true);
8002
8003 goto out;
8004 }
8005 }
8006 for (disk = first; disk <= last; disk++) {
8007 p = conf->disks + disk;
8008 if (test_bit(WantReplacement, &p->rdev->flags) &&
8009 p->replacement == NULL) {
8010 clear_bit(In_sync, &rdev->flags);
8011 set_bit(Replacement, &rdev->flags);
8012 rdev->raid_disk = disk;
8013 err = 0;
8014 conf->fullsync = 1;
8015 rcu_assign_pointer(p->replacement, rdev);
8016 break;
8017 }
8018 }
8019out:
8020 print_raid5_conf(conf);
8021 return err;
8022}
8023
8024static int raid5_resize(struct mddev *mddev, sector_t sectors)
8025{
8026
8027
8028
8029
8030
8031
8032
8033 sector_t newsize;
8034 struct r5conf *conf = mddev->private;
8035
8036 if (raid5_has_log(conf) || raid5_has_ppl(conf))
8037 return -EINVAL;
8038 sectors &= ~((sector_t)conf->chunk_sectors - 1);
8039 newsize = raid5_size(mddev, sectors, mddev->raid_disks);
8040 if (mddev->external_size &&
8041 mddev->array_sectors > newsize)
8042 return -EINVAL;
8043 if (mddev->bitmap) {
8044 int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0);
8045 if (ret)
8046 return ret;
8047 }
8048 md_set_array_sectors(mddev, newsize);
8049 if (sectors > mddev->dev_sectors &&
8050 mddev->recovery_cp > mddev->dev_sectors) {
8051 mddev->recovery_cp = mddev->dev_sectors;
8052 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8053 }
8054 mddev->dev_sectors = sectors;
8055 mddev->resync_max_sectors = sectors;
8056 return 0;
8057}
8058
8059static int check_stripe_cache(struct mddev *mddev)
8060{
8061
8062
8063
8064
8065
8066
8067
8068
8069 struct r5conf *conf = mddev->private;
8070 if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
8071 > conf->min_nr_stripes ||
8072 ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
8073 > conf->min_nr_stripes) {
8074 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
8075 mdname(mddev),
8076 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
8077 / RAID5_STRIPE_SIZE(conf))*4);
8078 return 0;
8079 }
8080 return 1;
8081}
8082
8083static int check_reshape(struct mddev *mddev)
8084{
8085 struct r5conf *conf = mddev->private;
8086
8087 if (raid5_has_log(conf) || raid5_has_ppl(conf))
8088 return -EINVAL;
8089 if (mddev->delta_disks == 0 &&
8090 mddev->new_layout == mddev->layout &&
8091 mddev->new_chunk_sectors == mddev->chunk_sectors)
8092 return 0;
8093 if (has_failed(conf))
8094 return -EINVAL;
8095 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) {
8096
8097
8098
8099
8100
8101 int min = 2;
8102 if (mddev->level == 6)
8103 min = 4;
8104 if (mddev->raid_disks + mddev->delta_disks < min)
8105 return -EINVAL;
8106 }
8107
8108 if (!check_stripe_cache(mddev))
8109 return -ENOSPC;
8110
8111 if (mddev->new_chunk_sectors > mddev->chunk_sectors ||
8112 mddev->delta_disks > 0)
8113 if (resize_chunks(conf,
8114 conf->previous_raid_disks
8115 + max(0, mddev->delta_disks),
8116 max(mddev->new_chunk_sectors,
8117 mddev->chunk_sectors)
8118 ) < 0)
8119 return -ENOMEM;
8120
8121 if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
8122 return 0;
8123 return resize_stripes(conf, (conf->previous_raid_disks
8124 + mddev->delta_disks));
8125}
8126
8127static int raid5_start_reshape(struct mddev *mddev)
8128{
8129 struct r5conf *conf = mddev->private;
8130 struct md_rdev *rdev;
8131 int spares = 0;
8132 unsigned long flags;
8133
8134 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
8135 return -EBUSY;
8136
8137 if (!check_stripe_cache(mddev))
8138 return -ENOSPC;
8139
8140 if (has_failed(conf))
8141 return -EINVAL;
8142
8143 rdev_for_each(rdev, mddev) {
8144 if (!test_bit(In_sync, &rdev->flags)
8145 && !test_bit(Faulty, &rdev->flags))
8146 spares++;
8147 }
8148
8149 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
8150
8151
8152
8153 return -EINVAL;
8154
8155
8156
8157
8158
8159 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
8160 < mddev->array_sectors) {
8161 pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
8162 mdname(mddev));
8163 return -EINVAL;
8164 }
8165
8166 atomic_set(&conf->reshape_stripes, 0);
8167 spin_lock_irq(&conf->device_lock);
8168 write_seqcount_begin(&conf->gen_lock);
8169 conf->previous_raid_disks = conf->raid_disks;
8170 conf->raid_disks += mddev->delta_disks;
8171 conf->prev_chunk_sectors = conf->chunk_sectors;
8172 conf->chunk_sectors = mddev->new_chunk_sectors;
8173 conf->prev_algo = conf->algorithm;
8174 conf->algorithm = mddev->new_layout;
8175 conf->generation++;
8176
8177
8178
8179 smp_mb();
8180 if (mddev->reshape_backwards)
8181 conf->reshape_progress = raid5_size(mddev, 0, 0);
8182 else
8183 conf->reshape_progress = 0;
8184 conf->reshape_safe = conf->reshape_progress;
8185 write_seqcount_end(&conf->gen_lock);
8186 spin_unlock_irq(&conf->device_lock);
8187
8188
8189
8190
8191
8192 mddev_suspend(mddev);
8193 mddev_resume(mddev);
8194
8195
8196
8197
8198
8199
8200
8201
8202 if (mddev->delta_disks >= 0) {
8203 rdev_for_each(rdev, mddev)
8204 if (rdev->raid_disk < 0 &&
8205 !test_bit(Faulty, &rdev->flags)) {
8206 if (raid5_add_disk(mddev, rdev) == 0) {
8207 if (rdev->raid_disk
8208 >= conf->previous_raid_disks)
8209 set_bit(In_sync, &rdev->flags);
8210 else
8211 rdev->recovery_offset = 0;
8212
8213
8214 sysfs_link_rdev(mddev, rdev);
8215 }
8216 } else if (rdev->raid_disk >= conf->previous_raid_disks
8217 && !test_bit(Faulty, &rdev->flags)) {
8218
8219 set_bit(In_sync, &rdev->flags);
8220 }
8221
8222
8223
8224
8225
8226 spin_lock_irqsave(&conf->device_lock, flags);
8227 mddev->degraded = raid5_calc_degraded(conf);
8228 spin_unlock_irqrestore(&conf->device_lock, flags);
8229 }
8230 mddev->raid_disks = conf->raid_disks;
8231 mddev->reshape_position = conf->reshape_progress;
8232 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8233
8234 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8235 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8236 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
8237 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8238 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8239 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
8240 "reshape");
8241 if (!mddev->sync_thread) {
8242 mddev->recovery = 0;
8243 spin_lock_irq(&conf->device_lock);
8244 write_seqcount_begin(&conf->gen_lock);
8245 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
8246 mddev->new_chunk_sectors =
8247 conf->chunk_sectors = conf->prev_chunk_sectors;
8248 mddev->new_layout = conf->algorithm = conf->prev_algo;
8249 rdev_for_each(rdev, mddev)
8250 rdev->new_data_offset = rdev->data_offset;
8251 smp_wmb();
8252 conf->generation --;
8253 conf->reshape_progress = MaxSector;
8254 mddev->reshape_position = MaxSector;
8255 write_seqcount_end(&conf->gen_lock);
8256 spin_unlock_irq(&conf->device_lock);
8257 return -EAGAIN;
8258 }
8259 conf->reshape_checkpoint = jiffies;
8260 md_wakeup_thread(mddev->sync_thread);
8261 md_new_event(mddev);
8262 return 0;
8263}
8264
8265
8266
8267
8268static void end_reshape(struct r5conf *conf)
8269{
8270
8271 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
8272 struct md_rdev *rdev;
8273
8274 spin_lock_irq(&conf->device_lock);
8275 conf->previous_raid_disks = conf->raid_disks;
8276 md_finish_reshape(conf->mddev);
8277 smp_wmb();
8278 conf->reshape_progress = MaxSector;
8279 conf->mddev->reshape_position = MaxSector;
8280 rdev_for_each(rdev, conf->mddev)
8281 if (rdev->raid_disk >= 0 &&
8282 !test_bit(Journal, &rdev->flags) &&
8283 !test_bit(In_sync, &rdev->flags))
8284 rdev->recovery_offset = MaxSector;
8285 spin_unlock_irq(&conf->device_lock);
8286 wake_up(&conf->wait_for_overlap);
8287
8288 if (conf->mddev->queue)
8289 raid5_set_io_opt(conf);
8290 }
8291}
8292
8293
8294
8295
8296static void raid5_finish_reshape(struct mddev *mddev)
8297{
8298 struct r5conf *conf = mddev->private;
8299
8300 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8301
8302 if (mddev->delta_disks <= 0) {
8303 int d;
8304 spin_lock_irq(&conf->device_lock);
8305 mddev->degraded = raid5_calc_degraded(conf);
8306 spin_unlock_irq(&conf->device_lock);
8307 for (d = conf->raid_disks ;
8308 d < conf->raid_disks - mddev->delta_disks;
8309 d++) {
8310 struct md_rdev *rdev = conf->disks[d].rdev;
8311 if (rdev)
8312 clear_bit(In_sync, &rdev->flags);
8313 rdev = conf->disks[d].replacement;
8314 if (rdev)
8315 clear_bit(In_sync, &rdev->flags);
8316 }
8317 }
8318 mddev->layout = conf->algorithm;
8319 mddev->chunk_sectors = conf->chunk_sectors;
8320 mddev->reshape_position = MaxSector;
8321 mddev->delta_disks = 0;
8322 mddev->reshape_backwards = 0;
8323 }
8324}
8325
8326static void raid5_quiesce(struct mddev *mddev, int quiesce)
8327{
8328 struct r5conf *conf = mddev->private;
8329
8330 if (quiesce) {
8331
8332 lock_all_device_hash_locks_irq(conf);
8333
8334
8335
8336 r5c_flush_cache(conf, INT_MAX);
8337 conf->quiesce = 2;
8338 wait_event_cmd(conf->wait_for_quiescent,
8339 atomic_read(&conf->active_stripes) == 0 &&
8340 atomic_read(&conf->active_aligned_reads) == 0,
8341 unlock_all_device_hash_locks_irq(conf),
8342 lock_all_device_hash_locks_irq(conf));
8343 conf->quiesce = 1;
8344 unlock_all_device_hash_locks_irq(conf);
8345
8346 wake_up(&conf->wait_for_overlap);
8347 } else {
8348
8349 lock_all_device_hash_locks_irq(conf);
8350 conf->quiesce = 0;
8351 wake_up(&conf->wait_for_quiescent);
8352 wake_up(&conf->wait_for_overlap);
8353 unlock_all_device_hash_locks_irq(conf);
8354 }
8355 log_quiesce(conf, quiesce);
8356}
8357
8358static void *raid45_takeover_raid0(struct mddev *mddev, int level)
8359{
8360 struct r0conf *raid0_conf = mddev->private;
8361 sector_t sectors;
8362
8363
8364 if (raid0_conf->nr_strip_zones > 1) {
8365 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
8366 mdname(mddev));
8367 return ERR_PTR(-EINVAL);
8368 }
8369
8370 sectors = raid0_conf->strip_zone[0].zone_end;
8371 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
8372 mddev->dev_sectors = sectors;
8373 mddev->new_level = level;
8374 mddev->new_layout = ALGORITHM_PARITY_N;
8375 mddev->new_chunk_sectors = mddev->chunk_sectors;
8376 mddev->raid_disks += 1;
8377 mddev->delta_disks = 1;
8378
8379 mddev->recovery_cp = MaxSector;
8380
8381 return setup_conf(mddev);
8382}
8383
8384static void *raid5_takeover_raid1(struct mddev *mddev)
8385{
8386 int chunksect;
8387 void *ret;
8388
8389 if (mddev->raid_disks != 2 ||
8390 mddev->degraded > 1)
8391 return ERR_PTR(-EINVAL);
8392
8393
8394
8395 chunksect = 64*2;
8396
8397
8398 while (chunksect && (mddev->array_sectors & (chunksect-1)))
8399 chunksect >>= 1;
8400
8401 if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private))
8402
8403 return ERR_PTR(-EINVAL);
8404
8405 mddev->new_level = 5;
8406 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
8407 mddev->new_chunk_sectors = chunksect;
8408
8409 ret = setup_conf(mddev);
8410 if (!IS_ERR(ret))
8411 mddev_clear_unsupported_flags(mddev,
8412 UNSUPPORTED_MDDEV_FLAGS);
8413 return ret;
8414}
8415
8416static void *raid5_takeover_raid6(struct mddev *mddev)
8417{
8418 int new_layout;
8419
8420 switch (mddev->layout) {
8421 case ALGORITHM_LEFT_ASYMMETRIC_6:
8422 new_layout = ALGORITHM_LEFT_ASYMMETRIC;
8423 break;
8424 case ALGORITHM_RIGHT_ASYMMETRIC_6:
8425 new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
8426 break;
8427 case ALGORITHM_LEFT_SYMMETRIC_6:
8428 new_layout = ALGORITHM_LEFT_SYMMETRIC;
8429 break;
8430 case ALGORITHM_RIGHT_SYMMETRIC_6:
8431 new_layout = ALGORITHM_RIGHT_SYMMETRIC;
8432 break;
8433 case ALGORITHM_PARITY_0_6:
8434 new_layout = ALGORITHM_PARITY_0;
8435 break;
8436 case ALGORITHM_PARITY_N:
8437 new_layout = ALGORITHM_PARITY_N;
8438 break;
8439 default:
8440 return ERR_PTR(-EINVAL);
8441 }
8442 mddev->new_level = 5;
8443 mddev->new_layout = new_layout;
8444 mddev->delta_disks = -1;
8445 mddev->raid_disks -= 1;
8446 return setup_conf(mddev);
8447}
8448
8449static int raid5_check_reshape(struct mddev *mddev)
8450{
8451
8452
8453
8454
8455
8456 struct r5conf *conf = mddev->private;
8457 int new_chunk = mddev->new_chunk_sectors;
8458
8459 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
8460 return -EINVAL;
8461 if (new_chunk > 0) {
8462 if (!is_power_of_2(new_chunk))
8463 return -EINVAL;
8464 if (new_chunk < (PAGE_SIZE>>9))
8465 return -EINVAL;
8466 if (mddev->array_sectors & (new_chunk-1))
8467
8468 return -EINVAL;
8469 }
8470
8471
8472
8473 if (mddev->raid_disks == 2) {
8474
8475 if (mddev->new_layout >= 0) {
8476 conf->algorithm = mddev->new_layout;
8477 mddev->layout = mddev->new_layout;
8478 }
8479 if (new_chunk > 0) {
8480 conf->chunk_sectors = new_chunk ;
8481 mddev->chunk_sectors = new_chunk;
8482 }
8483 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8484 md_wakeup_thread(mddev->thread);
8485 }
8486 return check_reshape(mddev);
8487}
8488
8489static int raid6_check_reshape(struct mddev *mddev)
8490{
8491 int new_chunk = mddev->new_chunk_sectors;
8492
8493 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
8494 return -EINVAL;
8495 if (new_chunk > 0) {
8496 if (!is_power_of_2(new_chunk))
8497 return -EINVAL;
8498 if (new_chunk < (PAGE_SIZE >> 9))
8499 return -EINVAL;
8500 if (mddev->array_sectors & (new_chunk-1))
8501
8502 return -EINVAL;
8503 }
8504
8505
8506 return check_reshape(mddev);
8507}
8508
8509static void *raid5_takeover(struct mddev *mddev)
8510{
8511
8512
8513
8514
8515
8516
8517 if (mddev->level == 0)
8518 return raid45_takeover_raid0(mddev, 5);
8519 if (mddev->level == 1)
8520 return raid5_takeover_raid1(mddev);
8521 if (mddev->level == 4) {
8522 mddev->new_layout = ALGORITHM_PARITY_N;
8523 mddev->new_level = 5;
8524 return setup_conf(mddev);
8525 }
8526 if (mddev->level == 6)
8527 return raid5_takeover_raid6(mddev);
8528
8529 return ERR_PTR(-EINVAL);
8530}
8531
8532static void *raid4_takeover(struct mddev *mddev)
8533{
8534
8535
8536
8537
8538 if (mddev->level == 0)
8539 return raid45_takeover_raid0(mddev, 4);
8540 if (mddev->level == 5 &&
8541 mddev->layout == ALGORITHM_PARITY_N) {
8542 mddev->new_layout = 0;
8543 mddev->new_level = 4;
8544 return setup_conf(mddev);
8545 }
8546 return ERR_PTR(-EINVAL);
8547}
8548
8549static struct md_personality raid5_personality;
8550
8551static void *raid6_takeover(struct mddev *mddev)
8552{
8553
8554
8555
8556
8557 int new_layout;
8558
8559 if (mddev->pers != &raid5_personality)
8560 return ERR_PTR(-EINVAL);
8561 if (mddev->degraded > 1)
8562 return ERR_PTR(-EINVAL);
8563 if (mddev->raid_disks > 253)
8564 return ERR_PTR(-EINVAL);
8565 if (mddev->raid_disks < 3)
8566 return ERR_PTR(-EINVAL);
8567
8568 switch (mddev->layout) {
8569 case ALGORITHM_LEFT_ASYMMETRIC:
8570 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
8571 break;
8572 case ALGORITHM_RIGHT_ASYMMETRIC:
8573 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
8574 break;
8575 case ALGORITHM_LEFT_SYMMETRIC:
8576 new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
8577 break;
8578 case ALGORITHM_RIGHT_SYMMETRIC:
8579 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
8580 break;
8581 case ALGORITHM_PARITY_0:
8582 new_layout = ALGORITHM_PARITY_0_6;
8583 break;
8584 case ALGORITHM_PARITY_N:
8585 new_layout = ALGORITHM_PARITY_N;
8586 break;
8587 default:
8588 return ERR_PTR(-EINVAL);
8589 }
8590 mddev->new_level = 6;
8591 mddev->new_layout = new_layout;
8592 mddev->delta_disks = 1;
8593 mddev->raid_disks += 1;
8594 return setup_conf(mddev);
8595}
8596
8597static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
8598{
8599 struct r5conf *conf;
8600 int err;
8601
8602 err = mddev_lock(mddev);
8603 if (err)
8604 return err;
8605 conf = mddev->private;
8606 if (!conf) {
8607 mddev_unlock(mddev);
8608 return -ENODEV;
8609 }
8610
8611 if (strncmp(buf, "ppl", 3) == 0) {
8612
8613 if (!raid5_has_ppl(conf) && conf->level == 5) {
8614 err = log_init(conf, NULL, true);
8615 if (!err) {
8616 err = resize_stripes(conf, conf->pool_size);
8617 if (err)
8618 log_exit(conf);
8619 }
8620 } else
8621 err = -EINVAL;
8622 } else if (strncmp(buf, "resync", 6) == 0) {
8623 if (raid5_has_ppl(conf)) {
8624 mddev_suspend(mddev);
8625 log_exit(conf);
8626 mddev_resume(mddev);
8627 err = resize_stripes(conf, conf->pool_size);
8628 } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
8629 r5l_log_disk_error(conf)) {
8630 bool journal_dev_exists = false;
8631 struct md_rdev *rdev;
8632
8633 rdev_for_each(rdev, mddev)
8634 if (test_bit(Journal, &rdev->flags)) {
8635 journal_dev_exists = true;
8636 break;
8637 }
8638
8639 if (!journal_dev_exists) {
8640 mddev_suspend(mddev);
8641 clear_bit(MD_HAS_JOURNAL, &mddev->flags);
8642 mddev_resume(mddev);
8643 } else
8644 err = -EBUSY;
8645 } else
8646 err = -EINVAL;
8647 } else {
8648 err = -EINVAL;
8649 }
8650
8651 if (!err)
8652 md_update_sb(mddev, 1);
8653
8654 mddev_unlock(mddev);
8655
8656 return err;
8657}
8658
8659static int raid5_start(struct mddev *mddev)
8660{
8661 struct r5conf *conf = mddev->private;
8662
8663 return r5l_start(conf->log);
8664}
8665
8666static struct md_personality raid6_personality =
8667{
8668 .name = "raid6",
8669 .level = 6,
8670 .owner = THIS_MODULE,
8671 .make_request = raid5_make_request,
8672 .run = raid5_run,
8673 .start = raid5_start,
8674 .free = raid5_free,
8675 .status = raid5_status,
8676 .error_handler = raid5_error,
8677 .hot_add_disk = raid5_add_disk,
8678 .hot_remove_disk= raid5_remove_disk,
8679 .spare_active = raid5_spare_active,
8680 .sync_request = raid5_sync_request,
8681 .resize = raid5_resize,
8682 .size = raid5_size,
8683 .check_reshape = raid6_check_reshape,
8684 .start_reshape = raid5_start_reshape,
8685 .finish_reshape = raid5_finish_reshape,
8686 .quiesce = raid5_quiesce,
8687 .takeover = raid6_takeover,
8688 .change_consistency_policy = raid5_change_consistency_policy,
8689};
8690static struct md_personality raid5_personality =
8691{
8692 .name = "raid5",
8693 .level = 5,
8694 .owner = THIS_MODULE,
8695 .make_request = raid5_make_request,
8696 .run = raid5_run,
8697 .start = raid5_start,
8698 .free = raid5_free,
8699 .status = raid5_status,
8700 .error_handler = raid5_error,
8701 .hot_add_disk = raid5_add_disk,
8702 .hot_remove_disk= raid5_remove_disk,
8703 .spare_active = raid5_spare_active,
8704 .sync_request = raid5_sync_request,
8705 .resize = raid5_resize,
8706 .size = raid5_size,
8707 .check_reshape = raid5_check_reshape,
8708 .start_reshape = raid5_start_reshape,
8709 .finish_reshape = raid5_finish_reshape,
8710 .quiesce = raid5_quiesce,
8711 .takeover = raid5_takeover,
8712 .change_consistency_policy = raid5_change_consistency_policy,
8713};
8714
8715static struct md_personality raid4_personality =
8716{
8717 .name = "raid4",
8718 .level = 4,
8719 .owner = THIS_MODULE,
8720 .make_request = raid5_make_request,
8721 .run = raid5_run,
8722 .start = raid5_start,
8723 .free = raid5_free,
8724 .status = raid5_status,
8725 .error_handler = raid5_error,
8726 .hot_add_disk = raid5_add_disk,
8727 .hot_remove_disk= raid5_remove_disk,
8728 .spare_active = raid5_spare_active,
8729 .sync_request = raid5_sync_request,
8730 .resize = raid5_resize,
8731 .size = raid5_size,
8732 .check_reshape = raid5_check_reshape,
8733 .start_reshape = raid5_start_reshape,
8734 .finish_reshape = raid5_finish_reshape,
8735 .quiesce = raid5_quiesce,
8736 .takeover = raid4_takeover,
8737 .change_consistency_policy = raid5_change_consistency_policy,
8738};
8739
8740static int __init raid5_init(void)
8741{
8742 int ret;
8743
8744 raid5_wq = alloc_workqueue("raid5wq",
8745 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
8746 if (!raid5_wq)
8747 return -ENOMEM;
8748
8749 ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE,
8750 "md/raid5:prepare",
8751 raid456_cpu_up_prepare,
8752 raid456_cpu_dead);
8753 if (ret) {
8754 destroy_workqueue(raid5_wq);
8755 return ret;
8756 }
8757 register_md_personality(&raid6_personality);
8758 register_md_personality(&raid5_personality);
8759 register_md_personality(&raid4_personality);
8760 return 0;
8761}
8762
8763static void raid5_exit(void)
8764{
8765 unregister_md_personality(&raid6_personality);
8766 unregister_md_personality(&raid5_personality);
8767 unregister_md_personality(&raid4_personality);
8768 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
8769 destroy_workqueue(raid5_wq);
8770}
8771
8772module_init(raid5_init);
8773module_exit(raid5_exit);
8774MODULE_LICENSE("GPL");
8775MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
8776MODULE_ALIAS("md-personality-4");
8777MODULE_ALIAS("md-raid5");
8778MODULE_ALIAS("md-raid4");
8779MODULE_ALIAS("md-level-5");
8780MODULE_ALIAS("md-level-4");
8781MODULE_ALIAS("md-personality-8");
8782MODULE_ALIAS("md-raid6");
8783MODULE_ALIAS("md-level-6");
8784
8785
8786MODULE_ALIAS("raid5");
8787MODULE_ALIAS("raid6");
8788