1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38#include <linux/blkdev.h>
39#include <linux/kthread.h>
40#include <linux/raid/pq.h>
41#include <linux/async_tx.h>
42#include <linux/module.h>
43#include <linux/async.h>
44#include <linux/seq_file.h>
45#include <linux/cpu.h>
46#include <linux/slab.h>
47#include <linux/ratelimit.h>
48#include <linux/nodemask.h>
49
50#include <trace/events/block.h>
51#include <linux/list_sort.h>
52
53#include "md.h"
54#include "raid5.h"
55#include "raid0.h"
56#include "md-bitmap.h"
57#include "raid5-log.h"
58
59#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
60
61#define cpu_to_group(cpu) cpu_to_node(cpu)
62#define ANY_GROUP NUMA_NO_NODE
63
64static bool devices_handle_discard_safely = false;
65module_param(devices_handle_discard_safely, bool, 0644);
66MODULE_PARM_DESC(devices_handle_discard_safely,
67 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
68static struct workqueue_struct *raid5_wq;
69
70static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
71{
72 int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
73 return &conf->stripe_hashtbl[hash];
74}
75
76static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
77{
78 return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK;
79}
80
81static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
82{
83 spin_lock_irq(conf->hash_locks + hash);
84 spin_lock(&conf->device_lock);
85}
86
87static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
88{
89 spin_unlock(&conf->device_lock);
90 spin_unlock_irq(conf->hash_locks + hash);
91}
92
93static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
94{
95 int i;
96 spin_lock_irq(conf->hash_locks);
97 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
98 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
99 spin_lock(&conf->device_lock);
100}
101
102static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
103{
104 int i;
105 spin_unlock(&conf->device_lock);
106 for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
107 spin_unlock(conf->hash_locks + i);
108 spin_unlock_irq(conf->hash_locks);
109}
110
111
112static inline int raid6_d0(struct stripe_head *sh)
113{
114 if (sh->ddf_layout)
115
116 return 0;
117
118 if (sh->qd_idx == sh->disks - 1)
119 return 0;
120 else
121 return sh->qd_idx + 1;
122}
123static inline int raid6_next_disk(int disk, int raid_disks)
124{
125 disk++;
126 return (disk < raid_disks) ? disk : 0;
127}
128
129
130
131
132
133
134static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
135 int *count, int syndrome_disks)
136{
137 int slot = *count;
138
139 if (sh->ddf_layout)
140 (*count)++;
141 if (idx == sh->pd_idx)
142 return syndrome_disks;
143 if (idx == sh->qd_idx)
144 return syndrome_disks + 1;
145 if (!sh->ddf_layout)
146 (*count)++;
147 return slot;
148}
149
150static void print_raid5_conf (struct r5conf *conf);
151
152static int stripe_operations_active(struct stripe_head *sh)
153{
154 return sh->check_state || sh->reconstruct_state ||
155 test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
156 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
157}
158
159static bool stripe_is_lowprio(struct stripe_head *sh)
160{
161 return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
162 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
163 !test_bit(STRIPE_R5C_CACHING, &sh->state);
164}
165
166static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
167{
168 struct r5conf *conf = sh->raid_conf;
169 struct r5worker_group *group;
170 int thread_cnt;
171 int i, cpu = sh->cpu;
172
173 if (!cpu_online(cpu)) {
174 cpu = cpumask_any(cpu_online_mask);
175 sh->cpu = cpu;
176 }
177
178 if (list_empty(&sh->lru)) {
179 struct r5worker_group *group;
180 group = conf->worker_groups + cpu_to_group(cpu);
181 if (stripe_is_lowprio(sh))
182 list_add_tail(&sh->lru, &group->loprio_list);
183 else
184 list_add_tail(&sh->lru, &group->handle_list);
185 group->stripes_cnt++;
186 sh->group = group;
187 }
188
189 if (conf->worker_cnt_per_group == 0) {
190 md_wakeup_thread(conf->mddev->thread);
191 return;
192 }
193
194 group = conf->worker_groups + cpu_to_group(sh->cpu);
195
196 group->workers[0].working = true;
197
198 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
199
200 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
201
202 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
203 if (group->workers[i].working == false) {
204 group->workers[i].working = true;
205 queue_work_on(sh->cpu, raid5_wq,
206 &group->workers[i].work);
207 thread_cnt--;
208 }
209 }
210}
211
212static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
213 struct list_head *temp_inactive_list)
214{
215 int i;
216 int injournal = 0;
217
218 BUG_ON(!list_empty(&sh->lru));
219 BUG_ON(atomic_read(&conf->active_stripes)==0);
220
221 if (r5c_is_writeback(conf->log))
222 for (i = sh->disks; i--; )
223 if (test_bit(R5_InJournal, &sh->dev[i].flags))
224 injournal++;
225
226
227
228
229
230
231
232 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
233 (conf->quiesce && r5c_is_writeback(conf->log) &&
234 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
235 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
236 r5c_make_stripe_write_out(sh);
237 set_bit(STRIPE_HANDLE, &sh->state);
238 }
239
240 if (test_bit(STRIPE_HANDLE, &sh->state)) {
241 if (test_bit(STRIPE_DELAYED, &sh->state) &&
242 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
243 list_add_tail(&sh->lru, &conf->delayed_list);
244 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
245 sh->bm_seq - conf->seq_write > 0)
246 list_add_tail(&sh->lru, &conf->bitmap_list);
247 else {
248 clear_bit(STRIPE_DELAYED, &sh->state);
249 clear_bit(STRIPE_BIT_DELAY, &sh->state);
250 if (conf->worker_cnt_per_group == 0) {
251 if (stripe_is_lowprio(sh))
252 list_add_tail(&sh->lru,
253 &conf->loprio_list);
254 else
255 list_add_tail(&sh->lru,
256 &conf->handle_list);
257 } else {
258 raid5_wakeup_stripe_thread(sh);
259 return;
260 }
261 }
262 md_wakeup_thread(conf->mddev->thread);
263 } else {
264 BUG_ON(stripe_operations_active(sh));
265 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
266 if (atomic_dec_return(&conf->preread_active_stripes)
267 < IO_THRESHOLD)
268 md_wakeup_thread(conf->mddev->thread);
269 atomic_dec(&conf->active_stripes);
270 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
271 if (!r5c_is_writeback(conf->log))
272 list_add_tail(&sh->lru, temp_inactive_list);
273 else {
274 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
275 if (injournal == 0)
276 list_add_tail(&sh->lru, temp_inactive_list);
277 else if (injournal == conf->raid_disks - conf->max_degraded) {
278
279 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
280 atomic_inc(&conf->r5c_cached_full_stripes);
281 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
282 atomic_dec(&conf->r5c_cached_partial_stripes);
283 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
284 r5c_check_cached_full_stripe(conf);
285 } else
286
287
288
289
290
291 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
292 }
293 }
294 }
295}
296
297static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
298 struct list_head *temp_inactive_list)
299{
300 if (atomic_dec_and_test(&sh->count))
301 do_release_stripe(conf, sh, temp_inactive_list);
302}
303
304
305
306
307
308
309
310
311static void release_inactive_stripe_list(struct r5conf *conf,
312 struct list_head *temp_inactive_list,
313 int hash)
314{
315 int size;
316 bool do_wakeup = false;
317 unsigned long flags;
318
319 if (hash == NR_STRIPE_HASH_LOCKS) {
320 size = NR_STRIPE_HASH_LOCKS;
321 hash = NR_STRIPE_HASH_LOCKS - 1;
322 } else
323 size = 1;
324 while (size) {
325 struct list_head *list = &temp_inactive_list[size - 1];
326
327
328
329
330
331 if (!list_empty_careful(list)) {
332 spin_lock_irqsave(conf->hash_locks + hash, flags);
333 if (list_empty(conf->inactive_list + hash) &&
334 !list_empty(list))
335 atomic_dec(&conf->empty_inactive_list_nr);
336 list_splice_tail_init(list, conf->inactive_list + hash);
337 do_wakeup = true;
338 spin_unlock_irqrestore(conf->hash_locks + hash, flags);
339 }
340 size--;
341 hash--;
342 }
343
344 if (do_wakeup) {
345 wake_up(&conf->wait_for_stripe);
346 if (atomic_read(&conf->active_stripes) == 0)
347 wake_up(&conf->wait_for_quiescent);
348 if (conf->retry_read_aligned)
349 md_wakeup_thread(conf->mddev->thread);
350 }
351}
352
353
354static int release_stripe_list(struct r5conf *conf,
355 struct list_head *temp_inactive_list)
356{
357 struct stripe_head *sh, *t;
358 int count = 0;
359 struct llist_node *head;
360
361 head = llist_del_all(&conf->released_stripes);
362 head = llist_reverse_order(head);
363 llist_for_each_entry_safe(sh, t, head, release_list) {
364 int hash;
365
366
367 smp_mb();
368 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
369
370
371
372
373
374 hash = sh->hash_lock_index;
375 __release_stripe(conf, sh, &temp_inactive_list[hash]);
376 count++;
377 }
378
379 return count;
380}
381
382void raid5_release_stripe(struct stripe_head *sh)
383{
384 struct r5conf *conf = sh->raid_conf;
385 unsigned long flags;
386 struct list_head list;
387 int hash;
388 bool wakeup;
389
390
391
392 if (atomic_add_unless(&sh->count, -1, 1))
393 return;
394
395 if (unlikely(!conf->mddev->thread) ||
396 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
397 goto slow_path;
398 wakeup = llist_add(&sh->release_list, &conf->released_stripes);
399 if (wakeup)
400 md_wakeup_thread(conf->mddev->thread);
401 return;
402slow_path:
403
404 if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) {
405 INIT_LIST_HEAD(&list);
406 hash = sh->hash_lock_index;
407 do_release_stripe(conf, sh, &list);
408 spin_unlock_irqrestore(&conf->device_lock, flags);
409 release_inactive_stripe_list(conf, &list, hash);
410 }
411}
412
413static inline void remove_hash(struct stripe_head *sh)
414{
415 pr_debug("remove_hash(), stripe %llu\n",
416 (unsigned long long)sh->sector);
417
418 hlist_del_init(&sh->hash);
419}
420
421static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
422{
423 struct hlist_head *hp = stripe_hash(conf, sh->sector);
424
425 pr_debug("insert_hash(), stripe %llu\n",
426 (unsigned long long)sh->sector);
427
428 hlist_add_head(&sh->hash, hp);
429}
430
431
432static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
433{
434 struct stripe_head *sh = NULL;
435 struct list_head *first;
436
437 if (list_empty(conf->inactive_list + hash))
438 goto out;
439 first = (conf->inactive_list + hash)->next;
440 sh = list_entry(first, struct stripe_head, lru);
441 list_del_init(first);
442 remove_hash(sh);
443 atomic_inc(&conf->active_stripes);
444 BUG_ON(hash != sh->hash_lock_index);
445 if (list_empty(conf->inactive_list + hash))
446 atomic_inc(&conf->empty_inactive_list_nr);
447out:
448 return sh;
449}
450
451#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
452static void free_stripe_pages(struct stripe_head *sh)
453{
454 int i;
455 struct page *p;
456
457
458 if (!sh->pages)
459 return;
460
461 for (i = 0; i < sh->nr_pages; i++) {
462 p = sh->pages[i];
463 if (p)
464 put_page(p);
465 sh->pages[i] = NULL;
466 }
467}
468
469static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp)
470{
471 int i;
472 struct page *p;
473
474 for (i = 0; i < sh->nr_pages; i++) {
475
476 if (sh->pages[i])
477 continue;
478
479 p = alloc_page(gfp);
480 if (!p) {
481 free_stripe_pages(sh);
482 return -ENOMEM;
483 }
484 sh->pages[i] = p;
485 }
486 return 0;
487}
488
489static int
490init_stripe_shared_pages(struct stripe_head *sh, struct r5conf *conf, int disks)
491{
492 int nr_pages, cnt;
493
494 if (sh->pages)
495 return 0;
496
497
498 cnt = PAGE_SIZE / conf->stripe_size;
499 nr_pages = (disks + cnt - 1) / cnt;
500
501 sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
502 if (!sh->pages)
503 return -ENOMEM;
504 sh->nr_pages = nr_pages;
505 sh->stripes_per_page = cnt;
506 return 0;
507}
508#endif
509
510static void shrink_buffers(struct stripe_head *sh)
511{
512 int i;
513 int num = sh->raid_conf->pool_size;
514
515#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
516 for (i = 0; i < num ; i++) {
517 struct page *p;
518
519 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
520 p = sh->dev[i].page;
521 if (!p)
522 continue;
523 sh->dev[i].page = NULL;
524 put_page(p);
525 }
526#else
527 for (i = 0; i < num; i++)
528 sh->dev[i].page = NULL;
529 free_stripe_pages(sh);
530#endif
531}
532
533static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
534{
535 int i;
536 int num = sh->raid_conf->pool_size;
537
538#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
539 for (i = 0; i < num; i++) {
540 struct page *page;
541
542 if (!(page = alloc_page(gfp))) {
543 return 1;
544 }
545 sh->dev[i].page = page;
546 sh->dev[i].orig_page = page;
547 sh->dev[i].offset = 0;
548 }
549#else
550 if (alloc_stripe_pages(sh, gfp))
551 return -ENOMEM;
552
553 for (i = 0; i < num; i++) {
554 sh->dev[i].page = raid5_get_dev_page(sh, i);
555 sh->dev[i].orig_page = sh->dev[i].page;
556 sh->dev[i].offset = raid5_get_page_offset(sh, i);
557 }
558#endif
559 return 0;
560}
561
562static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
563 struct stripe_head *sh);
564
565static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
566{
567 struct r5conf *conf = sh->raid_conf;
568 int i, seq;
569
570 BUG_ON(atomic_read(&sh->count) != 0);
571 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
572 BUG_ON(stripe_operations_active(sh));
573 BUG_ON(sh->batch_head);
574
575 pr_debug("init_stripe called, stripe %llu\n",
576 (unsigned long long)sector);
577retry:
578 seq = read_seqcount_begin(&conf->gen_lock);
579 sh->generation = conf->generation - previous;
580 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
581 sh->sector = sector;
582 stripe_set_idx(sector, conf, previous, sh);
583 sh->state = 0;
584
585 for (i = sh->disks; i--; ) {
586 struct r5dev *dev = &sh->dev[i];
587
588 if (dev->toread || dev->read || dev->towrite || dev->written ||
589 test_bit(R5_LOCKED, &dev->flags)) {
590 pr_err("sector=%llx i=%d %p %p %p %p %d\n",
591 (unsigned long long)sh->sector, i, dev->toread,
592 dev->read, dev->towrite, dev->written,
593 test_bit(R5_LOCKED, &dev->flags));
594 WARN_ON(1);
595 }
596 dev->flags = 0;
597 dev->sector = raid5_compute_blocknr(sh, i, previous);
598 }
599 if (read_seqcount_retry(&conf->gen_lock, seq))
600 goto retry;
601 sh->overwrite_disks = 0;
602 insert_hash(conf, sh);
603 sh->cpu = smp_processor_id();
604 set_bit(STRIPE_BATCH_READY, &sh->state);
605}
606
607static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
608 short generation)
609{
610 struct stripe_head *sh;
611
612 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
613 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
614 if (sh->sector == sector && sh->generation == generation)
615 return sh;
616 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
617 return NULL;
618}
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633int raid5_calc_degraded(struct r5conf *conf)
634{
635 int degraded, degraded2;
636 int i;
637
638 rcu_read_lock();
639 degraded = 0;
640 for (i = 0; i < conf->previous_raid_disks; i++) {
641 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
642 if (rdev && test_bit(Faulty, &rdev->flags))
643 rdev = rcu_dereference(conf->disks[i].replacement);
644 if (!rdev || test_bit(Faulty, &rdev->flags))
645 degraded++;
646 else if (test_bit(In_sync, &rdev->flags))
647 ;
648 else
649
650
651
652
653
654
655
656
657
658 if (conf->raid_disks >= conf->previous_raid_disks)
659 degraded++;
660 }
661 rcu_read_unlock();
662 if (conf->raid_disks == conf->previous_raid_disks)
663 return degraded;
664 rcu_read_lock();
665 degraded2 = 0;
666 for (i = 0; i < conf->raid_disks; i++) {
667 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
668 if (rdev && test_bit(Faulty, &rdev->flags))
669 rdev = rcu_dereference(conf->disks[i].replacement);
670 if (!rdev || test_bit(Faulty, &rdev->flags))
671 degraded2++;
672 else if (test_bit(In_sync, &rdev->flags))
673 ;
674 else
675
676
677
678
679
680 if (conf->raid_disks <= conf->previous_raid_disks)
681 degraded2++;
682 }
683 rcu_read_unlock();
684 if (degraded2 > degraded)
685 return degraded2;
686 return degraded;
687}
688
689static int has_failed(struct r5conf *conf)
690{
691 int degraded;
692
693 if (conf->mddev->reshape_position == MaxSector)
694 return conf->mddev->degraded > conf->max_degraded;
695
696 degraded = raid5_calc_degraded(conf);
697 if (degraded > conf->max_degraded)
698 return 1;
699 return 0;
700}
701
702struct stripe_head *
703raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
704 int previous, int noblock, int noquiesce)
705{
706 struct stripe_head *sh;
707 int hash = stripe_hash_locks_hash(conf, sector);
708 int inc_empty_inactive_list_flag;
709
710 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
711
712 spin_lock_irq(conf->hash_locks + hash);
713
714 do {
715 wait_event_lock_irq(conf->wait_for_quiescent,
716 conf->quiesce == 0 || noquiesce,
717 *(conf->hash_locks + hash));
718 sh = __find_stripe(conf, sector, conf->generation - previous);
719 if (!sh) {
720 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
721 sh = get_free_stripe(conf, hash);
722 if (!sh && !test_bit(R5_DID_ALLOC,
723 &conf->cache_state))
724 set_bit(R5_ALLOC_MORE,
725 &conf->cache_state);
726 }
727 if (noblock && sh == NULL)
728 break;
729
730 r5c_check_stripe_cache_usage(conf);
731 if (!sh) {
732 set_bit(R5_INACTIVE_BLOCKED,
733 &conf->cache_state);
734 r5l_wake_reclaim(conf->log, 0);
735 wait_event_lock_irq(
736 conf->wait_for_stripe,
737 !list_empty(conf->inactive_list + hash) &&
738 (atomic_read(&conf->active_stripes)
739 < (conf->max_nr_stripes * 3 / 4)
740 || !test_bit(R5_INACTIVE_BLOCKED,
741 &conf->cache_state)),
742 *(conf->hash_locks + hash));
743 clear_bit(R5_INACTIVE_BLOCKED,
744 &conf->cache_state);
745 } else {
746 init_stripe(sh, sector, previous);
747 atomic_inc(&sh->count);
748 }
749 } else if (!atomic_inc_not_zero(&sh->count)) {
750 spin_lock(&conf->device_lock);
751 if (!atomic_read(&sh->count)) {
752 if (!test_bit(STRIPE_HANDLE, &sh->state))
753 atomic_inc(&conf->active_stripes);
754 BUG_ON(list_empty(&sh->lru) &&
755 !test_bit(STRIPE_EXPANDING, &sh->state));
756 inc_empty_inactive_list_flag = 0;
757 if (!list_empty(conf->inactive_list + hash))
758 inc_empty_inactive_list_flag = 1;
759 list_del_init(&sh->lru);
760 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
761 atomic_inc(&conf->empty_inactive_list_nr);
762 if (sh->group) {
763 sh->group->stripes_cnt--;
764 sh->group = NULL;
765 }
766 }
767 atomic_inc(&sh->count);
768 spin_unlock(&conf->device_lock);
769 }
770 } while (sh == NULL);
771
772 spin_unlock_irq(conf->hash_locks + hash);
773 return sh;
774}
775
776static bool is_full_stripe_write(struct stripe_head *sh)
777{
778 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
779 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
780}
781
782static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
783 __acquires(&sh1->stripe_lock)
784 __acquires(&sh2->stripe_lock)
785{
786 if (sh1 > sh2) {
787 spin_lock_irq(&sh2->stripe_lock);
788 spin_lock_nested(&sh1->stripe_lock, 1);
789 } else {
790 spin_lock_irq(&sh1->stripe_lock);
791 spin_lock_nested(&sh2->stripe_lock, 1);
792 }
793}
794
795static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
796 __releases(&sh1->stripe_lock)
797 __releases(&sh2->stripe_lock)
798{
799 spin_unlock(&sh1->stripe_lock);
800 spin_unlock_irq(&sh2->stripe_lock);
801}
802
803
804static bool stripe_can_batch(struct stripe_head *sh)
805{
806 struct r5conf *conf = sh->raid_conf;
807
808 if (raid5_has_log(conf) || raid5_has_ppl(conf))
809 return false;
810 return test_bit(STRIPE_BATCH_READY, &sh->state) &&
811 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
812 is_full_stripe_write(sh);
813}
814
815
816static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
817{
818 struct stripe_head *head;
819 sector_t head_sector, tmp_sec;
820 int hash;
821 int dd_idx;
822 int inc_empty_inactive_list_flag;
823
824
825 tmp_sec = sh->sector;
826 if (!sector_div(tmp_sec, conf->chunk_sectors))
827 return;
828 head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
829
830 hash = stripe_hash_locks_hash(conf, head_sector);
831 spin_lock_irq(conf->hash_locks + hash);
832 head = __find_stripe(conf, head_sector, conf->generation);
833 if (head && !atomic_inc_not_zero(&head->count)) {
834 spin_lock(&conf->device_lock);
835 if (!atomic_read(&head->count)) {
836 if (!test_bit(STRIPE_HANDLE, &head->state))
837 atomic_inc(&conf->active_stripes);
838 BUG_ON(list_empty(&head->lru) &&
839 !test_bit(STRIPE_EXPANDING, &head->state));
840 inc_empty_inactive_list_flag = 0;
841 if (!list_empty(conf->inactive_list + hash))
842 inc_empty_inactive_list_flag = 1;
843 list_del_init(&head->lru);
844 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
845 atomic_inc(&conf->empty_inactive_list_nr);
846 if (head->group) {
847 head->group->stripes_cnt--;
848 head->group = NULL;
849 }
850 }
851 atomic_inc(&head->count);
852 spin_unlock(&conf->device_lock);
853 }
854 spin_unlock_irq(conf->hash_locks + hash);
855
856 if (!head)
857 return;
858 if (!stripe_can_batch(head))
859 goto out;
860
861 lock_two_stripes(head, sh);
862
863 if (!stripe_can_batch(head) || !stripe_can_batch(sh))
864 goto unlock_out;
865
866 if (sh->batch_head)
867 goto unlock_out;
868
869 dd_idx = 0;
870 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
871 dd_idx++;
872 if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf ||
873 bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite))
874 goto unlock_out;
875
876 if (head->batch_head) {
877 spin_lock(&head->batch_head->batch_lock);
878
879 if (!stripe_can_batch(head)) {
880 spin_unlock(&head->batch_head->batch_lock);
881 goto unlock_out;
882 }
883
884
885
886
887
888
889
890 sh->batch_head = head->batch_head;
891
892
893
894
895
896 list_add(&sh->batch_list, &head->batch_list);
897 spin_unlock(&head->batch_head->batch_lock);
898 } else {
899 head->batch_head = head;
900 sh->batch_head = head->batch_head;
901 spin_lock(&head->batch_lock);
902 list_add_tail(&sh->batch_list, &head->batch_list);
903 spin_unlock(&head->batch_lock);
904 }
905
906 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
907 if (atomic_dec_return(&conf->preread_active_stripes)
908 < IO_THRESHOLD)
909 md_wakeup_thread(conf->mddev->thread);
910
911 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
912 int seq = sh->bm_seq;
913 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
914 sh->batch_head->bm_seq > seq)
915 seq = sh->batch_head->bm_seq;
916 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
917 sh->batch_head->bm_seq = seq;
918 }
919
920 atomic_inc(&sh->count);
921unlock_out:
922 unlock_two_stripes(head, sh);
923out:
924 raid5_release_stripe(head);
925}
926
927
928
929
930static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
931{
932 sector_t progress = conf->reshape_progress;
933
934
935
936
937 smp_rmb();
938 if (progress == MaxSector)
939 return 0;
940 if (sh->generation == conf->generation - 1)
941 return 0;
942
943
944
945 return 1;
946}
947
948static void dispatch_bio_list(struct bio_list *tmp)
949{
950 struct bio *bio;
951
952 while ((bio = bio_list_pop(tmp)))
953 submit_bio_noacct(bio);
954}
955
956static int cmp_stripe(void *priv, const struct list_head *a,
957 const struct list_head *b)
958{
959 const struct r5pending_data *da = list_entry(a,
960 struct r5pending_data, sibling);
961 const struct r5pending_data *db = list_entry(b,
962 struct r5pending_data, sibling);
963 if (da->sector > db->sector)
964 return 1;
965 if (da->sector < db->sector)
966 return -1;
967 return 0;
968}
969
970static void dispatch_defer_bios(struct r5conf *conf, int target,
971 struct bio_list *list)
972{
973 struct r5pending_data *data;
974 struct list_head *first, *next = NULL;
975 int cnt = 0;
976
977 if (conf->pending_data_cnt == 0)
978 return;
979
980 list_sort(NULL, &conf->pending_list, cmp_stripe);
981
982 first = conf->pending_list.next;
983
984
985 if (conf->next_pending_data)
986 list_move_tail(&conf->pending_list,
987 &conf->next_pending_data->sibling);
988
989 while (!list_empty(&conf->pending_list)) {
990 data = list_first_entry(&conf->pending_list,
991 struct r5pending_data, sibling);
992 if (&data->sibling == first)
993 first = data->sibling.next;
994 next = data->sibling.next;
995
996 bio_list_merge(list, &data->bios);
997 list_move(&data->sibling, &conf->free_list);
998 cnt++;
999 if (cnt >= target)
1000 break;
1001 }
1002 conf->pending_data_cnt -= cnt;
1003 BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
1004
1005 if (next != &conf->pending_list)
1006 conf->next_pending_data = list_entry(next,
1007 struct r5pending_data, sibling);
1008 else
1009 conf->next_pending_data = NULL;
1010
1011 if (first != &conf->pending_list)
1012 list_move_tail(&conf->pending_list, first);
1013}
1014
1015static void flush_deferred_bios(struct r5conf *conf)
1016{
1017 struct bio_list tmp = BIO_EMPTY_LIST;
1018
1019 if (conf->pending_data_cnt == 0)
1020 return;
1021
1022 spin_lock(&conf->pending_bios_lock);
1023 dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
1024 BUG_ON(conf->pending_data_cnt != 0);
1025 spin_unlock(&conf->pending_bios_lock);
1026
1027 dispatch_bio_list(&tmp);
1028}
1029
1030static void defer_issue_bios(struct r5conf *conf, sector_t sector,
1031 struct bio_list *bios)
1032{
1033 struct bio_list tmp = BIO_EMPTY_LIST;
1034 struct r5pending_data *ent;
1035
1036 spin_lock(&conf->pending_bios_lock);
1037 ent = list_first_entry(&conf->free_list, struct r5pending_data,
1038 sibling);
1039 list_move_tail(&ent->sibling, &conf->pending_list);
1040 ent->sector = sector;
1041 bio_list_init(&ent->bios);
1042 bio_list_merge(&ent->bios, bios);
1043 conf->pending_data_cnt++;
1044 if (conf->pending_data_cnt >= PENDING_IO_MAX)
1045 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
1046
1047 spin_unlock(&conf->pending_bios_lock);
1048
1049 dispatch_bio_list(&tmp);
1050}
1051
1052static void
1053raid5_end_read_request(struct bio *bi);
1054static void
1055raid5_end_write_request(struct bio *bi);
1056
1057static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
1058{
1059 struct r5conf *conf = sh->raid_conf;
1060 int i, disks = sh->disks;
1061 struct stripe_head *head_sh = sh;
1062 struct bio_list pending_bios = BIO_EMPTY_LIST;
1063 bool should_defer;
1064
1065 might_sleep();
1066
1067 if (log_stripe(sh, s) == 0)
1068 return;
1069
1070 should_defer = conf->batch_bio_dispatch && conf->group_cnt;
1071
1072 for (i = disks; i--; ) {
1073 int op, op_flags = 0;
1074 int replace_only = 0;
1075 struct bio *bi, *rbi;
1076 struct md_rdev *rdev, *rrdev = NULL;
1077
1078 sh = head_sh;
1079 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
1080 op = REQ_OP_WRITE;
1081 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
1082 op_flags = REQ_FUA;
1083 if (test_bit(R5_Discard, &sh->dev[i].flags))
1084 op = REQ_OP_DISCARD;
1085 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1086 op = REQ_OP_READ;
1087 else if (test_and_clear_bit(R5_WantReplace,
1088 &sh->dev[i].flags)) {
1089 op = REQ_OP_WRITE;
1090 replace_only = 1;
1091 } else
1092 continue;
1093 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
1094 op_flags |= REQ_SYNC;
1095
1096again:
1097 bi = &sh->dev[i].req;
1098 rbi = &sh->dev[i].rreq;
1099
1100 rcu_read_lock();
1101 rrdev = rcu_dereference(conf->disks[i].replacement);
1102 smp_mb();
1103 rdev = rcu_dereference(conf->disks[i].rdev);
1104 if (!rdev) {
1105 rdev = rrdev;
1106 rrdev = NULL;
1107 }
1108 if (op_is_write(op)) {
1109 if (replace_only)
1110 rdev = NULL;
1111 if (rdev == rrdev)
1112
1113 rrdev = NULL;
1114 } else {
1115 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
1116 rdev = rrdev;
1117 rrdev = NULL;
1118 }
1119
1120 if (rdev && test_bit(Faulty, &rdev->flags))
1121 rdev = NULL;
1122 if (rdev)
1123 atomic_inc(&rdev->nr_pending);
1124 if (rrdev && test_bit(Faulty, &rrdev->flags))
1125 rrdev = NULL;
1126 if (rrdev)
1127 atomic_inc(&rrdev->nr_pending);
1128 rcu_read_unlock();
1129
1130
1131
1132
1133
1134 while (op_is_write(op) && rdev &&
1135 test_bit(WriteErrorSeen, &rdev->flags)) {
1136 sector_t first_bad;
1137 int bad_sectors;
1138 int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
1139 &first_bad, &bad_sectors);
1140 if (!bad)
1141 break;
1142
1143 if (bad < 0) {
1144 set_bit(BlockedBadBlocks, &rdev->flags);
1145 if (!conf->mddev->external &&
1146 conf->mddev->sb_flags) {
1147
1148
1149
1150
1151 md_check_recovery(conf->mddev);
1152 }
1153
1154
1155
1156
1157
1158 atomic_inc(&rdev->nr_pending);
1159 md_wait_for_blocked_rdev(rdev, conf->mddev);
1160 } else {
1161
1162 rdev_dec_pending(rdev, conf->mddev);
1163 rdev = NULL;
1164 }
1165 }
1166
1167 if (rdev) {
1168 if (s->syncing || s->expanding || s->expanded
1169 || s->replacing)
1170 md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
1171
1172 set_bit(STRIPE_IO_STARTED, &sh->state);
1173
1174 bio_set_dev(bi, rdev->bdev);
1175 bio_set_op_attrs(bi, op, op_flags);
1176 bi->bi_end_io = op_is_write(op)
1177 ? raid5_end_write_request
1178 : raid5_end_read_request;
1179 bi->bi_private = sh;
1180
1181 pr_debug("%s: for %llu schedule op %d on disc %d\n",
1182 __func__, (unsigned long long)sh->sector,
1183 bi->bi_opf, i);
1184 atomic_inc(&sh->count);
1185 if (sh != head_sh)
1186 atomic_inc(&head_sh->count);
1187 if (use_new_offset(conf, sh))
1188 bi->bi_iter.bi_sector = (sh->sector
1189 + rdev->new_data_offset);
1190 else
1191 bi->bi_iter.bi_sector = (sh->sector
1192 + rdev->data_offset);
1193 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
1194 bi->bi_opf |= REQ_NOMERGE;
1195
1196 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1197 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1198
1199 if (!op_is_write(op) &&
1200 test_bit(R5_InJournal, &sh->dev[i].flags))
1201
1202
1203
1204
1205
1206 sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
1207 else
1208 sh->dev[i].vec.bv_page = sh->dev[i].page;
1209 bi->bi_vcnt = 1;
1210 bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1211 bi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1212 bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1213 bi->bi_write_hint = sh->dev[i].write_hint;
1214 if (!rrdev)
1215 sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
1216
1217
1218
1219
1220 if (op == REQ_OP_DISCARD)
1221 bi->bi_vcnt = 0;
1222 if (rrdev)
1223 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
1224
1225 if (conf->mddev->gendisk)
1226 trace_block_bio_remap(bi,
1227 disk_devt(conf->mddev->gendisk),
1228 sh->dev[i].sector);
1229 if (should_defer && op_is_write(op))
1230 bio_list_add(&pending_bios, bi);
1231 else
1232 submit_bio_noacct(bi);
1233 }
1234 if (rrdev) {
1235 if (s->syncing || s->expanding || s->expanded
1236 || s->replacing)
1237 md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
1238
1239 set_bit(STRIPE_IO_STARTED, &sh->state);
1240
1241 bio_set_dev(rbi, rrdev->bdev);
1242 bio_set_op_attrs(rbi, op, op_flags);
1243 BUG_ON(!op_is_write(op));
1244 rbi->bi_end_io = raid5_end_write_request;
1245 rbi->bi_private = sh;
1246
1247 pr_debug("%s: for %llu schedule op %d on "
1248 "replacement disc %d\n",
1249 __func__, (unsigned long long)sh->sector,
1250 rbi->bi_opf, i);
1251 atomic_inc(&sh->count);
1252 if (sh != head_sh)
1253 atomic_inc(&head_sh->count);
1254 if (use_new_offset(conf, sh))
1255 rbi->bi_iter.bi_sector = (sh->sector
1256 + rrdev->new_data_offset);
1257 else
1258 rbi->bi_iter.bi_sector = (sh->sector
1259 + rrdev->data_offset);
1260 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1261 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1262 sh->dev[i].rvec.bv_page = sh->dev[i].page;
1263 rbi->bi_vcnt = 1;
1264 rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1265 rbi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1266 rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1267 rbi->bi_write_hint = sh->dev[i].write_hint;
1268 sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
1269
1270
1271
1272
1273 if (op == REQ_OP_DISCARD)
1274 rbi->bi_vcnt = 0;
1275 if (conf->mddev->gendisk)
1276 trace_block_bio_remap(rbi,
1277 disk_devt(conf->mddev->gendisk),
1278 sh->dev[i].sector);
1279 if (should_defer && op_is_write(op))
1280 bio_list_add(&pending_bios, rbi);
1281 else
1282 submit_bio_noacct(rbi);
1283 }
1284 if (!rdev && !rrdev) {
1285 if (op_is_write(op))
1286 set_bit(STRIPE_DEGRADED, &sh->state);
1287 pr_debug("skip op %d on disc %d for sector %llu\n",
1288 bi->bi_opf, i, (unsigned long long)sh->sector);
1289 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1290 set_bit(STRIPE_HANDLE, &sh->state);
1291 }
1292
1293 if (!head_sh->batch_head)
1294 continue;
1295 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1296 batch_list);
1297 if (sh != head_sh)
1298 goto again;
1299 }
1300
1301 if (should_defer && !bio_list_empty(&pending_bios))
1302 defer_issue_bios(conf, head_sh->sector, &pending_bios);
1303}
1304
1305static struct dma_async_tx_descriptor *
1306async_copy_data(int frombio, struct bio *bio, struct page **page,
1307 unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx,
1308 struct stripe_head *sh, int no_skipcopy)
1309{
1310 struct bio_vec bvl;
1311 struct bvec_iter iter;
1312 struct page *bio_page;
1313 int page_offset;
1314 struct async_submit_ctl submit;
1315 enum async_tx_flags flags = 0;
1316 struct r5conf *conf = sh->raid_conf;
1317
1318 if (bio->bi_iter.bi_sector >= sector)
1319 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
1320 else
1321 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
1322
1323 if (frombio)
1324 flags |= ASYNC_TX_FENCE;
1325 init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
1326
1327 bio_for_each_segment(bvl, bio, iter) {
1328 int len = bvl.bv_len;
1329 int clen;
1330 int b_offset = 0;
1331
1332 if (page_offset < 0) {
1333 b_offset = -page_offset;
1334 page_offset += b_offset;
1335 len -= b_offset;
1336 }
1337
1338 if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf))
1339 clen = RAID5_STRIPE_SIZE(conf) - page_offset;
1340 else
1341 clen = len;
1342
1343 if (clen > 0) {
1344 b_offset += bvl.bv_offset;
1345 bio_page = bvl.bv_page;
1346 if (frombio) {
1347 if (conf->skip_copy &&
1348 b_offset == 0 && page_offset == 0 &&
1349 clen == RAID5_STRIPE_SIZE(conf) &&
1350 !no_skipcopy)
1351 *page = bio_page;
1352 else
1353 tx = async_memcpy(*page, bio_page, page_offset + poff,
1354 b_offset, clen, &submit);
1355 } else
1356 tx = async_memcpy(bio_page, *page, b_offset,
1357 page_offset + poff, clen, &submit);
1358 }
1359
1360 submit.depend_tx = tx;
1361
1362 if (clen < len)
1363 break;
1364 page_offset += len;
1365 }
1366
1367 return tx;
1368}
1369
1370static void ops_complete_biofill(void *stripe_head_ref)
1371{
1372 struct stripe_head *sh = stripe_head_ref;
1373 int i;
1374 struct r5conf *conf = sh->raid_conf;
1375
1376 pr_debug("%s: stripe %llu\n", __func__,
1377 (unsigned long long)sh->sector);
1378
1379
1380 for (i = sh->disks; i--; ) {
1381 struct r5dev *dev = &sh->dev[i];
1382
1383
1384
1385
1386
1387
1388 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
1389 struct bio *rbi, *rbi2;
1390
1391 BUG_ON(!dev->read);
1392 rbi = dev->read;
1393 dev->read = NULL;
1394 while (rbi && rbi->bi_iter.bi_sector <
1395 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1396 rbi2 = r5_next_bio(conf, rbi, dev->sector);
1397 bio_endio(rbi);
1398 rbi = rbi2;
1399 }
1400 }
1401 }
1402 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
1403
1404 set_bit(STRIPE_HANDLE, &sh->state);
1405 raid5_release_stripe(sh);
1406}
1407
1408static void ops_run_biofill(struct stripe_head *sh)
1409{
1410 struct dma_async_tx_descriptor *tx = NULL;
1411 struct async_submit_ctl submit;
1412 int i;
1413 struct r5conf *conf = sh->raid_conf;
1414
1415 BUG_ON(sh->batch_head);
1416 pr_debug("%s: stripe %llu\n", __func__,
1417 (unsigned long long)sh->sector);
1418
1419 for (i = sh->disks; i--; ) {
1420 struct r5dev *dev = &sh->dev[i];
1421 if (test_bit(R5_Wantfill, &dev->flags)) {
1422 struct bio *rbi;
1423 spin_lock_irq(&sh->stripe_lock);
1424 dev->read = rbi = dev->toread;
1425 dev->toread = NULL;
1426 spin_unlock_irq(&sh->stripe_lock);
1427 while (rbi && rbi->bi_iter.bi_sector <
1428 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1429 tx = async_copy_data(0, rbi, &dev->page,
1430 dev->offset,
1431 dev->sector, tx, sh, 0);
1432 rbi = r5_next_bio(conf, rbi, dev->sector);
1433 }
1434 }
1435 }
1436
1437 atomic_inc(&sh->count);
1438 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
1439 async_trigger_callback(&submit);
1440}
1441
1442static void mark_target_uptodate(struct stripe_head *sh, int target)
1443{
1444 struct r5dev *tgt;
1445
1446 if (target < 0)
1447 return;
1448
1449 tgt = &sh->dev[target];
1450 set_bit(R5_UPTODATE, &tgt->flags);
1451 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1452 clear_bit(R5_Wantcompute, &tgt->flags);
1453}
1454
1455static void ops_complete_compute(void *stripe_head_ref)
1456{
1457 struct stripe_head *sh = stripe_head_ref;
1458
1459 pr_debug("%s: stripe %llu\n", __func__,
1460 (unsigned long long)sh->sector);
1461
1462
1463 mark_target_uptodate(sh, sh->ops.target);
1464 mark_target_uptodate(sh, sh->ops.target2);
1465
1466 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
1467 if (sh->check_state == check_state_compute_run)
1468 sh->check_state = check_state_compute_result;
1469 set_bit(STRIPE_HANDLE, &sh->state);
1470 raid5_release_stripe(sh);
1471}
1472
1473
1474static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1475{
1476 return percpu->scribble + i * percpu->scribble_obj_size;
1477}
1478
1479
1480static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1481 struct raid5_percpu *percpu, int i)
1482{
1483 return (void *) (to_addr_page(percpu, i) + sh->disks + 2);
1484}
1485
1486
1487
1488
1489static unsigned int *
1490to_addr_offs(struct stripe_head *sh, struct raid5_percpu *percpu)
1491{
1492 return (unsigned int *) (to_addr_conv(sh, percpu, 0) + sh->disks + 2);
1493}
1494
1495static struct dma_async_tx_descriptor *
1496ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1497{
1498 int disks = sh->disks;
1499 struct page **xor_srcs = to_addr_page(percpu, 0);
1500 unsigned int *off_srcs = to_addr_offs(sh, percpu);
1501 int target = sh->ops.target;
1502 struct r5dev *tgt = &sh->dev[target];
1503 struct page *xor_dest = tgt->page;
1504 unsigned int off_dest = tgt->offset;
1505 int count = 0;
1506 struct dma_async_tx_descriptor *tx;
1507 struct async_submit_ctl submit;
1508 int i;
1509
1510 BUG_ON(sh->batch_head);
1511
1512 pr_debug("%s: stripe %llu block: %d\n",
1513 __func__, (unsigned long long)sh->sector, target);
1514 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1515
1516 for (i = disks; i--; ) {
1517 if (i != target) {
1518 off_srcs[count] = sh->dev[i].offset;
1519 xor_srcs[count++] = sh->dev[i].page;
1520 }
1521 }
1522
1523 atomic_inc(&sh->count);
1524
1525 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
1526 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
1527 if (unlikely(count == 1))
1528 tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
1529 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1530 else
1531 tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1532 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1533
1534 return tx;
1535}
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547static int set_syndrome_sources(struct page **srcs,
1548 unsigned int *offs,
1549 struct stripe_head *sh,
1550 int srctype)
1551{
1552 int disks = sh->disks;
1553 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1554 int d0_idx = raid6_d0(sh);
1555 int count;
1556 int i;
1557
1558 for (i = 0; i < disks; i++)
1559 srcs[i] = NULL;
1560
1561 count = 0;
1562 i = d0_idx;
1563 do {
1564 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1565 struct r5dev *dev = &sh->dev[i];
1566
1567 if (i == sh->qd_idx || i == sh->pd_idx ||
1568 (srctype == SYNDROME_SRC_ALL) ||
1569 (srctype == SYNDROME_SRC_WANT_DRAIN &&
1570 (test_bit(R5_Wantdrain, &dev->flags) ||
1571 test_bit(R5_InJournal, &dev->flags))) ||
1572 (srctype == SYNDROME_SRC_WRITTEN &&
1573 (dev->written ||
1574 test_bit(R5_InJournal, &dev->flags)))) {
1575 if (test_bit(R5_InJournal, &dev->flags))
1576 srcs[slot] = sh->dev[i].orig_page;
1577 else
1578 srcs[slot] = sh->dev[i].page;
1579
1580
1581
1582
1583
1584 offs[slot] = sh->dev[i].offset;
1585 }
1586 i = raid6_next_disk(i, disks);
1587 } while (i != d0_idx);
1588
1589 return syndrome_disks;
1590}
1591
1592static struct dma_async_tx_descriptor *
1593ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1594{
1595 int disks = sh->disks;
1596 struct page **blocks = to_addr_page(percpu, 0);
1597 unsigned int *offs = to_addr_offs(sh, percpu);
1598 int target;
1599 int qd_idx = sh->qd_idx;
1600 struct dma_async_tx_descriptor *tx;
1601 struct async_submit_ctl submit;
1602 struct r5dev *tgt;
1603 struct page *dest;
1604 unsigned int dest_off;
1605 int i;
1606 int count;
1607
1608 BUG_ON(sh->batch_head);
1609 if (sh->ops.target < 0)
1610 target = sh->ops.target2;
1611 else if (sh->ops.target2 < 0)
1612 target = sh->ops.target;
1613 else
1614
1615 BUG();
1616 BUG_ON(target < 0);
1617 pr_debug("%s: stripe %llu block: %d\n",
1618 __func__, (unsigned long long)sh->sector, target);
1619
1620 tgt = &sh->dev[target];
1621 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1622 dest = tgt->page;
1623 dest_off = tgt->offset;
1624
1625 atomic_inc(&sh->count);
1626
1627 if (target == qd_idx) {
1628 count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
1629 blocks[count] = NULL;
1630 BUG_ON(blocks[count+1] != dest);
1631 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1632 ops_complete_compute, sh,
1633 to_addr_conv(sh, percpu, 0));
1634 tx = async_gen_syndrome(blocks, offs, count+2,
1635 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1636 } else {
1637
1638 count = 0;
1639 for (i = disks; i-- ; ) {
1640 if (i == target || i == qd_idx)
1641 continue;
1642 offs[count] = sh->dev[i].offset;
1643 blocks[count++] = sh->dev[i].page;
1644 }
1645
1646 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1647 NULL, ops_complete_compute, sh,
1648 to_addr_conv(sh, percpu, 0));
1649 tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1650 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1651 }
1652
1653 return tx;
1654}
1655
1656static struct dma_async_tx_descriptor *
1657ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1658{
1659 int i, count, disks = sh->disks;
1660 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1661 int d0_idx = raid6_d0(sh);
1662 int faila = -1, failb = -1;
1663 int target = sh->ops.target;
1664 int target2 = sh->ops.target2;
1665 struct r5dev *tgt = &sh->dev[target];
1666 struct r5dev *tgt2 = &sh->dev[target2];
1667 struct dma_async_tx_descriptor *tx;
1668 struct page **blocks = to_addr_page(percpu, 0);
1669 unsigned int *offs = to_addr_offs(sh, percpu);
1670 struct async_submit_ctl submit;
1671
1672 BUG_ON(sh->batch_head);
1673 pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1674 __func__, (unsigned long long)sh->sector, target, target2);
1675 BUG_ON(target < 0 || target2 < 0);
1676 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1677 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1678
1679
1680
1681
1682 for (i = 0; i < disks ; i++) {
1683 offs[i] = 0;
1684 blocks[i] = NULL;
1685 }
1686 count = 0;
1687 i = d0_idx;
1688 do {
1689 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1690
1691 offs[slot] = sh->dev[i].offset;
1692 blocks[slot] = sh->dev[i].page;
1693
1694 if (i == target)
1695 faila = slot;
1696 if (i == target2)
1697 failb = slot;
1698 i = raid6_next_disk(i, disks);
1699 } while (i != d0_idx);
1700
1701 BUG_ON(faila == failb);
1702 if (failb < faila)
1703 swap(faila, failb);
1704 pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1705 __func__, (unsigned long long)sh->sector, faila, failb);
1706
1707 atomic_inc(&sh->count);
1708
1709 if (failb == syndrome_disks+1) {
1710
1711 if (faila == syndrome_disks) {
1712
1713 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1714 ops_complete_compute, sh,
1715 to_addr_conv(sh, percpu, 0));
1716 return async_gen_syndrome(blocks, offs, syndrome_disks+2,
1717 RAID5_STRIPE_SIZE(sh->raid_conf),
1718 &submit);
1719 } else {
1720 struct page *dest;
1721 unsigned int dest_off;
1722 int data_target;
1723 int qd_idx = sh->qd_idx;
1724
1725
1726 if (target == qd_idx)
1727 data_target = target2;
1728 else
1729 data_target = target;
1730
1731 count = 0;
1732 for (i = disks; i-- ; ) {
1733 if (i == data_target || i == qd_idx)
1734 continue;
1735 offs[count] = sh->dev[i].offset;
1736 blocks[count++] = sh->dev[i].page;
1737 }
1738 dest = sh->dev[data_target].page;
1739 dest_off = sh->dev[data_target].offset;
1740 init_async_submit(&submit,
1741 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1742 NULL, NULL, NULL,
1743 to_addr_conv(sh, percpu, 0));
1744 tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1745 RAID5_STRIPE_SIZE(sh->raid_conf),
1746 &submit);
1747
1748 count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
1749 init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1750 ops_complete_compute, sh,
1751 to_addr_conv(sh, percpu, 0));
1752 return async_gen_syndrome(blocks, offs, count+2,
1753 RAID5_STRIPE_SIZE(sh->raid_conf),
1754 &submit);
1755 }
1756 } else {
1757 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1758 ops_complete_compute, sh,
1759 to_addr_conv(sh, percpu, 0));
1760 if (failb == syndrome_disks) {
1761
1762 return async_raid6_datap_recov(syndrome_disks+2,
1763 RAID5_STRIPE_SIZE(sh->raid_conf),
1764 faila,
1765 blocks, offs, &submit);
1766 } else {
1767
1768 return async_raid6_2data_recov(syndrome_disks+2,
1769 RAID5_STRIPE_SIZE(sh->raid_conf),
1770 faila, failb,
1771 blocks, offs, &submit);
1772 }
1773 }
1774}
1775
1776static void ops_complete_prexor(void *stripe_head_ref)
1777{
1778 struct stripe_head *sh = stripe_head_ref;
1779
1780 pr_debug("%s: stripe %llu\n", __func__,
1781 (unsigned long long)sh->sector);
1782
1783 if (r5c_is_writeback(sh->raid_conf->log))
1784
1785
1786
1787
1788 r5c_release_extra_page(sh);
1789}
1790
1791static struct dma_async_tx_descriptor *
1792ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1793 struct dma_async_tx_descriptor *tx)
1794{
1795 int disks = sh->disks;
1796 struct page **xor_srcs = to_addr_page(percpu, 0);
1797 unsigned int *off_srcs = to_addr_offs(sh, percpu);
1798 int count = 0, pd_idx = sh->pd_idx, i;
1799 struct async_submit_ctl submit;
1800
1801
1802 unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
1803 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1804
1805 BUG_ON(sh->batch_head);
1806 pr_debug("%s: stripe %llu\n", __func__,
1807 (unsigned long long)sh->sector);
1808
1809 for (i = disks; i--; ) {
1810 struct r5dev *dev = &sh->dev[i];
1811
1812 if (test_bit(R5_InJournal, &dev->flags)) {
1813
1814
1815
1816
1817 off_srcs[count] = dev->offset;
1818 xor_srcs[count++] = dev->orig_page;
1819 } else if (test_bit(R5_Wantdrain, &dev->flags)) {
1820 off_srcs[count] = dev->offset;
1821 xor_srcs[count++] = dev->page;
1822 }
1823 }
1824
1825 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1826 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1827 tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1828 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1829
1830 return tx;
1831}
1832
1833static struct dma_async_tx_descriptor *
1834ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
1835 struct dma_async_tx_descriptor *tx)
1836{
1837 struct page **blocks = to_addr_page(percpu, 0);
1838 unsigned int *offs = to_addr_offs(sh, percpu);
1839 int count;
1840 struct async_submit_ctl submit;
1841
1842 pr_debug("%s: stripe %llu\n", __func__,
1843 (unsigned long long)sh->sector);
1844
1845 count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_WANT_DRAIN);
1846
1847 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
1848 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1849 tx = async_gen_syndrome(blocks, offs, count+2,
1850 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1851
1852 return tx;
1853}
1854
1855static struct dma_async_tx_descriptor *
1856ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1857{
1858 struct r5conf *conf = sh->raid_conf;
1859 int disks = sh->disks;
1860 int i;
1861 struct stripe_head *head_sh = sh;
1862
1863 pr_debug("%s: stripe %llu\n", __func__,
1864 (unsigned long long)sh->sector);
1865
1866 for (i = disks; i--; ) {
1867 struct r5dev *dev;
1868 struct bio *chosen;
1869
1870 sh = head_sh;
1871 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
1872 struct bio *wbi;
1873
1874again:
1875 dev = &sh->dev[i];
1876
1877
1878
1879
1880 clear_bit(R5_InJournal, &dev->flags);
1881 spin_lock_irq(&sh->stripe_lock);
1882 chosen = dev->towrite;
1883 dev->towrite = NULL;
1884 sh->overwrite_disks = 0;
1885 BUG_ON(dev->written);
1886 wbi = dev->written = chosen;
1887 spin_unlock_irq(&sh->stripe_lock);
1888 WARN_ON(dev->page != dev->orig_page);
1889
1890 while (wbi && wbi->bi_iter.bi_sector <
1891 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1892 if (wbi->bi_opf & REQ_FUA)
1893 set_bit(R5_WantFUA, &dev->flags);
1894 if (wbi->bi_opf & REQ_SYNC)
1895 set_bit(R5_SyncIO, &dev->flags);
1896 if (bio_op(wbi) == REQ_OP_DISCARD)
1897 set_bit(R5_Discard, &dev->flags);
1898 else {
1899 tx = async_copy_data(1, wbi, &dev->page,
1900 dev->offset,
1901 dev->sector, tx, sh,
1902 r5c_is_writeback(conf->log));
1903 if (dev->page != dev->orig_page &&
1904 !r5c_is_writeback(conf->log)) {
1905 set_bit(R5_SkipCopy, &dev->flags);
1906 clear_bit(R5_UPTODATE, &dev->flags);
1907 clear_bit(R5_OVERWRITE, &dev->flags);
1908 }
1909 }
1910 wbi = r5_next_bio(conf, wbi, dev->sector);
1911 }
1912
1913 if (head_sh->batch_head) {
1914 sh = list_first_entry(&sh->batch_list,
1915 struct stripe_head,
1916 batch_list);
1917 if (sh == head_sh)
1918 continue;
1919 goto again;
1920 }
1921 }
1922 }
1923
1924 return tx;
1925}
1926
1927static void ops_complete_reconstruct(void *stripe_head_ref)
1928{
1929 struct stripe_head *sh = stripe_head_ref;
1930 int disks = sh->disks;
1931 int pd_idx = sh->pd_idx;
1932 int qd_idx = sh->qd_idx;
1933 int i;
1934 bool fua = false, sync = false, discard = false;
1935
1936 pr_debug("%s: stripe %llu\n", __func__,
1937 (unsigned long long)sh->sector);
1938
1939 for (i = disks; i--; ) {
1940 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1941 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1942 discard |= test_bit(R5_Discard, &sh->dev[i].flags);
1943 }
1944
1945 for (i = disks; i--; ) {
1946 struct r5dev *dev = &sh->dev[i];
1947
1948 if (dev->written || i == pd_idx || i == qd_idx) {
1949 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) {
1950 set_bit(R5_UPTODATE, &dev->flags);
1951 if (test_bit(STRIPE_EXPAND_READY, &sh->state))
1952 set_bit(R5_Expanded, &dev->flags);
1953 }
1954 if (fua)
1955 set_bit(R5_WantFUA, &dev->flags);
1956 if (sync)
1957 set_bit(R5_SyncIO, &dev->flags);
1958 }
1959 }
1960
1961 if (sh->reconstruct_state == reconstruct_state_drain_run)
1962 sh->reconstruct_state = reconstruct_state_drain_result;
1963 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
1964 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
1965 else {
1966 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
1967 sh->reconstruct_state = reconstruct_state_result;
1968 }
1969
1970 set_bit(STRIPE_HANDLE, &sh->state);
1971 raid5_release_stripe(sh);
1972}
1973
1974static void
1975ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1976 struct dma_async_tx_descriptor *tx)
1977{
1978 int disks = sh->disks;
1979 struct page **xor_srcs;
1980 unsigned int *off_srcs;
1981 struct async_submit_ctl submit;
1982 int count, pd_idx = sh->pd_idx, i;
1983 struct page *xor_dest;
1984 unsigned int off_dest;
1985 int prexor = 0;
1986 unsigned long flags;
1987 int j = 0;
1988 struct stripe_head *head_sh = sh;
1989 int last_stripe;
1990
1991 pr_debug("%s: stripe %llu\n", __func__,
1992 (unsigned long long)sh->sector);
1993
1994 for (i = 0; i < sh->disks; i++) {
1995 if (pd_idx == i)
1996 continue;
1997 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1998 break;
1999 }
2000 if (i >= sh->disks) {
2001 atomic_inc(&sh->count);
2002 set_bit(R5_Discard, &sh->dev[pd_idx].flags);
2003 ops_complete_reconstruct(sh);
2004 return;
2005 }
2006again:
2007 count = 0;
2008 xor_srcs = to_addr_page(percpu, j);
2009 off_srcs = to_addr_offs(sh, percpu);
2010
2011
2012
2013 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
2014 prexor = 1;
2015 off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
2016 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
2017 for (i = disks; i--; ) {
2018 struct r5dev *dev = &sh->dev[i];
2019 if (head_sh->dev[i].written ||
2020 test_bit(R5_InJournal, &head_sh->dev[i].flags)) {
2021 off_srcs[count] = dev->offset;
2022 xor_srcs[count++] = dev->page;
2023 }
2024 }
2025 } else {
2026 xor_dest = sh->dev[pd_idx].page;
2027 off_dest = sh->dev[pd_idx].offset;
2028 for (i = disks; i--; ) {
2029 struct r5dev *dev = &sh->dev[i];
2030 if (i != pd_idx) {
2031 off_srcs[count] = dev->offset;
2032 xor_srcs[count++] = dev->page;
2033 }
2034 }
2035 }
2036
2037
2038
2039
2040
2041
2042 last_stripe = !head_sh->batch_head ||
2043 list_first_entry(&sh->batch_list,
2044 struct stripe_head, batch_list) == head_sh;
2045 if (last_stripe) {
2046 flags = ASYNC_TX_ACK |
2047 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
2048
2049 atomic_inc(&head_sh->count);
2050 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
2051 to_addr_conv(sh, percpu, j));
2052 } else {
2053 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
2054 init_async_submit(&submit, flags, tx, NULL, NULL,
2055 to_addr_conv(sh, percpu, j));
2056 }
2057
2058 if (unlikely(count == 1))
2059 tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
2060 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
2061 else
2062 tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2063 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
2064 if (!last_stripe) {
2065 j++;
2066 sh = list_first_entry(&sh->batch_list, struct stripe_head,
2067 batch_list);
2068 goto again;
2069 }
2070}
2071
2072static void
2073ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
2074 struct dma_async_tx_descriptor *tx)
2075{
2076 struct async_submit_ctl submit;
2077 struct page **blocks;
2078 unsigned int *offs;
2079 int count, i, j = 0;
2080 struct stripe_head *head_sh = sh;
2081 int last_stripe;
2082 int synflags;
2083 unsigned long txflags;
2084
2085 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
2086
2087 for (i = 0; i < sh->disks; i++) {
2088 if (sh->pd_idx == i || sh->qd_idx == i)
2089 continue;
2090 if (!test_bit(R5_Discard, &sh->dev[i].flags))
2091 break;
2092 }
2093 if (i >= sh->disks) {
2094 atomic_inc(&sh->count);
2095 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
2096 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
2097 ops_complete_reconstruct(sh);
2098 return;
2099 }
2100
2101again:
2102 blocks = to_addr_page(percpu, j);
2103 offs = to_addr_offs(sh, percpu);
2104
2105 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
2106 synflags = SYNDROME_SRC_WRITTEN;
2107 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
2108 } else {
2109 synflags = SYNDROME_SRC_ALL;
2110 txflags = ASYNC_TX_ACK;
2111 }
2112
2113 count = set_syndrome_sources(blocks, offs, sh, synflags);
2114 last_stripe = !head_sh->batch_head ||
2115 list_first_entry(&sh->batch_list,
2116 struct stripe_head, batch_list) == head_sh;
2117
2118 if (last_stripe) {
2119 atomic_inc(&head_sh->count);
2120 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
2121 head_sh, to_addr_conv(sh, percpu, j));
2122 } else
2123 init_async_submit(&submit, 0, tx, NULL, NULL,
2124 to_addr_conv(sh, percpu, j));
2125 tx = async_gen_syndrome(blocks, offs, count+2,
2126 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
2127 if (!last_stripe) {
2128 j++;
2129 sh = list_first_entry(&sh->batch_list, struct stripe_head,
2130 batch_list);
2131 goto again;
2132 }
2133}
2134
2135static void ops_complete_check(void *stripe_head_ref)
2136{
2137 struct stripe_head *sh = stripe_head_ref;
2138
2139 pr_debug("%s: stripe %llu\n", __func__,
2140 (unsigned long long)sh->sector);
2141
2142 sh->check_state = check_state_check_result;
2143 set_bit(STRIPE_HANDLE, &sh->state);
2144 raid5_release_stripe(sh);
2145}
2146
2147static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
2148{
2149 int disks = sh->disks;
2150 int pd_idx = sh->pd_idx;
2151 int qd_idx = sh->qd_idx;
2152 struct page *xor_dest;
2153 unsigned int off_dest;
2154 struct page **xor_srcs = to_addr_page(percpu, 0);
2155 unsigned int *off_srcs = to_addr_offs(sh, percpu);
2156 struct dma_async_tx_descriptor *tx;
2157 struct async_submit_ctl submit;
2158 int count;
2159 int i;
2160
2161 pr_debug("%s: stripe %llu\n", __func__,
2162 (unsigned long long)sh->sector);
2163
2164 BUG_ON(sh->batch_head);
2165 count = 0;
2166 xor_dest = sh->dev[pd_idx].page;
2167 off_dest = sh->dev[pd_idx].offset;
2168 off_srcs[count] = off_dest;
2169 xor_srcs[count++] = xor_dest;
2170 for (i = disks; i--; ) {
2171 if (i == pd_idx || i == qd_idx)
2172 continue;
2173 off_srcs[count] = sh->dev[i].offset;
2174 xor_srcs[count++] = sh->dev[i].page;
2175 }
2176
2177 init_async_submit(&submit, 0, NULL, NULL, NULL,
2178 to_addr_conv(sh, percpu, 0));
2179 tx = async_xor_val_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2180 RAID5_STRIPE_SIZE(sh->raid_conf),
2181 &sh->ops.zero_sum_result, &submit);
2182
2183 atomic_inc(&sh->count);
2184 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
2185 tx = async_trigger_callback(&submit);
2186}
2187
2188static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
2189{
2190 struct page **srcs = to_addr_page(percpu, 0);
2191 unsigned int *offs = to_addr_offs(sh, percpu);
2192 struct async_submit_ctl submit;
2193 int count;
2194
2195 pr_debug("%s: stripe %llu checkp: %d\n", __func__,
2196 (unsigned long long)sh->sector, checkp);
2197
2198 BUG_ON(sh->batch_head);
2199 count = set_syndrome_sources(srcs, offs, sh, SYNDROME_SRC_ALL);
2200 if (!checkp)
2201 srcs[count] = NULL;
2202
2203 atomic_inc(&sh->count);
2204 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
2205 sh, to_addr_conv(sh, percpu, 0));
2206 async_syndrome_val(srcs, offs, count+2,
2207 RAID5_STRIPE_SIZE(sh->raid_conf),
2208 &sh->ops.zero_sum_result, percpu->spare_page, 0, &submit);
2209}
2210
2211static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
2212{
2213 int overlap_clear = 0, i, disks = sh->disks;
2214 struct dma_async_tx_descriptor *tx = NULL;
2215 struct r5conf *conf = sh->raid_conf;
2216 int level = conf->level;
2217 struct raid5_percpu *percpu;
2218 unsigned long cpu;
2219
2220 cpu = get_cpu();
2221 percpu = per_cpu_ptr(conf->percpu, cpu);
2222 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
2223 ops_run_biofill(sh);
2224 overlap_clear++;
2225 }
2226
2227 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
2228 if (level < 6)
2229 tx = ops_run_compute5(sh, percpu);
2230 else {
2231 if (sh->ops.target2 < 0 || sh->ops.target < 0)
2232 tx = ops_run_compute6_1(sh, percpu);
2233 else
2234 tx = ops_run_compute6_2(sh, percpu);
2235 }
2236
2237 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
2238 async_tx_ack(tx);
2239 }
2240
2241 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
2242 if (level < 6)
2243 tx = ops_run_prexor5(sh, percpu, tx);
2244 else
2245 tx = ops_run_prexor6(sh, percpu, tx);
2246 }
2247
2248 if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
2249 tx = ops_run_partial_parity(sh, percpu, tx);
2250
2251 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
2252 tx = ops_run_biodrain(sh, tx);
2253 overlap_clear++;
2254 }
2255
2256 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
2257 if (level < 6)
2258 ops_run_reconstruct5(sh, percpu, tx);
2259 else
2260 ops_run_reconstruct6(sh, percpu, tx);
2261 }
2262
2263 if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
2264 if (sh->check_state == check_state_run)
2265 ops_run_check_p(sh, percpu);
2266 else if (sh->check_state == check_state_run_q)
2267 ops_run_check_pq(sh, percpu, 0);
2268 else if (sh->check_state == check_state_run_pq)
2269 ops_run_check_pq(sh, percpu, 1);
2270 else
2271 BUG();
2272 }
2273
2274 if (overlap_clear && !sh->batch_head)
2275 for (i = disks; i--; ) {
2276 struct r5dev *dev = &sh->dev[i];
2277 if (test_and_clear_bit(R5_Overlap, &dev->flags))
2278 wake_up(&sh->raid_conf->wait_for_overlap);
2279 }
2280 put_cpu();
2281}
2282
2283static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
2284{
2285#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2286 kfree(sh->pages);
2287#endif
2288 if (sh->ppl_page)
2289 __free_page(sh->ppl_page);
2290 kmem_cache_free(sc, sh);
2291}
2292
2293static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
2294 int disks, struct r5conf *conf)
2295{
2296 struct stripe_head *sh;
2297 int i;
2298
2299 sh = kmem_cache_zalloc(sc, gfp);
2300 if (sh) {
2301 spin_lock_init(&sh->stripe_lock);
2302 spin_lock_init(&sh->batch_lock);
2303 INIT_LIST_HEAD(&sh->batch_list);
2304 INIT_LIST_HEAD(&sh->lru);
2305 INIT_LIST_HEAD(&sh->r5c);
2306 INIT_LIST_HEAD(&sh->log_list);
2307 atomic_set(&sh->count, 1);
2308 sh->raid_conf = conf;
2309 sh->log_start = MaxSector;
2310 for (i = 0; i < disks; i++) {
2311 struct r5dev *dev = &sh->dev[i];
2312
2313 bio_init(&dev->req, &dev->vec, 1);
2314 bio_init(&dev->rreq, &dev->rvec, 1);
2315 }
2316
2317 if (raid5_has_ppl(conf)) {
2318 sh->ppl_page = alloc_page(gfp);
2319 if (!sh->ppl_page) {
2320 free_stripe(sc, sh);
2321 return NULL;
2322 }
2323 }
2324#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2325 if (init_stripe_shared_pages(sh, conf, disks)) {
2326 free_stripe(sc, sh);
2327 return NULL;
2328 }
2329#endif
2330 }
2331 return sh;
2332}
2333static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
2334{
2335 struct stripe_head *sh;
2336
2337 sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
2338 if (!sh)
2339 return 0;
2340
2341 if (grow_buffers(sh, gfp)) {
2342 shrink_buffers(sh);
2343 free_stripe(conf->slab_cache, sh);
2344 return 0;
2345 }
2346 sh->hash_lock_index =
2347 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
2348
2349 atomic_inc(&conf->active_stripes);
2350
2351 raid5_release_stripe(sh);
2352 conf->max_nr_stripes++;
2353 return 1;
2354}
2355
2356static int grow_stripes(struct r5conf *conf, int num)
2357{
2358 struct kmem_cache *sc;
2359 size_t namelen = sizeof(conf->cache_name[0]);
2360 int devs = max(conf->raid_disks, conf->previous_raid_disks);
2361
2362 if (conf->mddev->gendisk)
2363 snprintf(conf->cache_name[0], namelen,
2364 "raid%d-%s", conf->level, mdname(conf->mddev));
2365 else
2366 snprintf(conf->cache_name[0], namelen,
2367 "raid%d-%p", conf->level, conf->mddev);
2368 snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]);
2369
2370 conf->active_name = 0;
2371 sc = kmem_cache_create(conf->cache_name[conf->active_name],
2372 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
2373 0, 0, NULL);
2374 if (!sc)
2375 return 1;
2376 conf->slab_cache = sc;
2377 conf->pool_size = devs;
2378 while (num--)
2379 if (!grow_one_stripe(conf, GFP_KERNEL))
2380 return 1;
2381
2382 return 0;
2383}
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401static int scribble_alloc(struct raid5_percpu *percpu,
2402 int num, int cnt)
2403{
2404 size_t obj_size =
2405 sizeof(struct page *) * (num + 2) +
2406 sizeof(addr_conv_t) * (num + 2) +
2407 sizeof(unsigned int) * (num + 2);
2408 void *scribble;
2409
2410
2411
2412
2413
2414
2415 scribble = kvmalloc_array(cnt, obj_size, GFP_KERNEL);
2416 if (!scribble)
2417 return -ENOMEM;
2418
2419 kvfree(percpu->scribble);
2420
2421 percpu->scribble = scribble;
2422 percpu->scribble_obj_size = obj_size;
2423 return 0;
2424}
2425
2426static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
2427{
2428 unsigned long cpu;
2429 int err = 0;
2430
2431
2432
2433
2434
2435
2436 if (conf->scribble_disks >= new_disks &&
2437 conf->scribble_sectors >= new_sectors)
2438 return 0;
2439 mddev_suspend(conf->mddev);
2440 get_online_cpus();
2441
2442 for_each_present_cpu(cpu) {
2443 struct raid5_percpu *percpu;
2444
2445 percpu = per_cpu_ptr(conf->percpu, cpu);
2446 err = scribble_alloc(percpu, new_disks,
2447 new_sectors / RAID5_STRIPE_SECTORS(conf));
2448 if (err)
2449 break;
2450 }
2451
2452 put_online_cpus();
2453 mddev_resume(conf->mddev);
2454 if (!err) {
2455 conf->scribble_disks = new_disks;
2456 conf->scribble_sectors = new_sectors;
2457 }
2458 return err;
2459}
2460
2461static int resize_stripes(struct r5conf *conf, int newsize)
2462{
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486 struct stripe_head *osh, *nsh;
2487 LIST_HEAD(newstripes);
2488 struct disk_info *ndisks;
2489 int err = 0;
2490 struct kmem_cache *sc;
2491 int i;
2492 int hash, cnt;
2493
2494 md_allow_write(conf->mddev);
2495
2496
2497 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
2498 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
2499 0, 0, NULL);
2500 if (!sc)
2501 return -ENOMEM;
2502
2503
2504 mutex_lock(&conf->cache_size_mutex);
2505
2506 for (i = conf->max_nr_stripes; i; i--) {
2507 nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
2508 if (!nsh)
2509 break;
2510
2511 list_add(&nsh->lru, &newstripes);
2512 }
2513 if (i) {
2514
2515 while (!list_empty(&newstripes)) {
2516 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2517 list_del(&nsh->lru);
2518 free_stripe(sc, nsh);
2519 }
2520 kmem_cache_destroy(sc);
2521 mutex_unlock(&conf->cache_size_mutex);
2522 return -ENOMEM;
2523 }
2524
2525
2526
2527
2528 hash = 0;
2529 cnt = 0;
2530 list_for_each_entry(nsh, &newstripes, lru) {
2531 lock_device_hash_lock(conf, hash);
2532 wait_event_cmd(conf->wait_for_stripe,
2533 !list_empty(conf->inactive_list + hash),
2534 unlock_device_hash_lock(conf, hash),
2535 lock_device_hash_lock(conf, hash));
2536 osh = get_free_stripe(conf, hash);
2537 unlock_device_hash_lock(conf, hash);
2538
2539#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2540 for (i = 0; i < osh->nr_pages; i++) {
2541 nsh->pages[i] = osh->pages[i];
2542 osh->pages[i] = NULL;
2543 }
2544#endif
2545 for(i=0; i<conf->pool_size; i++) {
2546 nsh->dev[i].page = osh->dev[i].page;
2547 nsh->dev[i].orig_page = osh->dev[i].page;
2548 nsh->dev[i].offset = osh->dev[i].offset;
2549 }
2550 nsh->hash_lock_index = hash;
2551 free_stripe(conf->slab_cache, osh);
2552 cnt++;
2553 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
2554 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
2555 hash++;
2556 cnt = 0;
2557 }
2558 }
2559 kmem_cache_destroy(conf->slab_cache);
2560
2561
2562
2563
2564
2565
2566 ndisks = kcalloc(newsize, sizeof(struct disk_info), GFP_NOIO);
2567 if (ndisks) {
2568 for (i = 0; i < conf->pool_size; i++)
2569 ndisks[i] = conf->disks[i];
2570
2571 for (i = conf->pool_size; i < newsize; i++) {
2572 ndisks[i].extra_page = alloc_page(GFP_NOIO);
2573 if (!ndisks[i].extra_page)
2574 err = -ENOMEM;
2575 }
2576
2577 if (err) {
2578 for (i = conf->pool_size; i < newsize; i++)
2579 if (ndisks[i].extra_page)
2580 put_page(ndisks[i].extra_page);
2581 kfree(ndisks);
2582 } else {
2583 kfree(conf->disks);
2584 conf->disks = ndisks;
2585 }
2586 } else
2587 err = -ENOMEM;
2588
2589 conf->slab_cache = sc;
2590 conf->active_name = 1-conf->active_name;
2591
2592
2593 while(!list_empty(&newstripes)) {
2594 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2595 list_del_init(&nsh->lru);
2596
2597#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2598 for (i = 0; i < nsh->nr_pages; i++) {
2599 if (nsh->pages[i])
2600 continue;
2601 nsh->pages[i] = alloc_page(GFP_NOIO);
2602 if (!nsh->pages[i])
2603 err = -ENOMEM;
2604 }
2605
2606 for (i = conf->raid_disks; i < newsize; i++) {
2607 if (nsh->dev[i].page)
2608 continue;
2609 nsh->dev[i].page = raid5_get_dev_page(nsh, i);
2610 nsh->dev[i].orig_page = nsh->dev[i].page;
2611 nsh->dev[i].offset = raid5_get_page_offset(nsh, i);
2612 }
2613#else
2614 for (i=conf->raid_disks; i < newsize; i++)
2615 if (nsh->dev[i].page == NULL) {
2616 struct page *p = alloc_page(GFP_NOIO);
2617 nsh->dev[i].page = p;
2618 nsh->dev[i].orig_page = p;
2619 nsh->dev[i].offset = 0;
2620 if (!p)
2621 err = -ENOMEM;
2622 }
2623#endif
2624 raid5_release_stripe(nsh);
2625 }
2626
2627
2628 if (!err)
2629 conf->pool_size = newsize;
2630 mutex_unlock(&conf->cache_size_mutex);
2631
2632 return err;
2633}
2634
2635static int drop_one_stripe(struct r5conf *conf)
2636{
2637 struct stripe_head *sh;
2638 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
2639
2640 spin_lock_irq(conf->hash_locks + hash);
2641 sh = get_free_stripe(conf, hash);
2642 spin_unlock_irq(conf->hash_locks + hash);
2643 if (!sh)
2644 return 0;
2645 BUG_ON(atomic_read(&sh->count));
2646 shrink_buffers(sh);
2647 free_stripe(conf->slab_cache, sh);
2648 atomic_dec(&conf->active_stripes);
2649 conf->max_nr_stripes--;
2650 return 1;
2651}
2652
2653static void shrink_stripes(struct r5conf *conf)
2654{
2655 while (conf->max_nr_stripes &&
2656 drop_one_stripe(conf))
2657 ;
2658
2659 kmem_cache_destroy(conf->slab_cache);
2660 conf->slab_cache = NULL;
2661}
2662
2663static void raid5_end_read_request(struct bio * bi)
2664{
2665 struct stripe_head *sh = bi->bi_private;
2666 struct r5conf *conf = sh->raid_conf;
2667 int disks = sh->disks, i;
2668 char b[BDEVNAME_SIZE];
2669 struct md_rdev *rdev = NULL;
2670 sector_t s;
2671
2672 for (i=0 ; i<disks; i++)
2673 if (bi == &sh->dev[i].req)
2674 break;
2675
2676 pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
2677 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2678 bi->bi_status);
2679 if (i == disks) {
2680 bio_reset(bi);
2681 BUG();
2682 return;
2683 }
2684 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2685
2686
2687
2688
2689
2690 rdev = conf->disks[i].replacement;
2691 if (!rdev)
2692 rdev = conf->disks[i].rdev;
2693
2694 if (use_new_offset(conf, sh))
2695 s = sh->sector + rdev->new_data_offset;
2696 else
2697 s = sh->sector + rdev->data_offset;
2698 if (!bi->bi_status) {
2699 set_bit(R5_UPTODATE, &sh->dev[i].flags);
2700 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2701
2702
2703
2704
2705 pr_info_ratelimited(
2706 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
2707 mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf),
2708 (unsigned long long)s,
2709 bdevname(rdev->bdev, b));
2710 atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors);
2711 clear_bit(R5_ReadError, &sh->dev[i].flags);
2712 clear_bit(R5_ReWrite, &sh->dev[i].flags);
2713 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2714 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2715
2716 if (test_bit(R5_InJournal, &sh->dev[i].flags))
2717
2718
2719
2720
2721 set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2722
2723 if (atomic_read(&rdev->read_errors))
2724 atomic_set(&rdev->read_errors, 0);
2725 } else {
2726 const char *bdn = bdevname(rdev->bdev, b);
2727 int retry = 0;
2728 int set_bad = 0;
2729
2730 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
2731 if (!(bi->bi_status == BLK_STS_PROTECTION))
2732 atomic_inc(&rdev->read_errors);
2733 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2734 pr_warn_ratelimited(
2735 "md/raid:%s: read error on replacement device (sector %llu on %s).\n",
2736 mdname(conf->mddev),
2737 (unsigned long long)s,
2738 bdn);
2739 else if (conf->mddev->degraded >= conf->max_degraded) {
2740 set_bad = 1;
2741 pr_warn_ratelimited(
2742 "md/raid:%s: read error not correctable (sector %llu on %s).\n",
2743 mdname(conf->mddev),
2744 (unsigned long long)s,
2745 bdn);
2746 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2747
2748 set_bad = 1;
2749 pr_warn_ratelimited(
2750 "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n",
2751 mdname(conf->mddev),
2752 (unsigned long long)s,
2753 bdn);
2754 } else if (atomic_read(&rdev->read_errors)
2755 > conf->max_nr_stripes) {
2756 if (!test_bit(Faulty, &rdev->flags)) {
2757 pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
2758 mdname(conf->mddev),
2759 atomic_read(&rdev->read_errors),
2760 conf->max_nr_stripes);
2761 pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
2762 mdname(conf->mddev), bdn);
2763 }
2764 } else
2765 retry = 1;
2766 if (set_bad && test_bit(In_sync, &rdev->flags)
2767 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2768 retry = 1;
2769 if (retry)
2770 if (sh->qd_idx >= 0 && sh->pd_idx == i)
2771 set_bit(R5_ReadError, &sh->dev[i].flags);
2772 else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
2773 set_bit(R5_ReadError, &sh->dev[i].flags);
2774 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2775 } else
2776 set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2777 else {
2778 clear_bit(R5_ReadError, &sh->dev[i].flags);
2779 clear_bit(R5_ReWrite, &sh->dev[i].flags);
2780 if (!(set_bad
2781 && test_bit(In_sync, &rdev->flags)
2782 && rdev_set_badblocks(
2783 rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)))
2784 md_error(conf->mddev, rdev);
2785 }
2786 }
2787 rdev_dec_pending(rdev, conf->mddev);
2788 bio_reset(bi);
2789 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2790 set_bit(STRIPE_HANDLE, &sh->state);
2791 raid5_release_stripe(sh);
2792}
2793
2794static void raid5_end_write_request(struct bio *bi)
2795{
2796 struct stripe_head *sh = bi->bi_private;
2797 struct r5conf *conf = sh->raid_conf;
2798 int disks = sh->disks, i;
2799 struct md_rdev *rdev;
2800 sector_t first_bad;
2801 int bad_sectors;
2802 int replacement = 0;
2803
2804 for (i = 0 ; i < disks; i++) {
2805 if (bi == &sh->dev[i].req) {
2806 rdev = conf->disks[i].rdev;
2807 break;
2808 }
2809 if (bi == &sh->dev[i].rreq) {
2810 rdev = conf->disks[i].replacement;
2811 if (rdev)
2812 replacement = 1;
2813 else
2814
2815
2816
2817
2818 rdev = conf->disks[i].rdev;
2819 break;
2820 }
2821 }
2822 pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
2823 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2824 bi->bi_status);
2825 if (i == disks) {
2826 bio_reset(bi);
2827 BUG();
2828 return;
2829 }
2830
2831 if (replacement) {
2832 if (bi->bi_status)
2833 md_error(conf->mddev, rdev);
2834 else if (is_badblock(rdev, sh->sector,
2835 RAID5_STRIPE_SECTORS(conf),
2836 &first_bad, &bad_sectors))
2837 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
2838 } else {
2839 if (bi->bi_status) {
2840 set_bit(STRIPE_DEGRADED, &sh->state);
2841 set_bit(WriteErrorSeen, &rdev->flags);
2842 set_bit(R5_WriteError, &sh->dev[i].flags);
2843 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2844 set_bit(MD_RECOVERY_NEEDED,
2845 &rdev->mddev->recovery);
2846 } else if (is_badblock(rdev, sh->sector,
2847 RAID5_STRIPE_SECTORS(conf),
2848 &first_bad, &bad_sectors)) {
2849 set_bit(R5_MadeGood, &sh->dev[i].flags);
2850 if (test_bit(R5_ReadError, &sh->dev[i].flags))
2851
2852
2853
2854
2855 set_bit(R5_ReWrite, &sh->dev[i].flags);
2856 }
2857 }
2858 rdev_dec_pending(rdev, conf->mddev);
2859
2860 if (sh->batch_head && bi->bi_status && !replacement)
2861 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
2862
2863 bio_reset(bi);
2864 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
2865 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2866 set_bit(STRIPE_HANDLE, &sh->state);
2867 raid5_release_stripe(sh);
2868
2869 if (sh->batch_head && sh != sh->batch_head)
2870 raid5_release_stripe(sh->batch_head);
2871}
2872
2873static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
2874{
2875 char b[BDEVNAME_SIZE];
2876 struct r5conf *conf = mddev->private;
2877 unsigned long flags;
2878 pr_debug("raid456: error called\n");
2879
2880 spin_lock_irqsave(&conf->device_lock, flags);
2881
2882 if (test_bit(In_sync, &rdev->flags) &&
2883 mddev->degraded == conf->max_degraded) {
2884
2885
2886
2887
2888 conf->recovery_disabled = mddev->recovery_disabled;
2889 spin_unlock_irqrestore(&conf->device_lock, flags);
2890 return;
2891 }
2892
2893 set_bit(Faulty, &rdev->flags);
2894 clear_bit(In_sync, &rdev->flags);
2895 mddev->degraded = raid5_calc_degraded(conf);
2896 spin_unlock_irqrestore(&conf->device_lock, flags);
2897 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2898
2899 set_bit(Blocked, &rdev->flags);
2900 set_mask_bits(&mddev->sb_flags, 0,
2901 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
2902 pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
2903 "md/raid:%s: Operation continuing on %d devices.\n",
2904 mdname(mddev),
2905 bdevname(rdev->bdev, b),
2906 mdname(mddev),
2907 conf->raid_disks - mddev->degraded);
2908 r5c_update_on_rdev_error(mddev, rdev);
2909}
2910
2911
2912
2913
2914
2915sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2916 int previous, int *dd_idx,
2917 struct stripe_head *sh)
2918{
2919 sector_t stripe, stripe2;
2920 sector_t chunk_number;
2921 unsigned int chunk_offset;
2922 int pd_idx, qd_idx;
2923 int ddf_layout = 0;
2924 sector_t new_sector;
2925 int algorithm = previous ? conf->prev_algo
2926 : conf->algorithm;
2927 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2928 : conf->chunk_sectors;
2929 int raid_disks = previous ? conf->previous_raid_disks
2930 : conf->raid_disks;
2931 int data_disks = raid_disks - conf->max_degraded;
2932
2933
2934
2935
2936
2937
2938 chunk_offset = sector_div(r_sector, sectors_per_chunk);
2939 chunk_number = r_sector;
2940
2941
2942
2943
2944 stripe = chunk_number;
2945 *dd_idx = sector_div(stripe, data_disks);
2946 stripe2 = stripe;
2947
2948
2949
2950 pd_idx = qd_idx = -1;
2951 switch(conf->level) {
2952 case 4:
2953 pd_idx = data_disks;
2954 break;
2955 case 5:
2956 switch (algorithm) {
2957 case ALGORITHM_LEFT_ASYMMETRIC:
2958 pd_idx = data_disks - sector_div(stripe2, raid_disks);
2959 if (*dd_idx >= pd_idx)
2960 (*dd_idx)++;
2961 break;
2962 case ALGORITHM_RIGHT_ASYMMETRIC:
2963 pd_idx = sector_div(stripe2, raid_disks);
2964 if (*dd_idx >= pd_idx)
2965 (*dd_idx)++;
2966 break;
2967 case ALGORITHM_LEFT_SYMMETRIC:
2968 pd_idx = data_disks - sector_div(stripe2, raid_disks);
2969 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2970 break;
2971 case ALGORITHM_RIGHT_SYMMETRIC:
2972 pd_idx = sector_div(stripe2, raid_disks);
2973 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2974 break;
2975 case ALGORITHM_PARITY_0:
2976 pd_idx = 0;
2977 (*dd_idx)++;
2978 break;
2979 case ALGORITHM_PARITY_N:
2980 pd_idx = data_disks;
2981 break;
2982 default:
2983 BUG();
2984 }
2985 break;
2986 case 6:
2987
2988 switch (algorithm) {
2989 case ALGORITHM_LEFT_ASYMMETRIC:
2990 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2991 qd_idx = pd_idx + 1;
2992 if (pd_idx == raid_disks-1) {
2993 (*dd_idx)++;
2994 qd_idx = 0;
2995 } else if (*dd_idx >= pd_idx)
2996 (*dd_idx) += 2;
2997 break;
2998 case ALGORITHM_RIGHT_ASYMMETRIC:
2999 pd_idx = sector_div(stripe2, raid_disks);
3000 qd_idx = pd_idx + 1;
3001 if (pd_idx == raid_disks-1) {
3002 (*dd_idx)++;
3003 qd_idx = 0;
3004 } else if (*dd_idx >= pd_idx)
3005 (*dd_idx) += 2;
3006 break;
3007 case ALGORITHM_LEFT_SYMMETRIC:
3008 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3009 qd_idx = (pd_idx + 1) % raid_disks;
3010 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
3011 break;
3012 case ALGORITHM_RIGHT_SYMMETRIC:
3013 pd_idx = sector_div(stripe2, raid_disks);
3014 qd_idx = (pd_idx + 1) % raid_disks;
3015 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
3016 break;
3017
3018 case ALGORITHM_PARITY_0:
3019 pd_idx = 0;
3020 qd_idx = 1;
3021 (*dd_idx) += 2;
3022 break;
3023 case ALGORITHM_PARITY_N:
3024 pd_idx = data_disks;
3025 qd_idx = data_disks + 1;
3026 break;
3027
3028 case ALGORITHM_ROTATING_ZERO_RESTART:
3029
3030
3031
3032 pd_idx = sector_div(stripe2, raid_disks);
3033 qd_idx = pd_idx + 1;
3034 if (pd_idx == raid_disks-1) {
3035 (*dd_idx)++;
3036 qd_idx = 0;
3037 } else if (*dd_idx >= pd_idx)
3038 (*dd_idx) += 2;
3039 ddf_layout = 1;
3040 break;
3041
3042 case ALGORITHM_ROTATING_N_RESTART:
3043
3044
3045
3046
3047 stripe2 += 1;
3048 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3049 qd_idx = pd_idx + 1;
3050 if (pd_idx == raid_disks-1) {
3051 (*dd_idx)++;
3052 qd_idx = 0;
3053 } else if (*dd_idx >= pd_idx)
3054 (*dd_idx) += 2;
3055 ddf_layout = 1;
3056 break;
3057
3058 case ALGORITHM_ROTATING_N_CONTINUE:
3059
3060 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3061 qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
3062 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
3063 ddf_layout = 1;
3064 break;
3065
3066 case ALGORITHM_LEFT_ASYMMETRIC_6:
3067
3068 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
3069 if (*dd_idx >= pd_idx)
3070 (*dd_idx)++;
3071 qd_idx = raid_disks - 1;
3072 break;
3073
3074 case ALGORITHM_RIGHT_ASYMMETRIC_6:
3075 pd_idx = sector_div(stripe2, raid_disks-1);
3076 if (*dd_idx >= pd_idx)
3077 (*dd_idx)++;
3078 qd_idx = raid_disks - 1;
3079 break;
3080
3081 case ALGORITHM_LEFT_SYMMETRIC_6:
3082 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
3083 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
3084 qd_idx = raid_disks - 1;
3085 break;
3086
3087 case ALGORITHM_RIGHT_SYMMETRIC_6:
3088 pd_idx = sector_div(stripe2, raid_disks-1);
3089 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
3090 qd_idx = raid_disks - 1;
3091 break;
3092
3093 case ALGORITHM_PARITY_0_6:
3094 pd_idx = 0;
3095 (*dd_idx)++;
3096 qd_idx = raid_disks - 1;
3097 break;
3098
3099 default:
3100 BUG();
3101 }
3102 break;
3103 }
3104
3105 if (sh) {
3106 sh->pd_idx = pd_idx;
3107 sh->qd_idx = qd_idx;
3108 sh->ddf_layout = ddf_layout;
3109 }
3110
3111
3112
3113 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
3114 return new_sector;
3115}
3116
3117sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
3118{
3119 struct r5conf *conf = sh->raid_conf;
3120 int raid_disks = sh->disks;
3121 int data_disks = raid_disks - conf->max_degraded;
3122 sector_t new_sector = sh->sector, check;
3123 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
3124 : conf->chunk_sectors;
3125 int algorithm = previous ? conf->prev_algo
3126 : conf->algorithm;
3127 sector_t stripe;
3128 int chunk_offset;
3129 sector_t chunk_number;
3130 int dummy1, dd_idx = i;
3131 sector_t r_sector;
3132 struct stripe_head sh2;
3133
3134 chunk_offset = sector_div(new_sector, sectors_per_chunk);
3135 stripe = new_sector;
3136
3137 if (i == sh->pd_idx)
3138 return 0;
3139 switch(conf->level) {
3140 case 4: break;
3141 case 5:
3142 switch (algorithm) {
3143 case ALGORITHM_LEFT_ASYMMETRIC:
3144 case ALGORITHM_RIGHT_ASYMMETRIC:
3145 if (i > sh->pd_idx)
3146 i--;
3147 break;
3148 case ALGORITHM_LEFT_SYMMETRIC:
3149 case ALGORITHM_RIGHT_SYMMETRIC:
3150 if (i < sh->pd_idx)
3151 i += raid_disks;
3152 i -= (sh->pd_idx + 1);
3153 break;
3154 case ALGORITHM_PARITY_0:
3155 i -= 1;
3156 break;
3157 case ALGORITHM_PARITY_N:
3158 break;
3159 default:
3160 BUG();
3161 }
3162 break;
3163 case 6:
3164 if (i == sh->qd_idx)
3165 return 0;
3166 switch (algorithm) {
3167 case ALGORITHM_LEFT_ASYMMETRIC:
3168 case ALGORITHM_RIGHT_ASYMMETRIC:
3169 case ALGORITHM_ROTATING_ZERO_RESTART:
3170 case ALGORITHM_ROTATING_N_RESTART:
3171 if (sh->pd_idx == raid_disks-1)
3172 i--;
3173 else if (i > sh->pd_idx)
3174 i -= 2;
3175 break;
3176 case ALGORITHM_LEFT_SYMMETRIC:
3177 case ALGORITHM_RIGHT_SYMMETRIC:
3178 if (sh->pd_idx == raid_disks-1)
3179 i--;
3180 else {
3181
3182 if (i < sh->pd_idx)
3183 i += raid_disks;
3184 i -= (sh->pd_idx + 2);
3185 }
3186 break;
3187 case ALGORITHM_PARITY_0:
3188 i -= 2;
3189 break;
3190 case ALGORITHM_PARITY_N:
3191 break;
3192 case ALGORITHM_ROTATING_N_CONTINUE:
3193
3194 if (sh->pd_idx == 0)
3195 i--;
3196 else {
3197
3198 if (i < sh->pd_idx)
3199 i += raid_disks;
3200 i -= (sh->pd_idx + 1);
3201 }
3202 break;
3203 case ALGORITHM_LEFT_ASYMMETRIC_6:
3204 case ALGORITHM_RIGHT_ASYMMETRIC_6:
3205 if (i > sh->pd_idx)
3206 i--;
3207 break;
3208 case ALGORITHM_LEFT_SYMMETRIC_6:
3209 case ALGORITHM_RIGHT_SYMMETRIC_6:
3210 if (i < sh->pd_idx)
3211 i += data_disks + 1;
3212 i -= (sh->pd_idx + 1);
3213 break;
3214 case ALGORITHM_PARITY_0_6:
3215 i -= 1;
3216 break;
3217 default:
3218 BUG();
3219 }
3220 break;
3221 }
3222
3223 chunk_number = stripe * data_disks + i;
3224 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
3225
3226 check = raid5_compute_sector(conf, r_sector,
3227 previous, &dummy1, &sh2);
3228 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
3229 || sh2.qd_idx != sh->qd_idx) {
3230 pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
3231 mdname(conf->mddev));
3232 return 0;
3233 }
3234 return r_sector;
3235}
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275static inline bool delay_towrite(struct r5conf *conf,
3276 struct r5dev *dev,
3277 struct stripe_head_state *s)
3278{
3279
3280 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3281 !test_bit(R5_Insync, &dev->flags) && s->injournal)
3282 return true;
3283
3284 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
3285 s->injournal > 0)
3286 return true;
3287
3288 if (s->log_failed && s->injournal)
3289 return true;
3290 return false;
3291}
3292
3293static void
3294schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
3295 int rcw, int expand)
3296{
3297 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
3298 struct r5conf *conf = sh->raid_conf;
3299 int level = conf->level;
3300
3301 if (rcw) {
3302
3303
3304
3305
3306
3307
3308 r5c_release_extra_page(sh);
3309
3310 for (i = disks; i--; ) {
3311 struct r5dev *dev = &sh->dev[i];
3312
3313 if (dev->towrite && !delay_towrite(conf, dev, s)) {
3314 set_bit(R5_LOCKED, &dev->flags);
3315 set_bit(R5_Wantdrain, &dev->flags);
3316 if (!expand)
3317 clear_bit(R5_UPTODATE, &dev->flags);
3318 s->locked++;
3319 } else if (test_bit(R5_InJournal, &dev->flags)) {
3320 set_bit(R5_LOCKED, &dev->flags);
3321 s->locked++;
3322 }
3323 }
3324
3325
3326
3327
3328 if (!expand) {
3329 if (!s->locked)
3330
3331 return;
3332 sh->reconstruct_state = reconstruct_state_drain_run;
3333 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3334 } else
3335 sh->reconstruct_state = reconstruct_state_run;
3336
3337 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3338
3339 if (s->locked + conf->max_degraded == disks)
3340 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
3341 atomic_inc(&conf->pending_full_writes);
3342 } else {
3343 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
3344 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
3345 BUG_ON(level == 6 &&
3346 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
3347 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
3348
3349 for (i = disks; i--; ) {
3350 struct r5dev *dev = &sh->dev[i];
3351 if (i == pd_idx || i == qd_idx)
3352 continue;
3353
3354 if (dev->towrite &&
3355 (test_bit(R5_UPTODATE, &dev->flags) ||
3356 test_bit(R5_Wantcompute, &dev->flags))) {
3357 set_bit(R5_Wantdrain, &dev->flags);
3358 set_bit(R5_LOCKED, &dev->flags);
3359 clear_bit(R5_UPTODATE, &dev->flags);
3360 s->locked++;
3361 } else if (test_bit(R5_InJournal, &dev->flags)) {
3362 set_bit(R5_LOCKED, &dev->flags);
3363 s->locked++;
3364 }
3365 }
3366 if (!s->locked)
3367
3368 return;
3369 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
3370 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
3371 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3372 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3373 }
3374
3375
3376
3377
3378 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
3379 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
3380 s->locked++;
3381
3382 if (level == 6) {
3383 int qd_idx = sh->qd_idx;
3384 struct r5dev *dev = &sh->dev[qd_idx];
3385
3386 set_bit(R5_LOCKED, &dev->flags);
3387 clear_bit(R5_UPTODATE, &dev->flags);
3388 s->locked++;
3389 }
3390
3391 if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
3392 test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
3393 !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
3394 test_bit(R5_Insync, &sh->dev[pd_idx].flags))
3395 set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
3396
3397 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
3398 __func__, (unsigned long long)sh->sector,
3399 s->locked, s->ops_request);
3400}
3401
3402
3403
3404
3405
3406
3407static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
3408 int forwrite, int previous)
3409{
3410 struct bio **bip;
3411 struct r5conf *conf = sh->raid_conf;
3412 int firstwrite=0;
3413
3414 pr_debug("adding bi b#%llu to stripe s#%llu\n",
3415 (unsigned long long)bi->bi_iter.bi_sector,
3416 (unsigned long long)sh->sector);
3417
3418 spin_lock_irq(&sh->stripe_lock);
3419 sh->dev[dd_idx].write_hint = bi->bi_write_hint;
3420
3421 if (sh->batch_head)
3422 goto overlap;
3423 if (forwrite) {
3424 bip = &sh->dev[dd_idx].towrite;
3425 if (*bip == NULL)
3426 firstwrite = 1;
3427 } else
3428 bip = &sh->dev[dd_idx].toread;
3429 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
3430 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
3431 goto overlap;
3432 bip = & (*bip)->bi_next;
3433 }
3434 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
3435 goto overlap;
3436
3437 if (forwrite && raid5_has_ppl(conf)) {
3438
3439
3440
3441
3442
3443
3444
3445 sector_t sector;
3446 sector_t first = 0;
3447 sector_t last = 0;
3448 int count = 0;
3449 int i;
3450
3451 for (i = 0; i < sh->disks; i++) {
3452 if (i != sh->pd_idx &&
3453 (i == dd_idx || sh->dev[i].towrite)) {
3454 sector = sh->dev[i].sector;
3455 if (count == 0 || sector < first)
3456 first = sector;
3457 if (sector > last)
3458 last = sector;
3459 count++;
3460 }
3461 }
3462
3463 if (first + conf->chunk_sectors * (count - 1) != last)
3464 goto overlap;
3465 }
3466
3467 if (!forwrite || previous)
3468 clear_bit(STRIPE_BATCH_READY, &sh->state);
3469
3470 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
3471 if (*bip)
3472 bi->bi_next = *bip;
3473 *bip = bi;
3474 bio_inc_remaining(bi);
3475 md_write_inc(conf->mddev, bi);
3476
3477 if (forwrite) {
3478
3479 sector_t sector = sh->dev[dd_idx].sector;
3480 for (bi=sh->dev[dd_idx].towrite;
3481 sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) &&
3482 bi && bi->bi_iter.bi_sector <= sector;
3483 bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) {
3484 if (bio_end_sector(bi) >= sector)
3485 sector = bio_end_sector(bi);
3486 }
3487 if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf))
3488 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
3489 sh->overwrite_disks++;
3490 }
3491
3492 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
3493 (unsigned long long)(*bip)->bi_iter.bi_sector,
3494 (unsigned long long)sh->sector, dd_idx);
3495
3496 if (conf->mddev->bitmap && firstwrite) {
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509 set_bit(STRIPE_BITMAP_PENDING, &sh->state);
3510 spin_unlock_irq(&sh->stripe_lock);
3511 md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
3512 RAID5_STRIPE_SECTORS(conf), 0);
3513 spin_lock_irq(&sh->stripe_lock);
3514 clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
3515 if (!sh->batch_head) {
3516 sh->bm_seq = conf->seq_flush+1;
3517 set_bit(STRIPE_BIT_DELAY, &sh->state);
3518 }
3519 }
3520 spin_unlock_irq(&sh->stripe_lock);
3521
3522 if (stripe_can_batch(sh))
3523 stripe_add_to_batch_list(conf, sh);
3524 return 1;
3525
3526 overlap:
3527 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
3528 spin_unlock_irq(&sh->stripe_lock);
3529 return 0;
3530}
3531
3532static void end_reshape(struct r5conf *conf);
3533
3534static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
3535 struct stripe_head *sh)
3536{
3537 int sectors_per_chunk =
3538 previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
3539 int dd_idx;
3540 int chunk_offset = sector_div(stripe, sectors_per_chunk);
3541 int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
3542
3543 raid5_compute_sector(conf,
3544 stripe * (disks - conf->max_degraded)
3545 *sectors_per_chunk + chunk_offset,
3546 previous,
3547 &dd_idx, sh);
3548}
3549
3550static void
3551handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
3552 struct stripe_head_state *s, int disks)
3553{
3554 int i;
3555 BUG_ON(sh->batch_head);
3556 for (i = disks; i--; ) {
3557 struct bio *bi;
3558 int bitmap_end = 0;
3559
3560 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
3561 struct md_rdev *rdev;
3562 rcu_read_lock();
3563 rdev = rcu_dereference(conf->disks[i].rdev);
3564 if (rdev && test_bit(In_sync, &rdev->flags) &&
3565 !test_bit(Faulty, &rdev->flags))
3566 atomic_inc(&rdev->nr_pending);
3567 else
3568 rdev = NULL;
3569 rcu_read_unlock();
3570 if (rdev) {
3571 if (!rdev_set_badblocks(
3572 rdev,
3573 sh->sector,
3574 RAID5_STRIPE_SECTORS(conf), 0))
3575 md_error(conf->mddev, rdev);
3576 rdev_dec_pending(rdev, conf->mddev);
3577 }
3578 }
3579 spin_lock_irq(&sh->stripe_lock);
3580
3581 bi = sh->dev[i].towrite;
3582 sh->dev[i].towrite = NULL;
3583 sh->overwrite_disks = 0;
3584 spin_unlock_irq(&sh->stripe_lock);
3585 if (bi)
3586 bitmap_end = 1;
3587
3588 log_stripe_write_finished(sh);
3589
3590 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3591 wake_up(&conf->wait_for_overlap);
3592
3593 while (bi && bi->bi_iter.bi_sector <
3594 sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3595 struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector);
3596
3597 md_write_end(conf->mddev);
3598 bio_io_error(bi);
3599 bi = nextbi;
3600 }
3601 if (bitmap_end)
3602 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3603 RAID5_STRIPE_SECTORS(conf), 0, 0);
3604 bitmap_end = 0;
3605
3606 bi = sh->dev[i].written;
3607 sh->dev[i].written = NULL;
3608 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
3609 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
3610 sh->dev[i].page = sh->dev[i].orig_page;
3611 }
3612
3613 if (bi) bitmap_end = 1;
3614 while (bi && bi->bi_iter.bi_sector <
3615 sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3616 struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
3617
3618 md_write_end(conf->mddev);
3619 bio_io_error(bi);
3620 bi = bi2;
3621 }
3622
3623
3624
3625
3626 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
3627 s->failed > conf->max_degraded &&
3628 (!test_bit(R5_Insync, &sh->dev[i].flags) ||
3629 test_bit(R5_ReadError, &sh->dev[i].flags))) {
3630 spin_lock_irq(&sh->stripe_lock);
3631 bi = sh->dev[i].toread;
3632 sh->dev[i].toread = NULL;
3633 spin_unlock_irq(&sh->stripe_lock);
3634 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3635 wake_up(&conf->wait_for_overlap);
3636 if (bi)
3637 s->to_read--;
3638 while (bi && bi->bi_iter.bi_sector <
3639 sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3640 struct bio *nextbi =
3641 r5_next_bio(conf, bi, sh->dev[i].sector);
3642
3643 bio_io_error(bi);
3644 bi = nextbi;
3645 }
3646 }
3647 if (bitmap_end)
3648 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3649 RAID5_STRIPE_SECTORS(conf), 0, 0);
3650
3651
3652
3653 clear_bit(R5_LOCKED, &sh->dev[i].flags);
3654 }
3655 s->to_write = 0;
3656 s->written = 0;
3657
3658 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3659 if (atomic_dec_and_test(&conf->pending_full_writes))
3660 md_wakeup_thread(conf->mddev->thread);
3661}
3662
3663static void
3664handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
3665 struct stripe_head_state *s)
3666{
3667 int abort = 0;
3668 int i;
3669
3670 BUG_ON(sh->batch_head);
3671 clear_bit(STRIPE_SYNCING, &sh->state);
3672 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
3673 wake_up(&conf->wait_for_overlap);
3674 s->syncing = 0;
3675 s->replacing = 0;
3676
3677
3678
3679
3680
3681
3682
3683 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
3684
3685
3686
3687 rcu_read_lock();
3688 for (i = 0; i < conf->raid_disks; i++) {
3689 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
3690 if (rdev
3691 && !test_bit(Faulty, &rdev->flags)
3692 && !test_bit(In_sync, &rdev->flags)
3693 && !rdev_set_badblocks(rdev, sh->sector,
3694 RAID5_STRIPE_SECTORS(conf), 0))
3695 abort = 1;
3696 rdev = rcu_dereference(conf->disks[i].replacement);
3697 if (rdev
3698 && !test_bit(Faulty, &rdev->flags)
3699 && !test_bit(In_sync, &rdev->flags)
3700 && !rdev_set_badblocks(rdev, sh->sector,
3701 RAID5_STRIPE_SECTORS(conf), 0))
3702 abort = 1;
3703 }
3704 rcu_read_unlock();
3705 if (abort)
3706 conf->recovery_disabled =
3707 conf->mddev->recovery_disabled;
3708 }
3709 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort);
3710}
3711
3712static int want_replace(struct stripe_head *sh, int disk_idx)
3713{
3714 struct md_rdev *rdev;
3715 int rv = 0;
3716
3717 rcu_read_lock();
3718 rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement);
3719 if (rdev
3720 && !test_bit(Faulty, &rdev->flags)
3721 && !test_bit(In_sync, &rdev->flags)
3722 && (rdev->recovery_offset <= sh->sector
3723 || rdev->mddev->recovery_cp <= sh->sector))
3724 rv = 1;
3725 rcu_read_unlock();
3726 return rv;
3727}
3728
3729static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
3730 int disk_idx, int disks)
3731{
3732 struct r5dev *dev = &sh->dev[disk_idx];
3733 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
3734 &sh->dev[s->failed_num[1]] };
3735 int i;
3736 bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
3737
3738
3739 if (test_bit(R5_LOCKED, &dev->flags) ||
3740 test_bit(R5_UPTODATE, &dev->flags))
3741
3742
3743
3744 return 0;
3745
3746 if (dev->toread ||
3747 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
3748
3749 return 1;
3750
3751 if (s->syncing || s->expanding ||
3752 (s->replacing && want_replace(sh, disk_idx)))
3753
3754
3755
3756 return 1;
3757
3758 if ((s->failed >= 1 && fdev[0]->toread) ||
3759 (s->failed >= 2 && fdev[1]->toread))
3760
3761
3762
3763 return 1;
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773 if (!s->failed || !s->to_write)
3774 return 0;
3775
3776 if (test_bit(R5_Insync, &dev->flags) &&
3777 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3778
3779
3780
3781
3782
3783 return 0;
3784
3785 for (i = 0; i < s->failed && i < 2; i++) {
3786 if (fdev[i]->towrite &&
3787 !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3788 !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3789
3790
3791
3792
3793
3794 return 1;
3795
3796 if (s->failed >= 2 &&
3797 (fdev[i]->towrite ||
3798 s->failed_num[i] == sh->pd_idx ||
3799 s->failed_num[i] == sh->qd_idx) &&
3800 !test_bit(R5_UPTODATE, &fdev[i]->flags))
3801
3802
3803
3804
3805 force_rcw = true;
3806 }
3807
3808
3809
3810
3811
3812
3813
3814
3815 if (!force_rcw &&
3816 sh->sector < sh->raid_conf->mddev->recovery_cp)
3817
3818 return 0;
3819 for (i = 0; i < s->failed && i < 2; i++) {
3820 if (s->failed_num[i] != sh->pd_idx &&
3821 s->failed_num[i] != sh->qd_idx &&
3822 !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3823 !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3824 return 1;
3825 }
3826
3827 return 0;
3828}
3829
3830
3831
3832
3833
3834
3835
3836static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
3837 int disk_idx, int disks)
3838{
3839 struct r5dev *dev = &sh->dev[disk_idx];
3840
3841
3842 if (need_this_block(sh, s, disk_idx, disks)) {
3843
3844
3845
3846 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
3847 BUG_ON(test_bit(R5_Wantread, &dev->flags));
3848 BUG_ON(sh->batch_head);
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859 if ((s->uptodate == disks - 1) &&
3860 ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
3861 (s->failed && (disk_idx == s->failed_num[0] ||
3862 disk_idx == s->failed_num[1])))) {
3863
3864
3865
3866 pr_debug("Computing stripe %llu block %d\n",
3867 (unsigned long long)sh->sector, disk_idx);
3868 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3869 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3870 set_bit(R5_Wantcompute, &dev->flags);
3871 sh->ops.target = disk_idx;
3872 sh->ops.target2 = -1;
3873 s->req_compute = 1;
3874
3875
3876
3877
3878
3879
3880 s->uptodate++;
3881 return 1;
3882 } else if (s->uptodate == disks-2 && s->failed >= 2) {
3883
3884
3885
3886 int other;
3887 for (other = disks; other--; ) {
3888 if (other == disk_idx)
3889 continue;
3890 if (!test_bit(R5_UPTODATE,
3891 &sh->dev[other].flags))
3892 break;
3893 }
3894 BUG_ON(other < 0);
3895 pr_debug("Computing stripe %llu blocks %d,%d\n",
3896 (unsigned long long)sh->sector,
3897 disk_idx, other);
3898 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3899 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3900 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
3901 set_bit(R5_Wantcompute, &sh->dev[other].flags);
3902 sh->ops.target = disk_idx;
3903 sh->ops.target2 = other;
3904 s->uptodate += 2;
3905 s->req_compute = 1;
3906 return 1;
3907 } else if (test_bit(R5_Insync, &dev->flags)) {
3908 set_bit(R5_LOCKED, &dev->flags);
3909 set_bit(R5_Wantread, &dev->flags);
3910 s->locked++;
3911 pr_debug("Reading block %d (sync=%d)\n",
3912 disk_idx, s->syncing);
3913 }
3914 }
3915
3916 return 0;
3917}
3918
3919
3920
3921
3922static void handle_stripe_fill(struct stripe_head *sh,
3923 struct stripe_head_state *s,
3924 int disks)
3925{
3926 int i;
3927
3928
3929
3930
3931
3932 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
3933 !sh->reconstruct_state) {
3934
3935
3936
3937
3938
3939
3940
3941
3942 if (s->injournal && s->failed) {
3943 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
3944 r5c_make_stripe_write_out(sh);
3945 goto out;
3946 }
3947
3948 for (i = disks; i--; )
3949 if (fetch_block(sh, s, i, disks))
3950 break;
3951 }
3952out:
3953 set_bit(STRIPE_HANDLE, &sh->state);
3954}
3955
3956static void break_stripe_batch_list(struct stripe_head *head_sh,
3957 unsigned long handle_flags);
3958
3959
3960
3961
3962
3963static void handle_stripe_clean_event(struct r5conf *conf,
3964 struct stripe_head *sh, int disks)
3965{
3966 int i;
3967 struct r5dev *dev;
3968 int discard_pending = 0;
3969 struct stripe_head *head_sh = sh;
3970 bool do_endio = false;
3971
3972 for (i = disks; i--; )
3973 if (sh->dev[i].written) {
3974 dev = &sh->dev[i];
3975 if (!test_bit(R5_LOCKED, &dev->flags) &&
3976 (test_bit(R5_UPTODATE, &dev->flags) ||
3977 test_bit(R5_Discard, &dev->flags) ||
3978 test_bit(R5_SkipCopy, &dev->flags))) {
3979
3980 struct bio *wbi, *wbi2;
3981 pr_debug("Return write for disc %d\n", i);
3982 if (test_and_clear_bit(R5_Discard, &dev->flags))
3983 clear_bit(R5_UPTODATE, &dev->flags);
3984 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
3985 WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
3986 }
3987 do_endio = true;
3988
3989returnbi:
3990 dev->page = dev->orig_page;
3991 wbi = dev->written;
3992 dev->written = NULL;
3993 while (wbi && wbi->bi_iter.bi_sector <
3994 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
3995 wbi2 = r5_next_bio(conf, wbi, dev->sector);
3996 md_write_end(conf->mddev);
3997 bio_endio(wbi);
3998 wbi = wbi2;
3999 }
4000 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
4001 RAID5_STRIPE_SECTORS(conf),
4002 !test_bit(STRIPE_DEGRADED, &sh->state),
4003 0);
4004 if (head_sh->batch_head) {
4005 sh = list_first_entry(&sh->batch_list,
4006 struct stripe_head,
4007 batch_list);
4008 if (sh != head_sh) {
4009 dev = &sh->dev[i];
4010 goto returnbi;
4011 }
4012 }
4013 sh = head_sh;
4014 dev = &sh->dev[i];
4015 } else if (test_bit(R5_Discard, &dev->flags))
4016 discard_pending = 1;
4017 }
4018
4019 log_stripe_write_finished(sh);
4020
4021 if (!discard_pending &&
4022 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
4023 int hash;
4024 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
4025 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
4026 if (sh->qd_idx >= 0) {
4027 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
4028 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
4029 }
4030
4031 clear_bit(STRIPE_DISCARD, &sh->state);
4032
4033
4034
4035
4036
4037unhash:
4038 hash = sh->hash_lock_index;
4039 spin_lock_irq(conf->hash_locks + hash);
4040 remove_hash(sh);
4041 spin_unlock_irq(conf->hash_locks + hash);
4042 if (head_sh->batch_head) {
4043 sh = list_first_entry(&sh->batch_list,
4044 struct stripe_head, batch_list);
4045 if (sh != head_sh)
4046 goto unhash;
4047 }
4048 sh = head_sh;
4049
4050 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
4051 set_bit(STRIPE_HANDLE, &sh->state);
4052
4053 }
4054
4055 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
4056 if (atomic_dec_and_test(&conf->pending_full_writes))
4057 md_wakeup_thread(conf->mddev->thread);
4058
4059 if (head_sh->batch_head && do_endio)
4060 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
4061}
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071static inline bool uptodate_for_rmw(struct r5dev *dev)
4072{
4073 return (test_bit(R5_UPTODATE, &dev->flags)) &&
4074 (!test_bit(R5_InJournal, &dev->flags) ||
4075 test_bit(R5_OrigPageUPTDODATE, &dev->flags));
4076}
4077
4078static int handle_stripe_dirtying(struct r5conf *conf,
4079 struct stripe_head *sh,
4080 struct stripe_head_state *s,
4081 int disks)
4082{
4083 int rmw = 0, rcw = 0, i;
4084 sector_t recovery_cp = conf->mddev->recovery_cp;
4085
4086
4087
4088
4089
4090
4091
4092
4093 if (conf->rmw_level == PARITY_DISABLE_RMW ||
4094 (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
4095 s->failed == 0)) {
4096
4097
4098
4099 rcw = 1; rmw = 2;
4100 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
4101 conf->rmw_level, (unsigned long long)recovery_cp,
4102 (unsigned long long)sh->sector);
4103 } else for (i = disks; i--; ) {
4104
4105 struct r5dev *dev = &sh->dev[i];
4106 if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
4107 i == sh->pd_idx || i == sh->qd_idx ||
4108 test_bit(R5_InJournal, &dev->flags)) &&
4109 !test_bit(R5_LOCKED, &dev->flags) &&
4110 !(uptodate_for_rmw(dev) ||
4111 test_bit(R5_Wantcompute, &dev->flags))) {
4112 if (test_bit(R5_Insync, &dev->flags))
4113 rmw++;
4114 else
4115 rmw += 2*disks;
4116 }
4117
4118 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
4119 i != sh->pd_idx && i != sh->qd_idx &&
4120 !test_bit(R5_LOCKED, &dev->flags) &&
4121 !(test_bit(R5_UPTODATE, &dev->flags) ||
4122 test_bit(R5_Wantcompute, &dev->flags))) {
4123 if (test_bit(R5_Insync, &dev->flags))
4124 rcw++;
4125 else
4126 rcw += 2*disks;
4127 }
4128 }
4129
4130 pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
4131 (unsigned long long)sh->sector, sh->state, rmw, rcw);
4132 set_bit(STRIPE_HANDLE, &sh->state);
4133 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
4134
4135 if (conf->mddev->queue)
4136 blk_add_trace_msg(conf->mddev->queue,
4137 "raid5 rmw %llu %d",
4138 (unsigned long long)sh->sector, rmw);
4139 for (i = disks; i--; ) {
4140 struct r5dev *dev = &sh->dev[i];
4141 if (test_bit(R5_InJournal, &dev->flags) &&
4142 dev->page == dev->orig_page &&
4143 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
4144
4145 struct page *p = alloc_page(GFP_NOIO);
4146
4147 if (p) {
4148 dev->orig_page = p;
4149 continue;
4150 }
4151
4152
4153
4154
4155
4156 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
4157 &conf->cache_state)) {
4158 r5c_use_extra_page(sh);
4159 break;
4160 }
4161
4162
4163 set_bit(STRIPE_DELAYED, &sh->state);
4164 s->waiting_extra_page = 1;
4165 return -EAGAIN;
4166 }
4167 }
4168
4169 for (i = disks; i--; ) {
4170 struct r5dev *dev = &sh->dev[i];
4171 if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
4172 i == sh->pd_idx || i == sh->qd_idx ||
4173 test_bit(R5_InJournal, &dev->flags)) &&
4174 !test_bit(R5_LOCKED, &dev->flags) &&
4175 !(uptodate_for_rmw(dev) ||
4176 test_bit(R5_Wantcompute, &dev->flags)) &&
4177 test_bit(R5_Insync, &dev->flags)) {
4178 if (test_bit(STRIPE_PREREAD_ACTIVE,
4179 &sh->state)) {
4180 pr_debug("Read_old block %d for r-m-w\n",
4181 i);
4182 set_bit(R5_LOCKED, &dev->flags);
4183 set_bit(R5_Wantread, &dev->flags);
4184 s->locked++;
4185 } else
4186 set_bit(STRIPE_DELAYED, &sh->state);
4187 }
4188 }
4189 }
4190 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) {
4191
4192 int qread =0;
4193 rcw = 0;
4194 for (i = disks; i--; ) {
4195 struct r5dev *dev = &sh->dev[i];
4196 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
4197 i != sh->pd_idx && i != sh->qd_idx &&
4198 !test_bit(R5_LOCKED, &dev->flags) &&
4199 !(test_bit(R5_UPTODATE, &dev->flags) ||
4200 test_bit(R5_Wantcompute, &dev->flags))) {
4201 rcw++;
4202 if (test_bit(R5_Insync, &dev->flags) &&
4203 test_bit(STRIPE_PREREAD_ACTIVE,
4204 &sh->state)) {
4205 pr_debug("Read_old block "
4206 "%d for Reconstruct\n", i);
4207 set_bit(R5_LOCKED, &dev->flags);
4208 set_bit(R5_Wantread, &dev->flags);
4209 s->locked++;
4210 qread++;
4211 } else
4212 set_bit(STRIPE_DELAYED, &sh->state);
4213 }
4214 }
4215 if (rcw && conf->mddev->queue)
4216 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
4217 (unsigned long long)sh->sector,
4218 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
4219 }
4220
4221 if (rcw > disks && rmw > disks &&
4222 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4223 set_bit(STRIPE_DELAYED, &sh->state);
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
4236 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
4237 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
4238 schedule_reconstruction(sh, s, rcw == 0, 0);
4239 return 0;
4240}
4241
4242static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
4243 struct stripe_head_state *s, int disks)
4244{
4245 struct r5dev *dev = NULL;
4246
4247 BUG_ON(sh->batch_head);
4248 set_bit(STRIPE_HANDLE, &sh->state);
4249
4250 switch (sh->check_state) {
4251 case check_state_idle:
4252
4253 if (s->failed == 0) {
4254 BUG_ON(s->uptodate != disks);
4255 sh->check_state = check_state_run;
4256 set_bit(STRIPE_OP_CHECK, &s->ops_request);
4257 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
4258 s->uptodate--;
4259 break;
4260 }
4261 dev = &sh->dev[s->failed_num[0]];
4262 fallthrough;
4263 case check_state_compute_result:
4264 sh->check_state = check_state_idle;
4265 if (!dev)
4266 dev = &sh->dev[sh->pd_idx];
4267
4268
4269 if (test_bit(STRIPE_INSYNC, &sh->state))
4270 break;
4271
4272
4273 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
4274 BUG_ON(s->uptodate != disks);
4275
4276 set_bit(R5_LOCKED, &dev->flags);
4277 s->locked++;
4278 set_bit(R5_Wantwrite, &dev->flags);
4279
4280 clear_bit(STRIPE_DEGRADED, &sh->state);
4281 set_bit(STRIPE_INSYNC, &sh->state);
4282 break;
4283 case check_state_run:
4284 break;
4285 case check_state_check_result:
4286 sh->check_state = check_state_idle;
4287
4288
4289
4290
4291 if (s->failed)
4292 break;
4293
4294
4295
4296
4297
4298 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
4299
4300
4301
4302 set_bit(STRIPE_INSYNC, &sh->state);
4303 else {
4304 atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
4305 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4306
4307 set_bit(STRIPE_INSYNC, &sh->state);
4308 pr_warn_ratelimited("%s: mismatch sector in range "
4309 "%llu-%llu\n", mdname(conf->mddev),
4310 (unsigned long long) sh->sector,
4311 (unsigned long long) sh->sector +
4312 RAID5_STRIPE_SECTORS(conf));
4313 } else {
4314 sh->check_state = check_state_compute_run;
4315 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4316 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4317 set_bit(R5_Wantcompute,
4318 &sh->dev[sh->pd_idx].flags);
4319 sh->ops.target = sh->pd_idx;
4320 sh->ops.target2 = -1;
4321 s->uptodate++;
4322 }
4323 }
4324 break;
4325 case check_state_compute_run:
4326 break;
4327 default:
4328 pr_err("%s: unknown check_state: %d sector: %llu\n",
4329 __func__, sh->check_state,
4330 (unsigned long long) sh->sector);
4331 BUG();
4332 }
4333}
4334
4335static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
4336 struct stripe_head_state *s,
4337 int disks)
4338{
4339 int pd_idx = sh->pd_idx;
4340 int qd_idx = sh->qd_idx;
4341 struct r5dev *dev;
4342
4343 BUG_ON(sh->batch_head);
4344 set_bit(STRIPE_HANDLE, &sh->state);
4345
4346 BUG_ON(s->failed > 2);
4347
4348
4349
4350
4351
4352
4353
4354 switch (sh->check_state) {
4355 case check_state_idle:
4356
4357 if (s->failed == s->q_failed) {
4358
4359
4360
4361
4362 sh->check_state = check_state_run;
4363 }
4364 if (!s->q_failed && s->failed < 2) {
4365
4366
4367
4368 if (sh->check_state == check_state_run)
4369 sh->check_state = check_state_run_pq;
4370 else
4371 sh->check_state = check_state_run_q;
4372 }
4373
4374
4375 sh->ops.zero_sum_result = 0;
4376
4377 if (sh->check_state == check_state_run) {
4378
4379 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
4380 s->uptodate--;
4381 }
4382 if (sh->check_state >= check_state_run &&
4383 sh->check_state <= check_state_run_pq) {
4384
4385
4386
4387 set_bit(STRIPE_OP_CHECK, &s->ops_request);
4388 break;
4389 }
4390
4391
4392 BUG_ON(s->failed != 2);
4393 fallthrough;
4394 case check_state_compute_result:
4395 sh->check_state = check_state_idle;
4396
4397
4398 if (test_bit(STRIPE_INSYNC, &sh->state))
4399 break;
4400
4401
4402
4403
4404 dev = NULL;
4405 if (s->failed == 2) {
4406 dev = &sh->dev[s->failed_num[1]];
4407 s->locked++;
4408 set_bit(R5_LOCKED, &dev->flags);
4409 set_bit(R5_Wantwrite, &dev->flags);
4410 }
4411 if (s->failed >= 1) {
4412 dev = &sh->dev[s->failed_num[0]];
4413 s->locked++;
4414 set_bit(R5_LOCKED, &dev->flags);
4415 set_bit(R5_Wantwrite, &dev->flags);
4416 }
4417 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4418 dev = &sh->dev[pd_idx];
4419 s->locked++;
4420 set_bit(R5_LOCKED, &dev->flags);
4421 set_bit(R5_Wantwrite, &dev->flags);
4422 }
4423 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4424 dev = &sh->dev[qd_idx];
4425 s->locked++;
4426 set_bit(R5_LOCKED, &dev->flags);
4427 set_bit(R5_Wantwrite, &dev->flags);
4428 }
4429 if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags),
4430 "%s: disk%td not up to date\n",
4431 mdname(conf->mddev),
4432 dev - (struct r5dev *) &sh->dev)) {
4433 clear_bit(R5_LOCKED, &dev->flags);
4434 clear_bit(R5_Wantwrite, &dev->flags);
4435 s->locked--;
4436 }
4437 clear_bit(STRIPE_DEGRADED, &sh->state);
4438
4439 set_bit(STRIPE_INSYNC, &sh->state);
4440 break;
4441 case check_state_run:
4442 case check_state_run_q:
4443 case check_state_run_pq:
4444 break;
4445 case check_state_check_result:
4446 sh->check_state = check_state_idle;
4447
4448
4449
4450
4451
4452 if (sh->ops.zero_sum_result == 0) {
4453
4454 if (!s->failed)
4455 set_bit(STRIPE_INSYNC, &sh->state);
4456 else {
4457
4458
4459
4460
4461 sh->check_state = check_state_compute_result;
4462
4463
4464
4465
4466
4467 }
4468 } else {
4469 atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
4470 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4471
4472 set_bit(STRIPE_INSYNC, &sh->state);
4473 pr_warn_ratelimited("%s: mismatch sector in range "
4474 "%llu-%llu\n", mdname(conf->mddev),
4475 (unsigned long long) sh->sector,
4476 (unsigned long long) sh->sector +
4477 RAID5_STRIPE_SECTORS(conf));
4478 } else {
4479 int *target = &sh->ops.target;
4480
4481 sh->ops.target = -1;
4482 sh->ops.target2 = -1;
4483 sh->check_state = check_state_compute_run;
4484 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4485 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4486 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4487 set_bit(R5_Wantcompute,
4488 &sh->dev[pd_idx].flags);
4489 *target = pd_idx;
4490 target = &sh->ops.target2;
4491 s->uptodate++;
4492 }
4493 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4494 set_bit(R5_Wantcompute,
4495 &sh->dev[qd_idx].flags);
4496 *target = qd_idx;
4497 s->uptodate++;
4498 }
4499 }
4500 }
4501 break;
4502 case check_state_compute_run:
4503 break;
4504 default:
4505 pr_warn("%s: unknown check_state: %d sector: %llu\n",
4506 __func__, sh->check_state,
4507 (unsigned long long) sh->sector);
4508 BUG();
4509 }
4510}
4511
4512static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
4513{
4514 int i;
4515
4516
4517
4518
4519 struct dma_async_tx_descriptor *tx = NULL;
4520 BUG_ON(sh->batch_head);
4521 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
4522 for (i = 0; i < sh->disks; i++)
4523 if (i != sh->pd_idx && i != sh->qd_idx) {
4524 int dd_idx, j;
4525 struct stripe_head *sh2;
4526 struct async_submit_ctl submit;
4527
4528 sector_t bn = raid5_compute_blocknr(sh, i, 1);
4529 sector_t s = raid5_compute_sector(conf, bn, 0,
4530 &dd_idx, NULL);
4531 sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1);
4532 if (sh2 == NULL)
4533
4534
4535
4536
4537 continue;
4538 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
4539 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
4540
4541 raid5_release_stripe(sh2);
4542 continue;
4543 }
4544
4545
4546 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
4547 tx = async_memcpy(sh2->dev[dd_idx].page,
4548 sh->dev[i].page, sh2->dev[dd_idx].offset,
4549 sh->dev[i].offset, RAID5_STRIPE_SIZE(conf),
4550 &submit);
4551
4552 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
4553 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
4554 for (j = 0; j < conf->raid_disks; j++)
4555 if (j != sh2->pd_idx &&
4556 j != sh2->qd_idx &&
4557 !test_bit(R5_Expanded, &sh2->dev[j].flags))
4558 break;
4559 if (j == conf->raid_disks) {
4560 set_bit(STRIPE_EXPAND_READY, &sh2->state);
4561 set_bit(STRIPE_HANDLE, &sh2->state);
4562 }
4563 raid5_release_stripe(sh2);
4564
4565 }
4566
4567 async_tx_quiesce(&tx);
4568}
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
4585{
4586 struct r5conf *conf = sh->raid_conf;
4587 int disks = sh->disks;
4588 struct r5dev *dev;
4589 int i;
4590 int do_recovery = 0;
4591
4592 memset(s, 0, sizeof(*s));
4593
4594 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
4595 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
4596 s->failed_num[0] = -1;
4597 s->failed_num[1] = -1;
4598 s->log_failed = r5l_log_disk_error(conf);
4599
4600
4601 rcu_read_lock();
4602 for (i=disks; i--; ) {
4603 struct md_rdev *rdev;
4604 sector_t first_bad;
4605 int bad_sectors;
4606 int is_bad = 0;
4607
4608 dev = &sh->dev[i];
4609
4610 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
4611 i, dev->flags,
4612 dev->toread, dev->towrite, dev->written);
4613
4614
4615
4616
4617
4618 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
4619 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
4620 set_bit(R5_Wantfill, &dev->flags);
4621
4622
4623 if (test_bit(R5_LOCKED, &dev->flags))
4624 s->locked++;
4625 if (test_bit(R5_UPTODATE, &dev->flags))
4626 s->uptodate++;
4627 if (test_bit(R5_Wantcompute, &dev->flags)) {
4628 s->compute++;
4629 BUG_ON(s->compute > 2);
4630 }
4631
4632 if (test_bit(R5_Wantfill, &dev->flags))
4633 s->to_fill++;
4634 else if (dev->toread)
4635 s->to_read++;
4636 if (dev->towrite) {
4637 s->to_write++;
4638 if (!test_bit(R5_OVERWRITE, &dev->flags))
4639 s->non_overwrite++;
4640 }
4641 if (dev->written)
4642 s->written++;
4643
4644
4645
4646 rdev = rcu_dereference(conf->disks[i].replacement);
4647 if (rdev && !test_bit(Faulty, &rdev->flags) &&
4648 rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
4649 !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
4650 &first_bad, &bad_sectors))
4651 set_bit(R5_ReadRepl, &dev->flags);
4652 else {
4653 if (rdev && !test_bit(Faulty, &rdev->flags))
4654 set_bit(R5_NeedReplace, &dev->flags);
4655 else
4656 clear_bit(R5_NeedReplace, &dev->flags);
4657 rdev = rcu_dereference(conf->disks[i].rdev);
4658 clear_bit(R5_ReadRepl, &dev->flags);
4659 }
4660 if (rdev && test_bit(Faulty, &rdev->flags))
4661 rdev = NULL;
4662 if (rdev) {
4663 is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
4664 &first_bad, &bad_sectors);
4665 if (s->blocked_rdev == NULL
4666 && (test_bit(Blocked, &rdev->flags)
4667 || is_bad < 0)) {
4668 if (is_bad < 0)
4669 set_bit(BlockedBadBlocks,
4670 &rdev->flags);
4671 s->blocked_rdev = rdev;
4672 atomic_inc(&rdev->nr_pending);
4673 }
4674 }
4675 clear_bit(R5_Insync, &dev->flags);
4676 if (!rdev)
4677 ;
4678 else if (is_bad) {
4679
4680 if (!test_bit(WriteErrorSeen, &rdev->flags) &&
4681 test_bit(R5_UPTODATE, &dev->flags)) {
4682
4683
4684
4685 set_bit(R5_Insync, &dev->flags);
4686 set_bit(R5_ReadError, &dev->flags);
4687 }
4688 } else if (test_bit(In_sync, &rdev->flags))
4689 set_bit(R5_Insync, &dev->flags);
4690 else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
4691
4692 set_bit(R5_Insync, &dev->flags);
4693 else if (test_bit(R5_UPTODATE, &dev->flags) &&
4694 test_bit(R5_Expanded, &dev->flags))
4695
4696
4697
4698
4699 set_bit(R5_Insync, &dev->flags);
4700
4701 if (test_bit(R5_WriteError, &dev->flags)) {
4702
4703
4704 struct md_rdev *rdev2 = rcu_dereference(
4705 conf->disks[i].rdev);
4706 if (rdev2 == rdev)
4707 clear_bit(R5_Insync, &dev->flags);
4708 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4709 s->handle_bad_blocks = 1;
4710 atomic_inc(&rdev2->nr_pending);
4711 } else
4712 clear_bit(R5_WriteError, &dev->flags);
4713 }
4714 if (test_bit(R5_MadeGood, &dev->flags)) {
4715
4716
4717 struct md_rdev *rdev2 = rcu_dereference(
4718 conf->disks[i].rdev);
4719 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4720 s->handle_bad_blocks = 1;
4721 atomic_inc(&rdev2->nr_pending);
4722 } else
4723 clear_bit(R5_MadeGood, &dev->flags);
4724 }
4725 if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
4726 struct md_rdev *rdev2 = rcu_dereference(
4727 conf->disks[i].replacement);
4728 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4729 s->handle_bad_blocks = 1;
4730 atomic_inc(&rdev2->nr_pending);
4731 } else
4732 clear_bit(R5_MadeGoodRepl, &dev->flags);
4733 }
4734 if (!test_bit(R5_Insync, &dev->flags)) {
4735
4736 clear_bit(R5_ReadError, &dev->flags);
4737 clear_bit(R5_ReWrite, &dev->flags);
4738 }
4739 if (test_bit(R5_ReadError, &dev->flags))
4740 clear_bit(R5_Insync, &dev->flags);
4741 if (!test_bit(R5_Insync, &dev->flags)) {
4742 if (s->failed < 2)
4743 s->failed_num[s->failed] = i;
4744 s->failed++;
4745 if (rdev && !test_bit(Faulty, &rdev->flags))
4746 do_recovery = 1;
4747 else if (!rdev) {
4748 rdev = rcu_dereference(
4749 conf->disks[i].replacement);
4750 if (rdev && !test_bit(Faulty, &rdev->flags))
4751 do_recovery = 1;
4752 }
4753 }
4754
4755 if (test_bit(R5_InJournal, &dev->flags))
4756 s->injournal++;
4757 if (test_bit(R5_InJournal, &dev->flags) && dev->written)
4758 s->just_cached++;
4759 }
4760 if (test_bit(STRIPE_SYNCING, &sh->state)) {
4761
4762
4763
4764
4765
4766
4767
4768
4769 if (do_recovery ||
4770 sh->sector >= conf->mddev->recovery_cp ||
4771 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
4772 s->syncing = 1;
4773 else
4774 s->replacing = 1;
4775 }
4776 rcu_read_unlock();
4777}
4778
4779
4780
4781
4782
4783static int clear_batch_ready(struct stripe_head *sh)
4784{
4785 struct stripe_head *tmp;
4786 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
4787 return (sh->batch_head && sh->batch_head != sh);
4788 spin_lock(&sh->stripe_lock);
4789 if (!sh->batch_head) {
4790 spin_unlock(&sh->stripe_lock);
4791 return 0;
4792 }
4793
4794
4795
4796
4797
4798 if (sh->batch_head != sh) {
4799 spin_unlock(&sh->stripe_lock);
4800 return 1;
4801 }
4802 spin_lock(&sh->batch_lock);
4803 list_for_each_entry(tmp, &sh->batch_list, batch_list)
4804 clear_bit(STRIPE_BATCH_READY, &tmp->state);
4805 spin_unlock(&sh->batch_lock);
4806 spin_unlock(&sh->stripe_lock);
4807
4808
4809
4810
4811
4812 return 0;
4813}
4814
4815static void break_stripe_batch_list(struct stripe_head *head_sh,
4816 unsigned long handle_flags)
4817{
4818 struct stripe_head *sh, *next;
4819 int i;
4820 int do_wakeup = 0;
4821
4822 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
4823
4824 list_del_init(&sh->batch_list);
4825
4826 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
4827 (1 << STRIPE_SYNCING) |
4828 (1 << STRIPE_REPLACED) |
4829 (1 << STRIPE_DELAYED) |
4830 (1 << STRIPE_BIT_DELAY) |
4831 (1 << STRIPE_FULL_WRITE) |
4832 (1 << STRIPE_BIOFILL_RUN) |
4833 (1 << STRIPE_COMPUTE_RUN) |
4834 (1 << STRIPE_DISCARD) |
4835 (1 << STRIPE_BATCH_READY) |
4836 (1 << STRIPE_BATCH_ERR) |
4837 (1 << STRIPE_BITMAP_PENDING)),
4838 "stripe state: %lx\n", sh->state);
4839 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
4840 (1 << STRIPE_REPLACED)),
4841 "head stripe state: %lx\n", head_sh->state);
4842
4843 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
4844 (1 << STRIPE_PREREAD_ACTIVE) |
4845 (1 << STRIPE_DEGRADED) |
4846 (1 << STRIPE_ON_UNPLUG_LIST)),
4847 head_sh->state & (1 << STRIPE_INSYNC));
4848
4849 sh->check_state = head_sh->check_state;
4850 sh->reconstruct_state = head_sh->reconstruct_state;
4851 spin_lock_irq(&sh->stripe_lock);
4852 sh->batch_head = NULL;
4853 spin_unlock_irq(&sh->stripe_lock);
4854 for (i = 0; i < sh->disks; i++) {
4855 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
4856 do_wakeup = 1;
4857 sh->dev[i].flags = head_sh->dev[i].flags &
4858 (~((1 << R5_WriteError) | (1 << R5_Overlap)));
4859 }
4860 if (handle_flags == 0 ||
4861 sh->state & handle_flags)
4862 set_bit(STRIPE_HANDLE, &sh->state);
4863 raid5_release_stripe(sh);
4864 }
4865 spin_lock_irq(&head_sh->stripe_lock);
4866 head_sh->batch_head = NULL;
4867 spin_unlock_irq(&head_sh->stripe_lock);
4868 for (i = 0; i < head_sh->disks; i++)
4869 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
4870 do_wakeup = 1;
4871 if (head_sh->state & handle_flags)
4872 set_bit(STRIPE_HANDLE, &head_sh->state);
4873
4874 if (do_wakeup)
4875 wake_up(&head_sh->raid_conf->wait_for_overlap);
4876}
4877
4878static void handle_stripe(struct stripe_head *sh)
4879{
4880 struct stripe_head_state s;
4881 struct r5conf *conf = sh->raid_conf;
4882 int i;
4883 int prexor;
4884 int disks = sh->disks;
4885 struct r5dev *pdev, *qdev;
4886
4887 clear_bit(STRIPE_HANDLE, &sh->state);
4888
4889
4890
4891
4892
4893
4894
4895 if (clear_batch_ready(sh))
4896 return;
4897
4898 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
4899
4900
4901 set_bit(STRIPE_HANDLE, &sh->state);
4902 return;
4903 }
4904
4905 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
4906 break_stripe_batch_list(sh, 0);
4907
4908 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
4909 spin_lock(&sh->stripe_lock);
4910
4911
4912
4913
4914 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
4915 !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
4916 !test_bit(STRIPE_DISCARD, &sh->state) &&
4917 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
4918 set_bit(STRIPE_SYNCING, &sh->state);
4919 clear_bit(STRIPE_INSYNC, &sh->state);
4920 clear_bit(STRIPE_REPLACED, &sh->state);
4921 }
4922 spin_unlock(&sh->stripe_lock);
4923 }
4924 clear_bit(STRIPE_DELAYED, &sh->state);
4925
4926 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
4927 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
4928 (unsigned long long)sh->sector, sh->state,
4929 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
4930 sh->check_state, sh->reconstruct_state);
4931
4932 analyse_stripe(sh, &s);
4933
4934 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
4935 goto finish;
4936
4937 if (s.handle_bad_blocks ||
4938 test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
4939 set_bit(STRIPE_HANDLE, &sh->state);
4940 goto finish;
4941 }
4942
4943 if (unlikely(s.blocked_rdev)) {
4944 if (s.syncing || s.expanding || s.expanded ||
4945 s.replacing || s.to_write || s.written) {
4946 set_bit(STRIPE_HANDLE, &sh->state);
4947 goto finish;
4948 }
4949
4950 rdev_dec_pending(s.blocked_rdev, conf->mddev);
4951 s.blocked_rdev = NULL;
4952 }
4953
4954 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
4955 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
4956 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
4957 }
4958
4959 pr_debug("locked=%d uptodate=%d to_read=%d"
4960 " to_write=%d failed=%d failed_num=%d,%d\n",
4961 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
4962 s.failed_num[0], s.failed_num[1]);
4963
4964
4965
4966
4967
4968
4969
4970 if (s.failed > conf->max_degraded ||
4971 (s.log_failed && s.injournal == 0)) {
4972 sh->check_state = 0;
4973 sh->reconstruct_state = 0;
4974 break_stripe_batch_list(sh, 0);
4975 if (s.to_read+s.to_write+s.written)
4976 handle_failed_stripe(conf, sh, &s, disks);
4977 if (s.syncing + s.replacing)
4978 handle_failed_sync(conf, sh, &s);
4979 }
4980
4981
4982
4983
4984 prexor = 0;
4985 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
4986 prexor = 1;
4987 if (sh->reconstruct_state == reconstruct_state_drain_result ||
4988 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
4989 sh->reconstruct_state = reconstruct_state_idle;
4990
4991
4992
4993
4994 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
4995 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
4996 BUG_ON(sh->qd_idx >= 0 &&
4997 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
4998 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
4999 for (i = disks; i--; ) {
5000 struct r5dev *dev = &sh->dev[i];
5001 if (test_bit(R5_LOCKED, &dev->flags) &&
5002 (i == sh->pd_idx || i == sh->qd_idx ||
5003 dev->written || test_bit(R5_InJournal,
5004 &dev->flags))) {
5005 pr_debug("Writing block %d\n", i);
5006 set_bit(R5_Wantwrite, &dev->flags);
5007 if (prexor)
5008 continue;
5009 if (s.failed > 1)
5010 continue;
5011 if (!test_bit(R5_Insync, &dev->flags) ||
5012 ((i == sh->pd_idx || i == sh->qd_idx) &&
5013 s.failed == 0))
5014 set_bit(STRIPE_INSYNC, &sh->state);
5015 }
5016 }
5017 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5018 s.dec_preread_active = 1;
5019 }
5020
5021
5022
5023
5024
5025 pdev = &sh->dev[sh->pd_idx];
5026 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
5027 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
5028 qdev = &sh->dev[sh->qd_idx];
5029 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
5030 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
5031 || conf->level < 6;
5032
5033 if (s.written &&
5034 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
5035 && !test_bit(R5_LOCKED, &pdev->flags)
5036 && (test_bit(R5_UPTODATE, &pdev->flags) ||
5037 test_bit(R5_Discard, &pdev->flags))))) &&
5038 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
5039 && !test_bit(R5_LOCKED, &qdev->flags)
5040 && (test_bit(R5_UPTODATE, &qdev->flags) ||
5041 test_bit(R5_Discard, &qdev->flags))))))
5042 handle_stripe_clean_event(conf, sh, disks);
5043
5044 if (s.just_cached)
5045 r5c_handle_cached_data_endio(conf, sh, disks);
5046 log_stripe_write_finished(sh);
5047
5048
5049
5050
5051
5052 if (s.to_read || s.non_overwrite
5053 || (s.to_write && s.failed)
5054 || (s.syncing && (s.uptodate + s.compute < disks))
5055 || s.replacing
5056 || s.expanding)
5057 handle_stripe_fill(sh, &s, disks);
5058
5059
5060
5061
5062
5063
5064 r5c_finish_stripe_write_out(conf, sh, &s);
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
5076 if (!r5c_is_writeback(conf->log)) {
5077 if (s.to_write)
5078 handle_stripe_dirtying(conf, sh, &s, disks);
5079 } else {
5080 int ret = 0;
5081
5082
5083 if (s.to_write)
5084 ret = r5c_try_caching_write(conf, sh, &s,
5085 disks);
5086
5087
5088
5089
5090
5091
5092
5093 if (ret == -EAGAIN ||
5094
5095 (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
5096 s.injournal > 0)) {
5097 ret = handle_stripe_dirtying(conf, sh, &s,
5098 disks);
5099 if (ret == -EAGAIN)
5100 goto finish;
5101 }
5102 }
5103 }
5104
5105
5106
5107
5108
5109
5110 if (sh->check_state ||
5111 (s.syncing && s.locked == 0 &&
5112 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
5113 !test_bit(STRIPE_INSYNC, &sh->state))) {
5114 if (conf->level == 6)
5115 handle_parity_checks6(conf, sh, &s, disks);
5116 else
5117 handle_parity_checks5(conf, sh, &s, disks);
5118 }
5119
5120 if ((s.replacing || s.syncing) && s.locked == 0
5121 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
5122 && !test_bit(STRIPE_REPLACED, &sh->state)) {
5123
5124 for (i = 0; i < conf->raid_disks; i++)
5125 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
5126 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
5127 set_bit(R5_WantReplace, &sh->dev[i].flags);
5128 set_bit(R5_LOCKED, &sh->dev[i].flags);
5129 s.locked++;
5130 }
5131 if (s.replacing)
5132 set_bit(STRIPE_INSYNC, &sh->state);
5133 set_bit(STRIPE_REPLACED, &sh->state);
5134 }
5135 if ((s.syncing || s.replacing) && s.locked == 0 &&
5136 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
5137 test_bit(STRIPE_INSYNC, &sh->state)) {
5138 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
5139 clear_bit(STRIPE_SYNCING, &sh->state);
5140 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
5141 wake_up(&conf->wait_for_overlap);
5142 }
5143
5144
5145
5146
5147 if (s.failed <= conf->max_degraded && !conf->mddev->ro)
5148 for (i = 0; i < s.failed; i++) {
5149 struct r5dev *dev = &sh->dev[s.failed_num[i]];
5150 if (test_bit(R5_ReadError, &dev->flags)
5151 && !test_bit(R5_LOCKED, &dev->flags)
5152 && test_bit(R5_UPTODATE, &dev->flags)
5153 ) {
5154 if (!test_bit(R5_ReWrite, &dev->flags)) {
5155 set_bit(R5_Wantwrite, &dev->flags);
5156 set_bit(R5_ReWrite, &dev->flags);
5157 } else
5158
5159 set_bit(R5_Wantread, &dev->flags);
5160 set_bit(R5_LOCKED, &dev->flags);
5161 s.locked++;
5162 }
5163 }
5164
5165
5166 if (sh->reconstruct_state == reconstruct_state_result) {
5167 struct stripe_head *sh_src
5168 = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
5169 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
5170
5171
5172
5173 set_bit(STRIPE_DELAYED, &sh->state);
5174 set_bit(STRIPE_HANDLE, &sh->state);
5175 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
5176 &sh_src->state))
5177 atomic_inc(&conf->preread_active_stripes);
5178 raid5_release_stripe(sh_src);
5179 goto finish;
5180 }
5181 if (sh_src)
5182 raid5_release_stripe(sh_src);
5183
5184 sh->reconstruct_state = reconstruct_state_idle;
5185 clear_bit(STRIPE_EXPANDING, &sh->state);
5186 for (i = conf->raid_disks; i--; ) {
5187 set_bit(R5_Wantwrite, &sh->dev[i].flags);
5188 set_bit(R5_LOCKED, &sh->dev[i].flags);
5189 s.locked++;
5190 }
5191 }
5192
5193 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
5194 !sh->reconstruct_state) {
5195
5196 sh->disks = conf->raid_disks;
5197 stripe_set_idx(sh->sector, conf, 0, sh);
5198 schedule_reconstruction(sh, &s, 1, 1);
5199 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
5200 clear_bit(STRIPE_EXPAND_READY, &sh->state);
5201 atomic_dec(&conf->reshape_stripes);
5202 wake_up(&conf->wait_for_overlap);
5203 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
5204 }
5205
5206 if (s.expanding && s.locked == 0 &&
5207 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
5208 handle_stripe_expansion(conf, sh);
5209
5210finish:
5211
5212 if (unlikely(s.blocked_rdev)) {
5213 if (conf->mddev->external)
5214 md_wait_for_blocked_rdev(s.blocked_rdev,
5215 conf->mddev);
5216 else
5217
5218
5219
5220
5221 rdev_dec_pending(s.blocked_rdev,
5222 conf->mddev);
5223 }
5224
5225 if (s.handle_bad_blocks)
5226 for (i = disks; i--; ) {
5227 struct md_rdev *rdev;
5228 struct r5dev *dev = &sh->dev[i];
5229 if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
5230
5231 rdev = conf->disks[i].rdev;
5232 if (!rdev_set_badblocks(rdev, sh->sector,
5233 RAID5_STRIPE_SECTORS(conf), 0))
5234 md_error(conf->mddev, rdev);
5235 rdev_dec_pending(rdev, conf->mddev);
5236 }
5237 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
5238 rdev = conf->disks[i].rdev;
5239 rdev_clear_badblocks(rdev, sh->sector,
5240 RAID5_STRIPE_SECTORS(conf), 0);
5241 rdev_dec_pending(rdev, conf->mddev);
5242 }
5243 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
5244 rdev = conf->disks[i].replacement;
5245 if (!rdev)
5246
5247 rdev = conf->disks[i].rdev;
5248 rdev_clear_badblocks(rdev, sh->sector,
5249 RAID5_STRIPE_SECTORS(conf), 0);
5250 rdev_dec_pending(rdev, conf->mddev);
5251 }
5252 }
5253
5254 if (s.ops_request)
5255 raid_run_ops(sh, s.ops_request);
5256
5257 ops_run_io(sh, &s);
5258
5259 if (s.dec_preread_active) {
5260
5261
5262
5263
5264 atomic_dec(&conf->preread_active_stripes);
5265 if (atomic_read(&conf->preread_active_stripes) <
5266 IO_THRESHOLD)
5267 md_wakeup_thread(conf->mddev->thread);
5268 }
5269
5270 clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
5271}
5272
5273static void raid5_activate_delayed(struct r5conf *conf)
5274{
5275 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
5276 while (!list_empty(&conf->delayed_list)) {
5277 struct list_head *l = conf->delayed_list.next;
5278 struct stripe_head *sh;
5279 sh = list_entry(l, struct stripe_head, lru);
5280 list_del_init(l);
5281 clear_bit(STRIPE_DELAYED, &sh->state);
5282 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5283 atomic_inc(&conf->preread_active_stripes);
5284 list_add_tail(&sh->lru, &conf->hold_list);
5285 raid5_wakeup_stripe_thread(sh);
5286 }
5287 }
5288}
5289
5290static void activate_bit_delay(struct r5conf *conf,
5291 struct list_head *temp_inactive_list)
5292{
5293
5294 struct list_head head;
5295 list_add(&head, &conf->bitmap_list);
5296 list_del_init(&conf->bitmap_list);
5297 while (!list_empty(&head)) {
5298 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
5299 int hash;
5300 list_del_init(&sh->lru);
5301 atomic_inc(&sh->count);
5302 hash = sh->hash_lock_index;
5303 __release_stripe(conf, sh, &temp_inactive_list[hash]);
5304 }
5305}
5306
5307static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
5308{
5309 struct r5conf *conf = mddev->private;
5310 sector_t sector = bio->bi_iter.bi_sector;
5311 unsigned int chunk_sectors;
5312 unsigned int bio_sectors = bio_sectors(bio);
5313
5314 chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
5315 return chunk_sectors >=
5316 ((sector & (chunk_sectors - 1)) + bio_sectors);
5317}
5318
5319
5320
5321
5322
5323static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
5324{
5325 unsigned long flags;
5326
5327 spin_lock_irqsave(&conf->device_lock, flags);
5328
5329 bi->bi_next = conf->retry_read_aligned_list;
5330 conf->retry_read_aligned_list = bi;
5331
5332 spin_unlock_irqrestore(&conf->device_lock, flags);
5333 md_wakeup_thread(conf->mddev->thread);
5334}
5335
5336static struct bio *remove_bio_from_retry(struct r5conf *conf,
5337 unsigned int *offset)
5338{
5339 struct bio *bi;
5340
5341 bi = conf->retry_read_aligned;
5342 if (bi) {
5343 *offset = conf->retry_read_offset;
5344 conf->retry_read_aligned = NULL;
5345 return bi;
5346 }
5347 bi = conf->retry_read_aligned_list;
5348 if(bi) {
5349 conf->retry_read_aligned_list = bi->bi_next;
5350 bi->bi_next = NULL;
5351 *offset = 0;
5352 }
5353
5354 return bi;
5355}
5356
5357
5358
5359
5360
5361
5362
5363static void raid5_align_endio(struct bio *bi)
5364{
5365 struct md_io_acct *md_io_acct = bi->bi_private;
5366 struct bio *raid_bi = md_io_acct->orig_bio;
5367 struct mddev *mddev;
5368 struct r5conf *conf;
5369 struct md_rdev *rdev;
5370 blk_status_t error = bi->bi_status;
5371 unsigned long start_time = md_io_acct->start_time;
5372
5373 bio_put(bi);
5374
5375 rdev = (void*)raid_bi->bi_next;
5376 raid_bi->bi_next = NULL;
5377 mddev = rdev->mddev;
5378 conf = mddev->private;
5379
5380 rdev_dec_pending(rdev, conf->mddev);
5381
5382 if (!error) {
5383 if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue))
5384 bio_end_io_acct(raid_bi, start_time);
5385 bio_endio(raid_bi);
5386 if (atomic_dec_and_test(&conf->active_aligned_reads))
5387 wake_up(&conf->wait_for_quiescent);
5388 return;
5389 }
5390
5391 pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
5392
5393 add_bio_to_retry(raid_bi, conf);
5394}
5395
5396static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
5397{
5398 struct r5conf *conf = mddev->private;
5399 struct bio *align_bio;
5400 struct md_rdev *rdev;
5401 sector_t sector, end_sector, first_bad;
5402 int bad_sectors, dd_idx;
5403 struct md_io_acct *md_io_acct;
5404 bool did_inc;
5405
5406 if (!in_chunk_boundary(mddev, raid_bio)) {
5407 pr_debug("%s: non aligned\n", __func__);
5408 return 0;
5409 }
5410
5411 sector = raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 0,
5412 &dd_idx, NULL);
5413 end_sector = bio_end_sector(raid_bio);
5414
5415 rcu_read_lock();
5416 if (r5c_big_stripe_cached(conf, sector))
5417 goto out_rcu_unlock;
5418
5419 rdev = rcu_dereference(conf->disks[dd_idx].replacement);
5420 if (!rdev || test_bit(Faulty, &rdev->flags) ||
5421 rdev->recovery_offset < end_sector) {
5422 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
5423 if (!rdev)
5424 goto out_rcu_unlock;
5425 if (test_bit(Faulty, &rdev->flags) ||
5426 !(test_bit(In_sync, &rdev->flags) ||
5427 rdev->recovery_offset >= end_sector))
5428 goto out_rcu_unlock;
5429 }
5430
5431 atomic_inc(&rdev->nr_pending);
5432 rcu_read_unlock();
5433
5434 if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad,
5435 &bad_sectors)) {
5436 bio_put(raid_bio);
5437 rdev_dec_pending(rdev, mddev);
5438 return 0;
5439 }
5440
5441 align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->io_acct_set);
5442 md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone);
5443 raid_bio->bi_next = (void *)rdev;
5444 if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
5445 md_io_acct->start_time = bio_start_io_acct(raid_bio);
5446 md_io_acct->orig_bio = raid_bio;
5447
5448 bio_set_dev(align_bio, rdev->bdev);
5449 align_bio->bi_end_io = raid5_align_endio;
5450 align_bio->bi_private = md_io_acct;
5451 align_bio->bi_iter.bi_sector = sector;
5452
5453
5454 align_bio->bi_iter.bi_sector += rdev->data_offset;
5455
5456 did_inc = false;
5457 if (conf->quiesce == 0) {
5458 atomic_inc(&conf->active_aligned_reads);
5459 did_inc = true;
5460 }
5461
5462 if (!did_inc || smp_load_acquire(&conf->quiesce) != 0) {
5463
5464
5465
5466 if (did_inc && atomic_dec_and_test(&conf->active_aligned_reads))
5467 wake_up(&conf->wait_for_quiescent);
5468 spin_lock_irq(&conf->device_lock);
5469 wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0,
5470 conf->device_lock);
5471 atomic_inc(&conf->active_aligned_reads);
5472 spin_unlock_irq(&conf->device_lock);
5473 }
5474
5475 if (mddev->gendisk)
5476 trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk),
5477 raid_bio->bi_iter.bi_sector);
5478 submit_bio_noacct(align_bio);
5479 return 1;
5480
5481out_rcu_unlock:
5482 rcu_read_unlock();
5483 return 0;
5484}
5485
5486static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
5487{
5488 struct bio *split;
5489 sector_t sector = raid_bio->bi_iter.bi_sector;
5490 unsigned chunk_sects = mddev->chunk_sectors;
5491 unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
5492
5493 if (sectors < bio_sectors(raid_bio)) {
5494 struct r5conf *conf = mddev->private;
5495 split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
5496 bio_chain(split, raid_bio);
5497 submit_bio_noacct(raid_bio);
5498 raid_bio = split;
5499 }
5500
5501 if (!raid5_read_one_chunk(mddev, raid_bio))
5502 return raid_bio;
5503
5504 return NULL;
5505}
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
5518{
5519 struct stripe_head *sh, *tmp;
5520 struct list_head *handle_list = NULL;
5521 struct r5worker_group *wg;
5522 bool second_try = !r5c_is_writeback(conf->log) &&
5523 !r5l_log_disk_error(conf);
5524 bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
5525 r5l_log_disk_error(conf);
5526
5527again:
5528 wg = NULL;
5529 sh = NULL;
5530 if (conf->worker_cnt_per_group == 0) {
5531 handle_list = try_loprio ? &conf->loprio_list :
5532 &conf->handle_list;
5533 } else if (group != ANY_GROUP) {
5534 handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
5535 &conf->worker_groups[group].handle_list;
5536 wg = &conf->worker_groups[group];
5537 } else {
5538 int i;
5539 for (i = 0; i < conf->group_cnt; i++) {
5540 handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
5541 &conf->worker_groups[i].handle_list;
5542 wg = &conf->worker_groups[i];
5543 if (!list_empty(handle_list))
5544 break;
5545 }
5546 }
5547
5548 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
5549 __func__,
5550 list_empty(handle_list) ? "empty" : "busy",
5551 list_empty(&conf->hold_list) ? "empty" : "busy",
5552 atomic_read(&conf->pending_full_writes), conf->bypass_count);
5553
5554 if (!list_empty(handle_list)) {
5555 sh = list_entry(handle_list->next, typeof(*sh), lru);
5556
5557 if (list_empty(&conf->hold_list))
5558 conf->bypass_count = 0;
5559 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
5560 if (conf->hold_list.next == conf->last_hold)
5561 conf->bypass_count++;
5562 else {
5563 conf->last_hold = conf->hold_list.next;
5564 conf->bypass_count -= conf->bypass_threshold;
5565 if (conf->bypass_count < 0)
5566 conf->bypass_count = 0;
5567 }
5568 }
5569 } else if (!list_empty(&conf->hold_list) &&
5570 ((conf->bypass_threshold &&
5571 conf->bypass_count > conf->bypass_threshold) ||
5572 atomic_read(&conf->pending_full_writes) == 0)) {
5573
5574 list_for_each_entry(tmp, &conf->hold_list, lru) {
5575 if (conf->worker_cnt_per_group == 0 ||
5576 group == ANY_GROUP ||
5577 !cpu_online(tmp->cpu) ||
5578 cpu_to_group(tmp->cpu) == group) {
5579 sh = tmp;
5580 break;
5581 }
5582 }
5583
5584 if (sh) {
5585 conf->bypass_count -= conf->bypass_threshold;
5586 if (conf->bypass_count < 0)
5587 conf->bypass_count = 0;
5588 }
5589 wg = NULL;
5590 }
5591
5592 if (!sh) {
5593 if (second_try)
5594 return NULL;
5595 second_try = true;
5596 try_loprio = !try_loprio;
5597 goto again;
5598 }
5599
5600 if (wg) {
5601 wg->stripes_cnt--;
5602 sh->group = NULL;
5603 }
5604 list_del_init(&sh->lru);
5605 BUG_ON(atomic_inc_return(&sh->count) != 1);
5606 return sh;
5607}
5608
5609struct raid5_plug_cb {
5610 struct blk_plug_cb cb;
5611 struct list_head list;
5612 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
5613};
5614
5615static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
5616{
5617 struct raid5_plug_cb *cb = container_of(
5618 blk_cb, struct raid5_plug_cb, cb);
5619 struct stripe_head *sh;
5620 struct mddev *mddev = cb->cb.data;
5621 struct r5conf *conf = mddev->private;
5622 int cnt = 0;
5623 int hash;
5624
5625 if (cb->list.next && !list_empty(&cb->list)) {
5626 spin_lock_irq(&conf->device_lock);
5627 while (!list_empty(&cb->list)) {
5628 sh = list_first_entry(&cb->list, struct stripe_head, lru);
5629 list_del_init(&sh->lru);
5630
5631
5632
5633
5634
5635 smp_mb__before_atomic();
5636 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
5637
5638
5639
5640
5641 hash = sh->hash_lock_index;
5642 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
5643 cnt++;
5644 }
5645 spin_unlock_irq(&conf->device_lock);
5646 }
5647 release_inactive_stripe_list(conf, cb->temp_inactive_list,
5648 NR_STRIPE_HASH_LOCKS);
5649 if (mddev->queue)
5650 trace_block_unplug(mddev->queue, cnt, !from_schedule);
5651 kfree(cb);
5652}
5653
5654static void release_stripe_plug(struct mddev *mddev,
5655 struct stripe_head *sh)
5656{
5657 struct blk_plug_cb *blk_cb = blk_check_plugged(
5658 raid5_unplug, mddev,
5659 sizeof(struct raid5_plug_cb));
5660 struct raid5_plug_cb *cb;
5661
5662 if (!blk_cb) {
5663 raid5_release_stripe(sh);
5664 return;
5665 }
5666
5667 cb = container_of(blk_cb, struct raid5_plug_cb, cb);
5668
5669 if (cb->list.next == NULL) {
5670 int i;
5671 INIT_LIST_HEAD(&cb->list);
5672 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5673 INIT_LIST_HEAD(cb->temp_inactive_list + i);
5674 }
5675
5676 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
5677 list_add_tail(&sh->lru, &cb->list);
5678 else
5679 raid5_release_stripe(sh);
5680}
5681
5682static void make_discard_request(struct mddev *mddev, struct bio *bi)
5683{
5684 struct r5conf *conf = mddev->private;
5685 sector_t logical_sector, last_sector;
5686 struct stripe_head *sh;
5687 int stripe_sectors;
5688
5689 if (mddev->reshape_position != MaxSector)
5690
5691 return;
5692
5693 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
5694 last_sector = bio_end_sector(bi);
5695
5696 bi->bi_next = NULL;
5697
5698 stripe_sectors = conf->chunk_sectors *
5699 (conf->raid_disks - conf->max_degraded);
5700 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
5701 stripe_sectors);
5702 sector_div(last_sector, stripe_sectors);
5703
5704 logical_sector *= conf->chunk_sectors;
5705 last_sector *= conf->chunk_sectors;
5706
5707 for (; logical_sector < last_sector;
5708 logical_sector += RAID5_STRIPE_SECTORS(conf)) {
5709 DEFINE_WAIT(w);
5710 int d;
5711 again:
5712 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
5713 prepare_to_wait(&conf->wait_for_overlap, &w,
5714 TASK_UNINTERRUPTIBLE);
5715 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5716 if (test_bit(STRIPE_SYNCING, &sh->state)) {
5717 raid5_release_stripe(sh);
5718 schedule();
5719 goto again;
5720 }
5721 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5722 spin_lock_irq(&sh->stripe_lock);
5723 for (d = 0; d < conf->raid_disks; d++) {
5724 if (d == sh->pd_idx || d == sh->qd_idx)
5725 continue;
5726 if (sh->dev[d].towrite || sh->dev[d].toread) {
5727 set_bit(R5_Overlap, &sh->dev[d].flags);
5728 spin_unlock_irq(&sh->stripe_lock);
5729 raid5_release_stripe(sh);
5730 schedule();
5731 goto again;
5732 }
5733 }
5734 set_bit(STRIPE_DISCARD, &sh->state);
5735 finish_wait(&conf->wait_for_overlap, &w);
5736 sh->overwrite_disks = 0;
5737 for (d = 0; d < conf->raid_disks; d++) {
5738 if (d == sh->pd_idx || d == sh->qd_idx)
5739 continue;
5740 sh->dev[d].towrite = bi;
5741 set_bit(R5_OVERWRITE, &sh->dev[d].flags);
5742 bio_inc_remaining(bi);
5743 md_write_inc(mddev, bi);
5744 sh->overwrite_disks++;
5745 }
5746 spin_unlock_irq(&sh->stripe_lock);
5747 if (conf->mddev->bitmap) {
5748 for (d = 0;
5749 d < conf->raid_disks - conf->max_degraded;
5750 d++)
5751 md_bitmap_startwrite(mddev->bitmap,
5752 sh->sector,
5753 RAID5_STRIPE_SECTORS(conf),
5754 0);
5755 sh->bm_seq = conf->seq_flush + 1;
5756 set_bit(STRIPE_BIT_DELAY, &sh->state);
5757 }
5758
5759 set_bit(STRIPE_HANDLE, &sh->state);
5760 clear_bit(STRIPE_DELAYED, &sh->state);
5761 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5762 atomic_inc(&conf->preread_active_stripes);
5763 release_stripe_plug(mddev, sh);
5764 }
5765
5766 bio_endio(bi);
5767}
5768
5769static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
5770{
5771 struct r5conf *conf = mddev->private;
5772 int dd_idx;
5773 sector_t new_sector;
5774 sector_t logical_sector, last_sector;
5775 struct stripe_head *sh;
5776 const int rw = bio_data_dir(bi);
5777 DEFINE_WAIT(w);
5778 bool do_prepare;
5779 bool do_flush = false;
5780
5781 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
5782 int ret = log_handle_flush_request(conf, bi);
5783
5784 if (ret == 0)
5785 return true;
5786 if (ret == -ENODEV) {
5787 if (md_flush_request(mddev, bi))
5788 return true;
5789 }
5790
5791
5792
5793
5794
5795 do_flush = bi->bi_opf & REQ_PREFLUSH;
5796 }
5797
5798 if (!md_write_start(mddev, bi))
5799 return false;
5800
5801
5802
5803
5804
5805 if (rw == READ && mddev->degraded == 0 &&
5806 mddev->reshape_position == MaxSector) {
5807 bi = chunk_aligned_read(mddev, bi);
5808 if (!bi)
5809 return true;
5810 }
5811
5812 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
5813 make_discard_request(mddev, bi);
5814 md_write_end(mddev);
5815 return true;
5816 }
5817
5818 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
5819 last_sector = bio_end_sector(bi);
5820 bi->bi_next = NULL;
5821
5822 md_account_bio(mddev, &bi);
5823 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
5824 for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
5825 int previous;
5826 int seq;
5827
5828 do_prepare = false;
5829 retry:
5830 seq = read_seqcount_begin(&conf->gen_lock);
5831 previous = 0;
5832 if (do_prepare)
5833 prepare_to_wait(&conf->wait_for_overlap, &w,
5834 TASK_UNINTERRUPTIBLE);
5835 if (unlikely(conf->reshape_progress != MaxSector)) {
5836
5837
5838
5839
5840
5841
5842
5843
5844 spin_lock_irq(&conf->device_lock);
5845 if (mddev->reshape_backwards
5846 ? logical_sector < conf->reshape_progress
5847 : logical_sector >= conf->reshape_progress) {
5848 previous = 1;
5849 } else {
5850 if (mddev->reshape_backwards
5851 ? logical_sector < conf->reshape_safe
5852 : logical_sector >= conf->reshape_safe) {
5853 spin_unlock_irq(&conf->device_lock);
5854 schedule();
5855 do_prepare = true;
5856 goto retry;
5857 }
5858 }
5859 spin_unlock_irq(&conf->device_lock);
5860 }
5861
5862 new_sector = raid5_compute_sector(conf, logical_sector,
5863 previous,
5864 &dd_idx, NULL);
5865 pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n",
5866 (unsigned long long)new_sector,
5867 (unsigned long long)logical_sector);
5868
5869 sh = raid5_get_active_stripe(conf, new_sector, previous,
5870 (bi->bi_opf & REQ_RAHEAD), 0);
5871 if (sh) {
5872 if (unlikely(previous)) {
5873
5874
5875
5876
5877
5878
5879
5880
5881 int must_retry = 0;
5882 spin_lock_irq(&conf->device_lock);
5883 if (mddev->reshape_backwards
5884 ? logical_sector >= conf->reshape_progress
5885 : logical_sector < conf->reshape_progress)
5886
5887 must_retry = 1;
5888 spin_unlock_irq(&conf->device_lock);
5889 if (must_retry) {
5890 raid5_release_stripe(sh);
5891 schedule();
5892 do_prepare = true;
5893 goto retry;
5894 }
5895 }
5896 if (read_seqcount_retry(&conf->gen_lock, seq)) {
5897
5898
5899
5900 raid5_release_stripe(sh);
5901 goto retry;
5902 }
5903
5904 if (test_bit(STRIPE_EXPANDING, &sh->state) ||
5905 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
5906
5907
5908
5909
5910 md_wakeup_thread(mddev->thread);
5911 raid5_release_stripe(sh);
5912 schedule();
5913 do_prepare = true;
5914 goto retry;
5915 }
5916 if (do_flush) {
5917 set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
5918
5919 do_flush = false;
5920 }
5921
5922 set_bit(STRIPE_HANDLE, &sh->state);
5923 clear_bit(STRIPE_DELAYED, &sh->state);
5924 if ((!sh->batch_head || sh == sh->batch_head) &&
5925 (bi->bi_opf & REQ_SYNC) &&
5926 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5927 atomic_inc(&conf->preread_active_stripes);
5928 release_stripe_plug(mddev, sh);
5929 } else {
5930
5931 bi->bi_status = BLK_STS_IOERR;
5932 break;
5933 }
5934 }
5935 finish_wait(&conf->wait_for_overlap, &w);
5936
5937 if (rw == WRITE)
5938 md_write_end(mddev);
5939 bio_endio(bi);
5940 return true;
5941}
5942
5943static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
5944
5945static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
5946{
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956 struct r5conf *conf = mddev->private;
5957 struct stripe_head *sh;
5958 struct md_rdev *rdev;
5959 sector_t first_sector, last_sector;
5960 int raid_disks = conf->previous_raid_disks;
5961 int data_disks = raid_disks - conf->max_degraded;
5962 int new_data_disks = conf->raid_disks - conf->max_degraded;
5963 int i;
5964 int dd_idx;
5965 sector_t writepos, readpos, safepos;
5966 sector_t stripe_addr;
5967 int reshape_sectors;
5968 struct list_head stripes;
5969 sector_t retn;
5970
5971 if (sector_nr == 0) {
5972
5973 if (mddev->reshape_backwards &&
5974 conf->reshape_progress < raid5_size(mddev, 0, 0)) {
5975 sector_nr = raid5_size(mddev, 0, 0)
5976 - conf->reshape_progress;
5977 } else if (mddev->reshape_backwards &&
5978 conf->reshape_progress == MaxSector) {
5979
5980 sector_nr = MaxSector;
5981 } else if (!mddev->reshape_backwards &&
5982 conf->reshape_progress > 0)
5983 sector_nr = conf->reshape_progress;
5984 sector_div(sector_nr, new_data_disks);
5985 if (sector_nr) {
5986 mddev->curr_resync_completed = sector_nr;
5987 sysfs_notify_dirent_safe(mddev->sysfs_completed);
5988 *skipped = 1;
5989 retn = sector_nr;
5990 goto finish;
5991 }
5992 }
5993
5994
5995
5996
5997
5998
5999 reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
6000
6001
6002
6003
6004
6005
6006
6007 writepos = conf->reshape_progress;
6008 sector_div(writepos, new_data_disks);
6009 readpos = conf->reshape_progress;
6010 sector_div(readpos, data_disks);
6011 safepos = conf->reshape_safe;
6012 sector_div(safepos, data_disks);
6013 if (mddev->reshape_backwards) {
6014 BUG_ON(writepos < reshape_sectors);
6015 writepos -= reshape_sectors;
6016 readpos += reshape_sectors;
6017 safepos += reshape_sectors;
6018 } else {
6019 writepos += reshape_sectors;
6020
6021
6022
6023
6024 readpos -= min_t(sector_t, reshape_sectors, readpos);
6025 safepos -= min_t(sector_t, reshape_sectors, safepos);
6026 }
6027
6028
6029
6030
6031 if (mddev->reshape_backwards) {
6032 BUG_ON(conf->reshape_progress == 0);
6033 stripe_addr = writepos;
6034 BUG_ON((mddev->dev_sectors &
6035 ~((sector_t)reshape_sectors - 1))
6036 - reshape_sectors - stripe_addr
6037 != sector_nr);
6038 } else {
6039 BUG_ON(writepos != sector_nr + reshape_sectors);
6040 stripe_addr = sector_nr;
6041 }
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063 if (conf->min_offset_diff < 0) {
6064 safepos += -conf->min_offset_diff;
6065 readpos += -conf->min_offset_diff;
6066 } else
6067 writepos += conf->min_offset_diff;
6068
6069 if ((mddev->reshape_backwards
6070 ? (safepos > writepos && readpos < writepos)
6071 : (safepos < writepos && readpos > writepos)) ||
6072 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
6073
6074 wait_event(conf->wait_for_overlap,
6075 atomic_read(&conf->reshape_stripes)==0
6076 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6077 if (atomic_read(&conf->reshape_stripes) != 0)
6078 return 0;
6079 mddev->reshape_position = conf->reshape_progress;
6080 mddev->curr_resync_completed = sector_nr;
6081 if (!mddev->reshape_backwards)
6082
6083 rdev_for_each(rdev, mddev)
6084 if (rdev->raid_disk >= 0 &&
6085 !test_bit(Journal, &rdev->flags) &&
6086 !test_bit(In_sync, &rdev->flags) &&
6087 rdev->recovery_offset < sector_nr)
6088 rdev->recovery_offset = sector_nr;
6089
6090 conf->reshape_checkpoint = jiffies;
6091 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6092 md_wakeup_thread(mddev->thread);
6093 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
6094 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6095 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6096 return 0;
6097 spin_lock_irq(&conf->device_lock);
6098 conf->reshape_safe = mddev->reshape_position;
6099 spin_unlock_irq(&conf->device_lock);
6100 wake_up(&conf->wait_for_overlap);
6101 sysfs_notify_dirent_safe(mddev->sysfs_completed);
6102 }
6103
6104 INIT_LIST_HEAD(&stripes);
6105 for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
6106 int j;
6107 int skipped_disk = 0;
6108 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
6109 set_bit(STRIPE_EXPANDING, &sh->state);
6110 atomic_inc(&conf->reshape_stripes);
6111
6112
6113
6114 for (j=sh->disks; j--;) {
6115 sector_t s;
6116 if (j == sh->pd_idx)
6117 continue;
6118 if (conf->level == 6 &&
6119 j == sh->qd_idx)
6120 continue;
6121 s = raid5_compute_blocknr(sh, j, 0);
6122 if (s < raid5_size(mddev, 0, 0)) {
6123 skipped_disk = 1;
6124 continue;
6125 }
6126 memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf));
6127 set_bit(R5_Expanded, &sh->dev[j].flags);
6128 set_bit(R5_UPTODATE, &sh->dev[j].flags);
6129 }
6130 if (!skipped_disk) {
6131 set_bit(STRIPE_EXPAND_READY, &sh->state);
6132 set_bit(STRIPE_HANDLE, &sh->state);
6133 }
6134 list_add(&sh->lru, &stripes);
6135 }
6136 spin_lock_irq(&conf->device_lock);
6137 if (mddev->reshape_backwards)
6138 conf->reshape_progress -= reshape_sectors * new_data_disks;
6139 else
6140 conf->reshape_progress += reshape_sectors * new_data_disks;
6141 spin_unlock_irq(&conf->device_lock);
6142
6143
6144
6145
6146
6147 first_sector =
6148 raid5_compute_sector(conf, stripe_addr*(new_data_disks),
6149 1, &dd_idx, NULL);
6150 last_sector =
6151 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
6152 * new_data_disks - 1),
6153 1, &dd_idx, NULL);
6154 if (last_sector >= mddev->dev_sectors)
6155 last_sector = mddev->dev_sectors - 1;
6156 while (first_sector <= last_sector) {
6157 sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
6158 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
6159 set_bit(STRIPE_HANDLE, &sh->state);
6160 raid5_release_stripe(sh);
6161 first_sector += RAID5_STRIPE_SECTORS(conf);
6162 }
6163
6164
6165
6166 while (!list_empty(&stripes)) {
6167 sh = list_entry(stripes.next, struct stripe_head, lru);
6168 list_del_init(&sh->lru);
6169 raid5_release_stripe(sh);
6170 }
6171
6172
6173
6174 sector_nr += reshape_sectors;
6175 retn = reshape_sectors;
6176finish:
6177 if (mddev->curr_resync_completed > mddev->resync_max ||
6178 (sector_nr - mddev->curr_resync_completed) * 2
6179 >= mddev->resync_max - mddev->curr_resync_completed) {
6180
6181 wait_event(conf->wait_for_overlap,
6182 atomic_read(&conf->reshape_stripes) == 0
6183 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6184 if (atomic_read(&conf->reshape_stripes) != 0)
6185 goto ret;
6186 mddev->reshape_position = conf->reshape_progress;
6187 mddev->curr_resync_completed = sector_nr;
6188 if (!mddev->reshape_backwards)
6189
6190 rdev_for_each(rdev, mddev)
6191 if (rdev->raid_disk >= 0 &&
6192 !test_bit(Journal, &rdev->flags) &&
6193 !test_bit(In_sync, &rdev->flags) &&
6194 rdev->recovery_offset < sector_nr)
6195 rdev->recovery_offset = sector_nr;
6196 conf->reshape_checkpoint = jiffies;
6197 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6198 md_wakeup_thread(mddev->thread);
6199 wait_event(mddev->sb_wait,
6200 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
6201 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6202 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6203 goto ret;
6204 spin_lock_irq(&conf->device_lock);
6205 conf->reshape_safe = mddev->reshape_position;
6206 spin_unlock_irq(&conf->device_lock);
6207 wake_up(&conf->wait_for_overlap);
6208 sysfs_notify_dirent_safe(mddev->sysfs_completed);
6209 }
6210ret:
6211 return retn;
6212}
6213
6214static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
6215 int *skipped)
6216{
6217 struct r5conf *conf = mddev->private;
6218 struct stripe_head *sh;
6219 sector_t max_sector = mddev->dev_sectors;
6220 sector_t sync_blocks;
6221 int still_degraded = 0;
6222 int i;
6223
6224 if (sector_nr >= max_sector) {
6225
6226
6227 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
6228 end_reshape(conf);
6229 return 0;
6230 }
6231
6232 if (mddev->curr_resync < max_sector)
6233 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
6234 &sync_blocks, 1);
6235 else
6236 conf->fullsync = 0;
6237 md_bitmap_close_sync(mddev->bitmap);
6238
6239 return 0;
6240 }
6241
6242
6243 wait_event(conf->wait_for_overlap, conf->quiesce != 2);
6244
6245 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6246 return reshape_request(mddev, sector_nr, skipped);
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258 if (mddev->degraded >= conf->max_degraded &&
6259 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6260 sector_t rv = mddev->dev_sectors - sector_nr;
6261 *skipped = 1;
6262 return rv;
6263 }
6264 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
6265 !conf->fullsync &&
6266 !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
6267 sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
6268
6269 do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
6270 *skipped = 1;
6271
6272 return sync_blocks * RAID5_STRIPE_SECTORS(conf);
6273 }
6274
6275 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
6276
6277 sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0);
6278 if (sh == NULL) {
6279 sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0);
6280
6281
6282
6283 schedule_timeout_uninterruptible(1);
6284 }
6285
6286
6287
6288
6289 rcu_read_lock();
6290 for (i = 0; i < conf->raid_disks; i++) {
6291 struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
6292
6293 if (rdev == NULL || test_bit(Faulty, &rdev->flags))
6294 still_degraded = 1;
6295 }
6296 rcu_read_unlock();
6297
6298 md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
6299
6300 set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
6301 set_bit(STRIPE_HANDLE, &sh->state);
6302
6303 raid5_release_stripe(sh);
6304
6305 return RAID5_STRIPE_SECTORS(conf);
6306}
6307
6308static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
6309 unsigned int offset)
6310{
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321 struct stripe_head *sh;
6322 int dd_idx;
6323 sector_t sector, logical_sector, last_sector;
6324 int scnt = 0;
6325 int handled = 0;
6326
6327 logical_sector = raid_bio->bi_iter.bi_sector &
6328 ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
6329 sector = raid5_compute_sector(conf, logical_sector,
6330 0, &dd_idx, NULL);
6331 last_sector = bio_end_sector(raid_bio);
6332
6333 for (; logical_sector < last_sector;
6334 logical_sector += RAID5_STRIPE_SECTORS(conf),
6335 sector += RAID5_STRIPE_SECTORS(conf),
6336 scnt++) {
6337
6338 if (scnt < offset)
6339
6340 continue;
6341
6342 sh = raid5_get_active_stripe(conf, sector, 0, 1, 1);
6343
6344 if (!sh) {
6345
6346 conf->retry_read_aligned = raid_bio;
6347 conf->retry_read_offset = scnt;
6348 return handled;
6349 }
6350
6351 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
6352 raid5_release_stripe(sh);
6353 conf->retry_read_aligned = raid_bio;
6354 conf->retry_read_offset = scnt;
6355 return handled;
6356 }
6357
6358 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
6359 handle_stripe(sh);
6360 raid5_release_stripe(sh);
6361 handled++;
6362 }
6363
6364 bio_endio(raid_bio);
6365
6366 if (atomic_dec_and_test(&conf->active_aligned_reads))
6367 wake_up(&conf->wait_for_quiescent);
6368 return handled;
6369}
6370
6371static int handle_active_stripes(struct r5conf *conf, int group,
6372 struct r5worker *worker,
6373 struct list_head *temp_inactive_list)
6374 __releases(&conf->device_lock)
6375 __acquires(&conf->device_lock)
6376{
6377 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
6378 int i, batch_size = 0, hash;
6379 bool release_inactive = false;
6380
6381 while (batch_size < MAX_STRIPE_BATCH &&
6382 (sh = __get_priority_stripe(conf, group)) != NULL)
6383 batch[batch_size++] = sh;
6384
6385 if (batch_size == 0) {
6386 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6387 if (!list_empty(temp_inactive_list + i))
6388 break;
6389 if (i == NR_STRIPE_HASH_LOCKS) {
6390 spin_unlock_irq(&conf->device_lock);
6391 log_flush_stripe_to_raid(conf);
6392 spin_lock_irq(&conf->device_lock);
6393 return batch_size;
6394 }
6395 release_inactive = true;
6396 }
6397 spin_unlock_irq(&conf->device_lock);
6398
6399 release_inactive_stripe_list(conf, temp_inactive_list,
6400 NR_STRIPE_HASH_LOCKS);
6401
6402 r5l_flush_stripe_to_raid(conf->log);
6403 if (release_inactive) {
6404 spin_lock_irq(&conf->device_lock);
6405 return 0;
6406 }
6407
6408 for (i = 0; i < batch_size; i++)
6409 handle_stripe(batch[i]);
6410 log_write_stripe_run(conf);
6411
6412 cond_resched();
6413
6414 spin_lock_irq(&conf->device_lock);
6415 for (i = 0; i < batch_size; i++) {
6416 hash = batch[i]->hash_lock_index;
6417 __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
6418 }
6419 return batch_size;
6420}
6421
6422static void raid5_do_work(struct work_struct *work)
6423{
6424 struct r5worker *worker = container_of(work, struct r5worker, work);
6425 struct r5worker_group *group = worker->group;
6426 struct r5conf *conf = group->conf;
6427 struct mddev *mddev = conf->mddev;
6428 int group_id = group - conf->worker_groups;
6429 int handled;
6430 struct blk_plug plug;
6431
6432 pr_debug("+++ raid5worker active\n");
6433
6434 blk_start_plug(&plug);
6435 handled = 0;
6436 spin_lock_irq(&conf->device_lock);
6437 while (1) {
6438 int batch_size, released;
6439
6440 released = release_stripe_list(conf, worker->temp_inactive_list);
6441
6442 batch_size = handle_active_stripes(conf, group_id, worker,
6443 worker->temp_inactive_list);
6444 worker->working = false;
6445 if (!batch_size && !released)
6446 break;
6447 handled += batch_size;
6448 wait_event_lock_irq(mddev->sb_wait,
6449 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6450 conf->device_lock);
6451 }
6452 pr_debug("%d stripes handled\n", handled);
6453
6454 spin_unlock_irq(&conf->device_lock);
6455
6456 flush_deferred_bios(conf);
6457
6458 r5l_flush_stripe_to_raid(conf->log);
6459
6460 async_tx_issue_pending_all();
6461 blk_finish_plug(&plug);
6462
6463 pr_debug("--- raid5worker inactive\n");
6464}
6465
6466
6467
6468
6469
6470
6471
6472
6473static void raid5d(struct md_thread *thread)
6474{
6475 struct mddev *mddev = thread->mddev;
6476 struct r5conf *conf = mddev->private;
6477 int handled;
6478 struct blk_plug plug;
6479
6480 pr_debug("+++ raid5d active\n");
6481
6482 md_check_recovery(mddev);
6483
6484 blk_start_plug(&plug);
6485 handled = 0;
6486 spin_lock_irq(&conf->device_lock);
6487 while (1) {
6488 struct bio *bio;
6489 int batch_size, released;
6490 unsigned int offset;
6491
6492 released = release_stripe_list(conf, conf->temp_inactive_list);
6493 if (released)
6494 clear_bit(R5_DID_ALLOC, &conf->cache_state);
6495
6496 if (
6497 !list_empty(&conf->bitmap_list)) {
6498
6499 conf->seq_flush++;
6500 spin_unlock_irq(&conf->device_lock);
6501 md_bitmap_unplug(mddev->bitmap);
6502 spin_lock_irq(&conf->device_lock);
6503 conf->seq_write = conf->seq_flush;
6504 activate_bit_delay(conf, conf->temp_inactive_list);
6505 }
6506 raid5_activate_delayed(conf);
6507
6508 while ((bio = remove_bio_from_retry(conf, &offset))) {
6509 int ok;
6510 spin_unlock_irq(&conf->device_lock);
6511 ok = retry_aligned_read(conf, bio, offset);
6512 spin_lock_irq(&conf->device_lock);
6513 if (!ok)
6514 break;
6515 handled++;
6516 }
6517
6518 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
6519 conf->temp_inactive_list);
6520 if (!batch_size && !released)
6521 break;
6522 handled += batch_size;
6523
6524 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
6525 spin_unlock_irq(&conf->device_lock);
6526 md_check_recovery(mddev);
6527 spin_lock_irq(&conf->device_lock);
6528 }
6529 }
6530 pr_debug("%d stripes handled\n", handled);
6531
6532 spin_unlock_irq(&conf->device_lock);
6533 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) &&
6534 mutex_trylock(&conf->cache_size_mutex)) {
6535 grow_one_stripe(conf, __GFP_NOWARN);
6536
6537
6538
6539 set_bit(R5_DID_ALLOC, &conf->cache_state);
6540 mutex_unlock(&conf->cache_size_mutex);
6541 }
6542
6543 flush_deferred_bios(conf);
6544
6545 r5l_flush_stripe_to_raid(conf->log);
6546
6547 async_tx_issue_pending_all();
6548 blk_finish_plug(&plug);
6549
6550 pr_debug("--- raid5d inactive\n");
6551}
6552
6553static ssize_t
6554raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
6555{
6556 struct r5conf *conf;
6557 int ret = 0;
6558 spin_lock(&mddev->lock);
6559 conf = mddev->private;
6560 if (conf)
6561 ret = sprintf(page, "%d\n", conf->min_nr_stripes);
6562 spin_unlock(&mddev->lock);
6563 return ret;
6564}
6565
6566int
6567raid5_set_cache_size(struct mddev *mddev, int size)
6568{
6569 int result = 0;
6570 struct r5conf *conf = mddev->private;
6571
6572 if (size <= 16 || size > 32768)
6573 return -EINVAL;
6574
6575 conf->min_nr_stripes = size;
6576 mutex_lock(&conf->cache_size_mutex);
6577 while (size < conf->max_nr_stripes &&
6578 drop_one_stripe(conf))
6579 ;
6580 mutex_unlock(&conf->cache_size_mutex);
6581
6582 md_allow_write(mddev);
6583
6584 mutex_lock(&conf->cache_size_mutex);
6585 while (size > conf->max_nr_stripes)
6586 if (!grow_one_stripe(conf, GFP_KERNEL)) {
6587 conf->min_nr_stripes = conf->max_nr_stripes;
6588 result = -ENOMEM;
6589 break;
6590 }
6591 mutex_unlock(&conf->cache_size_mutex);
6592
6593 return result;
6594}
6595EXPORT_SYMBOL(raid5_set_cache_size);
6596
6597static ssize_t
6598raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
6599{
6600 struct r5conf *conf;
6601 unsigned long new;
6602 int err;
6603
6604 if (len >= PAGE_SIZE)
6605 return -EINVAL;
6606 if (kstrtoul(page, 10, &new))
6607 return -EINVAL;
6608 err = mddev_lock(mddev);
6609 if (err)
6610 return err;
6611 conf = mddev->private;
6612 if (!conf)
6613 err = -ENODEV;
6614 else
6615 err = raid5_set_cache_size(mddev, new);
6616 mddev_unlock(mddev);
6617
6618 return err ?: len;
6619}
6620
6621static struct md_sysfs_entry
6622raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
6623 raid5_show_stripe_cache_size,
6624 raid5_store_stripe_cache_size);
6625
6626static ssize_t
6627raid5_show_rmw_level(struct mddev *mddev, char *page)
6628{
6629 struct r5conf *conf = mddev->private;
6630 if (conf)
6631 return sprintf(page, "%d\n", conf->rmw_level);
6632 else
6633 return 0;
6634}
6635
6636static ssize_t
6637raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len)
6638{
6639 struct r5conf *conf = mddev->private;
6640 unsigned long new;
6641
6642 if (!conf)
6643 return -ENODEV;
6644
6645 if (len >= PAGE_SIZE)
6646 return -EINVAL;
6647
6648 if (kstrtoul(page, 10, &new))
6649 return -EINVAL;
6650
6651 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
6652 return -EINVAL;
6653
6654 if (new != PARITY_DISABLE_RMW &&
6655 new != PARITY_ENABLE_RMW &&
6656 new != PARITY_PREFER_RMW)
6657 return -EINVAL;
6658
6659 conf->rmw_level = new;
6660 return len;
6661}
6662
6663static struct md_sysfs_entry
6664raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
6665 raid5_show_rmw_level,
6666 raid5_store_rmw_level);
6667
6668static ssize_t
6669raid5_show_stripe_size(struct mddev *mddev, char *page)
6670{
6671 struct r5conf *conf;
6672 int ret = 0;
6673
6674 spin_lock(&mddev->lock);
6675 conf = mddev->private;
6676 if (conf)
6677 ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf));
6678 spin_unlock(&mddev->lock);
6679 return ret;
6680}
6681
6682#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
6683static ssize_t
6684raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
6685{
6686 struct r5conf *conf;
6687 unsigned long new;
6688 int err;
6689 int size;
6690
6691 if (len >= PAGE_SIZE)
6692 return -EINVAL;
6693 if (kstrtoul(page, 10, &new))
6694 return -EINVAL;
6695
6696
6697
6698
6699
6700
6701 if (new % DEFAULT_STRIPE_SIZE != 0 ||
6702 new > PAGE_SIZE || new == 0 ||
6703 new != roundup_pow_of_two(new))
6704 return -EINVAL;
6705
6706 err = mddev_lock(mddev);
6707 if (err)
6708 return err;
6709
6710 conf = mddev->private;
6711 if (!conf) {
6712 err = -ENODEV;
6713 goto out_unlock;
6714 }
6715
6716 if (new == conf->stripe_size)
6717 goto out_unlock;
6718
6719 pr_debug("md/raid: change stripe_size from %lu to %lu\n",
6720 conf->stripe_size, new);
6721
6722 if (mddev->sync_thread ||
6723 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6724 mddev->reshape_position != MaxSector ||
6725 mddev->sysfs_active) {
6726 err = -EBUSY;
6727 goto out_unlock;
6728 }
6729
6730 mddev_suspend(mddev);
6731 mutex_lock(&conf->cache_size_mutex);
6732 size = conf->max_nr_stripes;
6733
6734 shrink_stripes(conf);
6735
6736 conf->stripe_size = new;
6737 conf->stripe_shift = ilog2(new) - 9;
6738 conf->stripe_sectors = new >> 9;
6739 if (grow_stripes(conf, size)) {
6740 pr_warn("md/raid:%s: couldn't allocate buffers\n",
6741 mdname(mddev));
6742 err = -ENOMEM;
6743 }
6744 mutex_unlock(&conf->cache_size_mutex);
6745 mddev_resume(mddev);
6746
6747out_unlock:
6748 mddev_unlock(mddev);
6749 return err ?: len;
6750}
6751
6752static struct md_sysfs_entry
6753raid5_stripe_size = __ATTR(stripe_size, 0644,
6754 raid5_show_stripe_size,
6755 raid5_store_stripe_size);
6756#else
6757static struct md_sysfs_entry
6758raid5_stripe_size = __ATTR(stripe_size, 0444,
6759 raid5_show_stripe_size,
6760 NULL);
6761#endif
6762
6763static ssize_t
6764raid5_show_preread_threshold(struct mddev *mddev, char *page)
6765{
6766 struct r5conf *conf;
6767 int ret = 0;
6768 spin_lock(&mddev->lock);
6769 conf = mddev->private;
6770 if (conf)
6771 ret = sprintf(page, "%d\n", conf->bypass_threshold);
6772 spin_unlock(&mddev->lock);
6773 return ret;
6774}
6775
6776static ssize_t
6777raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
6778{
6779 struct r5conf *conf;
6780 unsigned long new;
6781 int err;
6782
6783 if (len >= PAGE_SIZE)
6784 return -EINVAL;
6785 if (kstrtoul(page, 10, &new))
6786 return -EINVAL;
6787
6788 err = mddev_lock(mddev);
6789 if (err)
6790 return err;
6791 conf = mddev->private;
6792 if (!conf)
6793 err = -ENODEV;
6794 else if (new > conf->min_nr_stripes)
6795 err = -EINVAL;
6796 else
6797 conf->bypass_threshold = new;
6798 mddev_unlock(mddev);
6799 return err ?: len;
6800}
6801
6802static struct md_sysfs_entry
6803raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
6804 S_IRUGO | S_IWUSR,
6805 raid5_show_preread_threshold,
6806 raid5_store_preread_threshold);
6807
6808static ssize_t
6809raid5_show_skip_copy(struct mddev *mddev, char *page)
6810{
6811 struct r5conf *conf;
6812 int ret = 0;
6813 spin_lock(&mddev->lock);
6814 conf = mddev->private;
6815 if (conf)
6816 ret = sprintf(page, "%d\n", conf->skip_copy);
6817 spin_unlock(&mddev->lock);
6818 return ret;
6819}
6820
6821static ssize_t
6822raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
6823{
6824 struct r5conf *conf;
6825 unsigned long new;
6826 int err;
6827
6828 if (len >= PAGE_SIZE)
6829 return -EINVAL;
6830 if (kstrtoul(page, 10, &new))
6831 return -EINVAL;
6832 new = !!new;
6833
6834 err = mddev_lock(mddev);
6835 if (err)
6836 return err;
6837 conf = mddev->private;
6838 if (!conf)
6839 err = -ENODEV;
6840 else if (new != conf->skip_copy) {
6841 struct request_queue *q = mddev->queue;
6842
6843 mddev_suspend(mddev);
6844 conf->skip_copy = new;
6845 if (new)
6846 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
6847 else
6848 blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
6849 mddev_resume(mddev);
6850 }
6851 mddev_unlock(mddev);
6852 return err ?: len;
6853}
6854
6855static struct md_sysfs_entry
6856raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
6857 raid5_show_skip_copy,
6858 raid5_store_skip_copy);
6859
6860static ssize_t
6861stripe_cache_active_show(struct mddev *mddev, char *page)
6862{
6863 struct r5conf *conf = mddev->private;
6864 if (conf)
6865 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
6866 else
6867 return 0;
6868}
6869
6870static struct md_sysfs_entry
6871raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
6872
6873static ssize_t
6874raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
6875{
6876 struct r5conf *conf;
6877 int ret = 0;
6878 spin_lock(&mddev->lock);
6879 conf = mddev->private;
6880 if (conf)
6881 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
6882 spin_unlock(&mddev->lock);
6883 return ret;
6884}
6885
6886static int alloc_thread_groups(struct r5conf *conf, int cnt,
6887 int *group_cnt,
6888 struct r5worker_group **worker_groups);
6889static ssize_t
6890raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
6891{
6892 struct r5conf *conf;
6893 unsigned int new;
6894 int err;
6895 struct r5worker_group *new_groups, *old_groups;
6896 int group_cnt;
6897
6898 if (len >= PAGE_SIZE)
6899 return -EINVAL;
6900 if (kstrtouint(page, 10, &new))
6901 return -EINVAL;
6902
6903 if (new > 8192)
6904 return -EINVAL;
6905
6906 err = mddev_lock(mddev);
6907 if (err)
6908 return err;
6909 conf = mddev->private;
6910 if (!conf)
6911 err = -ENODEV;
6912 else if (new != conf->worker_cnt_per_group) {
6913 mddev_suspend(mddev);
6914
6915 old_groups = conf->worker_groups;
6916 if (old_groups)
6917 flush_workqueue(raid5_wq);
6918
6919 err = alloc_thread_groups(conf, new, &group_cnt, &new_groups);
6920 if (!err) {
6921 spin_lock_irq(&conf->device_lock);
6922 conf->group_cnt = group_cnt;
6923 conf->worker_cnt_per_group = new;
6924 conf->worker_groups = new_groups;
6925 spin_unlock_irq(&conf->device_lock);
6926
6927 if (old_groups)
6928 kfree(old_groups[0].workers);
6929 kfree(old_groups);
6930 }
6931 mddev_resume(mddev);
6932 }
6933 mddev_unlock(mddev);
6934
6935 return err ?: len;
6936}
6937
6938static struct md_sysfs_entry
6939raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
6940 raid5_show_group_thread_cnt,
6941 raid5_store_group_thread_cnt);
6942
6943static struct attribute *raid5_attrs[] = {
6944 &raid5_stripecache_size.attr,
6945 &raid5_stripecache_active.attr,
6946 &raid5_preread_bypass_threshold.attr,
6947 &raid5_group_thread_cnt.attr,
6948 &raid5_skip_copy.attr,
6949 &raid5_rmw_level.attr,
6950 &raid5_stripe_size.attr,
6951 &r5c_journal_mode.attr,
6952 &ppl_write_hint.attr,
6953 NULL,
6954};
6955static const struct attribute_group raid5_attrs_group = {
6956 .name = NULL,
6957 .attrs = raid5_attrs,
6958};
6959
6960static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt,
6961 struct r5worker_group **worker_groups)
6962{
6963 int i, j, k;
6964 ssize_t size;
6965 struct r5worker *workers;
6966
6967 if (cnt == 0) {
6968 *group_cnt = 0;
6969 *worker_groups = NULL;
6970 return 0;
6971 }
6972 *group_cnt = num_possible_nodes();
6973 size = sizeof(struct r5worker) * cnt;
6974 workers = kcalloc(size, *group_cnt, GFP_NOIO);
6975 *worker_groups = kcalloc(*group_cnt, sizeof(struct r5worker_group),
6976 GFP_NOIO);
6977 if (!*worker_groups || !workers) {
6978 kfree(workers);
6979 kfree(*worker_groups);
6980 return -ENOMEM;
6981 }
6982
6983 for (i = 0; i < *group_cnt; i++) {
6984 struct r5worker_group *group;
6985
6986 group = &(*worker_groups)[i];
6987 INIT_LIST_HEAD(&group->handle_list);
6988 INIT_LIST_HEAD(&group->loprio_list);
6989 group->conf = conf;
6990 group->workers = workers + i * cnt;
6991
6992 for (j = 0; j < cnt; j++) {
6993 struct r5worker *worker = group->workers + j;
6994 worker->group = group;
6995 INIT_WORK(&worker->work, raid5_do_work);
6996
6997 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
6998 INIT_LIST_HEAD(worker->temp_inactive_list + k);
6999 }
7000 }
7001
7002 return 0;
7003}
7004
7005static void free_thread_groups(struct r5conf *conf)
7006{
7007 if (conf->worker_groups)
7008 kfree(conf->worker_groups[0].workers);
7009 kfree(conf->worker_groups);
7010 conf->worker_groups = NULL;
7011}
7012
7013static sector_t
7014raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
7015{
7016 struct r5conf *conf = mddev->private;
7017
7018 if (!sectors)
7019 sectors = mddev->dev_sectors;
7020 if (!raid_disks)
7021
7022 raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
7023
7024 sectors &= ~((sector_t)conf->chunk_sectors - 1);
7025 sectors &= ~((sector_t)conf->prev_chunk_sectors - 1);
7026 return sectors * (raid_disks - conf->max_degraded);
7027}
7028
7029static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
7030{
7031 safe_put_page(percpu->spare_page);
7032 percpu->spare_page = NULL;
7033 kvfree(percpu->scribble);
7034 percpu->scribble = NULL;
7035}
7036
7037static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
7038{
7039 if (conf->level == 6 && !percpu->spare_page) {
7040 percpu->spare_page = alloc_page(GFP_KERNEL);
7041 if (!percpu->spare_page)
7042 return -ENOMEM;
7043 }
7044
7045 if (scribble_alloc(percpu,
7046 max(conf->raid_disks,
7047 conf->previous_raid_disks),
7048 max(conf->chunk_sectors,
7049 conf->prev_chunk_sectors)
7050 / RAID5_STRIPE_SECTORS(conf))) {
7051 free_scratch_buffer(conf, percpu);
7052 return -ENOMEM;
7053 }
7054
7055 return 0;
7056}
7057
7058static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node)
7059{
7060 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
7061
7062 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
7063 return 0;
7064}
7065
7066static void raid5_free_percpu(struct r5conf *conf)
7067{
7068 if (!conf->percpu)
7069 return;
7070
7071 cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
7072 free_percpu(conf->percpu);
7073}
7074
7075static void free_conf(struct r5conf *conf)
7076{
7077 int i;
7078
7079 log_exit(conf);
7080
7081 unregister_shrinker(&conf->shrinker);
7082 free_thread_groups(conf);
7083 shrink_stripes(conf);
7084 raid5_free_percpu(conf);
7085 for (i = 0; i < conf->pool_size; i++)
7086 if (conf->disks[i].extra_page)
7087 put_page(conf->disks[i].extra_page);
7088 kfree(conf->disks);
7089 bioset_exit(&conf->bio_split);
7090 kfree(conf->stripe_hashtbl);
7091 kfree(conf->pending_data);
7092 kfree(conf);
7093}
7094
7095static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
7096{
7097 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
7098 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
7099
7100 if (alloc_scratch_buffer(conf, percpu)) {
7101 pr_warn("%s: failed memory allocation for cpu%u\n",
7102 __func__, cpu);
7103 return -ENOMEM;
7104 }
7105 return 0;
7106}
7107
7108static int raid5_alloc_percpu(struct r5conf *conf)
7109{
7110 int err = 0;
7111
7112 conf->percpu = alloc_percpu(struct raid5_percpu);
7113 if (!conf->percpu)
7114 return -ENOMEM;
7115
7116 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
7117 if (!err) {
7118 conf->scribble_disks = max(conf->raid_disks,
7119 conf->previous_raid_disks);
7120 conf->scribble_sectors = max(conf->chunk_sectors,
7121 conf->prev_chunk_sectors);
7122 }
7123 return err;
7124}
7125
7126static unsigned long raid5_cache_scan(struct shrinker *shrink,
7127 struct shrink_control *sc)
7128{
7129 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
7130 unsigned long ret = SHRINK_STOP;
7131
7132 if (mutex_trylock(&conf->cache_size_mutex)) {
7133 ret= 0;
7134 while (ret < sc->nr_to_scan &&
7135 conf->max_nr_stripes > conf->min_nr_stripes) {
7136 if (drop_one_stripe(conf) == 0) {
7137 ret = SHRINK_STOP;
7138 break;
7139 }
7140 ret++;
7141 }
7142 mutex_unlock(&conf->cache_size_mutex);
7143 }
7144 return ret;
7145}
7146
7147static unsigned long raid5_cache_count(struct shrinker *shrink,
7148 struct shrink_control *sc)
7149{
7150 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
7151
7152 if (conf->max_nr_stripes < conf->min_nr_stripes)
7153
7154 return 0;
7155 return conf->max_nr_stripes - conf->min_nr_stripes;
7156}
7157
7158static struct r5conf *setup_conf(struct mddev *mddev)
7159{
7160 struct r5conf *conf;
7161 int raid_disk, memory, max_disks;
7162 struct md_rdev *rdev;
7163 struct disk_info *disk;
7164 char pers_name[6];
7165 int i;
7166 int group_cnt;
7167 struct r5worker_group *new_group;
7168 int ret;
7169
7170 if (mddev->new_level != 5
7171 && mddev->new_level != 4
7172 && mddev->new_level != 6) {
7173 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
7174 mdname(mddev), mddev->new_level);
7175 return ERR_PTR(-EIO);
7176 }
7177 if ((mddev->new_level == 5
7178 && !algorithm_valid_raid5(mddev->new_layout)) ||
7179 (mddev->new_level == 6
7180 && !algorithm_valid_raid6(mddev->new_layout))) {
7181 pr_warn("md/raid:%s: layout %d not supported\n",
7182 mdname(mddev), mddev->new_layout);
7183 return ERR_PTR(-EIO);
7184 }
7185 if (mddev->new_level == 6 && mddev->raid_disks < 4) {
7186 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
7187 mdname(mddev), mddev->raid_disks);
7188 return ERR_PTR(-EINVAL);
7189 }
7190
7191 if (!mddev->new_chunk_sectors ||
7192 (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
7193 !is_power_of_2(mddev->new_chunk_sectors)) {
7194 pr_warn("md/raid:%s: invalid chunk size %d\n",
7195 mdname(mddev), mddev->new_chunk_sectors << 9);
7196 return ERR_PTR(-EINVAL);
7197 }
7198
7199 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
7200 if (conf == NULL)
7201 goto abort;
7202
7203#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
7204 conf->stripe_size = DEFAULT_STRIPE_SIZE;
7205 conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9;
7206 conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9;
7207#endif
7208 INIT_LIST_HEAD(&conf->free_list);
7209 INIT_LIST_HEAD(&conf->pending_list);
7210 conf->pending_data = kcalloc(PENDING_IO_MAX,
7211 sizeof(struct r5pending_data),
7212 GFP_KERNEL);
7213 if (!conf->pending_data)
7214 goto abort;
7215 for (i = 0; i < PENDING_IO_MAX; i++)
7216 list_add(&conf->pending_data[i].sibling, &conf->free_list);
7217
7218 if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) {
7219 conf->group_cnt = group_cnt;
7220 conf->worker_cnt_per_group = 0;
7221 conf->worker_groups = new_group;
7222 } else
7223 goto abort;
7224 spin_lock_init(&conf->device_lock);
7225 seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
7226 mutex_init(&conf->cache_size_mutex);
7227 init_waitqueue_head(&conf->wait_for_quiescent);
7228 init_waitqueue_head(&conf->wait_for_stripe);
7229 init_waitqueue_head(&conf->wait_for_overlap);
7230 INIT_LIST_HEAD(&conf->handle_list);
7231 INIT_LIST_HEAD(&conf->loprio_list);
7232 INIT_LIST_HEAD(&conf->hold_list);
7233 INIT_LIST_HEAD(&conf->delayed_list);
7234 INIT_LIST_HEAD(&conf->bitmap_list);
7235 init_llist_head(&conf->released_stripes);
7236 atomic_set(&conf->active_stripes, 0);
7237 atomic_set(&conf->preread_active_stripes, 0);
7238 atomic_set(&conf->active_aligned_reads, 0);
7239 spin_lock_init(&conf->pending_bios_lock);
7240 conf->batch_bio_dispatch = true;
7241 rdev_for_each(rdev, mddev) {
7242 if (test_bit(Journal, &rdev->flags))
7243 continue;
7244 if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
7245 conf->batch_bio_dispatch = false;
7246 break;
7247 }
7248 }
7249
7250 conf->bypass_threshold = BYPASS_THRESHOLD;
7251 conf->recovery_disabled = mddev->recovery_disabled - 1;
7252
7253 conf->raid_disks = mddev->raid_disks;
7254 if (mddev->reshape_position == MaxSector)
7255 conf->previous_raid_disks = mddev->raid_disks;
7256 else
7257 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
7258 max_disks = max(conf->raid_disks, conf->previous_raid_disks);
7259
7260 conf->disks = kcalloc(max_disks, sizeof(struct disk_info),
7261 GFP_KERNEL);
7262
7263 if (!conf->disks)
7264 goto abort;
7265
7266 for (i = 0; i < max_disks; i++) {
7267 conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
7268 if (!conf->disks[i].extra_page)
7269 goto abort;
7270 }
7271
7272 ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
7273 if (ret)
7274 goto abort;
7275 conf->mddev = mddev;
7276
7277 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
7278 goto abort;
7279
7280
7281
7282
7283
7284
7285 spin_lock_init(conf->hash_locks);
7286 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
7287 spin_lock_init(conf->hash_locks + i);
7288
7289 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
7290 INIT_LIST_HEAD(conf->inactive_list + i);
7291
7292 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
7293 INIT_LIST_HEAD(conf->temp_inactive_list + i);
7294
7295 atomic_set(&conf->r5c_cached_full_stripes, 0);
7296 INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
7297 atomic_set(&conf->r5c_cached_partial_stripes, 0);
7298 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
7299 atomic_set(&conf->r5c_flushing_full_stripes, 0);
7300 atomic_set(&conf->r5c_flushing_partial_stripes, 0);
7301
7302 conf->level = mddev->new_level;
7303 conf->chunk_sectors = mddev->new_chunk_sectors;
7304 if (raid5_alloc_percpu(conf) != 0)
7305 goto abort;
7306
7307 pr_debug("raid456: run(%s) called.\n", mdname(mddev));
7308
7309 rdev_for_each(rdev, mddev) {
7310 raid_disk = rdev->raid_disk;
7311 if (raid_disk >= max_disks
7312 || raid_disk < 0 || test_bit(Journal, &rdev->flags))
7313 continue;
7314 disk = conf->disks + raid_disk;
7315
7316 if (test_bit(Replacement, &rdev->flags)) {
7317 if (disk->replacement)
7318 goto abort;
7319 disk->replacement = rdev;
7320 } else {
7321 if (disk->rdev)
7322 goto abort;
7323 disk->rdev = rdev;
7324 }
7325
7326 if (test_bit(In_sync, &rdev->flags)) {
7327 char b[BDEVNAME_SIZE];
7328 pr_info("md/raid:%s: device %s operational as raid disk %d\n",
7329 mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
7330 } else if (rdev->saved_raid_disk != raid_disk)
7331
7332 conf->fullsync = 1;
7333 }
7334
7335 conf->level = mddev->new_level;
7336 if (conf->level == 6) {
7337 conf->max_degraded = 2;
7338 if (raid6_call.xor_syndrome)
7339 conf->rmw_level = PARITY_ENABLE_RMW;
7340 else
7341 conf->rmw_level = PARITY_DISABLE_RMW;
7342 } else {
7343 conf->max_degraded = 1;
7344 conf->rmw_level = PARITY_ENABLE_RMW;
7345 }
7346 conf->algorithm = mddev->new_layout;
7347 conf->reshape_progress = mddev->reshape_position;
7348 if (conf->reshape_progress != MaxSector) {
7349 conf->prev_chunk_sectors = mddev->chunk_sectors;
7350 conf->prev_algo = mddev->layout;
7351 } else {
7352 conf->prev_chunk_sectors = conf->chunk_sectors;
7353 conf->prev_algo = conf->algorithm;
7354 }
7355
7356 conf->min_nr_stripes = NR_STRIPES;
7357 if (mddev->reshape_position != MaxSector) {
7358 int stripes = max_t(int,
7359 ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4,
7360 ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4);
7361 conf->min_nr_stripes = max(NR_STRIPES, stripes);
7362 if (conf->min_nr_stripes != NR_STRIPES)
7363 pr_info("md/raid:%s: force stripe size %d for reshape\n",
7364 mdname(mddev), conf->min_nr_stripes);
7365 }
7366 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
7367 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
7368 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
7369 if (grow_stripes(conf, conf->min_nr_stripes)) {
7370 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
7371 mdname(mddev), memory);
7372 goto abort;
7373 } else
7374 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
7375
7376
7377
7378
7379
7380 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
7381 conf->shrinker.scan_objects = raid5_cache_scan;
7382 conf->shrinker.count_objects = raid5_cache_count;
7383 conf->shrinker.batch = 128;
7384 conf->shrinker.flags = 0;
7385 if (register_shrinker(&conf->shrinker)) {
7386 pr_warn("md/raid:%s: couldn't register shrinker.\n",
7387 mdname(mddev));
7388 goto abort;
7389 }
7390
7391 sprintf(pers_name, "raid%d", mddev->new_level);
7392 conf->thread = md_register_thread(raid5d, mddev, pers_name);
7393 if (!conf->thread) {
7394 pr_warn("md/raid:%s: couldn't allocate thread.\n",
7395 mdname(mddev));
7396 goto abort;
7397 }
7398
7399 return conf;
7400
7401 abort:
7402 if (conf) {
7403 free_conf(conf);
7404 return ERR_PTR(-EIO);
7405 } else
7406 return ERR_PTR(-ENOMEM);
7407}
7408
7409static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
7410{
7411 switch (algo) {
7412 case ALGORITHM_PARITY_0:
7413 if (raid_disk < max_degraded)
7414 return 1;
7415 break;
7416 case ALGORITHM_PARITY_N:
7417 if (raid_disk >= raid_disks - max_degraded)
7418 return 1;
7419 break;
7420 case ALGORITHM_PARITY_0_6:
7421 if (raid_disk == 0 ||
7422 raid_disk == raid_disks - 1)
7423 return 1;
7424 break;
7425 case ALGORITHM_LEFT_ASYMMETRIC_6:
7426 case ALGORITHM_RIGHT_ASYMMETRIC_6:
7427 case ALGORITHM_LEFT_SYMMETRIC_6:
7428 case ALGORITHM_RIGHT_SYMMETRIC_6:
7429 if (raid_disk == raid_disks - 1)
7430 return 1;
7431 }
7432 return 0;
7433}
7434
7435static void raid5_set_io_opt(struct r5conf *conf)
7436{
7437 blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) *
7438 (conf->raid_disks - conf->max_degraded));
7439}
7440
7441static int raid5_run(struct mddev *mddev)
7442{
7443 struct r5conf *conf;
7444 int working_disks = 0;
7445 int dirty_parity_disks = 0;
7446 struct md_rdev *rdev;
7447 struct md_rdev *journal_dev = NULL;
7448 sector_t reshape_offset = 0;
7449 int i;
7450 long long min_offset_diff = 0;
7451 int first = 1;
7452
7453 if (mddev_init_writes_pending(mddev) < 0)
7454 return -ENOMEM;
7455
7456 if (mddev->recovery_cp != MaxSector)
7457 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
7458 mdname(mddev));
7459
7460 rdev_for_each(rdev, mddev) {
7461 long long diff;
7462
7463 if (test_bit(Journal, &rdev->flags)) {
7464 journal_dev = rdev;
7465 continue;
7466 }
7467 if (rdev->raid_disk < 0)
7468 continue;
7469 diff = (rdev->new_data_offset - rdev->data_offset);
7470 if (first) {
7471 min_offset_diff = diff;
7472 first = 0;
7473 } else if (mddev->reshape_backwards &&
7474 diff < min_offset_diff)
7475 min_offset_diff = diff;
7476 else if (!mddev->reshape_backwards &&
7477 diff > min_offset_diff)
7478 min_offset_diff = diff;
7479 }
7480
7481 if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) &&
7482 (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
7483 pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
7484 mdname(mddev));
7485 return -EINVAL;
7486 }
7487
7488 if (mddev->reshape_position != MaxSector) {
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501 sector_t here_new, here_old;
7502 int old_disks;
7503 int max_degraded = (mddev->level == 6 ? 2 : 1);
7504 int chunk_sectors;
7505 int new_data_disks;
7506
7507 if (journal_dev) {
7508 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
7509 mdname(mddev));
7510 return -EINVAL;
7511 }
7512
7513 if (mddev->new_level != mddev->level) {
7514 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
7515 mdname(mddev));
7516 return -EINVAL;
7517 }
7518 old_disks = mddev->raid_disks - mddev->delta_disks;
7519
7520
7521
7522
7523
7524
7525
7526 here_new = mddev->reshape_position;
7527 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
7528 new_data_disks = mddev->raid_disks - max_degraded;
7529 if (sector_div(here_new, chunk_sectors * new_data_disks)) {
7530 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
7531 mdname(mddev));
7532 return -EINVAL;
7533 }
7534 reshape_offset = here_new * chunk_sectors;
7535
7536 here_old = mddev->reshape_position;
7537 sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
7538
7539
7540 if (mddev->delta_disks == 0) {
7541
7542
7543
7544
7545
7546
7547
7548 if (abs(min_offset_diff) >= mddev->chunk_sectors &&
7549 abs(min_offset_diff) >= mddev->new_chunk_sectors)
7550 ;
7551 else if (mddev->ro == 0) {
7552 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
7553 mdname(mddev));
7554 return -EINVAL;
7555 }
7556 } else if (mddev->reshape_backwards
7557 ? (here_new * chunk_sectors + min_offset_diff <=
7558 here_old * chunk_sectors)
7559 : (here_new * chunk_sectors >=
7560 here_old * chunk_sectors + (-min_offset_diff))) {
7561
7562 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
7563 mdname(mddev));
7564 return -EINVAL;
7565 }
7566 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
7567
7568 } else {
7569 BUG_ON(mddev->level != mddev->new_level);
7570 BUG_ON(mddev->layout != mddev->new_layout);
7571 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
7572 BUG_ON(mddev->delta_disks != 0);
7573 }
7574
7575 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
7576 test_bit(MD_HAS_PPL, &mddev->flags)) {
7577 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
7578 mdname(mddev));
7579 clear_bit(MD_HAS_PPL, &mddev->flags);
7580 clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags);
7581 }
7582
7583 if (mddev->private == NULL)
7584 conf = setup_conf(mddev);
7585 else
7586 conf = mddev->private;
7587
7588 if (IS_ERR(conf))
7589 return PTR_ERR(conf);
7590
7591 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
7592 if (!journal_dev) {
7593 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
7594 mdname(mddev));
7595 mddev->ro = 1;
7596 set_disk_ro(mddev->gendisk, 1);
7597 } else if (mddev->recovery_cp == MaxSector)
7598 set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
7599 }
7600
7601 conf->min_offset_diff = min_offset_diff;
7602 mddev->thread = conf->thread;
7603 conf->thread = NULL;
7604 mddev->private = conf;
7605
7606 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
7607 i++) {
7608 rdev = conf->disks[i].rdev;
7609 if (!rdev && conf->disks[i].replacement) {
7610
7611 rdev = conf->disks[i].replacement;
7612 conf->disks[i].replacement = NULL;
7613 clear_bit(Replacement, &rdev->flags);
7614 conf->disks[i].rdev = rdev;
7615 }
7616 if (!rdev)
7617 continue;
7618 if (conf->disks[i].replacement &&
7619 conf->reshape_progress != MaxSector) {
7620
7621 pr_warn("md: cannot handle concurrent replacement and reshape.\n");
7622 goto abort;
7623 }
7624 if (test_bit(In_sync, &rdev->flags)) {
7625 working_disks++;
7626 continue;
7627 }
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637 if (mddev->major_version == 0 &&
7638 mddev->minor_version > 90)
7639 rdev->recovery_offset = reshape_offset;
7640
7641 if (rdev->recovery_offset < reshape_offset) {
7642
7643 if (!only_parity(rdev->raid_disk,
7644 conf->algorithm,
7645 conf->raid_disks,
7646 conf->max_degraded))
7647 continue;
7648 }
7649 if (!only_parity(rdev->raid_disk,
7650 conf->prev_algo,
7651 conf->previous_raid_disks,
7652 conf->max_degraded))
7653 continue;
7654 dirty_parity_disks++;
7655 }
7656
7657
7658
7659
7660 mddev->degraded = raid5_calc_degraded(conf);
7661
7662 if (has_failed(conf)) {
7663 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
7664 mdname(mddev), mddev->degraded, conf->raid_disks);
7665 goto abort;
7666 }
7667
7668
7669 mddev->dev_sectors &= ~((sector_t)mddev->chunk_sectors - 1);
7670 mddev->resync_max_sectors = mddev->dev_sectors;
7671
7672 if (mddev->degraded > dirty_parity_disks &&
7673 mddev->recovery_cp != MaxSector) {
7674 if (test_bit(MD_HAS_PPL, &mddev->flags))
7675 pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
7676 mdname(mddev));
7677 else if (mddev->ok_start_degraded)
7678 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
7679 mdname(mddev));
7680 else {
7681 pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
7682 mdname(mddev));
7683 goto abort;
7684 }
7685 }
7686
7687 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
7688 mdname(mddev), conf->level,
7689 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
7690 mddev->new_layout);
7691
7692 print_raid5_conf(conf);
7693
7694 if (conf->reshape_progress != MaxSector) {
7695 conf->reshape_safe = conf->reshape_progress;
7696 atomic_set(&conf->reshape_stripes, 0);
7697 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7698 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7699 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7700 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7701 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
7702 "reshape");
7703 if (!mddev->sync_thread)
7704 goto abort;
7705 }
7706
7707
7708 if (mddev->to_remove == &raid5_attrs_group)
7709 mddev->to_remove = NULL;
7710 else if (mddev->kobj.sd &&
7711 sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
7712 pr_warn("raid5: failed to create sysfs attributes for %s\n",
7713 mdname(mddev));
7714 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
7715
7716 if (mddev->queue) {
7717 int chunk_size;
7718
7719
7720
7721
7722 int data_disks = conf->previous_raid_disks - conf->max_degraded;
7723 int stripe = data_disks *
7724 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
7725
7726 chunk_size = mddev->chunk_sectors << 9;
7727 blk_queue_io_min(mddev->queue, chunk_size);
7728 raid5_set_io_opt(conf);
7729 mddev->queue->limits.raid_partial_stripes_expensive = 1;
7730
7731
7732
7733
7734 stripe = stripe * PAGE_SIZE;
7735
7736
7737 while ((stripe-1) & stripe)
7738 stripe = (stripe | (stripe-1)) + 1;
7739 mddev->queue->limits.discard_alignment = stripe;
7740 mddev->queue->limits.discard_granularity = stripe;
7741
7742 blk_queue_max_write_same_sectors(mddev->queue, 0);
7743 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
7744
7745 rdev_for_each(rdev, mddev) {
7746 disk_stack_limits(mddev->gendisk, rdev->bdev,
7747 rdev->data_offset << 9);
7748 disk_stack_limits(mddev->gendisk, rdev->bdev,
7749 rdev->new_data_offset << 9);
7750 }
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767 if (devices_handle_discard_safely &&
7768 mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
7769 mddev->queue->limits.discard_granularity >= stripe)
7770 blk_queue_flag_set(QUEUE_FLAG_DISCARD,
7771 mddev->queue);
7772 else
7773 blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
7774 mddev->queue);
7775
7776 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
7777 }
7778
7779 if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
7780 goto abort;
7781
7782 return 0;
7783abort:
7784 md_unregister_thread(&mddev->thread);
7785 print_raid5_conf(conf);
7786 free_conf(conf);
7787 mddev->private = NULL;
7788 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
7789 return -EIO;
7790}
7791
7792static void raid5_free(struct mddev *mddev, void *priv)
7793{
7794 struct r5conf *conf = priv;
7795
7796 free_conf(conf);
7797 mddev->to_remove = &raid5_attrs_group;
7798}
7799
7800static void raid5_status(struct seq_file *seq, struct mddev *mddev)
7801{
7802 struct r5conf *conf = mddev->private;
7803 int i;
7804
7805 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
7806 conf->chunk_sectors / 2, mddev->layout);
7807 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
7808 rcu_read_lock();
7809 for (i = 0; i < conf->raid_disks; i++) {
7810 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
7811 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
7812 }
7813 rcu_read_unlock();
7814 seq_printf (seq, "]");
7815}
7816
7817static void print_raid5_conf (struct r5conf *conf)
7818{
7819 int i;
7820 struct disk_info *tmp;
7821
7822 pr_debug("RAID conf printout:\n");
7823 if (!conf) {
7824 pr_debug("(conf==NULL)\n");
7825 return;
7826 }
7827 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
7828 conf->raid_disks,
7829 conf->raid_disks - conf->mddev->degraded);
7830
7831 for (i = 0; i < conf->raid_disks; i++) {
7832 char b[BDEVNAME_SIZE];
7833 tmp = conf->disks + i;
7834 if (tmp->rdev)
7835 pr_debug(" disk %d, o:%d, dev:%s\n",
7836 i, !test_bit(Faulty, &tmp->rdev->flags),
7837 bdevname(tmp->rdev->bdev, b));
7838 }
7839}
7840
7841static int raid5_spare_active(struct mddev *mddev)
7842{
7843 int i;
7844 struct r5conf *conf = mddev->private;
7845 struct disk_info *tmp;
7846 int count = 0;
7847 unsigned long flags;
7848
7849 for (i = 0; i < conf->raid_disks; i++) {
7850 tmp = conf->disks + i;
7851 if (tmp->replacement
7852 && tmp->replacement->recovery_offset == MaxSector
7853 && !test_bit(Faulty, &tmp->replacement->flags)
7854 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
7855
7856 if (!tmp->rdev
7857 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
7858 count++;
7859 if (tmp->rdev) {
7860
7861
7862
7863
7864 set_bit(Faulty, &tmp->rdev->flags);
7865 sysfs_notify_dirent_safe(
7866 tmp->rdev->sysfs_state);
7867 }
7868 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
7869 } else if (tmp->rdev
7870 && tmp->rdev->recovery_offset == MaxSector
7871 && !test_bit(Faulty, &tmp->rdev->flags)
7872 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
7873 count++;
7874 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
7875 }
7876 }
7877 spin_lock_irqsave(&conf->device_lock, flags);
7878 mddev->degraded = raid5_calc_degraded(conf);
7879 spin_unlock_irqrestore(&conf->device_lock, flags);
7880 print_raid5_conf(conf);
7881 return count;
7882}
7883
7884static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
7885{
7886 struct r5conf *conf = mddev->private;
7887 int err = 0;
7888 int number = rdev->raid_disk;
7889 struct md_rdev **rdevp;
7890 struct disk_info *p = conf->disks + number;
7891
7892 print_raid5_conf(conf);
7893 if (test_bit(Journal, &rdev->flags) && conf->log) {
7894
7895
7896
7897
7898
7899
7900 if (atomic_read(&conf->active_stripes) ||
7901 atomic_read(&conf->r5c_cached_full_stripes) ||
7902 atomic_read(&conf->r5c_cached_partial_stripes)) {
7903 return -EBUSY;
7904 }
7905 log_exit(conf);
7906 return 0;
7907 }
7908 if (rdev == p->rdev)
7909 rdevp = &p->rdev;
7910 else if (rdev == p->replacement)
7911 rdevp = &p->replacement;
7912 else
7913 return 0;
7914
7915 if (number >= conf->raid_disks &&
7916 conf->reshape_progress == MaxSector)
7917 clear_bit(In_sync, &rdev->flags);
7918
7919 if (test_bit(In_sync, &rdev->flags) ||
7920 atomic_read(&rdev->nr_pending)) {
7921 err = -EBUSY;
7922 goto abort;
7923 }
7924
7925
7926
7927 if (!test_bit(Faulty, &rdev->flags) &&
7928 mddev->recovery_disabled != conf->recovery_disabled &&
7929 !has_failed(conf) &&
7930 (!p->replacement || p->replacement == rdev) &&
7931 number < conf->raid_disks) {
7932 err = -EBUSY;
7933 goto abort;
7934 }
7935 *rdevp = NULL;
7936 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
7937 synchronize_rcu();
7938 if (atomic_read(&rdev->nr_pending)) {
7939
7940 err = -EBUSY;
7941 *rdevp = rdev;
7942 }
7943 }
7944 if (!err) {
7945 err = log_modify(conf, rdev, false);
7946 if (err)
7947 goto abort;
7948 }
7949 if (p->replacement) {
7950
7951 p->rdev = p->replacement;
7952 clear_bit(Replacement, &p->replacement->flags);
7953 smp_mb();
7954
7955
7956 p->replacement = NULL;
7957
7958 if (!err)
7959 err = log_modify(conf, p->rdev, true);
7960 }
7961
7962 clear_bit(WantReplacement, &rdev->flags);
7963abort:
7964
7965 print_raid5_conf(conf);
7966 return err;
7967}
7968
7969static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
7970{
7971 struct r5conf *conf = mddev->private;
7972 int ret, err = -EEXIST;
7973 int disk;
7974 struct disk_info *p;
7975 int first = 0;
7976 int last = conf->raid_disks - 1;
7977
7978 if (test_bit(Journal, &rdev->flags)) {
7979 if (conf->log)
7980 return -EBUSY;
7981
7982 rdev->raid_disk = 0;
7983
7984
7985
7986
7987 ret = log_init(conf, rdev, false);
7988 if (ret)
7989 return ret;
7990
7991 ret = r5l_start(conf->log);
7992 if (ret)
7993 return ret;
7994
7995 return 0;
7996 }
7997 if (mddev->recovery_disabled == conf->recovery_disabled)
7998 return -EBUSY;
7999
8000 if (rdev->saved_raid_disk < 0 && has_failed(conf))
8001
8002 return -EINVAL;
8003
8004 if (rdev->raid_disk >= 0)
8005 first = last = rdev->raid_disk;
8006
8007
8008
8009
8010
8011 if (rdev->saved_raid_disk >= 0 &&
8012 rdev->saved_raid_disk >= first &&
8013 conf->disks[rdev->saved_raid_disk].rdev == NULL)
8014 first = rdev->saved_raid_disk;
8015
8016 for (disk = first; disk <= last; disk++) {
8017 p = conf->disks + disk;
8018 if (p->rdev == NULL) {
8019 clear_bit(In_sync, &rdev->flags);
8020 rdev->raid_disk = disk;
8021 if (rdev->saved_raid_disk != disk)
8022 conf->fullsync = 1;
8023 rcu_assign_pointer(p->rdev, rdev);
8024
8025 err = log_modify(conf, rdev, true);
8026
8027 goto out;
8028 }
8029 }
8030 for (disk = first; disk <= last; disk++) {
8031 p = conf->disks + disk;
8032 if (test_bit(WantReplacement, &p->rdev->flags) &&
8033 p->replacement == NULL) {
8034 clear_bit(In_sync, &rdev->flags);
8035 set_bit(Replacement, &rdev->flags);
8036 rdev->raid_disk = disk;
8037 err = 0;
8038 conf->fullsync = 1;
8039 rcu_assign_pointer(p->replacement, rdev);
8040 break;
8041 }
8042 }
8043out:
8044 print_raid5_conf(conf);
8045 return err;
8046}
8047
8048static int raid5_resize(struct mddev *mddev, sector_t sectors)
8049{
8050
8051
8052
8053
8054
8055
8056
8057 sector_t newsize;
8058 struct r5conf *conf = mddev->private;
8059
8060 if (raid5_has_log(conf) || raid5_has_ppl(conf))
8061 return -EINVAL;
8062 sectors &= ~((sector_t)conf->chunk_sectors - 1);
8063 newsize = raid5_size(mddev, sectors, mddev->raid_disks);
8064 if (mddev->external_size &&
8065 mddev->array_sectors > newsize)
8066 return -EINVAL;
8067 if (mddev->bitmap) {
8068 int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0);
8069 if (ret)
8070 return ret;
8071 }
8072 md_set_array_sectors(mddev, newsize);
8073 if (sectors > mddev->dev_sectors &&
8074 mddev->recovery_cp > mddev->dev_sectors) {
8075 mddev->recovery_cp = mddev->dev_sectors;
8076 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8077 }
8078 mddev->dev_sectors = sectors;
8079 mddev->resync_max_sectors = sectors;
8080 return 0;
8081}
8082
8083static int check_stripe_cache(struct mddev *mddev)
8084{
8085
8086
8087
8088
8089
8090
8091
8092
8093 struct r5conf *conf = mddev->private;
8094 if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
8095 > conf->min_nr_stripes ||
8096 ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
8097 > conf->min_nr_stripes) {
8098 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
8099 mdname(mddev),
8100 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
8101 / RAID5_STRIPE_SIZE(conf))*4);
8102 return 0;
8103 }
8104 return 1;
8105}
8106
8107static int check_reshape(struct mddev *mddev)
8108{
8109 struct r5conf *conf = mddev->private;
8110
8111 if (raid5_has_log(conf) || raid5_has_ppl(conf))
8112 return -EINVAL;
8113 if (mddev->delta_disks == 0 &&
8114 mddev->new_layout == mddev->layout &&
8115 mddev->new_chunk_sectors == mddev->chunk_sectors)
8116 return 0;
8117 if (has_failed(conf))
8118 return -EINVAL;
8119 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) {
8120
8121
8122
8123
8124
8125 int min = 2;
8126 if (mddev->level == 6)
8127 min = 4;
8128 if (mddev->raid_disks + mddev->delta_disks < min)
8129 return -EINVAL;
8130 }
8131
8132 if (!check_stripe_cache(mddev))
8133 return -ENOSPC;
8134
8135 if (mddev->new_chunk_sectors > mddev->chunk_sectors ||
8136 mddev->delta_disks > 0)
8137 if (resize_chunks(conf,
8138 conf->previous_raid_disks
8139 + max(0, mddev->delta_disks),
8140 max(mddev->new_chunk_sectors,
8141 mddev->chunk_sectors)
8142 ) < 0)
8143 return -ENOMEM;
8144
8145 if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
8146 return 0;
8147 return resize_stripes(conf, (conf->previous_raid_disks
8148 + mddev->delta_disks));
8149}
8150
8151static int raid5_start_reshape(struct mddev *mddev)
8152{
8153 struct r5conf *conf = mddev->private;
8154 struct md_rdev *rdev;
8155 int spares = 0;
8156 unsigned long flags;
8157
8158 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
8159 return -EBUSY;
8160
8161 if (!check_stripe_cache(mddev))
8162 return -ENOSPC;
8163
8164 if (has_failed(conf))
8165 return -EINVAL;
8166
8167 rdev_for_each(rdev, mddev) {
8168 if (!test_bit(In_sync, &rdev->flags)
8169 && !test_bit(Faulty, &rdev->flags))
8170 spares++;
8171 }
8172
8173 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
8174
8175
8176
8177 return -EINVAL;
8178
8179
8180
8181
8182
8183 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
8184 < mddev->array_sectors) {
8185 pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
8186 mdname(mddev));
8187 return -EINVAL;
8188 }
8189
8190 atomic_set(&conf->reshape_stripes, 0);
8191 spin_lock_irq(&conf->device_lock);
8192 write_seqcount_begin(&conf->gen_lock);
8193 conf->previous_raid_disks = conf->raid_disks;
8194 conf->raid_disks += mddev->delta_disks;
8195 conf->prev_chunk_sectors = conf->chunk_sectors;
8196 conf->chunk_sectors = mddev->new_chunk_sectors;
8197 conf->prev_algo = conf->algorithm;
8198 conf->algorithm = mddev->new_layout;
8199 conf->generation++;
8200
8201
8202
8203 smp_mb();
8204 if (mddev->reshape_backwards)
8205 conf->reshape_progress = raid5_size(mddev, 0, 0);
8206 else
8207 conf->reshape_progress = 0;
8208 conf->reshape_safe = conf->reshape_progress;
8209 write_seqcount_end(&conf->gen_lock);
8210 spin_unlock_irq(&conf->device_lock);
8211
8212
8213
8214
8215
8216 mddev_suspend(mddev);
8217 mddev_resume(mddev);
8218
8219
8220
8221
8222
8223
8224
8225
8226 if (mddev->delta_disks >= 0) {
8227 rdev_for_each(rdev, mddev)
8228 if (rdev->raid_disk < 0 &&
8229 !test_bit(Faulty, &rdev->flags)) {
8230 if (raid5_add_disk(mddev, rdev) == 0) {
8231 if (rdev->raid_disk
8232 >= conf->previous_raid_disks)
8233 set_bit(In_sync, &rdev->flags);
8234 else
8235 rdev->recovery_offset = 0;
8236
8237
8238 sysfs_link_rdev(mddev, rdev);
8239 }
8240 } else if (rdev->raid_disk >= conf->previous_raid_disks
8241 && !test_bit(Faulty, &rdev->flags)) {
8242
8243 set_bit(In_sync, &rdev->flags);
8244 }
8245
8246
8247
8248
8249
8250 spin_lock_irqsave(&conf->device_lock, flags);
8251 mddev->degraded = raid5_calc_degraded(conf);
8252 spin_unlock_irqrestore(&conf->device_lock, flags);
8253 }
8254 mddev->raid_disks = conf->raid_disks;
8255 mddev->reshape_position = conf->reshape_progress;
8256 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8257
8258 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8259 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8260 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
8261 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8262 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8263 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
8264 "reshape");
8265 if (!mddev->sync_thread) {
8266 mddev->recovery = 0;
8267 spin_lock_irq(&conf->device_lock);
8268 write_seqcount_begin(&conf->gen_lock);
8269 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
8270 mddev->new_chunk_sectors =
8271 conf->chunk_sectors = conf->prev_chunk_sectors;
8272 mddev->new_layout = conf->algorithm = conf->prev_algo;
8273 rdev_for_each(rdev, mddev)
8274 rdev->new_data_offset = rdev->data_offset;
8275 smp_wmb();
8276 conf->generation --;
8277 conf->reshape_progress = MaxSector;
8278 mddev->reshape_position = MaxSector;
8279 write_seqcount_end(&conf->gen_lock);
8280 spin_unlock_irq(&conf->device_lock);
8281 return -EAGAIN;
8282 }
8283 conf->reshape_checkpoint = jiffies;
8284 md_wakeup_thread(mddev->sync_thread);
8285 md_new_event(mddev);
8286 return 0;
8287}
8288
8289
8290
8291
8292static void end_reshape(struct r5conf *conf)
8293{
8294
8295 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
8296 struct md_rdev *rdev;
8297
8298 spin_lock_irq(&conf->device_lock);
8299 conf->previous_raid_disks = conf->raid_disks;
8300 md_finish_reshape(conf->mddev);
8301 smp_wmb();
8302 conf->reshape_progress = MaxSector;
8303 conf->mddev->reshape_position = MaxSector;
8304 rdev_for_each(rdev, conf->mddev)
8305 if (rdev->raid_disk >= 0 &&
8306 !test_bit(Journal, &rdev->flags) &&
8307 !test_bit(In_sync, &rdev->flags))
8308 rdev->recovery_offset = MaxSector;
8309 spin_unlock_irq(&conf->device_lock);
8310 wake_up(&conf->wait_for_overlap);
8311
8312 if (conf->mddev->queue)
8313 raid5_set_io_opt(conf);
8314 }
8315}
8316
8317
8318
8319
8320static void raid5_finish_reshape(struct mddev *mddev)
8321{
8322 struct r5conf *conf = mddev->private;
8323
8324 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8325
8326 if (mddev->delta_disks <= 0) {
8327 int d;
8328 spin_lock_irq(&conf->device_lock);
8329 mddev->degraded = raid5_calc_degraded(conf);
8330 spin_unlock_irq(&conf->device_lock);
8331 for (d = conf->raid_disks ;
8332 d < conf->raid_disks - mddev->delta_disks;
8333 d++) {
8334 struct md_rdev *rdev = conf->disks[d].rdev;
8335 if (rdev)
8336 clear_bit(In_sync, &rdev->flags);
8337 rdev = conf->disks[d].replacement;
8338 if (rdev)
8339 clear_bit(In_sync, &rdev->flags);
8340 }
8341 }
8342 mddev->layout = conf->algorithm;
8343 mddev->chunk_sectors = conf->chunk_sectors;
8344 mddev->reshape_position = MaxSector;
8345 mddev->delta_disks = 0;
8346 mddev->reshape_backwards = 0;
8347 }
8348}
8349
8350static void raid5_quiesce(struct mddev *mddev, int quiesce)
8351{
8352 struct r5conf *conf = mddev->private;
8353
8354 if (quiesce) {
8355
8356 lock_all_device_hash_locks_irq(conf);
8357
8358
8359
8360 r5c_flush_cache(conf, INT_MAX);
8361
8362
8363
8364 smp_store_release(&conf->quiesce, 2);
8365 wait_event_cmd(conf->wait_for_quiescent,
8366 atomic_read(&conf->active_stripes) == 0 &&
8367 atomic_read(&conf->active_aligned_reads) == 0,
8368 unlock_all_device_hash_locks_irq(conf),
8369 lock_all_device_hash_locks_irq(conf));
8370 conf->quiesce = 1;
8371 unlock_all_device_hash_locks_irq(conf);
8372
8373 wake_up(&conf->wait_for_overlap);
8374 } else {
8375
8376 lock_all_device_hash_locks_irq(conf);
8377 conf->quiesce = 0;
8378 wake_up(&conf->wait_for_quiescent);
8379 wake_up(&conf->wait_for_overlap);
8380 unlock_all_device_hash_locks_irq(conf);
8381 }
8382 log_quiesce(conf, quiesce);
8383}
8384
8385static void *raid45_takeover_raid0(struct mddev *mddev, int level)
8386{
8387 struct r0conf *raid0_conf = mddev->private;
8388 sector_t sectors;
8389
8390
8391 if (raid0_conf->nr_strip_zones > 1) {
8392 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
8393 mdname(mddev));
8394 return ERR_PTR(-EINVAL);
8395 }
8396
8397 sectors = raid0_conf->strip_zone[0].zone_end;
8398 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
8399 mddev->dev_sectors = sectors;
8400 mddev->new_level = level;
8401 mddev->new_layout = ALGORITHM_PARITY_N;
8402 mddev->new_chunk_sectors = mddev->chunk_sectors;
8403 mddev->raid_disks += 1;
8404 mddev->delta_disks = 1;
8405
8406 mddev->recovery_cp = MaxSector;
8407
8408 return setup_conf(mddev);
8409}
8410
8411static void *raid5_takeover_raid1(struct mddev *mddev)
8412{
8413 int chunksect;
8414 void *ret;
8415
8416 if (mddev->raid_disks != 2 ||
8417 mddev->degraded > 1)
8418 return ERR_PTR(-EINVAL);
8419
8420
8421
8422 chunksect = 64*2;
8423
8424
8425 while (chunksect && (mddev->array_sectors & (chunksect-1)))
8426 chunksect >>= 1;
8427
8428 if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private))
8429
8430 return ERR_PTR(-EINVAL);
8431
8432 mddev->new_level = 5;
8433 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
8434 mddev->new_chunk_sectors = chunksect;
8435
8436 ret = setup_conf(mddev);
8437 if (!IS_ERR(ret))
8438 mddev_clear_unsupported_flags(mddev,
8439 UNSUPPORTED_MDDEV_FLAGS);
8440 return ret;
8441}
8442
8443static void *raid5_takeover_raid6(struct mddev *mddev)
8444{
8445 int new_layout;
8446
8447 switch (mddev->layout) {
8448 case ALGORITHM_LEFT_ASYMMETRIC_6:
8449 new_layout = ALGORITHM_LEFT_ASYMMETRIC;
8450 break;
8451 case ALGORITHM_RIGHT_ASYMMETRIC_6:
8452 new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
8453 break;
8454 case ALGORITHM_LEFT_SYMMETRIC_6:
8455 new_layout = ALGORITHM_LEFT_SYMMETRIC;
8456 break;
8457 case ALGORITHM_RIGHT_SYMMETRIC_6:
8458 new_layout = ALGORITHM_RIGHT_SYMMETRIC;
8459 break;
8460 case ALGORITHM_PARITY_0_6:
8461 new_layout = ALGORITHM_PARITY_0;
8462 break;
8463 case ALGORITHM_PARITY_N:
8464 new_layout = ALGORITHM_PARITY_N;
8465 break;
8466 default:
8467 return ERR_PTR(-EINVAL);
8468 }
8469 mddev->new_level = 5;
8470 mddev->new_layout = new_layout;
8471 mddev->delta_disks = -1;
8472 mddev->raid_disks -= 1;
8473 return setup_conf(mddev);
8474}
8475
8476static int raid5_check_reshape(struct mddev *mddev)
8477{
8478
8479
8480
8481
8482
8483 struct r5conf *conf = mddev->private;
8484 int new_chunk = mddev->new_chunk_sectors;
8485
8486 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
8487 return -EINVAL;
8488 if (new_chunk > 0) {
8489 if (!is_power_of_2(new_chunk))
8490 return -EINVAL;
8491 if (new_chunk < (PAGE_SIZE>>9))
8492 return -EINVAL;
8493 if (mddev->array_sectors & (new_chunk-1))
8494
8495 return -EINVAL;
8496 }
8497
8498
8499
8500 if (mddev->raid_disks == 2) {
8501
8502 if (mddev->new_layout >= 0) {
8503 conf->algorithm = mddev->new_layout;
8504 mddev->layout = mddev->new_layout;
8505 }
8506 if (new_chunk > 0) {
8507 conf->chunk_sectors = new_chunk ;
8508 mddev->chunk_sectors = new_chunk;
8509 }
8510 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8511 md_wakeup_thread(mddev->thread);
8512 }
8513 return check_reshape(mddev);
8514}
8515
8516static int raid6_check_reshape(struct mddev *mddev)
8517{
8518 int new_chunk = mddev->new_chunk_sectors;
8519
8520 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
8521 return -EINVAL;
8522 if (new_chunk > 0) {
8523 if (!is_power_of_2(new_chunk))
8524 return -EINVAL;
8525 if (new_chunk < (PAGE_SIZE >> 9))
8526 return -EINVAL;
8527 if (mddev->array_sectors & (new_chunk-1))
8528
8529 return -EINVAL;
8530 }
8531
8532
8533 return check_reshape(mddev);
8534}
8535
8536static void *raid5_takeover(struct mddev *mddev)
8537{
8538
8539
8540
8541
8542
8543
8544 if (mddev->level == 0)
8545 return raid45_takeover_raid0(mddev, 5);
8546 if (mddev->level == 1)
8547 return raid5_takeover_raid1(mddev);
8548 if (mddev->level == 4) {
8549 mddev->new_layout = ALGORITHM_PARITY_N;
8550 mddev->new_level = 5;
8551 return setup_conf(mddev);
8552 }
8553 if (mddev->level == 6)
8554 return raid5_takeover_raid6(mddev);
8555
8556 return ERR_PTR(-EINVAL);
8557}
8558
8559static void *raid4_takeover(struct mddev *mddev)
8560{
8561
8562
8563
8564
8565 if (mddev->level == 0)
8566 return raid45_takeover_raid0(mddev, 4);
8567 if (mddev->level == 5 &&
8568 mddev->layout == ALGORITHM_PARITY_N) {
8569 mddev->new_layout = 0;
8570 mddev->new_level = 4;
8571 return setup_conf(mddev);
8572 }
8573 return ERR_PTR(-EINVAL);
8574}
8575
8576static struct md_personality raid5_personality;
8577
8578static void *raid6_takeover(struct mddev *mddev)
8579{
8580
8581
8582
8583
8584 int new_layout;
8585
8586 if (mddev->pers != &raid5_personality)
8587 return ERR_PTR(-EINVAL);
8588 if (mddev->degraded > 1)
8589 return ERR_PTR(-EINVAL);
8590 if (mddev->raid_disks > 253)
8591 return ERR_PTR(-EINVAL);
8592 if (mddev->raid_disks < 3)
8593 return ERR_PTR(-EINVAL);
8594
8595 switch (mddev->layout) {
8596 case ALGORITHM_LEFT_ASYMMETRIC:
8597 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
8598 break;
8599 case ALGORITHM_RIGHT_ASYMMETRIC:
8600 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
8601 break;
8602 case ALGORITHM_LEFT_SYMMETRIC:
8603 new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
8604 break;
8605 case ALGORITHM_RIGHT_SYMMETRIC:
8606 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
8607 break;
8608 case ALGORITHM_PARITY_0:
8609 new_layout = ALGORITHM_PARITY_0_6;
8610 break;
8611 case ALGORITHM_PARITY_N:
8612 new_layout = ALGORITHM_PARITY_N;
8613 break;
8614 default:
8615 return ERR_PTR(-EINVAL);
8616 }
8617 mddev->new_level = 6;
8618 mddev->new_layout = new_layout;
8619 mddev->delta_disks = 1;
8620 mddev->raid_disks += 1;
8621 return setup_conf(mddev);
8622}
8623
8624static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
8625{
8626 struct r5conf *conf;
8627 int err;
8628
8629 err = mddev_lock(mddev);
8630 if (err)
8631 return err;
8632 conf = mddev->private;
8633 if (!conf) {
8634 mddev_unlock(mddev);
8635 return -ENODEV;
8636 }
8637
8638 if (strncmp(buf, "ppl", 3) == 0) {
8639
8640 if (!raid5_has_ppl(conf) && conf->level == 5) {
8641 err = log_init(conf, NULL, true);
8642 if (!err) {
8643 err = resize_stripes(conf, conf->pool_size);
8644 if (err)
8645 log_exit(conf);
8646 }
8647 } else
8648 err = -EINVAL;
8649 } else if (strncmp(buf, "resync", 6) == 0) {
8650 if (raid5_has_ppl(conf)) {
8651 mddev_suspend(mddev);
8652 log_exit(conf);
8653 mddev_resume(mddev);
8654 err = resize_stripes(conf, conf->pool_size);
8655 } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
8656 r5l_log_disk_error(conf)) {
8657 bool journal_dev_exists = false;
8658 struct md_rdev *rdev;
8659
8660 rdev_for_each(rdev, mddev)
8661 if (test_bit(Journal, &rdev->flags)) {
8662 journal_dev_exists = true;
8663 break;
8664 }
8665
8666 if (!journal_dev_exists) {
8667 mddev_suspend(mddev);
8668 clear_bit(MD_HAS_JOURNAL, &mddev->flags);
8669 mddev_resume(mddev);
8670 } else
8671 err = -EBUSY;
8672 } else
8673 err = -EINVAL;
8674 } else {
8675 err = -EINVAL;
8676 }
8677
8678 if (!err)
8679 md_update_sb(mddev, 1);
8680
8681 mddev_unlock(mddev);
8682
8683 return err;
8684}
8685
8686static int raid5_start(struct mddev *mddev)
8687{
8688 struct r5conf *conf = mddev->private;
8689
8690 return r5l_start(conf->log);
8691}
8692
8693static struct md_personality raid6_personality =
8694{
8695 .name = "raid6",
8696 .level = 6,
8697 .owner = THIS_MODULE,
8698 .make_request = raid5_make_request,
8699 .run = raid5_run,
8700 .start = raid5_start,
8701 .free = raid5_free,
8702 .status = raid5_status,
8703 .error_handler = raid5_error,
8704 .hot_add_disk = raid5_add_disk,
8705 .hot_remove_disk= raid5_remove_disk,
8706 .spare_active = raid5_spare_active,
8707 .sync_request = raid5_sync_request,
8708 .resize = raid5_resize,
8709 .size = raid5_size,
8710 .check_reshape = raid6_check_reshape,
8711 .start_reshape = raid5_start_reshape,
8712 .finish_reshape = raid5_finish_reshape,
8713 .quiesce = raid5_quiesce,
8714 .takeover = raid6_takeover,
8715 .change_consistency_policy = raid5_change_consistency_policy,
8716};
8717static struct md_personality raid5_personality =
8718{
8719 .name = "raid5",
8720 .level = 5,
8721 .owner = THIS_MODULE,
8722 .make_request = raid5_make_request,
8723 .run = raid5_run,
8724 .start = raid5_start,
8725 .free = raid5_free,
8726 .status = raid5_status,
8727 .error_handler = raid5_error,
8728 .hot_add_disk = raid5_add_disk,
8729 .hot_remove_disk= raid5_remove_disk,
8730 .spare_active = raid5_spare_active,
8731 .sync_request = raid5_sync_request,
8732 .resize = raid5_resize,
8733 .size = raid5_size,
8734 .check_reshape = raid5_check_reshape,
8735 .start_reshape = raid5_start_reshape,
8736 .finish_reshape = raid5_finish_reshape,
8737 .quiesce = raid5_quiesce,
8738 .takeover = raid5_takeover,
8739 .change_consistency_policy = raid5_change_consistency_policy,
8740};
8741
8742static struct md_personality raid4_personality =
8743{
8744 .name = "raid4",
8745 .level = 4,
8746 .owner = THIS_MODULE,
8747 .make_request = raid5_make_request,
8748 .run = raid5_run,
8749 .start = raid5_start,
8750 .free = raid5_free,
8751 .status = raid5_status,
8752 .error_handler = raid5_error,
8753 .hot_add_disk = raid5_add_disk,
8754 .hot_remove_disk= raid5_remove_disk,
8755 .spare_active = raid5_spare_active,
8756 .sync_request = raid5_sync_request,
8757 .resize = raid5_resize,
8758 .size = raid5_size,
8759 .check_reshape = raid5_check_reshape,
8760 .start_reshape = raid5_start_reshape,
8761 .finish_reshape = raid5_finish_reshape,
8762 .quiesce = raid5_quiesce,
8763 .takeover = raid4_takeover,
8764 .change_consistency_policy = raid5_change_consistency_policy,
8765};
8766
8767static int __init raid5_init(void)
8768{
8769 int ret;
8770
8771 raid5_wq = alloc_workqueue("raid5wq",
8772 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
8773 if (!raid5_wq)
8774 return -ENOMEM;
8775
8776 ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE,
8777 "md/raid5:prepare",
8778 raid456_cpu_up_prepare,
8779 raid456_cpu_dead);
8780 if (ret) {
8781 destroy_workqueue(raid5_wq);
8782 return ret;
8783 }
8784 register_md_personality(&raid6_personality);
8785 register_md_personality(&raid5_personality);
8786 register_md_personality(&raid4_personality);
8787 return 0;
8788}
8789
8790static void raid5_exit(void)
8791{
8792 unregister_md_personality(&raid6_personality);
8793 unregister_md_personality(&raid5_personality);
8794 unregister_md_personality(&raid4_personality);
8795 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
8796 destroy_workqueue(raid5_wq);
8797}
8798
8799module_init(raid5_init);
8800module_exit(raid5_exit);
8801MODULE_LICENSE("GPL");
8802MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
8803MODULE_ALIAS("md-personality-4");
8804MODULE_ALIAS("md-raid5");
8805MODULE_ALIAS("md-raid4");
8806MODULE_ALIAS("md-level-5");
8807MODULE_ALIAS("md-level-4");
8808MODULE_ALIAS("md-personality-8");
8809MODULE_ALIAS("md-raid6");
8810MODULE_ALIAS("md-level-6");
8811
8812
8813MODULE_ALIAS("raid5");
8814MODULE_ALIAS("raid6");
8815