1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46#include <linux/blkdev.h>
47#include <linux/kthread.h>
48#include <linux/raid/pq.h>
49#include <linux/async_tx.h>
50#include <linux/module.h>
51#include <linux/async.h>
52#include <linux/seq_file.h>
53#include <linux/cpu.h>
54#include <linux/slab.h>
55#include <linux/ratelimit.h>
56#include <linux/nodemask.h>
57#include <linux/flex_array.h>
58#include <linux/sched/signal.h>
59
60#include <trace/events/block.h>
61#include <linux/list_sort.h>
62
63#include "md.h"
64#include "raid5.h"
65#include "raid0.h"
66#include "bitmap.h"
67#include "raid5-log.h"
68
69#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
70
71#define cpu_to_group(cpu) cpu_to_node(cpu)
72#define ANY_GROUP NUMA_NO_NODE
73
74static bool devices_handle_discard_safely = false;
75module_param(devices_handle_discard_safely, bool, 0644);
76MODULE_PARM_DESC(devices_handle_discard_safely,
77 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
78static struct workqueue_struct *raid5_wq;
79
80static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
81{
82 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
83 return &conf->stripe_hashtbl[hash];
84}
85
86static inline int stripe_hash_locks_hash(sector_t sect)
87{
88 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
89}
90
91static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
92{
93 spin_lock_irq(conf->hash_locks + hash);
94 spin_lock(&conf->device_lock);
95}
96
97static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
98{
99 spin_unlock(&conf->device_lock);
100 spin_unlock_irq(conf->hash_locks + hash);
101}
102
103static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
104{
105 int i;
106 spin_lock_irq(conf->hash_locks);
107 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
108 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
109 spin_lock(&conf->device_lock);
110}
111
112static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
113{
114 int i;
115 spin_unlock(&conf->device_lock);
116 for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
117 spin_unlock(conf->hash_locks + i);
118 spin_unlock_irq(conf->hash_locks);
119}
120
121
122static inline int raid6_d0(struct stripe_head *sh)
123{
124 if (sh->ddf_layout)
125
126 return 0;
127
128 if (sh->qd_idx == sh->disks - 1)
129 return 0;
130 else
131 return sh->qd_idx + 1;
132}
133static inline int raid6_next_disk(int disk, int raid_disks)
134{
135 disk++;
136 return (disk < raid_disks) ? disk : 0;
137}
138
139
140
141
142
143
144static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
145 int *count, int syndrome_disks)
146{
147 int slot = *count;
148
149 if (sh->ddf_layout)
150 (*count)++;
151 if (idx == sh->pd_idx)
152 return syndrome_disks;
153 if (idx == sh->qd_idx)
154 return syndrome_disks + 1;
155 if (!sh->ddf_layout)
156 (*count)++;
157 return slot;
158}
159
160static void print_raid5_conf (struct r5conf *conf);
161
162static int stripe_operations_active(struct stripe_head *sh)
163{
164 return sh->check_state || sh->reconstruct_state ||
165 test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
166 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
167}
168
169static bool stripe_is_lowprio(struct stripe_head *sh)
170{
171 return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
172 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
173 !test_bit(STRIPE_R5C_CACHING, &sh->state);
174}
175
176static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
177{
178 struct r5conf *conf = sh->raid_conf;
179 struct r5worker_group *group;
180 int thread_cnt;
181 int i, cpu = sh->cpu;
182
183 if (!cpu_online(cpu)) {
184 cpu = cpumask_any(cpu_online_mask);
185 sh->cpu = cpu;
186 }
187
188 if (list_empty(&sh->lru)) {
189 struct r5worker_group *group;
190 group = conf->worker_groups + cpu_to_group(cpu);
191 if (stripe_is_lowprio(sh))
192 list_add_tail(&sh->lru, &group->loprio_list);
193 else
194 list_add_tail(&sh->lru, &group->handle_list);
195 group->stripes_cnt++;
196 sh->group = group;
197 }
198
199 if (conf->worker_cnt_per_group == 0) {
200 md_wakeup_thread(conf->mddev->thread);
201 return;
202 }
203
204 group = conf->worker_groups + cpu_to_group(sh->cpu);
205
206 group->workers[0].working = true;
207
208 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
209
210 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
211
212 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
213 if (group->workers[i].working == false) {
214 group->workers[i].working = true;
215 queue_work_on(sh->cpu, raid5_wq,
216 &group->workers[i].work);
217 thread_cnt--;
218 }
219 }
220}
221
222static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
223 struct list_head *temp_inactive_list)
224{
225 int i;
226 int injournal = 0;
227
228 BUG_ON(!list_empty(&sh->lru));
229 BUG_ON(atomic_read(&conf->active_stripes)==0);
230
231 if (r5c_is_writeback(conf->log))
232 for (i = sh->disks; i--; )
233 if (test_bit(R5_InJournal, &sh->dev[i].flags))
234 injournal++;
235
236
237
238
239
240
241
242 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
243 (conf->quiesce && r5c_is_writeback(conf->log) &&
244 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
245 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
246 r5c_make_stripe_write_out(sh);
247 set_bit(STRIPE_HANDLE, &sh->state);
248 }
249
250 if (test_bit(STRIPE_HANDLE, &sh->state)) {
251 if (test_bit(STRIPE_DELAYED, &sh->state) &&
252 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
253 list_add_tail(&sh->lru, &conf->delayed_list);
254 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
255 sh->bm_seq - conf->seq_write > 0)
256 list_add_tail(&sh->lru, &conf->bitmap_list);
257 else {
258 clear_bit(STRIPE_DELAYED, &sh->state);
259 clear_bit(STRIPE_BIT_DELAY, &sh->state);
260 if (conf->worker_cnt_per_group == 0) {
261 if (stripe_is_lowprio(sh))
262 list_add_tail(&sh->lru,
263 &conf->loprio_list);
264 else
265 list_add_tail(&sh->lru,
266 &conf->handle_list);
267 } else {
268 raid5_wakeup_stripe_thread(sh);
269 return;
270 }
271 }
272 md_wakeup_thread(conf->mddev->thread);
273 } else {
274 BUG_ON(stripe_operations_active(sh));
275 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
276 if (atomic_dec_return(&conf->preread_active_stripes)
277 < IO_THRESHOLD)
278 md_wakeup_thread(conf->mddev->thread);
279 atomic_dec(&conf->active_stripes);
280 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
281 if (!r5c_is_writeback(conf->log))
282 list_add_tail(&sh->lru, temp_inactive_list);
283 else {
284 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
285 if (injournal == 0)
286 list_add_tail(&sh->lru, temp_inactive_list);
287 else if (injournal == conf->raid_disks - conf->max_degraded) {
288
289 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
290 atomic_inc(&conf->r5c_cached_full_stripes);
291 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
292 atomic_dec(&conf->r5c_cached_partial_stripes);
293 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
294 r5c_check_cached_full_stripe(conf);
295 } else
296
297
298
299
300
301 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
302 }
303 }
304 }
305}
306
307static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
308 struct list_head *temp_inactive_list)
309{
310 if (atomic_dec_and_test(&sh->count))
311 do_release_stripe(conf, sh, temp_inactive_list);
312}
313
314
315
316
317
318
319
320
321static void release_inactive_stripe_list(struct r5conf *conf,
322 struct list_head *temp_inactive_list,
323 int hash)
324{
325 int size;
326 bool do_wakeup = false;
327 unsigned long flags;
328
329 if (hash == NR_STRIPE_HASH_LOCKS) {
330 size = NR_STRIPE_HASH_LOCKS;
331 hash = NR_STRIPE_HASH_LOCKS - 1;
332 } else
333 size = 1;
334 while (size) {
335 struct list_head *list = &temp_inactive_list[size - 1];
336
337
338
339
340
341 if (!list_empty_careful(list)) {
342 spin_lock_irqsave(conf->hash_locks + hash, flags);
343 if (list_empty(conf->inactive_list + hash) &&
344 !list_empty(list))
345 atomic_dec(&conf->empty_inactive_list_nr);
346 list_splice_tail_init(list, conf->inactive_list + hash);
347 do_wakeup = true;
348 spin_unlock_irqrestore(conf->hash_locks + hash, flags);
349 }
350 size--;
351 hash--;
352 }
353
354 if (do_wakeup) {
355 wake_up(&conf->wait_for_stripe);
356 if (atomic_read(&conf->active_stripes) == 0)
357 wake_up(&conf->wait_for_quiescent);
358 if (conf->retry_read_aligned)
359 md_wakeup_thread(conf->mddev->thread);
360 }
361}
362
363
364static int release_stripe_list(struct r5conf *conf,
365 struct list_head *temp_inactive_list)
366{
367 struct stripe_head *sh, *t;
368 int count = 0;
369 struct llist_node *head;
370
371 head = llist_del_all(&conf->released_stripes);
372 head = llist_reverse_order(head);
373 llist_for_each_entry_safe(sh, t, head, release_list) {
374 int hash;
375
376
377 smp_mb();
378 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
379
380
381
382
383
384 hash = sh->hash_lock_index;
385 __release_stripe(conf, sh, &temp_inactive_list[hash]);
386 count++;
387 }
388
389 return count;
390}
391
392void raid5_release_stripe(struct stripe_head *sh)
393{
394 struct r5conf *conf = sh->raid_conf;
395 unsigned long flags;
396 struct list_head list;
397 int hash;
398 bool wakeup;
399
400
401
402 if (atomic_add_unless(&sh->count, -1, 1))
403 return;
404
405 if (unlikely(!conf->mddev->thread) ||
406 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
407 goto slow_path;
408 wakeup = llist_add(&sh->release_list, &conf->released_stripes);
409 if (wakeup)
410 md_wakeup_thread(conf->mddev->thread);
411 return;
412slow_path:
413 local_irq_save(flags);
414
415 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
416 INIT_LIST_HEAD(&list);
417 hash = sh->hash_lock_index;
418 do_release_stripe(conf, sh, &list);
419 spin_unlock(&conf->device_lock);
420 release_inactive_stripe_list(conf, &list, hash);
421 }
422 local_irq_restore(flags);
423}
424
425static inline void remove_hash(struct stripe_head *sh)
426{
427 pr_debug("remove_hash(), stripe %llu\n",
428 (unsigned long long)sh->sector);
429
430 hlist_del_init(&sh->hash);
431}
432
433static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
434{
435 struct hlist_head *hp = stripe_hash(conf, sh->sector);
436
437 pr_debug("insert_hash(), stripe %llu\n",
438 (unsigned long long)sh->sector);
439
440 hlist_add_head(&sh->hash, hp);
441}
442
443
444static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
445{
446 struct stripe_head *sh = NULL;
447 struct list_head *first;
448
449 if (list_empty(conf->inactive_list + hash))
450 goto out;
451 first = (conf->inactive_list + hash)->next;
452 sh = list_entry(first, struct stripe_head, lru);
453 list_del_init(first);
454 remove_hash(sh);
455 atomic_inc(&conf->active_stripes);
456 BUG_ON(hash != sh->hash_lock_index);
457 if (list_empty(conf->inactive_list + hash))
458 atomic_inc(&conf->empty_inactive_list_nr);
459out:
460 return sh;
461}
462
463static void shrink_buffers(struct stripe_head *sh)
464{
465 struct page *p;
466 int i;
467 int num = sh->raid_conf->pool_size;
468
469 for (i = 0; i < num ; i++) {
470 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
471 p = sh->dev[i].page;
472 if (!p)
473 continue;
474 sh->dev[i].page = NULL;
475 put_page(p);
476 }
477}
478
479static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
480{
481 int i;
482 int num = sh->raid_conf->pool_size;
483
484 for (i = 0; i < num; i++) {
485 struct page *page;
486
487 if (!(page = alloc_page(gfp))) {
488 return 1;
489 }
490 sh->dev[i].page = page;
491 sh->dev[i].orig_page = page;
492 }
493
494 return 0;
495}
496
497static void raid5_build_block(struct stripe_head *sh, int i, int previous);
498static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
499 struct stripe_head *sh);
500
501static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
502{
503 struct r5conf *conf = sh->raid_conf;
504 int i, seq;
505
506 BUG_ON(atomic_read(&sh->count) != 0);
507 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
508 BUG_ON(stripe_operations_active(sh));
509 BUG_ON(sh->batch_head);
510
511 pr_debug("init_stripe called, stripe %llu\n",
512 (unsigned long long)sector);
513retry:
514 seq = read_seqcount_begin(&conf->gen_lock);
515 sh->generation = conf->generation - previous;
516 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
517 sh->sector = sector;
518 stripe_set_idx(sector, conf, previous, sh);
519 sh->state = 0;
520
521 for (i = sh->disks; i--; ) {
522 struct r5dev *dev = &sh->dev[i];
523
524 if (dev->toread || dev->read || dev->towrite || dev->written ||
525 test_bit(R5_LOCKED, &dev->flags)) {
526 pr_err("sector=%llx i=%d %p %p %p %p %d\n",
527 (unsigned long long)sh->sector, i, dev->toread,
528 dev->read, dev->towrite, dev->written,
529 test_bit(R5_LOCKED, &dev->flags));
530 WARN_ON(1);
531 }
532 dev->flags = 0;
533 raid5_build_block(sh, i, previous);
534 }
535 if (read_seqcount_retry(&conf->gen_lock, seq))
536 goto retry;
537 sh->overwrite_disks = 0;
538 insert_hash(conf, sh);
539 sh->cpu = smp_processor_id();
540 set_bit(STRIPE_BATCH_READY, &sh->state);
541}
542
543static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
544 short generation)
545{
546 struct stripe_head *sh;
547
548 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
549 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
550 if (sh->sector == sector && sh->generation == generation)
551 return sh;
552 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
553 return NULL;
554}
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569int raid5_calc_degraded(struct r5conf *conf)
570{
571 int degraded, degraded2;
572 int i;
573
574 rcu_read_lock();
575 degraded = 0;
576 for (i = 0; i < conf->previous_raid_disks; i++) {
577 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
578 if (rdev && test_bit(Faulty, &rdev->flags))
579 rdev = rcu_dereference(conf->disks[i].replacement);
580 if (!rdev || test_bit(Faulty, &rdev->flags))
581 degraded++;
582 else if (test_bit(In_sync, &rdev->flags))
583 ;
584 else
585
586
587
588
589
590
591
592
593
594 if (conf->raid_disks >= conf->previous_raid_disks)
595 degraded++;
596 }
597 rcu_read_unlock();
598 if (conf->raid_disks == conf->previous_raid_disks)
599 return degraded;
600 rcu_read_lock();
601 degraded2 = 0;
602 for (i = 0; i < conf->raid_disks; i++) {
603 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
604 if (rdev && test_bit(Faulty, &rdev->flags))
605 rdev = rcu_dereference(conf->disks[i].replacement);
606 if (!rdev || test_bit(Faulty, &rdev->flags))
607 degraded2++;
608 else if (test_bit(In_sync, &rdev->flags))
609 ;
610 else
611
612
613
614
615
616 if (conf->raid_disks <= conf->previous_raid_disks)
617 degraded2++;
618 }
619 rcu_read_unlock();
620 if (degraded2 > degraded)
621 return degraded2;
622 return degraded;
623}
624
625static int has_failed(struct r5conf *conf)
626{
627 int degraded;
628
629 if (conf->mddev->reshape_position == MaxSector)
630 return conf->mddev->degraded > conf->max_degraded;
631
632 degraded = raid5_calc_degraded(conf);
633 if (degraded > conf->max_degraded)
634 return 1;
635 return 0;
636}
637
638struct stripe_head *
639raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
640 int previous, int noblock, int noquiesce)
641{
642 struct stripe_head *sh;
643 int hash = stripe_hash_locks_hash(sector);
644 int inc_empty_inactive_list_flag;
645
646 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
647
648 spin_lock_irq(conf->hash_locks + hash);
649
650 do {
651 wait_event_lock_irq(conf->wait_for_quiescent,
652 conf->quiesce == 0 || noquiesce,
653 *(conf->hash_locks + hash));
654 sh = __find_stripe(conf, sector, conf->generation - previous);
655 if (!sh) {
656 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
657 sh = get_free_stripe(conf, hash);
658 if (!sh && !test_bit(R5_DID_ALLOC,
659 &conf->cache_state))
660 set_bit(R5_ALLOC_MORE,
661 &conf->cache_state);
662 }
663 if (noblock && sh == NULL)
664 break;
665
666 r5c_check_stripe_cache_usage(conf);
667 if (!sh) {
668 set_bit(R5_INACTIVE_BLOCKED,
669 &conf->cache_state);
670 r5l_wake_reclaim(conf->log, 0);
671 wait_event_lock_irq(
672 conf->wait_for_stripe,
673 !list_empty(conf->inactive_list + hash) &&
674 (atomic_read(&conf->active_stripes)
675 < (conf->max_nr_stripes * 3 / 4)
676 || !test_bit(R5_INACTIVE_BLOCKED,
677 &conf->cache_state)),
678 *(conf->hash_locks + hash));
679 clear_bit(R5_INACTIVE_BLOCKED,
680 &conf->cache_state);
681 } else {
682 init_stripe(sh, sector, previous);
683 atomic_inc(&sh->count);
684 }
685 } else if (!atomic_inc_not_zero(&sh->count)) {
686 spin_lock(&conf->device_lock);
687 if (!atomic_read(&sh->count)) {
688 if (!test_bit(STRIPE_HANDLE, &sh->state))
689 atomic_inc(&conf->active_stripes);
690 BUG_ON(list_empty(&sh->lru) &&
691 !test_bit(STRIPE_EXPANDING, &sh->state));
692 inc_empty_inactive_list_flag = 0;
693 if (!list_empty(conf->inactive_list + hash))
694 inc_empty_inactive_list_flag = 1;
695 list_del_init(&sh->lru);
696 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
697 atomic_inc(&conf->empty_inactive_list_nr);
698 if (sh->group) {
699 sh->group->stripes_cnt--;
700 sh->group = NULL;
701 }
702 }
703 atomic_inc(&sh->count);
704 spin_unlock(&conf->device_lock);
705 }
706 } while (sh == NULL);
707
708 spin_unlock_irq(conf->hash_locks + hash);
709 return sh;
710}
711
712static bool is_full_stripe_write(struct stripe_head *sh)
713{
714 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
715 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
716}
717
718static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
719{
720 if (sh1 > sh2) {
721 spin_lock_irq(&sh2->stripe_lock);
722 spin_lock_nested(&sh1->stripe_lock, 1);
723 } else {
724 spin_lock_irq(&sh1->stripe_lock);
725 spin_lock_nested(&sh2->stripe_lock, 1);
726 }
727}
728
729static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
730{
731 spin_unlock(&sh1->stripe_lock);
732 spin_unlock_irq(&sh2->stripe_lock);
733}
734
735
736static bool stripe_can_batch(struct stripe_head *sh)
737{
738 struct r5conf *conf = sh->raid_conf;
739
740 if (conf->log || raid5_has_ppl(conf))
741 return false;
742 return test_bit(STRIPE_BATCH_READY, &sh->state) &&
743 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
744 is_full_stripe_write(sh);
745}
746
747
748static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
749{
750 struct stripe_head *head;
751 sector_t head_sector, tmp_sec;
752 int hash;
753 int dd_idx;
754 int inc_empty_inactive_list_flag;
755
756
757 tmp_sec = sh->sector;
758 if (!sector_div(tmp_sec, conf->chunk_sectors))
759 return;
760 head_sector = sh->sector - STRIPE_SECTORS;
761
762 hash = stripe_hash_locks_hash(head_sector);
763 spin_lock_irq(conf->hash_locks + hash);
764 head = __find_stripe(conf, head_sector, conf->generation);
765 if (head && !atomic_inc_not_zero(&head->count)) {
766 spin_lock(&conf->device_lock);
767 if (!atomic_read(&head->count)) {
768 if (!test_bit(STRIPE_HANDLE, &head->state))
769 atomic_inc(&conf->active_stripes);
770 BUG_ON(list_empty(&head->lru) &&
771 !test_bit(STRIPE_EXPANDING, &head->state));
772 inc_empty_inactive_list_flag = 0;
773 if (!list_empty(conf->inactive_list + hash))
774 inc_empty_inactive_list_flag = 1;
775 list_del_init(&head->lru);
776 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
777 atomic_inc(&conf->empty_inactive_list_nr);
778 if (head->group) {
779 head->group->stripes_cnt--;
780 head->group = NULL;
781 }
782 }
783 atomic_inc(&head->count);
784 spin_unlock(&conf->device_lock);
785 }
786 spin_unlock_irq(conf->hash_locks + hash);
787
788 if (!head)
789 return;
790 if (!stripe_can_batch(head))
791 goto out;
792
793 lock_two_stripes(head, sh);
794
795 if (!stripe_can_batch(head) || !stripe_can_batch(sh))
796 goto unlock_out;
797
798 if (sh->batch_head)
799 goto unlock_out;
800
801 dd_idx = 0;
802 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
803 dd_idx++;
804 if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf ||
805 bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite))
806 goto unlock_out;
807
808 if (head->batch_head) {
809 spin_lock(&head->batch_head->batch_lock);
810
811 if (!stripe_can_batch(head)) {
812 spin_unlock(&head->batch_head->batch_lock);
813 goto unlock_out;
814 }
815
816
817
818
819
820 list_add(&sh->batch_list, &head->batch_list);
821 spin_unlock(&head->batch_head->batch_lock);
822
823 sh->batch_head = head->batch_head;
824 } else {
825 head->batch_head = head;
826 sh->batch_head = head->batch_head;
827 spin_lock(&head->batch_lock);
828 list_add_tail(&sh->batch_list, &head->batch_list);
829 spin_unlock(&head->batch_lock);
830 }
831
832 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
833 if (atomic_dec_return(&conf->preread_active_stripes)
834 < IO_THRESHOLD)
835 md_wakeup_thread(conf->mddev->thread);
836
837 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
838 int seq = sh->bm_seq;
839 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
840 sh->batch_head->bm_seq > seq)
841 seq = sh->batch_head->bm_seq;
842 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
843 sh->batch_head->bm_seq = seq;
844 }
845
846 atomic_inc(&sh->count);
847unlock_out:
848 unlock_two_stripes(head, sh);
849out:
850 raid5_release_stripe(head);
851}
852
853
854
855
856static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
857{
858 sector_t progress = conf->reshape_progress;
859
860
861
862
863 smp_rmb();
864 if (progress == MaxSector)
865 return 0;
866 if (sh->generation == conf->generation - 1)
867 return 0;
868
869
870
871 return 1;
872}
873
874static void dispatch_bio_list(struct bio_list *tmp)
875{
876 struct bio *bio;
877
878 while ((bio = bio_list_pop(tmp)))
879 generic_make_request(bio);
880}
881
882static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
883{
884 const struct r5pending_data *da = list_entry(a,
885 struct r5pending_data, sibling);
886 const struct r5pending_data *db = list_entry(b,
887 struct r5pending_data, sibling);
888 if (da->sector > db->sector)
889 return 1;
890 if (da->sector < db->sector)
891 return -1;
892 return 0;
893}
894
895static void dispatch_defer_bios(struct r5conf *conf, int target,
896 struct bio_list *list)
897{
898 struct r5pending_data *data;
899 struct list_head *first, *next = NULL;
900 int cnt = 0;
901
902 if (conf->pending_data_cnt == 0)
903 return;
904
905 list_sort(NULL, &conf->pending_list, cmp_stripe);
906
907 first = conf->pending_list.next;
908
909
910 if (conf->next_pending_data)
911 list_move_tail(&conf->pending_list,
912 &conf->next_pending_data->sibling);
913
914 while (!list_empty(&conf->pending_list)) {
915 data = list_first_entry(&conf->pending_list,
916 struct r5pending_data, sibling);
917 if (&data->sibling == first)
918 first = data->sibling.next;
919 next = data->sibling.next;
920
921 bio_list_merge(list, &data->bios);
922 list_move(&data->sibling, &conf->free_list);
923 cnt++;
924 if (cnt >= target)
925 break;
926 }
927 conf->pending_data_cnt -= cnt;
928 BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
929
930 if (next != &conf->pending_list)
931 conf->next_pending_data = list_entry(next,
932 struct r5pending_data, sibling);
933 else
934 conf->next_pending_data = NULL;
935
936 if (first != &conf->pending_list)
937 list_move_tail(&conf->pending_list, first);
938}
939
940static void flush_deferred_bios(struct r5conf *conf)
941{
942 struct bio_list tmp = BIO_EMPTY_LIST;
943
944 if (conf->pending_data_cnt == 0)
945 return;
946
947 spin_lock(&conf->pending_bios_lock);
948 dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
949 BUG_ON(conf->pending_data_cnt != 0);
950 spin_unlock(&conf->pending_bios_lock);
951
952 dispatch_bio_list(&tmp);
953}
954
955static void defer_issue_bios(struct r5conf *conf, sector_t sector,
956 struct bio_list *bios)
957{
958 struct bio_list tmp = BIO_EMPTY_LIST;
959 struct r5pending_data *ent;
960
961 spin_lock(&conf->pending_bios_lock);
962 ent = list_first_entry(&conf->free_list, struct r5pending_data,
963 sibling);
964 list_move_tail(&ent->sibling, &conf->pending_list);
965 ent->sector = sector;
966 bio_list_init(&ent->bios);
967 bio_list_merge(&ent->bios, bios);
968 conf->pending_data_cnt++;
969 if (conf->pending_data_cnt >= PENDING_IO_MAX)
970 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
971
972 spin_unlock(&conf->pending_bios_lock);
973
974 dispatch_bio_list(&tmp);
975}
976
977static void
978raid5_end_read_request(struct bio *bi);
979static void
980raid5_end_write_request(struct bio *bi);
981
982static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
983{
984 struct r5conf *conf = sh->raid_conf;
985 int i, disks = sh->disks;
986 struct stripe_head *head_sh = sh;
987 struct bio_list pending_bios = BIO_EMPTY_LIST;
988 bool should_defer;
989
990 might_sleep();
991
992 if (log_stripe(sh, s) == 0)
993 return;
994
995 should_defer = conf->batch_bio_dispatch && conf->group_cnt;
996
997 for (i = disks; i--; ) {
998 int op, op_flags = 0;
999 int replace_only = 0;
1000 struct bio *bi, *rbi;
1001 struct md_rdev *rdev, *rrdev = NULL;
1002
1003 sh = head_sh;
1004 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
1005 op = REQ_OP_WRITE;
1006 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
1007 op_flags = REQ_FUA;
1008 if (test_bit(R5_Discard, &sh->dev[i].flags))
1009 op = REQ_OP_DISCARD;
1010 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1011 op = REQ_OP_READ;
1012 else if (test_and_clear_bit(R5_WantReplace,
1013 &sh->dev[i].flags)) {
1014 op = REQ_OP_WRITE;
1015 replace_only = 1;
1016 } else
1017 continue;
1018 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
1019 op_flags |= REQ_SYNC;
1020
1021again:
1022 bi = &sh->dev[i].req;
1023 rbi = &sh->dev[i].rreq;
1024
1025 rcu_read_lock();
1026 rrdev = rcu_dereference(conf->disks[i].replacement);
1027 smp_mb();
1028 rdev = rcu_dereference(conf->disks[i].rdev);
1029 if (!rdev) {
1030 rdev = rrdev;
1031 rrdev = NULL;
1032 }
1033 if (op_is_write(op)) {
1034 if (replace_only)
1035 rdev = NULL;
1036 if (rdev == rrdev)
1037
1038 rrdev = NULL;
1039 } else {
1040 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
1041 rdev = rrdev;
1042 rrdev = NULL;
1043 }
1044
1045 if (rdev && test_bit(Faulty, &rdev->flags))
1046 rdev = NULL;
1047 if (rdev)
1048 atomic_inc(&rdev->nr_pending);
1049 if (rrdev && test_bit(Faulty, &rrdev->flags))
1050 rrdev = NULL;
1051 if (rrdev)
1052 atomic_inc(&rrdev->nr_pending);
1053 rcu_read_unlock();
1054
1055
1056
1057
1058
1059 while (op_is_write(op) && rdev &&
1060 test_bit(WriteErrorSeen, &rdev->flags)) {
1061 sector_t first_bad;
1062 int bad_sectors;
1063 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
1064 &first_bad, &bad_sectors);
1065 if (!bad)
1066 break;
1067
1068 if (bad < 0) {
1069 set_bit(BlockedBadBlocks, &rdev->flags);
1070 if (!conf->mddev->external &&
1071 conf->mddev->sb_flags) {
1072
1073
1074
1075
1076 md_check_recovery(conf->mddev);
1077 }
1078
1079
1080
1081
1082
1083 atomic_inc(&rdev->nr_pending);
1084 md_wait_for_blocked_rdev(rdev, conf->mddev);
1085 } else {
1086
1087 rdev_dec_pending(rdev, conf->mddev);
1088 rdev = NULL;
1089 }
1090 }
1091
1092 if (rdev) {
1093 if (s->syncing || s->expanding || s->expanded
1094 || s->replacing)
1095 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1096
1097 set_bit(STRIPE_IO_STARTED, &sh->state);
1098
1099 bi->bi_bdev = rdev->bdev;
1100 bio_set_op_attrs(bi, op, op_flags);
1101 bi->bi_end_io = op_is_write(op)
1102 ? raid5_end_write_request
1103 : raid5_end_read_request;
1104 bi->bi_private = sh;
1105
1106 pr_debug("%s: for %llu schedule op %d on disc %d\n",
1107 __func__, (unsigned long long)sh->sector,
1108 bi->bi_opf, i);
1109 atomic_inc(&sh->count);
1110 if (sh != head_sh)
1111 atomic_inc(&head_sh->count);
1112 if (use_new_offset(conf, sh))
1113 bi->bi_iter.bi_sector = (sh->sector
1114 + rdev->new_data_offset);
1115 else
1116 bi->bi_iter.bi_sector = (sh->sector
1117 + rdev->data_offset);
1118 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
1119 bi->bi_opf |= REQ_NOMERGE;
1120
1121 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1122 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1123
1124 if (!op_is_write(op) &&
1125 test_bit(R5_InJournal, &sh->dev[i].flags))
1126
1127
1128
1129
1130
1131 sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
1132 else
1133 sh->dev[i].vec.bv_page = sh->dev[i].page;
1134 bi->bi_vcnt = 1;
1135 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1136 bi->bi_io_vec[0].bv_offset = 0;
1137 bi->bi_iter.bi_size = STRIPE_SIZE;
1138
1139
1140
1141
1142 if (op == REQ_OP_DISCARD)
1143 bi->bi_vcnt = 0;
1144 if (rrdev)
1145 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
1146
1147 if (conf->mddev->gendisk)
1148 trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
1149 bi, disk_devt(conf->mddev->gendisk),
1150 sh->dev[i].sector);
1151 if (should_defer && op_is_write(op))
1152 bio_list_add(&pending_bios, bi);
1153 else
1154 generic_make_request(bi);
1155 }
1156 if (rrdev) {
1157 if (s->syncing || s->expanding || s->expanded
1158 || s->replacing)
1159 md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
1160
1161 set_bit(STRIPE_IO_STARTED, &sh->state);
1162
1163 rbi->bi_bdev = rrdev->bdev;
1164 bio_set_op_attrs(rbi, op, op_flags);
1165 BUG_ON(!op_is_write(op));
1166 rbi->bi_end_io = raid5_end_write_request;
1167 rbi->bi_private = sh;
1168
1169 pr_debug("%s: for %llu schedule op %d on "
1170 "replacement disc %d\n",
1171 __func__, (unsigned long long)sh->sector,
1172 rbi->bi_opf, i);
1173 atomic_inc(&sh->count);
1174 if (sh != head_sh)
1175 atomic_inc(&head_sh->count);
1176 if (use_new_offset(conf, sh))
1177 rbi->bi_iter.bi_sector = (sh->sector
1178 + rrdev->new_data_offset);
1179 else
1180 rbi->bi_iter.bi_sector = (sh->sector
1181 + rrdev->data_offset);
1182 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1183 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1184 sh->dev[i].rvec.bv_page = sh->dev[i].page;
1185 rbi->bi_vcnt = 1;
1186 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1187 rbi->bi_io_vec[0].bv_offset = 0;
1188 rbi->bi_iter.bi_size = STRIPE_SIZE;
1189
1190
1191
1192
1193 if (op == REQ_OP_DISCARD)
1194 rbi->bi_vcnt = 0;
1195 if (conf->mddev->gendisk)
1196 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
1197 rbi, disk_devt(conf->mddev->gendisk),
1198 sh->dev[i].sector);
1199 if (should_defer && op_is_write(op))
1200 bio_list_add(&pending_bios, rbi);
1201 else
1202 generic_make_request(rbi);
1203 }
1204 if (!rdev && !rrdev) {
1205 if (op_is_write(op))
1206 set_bit(STRIPE_DEGRADED, &sh->state);
1207 pr_debug("skip op %d on disc %d for sector %llu\n",
1208 bi->bi_opf, i, (unsigned long long)sh->sector);
1209 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1210 set_bit(STRIPE_HANDLE, &sh->state);
1211 }
1212
1213 if (!head_sh->batch_head)
1214 continue;
1215 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1216 batch_list);
1217 if (sh != head_sh)
1218 goto again;
1219 }
1220
1221 if (should_defer && !bio_list_empty(&pending_bios))
1222 defer_issue_bios(conf, head_sh->sector, &pending_bios);
1223}
1224
1225static struct dma_async_tx_descriptor *
1226async_copy_data(int frombio, struct bio *bio, struct page **page,
1227 sector_t sector, struct dma_async_tx_descriptor *tx,
1228 struct stripe_head *sh, int no_skipcopy)
1229{
1230 struct bio_vec bvl;
1231 struct bvec_iter iter;
1232 struct page *bio_page;
1233 int page_offset;
1234 struct async_submit_ctl submit;
1235 enum async_tx_flags flags = 0;
1236
1237 if (bio->bi_iter.bi_sector >= sector)
1238 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
1239 else
1240 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
1241
1242 if (frombio)
1243 flags |= ASYNC_TX_FENCE;
1244 init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
1245
1246 bio_for_each_segment(bvl, bio, iter) {
1247 int len = bvl.bv_len;
1248 int clen;
1249 int b_offset = 0;
1250
1251 if (page_offset < 0) {
1252 b_offset = -page_offset;
1253 page_offset += b_offset;
1254 len -= b_offset;
1255 }
1256
1257 if (len > 0 && page_offset + len > STRIPE_SIZE)
1258 clen = STRIPE_SIZE - page_offset;
1259 else
1260 clen = len;
1261
1262 if (clen > 0) {
1263 b_offset += bvl.bv_offset;
1264 bio_page = bvl.bv_page;
1265 if (frombio) {
1266 if (sh->raid_conf->skip_copy &&
1267 b_offset == 0 && page_offset == 0 &&
1268 clen == STRIPE_SIZE &&
1269 !no_skipcopy)
1270 *page = bio_page;
1271 else
1272 tx = async_memcpy(*page, bio_page, page_offset,
1273 b_offset, clen, &submit);
1274 } else
1275 tx = async_memcpy(bio_page, *page, b_offset,
1276 page_offset, clen, &submit);
1277 }
1278
1279 submit.depend_tx = tx;
1280
1281 if (clen < len)
1282 break;
1283 page_offset += len;
1284 }
1285
1286 return tx;
1287}
1288
1289static void ops_complete_biofill(void *stripe_head_ref)
1290{
1291 struct stripe_head *sh = stripe_head_ref;
1292 int i;
1293
1294 pr_debug("%s: stripe %llu\n", __func__,
1295 (unsigned long long)sh->sector);
1296
1297
1298 for (i = sh->disks; i--; ) {
1299 struct r5dev *dev = &sh->dev[i];
1300
1301
1302
1303
1304
1305
1306 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
1307 struct bio *rbi, *rbi2;
1308
1309 BUG_ON(!dev->read);
1310 rbi = dev->read;
1311 dev->read = NULL;
1312 while (rbi && rbi->bi_iter.bi_sector <
1313 dev->sector + STRIPE_SECTORS) {
1314 rbi2 = r5_next_bio(rbi, dev->sector);
1315 bio_endio(rbi);
1316 rbi = rbi2;
1317 }
1318 }
1319 }
1320 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
1321
1322 set_bit(STRIPE_HANDLE, &sh->state);
1323 raid5_release_stripe(sh);
1324}
1325
1326static void ops_run_biofill(struct stripe_head *sh)
1327{
1328 struct dma_async_tx_descriptor *tx = NULL;
1329 struct async_submit_ctl submit;
1330 int i;
1331
1332 BUG_ON(sh->batch_head);
1333 pr_debug("%s: stripe %llu\n", __func__,
1334 (unsigned long long)sh->sector);
1335
1336 for (i = sh->disks; i--; ) {
1337 struct r5dev *dev = &sh->dev[i];
1338 if (test_bit(R5_Wantfill, &dev->flags)) {
1339 struct bio *rbi;
1340 spin_lock_irq(&sh->stripe_lock);
1341 dev->read = rbi = dev->toread;
1342 dev->toread = NULL;
1343 spin_unlock_irq(&sh->stripe_lock);
1344 while (rbi && rbi->bi_iter.bi_sector <
1345 dev->sector + STRIPE_SECTORS) {
1346 tx = async_copy_data(0, rbi, &dev->page,
1347 dev->sector, tx, sh, 0);
1348 rbi = r5_next_bio(rbi, dev->sector);
1349 }
1350 }
1351 }
1352
1353 atomic_inc(&sh->count);
1354 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
1355 async_trigger_callback(&submit);
1356}
1357
1358static void mark_target_uptodate(struct stripe_head *sh, int target)
1359{
1360 struct r5dev *tgt;
1361
1362 if (target < 0)
1363 return;
1364
1365 tgt = &sh->dev[target];
1366 set_bit(R5_UPTODATE, &tgt->flags);
1367 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1368 clear_bit(R5_Wantcompute, &tgt->flags);
1369}
1370
1371static void ops_complete_compute(void *stripe_head_ref)
1372{
1373 struct stripe_head *sh = stripe_head_ref;
1374
1375 pr_debug("%s: stripe %llu\n", __func__,
1376 (unsigned long long)sh->sector);
1377
1378
1379 mark_target_uptodate(sh, sh->ops.target);
1380 mark_target_uptodate(sh, sh->ops.target2);
1381
1382 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
1383 if (sh->check_state == check_state_compute_run)
1384 sh->check_state = check_state_compute_result;
1385 set_bit(STRIPE_HANDLE, &sh->state);
1386 raid5_release_stripe(sh);
1387}
1388
1389
1390static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1391 struct raid5_percpu *percpu, int i)
1392{
1393 void *addr;
1394
1395 addr = flex_array_get(percpu->scribble, i);
1396 return addr + sizeof(struct page *) * (sh->disks + 2);
1397}
1398
1399
1400static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1401{
1402 void *addr;
1403
1404 addr = flex_array_get(percpu->scribble, i);
1405 return addr;
1406}
1407
1408static struct dma_async_tx_descriptor *
1409ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1410{
1411 int disks = sh->disks;
1412 struct page **xor_srcs = to_addr_page(percpu, 0);
1413 int target = sh->ops.target;
1414 struct r5dev *tgt = &sh->dev[target];
1415 struct page *xor_dest = tgt->page;
1416 int count = 0;
1417 struct dma_async_tx_descriptor *tx;
1418 struct async_submit_ctl submit;
1419 int i;
1420
1421 BUG_ON(sh->batch_head);
1422
1423 pr_debug("%s: stripe %llu block: %d\n",
1424 __func__, (unsigned long long)sh->sector, target);
1425 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1426
1427 for (i = disks; i--; )
1428 if (i != target)
1429 xor_srcs[count++] = sh->dev[i].page;
1430
1431 atomic_inc(&sh->count);
1432
1433 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
1434 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
1435 if (unlikely(count == 1))
1436 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1437 else
1438 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1439
1440 return tx;
1441}
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452static int set_syndrome_sources(struct page **srcs,
1453 struct stripe_head *sh,
1454 int srctype)
1455{
1456 int disks = sh->disks;
1457 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1458 int d0_idx = raid6_d0(sh);
1459 int count;
1460 int i;
1461
1462 for (i = 0; i < disks; i++)
1463 srcs[i] = NULL;
1464
1465 count = 0;
1466 i = d0_idx;
1467 do {
1468 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1469 struct r5dev *dev = &sh->dev[i];
1470
1471 if (i == sh->qd_idx || i == sh->pd_idx ||
1472 (srctype == SYNDROME_SRC_ALL) ||
1473 (srctype == SYNDROME_SRC_WANT_DRAIN &&
1474 (test_bit(R5_Wantdrain, &dev->flags) ||
1475 test_bit(R5_InJournal, &dev->flags))) ||
1476 (srctype == SYNDROME_SRC_WRITTEN &&
1477 (dev->written ||
1478 test_bit(R5_InJournal, &dev->flags)))) {
1479 if (test_bit(R5_InJournal, &dev->flags))
1480 srcs[slot] = sh->dev[i].orig_page;
1481 else
1482 srcs[slot] = sh->dev[i].page;
1483 }
1484 i = raid6_next_disk(i, disks);
1485 } while (i != d0_idx);
1486
1487 return syndrome_disks;
1488}
1489
1490static struct dma_async_tx_descriptor *
1491ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1492{
1493 int disks = sh->disks;
1494 struct page **blocks = to_addr_page(percpu, 0);
1495 int target;
1496 int qd_idx = sh->qd_idx;
1497 struct dma_async_tx_descriptor *tx;
1498 struct async_submit_ctl submit;
1499 struct r5dev *tgt;
1500 struct page *dest;
1501 int i;
1502 int count;
1503
1504 BUG_ON(sh->batch_head);
1505 if (sh->ops.target < 0)
1506 target = sh->ops.target2;
1507 else if (sh->ops.target2 < 0)
1508 target = sh->ops.target;
1509 else
1510
1511 BUG();
1512 BUG_ON(target < 0);
1513 pr_debug("%s: stripe %llu block: %d\n",
1514 __func__, (unsigned long long)sh->sector, target);
1515
1516 tgt = &sh->dev[target];
1517 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1518 dest = tgt->page;
1519
1520 atomic_inc(&sh->count);
1521
1522 if (target == qd_idx) {
1523 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1524 blocks[count] = NULL;
1525 BUG_ON(blocks[count+1] != dest);
1526 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1527 ops_complete_compute, sh,
1528 to_addr_conv(sh, percpu, 0));
1529 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1530 } else {
1531
1532 count = 0;
1533 for (i = disks; i-- ; ) {
1534 if (i == target || i == qd_idx)
1535 continue;
1536 blocks[count++] = sh->dev[i].page;
1537 }
1538
1539 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1540 NULL, ops_complete_compute, sh,
1541 to_addr_conv(sh, percpu, 0));
1542 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
1543 }
1544
1545 return tx;
1546}
1547
1548static struct dma_async_tx_descriptor *
1549ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1550{
1551 int i, count, disks = sh->disks;
1552 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1553 int d0_idx = raid6_d0(sh);
1554 int faila = -1, failb = -1;
1555 int target = sh->ops.target;
1556 int target2 = sh->ops.target2;
1557 struct r5dev *tgt = &sh->dev[target];
1558 struct r5dev *tgt2 = &sh->dev[target2];
1559 struct dma_async_tx_descriptor *tx;
1560 struct page **blocks = to_addr_page(percpu, 0);
1561 struct async_submit_ctl submit;
1562
1563 BUG_ON(sh->batch_head);
1564 pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1565 __func__, (unsigned long long)sh->sector, target, target2);
1566 BUG_ON(target < 0 || target2 < 0);
1567 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1568 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1569
1570
1571
1572
1573 for (i = 0; i < disks ; i++)
1574 blocks[i] = NULL;
1575 count = 0;
1576 i = d0_idx;
1577 do {
1578 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1579
1580 blocks[slot] = sh->dev[i].page;
1581
1582 if (i == target)
1583 faila = slot;
1584 if (i == target2)
1585 failb = slot;
1586 i = raid6_next_disk(i, disks);
1587 } while (i != d0_idx);
1588
1589 BUG_ON(faila == failb);
1590 if (failb < faila)
1591 swap(faila, failb);
1592 pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1593 __func__, (unsigned long long)sh->sector, faila, failb);
1594
1595 atomic_inc(&sh->count);
1596
1597 if (failb == syndrome_disks+1) {
1598
1599 if (faila == syndrome_disks) {
1600
1601 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1602 ops_complete_compute, sh,
1603 to_addr_conv(sh, percpu, 0));
1604 return async_gen_syndrome(blocks, 0, syndrome_disks+2,
1605 STRIPE_SIZE, &submit);
1606 } else {
1607 struct page *dest;
1608 int data_target;
1609 int qd_idx = sh->qd_idx;
1610
1611
1612 if (target == qd_idx)
1613 data_target = target2;
1614 else
1615 data_target = target;
1616
1617 count = 0;
1618 for (i = disks; i-- ; ) {
1619 if (i == data_target || i == qd_idx)
1620 continue;
1621 blocks[count++] = sh->dev[i].page;
1622 }
1623 dest = sh->dev[data_target].page;
1624 init_async_submit(&submit,
1625 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1626 NULL, NULL, NULL,
1627 to_addr_conv(sh, percpu, 0));
1628 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
1629 &submit);
1630
1631 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1632 init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1633 ops_complete_compute, sh,
1634 to_addr_conv(sh, percpu, 0));
1635 return async_gen_syndrome(blocks, 0, count+2,
1636 STRIPE_SIZE, &submit);
1637 }
1638 } else {
1639 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1640 ops_complete_compute, sh,
1641 to_addr_conv(sh, percpu, 0));
1642 if (failb == syndrome_disks) {
1643
1644 return async_raid6_datap_recov(syndrome_disks+2,
1645 STRIPE_SIZE, faila,
1646 blocks, &submit);
1647 } else {
1648
1649 return async_raid6_2data_recov(syndrome_disks+2,
1650 STRIPE_SIZE, faila, failb,
1651 blocks, &submit);
1652 }
1653 }
1654}
1655
1656static void ops_complete_prexor(void *stripe_head_ref)
1657{
1658 struct stripe_head *sh = stripe_head_ref;
1659
1660 pr_debug("%s: stripe %llu\n", __func__,
1661 (unsigned long long)sh->sector);
1662
1663 if (r5c_is_writeback(sh->raid_conf->log))
1664
1665
1666
1667
1668 r5c_release_extra_page(sh);
1669}
1670
1671static struct dma_async_tx_descriptor *
1672ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1673 struct dma_async_tx_descriptor *tx)
1674{
1675 int disks = sh->disks;
1676 struct page **xor_srcs = to_addr_page(percpu, 0);
1677 int count = 0, pd_idx = sh->pd_idx, i;
1678 struct async_submit_ctl submit;
1679
1680
1681 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1682
1683 BUG_ON(sh->batch_head);
1684 pr_debug("%s: stripe %llu\n", __func__,
1685 (unsigned long long)sh->sector);
1686
1687 for (i = disks; i--; ) {
1688 struct r5dev *dev = &sh->dev[i];
1689
1690 if (test_bit(R5_InJournal, &dev->flags))
1691 xor_srcs[count++] = dev->orig_page;
1692 else if (test_bit(R5_Wantdrain, &dev->flags))
1693 xor_srcs[count++] = dev->page;
1694 }
1695
1696 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1697 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1698 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1699
1700 return tx;
1701}
1702
1703static struct dma_async_tx_descriptor *
1704ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
1705 struct dma_async_tx_descriptor *tx)
1706{
1707 struct page **blocks = to_addr_page(percpu, 0);
1708 int count;
1709 struct async_submit_ctl submit;
1710
1711 pr_debug("%s: stripe %llu\n", __func__,
1712 (unsigned long long)sh->sector);
1713
1714 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
1715
1716 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
1717 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1718 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1719
1720 return tx;
1721}
1722
1723static struct dma_async_tx_descriptor *
1724ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1725{
1726 struct r5conf *conf = sh->raid_conf;
1727 int disks = sh->disks;
1728 int i;
1729 struct stripe_head *head_sh = sh;
1730
1731 pr_debug("%s: stripe %llu\n", __func__,
1732 (unsigned long long)sh->sector);
1733
1734 for (i = disks; i--; ) {
1735 struct r5dev *dev;
1736 struct bio *chosen;
1737
1738 sh = head_sh;
1739 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
1740 struct bio *wbi;
1741
1742again:
1743 dev = &sh->dev[i];
1744
1745
1746
1747
1748 clear_bit(R5_InJournal, &dev->flags);
1749 spin_lock_irq(&sh->stripe_lock);
1750 chosen = dev->towrite;
1751 dev->towrite = NULL;
1752 sh->overwrite_disks = 0;
1753 BUG_ON(dev->written);
1754 wbi = dev->written = chosen;
1755 spin_unlock_irq(&sh->stripe_lock);
1756 WARN_ON(dev->page != dev->orig_page);
1757
1758 while (wbi && wbi->bi_iter.bi_sector <
1759 dev->sector + STRIPE_SECTORS) {
1760 if (wbi->bi_opf & REQ_FUA)
1761 set_bit(R5_WantFUA, &dev->flags);
1762 if (wbi->bi_opf & REQ_SYNC)
1763 set_bit(R5_SyncIO, &dev->flags);
1764 if (bio_op(wbi) == REQ_OP_DISCARD)
1765 set_bit(R5_Discard, &dev->flags);
1766 else {
1767 tx = async_copy_data(1, wbi, &dev->page,
1768 dev->sector, tx, sh,
1769 r5c_is_writeback(conf->log));
1770 if (dev->page != dev->orig_page &&
1771 !r5c_is_writeback(conf->log)) {
1772 set_bit(R5_SkipCopy, &dev->flags);
1773 clear_bit(R5_UPTODATE, &dev->flags);
1774 clear_bit(R5_OVERWRITE, &dev->flags);
1775 }
1776 }
1777 wbi = r5_next_bio(wbi, dev->sector);
1778 }
1779
1780 if (head_sh->batch_head) {
1781 sh = list_first_entry(&sh->batch_list,
1782 struct stripe_head,
1783 batch_list);
1784 if (sh == head_sh)
1785 continue;
1786 goto again;
1787 }
1788 }
1789 }
1790
1791 return tx;
1792}
1793
1794static void ops_complete_reconstruct(void *stripe_head_ref)
1795{
1796 struct stripe_head *sh = stripe_head_ref;
1797 int disks = sh->disks;
1798 int pd_idx = sh->pd_idx;
1799 int qd_idx = sh->qd_idx;
1800 int i;
1801 bool fua = false, sync = false, discard = false;
1802
1803 pr_debug("%s: stripe %llu\n", __func__,
1804 (unsigned long long)sh->sector);
1805
1806 for (i = disks; i--; ) {
1807 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1808 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1809 discard |= test_bit(R5_Discard, &sh->dev[i].flags);
1810 }
1811
1812 for (i = disks; i--; ) {
1813 struct r5dev *dev = &sh->dev[i];
1814
1815 if (dev->written || i == pd_idx || i == qd_idx) {
1816 if (!discard && !test_bit(R5_SkipCopy, &dev->flags))
1817 set_bit(R5_UPTODATE, &dev->flags);
1818 if (fua)
1819 set_bit(R5_WantFUA, &dev->flags);
1820 if (sync)
1821 set_bit(R5_SyncIO, &dev->flags);
1822 }
1823 }
1824
1825 if (sh->reconstruct_state == reconstruct_state_drain_run)
1826 sh->reconstruct_state = reconstruct_state_drain_result;
1827 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
1828 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
1829 else {
1830 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
1831 sh->reconstruct_state = reconstruct_state_result;
1832 }
1833
1834 set_bit(STRIPE_HANDLE, &sh->state);
1835 raid5_release_stripe(sh);
1836}
1837
1838static void
1839ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1840 struct dma_async_tx_descriptor *tx)
1841{
1842 int disks = sh->disks;
1843 struct page **xor_srcs;
1844 struct async_submit_ctl submit;
1845 int count, pd_idx = sh->pd_idx, i;
1846 struct page *xor_dest;
1847 int prexor = 0;
1848 unsigned long flags;
1849 int j = 0;
1850 struct stripe_head *head_sh = sh;
1851 int last_stripe;
1852
1853 pr_debug("%s: stripe %llu\n", __func__,
1854 (unsigned long long)sh->sector);
1855
1856 for (i = 0; i < sh->disks; i++) {
1857 if (pd_idx == i)
1858 continue;
1859 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1860 break;
1861 }
1862 if (i >= sh->disks) {
1863 atomic_inc(&sh->count);
1864 set_bit(R5_Discard, &sh->dev[pd_idx].flags);
1865 ops_complete_reconstruct(sh);
1866 return;
1867 }
1868again:
1869 count = 0;
1870 xor_srcs = to_addr_page(percpu, j);
1871
1872
1873
1874 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1875 prexor = 1;
1876 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1877 for (i = disks; i--; ) {
1878 struct r5dev *dev = &sh->dev[i];
1879 if (head_sh->dev[i].written ||
1880 test_bit(R5_InJournal, &head_sh->dev[i].flags))
1881 xor_srcs[count++] = dev->page;
1882 }
1883 } else {
1884 xor_dest = sh->dev[pd_idx].page;
1885 for (i = disks; i--; ) {
1886 struct r5dev *dev = &sh->dev[i];
1887 if (i != pd_idx)
1888 xor_srcs[count++] = dev->page;
1889 }
1890 }
1891
1892
1893
1894
1895
1896
1897 last_stripe = !head_sh->batch_head ||
1898 list_first_entry(&sh->batch_list,
1899 struct stripe_head, batch_list) == head_sh;
1900 if (last_stripe) {
1901 flags = ASYNC_TX_ACK |
1902 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1903
1904 atomic_inc(&head_sh->count);
1905 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
1906 to_addr_conv(sh, percpu, j));
1907 } else {
1908 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
1909 init_async_submit(&submit, flags, tx, NULL, NULL,
1910 to_addr_conv(sh, percpu, j));
1911 }
1912
1913 if (unlikely(count == 1))
1914 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1915 else
1916 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1917 if (!last_stripe) {
1918 j++;
1919 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1920 batch_list);
1921 goto again;
1922 }
1923}
1924
1925static void
1926ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1927 struct dma_async_tx_descriptor *tx)
1928{
1929 struct async_submit_ctl submit;
1930 struct page **blocks;
1931 int count, i, j = 0;
1932 struct stripe_head *head_sh = sh;
1933 int last_stripe;
1934 int synflags;
1935 unsigned long txflags;
1936
1937 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1938
1939 for (i = 0; i < sh->disks; i++) {
1940 if (sh->pd_idx == i || sh->qd_idx == i)
1941 continue;
1942 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1943 break;
1944 }
1945 if (i >= sh->disks) {
1946 atomic_inc(&sh->count);
1947 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
1948 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
1949 ops_complete_reconstruct(sh);
1950 return;
1951 }
1952
1953again:
1954 blocks = to_addr_page(percpu, j);
1955
1956 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1957 synflags = SYNDROME_SRC_WRITTEN;
1958 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
1959 } else {
1960 synflags = SYNDROME_SRC_ALL;
1961 txflags = ASYNC_TX_ACK;
1962 }
1963
1964 count = set_syndrome_sources(blocks, sh, synflags);
1965 last_stripe = !head_sh->batch_head ||
1966 list_first_entry(&sh->batch_list,
1967 struct stripe_head, batch_list) == head_sh;
1968
1969 if (last_stripe) {
1970 atomic_inc(&head_sh->count);
1971 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
1972 head_sh, to_addr_conv(sh, percpu, j));
1973 } else
1974 init_async_submit(&submit, 0, tx, NULL, NULL,
1975 to_addr_conv(sh, percpu, j));
1976 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1977 if (!last_stripe) {
1978 j++;
1979 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1980 batch_list);
1981 goto again;
1982 }
1983}
1984
1985static void ops_complete_check(void *stripe_head_ref)
1986{
1987 struct stripe_head *sh = stripe_head_ref;
1988
1989 pr_debug("%s: stripe %llu\n", __func__,
1990 (unsigned long long)sh->sector);
1991
1992 sh->check_state = check_state_check_result;
1993 set_bit(STRIPE_HANDLE, &sh->state);
1994 raid5_release_stripe(sh);
1995}
1996
1997static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
1998{
1999 int disks = sh->disks;
2000 int pd_idx = sh->pd_idx;
2001 int qd_idx = sh->qd_idx;
2002 struct page *xor_dest;
2003 struct page **xor_srcs = to_addr_page(percpu, 0);
2004 struct dma_async_tx_descriptor *tx;
2005 struct async_submit_ctl submit;
2006 int count;
2007 int i;
2008
2009 pr_debug("%s: stripe %llu\n", __func__,
2010 (unsigned long long)sh->sector);
2011
2012 BUG_ON(sh->batch_head);
2013 count = 0;
2014 xor_dest = sh->dev[pd_idx].page;
2015 xor_srcs[count++] = xor_dest;
2016 for (i = disks; i--; ) {
2017 if (i == pd_idx || i == qd_idx)
2018 continue;
2019 xor_srcs[count++] = sh->dev[i].page;
2020 }
2021
2022 init_async_submit(&submit, 0, NULL, NULL, NULL,
2023 to_addr_conv(sh, percpu, 0));
2024 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
2025 &sh->ops.zero_sum_result, &submit);
2026
2027 atomic_inc(&sh->count);
2028 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
2029 tx = async_trigger_callback(&submit);
2030}
2031
2032static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
2033{
2034 struct page **srcs = to_addr_page(percpu, 0);
2035 struct async_submit_ctl submit;
2036 int count;
2037
2038 pr_debug("%s: stripe %llu checkp: %d\n", __func__,
2039 (unsigned long long)sh->sector, checkp);
2040
2041 BUG_ON(sh->batch_head);
2042 count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
2043 if (!checkp)
2044 srcs[count] = NULL;
2045
2046 atomic_inc(&sh->count);
2047 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
2048 sh, to_addr_conv(sh, percpu, 0));
2049 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
2050 &sh->ops.zero_sum_result, percpu->spare_page, &submit);
2051}
2052
2053static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
2054{
2055 int overlap_clear = 0, i, disks = sh->disks;
2056 struct dma_async_tx_descriptor *tx = NULL;
2057 struct r5conf *conf = sh->raid_conf;
2058 int level = conf->level;
2059 struct raid5_percpu *percpu;
2060 unsigned long cpu;
2061
2062 cpu = get_cpu();
2063 percpu = per_cpu_ptr(conf->percpu, cpu);
2064 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
2065 ops_run_biofill(sh);
2066 overlap_clear++;
2067 }
2068
2069 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
2070 if (level < 6)
2071 tx = ops_run_compute5(sh, percpu);
2072 else {
2073 if (sh->ops.target2 < 0 || sh->ops.target < 0)
2074 tx = ops_run_compute6_1(sh, percpu);
2075 else
2076 tx = ops_run_compute6_2(sh, percpu);
2077 }
2078
2079 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
2080 async_tx_ack(tx);
2081 }
2082
2083 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
2084 if (level < 6)
2085 tx = ops_run_prexor5(sh, percpu, tx);
2086 else
2087 tx = ops_run_prexor6(sh, percpu, tx);
2088 }
2089
2090 if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
2091 tx = ops_run_partial_parity(sh, percpu, tx);
2092
2093 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
2094 tx = ops_run_biodrain(sh, tx);
2095 overlap_clear++;
2096 }
2097
2098 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
2099 if (level < 6)
2100 ops_run_reconstruct5(sh, percpu, tx);
2101 else
2102 ops_run_reconstruct6(sh, percpu, tx);
2103 }
2104
2105 if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
2106 if (sh->check_state == check_state_run)
2107 ops_run_check_p(sh, percpu);
2108 else if (sh->check_state == check_state_run_q)
2109 ops_run_check_pq(sh, percpu, 0);
2110 else if (sh->check_state == check_state_run_pq)
2111 ops_run_check_pq(sh, percpu, 1);
2112 else
2113 BUG();
2114 }
2115
2116 if (overlap_clear && !sh->batch_head)
2117 for (i = disks; i--; ) {
2118 struct r5dev *dev = &sh->dev[i];
2119 if (test_and_clear_bit(R5_Overlap, &dev->flags))
2120 wake_up(&sh->raid_conf->wait_for_overlap);
2121 }
2122 put_cpu();
2123}
2124
2125static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
2126{
2127 if (sh->ppl_page)
2128 __free_page(sh->ppl_page);
2129 kmem_cache_free(sc, sh);
2130}
2131
2132static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
2133 int disks, struct r5conf *conf)
2134{
2135 struct stripe_head *sh;
2136 int i;
2137
2138 sh = kmem_cache_zalloc(sc, gfp);
2139 if (sh) {
2140 spin_lock_init(&sh->stripe_lock);
2141 spin_lock_init(&sh->batch_lock);
2142 INIT_LIST_HEAD(&sh->batch_list);
2143 INIT_LIST_HEAD(&sh->lru);
2144 INIT_LIST_HEAD(&sh->r5c);
2145 INIT_LIST_HEAD(&sh->log_list);
2146 atomic_set(&sh->count, 1);
2147 sh->raid_conf = conf;
2148 sh->log_start = MaxSector;
2149 for (i = 0; i < disks; i++) {
2150 struct r5dev *dev = &sh->dev[i];
2151
2152 bio_init(&dev->req, &dev->vec, 1);
2153 bio_init(&dev->rreq, &dev->rvec, 1);
2154 }
2155
2156 if (raid5_has_ppl(conf)) {
2157 sh->ppl_page = alloc_page(gfp);
2158 if (!sh->ppl_page) {
2159 free_stripe(sc, sh);
2160 sh = NULL;
2161 }
2162 }
2163 }
2164 return sh;
2165}
2166static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
2167{
2168 struct stripe_head *sh;
2169
2170 sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
2171 if (!sh)
2172 return 0;
2173
2174 if (grow_buffers(sh, gfp)) {
2175 shrink_buffers(sh);
2176 free_stripe(conf->slab_cache, sh);
2177 return 0;
2178 }
2179 sh->hash_lock_index =
2180 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
2181
2182 atomic_inc(&conf->active_stripes);
2183
2184 raid5_release_stripe(sh);
2185 conf->max_nr_stripes++;
2186 return 1;
2187}
2188
2189static int grow_stripes(struct r5conf *conf, int num)
2190{
2191 struct kmem_cache *sc;
2192 int devs = max(conf->raid_disks, conf->previous_raid_disks);
2193
2194 if (conf->mddev->gendisk)
2195 sprintf(conf->cache_name[0],
2196 "raid%d-%s", conf->level, mdname(conf->mddev));
2197 else
2198 sprintf(conf->cache_name[0],
2199 "raid%d-%p", conf->level, conf->mddev);
2200 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]);
2201
2202 conf->active_name = 0;
2203 sc = kmem_cache_create(conf->cache_name[conf->active_name],
2204 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
2205 0, 0, NULL);
2206 if (!sc)
2207 return 1;
2208 conf->slab_cache = sc;
2209 conf->pool_size = devs;
2210 while (num--)
2211 if (!grow_one_stripe(conf, GFP_KERNEL))
2212 return 1;
2213
2214 return 0;
2215}
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
2231{
2232 struct flex_array *ret;
2233 size_t len;
2234
2235 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
2236 ret = flex_array_alloc(len, cnt, flags);
2237 if (!ret)
2238 return NULL;
2239
2240 if (flex_array_prealloc(ret, 0, cnt, flags)) {
2241 flex_array_free(ret);
2242 return NULL;
2243 }
2244 return ret;
2245}
2246
2247static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
2248{
2249 unsigned long cpu;
2250 int err = 0;
2251
2252
2253
2254
2255
2256
2257 if (conf->scribble_disks >= new_disks &&
2258 conf->scribble_sectors >= new_sectors)
2259 return 0;
2260 mddev_suspend(conf->mddev);
2261 get_online_cpus();
2262 for_each_present_cpu(cpu) {
2263 struct raid5_percpu *percpu;
2264 struct flex_array *scribble;
2265
2266 percpu = per_cpu_ptr(conf->percpu, cpu);
2267 scribble = scribble_alloc(new_disks,
2268 new_sectors / STRIPE_SECTORS,
2269 GFP_NOIO);
2270
2271 if (scribble) {
2272 flex_array_free(percpu->scribble);
2273 percpu->scribble = scribble;
2274 } else {
2275 err = -ENOMEM;
2276 break;
2277 }
2278 }
2279 put_online_cpus();
2280 mddev_resume(conf->mddev);
2281 if (!err) {
2282 conf->scribble_disks = new_disks;
2283 conf->scribble_sectors = new_sectors;
2284 }
2285 return err;
2286}
2287
2288static int resize_stripes(struct r5conf *conf, int newsize)
2289{
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313 struct stripe_head *osh, *nsh;
2314 LIST_HEAD(newstripes);
2315 struct disk_info *ndisks;
2316 int err = 0;
2317 struct kmem_cache *sc;
2318 int i;
2319 int hash, cnt;
2320
2321 md_allow_write(conf->mddev);
2322
2323
2324 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
2325 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
2326 0, 0, NULL);
2327 if (!sc)
2328 return -ENOMEM;
2329
2330
2331 mutex_lock(&conf->cache_size_mutex);
2332
2333 for (i = conf->max_nr_stripes; i; i--) {
2334 nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
2335 if (!nsh)
2336 break;
2337
2338 list_add(&nsh->lru, &newstripes);
2339 }
2340 if (i) {
2341
2342 while (!list_empty(&newstripes)) {
2343 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2344 list_del(&nsh->lru);
2345 free_stripe(sc, nsh);
2346 }
2347 kmem_cache_destroy(sc);
2348 mutex_unlock(&conf->cache_size_mutex);
2349 return -ENOMEM;
2350 }
2351
2352
2353
2354
2355 hash = 0;
2356 cnt = 0;
2357 list_for_each_entry(nsh, &newstripes, lru) {
2358 lock_device_hash_lock(conf, hash);
2359 wait_event_cmd(conf->wait_for_stripe,
2360 !list_empty(conf->inactive_list + hash),
2361 unlock_device_hash_lock(conf, hash),
2362 lock_device_hash_lock(conf, hash));
2363 osh = get_free_stripe(conf, hash);
2364 unlock_device_hash_lock(conf, hash);
2365
2366 for(i=0; i<conf->pool_size; i++) {
2367 nsh->dev[i].page = osh->dev[i].page;
2368 nsh->dev[i].orig_page = osh->dev[i].page;
2369 }
2370 nsh->hash_lock_index = hash;
2371 free_stripe(conf->slab_cache, osh);
2372 cnt++;
2373 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
2374 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
2375 hash++;
2376 cnt = 0;
2377 }
2378 }
2379 kmem_cache_destroy(conf->slab_cache);
2380
2381
2382
2383
2384
2385
2386 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
2387 if (ndisks) {
2388 for (i = 0; i < conf->pool_size; i++)
2389 ndisks[i] = conf->disks[i];
2390
2391 for (i = conf->pool_size; i < newsize; i++) {
2392 ndisks[i].extra_page = alloc_page(GFP_NOIO);
2393 if (!ndisks[i].extra_page)
2394 err = -ENOMEM;
2395 }
2396
2397 if (err) {
2398 for (i = conf->pool_size; i < newsize; i++)
2399 if (ndisks[i].extra_page)
2400 put_page(ndisks[i].extra_page);
2401 kfree(ndisks);
2402 } else {
2403 kfree(conf->disks);
2404 conf->disks = ndisks;
2405 }
2406 } else
2407 err = -ENOMEM;
2408
2409 mutex_unlock(&conf->cache_size_mutex);
2410
2411 conf->slab_cache = sc;
2412 conf->active_name = 1-conf->active_name;
2413
2414
2415 while(!list_empty(&newstripes)) {
2416 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2417 list_del_init(&nsh->lru);
2418
2419 for (i=conf->raid_disks; i < newsize; i++)
2420 if (nsh->dev[i].page == NULL) {
2421 struct page *p = alloc_page(GFP_NOIO);
2422 nsh->dev[i].page = p;
2423 nsh->dev[i].orig_page = p;
2424 if (!p)
2425 err = -ENOMEM;
2426 }
2427 raid5_release_stripe(nsh);
2428 }
2429
2430
2431 if (!err)
2432 conf->pool_size = newsize;
2433 return err;
2434}
2435
2436static int drop_one_stripe(struct r5conf *conf)
2437{
2438 struct stripe_head *sh;
2439 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
2440
2441 spin_lock_irq(conf->hash_locks + hash);
2442 sh = get_free_stripe(conf, hash);
2443 spin_unlock_irq(conf->hash_locks + hash);
2444 if (!sh)
2445 return 0;
2446 BUG_ON(atomic_read(&sh->count));
2447 shrink_buffers(sh);
2448 free_stripe(conf->slab_cache, sh);
2449 atomic_dec(&conf->active_stripes);
2450 conf->max_nr_stripes--;
2451 return 1;
2452}
2453
2454static void shrink_stripes(struct r5conf *conf)
2455{
2456 while (conf->max_nr_stripes &&
2457 drop_one_stripe(conf))
2458 ;
2459
2460 kmem_cache_destroy(conf->slab_cache);
2461 conf->slab_cache = NULL;
2462}
2463
2464static void raid5_end_read_request(struct bio * bi)
2465{
2466 struct stripe_head *sh = bi->bi_private;
2467 struct r5conf *conf = sh->raid_conf;
2468 int disks = sh->disks, i;
2469 char b[BDEVNAME_SIZE];
2470 struct md_rdev *rdev = NULL;
2471 sector_t s;
2472
2473 for (i=0 ; i<disks; i++)
2474 if (bi == &sh->dev[i].req)
2475 break;
2476
2477 pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
2478 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2479 bi->bi_error);
2480 if (i == disks) {
2481 bio_reset(bi);
2482 BUG();
2483 return;
2484 }
2485 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2486
2487
2488
2489
2490
2491 rdev = conf->disks[i].replacement;
2492 if (!rdev)
2493 rdev = conf->disks[i].rdev;
2494
2495 if (use_new_offset(conf, sh))
2496 s = sh->sector + rdev->new_data_offset;
2497 else
2498 s = sh->sector + rdev->data_offset;
2499 if (!bi->bi_error) {
2500 set_bit(R5_UPTODATE, &sh->dev[i].flags);
2501 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2502
2503
2504
2505
2506 pr_info_ratelimited(
2507 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
2508 mdname(conf->mddev), STRIPE_SECTORS,
2509 (unsigned long long)s,
2510 bdevname(rdev->bdev, b));
2511 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
2512 clear_bit(R5_ReadError, &sh->dev[i].flags);
2513 clear_bit(R5_ReWrite, &sh->dev[i].flags);
2514 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2515 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2516
2517 if (test_bit(R5_InJournal, &sh->dev[i].flags))
2518
2519
2520
2521
2522 set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2523
2524 if (atomic_read(&rdev->read_errors))
2525 atomic_set(&rdev->read_errors, 0);
2526 } else {
2527 const char *bdn = bdevname(rdev->bdev, b);
2528 int retry = 0;
2529 int set_bad = 0;
2530
2531 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
2532 atomic_inc(&rdev->read_errors);
2533 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2534 pr_warn_ratelimited(
2535 "md/raid:%s: read error on replacement device (sector %llu on %s).\n",
2536 mdname(conf->mddev),
2537 (unsigned long long)s,
2538 bdn);
2539 else if (conf->mddev->degraded >= conf->max_degraded) {
2540 set_bad = 1;
2541 pr_warn_ratelimited(
2542 "md/raid:%s: read error not correctable (sector %llu on %s).\n",
2543 mdname(conf->mddev),
2544 (unsigned long long)s,
2545 bdn);
2546 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2547
2548 set_bad = 1;
2549 pr_warn_ratelimited(
2550 "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n",
2551 mdname(conf->mddev),
2552 (unsigned long long)s,
2553 bdn);
2554 } else if (atomic_read(&rdev->read_errors)
2555 > conf->max_nr_stripes)
2556 pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
2557 mdname(conf->mddev), bdn);
2558 else
2559 retry = 1;
2560 if (set_bad && test_bit(In_sync, &rdev->flags)
2561 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2562 retry = 1;
2563 if (retry)
2564 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
2565 set_bit(R5_ReadError, &sh->dev[i].flags);
2566 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2567 } else
2568 set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2569 else {
2570 clear_bit(R5_ReadError, &sh->dev[i].flags);
2571 clear_bit(R5_ReWrite, &sh->dev[i].flags);
2572 if (!(set_bad
2573 && test_bit(In_sync, &rdev->flags)
2574 && rdev_set_badblocks(
2575 rdev, sh->sector, STRIPE_SECTORS, 0)))
2576 md_error(conf->mddev, rdev);
2577 }
2578 }
2579 rdev_dec_pending(rdev, conf->mddev);
2580 bio_reset(bi);
2581 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2582 set_bit(STRIPE_HANDLE, &sh->state);
2583 raid5_release_stripe(sh);
2584}
2585
2586static void raid5_end_write_request(struct bio *bi)
2587{
2588 struct stripe_head *sh = bi->bi_private;
2589 struct r5conf *conf = sh->raid_conf;
2590 int disks = sh->disks, i;
2591 struct md_rdev *uninitialized_var(rdev);
2592 sector_t first_bad;
2593 int bad_sectors;
2594 int replacement = 0;
2595
2596 for (i = 0 ; i < disks; i++) {
2597 if (bi == &sh->dev[i].req) {
2598 rdev = conf->disks[i].rdev;
2599 break;
2600 }
2601 if (bi == &sh->dev[i].rreq) {
2602 rdev = conf->disks[i].replacement;
2603 if (rdev)
2604 replacement = 1;
2605 else
2606
2607
2608
2609
2610 rdev = conf->disks[i].rdev;
2611 break;
2612 }
2613 }
2614 pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
2615 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2616 bi->bi_error);
2617 if (i == disks) {
2618 bio_reset(bi);
2619 BUG();
2620 return;
2621 }
2622
2623 if (replacement) {
2624 if (bi->bi_error)
2625 md_error(conf->mddev, rdev);
2626 else if (is_badblock(rdev, sh->sector,
2627 STRIPE_SECTORS,
2628 &first_bad, &bad_sectors))
2629 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
2630 } else {
2631 if (bi->bi_error) {
2632 set_bit(STRIPE_DEGRADED, &sh->state);
2633 set_bit(WriteErrorSeen, &rdev->flags);
2634 set_bit(R5_WriteError, &sh->dev[i].flags);
2635 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2636 set_bit(MD_RECOVERY_NEEDED,
2637 &rdev->mddev->recovery);
2638 } else if (is_badblock(rdev, sh->sector,
2639 STRIPE_SECTORS,
2640 &first_bad, &bad_sectors)) {
2641 set_bit(R5_MadeGood, &sh->dev[i].flags);
2642 if (test_bit(R5_ReadError, &sh->dev[i].flags))
2643
2644
2645
2646
2647 set_bit(R5_ReWrite, &sh->dev[i].flags);
2648 }
2649 }
2650 rdev_dec_pending(rdev, conf->mddev);
2651
2652 if (sh->batch_head && bi->bi_error && !replacement)
2653 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
2654
2655 bio_reset(bi);
2656 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
2657 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2658 set_bit(STRIPE_HANDLE, &sh->state);
2659 raid5_release_stripe(sh);
2660
2661 if (sh->batch_head && sh != sh->batch_head)
2662 raid5_release_stripe(sh->batch_head);
2663}
2664
2665static void raid5_build_block(struct stripe_head *sh, int i, int previous)
2666{
2667 struct r5dev *dev = &sh->dev[i];
2668
2669 dev->flags = 0;
2670 dev->sector = raid5_compute_blocknr(sh, i, previous);
2671}
2672
2673static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
2674{
2675 char b[BDEVNAME_SIZE];
2676 struct r5conf *conf = mddev->private;
2677 unsigned long flags;
2678 pr_debug("raid456: error called\n");
2679
2680 spin_lock_irqsave(&conf->device_lock, flags);
2681 clear_bit(In_sync, &rdev->flags);
2682 mddev->degraded = raid5_calc_degraded(conf);
2683 spin_unlock_irqrestore(&conf->device_lock, flags);
2684 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2685
2686 set_bit(Blocked, &rdev->flags);
2687 set_bit(Faulty, &rdev->flags);
2688 set_mask_bits(&mddev->sb_flags, 0,
2689 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
2690 pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
2691 "md/raid:%s: Operation continuing on %d devices.\n",
2692 mdname(mddev),
2693 bdevname(rdev->bdev, b),
2694 mdname(mddev),
2695 conf->raid_disks - mddev->degraded);
2696 r5c_update_on_rdev_error(mddev, rdev);
2697}
2698
2699
2700
2701
2702
2703sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2704 int previous, int *dd_idx,
2705 struct stripe_head *sh)
2706{
2707 sector_t stripe, stripe2;
2708 sector_t chunk_number;
2709 unsigned int chunk_offset;
2710 int pd_idx, qd_idx;
2711 int ddf_layout = 0;
2712 sector_t new_sector;
2713 int algorithm = previous ? conf->prev_algo
2714 : conf->algorithm;
2715 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2716 : conf->chunk_sectors;
2717 int raid_disks = previous ? conf->previous_raid_disks
2718 : conf->raid_disks;
2719 int data_disks = raid_disks - conf->max_degraded;
2720
2721
2722
2723
2724
2725
2726 chunk_offset = sector_div(r_sector, sectors_per_chunk);
2727 chunk_number = r_sector;
2728
2729
2730
2731
2732 stripe = chunk_number;
2733 *dd_idx = sector_div(stripe, data_disks);
2734 stripe2 = stripe;
2735
2736
2737
2738 pd_idx = qd_idx = -1;
2739 switch(conf->level) {
2740 case 4:
2741 pd_idx = data_disks;
2742 break;
2743 case 5:
2744 switch (algorithm) {
2745 case ALGORITHM_LEFT_ASYMMETRIC:
2746 pd_idx = data_disks - sector_div(stripe2, raid_disks);
2747 if (*dd_idx >= pd_idx)
2748 (*dd_idx)++;
2749 break;
2750 case ALGORITHM_RIGHT_ASYMMETRIC:
2751 pd_idx = sector_div(stripe2, raid_disks);
2752 if (*dd_idx >= pd_idx)
2753 (*dd_idx)++;
2754 break;
2755 case ALGORITHM_LEFT_SYMMETRIC:
2756 pd_idx = data_disks - sector_div(stripe2, raid_disks);
2757 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2758 break;
2759 case ALGORITHM_RIGHT_SYMMETRIC:
2760 pd_idx = sector_div(stripe2, raid_disks);
2761 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2762 break;
2763 case ALGORITHM_PARITY_0:
2764 pd_idx = 0;
2765 (*dd_idx)++;
2766 break;
2767 case ALGORITHM_PARITY_N:
2768 pd_idx = data_disks;
2769 break;
2770 default:
2771 BUG();
2772 }
2773 break;
2774 case 6:
2775
2776 switch (algorithm) {
2777 case ALGORITHM_LEFT_ASYMMETRIC:
2778 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2779 qd_idx = pd_idx + 1;
2780 if (pd_idx == raid_disks-1) {
2781 (*dd_idx)++;
2782 qd_idx = 0;
2783 } else if (*dd_idx >= pd_idx)
2784 (*dd_idx) += 2;
2785 break;
2786 case ALGORITHM_RIGHT_ASYMMETRIC:
2787 pd_idx = sector_div(stripe2, raid_disks);
2788 qd_idx = pd_idx + 1;
2789 if (pd_idx == raid_disks-1) {
2790 (*dd_idx)++;
2791 qd_idx = 0;
2792 } else if (*dd_idx >= pd_idx)
2793 (*dd_idx) += 2;
2794 break;
2795 case ALGORITHM_LEFT_SYMMETRIC:
2796 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2797 qd_idx = (pd_idx + 1) % raid_disks;
2798 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2799 break;
2800 case ALGORITHM_RIGHT_SYMMETRIC:
2801 pd_idx = sector_div(stripe2, raid_disks);
2802 qd_idx = (pd_idx + 1) % raid_disks;
2803 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2804 break;
2805
2806 case ALGORITHM_PARITY_0:
2807 pd_idx = 0;
2808 qd_idx = 1;
2809 (*dd_idx) += 2;
2810 break;
2811 case ALGORITHM_PARITY_N:
2812 pd_idx = data_disks;
2813 qd_idx = data_disks + 1;
2814 break;
2815
2816 case ALGORITHM_ROTATING_ZERO_RESTART:
2817
2818
2819
2820 pd_idx = sector_div(stripe2, raid_disks);
2821 qd_idx = pd_idx + 1;
2822 if (pd_idx == raid_disks-1) {
2823 (*dd_idx)++;
2824 qd_idx = 0;
2825 } else if (*dd_idx >= pd_idx)
2826 (*dd_idx) += 2;
2827 ddf_layout = 1;
2828 break;
2829
2830 case ALGORITHM_ROTATING_N_RESTART:
2831
2832
2833
2834
2835 stripe2 += 1;
2836 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2837 qd_idx = pd_idx + 1;
2838 if (pd_idx == raid_disks-1) {
2839 (*dd_idx)++;
2840 qd_idx = 0;
2841 } else if (*dd_idx >= pd_idx)
2842 (*dd_idx) += 2;
2843 ddf_layout = 1;
2844 break;
2845
2846 case ALGORITHM_ROTATING_N_CONTINUE:
2847
2848 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2849 qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
2850 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2851 ddf_layout = 1;
2852 break;
2853
2854 case ALGORITHM_LEFT_ASYMMETRIC_6:
2855
2856 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2857 if (*dd_idx >= pd_idx)
2858 (*dd_idx)++;
2859 qd_idx = raid_disks - 1;
2860 break;
2861
2862 case ALGORITHM_RIGHT_ASYMMETRIC_6:
2863 pd_idx = sector_div(stripe2, raid_disks-1);
2864 if (*dd_idx >= pd_idx)
2865 (*dd_idx)++;
2866 qd_idx = raid_disks - 1;
2867 break;
2868
2869 case ALGORITHM_LEFT_SYMMETRIC_6:
2870 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2871 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2872 qd_idx = raid_disks - 1;
2873 break;
2874
2875 case ALGORITHM_RIGHT_SYMMETRIC_6:
2876 pd_idx = sector_div(stripe2, raid_disks-1);
2877 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2878 qd_idx = raid_disks - 1;
2879 break;
2880
2881 case ALGORITHM_PARITY_0_6:
2882 pd_idx = 0;
2883 (*dd_idx)++;
2884 qd_idx = raid_disks - 1;
2885 break;
2886
2887 default:
2888 BUG();
2889 }
2890 break;
2891 }
2892
2893 if (sh) {
2894 sh->pd_idx = pd_idx;
2895 sh->qd_idx = qd_idx;
2896 sh->ddf_layout = ddf_layout;
2897 }
2898
2899
2900
2901 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
2902 return new_sector;
2903}
2904
2905sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
2906{
2907 struct r5conf *conf = sh->raid_conf;
2908 int raid_disks = sh->disks;
2909 int data_disks = raid_disks - conf->max_degraded;
2910 sector_t new_sector = sh->sector, check;
2911 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2912 : conf->chunk_sectors;
2913 int algorithm = previous ? conf->prev_algo
2914 : conf->algorithm;
2915 sector_t stripe;
2916 int chunk_offset;
2917 sector_t chunk_number;
2918 int dummy1, dd_idx = i;
2919 sector_t r_sector;
2920 struct stripe_head sh2;
2921
2922 chunk_offset = sector_div(new_sector, sectors_per_chunk);
2923 stripe = new_sector;
2924
2925 if (i == sh->pd_idx)
2926 return 0;
2927 switch(conf->level) {
2928 case 4: break;
2929 case 5:
2930 switch (algorithm) {
2931 case ALGORITHM_LEFT_ASYMMETRIC:
2932 case ALGORITHM_RIGHT_ASYMMETRIC:
2933 if (i > sh->pd_idx)
2934 i--;
2935 break;
2936 case ALGORITHM_LEFT_SYMMETRIC:
2937 case ALGORITHM_RIGHT_SYMMETRIC:
2938 if (i < sh->pd_idx)
2939 i += raid_disks;
2940 i -= (sh->pd_idx + 1);
2941 break;
2942 case ALGORITHM_PARITY_0:
2943 i -= 1;
2944 break;
2945 case ALGORITHM_PARITY_N:
2946 break;
2947 default:
2948 BUG();
2949 }
2950 break;
2951 case 6:
2952 if (i == sh->qd_idx)
2953 return 0;
2954 switch (algorithm) {
2955 case ALGORITHM_LEFT_ASYMMETRIC:
2956 case ALGORITHM_RIGHT_ASYMMETRIC:
2957 case ALGORITHM_ROTATING_ZERO_RESTART:
2958 case ALGORITHM_ROTATING_N_RESTART:
2959 if (sh->pd_idx == raid_disks-1)
2960 i--;
2961 else if (i > sh->pd_idx)
2962 i -= 2;
2963 break;
2964 case ALGORITHM_LEFT_SYMMETRIC:
2965 case ALGORITHM_RIGHT_SYMMETRIC:
2966 if (sh->pd_idx == raid_disks-1)
2967 i--;
2968 else {
2969
2970 if (i < sh->pd_idx)
2971 i += raid_disks;
2972 i -= (sh->pd_idx + 2);
2973 }
2974 break;
2975 case ALGORITHM_PARITY_0:
2976 i -= 2;
2977 break;
2978 case ALGORITHM_PARITY_N:
2979 break;
2980 case ALGORITHM_ROTATING_N_CONTINUE:
2981
2982 if (sh->pd_idx == 0)
2983 i--;
2984 else {
2985
2986 if (i < sh->pd_idx)
2987 i += raid_disks;
2988 i -= (sh->pd_idx + 1);
2989 }
2990 break;
2991 case ALGORITHM_LEFT_ASYMMETRIC_6:
2992 case ALGORITHM_RIGHT_ASYMMETRIC_6:
2993 if (i > sh->pd_idx)
2994 i--;
2995 break;
2996 case ALGORITHM_LEFT_SYMMETRIC_6:
2997 case ALGORITHM_RIGHT_SYMMETRIC_6:
2998 if (i < sh->pd_idx)
2999 i += data_disks + 1;
3000 i -= (sh->pd_idx + 1);
3001 break;
3002 case ALGORITHM_PARITY_0_6:
3003 i -= 1;
3004 break;
3005 default:
3006 BUG();
3007 }
3008 break;
3009 }
3010
3011 chunk_number = stripe * data_disks + i;
3012 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
3013
3014 check = raid5_compute_sector(conf, r_sector,
3015 previous, &dummy1, &sh2);
3016 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
3017 || sh2.qd_idx != sh->qd_idx) {
3018 pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
3019 mdname(conf->mddev));
3020 return 0;
3021 }
3022 return r_sector;
3023}
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063static inline bool delay_towrite(struct r5conf *conf,
3064 struct r5dev *dev,
3065 struct stripe_head_state *s)
3066{
3067
3068 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3069 !test_bit(R5_Insync, &dev->flags) && s->injournal)
3070 return true;
3071
3072 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
3073 s->injournal > 0)
3074 return true;
3075
3076 if (s->log_failed && s->injournal)
3077 return true;
3078 return false;
3079}
3080
3081static void
3082schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
3083 int rcw, int expand)
3084{
3085 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
3086 struct r5conf *conf = sh->raid_conf;
3087 int level = conf->level;
3088
3089 if (rcw) {
3090
3091
3092
3093
3094
3095
3096 r5c_release_extra_page(sh);
3097
3098 for (i = disks; i--; ) {
3099 struct r5dev *dev = &sh->dev[i];
3100
3101 if (dev->towrite && !delay_towrite(conf, dev, s)) {
3102 set_bit(R5_LOCKED, &dev->flags);
3103 set_bit(R5_Wantdrain, &dev->flags);
3104 if (!expand)
3105 clear_bit(R5_UPTODATE, &dev->flags);
3106 s->locked++;
3107 } else if (test_bit(R5_InJournal, &dev->flags)) {
3108 set_bit(R5_LOCKED, &dev->flags);
3109 s->locked++;
3110 }
3111 }
3112
3113
3114
3115
3116 if (!expand) {
3117 if (!s->locked)
3118
3119 return;
3120 sh->reconstruct_state = reconstruct_state_drain_run;
3121 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3122 } else
3123 sh->reconstruct_state = reconstruct_state_run;
3124
3125 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3126
3127 if (s->locked + conf->max_degraded == disks)
3128 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
3129 atomic_inc(&conf->pending_full_writes);
3130 } else {
3131 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
3132 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
3133 BUG_ON(level == 6 &&
3134 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
3135 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
3136
3137 for (i = disks; i--; ) {
3138 struct r5dev *dev = &sh->dev[i];
3139 if (i == pd_idx || i == qd_idx)
3140 continue;
3141
3142 if (dev->towrite &&
3143 (test_bit(R5_UPTODATE, &dev->flags) ||
3144 test_bit(R5_Wantcompute, &dev->flags))) {
3145 set_bit(R5_Wantdrain, &dev->flags);
3146 set_bit(R5_LOCKED, &dev->flags);
3147 clear_bit(R5_UPTODATE, &dev->flags);
3148 s->locked++;
3149 } else if (test_bit(R5_InJournal, &dev->flags)) {
3150 set_bit(R5_LOCKED, &dev->flags);
3151 s->locked++;
3152 }
3153 }
3154 if (!s->locked)
3155
3156 return;
3157 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
3158 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
3159 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3160 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3161 }
3162
3163
3164
3165
3166 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
3167 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
3168 s->locked++;
3169
3170 if (level == 6) {
3171 int qd_idx = sh->qd_idx;
3172 struct r5dev *dev = &sh->dev[qd_idx];
3173
3174 set_bit(R5_LOCKED, &dev->flags);
3175 clear_bit(R5_UPTODATE, &dev->flags);
3176 s->locked++;
3177 }
3178
3179 if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
3180 test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
3181 !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
3182 test_bit(R5_Insync, &sh->dev[pd_idx].flags))
3183 set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
3184
3185 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
3186 __func__, (unsigned long long)sh->sector,
3187 s->locked, s->ops_request);
3188}
3189
3190
3191
3192
3193
3194
3195static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
3196 int forwrite, int previous)
3197{
3198 struct bio **bip;
3199 struct r5conf *conf = sh->raid_conf;
3200 int firstwrite=0;
3201
3202 pr_debug("adding bi b#%llu to stripe s#%llu\n",
3203 (unsigned long long)bi->bi_iter.bi_sector,
3204 (unsigned long long)sh->sector);
3205
3206 spin_lock_irq(&sh->stripe_lock);
3207
3208 if (sh->batch_head)
3209 goto overlap;
3210 if (forwrite) {
3211 bip = &sh->dev[dd_idx].towrite;
3212 if (*bip == NULL)
3213 firstwrite = 1;
3214 } else
3215 bip = &sh->dev[dd_idx].toread;
3216 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
3217 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
3218 goto overlap;
3219 bip = & (*bip)->bi_next;
3220 }
3221 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
3222 goto overlap;
3223
3224 if (forwrite && raid5_has_ppl(conf)) {
3225
3226
3227
3228
3229
3230
3231
3232 sector_t sector;
3233 sector_t first = 0;
3234 sector_t last = 0;
3235 int count = 0;
3236 int i;
3237
3238 for (i = 0; i < sh->disks; i++) {
3239 if (i != sh->pd_idx &&
3240 (i == dd_idx || sh->dev[i].towrite)) {
3241 sector = sh->dev[i].sector;
3242 if (count == 0 || sector < first)
3243 first = sector;
3244 if (sector > last)
3245 last = sector;
3246 count++;
3247 }
3248 }
3249
3250 if (first + conf->chunk_sectors * (count - 1) != last)
3251 goto overlap;
3252 }
3253
3254 if (!forwrite || previous)
3255 clear_bit(STRIPE_BATCH_READY, &sh->state);
3256
3257 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
3258 if (*bip)
3259 bi->bi_next = *bip;
3260 *bip = bi;
3261 bio_inc_remaining(bi);
3262 md_write_inc(conf->mddev, bi);
3263
3264 if (forwrite) {
3265
3266 sector_t sector = sh->dev[dd_idx].sector;
3267 for (bi=sh->dev[dd_idx].towrite;
3268 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
3269 bi && bi->bi_iter.bi_sector <= sector;
3270 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
3271 if (bio_end_sector(bi) >= sector)
3272 sector = bio_end_sector(bi);
3273 }
3274 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
3275 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
3276 sh->overwrite_disks++;
3277 }
3278
3279 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
3280 (unsigned long long)(*bip)->bi_iter.bi_sector,
3281 (unsigned long long)sh->sector, dd_idx);
3282
3283 if (conf->mddev->bitmap && firstwrite) {
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296 set_bit(STRIPE_BITMAP_PENDING, &sh->state);
3297 spin_unlock_irq(&sh->stripe_lock);
3298 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
3299 STRIPE_SECTORS, 0);
3300 spin_lock_irq(&sh->stripe_lock);
3301 clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
3302 if (!sh->batch_head) {
3303 sh->bm_seq = conf->seq_flush+1;
3304 set_bit(STRIPE_BIT_DELAY, &sh->state);
3305 }
3306 }
3307 spin_unlock_irq(&sh->stripe_lock);
3308
3309 if (stripe_can_batch(sh))
3310 stripe_add_to_batch_list(conf, sh);
3311 return 1;
3312
3313 overlap:
3314 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
3315 spin_unlock_irq(&sh->stripe_lock);
3316 return 0;
3317}
3318
3319static void end_reshape(struct r5conf *conf);
3320
3321static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
3322 struct stripe_head *sh)
3323{
3324 int sectors_per_chunk =
3325 previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
3326 int dd_idx;
3327 int chunk_offset = sector_div(stripe, sectors_per_chunk);
3328 int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
3329
3330 raid5_compute_sector(conf,
3331 stripe * (disks - conf->max_degraded)
3332 *sectors_per_chunk + chunk_offset,
3333 previous,
3334 &dd_idx, sh);
3335}
3336
3337static void
3338handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
3339 struct stripe_head_state *s, int disks)
3340{
3341 int i;
3342 BUG_ON(sh->batch_head);
3343 for (i = disks; i--; ) {
3344 struct bio *bi;
3345 int bitmap_end = 0;
3346
3347 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
3348 struct md_rdev *rdev;
3349 rcu_read_lock();
3350 rdev = rcu_dereference(conf->disks[i].rdev);
3351 if (rdev && test_bit(In_sync, &rdev->flags) &&
3352 !test_bit(Faulty, &rdev->flags))
3353 atomic_inc(&rdev->nr_pending);
3354 else
3355 rdev = NULL;
3356 rcu_read_unlock();
3357 if (rdev) {
3358 if (!rdev_set_badblocks(
3359 rdev,
3360 sh->sector,
3361 STRIPE_SECTORS, 0))
3362 md_error(conf->mddev, rdev);
3363 rdev_dec_pending(rdev, conf->mddev);
3364 }
3365 }
3366 spin_lock_irq(&sh->stripe_lock);
3367
3368 bi = sh->dev[i].towrite;
3369 sh->dev[i].towrite = NULL;
3370 sh->overwrite_disks = 0;
3371 spin_unlock_irq(&sh->stripe_lock);
3372 if (bi)
3373 bitmap_end = 1;
3374
3375 log_stripe_write_finished(sh);
3376
3377 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3378 wake_up(&conf->wait_for_overlap);
3379
3380 while (bi && bi->bi_iter.bi_sector <
3381 sh->dev[i].sector + STRIPE_SECTORS) {
3382 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
3383
3384 bi->bi_error = -EIO;
3385 md_write_end(conf->mddev);
3386 bio_endio(bi);
3387 bi = nextbi;
3388 }
3389 if (bitmap_end)
3390 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3391 STRIPE_SECTORS, 0, 0);
3392 bitmap_end = 0;
3393
3394 bi = sh->dev[i].written;
3395 sh->dev[i].written = NULL;
3396 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
3397 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
3398 sh->dev[i].page = sh->dev[i].orig_page;
3399 }
3400
3401 if (bi) bitmap_end = 1;
3402 while (bi && bi->bi_iter.bi_sector <
3403 sh->dev[i].sector + STRIPE_SECTORS) {
3404 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
3405
3406 bi->bi_error = -EIO;
3407 md_write_end(conf->mddev);
3408 bio_endio(bi);
3409 bi = bi2;
3410 }
3411
3412
3413
3414
3415 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
3416 s->failed > conf->max_degraded &&
3417 (!test_bit(R5_Insync, &sh->dev[i].flags) ||
3418 test_bit(R5_ReadError, &sh->dev[i].flags))) {
3419 spin_lock_irq(&sh->stripe_lock);
3420 bi = sh->dev[i].toread;
3421 sh->dev[i].toread = NULL;
3422 spin_unlock_irq(&sh->stripe_lock);
3423 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3424 wake_up(&conf->wait_for_overlap);
3425 if (bi)
3426 s->to_read--;
3427 while (bi && bi->bi_iter.bi_sector <
3428 sh->dev[i].sector + STRIPE_SECTORS) {
3429 struct bio *nextbi =
3430 r5_next_bio(bi, sh->dev[i].sector);
3431
3432 bi->bi_error = -EIO;
3433 bio_endio(bi);
3434 bi = nextbi;
3435 }
3436 }
3437 if (bitmap_end)
3438 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3439 STRIPE_SECTORS, 0, 0);
3440
3441
3442
3443 clear_bit(R5_LOCKED, &sh->dev[i].flags);
3444 }
3445 s->to_write = 0;
3446 s->written = 0;
3447
3448 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3449 if (atomic_dec_and_test(&conf->pending_full_writes))
3450 md_wakeup_thread(conf->mddev->thread);
3451}
3452
3453static void
3454handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
3455 struct stripe_head_state *s)
3456{
3457 int abort = 0;
3458 int i;
3459
3460 BUG_ON(sh->batch_head);
3461 clear_bit(STRIPE_SYNCING, &sh->state);
3462 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
3463 wake_up(&conf->wait_for_overlap);
3464 s->syncing = 0;
3465 s->replacing = 0;
3466
3467
3468
3469
3470
3471
3472
3473 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
3474
3475
3476
3477 rcu_read_lock();
3478 for (i = 0; i < conf->raid_disks; i++) {
3479 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
3480 if (rdev
3481 && !test_bit(Faulty, &rdev->flags)
3482 && !test_bit(In_sync, &rdev->flags)
3483 && !rdev_set_badblocks(rdev, sh->sector,
3484 STRIPE_SECTORS, 0))
3485 abort = 1;
3486 rdev = rcu_dereference(conf->disks[i].replacement);
3487 if (rdev
3488 && !test_bit(Faulty, &rdev->flags)
3489 && !test_bit(In_sync, &rdev->flags)
3490 && !rdev_set_badblocks(rdev, sh->sector,
3491 STRIPE_SECTORS, 0))
3492 abort = 1;
3493 }
3494 rcu_read_unlock();
3495 if (abort)
3496 conf->recovery_disabled =
3497 conf->mddev->recovery_disabled;
3498 }
3499 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
3500}
3501
3502static int want_replace(struct stripe_head *sh, int disk_idx)
3503{
3504 struct md_rdev *rdev;
3505 int rv = 0;
3506
3507 rcu_read_lock();
3508 rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement);
3509 if (rdev
3510 && !test_bit(Faulty, &rdev->flags)
3511 && !test_bit(In_sync, &rdev->flags)
3512 && (rdev->recovery_offset <= sh->sector
3513 || rdev->mddev->recovery_cp <= sh->sector))
3514 rv = 1;
3515 rcu_read_unlock();
3516 return rv;
3517}
3518
3519static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
3520 int disk_idx, int disks)
3521{
3522 struct r5dev *dev = &sh->dev[disk_idx];
3523 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
3524 &sh->dev[s->failed_num[1]] };
3525 int i;
3526
3527
3528 if (test_bit(R5_LOCKED, &dev->flags) ||
3529 test_bit(R5_UPTODATE, &dev->flags))
3530
3531
3532
3533 return 0;
3534
3535 if (dev->toread ||
3536 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
3537
3538 return 1;
3539
3540 if (s->syncing || s->expanding ||
3541 (s->replacing && want_replace(sh, disk_idx)))
3542
3543
3544
3545 return 1;
3546
3547 if ((s->failed >= 1 && fdev[0]->toread) ||
3548 (s->failed >= 2 && fdev[1]->toread))
3549
3550
3551
3552 return 1;
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562 if (!s->failed || !s->to_write)
3563 return 0;
3564
3565 if (test_bit(R5_Insync, &dev->flags) &&
3566 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3567
3568
3569
3570
3571
3572 return 0;
3573
3574 for (i = 0; i < s->failed && i < 2; i++) {
3575 if (fdev[i]->towrite &&
3576 !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3577 !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3578
3579
3580
3581
3582
3583 return 1;
3584 }
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594 if (sh->raid_conf->level != 6 &&
3595 sh->sector < sh->raid_conf->mddev->recovery_cp)
3596
3597 return 0;
3598 for (i = 0; i < s->failed && i < 2; i++) {
3599 if (s->failed_num[i] != sh->pd_idx &&
3600 s->failed_num[i] != sh->qd_idx &&
3601 !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3602 !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3603 return 1;
3604 }
3605
3606 return 0;
3607}
3608
3609
3610
3611
3612
3613
3614
3615static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
3616 int disk_idx, int disks)
3617{
3618 struct r5dev *dev = &sh->dev[disk_idx];
3619
3620
3621 if (need_this_block(sh, s, disk_idx, disks)) {
3622
3623
3624
3625 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
3626 BUG_ON(test_bit(R5_Wantread, &dev->flags));
3627 BUG_ON(sh->batch_head);
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638 if ((s->uptodate == disks - 1) &&
3639 ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
3640 (s->failed && (disk_idx == s->failed_num[0] ||
3641 disk_idx == s->failed_num[1])))) {
3642
3643
3644
3645 pr_debug("Computing stripe %llu block %d\n",
3646 (unsigned long long)sh->sector, disk_idx);
3647 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3648 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3649 set_bit(R5_Wantcompute, &dev->flags);
3650 sh->ops.target = disk_idx;
3651 sh->ops.target2 = -1;
3652 s->req_compute = 1;
3653
3654
3655
3656
3657
3658
3659 s->uptodate++;
3660 return 1;
3661 } else if (s->uptodate == disks-2 && s->failed >= 2) {
3662
3663
3664
3665 int other;
3666 for (other = disks; other--; ) {
3667 if (other == disk_idx)
3668 continue;
3669 if (!test_bit(R5_UPTODATE,
3670 &sh->dev[other].flags))
3671 break;
3672 }
3673 BUG_ON(other < 0);
3674 pr_debug("Computing stripe %llu blocks %d,%d\n",
3675 (unsigned long long)sh->sector,
3676 disk_idx, other);
3677 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3678 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3679 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
3680 set_bit(R5_Wantcompute, &sh->dev[other].flags);
3681 sh->ops.target = disk_idx;
3682 sh->ops.target2 = other;
3683 s->uptodate += 2;
3684 s->req_compute = 1;
3685 return 1;
3686 } else if (test_bit(R5_Insync, &dev->flags)) {
3687 set_bit(R5_LOCKED, &dev->flags);
3688 set_bit(R5_Wantread, &dev->flags);
3689 s->locked++;
3690 pr_debug("Reading block %d (sync=%d)\n",
3691 disk_idx, s->syncing);
3692 }
3693 }
3694
3695 return 0;
3696}
3697
3698
3699
3700
3701static void handle_stripe_fill(struct stripe_head *sh,
3702 struct stripe_head_state *s,
3703 int disks)
3704{
3705 int i;
3706
3707
3708
3709
3710
3711 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
3712 !sh->reconstruct_state) {
3713
3714
3715
3716
3717
3718
3719
3720
3721 if (s->injournal && s->failed) {
3722 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
3723 r5c_make_stripe_write_out(sh);
3724 goto out;
3725 }
3726
3727 for (i = disks; i--; )
3728 if (fetch_block(sh, s, i, disks))
3729 break;
3730 }
3731out:
3732 set_bit(STRIPE_HANDLE, &sh->state);
3733}
3734
3735static void break_stripe_batch_list(struct stripe_head *head_sh,
3736 unsigned long handle_flags);
3737
3738
3739
3740
3741
3742static void handle_stripe_clean_event(struct r5conf *conf,
3743 struct stripe_head *sh, int disks)
3744{
3745 int i;
3746 struct r5dev *dev;
3747 int discard_pending = 0;
3748 struct stripe_head *head_sh = sh;
3749 bool do_endio = false;
3750
3751 for (i = disks; i--; )
3752 if (sh->dev[i].written) {
3753 dev = &sh->dev[i];
3754 if (!test_bit(R5_LOCKED, &dev->flags) &&
3755 (test_bit(R5_UPTODATE, &dev->flags) ||
3756 test_bit(R5_Discard, &dev->flags) ||
3757 test_bit(R5_SkipCopy, &dev->flags))) {
3758
3759 struct bio *wbi, *wbi2;
3760 pr_debug("Return write for disc %d\n", i);
3761 if (test_and_clear_bit(R5_Discard, &dev->flags))
3762 clear_bit(R5_UPTODATE, &dev->flags);
3763 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
3764 WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
3765 }
3766 do_endio = true;
3767
3768returnbi:
3769 dev->page = dev->orig_page;
3770 wbi = dev->written;
3771 dev->written = NULL;
3772 while (wbi && wbi->bi_iter.bi_sector <
3773 dev->sector + STRIPE_SECTORS) {
3774 wbi2 = r5_next_bio(wbi, dev->sector);
3775 md_write_end(conf->mddev);
3776 bio_endio(wbi);
3777 wbi = wbi2;
3778 }
3779 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3780 STRIPE_SECTORS,
3781 !test_bit(STRIPE_DEGRADED, &sh->state),
3782 0);
3783 if (head_sh->batch_head) {
3784 sh = list_first_entry(&sh->batch_list,
3785 struct stripe_head,
3786 batch_list);
3787 if (sh != head_sh) {
3788 dev = &sh->dev[i];
3789 goto returnbi;
3790 }
3791 }
3792 sh = head_sh;
3793 dev = &sh->dev[i];
3794 } else if (test_bit(R5_Discard, &dev->flags))
3795 discard_pending = 1;
3796 }
3797
3798 log_stripe_write_finished(sh);
3799
3800 if (!discard_pending &&
3801 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
3802 int hash;
3803 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
3804 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
3805 if (sh->qd_idx >= 0) {
3806 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
3807 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
3808 }
3809
3810 clear_bit(STRIPE_DISCARD, &sh->state);
3811
3812
3813
3814
3815
3816unhash:
3817 hash = sh->hash_lock_index;
3818 spin_lock_irq(conf->hash_locks + hash);
3819 remove_hash(sh);
3820 spin_unlock_irq(conf->hash_locks + hash);
3821 if (head_sh->batch_head) {
3822 sh = list_first_entry(&sh->batch_list,
3823 struct stripe_head, batch_list);
3824 if (sh != head_sh)
3825 goto unhash;
3826 }
3827 sh = head_sh;
3828
3829 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
3830 set_bit(STRIPE_HANDLE, &sh->state);
3831
3832 }
3833
3834 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3835 if (atomic_dec_and_test(&conf->pending_full_writes))
3836 md_wakeup_thread(conf->mddev->thread);
3837
3838 if (head_sh->batch_head && do_endio)
3839 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
3840}
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850static inline bool uptodate_for_rmw(struct r5dev *dev)
3851{
3852 return (test_bit(R5_UPTODATE, &dev->flags)) &&
3853 (!test_bit(R5_InJournal, &dev->flags) ||
3854 test_bit(R5_OrigPageUPTDODATE, &dev->flags));
3855}
3856
3857static int handle_stripe_dirtying(struct r5conf *conf,
3858 struct stripe_head *sh,
3859 struct stripe_head_state *s,
3860 int disks)
3861{
3862 int rmw = 0, rcw = 0, i;
3863 sector_t recovery_cp = conf->mddev->recovery_cp;
3864
3865
3866
3867
3868
3869
3870
3871
3872 if (conf->rmw_level == PARITY_DISABLE_RMW ||
3873 (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
3874 s->failed == 0)) {
3875
3876
3877
3878 rcw = 1; rmw = 2;
3879 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
3880 conf->rmw_level, (unsigned long long)recovery_cp,
3881 (unsigned long long)sh->sector);
3882 } else for (i = disks; i--; ) {
3883
3884 struct r5dev *dev = &sh->dev[i];
3885 if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
3886 i == sh->pd_idx || i == sh->qd_idx ||
3887 test_bit(R5_InJournal, &dev->flags)) &&
3888 !test_bit(R5_LOCKED, &dev->flags) &&
3889 !(uptodate_for_rmw(dev) ||
3890 test_bit(R5_Wantcompute, &dev->flags))) {
3891 if (test_bit(R5_Insync, &dev->flags))
3892 rmw++;
3893 else
3894 rmw += 2*disks;
3895 }
3896
3897 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3898 i != sh->pd_idx && i != sh->qd_idx &&
3899 !test_bit(R5_LOCKED, &dev->flags) &&
3900 !(test_bit(R5_UPTODATE, &dev->flags) ||
3901 test_bit(R5_Wantcompute, &dev->flags))) {
3902 if (test_bit(R5_Insync, &dev->flags))
3903 rcw++;
3904 else
3905 rcw += 2*disks;
3906 }
3907 }
3908
3909 pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
3910 (unsigned long long)sh->sector, sh->state, rmw, rcw);
3911 set_bit(STRIPE_HANDLE, &sh->state);
3912 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
3913
3914 if (conf->mddev->queue)
3915 blk_add_trace_msg(conf->mddev->queue,
3916 "raid5 rmw %llu %d",
3917 (unsigned long long)sh->sector, rmw);
3918 for (i = disks; i--; ) {
3919 struct r5dev *dev = &sh->dev[i];
3920 if (test_bit(R5_InJournal, &dev->flags) &&
3921 dev->page == dev->orig_page &&
3922 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
3923
3924 struct page *p = alloc_page(GFP_NOIO);
3925
3926 if (p) {
3927 dev->orig_page = p;
3928 continue;
3929 }
3930
3931
3932
3933
3934
3935 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
3936 &conf->cache_state)) {
3937 r5c_use_extra_page(sh);
3938 break;
3939 }
3940
3941
3942 set_bit(STRIPE_DELAYED, &sh->state);
3943 s->waiting_extra_page = 1;
3944 return -EAGAIN;
3945 }
3946 }
3947
3948 for (i = disks; i--; ) {
3949 struct r5dev *dev = &sh->dev[i];
3950 if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
3951 i == sh->pd_idx || i == sh->qd_idx ||
3952 test_bit(R5_InJournal, &dev->flags)) &&
3953 !test_bit(R5_LOCKED, &dev->flags) &&
3954 !(uptodate_for_rmw(dev) ||
3955 test_bit(R5_Wantcompute, &dev->flags)) &&
3956 test_bit(R5_Insync, &dev->flags)) {
3957 if (test_bit(STRIPE_PREREAD_ACTIVE,
3958 &sh->state)) {
3959 pr_debug("Read_old block %d for r-m-w\n",
3960 i);
3961 set_bit(R5_LOCKED, &dev->flags);
3962 set_bit(R5_Wantread, &dev->flags);
3963 s->locked++;
3964 } else {
3965 set_bit(STRIPE_DELAYED, &sh->state);
3966 set_bit(STRIPE_HANDLE, &sh->state);
3967 }
3968 }
3969 }
3970 }
3971 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) {
3972
3973 int qread =0;
3974 rcw = 0;
3975 for (i = disks; i--; ) {
3976 struct r5dev *dev = &sh->dev[i];
3977 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3978 i != sh->pd_idx && i != sh->qd_idx &&
3979 !test_bit(R5_LOCKED, &dev->flags) &&
3980 !(test_bit(R5_UPTODATE, &dev->flags) ||
3981 test_bit(R5_Wantcompute, &dev->flags))) {
3982 rcw++;
3983 if (test_bit(R5_Insync, &dev->flags) &&
3984 test_bit(STRIPE_PREREAD_ACTIVE,
3985 &sh->state)) {
3986 pr_debug("Read_old block "
3987 "%d for Reconstruct\n", i);
3988 set_bit(R5_LOCKED, &dev->flags);
3989 set_bit(R5_Wantread, &dev->flags);
3990 s->locked++;
3991 qread++;
3992 } else {
3993 set_bit(STRIPE_DELAYED, &sh->state);
3994 set_bit(STRIPE_HANDLE, &sh->state);
3995 }
3996 }
3997 }
3998 if (rcw && conf->mddev->queue)
3999 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
4000 (unsigned long long)sh->sector,
4001 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
4002 }
4003
4004 if (rcw > disks && rmw > disks &&
4005 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4006 set_bit(STRIPE_DELAYED, &sh->state);
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
4019 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
4020 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
4021 schedule_reconstruction(sh, s, rcw == 0, 0);
4022 return 0;
4023}
4024
4025static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
4026 struct stripe_head_state *s, int disks)
4027{
4028 struct r5dev *dev = NULL;
4029
4030 BUG_ON(sh->batch_head);
4031 set_bit(STRIPE_HANDLE, &sh->state);
4032
4033 switch (sh->check_state) {
4034 case check_state_idle:
4035
4036 if (s->failed == 0) {
4037 BUG_ON(s->uptodate != disks);
4038 sh->check_state = check_state_run;
4039 set_bit(STRIPE_OP_CHECK, &s->ops_request);
4040 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
4041 s->uptodate--;
4042 break;
4043 }
4044 dev = &sh->dev[s->failed_num[0]];
4045
4046 case check_state_compute_result:
4047 sh->check_state = check_state_idle;
4048 if (!dev)
4049 dev = &sh->dev[sh->pd_idx];
4050
4051
4052 if (test_bit(STRIPE_INSYNC, &sh->state))
4053 break;
4054
4055
4056 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
4057 BUG_ON(s->uptodate != disks);
4058
4059 set_bit(R5_LOCKED, &dev->flags);
4060 s->locked++;
4061 set_bit(R5_Wantwrite, &dev->flags);
4062
4063 clear_bit(STRIPE_DEGRADED, &sh->state);
4064 set_bit(STRIPE_INSYNC, &sh->state);
4065 break;
4066 case check_state_run:
4067 break;
4068 case check_state_check_result:
4069 sh->check_state = check_state_idle;
4070
4071
4072
4073
4074 if (s->failed)
4075 break;
4076
4077
4078
4079
4080
4081 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
4082
4083
4084
4085 set_bit(STRIPE_INSYNC, &sh->state);
4086 else {
4087 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
4088 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4089
4090 set_bit(STRIPE_INSYNC, &sh->state);
4091 pr_warn_ratelimited("%s: mismatch sector in range "
4092 "%llu-%llu\n", mdname(conf->mddev),
4093 (unsigned long long) sh->sector,
4094 (unsigned long long) sh->sector +
4095 STRIPE_SECTORS);
4096 } else {
4097 sh->check_state = check_state_compute_run;
4098 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4099 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4100 set_bit(R5_Wantcompute,
4101 &sh->dev[sh->pd_idx].flags);
4102 sh->ops.target = sh->pd_idx;
4103 sh->ops.target2 = -1;
4104 s->uptodate++;
4105 }
4106 }
4107 break;
4108 case check_state_compute_run:
4109 break;
4110 default:
4111 pr_err("%s: unknown check_state: %d sector: %llu\n",
4112 __func__, sh->check_state,
4113 (unsigned long long) sh->sector);
4114 BUG();
4115 }
4116}
4117
4118static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
4119 struct stripe_head_state *s,
4120 int disks)
4121{
4122 int pd_idx = sh->pd_idx;
4123 int qd_idx = sh->qd_idx;
4124 struct r5dev *dev;
4125
4126 BUG_ON(sh->batch_head);
4127 set_bit(STRIPE_HANDLE, &sh->state);
4128
4129 BUG_ON(s->failed > 2);
4130
4131
4132
4133
4134
4135
4136
4137 switch (sh->check_state) {
4138 case check_state_idle:
4139
4140 if (s->failed == s->q_failed) {
4141
4142
4143
4144
4145 sh->check_state = check_state_run;
4146 }
4147 if (!s->q_failed && s->failed < 2) {
4148
4149
4150
4151 if (sh->check_state == check_state_run)
4152 sh->check_state = check_state_run_pq;
4153 else
4154 sh->check_state = check_state_run_q;
4155 }
4156
4157
4158 sh->ops.zero_sum_result = 0;
4159
4160 if (sh->check_state == check_state_run) {
4161
4162 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
4163 s->uptodate--;
4164 }
4165 if (sh->check_state >= check_state_run &&
4166 sh->check_state <= check_state_run_pq) {
4167
4168
4169
4170 set_bit(STRIPE_OP_CHECK, &s->ops_request);
4171 break;
4172 }
4173
4174
4175 BUG_ON(s->failed != 2);
4176
4177 case check_state_compute_result:
4178 sh->check_state = check_state_idle;
4179
4180
4181 if (test_bit(STRIPE_INSYNC, &sh->state))
4182 break;
4183
4184
4185
4186
4187 BUG_ON(s->uptodate < disks - 1);
4188 if (s->failed == 2) {
4189 dev = &sh->dev[s->failed_num[1]];
4190 s->locked++;
4191 set_bit(R5_LOCKED, &dev->flags);
4192 set_bit(R5_Wantwrite, &dev->flags);
4193 }
4194 if (s->failed >= 1) {
4195 dev = &sh->dev[s->failed_num[0]];
4196 s->locked++;
4197 set_bit(R5_LOCKED, &dev->flags);
4198 set_bit(R5_Wantwrite, &dev->flags);
4199 }
4200 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4201 dev = &sh->dev[pd_idx];
4202 s->locked++;
4203 set_bit(R5_LOCKED, &dev->flags);
4204 set_bit(R5_Wantwrite, &dev->flags);
4205 }
4206 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4207 dev = &sh->dev[qd_idx];
4208 s->locked++;
4209 set_bit(R5_LOCKED, &dev->flags);
4210 set_bit(R5_Wantwrite, &dev->flags);
4211 }
4212 clear_bit(STRIPE_DEGRADED, &sh->state);
4213
4214 set_bit(STRIPE_INSYNC, &sh->state);
4215 break;
4216 case check_state_run:
4217 case check_state_run_q:
4218 case check_state_run_pq:
4219 break;
4220 case check_state_check_result:
4221 sh->check_state = check_state_idle;
4222
4223
4224
4225
4226
4227 if (sh->ops.zero_sum_result == 0) {
4228
4229 if (!s->failed)
4230 set_bit(STRIPE_INSYNC, &sh->state);
4231 else {
4232
4233
4234
4235
4236 sh->check_state = check_state_compute_result;
4237
4238
4239
4240
4241
4242 }
4243 } else {
4244 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
4245 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4246
4247 set_bit(STRIPE_INSYNC, &sh->state);
4248 pr_warn_ratelimited("%s: mismatch sector in range "
4249 "%llu-%llu\n", mdname(conf->mddev),
4250 (unsigned long long) sh->sector,
4251 (unsigned long long) sh->sector +
4252 STRIPE_SECTORS);
4253 } else {
4254 int *target = &sh->ops.target;
4255
4256 sh->ops.target = -1;
4257 sh->ops.target2 = -1;
4258 sh->check_state = check_state_compute_run;
4259 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4260 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4261 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4262 set_bit(R5_Wantcompute,
4263 &sh->dev[pd_idx].flags);
4264 *target = pd_idx;
4265 target = &sh->ops.target2;
4266 s->uptodate++;
4267 }
4268 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4269 set_bit(R5_Wantcompute,
4270 &sh->dev[qd_idx].flags);
4271 *target = qd_idx;
4272 s->uptodate++;
4273 }
4274 }
4275 }
4276 break;
4277 case check_state_compute_run:
4278 break;
4279 default:
4280 pr_warn("%s: unknown check_state: %d sector: %llu\n",
4281 __func__, sh->check_state,
4282 (unsigned long long) sh->sector);
4283 BUG();
4284 }
4285}
4286
4287static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
4288{
4289 int i;
4290
4291
4292
4293
4294 struct dma_async_tx_descriptor *tx = NULL;
4295 BUG_ON(sh->batch_head);
4296 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
4297 for (i = 0; i < sh->disks; i++)
4298 if (i != sh->pd_idx && i != sh->qd_idx) {
4299 int dd_idx, j;
4300 struct stripe_head *sh2;
4301 struct async_submit_ctl submit;
4302
4303 sector_t bn = raid5_compute_blocknr(sh, i, 1);
4304 sector_t s = raid5_compute_sector(conf, bn, 0,
4305 &dd_idx, NULL);
4306 sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1);
4307 if (sh2 == NULL)
4308
4309
4310
4311
4312 continue;
4313 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
4314 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
4315
4316 raid5_release_stripe(sh2);
4317 continue;
4318 }
4319
4320
4321 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
4322 tx = async_memcpy(sh2->dev[dd_idx].page,
4323 sh->dev[i].page, 0, 0, STRIPE_SIZE,
4324 &submit);
4325
4326 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
4327 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
4328 for (j = 0; j < conf->raid_disks; j++)
4329 if (j != sh2->pd_idx &&
4330 j != sh2->qd_idx &&
4331 !test_bit(R5_Expanded, &sh2->dev[j].flags))
4332 break;
4333 if (j == conf->raid_disks) {
4334 set_bit(STRIPE_EXPAND_READY, &sh2->state);
4335 set_bit(STRIPE_HANDLE, &sh2->state);
4336 }
4337 raid5_release_stripe(sh2);
4338
4339 }
4340
4341 async_tx_quiesce(&tx);
4342}
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
4359{
4360 struct r5conf *conf = sh->raid_conf;
4361 int disks = sh->disks;
4362 struct r5dev *dev;
4363 int i;
4364 int do_recovery = 0;
4365
4366 memset(s, 0, sizeof(*s));
4367
4368 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
4369 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
4370 s->failed_num[0] = -1;
4371 s->failed_num[1] = -1;
4372 s->log_failed = r5l_log_disk_error(conf);
4373
4374
4375 rcu_read_lock();
4376 for (i=disks; i--; ) {
4377 struct md_rdev *rdev;
4378 sector_t first_bad;
4379 int bad_sectors;
4380 int is_bad = 0;
4381
4382 dev = &sh->dev[i];
4383
4384 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
4385 i, dev->flags,
4386 dev->toread, dev->towrite, dev->written);
4387
4388
4389
4390
4391
4392 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
4393 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
4394 set_bit(R5_Wantfill, &dev->flags);
4395
4396
4397 if (test_bit(R5_LOCKED, &dev->flags))
4398 s->locked++;
4399 if (test_bit(R5_UPTODATE, &dev->flags))
4400 s->uptodate++;
4401 if (test_bit(R5_Wantcompute, &dev->flags)) {
4402 s->compute++;
4403 BUG_ON(s->compute > 2);
4404 }
4405
4406 if (test_bit(R5_Wantfill, &dev->flags))
4407 s->to_fill++;
4408 else if (dev->toread)
4409 s->to_read++;
4410 if (dev->towrite) {
4411 s->to_write++;
4412 if (!test_bit(R5_OVERWRITE, &dev->flags))
4413 s->non_overwrite++;
4414 }
4415 if (dev->written)
4416 s->written++;
4417
4418
4419
4420 rdev = rcu_dereference(conf->disks[i].replacement);
4421 if (rdev && !test_bit(Faulty, &rdev->flags) &&
4422 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
4423 !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4424 &first_bad, &bad_sectors))
4425 set_bit(R5_ReadRepl, &dev->flags);
4426 else {
4427 if (rdev && !test_bit(Faulty, &rdev->flags))
4428 set_bit(R5_NeedReplace, &dev->flags);
4429 else
4430 clear_bit(R5_NeedReplace, &dev->flags);
4431 rdev = rcu_dereference(conf->disks[i].rdev);
4432 clear_bit(R5_ReadRepl, &dev->flags);
4433 }
4434 if (rdev && test_bit(Faulty, &rdev->flags))
4435 rdev = NULL;
4436 if (rdev) {
4437 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4438 &first_bad, &bad_sectors);
4439 if (s->blocked_rdev == NULL
4440 && (test_bit(Blocked, &rdev->flags)
4441 || is_bad < 0)) {
4442 if (is_bad < 0)
4443 set_bit(BlockedBadBlocks,
4444 &rdev->flags);
4445 s->blocked_rdev = rdev;
4446 atomic_inc(&rdev->nr_pending);
4447 }
4448 }
4449 clear_bit(R5_Insync, &dev->flags);
4450 if (!rdev)
4451 ;
4452 else if (is_bad) {
4453
4454 if (!test_bit(WriteErrorSeen, &rdev->flags) &&
4455 test_bit(R5_UPTODATE, &dev->flags)) {
4456
4457
4458
4459 set_bit(R5_Insync, &dev->flags);
4460 set_bit(R5_ReadError, &dev->flags);
4461 }
4462 } else if (test_bit(In_sync, &rdev->flags))
4463 set_bit(R5_Insync, &dev->flags);
4464 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
4465
4466 set_bit(R5_Insync, &dev->flags);
4467 else if (test_bit(R5_UPTODATE, &dev->flags) &&
4468 test_bit(R5_Expanded, &dev->flags))
4469
4470
4471
4472
4473 set_bit(R5_Insync, &dev->flags);
4474
4475 if (test_bit(R5_WriteError, &dev->flags)) {
4476
4477
4478 struct md_rdev *rdev2 = rcu_dereference(
4479 conf->disks[i].rdev);
4480 if (rdev2 == rdev)
4481 clear_bit(R5_Insync, &dev->flags);
4482 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4483 s->handle_bad_blocks = 1;
4484 atomic_inc(&rdev2->nr_pending);
4485 } else
4486 clear_bit(R5_WriteError, &dev->flags);
4487 }
4488 if (test_bit(R5_MadeGood, &dev->flags)) {
4489
4490
4491 struct md_rdev *rdev2 = rcu_dereference(
4492 conf->disks[i].rdev);
4493 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4494 s->handle_bad_blocks = 1;
4495 atomic_inc(&rdev2->nr_pending);
4496 } else
4497 clear_bit(R5_MadeGood, &dev->flags);
4498 }
4499 if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
4500 struct md_rdev *rdev2 = rcu_dereference(
4501 conf->disks[i].replacement);
4502 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4503 s->handle_bad_blocks = 1;
4504 atomic_inc(&rdev2->nr_pending);
4505 } else
4506 clear_bit(R5_MadeGoodRepl, &dev->flags);
4507 }
4508 if (!test_bit(R5_Insync, &dev->flags)) {
4509
4510 clear_bit(R5_ReadError, &dev->flags);
4511 clear_bit(R5_ReWrite, &dev->flags);
4512 }
4513 if (test_bit(R5_ReadError, &dev->flags))
4514 clear_bit(R5_Insync, &dev->flags);
4515 if (!test_bit(R5_Insync, &dev->flags)) {
4516 if (s->failed < 2)
4517 s->failed_num[s->failed] = i;
4518 s->failed++;
4519 if (rdev && !test_bit(Faulty, &rdev->flags))
4520 do_recovery = 1;
4521 }
4522
4523 if (test_bit(R5_InJournal, &dev->flags))
4524 s->injournal++;
4525 if (test_bit(R5_InJournal, &dev->flags) && dev->written)
4526 s->just_cached++;
4527 }
4528 if (test_bit(STRIPE_SYNCING, &sh->state)) {
4529
4530
4531
4532
4533
4534
4535
4536
4537 if (do_recovery ||
4538 sh->sector >= conf->mddev->recovery_cp ||
4539 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
4540 s->syncing = 1;
4541 else
4542 s->replacing = 1;
4543 }
4544 rcu_read_unlock();
4545}
4546
4547static int clear_batch_ready(struct stripe_head *sh)
4548{
4549
4550
4551
4552
4553 struct stripe_head *tmp;
4554 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
4555 return (sh->batch_head && sh->batch_head != sh);
4556 spin_lock(&sh->stripe_lock);
4557 if (!sh->batch_head) {
4558 spin_unlock(&sh->stripe_lock);
4559 return 0;
4560 }
4561
4562
4563
4564
4565
4566 if (sh->batch_head != sh) {
4567 spin_unlock(&sh->stripe_lock);
4568 return 1;
4569 }
4570 spin_lock(&sh->batch_lock);
4571 list_for_each_entry(tmp, &sh->batch_list, batch_list)
4572 clear_bit(STRIPE_BATCH_READY, &tmp->state);
4573 spin_unlock(&sh->batch_lock);
4574 spin_unlock(&sh->stripe_lock);
4575
4576
4577
4578
4579
4580 return 0;
4581}
4582
4583static void break_stripe_batch_list(struct stripe_head *head_sh,
4584 unsigned long handle_flags)
4585{
4586 struct stripe_head *sh, *next;
4587 int i;
4588 int do_wakeup = 0;
4589
4590 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
4591
4592 list_del_init(&sh->batch_list);
4593
4594 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
4595 (1 << STRIPE_SYNCING) |
4596 (1 << STRIPE_REPLACED) |
4597 (1 << STRIPE_DELAYED) |
4598 (1 << STRIPE_BIT_DELAY) |
4599 (1 << STRIPE_FULL_WRITE) |
4600 (1 << STRIPE_BIOFILL_RUN) |
4601 (1 << STRIPE_COMPUTE_RUN) |
4602 (1 << STRIPE_OPS_REQ_PENDING) |
4603 (1 << STRIPE_DISCARD) |
4604 (1 << STRIPE_BATCH_READY) |
4605 (1 << STRIPE_BATCH_ERR) |
4606 (1 << STRIPE_BITMAP_PENDING)),
4607 "stripe state: %lx\n", sh->state);
4608 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
4609 (1 << STRIPE_REPLACED)),
4610 "head stripe state: %lx\n", head_sh->state);
4611
4612 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
4613 (1 << STRIPE_PREREAD_ACTIVE) |
4614 (1 << STRIPE_DEGRADED)),
4615 head_sh->state & (1 << STRIPE_INSYNC));
4616
4617 sh->check_state = head_sh->check_state;
4618 sh->reconstruct_state = head_sh->reconstruct_state;
4619 for (i = 0; i < sh->disks; i++) {
4620 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
4621 do_wakeup = 1;
4622 sh->dev[i].flags = head_sh->dev[i].flags &
4623 (~((1 << R5_WriteError) | (1 << R5_Overlap)));
4624 }
4625 spin_lock_irq(&sh->stripe_lock);
4626 sh->batch_head = NULL;
4627 spin_unlock_irq(&sh->stripe_lock);
4628 if (handle_flags == 0 ||
4629 sh->state & handle_flags)
4630 set_bit(STRIPE_HANDLE, &sh->state);
4631 raid5_release_stripe(sh);
4632 }
4633 spin_lock_irq(&head_sh->stripe_lock);
4634 head_sh->batch_head = NULL;
4635 spin_unlock_irq(&head_sh->stripe_lock);
4636 for (i = 0; i < head_sh->disks; i++)
4637 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
4638 do_wakeup = 1;
4639 if (head_sh->state & handle_flags)
4640 set_bit(STRIPE_HANDLE, &head_sh->state);
4641
4642 if (do_wakeup)
4643 wake_up(&head_sh->raid_conf->wait_for_overlap);
4644}
4645
4646static void handle_stripe(struct stripe_head *sh)
4647{
4648 struct stripe_head_state s;
4649 struct r5conf *conf = sh->raid_conf;
4650 int i;
4651 int prexor;
4652 int disks = sh->disks;
4653 struct r5dev *pdev, *qdev;
4654
4655 clear_bit(STRIPE_HANDLE, &sh->state);
4656 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
4657
4658
4659 set_bit(STRIPE_HANDLE, &sh->state);
4660 return;
4661 }
4662
4663 if (clear_batch_ready(sh) ) {
4664 clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
4665 return;
4666 }
4667
4668 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
4669 break_stripe_batch_list(sh, 0);
4670
4671 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
4672 spin_lock(&sh->stripe_lock);
4673
4674
4675
4676
4677 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
4678 !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
4679 !test_bit(STRIPE_DISCARD, &sh->state) &&
4680 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
4681 set_bit(STRIPE_SYNCING, &sh->state);
4682 clear_bit(STRIPE_INSYNC, &sh->state);
4683 clear_bit(STRIPE_REPLACED, &sh->state);
4684 }
4685 spin_unlock(&sh->stripe_lock);
4686 }
4687 clear_bit(STRIPE_DELAYED, &sh->state);
4688
4689 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
4690 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
4691 (unsigned long long)sh->sector, sh->state,
4692 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
4693 sh->check_state, sh->reconstruct_state);
4694
4695 analyse_stripe(sh, &s);
4696
4697 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
4698 goto finish;
4699
4700 if (s.handle_bad_blocks ||
4701 test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
4702 set_bit(STRIPE_HANDLE, &sh->state);
4703 goto finish;
4704 }
4705
4706 if (unlikely(s.blocked_rdev)) {
4707 if (s.syncing || s.expanding || s.expanded ||
4708 s.replacing || s.to_write || s.written) {
4709 set_bit(STRIPE_HANDLE, &sh->state);
4710 goto finish;
4711 }
4712
4713 rdev_dec_pending(s.blocked_rdev, conf->mddev);
4714 s.blocked_rdev = NULL;
4715 }
4716
4717 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
4718 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
4719 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
4720 }
4721
4722 pr_debug("locked=%d uptodate=%d to_read=%d"
4723 " to_write=%d failed=%d failed_num=%d,%d\n",
4724 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
4725 s.failed_num[0], s.failed_num[1]);
4726
4727
4728
4729
4730
4731
4732
4733 if (s.failed > conf->max_degraded ||
4734 (s.log_failed && s.injournal == 0)) {
4735 sh->check_state = 0;
4736 sh->reconstruct_state = 0;
4737 break_stripe_batch_list(sh, 0);
4738 if (s.to_read+s.to_write+s.written)
4739 handle_failed_stripe(conf, sh, &s, disks);
4740 if (s.syncing + s.replacing)
4741 handle_failed_sync(conf, sh, &s);
4742 }
4743
4744
4745
4746
4747 prexor = 0;
4748 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
4749 prexor = 1;
4750 if (sh->reconstruct_state == reconstruct_state_drain_result ||
4751 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
4752 sh->reconstruct_state = reconstruct_state_idle;
4753
4754
4755
4756
4757 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
4758 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
4759 BUG_ON(sh->qd_idx >= 0 &&
4760 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
4761 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
4762 for (i = disks; i--; ) {
4763 struct r5dev *dev = &sh->dev[i];
4764 if (test_bit(R5_LOCKED, &dev->flags) &&
4765 (i == sh->pd_idx || i == sh->qd_idx ||
4766 dev->written || test_bit(R5_InJournal,
4767 &dev->flags))) {
4768 pr_debug("Writing block %d\n", i);
4769 set_bit(R5_Wantwrite, &dev->flags);
4770 if (prexor)
4771 continue;
4772 if (s.failed > 1)
4773 continue;
4774 if (!test_bit(R5_Insync, &dev->flags) ||
4775 ((i == sh->pd_idx || i == sh->qd_idx) &&
4776 s.failed == 0))
4777 set_bit(STRIPE_INSYNC, &sh->state);
4778 }
4779 }
4780 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4781 s.dec_preread_active = 1;
4782 }
4783
4784
4785
4786
4787
4788 pdev = &sh->dev[sh->pd_idx];
4789 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
4790 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
4791 qdev = &sh->dev[sh->qd_idx];
4792 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
4793 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
4794 || conf->level < 6;
4795
4796 if (s.written &&
4797 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
4798 && !test_bit(R5_LOCKED, &pdev->flags)
4799 && (test_bit(R5_UPTODATE, &pdev->flags) ||
4800 test_bit(R5_Discard, &pdev->flags))))) &&
4801 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
4802 && !test_bit(R5_LOCKED, &qdev->flags)
4803 && (test_bit(R5_UPTODATE, &qdev->flags) ||
4804 test_bit(R5_Discard, &qdev->flags))))))
4805 handle_stripe_clean_event(conf, sh, disks);
4806
4807 if (s.just_cached)
4808 r5c_handle_cached_data_endio(conf, sh, disks);
4809 log_stripe_write_finished(sh);
4810
4811
4812
4813
4814
4815 if (s.to_read || s.non_overwrite
4816 || (conf->level == 6 && s.to_write && s.failed)
4817 || (s.syncing && (s.uptodate + s.compute < disks))
4818 || s.replacing
4819 || s.expanding)
4820 handle_stripe_fill(sh, &s, disks);
4821
4822
4823
4824
4825
4826
4827 r5c_finish_stripe_write_out(conf, sh, &s);
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
4839 if (!r5c_is_writeback(conf->log)) {
4840 if (s.to_write)
4841 handle_stripe_dirtying(conf, sh, &s, disks);
4842 } else {
4843 int ret = 0;
4844
4845
4846 if (s.to_write)
4847 ret = r5c_try_caching_write(conf, sh, &s,
4848 disks);
4849
4850
4851
4852
4853
4854
4855
4856 if (ret == -EAGAIN ||
4857
4858 (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
4859 s.injournal > 0)) {
4860 ret = handle_stripe_dirtying(conf, sh, &s,
4861 disks);
4862 if (ret == -EAGAIN)
4863 goto finish;
4864 }
4865 }
4866 }
4867
4868
4869
4870
4871
4872
4873 if (sh->check_state ||
4874 (s.syncing && s.locked == 0 &&
4875 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
4876 !test_bit(STRIPE_INSYNC, &sh->state))) {
4877 if (conf->level == 6)
4878 handle_parity_checks6(conf, sh, &s, disks);
4879 else
4880 handle_parity_checks5(conf, sh, &s, disks);
4881 }
4882
4883 if ((s.replacing || s.syncing) && s.locked == 0
4884 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
4885 && !test_bit(STRIPE_REPLACED, &sh->state)) {
4886
4887 for (i = 0; i < conf->raid_disks; i++)
4888 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
4889 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
4890 set_bit(R5_WantReplace, &sh->dev[i].flags);
4891 set_bit(R5_LOCKED, &sh->dev[i].flags);
4892 s.locked++;
4893 }
4894 if (s.replacing)
4895 set_bit(STRIPE_INSYNC, &sh->state);
4896 set_bit(STRIPE_REPLACED, &sh->state);
4897 }
4898 if ((s.syncing || s.replacing) && s.locked == 0 &&
4899 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
4900 test_bit(STRIPE_INSYNC, &sh->state)) {
4901 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
4902 clear_bit(STRIPE_SYNCING, &sh->state);
4903 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
4904 wake_up(&conf->wait_for_overlap);
4905 }
4906
4907
4908
4909
4910 if (s.failed <= conf->max_degraded && !conf->mddev->ro)
4911 for (i = 0; i < s.failed; i++) {
4912 struct r5dev *dev = &sh->dev[s.failed_num[i]];
4913 if (test_bit(R5_ReadError, &dev->flags)
4914 && !test_bit(R5_LOCKED, &dev->flags)
4915 && test_bit(R5_UPTODATE, &dev->flags)
4916 ) {
4917 if (!test_bit(R5_ReWrite, &dev->flags)) {
4918 set_bit(R5_Wantwrite, &dev->flags);
4919 set_bit(R5_ReWrite, &dev->flags);
4920 set_bit(R5_LOCKED, &dev->flags);
4921 s.locked++;
4922 } else {
4923
4924 set_bit(R5_Wantread, &dev->flags);
4925 set_bit(R5_LOCKED, &dev->flags);
4926 s.locked++;
4927 }
4928 }
4929 }
4930
4931
4932 if (sh->reconstruct_state == reconstruct_state_result) {
4933 struct stripe_head *sh_src
4934 = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
4935 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
4936
4937
4938
4939 set_bit(STRIPE_DELAYED, &sh->state);
4940 set_bit(STRIPE_HANDLE, &sh->state);
4941 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
4942 &sh_src->state))
4943 atomic_inc(&conf->preread_active_stripes);
4944 raid5_release_stripe(sh_src);
4945 goto finish;
4946 }
4947 if (sh_src)
4948 raid5_release_stripe(sh_src);
4949
4950 sh->reconstruct_state = reconstruct_state_idle;
4951 clear_bit(STRIPE_EXPANDING, &sh->state);
4952 for (i = conf->raid_disks; i--; ) {
4953 set_bit(R5_Wantwrite, &sh->dev[i].flags);
4954 set_bit(R5_LOCKED, &sh->dev[i].flags);
4955 s.locked++;
4956 }
4957 }
4958
4959 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
4960 !sh->reconstruct_state) {
4961
4962 sh->disks = conf->raid_disks;
4963 stripe_set_idx(sh->sector, conf, 0, sh);
4964 schedule_reconstruction(sh, &s, 1, 1);
4965 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
4966 clear_bit(STRIPE_EXPAND_READY, &sh->state);
4967 atomic_dec(&conf->reshape_stripes);
4968 wake_up(&conf->wait_for_overlap);
4969 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
4970 }
4971
4972 if (s.expanding && s.locked == 0 &&
4973 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
4974 handle_stripe_expansion(conf, sh);
4975
4976finish:
4977
4978 if (unlikely(s.blocked_rdev)) {
4979 if (conf->mddev->external)
4980 md_wait_for_blocked_rdev(s.blocked_rdev,
4981 conf->mddev);
4982 else
4983
4984
4985
4986
4987 rdev_dec_pending(s.blocked_rdev,
4988 conf->mddev);
4989 }
4990
4991 if (s.handle_bad_blocks)
4992 for (i = disks; i--; ) {
4993 struct md_rdev *rdev;
4994 struct r5dev *dev = &sh->dev[i];
4995 if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
4996
4997 rdev = conf->disks[i].rdev;
4998 if (!rdev_set_badblocks(rdev, sh->sector,
4999 STRIPE_SECTORS, 0))
5000 md_error(conf->mddev, rdev);
5001 rdev_dec_pending(rdev, conf->mddev);
5002 }
5003 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
5004 rdev = conf->disks[i].rdev;
5005 rdev_clear_badblocks(rdev, sh->sector,
5006 STRIPE_SECTORS, 0);
5007 rdev_dec_pending(rdev, conf->mddev);
5008 }
5009 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
5010 rdev = conf->disks[i].replacement;
5011 if (!rdev)
5012
5013 rdev = conf->disks[i].rdev;
5014 rdev_clear_badblocks(rdev, sh->sector,
5015 STRIPE_SECTORS, 0);
5016 rdev_dec_pending(rdev, conf->mddev);
5017 }
5018 }
5019
5020 if (s.ops_request)
5021 raid_run_ops(sh, s.ops_request);
5022
5023 ops_run_io(sh, &s);
5024
5025 if (s.dec_preread_active) {
5026
5027
5028
5029
5030 atomic_dec(&conf->preread_active_stripes);
5031 if (atomic_read(&conf->preread_active_stripes) <
5032 IO_THRESHOLD)
5033 md_wakeup_thread(conf->mddev->thread);
5034 }
5035
5036 clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
5037}
5038
5039static void raid5_activate_delayed(struct r5conf *conf)
5040{
5041 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
5042 while (!list_empty(&conf->delayed_list)) {
5043 struct list_head *l = conf->delayed_list.next;
5044 struct stripe_head *sh;
5045 sh = list_entry(l, struct stripe_head, lru);
5046 list_del_init(l);
5047 clear_bit(STRIPE_DELAYED, &sh->state);
5048 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5049 atomic_inc(&conf->preread_active_stripes);
5050 list_add_tail(&sh->lru, &conf->hold_list);
5051 raid5_wakeup_stripe_thread(sh);
5052 }
5053 }
5054}
5055
5056static void activate_bit_delay(struct r5conf *conf,
5057 struct list_head *temp_inactive_list)
5058{
5059
5060 struct list_head head;
5061 list_add(&head, &conf->bitmap_list);
5062 list_del_init(&conf->bitmap_list);
5063 while (!list_empty(&head)) {
5064 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
5065 int hash;
5066 list_del_init(&sh->lru);
5067 atomic_inc(&sh->count);
5068 hash = sh->hash_lock_index;
5069 __release_stripe(conf, sh, &temp_inactive_list[hash]);
5070 }
5071}
5072
5073static int raid5_congested(struct mddev *mddev, int bits)
5074{
5075 struct r5conf *conf = mddev->private;
5076
5077
5078
5079
5080
5081 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
5082 return 1;
5083
5084
5085 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state))
5086 return 1;
5087 if (conf->quiesce)
5088 return 1;
5089 if (atomic_read(&conf->empty_inactive_list_nr))
5090 return 1;
5091
5092 return 0;
5093}
5094
5095static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
5096{
5097 struct r5conf *conf = mddev->private;
5098 sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);
5099 unsigned int chunk_sectors;
5100 unsigned int bio_sectors = bio_sectors(bio);
5101
5102 chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
5103 return chunk_sectors >=
5104 ((sector & (chunk_sectors - 1)) + bio_sectors);
5105}
5106
5107
5108
5109
5110
5111static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
5112{
5113 unsigned long flags;
5114
5115 spin_lock_irqsave(&conf->device_lock, flags);
5116
5117 bi->bi_next = conf->retry_read_aligned_list;
5118 conf->retry_read_aligned_list = bi;
5119
5120 spin_unlock_irqrestore(&conf->device_lock, flags);
5121 md_wakeup_thread(conf->mddev->thread);
5122}
5123
5124static struct bio *remove_bio_from_retry(struct r5conf *conf,
5125 unsigned int *offset)
5126{
5127 struct bio *bi;
5128
5129 bi = conf->retry_read_aligned;
5130 if (bi) {
5131 *offset = conf->retry_read_offset;
5132 conf->retry_read_aligned = NULL;
5133 return bi;
5134 }
5135 bi = conf->retry_read_aligned_list;
5136 if(bi) {
5137 conf->retry_read_aligned_list = bi->bi_next;
5138 bi->bi_next = NULL;
5139 *offset = 0;
5140 }
5141
5142 return bi;
5143}
5144
5145
5146
5147
5148
5149
5150
5151static void raid5_align_endio(struct bio *bi)
5152{
5153 struct bio* raid_bi = bi->bi_private;
5154 struct mddev *mddev;
5155 struct r5conf *conf;
5156 struct md_rdev *rdev;
5157 int error = bi->bi_error;
5158
5159 bio_put(bi);
5160
5161 rdev = (void*)raid_bi->bi_next;
5162 raid_bi->bi_next = NULL;
5163 mddev = rdev->mddev;
5164 conf = mddev->private;
5165
5166 rdev_dec_pending(rdev, conf->mddev);
5167
5168 if (!error) {
5169 bio_endio(raid_bi);
5170 if (atomic_dec_and_test(&conf->active_aligned_reads))
5171 wake_up(&conf->wait_for_quiescent);
5172 return;
5173 }
5174
5175 pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
5176
5177 add_bio_to_retry(raid_bi, conf);
5178}
5179
5180static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
5181{
5182 struct r5conf *conf = mddev->private;
5183 int dd_idx;
5184 struct bio* align_bi;
5185 struct md_rdev *rdev;
5186 sector_t end_sector;
5187
5188 if (!in_chunk_boundary(mddev, raid_bio)) {
5189 pr_debug("%s: non aligned\n", __func__);
5190 return 0;
5191 }
5192
5193
5194
5195 align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set);
5196 if (!align_bi)
5197 return 0;
5198
5199
5200
5201
5202 align_bi->bi_end_io = raid5_align_endio;
5203 align_bi->bi_private = raid_bio;
5204
5205
5206
5207 align_bi->bi_iter.bi_sector =
5208 raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector,
5209 0, &dd_idx, NULL);
5210
5211 end_sector = bio_end_sector(align_bi);
5212 rcu_read_lock();
5213 rdev = rcu_dereference(conf->disks[dd_idx].replacement);
5214 if (!rdev || test_bit(Faulty, &rdev->flags) ||
5215 rdev->recovery_offset < end_sector) {
5216 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
5217 if (rdev &&
5218 (test_bit(Faulty, &rdev->flags) ||
5219 !(test_bit(In_sync, &rdev->flags) ||
5220 rdev->recovery_offset >= end_sector)))
5221 rdev = NULL;
5222 }
5223
5224 if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) {
5225 rcu_read_unlock();
5226 bio_put(align_bi);
5227 return 0;
5228 }
5229
5230 if (rdev) {
5231 sector_t first_bad;
5232 int bad_sectors;
5233
5234 atomic_inc(&rdev->nr_pending);
5235 rcu_read_unlock();
5236 raid_bio->bi_next = (void*)rdev;
5237 align_bi->bi_bdev = rdev->bdev;
5238 bio_clear_flag(align_bi, BIO_SEG_VALID);
5239
5240 if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
5241 bio_sectors(align_bi),
5242 &first_bad, &bad_sectors)) {
5243 bio_put(align_bi);
5244 rdev_dec_pending(rdev, mddev);
5245 return 0;
5246 }
5247
5248
5249 align_bi->bi_iter.bi_sector += rdev->data_offset;
5250
5251 spin_lock_irq(&conf->device_lock);
5252 wait_event_lock_irq(conf->wait_for_quiescent,
5253 conf->quiesce == 0,
5254 conf->device_lock);
5255 atomic_inc(&conf->active_aligned_reads);
5256 spin_unlock_irq(&conf->device_lock);
5257
5258 if (mddev->gendisk)
5259 trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
5260 align_bi, disk_devt(mddev->gendisk),
5261 raid_bio->bi_iter.bi_sector);
5262 generic_make_request(align_bi);
5263 return 1;
5264 } else {
5265 rcu_read_unlock();
5266 bio_put(align_bi);
5267 return 0;
5268 }
5269}
5270
5271static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
5272{
5273 struct bio *split;
5274 sector_t sector = raid_bio->bi_iter.bi_sector;
5275 unsigned chunk_sects = mddev->chunk_sectors;
5276 unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
5277
5278 if (sectors < bio_sectors(raid_bio)) {
5279 struct r5conf *conf = mddev->private;
5280 split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split);
5281 bio_chain(split, raid_bio);
5282 generic_make_request(raid_bio);
5283 raid_bio = split;
5284 }
5285
5286 if (!raid5_read_one_chunk(mddev, raid_bio))
5287 return raid_bio;
5288
5289 return NULL;
5290}
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
5303{
5304 struct stripe_head *sh, *tmp;
5305 struct list_head *handle_list = NULL;
5306 struct r5worker_group *wg;
5307 bool second_try = !r5c_is_writeback(conf->log) &&
5308 !r5l_log_disk_error(conf);
5309 bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
5310 r5l_log_disk_error(conf);
5311
5312again:
5313 wg = NULL;
5314 sh = NULL;
5315 if (conf->worker_cnt_per_group == 0) {
5316 handle_list = try_loprio ? &conf->loprio_list :
5317 &conf->handle_list;
5318 } else if (group != ANY_GROUP) {
5319 handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
5320 &conf->worker_groups[group].handle_list;
5321 wg = &conf->worker_groups[group];
5322 } else {
5323 int i;
5324 for (i = 0; i < conf->group_cnt; i++) {
5325 handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
5326 &conf->worker_groups[i].handle_list;
5327 wg = &conf->worker_groups[i];
5328 if (!list_empty(handle_list))
5329 break;
5330 }
5331 }
5332
5333 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
5334 __func__,
5335 list_empty(handle_list) ? "empty" : "busy",
5336 list_empty(&conf->hold_list) ? "empty" : "busy",
5337 atomic_read(&conf->pending_full_writes), conf->bypass_count);
5338
5339 if (!list_empty(handle_list)) {
5340 sh = list_entry(handle_list->next, typeof(*sh), lru);
5341
5342 if (list_empty(&conf->hold_list))
5343 conf->bypass_count = 0;
5344 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
5345 if (conf->hold_list.next == conf->last_hold)
5346 conf->bypass_count++;
5347 else {
5348 conf->last_hold = conf->hold_list.next;
5349 conf->bypass_count -= conf->bypass_threshold;
5350 if (conf->bypass_count < 0)
5351 conf->bypass_count = 0;
5352 }
5353 }
5354 } else if (!list_empty(&conf->hold_list) &&
5355 ((conf->bypass_threshold &&
5356 conf->bypass_count > conf->bypass_threshold) ||
5357 atomic_read(&conf->pending_full_writes) == 0)) {
5358
5359 list_for_each_entry(tmp, &conf->hold_list, lru) {
5360 if (conf->worker_cnt_per_group == 0 ||
5361 group == ANY_GROUP ||
5362 !cpu_online(tmp->cpu) ||
5363 cpu_to_group(tmp->cpu) == group) {
5364 sh = tmp;
5365 break;
5366 }
5367 }
5368
5369 if (sh) {
5370 conf->bypass_count -= conf->bypass_threshold;
5371 if (conf->bypass_count < 0)
5372 conf->bypass_count = 0;
5373 }
5374 wg = NULL;
5375 }
5376
5377 if (!sh) {
5378 if (second_try)
5379 return NULL;
5380 second_try = true;
5381 try_loprio = !try_loprio;
5382 goto again;
5383 }
5384
5385 if (wg) {
5386 wg->stripes_cnt--;
5387 sh->group = NULL;
5388 }
5389 list_del_init(&sh->lru);
5390 BUG_ON(atomic_inc_return(&sh->count) != 1);
5391 return sh;
5392}
5393
5394struct raid5_plug_cb {
5395 struct blk_plug_cb cb;
5396 struct list_head list;
5397 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
5398};
5399
5400static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
5401{
5402 struct raid5_plug_cb *cb = container_of(
5403 blk_cb, struct raid5_plug_cb, cb);
5404 struct stripe_head *sh;
5405 struct mddev *mddev = cb->cb.data;
5406 struct r5conf *conf = mddev->private;
5407 int cnt = 0;
5408 int hash;
5409
5410 if (cb->list.next && !list_empty(&cb->list)) {
5411 spin_lock_irq(&conf->device_lock);
5412 while (!list_empty(&cb->list)) {
5413 sh = list_first_entry(&cb->list, struct stripe_head, lru);
5414 list_del_init(&sh->lru);
5415
5416
5417
5418
5419
5420 smp_mb__before_atomic();
5421 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
5422
5423
5424
5425
5426 hash = sh->hash_lock_index;
5427 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
5428 cnt++;
5429 }
5430 spin_unlock_irq(&conf->device_lock);
5431 }
5432 release_inactive_stripe_list(conf, cb->temp_inactive_list,
5433 NR_STRIPE_HASH_LOCKS);
5434 if (mddev->queue)
5435 trace_block_unplug(mddev->queue, cnt, !from_schedule);
5436 kfree(cb);
5437}
5438
5439static void release_stripe_plug(struct mddev *mddev,
5440 struct stripe_head *sh)
5441{
5442 struct blk_plug_cb *blk_cb = blk_check_plugged(
5443 raid5_unplug, mddev,
5444 sizeof(struct raid5_plug_cb));
5445 struct raid5_plug_cb *cb;
5446
5447 if (!blk_cb) {
5448 raid5_release_stripe(sh);
5449 return;
5450 }
5451
5452 cb = container_of(blk_cb, struct raid5_plug_cb, cb);
5453
5454 if (cb->list.next == NULL) {
5455 int i;
5456 INIT_LIST_HEAD(&cb->list);
5457 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5458 INIT_LIST_HEAD(cb->temp_inactive_list + i);
5459 }
5460
5461 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
5462 list_add_tail(&sh->lru, &cb->list);
5463 else
5464 raid5_release_stripe(sh);
5465}
5466
5467static void make_discard_request(struct mddev *mddev, struct bio *bi)
5468{
5469 struct r5conf *conf = mddev->private;
5470 sector_t logical_sector, last_sector;
5471 struct stripe_head *sh;
5472 int stripe_sectors;
5473
5474 if (mddev->reshape_position != MaxSector)
5475
5476 return;
5477
5478 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5479 last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
5480
5481 bi->bi_next = NULL;
5482 md_write_start(mddev, bi);
5483
5484 stripe_sectors = conf->chunk_sectors *
5485 (conf->raid_disks - conf->max_degraded);
5486 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
5487 stripe_sectors);
5488 sector_div(last_sector, stripe_sectors);
5489
5490 logical_sector *= conf->chunk_sectors;
5491 last_sector *= conf->chunk_sectors;
5492
5493 for (; logical_sector < last_sector;
5494 logical_sector += STRIPE_SECTORS) {
5495 DEFINE_WAIT(w);
5496 int d;
5497 again:
5498 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
5499 prepare_to_wait(&conf->wait_for_overlap, &w,
5500 TASK_UNINTERRUPTIBLE);
5501 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5502 if (test_bit(STRIPE_SYNCING, &sh->state)) {
5503 raid5_release_stripe(sh);
5504 schedule();
5505 goto again;
5506 }
5507 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5508 spin_lock_irq(&sh->stripe_lock);
5509 for (d = 0; d < conf->raid_disks; d++) {
5510 if (d == sh->pd_idx || d == sh->qd_idx)
5511 continue;
5512 if (sh->dev[d].towrite || sh->dev[d].toread) {
5513 set_bit(R5_Overlap, &sh->dev[d].flags);
5514 spin_unlock_irq(&sh->stripe_lock);
5515 raid5_release_stripe(sh);
5516 schedule();
5517 goto again;
5518 }
5519 }
5520 set_bit(STRIPE_DISCARD, &sh->state);
5521 finish_wait(&conf->wait_for_overlap, &w);
5522 sh->overwrite_disks = 0;
5523 for (d = 0; d < conf->raid_disks; d++) {
5524 if (d == sh->pd_idx || d == sh->qd_idx)
5525 continue;
5526 sh->dev[d].towrite = bi;
5527 set_bit(R5_OVERWRITE, &sh->dev[d].flags);
5528 bio_inc_remaining(bi);
5529 md_write_inc(mddev, bi);
5530 sh->overwrite_disks++;
5531 }
5532 spin_unlock_irq(&sh->stripe_lock);
5533 if (conf->mddev->bitmap) {
5534 for (d = 0;
5535 d < conf->raid_disks - conf->max_degraded;
5536 d++)
5537 bitmap_startwrite(mddev->bitmap,
5538 sh->sector,
5539 STRIPE_SECTORS,
5540 0);
5541 sh->bm_seq = conf->seq_flush + 1;
5542 set_bit(STRIPE_BIT_DELAY, &sh->state);
5543 }
5544
5545 set_bit(STRIPE_HANDLE, &sh->state);
5546 clear_bit(STRIPE_DELAYED, &sh->state);
5547 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5548 atomic_inc(&conf->preread_active_stripes);
5549 release_stripe_plug(mddev, sh);
5550 }
5551
5552 md_write_end(mddev);
5553 bio_endio(bi);
5554}
5555
5556static void raid5_make_request(struct mddev *mddev, struct bio * bi)
5557{
5558 struct r5conf *conf = mddev->private;
5559 int dd_idx;
5560 sector_t new_sector;
5561 sector_t logical_sector, last_sector;
5562 struct stripe_head *sh;
5563 const int rw = bio_data_dir(bi);
5564 DEFINE_WAIT(w);
5565 bool do_prepare;
5566 bool do_flush = false;
5567
5568 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
5569 int ret = r5l_handle_flush_request(conf->log, bi);
5570
5571 if (ret == 0)
5572 return;
5573 if (ret == -ENODEV) {
5574 md_flush_request(mddev, bi);
5575 return;
5576 }
5577
5578
5579
5580
5581
5582 do_flush = bi->bi_opf & REQ_PREFLUSH;
5583 }
5584
5585
5586
5587
5588
5589
5590 if (rw == READ && mddev->degraded == 0 &&
5591 mddev->reshape_position == MaxSector) {
5592 bi = chunk_aligned_read(mddev, bi);
5593 if (!bi)
5594 return;
5595 }
5596
5597 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
5598 make_discard_request(mddev, bi);
5599 return;
5600 }
5601
5602 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5603 last_sector = bio_end_sector(bi);
5604 bi->bi_next = NULL;
5605 md_write_start(mddev, bi);
5606
5607 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
5608 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
5609 int previous;
5610 int seq;
5611
5612 do_prepare = false;
5613 retry:
5614 seq = read_seqcount_begin(&conf->gen_lock);
5615 previous = 0;
5616 if (do_prepare)
5617 prepare_to_wait(&conf->wait_for_overlap, &w,
5618 TASK_UNINTERRUPTIBLE);
5619 if (unlikely(conf->reshape_progress != MaxSector)) {
5620
5621
5622
5623
5624
5625
5626
5627
5628 spin_lock_irq(&conf->device_lock);
5629 if (mddev->reshape_backwards
5630 ? logical_sector < conf->reshape_progress
5631 : logical_sector >= conf->reshape_progress) {
5632 previous = 1;
5633 } else {
5634 if (mddev->reshape_backwards
5635 ? logical_sector < conf->reshape_safe
5636 : logical_sector >= conf->reshape_safe) {
5637 spin_unlock_irq(&conf->device_lock);
5638 schedule();
5639 do_prepare = true;
5640 goto retry;
5641 }
5642 }
5643 spin_unlock_irq(&conf->device_lock);
5644 }
5645
5646 new_sector = raid5_compute_sector(conf, logical_sector,
5647 previous,
5648 &dd_idx, NULL);
5649 pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n",
5650 (unsigned long long)new_sector,
5651 (unsigned long long)logical_sector);
5652
5653 sh = raid5_get_active_stripe(conf, new_sector, previous,
5654 (bi->bi_opf & REQ_RAHEAD), 0);
5655 if (sh) {
5656 if (unlikely(previous)) {
5657
5658
5659
5660
5661
5662
5663
5664
5665 int must_retry = 0;
5666 spin_lock_irq(&conf->device_lock);
5667 if (mddev->reshape_backwards
5668 ? logical_sector >= conf->reshape_progress
5669 : logical_sector < conf->reshape_progress)
5670
5671 must_retry = 1;
5672 spin_unlock_irq(&conf->device_lock);
5673 if (must_retry) {
5674 raid5_release_stripe(sh);
5675 schedule();
5676 do_prepare = true;
5677 goto retry;
5678 }
5679 }
5680 if (read_seqcount_retry(&conf->gen_lock, seq)) {
5681
5682
5683
5684 raid5_release_stripe(sh);
5685 goto retry;
5686 }
5687
5688 if (rw == WRITE &&
5689 logical_sector >= mddev->suspend_lo &&
5690 logical_sector < mddev->suspend_hi) {
5691 raid5_release_stripe(sh);
5692
5693
5694
5695
5696 flush_signals(current);
5697 prepare_to_wait(&conf->wait_for_overlap,
5698 &w, TASK_INTERRUPTIBLE);
5699 if (logical_sector >= mddev->suspend_lo &&
5700 logical_sector < mddev->suspend_hi) {
5701 schedule();
5702 do_prepare = true;
5703 }
5704 goto retry;
5705 }
5706
5707 if (test_bit(STRIPE_EXPANDING, &sh->state) ||
5708 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
5709
5710
5711
5712
5713 md_wakeup_thread(mddev->thread);
5714 raid5_release_stripe(sh);
5715 schedule();
5716 do_prepare = true;
5717 goto retry;
5718 }
5719 if (do_flush) {
5720 set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
5721
5722 do_flush = false;
5723 }
5724
5725 set_bit(STRIPE_HANDLE, &sh->state);
5726 clear_bit(STRIPE_DELAYED, &sh->state);
5727 if ((!sh->batch_head || sh == sh->batch_head) &&
5728 (bi->bi_opf & REQ_SYNC) &&
5729 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5730 atomic_inc(&conf->preread_active_stripes);
5731 release_stripe_plug(mddev, sh);
5732 } else {
5733
5734 bi->bi_error = -EIO;
5735 break;
5736 }
5737 }
5738 finish_wait(&conf->wait_for_overlap, &w);
5739
5740 if (rw == WRITE)
5741 md_write_end(mddev);
5742 bio_endio(bi);
5743}
5744
5745static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
5746
5747static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
5748{
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758 struct r5conf *conf = mddev->private;
5759 struct stripe_head *sh;
5760 sector_t first_sector, last_sector;
5761 int raid_disks = conf->previous_raid_disks;
5762 int data_disks = raid_disks - conf->max_degraded;
5763 int new_data_disks = conf->raid_disks - conf->max_degraded;
5764 int i;
5765 int dd_idx;
5766 sector_t writepos, readpos, safepos;
5767 sector_t stripe_addr;
5768 int reshape_sectors;
5769 struct list_head stripes;
5770 sector_t retn;
5771
5772 if (sector_nr == 0) {
5773
5774 if (mddev->reshape_backwards &&
5775 conf->reshape_progress < raid5_size(mddev, 0, 0)) {
5776 sector_nr = raid5_size(mddev, 0, 0)
5777 - conf->reshape_progress;
5778 } else if (mddev->reshape_backwards &&
5779 conf->reshape_progress == MaxSector) {
5780
5781 sector_nr = MaxSector;
5782 } else if (!mddev->reshape_backwards &&
5783 conf->reshape_progress > 0)
5784 sector_nr = conf->reshape_progress;
5785 sector_div(sector_nr, new_data_disks);
5786 if (sector_nr) {
5787 mddev->curr_resync_completed = sector_nr;
5788 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5789 *skipped = 1;
5790 retn = sector_nr;
5791 goto finish;
5792 }
5793 }
5794
5795
5796
5797
5798
5799
5800 reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
5801
5802
5803
5804
5805
5806
5807
5808 writepos = conf->reshape_progress;
5809 sector_div(writepos, new_data_disks);
5810 readpos = conf->reshape_progress;
5811 sector_div(readpos, data_disks);
5812 safepos = conf->reshape_safe;
5813 sector_div(safepos, data_disks);
5814 if (mddev->reshape_backwards) {
5815 BUG_ON(writepos < reshape_sectors);
5816 writepos -= reshape_sectors;
5817 readpos += reshape_sectors;
5818 safepos += reshape_sectors;
5819 } else {
5820 writepos += reshape_sectors;
5821
5822
5823
5824
5825 readpos -= min_t(sector_t, reshape_sectors, readpos);
5826 safepos -= min_t(sector_t, reshape_sectors, safepos);
5827 }
5828
5829
5830
5831
5832 if (mddev->reshape_backwards) {
5833 BUG_ON(conf->reshape_progress == 0);
5834 stripe_addr = writepos;
5835 BUG_ON((mddev->dev_sectors &
5836 ~((sector_t)reshape_sectors - 1))
5837 - reshape_sectors - stripe_addr
5838 != sector_nr);
5839 } else {
5840 BUG_ON(writepos != sector_nr + reshape_sectors);
5841 stripe_addr = sector_nr;
5842 }
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864 if (conf->min_offset_diff < 0) {
5865 safepos += -conf->min_offset_diff;
5866 readpos += -conf->min_offset_diff;
5867 } else
5868 writepos += conf->min_offset_diff;
5869
5870 if ((mddev->reshape_backwards
5871 ? (safepos > writepos && readpos < writepos)
5872 : (safepos < writepos && readpos > writepos)) ||
5873 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
5874
5875 wait_event(conf->wait_for_overlap,
5876 atomic_read(&conf->reshape_stripes)==0
5877 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5878 if (atomic_read(&conf->reshape_stripes) != 0)
5879 return 0;
5880 mddev->reshape_position = conf->reshape_progress;
5881 mddev->curr_resync_completed = sector_nr;
5882 conf->reshape_checkpoint = jiffies;
5883 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
5884 md_wakeup_thread(mddev->thread);
5885 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
5886 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5887 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5888 return 0;
5889 spin_lock_irq(&conf->device_lock);
5890 conf->reshape_safe = mddev->reshape_position;
5891 spin_unlock_irq(&conf->device_lock);
5892 wake_up(&conf->wait_for_overlap);
5893 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5894 }
5895
5896 INIT_LIST_HEAD(&stripes);
5897 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
5898 int j;
5899 int skipped_disk = 0;
5900 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
5901 set_bit(STRIPE_EXPANDING, &sh->state);
5902 atomic_inc(&conf->reshape_stripes);
5903
5904
5905
5906 for (j=sh->disks; j--;) {
5907 sector_t s;
5908 if (j == sh->pd_idx)
5909 continue;
5910 if (conf->level == 6 &&
5911 j == sh->qd_idx)
5912 continue;
5913 s = raid5_compute_blocknr(sh, j, 0);
5914 if (s < raid5_size(mddev, 0, 0)) {
5915 skipped_disk = 1;
5916 continue;
5917 }
5918 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
5919 set_bit(R5_Expanded, &sh->dev[j].flags);
5920 set_bit(R5_UPTODATE, &sh->dev[j].flags);
5921 }
5922 if (!skipped_disk) {
5923 set_bit(STRIPE_EXPAND_READY, &sh->state);
5924 set_bit(STRIPE_HANDLE, &sh->state);
5925 }
5926 list_add(&sh->lru, &stripes);
5927 }
5928 spin_lock_irq(&conf->device_lock);
5929 if (mddev->reshape_backwards)
5930 conf->reshape_progress -= reshape_sectors * new_data_disks;
5931 else
5932 conf->reshape_progress += reshape_sectors * new_data_disks;
5933 spin_unlock_irq(&conf->device_lock);
5934
5935
5936
5937
5938
5939 first_sector =
5940 raid5_compute_sector(conf, stripe_addr*(new_data_disks),
5941 1, &dd_idx, NULL);
5942 last_sector =
5943 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
5944 * new_data_disks - 1),
5945 1, &dd_idx, NULL);
5946 if (last_sector >= mddev->dev_sectors)
5947 last_sector = mddev->dev_sectors - 1;
5948 while (first_sector <= last_sector) {
5949 sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
5950 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
5951 set_bit(STRIPE_HANDLE, &sh->state);
5952 raid5_release_stripe(sh);
5953 first_sector += STRIPE_SECTORS;
5954 }
5955
5956
5957
5958 while (!list_empty(&stripes)) {
5959 sh = list_entry(stripes.next, struct stripe_head, lru);
5960 list_del_init(&sh->lru);
5961 raid5_release_stripe(sh);
5962 }
5963
5964
5965
5966 sector_nr += reshape_sectors;
5967 retn = reshape_sectors;
5968finish:
5969 if (mddev->curr_resync_completed > mddev->resync_max ||
5970 (sector_nr - mddev->curr_resync_completed) * 2
5971 >= mddev->resync_max - mddev->curr_resync_completed) {
5972
5973 wait_event(conf->wait_for_overlap,
5974 atomic_read(&conf->reshape_stripes) == 0
5975 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5976 if (atomic_read(&conf->reshape_stripes) != 0)
5977 goto ret;
5978 mddev->reshape_position = conf->reshape_progress;
5979 mddev->curr_resync_completed = sector_nr;
5980 conf->reshape_checkpoint = jiffies;
5981 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
5982 md_wakeup_thread(mddev->thread);
5983 wait_event(mddev->sb_wait,
5984 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
5985 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5986 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5987 goto ret;
5988 spin_lock_irq(&conf->device_lock);
5989 conf->reshape_safe = mddev->reshape_position;
5990 spin_unlock_irq(&conf->device_lock);
5991 wake_up(&conf->wait_for_overlap);
5992 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5993 }
5994ret:
5995 return retn;
5996}
5997
5998static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
5999 int *skipped)
6000{
6001 struct r5conf *conf = mddev->private;
6002 struct stripe_head *sh;
6003 sector_t max_sector = mddev->dev_sectors;
6004 sector_t sync_blocks;
6005 int still_degraded = 0;
6006 int i;
6007
6008 if (sector_nr >= max_sector) {
6009
6010
6011 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
6012 end_reshape(conf);
6013 return 0;
6014 }
6015
6016 if (mddev->curr_resync < max_sector)
6017 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
6018 &sync_blocks, 1);
6019 else
6020 conf->fullsync = 0;
6021 bitmap_close_sync(mddev->bitmap);
6022
6023 return 0;
6024 }
6025
6026
6027 wait_event(conf->wait_for_overlap, conf->quiesce != 2);
6028
6029 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6030 return reshape_request(mddev, sector_nr, skipped);
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042 if (mddev->degraded >= conf->max_degraded &&
6043 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6044 sector_t rv = mddev->dev_sectors - sector_nr;
6045 *skipped = 1;
6046 return rv;
6047 }
6048 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
6049 !conf->fullsync &&
6050 !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
6051 sync_blocks >= STRIPE_SECTORS) {
6052
6053 sync_blocks /= STRIPE_SECTORS;
6054 *skipped = 1;
6055 return sync_blocks * STRIPE_SECTORS;
6056 }
6057
6058 bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
6059
6060 sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0);
6061 if (sh == NULL) {
6062 sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0);
6063
6064
6065
6066 schedule_timeout_uninterruptible(1);
6067 }
6068
6069
6070
6071
6072 rcu_read_lock();
6073 for (i = 0; i < conf->raid_disks; i++) {
6074 struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev);
6075
6076 if (rdev == NULL || test_bit(Faulty, &rdev->flags))
6077 still_degraded = 1;
6078 }
6079 rcu_read_unlock();
6080
6081 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
6082
6083 set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
6084 set_bit(STRIPE_HANDLE, &sh->state);
6085
6086 raid5_release_stripe(sh);
6087
6088 return STRIPE_SECTORS;
6089}
6090
6091static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
6092 unsigned int offset)
6093{
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104 struct stripe_head *sh;
6105 int dd_idx;
6106 sector_t sector, logical_sector, last_sector;
6107 int scnt = 0;
6108 int handled = 0;
6109
6110 logical_sector = raid_bio->bi_iter.bi_sector &
6111 ~((sector_t)STRIPE_SECTORS-1);
6112 sector = raid5_compute_sector(conf, logical_sector,
6113 0, &dd_idx, NULL);
6114 last_sector = bio_end_sector(raid_bio);
6115
6116 for (; logical_sector < last_sector;
6117 logical_sector += STRIPE_SECTORS,
6118 sector += STRIPE_SECTORS,
6119 scnt++) {
6120
6121 if (scnt < offset)
6122
6123 continue;
6124
6125 sh = raid5_get_active_stripe(conf, sector, 0, 1, 1);
6126
6127 if (!sh) {
6128
6129 conf->retry_read_aligned = raid_bio;
6130 conf->retry_read_offset = scnt;
6131 return handled;
6132 }
6133
6134 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
6135 raid5_release_stripe(sh);
6136 conf->retry_read_aligned = raid_bio;
6137 conf->retry_read_offset = scnt;
6138 return handled;
6139 }
6140
6141 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
6142 handle_stripe(sh);
6143 raid5_release_stripe(sh);
6144 handled++;
6145 }
6146
6147 bio_endio(raid_bio);
6148
6149 if (atomic_dec_and_test(&conf->active_aligned_reads))
6150 wake_up(&conf->wait_for_quiescent);
6151 return handled;
6152}
6153
6154static int handle_active_stripes(struct r5conf *conf, int group,
6155 struct r5worker *worker,
6156 struct list_head *temp_inactive_list)
6157{
6158 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
6159 int i, batch_size = 0, hash;
6160 bool release_inactive = false;
6161
6162 while (batch_size < MAX_STRIPE_BATCH &&
6163 (sh = __get_priority_stripe(conf, group)) != NULL)
6164 batch[batch_size++] = sh;
6165
6166 if (batch_size == 0) {
6167 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6168 if (!list_empty(temp_inactive_list + i))
6169 break;
6170 if (i == NR_STRIPE_HASH_LOCKS) {
6171 spin_unlock_irq(&conf->device_lock);
6172 r5l_flush_stripe_to_raid(conf->log);
6173 spin_lock_irq(&conf->device_lock);
6174 return batch_size;
6175 }
6176 release_inactive = true;
6177 }
6178 spin_unlock_irq(&conf->device_lock);
6179
6180 release_inactive_stripe_list(conf, temp_inactive_list,
6181 NR_STRIPE_HASH_LOCKS);
6182
6183 r5l_flush_stripe_to_raid(conf->log);
6184 if (release_inactive) {
6185 spin_lock_irq(&conf->device_lock);
6186 return 0;
6187 }
6188
6189 for (i = 0; i < batch_size; i++)
6190 handle_stripe(batch[i]);
6191 log_write_stripe_run(conf);
6192
6193 cond_resched();
6194
6195 spin_lock_irq(&conf->device_lock);
6196 for (i = 0; i < batch_size; i++) {
6197 hash = batch[i]->hash_lock_index;
6198 __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
6199 }
6200 return batch_size;
6201}
6202
6203static void raid5_do_work(struct work_struct *work)
6204{
6205 struct r5worker *worker = container_of(work, struct r5worker, work);
6206 struct r5worker_group *group = worker->group;
6207 struct r5conf *conf = group->conf;
6208 struct mddev *mddev = conf->mddev;
6209 int group_id = group - conf->worker_groups;
6210 int handled;
6211 struct blk_plug plug;
6212
6213 pr_debug("+++ raid5worker active\n");
6214
6215 blk_start_plug(&plug);
6216 handled = 0;
6217 spin_lock_irq(&conf->device_lock);
6218 while (1) {
6219 int batch_size, released;
6220
6221 released = release_stripe_list(conf, worker->temp_inactive_list);
6222
6223 batch_size = handle_active_stripes(conf, group_id, worker,
6224 worker->temp_inactive_list);
6225 worker->working = false;
6226 if (!batch_size && !released)
6227 break;
6228 handled += batch_size;
6229 wait_event_lock_irq(mddev->sb_wait,
6230 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6231 conf->device_lock);
6232 }
6233 pr_debug("%d stripes handled\n", handled);
6234
6235 spin_unlock_irq(&conf->device_lock);
6236 blk_finish_plug(&plug);
6237
6238 pr_debug("--- raid5worker inactive\n");
6239}
6240
6241
6242
6243
6244
6245
6246
6247
6248static void raid5d(struct md_thread *thread)
6249{
6250 struct mddev *mddev = thread->mddev;
6251 struct r5conf *conf = mddev->private;
6252 int handled;
6253 struct blk_plug plug;
6254
6255 pr_debug("+++ raid5d active\n");
6256
6257 md_check_recovery(mddev);
6258
6259 blk_start_plug(&plug);
6260 handled = 0;
6261 spin_lock_irq(&conf->device_lock);
6262 while (1) {
6263 struct bio *bio;
6264 int batch_size, released;
6265 unsigned int offset;
6266
6267 released = release_stripe_list(conf, conf->temp_inactive_list);
6268 if (released)
6269 clear_bit(R5_DID_ALLOC, &conf->cache_state);
6270
6271 if (
6272 !list_empty(&conf->bitmap_list)) {
6273
6274 conf->seq_flush++;
6275 spin_unlock_irq(&conf->device_lock);
6276 bitmap_unplug(mddev->bitmap);
6277 spin_lock_irq(&conf->device_lock);
6278 conf->seq_write = conf->seq_flush;
6279 activate_bit_delay(conf, conf->temp_inactive_list);
6280 }
6281 raid5_activate_delayed(conf);
6282
6283 while ((bio = remove_bio_from_retry(conf, &offset))) {
6284 int ok;
6285 spin_unlock_irq(&conf->device_lock);
6286 ok = retry_aligned_read(conf, bio, offset);
6287 spin_lock_irq(&conf->device_lock);
6288 if (!ok)
6289 break;
6290 handled++;
6291 }
6292
6293 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
6294 conf->temp_inactive_list);
6295 if (!batch_size && !released)
6296 break;
6297 handled += batch_size;
6298
6299 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
6300 spin_unlock_irq(&conf->device_lock);
6301 md_check_recovery(mddev);
6302 spin_lock_irq(&conf->device_lock);
6303 }
6304 }
6305 pr_debug("%d stripes handled\n", handled);
6306
6307 spin_unlock_irq(&conf->device_lock);
6308 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) &&
6309 mutex_trylock(&conf->cache_size_mutex)) {
6310 grow_one_stripe(conf, __GFP_NOWARN);
6311
6312
6313
6314 set_bit(R5_DID_ALLOC, &conf->cache_state);
6315 mutex_unlock(&conf->cache_size_mutex);
6316 }
6317
6318 flush_deferred_bios(conf);
6319
6320 r5l_flush_stripe_to_raid(conf->log);
6321
6322 async_tx_issue_pending_all();
6323 blk_finish_plug(&plug);
6324
6325 pr_debug("--- raid5d inactive\n");
6326}
6327
6328static ssize_t
6329raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
6330{
6331 struct r5conf *conf;
6332 int ret = 0;
6333 spin_lock(&mddev->lock);
6334 conf = mddev->private;
6335 if (conf)
6336 ret = sprintf(page, "%d\n", conf->min_nr_stripes);
6337 spin_unlock(&mddev->lock);
6338 return ret;
6339}
6340
6341int
6342raid5_set_cache_size(struct mddev *mddev, int size)
6343{
6344 struct r5conf *conf = mddev->private;
6345
6346 if (size <= 16 || size > 32768)
6347 return -EINVAL;
6348
6349 conf->min_nr_stripes = size;
6350 mutex_lock(&conf->cache_size_mutex);
6351 while (size < conf->max_nr_stripes &&
6352 drop_one_stripe(conf))
6353 ;
6354 mutex_unlock(&conf->cache_size_mutex);
6355
6356 md_allow_write(mddev);
6357
6358 mutex_lock(&conf->cache_size_mutex);
6359 while (size > conf->max_nr_stripes)
6360 if (!grow_one_stripe(conf, GFP_KERNEL))
6361 break;
6362 mutex_unlock(&conf->cache_size_mutex);
6363
6364 return 0;
6365}
6366EXPORT_SYMBOL(raid5_set_cache_size);
6367
6368static ssize_t
6369raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
6370{
6371 struct r5conf *conf;
6372 unsigned long new;
6373 int err;
6374
6375 if (len >= PAGE_SIZE)
6376 return -EINVAL;
6377 if (kstrtoul(page, 10, &new))
6378 return -EINVAL;
6379 err = mddev_lock(mddev);
6380 if (err)
6381 return err;
6382 conf = mddev->private;
6383 if (!conf)
6384 err = -ENODEV;
6385 else
6386 err = raid5_set_cache_size(mddev, new);
6387 mddev_unlock(mddev);
6388
6389 return err ?: len;
6390}
6391
6392static struct md_sysfs_entry
6393raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
6394 raid5_show_stripe_cache_size,
6395 raid5_store_stripe_cache_size);
6396
6397static ssize_t
6398raid5_show_rmw_level(struct mddev *mddev, char *page)
6399{
6400 struct r5conf *conf = mddev->private;
6401 if (conf)
6402 return sprintf(page, "%d\n", conf->rmw_level);
6403 else
6404 return 0;
6405}
6406
6407static ssize_t
6408raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len)
6409{
6410 struct r5conf *conf = mddev->private;
6411 unsigned long new;
6412
6413 if (!conf)
6414 return -ENODEV;
6415
6416 if (len >= PAGE_SIZE)
6417 return -EINVAL;
6418
6419 if (kstrtoul(page, 10, &new))
6420 return -EINVAL;
6421
6422 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
6423 return -EINVAL;
6424
6425 if (new != PARITY_DISABLE_RMW &&
6426 new != PARITY_ENABLE_RMW &&
6427 new != PARITY_PREFER_RMW)
6428 return -EINVAL;
6429
6430 conf->rmw_level = new;
6431 return len;
6432}
6433
6434static struct md_sysfs_entry
6435raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
6436 raid5_show_rmw_level,
6437 raid5_store_rmw_level);
6438
6439
6440static ssize_t
6441raid5_show_preread_threshold(struct mddev *mddev, char *page)
6442{
6443 struct r5conf *conf;
6444 int ret = 0;
6445 spin_lock(&mddev->lock);
6446 conf = mddev->private;
6447 if (conf)
6448 ret = sprintf(page, "%d\n", conf->bypass_threshold);
6449 spin_unlock(&mddev->lock);
6450 return ret;
6451}
6452
6453static ssize_t
6454raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
6455{
6456 struct r5conf *conf;
6457 unsigned long new;
6458 int err;
6459
6460 if (len >= PAGE_SIZE)
6461 return -EINVAL;
6462 if (kstrtoul(page, 10, &new))
6463 return -EINVAL;
6464
6465 err = mddev_lock(mddev);
6466 if (err)
6467 return err;
6468 conf = mddev->private;
6469 if (!conf)
6470 err = -ENODEV;
6471 else if (new > conf->min_nr_stripes)
6472 err = -EINVAL;
6473 else
6474 conf->bypass_threshold = new;
6475 mddev_unlock(mddev);
6476 return err ?: len;
6477}
6478
6479static struct md_sysfs_entry
6480raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
6481 S_IRUGO | S_IWUSR,
6482 raid5_show_preread_threshold,
6483 raid5_store_preread_threshold);
6484
6485static ssize_t
6486raid5_show_skip_copy(struct mddev *mddev, char *page)
6487{
6488 struct r5conf *conf;
6489 int ret = 0;
6490 spin_lock(&mddev->lock);
6491 conf = mddev->private;
6492 if (conf)
6493 ret = sprintf(page, "%d\n", conf->skip_copy);
6494 spin_unlock(&mddev->lock);
6495 return ret;
6496}
6497
6498static ssize_t
6499raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
6500{
6501 struct r5conf *conf;
6502 unsigned long new;
6503 int err;
6504
6505 if (len >= PAGE_SIZE)
6506 return -EINVAL;
6507 if (kstrtoul(page, 10, &new))
6508 return -EINVAL;
6509 new = !!new;
6510
6511 err = mddev_lock(mddev);
6512 if (err)
6513 return err;
6514 conf = mddev->private;
6515 if (!conf)
6516 err = -ENODEV;
6517 else if (new != conf->skip_copy) {
6518 mddev_suspend(mddev);
6519 conf->skip_copy = new;
6520 if (new)
6521 mddev->queue->backing_dev_info->capabilities |=
6522 BDI_CAP_STABLE_WRITES;
6523 else
6524 mddev->queue->backing_dev_info->capabilities &=
6525 ~BDI_CAP_STABLE_WRITES;
6526 mddev_resume(mddev);
6527 }
6528 mddev_unlock(mddev);
6529 return err ?: len;
6530}
6531
6532static struct md_sysfs_entry
6533raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
6534 raid5_show_skip_copy,
6535 raid5_store_skip_copy);
6536
6537static ssize_t
6538stripe_cache_active_show(struct mddev *mddev, char *page)
6539{
6540 struct r5conf *conf = mddev->private;
6541 if (conf)
6542 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
6543 else
6544 return 0;
6545}
6546
6547static struct md_sysfs_entry
6548raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
6549
6550static ssize_t
6551raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
6552{
6553 struct r5conf *conf;
6554 int ret = 0;
6555 spin_lock(&mddev->lock);
6556 conf = mddev->private;
6557 if (conf)
6558 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
6559 spin_unlock(&mddev->lock);
6560 return ret;
6561}
6562
6563static int alloc_thread_groups(struct r5conf *conf, int cnt,
6564 int *group_cnt,
6565 int *worker_cnt_per_group,
6566 struct r5worker_group **worker_groups);
6567static ssize_t
6568raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
6569{
6570 struct r5conf *conf;
6571 unsigned long new;
6572 int err;
6573 struct r5worker_group *new_groups, *old_groups;
6574 int group_cnt, worker_cnt_per_group;
6575
6576 if (len >= PAGE_SIZE)
6577 return -EINVAL;
6578 if (kstrtoul(page, 10, &new))
6579 return -EINVAL;
6580
6581 err = mddev_lock(mddev);
6582 if (err)
6583 return err;
6584 conf = mddev->private;
6585 if (!conf)
6586 err = -ENODEV;
6587 else if (new != conf->worker_cnt_per_group) {
6588 mddev_suspend(mddev);
6589
6590 old_groups = conf->worker_groups;
6591 if (old_groups)
6592 flush_workqueue(raid5_wq);
6593
6594 err = alloc_thread_groups(conf, new,
6595 &group_cnt, &worker_cnt_per_group,
6596 &new_groups);
6597 if (!err) {
6598 spin_lock_irq(&conf->device_lock);
6599 conf->group_cnt = group_cnt;
6600 conf->worker_cnt_per_group = worker_cnt_per_group;
6601 conf->worker_groups = new_groups;
6602 spin_unlock_irq(&conf->device_lock);
6603
6604 if (old_groups)
6605 kfree(old_groups[0].workers);
6606 kfree(old_groups);
6607 }
6608 mddev_resume(mddev);
6609 }
6610 mddev_unlock(mddev);
6611
6612 return err ?: len;
6613}
6614
6615static struct md_sysfs_entry
6616raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
6617 raid5_show_group_thread_cnt,
6618 raid5_store_group_thread_cnt);
6619
6620static struct attribute *raid5_attrs[] = {
6621 &raid5_stripecache_size.attr,
6622 &raid5_stripecache_active.attr,
6623 &raid5_preread_bypass_threshold.attr,
6624 &raid5_group_thread_cnt.attr,
6625 &raid5_skip_copy.attr,
6626 &raid5_rmw_level.attr,
6627 &r5c_journal_mode.attr,
6628 NULL,
6629};
6630static struct attribute_group raid5_attrs_group = {
6631 .name = NULL,
6632 .attrs = raid5_attrs,
6633};
6634
6635static int alloc_thread_groups(struct r5conf *conf, int cnt,
6636 int *group_cnt,
6637 int *worker_cnt_per_group,
6638 struct r5worker_group **worker_groups)
6639{
6640 int i, j, k;
6641 ssize_t size;
6642 struct r5worker *workers;
6643
6644 *worker_cnt_per_group = cnt;
6645 if (cnt == 0) {
6646 *group_cnt = 0;
6647 *worker_groups = NULL;
6648 return 0;
6649 }
6650 *group_cnt = num_possible_nodes();
6651 size = sizeof(struct r5worker) * cnt;
6652 workers = kzalloc(size * *group_cnt, GFP_NOIO);
6653 *worker_groups = kzalloc(sizeof(struct r5worker_group) *
6654 *group_cnt, GFP_NOIO);
6655 if (!*worker_groups || !workers) {
6656 kfree(workers);
6657 kfree(*worker_groups);
6658 return -ENOMEM;
6659 }
6660
6661 for (i = 0; i < *group_cnt; i++) {
6662 struct r5worker_group *group;
6663
6664 group = &(*worker_groups)[i];
6665 INIT_LIST_HEAD(&group->handle_list);
6666 INIT_LIST_HEAD(&group->loprio_list);
6667 group->conf = conf;
6668 group->workers = workers + i * cnt;
6669
6670 for (j = 0; j < cnt; j++) {
6671 struct r5worker *worker = group->workers + j;
6672 worker->group = group;
6673 INIT_WORK(&worker->work, raid5_do_work);
6674
6675 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
6676 INIT_LIST_HEAD(worker->temp_inactive_list + k);
6677 }
6678 }
6679
6680 return 0;
6681}
6682
6683static void free_thread_groups(struct r5conf *conf)
6684{
6685 if (conf->worker_groups)
6686 kfree(conf->worker_groups[0].workers);
6687 kfree(conf->worker_groups);
6688 conf->worker_groups = NULL;
6689}
6690
6691static sector_t
6692raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
6693{
6694 struct r5conf *conf = mddev->private;
6695
6696 if (!sectors)
6697 sectors = mddev->dev_sectors;
6698 if (!raid_disks)
6699
6700 raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
6701
6702 sectors &= ~((sector_t)conf->chunk_sectors - 1);
6703 sectors &= ~((sector_t)conf->prev_chunk_sectors - 1);
6704 return sectors * (raid_disks - conf->max_degraded);
6705}
6706
6707static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
6708{
6709 safe_put_page(percpu->spare_page);
6710 if (percpu->scribble)
6711 flex_array_free(percpu->scribble);
6712 percpu->spare_page = NULL;
6713 percpu->scribble = NULL;
6714}
6715
6716static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
6717{
6718 if (conf->level == 6 && !percpu->spare_page)
6719 percpu->spare_page = alloc_page(GFP_KERNEL);
6720 if (!percpu->scribble)
6721 percpu->scribble = scribble_alloc(max(conf->raid_disks,
6722 conf->previous_raid_disks),
6723 max(conf->chunk_sectors,
6724 conf->prev_chunk_sectors)
6725 / STRIPE_SECTORS,
6726 GFP_KERNEL);
6727
6728 if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
6729 free_scratch_buffer(conf, percpu);
6730 return -ENOMEM;
6731 }
6732
6733 return 0;
6734}
6735
6736static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node)
6737{
6738 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
6739
6740 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
6741 return 0;
6742}
6743
6744static void raid5_free_percpu(struct r5conf *conf)
6745{
6746 if (!conf->percpu)
6747 return;
6748
6749 cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
6750 free_percpu(conf->percpu);
6751}
6752
6753static void free_conf(struct r5conf *conf)
6754{
6755 int i;
6756
6757 log_exit(conf);
6758
6759 if (conf->shrinker.nr_deferred)
6760 unregister_shrinker(&conf->shrinker);
6761
6762 free_thread_groups(conf);
6763 shrink_stripes(conf);
6764 raid5_free_percpu(conf);
6765 for (i = 0; i < conf->pool_size; i++)
6766 if (conf->disks[i].extra_page)
6767 put_page(conf->disks[i].extra_page);
6768 kfree(conf->disks);
6769 if (conf->bio_split)
6770 bioset_free(conf->bio_split);
6771 kfree(conf->stripe_hashtbl);
6772 kfree(conf->pending_data);
6773 kfree(conf);
6774}
6775
6776static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
6777{
6778 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
6779 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
6780
6781 if (alloc_scratch_buffer(conf, percpu)) {
6782 pr_warn("%s: failed memory allocation for cpu%u\n",
6783 __func__, cpu);
6784 return -ENOMEM;
6785 }
6786 return 0;
6787}
6788
6789static int raid5_alloc_percpu(struct r5conf *conf)
6790{
6791 int err = 0;
6792
6793 conf->percpu = alloc_percpu(struct raid5_percpu);
6794 if (!conf->percpu)
6795 return -ENOMEM;
6796
6797 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
6798 if (!err) {
6799 conf->scribble_disks = max(conf->raid_disks,
6800 conf->previous_raid_disks);
6801 conf->scribble_sectors = max(conf->chunk_sectors,
6802 conf->prev_chunk_sectors);
6803 }
6804 return err;
6805}
6806
6807static unsigned long raid5_cache_scan(struct shrinker *shrink,
6808 struct shrink_control *sc)
6809{
6810 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
6811 unsigned long ret = SHRINK_STOP;
6812
6813 if (mutex_trylock(&conf->cache_size_mutex)) {
6814 ret= 0;
6815 while (ret < sc->nr_to_scan &&
6816 conf->max_nr_stripes > conf->min_nr_stripes) {
6817 if (drop_one_stripe(conf) == 0) {
6818 ret = SHRINK_STOP;
6819 break;
6820 }
6821 ret++;
6822 }
6823 mutex_unlock(&conf->cache_size_mutex);
6824 }
6825 return ret;
6826}
6827
6828static unsigned long raid5_cache_count(struct shrinker *shrink,
6829 struct shrink_control *sc)
6830{
6831 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
6832
6833 if (conf->max_nr_stripes < conf->min_nr_stripes)
6834
6835 return 0;
6836 return conf->max_nr_stripes - conf->min_nr_stripes;
6837}
6838
6839static struct r5conf *setup_conf(struct mddev *mddev)
6840{
6841 struct r5conf *conf;
6842 int raid_disk, memory, max_disks;
6843 struct md_rdev *rdev;
6844 struct disk_info *disk;
6845 char pers_name[6];
6846 int i;
6847 int group_cnt, worker_cnt_per_group;
6848 struct r5worker_group *new_group;
6849
6850 if (mddev->new_level != 5
6851 && mddev->new_level != 4
6852 && mddev->new_level != 6) {
6853 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
6854 mdname(mddev), mddev->new_level);
6855 return ERR_PTR(-EIO);
6856 }
6857 if ((mddev->new_level == 5
6858 && !algorithm_valid_raid5(mddev->new_layout)) ||
6859 (mddev->new_level == 6
6860 && !algorithm_valid_raid6(mddev->new_layout))) {
6861 pr_warn("md/raid:%s: layout %d not supported\n",
6862 mdname(mddev), mddev->new_layout);
6863 return ERR_PTR(-EIO);
6864 }
6865 if (mddev->new_level == 6 && mddev->raid_disks < 4) {
6866 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
6867 mdname(mddev), mddev->raid_disks);
6868 return ERR_PTR(-EINVAL);
6869 }
6870
6871 if (!mddev->new_chunk_sectors ||
6872 (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
6873 !is_power_of_2(mddev->new_chunk_sectors)) {
6874 pr_warn("md/raid:%s: invalid chunk size %d\n",
6875 mdname(mddev), mddev->new_chunk_sectors << 9);
6876 return ERR_PTR(-EINVAL);
6877 }
6878
6879 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
6880 if (conf == NULL)
6881 goto abort;
6882 INIT_LIST_HEAD(&conf->free_list);
6883 INIT_LIST_HEAD(&conf->pending_list);
6884 conf->pending_data = kzalloc(sizeof(struct r5pending_data) *
6885 PENDING_IO_MAX, GFP_KERNEL);
6886 if (!conf->pending_data)
6887 goto abort;
6888 for (i = 0; i < PENDING_IO_MAX; i++)
6889 list_add(&conf->pending_data[i].sibling, &conf->free_list);
6890
6891 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
6892 &new_group)) {
6893 conf->group_cnt = group_cnt;
6894 conf->worker_cnt_per_group = worker_cnt_per_group;
6895 conf->worker_groups = new_group;
6896 } else
6897 goto abort;
6898 spin_lock_init(&conf->device_lock);
6899 seqcount_init(&conf->gen_lock);
6900 mutex_init(&conf->cache_size_mutex);
6901 init_waitqueue_head(&conf->wait_for_quiescent);
6902 init_waitqueue_head(&conf->wait_for_stripe);
6903 init_waitqueue_head(&conf->wait_for_overlap);
6904 INIT_LIST_HEAD(&conf->handle_list);
6905 INIT_LIST_HEAD(&conf->loprio_list);
6906 INIT_LIST_HEAD(&conf->hold_list);
6907 INIT_LIST_HEAD(&conf->delayed_list);
6908 INIT_LIST_HEAD(&conf->bitmap_list);
6909 init_llist_head(&conf->released_stripes);
6910 atomic_set(&conf->active_stripes, 0);
6911 atomic_set(&conf->preread_active_stripes, 0);
6912 atomic_set(&conf->active_aligned_reads, 0);
6913 spin_lock_init(&conf->pending_bios_lock);
6914 conf->batch_bio_dispatch = true;
6915 rdev_for_each(rdev, mddev) {
6916 if (test_bit(Journal, &rdev->flags))
6917 continue;
6918 if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
6919 conf->batch_bio_dispatch = false;
6920 break;
6921 }
6922 }
6923
6924 conf->bypass_threshold = BYPASS_THRESHOLD;
6925 conf->recovery_disabled = mddev->recovery_disabled - 1;
6926
6927 conf->raid_disks = mddev->raid_disks;
6928 if (mddev->reshape_position == MaxSector)
6929 conf->previous_raid_disks = mddev->raid_disks;
6930 else
6931 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
6932 max_disks = max(conf->raid_disks, conf->previous_raid_disks);
6933
6934 conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
6935 GFP_KERNEL);
6936
6937 if (!conf->disks)
6938 goto abort;
6939
6940 for (i = 0; i < max_disks; i++) {
6941 conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
6942 if (!conf->disks[i].extra_page)
6943 goto abort;
6944 }
6945
6946 conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
6947 if (!conf->bio_split)
6948 goto abort;
6949 conf->mddev = mddev;
6950
6951 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
6952 goto abort;
6953
6954
6955
6956
6957
6958
6959 spin_lock_init(conf->hash_locks);
6960 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
6961 spin_lock_init(conf->hash_locks + i);
6962
6963 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6964 INIT_LIST_HEAD(conf->inactive_list + i);
6965
6966 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6967 INIT_LIST_HEAD(conf->temp_inactive_list + i);
6968
6969 atomic_set(&conf->r5c_cached_full_stripes, 0);
6970 INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
6971 atomic_set(&conf->r5c_cached_partial_stripes, 0);
6972 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
6973 atomic_set(&conf->r5c_flushing_full_stripes, 0);
6974 atomic_set(&conf->r5c_flushing_partial_stripes, 0);
6975
6976 conf->level = mddev->new_level;
6977 conf->chunk_sectors = mddev->new_chunk_sectors;
6978 if (raid5_alloc_percpu(conf) != 0)
6979 goto abort;
6980
6981 pr_debug("raid456: run(%s) called.\n", mdname(mddev));
6982
6983 rdev_for_each(rdev, mddev) {
6984 raid_disk = rdev->raid_disk;
6985 if (raid_disk >= max_disks
6986 || raid_disk < 0 || test_bit(Journal, &rdev->flags))
6987 continue;
6988 disk = conf->disks + raid_disk;
6989
6990 if (test_bit(Replacement, &rdev->flags)) {
6991 if (disk->replacement)
6992 goto abort;
6993 disk->replacement = rdev;
6994 } else {
6995 if (disk->rdev)
6996 goto abort;
6997 disk->rdev = rdev;
6998 }
6999
7000 if (test_bit(In_sync, &rdev->flags)) {
7001 char b[BDEVNAME_SIZE];
7002 pr_info("md/raid:%s: device %s operational as raid disk %d\n",
7003 mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
7004 } else if (rdev->saved_raid_disk != raid_disk)
7005
7006 conf->fullsync = 1;
7007 }
7008
7009 conf->level = mddev->new_level;
7010 if (conf->level == 6) {
7011 conf->max_degraded = 2;
7012 if (raid6_call.xor_syndrome)
7013 conf->rmw_level = PARITY_ENABLE_RMW;
7014 else
7015 conf->rmw_level = PARITY_DISABLE_RMW;
7016 } else {
7017 conf->max_degraded = 1;
7018 conf->rmw_level = PARITY_ENABLE_RMW;
7019 }
7020 conf->algorithm = mddev->new_layout;
7021 conf->reshape_progress = mddev->reshape_position;
7022 if (conf->reshape_progress != MaxSector) {
7023 conf->prev_chunk_sectors = mddev->chunk_sectors;
7024 conf->prev_algo = mddev->layout;
7025 } else {
7026 conf->prev_chunk_sectors = conf->chunk_sectors;
7027 conf->prev_algo = conf->algorithm;
7028 }
7029
7030 conf->min_nr_stripes = NR_STRIPES;
7031 if (mddev->reshape_position != MaxSector) {
7032 int stripes = max_t(int,
7033 ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
7034 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
7035 conf->min_nr_stripes = max(NR_STRIPES, stripes);
7036 if (conf->min_nr_stripes != NR_STRIPES)
7037 pr_info("md/raid:%s: force stripe size %d for reshape\n",
7038 mdname(mddev), conf->min_nr_stripes);
7039 }
7040 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
7041 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
7042 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
7043 if (grow_stripes(conf, conf->min_nr_stripes)) {
7044 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
7045 mdname(mddev), memory);
7046 goto abort;
7047 } else
7048 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
7049
7050
7051
7052
7053
7054 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
7055 conf->shrinker.scan_objects = raid5_cache_scan;
7056 conf->shrinker.count_objects = raid5_cache_count;
7057 conf->shrinker.batch = 128;
7058 conf->shrinker.flags = 0;
7059 if (register_shrinker(&conf->shrinker)) {
7060 pr_warn("md/raid:%s: couldn't register shrinker.\n",
7061 mdname(mddev));
7062 goto abort;
7063 }
7064
7065 sprintf(pers_name, "raid%d", mddev->new_level);
7066 conf->thread = md_register_thread(raid5d, mddev, pers_name);
7067 if (!conf->thread) {
7068 pr_warn("md/raid:%s: couldn't allocate thread.\n",
7069 mdname(mddev));
7070 goto abort;
7071 }
7072
7073 return conf;
7074
7075 abort:
7076 if (conf) {
7077 free_conf(conf);
7078 return ERR_PTR(-EIO);
7079 } else
7080 return ERR_PTR(-ENOMEM);
7081}
7082
7083static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
7084{
7085 switch (algo) {
7086 case ALGORITHM_PARITY_0:
7087 if (raid_disk < max_degraded)
7088 return 1;
7089 break;
7090 case ALGORITHM_PARITY_N:
7091 if (raid_disk >= raid_disks - max_degraded)
7092 return 1;
7093 break;
7094 case ALGORITHM_PARITY_0_6:
7095 if (raid_disk == 0 ||
7096 raid_disk == raid_disks - 1)
7097 return 1;
7098 break;
7099 case ALGORITHM_LEFT_ASYMMETRIC_6:
7100 case ALGORITHM_RIGHT_ASYMMETRIC_6:
7101 case ALGORITHM_LEFT_SYMMETRIC_6:
7102 case ALGORITHM_RIGHT_SYMMETRIC_6:
7103 if (raid_disk == raid_disks - 1)
7104 return 1;
7105 }
7106 return 0;
7107}
7108
7109static int raid5_run(struct mddev *mddev)
7110{
7111 struct r5conf *conf;
7112 int working_disks = 0;
7113 int dirty_parity_disks = 0;
7114 struct md_rdev *rdev;
7115 struct md_rdev *journal_dev = NULL;
7116 sector_t reshape_offset = 0;
7117 int i;
7118 long long min_offset_diff = 0;
7119 int first = 1;
7120
7121 if (mddev_init_writes_pending(mddev) < 0)
7122 return -ENOMEM;
7123
7124 if (mddev->recovery_cp != MaxSector)
7125 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
7126 mdname(mddev));
7127
7128 rdev_for_each(rdev, mddev) {
7129 long long diff;
7130
7131 if (test_bit(Journal, &rdev->flags)) {
7132 journal_dev = rdev;
7133 continue;
7134 }
7135 if (rdev->raid_disk < 0)
7136 continue;
7137 diff = (rdev->new_data_offset - rdev->data_offset);
7138 if (first) {
7139 min_offset_diff = diff;
7140 first = 0;
7141 } else if (mddev->reshape_backwards &&
7142 diff < min_offset_diff)
7143 min_offset_diff = diff;
7144 else if (!mddev->reshape_backwards &&
7145 diff > min_offset_diff)
7146 min_offset_diff = diff;
7147 }
7148
7149 if (mddev->reshape_position != MaxSector) {
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162 sector_t here_new, here_old;
7163 int old_disks;
7164 int max_degraded = (mddev->level == 6 ? 2 : 1);
7165 int chunk_sectors;
7166 int new_data_disks;
7167
7168 if (journal_dev) {
7169 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
7170 mdname(mddev));
7171 return -EINVAL;
7172 }
7173
7174 if (mddev->new_level != mddev->level) {
7175 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
7176 mdname(mddev));
7177 return -EINVAL;
7178 }
7179 old_disks = mddev->raid_disks - mddev->delta_disks;
7180
7181
7182
7183
7184
7185
7186
7187 here_new = mddev->reshape_position;
7188 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
7189 new_data_disks = mddev->raid_disks - max_degraded;
7190 if (sector_div(here_new, chunk_sectors * new_data_disks)) {
7191 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
7192 mdname(mddev));
7193 return -EINVAL;
7194 }
7195 reshape_offset = here_new * chunk_sectors;
7196
7197 here_old = mddev->reshape_position;
7198 sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
7199
7200
7201 if (mddev->delta_disks == 0) {
7202
7203
7204
7205
7206
7207
7208
7209 if (abs(min_offset_diff) >= mddev->chunk_sectors &&
7210 abs(min_offset_diff) >= mddev->new_chunk_sectors)
7211 ;
7212 else if (mddev->ro == 0) {
7213 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
7214 mdname(mddev));
7215 return -EINVAL;
7216 }
7217 } else if (mddev->reshape_backwards
7218 ? (here_new * chunk_sectors + min_offset_diff <=
7219 here_old * chunk_sectors)
7220 : (here_new * chunk_sectors >=
7221 here_old * chunk_sectors + (-min_offset_diff))) {
7222
7223 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
7224 mdname(mddev));
7225 return -EINVAL;
7226 }
7227 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
7228
7229 } else {
7230 BUG_ON(mddev->level != mddev->new_level);
7231 BUG_ON(mddev->layout != mddev->new_layout);
7232 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
7233 BUG_ON(mddev->delta_disks != 0);
7234 }
7235
7236 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
7237 test_bit(MD_HAS_PPL, &mddev->flags)) {
7238 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
7239 mdname(mddev));
7240 clear_bit(MD_HAS_PPL, &mddev->flags);
7241 }
7242
7243 if (mddev->private == NULL)
7244 conf = setup_conf(mddev);
7245 else
7246 conf = mddev->private;
7247
7248 if (IS_ERR(conf))
7249 return PTR_ERR(conf);
7250
7251 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
7252 if (!journal_dev) {
7253 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
7254 mdname(mddev));
7255 mddev->ro = 1;
7256 set_disk_ro(mddev->gendisk, 1);
7257 } else if (mddev->recovery_cp == MaxSector)
7258 set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
7259 }
7260
7261 conf->min_offset_diff = min_offset_diff;
7262 mddev->thread = conf->thread;
7263 conf->thread = NULL;
7264 mddev->private = conf;
7265
7266 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
7267 i++) {
7268 rdev = conf->disks[i].rdev;
7269 if (!rdev && conf->disks[i].replacement) {
7270
7271 rdev = conf->disks[i].replacement;
7272 conf->disks[i].replacement = NULL;
7273 clear_bit(Replacement, &rdev->flags);
7274 conf->disks[i].rdev = rdev;
7275 }
7276 if (!rdev)
7277 continue;
7278 if (conf->disks[i].replacement &&
7279 conf->reshape_progress != MaxSector) {
7280
7281 pr_warn("md: cannot handle concurrent replacement and reshape.\n");
7282 goto abort;
7283 }
7284 if (test_bit(In_sync, &rdev->flags)) {
7285 working_disks++;
7286 continue;
7287 }
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297 if (mddev->major_version == 0 &&
7298 mddev->minor_version > 90)
7299 rdev->recovery_offset = reshape_offset;
7300
7301 if (rdev->recovery_offset < reshape_offset) {
7302
7303 if (!only_parity(rdev->raid_disk,
7304 conf->algorithm,
7305 conf->raid_disks,
7306 conf->max_degraded))
7307 continue;
7308 }
7309 if (!only_parity(rdev->raid_disk,
7310 conf->prev_algo,
7311 conf->previous_raid_disks,
7312 conf->max_degraded))
7313 continue;
7314 dirty_parity_disks++;
7315 }
7316
7317
7318
7319
7320 mddev->degraded = raid5_calc_degraded(conf);
7321
7322 if (has_failed(conf)) {
7323 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
7324 mdname(mddev), mddev->degraded, conf->raid_disks);
7325 goto abort;
7326 }
7327
7328
7329 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
7330 mddev->resync_max_sectors = mddev->dev_sectors;
7331
7332 if (mddev->degraded > dirty_parity_disks &&
7333 mddev->recovery_cp != MaxSector) {
7334 if (test_bit(MD_HAS_PPL, &mddev->flags))
7335 pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
7336 mdname(mddev));
7337 else if (mddev->ok_start_degraded)
7338 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
7339 mdname(mddev));
7340 else {
7341 pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
7342 mdname(mddev));
7343 goto abort;
7344 }
7345 }
7346
7347 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
7348 mdname(mddev), conf->level,
7349 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
7350 mddev->new_layout);
7351
7352 print_raid5_conf(conf);
7353
7354 if (conf->reshape_progress != MaxSector) {
7355 conf->reshape_safe = conf->reshape_progress;
7356 atomic_set(&conf->reshape_stripes, 0);
7357 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7358 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7359 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7360 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7361 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
7362 "reshape");
7363 }
7364
7365
7366 if (mddev->to_remove == &raid5_attrs_group)
7367 mddev->to_remove = NULL;
7368 else if (mddev->kobj.sd &&
7369 sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
7370 pr_warn("raid5: failed to create sysfs attributes for %s\n",
7371 mdname(mddev));
7372 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
7373
7374 if (mddev->queue) {
7375 int chunk_size;
7376
7377
7378
7379
7380 int data_disks = conf->previous_raid_disks - conf->max_degraded;
7381 int stripe = data_disks *
7382 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
7383 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
7384 mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
7385
7386 chunk_size = mddev->chunk_sectors << 9;
7387 blk_queue_io_min(mddev->queue, chunk_size);
7388 blk_queue_io_opt(mddev->queue, chunk_size *
7389 (conf->raid_disks - conf->max_degraded));
7390 mddev->queue->limits.raid_partial_stripes_expensive = 1;
7391
7392
7393
7394
7395 stripe = stripe * PAGE_SIZE;
7396
7397
7398 while ((stripe-1) & stripe)
7399 stripe = (stripe | (stripe-1)) + 1;
7400 mddev->queue->limits.discard_alignment = stripe;
7401 mddev->queue->limits.discard_granularity = stripe;
7402
7403 blk_queue_max_write_same_sectors(mddev->queue, 0);
7404 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
7405
7406 rdev_for_each(rdev, mddev) {
7407 disk_stack_limits(mddev->gendisk, rdev->bdev,
7408 rdev->data_offset << 9);
7409 disk_stack_limits(mddev->gendisk, rdev->bdev,
7410 rdev->new_data_offset << 9);
7411 }
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428 if (devices_handle_discard_safely &&
7429 mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
7430 mddev->queue->limits.discard_granularity >= stripe)
7431 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
7432 mddev->queue);
7433 else
7434 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
7435 mddev->queue);
7436
7437 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
7438 }
7439
7440 if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
7441 goto abort;
7442
7443 return 0;
7444abort:
7445 md_unregister_thread(&mddev->thread);
7446 print_raid5_conf(conf);
7447 free_conf(conf);
7448 mddev->private = NULL;
7449 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
7450 return -EIO;
7451}
7452
7453static void raid5_free(struct mddev *mddev, void *priv)
7454{
7455 struct r5conf *conf = priv;
7456
7457 free_conf(conf);
7458 mddev->to_remove = &raid5_attrs_group;
7459}
7460
7461static void raid5_status(struct seq_file *seq, struct mddev *mddev)
7462{
7463 struct r5conf *conf = mddev->private;
7464 int i;
7465
7466 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
7467 conf->chunk_sectors / 2, mddev->layout);
7468 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
7469 rcu_read_lock();
7470 for (i = 0; i < conf->raid_disks; i++) {
7471 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
7472 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
7473 }
7474 rcu_read_unlock();
7475 seq_printf (seq, "]");
7476}
7477
7478static void print_raid5_conf (struct r5conf *conf)
7479{
7480 int i;
7481 struct disk_info *tmp;
7482
7483 pr_debug("RAID conf printout:\n");
7484 if (!conf) {
7485 pr_debug("(conf==NULL)\n");
7486 return;
7487 }
7488 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
7489 conf->raid_disks,
7490 conf->raid_disks - conf->mddev->degraded);
7491
7492 for (i = 0; i < conf->raid_disks; i++) {
7493 char b[BDEVNAME_SIZE];
7494 tmp = conf->disks + i;
7495 if (tmp->rdev)
7496 pr_debug(" disk %d, o:%d, dev:%s\n",
7497 i, !test_bit(Faulty, &tmp->rdev->flags),
7498 bdevname(tmp->rdev->bdev, b));
7499 }
7500}
7501
7502static int raid5_spare_active(struct mddev *mddev)
7503{
7504 int i;
7505 struct r5conf *conf = mddev->private;
7506 struct disk_info *tmp;
7507 int count = 0;
7508 unsigned long flags;
7509
7510 for (i = 0; i < conf->raid_disks; i++) {
7511 tmp = conf->disks + i;
7512 if (tmp->replacement
7513 && tmp->replacement->recovery_offset == MaxSector
7514 && !test_bit(Faulty, &tmp->replacement->flags)
7515 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
7516
7517 if (!tmp->rdev
7518 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
7519 count++;
7520 if (tmp->rdev) {
7521
7522
7523
7524
7525 set_bit(Faulty, &tmp->rdev->flags);
7526 sysfs_notify_dirent_safe(
7527 tmp->rdev->sysfs_state);
7528 }
7529 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
7530 } else if (tmp->rdev
7531 && tmp->rdev->recovery_offset == MaxSector
7532 && !test_bit(Faulty, &tmp->rdev->flags)
7533 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
7534 count++;
7535 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
7536 }
7537 }
7538 spin_lock_irqsave(&conf->device_lock, flags);
7539 mddev->degraded = raid5_calc_degraded(conf);
7540 spin_unlock_irqrestore(&conf->device_lock, flags);
7541 print_raid5_conf(conf);
7542 return count;
7543}
7544
7545static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
7546{
7547 struct r5conf *conf = mddev->private;
7548 int err = 0;
7549 int number = rdev->raid_disk;
7550 struct md_rdev **rdevp;
7551 struct disk_info *p = conf->disks + number;
7552
7553 print_raid5_conf(conf);
7554 if (test_bit(Journal, &rdev->flags) && conf->log) {
7555
7556
7557
7558
7559
7560
7561 if (atomic_read(&conf->active_stripes) ||
7562 atomic_read(&conf->r5c_cached_full_stripes) ||
7563 atomic_read(&conf->r5c_cached_partial_stripes)) {
7564 return -EBUSY;
7565 }
7566 log_exit(conf);
7567 return 0;
7568 }
7569 if (rdev == p->rdev)
7570 rdevp = &p->rdev;
7571 else if (rdev == p->replacement)
7572 rdevp = &p->replacement;
7573 else
7574 return 0;
7575
7576 if (number >= conf->raid_disks &&
7577 conf->reshape_progress == MaxSector)
7578 clear_bit(In_sync, &rdev->flags);
7579
7580 if (test_bit(In_sync, &rdev->flags) ||
7581 atomic_read(&rdev->nr_pending)) {
7582 err = -EBUSY;
7583 goto abort;
7584 }
7585
7586
7587
7588 if (!test_bit(Faulty, &rdev->flags) &&
7589 mddev->recovery_disabled != conf->recovery_disabled &&
7590 !has_failed(conf) &&
7591 (!p->replacement || p->replacement == rdev) &&
7592 number < conf->raid_disks) {
7593 err = -EBUSY;
7594 goto abort;
7595 }
7596 *rdevp = NULL;
7597 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
7598 synchronize_rcu();
7599 if (atomic_read(&rdev->nr_pending)) {
7600
7601 err = -EBUSY;
7602 *rdevp = rdev;
7603 }
7604 }
7605 if (!err) {
7606 err = log_modify(conf, rdev, false);
7607 if (err)
7608 goto abort;
7609 }
7610 if (p->replacement) {
7611
7612 p->rdev = p->replacement;
7613 clear_bit(Replacement, &p->replacement->flags);
7614 smp_mb();
7615
7616
7617 p->replacement = NULL;
7618
7619 if (!err)
7620 err = log_modify(conf, p->rdev, true);
7621 }
7622
7623 clear_bit(WantReplacement, &rdev->flags);
7624abort:
7625
7626 print_raid5_conf(conf);
7627 return err;
7628}
7629
7630static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
7631{
7632 struct r5conf *conf = mddev->private;
7633 int err = -EEXIST;
7634 int disk;
7635 struct disk_info *p;
7636 int first = 0;
7637 int last = conf->raid_disks - 1;
7638
7639 if (test_bit(Journal, &rdev->flags)) {
7640 if (conf->log)
7641 return -EBUSY;
7642
7643 rdev->raid_disk = 0;
7644
7645
7646
7647
7648 log_init(conf, rdev, false);
7649 return 0;
7650 }
7651 if (mddev->recovery_disabled == conf->recovery_disabled)
7652 return -EBUSY;
7653
7654 if (rdev->saved_raid_disk < 0 && has_failed(conf))
7655
7656 return -EINVAL;
7657
7658 if (rdev->raid_disk >= 0)
7659 first = last = rdev->raid_disk;
7660
7661
7662
7663
7664
7665 if (rdev->saved_raid_disk >= 0 &&
7666 rdev->saved_raid_disk >= first &&
7667 conf->disks[rdev->saved_raid_disk].rdev == NULL)
7668 first = rdev->saved_raid_disk;
7669
7670 for (disk = first; disk <= last; disk++) {
7671 p = conf->disks + disk;
7672 if (p->rdev == NULL) {
7673 clear_bit(In_sync, &rdev->flags);
7674 rdev->raid_disk = disk;
7675 if (rdev->saved_raid_disk != disk)
7676 conf->fullsync = 1;
7677 rcu_assign_pointer(p->rdev, rdev);
7678
7679 err = log_modify(conf, rdev, true);
7680
7681 goto out;
7682 }
7683 }
7684 for (disk = first; disk <= last; disk++) {
7685 p = conf->disks + disk;
7686 if (test_bit(WantReplacement, &p->rdev->flags) &&
7687 p->replacement == NULL) {
7688 clear_bit(In_sync, &rdev->flags);
7689 set_bit(Replacement, &rdev->flags);
7690 rdev->raid_disk = disk;
7691 err = 0;
7692 conf->fullsync = 1;
7693 rcu_assign_pointer(p->replacement, rdev);
7694 break;
7695 }
7696 }
7697out:
7698 print_raid5_conf(conf);
7699 return err;
7700}
7701
7702static int raid5_resize(struct mddev *mddev, sector_t sectors)
7703{
7704
7705
7706
7707
7708
7709
7710
7711 sector_t newsize;
7712 struct r5conf *conf = mddev->private;
7713
7714 if (conf->log || raid5_has_ppl(conf))
7715 return -EINVAL;
7716 sectors &= ~((sector_t)conf->chunk_sectors - 1);
7717 newsize = raid5_size(mddev, sectors, mddev->raid_disks);
7718 if (mddev->external_size &&
7719 mddev->array_sectors > newsize)
7720 return -EINVAL;
7721 if (mddev->bitmap) {
7722 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0);
7723 if (ret)
7724 return ret;
7725 }
7726 md_set_array_sectors(mddev, newsize);
7727 if (sectors > mddev->dev_sectors &&
7728 mddev->recovery_cp > mddev->dev_sectors) {
7729 mddev->recovery_cp = mddev->dev_sectors;
7730 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7731 }
7732 mddev->dev_sectors = sectors;
7733 mddev->resync_max_sectors = sectors;
7734 return 0;
7735}
7736
7737static int check_stripe_cache(struct mddev *mddev)
7738{
7739
7740
7741
7742
7743
7744
7745
7746
7747 struct r5conf *conf = mddev->private;
7748 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
7749 > conf->min_nr_stripes ||
7750 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
7751 > conf->min_nr_stripes) {
7752 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
7753 mdname(mddev),
7754 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
7755 / STRIPE_SIZE)*4);
7756 return 0;
7757 }
7758 return 1;
7759}
7760
7761static int check_reshape(struct mddev *mddev)
7762{
7763 struct r5conf *conf = mddev->private;
7764
7765 if (conf->log || raid5_has_ppl(conf))
7766 return -EINVAL;
7767 if (mddev->delta_disks == 0 &&
7768 mddev->new_layout == mddev->layout &&
7769 mddev->new_chunk_sectors == mddev->chunk_sectors)
7770 return 0;
7771 if (has_failed(conf))
7772 return -EINVAL;
7773 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) {
7774
7775
7776
7777
7778
7779 int min = 2;
7780 if (mddev->level == 6)
7781 min = 4;
7782 if (mddev->raid_disks + mddev->delta_disks < min)
7783 return -EINVAL;
7784 }
7785
7786 if (!check_stripe_cache(mddev))
7787 return -ENOSPC;
7788
7789 if (mddev->new_chunk_sectors > mddev->chunk_sectors ||
7790 mddev->delta_disks > 0)
7791 if (resize_chunks(conf,
7792 conf->previous_raid_disks
7793 + max(0, mddev->delta_disks),
7794 max(mddev->new_chunk_sectors,
7795 mddev->chunk_sectors)
7796 ) < 0)
7797 return -ENOMEM;
7798
7799 if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
7800 return 0;
7801 return resize_stripes(conf, (conf->previous_raid_disks
7802 + mddev->delta_disks));
7803}
7804
7805static int raid5_start_reshape(struct mddev *mddev)
7806{
7807 struct r5conf *conf = mddev->private;
7808 struct md_rdev *rdev;
7809 int spares = 0;
7810 unsigned long flags;
7811
7812 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
7813 return -EBUSY;
7814
7815 if (!check_stripe_cache(mddev))
7816 return -ENOSPC;
7817
7818 if (has_failed(conf))
7819 return -EINVAL;
7820
7821 rdev_for_each(rdev, mddev) {
7822 if (!test_bit(In_sync, &rdev->flags)
7823 && !test_bit(Faulty, &rdev->flags))
7824 spares++;
7825 }
7826
7827 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
7828
7829
7830
7831 return -EINVAL;
7832
7833
7834
7835
7836
7837 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
7838 < mddev->array_sectors) {
7839 pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
7840 mdname(mddev));
7841 return -EINVAL;
7842 }
7843
7844 atomic_set(&conf->reshape_stripes, 0);
7845 spin_lock_irq(&conf->device_lock);
7846 write_seqcount_begin(&conf->gen_lock);
7847 conf->previous_raid_disks = conf->raid_disks;
7848 conf->raid_disks += mddev->delta_disks;
7849 conf->prev_chunk_sectors = conf->chunk_sectors;
7850 conf->chunk_sectors = mddev->new_chunk_sectors;
7851 conf->prev_algo = conf->algorithm;
7852 conf->algorithm = mddev->new_layout;
7853 conf->generation++;
7854
7855
7856
7857 smp_mb();
7858 if (mddev->reshape_backwards)
7859 conf->reshape_progress = raid5_size(mddev, 0, 0);
7860 else
7861 conf->reshape_progress = 0;
7862 conf->reshape_safe = conf->reshape_progress;
7863 write_seqcount_end(&conf->gen_lock);
7864 spin_unlock_irq(&conf->device_lock);
7865
7866
7867
7868
7869
7870 mddev_suspend(mddev);
7871 mddev_resume(mddev);
7872
7873
7874
7875
7876
7877
7878
7879
7880 if (mddev->delta_disks >= 0) {
7881 rdev_for_each(rdev, mddev)
7882 if (rdev->raid_disk < 0 &&
7883 !test_bit(Faulty, &rdev->flags)) {
7884 if (raid5_add_disk(mddev, rdev) == 0) {
7885 if (rdev->raid_disk
7886 >= conf->previous_raid_disks)
7887 set_bit(In_sync, &rdev->flags);
7888 else
7889 rdev->recovery_offset = 0;
7890
7891 if (sysfs_link_rdev(mddev, rdev))
7892 ;
7893 }
7894 } else if (rdev->raid_disk >= conf->previous_raid_disks
7895 && !test_bit(Faulty, &rdev->flags)) {
7896
7897 set_bit(In_sync, &rdev->flags);
7898 }
7899
7900
7901
7902
7903
7904 spin_lock_irqsave(&conf->device_lock, flags);
7905 mddev->degraded = raid5_calc_degraded(conf);
7906 spin_unlock_irqrestore(&conf->device_lock, flags);
7907 }
7908 mddev->raid_disks = conf->raid_disks;
7909 mddev->reshape_position = conf->reshape_progress;
7910 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7911
7912 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7913 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7914 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
7915 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7916 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7917 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
7918 "reshape");
7919 if (!mddev->sync_thread) {
7920 mddev->recovery = 0;
7921 spin_lock_irq(&conf->device_lock);
7922 write_seqcount_begin(&conf->gen_lock);
7923 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
7924 mddev->new_chunk_sectors =
7925 conf->chunk_sectors = conf->prev_chunk_sectors;
7926 mddev->new_layout = conf->algorithm = conf->prev_algo;
7927 rdev_for_each(rdev, mddev)
7928 rdev->new_data_offset = rdev->data_offset;
7929 smp_wmb();
7930 conf->generation --;
7931 conf->reshape_progress = MaxSector;
7932 mddev->reshape_position = MaxSector;
7933 write_seqcount_end(&conf->gen_lock);
7934 spin_unlock_irq(&conf->device_lock);
7935 return -EAGAIN;
7936 }
7937 conf->reshape_checkpoint = jiffies;
7938 md_wakeup_thread(mddev->sync_thread);
7939 md_new_event(mddev);
7940 return 0;
7941}
7942
7943
7944
7945
7946static void end_reshape(struct r5conf *conf)
7947{
7948
7949 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
7950 struct md_rdev *rdev;
7951
7952 spin_lock_irq(&conf->device_lock);
7953 conf->previous_raid_disks = conf->raid_disks;
7954 rdev_for_each(rdev, conf->mddev)
7955 rdev->data_offset = rdev->new_data_offset;
7956 smp_wmb();
7957 conf->reshape_progress = MaxSector;
7958 conf->mddev->reshape_position = MaxSector;
7959 spin_unlock_irq(&conf->device_lock);
7960 wake_up(&conf->wait_for_overlap);
7961
7962
7963
7964
7965 if (conf->mddev->queue) {
7966 int data_disks = conf->raid_disks - conf->max_degraded;
7967 int stripe = data_disks * ((conf->chunk_sectors << 9)
7968 / PAGE_SIZE);
7969 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
7970 conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
7971 }
7972 }
7973}
7974
7975
7976
7977
7978static void raid5_finish_reshape(struct mddev *mddev)
7979{
7980 struct r5conf *conf = mddev->private;
7981
7982 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7983
7984 if (mddev->delta_disks > 0) {
7985 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
7986 if (mddev->queue) {
7987 set_capacity(mddev->gendisk, mddev->array_sectors);
7988 revalidate_disk(mddev->gendisk);
7989 }
7990 } else {
7991 int d;
7992 spin_lock_irq(&conf->device_lock);
7993 mddev->degraded = raid5_calc_degraded(conf);
7994 spin_unlock_irq(&conf->device_lock);
7995 for (d = conf->raid_disks ;
7996 d < conf->raid_disks - mddev->delta_disks;
7997 d++) {
7998 struct md_rdev *rdev = conf->disks[d].rdev;
7999 if (rdev)
8000 clear_bit(In_sync, &rdev->flags);
8001 rdev = conf->disks[d].replacement;
8002 if (rdev)
8003 clear_bit(In_sync, &rdev->flags);
8004 }
8005 }
8006 mddev->layout = conf->algorithm;
8007 mddev->chunk_sectors = conf->chunk_sectors;
8008 mddev->reshape_position = MaxSector;
8009 mddev->delta_disks = 0;
8010 mddev->reshape_backwards = 0;
8011 }
8012}
8013
8014static void raid5_quiesce(struct mddev *mddev, int state)
8015{
8016 struct r5conf *conf = mddev->private;
8017
8018 switch(state) {
8019 case 2:
8020 wake_up(&conf->wait_for_overlap);
8021 break;
8022
8023 case 1:
8024 lock_all_device_hash_locks_irq(conf);
8025
8026
8027
8028 r5c_flush_cache(conf, INT_MAX);
8029 conf->quiesce = 2;
8030 wait_event_cmd(conf->wait_for_quiescent,
8031 atomic_read(&conf->active_stripes) == 0 &&
8032 atomic_read(&conf->active_aligned_reads) == 0,
8033 unlock_all_device_hash_locks_irq(conf),
8034 lock_all_device_hash_locks_irq(conf));
8035 conf->quiesce = 1;
8036 unlock_all_device_hash_locks_irq(conf);
8037
8038 wake_up(&conf->wait_for_overlap);
8039 break;
8040
8041 case 0:
8042 lock_all_device_hash_locks_irq(conf);
8043 conf->quiesce = 0;
8044 wake_up(&conf->wait_for_quiescent);
8045 wake_up(&conf->wait_for_overlap);
8046 unlock_all_device_hash_locks_irq(conf);
8047 break;
8048 }
8049 r5l_quiesce(conf->log, state);
8050}
8051
8052static void *raid45_takeover_raid0(struct mddev *mddev, int level)
8053{
8054 struct r0conf *raid0_conf = mddev->private;
8055 sector_t sectors;
8056
8057
8058 if (raid0_conf->nr_strip_zones > 1) {
8059 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
8060 mdname(mddev));
8061 return ERR_PTR(-EINVAL);
8062 }
8063
8064 sectors = raid0_conf->strip_zone[0].zone_end;
8065 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
8066 mddev->dev_sectors = sectors;
8067 mddev->new_level = level;
8068 mddev->new_layout = ALGORITHM_PARITY_N;
8069 mddev->new_chunk_sectors = mddev->chunk_sectors;
8070 mddev->raid_disks += 1;
8071 mddev->delta_disks = 1;
8072
8073 mddev->recovery_cp = MaxSector;
8074
8075 return setup_conf(mddev);
8076}
8077
8078static void *raid5_takeover_raid1(struct mddev *mddev)
8079{
8080 int chunksect;
8081 void *ret;
8082
8083 if (mddev->raid_disks != 2 ||
8084 mddev->degraded > 1)
8085 return ERR_PTR(-EINVAL);
8086
8087
8088
8089 chunksect = 64*2;
8090
8091
8092 while (chunksect && (mddev->array_sectors & (chunksect-1)))
8093 chunksect >>= 1;
8094
8095 if ((chunksect<<9) < STRIPE_SIZE)
8096
8097 return ERR_PTR(-EINVAL);
8098
8099 mddev->new_level = 5;
8100 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
8101 mddev->new_chunk_sectors = chunksect;
8102
8103 ret = setup_conf(mddev);
8104 if (!IS_ERR(ret))
8105 mddev_clear_unsupported_flags(mddev,
8106 UNSUPPORTED_MDDEV_FLAGS);
8107 return ret;
8108}
8109
8110static void *raid5_takeover_raid6(struct mddev *mddev)
8111{
8112 int new_layout;
8113
8114 switch (mddev->layout) {
8115 case ALGORITHM_LEFT_ASYMMETRIC_6:
8116 new_layout = ALGORITHM_LEFT_ASYMMETRIC;
8117 break;
8118 case ALGORITHM_RIGHT_ASYMMETRIC_6:
8119 new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
8120 break;
8121 case ALGORITHM_LEFT_SYMMETRIC_6:
8122 new_layout = ALGORITHM_LEFT_SYMMETRIC;
8123 break;
8124 case ALGORITHM_RIGHT_SYMMETRIC_6:
8125 new_layout = ALGORITHM_RIGHT_SYMMETRIC;
8126 break;
8127 case ALGORITHM_PARITY_0_6:
8128 new_layout = ALGORITHM_PARITY_0;
8129 break;
8130 case ALGORITHM_PARITY_N:
8131 new_layout = ALGORITHM_PARITY_N;
8132 break;
8133 default:
8134 return ERR_PTR(-EINVAL);
8135 }
8136 mddev->new_level = 5;
8137 mddev->new_layout = new_layout;
8138 mddev->delta_disks = -1;
8139 mddev->raid_disks -= 1;
8140 return setup_conf(mddev);
8141}
8142
8143static int raid5_check_reshape(struct mddev *mddev)
8144{
8145
8146
8147
8148
8149
8150 struct r5conf *conf = mddev->private;
8151 int new_chunk = mddev->new_chunk_sectors;
8152
8153 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
8154 return -EINVAL;
8155 if (new_chunk > 0) {
8156 if (!is_power_of_2(new_chunk))
8157 return -EINVAL;
8158 if (new_chunk < (PAGE_SIZE>>9))
8159 return -EINVAL;
8160 if (mddev->array_sectors & (new_chunk-1))
8161
8162 return -EINVAL;
8163 }
8164
8165
8166
8167 if (mddev->raid_disks == 2) {
8168
8169 if (mddev->new_layout >= 0) {
8170 conf->algorithm = mddev->new_layout;
8171 mddev->layout = mddev->new_layout;
8172 }
8173 if (new_chunk > 0) {
8174 conf->chunk_sectors = new_chunk ;
8175 mddev->chunk_sectors = new_chunk;
8176 }
8177 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8178 md_wakeup_thread(mddev->thread);
8179 }
8180 return check_reshape(mddev);
8181}
8182
8183static int raid6_check_reshape(struct mddev *mddev)
8184{
8185 int new_chunk = mddev->new_chunk_sectors;
8186
8187 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
8188 return -EINVAL;
8189 if (new_chunk > 0) {
8190 if (!is_power_of_2(new_chunk))
8191 return -EINVAL;
8192 if (new_chunk < (PAGE_SIZE >> 9))
8193 return -EINVAL;
8194 if (mddev->array_sectors & (new_chunk-1))
8195
8196 return -EINVAL;
8197 }
8198
8199
8200 return check_reshape(mddev);
8201}
8202
8203static void *raid5_takeover(struct mddev *mddev)
8204{
8205
8206
8207
8208
8209
8210
8211 if (mddev->level == 0)
8212 return raid45_takeover_raid0(mddev, 5);
8213 if (mddev->level == 1)
8214 return raid5_takeover_raid1(mddev);
8215 if (mddev->level == 4) {
8216 mddev->new_layout = ALGORITHM_PARITY_N;
8217 mddev->new_level = 5;
8218 return setup_conf(mddev);
8219 }
8220 if (mddev->level == 6)
8221 return raid5_takeover_raid6(mddev);
8222
8223 return ERR_PTR(-EINVAL);
8224}
8225
8226static void *raid4_takeover(struct mddev *mddev)
8227{
8228
8229
8230
8231
8232 if (mddev->level == 0)
8233 return raid45_takeover_raid0(mddev, 4);
8234 if (mddev->level == 5 &&
8235 mddev->layout == ALGORITHM_PARITY_N) {
8236 mddev->new_layout = 0;
8237 mddev->new_level = 4;
8238 return setup_conf(mddev);
8239 }
8240 return ERR_PTR(-EINVAL);
8241}
8242
8243static struct md_personality raid5_personality;
8244
8245static void *raid6_takeover(struct mddev *mddev)
8246{
8247
8248
8249
8250
8251 int new_layout;
8252
8253 if (mddev->pers != &raid5_personality)
8254 return ERR_PTR(-EINVAL);
8255 if (mddev->degraded > 1)
8256 return ERR_PTR(-EINVAL);
8257 if (mddev->raid_disks > 253)
8258 return ERR_PTR(-EINVAL);
8259 if (mddev->raid_disks < 3)
8260 return ERR_PTR(-EINVAL);
8261
8262 switch (mddev->layout) {
8263 case ALGORITHM_LEFT_ASYMMETRIC:
8264 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
8265 break;
8266 case ALGORITHM_RIGHT_ASYMMETRIC:
8267 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
8268 break;
8269 case ALGORITHM_LEFT_SYMMETRIC:
8270 new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
8271 break;
8272 case ALGORITHM_RIGHT_SYMMETRIC:
8273 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
8274 break;
8275 case ALGORITHM_PARITY_0:
8276 new_layout = ALGORITHM_PARITY_0_6;
8277 break;
8278 case ALGORITHM_PARITY_N:
8279 new_layout = ALGORITHM_PARITY_N;
8280 break;
8281 default:
8282 return ERR_PTR(-EINVAL);
8283 }
8284 mddev->new_level = 6;
8285 mddev->new_layout = new_layout;
8286 mddev->delta_disks = 1;
8287 mddev->raid_disks += 1;
8288 return setup_conf(mddev);
8289}
8290
8291static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
8292{
8293 struct r5conf *conf;
8294 int err;
8295
8296 err = mddev_lock(mddev);
8297 if (err)
8298 return err;
8299 conf = mddev->private;
8300 if (!conf) {
8301 mddev_unlock(mddev);
8302 return -ENODEV;
8303 }
8304
8305 if (strncmp(buf, "ppl", 3) == 0) {
8306
8307 if (!raid5_has_ppl(conf) && conf->level == 5) {
8308 err = log_init(conf, NULL, true);
8309 if (!err) {
8310 err = resize_stripes(conf, conf->pool_size);
8311 if (err)
8312 log_exit(conf);
8313 }
8314 } else
8315 err = -EINVAL;
8316 } else if (strncmp(buf, "resync", 6) == 0) {
8317 if (raid5_has_ppl(conf)) {
8318 mddev_suspend(mddev);
8319 log_exit(conf);
8320 mddev_resume(mddev);
8321 err = resize_stripes(conf, conf->pool_size);
8322 } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
8323 r5l_log_disk_error(conf)) {
8324 bool journal_dev_exists = false;
8325 struct md_rdev *rdev;
8326
8327 rdev_for_each(rdev, mddev)
8328 if (test_bit(Journal, &rdev->flags)) {
8329 journal_dev_exists = true;
8330 break;
8331 }
8332
8333 if (!journal_dev_exists) {
8334 mddev_suspend(mddev);
8335 clear_bit(MD_HAS_JOURNAL, &mddev->flags);
8336 mddev_resume(mddev);
8337 } else
8338 err = -EBUSY;
8339 } else
8340 err = -EINVAL;
8341 } else {
8342 err = -EINVAL;
8343 }
8344
8345 if (!err)
8346 md_update_sb(mddev, 1);
8347
8348 mddev_unlock(mddev);
8349
8350 return err;
8351}
8352
8353static struct md_personality raid6_personality =
8354{
8355 .name = "raid6",
8356 .level = 6,
8357 .owner = THIS_MODULE,
8358 .make_request = raid5_make_request,
8359 .run = raid5_run,
8360 .free = raid5_free,
8361 .status = raid5_status,
8362 .error_handler = raid5_error,
8363 .hot_add_disk = raid5_add_disk,
8364 .hot_remove_disk= raid5_remove_disk,
8365 .spare_active = raid5_spare_active,
8366 .sync_request = raid5_sync_request,
8367 .resize = raid5_resize,
8368 .size = raid5_size,
8369 .check_reshape = raid6_check_reshape,
8370 .start_reshape = raid5_start_reshape,
8371 .finish_reshape = raid5_finish_reshape,
8372 .quiesce = raid5_quiesce,
8373 .takeover = raid6_takeover,
8374 .congested = raid5_congested,
8375 .change_consistency_policy = raid5_change_consistency_policy,
8376};
8377static struct md_personality raid5_personality =
8378{
8379 .name = "raid5",
8380 .level = 5,
8381 .owner = THIS_MODULE,
8382 .make_request = raid5_make_request,
8383 .run = raid5_run,
8384 .free = raid5_free,
8385 .status = raid5_status,
8386 .error_handler = raid5_error,
8387 .hot_add_disk = raid5_add_disk,
8388 .hot_remove_disk= raid5_remove_disk,
8389 .spare_active = raid5_spare_active,
8390 .sync_request = raid5_sync_request,
8391 .resize = raid5_resize,
8392 .size = raid5_size,
8393 .check_reshape = raid5_check_reshape,
8394 .start_reshape = raid5_start_reshape,
8395 .finish_reshape = raid5_finish_reshape,
8396 .quiesce = raid5_quiesce,
8397 .takeover = raid5_takeover,
8398 .congested = raid5_congested,
8399 .change_consistency_policy = raid5_change_consistency_policy,
8400};
8401
8402static struct md_personality raid4_personality =
8403{
8404 .name = "raid4",
8405 .level = 4,
8406 .owner = THIS_MODULE,
8407 .make_request = raid5_make_request,
8408 .run = raid5_run,
8409 .free = raid5_free,
8410 .status = raid5_status,
8411 .error_handler = raid5_error,
8412 .hot_add_disk = raid5_add_disk,
8413 .hot_remove_disk= raid5_remove_disk,
8414 .spare_active = raid5_spare_active,
8415 .sync_request = raid5_sync_request,
8416 .resize = raid5_resize,
8417 .size = raid5_size,
8418 .check_reshape = raid5_check_reshape,
8419 .start_reshape = raid5_start_reshape,
8420 .finish_reshape = raid5_finish_reshape,
8421 .quiesce = raid5_quiesce,
8422 .takeover = raid4_takeover,
8423 .congested = raid5_congested,
8424 .change_consistency_policy = raid5_change_consistency_policy,
8425};
8426
8427static int __init raid5_init(void)
8428{
8429 int ret;
8430
8431 raid5_wq = alloc_workqueue("raid5wq",
8432 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
8433 if (!raid5_wq)
8434 return -ENOMEM;
8435
8436 ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE,
8437 "md/raid5:prepare",
8438 raid456_cpu_up_prepare,
8439 raid456_cpu_dead);
8440 if (ret) {
8441 destroy_workqueue(raid5_wq);
8442 return ret;
8443 }
8444 register_md_personality(&raid6_personality);
8445 register_md_personality(&raid5_personality);
8446 register_md_personality(&raid4_personality);
8447 return 0;
8448}
8449
8450static void raid5_exit(void)
8451{
8452 unregister_md_personality(&raid6_personality);
8453 unregister_md_personality(&raid5_personality);
8454 unregister_md_personality(&raid4_personality);
8455 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
8456 destroy_workqueue(raid5_wq);
8457}
8458
8459module_init(raid5_init);
8460module_exit(raid5_exit);
8461MODULE_LICENSE("GPL");
8462MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
8463MODULE_ALIAS("md-personality-4");
8464MODULE_ALIAS("md-raid5");
8465MODULE_ALIAS("md-raid4");
8466MODULE_ALIAS("md-level-5");
8467MODULE_ALIAS("md-level-4");
8468MODULE_ALIAS("md-personality-8");
8469MODULE_ALIAS("md-raid6");
8470MODULE_ALIAS("md-level-6");
8471
8472
8473MODULE_ALIAS("raid5");
8474MODULE_ALIAS("raid6");
8475