1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46#include <linux/blkdev.h>
47#include <linux/kthread.h>
48#include <linux/raid/pq.h>
49#include <linux/async_tx.h>
50#include <linux/module.h>
51#include <linux/async.h>
52#include <linux/seq_file.h>
53#include <linux/cpu.h>
54#include <linux/slab.h>
55#include <linux/ratelimit.h>
56#include <linux/nodemask.h>
57#include <linux/flex_array.h>
58
59#include <trace/events/block.h>
60#include <linux/list_sort.h>
61
62#include "md.h"
63#include "raid5.h"
64#include "raid0.h"
65#include "md-bitmap.h"
66#include "raid5-log.h"
67
68#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
69
70#define cpu_to_group(cpu) cpu_to_node(cpu)
71#define ANY_GROUP NUMA_NO_NODE
72
73static bool devices_handle_discard_safely = false;
74module_param(devices_handle_discard_safely, bool, 0644);
75MODULE_PARM_DESC(devices_handle_discard_safely,
76 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
77static struct workqueue_struct *raid5_wq;
78
79static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
80{
81 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
82 return &conf->stripe_hashtbl[hash];
83}
84
85static inline int stripe_hash_locks_hash(sector_t sect)
86{
87 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
88}
89
90static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
91{
92 spin_lock_irq(conf->hash_locks + hash);
93 spin_lock(&conf->device_lock);
94}
95
96static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
97{
98 spin_unlock(&conf->device_lock);
99 spin_unlock_irq(conf->hash_locks + hash);
100}
101
102static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
103{
104 int i;
105 spin_lock_irq(conf->hash_locks);
106 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
107 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
108 spin_lock(&conf->device_lock);
109}
110
111static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
112{
113 int i;
114 spin_unlock(&conf->device_lock);
115 for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
116 spin_unlock(conf->hash_locks + i);
117 spin_unlock_irq(conf->hash_locks);
118}
119
120
121static inline int raid6_d0(struct stripe_head *sh)
122{
123 if (sh->ddf_layout)
124
125 return 0;
126
127 if (sh->qd_idx == sh->disks - 1)
128 return 0;
129 else
130 return sh->qd_idx + 1;
131}
132static inline int raid6_next_disk(int disk, int raid_disks)
133{
134 disk++;
135 return (disk < raid_disks) ? disk : 0;
136}
137
138
139
140
141
142
143static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
144 int *count, int syndrome_disks)
145{
146 int slot = *count;
147
148 if (sh->ddf_layout)
149 (*count)++;
150 if (idx == sh->pd_idx)
151 return syndrome_disks;
152 if (idx == sh->qd_idx)
153 return syndrome_disks + 1;
154 if (!sh->ddf_layout)
155 (*count)++;
156 return slot;
157}
158
159static void print_raid5_conf (struct r5conf *conf);
160
161static int stripe_operations_active(struct stripe_head *sh)
162{
163 return sh->check_state || sh->reconstruct_state ||
164 test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
165 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
166}
167
168static bool stripe_is_lowprio(struct stripe_head *sh)
169{
170 return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
171 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
172 !test_bit(STRIPE_R5C_CACHING, &sh->state);
173}
174
175static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
176{
177 struct r5conf *conf = sh->raid_conf;
178 struct r5worker_group *group;
179 int thread_cnt;
180 int i, cpu = sh->cpu;
181
182 if (!cpu_online(cpu)) {
183 cpu = cpumask_any(cpu_online_mask);
184 sh->cpu = cpu;
185 }
186
187 if (list_empty(&sh->lru)) {
188 struct r5worker_group *group;
189 group = conf->worker_groups + cpu_to_group(cpu);
190 if (stripe_is_lowprio(sh))
191 list_add_tail(&sh->lru, &group->loprio_list);
192 else
193 list_add_tail(&sh->lru, &group->handle_list);
194 group->stripes_cnt++;
195 sh->group = group;
196 }
197
198 if (conf->worker_cnt_per_group == 0) {
199 md_wakeup_thread(conf->mddev->thread);
200 return;
201 }
202
203 group = conf->worker_groups + cpu_to_group(sh->cpu);
204
205 group->workers[0].working = true;
206
207 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
208
209 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
210
211 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
212 if (group->workers[i].working == false) {
213 group->workers[i].working = true;
214 queue_work_on(sh->cpu, raid5_wq,
215 &group->workers[i].work);
216 thread_cnt--;
217 }
218 }
219}
220
221static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
222 struct list_head *temp_inactive_list)
223{
224 int i;
225 int injournal = 0;
226
227 BUG_ON(!list_empty(&sh->lru));
228 BUG_ON(atomic_read(&conf->active_stripes)==0);
229
230 if (r5c_is_writeback(conf->log))
231 for (i = sh->disks; i--; )
232 if (test_bit(R5_InJournal, &sh->dev[i].flags))
233 injournal++;
234
235
236
237
238
239
240
241 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
242 (conf->quiesce && r5c_is_writeback(conf->log) &&
243 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
244 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
245 r5c_make_stripe_write_out(sh);
246 set_bit(STRIPE_HANDLE, &sh->state);
247 }
248
249 if (test_bit(STRIPE_HANDLE, &sh->state)) {
250 if (test_bit(STRIPE_DELAYED, &sh->state) &&
251 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
252 list_add_tail(&sh->lru, &conf->delayed_list);
253 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
254 sh->bm_seq - conf->seq_write > 0)
255 list_add_tail(&sh->lru, &conf->bitmap_list);
256 else {
257 clear_bit(STRIPE_DELAYED, &sh->state);
258 clear_bit(STRIPE_BIT_DELAY, &sh->state);
259 if (conf->worker_cnt_per_group == 0) {
260 if (stripe_is_lowprio(sh))
261 list_add_tail(&sh->lru,
262 &conf->loprio_list);
263 else
264 list_add_tail(&sh->lru,
265 &conf->handle_list);
266 } else {
267 raid5_wakeup_stripe_thread(sh);
268 return;
269 }
270 }
271 md_wakeup_thread(conf->mddev->thread);
272 } else {
273 BUG_ON(stripe_operations_active(sh));
274 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
275 if (atomic_dec_return(&conf->preread_active_stripes)
276 < IO_THRESHOLD)
277 md_wakeup_thread(conf->mddev->thread);
278 atomic_dec(&conf->active_stripes);
279 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
280 if (!r5c_is_writeback(conf->log))
281 list_add_tail(&sh->lru, temp_inactive_list);
282 else {
283 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
284 if (injournal == 0)
285 list_add_tail(&sh->lru, temp_inactive_list);
286 else if (injournal == conf->raid_disks - conf->max_degraded) {
287
288 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
289 atomic_inc(&conf->r5c_cached_full_stripes);
290 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
291 atomic_dec(&conf->r5c_cached_partial_stripes);
292 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
293 r5c_check_cached_full_stripe(conf);
294 } else
295
296
297
298
299
300 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
301 }
302 }
303 }
304}
305
306static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
307 struct list_head *temp_inactive_list)
308{
309 if (atomic_dec_and_test(&sh->count))
310 do_release_stripe(conf, sh, temp_inactive_list);
311}
312
313
314
315
316
317
318
319
320static void release_inactive_stripe_list(struct r5conf *conf,
321 struct list_head *temp_inactive_list,
322 int hash)
323{
324 int size;
325 bool do_wakeup = false;
326 unsigned long flags;
327
328 if (hash == NR_STRIPE_HASH_LOCKS) {
329 size = NR_STRIPE_HASH_LOCKS;
330 hash = NR_STRIPE_HASH_LOCKS - 1;
331 } else
332 size = 1;
333 while (size) {
334 struct list_head *list = &temp_inactive_list[size - 1];
335
336
337
338
339
340 if (!list_empty_careful(list)) {
341 spin_lock_irqsave(conf->hash_locks + hash, flags);
342 if (list_empty(conf->inactive_list + hash) &&
343 !list_empty(list))
344 atomic_dec(&conf->empty_inactive_list_nr);
345 list_splice_tail_init(list, conf->inactive_list + hash);
346 do_wakeup = true;
347 spin_unlock_irqrestore(conf->hash_locks + hash, flags);
348 }
349 size--;
350 hash--;
351 }
352
353 if (do_wakeup) {
354 wake_up(&conf->wait_for_stripe);
355 if (atomic_read(&conf->active_stripes) == 0)
356 wake_up(&conf->wait_for_quiescent);
357 if (conf->retry_read_aligned)
358 md_wakeup_thread(conf->mddev->thread);
359 }
360}
361
362
363static int release_stripe_list(struct r5conf *conf,
364 struct list_head *temp_inactive_list)
365{
366 struct stripe_head *sh, *t;
367 int count = 0;
368 struct llist_node *head;
369
370 head = llist_del_all(&conf->released_stripes);
371 head = llist_reverse_order(head);
372 llist_for_each_entry_safe(sh, t, head, release_list) {
373 int hash;
374
375
376 smp_mb();
377 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
378
379
380
381
382
383 hash = sh->hash_lock_index;
384 __release_stripe(conf, sh, &temp_inactive_list[hash]);
385 count++;
386 }
387
388 return count;
389}
390
391void raid5_release_stripe(struct stripe_head *sh)
392{
393 struct r5conf *conf = sh->raid_conf;
394 unsigned long flags;
395 struct list_head list;
396 int hash;
397 bool wakeup;
398
399
400
401 if (atomic_add_unless(&sh->count, -1, 1))
402 return;
403
404 if (unlikely(!conf->mddev->thread) ||
405 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
406 goto slow_path;
407 wakeup = llist_add(&sh->release_list, &conf->released_stripes);
408 if (wakeup)
409 md_wakeup_thread(conf->mddev->thread);
410 return;
411slow_path:
412
413 if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) {
414 INIT_LIST_HEAD(&list);
415 hash = sh->hash_lock_index;
416 do_release_stripe(conf, sh, &list);
417 spin_unlock_irqrestore(&conf->device_lock, flags);
418 release_inactive_stripe_list(conf, &list, hash);
419 }
420}
421
422static inline void remove_hash(struct stripe_head *sh)
423{
424 pr_debug("remove_hash(), stripe %llu\n",
425 (unsigned long long)sh->sector);
426
427 hlist_del_init(&sh->hash);
428}
429
430static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
431{
432 struct hlist_head *hp = stripe_hash(conf, sh->sector);
433
434 pr_debug("insert_hash(), stripe %llu\n",
435 (unsigned long long)sh->sector);
436
437 hlist_add_head(&sh->hash, hp);
438}
439
440
441static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
442{
443 struct stripe_head *sh = NULL;
444 struct list_head *first;
445
446 if (list_empty(conf->inactive_list + hash))
447 goto out;
448 first = (conf->inactive_list + hash)->next;
449 sh = list_entry(first, struct stripe_head, lru);
450 list_del_init(first);
451 remove_hash(sh);
452 atomic_inc(&conf->active_stripes);
453 BUG_ON(hash != sh->hash_lock_index);
454 if (list_empty(conf->inactive_list + hash))
455 atomic_inc(&conf->empty_inactive_list_nr);
456out:
457 return sh;
458}
459
460static void shrink_buffers(struct stripe_head *sh)
461{
462 struct page *p;
463 int i;
464 int num = sh->raid_conf->pool_size;
465
466 for (i = 0; i < num ; i++) {
467 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
468 p = sh->dev[i].page;
469 if (!p)
470 continue;
471 sh->dev[i].page = NULL;
472 put_page(p);
473 }
474}
475
476static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
477{
478 int i;
479 int num = sh->raid_conf->pool_size;
480
481 for (i = 0; i < num; i++) {
482 struct page *page;
483
484 if (!(page = alloc_page(gfp))) {
485 return 1;
486 }
487 sh->dev[i].page = page;
488 sh->dev[i].orig_page = page;
489 }
490
491 return 0;
492}
493
494static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
495 struct stripe_head *sh);
496
497static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
498{
499 struct r5conf *conf = sh->raid_conf;
500 int i, seq;
501
502 BUG_ON(atomic_read(&sh->count) != 0);
503 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
504 BUG_ON(stripe_operations_active(sh));
505 BUG_ON(sh->batch_head);
506
507 pr_debug("init_stripe called, stripe %llu\n",
508 (unsigned long long)sector);
509retry:
510 seq = read_seqcount_begin(&conf->gen_lock);
511 sh->generation = conf->generation - previous;
512 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
513 sh->sector = sector;
514 stripe_set_idx(sector, conf, previous, sh);
515 sh->state = 0;
516
517 for (i = sh->disks; i--; ) {
518 struct r5dev *dev = &sh->dev[i];
519
520 if (dev->toread || dev->read || dev->towrite || dev->written ||
521 test_bit(R5_LOCKED, &dev->flags)) {
522 pr_err("sector=%llx i=%d %p %p %p %p %d\n",
523 (unsigned long long)sh->sector, i, dev->toread,
524 dev->read, dev->towrite, dev->written,
525 test_bit(R5_LOCKED, &dev->flags));
526 WARN_ON(1);
527 }
528 dev->flags = 0;
529 dev->sector = raid5_compute_blocknr(sh, i, previous);
530 }
531 if (read_seqcount_retry(&conf->gen_lock, seq))
532 goto retry;
533 sh->overwrite_disks = 0;
534 insert_hash(conf, sh);
535 sh->cpu = smp_processor_id();
536 set_bit(STRIPE_BATCH_READY, &sh->state);
537}
538
539static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
540 short generation)
541{
542 struct stripe_head *sh;
543
544 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
545 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
546 if (sh->sector == sector && sh->generation == generation)
547 return sh;
548 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
549 return NULL;
550}
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565int raid5_calc_degraded(struct r5conf *conf)
566{
567 int degraded, degraded2;
568 int i;
569
570 rcu_read_lock();
571 degraded = 0;
572 for (i = 0; i < conf->previous_raid_disks; i++) {
573 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
574 if (rdev && test_bit(Faulty, &rdev->flags))
575 rdev = rcu_dereference(conf->disks[i].replacement);
576 if (!rdev || test_bit(Faulty, &rdev->flags))
577 degraded++;
578 else if (test_bit(In_sync, &rdev->flags))
579 ;
580 else
581
582
583
584
585
586
587
588
589
590 if (conf->raid_disks >= conf->previous_raid_disks)
591 degraded++;
592 }
593 rcu_read_unlock();
594 if (conf->raid_disks == conf->previous_raid_disks)
595 return degraded;
596 rcu_read_lock();
597 degraded2 = 0;
598 for (i = 0; i < conf->raid_disks; i++) {
599 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
600 if (rdev && test_bit(Faulty, &rdev->flags))
601 rdev = rcu_dereference(conf->disks[i].replacement);
602 if (!rdev || test_bit(Faulty, &rdev->flags))
603 degraded2++;
604 else if (test_bit(In_sync, &rdev->flags))
605 ;
606 else
607
608
609
610
611
612 if (conf->raid_disks <= conf->previous_raid_disks)
613 degraded2++;
614 }
615 rcu_read_unlock();
616 if (degraded2 > degraded)
617 return degraded2;
618 return degraded;
619}
620
621static int has_failed(struct r5conf *conf)
622{
623 int degraded;
624
625 if (conf->mddev->reshape_position == MaxSector)
626 return conf->mddev->degraded > conf->max_degraded;
627
628 degraded = raid5_calc_degraded(conf);
629 if (degraded > conf->max_degraded)
630 return 1;
631 return 0;
632}
633
634struct stripe_head *
635raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
636 int previous, int noblock, int noquiesce)
637{
638 struct stripe_head *sh;
639 int hash = stripe_hash_locks_hash(sector);
640 int inc_empty_inactive_list_flag;
641
642 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
643
644 spin_lock_irq(conf->hash_locks + hash);
645
646 do {
647 wait_event_lock_irq(conf->wait_for_quiescent,
648 conf->quiesce == 0 || noquiesce,
649 *(conf->hash_locks + hash));
650 sh = __find_stripe(conf, sector, conf->generation - previous);
651 if (!sh) {
652 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
653 sh = get_free_stripe(conf, hash);
654 if (!sh && !test_bit(R5_DID_ALLOC,
655 &conf->cache_state))
656 set_bit(R5_ALLOC_MORE,
657 &conf->cache_state);
658 }
659 if (noblock && sh == NULL)
660 break;
661
662 r5c_check_stripe_cache_usage(conf);
663 if (!sh) {
664 set_bit(R5_INACTIVE_BLOCKED,
665 &conf->cache_state);
666 r5l_wake_reclaim(conf->log, 0);
667 wait_event_lock_irq(
668 conf->wait_for_stripe,
669 !list_empty(conf->inactive_list + hash) &&
670 (atomic_read(&conf->active_stripes)
671 < (conf->max_nr_stripes * 3 / 4)
672 || !test_bit(R5_INACTIVE_BLOCKED,
673 &conf->cache_state)),
674 *(conf->hash_locks + hash));
675 clear_bit(R5_INACTIVE_BLOCKED,
676 &conf->cache_state);
677 } else {
678 init_stripe(sh, sector, previous);
679 atomic_inc(&sh->count);
680 }
681 } else if (!atomic_inc_not_zero(&sh->count)) {
682 spin_lock(&conf->device_lock);
683 if (!atomic_read(&sh->count)) {
684 if (!test_bit(STRIPE_HANDLE, &sh->state))
685 atomic_inc(&conf->active_stripes);
686 BUG_ON(list_empty(&sh->lru) &&
687 !test_bit(STRIPE_EXPANDING, &sh->state));
688 inc_empty_inactive_list_flag = 0;
689 if (!list_empty(conf->inactive_list + hash))
690 inc_empty_inactive_list_flag = 1;
691 list_del_init(&sh->lru);
692 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
693 atomic_inc(&conf->empty_inactive_list_nr);
694 if (sh->group) {
695 sh->group->stripes_cnt--;
696 sh->group = NULL;
697 }
698 }
699 atomic_inc(&sh->count);
700 spin_unlock(&conf->device_lock);
701 }
702 } while (sh == NULL);
703
704 spin_unlock_irq(conf->hash_locks + hash);
705 return sh;
706}
707
708static bool is_full_stripe_write(struct stripe_head *sh)
709{
710 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
711 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
712}
713
714static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
715{
716 if (sh1 > sh2) {
717 spin_lock_irq(&sh2->stripe_lock);
718 spin_lock_nested(&sh1->stripe_lock, 1);
719 } else {
720 spin_lock_irq(&sh1->stripe_lock);
721 spin_lock_nested(&sh2->stripe_lock, 1);
722 }
723}
724
725static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
726{
727 spin_unlock(&sh1->stripe_lock);
728 spin_unlock_irq(&sh2->stripe_lock);
729}
730
731
732static bool stripe_can_batch(struct stripe_head *sh)
733{
734 struct r5conf *conf = sh->raid_conf;
735
736 if (raid5_has_log(conf) || raid5_has_ppl(conf))
737 return false;
738 return test_bit(STRIPE_BATCH_READY, &sh->state) &&
739 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
740 is_full_stripe_write(sh);
741}
742
743
744static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
745{
746 struct stripe_head *head;
747 sector_t head_sector, tmp_sec;
748 int hash;
749 int dd_idx;
750 int inc_empty_inactive_list_flag;
751
752
753 tmp_sec = sh->sector;
754 if (!sector_div(tmp_sec, conf->chunk_sectors))
755 return;
756 head_sector = sh->sector - STRIPE_SECTORS;
757
758 hash = stripe_hash_locks_hash(head_sector);
759 spin_lock_irq(conf->hash_locks + hash);
760 head = __find_stripe(conf, head_sector, conf->generation);
761 if (head && !atomic_inc_not_zero(&head->count)) {
762 spin_lock(&conf->device_lock);
763 if (!atomic_read(&head->count)) {
764 if (!test_bit(STRIPE_HANDLE, &head->state))
765 atomic_inc(&conf->active_stripes);
766 BUG_ON(list_empty(&head->lru) &&
767 !test_bit(STRIPE_EXPANDING, &head->state));
768 inc_empty_inactive_list_flag = 0;
769 if (!list_empty(conf->inactive_list + hash))
770 inc_empty_inactive_list_flag = 1;
771 list_del_init(&head->lru);
772 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
773 atomic_inc(&conf->empty_inactive_list_nr);
774 if (head->group) {
775 head->group->stripes_cnt--;
776 head->group = NULL;
777 }
778 }
779 atomic_inc(&head->count);
780 spin_unlock(&conf->device_lock);
781 }
782 spin_unlock_irq(conf->hash_locks + hash);
783
784 if (!head)
785 return;
786 if (!stripe_can_batch(head))
787 goto out;
788
789 lock_two_stripes(head, sh);
790
791 if (!stripe_can_batch(head) || !stripe_can_batch(sh))
792 goto unlock_out;
793
794 if (sh->batch_head)
795 goto unlock_out;
796
797 dd_idx = 0;
798 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
799 dd_idx++;
800 if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf ||
801 bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite))
802 goto unlock_out;
803
804 if (head->batch_head) {
805 spin_lock(&head->batch_head->batch_lock);
806
807 if (!stripe_can_batch(head)) {
808 spin_unlock(&head->batch_head->batch_lock);
809 goto unlock_out;
810 }
811
812
813
814
815
816
817
818 sh->batch_head = head->batch_head;
819
820
821
822
823
824 list_add(&sh->batch_list, &head->batch_list);
825 spin_unlock(&head->batch_head->batch_lock);
826 } else {
827 head->batch_head = head;
828 sh->batch_head = head->batch_head;
829 spin_lock(&head->batch_lock);
830 list_add_tail(&sh->batch_list, &head->batch_list);
831 spin_unlock(&head->batch_lock);
832 }
833
834 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
835 if (atomic_dec_return(&conf->preread_active_stripes)
836 < IO_THRESHOLD)
837 md_wakeup_thread(conf->mddev->thread);
838
839 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
840 int seq = sh->bm_seq;
841 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
842 sh->batch_head->bm_seq > seq)
843 seq = sh->batch_head->bm_seq;
844 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
845 sh->batch_head->bm_seq = seq;
846 }
847
848 atomic_inc(&sh->count);
849unlock_out:
850 unlock_two_stripes(head, sh);
851out:
852 raid5_release_stripe(head);
853}
854
855
856
857
858static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
859{
860 sector_t progress = conf->reshape_progress;
861
862
863
864
865 smp_rmb();
866 if (progress == MaxSector)
867 return 0;
868 if (sh->generation == conf->generation - 1)
869 return 0;
870
871
872
873 return 1;
874}
875
876static void dispatch_bio_list(struct bio_list *tmp)
877{
878 struct bio *bio;
879
880 while ((bio = bio_list_pop(tmp)))
881 generic_make_request(bio);
882}
883
884static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
885{
886 const struct r5pending_data *da = list_entry(a,
887 struct r5pending_data, sibling);
888 const struct r5pending_data *db = list_entry(b,
889 struct r5pending_data, sibling);
890 if (da->sector > db->sector)
891 return 1;
892 if (da->sector < db->sector)
893 return -1;
894 return 0;
895}
896
897static void dispatch_defer_bios(struct r5conf *conf, int target,
898 struct bio_list *list)
899{
900 struct r5pending_data *data;
901 struct list_head *first, *next = NULL;
902 int cnt = 0;
903
904 if (conf->pending_data_cnt == 0)
905 return;
906
907 list_sort(NULL, &conf->pending_list, cmp_stripe);
908
909 first = conf->pending_list.next;
910
911
912 if (conf->next_pending_data)
913 list_move_tail(&conf->pending_list,
914 &conf->next_pending_data->sibling);
915
916 while (!list_empty(&conf->pending_list)) {
917 data = list_first_entry(&conf->pending_list,
918 struct r5pending_data, sibling);
919 if (&data->sibling == first)
920 first = data->sibling.next;
921 next = data->sibling.next;
922
923 bio_list_merge(list, &data->bios);
924 list_move(&data->sibling, &conf->free_list);
925 cnt++;
926 if (cnt >= target)
927 break;
928 }
929 conf->pending_data_cnt -= cnt;
930 BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
931
932 if (next != &conf->pending_list)
933 conf->next_pending_data = list_entry(next,
934 struct r5pending_data, sibling);
935 else
936 conf->next_pending_data = NULL;
937
938 if (first != &conf->pending_list)
939 list_move_tail(&conf->pending_list, first);
940}
941
942static void flush_deferred_bios(struct r5conf *conf)
943{
944 struct bio_list tmp = BIO_EMPTY_LIST;
945
946 if (conf->pending_data_cnt == 0)
947 return;
948
949 spin_lock(&conf->pending_bios_lock);
950 dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
951 BUG_ON(conf->pending_data_cnt != 0);
952 spin_unlock(&conf->pending_bios_lock);
953
954 dispatch_bio_list(&tmp);
955}
956
957static void defer_issue_bios(struct r5conf *conf, sector_t sector,
958 struct bio_list *bios)
959{
960 struct bio_list tmp = BIO_EMPTY_LIST;
961 struct r5pending_data *ent;
962
963 spin_lock(&conf->pending_bios_lock);
964 ent = list_first_entry(&conf->free_list, struct r5pending_data,
965 sibling);
966 list_move_tail(&ent->sibling, &conf->pending_list);
967 ent->sector = sector;
968 bio_list_init(&ent->bios);
969 bio_list_merge(&ent->bios, bios);
970 conf->pending_data_cnt++;
971 if (conf->pending_data_cnt >= PENDING_IO_MAX)
972 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
973
974 spin_unlock(&conf->pending_bios_lock);
975
976 dispatch_bio_list(&tmp);
977}
978
979static void
980raid5_end_read_request(struct bio *bi);
981static void
982raid5_end_write_request(struct bio *bi);
983
984static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
985{
986 struct r5conf *conf = sh->raid_conf;
987 int i, disks = sh->disks;
988 struct stripe_head *head_sh = sh;
989 struct bio_list pending_bios = BIO_EMPTY_LIST;
990 bool should_defer;
991
992 might_sleep();
993
994 if (log_stripe(sh, s) == 0)
995 return;
996
997 should_defer = conf->batch_bio_dispatch && conf->group_cnt;
998
999 for (i = disks; i--; ) {
1000 int op, op_flags = 0;
1001 int replace_only = 0;
1002 struct bio *bi, *rbi;
1003 struct md_rdev *rdev, *rrdev = NULL;
1004
1005 sh = head_sh;
1006 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
1007 op = REQ_OP_WRITE;
1008 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
1009 op_flags = REQ_FUA;
1010 if (test_bit(R5_Discard, &sh->dev[i].flags))
1011 op = REQ_OP_DISCARD;
1012 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1013 op = REQ_OP_READ;
1014 else if (test_and_clear_bit(R5_WantReplace,
1015 &sh->dev[i].flags)) {
1016 op = REQ_OP_WRITE;
1017 replace_only = 1;
1018 } else
1019 continue;
1020 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
1021 op_flags |= REQ_SYNC;
1022
1023again:
1024 bi = &sh->dev[i].req;
1025 rbi = &sh->dev[i].rreq;
1026
1027 rcu_read_lock();
1028 rrdev = rcu_dereference(conf->disks[i].replacement);
1029 smp_mb();
1030 rdev = rcu_dereference(conf->disks[i].rdev);
1031 if (!rdev) {
1032 rdev = rrdev;
1033 rrdev = NULL;
1034 }
1035 if (op_is_write(op)) {
1036 if (replace_only)
1037 rdev = NULL;
1038 if (rdev == rrdev)
1039
1040 rrdev = NULL;
1041 } else {
1042 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
1043 rdev = rrdev;
1044 rrdev = NULL;
1045 }
1046
1047 if (rdev && test_bit(Faulty, &rdev->flags))
1048 rdev = NULL;
1049 if (rdev)
1050 atomic_inc(&rdev->nr_pending);
1051 if (rrdev && test_bit(Faulty, &rrdev->flags))
1052 rrdev = NULL;
1053 if (rrdev)
1054 atomic_inc(&rrdev->nr_pending);
1055 rcu_read_unlock();
1056
1057
1058
1059
1060
1061 while (op_is_write(op) && rdev &&
1062 test_bit(WriteErrorSeen, &rdev->flags)) {
1063 sector_t first_bad;
1064 int bad_sectors;
1065 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
1066 &first_bad, &bad_sectors);
1067 if (!bad)
1068 break;
1069
1070 if (bad < 0) {
1071 set_bit(BlockedBadBlocks, &rdev->flags);
1072 if (!conf->mddev->external &&
1073 conf->mddev->sb_flags) {
1074
1075
1076
1077
1078 md_check_recovery(conf->mddev);
1079 }
1080
1081
1082
1083
1084
1085 atomic_inc(&rdev->nr_pending);
1086 md_wait_for_blocked_rdev(rdev, conf->mddev);
1087 } else {
1088
1089 rdev_dec_pending(rdev, conf->mddev);
1090 rdev = NULL;
1091 }
1092 }
1093
1094 if (rdev) {
1095 if (s->syncing || s->expanding || s->expanded
1096 || s->replacing)
1097 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1098
1099 set_bit(STRIPE_IO_STARTED, &sh->state);
1100
1101 bio_set_dev(bi, rdev->bdev);
1102 bio_set_op_attrs(bi, op, op_flags);
1103 bi->bi_end_io = op_is_write(op)
1104 ? raid5_end_write_request
1105 : raid5_end_read_request;
1106 bi->bi_private = sh;
1107
1108 pr_debug("%s: for %llu schedule op %d on disc %d\n",
1109 __func__, (unsigned long long)sh->sector,
1110 bi->bi_opf, i);
1111 atomic_inc(&sh->count);
1112 if (sh != head_sh)
1113 atomic_inc(&head_sh->count);
1114 if (use_new_offset(conf, sh))
1115 bi->bi_iter.bi_sector = (sh->sector
1116 + rdev->new_data_offset);
1117 else
1118 bi->bi_iter.bi_sector = (sh->sector
1119 + rdev->data_offset);
1120 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
1121 bi->bi_opf |= REQ_NOMERGE;
1122
1123 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1124 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1125
1126 if (!op_is_write(op) &&
1127 test_bit(R5_InJournal, &sh->dev[i].flags))
1128
1129
1130
1131
1132
1133 sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
1134 else
1135 sh->dev[i].vec.bv_page = sh->dev[i].page;
1136 bi->bi_vcnt = 1;
1137 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1138 bi->bi_io_vec[0].bv_offset = 0;
1139 bi->bi_iter.bi_size = STRIPE_SIZE;
1140 bi->bi_write_hint = sh->dev[i].write_hint;
1141 if (!rrdev)
1142 sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
1143
1144
1145
1146
1147 if (op == REQ_OP_DISCARD)
1148 bi->bi_vcnt = 0;
1149 if (rrdev)
1150 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
1151
1152 if (conf->mddev->gendisk)
1153 trace_block_bio_remap(bi->bi_disk->queue,
1154 bi, disk_devt(conf->mddev->gendisk),
1155 sh->dev[i].sector);
1156 if (should_defer && op_is_write(op))
1157 bio_list_add(&pending_bios, bi);
1158 else
1159 generic_make_request(bi);
1160 }
1161 if (rrdev) {
1162 if (s->syncing || s->expanding || s->expanded
1163 || s->replacing)
1164 md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
1165
1166 set_bit(STRIPE_IO_STARTED, &sh->state);
1167
1168 bio_set_dev(rbi, rrdev->bdev);
1169 bio_set_op_attrs(rbi, op, op_flags);
1170 BUG_ON(!op_is_write(op));
1171 rbi->bi_end_io = raid5_end_write_request;
1172 rbi->bi_private = sh;
1173
1174 pr_debug("%s: for %llu schedule op %d on "
1175 "replacement disc %d\n",
1176 __func__, (unsigned long long)sh->sector,
1177 rbi->bi_opf, i);
1178 atomic_inc(&sh->count);
1179 if (sh != head_sh)
1180 atomic_inc(&head_sh->count);
1181 if (use_new_offset(conf, sh))
1182 rbi->bi_iter.bi_sector = (sh->sector
1183 + rrdev->new_data_offset);
1184 else
1185 rbi->bi_iter.bi_sector = (sh->sector
1186 + rrdev->data_offset);
1187 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1188 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1189 sh->dev[i].rvec.bv_page = sh->dev[i].page;
1190 rbi->bi_vcnt = 1;
1191 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1192 rbi->bi_io_vec[0].bv_offset = 0;
1193 rbi->bi_iter.bi_size = STRIPE_SIZE;
1194 rbi->bi_write_hint = sh->dev[i].write_hint;
1195 sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
1196
1197
1198
1199
1200 if (op == REQ_OP_DISCARD)
1201 rbi->bi_vcnt = 0;
1202 if (conf->mddev->gendisk)
1203 trace_block_bio_remap(rbi->bi_disk->queue,
1204 rbi, disk_devt(conf->mddev->gendisk),
1205 sh->dev[i].sector);
1206 if (should_defer && op_is_write(op))
1207 bio_list_add(&pending_bios, rbi);
1208 else
1209 generic_make_request(rbi);
1210 }
1211 if (!rdev && !rrdev) {
1212 if (op_is_write(op))
1213 set_bit(STRIPE_DEGRADED, &sh->state);
1214 pr_debug("skip op %d on disc %d for sector %llu\n",
1215 bi->bi_opf, i, (unsigned long long)sh->sector);
1216 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1217 set_bit(STRIPE_HANDLE, &sh->state);
1218 }
1219
1220 if (!head_sh->batch_head)
1221 continue;
1222 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1223 batch_list);
1224 if (sh != head_sh)
1225 goto again;
1226 }
1227
1228 if (should_defer && !bio_list_empty(&pending_bios))
1229 defer_issue_bios(conf, head_sh->sector, &pending_bios);
1230}
1231
1232static struct dma_async_tx_descriptor *
1233async_copy_data(int frombio, struct bio *bio, struct page **page,
1234 sector_t sector, struct dma_async_tx_descriptor *tx,
1235 struct stripe_head *sh, int no_skipcopy)
1236{
1237 struct bio_vec bvl;
1238 struct bvec_iter iter;
1239 struct page *bio_page;
1240 int page_offset;
1241 struct async_submit_ctl submit;
1242 enum async_tx_flags flags = 0;
1243
1244 if (bio->bi_iter.bi_sector >= sector)
1245 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
1246 else
1247 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
1248
1249 if (frombio)
1250 flags |= ASYNC_TX_FENCE;
1251 init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
1252
1253 bio_for_each_segment(bvl, bio, iter) {
1254 int len = bvl.bv_len;
1255 int clen;
1256 int b_offset = 0;
1257
1258 if (page_offset < 0) {
1259 b_offset = -page_offset;
1260 page_offset += b_offset;
1261 len -= b_offset;
1262 }
1263
1264 if (len > 0 && page_offset + len > STRIPE_SIZE)
1265 clen = STRIPE_SIZE - page_offset;
1266 else
1267 clen = len;
1268
1269 if (clen > 0) {
1270 b_offset += bvl.bv_offset;
1271 bio_page = bvl.bv_page;
1272 if (frombio) {
1273 if (sh->raid_conf->skip_copy &&
1274 b_offset == 0 && page_offset == 0 &&
1275 clen == STRIPE_SIZE &&
1276 !no_skipcopy)
1277 *page = bio_page;
1278 else
1279 tx = async_memcpy(*page, bio_page, page_offset,
1280 b_offset, clen, &submit);
1281 } else
1282 tx = async_memcpy(bio_page, *page, b_offset,
1283 page_offset, clen, &submit);
1284 }
1285
1286 submit.depend_tx = tx;
1287
1288 if (clen < len)
1289 break;
1290 page_offset += len;
1291 }
1292
1293 return tx;
1294}
1295
1296static void ops_complete_biofill(void *stripe_head_ref)
1297{
1298 struct stripe_head *sh = stripe_head_ref;
1299 int i;
1300
1301 pr_debug("%s: stripe %llu\n", __func__,
1302 (unsigned long long)sh->sector);
1303
1304
1305 for (i = sh->disks; i--; ) {
1306 struct r5dev *dev = &sh->dev[i];
1307
1308
1309
1310
1311
1312
1313 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
1314 struct bio *rbi, *rbi2;
1315
1316 BUG_ON(!dev->read);
1317 rbi = dev->read;
1318 dev->read = NULL;
1319 while (rbi && rbi->bi_iter.bi_sector <
1320 dev->sector + STRIPE_SECTORS) {
1321 rbi2 = r5_next_bio(rbi, dev->sector);
1322 bio_endio(rbi);
1323 rbi = rbi2;
1324 }
1325 }
1326 }
1327 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
1328
1329 set_bit(STRIPE_HANDLE, &sh->state);
1330 raid5_release_stripe(sh);
1331}
1332
1333static void ops_run_biofill(struct stripe_head *sh)
1334{
1335 struct dma_async_tx_descriptor *tx = NULL;
1336 struct async_submit_ctl submit;
1337 int i;
1338
1339 BUG_ON(sh->batch_head);
1340 pr_debug("%s: stripe %llu\n", __func__,
1341 (unsigned long long)sh->sector);
1342
1343 for (i = sh->disks; i--; ) {
1344 struct r5dev *dev = &sh->dev[i];
1345 if (test_bit(R5_Wantfill, &dev->flags)) {
1346 struct bio *rbi;
1347 spin_lock_irq(&sh->stripe_lock);
1348 dev->read = rbi = dev->toread;
1349 dev->toread = NULL;
1350 spin_unlock_irq(&sh->stripe_lock);
1351 while (rbi && rbi->bi_iter.bi_sector <
1352 dev->sector + STRIPE_SECTORS) {
1353 tx = async_copy_data(0, rbi, &dev->page,
1354 dev->sector, tx, sh, 0);
1355 rbi = r5_next_bio(rbi, dev->sector);
1356 }
1357 }
1358 }
1359
1360 atomic_inc(&sh->count);
1361 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
1362 async_trigger_callback(&submit);
1363}
1364
1365static void mark_target_uptodate(struct stripe_head *sh, int target)
1366{
1367 struct r5dev *tgt;
1368
1369 if (target < 0)
1370 return;
1371
1372 tgt = &sh->dev[target];
1373 set_bit(R5_UPTODATE, &tgt->flags);
1374 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1375 clear_bit(R5_Wantcompute, &tgt->flags);
1376}
1377
1378static void ops_complete_compute(void *stripe_head_ref)
1379{
1380 struct stripe_head *sh = stripe_head_ref;
1381
1382 pr_debug("%s: stripe %llu\n", __func__,
1383 (unsigned long long)sh->sector);
1384
1385
1386 mark_target_uptodate(sh, sh->ops.target);
1387 mark_target_uptodate(sh, sh->ops.target2);
1388
1389 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
1390 if (sh->check_state == check_state_compute_run)
1391 sh->check_state = check_state_compute_result;
1392 set_bit(STRIPE_HANDLE, &sh->state);
1393 raid5_release_stripe(sh);
1394}
1395
1396
1397static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1398 struct raid5_percpu *percpu, int i)
1399{
1400 void *addr;
1401
1402 addr = flex_array_get(percpu->scribble, i);
1403 return addr + sizeof(struct page *) * (sh->disks + 2);
1404}
1405
1406
1407static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1408{
1409 void *addr;
1410
1411 addr = flex_array_get(percpu->scribble, i);
1412 return addr;
1413}
1414
1415static struct dma_async_tx_descriptor *
1416ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1417{
1418 int disks = sh->disks;
1419 struct page **xor_srcs = to_addr_page(percpu, 0);
1420 int target = sh->ops.target;
1421 struct r5dev *tgt = &sh->dev[target];
1422 struct page *xor_dest = tgt->page;
1423 int count = 0;
1424 struct dma_async_tx_descriptor *tx;
1425 struct async_submit_ctl submit;
1426 int i;
1427
1428 BUG_ON(sh->batch_head);
1429
1430 pr_debug("%s: stripe %llu block: %d\n",
1431 __func__, (unsigned long long)sh->sector, target);
1432 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1433
1434 for (i = disks; i--; )
1435 if (i != target)
1436 xor_srcs[count++] = sh->dev[i].page;
1437
1438 atomic_inc(&sh->count);
1439
1440 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
1441 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
1442 if (unlikely(count == 1))
1443 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1444 else
1445 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1446
1447 return tx;
1448}
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459static int set_syndrome_sources(struct page **srcs,
1460 struct stripe_head *sh,
1461 int srctype)
1462{
1463 int disks = sh->disks;
1464 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1465 int d0_idx = raid6_d0(sh);
1466 int count;
1467 int i;
1468
1469 for (i = 0; i < disks; i++)
1470 srcs[i] = NULL;
1471
1472 count = 0;
1473 i = d0_idx;
1474 do {
1475 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1476 struct r5dev *dev = &sh->dev[i];
1477
1478 if (i == sh->qd_idx || i == sh->pd_idx ||
1479 (srctype == SYNDROME_SRC_ALL) ||
1480 (srctype == SYNDROME_SRC_WANT_DRAIN &&
1481 (test_bit(R5_Wantdrain, &dev->flags) ||
1482 test_bit(R5_InJournal, &dev->flags))) ||
1483 (srctype == SYNDROME_SRC_WRITTEN &&
1484 (dev->written ||
1485 test_bit(R5_InJournal, &dev->flags)))) {
1486 if (test_bit(R5_InJournal, &dev->flags))
1487 srcs[slot] = sh->dev[i].orig_page;
1488 else
1489 srcs[slot] = sh->dev[i].page;
1490 }
1491 i = raid6_next_disk(i, disks);
1492 } while (i != d0_idx);
1493
1494 return syndrome_disks;
1495}
1496
1497static struct dma_async_tx_descriptor *
1498ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1499{
1500 int disks = sh->disks;
1501 struct page **blocks = to_addr_page(percpu, 0);
1502 int target;
1503 int qd_idx = sh->qd_idx;
1504 struct dma_async_tx_descriptor *tx;
1505 struct async_submit_ctl submit;
1506 struct r5dev *tgt;
1507 struct page *dest;
1508 int i;
1509 int count;
1510
1511 BUG_ON(sh->batch_head);
1512 if (sh->ops.target < 0)
1513 target = sh->ops.target2;
1514 else if (sh->ops.target2 < 0)
1515 target = sh->ops.target;
1516 else
1517
1518 BUG();
1519 BUG_ON(target < 0);
1520 pr_debug("%s: stripe %llu block: %d\n",
1521 __func__, (unsigned long long)sh->sector, target);
1522
1523 tgt = &sh->dev[target];
1524 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1525 dest = tgt->page;
1526
1527 atomic_inc(&sh->count);
1528
1529 if (target == qd_idx) {
1530 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1531 blocks[count] = NULL;
1532 BUG_ON(blocks[count+1] != dest);
1533 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1534 ops_complete_compute, sh,
1535 to_addr_conv(sh, percpu, 0));
1536 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1537 } else {
1538
1539 count = 0;
1540 for (i = disks; i-- ; ) {
1541 if (i == target || i == qd_idx)
1542 continue;
1543 blocks[count++] = sh->dev[i].page;
1544 }
1545
1546 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1547 NULL, ops_complete_compute, sh,
1548 to_addr_conv(sh, percpu, 0));
1549 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
1550 }
1551
1552 return tx;
1553}
1554
1555static struct dma_async_tx_descriptor *
1556ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1557{
1558 int i, count, disks = sh->disks;
1559 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1560 int d0_idx = raid6_d0(sh);
1561 int faila = -1, failb = -1;
1562 int target = sh->ops.target;
1563 int target2 = sh->ops.target2;
1564 struct r5dev *tgt = &sh->dev[target];
1565 struct r5dev *tgt2 = &sh->dev[target2];
1566 struct dma_async_tx_descriptor *tx;
1567 struct page **blocks = to_addr_page(percpu, 0);
1568 struct async_submit_ctl submit;
1569
1570 BUG_ON(sh->batch_head);
1571 pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1572 __func__, (unsigned long long)sh->sector, target, target2);
1573 BUG_ON(target < 0 || target2 < 0);
1574 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1575 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1576
1577
1578
1579
1580 for (i = 0; i < disks ; i++)
1581 blocks[i] = NULL;
1582 count = 0;
1583 i = d0_idx;
1584 do {
1585 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1586
1587 blocks[slot] = sh->dev[i].page;
1588
1589 if (i == target)
1590 faila = slot;
1591 if (i == target2)
1592 failb = slot;
1593 i = raid6_next_disk(i, disks);
1594 } while (i != d0_idx);
1595
1596 BUG_ON(faila == failb);
1597 if (failb < faila)
1598 swap(faila, failb);
1599 pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1600 __func__, (unsigned long long)sh->sector, faila, failb);
1601
1602 atomic_inc(&sh->count);
1603
1604 if (failb == syndrome_disks+1) {
1605
1606 if (faila == syndrome_disks) {
1607
1608 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1609 ops_complete_compute, sh,
1610 to_addr_conv(sh, percpu, 0));
1611 return async_gen_syndrome(blocks, 0, syndrome_disks+2,
1612 STRIPE_SIZE, &submit);
1613 } else {
1614 struct page *dest;
1615 int data_target;
1616 int qd_idx = sh->qd_idx;
1617
1618
1619 if (target == qd_idx)
1620 data_target = target2;
1621 else
1622 data_target = target;
1623
1624 count = 0;
1625 for (i = disks; i-- ; ) {
1626 if (i == data_target || i == qd_idx)
1627 continue;
1628 blocks[count++] = sh->dev[i].page;
1629 }
1630 dest = sh->dev[data_target].page;
1631 init_async_submit(&submit,
1632 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1633 NULL, NULL, NULL,
1634 to_addr_conv(sh, percpu, 0));
1635 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
1636 &submit);
1637
1638 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1639 init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1640 ops_complete_compute, sh,
1641 to_addr_conv(sh, percpu, 0));
1642 return async_gen_syndrome(blocks, 0, count+2,
1643 STRIPE_SIZE, &submit);
1644 }
1645 } else {
1646 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1647 ops_complete_compute, sh,
1648 to_addr_conv(sh, percpu, 0));
1649 if (failb == syndrome_disks) {
1650
1651 return async_raid6_datap_recov(syndrome_disks+2,
1652 STRIPE_SIZE, faila,
1653 blocks, &submit);
1654 } else {
1655
1656 return async_raid6_2data_recov(syndrome_disks+2,
1657 STRIPE_SIZE, faila, failb,
1658 blocks, &submit);
1659 }
1660 }
1661}
1662
1663static void ops_complete_prexor(void *stripe_head_ref)
1664{
1665 struct stripe_head *sh = stripe_head_ref;
1666
1667 pr_debug("%s: stripe %llu\n", __func__,
1668 (unsigned long long)sh->sector);
1669
1670 if (r5c_is_writeback(sh->raid_conf->log))
1671
1672
1673
1674
1675 r5c_release_extra_page(sh);
1676}
1677
1678static struct dma_async_tx_descriptor *
1679ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1680 struct dma_async_tx_descriptor *tx)
1681{
1682 int disks = sh->disks;
1683 struct page **xor_srcs = to_addr_page(percpu, 0);
1684 int count = 0, pd_idx = sh->pd_idx, i;
1685 struct async_submit_ctl submit;
1686
1687
1688 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1689
1690 BUG_ON(sh->batch_head);
1691 pr_debug("%s: stripe %llu\n", __func__,
1692 (unsigned long long)sh->sector);
1693
1694 for (i = disks; i--; ) {
1695 struct r5dev *dev = &sh->dev[i];
1696
1697 if (test_bit(R5_InJournal, &dev->flags))
1698 xor_srcs[count++] = dev->orig_page;
1699 else if (test_bit(R5_Wantdrain, &dev->flags))
1700 xor_srcs[count++] = dev->page;
1701 }
1702
1703 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1704 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1705 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1706
1707 return tx;
1708}
1709
1710static struct dma_async_tx_descriptor *
1711ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
1712 struct dma_async_tx_descriptor *tx)
1713{
1714 struct page **blocks = to_addr_page(percpu, 0);
1715 int count;
1716 struct async_submit_ctl submit;
1717
1718 pr_debug("%s: stripe %llu\n", __func__,
1719 (unsigned long long)sh->sector);
1720
1721 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
1722
1723 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
1724 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1725 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1726
1727 return tx;
1728}
1729
1730static struct dma_async_tx_descriptor *
1731ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1732{
1733 struct r5conf *conf = sh->raid_conf;
1734 int disks = sh->disks;
1735 int i;
1736 struct stripe_head *head_sh = sh;
1737
1738 pr_debug("%s: stripe %llu\n", __func__,
1739 (unsigned long long)sh->sector);
1740
1741 for (i = disks; i--; ) {
1742 struct r5dev *dev;
1743 struct bio *chosen;
1744
1745 sh = head_sh;
1746 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
1747 struct bio *wbi;
1748
1749again:
1750 dev = &sh->dev[i];
1751
1752
1753
1754
1755 clear_bit(R5_InJournal, &dev->flags);
1756 spin_lock_irq(&sh->stripe_lock);
1757 chosen = dev->towrite;
1758 dev->towrite = NULL;
1759 sh->overwrite_disks = 0;
1760 BUG_ON(dev->written);
1761 wbi = dev->written = chosen;
1762 spin_unlock_irq(&sh->stripe_lock);
1763 WARN_ON(dev->page != dev->orig_page);
1764
1765 while (wbi && wbi->bi_iter.bi_sector <
1766 dev->sector + STRIPE_SECTORS) {
1767 if (wbi->bi_opf & REQ_FUA)
1768 set_bit(R5_WantFUA, &dev->flags);
1769 if (wbi->bi_opf & REQ_SYNC)
1770 set_bit(R5_SyncIO, &dev->flags);
1771 if (bio_op(wbi) == REQ_OP_DISCARD)
1772 set_bit(R5_Discard, &dev->flags);
1773 else {
1774 tx = async_copy_data(1, wbi, &dev->page,
1775 dev->sector, tx, sh,
1776 r5c_is_writeback(conf->log));
1777 if (dev->page != dev->orig_page &&
1778 !r5c_is_writeback(conf->log)) {
1779 set_bit(R5_SkipCopy, &dev->flags);
1780 clear_bit(R5_UPTODATE, &dev->flags);
1781 clear_bit(R5_OVERWRITE, &dev->flags);
1782 }
1783 }
1784 wbi = r5_next_bio(wbi, dev->sector);
1785 }
1786
1787 if (head_sh->batch_head) {
1788 sh = list_first_entry(&sh->batch_list,
1789 struct stripe_head,
1790 batch_list);
1791 if (sh == head_sh)
1792 continue;
1793 goto again;
1794 }
1795 }
1796 }
1797
1798 return tx;
1799}
1800
1801static void ops_complete_reconstruct(void *stripe_head_ref)
1802{
1803 struct stripe_head *sh = stripe_head_ref;
1804 int disks = sh->disks;
1805 int pd_idx = sh->pd_idx;
1806 int qd_idx = sh->qd_idx;
1807 int i;
1808 bool fua = false, sync = false, discard = false;
1809
1810 pr_debug("%s: stripe %llu\n", __func__,
1811 (unsigned long long)sh->sector);
1812
1813 for (i = disks; i--; ) {
1814 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1815 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1816 discard |= test_bit(R5_Discard, &sh->dev[i].flags);
1817 }
1818
1819 for (i = disks; i--; ) {
1820 struct r5dev *dev = &sh->dev[i];
1821
1822 if (dev->written || i == pd_idx || i == qd_idx) {
1823 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) {
1824 set_bit(R5_UPTODATE, &dev->flags);
1825 if (test_bit(STRIPE_EXPAND_READY, &sh->state))
1826 set_bit(R5_Expanded, &dev->flags);
1827 }
1828 if (fua)
1829 set_bit(R5_WantFUA, &dev->flags);
1830 if (sync)
1831 set_bit(R5_SyncIO, &dev->flags);
1832 }
1833 }
1834
1835 if (sh->reconstruct_state == reconstruct_state_drain_run)
1836 sh->reconstruct_state = reconstruct_state_drain_result;
1837 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
1838 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
1839 else {
1840 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
1841 sh->reconstruct_state = reconstruct_state_result;
1842 }
1843
1844 set_bit(STRIPE_HANDLE, &sh->state);
1845 raid5_release_stripe(sh);
1846}
1847
1848static void
1849ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1850 struct dma_async_tx_descriptor *tx)
1851{
1852 int disks = sh->disks;
1853 struct page **xor_srcs;
1854 struct async_submit_ctl submit;
1855 int count, pd_idx = sh->pd_idx, i;
1856 struct page *xor_dest;
1857 int prexor = 0;
1858 unsigned long flags;
1859 int j = 0;
1860 struct stripe_head *head_sh = sh;
1861 int last_stripe;
1862
1863 pr_debug("%s: stripe %llu\n", __func__,
1864 (unsigned long long)sh->sector);
1865
1866 for (i = 0; i < sh->disks; i++) {
1867 if (pd_idx == i)
1868 continue;
1869 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1870 break;
1871 }
1872 if (i >= sh->disks) {
1873 atomic_inc(&sh->count);
1874 set_bit(R5_Discard, &sh->dev[pd_idx].flags);
1875 ops_complete_reconstruct(sh);
1876 return;
1877 }
1878again:
1879 count = 0;
1880 xor_srcs = to_addr_page(percpu, j);
1881
1882
1883
1884 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1885 prexor = 1;
1886 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1887 for (i = disks; i--; ) {
1888 struct r5dev *dev = &sh->dev[i];
1889 if (head_sh->dev[i].written ||
1890 test_bit(R5_InJournal, &head_sh->dev[i].flags))
1891 xor_srcs[count++] = dev->page;
1892 }
1893 } else {
1894 xor_dest = sh->dev[pd_idx].page;
1895 for (i = disks; i--; ) {
1896 struct r5dev *dev = &sh->dev[i];
1897 if (i != pd_idx)
1898 xor_srcs[count++] = dev->page;
1899 }
1900 }
1901
1902
1903
1904
1905
1906
1907 last_stripe = !head_sh->batch_head ||
1908 list_first_entry(&sh->batch_list,
1909 struct stripe_head, batch_list) == head_sh;
1910 if (last_stripe) {
1911 flags = ASYNC_TX_ACK |
1912 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1913
1914 atomic_inc(&head_sh->count);
1915 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
1916 to_addr_conv(sh, percpu, j));
1917 } else {
1918 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
1919 init_async_submit(&submit, flags, tx, NULL, NULL,
1920 to_addr_conv(sh, percpu, j));
1921 }
1922
1923 if (unlikely(count == 1))
1924 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1925 else
1926 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1927 if (!last_stripe) {
1928 j++;
1929 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1930 batch_list);
1931 goto again;
1932 }
1933}
1934
1935static void
1936ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1937 struct dma_async_tx_descriptor *tx)
1938{
1939 struct async_submit_ctl submit;
1940 struct page **blocks;
1941 int count, i, j = 0;
1942 struct stripe_head *head_sh = sh;
1943 int last_stripe;
1944 int synflags;
1945 unsigned long txflags;
1946
1947 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1948
1949 for (i = 0; i < sh->disks; i++) {
1950 if (sh->pd_idx == i || sh->qd_idx == i)
1951 continue;
1952 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1953 break;
1954 }
1955 if (i >= sh->disks) {
1956 atomic_inc(&sh->count);
1957 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
1958 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
1959 ops_complete_reconstruct(sh);
1960 return;
1961 }
1962
1963again:
1964 blocks = to_addr_page(percpu, j);
1965
1966 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1967 synflags = SYNDROME_SRC_WRITTEN;
1968 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
1969 } else {
1970 synflags = SYNDROME_SRC_ALL;
1971 txflags = ASYNC_TX_ACK;
1972 }
1973
1974 count = set_syndrome_sources(blocks, sh, synflags);
1975 last_stripe = !head_sh->batch_head ||
1976 list_first_entry(&sh->batch_list,
1977 struct stripe_head, batch_list) == head_sh;
1978
1979 if (last_stripe) {
1980 atomic_inc(&head_sh->count);
1981 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
1982 head_sh, to_addr_conv(sh, percpu, j));
1983 } else
1984 init_async_submit(&submit, 0, tx, NULL, NULL,
1985 to_addr_conv(sh, percpu, j));
1986 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1987 if (!last_stripe) {
1988 j++;
1989 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1990 batch_list);
1991 goto again;
1992 }
1993}
1994
1995static void ops_complete_check(void *stripe_head_ref)
1996{
1997 struct stripe_head *sh = stripe_head_ref;
1998
1999 pr_debug("%s: stripe %llu\n", __func__,
2000 (unsigned long long)sh->sector);
2001
2002 sh->check_state = check_state_check_result;
2003 set_bit(STRIPE_HANDLE, &sh->state);
2004 raid5_release_stripe(sh);
2005}
2006
2007static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
2008{
2009 int disks = sh->disks;
2010 int pd_idx = sh->pd_idx;
2011 int qd_idx = sh->qd_idx;
2012 struct page *xor_dest;
2013 struct page **xor_srcs = to_addr_page(percpu, 0);
2014 struct dma_async_tx_descriptor *tx;
2015 struct async_submit_ctl submit;
2016 int count;
2017 int i;
2018
2019 pr_debug("%s: stripe %llu\n", __func__,
2020 (unsigned long long)sh->sector);
2021
2022 BUG_ON(sh->batch_head);
2023 count = 0;
2024 xor_dest = sh->dev[pd_idx].page;
2025 xor_srcs[count++] = xor_dest;
2026 for (i = disks; i--; ) {
2027 if (i == pd_idx || i == qd_idx)
2028 continue;
2029 xor_srcs[count++] = sh->dev[i].page;
2030 }
2031
2032 init_async_submit(&submit, 0, NULL, NULL, NULL,
2033 to_addr_conv(sh, percpu, 0));
2034 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
2035 &sh->ops.zero_sum_result, &submit);
2036
2037 atomic_inc(&sh->count);
2038 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
2039 tx = async_trigger_callback(&submit);
2040}
2041
2042static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
2043{
2044 struct page **srcs = to_addr_page(percpu, 0);
2045 struct async_submit_ctl submit;
2046 int count;
2047
2048 pr_debug("%s: stripe %llu checkp: %d\n", __func__,
2049 (unsigned long long)sh->sector, checkp);
2050
2051 BUG_ON(sh->batch_head);
2052 count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
2053 if (!checkp)
2054 srcs[count] = NULL;
2055
2056 atomic_inc(&sh->count);
2057 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
2058 sh, to_addr_conv(sh, percpu, 0));
2059 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
2060 &sh->ops.zero_sum_result, percpu->spare_page, &submit);
2061}
2062
2063static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
2064{
2065 int overlap_clear = 0, i, disks = sh->disks;
2066 struct dma_async_tx_descriptor *tx = NULL;
2067 struct r5conf *conf = sh->raid_conf;
2068 int level = conf->level;
2069 struct raid5_percpu *percpu;
2070 unsigned long cpu;
2071
2072 cpu = get_cpu();
2073 percpu = per_cpu_ptr(conf->percpu, cpu);
2074 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
2075 ops_run_biofill(sh);
2076 overlap_clear++;
2077 }
2078
2079 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
2080 if (level < 6)
2081 tx = ops_run_compute5(sh, percpu);
2082 else {
2083 if (sh->ops.target2 < 0 || sh->ops.target < 0)
2084 tx = ops_run_compute6_1(sh, percpu);
2085 else
2086 tx = ops_run_compute6_2(sh, percpu);
2087 }
2088
2089 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
2090 async_tx_ack(tx);
2091 }
2092
2093 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
2094 if (level < 6)
2095 tx = ops_run_prexor5(sh, percpu, tx);
2096 else
2097 tx = ops_run_prexor6(sh, percpu, tx);
2098 }
2099
2100 if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
2101 tx = ops_run_partial_parity(sh, percpu, tx);
2102
2103 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
2104 tx = ops_run_biodrain(sh, tx);
2105 overlap_clear++;
2106 }
2107
2108 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
2109 if (level < 6)
2110 ops_run_reconstruct5(sh, percpu, tx);
2111 else
2112 ops_run_reconstruct6(sh, percpu, tx);
2113 }
2114
2115 if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
2116 if (sh->check_state == check_state_run)
2117 ops_run_check_p(sh, percpu);
2118 else if (sh->check_state == check_state_run_q)
2119 ops_run_check_pq(sh, percpu, 0);
2120 else if (sh->check_state == check_state_run_pq)
2121 ops_run_check_pq(sh, percpu, 1);
2122 else
2123 BUG();
2124 }
2125
2126 if (overlap_clear && !sh->batch_head)
2127 for (i = disks; i--; ) {
2128 struct r5dev *dev = &sh->dev[i];
2129 if (test_and_clear_bit(R5_Overlap, &dev->flags))
2130 wake_up(&sh->raid_conf->wait_for_overlap);
2131 }
2132 put_cpu();
2133}
2134
2135static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
2136{
2137 if (sh->ppl_page)
2138 __free_page(sh->ppl_page);
2139 kmem_cache_free(sc, sh);
2140}
2141
2142static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
2143 int disks, struct r5conf *conf)
2144{
2145 struct stripe_head *sh;
2146 int i;
2147
2148 sh = kmem_cache_zalloc(sc, gfp);
2149 if (sh) {
2150 spin_lock_init(&sh->stripe_lock);
2151 spin_lock_init(&sh->batch_lock);
2152 INIT_LIST_HEAD(&sh->batch_list);
2153 INIT_LIST_HEAD(&sh->lru);
2154 INIT_LIST_HEAD(&sh->r5c);
2155 INIT_LIST_HEAD(&sh->log_list);
2156 atomic_set(&sh->count, 1);
2157 sh->raid_conf = conf;
2158 sh->log_start = MaxSector;
2159 for (i = 0; i < disks; i++) {
2160 struct r5dev *dev = &sh->dev[i];
2161
2162 bio_init(&dev->req, &dev->vec, 1);
2163 bio_init(&dev->rreq, &dev->rvec, 1);
2164 }
2165
2166 if (raid5_has_ppl(conf)) {
2167 sh->ppl_page = alloc_page(gfp);
2168 if (!sh->ppl_page) {
2169 free_stripe(sc, sh);
2170 sh = NULL;
2171 }
2172 }
2173 }
2174 return sh;
2175}
2176static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
2177{
2178 struct stripe_head *sh;
2179
2180 sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
2181 if (!sh)
2182 return 0;
2183
2184 if (grow_buffers(sh, gfp)) {
2185 shrink_buffers(sh);
2186 free_stripe(conf->slab_cache, sh);
2187 return 0;
2188 }
2189 sh->hash_lock_index =
2190 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
2191
2192 atomic_inc(&conf->active_stripes);
2193
2194 raid5_release_stripe(sh);
2195 conf->max_nr_stripes++;
2196 return 1;
2197}
2198
2199static int grow_stripes(struct r5conf *conf, int num)
2200{
2201 struct kmem_cache *sc;
2202 size_t namelen = sizeof(conf->cache_name[0]);
2203 int devs = max(conf->raid_disks, conf->previous_raid_disks);
2204
2205 if (conf->mddev->gendisk)
2206 snprintf(conf->cache_name[0], namelen,
2207 "raid%d-%s", conf->level, mdname(conf->mddev));
2208 else
2209 snprintf(conf->cache_name[0], namelen,
2210 "raid%d-%p", conf->level, conf->mddev);
2211 snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]);
2212
2213 conf->active_name = 0;
2214 sc = kmem_cache_create(conf->cache_name[conf->active_name],
2215 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
2216 0, 0, NULL);
2217 if (!sc)
2218 return 1;
2219 conf->slab_cache = sc;
2220 conf->pool_size = devs;
2221 while (num--)
2222 if (!grow_one_stripe(conf, GFP_KERNEL))
2223 return 1;
2224
2225 return 0;
2226}
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
2242{
2243 struct flex_array *ret;
2244 size_t len;
2245
2246 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
2247 ret = flex_array_alloc(len, cnt, flags);
2248 if (!ret)
2249 return NULL;
2250
2251 if (flex_array_prealloc(ret, 0, cnt, flags)) {
2252 flex_array_free(ret);
2253 return NULL;
2254 }
2255 return ret;
2256}
2257
2258static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
2259{
2260 unsigned long cpu;
2261 int err = 0;
2262
2263
2264
2265
2266
2267
2268 if (conf->scribble_disks >= new_disks &&
2269 conf->scribble_sectors >= new_sectors)
2270 return 0;
2271 mddev_suspend(conf->mddev);
2272 get_online_cpus();
2273 for_each_present_cpu(cpu) {
2274 struct raid5_percpu *percpu;
2275 struct flex_array *scribble;
2276
2277 percpu = per_cpu_ptr(conf->percpu, cpu);
2278 scribble = scribble_alloc(new_disks,
2279 new_sectors / STRIPE_SECTORS,
2280 GFP_NOIO);
2281
2282 if (scribble) {
2283 flex_array_free(percpu->scribble);
2284 percpu->scribble = scribble;
2285 } else {
2286 err = -ENOMEM;
2287 break;
2288 }
2289 }
2290 put_online_cpus();
2291 mddev_resume(conf->mddev);
2292 if (!err) {
2293 conf->scribble_disks = new_disks;
2294 conf->scribble_sectors = new_sectors;
2295 }
2296 return err;
2297}
2298
2299static int resize_stripes(struct r5conf *conf, int newsize)
2300{
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324 struct stripe_head *osh, *nsh;
2325 LIST_HEAD(newstripes);
2326 struct disk_info *ndisks;
2327 int err = 0;
2328 struct kmem_cache *sc;
2329 int i;
2330 int hash, cnt;
2331
2332 md_allow_write(conf->mddev);
2333
2334
2335 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
2336 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
2337 0, 0, NULL);
2338 if (!sc)
2339 return -ENOMEM;
2340
2341
2342 mutex_lock(&conf->cache_size_mutex);
2343
2344 for (i = conf->max_nr_stripes; i; i--) {
2345 nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
2346 if (!nsh)
2347 break;
2348
2349 list_add(&nsh->lru, &newstripes);
2350 }
2351 if (i) {
2352
2353 while (!list_empty(&newstripes)) {
2354 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2355 list_del(&nsh->lru);
2356 free_stripe(sc, nsh);
2357 }
2358 kmem_cache_destroy(sc);
2359 mutex_unlock(&conf->cache_size_mutex);
2360 return -ENOMEM;
2361 }
2362
2363
2364
2365
2366 hash = 0;
2367 cnt = 0;
2368 list_for_each_entry(nsh, &newstripes, lru) {
2369 lock_device_hash_lock(conf, hash);
2370 wait_event_cmd(conf->wait_for_stripe,
2371 !list_empty(conf->inactive_list + hash),
2372 unlock_device_hash_lock(conf, hash),
2373 lock_device_hash_lock(conf, hash));
2374 osh = get_free_stripe(conf, hash);
2375 unlock_device_hash_lock(conf, hash);
2376
2377 for(i=0; i<conf->pool_size; i++) {
2378 nsh->dev[i].page = osh->dev[i].page;
2379 nsh->dev[i].orig_page = osh->dev[i].page;
2380 }
2381 nsh->hash_lock_index = hash;
2382 free_stripe(conf->slab_cache, osh);
2383 cnt++;
2384 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
2385 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
2386 hash++;
2387 cnt = 0;
2388 }
2389 }
2390 kmem_cache_destroy(conf->slab_cache);
2391
2392
2393
2394
2395
2396
2397 ndisks = kcalloc(newsize, sizeof(struct disk_info), GFP_NOIO);
2398 if (ndisks) {
2399 for (i = 0; i < conf->pool_size; i++)
2400 ndisks[i] = conf->disks[i];
2401
2402 for (i = conf->pool_size; i < newsize; i++) {
2403 ndisks[i].extra_page = alloc_page(GFP_NOIO);
2404 if (!ndisks[i].extra_page)
2405 err = -ENOMEM;
2406 }
2407
2408 if (err) {
2409 for (i = conf->pool_size; i < newsize; i++)
2410 if (ndisks[i].extra_page)
2411 put_page(ndisks[i].extra_page);
2412 kfree(ndisks);
2413 } else {
2414 kfree(conf->disks);
2415 conf->disks = ndisks;
2416 }
2417 } else
2418 err = -ENOMEM;
2419
2420 mutex_unlock(&conf->cache_size_mutex);
2421
2422 conf->slab_cache = sc;
2423 conf->active_name = 1-conf->active_name;
2424
2425
2426 while(!list_empty(&newstripes)) {
2427 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2428 list_del_init(&nsh->lru);
2429
2430 for (i=conf->raid_disks; i < newsize; i++)
2431 if (nsh->dev[i].page == NULL) {
2432 struct page *p = alloc_page(GFP_NOIO);
2433 nsh->dev[i].page = p;
2434 nsh->dev[i].orig_page = p;
2435 if (!p)
2436 err = -ENOMEM;
2437 }
2438 raid5_release_stripe(nsh);
2439 }
2440
2441
2442 if (!err)
2443 conf->pool_size = newsize;
2444 return err;
2445}
2446
2447static int drop_one_stripe(struct r5conf *conf)
2448{
2449 struct stripe_head *sh;
2450 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
2451
2452 spin_lock_irq(conf->hash_locks + hash);
2453 sh = get_free_stripe(conf, hash);
2454 spin_unlock_irq(conf->hash_locks + hash);
2455 if (!sh)
2456 return 0;
2457 BUG_ON(atomic_read(&sh->count));
2458 shrink_buffers(sh);
2459 free_stripe(conf->slab_cache, sh);
2460 atomic_dec(&conf->active_stripes);
2461 conf->max_nr_stripes--;
2462 return 1;
2463}
2464
2465static void shrink_stripes(struct r5conf *conf)
2466{
2467 while (conf->max_nr_stripes &&
2468 drop_one_stripe(conf))
2469 ;
2470
2471 kmem_cache_destroy(conf->slab_cache);
2472 conf->slab_cache = NULL;
2473}
2474
2475static void raid5_end_read_request(struct bio * bi)
2476{
2477 struct stripe_head *sh = bi->bi_private;
2478 struct r5conf *conf = sh->raid_conf;
2479 int disks = sh->disks, i;
2480 char b[BDEVNAME_SIZE];
2481 struct md_rdev *rdev = NULL;
2482 sector_t s;
2483
2484 for (i=0 ; i<disks; i++)
2485 if (bi == &sh->dev[i].req)
2486 break;
2487
2488 pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
2489 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2490 bi->bi_status);
2491 if (i == disks) {
2492 bio_reset(bi);
2493 BUG();
2494 return;
2495 }
2496 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2497
2498
2499
2500
2501
2502 rdev = conf->disks[i].replacement;
2503 if (!rdev)
2504 rdev = conf->disks[i].rdev;
2505
2506 if (use_new_offset(conf, sh))
2507 s = sh->sector + rdev->new_data_offset;
2508 else
2509 s = sh->sector + rdev->data_offset;
2510 if (!bi->bi_status) {
2511 set_bit(R5_UPTODATE, &sh->dev[i].flags);
2512 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2513
2514
2515
2516
2517 pr_info_ratelimited(
2518 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
2519 mdname(conf->mddev), STRIPE_SECTORS,
2520 (unsigned long long)s,
2521 bdevname(rdev->bdev, b));
2522 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
2523 clear_bit(R5_ReadError, &sh->dev[i].flags);
2524 clear_bit(R5_ReWrite, &sh->dev[i].flags);
2525 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2526 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2527
2528 if (test_bit(R5_InJournal, &sh->dev[i].flags))
2529
2530
2531
2532
2533 set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2534
2535 if (atomic_read(&rdev->read_errors))
2536 atomic_set(&rdev->read_errors, 0);
2537 } else {
2538 const char *bdn = bdevname(rdev->bdev, b);
2539 int retry = 0;
2540 int set_bad = 0;
2541
2542 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
2543 atomic_inc(&rdev->read_errors);
2544 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2545 pr_warn_ratelimited(
2546 "md/raid:%s: read error on replacement device (sector %llu on %s).\n",
2547 mdname(conf->mddev),
2548 (unsigned long long)s,
2549 bdn);
2550 else if (conf->mddev->degraded >= conf->max_degraded) {
2551 set_bad = 1;
2552 pr_warn_ratelimited(
2553 "md/raid:%s: read error not correctable (sector %llu on %s).\n",
2554 mdname(conf->mddev),
2555 (unsigned long long)s,
2556 bdn);
2557 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2558
2559 set_bad = 1;
2560 pr_warn_ratelimited(
2561 "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n",
2562 mdname(conf->mddev),
2563 (unsigned long long)s,
2564 bdn);
2565 } else if (atomic_read(&rdev->read_errors)
2566 > conf->max_nr_stripes)
2567 pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
2568 mdname(conf->mddev), bdn);
2569 else
2570 retry = 1;
2571 if (set_bad && test_bit(In_sync, &rdev->flags)
2572 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2573 retry = 1;
2574 if (retry)
2575 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
2576 set_bit(R5_ReadError, &sh->dev[i].flags);
2577 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2578 } else
2579 set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2580 else {
2581 clear_bit(R5_ReadError, &sh->dev[i].flags);
2582 clear_bit(R5_ReWrite, &sh->dev[i].flags);
2583 if (!(set_bad
2584 && test_bit(In_sync, &rdev->flags)
2585 && rdev_set_badblocks(
2586 rdev, sh->sector, STRIPE_SECTORS, 0)))
2587 md_error(conf->mddev, rdev);
2588 }
2589 }
2590 rdev_dec_pending(rdev, conf->mddev);
2591 bio_reset(bi);
2592 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2593 set_bit(STRIPE_HANDLE, &sh->state);
2594 raid5_release_stripe(sh);
2595}
2596
2597static void raid5_end_write_request(struct bio *bi)
2598{
2599 struct stripe_head *sh = bi->bi_private;
2600 struct r5conf *conf = sh->raid_conf;
2601 int disks = sh->disks, i;
2602 struct md_rdev *uninitialized_var(rdev);
2603 sector_t first_bad;
2604 int bad_sectors;
2605 int replacement = 0;
2606
2607 for (i = 0 ; i < disks; i++) {
2608 if (bi == &sh->dev[i].req) {
2609 rdev = conf->disks[i].rdev;
2610 break;
2611 }
2612 if (bi == &sh->dev[i].rreq) {
2613 rdev = conf->disks[i].replacement;
2614 if (rdev)
2615 replacement = 1;
2616 else
2617
2618
2619
2620
2621 rdev = conf->disks[i].rdev;
2622 break;
2623 }
2624 }
2625 pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
2626 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2627 bi->bi_status);
2628 if (i == disks) {
2629 bio_reset(bi);
2630 BUG();
2631 return;
2632 }
2633
2634 if (replacement) {
2635 if (bi->bi_status)
2636 md_error(conf->mddev, rdev);
2637 else if (is_badblock(rdev, sh->sector,
2638 STRIPE_SECTORS,
2639 &first_bad, &bad_sectors))
2640 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
2641 } else {
2642 if (bi->bi_status) {
2643 set_bit(STRIPE_DEGRADED, &sh->state);
2644 set_bit(WriteErrorSeen, &rdev->flags);
2645 set_bit(R5_WriteError, &sh->dev[i].flags);
2646 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2647 set_bit(MD_RECOVERY_NEEDED,
2648 &rdev->mddev->recovery);
2649 } else if (is_badblock(rdev, sh->sector,
2650 STRIPE_SECTORS,
2651 &first_bad, &bad_sectors)) {
2652 set_bit(R5_MadeGood, &sh->dev[i].flags);
2653 if (test_bit(R5_ReadError, &sh->dev[i].flags))
2654
2655
2656
2657
2658 set_bit(R5_ReWrite, &sh->dev[i].flags);
2659 }
2660 }
2661 rdev_dec_pending(rdev, conf->mddev);
2662
2663 if (sh->batch_head && bi->bi_status && !replacement)
2664 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
2665
2666 bio_reset(bi);
2667 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
2668 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2669 set_bit(STRIPE_HANDLE, &sh->state);
2670 raid5_release_stripe(sh);
2671
2672 if (sh->batch_head && sh != sh->batch_head)
2673 raid5_release_stripe(sh->batch_head);
2674}
2675
2676static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
2677{
2678 char b[BDEVNAME_SIZE];
2679 struct r5conf *conf = mddev->private;
2680 unsigned long flags;
2681 pr_debug("raid456: error called\n");
2682
2683 spin_lock_irqsave(&conf->device_lock, flags);
2684
2685 if (test_bit(In_sync, &rdev->flags) &&
2686 mddev->degraded == conf->max_degraded) {
2687
2688
2689
2690
2691 conf->recovery_disabled = mddev->recovery_disabled;
2692 spin_unlock_irqrestore(&conf->device_lock, flags);
2693 return;
2694 }
2695
2696 set_bit(Faulty, &rdev->flags);
2697 clear_bit(In_sync, &rdev->flags);
2698 mddev->degraded = raid5_calc_degraded(conf);
2699 spin_unlock_irqrestore(&conf->device_lock, flags);
2700 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2701
2702 set_bit(Blocked, &rdev->flags);
2703 set_mask_bits(&mddev->sb_flags, 0,
2704 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
2705 pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
2706 "md/raid:%s: Operation continuing on %d devices.\n",
2707 mdname(mddev),
2708 bdevname(rdev->bdev, b),
2709 mdname(mddev),
2710 conf->raid_disks - mddev->degraded);
2711 r5c_update_on_rdev_error(mddev, rdev);
2712}
2713
2714
2715
2716
2717
2718sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2719 int previous, int *dd_idx,
2720 struct stripe_head *sh)
2721{
2722 sector_t stripe, stripe2;
2723 sector_t chunk_number;
2724 unsigned int chunk_offset;
2725 int pd_idx, qd_idx;
2726 int ddf_layout = 0;
2727 sector_t new_sector;
2728 int algorithm = previous ? conf->prev_algo
2729 : conf->algorithm;
2730 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2731 : conf->chunk_sectors;
2732 int raid_disks = previous ? conf->previous_raid_disks
2733 : conf->raid_disks;
2734 int data_disks = raid_disks - conf->max_degraded;
2735
2736
2737
2738
2739
2740
2741 chunk_offset = sector_div(r_sector, sectors_per_chunk);
2742 chunk_number = r_sector;
2743
2744
2745
2746
2747 stripe = chunk_number;
2748 *dd_idx = sector_div(stripe, data_disks);
2749 stripe2 = stripe;
2750
2751
2752
2753 pd_idx = qd_idx = -1;
2754 switch(conf->level) {
2755 case 4:
2756 pd_idx = data_disks;
2757 break;
2758 case 5:
2759 switch (algorithm) {
2760 case ALGORITHM_LEFT_ASYMMETRIC:
2761 pd_idx = data_disks - sector_div(stripe2, raid_disks);
2762 if (*dd_idx >= pd_idx)
2763 (*dd_idx)++;
2764 break;
2765 case ALGORITHM_RIGHT_ASYMMETRIC:
2766 pd_idx = sector_div(stripe2, raid_disks);
2767 if (*dd_idx >= pd_idx)
2768 (*dd_idx)++;
2769 break;
2770 case ALGORITHM_LEFT_SYMMETRIC:
2771 pd_idx = data_disks - sector_div(stripe2, raid_disks);
2772 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2773 break;
2774 case ALGORITHM_RIGHT_SYMMETRIC:
2775 pd_idx = sector_div(stripe2, raid_disks);
2776 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2777 break;
2778 case ALGORITHM_PARITY_0:
2779 pd_idx = 0;
2780 (*dd_idx)++;
2781 break;
2782 case ALGORITHM_PARITY_N:
2783 pd_idx = data_disks;
2784 break;
2785 default:
2786 BUG();
2787 }
2788 break;
2789 case 6:
2790
2791 switch (algorithm) {
2792 case ALGORITHM_LEFT_ASYMMETRIC:
2793 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2794 qd_idx = pd_idx + 1;
2795 if (pd_idx == raid_disks-1) {
2796 (*dd_idx)++;
2797 qd_idx = 0;
2798 } else if (*dd_idx >= pd_idx)
2799 (*dd_idx) += 2;
2800 break;
2801 case ALGORITHM_RIGHT_ASYMMETRIC:
2802 pd_idx = sector_div(stripe2, raid_disks);
2803 qd_idx = pd_idx + 1;
2804 if (pd_idx == raid_disks-1) {
2805 (*dd_idx)++;
2806 qd_idx = 0;
2807 } else if (*dd_idx >= pd_idx)
2808 (*dd_idx) += 2;
2809 break;
2810 case ALGORITHM_LEFT_SYMMETRIC:
2811 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2812 qd_idx = (pd_idx + 1) % raid_disks;
2813 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2814 break;
2815 case ALGORITHM_RIGHT_SYMMETRIC:
2816 pd_idx = sector_div(stripe2, raid_disks);
2817 qd_idx = (pd_idx + 1) % raid_disks;
2818 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2819 break;
2820
2821 case ALGORITHM_PARITY_0:
2822 pd_idx = 0;
2823 qd_idx = 1;
2824 (*dd_idx) += 2;
2825 break;
2826 case ALGORITHM_PARITY_N:
2827 pd_idx = data_disks;
2828 qd_idx = data_disks + 1;
2829 break;
2830
2831 case ALGORITHM_ROTATING_ZERO_RESTART:
2832
2833
2834
2835 pd_idx = sector_div(stripe2, raid_disks);
2836 qd_idx = pd_idx + 1;
2837 if (pd_idx == raid_disks-1) {
2838 (*dd_idx)++;
2839 qd_idx = 0;
2840 } else if (*dd_idx >= pd_idx)
2841 (*dd_idx) += 2;
2842 ddf_layout = 1;
2843 break;
2844
2845 case ALGORITHM_ROTATING_N_RESTART:
2846
2847
2848
2849
2850 stripe2 += 1;
2851 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2852 qd_idx = pd_idx + 1;
2853 if (pd_idx == raid_disks-1) {
2854 (*dd_idx)++;
2855 qd_idx = 0;
2856 } else if (*dd_idx >= pd_idx)
2857 (*dd_idx) += 2;
2858 ddf_layout = 1;
2859 break;
2860
2861 case ALGORITHM_ROTATING_N_CONTINUE:
2862
2863 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2864 qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
2865 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2866 ddf_layout = 1;
2867 break;
2868
2869 case ALGORITHM_LEFT_ASYMMETRIC_6:
2870
2871 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2872 if (*dd_idx >= pd_idx)
2873 (*dd_idx)++;
2874 qd_idx = raid_disks - 1;
2875 break;
2876
2877 case ALGORITHM_RIGHT_ASYMMETRIC_6:
2878 pd_idx = sector_div(stripe2, raid_disks-1);
2879 if (*dd_idx >= pd_idx)
2880 (*dd_idx)++;
2881 qd_idx = raid_disks - 1;
2882 break;
2883
2884 case ALGORITHM_LEFT_SYMMETRIC_6:
2885 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2886 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2887 qd_idx = raid_disks - 1;
2888 break;
2889
2890 case ALGORITHM_RIGHT_SYMMETRIC_6:
2891 pd_idx = sector_div(stripe2, raid_disks-1);
2892 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2893 qd_idx = raid_disks - 1;
2894 break;
2895
2896 case ALGORITHM_PARITY_0_6:
2897 pd_idx = 0;
2898 (*dd_idx)++;
2899 qd_idx = raid_disks - 1;
2900 break;
2901
2902 default:
2903 BUG();
2904 }
2905 break;
2906 }
2907
2908 if (sh) {
2909 sh->pd_idx = pd_idx;
2910 sh->qd_idx = qd_idx;
2911 sh->ddf_layout = ddf_layout;
2912 }
2913
2914
2915
2916 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
2917 return new_sector;
2918}
2919
2920sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
2921{
2922 struct r5conf *conf = sh->raid_conf;
2923 int raid_disks = sh->disks;
2924 int data_disks = raid_disks - conf->max_degraded;
2925 sector_t new_sector = sh->sector, check;
2926 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2927 : conf->chunk_sectors;
2928 int algorithm = previous ? conf->prev_algo
2929 : conf->algorithm;
2930 sector_t stripe;
2931 int chunk_offset;
2932 sector_t chunk_number;
2933 int dummy1, dd_idx = i;
2934 sector_t r_sector;
2935 struct stripe_head sh2;
2936
2937 chunk_offset = sector_div(new_sector, sectors_per_chunk);
2938 stripe = new_sector;
2939
2940 if (i == sh->pd_idx)
2941 return 0;
2942 switch(conf->level) {
2943 case 4: break;
2944 case 5:
2945 switch (algorithm) {
2946 case ALGORITHM_LEFT_ASYMMETRIC:
2947 case ALGORITHM_RIGHT_ASYMMETRIC:
2948 if (i > sh->pd_idx)
2949 i--;
2950 break;
2951 case ALGORITHM_LEFT_SYMMETRIC:
2952 case ALGORITHM_RIGHT_SYMMETRIC:
2953 if (i < sh->pd_idx)
2954 i += raid_disks;
2955 i -= (sh->pd_idx + 1);
2956 break;
2957 case ALGORITHM_PARITY_0:
2958 i -= 1;
2959 break;
2960 case ALGORITHM_PARITY_N:
2961 break;
2962 default:
2963 BUG();
2964 }
2965 break;
2966 case 6:
2967 if (i == sh->qd_idx)
2968 return 0;
2969 switch (algorithm) {
2970 case ALGORITHM_LEFT_ASYMMETRIC:
2971 case ALGORITHM_RIGHT_ASYMMETRIC:
2972 case ALGORITHM_ROTATING_ZERO_RESTART:
2973 case ALGORITHM_ROTATING_N_RESTART:
2974 if (sh->pd_idx == raid_disks-1)
2975 i--;
2976 else if (i > sh->pd_idx)
2977 i -= 2;
2978 break;
2979 case ALGORITHM_LEFT_SYMMETRIC:
2980 case ALGORITHM_RIGHT_SYMMETRIC:
2981 if (sh->pd_idx == raid_disks-1)
2982 i--;
2983 else {
2984
2985 if (i < sh->pd_idx)
2986 i += raid_disks;
2987 i -= (sh->pd_idx + 2);
2988 }
2989 break;
2990 case ALGORITHM_PARITY_0:
2991 i -= 2;
2992 break;
2993 case ALGORITHM_PARITY_N:
2994 break;
2995 case ALGORITHM_ROTATING_N_CONTINUE:
2996
2997 if (sh->pd_idx == 0)
2998 i--;
2999 else {
3000
3001 if (i < sh->pd_idx)
3002 i += raid_disks;
3003 i -= (sh->pd_idx + 1);
3004 }
3005 break;
3006 case ALGORITHM_LEFT_ASYMMETRIC_6:
3007 case ALGORITHM_RIGHT_ASYMMETRIC_6:
3008 if (i > sh->pd_idx)
3009 i--;
3010 break;
3011 case ALGORITHM_LEFT_SYMMETRIC_6:
3012 case ALGORITHM_RIGHT_SYMMETRIC_6:
3013 if (i < sh->pd_idx)
3014 i += data_disks + 1;
3015 i -= (sh->pd_idx + 1);
3016 break;
3017 case ALGORITHM_PARITY_0_6:
3018 i -= 1;
3019 break;
3020 default:
3021 BUG();
3022 }
3023 break;
3024 }
3025
3026 chunk_number = stripe * data_disks + i;
3027 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
3028
3029 check = raid5_compute_sector(conf, r_sector,
3030 previous, &dummy1, &sh2);
3031 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
3032 || sh2.qd_idx != sh->qd_idx) {
3033 pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
3034 mdname(conf->mddev));
3035 return 0;
3036 }
3037 return r_sector;
3038}
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078static inline bool delay_towrite(struct r5conf *conf,
3079 struct r5dev *dev,
3080 struct stripe_head_state *s)
3081{
3082
3083 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3084 !test_bit(R5_Insync, &dev->flags) && s->injournal)
3085 return true;
3086
3087 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
3088 s->injournal > 0)
3089 return true;
3090
3091 if (s->log_failed && s->injournal)
3092 return true;
3093 return false;
3094}
3095
3096static void
3097schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
3098 int rcw, int expand)
3099{
3100 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
3101 struct r5conf *conf = sh->raid_conf;
3102 int level = conf->level;
3103
3104 if (rcw) {
3105
3106
3107
3108
3109
3110
3111 r5c_release_extra_page(sh);
3112
3113 for (i = disks; i--; ) {
3114 struct r5dev *dev = &sh->dev[i];
3115
3116 if (dev->towrite && !delay_towrite(conf, dev, s)) {
3117 set_bit(R5_LOCKED, &dev->flags);
3118 set_bit(R5_Wantdrain, &dev->flags);
3119 if (!expand)
3120 clear_bit(R5_UPTODATE, &dev->flags);
3121 s->locked++;
3122 } else if (test_bit(R5_InJournal, &dev->flags)) {
3123 set_bit(R5_LOCKED, &dev->flags);
3124 s->locked++;
3125 }
3126 }
3127
3128
3129
3130
3131 if (!expand) {
3132 if (!s->locked)
3133
3134 return;
3135 sh->reconstruct_state = reconstruct_state_drain_run;
3136 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3137 } else
3138 sh->reconstruct_state = reconstruct_state_run;
3139
3140 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3141
3142 if (s->locked + conf->max_degraded == disks)
3143 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
3144 atomic_inc(&conf->pending_full_writes);
3145 } else {
3146 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
3147 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
3148 BUG_ON(level == 6 &&
3149 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
3150 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
3151
3152 for (i = disks; i--; ) {
3153 struct r5dev *dev = &sh->dev[i];
3154 if (i == pd_idx || i == qd_idx)
3155 continue;
3156
3157 if (dev->towrite &&
3158 (test_bit(R5_UPTODATE, &dev->flags) ||
3159 test_bit(R5_Wantcompute, &dev->flags))) {
3160 set_bit(R5_Wantdrain, &dev->flags);
3161 set_bit(R5_LOCKED, &dev->flags);
3162 clear_bit(R5_UPTODATE, &dev->flags);
3163 s->locked++;
3164 } else if (test_bit(R5_InJournal, &dev->flags)) {
3165 set_bit(R5_LOCKED, &dev->flags);
3166 s->locked++;
3167 }
3168 }
3169 if (!s->locked)
3170
3171 return;
3172 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
3173 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
3174 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3175 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3176 }
3177
3178
3179
3180
3181 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
3182 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
3183 s->locked++;
3184
3185 if (level == 6) {
3186 int qd_idx = sh->qd_idx;
3187 struct r5dev *dev = &sh->dev[qd_idx];
3188
3189 set_bit(R5_LOCKED, &dev->flags);
3190 clear_bit(R5_UPTODATE, &dev->flags);
3191 s->locked++;
3192 }
3193
3194 if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
3195 test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
3196 !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
3197 test_bit(R5_Insync, &sh->dev[pd_idx].flags))
3198 set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
3199
3200 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
3201 __func__, (unsigned long long)sh->sector,
3202 s->locked, s->ops_request);
3203}
3204
3205
3206
3207
3208
3209
3210static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
3211 int forwrite, int previous)
3212{
3213 struct bio **bip;
3214 struct r5conf *conf = sh->raid_conf;
3215 int firstwrite=0;
3216
3217 pr_debug("adding bi b#%llu to stripe s#%llu\n",
3218 (unsigned long long)bi->bi_iter.bi_sector,
3219 (unsigned long long)sh->sector);
3220
3221 spin_lock_irq(&sh->stripe_lock);
3222 sh->dev[dd_idx].write_hint = bi->bi_write_hint;
3223
3224 if (sh->batch_head)
3225 goto overlap;
3226 if (forwrite) {
3227 bip = &sh->dev[dd_idx].towrite;
3228 if (*bip == NULL)
3229 firstwrite = 1;
3230 } else
3231 bip = &sh->dev[dd_idx].toread;
3232 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
3233 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
3234 goto overlap;
3235 bip = & (*bip)->bi_next;
3236 }
3237 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
3238 goto overlap;
3239
3240 if (forwrite && raid5_has_ppl(conf)) {
3241
3242
3243
3244
3245
3246
3247
3248 sector_t sector;
3249 sector_t first = 0;
3250 sector_t last = 0;
3251 int count = 0;
3252 int i;
3253
3254 for (i = 0; i < sh->disks; i++) {
3255 if (i != sh->pd_idx &&
3256 (i == dd_idx || sh->dev[i].towrite)) {
3257 sector = sh->dev[i].sector;
3258 if (count == 0 || sector < first)
3259 first = sector;
3260 if (sector > last)
3261 last = sector;
3262 count++;
3263 }
3264 }
3265
3266 if (first + conf->chunk_sectors * (count - 1) != last)
3267 goto overlap;
3268 }
3269
3270 if (!forwrite || previous)
3271 clear_bit(STRIPE_BATCH_READY, &sh->state);
3272
3273 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
3274 if (*bip)
3275 bi->bi_next = *bip;
3276 *bip = bi;
3277 bio_inc_remaining(bi);
3278 md_write_inc(conf->mddev, bi);
3279
3280 if (forwrite) {
3281
3282 sector_t sector = sh->dev[dd_idx].sector;
3283 for (bi=sh->dev[dd_idx].towrite;
3284 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
3285 bi && bi->bi_iter.bi_sector <= sector;
3286 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
3287 if (bio_end_sector(bi) >= sector)
3288 sector = bio_end_sector(bi);
3289 }
3290 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
3291 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
3292 sh->overwrite_disks++;
3293 }
3294
3295 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
3296 (unsigned long long)(*bip)->bi_iter.bi_sector,
3297 (unsigned long long)sh->sector, dd_idx);
3298
3299 if (conf->mddev->bitmap && firstwrite) {
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312 set_bit(STRIPE_BITMAP_PENDING, &sh->state);
3313 spin_unlock_irq(&sh->stripe_lock);
3314 md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
3315 STRIPE_SECTORS, 0);
3316 spin_lock_irq(&sh->stripe_lock);
3317 clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
3318 if (!sh->batch_head) {
3319 sh->bm_seq = conf->seq_flush+1;
3320 set_bit(STRIPE_BIT_DELAY, &sh->state);
3321 }
3322 }
3323 spin_unlock_irq(&sh->stripe_lock);
3324
3325 if (stripe_can_batch(sh))
3326 stripe_add_to_batch_list(conf, sh);
3327 return 1;
3328
3329 overlap:
3330 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
3331 spin_unlock_irq(&sh->stripe_lock);
3332 return 0;
3333}
3334
3335static void end_reshape(struct r5conf *conf);
3336
3337static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
3338 struct stripe_head *sh)
3339{
3340 int sectors_per_chunk =
3341 previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
3342 int dd_idx;
3343 int chunk_offset = sector_div(stripe, sectors_per_chunk);
3344 int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
3345
3346 raid5_compute_sector(conf,
3347 stripe * (disks - conf->max_degraded)
3348 *sectors_per_chunk + chunk_offset,
3349 previous,
3350 &dd_idx, sh);
3351}
3352
3353static void
3354handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
3355 struct stripe_head_state *s, int disks)
3356{
3357 int i;
3358 BUG_ON(sh->batch_head);
3359 for (i = disks; i--; ) {
3360 struct bio *bi;
3361 int bitmap_end = 0;
3362
3363 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
3364 struct md_rdev *rdev;
3365 rcu_read_lock();
3366 rdev = rcu_dereference(conf->disks[i].rdev);
3367 if (rdev && test_bit(In_sync, &rdev->flags) &&
3368 !test_bit(Faulty, &rdev->flags))
3369 atomic_inc(&rdev->nr_pending);
3370 else
3371 rdev = NULL;
3372 rcu_read_unlock();
3373 if (rdev) {
3374 if (!rdev_set_badblocks(
3375 rdev,
3376 sh->sector,
3377 STRIPE_SECTORS, 0))
3378 md_error(conf->mddev, rdev);
3379 rdev_dec_pending(rdev, conf->mddev);
3380 }
3381 }
3382 spin_lock_irq(&sh->stripe_lock);
3383
3384 bi = sh->dev[i].towrite;
3385 sh->dev[i].towrite = NULL;
3386 sh->overwrite_disks = 0;
3387 spin_unlock_irq(&sh->stripe_lock);
3388 if (bi)
3389 bitmap_end = 1;
3390
3391 log_stripe_write_finished(sh);
3392
3393 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3394 wake_up(&conf->wait_for_overlap);
3395
3396 while (bi && bi->bi_iter.bi_sector <
3397 sh->dev[i].sector + STRIPE_SECTORS) {
3398 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
3399
3400 md_write_end(conf->mddev);
3401 bio_io_error(bi);
3402 bi = nextbi;
3403 }
3404 if (bitmap_end)
3405 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3406 STRIPE_SECTORS, 0, 0);
3407 bitmap_end = 0;
3408
3409 bi = sh->dev[i].written;
3410 sh->dev[i].written = NULL;
3411 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
3412 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
3413 sh->dev[i].page = sh->dev[i].orig_page;
3414 }
3415
3416 if (bi) bitmap_end = 1;
3417 while (bi && bi->bi_iter.bi_sector <
3418 sh->dev[i].sector + STRIPE_SECTORS) {
3419 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
3420
3421 md_write_end(conf->mddev);
3422 bio_io_error(bi);
3423 bi = bi2;
3424 }
3425
3426
3427
3428
3429 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
3430 s->failed > conf->max_degraded &&
3431 (!test_bit(R5_Insync, &sh->dev[i].flags) ||
3432 test_bit(R5_ReadError, &sh->dev[i].flags))) {
3433 spin_lock_irq(&sh->stripe_lock);
3434 bi = sh->dev[i].toread;
3435 sh->dev[i].toread = NULL;
3436 spin_unlock_irq(&sh->stripe_lock);
3437 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3438 wake_up(&conf->wait_for_overlap);
3439 if (bi)
3440 s->to_read--;
3441 while (bi && bi->bi_iter.bi_sector <
3442 sh->dev[i].sector + STRIPE_SECTORS) {
3443 struct bio *nextbi =
3444 r5_next_bio(bi, sh->dev[i].sector);
3445
3446 bio_io_error(bi);
3447 bi = nextbi;
3448 }
3449 }
3450 if (bitmap_end)
3451 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3452 STRIPE_SECTORS, 0, 0);
3453
3454
3455
3456 clear_bit(R5_LOCKED, &sh->dev[i].flags);
3457 }
3458 s->to_write = 0;
3459 s->written = 0;
3460
3461 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3462 if (atomic_dec_and_test(&conf->pending_full_writes))
3463 md_wakeup_thread(conf->mddev->thread);
3464}
3465
3466static void
3467handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
3468 struct stripe_head_state *s)
3469{
3470 int abort = 0;
3471 int i;
3472
3473 BUG_ON(sh->batch_head);
3474 clear_bit(STRIPE_SYNCING, &sh->state);
3475 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
3476 wake_up(&conf->wait_for_overlap);
3477 s->syncing = 0;
3478 s->replacing = 0;
3479
3480
3481
3482
3483
3484
3485
3486 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
3487
3488
3489
3490 rcu_read_lock();
3491 for (i = 0; i < conf->raid_disks; i++) {
3492 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
3493 if (rdev
3494 && !test_bit(Faulty, &rdev->flags)
3495 && !test_bit(In_sync, &rdev->flags)
3496 && !rdev_set_badblocks(rdev, sh->sector,
3497 STRIPE_SECTORS, 0))
3498 abort = 1;
3499 rdev = rcu_dereference(conf->disks[i].replacement);
3500 if (rdev
3501 && !test_bit(Faulty, &rdev->flags)
3502 && !test_bit(In_sync, &rdev->flags)
3503 && !rdev_set_badblocks(rdev, sh->sector,
3504 STRIPE_SECTORS, 0))
3505 abort = 1;
3506 }
3507 rcu_read_unlock();
3508 if (abort)
3509 conf->recovery_disabled =
3510 conf->mddev->recovery_disabled;
3511 }
3512 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
3513}
3514
3515static int want_replace(struct stripe_head *sh, int disk_idx)
3516{
3517 struct md_rdev *rdev;
3518 int rv = 0;
3519
3520 rcu_read_lock();
3521 rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement);
3522 if (rdev
3523 && !test_bit(Faulty, &rdev->flags)
3524 && !test_bit(In_sync, &rdev->flags)
3525 && (rdev->recovery_offset <= sh->sector
3526 || rdev->mddev->recovery_cp <= sh->sector))
3527 rv = 1;
3528 rcu_read_unlock();
3529 return rv;
3530}
3531
3532static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
3533 int disk_idx, int disks)
3534{
3535 struct r5dev *dev = &sh->dev[disk_idx];
3536 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
3537 &sh->dev[s->failed_num[1]] };
3538 int i;
3539
3540
3541 if (test_bit(R5_LOCKED, &dev->flags) ||
3542 test_bit(R5_UPTODATE, &dev->flags))
3543
3544
3545
3546 return 0;
3547
3548 if (dev->toread ||
3549 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
3550
3551 return 1;
3552
3553 if (s->syncing || s->expanding ||
3554 (s->replacing && want_replace(sh, disk_idx)))
3555
3556
3557
3558 return 1;
3559
3560 if ((s->failed >= 1 && fdev[0]->toread) ||
3561 (s->failed >= 2 && fdev[1]->toread))
3562
3563
3564
3565 return 1;
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575 if (!s->failed || !s->to_write)
3576 return 0;
3577
3578 if (test_bit(R5_Insync, &dev->flags) &&
3579 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3580
3581
3582
3583
3584
3585 return 0;
3586
3587 for (i = 0; i < s->failed && i < 2; i++) {
3588 if (fdev[i]->towrite &&
3589 !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3590 !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3591
3592
3593
3594
3595
3596 return 1;
3597 }
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607 if (sh->raid_conf->level != 6 &&
3608 sh->sector < sh->raid_conf->mddev->recovery_cp)
3609
3610 return 0;
3611 for (i = 0; i < s->failed && i < 2; i++) {
3612 if (s->failed_num[i] != sh->pd_idx &&
3613 s->failed_num[i] != sh->qd_idx &&
3614 !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3615 !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3616 return 1;
3617 }
3618
3619 return 0;
3620}
3621
3622
3623
3624
3625
3626
3627
3628static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
3629 int disk_idx, int disks)
3630{
3631 struct r5dev *dev = &sh->dev[disk_idx];
3632
3633
3634 if (need_this_block(sh, s, disk_idx, disks)) {
3635
3636
3637
3638 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
3639 BUG_ON(test_bit(R5_Wantread, &dev->flags));
3640 BUG_ON(sh->batch_head);
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651 if ((s->uptodate == disks - 1) &&
3652 ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
3653 (s->failed && (disk_idx == s->failed_num[0] ||
3654 disk_idx == s->failed_num[1])))) {
3655
3656
3657
3658 pr_debug("Computing stripe %llu block %d\n",
3659 (unsigned long long)sh->sector, disk_idx);
3660 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3661 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3662 set_bit(R5_Wantcompute, &dev->flags);
3663 sh->ops.target = disk_idx;
3664 sh->ops.target2 = -1;
3665 s->req_compute = 1;
3666
3667
3668
3669
3670
3671
3672 s->uptodate++;
3673 return 1;
3674 } else if (s->uptodate == disks-2 && s->failed >= 2) {
3675
3676
3677
3678 int other;
3679 for (other = disks; other--; ) {
3680 if (other == disk_idx)
3681 continue;
3682 if (!test_bit(R5_UPTODATE,
3683 &sh->dev[other].flags))
3684 break;
3685 }
3686 BUG_ON(other < 0);
3687 pr_debug("Computing stripe %llu blocks %d,%d\n",
3688 (unsigned long long)sh->sector,
3689 disk_idx, other);
3690 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3691 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3692 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
3693 set_bit(R5_Wantcompute, &sh->dev[other].flags);
3694 sh->ops.target = disk_idx;
3695 sh->ops.target2 = other;
3696 s->uptodate += 2;
3697 s->req_compute = 1;
3698 return 1;
3699 } else if (test_bit(R5_Insync, &dev->flags)) {
3700 set_bit(R5_LOCKED, &dev->flags);
3701 set_bit(R5_Wantread, &dev->flags);
3702 s->locked++;
3703 pr_debug("Reading block %d (sync=%d)\n",
3704 disk_idx, s->syncing);
3705 }
3706 }
3707
3708 return 0;
3709}
3710
3711
3712
3713
3714static void handle_stripe_fill(struct stripe_head *sh,
3715 struct stripe_head_state *s,
3716 int disks)
3717{
3718 int i;
3719
3720
3721
3722
3723
3724 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
3725 !sh->reconstruct_state) {
3726
3727
3728
3729
3730
3731
3732
3733
3734 if (s->injournal && s->failed) {
3735 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
3736 r5c_make_stripe_write_out(sh);
3737 goto out;
3738 }
3739
3740 for (i = disks; i--; )
3741 if (fetch_block(sh, s, i, disks))
3742 break;
3743 }
3744out:
3745 set_bit(STRIPE_HANDLE, &sh->state);
3746}
3747
3748static void break_stripe_batch_list(struct stripe_head *head_sh,
3749 unsigned long handle_flags);
3750
3751
3752
3753
3754
3755static void handle_stripe_clean_event(struct r5conf *conf,
3756 struct stripe_head *sh, int disks)
3757{
3758 int i;
3759 struct r5dev *dev;
3760 int discard_pending = 0;
3761 struct stripe_head *head_sh = sh;
3762 bool do_endio = false;
3763
3764 for (i = disks; i--; )
3765 if (sh->dev[i].written) {
3766 dev = &sh->dev[i];
3767 if (!test_bit(R5_LOCKED, &dev->flags) &&
3768 (test_bit(R5_UPTODATE, &dev->flags) ||
3769 test_bit(R5_Discard, &dev->flags) ||
3770 test_bit(R5_SkipCopy, &dev->flags))) {
3771
3772 struct bio *wbi, *wbi2;
3773 pr_debug("Return write for disc %d\n", i);
3774 if (test_and_clear_bit(R5_Discard, &dev->flags))
3775 clear_bit(R5_UPTODATE, &dev->flags);
3776 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
3777 WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
3778 }
3779 do_endio = true;
3780
3781returnbi:
3782 dev->page = dev->orig_page;
3783 wbi = dev->written;
3784 dev->written = NULL;
3785 while (wbi && wbi->bi_iter.bi_sector <
3786 dev->sector + STRIPE_SECTORS) {
3787 wbi2 = r5_next_bio(wbi, dev->sector);
3788 md_write_end(conf->mddev);
3789 bio_endio(wbi);
3790 wbi = wbi2;
3791 }
3792 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3793 STRIPE_SECTORS,
3794 !test_bit(STRIPE_DEGRADED, &sh->state),
3795 0);
3796 if (head_sh->batch_head) {
3797 sh = list_first_entry(&sh->batch_list,
3798 struct stripe_head,
3799 batch_list);
3800 if (sh != head_sh) {
3801 dev = &sh->dev[i];
3802 goto returnbi;
3803 }
3804 }
3805 sh = head_sh;
3806 dev = &sh->dev[i];
3807 } else if (test_bit(R5_Discard, &dev->flags))
3808 discard_pending = 1;
3809 }
3810
3811 log_stripe_write_finished(sh);
3812
3813 if (!discard_pending &&
3814 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
3815 int hash;
3816 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
3817 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
3818 if (sh->qd_idx >= 0) {
3819 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
3820 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
3821 }
3822
3823 clear_bit(STRIPE_DISCARD, &sh->state);
3824
3825
3826
3827
3828
3829unhash:
3830 hash = sh->hash_lock_index;
3831 spin_lock_irq(conf->hash_locks + hash);
3832 remove_hash(sh);
3833 spin_unlock_irq(conf->hash_locks + hash);
3834 if (head_sh->batch_head) {
3835 sh = list_first_entry(&sh->batch_list,
3836 struct stripe_head, batch_list);
3837 if (sh != head_sh)
3838 goto unhash;
3839 }
3840 sh = head_sh;
3841
3842 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
3843 set_bit(STRIPE_HANDLE, &sh->state);
3844
3845 }
3846
3847 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3848 if (atomic_dec_and_test(&conf->pending_full_writes))
3849 md_wakeup_thread(conf->mddev->thread);
3850
3851 if (head_sh->batch_head && do_endio)
3852 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
3853}
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863static inline bool uptodate_for_rmw(struct r5dev *dev)
3864{
3865 return (test_bit(R5_UPTODATE, &dev->flags)) &&
3866 (!test_bit(R5_InJournal, &dev->flags) ||
3867 test_bit(R5_OrigPageUPTDODATE, &dev->flags));
3868}
3869
3870static int handle_stripe_dirtying(struct r5conf *conf,
3871 struct stripe_head *sh,
3872 struct stripe_head_state *s,
3873 int disks)
3874{
3875 int rmw = 0, rcw = 0, i;
3876 sector_t recovery_cp = conf->mddev->recovery_cp;
3877
3878
3879
3880
3881
3882
3883
3884
3885 if (conf->rmw_level == PARITY_DISABLE_RMW ||
3886 (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
3887 s->failed == 0)) {
3888
3889
3890
3891 rcw = 1; rmw = 2;
3892 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
3893 conf->rmw_level, (unsigned long long)recovery_cp,
3894 (unsigned long long)sh->sector);
3895 } else for (i = disks; i--; ) {
3896
3897 struct r5dev *dev = &sh->dev[i];
3898 if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
3899 i == sh->pd_idx || i == sh->qd_idx ||
3900 test_bit(R5_InJournal, &dev->flags)) &&
3901 !test_bit(R5_LOCKED, &dev->flags) &&
3902 !(uptodate_for_rmw(dev) ||
3903 test_bit(R5_Wantcompute, &dev->flags))) {
3904 if (test_bit(R5_Insync, &dev->flags))
3905 rmw++;
3906 else
3907 rmw += 2*disks;
3908 }
3909
3910 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3911 i != sh->pd_idx && i != sh->qd_idx &&
3912 !test_bit(R5_LOCKED, &dev->flags) &&
3913 !(test_bit(R5_UPTODATE, &dev->flags) ||
3914 test_bit(R5_Wantcompute, &dev->flags))) {
3915 if (test_bit(R5_Insync, &dev->flags))
3916 rcw++;
3917 else
3918 rcw += 2*disks;
3919 }
3920 }
3921
3922 pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
3923 (unsigned long long)sh->sector, sh->state, rmw, rcw);
3924 set_bit(STRIPE_HANDLE, &sh->state);
3925 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
3926
3927 if (conf->mddev->queue)
3928 blk_add_trace_msg(conf->mddev->queue,
3929 "raid5 rmw %llu %d",
3930 (unsigned long long)sh->sector, rmw);
3931 for (i = disks; i--; ) {
3932 struct r5dev *dev = &sh->dev[i];
3933 if (test_bit(R5_InJournal, &dev->flags) &&
3934 dev->page == dev->orig_page &&
3935 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
3936
3937 struct page *p = alloc_page(GFP_NOIO);
3938
3939 if (p) {
3940 dev->orig_page = p;
3941 continue;
3942 }
3943
3944
3945
3946
3947
3948 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
3949 &conf->cache_state)) {
3950 r5c_use_extra_page(sh);
3951 break;
3952 }
3953
3954
3955 set_bit(STRIPE_DELAYED, &sh->state);
3956 s->waiting_extra_page = 1;
3957 return -EAGAIN;
3958 }
3959 }
3960
3961 for (i = disks; i--; ) {
3962 struct r5dev *dev = &sh->dev[i];
3963 if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
3964 i == sh->pd_idx || i == sh->qd_idx ||
3965 test_bit(R5_InJournal, &dev->flags)) &&
3966 !test_bit(R5_LOCKED, &dev->flags) &&
3967 !(uptodate_for_rmw(dev) ||
3968 test_bit(R5_Wantcompute, &dev->flags)) &&
3969 test_bit(R5_Insync, &dev->flags)) {
3970 if (test_bit(STRIPE_PREREAD_ACTIVE,
3971 &sh->state)) {
3972 pr_debug("Read_old block %d for r-m-w\n",
3973 i);
3974 set_bit(R5_LOCKED, &dev->flags);
3975 set_bit(R5_Wantread, &dev->flags);
3976 s->locked++;
3977 } else {
3978 set_bit(STRIPE_DELAYED, &sh->state);
3979 set_bit(STRIPE_HANDLE, &sh->state);
3980 }
3981 }
3982 }
3983 }
3984 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) {
3985
3986 int qread =0;
3987 rcw = 0;
3988 for (i = disks; i--; ) {
3989 struct r5dev *dev = &sh->dev[i];
3990 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3991 i != sh->pd_idx && i != sh->qd_idx &&
3992 !test_bit(R5_LOCKED, &dev->flags) &&
3993 !(test_bit(R5_UPTODATE, &dev->flags) ||
3994 test_bit(R5_Wantcompute, &dev->flags))) {
3995 rcw++;
3996 if (test_bit(R5_Insync, &dev->flags) &&
3997 test_bit(STRIPE_PREREAD_ACTIVE,
3998 &sh->state)) {
3999 pr_debug("Read_old block "
4000 "%d for Reconstruct\n", i);
4001 set_bit(R5_LOCKED, &dev->flags);
4002 set_bit(R5_Wantread, &dev->flags);
4003 s->locked++;
4004 qread++;
4005 } else {
4006 set_bit(STRIPE_DELAYED, &sh->state);
4007 set_bit(STRIPE_HANDLE, &sh->state);
4008 }
4009 }
4010 }
4011 if (rcw && conf->mddev->queue)
4012 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
4013 (unsigned long long)sh->sector,
4014 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
4015 }
4016
4017 if (rcw > disks && rmw > disks &&
4018 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4019 set_bit(STRIPE_DELAYED, &sh->state);
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
4032 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
4033 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
4034 schedule_reconstruction(sh, s, rcw == 0, 0);
4035 return 0;
4036}
4037
4038static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
4039 struct stripe_head_state *s, int disks)
4040{
4041 struct r5dev *dev = NULL;
4042
4043 BUG_ON(sh->batch_head);
4044 set_bit(STRIPE_HANDLE, &sh->state);
4045
4046 switch (sh->check_state) {
4047 case check_state_idle:
4048
4049 if (s->failed == 0) {
4050 BUG_ON(s->uptodate != disks);
4051 sh->check_state = check_state_run;
4052 set_bit(STRIPE_OP_CHECK, &s->ops_request);
4053 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
4054 s->uptodate--;
4055 break;
4056 }
4057 dev = &sh->dev[s->failed_num[0]];
4058
4059 case check_state_compute_result:
4060 sh->check_state = check_state_idle;
4061 if (!dev)
4062 dev = &sh->dev[sh->pd_idx];
4063
4064
4065 if (test_bit(STRIPE_INSYNC, &sh->state))
4066 break;
4067
4068
4069 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
4070 BUG_ON(s->uptodate != disks);
4071
4072 set_bit(R5_LOCKED, &dev->flags);
4073 s->locked++;
4074 set_bit(R5_Wantwrite, &dev->flags);
4075
4076 clear_bit(STRIPE_DEGRADED, &sh->state);
4077 set_bit(STRIPE_INSYNC, &sh->state);
4078 break;
4079 case check_state_run:
4080 break;
4081 case check_state_check_result:
4082 sh->check_state = check_state_idle;
4083
4084
4085
4086
4087 if (s->failed)
4088 break;
4089
4090
4091
4092
4093
4094 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
4095
4096
4097
4098 set_bit(STRIPE_INSYNC, &sh->state);
4099 else {
4100 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
4101 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4102
4103 set_bit(STRIPE_INSYNC, &sh->state);
4104 pr_warn_ratelimited("%s: mismatch sector in range "
4105 "%llu-%llu\n", mdname(conf->mddev),
4106 (unsigned long long) sh->sector,
4107 (unsigned long long) sh->sector +
4108 STRIPE_SECTORS);
4109 } else {
4110 sh->check_state = check_state_compute_run;
4111 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4112 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4113 set_bit(R5_Wantcompute,
4114 &sh->dev[sh->pd_idx].flags);
4115 sh->ops.target = sh->pd_idx;
4116 sh->ops.target2 = -1;
4117 s->uptodate++;
4118 }
4119 }
4120 break;
4121 case check_state_compute_run:
4122 break;
4123 default:
4124 pr_err("%s: unknown check_state: %d sector: %llu\n",
4125 __func__, sh->check_state,
4126 (unsigned long long) sh->sector);
4127 BUG();
4128 }
4129}
4130
4131static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
4132 struct stripe_head_state *s,
4133 int disks)
4134{
4135 int pd_idx = sh->pd_idx;
4136 int qd_idx = sh->qd_idx;
4137 struct r5dev *dev;
4138
4139 BUG_ON(sh->batch_head);
4140 set_bit(STRIPE_HANDLE, &sh->state);
4141
4142 BUG_ON(s->failed > 2);
4143
4144
4145
4146
4147
4148
4149
4150 switch (sh->check_state) {
4151 case check_state_idle:
4152
4153 if (s->failed == s->q_failed) {
4154
4155
4156
4157
4158 sh->check_state = check_state_run;
4159 }
4160 if (!s->q_failed && s->failed < 2) {
4161
4162
4163
4164 if (sh->check_state == check_state_run)
4165 sh->check_state = check_state_run_pq;
4166 else
4167 sh->check_state = check_state_run_q;
4168 }
4169
4170
4171 sh->ops.zero_sum_result = 0;
4172
4173 if (sh->check_state == check_state_run) {
4174
4175 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
4176 s->uptodate--;
4177 }
4178 if (sh->check_state >= check_state_run &&
4179 sh->check_state <= check_state_run_pq) {
4180
4181
4182
4183 set_bit(STRIPE_OP_CHECK, &s->ops_request);
4184 break;
4185 }
4186
4187
4188 BUG_ON(s->failed != 2);
4189
4190 case check_state_compute_result:
4191 sh->check_state = check_state_idle;
4192
4193
4194 if (test_bit(STRIPE_INSYNC, &sh->state))
4195 break;
4196
4197
4198
4199
4200 BUG_ON(s->uptodate < disks - 1);
4201 if (s->failed == 2) {
4202 dev = &sh->dev[s->failed_num[1]];
4203 s->locked++;
4204 set_bit(R5_LOCKED, &dev->flags);
4205 set_bit(R5_Wantwrite, &dev->flags);
4206 }
4207 if (s->failed >= 1) {
4208 dev = &sh->dev[s->failed_num[0]];
4209 s->locked++;
4210 set_bit(R5_LOCKED, &dev->flags);
4211 set_bit(R5_Wantwrite, &dev->flags);
4212 }
4213 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4214 dev = &sh->dev[pd_idx];
4215 s->locked++;
4216 set_bit(R5_LOCKED, &dev->flags);
4217 set_bit(R5_Wantwrite, &dev->flags);
4218 }
4219 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4220 dev = &sh->dev[qd_idx];
4221 s->locked++;
4222 set_bit(R5_LOCKED, &dev->flags);
4223 set_bit(R5_Wantwrite, &dev->flags);
4224 }
4225 clear_bit(STRIPE_DEGRADED, &sh->state);
4226
4227 set_bit(STRIPE_INSYNC, &sh->state);
4228 break;
4229 case check_state_run:
4230 case check_state_run_q:
4231 case check_state_run_pq:
4232 break;
4233 case check_state_check_result:
4234 sh->check_state = check_state_idle;
4235
4236
4237
4238
4239
4240 if (sh->ops.zero_sum_result == 0) {
4241
4242 if (!s->failed)
4243 set_bit(STRIPE_INSYNC, &sh->state);
4244 else {
4245
4246
4247
4248
4249 sh->check_state = check_state_compute_result;
4250
4251
4252
4253
4254
4255 }
4256 } else {
4257 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
4258 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4259
4260 set_bit(STRIPE_INSYNC, &sh->state);
4261 pr_warn_ratelimited("%s: mismatch sector in range "
4262 "%llu-%llu\n", mdname(conf->mddev),
4263 (unsigned long long) sh->sector,
4264 (unsigned long long) sh->sector +
4265 STRIPE_SECTORS);
4266 } else {
4267 int *target = &sh->ops.target;
4268
4269 sh->ops.target = -1;
4270 sh->ops.target2 = -1;
4271 sh->check_state = check_state_compute_run;
4272 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4273 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4274 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4275 set_bit(R5_Wantcompute,
4276 &sh->dev[pd_idx].flags);
4277 *target = pd_idx;
4278 target = &sh->ops.target2;
4279 s->uptodate++;
4280 }
4281 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4282 set_bit(R5_Wantcompute,
4283 &sh->dev[qd_idx].flags);
4284 *target = qd_idx;
4285 s->uptodate++;
4286 }
4287 }
4288 }
4289 break;
4290 case check_state_compute_run:
4291 break;
4292 default:
4293 pr_warn("%s: unknown check_state: %d sector: %llu\n",
4294 __func__, sh->check_state,
4295 (unsigned long long) sh->sector);
4296 BUG();
4297 }
4298}
4299
4300static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
4301{
4302 int i;
4303
4304
4305
4306
4307 struct dma_async_tx_descriptor *tx = NULL;
4308 BUG_ON(sh->batch_head);
4309 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
4310 for (i = 0; i < sh->disks; i++)
4311 if (i != sh->pd_idx && i != sh->qd_idx) {
4312 int dd_idx, j;
4313 struct stripe_head *sh2;
4314 struct async_submit_ctl submit;
4315
4316 sector_t bn = raid5_compute_blocknr(sh, i, 1);
4317 sector_t s = raid5_compute_sector(conf, bn, 0,
4318 &dd_idx, NULL);
4319 sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1);
4320 if (sh2 == NULL)
4321
4322
4323
4324
4325 continue;
4326 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
4327 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
4328
4329 raid5_release_stripe(sh2);
4330 continue;
4331 }
4332
4333
4334 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
4335 tx = async_memcpy(sh2->dev[dd_idx].page,
4336 sh->dev[i].page, 0, 0, STRIPE_SIZE,
4337 &submit);
4338
4339 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
4340 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
4341 for (j = 0; j < conf->raid_disks; j++)
4342 if (j != sh2->pd_idx &&
4343 j != sh2->qd_idx &&
4344 !test_bit(R5_Expanded, &sh2->dev[j].flags))
4345 break;
4346 if (j == conf->raid_disks) {
4347 set_bit(STRIPE_EXPAND_READY, &sh2->state);
4348 set_bit(STRIPE_HANDLE, &sh2->state);
4349 }
4350 raid5_release_stripe(sh2);
4351
4352 }
4353
4354 async_tx_quiesce(&tx);
4355}
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
4372{
4373 struct r5conf *conf = sh->raid_conf;
4374 int disks = sh->disks;
4375 struct r5dev *dev;
4376 int i;
4377 int do_recovery = 0;
4378
4379 memset(s, 0, sizeof(*s));
4380
4381 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
4382 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
4383 s->failed_num[0] = -1;
4384 s->failed_num[1] = -1;
4385 s->log_failed = r5l_log_disk_error(conf);
4386
4387
4388 rcu_read_lock();
4389 for (i=disks; i--; ) {
4390 struct md_rdev *rdev;
4391 sector_t first_bad;
4392 int bad_sectors;
4393 int is_bad = 0;
4394
4395 dev = &sh->dev[i];
4396
4397 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
4398 i, dev->flags,
4399 dev->toread, dev->towrite, dev->written);
4400
4401
4402
4403
4404
4405 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
4406 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
4407 set_bit(R5_Wantfill, &dev->flags);
4408
4409
4410 if (test_bit(R5_LOCKED, &dev->flags))
4411 s->locked++;
4412 if (test_bit(R5_UPTODATE, &dev->flags))
4413 s->uptodate++;
4414 if (test_bit(R5_Wantcompute, &dev->flags)) {
4415 s->compute++;
4416 BUG_ON(s->compute > 2);
4417 }
4418
4419 if (test_bit(R5_Wantfill, &dev->flags))
4420 s->to_fill++;
4421 else if (dev->toread)
4422 s->to_read++;
4423 if (dev->towrite) {
4424 s->to_write++;
4425 if (!test_bit(R5_OVERWRITE, &dev->flags))
4426 s->non_overwrite++;
4427 }
4428 if (dev->written)
4429 s->written++;
4430
4431
4432
4433 rdev = rcu_dereference(conf->disks[i].replacement);
4434 if (rdev && !test_bit(Faulty, &rdev->flags) &&
4435 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
4436 !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4437 &first_bad, &bad_sectors))
4438 set_bit(R5_ReadRepl, &dev->flags);
4439 else {
4440 if (rdev && !test_bit(Faulty, &rdev->flags))
4441 set_bit(R5_NeedReplace, &dev->flags);
4442 else
4443 clear_bit(R5_NeedReplace, &dev->flags);
4444 rdev = rcu_dereference(conf->disks[i].rdev);
4445 clear_bit(R5_ReadRepl, &dev->flags);
4446 }
4447 if (rdev && test_bit(Faulty, &rdev->flags))
4448 rdev = NULL;
4449 if (rdev) {
4450 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4451 &first_bad, &bad_sectors);
4452 if (s->blocked_rdev == NULL
4453 && (test_bit(Blocked, &rdev->flags)
4454 || is_bad < 0)) {
4455 if (is_bad < 0)
4456 set_bit(BlockedBadBlocks,
4457 &rdev->flags);
4458 s->blocked_rdev = rdev;
4459 atomic_inc(&rdev->nr_pending);
4460 }
4461 }
4462 clear_bit(R5_Insync, &dev->flags);
4463 if (!rdev)
4464 ;
4465 else if (is_bad) {
4466
4467 if (!test_bit(WriteErrorSeen, &rdev->flags) &&
4468 test_bit(R5_UPTODATE, &dev->flags)) {
4469
4470
4471
4472 set_bit(R5_Insync, &dev->flags);
4473 set_bit(R5_ReadError, &dev->flags);
4474 }
4475 } else if (test_bit(In_sync, &rdev->flags))
4476 set_bit(R5_Insync, &dev->flags);
4477 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
4478
4479 set_bit(R5_Insync, &dev->flags);
4480 else if (test_bit(R5_UPTODATE, &dev->flags) &&
4481 test_bit(R5_Expanded, &dev->flags))
4482
4483
4484
4485
4486 set_bit(R5_Insync, &dev->flags);
4487
4488 if (test_bit(R5_WriteError, &dev->flags)) {
4489
4490
4491 struct md_rdev *rdev2 = rcu_dereference(
4492 conf->disks[i].rdev);
4493 if (rdev2 == rdev)
4494 clear_bit(R5_Insync, &dev->flags);
4495 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4496 s->handle_bad_blocks = 1;
4497 atomic_inc(&rdev2->nr_pending);
4498 } else
4499 clear_bit(R5_WriteError, &dev->flags);
4500 }
4501 if (test_bit(R5_MadeGood, &dev->flags)) {
4502
4503
4504 struct md_rdev *rdev2 = rcu_dereference(
4505 conf->disks[i].rdev);
4506 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4507 s->handle_bad_blocks = 1;
4508 atomic_inc(&rdev2->nr_pending);
4509 } else
4510 clear_bit(R5_MadeGood, &dev->flags);
4511 }
4512 if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
4513 struct md_rdev *rdev2 = rcu_dereference(
4514 conf->disks[i].replacement);
4515 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4516 s->handle_bad_blocks = 1;
4517 atomic_inc(&rdev2->nr_pending);
4518 } else
4519 clear_bit(R5_MadeGoodRepl, &dev->flags);
4520 }
4521 if (!test_bit(R5_Insync, &dev->flags)) {
4522
4523 clear_bit(R5_ReadError, &dev->flags);
4524 clear_bit(R5_ReWrite, &dev->flags);
4525 }
4526 if (test_bit(R5_ReadError, &dev->flags))
4527 clear_bit(R5_Insync, &dev->flags);
4528 if (!test_bit(R5_Insync, &dev->flags)) {
4529 if (s->failed < 2)
4530 s->failed_num[s->failed] = i;
4531 s->failed++;
4532 if (rdev && !test_bit(Faulty, &rdev->flags))
4533 do_recovery = 1;
4534 else if (!rdev) {
4535 rdev = rcu_dereference(
4536 conf->disks[i].replacement);
4537 if (rdev && !test_bit(Faulty, &rdev->flags))
4538 do_recovery = 1;
4539 }
4540 }
4541
4542 if (test_bit(R5_InJournal, &dev->flags))
4543 s->injournal++;
4544 if (test_bit(R5_InJournal, &dev->flags) && dev->written)
4545 s->just_cached++;
4546 }
4547 if (test_bit(STRIPE_SYNCING, &sh->state)) {
4548
4549
4550
4551
4552
4553
4554
4555
4556 if (do_recovery ||
4557 sh->sector >= conf->mddev->recovery_cp ||
4558 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
4559 s->syncing = 1;
4560 else
4561 s->replacing = 1;
4562 }
4563 rcu_read_unlock();
4564}
4565
4566static int clear_batch_ready(struct stripe_head *sh)
4567{
4568
4569
4570
4571
4572 struct stripe_head *tmp;
4573 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
4574 return (sh->batch_head && sh->batch_head != sh);
4575 spin_lock(&sh->stripe_lock);
4576 if (!sh->batch_head) {
4577 spin_unlock(&sh->stripe_lock);
4578 return 0;
4579 }
4580
4581
4582
4583
4584
4585 if (sh->batch_head != sh) {
4586 spin_unlock(&sh->stripe_lock);
4587 return 1;
4588 }
4589 spin_lock(&sh->batch_lock);
4590 list_for_each_entry(tmp, &sh->batch_list, batch_list)
4591 clear_bit(STRIPE_BATCH_READY, &tmp->state);
4592 spin_unlock(&sh->batch_lock);
4593 spin_unlock(&sh->stripe_lock);
4594
4595
4596
4597
4598
4599 return 0;
4600}
4601
4602static void break_stripe_batch_list(struct stripe_head *head_sh,
4603 unsigned long handle_flags)
4604{
4605 struct stripe_head *sh, *next;
4606 int i;
4607 int do_wakeup = 0;
4608
4609 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
4610
4611 list_del_init(&sh->batch_list);
4612
4613 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
4614 (1 << STRIPE_SYNCING) |
4615 (1 << STRIPE_REPLACED) |
4616 (1 << STRIPE_DELAYED) |
4617 (1 << STRIPE_BIT_DELAY) |
4618 (1 << STRIPE_FULL_WRITE) |
4619 (1 << STRIPE_BIOFILL_RUN) |
4620 (1 << STRIPE_COMPUTE_RUN) |
4621 (1 << STRIPE_OPS_REQ_PENDING) |
4622 (1 << STRIPE_DISCARD) |
4623 (1 << STRIPE_BATCH_READY) |
4624 (1 << STRIPE_BATCH_ERR) |
4625 (1 << STRIPE_BITMAP_PENDING)),
4626 "stripe state: %lx\n", sh->state);
4627 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
4628 (1 << STRIPE_REPLACED)),
4629 "head stripe state: %lx\n", head_sh->state);
4630
4631 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
4632 (1 << STRIPE_PREREAD_ACTIVE) |
4633 (1 << STRIPE_DEGRADED) |
4634 (1 << STRIPE_ON_UNPLUG_LIST)),
4635 head_sh->state & (1 << STRIPE_INSYNC));
4636
4637 sh->check_state = head_sh->check_state;
4638 sh->reconstruct_state = head_sh->reconstruct_state;
4639 spin_lock_irq(&sh->stripe_lock);
4640 sh->batch_head = NULL;
4641 spin_unlock_irq(&sh->stripe_lock);
4642 for (i = 0; i < sh->disks; i++) {
4643 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
4644 do_wakeup = 1;
4645 sh->dev[i].flags = head_sh->dev[i].flags &
4646 (~((1 << R5_WriteError) | (1 << R5_Overlap)));
4647 }
4648 if (handle_flags == 0 ||
4649 sh->state & handle_flags)
4650 set_bit(STRIPE_HANDLE, &sh->state);
4651 raid5_release_stripe(sh);
4652 }
4653 spin_lock_irq(&head_sh->stripe_lock);
4654 head_sh->batch_head = NULL;
4655 spin_unlock_irq(&head_sh->stripe_lock);
4656 for (i = 0; i < head_sh->disks; i++)
4657 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
4658 do_wakeup = 1;
4659 if (head_sh->state & handle_flags)
4660 set_bit(STRIPE_HANDLE, &head_sh->state);
4661
4662 if (do_wakeup)
4663 wake_up(&head_sh->raid_conf->wait_for_overlap);
4664}
4665
4666static void handle_stripe(struct stripe_head *sh)
4667{
4668 struct stripe_head_state s;
4669 struct r5conf *conf = sh->raid_conf;
4670 int i;
4671 int prexor;
4672 int disks = sh->disks;
4673 struct r5dev *pdev, *qdev;
4674
4675 clear_bit(STRIPE_HANDLE, &sh->state);
4676 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
4677
4678
4679 set_bit(STRIPE_HANDLE, &sh->state);
4680 return;
4681 }
4682
4683 if (clear_batch_ready(sh) ) {
4684 clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
4685 return;
4686 }
4687
4688 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
4689 break_stripe_batch_list(sh, 0);
4690
4691 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
4692 spin_lock(&sh->stripe_lock);
4693
4694
4695
4696
4697 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
4698 !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
4699 !test_bit(STRIPE_DISCARD, &sh->state) &&
4700 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
4701 set_bit(STRIPE_SYNCING, &sh->state);
4702 clear_bit(STRIPE_INSYNC, &sh->state);
4703 clear_bit(STRIPE_REPLACED, &sh->state);
4704 }
4705 spin_unlock(&sh->stripe_lock);
4706 }
4707 clear_bit(STRIPE_DELAYED, &sh->state);
4708
4709 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
4710 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
4711 (unsigned long long)sh->sector, sh->state,
4712 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
4713 sh->check_state, sh->reconstruct_state);
4714
4715 analyse_stripe(sh, &s);
4716
4717 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
4718 goto finish;
4719
4720 if (s.handle_bad_blocks ||
4721 test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
4722 set_bit(STRIPE_HANDLE, &sh->state);
4723 goto finish;
4724 }
4725
4726 if (unlikely(s.blocked_rdev)) {
4727 if (s.syncing || s.expanding || s.expanded ||
4728 s.replacing || s.to_write || s.written) {
4729 set_bit(STRIPE_HANDLE, &sh->state);
4730 goto finish;
4731 }
4732
4733 rdev_dec_pending(s.blocked_rdev, conf->mddev);
4734 s.blocked_rdev = NULL;
4735 }
4736
4737 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
4738 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
4739 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
4740 }
4741
4742 pr_debug("locked=%d uptodate=%d to_read=%d"
4743 " to_write=%d failed=%d failed_num=%d,%d\n",
4744 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
4745 s.failed_num[0], s.failed_num[1]);
4746
4747
4748
4749
4750
4751
4752
4753 if (s.failed > conf->max_degraded ||
4754 (s.log_failed && s.injournal == 0)) {
4755 sh->check_state = 0;
4756 sh->reconstruct_state = 0;
4757 break_stripe_batch_list(sh, 0);
4758 if (s.to_read+s.to_write+s.written)
4759 handle_failed_stripe(conf, sh, &s, disks);
4760 if (s.syncing + s.replacing)
4761 handle_failed_sync(conf, sh, &s);
4762 }
4763
4764
4765
4766
4767 prexor = 0;
4768 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
4769 prexor = 1;
4770 if (sh->reconstruct_state == reconstruct_state_drain_result ||
4771 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
4772 sh->reconstruct_state = reconstruct_state_idle;
4773
4774
4775
4776
4777 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
4778 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
4779 BUG_ON(sh->qd_idx >= 0 &&
4780 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
4781 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
4782 for (i = disks; i--; ) {
4783 struct r5dev *dev = &sh->dev[i];
4784 if (test_bit(R5_LOCKED, &dev->flags) &&
4785 (i == sh->pd_idx || i == sh->qd_idx ||
4786 dev->written || test_bit(R5_InJournal,
4787 &dev->flags))) {
4788 pr_debug("Writing block %d\n", i);
4789 set_bit(R5_Wantwrite, &dev->flags);
4790 if (prexor)
4791 continue;
4792 if (s.failed > 1)
4793 continue;
4794 if (!test_bit(R5_Insync, &dev->flags) ||
4795 ((i == sh->pd_idx || i == sh->qd_idx) &&
4796 s.failed == 0))
4797 set_bit(STRIPE_INSYNC, &sh->state);
4798 }
4799 }
4800 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4801 s.dec_preread_active = 1;
4802 }
4803
4804
4805
4806
4807
4808 pdev = &sh->dev[sh->pd_idx];
4809 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
4810 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
4811 qdev = &sh->dev[sh->qd_idx];
4812 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
4813 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
4814 || conf->level < 6;
4815
4816 if (s.written &&
4817 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
4818 && !test_bit(R5_LOCKED, &pdev->flags)
4819 && (test_bit(R5_UPTODATE, &pdev->flags) ||
4820 test_bit(R5_Discard, &pdev->flags))))) &&
4821 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
4822 && !test_bit(R5_LOCKED, &qdev->flags)
4823 && (test_bit(R5_UPTODATE, &qdev->flags) ||
4824 test_bit(R5_Discard, &qdev->flags))))))
4825 handle_stripe_clean_event(conf, sh, disks);
4826
4827 if (s.just_cached)
4828 r5c_handle_cached_data_endio(conf, sh, disks);
4829 log_stripe_write_finished(sh);
4830
4831
4832
4833
4834
4835 if (s.to_read || s.non_overwrite
4836 || (conf->level == 6 && s.to_write && s.failed)
4837 || (s.syncing && (s.uptodate + s.compute < disks))
4838 || s.replacing
4839 || s.expanding)
4840 handle_stripe_fill(sh, &s, disks);
4841
4842
4843
4844
4845
4846
4847 r5c_finish_stripe_write_out(conf, sh, &s);
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
4859 if (!r5c_is_writeback(conf->log)) {
4860 if (s.to_write)
4861 handle_stripe_dirtying(conf, sh, &s, disks);
4862 } else {
4863 int ret = 0;
4864
4865
4866 if (s.to_write)
4867 ret = r5c_try_caching_write(conf, sh, &s,
4868 disks);
4869
4870
4871
4872
4873
4874
4875
4876 if (ret == -EAGAIN ||
4877
4878 (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
4879 s.injournal > 0)) {
4880 ret = handle_stripe_dirtying(conf, sh, &s,
4881 disks);
4882 if (ret == -EAGAIN)
4883 goto finish;
4884 }
4885 }
4886 }
4887
4888
4889
4890
4891
4892
4893 if (sh->check_state ||
4894 (s.syncing && s.locked == 0 &&
4895 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
4896 !test_bit(STRIPE_INSYNC, &sh->state))) {
4897 if (conf->level == 6)
4898 handle_parity_checks6(conf, sh, &s, disks);
4899 else
4900 handle_parity_checks5(conf, sh, &s, disks);
4901 }
4902
4903 if ((s.replacing || s.syncing) && s.locked == 0
4904 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
4905 && !test_bit(STRIPE_REPLACED, &sh->state)) {
4906
4907 for (i = 0; i < conf->raid_disks; i++)
4908 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
4909 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
4910 set_bit(R5_WantReplace, &sh->dev[i].flags);
4911 set_bit(R5_LOCKED, &sh->dev[i].flags);
4912 s.locked++;
4913 }
4914 if (s.replacing)
4915 set_bit(STRIPE_INSYNC, &sh->state);
4916 set_bit(STRIPE_REPLACED, &sh->state);
4917 }
4918 if ((s.syncing || s.replacing) && s.locked == 0 &&
4919 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
4920 test_bit(STRIPE_INSYNC, &sh->state)) {
4921 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
4922 clear_bit(STRIPE_SYNCING, &sh->state);
4923 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
4924 wake_up(&conf->wait_for_overlap);
4925 }
4926
4927
4928
4929
4930 if (s.failed <= conf->max_degraded && !conf->mddev->ro)
4931 for (i = 0; i < s.failed; i++) {
4932 struct r5dev *dev = &sh->dev[s.failed_num[i]];
4933 if (test_bit(R5_ReadError, &dev->flags)
4934 && !test_bit(R5_LOCKED, &dev->flags)
4935 && test_bit(R5_UPTODATE, &dev->flags)
4936 ) {
4937 if (!test_bit(R5_ReWrite, &dev->flags)) {
4938 set_bit(R5_Wantwrite, &dev->flags);
4939 set_bit(R5_ReWrite, &dev->flags);
4940 set_bit(R5_LOCKED, &dev->flags);
4941 s.locked++;
4942 } else {
4943
4944 set_bit(R5_Wantread, &dev->flags);
4945 set_bit(R5_LOCKED, &dev->flags);
4946 s.locked++;
4947 }
4948 }
4949 }
4950
4951
4952 if (sh->reconstruct_state == reconstruct_state_result) {
4953 struct stripe_head *sh_src
4954 = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
4955 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
4956
4957
4958
4959 set_bit(STRIPE_DELAYED, &sh->state);
4960 set_bit(STRIPE_HANDLE, &sh->state);
4961 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
4962 &sh_src->state))
4963 atomic_inc(&conf->preread_active_stripes);
4964 raid5_release_stripe(sh_src);
4965 goto finish;
4966 }
4967 if (sh_src)
4968 raid5_release_stripe(sh_src);
4969
4970 sh->reconstruct_state = reconstruct_state_idle;
4971 clear_bit(STRIPE_EXPANDING, &sh->state);
4972 for (i = conf->raid_disks; i--; ) {
4973 set_bit(R5_Wantwrite, &sh->dev[i].flags);
4974 set_bit(R5_LOCKED, &sh->dev[i].flags);
4975 s.locked++;
4976 }
4977 }
4978
4979 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
4980 !sh->reconstruct_state) {
4981
4982 sh->disks = conf->raid_disks;
4983 stripe_set_idx(sh->sector, conf, 0, sh);
4984 schedule_reconstruction(sh, &s, 1, 1);
4985 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
4986 clear_bit(STRIPE_EXPAND_READY, &sh->state);
4987 atomic_dec(&conf->reshape_stripes);
4988 wake_up(&conf->wait_for_overlap);
4989 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
4990 }
4991
4992 if (s.expanding && s.locked == 0 &&
4993 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
4994 handle_stripe_expansion(conf, sh);
4995
4996finish:
4997
4998 if (unlikely(s.blocked_rdev)) {
4999 if (conf->mddev->external)
5000 md_wait_for_blocked_rdev(s.blocked_rdev,
5001 conf->mddev);
5002 else
5003
5004
5005
5006
5007 rdev_dec_pending(s.blocked_rdev,
5008 conf->mddev);
5009 }
5010
5011 if (s.handle_bad_blocks)
5012 for (i = disks; i--; ) {
5013 struct md_rdev *rdev;
5014 struct r5dev *dev = &sh->dev[i];
5015 if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
5016
5017 rdev = conf->disks[i].rdev;
5018 if (!rdev_set_badblocks(rdev, sh->sector,
5019 STRIPE_SECTORS, 0))
5020 md_error(conf->mddev, rdev);
5021 rdev_dec_pending(rdev, conf->mddev);
5022 }
5023 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
5024 rdev = conf->disks[i].rdev;
5025 rdev_clear_badblocks(rdev, sh->sector,
5026 STRIPE_SECTORS, 0);
5027 rdev_dec_pending(rdev, conf->mddev);
5028 }
5029 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
5030 rdev = conf->disks[i].replacement;
5031 if (!rdev)
5032
5033 rdev = conf->disks[i].rdev;
5034 rdev_clear_badblocks(rdev, sh->sector,
5035 STRIPE_SECTORS, 0);
5036 rdev_dec_pending(rdev, conf->mddev);
5037 }
5038 }
5039
5040 if (s.ops_request)
5041 raid_run_ops(sh, s.ops_request);
5042
5043 ops_run_io(sh, &s);
5044
5045 if (s.dec_preread_active) {
5046
5047
5048
5049
5050 atomic_dec(&conf->preread_active_stripes);
5051 if (atomic_read(&conf->preread_active_stripes) <
5052 IO_THRESHOLD)
5053 md_wakeup_thread(conf->mddev->thread);
5054 }
5055
5056 clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
5057}
5058
5059static void raid5_activate_delayed(struct r5conf *conf)
5060{
5061 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
5062 while (!list_empty(&conf->delayed_list)) {
5063 struct list_head *l = conf->delayed_list.next;
5064 struct stripe_head *sh;
5065 sh = list_entry(l, struct stripe_head, lru);
5066 list_del_init(l);
5067 clear_bit(STRIPE_DELAYED, &sh->state);
5068 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5069 atomic_inc(&conf->preread_active_stripes);
5070 list_add_tail(&sh->lru, &conf->hold_list);
5071 raid5_wakeup_stripe_thread(sh);
5072 }
5073 }
5074}
5075
5076static void activate_bit_delay(struct r5conf *conf,
5077 struct list_head *temp_inactive_list)
5078{
5079
5080 struct list_head head;
5081 list_add(&head, &conf->bitmap_list);
5082 list_del_init(&conf->bitmap_list);
5083 while (!list_empty(&head)) {
5084 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
5085 int hash;
5086 list_del_init(&sh->lru);
5087 atomic_inc(&sh->count);
5088 hash = sh->hash_lock_index;
5089 __release_stripe(conf, sh, &temp_inactive_list[hash]);
5090 }
5091}
5092
5093static int raid5_congested(struct mddev *mddev, int bits)
5094{
5095 struct r5conf *conf = mddev->private;
5096
5097
5098
5099
5100
5101 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
5102 return 1;
5103
5104
5105 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state))
5106 return 1;
5107 if (conf->quiesce)
5108 return 1;
5109 if (atomic_read(&conf->empty_inactive_list_nr))
5110 return 1;
5111
5112 return 0;
5113}
5114
5115static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
5116{
5117 struct r5conf *conf = mddev->private;
5118 sector_t sector = bio->bi_iter.bi_sector;
5119 unsigned int chunk_sectors;
5120 unsigned int bio_sectors = bio_sectors(bio);
5121
5122 WARN_ON_ONCE(bio->bi_partno);
5123
5124 chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
5125 return chunk_sectors >=
5126 ((sector & (chunk_sectors - 1)) + bio_sectors);
5127}
5128
5129
5130
5131
5132
5133static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
5134{
5135 unsigned long flags;
5136
5137 spin_lock_irqsave(&conf->device_lock, flags);
5138
5139 bi->bi_next = conf->retry_read_aligned_list;
5140 conf->retry_read_aligned_list = bi;
5141
5142 spin_unlock_irqrestore(&conf->device_lock, flags);
5143 md_wakeup_thread(conf->mddev->thread);
5144}
5145
5146static struct bio *remove_bio_from_retry(struct r5conf *conf,
5147 unsigned int *offset)
5148{
5149 struct bio *bi;
5150
5151 bi = conf->retry_read_aligned;
5152 if (bi) {
5153 *offset = conf->retry_read_offset;
5154 conf->retry_read_aligned = NULL;
5155 return bi;
5156 }
5157 bi = conf->retry_read_aligned_list;
5158 if(bi) {
5159 conf->retry_read_aligned_list = bi->bi_next;
5160 bi->bi_next = NULL;
5161 *offset = 0;
5162 }
5163
5164 return bi;
5165}
5166
5167
5168
5169
5170
5171
5172
5173static void raid5_align_endio(struct bio *bi)
5174{
5175 struct bio* raid_bi = bi->bi_private;
5176 struct mddev *mddev;
5177 struct r5conf *conf;
5178 struct md_rdev *rdev;
5179 blk_status_t error = bi->bi_status;
5180
5181 bio_put(bi);
5182
5183 rdev = (void*)raid_bi->bi_next;
5184 raid_bi->bi_next = NULL;
5185 mddev = rdev->mddev;
5186 conf = mddev->private;
5187
5188 rdev_dec_pending(rdev, conf->mddev);
5189
5190 if (!error) {
5191 bio_endio(raid_bi);
5192 if (atomic_dec_and_test(&conf->active_aligned_reads))
5193 wake_up(&conf->wait_for_quiescent);
5194 return;
5195 }
5196
5197 pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
5198
5199 add_bio_to_retry(raid_bi, conf);
5200}
5201
5202static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
5203{
5204 struct r5conf *conf = mddev->private;
5205 int dd_idx;
5206 struct bio* align_bi;
5207 struct md_rdev *rdev;
5208 sector_t end_sector;
5209
5210 if (!in_chunk_boundary(mddev, raid_bio)) {
5211 pr_debug("%s: non aligned\n", __func__);
5212 return 0;
5213 }
5214
5215
5216
5217 align_bi = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set);
5218 if (!align_bi)
5219 return 0;
5220
5221
5222
5223
5224 align_bi->bi_end_io = raid5_align_endio;
5225 align_bi->bi_private = raid_bio;
5226
5227
5228
5229 align_bi->bi_iter.bi_sector =
5230 raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector,
5231 0, &dd_idx, NULL);
5232
5233 end_sector = bio_end_sector(align_bi);
5234 rcu_read_lock();
5235 rdev = rcu_dereference(conf->disks[dd_idx].replacement);
5236 if (!rdev || test_bit(Faulty, &rdev->flags) ||
5237 rdev->recovery_offset < end_sector) {
5238 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
5239 if (rdev &&
5240 (test_bit(Faulty, &rdev->flags) ||
5241 !(test_bit(In_sync, &rdev->flags) ||
5242 rdev->recovery_offset >= end_sector)))
5243 rdev = NULL;
5244 }
5245
5246 if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) {
5247 rcu_read_unlock();
5248 bio_put(align_bi);
5249 return 0;
5250 }
5251
5252 if (rdev) {
5253 sector_t first_bad;
5254 int bad_sectors;
5255
5256 atomic_inc(&rdev->nr_pending);
5257 rcu_read_unlock();
5258 raid_bio->bi_next = (void*)rdev;
5259 bio_set_dev(align_bi, rdev->bdev);
5260 bio_clear_flag(align_bi, BIO_SEG_VALID);
5261
5262 if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
5263 bio_sectors(align_bi),
5264 &first_bad, &bad_sectors)) {
5265 bio_put(align_bi);
5266 rdev_dec_pending(rdev, mddev);
5267 return 0;
5268 }
5269
5270
5271 align_bi->bi_iter.bi_sector += rdev->data_offset;
5272
5273 spin_lock_irq(&conf->device_lock);
5274 wait_event_lock_irq(conf->wait_for_quiescent,
5275 conf->quiesce == 0,
5276 conf->device_lock);
5277 atomic_inc(&conf->active_aligned_reads);
5278 spin_unlock_irq(&conf->device_lock);
5279
5280 if (mddev->gendisk)
5281 trace_block_bio_remap(align_bi->bi_disk->queue,
5282 align_bi, disk_devt(mddev->gendisk),
5283 raid_bio->bi_iter.bi_sector);
5284 generic_make_request(align_bi);
5285 return 1;
5286 } else {
5287 rcu_read_unlock();
5288 bio_put(align_bi);
5289 return 0;
5290 }
5291}
5292
5293static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
5294{
5295 struct bio *split;
5296 sector_t sector = raid_bio->bi_iter.bi_sector;
5297 unsigned chunk_sects = mddev->chunk_sectors;
5298 unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
5299
5300 if (sectors < bio_sectors(raid_bio)) {
5301 struct r5conf *conf = mddev->private;
5302 split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
5303 bio_chain(split, raid_bio);
5304 generic_make_request(raid_bio);
5305 raid_bio = split;
5306 }
5307
5308 if (!raid5_read_one_chunk(mddev, raid_bio))
5309 return raid_bio;
5310
5311 return NULL;
5312}
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
5325{
5326 struct stripe_head *sh, *tmp;
5327 struct list_head *handle_list = NULL;
5328 struct r5worker_group *wg;
5329 bool second_try = !r5c_is_writeback(conf->log) &&
5330 !r5l_log_disk_error(conf);
5331 bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
5332 r5l_log_disk_error(conf);
5333
5334again:
5335 wg = NULL;
5336 sh = NULL;
5337 if (conf->worker_cnt_per_group == 0) {
5338 handle_list = try_loprio ? &conf->loprio_list :
5339 &conf->handle_list;
5340 } else if (group != ANY_GROUP) {
5341 handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
5342 &conf->worker_groups[group].handle_list;
5343 wg = &conf->worker_groups[group];
5344 } else {
5345 int i;
5346 for (i = 0; i < conf->group_cnt; i++) {
5347 handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
5348 &conf->worker_groups[i].handle_list;
5349 wg = &conf->worker_groups[i];
5350 if (!list_empty(handle_list))
5351 break;
5352 }
5353 }
5354
5355 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
5356 __func__,
5357 list_empty(handle_list) ? "empty" : "busy",
5358 list_empty(&conf->hold_list) ? "empty" : "busy",
5359 atomic_read(&conf->pending_full_writes), conf->bypass_count);
5360
5361 if (!list_empty(handle_list)) {
5362 sh = list_entry(handle_list->next, typeof(*sh), lru);
5363
5364 if (list_empty(&conf->hold_list))
5365 conf->bypass_count = 0;
5366 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
5367 if (conf->hold_list.next == conf->last_hold)
5368 conf->bypass_count++;
5369 else {
5370 conf->last_hold = conf->hold_list.next;
5371 conf->bypass_count -= conf->bypass_threshold;
5372 if (conf->bypass_count < 0)
5373 conf->bypass_count = 0;
5374 }
5375 }
5376 } else if (!list_empty(&conf->hold_list) &&
5377 ((conf->bypass_threshold &&
5378 conf->bypass_count > conf->bypass_threshold) ||
5379 atomic_read(&conf->pending_full_writes) == 0)) {
5380
5381 list_for_each_entry(tmp, &conf->hold_list, lru) {
5382 if (conf->worker_cnt_per_group == 0 ||
5383 group == ANY_GROUP ||
5384 !cpu_online(tmp->cpu) ||
5385 cpu_to_group(tmp->cpu) == group) {
5386 sh = tmp;
5387 break;
5388 }
5389 }
5390
5391 if (sh) {
5392 conf->bypass_count -= conf->bypass_threshold;
5393 if (conf->bypass_count < 0)
5394 conf->bypass_count = 0;
5395 }
5396 wg = NULL;
5397 }
5398
5399 if (!sh) {
5400 if (second_try)
5401 return NULL;
5402 second_try = true;
5403 try_loprio = !try_loprio;
5404 goto again;
5405 }
5406
5407 if (wg) {
5408 wg->stripes_cnt--;
5409 sh->group = NULL;
5410 }
5411 list_del_init(&sh->lru);
5412 BUG_ON(atomic_inc_return(&sh->count) != 1);
5413 return sh;
5414}
5415
5416struct raid5_plug_cb {
5417 struct blk_plug_cb cb;
5418 struct list_head list;
5419 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
5420};
5421
5422static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
5423{
5424 struct raid5_plug_cb *cb = container_of(
5425 blk_cb, struct raid5_plug_cb, cb);
5426 struct stripe_head *sh;
5427 struct mddev *mddev = cb->cb.data;
5428 struct r5conf *conf = mddev->private;
5429 int cnt = 0;
5430 int hash;
5431
5432 if (cb->list.next && !list_empty(&cb->list)) {
5433 spin_lock_irq(&conf->device_lock);
5434 while (!list_empty(&cb->list)) {
5435 sh = list_first_entry(&cb->list, struct stripe_head, lru);
5436 list_del_init(&sh->lru);
5437
5438
5439
5440
5441
5442 smp_mb__before_atomic();
5443 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
5444
5445
5446
5447
5448 hash = sh->hash_lock_index;
5449 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
5450 cnt++;
5451 }
5452 spin_unlock_irq(&conf->device_lock);
5453 }
5454 release_inactive_stripe_list(conf, cb->temp_inactive_list,
5455 NR_STRIPE_HASH_LOCKS);
5456 if (mddev->queue)
5457 trace_block_unplug(mddev->queue, cnt, !from_schedule);
5458 kfree(cb);
5459}
5460
5461static void release_stripe_plug(struct mddev *mddev,
5462 struct stripe_head *sh)
5463{
5464 struct blk_plug_cb *blk_cb = blk_check_plugged(
5465 raid5_unplug, mddev,
5466 sizeof(struct raid5_plug_cb));
5467 struct raid5_plug_cb *cb;
5468
5469 if (!blk_cb) {
5470 raid5_release_stripe(sh);
5471 return;
5472 }
5473
5474 cb = container_of(blk_cb, struct raid5_plug_cb, cb);
5475
5476 if (cb->list.next == NULL) {
5477 int i;
5478 INIT_LIST_HEAD(&cb->list);
5479 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5480 INIT_LIST_HEAD(cb->temp_inactive_list + i);
5481 }
5482
5483 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
5484 list_add_tail(&sh->lru, &cb->list);
5485 else
5486 raid5_release_stripe(sh);
5487}
5488
5489static void make_discard_request(struct mddev *mddev, struct bio *bi)
5490{
5491 struct r5conf *conf = mddev->private;
5492 sector_t logical_sector, last_sector;
5493 struct stripe_head *sh;
5494 int stripe_sectors;
5495
5496 if (mddev->reshape_position != MaxSector)
5497
5498 return;
5499
5500 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5501 last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
5502
5503 bi->bi_next = NULL;
5504
5505 stripe_sectors = conf->chunk_sectors *
5506 (conf->raid_disks - conf->max_degraded);
5507 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
5508 stripe_sectors);
5509 sector_div(last_sector, stripe_sectors);
5510
5511 logical_sector *= conf->chunk_sectors;
5512 last_sector *= conf->chunk_sectors;
5513
5514 for (; logical_sector < last_sector;
5515 logical_sector += STRIPE_SECTORS) {
5516 DEFINE_WAIT(w);
5517 int d;
5518 again:
5519 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
5520 prepare_to_wait(&conf->wait_for_overlap, &w,
5521 TASK_UNINTERRUPTIBLE);
5522 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5523 if (test_bit(STRIPE_SYNCING, &sh->state)) {
5524 raid5_release_stripe(sh);
5525 schedule();
5526 goto again;
5527 }
5528 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5529 spin_lock_irq(&sh->stripe_lock);
5530 for (d = 0; d < conf->raid_disks; d++) {
5531 if (d == sh->pd_idx || d == sh->qd_idx)
5532 continue;
5533 if (sh->dev[d].towrite || sh->dev[d].toread) {
5534 set_bit(R5_Overlap, &sh->dev[d].flags);
5535 spin_unlock_irq(&sh->stripe_lock);
5536 raid5_release_stripe(sh);
5537 schedule();
5538 goto again;
5539 }
5540 }
5541 set_bit(STRIPE_DISCARD, &sh->state);
5542 finish_wait(&conf->wait_for_overlap, &w);
5543 sh->overwrite_disks = 0;
5544 for (d = 0; d < conf->raid_disks; d++) {
5545 if (d == sh->pd_idx || d == sh->qd_idx)
5546 continue;
5547 sh->dev[d].towrite = bi;
5548 set_bit(R5_OVERWRITE, &sh->dev[d].flags);
5549 bio_inc_remaining(bi);
5550 md_write_inc(mddev, bi);
5551 sh->overwrite_disks++;
5552 }
5553 spin_unlock_irq(&sh->stripe_lock);
5554 if (conf->mddev->bitmap) {
5555 for (d = 0;
5556 d < conf->raid_disks - conf->max_degraded;
5557 d++)
5558 md_bitmap_startwrite(mddev->bitmap,
5559 sh->sector,
5560 STRIPE_SECTORS,
5561 0);
5562 sh->bm_seq = conf->seq_flush + 1;
5563 set_bit(STRIPE_BIT_DELAY, &sh->state);
5564 }
5565
5566 set_bit(STRIPE_HANDLE, &sh->state);
5567 clear_bit(STRIPE_DELAYED, &sh->state);
5568 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5569 atomic_inc(&conf->preread_active_stripes);
5570 release_stripe_plug(mddev, sh);
5571 }
5572
5573 bio_endio(bi);
5574}
5575
5576static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
5577{
5578 struct r5conf *conf = mddev->private;
5579 int dd_idx;
5580 sector_t new_sector;
5581 sector_t logical_sector, last_sector;
5582 struct stripe_head *sh;
5583 const int rw = bio_data_dir(bi);
5584 DEFINE_WAIT(w);
5585 bool do_prepare;
5586 bool do_flush = false;
5587
5588 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
5589 int ret = log_handle_flush_request(conf, bi);
5590
5591 if (ret == 0)
5592 return true;
5593 if (ret == -ENODEV) {
5594 md_flush_request(mddev, bi);
5595 return true;
5596 }
5597
5598
5599
5600
5601
5602 do_flush = bi->bi_opf & REQ_PREFLUSH;
5603 }
5604
5605 if (!md_write_start(mddev, bi))
5606 return false;
5607
5608
5609
5610
5611
5612 if (rw == READ && mddev->degraded == 0 &&
5613 mddev->reshape_position == MaxSector) {
5614 bi = chunk_aligned_read(mddev, bi);
5615 if (!bi)
5616 return true;
5617 }
5618
5619 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
5620 make_discard_request(mddev, bi);
5621 md_write_end(mddev);
5622 return true;
5623 }
5624
5625 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5626 last_sector = bio_end_sector(bi);
5627 bi->bi_next = NULL;
5628
5629 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
5630 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
5631 int previous;
5632 int seq;
5633
5634 do_prepare = false;
5635 retry:
5636 seq = read_seqcount_begin(&conf->gen_lock);
5637 previous = 0;
5638 if (do_prepare)
5639 prepare_to_wait(&conf->wait_for_overlap, &w,
5640 TASK_UNINTERRUPTIBLE);
5641 if (unlikely(conf->reshape_progress != MaxSector)) {
5642
5643
5644
5645
5646
5647
5648
5649
5650 spin_lock_irq(&conf->device_lock);
5651 if (mddev->reshape_backwards
5652 ? logical_sector < conf->reshape_progress
5653 : logical_sector >= conf->reshape_progress) {
5654 previous = 1;
5655 } else {
5656 if (mddev->reshape_backwards
5657 ? logical_sector < conf->reshape_safe
5658 : logical_sector >= conf->reshape_safe) {
5659 spin_unlock_irq(&conf->device_lock);
5660 schedule();
5661 do_prepare = true;
5662 goto retry;
5663 }
5664 }
5665 spin_unlock_irq(&conf->device_lock);
5666 }
5667
5668 new_sector = raid5_compute_sector(conf, logical_sector,
5669 previous,
5670 &dd_idx, NULL);
5671 pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n",
5672 (unsigned long long)new_sector,
5673 (unsigned long long)logical_sector);
5674
5675 sh = raid5_get_active_stripe(conf, new_sector, previous,
5676 (bi->bi_opf & REQ_RAHEAD), 0);
5677 if (sh) {
5678 if (unlikely(previous)) {
5679
5680
5681
5682
5683
5684
5685
5686
5687 int must_retry = 0;
5688 spin_lock_irq(&conf->device_lock);
5689 if (mddev->reshape_backwards
5690 ? logical_sector >= conf->reshape_progress
5691 : logical_sector < conf->reshape_progress)
5692
5693 must_retry = 1;
5694 spin_unlock_irq(&conf->device_lock);
5695 if (must_retry) {
5696 raid5_release_stripe(sh);
5697 schedule();
5698 do_prepare = true;
5699 goto retry;
5700 }
5701 }
5702 if (read_seqcount_retry(&conf->gen_lock, seq)) {
5703
5704
5705
5706 raid5_release_stripe(sh);
5707 goto retry;
5708 }
5709
5710 if (test_bit(STRIPE_EXPANDING, &sh->state) ||
5711 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
5712
5713
5714
5715
5716 md_wakeup_thread(mddev->thread);
5717 raid5_release_stripe(sh);
5718 schedule();
5719 do_prepare = true;
5720 goto retry;
5721 }
5722 if (do_flush) {
5723 set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
5724
5725 do_flush = false;
5726 }
5727
5728 set_bit(STRIPE_HANDLE, &sh->state);
5729 clear_bit(STRIPE_DELAYED, &sh->state);
5730 if ((!sh->batch_head || sh == sh->batch_head) &&
5731 (bi->bi_opf & REQ_SYNC) &&
5732 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5733 atomic_inc(&conf->preread_active_stripes);
5734 release_stripe_plug(mddev, sh);
5735 } else {
5736
5737 bi->bi_status = BLK_STS_IOERR;
5738 break;
5739 }
5740 }
5741 finish_wait(&conf->wait_for_overlap, &w);
5742
5743 if (rw == WRITE)
5744 md_write_end(mddev);
5745 bio_endio(bi);
5746 return true;
5747}
5748
5749static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
5750
5751static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
5752{
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762 struct r5conf *conf = mddev->private;
5763 struct stripe_head *sh;
5764 struct md_rdev *rdev;
5765 sector_t first_sector, last_sector;
5766 int raid_disks = conf->previous_raid_disks;
5767 int data_disks = raid_disks - conf->max_degraded;
5768 int new_data_disks = conf->raid_disks - conf->max_degraded;
5769 int i;
5770 int dd_idx;
5771 sector_t writepos, readpos, safepos;
5772 sector_t stripe_addr;
5773 int reshape_sectors;
5774 struct list_head stripes;
5775 sector_t retn;
5776
5777 if (sector_nr == 0) {
5778
5779 if (mddev->reshape_backwards &&
5780 conf->reshape_progress < raid5_size(mddev, 0, 0)) {
5781 sector_nr = raid5_size(mddev, 0, 0)
5782 - conf->reshape_progress;
5783 } else if (mddev->reshape_backwards &&
5784 conf->reshape_progress == MaxSector) {
5785
5786 sector_nr = MaxSector;
5787 } else if (!mddev->reshape_backwards &&
5788 conf->reshape_progress > 0)
5789 sector_nr = conf->reshape_progress;
5790 sector_div(sector_nr, new_data_disks);
5791 if (sector_nr) {
5792 mddev->curr_resync_completed = sector_nr;
5793 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5794 *skipped = 1;
5795 retn = sector_nr;
5796 goto finish;
5797 }
5798 }
5799
5800
5801
5802
5803
5804
5805 reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
5806
5807
5808
5809
5810
5811
5812
5813 writepos = conf->reshape_progress;
5814 sector_div(writepos, new_data_disks);
5815 readpos = conf->reshape_progress;
5816 sector_div(readpos, data_disks);
5817 safepos = conf->reshape_safe;
5818 sector_div(safepos, data_disks);
5819 if (mddev->reshape_backwards) {
5820 BUG_ON(writepos < reshape_sectors);
5821 writepos -= reshape_sectors;
5822 readpos += reshape_sectors;
5823 safepos += reshape_sectors;
5824 } else {
5825 writepos += reshape_sectors;
5826
5827
5828
5829
5830 readpos -= min_t(sector_t, reshape_sectors, readpos);
5831 safepos -= min_t(sector_t, reshape_sectors, safepos);
5832 }
5833
5834
5835
5836
5837 if (mddev->reshape_backwards) {
5838 BUG_ON(conf->reshape_progress == 0);
5839 stripe_addr = writepos;
5840 BUG_ON((mddev->dev_sectors &
5841 ~((sector_t)reshape_sectors - 1))
5842 - reshape_sectors - stripe_addr
5843 != sector_nr);
5844 } else {
5845 BUG_ON(writepos != sector_nr + reshape_sectors);
5846 stripe_addr = sector_nr;
5847 }
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869 if (conf->min_offset_diff < 0) {
5870 safepos += -conf->min_offset_diff;
5871 readpos += -conf->min_offset_diff;
5872 } else
5873 writepos += conf->min_offset_diff;
5874
5875 if ((mddev->reshape_backwards
5876 ? (safepos > writepos && readpos < writepos)
5877 : (safepos < writepos && readpos > writepos)) ||
5878 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
5879
5880 wait_event(conf->wait_for_overlap,
5881 atomic_read(&conf->reshape_stripes)==0
5882 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5883 if (atomic_read(&conf->reshape_stripes) != 0)
5884 return 0;
5885 mddev->reshape_position = conf->reshape_progress;
5886 mddev->curr_resync_completed = sector_nr;
5887 if (!mddev->reshape_backwards)
5888
5889 rdev_for_each(rdev, mddev)
5890 if (rdev->raid_disk >= 0 &&
5891 !test_bit(Journal, &rdev->flags) &&
5892 !test_bit(In_sync, &rdev->flags) &&
5893 rdev->recovery_offset < sector_nr)
5894 rdev->recovery_offset = sector_nr;
5895
5896 conf->reshape_checkpoint = jiffies;
5897 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
5898 md_wakeup_thread(mddev->thread);
5899 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
5900 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5901 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5902 return 0;
5903 spin_lock_irq(&conf->device_lock);
5904 conf->reshape_safe = mddev->reshape_position;
5905 spin_unlock_irq(&conf->device_lock);
5906 wake_up(&conf->wait_for_overlap);
5907 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5908 }
5909
5910 INIT_LIST_HEAD(&stripes);
5911 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
5912 int j;
5913 int skipped_disk = 0;
5914 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
5915 set_bit(STRIPE_EXPANDING, &sh->state);
5916 atomic_inc(&conf->reshape_stripes);
5917
5918
5919
5920 for (j=sh->disks; j--;) {
5921 sector_t s;
5922 if (j == sh->pd_idx)
5923 continue;
5924 if (conf->level == 6 &&
5925 j == sh->qd_idx)
5926 continue;
5927 s = raid5_compute_blocknr(sh, j, 0);
5928 if (s < raid5_size(mddev, 0, 0)) {
5929 skipped_disk = 1;
5930 continue;
5931 }
5932 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
5933 set_bit(R5_Expanded, &sh->dev[j].flags);
5934 set_bit(R5_UPTODATE, &sh->dev[j].flags);
5935 }
5936 if (!skipped_disk) {
5937 set_bit(STRIPE_EXPAND_READY, &sh->state);
5938 set_bit(STRIPE_HANDLE, &sh->state);
5939 }
5940 list_add(&sh->lru, &stripes);
5941 }
5942 spin_lock_irq(&conf->device_lock);
5943 if (mddev->reshape_backwards)
5944 conf->reshape_progress -= reshape_sectors * new_data_disks;
5945 else
5946 conf->reshape_progress += reshape_sectors * new_data_disks;
5947 spin_unlock_irq(&conf->device_lock);
5948
5949
5950
5951
5952
5953 first_sector =
5954 raid5_compute_sector(conf, stripe_addr*(new_data_disks),
5955 1, &dd_idx, NULL);
5956 last_sector =
5957 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
5958 * new_data_disks - 1),
5959 1, &dd_idx, NULL);
5960 if (last_sector >= mddev->dev_sectors)
5961 last_sector = mddev->dev_sectors - 1;
5962 while (first_sector <= last_sector) {
5963 sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
5964 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
5965 set_bit(STRIPE_HANDLE, &sh->state);
5966 raid5_release_stripe(sh);
5967 first_sector += STRIPE_SECTORS;
5968 }
5969
5970
5971
5972 while (!list_empty(&stripes)) {
5973 sh = list_entry(stripes.next, struct stripe_head, lru);
5974 list_del_init(&sh->lru);
5975 raid5_release_stripe(sh);
5976 }
5977
5978
5979
5980 sector_nr += reshape_sectors;
5981 retn = reshape_sectors;
5982finish:
5983 if (mddev->curr_resync_completed > mddev->resync_max ||
5984 (sector_nr - mddev->curr_resync_completed) * 2
5985 >= mddev->resync_max - mddev->curr_resync_completed) {
5986
5987 wait_event(conf->wait_for_overlap,
5988 atomic_read(&conf->reshape_stripes) == 0
5989 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5990 if (atomic_read(&conf->reshape_stripes) != 0)
5991 goto ret;
5992 mddev->reshape_position = conf->reshape_progress;
5993 mddev->curr_resync_completed = sector_nr;
5994 if (!mddev->reshape_backwards)
5995
5996 rdev_for_each(rdev, mddev)
5997 if (rdev->raid_disk >= 0 &&
5998 !test_bit(Journal, &rdev->flags) &&
5999 !test_bit(In_sync, &rdev->flags) &&
6000 rdev->recovery_offset < sector_nr)
6001 rdev->recovery_offset = sector_nr;
6002 conf->reshape_checkpoint = jiffies;
6003 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6004 md_wakeup_thread(mddev->thread);
6005 wait_event(mddev->sb_wait,
6006 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
6007 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6008 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6009 goto ret;
6010 spin_lock_irq(&conf->device_lock);
6011 conf->reshape_safe = mddev->reshape_position;
6012 spin_unlock_irq(&conf->device_lock);
6013 wake_up(&conf->wait_for_overlap);
6014 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6015 }
6016ret:
6017 return retn;
6018}
6019
6020static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
6021 int *skipped)
6022{
6023 struct r5conf *conf = mddev->private;
6024 struct stripe_head *sh;
6025 sector_t max_sector = mddev->dev_sectors;
6026 sector_t sync_blocks;
6027 int still_degraded = 0;
6028 int i;
6029
6030 if (sector_nr >= max_sector) {
6031
6032
6033 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
6034 end_reshape(conf);
6035 return 0;
6036 }
6037
6038 if (mddev->curr_resync < max_sector)
6039 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
6040 &sync_blocks, 1);
6041 else
6042 conf->fullsync = 0;
6043 md_bitmap_close_sync(mddev->bitmap);
6044
6045 return 0;
6046 }
6047
6048
6049 wait_event(conf->wait_for_overlap, conf->quiesce != 2);
6050
6051 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6052 return reshape_request(mddev, sector_nr, skipped);
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064 if (mddev->degraded >= conf->max_degraded &&
6065 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6066 sector_t rv = mddev->dev_sectors - sector_nr;
6067 *skipped = 1;
6068 return rv;
6069 }
6070 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
6071 !conf->fullsync &&
6072 !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
6073 sync_blocks >= STRIPE_SECTORS) {
6074
6075 sync_blocks /= STRIPE_SECTORS;
6076 *skipped = 1;
6077 return sync_blocks * STRIPE_SECTORS;
6078 }
6079
6080 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
6081
6082 sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0);
6083 if (sh == NULL) {
6084 sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0);
6085
6086
6087
6088 schedule_timeout_uninterruptible(1);
6089 }
6090
6091
6092
6093
6094 rcu_read_lock();
6095 for (i = 0; i < conf->raid_disks; i++) {
6096 struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
6097
6098 if (rdev == NULL || test_bit(Faulty, &rdev->flags))
6099 still_degraded = 1;
6100 }
6101 rcu_read_unlock();
6102
6103 md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
6104
6105 set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
6106 set_bit(STRIPE_HANDLE, &sh->state);
6107
6108 raid5_release_stripe(sh);
6109
6110 return STRIPE_SECTORS;
6111}
6112
6113static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
6114 unsigned int offset)
6115{
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126 struct stripe_head *sh;
6127 int dd_idx;
6128 sector_t sector, logical_sector, last_sector;
6129 int scnt = 0;
6130 int handled = 0;
6131
6132 logical_sector = raid_bio->bi_iter.bi_sector &
6133 ~((sector_t)STRIPE_SECTORS-1);
6134 sector = raid5_compute_sector(conf, logical_sector,
6135 0, &dd_idx, NULL);
6136 last_sector = bio_end_sector(raid_bio);
6137
6138 for (; logical_sector < last_sector;
6139 logical_sector += STRIPE_SECTORS,
6140 sector += STRIPE_SECTORS,
6141 scnt++) {
6142
6143 if (scnt < offset)
6144
6145 continue;
6146
6147 sh = raid5_get_active_stripe(conf, sector, 0, 1, 1);
6148
6149 if (!sh) {
6150
6151 conf->retry_read_aligned = raid_bio;
6152 conf->retry_read_offset = scnt;
6153 return handled;
6154 }
6155
6156 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
6157 raid5_release_stripe(sh);
6158 conf->retry_read_aligned = raid_bio;
6159 conf->retry_read_offset = scnt;
6160 return handled;
6161 }
6162
6163 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
6164 handle_stripe(sh);
6165 raid5_release_stripe(sh);
6166 handled++;
6167 }
6168
6169 bio_endio(raid_bio);
6170
6171 if (atomic_dec_and_test(&conf->active_aligned_reads))
6172 wake_up(&conf->wait_for_quiescent);
6173 return handled;
6174}
6175
6176static int handle_active_stripes(struct r5conf *conf, int group,
6177 struct r5worker *worker,
6178 struct list_head *temp_inactive_list)
6179{
6180 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
6181 int i, batch_size = 0, hash;
6182 bool release_inactive = false;
6183
6184 while (batch_size < MAX_STRIPE_BATCH &&
6185 (sh = __get_priority_stripe(conf, group)) != NULL)
6186 batch[batch_size++] = sh;
6187
6188 if (batch_size == 0) {
6189 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6190 if (!list_empty(temp_inactive_list + i))
6191 break;
6192 if (i == NR_STRIPE_HASH_LOCKS) {
6193 spin_unlock_irq(&conf->device_lock);
6194 log_flush_stripe_to_raid(conf);
6195 spin_lock_irq(&conf->device_lock);
6196 return batch_size;
6197 }
6198 release_inactive = true;
6199 }
6200 spin_unlock_irq(&conf->device_lock);
6201
6202 release_inactive_stripe_list(conf, temp_inactive_list,
6203 NR_STRIPE_HASH_LOCKS);
6204
6205 r5l_flush_stripe_to_raid(conf->log);
6206 if (release_inactive) {
6207 spin_lock_irq(&conf->device_lock);
6208 return 0;
6209 }
6210
6211 for (i = 0; i < batch_size; i++)
6212 handle_stripe(batch[i]);
6213 log_write_stripe_run(conf);
6214
6215 cond_resched();
6216
6217 spin_lock_irq(&conf->device_lock);
6218 for (i = 0; i < batch_size; i++) {
6219 hash = batch[i]->hash_lock_index;
6220 __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
6221 }
6222 return batch_size;
6223}
6224
6225static void raid5_do_work(struct work_struct *work)
6226{
6227 struct r5worker *worker = container_of(work, struct r5worker, work);
6228 struct r5worker_group *group = worker->group;
6229 struct r5conf *conf = group->conf;
6230 struct mddev *mddev = conf->mddev;
6231 int group_id = group - conf->worker_groups;
6232 int handled;
6233 struct blk_plug plug;
6234
6235 pr_debug("+++ raid5worker active\n");
6236
6237 blk_start_plug(&plug);
6238 handled = 0;
6239 spin_lock_irq(&conf->device_lock);
6240 while (1) {
6241 int batch_size, released;
6242
6243 released = release_stripe_list(conf, worker->temp_inactive_list);
6244
6245 batch_size = handle_active_stripes(conf, group_id, worker,
6246 worker->temp_inactive_list);
6247 worker->working = false;
6248 if (!batch_size && !released)
6249 break;
6250 handled += batch_size;
6251 wait_event_lock_irq(mddev->sb_wait,
6252 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6253 conf->device_lock);
6254 }
6255 pr_debug("%d stripes handled\n", handled);
6256
6257 spin_unlock_irq(&conf->device_lock);
6258
6259 flush_deferred_bios(conf);
6260
6261 r5l_flush_stripe_to_raid(conf->log);
6262
6263 async_tx_issue_pending_all();
6264 blk_finish_plug(&plug);
6265
6266 pr_debug("--- raid5worker inactive\n");
6267}
6268
6269
6270
6271
6272
6273
6274
6275
6276static void raid5d(struct md_thread *thread)
6277{
6278 struct mddev *mddev = thread->mddev;
6279 struct r5conf *conf = mddev->private;
6280 int handled;
6281 struct blk_plug plug;
6282
6283 pr_debug("+++ raid5d active\n");
6284
6285 md_check_recovery(mddev);
6286
6287 blk_start_plug(&plug);
6288 handled = 0;
6289 spin_lock_irq(&conf->device_lock);
6290 while (1) {
6291 struct bio *bio;
6292 int batch_size, released;
6293 unsigned int offset;
6294
6295 released = release_stripe_list(conf, conf->temp_inactive_list);
6296 if (released)
6297 clear_bit(R5_DID_ALLOC, &conf->cache_state);
6298
6299 if (
6300 !list_empty(&conf->bitmap_list)) {
6301
6302 conf->seq_flush++;
6303 spin_unlock_irq(&conf->device_lock);
6304 md_bitmap_unplug(mddev->bitmap);
6305 spin_lock_irq(&conf->device_lock);
6306 conf->seq_write = conf->seq_flush;
6307 activate_bit_delay(conf, conf->temp_inactive_list);
6308 }
6309 raid5_activate_delayed(conf);
6310
6311 while ((bio = remove_bio_from_retry(conf, &offset))) {
6312 int ok;
6313 spin_unlock_irq(&conf->device_lock);
6314 ok = retry_aligned_read(conf, bio, offset);
6315 spin_lock_irq(&conf->device_lock);
6316 if (!ok)
6317 break;
6318 handled++;
6319 }
6320
6321 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
6322 conf->temp_inactive_list);
6323 if (!batch_size && !released)
6324 break;
6325 handled += batch_size;
6326
6327 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
6328 spin_unlock_irq(&conf->device_lock);
6329 md_check_recovery(mddev);
6330 spin_lock_irq(&conf->device_lock);
6331 }
6332 }
6333 pr_debug("%d stripes handled\n", handled);
6334
6335 spin_unlock_irq(&conf->device_lock);
6336 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) &&
6337 mutex_trylock(&conf->cache_size_mutex)) {
6338 grow_one_stripe(conf, __GFP_NOWARN);
6339
6340
6341
6342 set_bit(R5_DID_ALLOC, &conf->cache_state);
6343 mutex_unlock(&conf->cache_size_mutex);
6344 }
6345
6346 flush_deferred_bios(conf);
6347
6348 r5l_flush_stripe_to_raid(conf->log);
6349
6350 async_tx_issue_pending_all();
6351 blk_finish_plug(&plug);
6352
6353 pr_debug("--- raid5d inactive\n");
6354}
6355
6356static ssize_t
6357raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
6358{
6359 struct r5conf *conf;
6360 int ret = 0;
6361 spin_lock(&mddev->lock);
6362 conf = mddev->private;
6363 if (conf)
6364 ret = sprintf(page, "%d\n", conf->min_nr_stripes);
6365 spin_unlock(&mddev->lock);
6366 return ret;
6367}
6368
6369int
6370raid5_set_cache_size(struct mddev *mddev, int size)
6371{
6372 struct r5conf *conf = mddev->private;
6373
6374 if (size <= 16 || size > 32768)
6375 return -EINVAL;
6376
6377 conf->min_nr_stripes = size;
6378 mutex_lock(&conf->cache_size_mutex);
6379 while (size < conf->max_nr_stripes &&
6380 drop_one_stripe(conf))
6381 ;
6382 mutex_unlock(&conf->cache_size_mutex);
6383
6384 md_allow_write(mddev);
6385
6386 mutex_lock(&conf->cache_size_mutex);
6387 while (size > conf->max_nr_stripes)
6388 if (!grow_one_stripe(conf, GFP_KERNEL))
6389 break;
6390 mutex_unlock(&conf->cache_size_mutex);
6391
6392 return 0;
6393}
6394EXPORT_SYMBOL(raid5_set_cache_size);
6395
6396static ssize_t
6397raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
6398{
6399 struct r5conf *conf;
6400 unsigned long new;
6401 int err;
6402
6403 if (len >= PAGE_SIZE)
6404 return -EINVAL;
6405 if (kstrtoul(page, 10, &new))
6406 return -EINVAL;
6407 err = mddev_lock(mddev);
6408 if (err)
6409 return err;
6410 conf = mddev->private;
6411 if (!conf)
6412 err = -ENODEV;
6413 else
6414 err = raid5_set_cache_size(mddev, new);
6415 mddev_unlock(mddev);
6416
6417 return err ?: len;
6418}
6419
6420static struct md_sysfs_entry
6421raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
6422 raid5_show_stripe_cache_size,
6423 raid5_store_stripe_cache_size);
6424
6425static ssize_t
6426raid5_show_rmw_level(struct mddev *mddev, char *page)
6427{
6428 struct r5conf *conf = mddev->private;
6429 if (conf)
6430 return sprintf(page, "%d\n", conf->rmw_level);
6431 else
6432 return 0;
6433}
6434
6435static ssize_t
6436raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len)
6437{
6438 struct r5conf *conf = mddev->private;
6439 unsigned long new;
6440
6441 if (!conf)
6442 return -ENODEV;
6443
6444 if (len >= PAGE_SIZE)
6445 return -EINVAL;
6446
6447 if (kstrtoul(page, 10, &new))
6448 return -EINVAL;
6449
6450 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
6451 return -EINVAL;
6452
6453 if (new != PARITY_DISABLE_RMW &&
6454 new != PARITY_ENABLE_RMW &&
6455 new != PARITY_PREFER_RMW)
6456 return -EINVAL;
6457
6458 conf->rmw_level = new;
6459 return len;
6460}
6461
6462static struct md_sysfs_entry
6463raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
6464 raid5_show_rmw_level,
6465 raid5_store_rmw_level);
6466
6467
6468static ssize_t
6469raid5_show_preread_threshold(struct mddev *mddev, char *page)
6470{
6471 struct r5conf *conf;
6472 int ret = 0;
6473 spin_lock(&mddev->lock);
6474 conf = mddev->private;
6475 if (conf)
6476 ret = sprintf(page, "%d\n", conf->bypass_threshold);
6477 spin_unlock(&mddev->lock);
6478 return ret;
6479}
6480
6481static ssize_t
6482raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
6483{
6484 struct r5conf *conf;
6485 unsigned long new;
6486 int err;
6487
6488 if (len >= PAGE_SIZE)
6489 return -EINVAL;
6490 if (kstrtoul(page, 10, &new))
6491 return -EINVAL;
6492
6493 err = mddev_lock(mddev);
6494 if (err)
6495 return err;
6496 conf = mddev->private;
6497 if (!conf)
6498 err = -ENODEV;
6499 else if (new > conf->min_nr_stripes)
6500 err = -EINVAL;
6501 else
6502 conf->bypass_threshold = new;
6503 mddev_unlock(mddev);
6504 return err ?: len;
6505}
6506
6507static struct md_sysfs_entry
6508raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
6509 S_IRUGO | S_IWUSR,
6510 raid5_show_preread_threshold,
6511 raid5_store_preread_threshold);
6512
6513static ssize_t
6514raid5_show_skip_copy(struct mddev *mddev, char *page)
6515{
6516 struct r5conf *conf;
6517 int ret = 0;
6518 spin_lock(&mddev->lock);
6519 conf = mddev->private;
6520 if (conf)
6521 ret = sprintf(page, "%d\n", conf->skip_copy);
6522 spin_unlock(&mddev->lock);
6523 return ret;
6524}
6525
6526static ssize_t
6527raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
6528{
6529 struct r5conf *conf;
6530 unsigned long new;
6531 int err;
6532
6533 if (len >= PAGE_SIZE)
6534 return -EINVAL;
6535 if (kstrtoul(page, 10, &new))
6536 return -EINVAL;
6537 new = !!new;
6538
6539 err = mddev_lock(mddev);
6540 if (err)
6541 return err;
6542 conf = mddev->private;
6543 if (!conf)
6544 err = -ENODEV;
6545 else if (new != conf->skip_copy) {
6546 mddev_suspend(mddev);
6547 conf->skip_copy = new;
6548 if (new)
6549 mddev->queue->backing_dev_info->capabilities |=
6550 BDI_CAP_STABLE_WRITES;
6551 else
6552 mddev->queue->backing_dev_info->capabilities &=
6553 ~BDI_CAP_STABLE_WRITES;
6554 mddev_resume(mddev);
6555 }
6556 mddev_unlock(mddev);
6557 return err ?: len;
6558}
6559
6560static struct md_sysfs_entry
6561raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
6562 raid5_show_skip_copy,
6563 raid5_store_skip_copy);
6564
6565static ssize_t
6566stripe_cache_active_show(struct mddev *mddev, char *page)
6567{
6568 struct r5conf *conf = mddev->private;
6569 if (conf)
6570 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
6571 else
6572 return 0;
6573}
6574
6575static struct md_sysfs_entry
6576raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
6577
6578static ssize_t
6579raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
6580{
6581 struct r5conf *conf;
6582 int ret = 0;
6583 spin_lock(&mddev->lock);
6584 conf = mddev->private;
6585 if (conf)
6586 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
6587 spin_unlock(&mddev->lock);
6588 return ret;
6589}
6590
6591static int alloc_thread_groups(struct r5conf *conf, int cnt,
6592 int *group_cnt,
6593 int *worker_cnt_per_group,
6594 struct r5worker_group **worker_groups);
6595static ssize_t
6596raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
6597{
6598 struct r5conf *conf;
6599 unsigned int new;
6600 int err;
6601 struct r5worker_group *new_groups, *old_groups;
6602 int group_cnt, worker_cnt_per_group;
6603
6604 if (len >= PAGE_SIZE)
6605 return -EINVAL;
6606 if (kstrtouint(page, 10, &new))
6607 return -EINVAL;
6608
6609 if (new > 8192)
6610 return -EINVAL;
6611
6612 err = mddev_lock(mddev);
6613 if (err)
6614 return err;
6615 conf = mddev->private;
6616 if (!conf)
6617 err = -ENODEV;
6618 else if (new != conf->worker_cnt_per_group) {
6619 mddev_suspend(mddev);
6620
6621 old_groups = conf->worker_groups;
6622 if (old_groups)
6623 flush_workqueue(raid5_wq);
6624
6625 err = alloc_thread_groups(conf, new,
6626 &group_cnt, &worker_cnt_per_group,
6627 &new_groups);
6628 if (!err) {
6629 spin_lock_irq(&conf->device_lock);
6630 conf->group_cnt = group_cnt;
6631 conf->worker_cnt_per_group = worker_cnt_per_group;
6632 conf->worker_groups = new_groups;
6633 spin_unlock_irq(&conf->device_lock);
6634
6635 if (old_groups)
6636 kfree(old_groups[0].workers);
6637 kfree(old_groups);
6638 }
6639 mddev_resume(mddev);
6640 }
6641 mddev_unlock(mddev);
6642
6643 return err ?: len;
6644}
6645
6646static struct md_sysfs_entry
6647raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
6648 raid5_show_group_thread_cnt,
6649 raid5_store_group_thread_cnt);
6650
6651static struct attribute *raid5_attrs[] = {
6652 &raid5_stripecache_size.attr,
6653 &raid5_stripecache_active.attr,
6654 &raid5_preread_bypass_threshold.attr,
6655 &raid5_group_thread_cnt.attr,
6656 &raid5_skip_copy.attr,
6657 &raid5_rmw_level.attr,
6658 &r5c_journal_mode.attr,
6659 NULL,
6660};
6661static struct attribute_group raid5_attrs_group = {
6662 .name = NULL,
6663 .attrs = raid5_attrs,
6664};
6665
6666static int alloc_thread_groups(struct r5conf *conf, int cnt,
6667 int *group_cnt,
6668 int *worker_cnt_per_group,
6669 struct r5worker_group **worker_groups)
6670{
6671 int i, j, k;
6672 ssize_t size;
6673 struct r5worker *workers;
6674
6675 *worker_cnt_per_group = cnt;
6676 if (cnt == 0) {
6677 *group_cnt = 0;
6678 *worker_groups = NULL;
6679 return 0;
6680 }
6681 *group_cnt = num_possible_nodes();
6682 size = sizeof(struct r5worker) * cnt;
6683 workers = kcalloc(size, *group_cnt, GFP_NOIO);
6684 *worker_groups = kcalloc(*group_cnt, sizeof(struct r5worker_group),
6685 GFP_NOIO);
6686 if (!*worker_groups || !workers) {
6687 kfree(workers);
6688 kfree(*worker_groups);
6689 return -ENOMEM;
6690 }
6691
6692 for (i = 0; i < *group_cnt; i++) {
6693 struct r5worker_group *group;
6694
6695 group = &(*worker_groups)[i];
6696 INIT_LIST_HEAD(&group->handle_list);
6697 INIT_LIST_HEAD(&group->loprio_list);
6698 group->conf = conf;
6699 group->workers = workers + i * cnt;
6700
6701 for (j = 0; j < cnt; j++) {
6702 struct r5worker *worker = group->workers + j;
6703 worker->group = group;
6704 INIT_WORK(&worker->work, raid5_do_work);
6705
6706 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
6707 INIT_LIST_HEAD(worker->temp_inactive_list + k);
6708 }
6709 }
6710
6711 return 0;
6712}
6713
6714static void free_thread_groups(struct r5conf *conf)
6715{
6716 if (conf->worker_groups)
6717 kfree(conf->worker_groups[0].workers);
6718 kfree(conf->worker_groups);
6719 conf->worker_groups = NULL;
6720}
6721
6722static sector_t
6723raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
6724{
6725 struct r5conf *conf = mddev->private;
6726
6727 if (!sectors)
6728 sectors = mddev->dev_sectors;
6729 if (!raid_disks)
6730
6731 raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
6732
6733 sectors &= ~((sector_t)conf->chunk_sectors - 1);
6734 sectors &= ~((sector_t)conf->prev_chunk_sectors - 1);
6735 return sectors * (raid_disks - conf->max_degraded);
6736}
6737
6738static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
6739{
6740 safe_put_page(percpu->spare_page);
6741 if (percpu->scribble)
6742 flex_array_free(percpu->scribble);
6743 percpu->spare_page = NULL;
6744 percpu->scribble = NULL;
6745}
6746
6747static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
6748{
6749 if (conf->level == 6 && !percpu->spare_page)
6750 percpu->spare_page = alloc_page(GFP_KERNEL);
6751 if (!percpu->scribble)
6752 percpu->scribble = scribble_alloc(max(conf->raid_disks,
6753 conf->previous_raid_disks),
6754 max(conf->chunk_sectors,
6755 conf->prev_chunk_sectors)
6756 / STRIPE_SECTORS,
6757 GFP_KERNEL);
6758
6759 if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
6760 free_scratch_buffer(conf, percpu);
6761 return -ENOMEM;
6762 }
6763
6764 return 0;
6765}
6766
6767static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node)
6768{
6769 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
6770
6771 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
6772 return 0;
6773}
6774
6775static void raid5_free_percpu(struct r5conf *conf)
6776{
6777 if (!conf->percpu)
6778 return;
6779
6780 cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
6781 free_percpu(conf->percpu);
6782}
6783
6784static void free_conf(struct r5conf *conf)
6785{
6786 int i;
6787
6788 log_exit(conf);
6789
6790 unregister_shrinker(&conf->shrinker);
6791 free_thread_groups(conf);
6792 shrink_stripes(conf);
6793 raid5_free_percpu(conf);
6794 for (i = 0; i < conf->pool_size; i++)
6795 if (conf->disks[i].extra_page)
6796 put_page(conf->disks[i].extra_page);
6797 kfree(conf->disks);
6798 bioset_exit(&conf->bio_split);
6799 kfree(conf->stripe_hashtbl);
6800 kfree(conf->pending_data);
6801 kfree(conf);
6802}
6803
6804static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
6805{
6806 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
6807 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
6808
6809 if (alloc_scratch_buffer(conf, percpu)) {
6810 pr_warn("%s: failed memory allocation for cpu%u\n",
6811 __func__, cpu);
6812 return -ENOMEM;
6813 }
6814 return 0;
6815}
6816
6817static int raid5_alloc_percpu(struct r5conf *conf)
6818{
6819 int err = 0;
6820
6821 conf->percpu = alloc_percpu(struct raid5_percpu);
6822 if (!conf->percpu)
6823 return -ENOMEM;
6824
6825 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
6826 if (!err) {
6827 conf->scribble_disks = max(conf->raid_disks,
6828 conf->previous_raid_disks);
6829 conf->scribble_sectors = max(conf->chunk_sectors,
6830 conf->prev_chunk_sectors);
6831 }
6832 return err;
6833}
6834
6835static unsigned long raid5_cache_scan(struct shrinker *shrink,
6836 struct shrink_control *sc)
6837{
6838 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
6839 unsigned long ret = SHRINK_STOP;
6840
6841 if (mutex_trylock(&conf->cache_size_mutex)) {
6842 ret= 0;
6843 while (ret < sc->nr_to_scan &&
6844 conf->max_nr_stripes > conf->min_nr_stripes) {
6845 if (drop_one_stripe(conf) == 0) {
6846 ret = SHRINK_STOP;
6847 break;
6848 }
6849 ret++;
6850 }
6851 mutex_unlock(&conf->cache_size_mutex);
6852 }
6853 return ret;
6854}
6855
6856static unsigned long raid5_cache_count(struct shrinker *shrink,
6857 struct shrink_control *sc)
6858{
6859 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
6860
6861 if (conf->max_nr_stripes < conf->min_nr_stripes)
6862
6863 return 0;
6864 return conf->max_nr_stripes - conf->min_nr_stripes;
6865}
6866
6867static struct r5conf *setup_conf(struct mddev *mddev)
6868{
6869 struct r5conf *conf;
6870 int raid_disk, memory, max_disks;
6871 struct md_rdev *rdev;
6872 struct disk_info *disk;
6873 char pers_name[6];
6874 int i;
6875 int group_cnt, worker_cnt_per_group;
6876 struct r5worker_group *new_group;
6877 int ret;
6878
6879 if (mddev->new_level != 5
6880 && mddev->new_level != 4
6881 && mddev->new_level != 6) {
6882 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
6883 mdname(mddev), mddev->new_level);
6884 return ERR_PTR(-EIO);
6885 }
6886 if ((mddev->new_level == 5
6887 && !algorithm_valid_raid5(mddev->new_layout)) ||
6888 (mddev->new_level == 6
6889 && !algorithm_valid_raid6(mddev->new_layout))) {
6890 pr_warn("md/raid:%s: layout %d not supported\n",
6891 mdname(mddev), mddev->new_layout);
6892 return ERR_PTR(-EIO);
6893 }
6894 if (mddev->new_level == 6 && mddev->raid_disks < 4) {
6895 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
6896 mdname(mddev), mddev->raid_disks);
6897 return ERR_PTR(-EINVAL);
6898 }
6899
6900 if (!mddev->new_chunk_sectors ||
6901 (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
6902 !is_power_of_2(mddev->new_chunk_sectors)) {
6903 pr_warn("md/raid:%s: invalid chunk size %d\n",
6904 mdname(mddev), mddev->new_chunk_sectors << 9);
6905 return ERR_PTR(-EINVAL);
6906 }
6907
6908 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
6909 if (conf == NULL)
6910 goto abort;
6911 INIT_LIST_HEAD(&conf->free_list);
6912 INIT_LIST_HEAD(&conf->pending_list);
6913 conf->pending_data = kcalloc(PENDING_IO_MAX,
6914 sizeof(struct r5pending_data),
6915 GFP_KERNEL);
6916 if (!conf->pending_data)
6917 goto abort;
6918 for (i = 0; i < PENDING_IO_MAX; i++)
6919 list_add(&conf->pending_data[i].sibling, &conf->free_list);
6920
6921 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
6922 &new_group)) {
6923 conf->group_cnt = group_cnt;
6924 conf->worker_cnt_per_group = worker_cnt_per_group;
6925 conf->worker_groups = new_group;
6926 } else
6927 goto abort;
6928 spin_lock_init(&conf->device_lock);
6929 seqcount_init(&conf->gen_lock);
6930 mutex_init(&conf->cache_size_mutex);
6931 init_waitqueue_head(&conf->wait_for_quiescent);
6932 init_waitqueue_head(&conf->wait_for_stripe);
6933 init_waitqueue_head(&conf->wait_for_overlap);
6934 INIT_LIST_HEAD(&conf->handle_list);
6935 INIT_LIST_HEAD(&conf->loprio_list);
6936 INIT_LIST_HEAD(&conf->hold_list);
6937 INIT_LIST_HEAD(&conf->delayed_list);
6938 INIT_LIST_HEAD(&conf->bitmap_list);
6939 init_llist_head(&conf->released_stripes);
6940 atomic_set(&conf->active_stripes, 0);
6941 atomic_set(&conf->preread_active_stripes, 0);
6942 atomic_set(&conf->active_aligned_reads, 0);
6943 spin_lock_init(&conf->pending_bios_lock);
6944 conf->batch_bio_dispatch = true;
6945 rdev_for_each(rdev, mddev) {
6946 if (test_bit(Journal, &rdev->flags))
6947 continue;
6948 if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
6949 conf->batch_bio_dispatch = false;
6950 break;
6951 }
6952 }
6953
6954 conf->bypass_threshold = BYPASS_THRESHOLD;
6955 conf->recovery_disabled = mddev->recovery_disabled - 1;
6956
6957 conf->raid_disks = mddev->raid_disks;
6958 if (mddev->reshape_position == MaxSector)
6959 conf->previous_raid_disks = mddev->raid_disks;
6960 else
6961 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
6962 max_disks = max(conf->raid_disks, conf->previous_raid_disks);
6963
6964 conf->disks = kcalloc(max_disks, sizeof(struct disk_info),
6965 GFP_KERNEL);
6966
6967 if (!conf->disks)
6968 goto abort;
6969
6970 for (i = 0; i < max_disks; i++) {
6971 conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
6972 if (!conf->disks[i].extra_page)
6973 goto abort;
6974 }
6975
6976 ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
6977 if (ret)
6978 goto abort;
6979 conf->mddev = mddev;
6980
6981 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
6982 goto abort;
6983
6984
6985
6986
6987
6988
6989 spin_lock_init(conf->hash_locks);
6990 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
6991 spin_lock_init(conf->hash_locks + i);
6992
6993 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6994 INIT_LIST_HEAD(conf->inactive_list + i);
6995
6996 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6997 INIT_LIST_HEAD(conf->temp_inactive_list + i);
6998
6999 atomic_set(&conf->r5c_cached_full_stripes, 0);
7000 INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
7001 atomic_set(&conf->r5c_cached_partial_stripes, 0);
7002 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
7003 atomic_set(&conf->r5c_flushing_full_stripes, 0);
7004 atomic_set(&conf->r5c_flushing_partial_stripes, 0);
7005
7006 conf->level = mddev->new_level;
7007 conf->chunk_sectors = mddev->new_chunk_sectors;
7008 if (raid5_alloc_percpu(conf) != 0)
7009 goto abort;
7010
7011 pr_debug("raid456: run(%s) called.\n", mdname(mddev));
7012
7013 rdev_for_each(rdev, mddev) {
7014 raid_disk = rdev->raid_disk;
7015 if (raid_disk >= max_disks
7016 || raid_disk < 0 || test_bit(Journal, &rdev->flags))
7017 continue;
7018 disk = conf->disks + raid_disk;
7019
7020 if (test_bit(Replacement, &rdev->flags)) {
7021 if (disk->replacement)
7022 goto abort;
7023 disk->replacement = rdev;
7024 } else {
7025 if (disk->rdev)
7026 goto abort;
7027 disk->rdev = rdev;
7028 }
7029
7030 if (test_bit(In_sync, &rdev->flags)) {
7031 char b[BDEVNAME_SIZE];
7032 pr_info("md/raid:%s: device %s operational as raid disk %d\n",
7033 mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
7034 } else if (rdev->saved_raid_disk != raid_disk)
7035
7036 conf->fullsync = 1;
7037 }
7038
7039 conf->level = mddev->new_level;
7040 if (conf->level == 6) {
7041 conf->max_degraded = 2;
7042 if (raid6_call.xor_syndrome)
7043 conf->rmw_level = PARITY_ENABLE_RMW;
7044 else
7045 conf->rmw_level = PARITY_DISABLE_RMW;
7046 } else {
7047 conf->max_degraded = 1;
7048 conf->rmw_level = PARITY_ENABLE_RMW;
7049 }
7050 conf->algorithm = mddev->new_layout;
7051 conf->reshape_progress = mddev->reshape_position;
7052 if (conf->reshape_progress != MaxSector) {
7053 conf->prev_chunk_sectors = mddev->chunk_sectors;
7054 conf->prev_algo = mddev->layout;
7055 } else {
7056 conf->prev_chunk_sectors = conf->chunk_sectors;
7057 conf->prev_algo = conf->algorithm;
7058 }
7059
7060 conf->min_nr_stripes = NR_STRIPES;
7061 if (mddev->reshape_position != MaxSector) {
7062 int stripes = max_t(int,
7063 ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
7064 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
7065 conf->min_nr_stripes = max(NR_STRIPES, stripes);
7066 if (conf->min_nr_stripes != NR_STRIPES)
7067 pr_info("md/raid:%s: force stripe size %d for reshape\n",
7068 mdname(mddev), conf->min_nr_stripes);
7069 }
7070 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
7071 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
7072 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
7073 if (grow_stripes(conf, conf->min_nr_stripes)) {
7074 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
7075 mdname(mddev), memory);
7076 goto abort;
7077 } else
7078 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
7079
7080
7081
7082
7083
7084 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
7085 conf->shrinker.scan_objects = raid5_cache_scan;
7086 conf->shrinker.count_objects = raid5_cache_count;
7087 conf->shrinker.batch = 128;
7088 conf->shrinker.flags = 0;
7089 if (register_shrinker(&conf->shrinker)) {
7090 pr_warn("md/raid:%s: couldn't register shrinker.\n",
7091 mdname(mddev));
7092 goto abort;
7093 }
7094
7095 sprintf(pers_name, "raid%d", mddev->new_level);
7096 conf->thread = md_register_thread(raid5d, mddev, pers_name);
7097 if (!conf->thread) {
7098 pr_warn("md/raid:%s: couldn't allocate thread.\n",
7099 mdname(mddev));
7100 goto abort;
7101 }
7102
7103 return conf;
7104
7105 abort:
7106 if (conf) {
7107 free_conf(conf);
7108 return ERR_PTR(-EIO);
7109 } else
7110 return ERR_PTR(-ENOMEM);
7111}
7112
7113static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
7114{
7115 switch (algo) {
7116 case ALGORITHM_PARITY_0:
7117 if (raid_disk < max_degraded)
7118 return 1;
7119 break;
7120 case ALGORITHM_PARITY_N:
7121 if (raid_disk >= raid_disks - max_degraded)
7122 return 1;
7123 break;
7124 case ALGORITHM_PARITY_0_6:
7125 if (raid_disk == 0 ||
7126 raid_disk == raid_disks - 1)
7127 return 1;
7128 break;
7129 case ALGORITHM_LEFT_ASYMMETRIC_6:
7130 case ALGORITHM_RIGHT_ASYMMETRIC_6:
7131 case ALGORITHM_LEFT_SYMMETRIC_6:
7132 case ALGORITHM_RIGHT_SYMMETRIC_6:
7133 if (raid_disk == raid_disks - 1)
7134 return 1;
7135 }
7136 return 0;
7137}
7138
7139static int raid5_run(struct mddev *mddev)
7140{
7141 struct r5conf *conf;
7142 int working_disks = 0;
7143 int dirty_parity_disks = 0;
7144 struct md_rdev *rdev;
7145 struct md_rdev *journal_dev = NULL;
7146 sector_t reshape_offset = 0;
7147 int i;
7148 long long min_offset_diff = 0;
7149 int first = 1;
7150
7151 if (mddev_init_writes_pending(mddev) < 0)
7152 return -ENOMEM;
7153
7154 if (mddev->recovery_cp != MaxSector)
7155 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
7156 mdname(mddev));
7157
7158 rdev_for_each(rdev, mddev) {
7159 long long diff;
7160
7161 if (test_bit(Journal, &rdev->flags)) {
7162 journal_dev = rdev;
7163 continue;
7164 }
7165 if (rdev->raid_disk < 0)
7166 continue;
7167 diff = (rdev->new_data_offset - rdev->data_offset);
7168 if (first) {
7169 min_offset_diff = diff;
7170 first = 0;
7171 } else if (mddev->reshape_backwards &&
7172 diff < min_offset_diff)
7173 min_offset_diff = diff;
7174 else if (!mddev->reshape_backwards &&
7175 diff > min_offset_diff)
7176 min_offset_diff = diff;
7177 }
7178
7179 if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) &&
7180 (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
7181 pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
7182 mdname(mddev));
7183 return -EINVAL;
7184 }
7185
7186 if (mddev->reshape_position != MaxSector) {
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199 sector_t here_new, here_old;
7200 int old_disks;
7201 int max_degraded = (mddev->level == 6 ? 2 : 1);
7202 int chunk_sectors;
7203 int new_data_disks;
7204
7205 if (journal_dev) {
7206 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
7207 mdname(mddev));
7208 return -EINVAL;
7209 }
7210
7211 if (mddev->new_level != mddev->level) {
7212 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
7213 mdname(mddev));
7214 return -EINVAL;
7215 }
7216 old_disks = mddev->raid_disks - mddev->delta_disks;
7217
7218
7219
7220
7221
7222
7223
7224 here_new = mddev->reshape_position;
7225 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
7226 new_data_disks = mddev->raid_disks - max_degraded;
7227 if (sector_div(here_new, chunk_sectors * new_data_disks)) {
7228 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
7229 mdname(mddev));
7230 return -EINVAL;
7231 }
7232 reshape_offset = here_new * chunk_sectors;
7233
7234 here_old = mddev->reshape_position;
7235 sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
7236
7237
7238 if (mddev->delta_disks == 0) {
7239
7240
7241
7242
7243
7244
7245
7246 if (abs(min_offset_diff) >= mddev->chunk_sectors &&
7247 abs(min_offset_diff) >= mddev->new_chunk_sectors)
7248 ;
7249 else if (mddev->ro == 0) {
7250 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
7251 mdname(mddev));
7252 return -EINVAL;
7253 }
7254 } else if (mddev->reshape_backwards
7255 ? (here_new * chunk_sectors + min_offset_diff <=
7256 here_old * chunk_sectors)
7257 : (here_new * chunk_sectors >=
7258 here_old * chunk_sectors + (-min_offset_diff))) {
7259
7260 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
7261 mdname(mddev));
7262 return -EINVAL;
7263 }
7264 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
7265
7266 } else {
7267 BUG_ON(mddev->level != mddev->new_level);
7268 BUG_ON(mddev->layout != mddev->new_layout);
7269 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
7270 BUG_ON(mddev->delta_disks != 0);
7271 }
7272
7273 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
7274 test_bit(MD_HAS_PPL, &mddev->flags)) {
7275 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
7276 mdname(mddev));
7277 clear_bit(MD_HAS_PPL, &mddev->flags);
7278 clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags);
7279 }
7280
7281 if (mddev->private == NULL)
7282 conf = setup_conf(mddev);
7283 else
7284 conf = mddev->private;
7285
7286 if (IS_ERR(conf))
7287 return PTR_ERR(conf);
7288
7289 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
7290 if (!journal_dev) {
7291 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
7292 mdname(mddev));
7293 mddev->ro = 1;
7294 set_disk_ro(mddev->gendisk, 1);
7295 } else if (mddev->recovery_cp == MaxSector)
7296 set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
7297 }
7298
7299 conf->min_offset_diff = min_offset_diff;
7300 mddev->thread = conf->thread;
7301 conf->thread = NULL;
7302 mddev->private = conf;
7303
7304 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
7305 i++) {
7306 rdev = conf->disks[i].rdev;
7307 if (!rdev && conf->disks[i].replacement) {
7308
7309 rdev = conf->disks[i].replacement;
7310 conf->disks[i].replacement = NULL;
7311 clear_bit(Replacement, &rdev->flags);
7312 conf->disks[i].rdev = rdev;
7313 }
7314 if (!rdev)
7315 continue;
7316 if (conf->disks[i].replacement &&
7317 conf->reshape_progress != MaxSector) {
7318
7319 pr_warn("md: cannot handle concurrent replacement and reshape.\n");
7320 goto abort;
7321 }
7322 if (test_bit(In_sync, &rdev->flags)) {
7323 working_disks++;
7324 continue;
7325 }
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335 if (mddev->major_version == 0 &&
7336 mddev->minor_version > 90)
7337 rdev->recovery_offset = reshape_offset;
7338
7339 if (rdev->recovery_offset < reshape_offset) {
7340
7341 if (!only_parity(rdev->raid_disk,
7342 conf->algorithm,
7343 conf->raid_disks,
7344 conf->max_degraded))
7345 continue;
7346 }
7347 if (!only_parity(rdev->raid_disk,
7348 conf->prev_algo,
7349 conf->previous_raid_disks,
7350 conf->max_degraded))
7351 continue;
7352 dirty_parity_disks++;
7353 }
7354
7355
7356
7357
7358 mddev->degraded = raid5_calc_degraded(conf);
7359
7360 if (has_failed(conf)) {
7361 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
7362 mdname(mddev), mddev->degraded, conf->raid_disks);
7363 goto abort;
7364 }
7365
7366
7367 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
7368 mddev->resync_max_sectors = mddev->dev_sectors;
7369
7370 if (mddev->degraded > dirty_parity_disks &&
7371 mddev->recovery_cp != MaxSector) {
7372 if (test_bit(MD_HAS_PPL, &mddev->flags))
7373 pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
7374 mdname(mddev));
7375 else if (mddev->ok_start_degraded)
7376 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
7377 mdname(mddev));
7378 else {
7379 pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
7380 mdname(mddev));
7381 goto abort;
7382 }
7383 }
7384
7385 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
7386 mdname(mddev), conf->level,
7387 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
7388 mddev->new_layout);
7389
7390 print_raid5_conf(conf);
7391
7392 if (conf->reshape_progress != MaxSector) {
7393 conf->reshape_safe = conf->reshape_progress;
7394 atomic_set(&conf->reshape_stripes, 0);
7395 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7396 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7397 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7398 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7399 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
7400 "reshape");
7401 }
7402
7403
7404 if (mddev->to_remove == &raid5_attrs_group)
7405 mddev->to_remove = NULL;
7406 else if (mddev->kobj.sd &&
7407 sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
7408 pr_warn("raid5: failed to create sysfs attributes for %s\n",
7409 mdname(mddev));
7410 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
7411
7412 if (mddev->queue) {
7413 int chunk_size;
7414
7415
7416
7417
7418 int data_disks = conf->previous_raid_disks - conf->max_degraded;
7419 int stripe = data_disks *
7420 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
7421 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
7422 mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
7423
7424 chunk_size = mddev->chunk_sectors << 9;
7425 blk_queue_io_min(mddev->queue, chunk_size);
7426 blk_queue_io_opt(mddev->queue, chunk_size *
7427 (conf->raid_disks - conf->max_degraded));
7428 mddev->queue->limits.raid_partial_stripes_expensive = 1;
7429
7430
7431
7432
7433 stripe = stripe * PAGE_SIZE;
7434
7435
7436 while ((stripe-1) & stripe)
7437 stripe = (stripe | (stripe-1)) + 1;
7438 mddev->queue->limits.discard_alignment = stripe;
7439 mddev->queue->limits.discard_granularity = stripe;
7440
7441 blk_queue_max_write_same_sectors(mddev->queue, 0);
7442 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
7443
7444 rdev_for_each(rdev, mddev) {
7445 disk_stack_limits(mddev->gendisk, rdev->bdev,
7446 rdev->data_offset << 9);
7447 disk_stack_limits(mddev->gendisk, rdev->bdev,
7448 rdev->new_data_offset << 9);
7449 }
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466 if (devices_handle_discard_safely &&
7467 mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
7468 mddev->queue->limits.discard_granularity >= stripe)
7469 blk_queue_flag_set(QUEUE_FLAG_DISCARD,
7470 mddev->queue);
7471 else
7472 blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
7473 mddev->queue);
7474
7475 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
7476 }
7477
7478 if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
7479 goto abort;
7480
7481 return 0;
7482abort:
7483 md_unregister_thread(&mddev->thread);
7484 print_raid5_conf(conf);
7485 free_conf(conf);
7486 mddev->private = NULL;
7487 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
7488 return -EIO;
7489}
7490
7491static void raid5_free(struct mddev *mddev, void *priv)
7492{
7493 struct r5conf *conf = priv;
7494
7495 free_conf(conf);
7496 mddev->to_remove = &raid5_attrs_group;
7497}
7498
7499static void raid5_status(struct seq_file *seq, struct mddev *mddev)
7500{
7501 struct r5conf *conf = mddev->private;
7502 int i;
7503
7504 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
7505 conf->chunk_sectors / 2, mddev->layout);
7506 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
7507 rcu_read_lock();
7508 for (i = 0; i < conf->raid_disks; i++) {
7509 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
7510 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
7511 }
7512 rcu_read_unlock();
7513 seq_printf (seq, "]");
7514}
7515
7516static void print_raid5_conf (struct r5conf *conf)
7517{
7518 int i;
7519 struct disk_info *tmp;
7520
7521 pr_debug("RAID conf printout:\n");
7522 if (!conf) {
7523 pr_debug("(conf==NULL)\n");
7524 return;
7525 }
7526 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
7527 conf->raid_disks,
7528 conf->raid_disks - conf->mddev->degraded);
7529
7530 for (i = 0; i < conf->raid_disks; i++) {
7531 char b[BDEVNAME_SIZE];
7532 tmp = conf->disks + i;
7533 if (tmp->rdev)
7534 pr_debug(" disk %d, o:%d, dev:%s\n",
7535 i, !test_bit(Faulty, &tmp->rdev->flags),
7536 bdevname(tmp->rdev->bdev, b));
7537 }
7538}
7539
7540static int raid5_spare_active(struct mddev *mddev)
7541{
7542 int i;
7543 struct r5conf *conf = mddev->private;
7544 struct disk_info *tmp;
7545 int count = 0;
7546 unsigned long flags;
7547
7548 for (i = 0; i < conf->raid_disks; i++) {
7549 tmp = conf->disks + i;
7550 if (tmp->replacement
7551 && tmp->replacement->recovery_offset == MaxSector
7552 && !test_bit(Faulty, &tmp->replacement->flags)
7553 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
7554
7555 if (!tmp->rdev
7556 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
7557 count++;
7558 if (tmp->rdev) {
7559
7560
7561
7562
7563 set_bit(Faulty, &tmp->rdev->flags);
7564 sysfs_notify_dirent_safe(
7565 tmp->rdev->sysfs_state);
7566 }
7567 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
7568 } else if (tmp->rdev
7569 && tmp->rdev->recovery_offset == MaxSector
7570 && !test_bit(Faulty, &tmp->rdev->flags)
7571 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
7572 count++;
7573 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
7574 }
7575 }
7576 spin_lock_irqsave(&conf->device_lock, flags);
7577 mddev->degraded = raid5_calc_degraded(conf);
7578 spin_unlock_irqrestore(&conf->device_lock, flags);
7579 print_raid5_conf(conf);
7580 return count;
7581}
7582
7583static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
7584{
7585 struct r5conf *conf = mddev->private;
7586 int err = 0;
7587 int number = rdev->raid_disk;
7588 struct md_rdev **rdevp;
7589 struct disk_info *p = conf->disks + number;
7590
7591 print_raid5_conf(conf);
7592 if (test_bit(Journal, &rdev->flags) && conf->log) {
7593
7594
7595
7596
7597
7598
7599 if (atomic_read(&conf->active_stripes) ||
7600 atomic_read(&conf->r5c_cached_full_stripes) ||
7601 atomic_read(&conf->r5c_cached_partial_stripes)) {
7602 return -EBUSY;
7603 }
7604 log_exit(conf);
7605 return 0;
7606 }
7607 if (rdev == p->rdev)
7608 rdevp = &p->rdev;
7609 else if (rdev == p->replacement)
7610 rdevp = &p->replacement;
7611 else
7612 return 0;
7613
7614 if (number >= conf->raid_disks &&
7615 conf->reshape_progress == MaxSector)
7616 clear_bit(In_sync, &rdev->flags);
7617
7618 if (test_bit(In_sync, &rdev->flags) ||
7619 atomic_read(&rdev->nr_pending)) {
7620 err = -EBUSY;
7621 goto abort;
7622 }
7623
7624
7625
7626 if (!test_bit(Faulty, &rdev->flags) &&
7627 mddev->recovery_disabled != conf->recovery_disabled &&
7628 !has_failed(conf) &&
7629 (!p->replacement || p->replacement == rdev) &&
7630 number < conf->raid_disks) {
7631 err = -EBUSY;
7632 goto abort;
7633 }
7634 *rdevp = NULL;
7635 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
7636 synchronize_rcu();
7637 if (atomic_read(&rdev->nr_pending)) {
7638
7639 err = -EBUSY;
7640 *rdevp = rdev;
7641 }
7642 }
7643 if (!err) {
7644 err = log_modify(conf, rdev, false);
7645 if (err)
7646 goto abort;
7647 }
7648 if (p->replacement) {
7649
7650 p->rdev = p->replacement;
7651 clear_bit(Replacement, &p->replacement->flags);
7652 smp_mb();
7653
7654
7655 p->replacement = NULL;
7656
7657 if (!err)
7658 err = log_modify(conf, p->rdev, true);
7659 }
7660
7661 clear_bit(WantReplacement, &rdev->flags);
7662abort:
7663
7664 print_raid5_conf(conf);
7665 return err;
7666}
7667
7668static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
7669{
7670 struct r5conf *conf = mddev->private;
7671 int err = -EEXIST;
7672 int disk;
7673 struct disk_info *p;
7674 int first = 0;
7675 int last = conf->raid_disks - 1;
7676
7677 if (test_bit(Journal, &rdev->flags)) {
7678 if (conf->log)
7679 return -EBUSY;
7680
7681 rdev->raid_disk = 0;
7682
7683
7684
7685
7686 log_init(conf, rdev, false);
7687 return 0;
7688 }
7689 if (mddev->recovery_disabled == conf->recovery_disabled)
7690 return -EBUSY;
7691
7692 if (rdev->saved_raid_disk < 0 && has_failed(conf))
7693
7694 return -EINVAL;
7695
7696 if (rdev->raid_disk >= 0)
7697 first = last = rdev->raid_disk;
7698
7699
7700
7701
7702
7703 if (rdev->saved_raid_disk >= 0 &&
7704 rdev->saved_raid_disk >= first &&
7705 conf->disks[rdev->saved_raid_disk].rdev == NULL)
7706 first = rdev->saved_raid_disk;
7707
7708 for (disk = first; disk <= last; disk++) {
7709 p = conf->disks + disk;
7710 if (p->rdev == NULL) {
7711 clear_bit(In_sync, &rdev->flags);
7712 rdev->raid_disk = disk;
7713 if (rdev->saved_raid_disk != disk)
7714 conf->fullsync = 1;
7715 rcu_assign_pointer(p->rdev, rdev);
7716
7717 err = log_modify(conf, rdev, true);
7718
7719 goto out;
7720 }
7721 }
7722 for (disk = first; disk <= last; disk++) {
7723 p = conf->disks + disk;
7724 if (test_bit(WantReplacement, &p->rdev->flags) &&
7725 p->replacement == NULL) {
7726 clear_bit(In_sync, &rdev->flags);
7727 set_bit(Replacement, &rdev->flags);
7728 rdev->raid_disk = disk;
7729 err = 0;
7730 conf->fullsync = 1;
7731 rcu_assign_pointer(p->replacement, rdev);
7732 break;
7733 }
7734 }
7735out:
7736 print_raid5_conf(conf);
7737 return err;
7738}
7739
7740static int raid5_resize(struct mddev *mddev, sector_t sectors)
7741{
7742
7743
7744
7745
7746
7747
7748
7749 sector_t newsize;
7750 struct r5conf *conf = mddev->private;
7751
7752 if (raid5_has_log(conf) || raid5_has_ppl(conf))
7753 return -EINVAL;
7754 sectors &= ~((sector_t)conf->chunk_sectors - 1);
7755 newsize = raid5_size(mddev, sectors, mddev->raid_disks);
7756 if (mddev->external_size &&
7757 mddev->array_sectors > newsize)
7758 return -EINVAL;
7759 if (mddev->bitmap) {
7760 int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0);
7761 if (ret)
7762 return ret;
7763 }
7764 md_set_array_sectors(mddev, newsize);
7765 if (sectors > mddev->dev_sectors &&
7766 mddev->recovery_cp > mddev->dev_sectors) {
7767 mddev->recovery_cp = mddev->dev_sectors;
7768 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7769 }
7770 mddev->dev_sectors = sectors;
7771 mddev->resync_max_sectors = sectors;
7772 return 0;
7773}
7774
7775static int check_stripe_cache(struct mddev *mddev)
7776{
7777
7778
7779
7780
7781
7782
7783
7784
7785 struct r5conf *conf = mddev->private;
7786 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
7787 > conf->min_nr_stripes ||
7788 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
7789 > conf->min_nr_stripes) {
7790 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
7791 mdname(mddev),
7792 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
7793 / STRIPE_SIZE)*4);
7794 return 0;
7795 }
7796 return 1;
7797}
7798
7799static int check_reshape(struct mddev *mddev)
7800{
7801 struct r5conf *conf = mddev->private;
7802
7803 if (raid5_has_log(conf) || raid5_has_ppl(conf))
7804 return -EINVAL;
7805 if (mddev->delta_disks == 0 &&
7806 mddev->new_layout == mddev->layout &&
7807 mddev->new_chunk_sectors == mddev->chunk_sectors)
7808 return 0;
7809 if (has_failed(conf))
7810 return -EINVAL;
7811 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) {
7812
7813
7814
7815
7816
7817 int min = 2;
7818 if (mddev->level == 6)
7819 min = 4;
7820 if (mddev->raid_disks + mddev->delta_disks < min)
7821 return -EINVAL;
7822 }
7823
7824 if (!check_stripe_cache(mddev))
7825 return -ENOSPC;
7826
7827 if (mddev->new_chunk_sectors > mddev->chunk_sectors ||
7828 mddev->delta_disks > 0)
7829 if (resize_chunks(conf,
7830 conf->previous_raid_disks
7831 + max(0, mddev->delta_disks),
7832 max(mddev->new_chunk_sectors,
7833 mddev->chunk_sectors)
7834 ) < 0)
7835 return -ENOMEM;
7836
7837 if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
7838 return 0;
7839 return resize_stripes(conf, (conf->previous_raid_disks
7840 + mddev->delta_disks));
7841}
7842
7843static int raid5_start_reshape(struct mddev *mddev)
7844{
7845 struct r5conf *conf = mddev->private;
7846 struct md_rdev *rdev;
7847 int spares = 0;
7848 unsigned long flags;
7849
7850 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
7851 return -EBUSY;
7852
7853 if (!check_stripe_cache(mddev))
7854 return -ENOSPC;
7855
7856 if (has_failed(conf))
7857 return -EINVAL;
7858
7859 rdev_for_each(rdev, mddev) {
7860 if (!test_bit(In_sync, &rdev->flags)
7861 && !test_bit(Faulty, &rdev->flags))
7862 spares++;
7863 }
7864
7865 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
7866
7867
7868
7869 return -EINVAL;
7870
7871
7872
7873
7874
7875 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
7876 < mddev->array_sectors) {
7877 pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
7878 mdname(mddev));
7879 return -EINVAL;
7880 }
7881
7882 atomic_set(&conf->reshape_stripes, 0);
7883 spin_lock_irq(&conf->device_lock);
7884 write_seqcount_begin(&conf->gen_lock);
7885 conf->previous_raid_disks = conf->raid_disks;
7886 conf->raid_disks += mddev->delta_disks;
7887 conf->prev_chunk_sectors = conf->chunk_sectors;
7888 conf->chunk_sectors = mddev->new_chunk_sectors;
7889 conf->prev_algo = conf->algorithm;
7890 conf->algorithm = mddev->new_layout;
7891 conf->generation++;
7892
7893
7894
7895 smp_mb();
7896 if (mddev->reshape_backwards)
7897 conf->reshape_progress = raid5_size(mddev, 0, 0);
7898 else
7899 conf->reshape_progress = 0;
7900 conf->reshape_safe = conf->reshape_progress;
7901 write_seqcount_end(&conf->gen_lock);
7902 spin_unlock_irq(&conf->device_lock);
7903
7904
7905
7906
7907
7908 mddev_suspend(mddev);
7909 mddev_resume(mddev);
7910
7911
7912
7913
7914
7915
7916
7917
7918 if (mddev->delta_disks >= 0) {
7919 rdev_for_each(rdev, mddev)
7920 if (rdev->raid_disk < 0 &&
7921 !test_bit(Faulty, &rdev->flags)) {
7922 if (raid5_add_disk(mddev, rdev) == 0) {
7923 if (rdev->raid_disk
7924 >= conf->previous_raid_disks)
7925 set_bit(In_sync, &rdev->flags);
7926 else
7927 rdev->recovery_offset = 0;
7928
7929 if (sysfs_link_rdev(mddev, rdev))
7930 ;
7931 }
7932 } else if (rdev->raid_disk >= conf->previous_raid_disks
7933 && !test_bit(Faulty, &rdev->flags)) {
7934
7935 set_bit(In_sync, &rdev->flags);
7936 }
7937
7938
7939
7940
7941
7942 spin_lock_irqsave(&conf->device_lock, flags);
7943 mddev->degraded = raid5_calc_degraded(conf);
7944 spin_unlock_irqrestore(&conf->device_lock, flags);
7945 }
7946 mddev->raid_disks = conf->raid_disks;
7947 mddev->reshape_position = conf->reshape_progress;
7948 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7949
7950 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7951 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7952 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
7953 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7954 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7955 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
7956 "reshape");
7957 if (!mddev->sync_thread) {
7958 mddev->recovery = 0;
7959 spin_lock_irq(&conf->device_lock);
7960 write_seqcount_begin(&conf->gen_lock);
7961 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
7962 mddev->new_chunk_sectors =
7963 conf->chunk_sectors = conf->prev_chunk_sectors;
7964 mddev->new_layout = conf->algorithm = conf->prev_algo;
7965 rdev_for_each(rdev, mddev)
7966 rdev->new_data_offset = rdev->data_offset;
7967 smp_wmb();
7968 conf->generation --;
7969 conf->reshape_progress = MaxSector;
7970 mddev->reshape_position = MaxSector;
7971 write_seqcount_end(&conf->gen_lock);
7972 spin_unlock_irq(&conf->device_lock);
7973 return -EAGAIN;
7974 }
7975 conf->reshape_checkpoint = jiffies;
7976 md_wakeup_thread(mddev->sync_thread);
7977 md_new_event(mddev);
7978 return 0;
7979}
7980
7981
7982
7983
7984static void end_reshape(struct r5conf *conf)
7985{
7986
7987 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
7988 struct md_rdev *rdev;
7989
7990 spin_lock_irq(&conf->device_lock);
7991 conf->previous_raid_disks = conf->raid_disks;
7992 md_finish_reshape(conf->mddev);
7993 smp_wmb();
7994 conf->reshape_progress = MaxSector;
7995 conf->mddev->reshape_position = MaxSector;
7996 rdev_for_each(rdev, conf->mddev)
7997 if (rdev->raid_disk >= 0 &&
7998 !test_bit(Journal, &rdev->flags) &&
7999 !test_bit(In_sync, &rdev->flags))
8000 rdev->recovery_offset = MaxSector;
8001 spin_unlock_irq(&conf->device_lock);
8002 wake_up(&conf->wait_for_overlap);
8003
8004
8005
8006
8007 if (conf->mddev->queue) {
8008 int data_disks = conf->raid_disks - conf->max_degraded;
8009 int stripe = data_disks * ((conf->chunk_sectors << 9)
8010 / PAGE_SIZE);
8011 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
8012 conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
8013 }
8014 }
8015}
8016
8017
8018
8019
8020static void raid5_finish_reshape(struct mddev *mddev)
8021{
8022 struct r5conf *conf = mddev->private;
8023
8024 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8025
8026 if (mddev->delta_disks <= 0) {
8027 int d;
8028 spin_lock_irq(&conf->device_lock);
8029 mddev->degraded = raid5_calc_degraded(conf);
8030 spin_unlock_irq(&conf->device_lock);
8031 for (d = conf->raid_disks ;
8032 d < conf->raid_disks - mddev->delta_disks;
8033 d++) {
8034 struct md_rdev *rdev = conf->disks[d].rdev;
8035 if (rdev)
8036 clear_bit(In_sync, &rdev->flags);
8037 rdev = conf->disks[d].replacement;
8038 if (rdev)
8039 clear_bit(In_sync, &rdev->flags);
8040 }
8041 }
8042 mddev->layout = conf->algorithm;
8043 mddev->chunk_sectors = conf->chunk_sectors;
8044 mddev->reshape_position = MaxSector;
8045 mddev->delta_disks = 0;
8046 mddev->reshape_backwards = 0;
8047 }
8048}
8049
8050static void raid5_quiesce(struct mddev *mddev, int quiesce)
8051{
8052 struct r5conf *conf = mddev->private;
8053
8054 if (quiesce) {
8055
8056 lock_all_device_hash_locks_irq(conf);
8057
8058
8059
8060 r5c_flush_cache(conf, INT_MAX);
8061 conf->quiesce = 2;
8062 wait_event_cmd(conf->wait_for_quiescent,
8063 atomic_read(&conf->active_stripes) == 0 &&
8064 atomic_read(&conf->active_aligned_reads) == 0,
8065 unlock_all_device_hash_locks_irq(conf),
8066 lock_all_device_hash_locks_irq(conf));
8067 conf->quiesce = 1;
8068 unlock_all_device_hash_locks_irq(conf);
8069
8070 wake_up(&conf->wait_for_overlap);
8071 } else {
8072
8073 lock_all_device_hash_locks_irq(conf);
8074 conf->quiesce = 0;
8075 wake_up(&conf->wait_for_quiescent);
8076 wake_up(&conf->wait_for_overlap);
8077 unlock_all_device_hash_locks_irq(conf);
8078 }
8079 log_quiesce(conf, quiesce);
8080}
8081
8082static void *raid45_takeover_raid0(struct mddev *mddev, int level)
8083{
8084 struct r0conf *raid0_conf = mddev->private;
8085 sector_t sectors;
8086
8087
8088 if (raid0_conf->nr_strip_zones > 1) {
8089 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
8090 mdname(mddev));
8091 return ERR_PTR(-EINVAL);
8092 }
8093
8094 sectors = raid0_conf->strip_zone[0].zone_end;
8095 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
8096 mddev->dev_sectors = sectors;
8097 mddev->new_level = level;
8098 mddev->new_layout = ALGORITHM_PARITY_N;
8099 mddev->new_chunk_sectors = mddev->chunk_sectors;
8100 mddev->raid_disks += 1;
8101 mddev->delta_disks = 1;
8102
8103 mddev->recovery_cp = MaxSector;
8104
8105 return setup_conf(mddev);
8106}
8107
8108static void *raid5_takeover_raid1(struct mddev *mddev)
8109{
8110 int chunksect;
8111 void *ret;
8112
8113 if (mddev->raid_disks != 2 ||
8114 mddev->degraded > 1)
8115 return ERR_PTR(-EINVAL);
8116
8117
8118
8119 chunksect = 64*2;
8120
8121
8122 while (chunksect && (mddev->array_sectors & (chunksect-1)))
8123 chunksect >>= 1;
8124
8125 if ((chunksect<<9) < STRIPE_SIZE)
8126
8127 return ERR_PTR(-EINVAL);
8128
8129 mddev->new_level = 5;
8130 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
8131 mddev->new_chunk_sectors = chunksect;
8132
8133 ret = setup_conf(mddev);
8134 if (!IS_ERR(ret))
8135 mddev_clear_unsupported_flags(mddev,
8136 UNSUPPORTED_MDDEV_FLAGS);
8137 return ret;
8138}
8139
8140static void *raid5_takeover_raid6(struct mddev *mddev)
8141{
8142 int new_layout;
8143
8144 switch (mddev->layout) {
8145 case ALGORITHM_LEFT_ASYMMETRIC_6:
8146 new_layout = ALGORITHM_LEFT_ASYMMETRIC;
8147 break;
8148 case ALGORITHM_RIGHT_ASYMMETRIC_6:
8149 new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
8150 break;
8151 case ALGORITHM_LEFT_SYMMETRIC_6:
8152 new_layout = ALGORITHM_LEFT_SYMMETRIC;
8153 break;
8154 case ALGORITHM_RIGHT_SYMMETRIC_6:
8155 new_layout = ALGORITHM_RIGHT_SYMMETRIC;
8156 break;
8157 case ALGORITHM_PARITY_0_6:
8158 new_layout = ALGORITHM_PARITY_0;
8159 break;
8160 case ALGORITHM_PARITY_N:
8161 new_layout = ALGORITHM_PARITY_N;
8162 break;
8163 default:
8164 return ERR_PTR(-EINVAL);
8165 }
8166 mddev->new_level = 5;
8167 mddev->new_layout = new_layout;
8168 mddev->delta_disks = -1;
8169 mddev->raid_disks -= 1;
8170 return setup_conf(mddev);
8171}
8172
8173static int raid5_check_reshape(struct mddev *mddev)
8174{
8175
8176
8177
8178
8179
8180 struct r5conf *conf = mddev->private;
8181 int new_chunk = mddev->new_chunk_sectors;
8182
8183 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
8184 return -EINVAL;
8185 if (new_chunk > 0) {
8186 if (!is_power_of_2(new_chunk))
8187 return -EINVAL;
8188 if (new_chunk < (PAGE_SIZE>>9))
8189 return -EINVAL;
8190 if (mddev->array_sectors & (new_chunk-1))
8191
8192 return -EINVAL;
8193 }
8194
8195
8196
8197 if (mddev->raid_disks == 2) {
8198
8199 if (mddev->new_layout >= 0) {
8200 conf->algorithm = mddev->new_layout;
8201 mddev->layout = mddev->new_layout;
8202 }
8203 if (new_chunk > 0) {
8204 conf->chunk_sectors = new_chunk ;
8205 mddev->chunk_sectors = new_chunk;
8206 }
8207 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8208 md_wakeup_thread(mddev->thread);
8209 }
8210 return check_reshape(mddev);
8211}
8212
8213static int raid6_check_reshape(struct mddev *mddev)
8214{
8215 int new_chunk = mddev->new_chunk_sectors;
8216
8217 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
8218 return -EINVAL;
8219 if (new_chunk > 0) {
8220 if (!is_power_of_2(new_chunk))
8221 return -EINVAL;
8222 if (new_chunk < (PAGE_SIZE >> 9))
8223 return -EINVAL;
8224 if (mddev->array_sectors & (new_chunk-1))
8225
8226 return -EINVAL;
8227 }
8228
8229
8230 return check_reshape(mddev);
8231}
8232
8233static void *raid5_takeover(struct mddev *mddev)
8234{
8235
8236
8237
8238
8239
8240
8241 if (mddev->level == 0)
8242 return raid45_takeover_raid0(mddev, 5);
8243 if (mddev->level == 1)
8244 return raid5_takeover_raid1(mddev);
8245 if (mddev->level == 4) {
8246 mddev->new_layout = ALGORITHM_PARITY_N;
8247 mddev->new_level = 5;
8248 return setup_conf(mddev);
8249 }
8250 if (mddev->level == 6)
8251 return raid5_takeover_raid6(mddev);
8252
8253 return ERR_PTR(-EINVAL);
8254}
8255
8256static void *raid4_takeover(struct mddev *mddev)
8257{
8258
8259
8260
8261
8262 if (mddev->level == 0)
8263 return raid45_takeover_raid0(mddev, 4);
8264 if (mddev->level == 5 &&
8265 mddev->layout == ALGORITHM_PARITY_N) {
8266 mddev->new_layout = 0;
8267 mddev->new_level = 4;
8268 return setup_conf(mddev);
8269 }
8270 return ERR_PTR(-EINVAL);
8271}
8272
8273static struct md_personality raid5_personality;
8274
8275static void *raid6_takeover(struct mddev *mddev)
8276{
8277
8278
8279
8280
8281 int new_layout;
8282
8283 if (mddev->pers != &raid5_personality)
8284 return ERR_PTR(-EINVAL);
8285 if (mddev->degraded > 1)
8286 return ERR_PTR(-EINVAL);
8287 if (mddev->raid_disks > 253)
8288 return ERR_PTR(-EINVAL);
8289 if (mddev->raid_disks < 3)
8290 return ERR_PTR(-EINVAL);
8291
8292 switch (mddev->layout) {
8293 case ALGORITHM_LEFT_ASYMMETRIC:
8294 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
8295 break;
8296 case ALGORITHM_RIGHT_ASYMMETRIC:
8297 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
8298 break;
8299 case ALGORITHM_LEFT_SYMMETRIC:
8300 new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
8301 break;
8302 case ALGORITHM_RIGHT_SYMMETRIC:
8303 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
8304 break;
8305 case ALGORITHM_PARITY_0:
8306 new_layout = ALGORITHM_PARITY_0_6;
8307 break;
8308 case ALGORITHM_PARITY_N:
8309 new_layout = ALGORITHM_PARITY_N;
8310 break;
8311 default:
8312 return ERR_PTR(-EINVAL);
8313 }
8314 mddev->new_level = 6;
8315 mddev->new_layout = new_layout;
8316 mddev->delta_disks = 1;
8317 mddev->raid_disks += 1;
8318 return setup_conf(mddev);
8319}
8320
8321static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
8322{
8323 struct r5conf *conf;
8324 int err;
8325
8326 err = mddev_lock(mddev);
8327 if (err)
8328 return err;
8329 conf = mddev->private;
8330 if (!conf) {
8331 mddev_unlock(mddev);
8332 return -ENODEV;
8333 }
8334
8335 if (strncmp(buf, "ppl", 3) == 0) {
8336
8337 if (!raid5_has_ppl(conf) && conf->level == 5) {
8338 err = log_init(conf, NULL, true);
8339 if (!err) {
8340 err = resize_stripes(conf, conf->pool_size);
8341 if (err)
8342 log_exit(conf);
8343 }
8344 } else
8345 err = -EINVAL;
8346 } else if (strncmp(buf, "resync", 6) == 0) {
8347 if (raid5_has_ppl(conf)) {
8348 mddev_suspend(mddev);
8349 log_exit(conf);
8350 mddev_resume(mddev);
8351 err = resize_stripes(conf, conf->pool_size);
8352 } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
8353 r5l_log_disk_error(conf)) {
8354 bool journal_dev_exists = false;
8355 struct md_rdev *rdev;
8356
8357 rdev_for_each(rdev, mddev)
8358 if (test_bit(Journal, &rdev->flags)) {
8359 journal_dev_exists = true;
8360 break;
8361 }
8362
8363 if (!journal_dev_exists) {
8364 mddev_suspend(mddev);
8365 clear_bit(MD_HAS_JOURNAL, &mddev->flags);
8366 mddev_resume(mddev);
8367 } else
8368 err = -EBUSY;
8369 } else
8370 err = -EINVAL;
8371 } else {
8372 err = -EINVAL;
8373 }
8374
8375 if (!err)
8376 md_update_sb(mddev, 1);
8377
8378 mddev_unlock(mddev);
8379
8380 return err;
8381}
8382
8383static int raid5_start(struct mddev *mddev)
8384{
8385 struct r5conf *conf = mddev->private;
8386
8387 return r5l_start(conf->log);
8388}
8389
8390static struct md_personality raid6_personality =
8391{
8392 .name = "raid6",
8393 .level = 6,
8394 .owner = THIS_MODULE,
8395 .make_request = raid5_make_request,
8396 .run = raid5_run,
8397 .start = raid5_start,
8398 .free = raid5_free,
8399 .status = raid5_status,
8400 .error_handler = raid5_error,
8401 .hot_add_disk = raid5_add_disk,
8402 .hot_remove_disk= raid5_remove_disk,
8403 .spare_active = raid5_spare_active,
8404 .sync_request = raid5_sync_request,
8405 .resize = raid5_resize,
8406 .size = raid5_size,
8407 .check_reshape = raid6_check_reshape,
8408 .start_reshape = raid5_start_reshape,
8409 .finish_reshape = raid5_finish_reshape,
8410 .quiesce = raid5_quiesce,
8411 .takeover = raid6_takeover,
8412 .congested = raid5_congested,
8413 .change_consistency_policy = raid5_change_consistency_policy,
8414};
8415static struct md_personality raid5_personality =
8416{
8417 .name = "raid5",
8418 .level = 5,
8419 .owner = THIS_MODULE,
8420 .make_request = raid5_make_request,
8421 .run = raid5_run,
8422 .start = raid5_start,
8423 .free = raid5_free,
8424 .status = raid5_status,
8425 .error_handler = raid5_error,
8426 .hot_add_disk = raid5_add_disk,
8427 .hot_remove_disk= raid5_remove_disk,
8428 .spare_active = raid5_spare_active,
8429 .sync_request = raid5_sync_request,
8430 .resize = raid5_resize,
8431 .size = raid5_size,
8432 .check_reshape = raid5_check_reshape,
8433 .start_reshape = raid5_start_reshape,
8434 .finish_reshape = raid5_finish_reshape,
8435 .quiesce = raid5_quiesce,
8436 .takeover = raid5_takeover,
8437 .congested = raid5_congested,
8438 .change_consistency_policy = raid5_change_consistency_policy,
8439};
8440
8441static struct md_personality raid4_personality =
8442{
8443 .name = "raid4",
8444 .level = 4,
8445 .owner = THIS_MODULE,
8446 .make_request = raid5_make_request,
8447 .run = raid5_run,
8448 .start = raid5_start,
8449 .free = raid5_free,
8450 .status = raid5_status,
8451 .error_handler = raid5_error,
8452 .hot_add_disk = raid5_add_disk,
8453 .hot_remove_disk= raid5_remove_disk,
8454 .spare_active = raid5_spare_active,
8455 .sync_request = raid5_sync_request,
8456 .resize = raid5_resize,
8457 .size = raid5_size,
8458 .check_reshape = raid5_check_reshape,
8459 .start_reshape = raid5_start_reshape,
8460 .finish_reshape = raid5_finish_reshape,
8461 .quiesce = raid5_quiesce,
8462 .takeover = raid4_takeover,
8463 .congested = raid5_congested,
8464 .change_consistency_policy = raid5_change_consistency_policy,
8465};
8466
8467static int __init raid5_init(void)
8468{
8469 int ret;
8470
8471 raid5_wq = alloc_workqueue("raid5wq",
8472 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
8473 if (!raid5_wq)
8474 return -ENOMEM;
8475
8476 ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE,
8477 "md/raid5:prepare",
8478 raid456_cpu_up_prepare,
8479 raid456_cpu_dead);
8480 if (ret) {
8481 destroy_workqueue(raid5_wq);
8482 return ret;
8483 }
8484 register_md_personality(&raid6_personality);
8485 register_md_personality(&raid5_personality);
8486 register_md_personality(&raid4_personality);
8487 return 0;
8488}
8489
8490static void raid5_exit(void)
8491{
8492 unregister_md_personality(&raid6_personality);
8493 unregister_md_personality(&raid5_personality);
8494 unregister_md_personality(&raid4_personality);
8495 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
8496 destroy_workqueue(raid5_wq);
8497}
8498
8499module_init(raid5_init);
8500module_exit(raid5_exit);
8501MODULE_LICENSE("GPL");
8502MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
8503MODULE_ALIAS("md-personality-4");
8504MODULE_ALIAS("md-raid5");
8505MODULE_ALIAS("md-raid4");
8506MODULE_ALIAS("md-level-5");
8507MODULE_ALIAS("md-level-4");
8508MODULE_ALIAS("md-personality-8");
8509MODULE_ALIAS("md-raid6");
8510MODULE_ALIAS("md-level-6");
8511
8512
8513MODULE_ALIAS("raid5");
8514MODULE_ALIAS("raid6");
8515