1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46#include <linux/blkdev.h>
47#include <linux/kthread.h>
48#include <linux/raid/pq.h>
49#include <linux/async_tx.h>
50#include <linux/module.h>
51#include <linux/async.h>
52#include <linux/seq_file.h>
53#include <linux/cpu.h>
54#include <linux/slab.h>
55#include <linux/ratelimit.h>
56#include <linux/nodemask.h>
57#include <linux/flex_array.h>
58
59#include <trace/events/block.h>
60#include <linux/list_sort.h>
61
62#include "md.h"
63#include "raid5.h"
64#include "raid0.h"
65#include "md-bitmap.h"
66#include "raid5-log.h"
67
68#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
69
70#define cpu_to_group(cpu) cpu_to_node(cpu)
71#define ANY_GROUP NUMA_NO_NODE
72
73static bool devices_handle_discard_safely = false;
74module_param(devices_handle_discard_safely, bool, 0644);
75MODULE_PARM_DESC(devices_handle_discard_safely,
76 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
77static struct workqueue_struct *raid5_wq;
78
79static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
80{
81 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
82 return &conf->stripe_hashtbl[hash];
83}
84
85static inline int stripe_hash_locks_hash(sector_t sect)
86{
87 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
88}
89
90static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
91{
92 spin_lock_irq(conf->hash_locks + hash);
93 spin_lock(&conf->device_lock);
94}
95
96static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
97{
98 spin_unlock(&conf->device_lock);
99 spin_unlock_irq(conf->hash_locks + hash);
100}
101
102static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
103{
104 int i;
105 spin_lock_irq(conf->hash_locks);
106 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
107 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
108 spin_lock(&conf->device_lock);
109}
110
111static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
112{
113 int i;
114 spin_unlock(&conf->device_lock);
115 for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
116 spin_unlock(conf->hash_locks + i);
117 spin_unlock_irq(conf->hash_locks);
118}
119
120
121static inline int raid6_d0(struct stripe_head *sh)
122{
123 if (sh->ddf_layout)
124
125 return 0;
126
127 if (sh->qd_idx == sh->disks - 1)
128 return 0;
129 else
130 return sh->qd_idx + 1;
131}
132static inline int raid6_next_disk(int disk, int raid_disks)
133{
134 disk++;
135 return (disk < raid_disks) ? disk : 0;
136}
137
138
139
140
141
142
143static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
144 int *count, int syndrome_disks)
145{
146 int slot = *count;
147
148 if (sh->ddf_layout)
149 (*count)++;
150 if (idx == sh->pd_idx)
151 return syndrome_disks;
152 if (idx == sh->qd_idx)
153 return syndrome_disks + 1;
154 if (!sh->ddf_layout)
155 (*count)++;
156 return slot;
157}
158
159static void print_raid5_conf (struct r5conf *conf);
160
161static int stripe_operations_active(struct stripe_head *sh)
162{
163 return sh->check_state || sh->reconstruct_state ||
164 test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
165 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
166}
167
168static bool stripe_is_lowprio(struct stripe_head *sh)
169{
170 return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
171 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
172 !test_bit(STRIPE_R5C_CACHING, &sh->state);
173}
174
175static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
176{
177 struct r5conf *conf = sh->raid_conf;
178 struct r5worker_group *group;
179 int thread_cnt;
180 int i, cpu = sh->cpu;
181
182 if (!cpu_online(cpu)) {
183 cpu = cpumask_any(cpu_online_mask);
184 sh->cpu = cpu;
185 }
186
187 if (list_empty(&sh->lru)) {
188 struct r5worker_group *group;
189 group = conf->worker_groups + cpu_to_group(cpu);
190 if (stripe_is_lowprio(sh))
191 list_add_tail(&sh->lru, &group->loprio_list);
192 else
193 list_add_tail(&sh->lru, &group->handle_list);
194 group->stripes_cnt++;
195 sh->group = group;
196 }
197
198 if (conf->worker_cnt_per_group == 0) {
199 md_wakeup_thread(conf->mddev->thread);
200 return;
201 }
202
203 group = conf->worker_groups + cpu_to_group(sh->cpu);
204
205 group->workers[0].working = true;
206
207 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
208
209 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
210
211 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
212 if (group->workers[i].working == false) {
213 group->workers[i].working = true;
214 queue_work_on(sh->cpu, raid5_wq,
215 &group->workers[i].work);
216 thread_cnt--;
217 }
218 }
219}
220
221static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
222 struct list_head *temp_inactive_list)
223{
224 int i;
225 int injournal = 0;
226
227 BUG_ON(!list_empty(&sh->lru));
228 BUG_ON(atomic_read(&conf->active_stripes)==0);
229
230 if (r5c_is_writeback(conf->log))
231 for (i = sh->disks; i--; )
232 if (test_bit(R5_InJournal, &sh->dev[i].flags))
233 injournal++;
234
235
236
237
238
239
240
241 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
242 (conf->quiesce && r5c_is_writeback(conf->log) &&
243 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
244 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
245 r5c_make_stripe_write_out(sh);
246 set_bit(STRIPE_HANDLE, &sh->state);
247 }
248
249 if (test_bit(STRIPE_HANDLE, &sh->state)) {
250 if (test_bit(STRIPE_DELAYED, &sh->state) &&
251 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
252 list_add_tail(&sh->lru, &conf->delayed_list);
253 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
254 sh->bm_seq - conf->seq_write > 0)
255 list_add_tail(&sh->lru, &conf->bitmap_list);
256 else {
257 clear_bit(STRIPE_DELAYED, &sh->state);
258 clear_bit(STRIPE_BIT_DELAY, &sh->state);
259 if (conf->worker_cnt_per_group == 0) {
260 if (stripe_is_lowprio(sh))
261 list_add_tail(&sh->lru,
262 &conf->loprio_list);
263 else
264 list_add_tail(&sh->lru,
265 &conf->handle_list);
266 } else {
267 raid5_wakeup_stripe_thread(sh);
268 return;
269 }
270 }
271 md_wakeup_thread(conf->mddev->thread);
272 } else {
273 BUG_ON(stripe_operations_active(sh));
274 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
275 if (atomic_dec_return(&conf->preread_active_stripes)
276 < IO_THRESHOLD)
277 md_wakeup_thread(conf->mddev->thread);
278 atomic_dec(&conf->active_stripes);
279 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
280 if (!r5c_is_writeback(conf->log))
281 list_add_tail(&sh->lru, temp_inactive_list);
282 else {
283 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
284 if (injournal == 0)
285 list_add_tail(&sh->lru, temp_inactive_list);
286 else if (injournal == conf->raid_disks - conf->max_degraded) {
287
288 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
289 atomic_inc(&conf->r5c_cached_full_stripes);
290 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
291 atomic_dec(&conf->r5c_cached_partial_stripes);
292 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
293 r5c_check_cached_full_stripe(conf);
294 } else
295
296
297
298
299
300 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
301 }
302 }
303 }
304}
305
306static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
307 struct list_head *temp_inactive_list)
308{
309 if (atomic_dec_and_test(&sh->count))
310 do_release_stripe(conf, sh, temp_inactive_list);
311}
312
313
314
315
316
317
318
319
320static void release_inactive_stripe_list(struct r5conf *conf,
321 struct list_head *temp_inactive_list,
322 int hash)
323{
324 int size;
325 bool do_wakeup = false;
326 unsigned long flags;
327
328 if (hash == NR_STRIPE_HASH_LOCKS) {
329 size = NR_STRIPE_HASH_LOCKS;
330 hash = NR_STRIPE_HASH_LOCKS - 1;
331 } else
332 size = 1;
333 while (size) {
334 struct list_head *list = &temp_inactive_list[size - 1];
335
336
337
338
339
340 if (!list_empty_careful(list)) {
341 spin_lock_irqsave(conf->hash_locks + hash, flags);
342 if (list_empty(conf->inactive_list + hash) &&
343 !list_empty(list))
344 atomic_dec(&conf->empty_inactive_list_nr);
345 list_splice_tail_init(list, conf->inactive_list + hash);
346 do_wakeup = true;
347 spin_unlock_irqrestore(conf->hash_locks + hash, flags);
348 }
349 size--;
350 hash--;
351 }
352
353 if (do_wakeup) {
354 wake_up(&conf->wait_for_stripe);
355 if (atomic_read(&conf->active_stripes) == 0)
356 wake_up(&conf->wait_for_quiescent);
357 if (conf->retry_read_aligned)
358 md_wakeup_thread(conf->mddev->thread);
359 }
360}
361
362
363static int release_stripe_list(struct r5conf *conf,
364 struct list_head *temp_inactive_list)
365{
366 struct stripe_head *sh, *t;
367 int count = 0;
368 struct llist_node *head;
369
370 head = llist_del_all(&conf->released_stripes);
371 head = llist_reverse_order(head);
372 llist_for_each_entry_safe(sh, t, head, release_list) {
373 int hash;
374
375
376 smp_mb();
377 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
378
379
380
381
382
383 hash = sh->hash_lock_index;
384 __release_stripe(conf, sh, &temp_inactive_list[hash]);
385 count++;
386 }
387
388 return count;
389}
390
391void raid5_release_stripe(struct stripe_head *sh)
392{
393 struct r5conf *conf = sh->raid_conf;
394 unsigned long flags;
395 struct list_head list;
396 int hash;
397 bool wakeup;
398
399
400
401 if (atomic_add_unless(&sh->count, -1, 1))
402 return;
403
404 if (unlikely(!conf->mddev->thread) ||
405 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
406 goto slow_path;
407 wakeup = llist_add(&sh->release_list, &conf->released_stripes);
408 if (wakeup)
409 md_wakeup_thread(conf->mddev->thread);
410 return;
411slow_path:
412
413 if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) {
414 INIT_LIST_HEAD(&list);
415 hash = sh->hash_lock_index;
416 do_release_stripe(conf, sh, &list);
417 spin_unlock_irqrestore(&conf->device_lock, flags);
418 release_inactive_stripe_list(conf, &list, hash);
419 }
420}
421
422static inline void remove_hash(struct stripe_head *sh)
423{
424 pr_debug("remove_hash(), stripe %llu\n",
425 (unsigned long long)sh->sector);
426
427 hlist_del_init(&sh->hash);
428}
429
430static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
431{
432 struct hlist_head *hp = stripe_hash(conf, sh->sector);
433
434 pr_debug("insert_hash(), stripe %llu\n",
435 (unsigned long long)sh->sector);
436
437 hlist_add_head(&sh->hash, hp);
438}
439
440
441static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
442{
443 struct stripe_head *sh = NULL;
444 struct list_head *first;
445
446 if (list_empty(conf->inactive_list + hash))
447 goto out;
448 first = (conf->inactive_list + hash)->next;
449 sh = list_entry(first, struct stripe_head, lru);
450 list_del_init(first);
451 remove_hash(sh);
452 atomic_inc(&conf->active_stripes);
453 BUG_ON(hash != sh->hash_lock_index);
454 if (list_empty(conf->inactive_list + hash))
455 atomic_inc(&conf->empty_inactive_list_nr);
456out:
457 return sh;
458}
459
460static void shrink_buffers(struct stripe_head *sh)
461{
462 struct page *p;
463 int i;
464 int num = sh->raid_conf->pool_size;
465
466 for (i = 0; i < num ; i++) {
467 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
468 p = sh->dev[i].page;
469 if (!p)
470 continue;
471 sh->dev[i].page = NULL;
472 put_page(p);
473 }
474}
475
476static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
477{
478 int i;
479 int num = sh->raid_conf->pool_size;
480
481 for (i = 0; i < num; i++) {
482 struct page *page;
483
484 if (!(page = alloc_page(gfp))) {
485 return 1;
486 }
487 sh->dev[i].page = page;
488 sh->dev[i].orig_page = page;
489 }
490
491 return 0;
492}
493
494static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
495 struct stripe_head *sh);
496
497static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
498{
499 struct r5conf *conf = sh->raid_conf;
500 int i, seq;
501
502 BUG_ON(atomic_read(&sh->count) != 0);
503 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
504 BUG_ON(stripe_operations_active(sh));
505 BUG_ON(sh->batch_head);
506
507 pr_debug("init_stripe called, stripe %llu\n",
508 (unsigned long long)sector);
509retry:
510 seq = read_seqcount_begin(&conf->gen_lock);
511 sh->generation = conf->generation - previous;
512 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
513 sh->sector = sector;
514 stripe_set_idx(sector, conf, previous, sh);
515 sh->state = 0;
516
517 for (i = sh->disks; i--; ) {
518 struct r5dev *dev = &sh->dev[i];
519
520 if (dev->toread || dev->read || dev->towrite || dev->written ||
521 test_bit(R5_LOCKED, &dev->flags)) {
522 pr_err("sector=%llx i=%d %p %p %p %p %d\n",
523 (unsigned long long)sh->sector, i, dev->toread,
524 dev->read, dev->towrite, dev->written,
525 test_bit(R5_LOCKED, &dev->flags));
526 WARN_ON(1);
527 }
528 dev->flags = 0;
529 dev->sector = raid5_compute_blocknr(sh, i, previous);
530 }
531 if (read_seqcount_retry(&conf->gen_lock, seq))
532 goto retry;
533 sh->overwrite_disks = 0;
534 insert_hash(conf, sh);
535 sh->cpu = smp_processor_id();
536 set_bit(STRIPE_BATCH_READY, &sh->state);
537}
538
539static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
540 short generation)
541{
542 struct stripe_head *sh;
543
544 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
545 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
546 if (sh->sector == sector && sh->generation == generation)
547 return sh;
548 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
549 return NULL;
550}
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565int raid5_calc_degraded(struct r5conf *conf)
566{
567 int degraded, degraded2;
568 int i;
569
570 rcu_read_lock();
571 degraded = 0;
572 for (i = 0; i < conf->previous_raid_disks; i++) {
573 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
574 if (rdev && test_bit(Faulty, &rdev->flags))
575 rdev = rcu_dereference(conf->disks[i].replacement);
576 if (!rdev || test_bit(Faulty, &rdev->flags))
577 degraded++;
578 else if (test_bit(In_sync, &rdev->flags))
579 ;
580 else
581
582
583
584
585
586
587
588
589
590 if (conf->raid_disks >= conf->previous_raid_disks)
591 degraded++;
592 }
593 rcu_read_unlock();
594 if (conf->raid_disks == conf->previous_raid_disks)
595 return degraded;
596 rcu_read_lock();
597 degraded2 = 0;
598 for (i = 0; i < conf->raid_disks; i++) {
599 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
600 if (rdev && test_bit(Faulty, &rdev->flags))
601 rdev = rcu_dereference(conf->disks[i].replacement);
602 if (!rdev || test_bit(Faulty, &rdev->flags))
603 degraded2++;
604 else if (test_bit(In_sync, &rdev->flags))
605 ;
606 else
607
608
609
610
611
612 if (conf->raid_disks <= conf->previous_raid_disks)
613 degraded2++;
614 }
615 rcu_read_unlock();
616 if (degraded2 > degraded)
617 return degraded2;
618 return degraded;
619}
620
621static int has_failed(struct r5conf *conf)
622{
623 int degraded;
624
625 if (conf->mddev->reshape_position == MaxSector)
626 return conf->mddev->degraded > conf->max_degraded;
627
628 degraded = raid5_calc_degraded(conf);
629 if (degraded > conf->max_degraded)
630 return 1;
631 return 0;
632}
633
634struct stripe_head *
635raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
636 int previous, int noblock, int noquiesce)
637{
638 struct stripe_head *sh;
639 int hash = stripe_hash_locks_hash(sector);
640 int inc_empty_inactive_list_flag;
641
642 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
643
644 spin_lock_irq(conf->hash_locks + hash);
645
646 do {
647 wait_event_lock_irq(conf->wait_for_quiescent,
648 conf->quiesce == 0 || noquiesce,
649 *(conf->hash_locks + hash));
650 sh = __find_stripe(conf, sector, conf->generation - previous);
651 if (!sh) {
652 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
653 sh = get_free_stripe(conf, hash);
654 if (!sh && !test_bit(R5_DID_ALLOC,
655 &conf->cache_state))
656 set_bit(R5_ALLOC_MORE,
657 &conf->cache_state);
658 }
659 if (noblock && sh == NULL)
660 break;
661
662 r5c_check_stripe_cache_usage(conf);
663 if (!sh) {
664 set_bit(R5_INACTIVE_BLOCKED,
665 &conf->cache_state);
666 r5l_wake_reclaim(conf->log, 0);
667 wait_event_lock_irq(
668 conf->wait_for_stripe,
669 !list_empty(conf->inactive_list + hash) &&
670 (atomic_read(&conf->active_stripes)
671 < (conf->max_nr_stripes * 3 / 4)
672 || !test_bit(R5_INACTIVE_BLOCKED,
673 &conf->cache_state)),
674 *(conf->hash_locks + hash));
675 clear_bit(R5_INACTIVE_BLOCKED,
676 &conf->cache_state);
677 } else {
678 init_stripe(sh, sector, previous);
679 atomic_inc(&sh->count);
680 }
681 } else if (!atomic_inc_not_zero(&sh->count)) {
682 spin_lock(&conf->device_lock);
683 if (!atomic_read(&sh->count)) {
684 if (!test_bit(STRIPE_HANDLE, &sh->state))
685 atomic_inc(&conf->active_stripes);
686 BUG_ON(list_empty(&sh->lru) &&
687 !test_bit(STRIPE_EXPANDING, &sh->state));
688 inc_empty_inactive_list_flag = 0;
689 if (!list_empty(conf->inactive_list + hash))
690 inc_empty_inactive_list_flag = 1;
691 list_del_init(&sh->lru);
692 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
693 atomic_inc(&conf->empty_inactive_list_nr);
694 if (sh->group) {
695 sh->group->stripes_cnt--;
696 sh->group = NULL;
697 }
698 }
699 atomic_inc(&sh->count);
700 spin_unlock(&conf->device_lock);
701 }
702 } while (sh == NULL);
703
704 spin_unlock_irq(conf->hash_locks + hash);
705 return sh;
706}
707
708static bool is_full_stripe_write(struct stripe_head *sh)
709{
710 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
711 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
712}
713
714static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
715{
716 if (sh1 > sh2) {
717 spin_lock_irq(&sh2->stripe_lock);
718 spin_lock_nested(&sh1->stripe_lock, 1);
719 } else {
720 spin_lock_irq(&sh1->stripe_lock);
721 spin_lock_nested(&sh2->stripe_lock, 1);
722 }
723}
724
725static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
726{
727 spin_unlock(&sh1->stripe_lock);
728 spin_unlock_irq(&sh2->stripe_lock);
729}
730
731
732static bool stripe_can_batch(struct stripe_head *sh)
733{
734 struct r5conf *conf = sh->raid_conf;
735
736 if (raid5_has_log(conf) || raid5_has_ppl(conf))
737 return false;
738 return test_bit(STRIPE_BATCH_READY, &sh->state) &&
739 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
740 is_full_stripe_write(sh);
741}
742
743
744static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
745{
746 struct stripe_head *head;
747 sector_t head_sector, tmp_sec;
748 int hash;
749 int dd_idx;
750 int inc_empty_inactive_list_flag;
751
752
753 tmp_sec = sh->sector;
754 if (!sector_div(tmp_sec, conf->chunk_sectors))
755 return;
756 head_sector = sh->sector - STRIPE_SECTORS;
757
758 hash = stripe_hash_locks_hash(head_sector);
759 spin_lock_irq(conf->hash_locks + hash);
760 head = __find_stripe(conf, head_sector, conf->generation);
761 if (head && !atomic_inc_not_zero(&head->count)) {
762 spin_lock(&conf->device_lock);
763 if (!atomic_read(&head->count)) {
764 if (!test_bit(STRIPE_HANDLE, &head->state))
765 atomic_inc(&conf->active_stripes);
766 BUG_ON(list_empty(&head->lru) &&
767 !test_bit(STRIPE_EXPANDING, &head->state));
768 inc_empty_inactive_list_flag = 0;
769 if (!list_empty(conf->inactive_list + hash))
770 inc_empty_inactive_list_flag = 1;
771 list_del_init(&head->lru);
772 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
773 atomic_inc(&conf->empty_inactive_list_nr);
774 if (head->group) {
775 head->group->stripes_cnt--;
776 head->group = NULL;
777 }
778 }
779 atomic_inc(&head->count);
780 spin_unlock(&conf->device_lock);
781 }
782 spin_unlock_irq(conf->hash_locks + hash);
783
784 if (!head)
785 return;
786 if (!stripe_can_batch(head))
787 goto out;
788
789 lock_two_stripes(head, sh);
790
791 if (!stripe_can_batch(head) || !stripe_can_batch(sh))
792 goto unlock_out;
793
794 if (sh->batch_head)
795 goto unlock_out;
796
797 dd_idx = 0;
798 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
799 dd_idx++;
800 if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf ||
801 bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite))
802 goto unlock_out;
803
804 if (head->batch_head) {
805 spin_lock(&head->batch_head->batch_lock);
806
807 if (!stripe_can_batch(head)) {
808 spin_unlock(&head->batch_head->batch_lock);
809 goto unlock_out;
810 }
811
812
813
814
815
816
817
818 sh->batch_head = head->batch_head;
819
820
821
822
823
824 list_add(&sh->batch_list, &head->batch_list);
825 spin_unlock(&head->batch_head->batch_lock);
826 } else {
827 head->batch_head = head;
828 sh->batch_head = head->batch_head;
829 spin_lock(&head->batch_lock);
830 list_add_tail(&sh->batch_list, &head->batch_list);
831 spin_unlock(&head->batch_lock);
832 }
833
834 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
835 if (atomic_dec_return(&conf->preread_active_stripes)
836 < IO_THRESHOLD)
837 md_wakeup_thread(conf->mddev->thread);
838
839 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
840 int seq = sh->bm_seq;
841 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
842 sh->batch_head->bm_seq > seq)
843 seq = sh->batch_head->bm_seq;
844 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
845 sh->batch_head->bm_seq = seq;
846 }
847
848 atomic_inc(&sh->count);
849unlock_out:
850 unlock_two_stripes(head, sh);
851out:
852 raid5_release_stripe(head);
853}
854
855
856
857
858static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
859{
860 sector_t progress = conf->reshape_progress;
861
862
863
864
865 smp_rmb();
866 if (progress == MaxSector)
867 return 0;
868 if (sh->generation == conf->generation - 1)
869 return 0;
870
871
872
873 return 1;
874}
875
876static void dispatch_bio_list(struct bio_list *tmp)
877{
878 struct bio *bio;
879
880 while ((bio = bio_list_pop(tmp)))
881 generic_make_request(bio);
882}
883
884static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
885{
886 const struct r5pending_data *da = list_entry(a,
887 struct r5pending_data, sibling);
888 const struct r5pending_data *db = list_entry(b,
889 struct r5pending_data, sibling);
890 if (da->sector > db->sector)
891 return 1;
892 if (da->sector < db->sector)
893 return -1;
894 return 0;
895}
896
897static void dispatch_defer_bios(struct r5conf *conf, int target,
898 struct bio_list *list)
899{
900 struct r5pending_data *data;
901 struct list_head *first, *next = NULL;
902 int cnt = 0;
903
904 if (conf->pending_data_cnt == 0)
905 return;
906
907 list_sort(NULL, &conf->pending_list, cmp_stripe);
908
909 first = conf->pending_list.next;
910
911
912 if (conf->next_pending_data)
913 list_move_tail(&conf->pending_list,
914 &conf->next_pending_data->sibling);
915
916 while (!list_empty(&conf->pending_list)) {
917 data = list_first_entry(&conf->pending_list,
918 struct r5pending_data, sibling);
919 if (&data->sibling == first)
920 first = data->sibling.next;
921 next = data->sibling.next;
922
923 bio_list_merge(list, &data->bios);
924 list_move(&data->sibling, &conf->free_list);
925 cnt++;
926 if (cnt >= target)
927 break;
928 }
929 conf->pending_data_cnt -= cnt;
930 BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
931
932 if (next != &conf->pending_list)
933 conf->next_pending_data = list_entry(next,
934 struct r5pending_data, sibling);
935 else
936 conf->next_pending_data = NULL;
937
938 if (first != &conf->pending_list)
939 list_move_tail(&conf->pending_list, first);
940}
941
942static void flush_deferred_bios(struct r5conf *conf)
943{
944 struct bio_list tmp = BIO_EMPTY_LIST;
945
946 if (conf->pending_data_cnt == 0)
947 return;
948
949 spin_lock(&conf->pending_bios_lock);
950 dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
951 BUG_ON(conf->pending_data_cnt != 0);
952 spin_unlock(&conf->pending_bios_lock);
953
954 dispatch_bio_list(&tmp);
955}
956
957static void defer_issue_bios(struct r5conf *conf, sector_t sector,
958 struct bio_list *bios)
959{
960 struct bio_list tmp = BIO_EMPTY_LIST;
961 struct r5pending_data *ent;
962
963 spin_lock(&conf->pending_bios_lock);
964 ent = list_first_entry(&conf->free_list, struct r5pending_data,
965 sibling);
966 list_move_tail(&ent->sibling, &conf->pending_list);
967 ent->sector = sector;
968 bio_list_init(&ent->bios);
969 bio_list_merge(&ent->bios, bios);
970 conf->pending_data_cnt++;
971 if (conf->pending_data_cnt >= PENDING_IO_MAX)
972 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
973
974 spin_unlock(&conf->pending_bios_lock);
975
976 dispatch_bio_list(&tmp);
977}
978
979static void
980raid5_end_read_request(struct bio *bi);
981static void
982raid5_end_write_request(struct bio *bi);
983
984static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
985{
986 struct r5conf *conf = sh->raid_conf;
987 int i, disks = sh->disks;
988 struct stripe_head *head_sh = sh;
989 struct bio_list pending_bios = BIO_EMPTY_LIST;
990 bool should_defer;
991
992 might_sleep();
993
994 if (log_stripe(sh, s) == 0)
995 return;
996
997 should_defer = conf->batch_bio_dispatch && conf->group_cnt;
998
999 for (i = disks; i--; ) {
1000 int op, op_flags = 0;
1001 int replace_only = 0;
1002 struct bio *bi, *rbi;
1003 struct md_rdev *rdev, *rrdev = NULL;
1004
1005 sh = head_sh;
1006 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
1007 op = REQ_OP_WRITE;
1008 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
1009 op_flags = REQ_FUA;
1010 if (test_bit(R5_Discard, &sh->dev[i].flags))
1011 op = REQ_OP_DISCARD;
1012 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1013 op = REQ_OP_READ;
1014 else if (test_and_clear_bit(R5_WantReplace,
1015 &sh->dev[i].flags)) {
1016 op = REQ_OP_WRITE;
1017 replace_only = 1;
1018 } else
1019 continue;
1020 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
1021 op_flags |= REQ_SYNC;
1022
1023again:
1024 bi = &sh->dev[i].req;
1025 rbi = &sh->dev[i].rreq;
1026
1027 rcu_read_lock();
1028 rrdev = rcu_dereference(conf->disks[i].replacement);
1029 smp_mb();
1030 rdev = rcu_dereference(conf->disks[i].rdev);
1031 if (!rdev) {
1032 rdev = rrdev;
1033 rrdev = NULL;
1034 }
1035 if (op_is_write(op)) {
1036 if (replace_only)
1037 rdev = NULL;
1038 if (rdev == rrdev)
1039
1040 rrdev = NULL;
1041 } else {
1042 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
1043 rdev = rrdev;
1044 rrdev = NULL;
1045 }
1046
1047 if (rdev && test_bit(Faulty, &rdev->flags))
1048 rdev = NULL;
1049 if (rdev)
1050 atomic_inc(&rdev->nr_pending);
1051 if (rrdev && test_bit(Faulty, &rrdev->flags))
1052 rrdev = NULL;
1053 if (rrdev)
1054 atomic_inc(&rrdev->nr_pending);
1055 rcu_read_unlock();
1056
1057
1058
1059
1060
1061 while (op_is_write(op) && rdev &&
1062 test_bit(WriteErrorSeen, &rdev->flags)) {
1063 sector_t first_bad;
1064 int bad_sectors;
1065 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
1066 &first_bad, &bad_sectors);
1067 if (!bad)
1068 break;
1069
1070 if (bad < 0) {
1071 set_bit(BlockedBadBlocks, &rdev->flags);
1072 if (!conf->mddev->external &&
1073 conf->mddev->sb_flags) {
1074
1075
1076
1077
1078 md_check_recovery(conf->mddev);
1079 }
1080
1081
1082
1083
1084
1085 atomic_inc(&rdev->nr_pending);
1086 md_wait_for_blocked_rdev(rdev, conf->mddev);
1087 } else {
1088
1089 rdev_dec_pending(rdev, conf->mddev);
1090 rdev = NULL;
1091 }
1092 }
1093
1094 if (rdev) {
1095 if (s->syncing || s->expanding || s->expanded
1096 || s->replacing)
1097 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1098
1099 set_bit(STRIPE_IO_STARTED, &sh->state);
1100
1101 bio_set_dev(bi, rdev->bdev);
1102 bio_set_op_attrs(bi, op, op_flags);
1103 bi->bi_end_io = op_is_write(op)
1104 ? raid5_end_write_request
1105 : raid5_end_read_request;
1106 bi->bi_private = sh;
1107
1108 pr_debug("%s: for %llu schedule op %d on disc %d\n",
1109 __func__, (unsigned long long)sh->sector,
1110 bi->bi_opf, i);
1111 atomic_inc(&sh->count);
1112 if (sh != head_sh)
1113 atomic_inc(&head_sh->count);
1114 if (use_new_offset(conf, sh))
1115 bi->bi_iter.bi_sector = (sh->sector
1116 + rdev->new_data_offset);
1117 else
1118 bi->bi_iter.bi_sector = (sh->sector
1119 + rdev->data_offset);
1120 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
1121 bi->bi_opf |= REQ_NOMERGE;
1122
1123 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1124 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1125
1126 if (!op_is_write(op) &&
1127 test_bit(R5_InJournal, &sh->dev[i].flags))
1128
1129
1130
1131
1132
1133 sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
1134 else
1135 sh->dev[i].vec.bv_page = sh->dev[i].page;
1136 bi->bi_vcnt = 1;
1137 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1138 bi->bi_io_vec[0].bv_offset = 0;
1139 bi->bi_iter.bi_size = STRIPE_SIZE;
1140 bi->bi_write_hint = sh->dev[i].write_hint;
1141 if (!rrdev)
1142 sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
1143
1144
1145
1146
1147 if (op == REQ_OP_DISCARD)
1148 bi->bi_vcnt = 0;
1149 if (rrdev)
1150 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
1151
1152 if (conf->mddev->gendisk)
1153 trace_block_bio_remap(bi->bi_disk->queue,
1154 bi, disk_devt(conf->mddev->gendisk),
1155 sh->dev[i].sector);
1156 if (should_defer && op_is_write(op))
1157 bio_list_add(&pending_bios, bi);
1158 else
1159 generic_make_request(bi);
1160 }
1161 if (rrdev) {
1162 if (s->syncing || s->expanding || s->expanded
1163 || s->replacing)
1164 md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
1165
1166 set_bit(STRIPE_IO_STARTED, &sh->state);
1167
1168 bio_set_dev(rbi, rrdev->bdev);
1169 bio_set_op_attrs(rbi, op, op_flags);
1170 BUG_ON(!op_is_write(op));
1171 rbi->bi_end_io = raid5_end_write_request;
1172 rbi->bi_private = sh;
1173
1174 pr_debug("%s: for %llu schedule op %d on "
1175 "replacement disc %d\n",
1176 __func__, (unsigned long long)sh->sector,
1177 rbi->bi_opf, i);
1178 atomic_inc(&sh->count);
1179 if (sh != head_sh)
1180 atomic_inc(&head_sh->count);
1181 if (use_new_offset(conf, sh))
1182 rbi->bi_iter.bi_sector = (sh->sector
1183 + rrdev->new_data_offset);
1184 else
1185 rbi->bi_iter.bi_sector = (sh->sector
1186 + rrdev->data_offset);
1187 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1188 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1189 sh->dev[i].rvec.bv_page = sh->dev[i].page;
1190 rbi->bi_vcnt = 1;
1191 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1192 rbi->bi_io_vec[0].bv_offset = 0;
1193 rbi->bi_iter.bi_size = STRIPE_SIZE;
1194 rbi->bi_write_hint = sh->dev[i].write_hint;
1195 sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
1196
1197
1198
1199
1200 if (op == REQ_OP_DISCARD)
1201 rbi->bi_vcnt = 0;
1202 if (conf->mddev->gendisk)
1203 trace_block_bio_remap(rbi->bi_disk->queue,
1204 rbi, disk_devt(conf->mddev->gendisk),
1205 sh->dev[i].sector);
1206 if (should_defer && op_is_write(op))
1207 bio_list_add(&pending_bios, rbi);
1208 else
1209 generic_make_request(rbi);
1210 }
1211 if (!rdev && !rrdev) {
1212 if (op_is_write(op))
1213 set_bit(STRIPE_DEGRADED, &sh->state);
1214 pr_debug("skip op %d on disc %d for sector %llu\n",
1215 bi->bi_opf, i, (unsigned long long)sh->sector);
1216 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1217 set_bit(STRIPE_HANDLE, &sh->state);
1218 }
1219
1220 if (!head_sh->batch_head)
1221 continue;
1222 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1223 batch_list);
1224 if (sh != head_sh)
1225 goto again;
1226 }
1227
1228 if (should_defer && !bio_list_empty(&pending_bios))
1229 defer_issue_bios(conf, head_sh->sector, &pending_bios);
1230}
1231
1232static struct dma_async_tx_descriptor *
1233async_copy_data(int frombio, struct bio *bio, struct page **page,
1234 sector_t sector, struct dma_async_tx_descriptor *tx,
1235 struct stripe_head *sh, int no_skipcopy)
1236{
1237 struct bio_vec bvl;
1238 struct bvec_iter iter;
1239 struct page *bio_page;
1240 int page_offset;
1241 struct async_submit_ctl submit;
1242 enum async_tx_flags flags = 0;
1243
1244 if (bio->bi_iter.bi_sector >= sector)
1245 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
1246 else
1247 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
1248
1249 if (frombio)
1250 flags |= ASYNC_TX_FENCE;
1251 init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
1252
1253 bio_for_each_segment(bvl, bio, iter) {
1254 int len = bvl.bv_len;
1255 int clen;
1256 int b_offset = 0;
1257
1258 if (page_offset < 0) {
1259 b_offset = -page_offset;
1260 page_offset += b_offset;
1261 len -= b_offset;
1262 }
1263
1264 if (len > 0 && page_offset + len > STRIPE_SIZE)
1265 clen = STRIPE_SIZE - page_offset;
1266 else
1267 clen = len;
1268
1269 if (clen > 0) {
1270 b_offset += bvl.bv_offset;
1271 bio_page = bvl.bv_page;
1272 if (frombio) {
1273 if (sh->raid_conf->skip_copy &&
1274 b_offset == 0 && page_offset == 0 &&
1275 clen == STRIPE_SIZE &&
1276 !no_skipcopy)
1277 *page = bio_page;
1278 else
1279 tx = async_memcpy(*page, bio_page, page_offset,
1280 b_offset, clen, &submit);
1281 } else
1282 tx = async_memcpy(bio_page, *page, b_offset,
1283 page_offset, clen, &submit);
1284 }
1285
1286 submit.depend_tx = tx;
1287
1288 if (clen < len)
1289 break;
1290 page_offset += len;
1291 }
1292
1293 return tx;
1294}
1295
1296static void ops_complete_biofill(void *stripe_head_ref)
1297{
1298 struct stripe_head *sh = stripe_head_ref;
1299 int i;
1300
1301 pr_debug("%s: stripe %llu\n", __func__,
1302 (unsigned long long)sh->sector);
1303
1304
1305 for (i = sh->disks; i--; ) {
1306 struct r5dev *dev = &sh->dev[i];
1307
1308
1309
1310
1311
1312
1313 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
1314 struct bio *rbi, *rbi2;
1315
1316 BUG_ON(!dev->read);
1317 rbi = dev->read;
1318 dev->read = NULL;
1319 while (rbi && rbi->bi_iter.bi_sector <
1320 dev->sector + STRIPE_SECTORS) {
1321 rbi2 = r5_next_bio(rbi, dev->sector);
1322 bio_endio(rbi);
1323 rbi = rbi2;
1324 }
1325 }
1326 }
1327 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
1328
1329 set_bit(STRIPE_HANDLE, &sh->state);
1330 raid5_release_stripe(sh);
1331}
1332
1333static void ops_run_biofill(struct stripe_head *sh)
1334{
1335 struct dma_async_tx_descriptor *tx = NULL;
1336 struct async_submit_ctl submit;
1337 int i;
1338
1339 BUG_ON(sh->batch_head);
1340 pr_debug("%s: stripe %llu\n", __func__,
1341 (unsigned long long)sh->sector);
1342
1343 for (i = sh->disks; i--; ) {
1344 struct r5dev *dev = &sh->dev[i];
1345 if (test_bit(R5_Wantfill, &dev->flags)) {
1346 struct bio *rbi;
1347 spin_lock_irq(&sh->stripe_lock);
1348 dev->read = rbi = dev->toread;
1349 dev->toread = NULL;
1350 spin_unlock_irq(&sh->stripe_lock);
1351 while (rbi && rbi->bi_iter.bi_sector <
1352 dev->sector + STRIPE_SECTORS) {
1353 tx = async_copy_data(0, rbi, &dev->page,
1354 dev->sector, tx, sh, 0);
1355 rbi = r5_next_bio(rbi, dev->sector);
1356 }
1357 }
1358 }
1359
1360 atomic_inc(&sh->count);
1361 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
1362 async_trigger_callback(&submit);
1363}
1364
1365static void mark_target_uptodate(struct stripe_head *sh, int target)
1366{
1367 struct r5dev *tgt;
1368
1369 if (target < 0)
1370 return;
1371
1372 tgt = &sh->dev[target];
1373 set_bit(R5_UPTODATE, &tgt->flags);
1374 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1375 clear_bit(R5_Wantcompute, &tgt->flags);
1376}
1377
1378static void ops_complete_compute(void *stripe_head_ref)
1379{
1380 struct stripe_head *sh = stripe_head_ref;
1381
1382 pr_debug("%s: stripe %llu\n", __func__,
1383 (unsigned long long)sh->sector);
1384
1385
1386 mark_target_uptodate(sh, sh->ops.target);
1387 mark_target_uptodate(sh, sh->ops.target2);
1388
1389 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
1390 if (sh->check_state == check_state_compute_run)
1391 sh->check_state = check_state_compute_result;
1392 set_bit(STRIPE_HANDLE, &sh->state);
1393 raid5_release_stripe(sh);
1394}
1395
1396
1397static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1398 struct raid5_percpu *percpu, int i)
1399{
1400 void *addr;
1401
1402 addr = flex_array_get(percpu->scribble, i);
1403 return addr + sizeof(struct page *) * (sh->disks + 2);
1404}
1405
1406
1407static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1408{
1409 void *addr;
1410
1411 addr = flex_array_get(percpu->scribble, i);
1412 return addr;
1413}
1414
1415static struct dma_async_tx_descriptor *
1416ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1417{
1418 int disks = sh->disks;
1419 struct page **xor_srcs = to_addr_page(percpu, 0);
1420 int target = sh->ops.target;
1421 struct r5dev *tgt = &sh->dev[target];
1422 struct page *xor_dest = tgt->page;
1423 int count = 0;
1424 struct dma_async_tx_descriptor *tx;
1425 struct async_submit_ctl submit;
1426 int i;
1427
1428 BUG_ON(sh->batch_head);
1429
1430 pr_debug("%s: stripe %llu block: %d\n",
1431 __func__, (unsigned long long)sh->sector, target);
1432 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1433
1434 for (i = disks; i--; )
1435 if (i != target)
1436 xor_srcs[count++] = sh->dev[i].page;
1437
1438 atomic_inc(&sh->count);
1439
1440 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
1441 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
1442 if (unlikely(count == 1))
1443 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1444 else
1445 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1446
1447 return tx;
1448}
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459static int set_syndrome_sources(struct page **srcs,
1460 struct stripe_head *sh,
1461 int srctype)
1462{
1463 int disks = sh->disks;
1464 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1465 int d0_idx = raid6_d0(sh);
1466 int count;
1467 int i;
1468
1469 for (i = 0; i < disks; i++)
1470 srcs[i] = NULL;
1471
1472 count = 0;
1473 i = d0_idx;
1474 do {
1475 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1476 struct r5dev *dev = &sh->dev[i];
1477
1478 if (i == sh->qd_idx || i == sh->pd_idx ||
1479 (srctype == SYNDROME_SRC_ALL) ||
1480 (srctype == SYNDROME_SRC_WANT_DRAIN &&
1481 (test_bit(R5_Wantdrain, &dev->flags) ||
1482 test_bit(R5_InJournal, &dev->flags))) ||
1483 (srctype == SYNDROME_SRC_WRITTEN &&
1484 (dev->written ||
1485 test_bit(R5_InJournal, &dev->flags)))) {
1486 if (test_bit(R5_InJournal, &dev->flags))
1487 srcs[slot] = sh->dev[i].orig_page;
1488 else
1489 srcs[slot] = sh->dev[i].page;
1490 }
1491 i = raid6_next_disk(i, disks);
1492 } while (i != d0_idx);
1493
1494 return syndrome_disks;
1495}
1496
1497static struct dma_async_tx_descriptor *
1498ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1499{
1500 int disks = sh->disks;
1501 struct page **blocks = to_addr_page(percpu, 0);
1502 int target;
1503 int qd_idx = sh->qd_idx;
1504 struct dma_async_tx_descriptor *tx;
1505 struct async_submit_ctl submit;
1506 struct r5dev *tgt;
1507 struct page *dest;
1508 int i;
1509 int count;
1510
1511 BUG_ON(sh->batch_head);
1512 if (sh->ops.target < 0)
1513 target = sh->ops.target2;
1514 else if (sh->ops.target2 < 0)
1515 target = sh->ops.target;
1516 else
1517
1518 BUG();
1519 BUG_ON(target < 0);
1520 pr_debug("%s: stripe %llu block: %d\n",
1521 __func__, (unsigned long long)sh->sector, target);
1522
1523 tgt = &sh->dev[target];
1524 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1525 dest = tgt->page;
1526
1527 atomic_inc(&sh->count);
1528
1529 if (target == qd_idx) {
1530 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1531 blocks[count] = NULL;
1532 BUG_ON(blocks[count+1] != dest);
1533 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1534 ops_complete_compute, sh,
1535 to_addr_conv(sh, percpu, 0));
1536 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1537 } else {
1538
1539 count = 0;
1540 for (i = disks; i-- ; ) {
1541 if (i == target || i == qd_idx)
1542 continue;
1543 blocks[count++] = sh->dev[i].page;
1544 }
1545
1546 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1547 NULL, ops_complete_compute, sh,
1548 to_addr_conv(sh, percpu, 0));
1549 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
1550 }
1551
1552 return tx;
1553}
1554
1555static struct dma_async_tx_descriptor *
1556ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1557{
1558 int i, count, disks = sh->disks;
1559 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1560 int d0_idx = raid6_d0(sh);
1561 int faila = -1, failb = -1;
1562 int target = sh->ops.target;
1563 int target2 = sh->ops.target2;
1564 struct r5dev *tgt = &sh->dev[target];
1565 struct r5dev *tgt2 = &sh->dev[target2];
1566 struct dma_async_tx_descriptor *tx;
1567 struct page **blocks = to_addr_page(percpu, 0);
1568 struct async_submit_ctl submit;
1569
1570 BUG_ON(sh->batch_head);
1571 pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1572 __func__, (unsigned long long)sh->sector, target, target2);
1573 BUG_ON(target < 0 || target2 < 0);
1574 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1575 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1576
1577
1578
1579
1580 for (i = 0; i < disks ; i++)
1581 blocks[i] = NULL;
1582 count = 0;
1583 i = d0_idx;
1584 do {
1585 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1586
1587 blocks[slot] = sh->dev[i].page;
1588
1589 if (i == target)
1590 faila = slot;
1591 if (i == target2)
1592 failb = slot;
1593 i = raid6_next_disk(i, disks);
1594 } while (i != d0_idx);
1595
1596 BUG_ON(faila == failb);
1597 if (failb < faila)
1598 swap(faila, failb);
1599 pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1600 __func__, (unsigned long long)sh->sector, faila, failb);
1601
1602 atomic_inc(&sh->count);
1603
1604 if (failb == syndrome_disks+1) {
1605
1606 if (faila == syndrome_disks) {
1607
1608 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1609 ops_complete_compute, sh,
1610 to_addr_conv(sh, percpu, 0));
1611 return async_gen_syndrome(blocks, 0, syndrome_disks+2,
1612 STRIPE_SIZE, &submit);
1613 } else {
1614 struct page *dest;
1615 int data_target;
1616 int qd_idx = sh->qd_idx;
1617
1618
1619 if (target == qd_idx)
1620 data_target = target2;
1621 else
1622 data_target = target;
1623
1624 count = 0;
1625 for (i = disks; i-- ; ) {
1626 if (i == data_target || i == qd_idx)
1627 continue;
1628 blocks[count++] = sh->dev[i].page;
1629 }
1630 dest = sh->dev[data_target].page;
1631 init_async_submit(&submit,
1632 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1633 NULL, NULL, NULL,
1634 to_addr_conv(sh, percpu, 0));
1635 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
1636 &submit);
1637
1638 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1639 init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1640 ops_complete_compute, sh,
1641 to_addr_conv(sh, percpu, 0));
1642 return async_gen_syndrome(blocks, 0, count+2,
1643 STRIPE_SIZE, &submit);
1644 }
1645 } else {
1646 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1647 ops_complete_compute, sh,
1648 to_addr_conv(sh, percpu, 0));
1649 if (failb == syndrome_disks) {
1650
1651 return async_raid6_datap_recov(syndrome_disks+2,
1652 STRIPE_SIZE, faila,
1653 blocks, &submit);
1654 } else {
1655
1656 return async_raid6_2data_recov(syndrome_disks+2,
1657 STRIPE_SIZE, faila, failb,
1658 blocks, &submit);
1659 }
1660 }
1661}
1662
1663static void ops_complete_prexor(void *stripe_head_ref)
1664{
1665 struct stripe_head *sh = stripe_head_ref;
1666
1667 pr_debug("%s: stripe %llu\n", __func__,
1668 (unsigned long long)sh->sector);
1669
1670 if (r5c_is_writeback(sh->raid_conf->log))
1671
1672
1673
1674
1675 r5c_release_extra_page(sh);
1676}
1677
1678static struct dma_async_tx_descriptor *
1679ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1680 struct dma_async_tx_descriptor *tx)
1681{
1682 int disks = sh->disks;
1683 struct page **xor_srcs = to_addr_page(percpu, 0);
1684 int count = 0, pd_idx = sh->pd_idx, i;
1685 struct async_submit_ctl submit;
1686
1687
1688 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1689
1690 BUG_ON(sh->batch_head);
1691 pr_debug("%s: stripe %llu\n", __func__,
1692 (unsigned long long)sh->sector);
1693
1694 for (i = disks; i--; ) {
1695 struct r5dev *dev = &sh->dev[i];
1696
1697 if (test_bit(R5_InJournal, &dev->flags))
1698 xor_srcs[count++] = dev->orig_page;
1699 else if (test_bit(R5_Wantdrain, &dev->flags))
1700 xor_srcs[count++] = dev->page;
1701 }
1702
1703 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1704 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1705 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1706
1707 return tx;
1708}
1709
1710static struct dma_async_tx_descriptor *
1711ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
1712 struct dma_async_tx_descriptor *tx)
1713{
1714 struct page **blocks = to_addr_page(percpu, 0);
1715 int count;
1716 struct async_submit_ctl submit;
1717
1718 pr_debug("%s: stripe %llu\n", __func__,
1719 (unsigned long long)sh->sector);
1720
1721 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
1722
1723 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
1724 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1725 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1726
1727 return tx;
1728}
1729
1730static struct dma_async_tx_descriptor *
1731ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1732{
1733 struct r5conf *conf = sh->raid_conf;
1734 int disks = sh->disks;
1735 int i;
1736 struct stripe_head *head_sh = sh;
1737
1738 pr_debug("%s: stripe %llu\n", __func__,
1739 (unsigned long long)sh->sector);
1740
1741 for (i = disks; i--; ) {
1742 struct r5dev *dev;
1743 struct bio *chosen;
1744
1745 sh = head_sh;
1746 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
1747 struct bio *wbi;
1748
1749again:
1750 dev = &sh->dev[i];
1751
1752
1753
1754
1755 clear_bit(R5_InJournal, &dev->flags);
1756 spin_lock_irq(&sh->stripe_lock);
1757 chosen = dev->towrite;
1758 dev->towrite = NULL;
1759 sh->overwrite_disks = 0;
1760 BUG_ON(dev->written);
1761 wbi = dev->written = chosen;
1762 spin_unlock_irq(&sh->stripe_lock);
1763 WARN_ON(dev->page != dev->orig_page);
1764
1765 while (wbi && wbi->bi_iter.bi_sector <
1766 dev->sector + STRIPE_SECTORS) {
1767 if (wbi->bi_opf & REQ_FUA)
1768 set_bit(R5_WantFUA, &dev->flags);
1769 if (wbi->bi_opf & REQ_SYNC)
1770 set_bit(R5_SyncIO, &dev->flags);
1771 if (bio_op(wbi) == REQ_OP_DISCARD)
1772 set_bit(R5_Discard, &dev->flags);
1773 else {
1774 tx = async_copy_data(1, wbi, &dev->page,
1775 dev->sector, tx, sh,
1776 r5c_is_writeback(conf->log));
1777 if (dev->page != dev->orig_page &&
1778 !r5c_is_writeback(conf->log)) {
1779 set_bit(R5_SkipCopy, &dev->flags);
1780 clear_bit(R5_UPTODATE, &dev->flags);
1781 clear_bit(R5_OVERWRITE, &dev->flags);
1782 }
1783 }
1784 wbi = r5_next_bio(wbi, dev->sector);
1785 }
1786
1787 if (head_sh->batch_head) {
1788 sh = list_first_entry(&sh->batch_list,
1789 struct stripe_head,
1790 batch_list);
1791 if (sh == head_sh)
1792 continue;
1793 goto again;
1794 }
1795 }
1796 }
1797
1798 return tx;
1799}
1800
1801static void ops_complete_reconstruct(void *stripe_head_ref)
1802{
1803 struct stripe_head *sh = stripe_head_ref;
1804 int disks = sh->disks;
1805 int pd_idx = sh->pd_idx;
1806 int qd_idx = sh->qd_idx;
1807 int i;
1808 bool fua = false, sync = false, discard = false;
1809
1810 pr_debug("%s: stripe %llu\n", __func__,
1811 (unsigned long long)sh->sector);
1812
1813 for (i = disks; i--; ) {
1814 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1815 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1816 discard |= test_bit(R5_Discard, &sh->dev[i].flags);
1817 }
1818
1819 for (i = disks; i--; ) {
1820 struct r5dev *dev = &sh->dev[i];
1821
1822 if (dev->written || i == pd_idx || i == qd_idx) {
1823 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) {
1824 set_bit(R5_UPTODATE, &dev->flags);
1825 if (test_bit(STRIPE_EXPAND_READY, &sh->state))
1826 set_bit(R5_Expanded, &dev->flags);
1827 }
1828 if (fua)
1829 set_bit(R5_WantFUA, &dev->flags);
1830 if (sync)
1831 set_bit(R5_SyncIO, &dev->flags);
1832 }
1833 }
1834
1835 if (sh->reconstruct_state == reconstruct_state_drain_run)
1836 sh->reconstruct_state = reconstruct_state_drain_result;
1837 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
1838 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
1839 else {
1840 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
1841 sh->reconstruct_state = reconstruct_state_result;
1842 }
1843
1844 set_bit(STRIPE_HANDLE, &sh->state);
1845 raid5_release_stripe(sh);
1846}
1847
1848static void
1849ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1850 struct dma_async_tx_descriptor *tx)
1851{
1852 int disks = sh->disks;
1853 struct page **xor_srcs;
1854 struct async_submit_ctl submit;
1855 int count, pd_idx = sh->pd_idx, i;
1856 struct page *xor_dest;
1857 int prexor = 0;
1858 unsigned long flags;
1859 int j = 0;
1860 struct stripe_head *head_sh = sh;
1861 int last_stripe;
1862
1863 pr_debug("%s: stripe %llu\n", __func__,
1864 (unsigned long long)sh->sector);
1865
1866 for (i = 0; i < sh->disks; i++) {
1867 if (pd_idx == i)
1868 continue;
1869 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1870 break;
1871 }
1872 if (i >= sh->disks) {
1873 atomic_inc(&sh->count);
1874 set_bit(R5_Discard, &sh->dev[pd_idx].flags);
1875 ops_complete_reconstruct(sh);
1876 return;
1877 }
1878again:
1879 count = 0;
1880 xor_srcs = to_addr_page(percpu, j);
1881
1882
1883
1884 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1885 prexor = 1;
1886 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1887 for (i = disks; i--; ) {
1888 struct r5dev *dev = &sh->dev[i];
1889 if (head_sh->dev[i].written ||
1890 test_bit(R5_InJournal, &head_sh->dev[i].flags))
1891 xor_srcs[count++] = dev->page;
1892 }
1893 } else {
1894 xor_dest = sh->dev[pd_idx].page;
1895 for (i = disks; i--; ) {
1896 struct r5dev *dev = &sh->dev[i];
1897 if (i != pd_idx)
1898 xor_srcs[count++] = dev->page;
1899 }
1900 }
1901
1902
1903
1904
1905
1906
1907 last_stripe = !head_sh->batch_head ||
1908 list_first_entry(&sh->batch_list,
1909 struct stripe_head, batch_list) == head_sh;
1910 if (last_stripe) {
1911 flags = ASYNC_TX_ACK |
1912 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1913
1914 atomic_inc(&head_sh->count);
1915 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
1916 to_addr_conv(sh, percpu, j));
1917 } else {
1918 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
1919 init_async_submit(&submit, flags, tx, NULL, NULL,
1920 to_addr_conv(sh, percpu, j));
1921 }
1922
1923 if (unlikely(count == 1))
1924 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1925 else
1926 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1927 if (!last_stripe) {
1928 j++;
1929 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1930 batch_list);
1931 goto again;
1932 }
1933}
1934
1935static void
1936ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1937 struct dma_async_tx_descriptor *tx)
1938{
1939 struct async_submit_ctl submit;
1940 struct page **blocks;
1941 int count, i, j = 0;
1942 struct stripe_head *head_sh = sh;
1943 int last_stripe;
1944 int synflags;
1945 unsigned long txflags;
1946
1947 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1948
1949 for (i = 0; i < sh->disks; i++) {
1950 if (sh->pd_idx == i || sh->qd_idx == i)
1951 continue;
1952 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1953 break;
1954 }
1955 if (i >= sh->disks) {
1956 atomic_inc(&sh->count);
1957 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
1958 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
1959 ops_complete_reconstruct(sh);
1960 return;
1961 }
1962
1963again:
1964 blocks = to_addr_page(percpu, j);
1965
1966 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1967 synflags = SYNDROME_SRC_WRITTEN;
1968 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
1969 } else {
1970 synflags = SYNDROME_SRC_ALL;
1971 txflags = ASYNC_TX_ACK;
1972 }
1973
1974 count = set_syndrome_sources(blocks, sh, synflags);
1975 last_stripe = !head_sh->batch_head ||
1976 list_first_entry(&sh->batch_list,
1977 struct stripe_head, batch_list) == head_sh;
1978
1979 if (last_stripe) {
1980 atomic_inc(&head_sh->count);
1981 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
1982 head_sh, to_addr_conv(sh, percpu, j));
1983 } else
1984 init_async_submit(&submit, 0, tx, NULL, NULL,
1985 to_addr_conv(sh, percpu, j));
1986 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1987 if (!last_stripe) {
1988 j++;
1989 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1990 batch_list);
1991 goto again;
1992 }
1993}
1994
1995static void ops_complete_check(void *stripe_head_ref)
1996{
1997 struct stripe_head *sh = stripe_head_ref;
1998
1999 pr_debug("%s: stripe %llu\n", __func__,
2000 (unsigned long long)sh->sector);
2001
2002 sh->check_state = check_state_check_result;
2003 set_bit(STRIPE_HANDLE, &sh->state);
2004 raid5_release_stripe(sh);
2005}
2006
2007static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
2008{
2009 int disks = sh->disks;
2010 int pd_idx = sh->pd_idx;
2011 int qd_idx = sh->qd_idx;
2012 struct page *xor_dest;
2013 struct page **xor_srcs = to_addr_page(percpu, 0);
2014 struct dma_async_tx_descriptor *tx;
2015 struct async_submit_ctl submit;
2016 int count;
2017 int i;
2018
2019 pr_debug("%s: stripe %llu\n", __func__,
2020 (unsigned long long)sh->sector);
2021
2022 BUG_ON(sh->batch_head);
2023 count = 0;
2024 xor_dest = sh->dev[pd_idx].page;
2025 xor_srcs[count++] = xor_dest;
2026 for (i = disks; i--; ) {
2027 if (i == pd_idx || i == qd_idx)
2028 continue;
2029 xor_srcs[count++] = sh->dev[i].page;
2030 }
2031
2032 init_async_submit(&submit, 0, NULL, NULL, NULL,
2033 to_addr_conv(sh, percpu, 0));
2034 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
2035 &sh->ops.zero_sum_result, &submit);
2036
2037 atomic_inc(&sh->count);
2038 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
2039 tx = async_trigger_callback(&submit);
2040}
2041
2042static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
2043{
2044 struct page **srcs = to_addr_page(percpu, 0);
2045 struct async_submit_ctl submit;
2046 int count;
2047
2048 pr_debug("%s: stripe %llu checkp: %d\n", __func__,
2049 (unsigned long long)sh->sector, checkp);
2050
2051 BUG_ON(sh->batch_head);
2052 count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
2053 if (!checkp)
2054 srcs[count] = NULL;
2055
2056 atomic_inc(&sh->count);
2057 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
2058 sh, to_addr_conv(sh, percpu, 0));
2059 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
2060 &sh->ops.zero_sum_result, percpu->spare_page, &submit);
2061}
2062
2063static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
2064{
2065 int overlap_clear = 0, i, disks = sh->disks;
2066 struct dma_async_tx_descriptor *tx = NULL;
2067 struct r5conf *conf = sh->raid_conf;
2068 int level = conf->level;
2069 struct raid5_percpu *percpu;
2070 unsigned long cpu;
2071
2072 cpu = get_cpu();
2073 percpu = per_cpu_ptr(conf->percpu, cpu);
2074 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
2075 ops_run_biofill(sh);
2076 overlap_clear++;
2077 }
2078
2079 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
2080 if (level < 6)
2081 tx = ops_run_compute5(sh, percpu);
2082 else {
2083 if (sh->ops.target2 < 0 || sh->ops.target < 0)
2084 tx = ops_run_compute6_1(sh, percpu);
2085 else
2086 tx = ops_run_compute6_2(sh, percpu);
2087 }
2088
2089 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
2090 async_tx_ack(tx);
2091 }
2092
2093 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
2094 if (level < 6)
2095 tx = ops_run_prexor5(sh, percpu, tx);
2096 else
2097 tx = ops_run_prexor6(sh, percpu, tx);
2098 }
2099
2100 if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
2101 tx = ops_run_partial_parity(sh, percpu, tx);
2102
2103 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
2104 tx = ops_run_biodrain(sh, tx);
2105 overlap_clear++;
2106 }
2107
2108 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
2109 if (level < 6)
2110 ops_run_reconstruct5(sh, percpu, tx);
2111 else
2112 ops_run_reconstruct6(sh, percpu, tx);
2113 }
2114
2115 if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
2116 if (sh->check_state == check_state_run)
2117 ops_run_check_p(sh, percpu);
2118 else if (sh->check_state == check_state_run_q)
2119 ops_run_check_pq(sh, percpu, 0);
2120 else if (sh->check_state == check_state_run_pq)
2121 ops_run_check_pq(sh, percpu, 1);
2122 else
2123 BUG();
2124 }
2125
2126 if (overlap_clear && !sh->batch_head)
2127 for (i = disks; i--; ) {
2128 struct r5dev *dev = &sh->dev[i];
2129 if (test_and_clear_bit(R5_Overlap, &dev->flags))
2130 wake_up(&sh->raid_conf->wait_for_overlap);
2131 }
2132 put_cpu();
2133}
2134
2135static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
2136{
2137 if (sh->ppl_page)
2138 __free_page(sh->ppl_page);
2139 kmem_cache_free(sc, sh);
2140}
2141
2142static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
2143 int disks, struct r5conf *conf)
2144{
2145 struct stripe_head *sh;
2146 int i;
2147
2148 sh = kmem_cache_zalloc(sc, gfp);
2149 if (sh) {
2150 spin_lock_init(&sh->stripe_lock);
2151 spin_lock_init(&sh->batch_lock);
2152 INIT_LIST_HEAD(&sh->batch_list);
2153 INIT_LIST_HEAD(&sh->lru);
2154 INIT_LIST_HEAD(&sh->r5c);
2155 INIT_LIST_HEAD(&sh->log_list);
2156 atomic_set(&sh->count, 1);
2157 sh->raid_conf = conf;
2158 sh->log_start = MaxSector;
2159 for (i = 0; i < disks; i++) {
2160 struct r5dev *dev = &sh->dev[i];
2161
2162 bio_init(&dev->req, &dev->vec, 1);
2163 bio_init(&dev->rreq, &dev->rvec, 1);
2164 }
2165
2166 if (raid5_has_ppl(conf)) {
2167 sh->ppl_page = alloc_page(gfp);
2168 if (!sh->ppl_page) {
2169 free_stripe(sc, sh);
2170 sh = NULL;
2171 }
2172 }
2173 }
2174 return sh;
2175}
2176static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
2177{
2178 struct stripe_head *sh;
2179
2180 sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
2181 if (!sh)
2182 return 0;
2183
2184 if (grow_buffers(sh, gfp)) {
2185 shrink_buffers(sh);
2186 free_stripe(conf->slab_cache, sh);
2187 return 0;
2188 }
2189 sh->hash_lock_index =
2190 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
2191
2192 atomic_inc(&conf->active_stripes);
2193
2194 raid5_release_stripe(sh);
2195 conf->max_nr_stripes++;
2196 return 1;
2197}
2198
2199static int grow_stripes(struct r5conf *conf, int num)
2200{
2201 struct kmem_cache *sc;
2202 size_t namelen = sizeof(conf->cache_name[0]);
2203 int devs = max(conf->raid_disks, conf->previous_raid_disks);
2204
2205 if (conf->mddev->gendisk)
2206 snprintf(conf->cache_name[0], namelen,
2207 "raid%d-%s", conf->level, mdname(conf->mddev));
2208 else
2209 snprintf(conf->cache_name[0], namelen,
2210 "raid%d-%p", conf->level, conf->mddev);
2211 snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]);
2212
2213 conf->active_name = 0;
2214 sc = kmem_cache_create(conf->cache_name[conf->active_name],
2215 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
2216 0, 0, NULL);
2217 if (!sc)
2218 return 1;
2219 conf->slab_cache = sc;
2220 conf->pool_size = devs;
2221 while (num--)
2222 if (!grow_one_stripe(conf, GFP_KERNEL))
2223 return 1;
2224
2225 return 0;
2226}
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
2242{
2243 struct flex_array *ret;
2244 size_t len;
2245
2246 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
2247 ret = flex_array_alloc(len, cnt, flags);
2248 if (!ret)
2249 return NULL;
2250
2251 if (flex_array_prealloc(ret, 0, cnt, flags)) {
2252 flex_array_free(ret);
2253 return NULL;
2254 }
2255 return ret;
2256}
2257
2258static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
2259{
2260 unsigned long cpu;
2261 int err = 0;
2262
2263
2264
2265
2266
2267
2268 if (conf->scribble_disks >= new_disks &&
2269 conf->scribble_sectors >= new_sectors)
2270 return 0;
2271 mddev_suspend(conf->mddev);
2272 get_online_cpus();
2273 for_each_present_cpu(cpu) {
2274 struct raid5_percpu *percpu;
2275 struct flex_array *scribble;
2276
2277 percpu = per_cpu_ptr(conf->percpu, cpu);
2278 scribble = scribble_alloc(new_disks,
2279 new_sectors / STRIPE_SECTORS,
2280 GFP_NOIO);
2281
2282 if (scribble) {
2283 flex_array_free(percpu->scribble);
2284 percpu->scribble = scribble;
2285 } else {
2286 err = -ENOMEM;
2287 break;
2288 }
2289 }
2290 put_online_cpus();
2291 mddev_resume(conf->mddev);
2292 if (!err) {
2293 conf->scribble_disks = new_disks;
2294 conf->scribble_sectors = new_sectors;
2295 }
2296 return err;
2297}
2298
2299static int resize_stripes(struct r5conf *conf, int newsize)
2300{
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324 struct stripe_head *osh, *nsh;
2325 LIST_HEAD(newstripes);
2326 struct disk_info *ndisks;
2327 int err = 0;
2328 struct kmem_cache *sc;
2329 int i;
2330 int hash, cnt;
2331
2332 md_allow_write(conf->mddev);
2333
2334
2335 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
2336 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
2337 0, 0, NULL);
2338 if (!sc)
2339 return -ENOMEM;
2340
2341
2342 mutex_lock(&conf->cache_size_mutex);
2343
2344 for (i = conf->max_nr_stripes; i; i--) {
2345 nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
2346 if (!nsh)
2347 break;
2348
2349 list_add(&nsh->lru, &newstripes);
2350 }
2351 if (i) {
2352
2353 while (!list_empty(&newstripes)) {
2354 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2355 list_del(&nsh->lru);
2356 free_stripe(sc, nsh);
2357 }
2358 kmem_cache_destroy(sc);
2359 mutex_unlock(&conf->cache_size_mutex);
2360 return -ENOMEM;
2361 }
2362
2363
2364
2365
2366 hash = 0;
2367 cnt = 0;
2368 list_for_each_entry(nsh, &newstripes, lru) {
2369 lock_device_hash_lock(conf, hash);
2370 wait_event_cmd(conf->wait_for_stripe,
2371 !list_empty(conf->inactive_list + hash),
2372 unlock_device_hash_lock(conf, hash),
2373 lock_device_hash_lock(conf, hash));
2374 osh = get_free_stripe(conf, hash);
2375 unlock_device_hash_lock(conf, hash);
2376
2377 for(i=0; i<conf->pool_size; i++) {
2378 nsh->dev[i].page = osh->dev[i].page;
2379 nsh->dev[i].orig_page = osh->dev[i].page;
2380 }
2381 nsh->hash_lock_index = hash;
2382 free_stripe(conf->slab_cache, osh);
2383 cnt++;
2384 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
2385 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
2386 hash++;
2387 cnt = 0;
2388 }
2389 }
2390 kmem_cache_destroy(conf->slab_cache);
2391
2392
2393
2394
2395
2396
2397 ndisks = kcalloc(newsize, sizeof(struct disk_info), GFP_NOIO);
2398 if (ndisks) {
2399 for (i = 0; i < conf->pool_size; i++)
2400 ndisks[i] = conf->disks[i];
2401
2402 for (i = conf->pool_size; i < newsize; i++) {
2403 ndisks[i].extra_page = alloc_page(GFP_NOIO);
2404 if (!ndisks[i].extra_page)
2405 err = -ENOMEM;
2406 }
2407
2408 if (err) {
2409 for (i = conf->pool_size; i < newsize; i++)
2410 if (ndisks[i].extra_page)
2411 put_page(ndisks[i].extra_page);
2412 kfree(ndisks);
2413 } else {
2414 kfree(conf->disks);
2415 conf->disks = ndisks;
2416 }
2417 } else
2418 err = -ENOMEM;
2419
2420 mutex_unlock(&conf->cache_size_mutex);
2421
2422 conf->slab_cache = sc;
2423 conf->active_name = 1-conf->active_name;
2424
2425
2426 while(!list_empty(&newstripes)) {
2427 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2428 list_del_init(&nsh->lru);
2429
2430 for (i=conf->raid_disks; i < newsize; i++)
2431 if (nsh->dev[i].page == NULL) {
2432 struct page *p = alloc_page(GFP_NOIO);
2433 nsh->dev[i].page = p;
2434 nsh->dev[i].orig_page = p;
2435 if (!p)
2436 err = -ENOMEM;
2437 }
2438 raid5_release_stripe(nsh);
2439 }
2440
2441
2442 if (!err)
2443 conf->pool_size = newsize;
2444 return err;
2445}
2446
2447static int drop_one_stripe(struct r5conf *conf)
2448{
2449 struct stripe_head *sh;
2450 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
2451
2452 spin_lock_irq(conf->hash_locks + hash);
2453 sh = get_free_stripe(conf, hash);
2454 spin_unlock_irq(conf->hash_locks + hash);
2455 if (!sh)
2456 return 0;
2457 BUG_ON(atomic_read(&sh->count));
2458 shrink_buffers(sh);
2459 free_stripe(conf->slab_cache, sh);
2460 atomic_dec(&conf->active_stripes);
2461 conf->max_nr_stripes--;
2462 return 1;
2463}
2464
2465static void shrink_stripes(struct r5conf *conf)
2466{
2467 while (conf->max_nr_stripes &&
2468 drop_one_stripe(conf))
2469 ;
2470
2471 kmem_cache_destroy(conf->slab_cache);
2472 conf->slab_cache = NULL;
2473}
2474
2475static void raid5_end_read_request(struct bio * bi)
2476{
2477 struct stripe_head *sh = bi->bi_private;
2478 struct r5conf *conf = sh->raid_conf;
2479 int disks = sh->disks, i;
2480 char b[BDEVNAME_SIZE];
2481 struct md_rdev *rdev = NULL;
2482 sector_t s;
2483
2484 for (i=0 ; i<disks; i++)
2485 if (bi == &sh->dev[i].req)
2486 break;
2487
2488 pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
2489 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2490 bi->bi_status);
2491 if (i == disks) {
2492 bio_reset(bi);
2493 BUG();
2494 return;
2495 }
2496 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2497
2498
2499
2500
2501
2502 rdev = conf->disks[i].replacement;
2503 if (!rdev)
2504 rdev = conf->disks[i].rdev;
2505
2506 if (use_new_offset(conf, sh))
2507 s = sh->sector + rdev->new_data_offset;
2508 else
2509 s = sh->sector + rdev->data_offset;
2510 if (!bi->bi_status) {
2511 set_bit(R5_UPTODATE, &sh->dev[i].flags);
2512 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2513
2514
2515
2516
2517 pr_info_ratelimited(
2518 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
2519 mdname(conf->mddev), STRIPE_SECTORS,
2520 (unsigned long long)s,
2521 bdevname(rdev->bdev, b));
2522 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
2523 clear_bit(R5_ReadError, &sh->dev[i].flags);
2524 clear_bit(R5_ReWrite, &sh->dev[i].flags);
2525 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2526 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2527
2528 if (test_bit(R5_InJournal, &sh->dev[i].flags))
2529
2530
2531
2532
2533 set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2534
2535 if (atomic_read(&rdev->read_errors))
2536 atomic_set(&rdev->read_errors, 0);
2537 } else {
2538 const char *bdn = bdevname(rdev->bdev, b);
2539 int retry = 0;
2540 int set_bad = 0;
2541
2542 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
2543 atomic_inc(&rdev->read_errors);
2544 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2545 pr_warn_ratelimited(
2546 "md/raid:%s: read error on replacement device (sector %llu on %s).\n",
2547 mdname(conf->mddev),
2548 (unsigned long long)s,
2549 bdn);
2550 else if (conf->mddev->degraded >= conf->max_degraded) {
2551 set_bad = 1;
2552 pr_warn_ratelimited(
2553 "md/raid:%s: read error not correctable (sector %llu on %s).\n",
2554 mdname(conf->mddev),
2555 (unsigned long long)s,
2556 bdn);
2557 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2558
2559 set_bad = 1;
2560 pr_warn_ratelimited(
2561 "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n",
2562 mdname(conf->mddev),
2563 (unsigned long long)s,
2564 bdn);
2565 } else if (atomic_read(&rdev->read_errors)
2566 > conf->max_nr_stripes)
2567 pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
2568 mdname(conf->mddev), bdn);
2569 else
2570 retry = 1;
2571 if (set_bad && test_bit(In_sync, &rdev->flags)
2572 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2573 retry = 1;
2574 if (retry)
2575 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
2576 set_bit(R5_ReadError, &sh->dev[i].flags);
2577 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2578 } else
2579 set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2580 else {
2581 clear_bit(R5_ReadError, &sh->dev[i].flags);
2582 clear_bit(R5_ReWrite, &sh->dev[i].flags);
2583 if (!(set_bad
2584 && test_bit(In_sync, &rdev->flags)
2585 && rdev_set_badblocks(
2586 rdev, sh->sector, STRIPE_SECTORS, 0)))
2587 md_error(conf->mddev, rdev);
2588 }
2589 }
2590 rdev_dec_pending(rdev, conf->mddev);
2591 bio_reset(bi);
2592 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2593 set_bit(STRIPE_HANDLE, &sh->state);
2594 raid5_release_stripe(sh);
2595}
2596
2597static void raid5_end_write_request(struct bio *bi)
2598{
2599 struct stripe_head *sh = bi->bi_private;
2600 struct r5conf *conf = sh->raid_conf;
2601 int disks = sh->disks, i;
2602 struct md_rdev *uninitialized_var(rdev);
2603 sector_t first_bad;
2604 int bad_sectors;
2605 int replacement = 0;
2606
2607 for (i = 0 ; i < disks; i++) {
2608 if (bi == &sh->dev[i].req) {
2609 rdev = conf->disks[i].rdev;
2610 break;
2611 }
2612 if (bi == &sh->dev[i].rreq) {
2613 rdev = conf->disks[i].replacement;
2614 if (rdev)
2615 replacement = 1;
2616 else
2617
2618
2619
2620
2621 rdev = conf->disks[i].rdev;
2622 break;
2623 }
2624 }
2625 pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
2626 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2627 bi->bi_status);
2628 if (i == disks) {
2629 bio_reset(bi);
2630 BUG();
2631 return;
2632 }
2633
2634 if (replacement) {
2635 if (bi->bi_status)
2636 md_error(conf->mddev, rdev);
2637 else if (is_badblock(rdev, sh->sector,
2638 STRIPE_SECTORS,
2639 &first_bad, &bad_sectors))
2640 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
2641 } else {
2642 if (bi->bi_status) {
2643 set_bit(STRIPE_DEGRADED, &sh->state);
2644 set_bit(WriteErrorSeen, &rdev->flags);
2645 set_bit(R5_WriteError, &sh->dev[i].flags);
2646 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2647 set_bit(MD_RECOVERY_NEEDED,
2648 &rdev->mddev->recovery);
2649 } else if (is_badblock(rdev, sh->sector,
2650 STRIPE_SECTORS,
2651 &first_bad, &bad_sectors)) {
2652 set_bit(R5_MadeGood, &sh->dev[i].flags);
2653 if (test_bit(R5_ReadError, &sh->dev[i].flags))
2654
2655
2656
2657
2658 set_bit(R5_ReWrite, &sh->dev[i].flags);
2659 }
2660 }
2661 rdev_dec_pending(rdev, conf->mddev);
2662
2663 if (sh->batch_head && bi->bi_status && !replacement)
2664 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
2665
2666 bio_reset(bi);
2667 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
2668 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2669 set_bit(STRIPE_HANDLE, &sh->state);
2670 raid5_release_stripe(sh);
2671
2672 if (sh->batch_head && sh != sh->batch_head)
2673 raid5_release_stripe(sh->batch_head);
2674}
2675
2676static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
2677{
2678 char b[BDEVNAME_SIZE];
2679 struct r5conf *conf = mddev->private;
2680 unsigned long flags;
2681 pr_debug("raid456: error called\n");
2682
2683 spin_lock_irqsave(&conf->device_lock, flags);
2684 set_bit(Faulty, &rdev->flags);
2685 clear_bit(In_sync, &rdev->flags);
2686 mddev->degraded = raid5_calc_degraded(conf);
2687 spin_unlock_irqrestore(&conf->device_lock, flags);
2688 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2689
2690 set_bit(Blocked, &rdev->flags);
2691 set_mask_bits(&mddev->sb_flags, 0,
2692 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
2693 pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
2694 "md/raid:%s: Operation continuing on %d devices.\n",
2695 mdname(mddev),
2696 bdevname(rdev->bdev, b),
2697 mdname(mddev),
2698 conf->raid_disks - mddev->degraded);
2699 r5c_update_on_rdev_error(mddev, rdev);
2700}
2701
2702
2703
2704
2705
2706sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2707 int previous, int *dd_idx,
2708 struct stripe_head *sh)
2709{
2710 sector_t stripe, stripe2;
2711 sector_t chunk_number;
2712 unsigned int chunk_offset;
2713 int pd_idx, qd_idx;
2714 int ddf_layout = 0;
2715 sector_t new_sector;
2716 int algorithm = previous ? conf->prev_algo
2717 : conf->algorithm;
2718 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2719 : conf->chunk_sectors;
2720 int raid_disks = previous ? conf->previous_raid_disks
2721 : conf->raid_disks;
2722 int data_disks = raid_disks - conf->max_degraded;
2723
2724
2725
2726
2727
2728
2729 chunk_offset = sector_div(r_sector, sectors_per_chunk);
2730 chunk_number = r_sector;
2731
2732
2733
2734
2735 stripe = chunk_number;
2736 *dd_idx = sector_div(stripe, data_disks);
2737 stripe2 = stripe;
2738
2739
2740
2741 pd_idx = qd_idx = -1;
2742 switch(conf->level) {
2743 case 4:
2744 pd_idx = data_disks;
2745 break;
2746 case 5:
2747 switch (algorithm) {
2748 case ALGORITHM_LEFT_ASYMMETRIC:
2749 pd_idx = data_disks - sector_div(stripe2, raid_disks);
2750 if (*dd_idx >= pd_idx)
2751 (*dd_idx)++;
2752 break;
2753 case ALGORITHM_RIGHT_ASYMMETRIC:
2754 pd_idx = sector_div(stripe2, raid_disks);
2755 if (*dd_idx >= pd_idx)
2756 (*dd_idx)++;
2757 break;
2758 case ALGORITHM_LEFT_SYMMETRIC:
2759 pd_idx = data_disks - sector_div(stripe2, raid_disks);
2760 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2761 break;
2762 case ALGORITHM_RIGHT_SYMMETRIC:
2763 pd_idx = sector_div(stripe2, raid_disks);
2764 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2765 break;
2766 case ALGORITHM_PARITY_0:
2767 pd_idx = 0;
2768 (*dd_idx)++;
2769 break;
2770 case ALGORITHM_PARITY_N:
2771 pd_idx = data_disks;
2772 break;
2773 default:
2774 BUG();
2775 }
2776 break;
2777 case 6:
2778
2779 switch (algorithm) {
2780 case ALGORITHM_LEFT_ASYMMETRIC:
2781 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2782 qd_idx = pd_idx + 1;
2783 if (pd_idx == raid_disks-1) {
2784 (*dd_idx)++;
2785 qd_idx = 0;
2786 } else if (*dd_idx >= pd_idx)
2787 (*dd_idx) += 2;
2788 break;
2789 case ALGORITHM_RIGHT_ASYMMETRIC:
2790 pd_idx = sector_div(stripe2, raid_disks);
2791 qd_idx = pd_idx + 1;
2792 if (pd_idx == raid_disks-1) {
2793 (*dd_idx)++;
2794 qd_idx = 0;
2795 } else if (*dd_idx >= pd_idx)
2796 (*dd_idx) += 2;
2797 break;
2798 case ALGORITHM_LEFT_SYMMETRIC:
2799 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2800 qd_idx = (pd_idx + 1) % raid_disks;
2801 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2802 break;
2803 case ALGORITHM_RIGHT_SYMMETRIC:
2804 pd_idx = sector_div(stripe2, raid_disks);
2805 qd_idx = (pd_idx + 1) % raid_disks;
2806 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2807 break;
2808
2809 case ALGORITHM_PARITY_0:
2810 pd_idx = 0;
2811 qd_idx = 1;
2812 (*dd_idx) += 2;
2813 break;
2814 case ALGORITHM_PARITY_N:
2815 pd_idx = data_disks;
2816 qd_idx = data_disks + 1;
2817 break;
2818
2819 case ALGORITHM_ROTATING_ZERO_RESTART:
2820
2821
2822
2823 pd_idx = sector_div(stripe2, raid_disks);
2824 qd_idx = pd_idx + 1;
2825 if (pd_idx == raid_disks-1) {
2826 (*dd_idx)++;
2827 qd_idx = 0;
2828 } else if (*dd_idx >= pd_idx)
2829 (*dd_idx) += 2;
2830 ddf_layout = 1;
2831 break;
2832
2833 case ALGORITHM_ROTATING_N_RESTART:
2834
2835
2836
2837
2838 stripe2 += 1;
2839 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2840 qd_idx = pd_idx + 1;
2841 if (pd_idx == raid_disks-1) {
2842 (*dd_idx)++;
2843 qd_idx = 0;
2844 } else if (*dd_idx >= pd_idx)
2845 (*dd_idx) += 2;
2846 ddf_layout = 1;
2847 break;
2848
2849 case ALGORITHM_ROTATING_N_CONTINUE:
2850
2851 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2852 qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
2853 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2854 ddf_layout = 1;
2855 break;
2856
2857 case ALGORITHM_LEFT_ASYMMETRIC_6:
2858
2859 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2860 if (*dd_idx >= pd_idx)
2861 (*dd_idx)++;
2862 qd_idx = raid_disks - 1;
2863 break;
2864
2865 case ALGORITHM_RIGHT_ASYMMETRIC_6:
2866 pd_idx = sector_div(stripe2, raid_disks-1);
2867 if (*dd_idx >= pd_idx)
2868 (*dd_idx)++;
2869 qd_idx = raid_disks - 1;
2870 break;
2871
2872 case ALGORITHM_LEFT_SYMMETRIC_6:
2873 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2874 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2875 qd_idx = raid_disks - 1;
2876 break;
2877
2878 case ALGORITHM_RIGHT_SYMMETRIC_6:
2879 pd_idx = sector_div(stripe2, raid_disks-1);
2880 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2881 qd_idx = raid_disks - 1;
2882 break;
2883
2884 case ALGORITHM_PARITY_0_6:
2885 pd_idx = 0;
2886 (*dd_idx)++;
2887 qd_idx = raid_disks - 1;
2888 break;
2889
2890 default:
2891 BUG();
2892 }
2893 break;
2894 }
2895
2896 if (sh) {
2897 sh->pd_idx = pd_idx;
2898 sh->qd_idx = qd_idx;
2899 sh->ddf_layout = ddf_layout;
2900 }
2901
2902
2903
2904 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
2905 return new_sector;
2906}
2907
2908sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
2909{
2910 struct r5conf *conf = sh->raid_conf;
2911 int raid_disks = sh->disks;
2912 int data_disks = raid_disks - conf->max_degraded;
2913 sector_t new_sector = sh->sector, check;
2914 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2915 : conf->chunk_sectors;
2916 int algorithm = previous ? conf->prev_algo
2917 : conf->algorithm;
2918 sector_t stripe;
2919 int chunk_offset;
2920 sector_t chunk_number;
2921 int dummy1, dd_idx = i;
2922 sector_t r_sector;
2923 struct stripe_head sh2;
2924
2925 chunk_offset = sector_div(new_sector, sectors_per_chunk);
2926 stripe = new_sector;
2927
2928 if (i == sh->pd_idx)
2929 return 0;
2930 switch(conf->level) {
2931 case 4: break;
2932 case 5:
2933 switch (algorithm) {
2934 case ALGORITHM_LEFT_ASYMMETRIC:
2935 case ALGORITHM_RIGHT_ASYMMETRIC:
2936 if (i > sh->pd_idx)
2937 i--;
2938 break;
2939 case ALGORITHM_LEFT_SYMMETRIC:
2940 case ALGORITHM_RIGHT_SYMMETRIC:
2941 if (i < sh->pd_idx)
2942 i += raid_disks;
2943 i -= (sh->pd_idx + 1);
2944 break;
2945 case ALGORITHM_PARITY_0:
2946 i -= 1;
2947 break;
2948 case ALGORITHM_PARITY_N:
2949 break;
2950 default:
2951 BUG();
2952 }
2953 break;
2954 case 6:
2955 if (i == sh->qd_idx)
2956 return 0;
2957 switch (algorithm) {
2958 case ALGORITHM_LEFT_ASYMMETRIC:
2959 case ALGORITHM_RIGHT_ASYMMETRIC:
2960 case ALGORITHM_ROTATING_ZERO_RESTART:
2961 case ALGORITHM_ROTATING_N_RESTART:
2962 if (sh->pd_idx == raid_disks-1)
2963 i--;
2964 else if (i > sh->pd_idx)
2965 i -= 2;
2966 break;
2967 case ALGORITHM_LEFT_SYMMETRIC:
2968 case ALGORITHM_RIGHT_SYMMETRIC:
2969 if (sh->pd_idx == raid_disks-1)
2970 i--;
2971 else {
2972
2973 if (i < sh->pd_idx)
2974 i += raid_disks;
2975 i -= (sh->pd_idx + 2);
2976 }
2977 break;
2978 case ALGORITHM_PARITY_0:
2979 i -= 2;
2980 break;
2981 case ALGORITHM_PARITY_N:
2982 break;
2983 case ALGORITHM_ROTATING_N_CONTINUE:
2984
2985 if (sh->pd_idx == 0)
2986 i--;
2987 else {
2988
2989 if (i < sh->pd_idx)
2990 i += raid_disks;
2991 i -= (sh->pd_idx + 1);
2992 }
2993 break;
2994 case ALGORITHM_LEFT_ASYMMETRIC_6:
2995 case ALGORITHM_RIGHT_ASYMMETRIC_6:
2996 if (i > sh->pd_idx)
2997 i--;
2998 break;
2999 case ALGORITHM_LEFT_SYMMETRIC_6:
3000 case ALGORITHM_RIGHT_SYMMETRIC_6:
3001 if (i < sh->pd_idx)
3002 i += data_disks + 1;
3003 i -= (sh->pd_idx + 1);
3004 break;
3005 case ALGORITHM_PARITY_0_6:
3006 i -= 1;
3007 break;
3008 default:
3009 BUG();
3010 }
3011 break;
3012 }
3013
3014 chunk_number = stripe * data_disks + i;
3015 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
3016
3017 check = raid5_compute_sector(conf, r_sector,
3018 previous, &dummy1, &sh2);
3019 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
3020 || sh2.qd_idx != sh->qd_idx) {
3021 pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
3022 mdname(conf->mddev));
3023 return 0;
3024 }
3025 return r_sector;
3026}
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066static inline bool delay_towrite(struct r5conf *conf,
3067 struct r5dev *dev,
3068 struct stripe_head_state *s)
3069{
3070
3071 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3072 !test_bit(R5_Insync, &dev->flags) && s->injournal)
3073 return true;
3074
3075 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
3076 s->injournal > 0)
3077 return true;
3078
3079 if (s->log_failed && s->injournal)
3080 return true;
3081 return false;
3082}
3083
3084static void
3085schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
3086 int rcw, int expand)
3087{
3088 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
3089 struct r5conf *conf = sh->raid_conf;
3090 int level = conf->level;
3091
3092 if (rcw) {
3093
3094
3095
3096
3097
3098
3099 r5c_release_extra_page(sh);
3100
3101 for (i = disks; i--; ) {
3102 struct r5dev *dev = &sh->dev[i];
3103
3104 if (dev->towrite && !delay_towrite(conf, dev, s)) {
3105 set_bit(R5_LOCKED, &dev->flags);
3106 set_bit(R5_Wantdrain, &dev->flags);
3107 if (!expand)
3108 clear_bit(R5_UPTODATE, &dev->flags);
3109 s->locked++;
3110 } else if (test_bit(R5_InJournal, &dev->flags)) {
3111 set_bit(R5_LOCKED, &dev->flags);
3112 s->locked++;
3113 }
3114 }
3115
3116
3117
3118
3119 if (!expand) {
3120 if (!s->locked)
3121
3122 return;
3123 sh->reconstruct_state = reconstruct_state_drain_run;
3124 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3125 } else
3126 sh->reconstruct_state = reconstruct_state_run;
3127
3128 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3129
3130 if (s->locked + conf->max_degraded == disks)
3131 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
3132 atomic_inc(&conf->pending_full_writes);
3133 } else {
3134 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
3135 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
3136 BUG_ON(level == 6 &&
3137 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
3138 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
3139
3140 for (i = disks; i--; ) {
3141 struct r5dev *dev = &sh->dev[i];
3142 if (i == pd_idx || i == qd_idx)
3143 continue;
3144
3145 if (dev->towrite &&
3146 (test_bit(R5_UPTODATE, &dev->flags) ||
3147 test_bit(R5_Wantcompute, &dev->flags))) {
3148 set_bit(R5_Wantdrain, &dev->flags);
3149 set_bit(R5_LOCKED, &dev->flags);
3150 clear_bit(R5_UPTODATE, &dev->flags);
3151 s->locked++;
3152 } else if (test_bit(R5_InJournal, &dev->flags)) {
3153 set_bit(R5_LOCKED, &dev->flags);
3154 s->locked++;
3155 }
3156 }
3157 if (!s->locked)
3158
3159 return;
3160 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
3161 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
3162 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3163 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3164 }
3165
3166
3167
3168
3169 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
3170 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
3171 s->locked++;
3172
3173 if (level == 6) {
3174 int qd_idx = sh->qd_idx;
3175 struct r5dev *dev = &sh->dev[qd_idx];
3176
3177 set_bit(R5_LOCKED, &dev->flags);
3178 clear_bit(R5_UPTODATE, &dev->flags);
3179 s->locked++;
3180 }
3181
3182 if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
3183 test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
3184 !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
3185 test_bit(R5_Insync, &sh->dev[pd_idx].flags))
3186 set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
3187
3188 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
3189 __func__, (unsigned long long)sh->sector,
3190 s->locked, s->ops_request);
3191}
3192
3193
3194
3195
3196
3197
3198static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
3199 int forwrite, int previous)
3200{
3201 struct bio **bip;
3202 struct r5conf *conf = sh->raid_conf;
3203 int firstwrite=0;
3204
3205 pr_debug("adding bi b#%llu to stripe s#%llu\n",
3206 (unsigned long long)bi->bi_iter.bi_sector,
3207 (unsigned long long)sh->sector);
3208
3209 spin_lock_irq(&sh->stripe_lock);
3210 sh->dev[dd_idx].write_hint = bi->bi_write_hint;
3211
3212 if (sh->batch_head)
3213 goto overlap;
3214 if (forwrite) {
3215 bip = &sh->dev[dd_idx].towrite;
3216 if (*bip == NULL)
3217 firstwrite = 1;
3218 } else
3219 bip = &sh->dev[dd_idx].toread;
3220 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
3221 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
3222 goto overlap;
3223 bip = & (*bip)->bi_next;
3224 }
3225 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
3226 goto overlap;
3227
3228 if (forwrite && raid5_has_ppl(conf)) {
3229
3230
3231
3232
3233
3234
3235
3236 sector_t sector;
3237 sector_t first = 0;
3238 sector_t last = 0;
3239 int count = 0;
3240 int i;
3241
3242 for (i = 0; i < sh->disks; i++) {
3243 if (i != sh->pd_idx &&
3244 (i == dd_idx || sh->dev[i].towrite)) {
3245 sector = sh->dev[i].sector;
3246 if (count == 0 || sector < first)
3247 first = sector;
3248 if (sector > last)
3249 last = sector;
3250 count++;
3251 }
3252 }
3253
3254 if (first + conf->chunk_sectors * (count - 1) != last)
3255 goto overlap;
3256 }
3257
3258 if (!forwrite || previous)
3259 clear_bit(STRIPE_BATCH_READY, &sh->state);
3260
3261 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
3262 if (*bip)
3263 bi->bi_next = *bip;
3264 *bip = bi;
3265 bio_inc_remaining(bi);
3266 md_write_inc(conf->mddev, bi);
3267
3268 if (forwrite) {
3269
3270 sector_t sector = sh->dev[dd_idx].sector;
3271 for (bi=sh->dev[dd_idx].towrite;
3272 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
3273 bi && bi->bi_iter.bi_sector <= sector;
3274 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
3275 if (bio_end_sector(bi) >= sector)
3276 sector = bio_end_sector(bi);
3277 }
3278 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
3279 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
3280 sh->overwrite_disks++;
3281 }
3282
3283 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
3284 (unsigned long long)(*bip)->bi_iter.bi_sector,
3285 (unsigned long long)sh->sector, dd_idx);
3286
3287 if (conf->mddev->bitmap && firstwrite) {
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300 set_bit(STRIPE_BITMAP_PENDING, &sh->state);
3301 spin_unlock_irq(&sh->stripe_lock);
3302 md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
3303 STRIPE_SECTORS, 0);
3304 spin_lock_irq(&sh->stripe_lock);
3305 clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
3306 if (!sh->batch_head) {
3307 sh->bm_seq = conf->seq_flush+1;
3308 set_bit(STRIPE_BIT_DELAY, &sh->state);
3309 }
3310 }
3311 spin_unlock_irq(&sh->stripe_lock);
3312
3313 if (stripe_can_batch(sh))
3314 stripe_add_to_batch_list(conf, sh);
3315 return 1;
3316
3317 overlap:
3318 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
3319 spin_unlock_irq(&sh->stripe_lock);
3320 return 0;
3321}
3322
3323static void end_reshape(struct r5conf *conf);
3324
3325static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
3326 struct stripe_head *sh)
3327{
3328 int sectors_per_chunk =
3329 previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
3330 int dd_idx;
3331 int chunk_offset = sector_div(stripe, sectors_per_chunk);
3332 int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
3333
3334 raid5_compute_sector(conf,
3335 stripe * (disks - conf->max_degraded)
3336 *sectors_per_chunk + chunk_offset,
3337 previous,
3338 &dd_idx, sh);
3339}
3340
3341static void
3342handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
3343 struct stripe_head_state *s, int disks)
3344{
3345 int i;
3346 BUG_ON(sh->batch_head);
3347 for (i = disks; i--; ) {
3348 struct bio *bi;
3349 int bitmap_end = 0;
3350
3351 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
3352 struct md_rdev *rdev;
3353 rcu_read_lock();
3354 rdev = rcu_dereference(conf->disks[i].rdev);
3355 if (rdev && test_bit(In_sync, &rdev->flags) &&
3356 !test_bit(Faulty, &rdev->flags))
3357 atomic_inc(&rdev->nr_pending);
3358 else
3359 rdev = NULL;
3360 rcu_read_unlock();
3361 if (rdev) {
3362 if (!rdev_set_badblocks(
3363 rdev,
3364 sh->sector,
3365 STRIPE_SECTORS, 0))
3366 md_error(conf->mddev, rdev);
3367 rdev_dec_pending(rdev, conf->mddev);
3368 }
3369 }
3370 spin_lock_irq(&sh->stripe_lock);
3371
3372 bi = sh->dev[i].towrite;
3373 sh->dev[i].towrite = NULL;
3374 sh->overwrite_disks = 0;
3375 spin_unlock_irq(&sh->stripe_lock);
3376 if (bi)
3377 bitmap_end = 1;
3378
3379 log_stripe_write_finished(sh);
3380
3381 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3382 wake_up(&conf->wait_for_overlap);
3383
3384 while (bi && bi->bi_iter.bi_sector <
3385 sh->dev[i].sector + STRIPE_SECTORS) {
3386 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
3387
3388 md_write_end(conf->mddev);
3389 bio_io_error(bi);
3390 bi = nextbi;
3391 }
3392 if (bitmap_end)
3393 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3394 STRIPE_SECTORS, 0, 0);
3395 bitmap_end = 0;
3396
3397 bi = sh->dev[i].written;
3398 sh->dev[i].written = NULL;
3399 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
3400 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
3401 sh->dev[i].page = sh->dev[i].orig_page;
3402 }
3403
3404 if (bi) bitmap_end = 1;
3405 while (bi && bi->bi_iter.bi_sector <
3406 sh->dev[i].sector + STRIPE_SECTORS) {
3407 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
3408
3409 md_write_end(conf->mddev);
3410 bio_io_error(bi);
3411 bi = bi2;
3412 }
3413
3414
3415
3416
3417 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
3418 s->failed > conf->max_degraded &&
3419 (!test_bit(R5_Insync, &sh->dev[i].flags) ||
3420 test_bit(R5_ReadError, &sh->dev[i].flags))) {
3421 spin_lock_irq(&sh->stripe_lock);
3422 bi = sh->dev[i].toread;
3423 sh->dev[i].toread = NULL;
3424 spin_unlock_irq(&sh->stripe_lock);
3425 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3426 wake_up(&conf->wait_for_overlap);
3427 if (bi)
3428 s->to_read--;
3429 while (bi && bi->bi_iter.bi_sector <
3430 sh->dev[i].sector + STRIPE_SECTORS) {
3431 struct bio *nextbi =
3432 r5_next_bio(bi, sh->dev[i].sector);
3433
3434 bio_io_error(bi);
3435 bi = nextbi;
3436 }
3437 }
3438 if (bitmap_end)
3439 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3440 STRIPE_SECTORS, 0, 0);
3441
3442
3443
3444 clear_bit(R5_LOCKED, &sh->dev[i].flags);
3445 }
3446 s->to_write = 0;
3447 s->written = 0;
3448
3449 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3450 if (atomic_dec_and_test(&conf->pending_full_writes))
3451 md_wakeup_thread(conf->mddev->thread);
3452}
3453
3454static void
3455handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
3456 struct stripe_head_state *s)
3457{
3458 int abort = 0;
3459 int i;
3460
3461 BUG_ON(sh->batch_head);
3462 clear_bit(STRIPE_SYNCING, &sh->state);
3463 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
3464 wake_up(&conf->wait_for_overlap);
3465 s->syncing = 0;
3466 s->replacing = 0;
3467
3468
3469
3470
3471
3472
3473
3474 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
3475
3476
3477
3478 rcu_read_lock();
3479 for (i = 0; i < conf->raid_disks; i++) {
3480 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
3481 if (rdev
3482 && !test_bit(Faulty, &rdev->flags)
3483 && !test_bit(In_sync, &rdev->flags)
3484 && !rdev_set_badblocks(rdev, sh->sector,
3485 STRIPE_SECTORS, 0))
3486 abort = 1;
3487 rdev = rcu_dereference(conf->disks[i].replacement);
3488 if (rdev
3489 && !test_bit(Faulty, &rdev->flags)
3490 && !test_bit(In_sync, &rdev->flags)
3491 && !rdev_set_badblocks(rdev, sh->sector,
3492 STRIPE_SECTORS, 0))
3493 abort = 1;
3494 }
3495 rcu_read_unlock();
3496 if (abort)
3497 conf->recovery_disabled =
3498 conf->mddev->recovery_disabled;
3499 }
3500 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
3501}
3502
3503static int want_replace(struct stripe_head *sh, int disk_idx)
3504{
3505 struct md_rdev *rdev;
3506 int rv = 0;
3507
3508 rcu_read_lock();
3509 rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement);
3510 if (rdev
3511 && !test_bit(Faulty, &rdev->flags)
3512 && !test_bit(In_sync, &rdev->flags)
3513 && (rdev->recovery_offset <= sh->sector
3514 || rdev->mddev->recovery_cp <= sh->sector))
3515 rv = 1;
3516 rcu_read_unlock();
3517 return rv;
3518}
3519
3520static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
3521 int disk_idx, int disks)
3522{
3523 struct r5dev *dev = &sh->dev[disk_idx];
3524 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
3525 &sh->dev[s->failed_num[1]] };
3526 int i;
3527
3528
3529 if (test_bit(R5_LOCKED, &dev->flags) ||
3530 test_bit(R5_UPTODATE, &dev->flags))
3531
3532
3533
3534 return 0;
3535
3536 if (dev->toread ||
3537 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
3538
3539 return 1;
3540
3541 if (s->syncing || s->expanding ||
3542 (s->replacing && want_replace(sh, disk_idx)))
3543
3544
3545
3546 return 1;
3547
3548 if ((s->failed >= 1 && fdev[0]->toread) ||
3549 (s->failed >= 2 && fdev[1]->toread))
3550
3551
3552
3553 return 1;
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563 if (!s->failed || !s->to_write)
3564 return 0;
3565
3566 if (test_bit(R5_Insync, &dev->flags) &&
3567 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3568
3569
3570
3571
3572
3573 return 0;
3574
3575 for (i = 0; i < s->failed && i < 2; i++) {
3576 if (fdev[i]->towrite &&
3577 !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3578 !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3579
3580
3581
3582
3583
3584 return 1;
3585 }
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595 if (sh->raid_conf->level != 6 &&
3596 sh->sector < sh->raid_conf->mddev->recovery_cp)
3597
3598 return 0;
3599 for (i = 0; i < s->failed && i < 2; i++) {
3600 if (s->failed_num[i] != sh->pd_idx &&
3601 s->failed_num[i] != sh->qd_idx &&
3602 !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3603 !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3604 return 1;
3605 }
3606
3607 return 0;
3608}
3609
3610
3611
3612
3613
3614
3615
3616static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
3617 int disk_idx, int disks)
3618{
3619 struct r5dev *dev = &sh->dev[disk_idx];
3620
3621
3622 if (need_this_block(sh, s, disk_idx, disks)) {
3623
3624
3625
3626 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
3627 BUG_ON(test_bit(R5_Wantread, &dev->flags));
3628 BUG_ON(sh->batch_head);
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639 if ((s->uptodate == disks - 1) &&
3640 ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
3641 (s->failed && (disk_idx == s->failed_num[0] ||
3642 disk_idx == s->failed_num[1])))) {
3643
3644
3645
3646 pr_debug("Computing stripe %llu block %d\n",
3647 (unsigned long long)sh->sector, disk_idx);
3648 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3649 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3650 set_bit(R5_Wantcompute, &dev->flags);
3651 sh->ops.target = disk_idx;
3652 sh->ops.target2 = -1;
3653 s->req_compute = 1;
3654
3655
3656
3657
3658
3659
3660 s->uptodate++;
3661 return 1;
3662 } else if (s->uptodate == disks-2 && s->failed >= 2) {
3663
3664
3665
3666 int other;
3667 for (other = disks; other--; ) {
3668 if (other == disk_idx)
3669 continue;
3670 if (!test_bit(R5_UPTODATE,
3671 &sh->dev[other].flags))
3672 break;
3673 }
3674 BUG_ON(other < 0);
3675 pr_debug("Computing stripe %llu blocks %d,%d\n",
3676 (unsigned long long)sh->sector,
3677 disk_idx, other);
3678 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3679 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3680 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
3681 set_bit(R5_Wantcompute, &sh->dev[other].flags);
3682 sh->ops.target = disk_idx;
3683 sh->ops.target2 = other;
3684 s->uptodate += 2;
3685 s->req_compute = 1;
3686 return 1;
3687 } else if (test_bit(R5_Insync, &dev->flags)) {
3688 set_bit(R5_LOCKED, &dev->flags);
3689 set_bit(R5_Wantread, &dev->flags);
3690 s->locked++;
3691 pr_debug("Reading block %d (sync=%d)\n",
3692 disk_idx, s->syncing);
3693 }
3694 }
3695
3696 return 0;
3697}
3698
3699
3700
3701
3702static void handle_stripe_fill(struct stripe_head *sh,
3703 struct stripe_head_state *s,
3704 int disks)
3705{
3706 int i;
3707
3708
3709
3710
3711
3712 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
3713 !sh->reconstruct_state) {
3714
3715
3716
3717
3718
3719
3720
3721
3722 if (s->injournal && s->failed) {
3723 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
3724 r5c_make_stripe_write_out(sh);
3725 goto out;
3726 }
3727
3728 for (i = disks; i--; )
3729 if (fetch_block(sh, s, i, disks))
3730 break;
3731 }
3732out:
3733 set_bit(STRIPE_HANDLE, &sh->state);
3734}
3735
3736static void break_stripe_batch_list(struct stripe_head *head_sh,
3737 unsigned long handle_flags);
3738
3739
3740
3741
3742
3743static void handle_stripe_clean_event(struct r5conf *conf,
3744 struct stripe_head *sh, int disks)
3745{
3746 int i;
3747 struct r5dev *dev;
3748 int discard_pending = 0;
3749 struct stripe_head *head_sh = sh;
3750 bool do_endio = false;
3751
3752 for (i = disks; i--; )
3753 if (sh->dev[i].written) {
3754 dev = &sh->dev[i];
3755 if (!test_bit(R5_LOCKED, &dev->flags) &&
3756 (test_bit(R5_UPTODATE, &dev->flags) ||
3757 test_bit(R5_Discard, &dev->flags) ||
3758 test_bit(R5_SkipCopy, &dev->flags))) {
3759
3760 struct bio *wbi, *wbi2;
3761 pr_debug("Return write for disc %d\n", i);
3762 if (test_and_clear_bit(R5_Discard, &dev->flags))
3763 clear_bit(R5_UPTODATE, &dev->flags);
3764 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
3765 WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
3766 }
3767 do_endio = true;
3768
3769returnbi:
3770 dev->page = dev->orig_page;
3771 wbi = dev->written;
3772 dev->written = NULL;
3773 while (wbi && wbi->bi_iter.bi_sector <
3774 dev->sector + STRIPE_SECTORS) {
3775 wbi2 = r5_next_bio(wbi, dev->sector);
3776 md_write_end(conf->mddev);
3777 bio_endio(wbi);
3778 wbi = wbi2;
3779 }
3780 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3781 STRIPE_SECTORS,
3782 !test_bit(STRIPE_DEGRADED, &sh->state),
3783 0);
3784 if (head_sh->batch_head) {
3785 sh = list_first_entry(&sh->batch_list,
3786 struct stripe_head,
3787 batch_list);
3788 if (sh != head_sh) {
3789 dev = &sh->dev[i];
3790 goto returnbi;
3791 }
3792 }
3793 sh = head_sh;
3794 dev = &sh->dev[i];
3795 } else if (test_bit(R5_Discard, &dev->flags))
3796 discard_pending = 1;
3797 }
3798
3799 log_stripe_write_finished(sh);
3800
3801 if (!discard_pending &&
3802 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
3803 int hash;
3804 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
3805 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
3806 if (sh->qd_idx >= 0) {
3807 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
3808 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
3809 }
3810
3811 clear_bit(STRIPE_DISCARD, &sh->state);
3812
3813
3814
3815
3816
3817unhash:
3818 hash = sh->hash_lock_index;
3819 spin_lock_irq(conf->hash_locks + hash);
3820 remove_hash(sh);
3821 spin_unlock_irq(conf->hash_locks + hash);
3822 if (head_sh->batch_head) {
3823 sh = list_first_entry(&sh->batch_list,
3824 struct stripe_head, batch_list);
3825 if (sh != head_sh)
3826 goto unhash;
3827 }
3828 sh = head_sh;
3829
3830 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
3831 set_bit(STRIPE_HANDLE, &sh->state);
3832
3833 }
3834
3835 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3836 if (atomic_dec_and_test(&conf->pending_full_writes))
3837 md_wakeup_thread(conf->mddev->thread);
3838
3839 if (head_sh->batch_head && do_endio)
3840 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
3841}
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851static inline bool uptodate_for_rmw(struct r5dev *dev)
3852{
3853 return (test_bit(R5_UPTODATE, &dev->flags)) &&
3854 (!test_bit(R5_InJournal, &dev->flags) ||
3855 test_bit(R5_OrigPageUPTDODATE, &dev->flags));
3856}
3857
3858static int handle_stripe_dirtying(struct r5conf *conf,
3859 struct stripe_head *sh,
3860 struct stripe_head_state *s,
3861 int disks)
3862{
3863 int rmw = 0, rcw = 0, i;
3864 sector_t recovery_cp = conf->mddev->recovery_cp;
3865
3866
3867
3868
3869
3870
3871
3872
3873 if (conf->rmw_level == PARITY_DISABLE_RMW ||
3874 (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
3875 s->failed == 0)) {
3876
3877
3878
3879 rcw = 1; rmw = 2;
3880 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
3881 conf->rmw_level, (unsigned long long)recovery_cp,
3882 (unsigned long long)sh->sector);
3883 } else for (i = disks; i--; ) {
3884
3885 struct r5dev *dev = &sh->dev[i];
3886 if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
3887 i == sh->pd_idx || i == sh->qd_idx ||
3888 test_bit(R5_InJournal, &dev->flags)) &&
3889 !test_bit(R5_LOCKED, &dev->flags) &&
3890 !(uptodate_for_rmw(dev) ||
3891 test_bit(R5_Wantcompute, &dev->flags))) {
3892 if (test_bit(R5_Insync, &dev->flags))
3893 rmw++;
3894 else
3895 rmw += 2*disks;
3896 }
3897
3898 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3899 i != sh->pd_idx && i != sh->qd_idx &&
3900 !test_bit(R5_LOCKED, &dev->flags) &&
3901 !(test_bit(R5_UPTODATE, &dev->flags) ||
3902 test_bit(R5_Wantcompute, &dev->flags))) {
3903 if (test_bit(R5_Insync, &dev->flags))
3904 rcw++;
3905 else
3906 rcw += 2*disks;
3907 }
3908 }
3909
3910 pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
3911 (unsigned long long)sh->sector, sh->state, rmw, rcw);
3912 set_bit(STRIPE_HANDLE, &sh->state);
3913 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
3914
3915 if (conf->mddev->queue)
3916 blk_add_trace_msg(conf->mddev->queue,
3917 "raid5 rmw %llu %d",
3918 (unsigned long long)sh->sector, rmw);
3919 for (i = disks; i--; ) {
3920 struct r5dev *dev = &sh->dev[i];
3921 if (test_bit(R5_InJournal, &dev->flags) &&
3922 dev->page == dev->orig_page &&
3923 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
3924
3925 struct page *p = alloc_page(GFP_NOIO);
3926
3927 if (p) {
3928 dev->orig_page = p;
3929 continue;
3930 }
3931
3932
3933
3934
3935
3936 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
3937 &conf->cache_state)) {
3938 r5c_use_extra_page(sh);
3939 break;
3940 }
3941
3942
3943 set_bit(STRIPE_DELAYED, &sh->state);
3944 s->waiting_extra_page = 1;
3945 return -EAGAIN;
3946 }
3947 }
3948
3949 for (i = disks; i--; ) {
3950 struct r5dev *dev = &sh->dev[i];
3951 if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
3952 i == sh->pd_idx || i == sh->qd_idx ||
3953 test_bit(R5_InJournal, &dev->flags)) &&
3954 !test_bit(R5_LOCKED, &dev->flags) &&
3955 !(uptodate_for_rmw(dev) ||
3956 test_bit(R5_Wantcompute, &dev->flags)) &&
3957 test_bit(R5_Insync, &dev->flags)) {
3958 if (test_bit(STRIPE_PREREAD_ACTIVE,
3959 &sh->state)) {
3960 pr_debug("Read_old block %d for r-m-w\n",
3961 i);
3962 set_bit(R5_LOCKED, &dev->flags);
3963 set_bit(R5_Wantread, &dev->flags);
3964 s->locked++;
3965 } else {
3966 set_bit(STRIPE_DELAYED, &sh->state);
3967 set_bit(STRIPE_HANDLE, &sh->state);
3968 }
3969 }
3970 }
3971 }
3972 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) {
3973
3974 int qread =0;
3975 rcw = 0;
3976 for (i = disks; i--; ) {
3977 struct r5dev *dev = &sh->dev[i];
3978 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3979 i != sh->pd_idx && i != sh->qd_idx &&
3980 !test_bit(R5_LOCKED, &dev->flags) &&
3981 !(test_bit(R5_UPTODATE, &dev->flags) ||
3982 test_bit(R5_Wantcompute, &dev->flags))) {
3983 rcw++;
3984 if (test_bit(R5_Insync, &dev->flags) &&
3985 test_bit(STRIPE_PREREAD_ACTIVE,
3986 &sh->state)) {
3987 pr_debug("Read_old block "
3988 "%d for Reconstruct\n", i);
3989 set_bit(R5_LOCKED, &dev->flags);
3990 set_bit(R5_Wantread, &dev->flags);
3991 s->locked++;
3992 qread++;
3993 } else {
3994 set_bit(STRIPE_DELAYED, &sh->state);
3995 set_bit(STRIPE_HANDLE, &sh->state);
3996 }
3997 }
3998 }
3999 if (rcw && conf->mddev->queue)
4000 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
4001 (unsigned long long)sh->sector,
4002 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
4003 }
4004
4005 if (rcw > disks && rmw > disks &&
4006 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4007 set_bit(STRIPE_DELAYED, &sh->state);
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
4020 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
4021 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
4022 schedule_reconstruction(sh, s, rcw == 0, 0);
4023 return 0;
4024}
4025
4026static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
4027 struct stripe_head_state *s, int disks)
4028{
4029 struct r5dev *dev = NULL;
4030
4031 BUG_ON(sh->batch_head);
4032 set_bit(STRIPE_HANDLE, &sh->state);
4033
4034 switch (sh->check_state) {
4035 case check_state_idle:
4036
4037 if (s->failed == 0) {
4038 BUG_ON(s->uptodate != disks);
4039 sh->check_state = check_state_run;
4040 set_bit(STRIPE_OP_CHECK, &s->ops_request);
4041 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
4042 s->uptodate--;
4043 break;
4044 }
4045 dev = &sh->dev[s->failed_num[0]];
4046
4047 case check_state_compute_result:
4048 sh->check_state = check_state_idle;
4049 if (!dev)
4050 dev = &sh->dev[sh->pd_idx];
4051
4052
4053 if (test_bit(STRIPE_INSYNC, &sh->state))
4054 break;
4055
4056
4057 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
4058 BUG_ON(s->uptodate != disks);
4059
4060 set_bit(R5_LOCKED, &dev->flags);
4061 s->locked++;
4062 set_bit(R5_Wantwrite, &dev->flags);
4063
4064 clear_bit(STRIPE_DEGRADED, &sh->state);
4065 set_bit(STRIPE_INSYNC, &sh->state);
4066 break;
4067 case check_state_run:
4068 break;
4069 case check_state_check_result:
4070 sh->check_state = check_state_idle;
4071
4072
4073
4074
4075 if (s->failed)
4076 break;
4077
4078
4079
4080
4081
4082 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
4083
4084
4085
4086 set_bit(STRIPE_INSYNC, &sh->state);
4087 else {
4088 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
4089 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4090
4091 set_bit(STRIPE_INSYNC, &sh->state);
4092 pr_warn_ratelimited("%s: mismatch sector in range "
4093 "%llu-%llu\n", mdname(conf->mddev),
4094 (unsigned long long) sh->sector,
4095 (unsigned long long) sh->sector +
4096 STRIPE_SECTORS);
4097 } else {
4098 sh->check_state = check_state_compute_run;
4099 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4100 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4101 set_bit(R5_Wantcompute,
4102 &sh->dev[sh->pd_idx].flags);
4103 sh->ops.target = sh->pd_idx;
4104 sh->ops.target2 = -1;
4105 s->uptodate++;
4106 }
4107 }
4108 break;
4109 case check_state_compute_run:
4110 break;
4111 default:
4112 pr_err("%s: unknown check_state: %d sector: %llu\n",
4113 __func__, sh->check_state,
4114 (unsigned long long) sh->sector);
4115 BUG();
4116 }
4117}
4118
4119static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
4120 struct stripe_head_state *s,
4121 int disks)
4122{
4123 int pd_idx = sh->pd_idx;
4124 int qd_idx = sh->qd_idx;
4125 struct r5dev *dev;
4126
4127 BUG_ON(sh->batch_head);
4128 set_bit(STRIPE_HANDLE, &sh->state);
4129
4130 BUG_ON(s->failed > 2);
4131
4132
4133
4134
4135
4136
4137
4138 switch (sh->check_state) {
4139 case check_state_idle:
4140
4141 if (s->failed == s->q_failed) {
4142
4143
4144
4145
4146 sh->check_state = check_state_run;
4147 }
4148 if (!s->q_failed && s->failed < 2) {
4149
4150
4151
4152 if (sh->check_state == check_state_run)
4153 sh->check_state = check_state_run_pq;
4154 else
4155 sh->check_state = check_state_run_q;
4156 }
4157
4158
4159 sh->ops.zero_sum_result = 0;
4160
4161 if (sh->check_state == check_state_run) {
4162
4163 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
4164 s->uptodate--;
4165 }
4166 if (sh->check_state >= check_state_run &&
4167 sh->check_state <= check_state_run_pq) {
4168
4169
4170
4171 set_bit(STRIPE_OP_CHECK, &s->ops_request);
4172 break;
4173 }
4174
4175
4176 BUG_ON(s->failed != 2);
4177
4178 case check_state_compute_result:
4179 sh->check_state = check_state_idle;
4180
4181
4182 if (test_bit(STRIPE_INSYNC, &sh->state))
4183 break;
4184
4185
4186
4187
4188 BUG_ON(s->uptodate < disks - 1);
4189 if (s->failed == 2) {
4190 dev = &sh->dev[s->failed_num[1]];
4191 s->locked++;
4192 set_bit(R5_LOCKED, &dev->flags);
4193 set_bit(R5_Wantwrite, &dev->flags);
4194 }
4195 if (s->failed >= 1) {
4196 dev = &sh->dev[s->failed_num[0]];
4197 s->locked++;
4198 set_bit(R5_LOCKED, &dev->flags);
4199 set_bit(R5_Wantwrite, &dev->flags);
4200 }
4201 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4202 dev = &sh->dev[pd_idx];
4203 s->locked++;
4204 set_bit(R5_LOCKED, &dev->flags);
4205 set_bit(R5_Wantwrite, &dev->flags);
4206 }
4207 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4208 dev = &sh->dev[qd_idx];
4209 s->locked++;
4210 set_bit(R5_LOCKED, &dev->flags);
4211 set_bit(R5_Wantwrite, &dev->flags);
4212 }
4213 clear_bit(STRIPE_DEGRADED, &sh->state);
4214
4215 set_bit(STRIPE_INSYNC, &sh->state);
4216 break;
4217 case check_state_run:
4218 case check_state_run_q:
4219 case check_state_run_pq:
4220 break;
4221 case check_state_check_result:
4222 sh->check_state = check_state_idle;
4223
4224
4225
4226
4227
4228 if (sh->ops.zero_sum_result == 0) {
4229
4230 if (!s->failed)
4231 set_bit(STRIPE_INSYNC, &sh->state);
4232 else {
4233
4234
4235
4236
4237 sh->check_state = check_state_compute_result;
4238
4239
4240
4241
4242
4243 }
4244 } else {
4245 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
4246 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4247
4248 set_bit(STRIPE_INSYNC, &sh->state);
4249 pr_warn_ratelimited("%s: mismatch sector in range "
4250 "%llu-%llu\n", mdname(conf->mddev),
4251 (unsigned long long) sh->sector,
4252 (unsigned long long) sh->sector +
4253 STRIPE_SECTORS);
4254 } else {
4255 int *target = &sh->ops.target;
4256
4257 sh->ops.target = -1;
4258 sh->ops.target2 = -1;
4259 sh->check_state = check_state_compute_run;
4260 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4261 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4262 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4263 set_bit(R5_Wantcompute,
4264 &sh->dev[pd_idx].flags);
4265 *target = pd_idx;
4266 target = &sh->ops.target2;
4267 s->uptodate++;
4268 }
4269 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4270 set_bit(R5_Wantcompute,
4271 &sh->dev[qd_idx].flags);
4272 *target = qd_idx;
4273 s->uptodate++;
4274 }
4275 }
4276 }
4277 break;
4278 case check_state_compute_run:
4279 break;
4280 default:
4281 pr_warn("%s: unknown check_state: %d sector: %llu\n",
4282 __func__, sh->check_state,
4283 (unsigned long long) sh->sector);
4284 BUG();
4285 }
4286}
4287
4288static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
4289{
4290 int i;
4291
4292
4293
4294
4295 struct dma_async_tx_descriptor *tx = NULL;
4296 BUG_ON(sh->batch_head);
4297 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
4298 for (i = 0; i < sh->disks; i++)
4299 if (i != sh->pd_idx && i != sh->qd_idx) {
4300 int dd_idx, j;
4301 struct stripe_head *sh2;
4302 struct async_submit_ctl submit;
4303
4304 sector_t bn = raid5_compute_blocknr(sh, i, 1);
4305 sector_t s = raid5_compute_sector(conf, bn, 0,
4306 &dd_idx, NULL);
4307 sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1);
4308 if (sh2 == NULL)
4309
4310
4311
4312
4313 continue;
4314 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
4315 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
4316
4317 raid5_release_stripe(sh2);
4318 continue;
4319 }
4320
4321
4322 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
4323 tx = async_memcpy(sh2->dev[dd_idx].page,
4324 sh->dev[i].page, 0, 0, STRIPE_SIZE,
4325 &submit);
4326
4327 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
4328 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
4329 for (j = 0; j < conf->raid_disks; j++)
4330 if (j != sh2->pd_idx &&
4331 j != sh2->qd_idx &&
4332 !test_bit(R5_Expanded, &sh2->dev[j].flags))
4333 break;
4334 if (j == conf->raid_disks) {
4335 set_bit(STRIPE_EXPAND_READY, &sh2->state);
4336 set_bit(STRIPE_HANDLE, &sh2->state);
4337 }
4338 raid5_release_stripe(sh2);
4339
4340 }
4341
4342 async_tx_quiesce(&tx);
4343}
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
4360{
4361 struct r5conf *conf = sh->raid_conf;
4362 int disks = sh->disks;
4363 struct r5dev *dev;
4364 int i;
4365 int do_recovery = 0;
4366
4367 memset(s, 0, sizeof(*s));
4368
4369 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
4370 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
4371 s->failed_num[0] = -1;
4372 s->failed_num[1] = -1;
4373 s->log_failed = r5l_log_disk_error(conf);
4374
4375
4376 rcu_read_lock();
4377 for (i=disks; i--; ) {
4378 struct md_rdev *rdev;
4379 sector_t first_bad;
4380 int bad_sectors;
4381 int is_bad = 0;
4382
4383 dev = &sh->dev[i];
4384
4385 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
4386 i, dev->flags,
4387 dev->toread, dev->towrite, dev->written);
4388
4389
4390
4391
4392
4393 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
4394 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
4395 set_bit(R5_Wantfill, &dev->flags);
4396
4397
4398 if (test_bit(R5_LOCKED, &dev->flags))
4399 s->locked++;
4400 if (test_bit(R5_UPTODATE, &dev->flags))
4401 s->uptodate++;
4402 if (test_bit(R5_Wantcompute, &dev->flags)) {
4403 s->compute++;
4404 BUG_ON(s->compute > 2);
4405 }
4406
4407 if (test_bit(R5_Wantfill, &dev->flags))
4408 s->to_fill++;
4409 else if (dev->toread)
4410 s->to_read++;
4411 if (dev->towrite) {
4412 s->to_write++;
4413 if (!test_bit(R5_OVERWRITE, &dev->flags))
4414 s->non_overwrite++;
4415 }
4416 if (dev->written)
4417 s->written++;
4418
4419
4420
4421 rdev = rcu_dereference(conf->disks[i].replacement);
4422 if (rdev && !test_bit(Faulty, &rdev->flags) &&
4423 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
4424 !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4425 &first_bad, &bad_sectors))
4426 set_bit(R5_ReadRepl, &dev->flags);
4427 else {
4428 if (rdev && !test_bit(Faulty, &rdev->flags))
4429 set_bit(R5_NeedReplace, &dev->flags);
4430 else
4431 clear_bit(R5_NeedReplace, &dev->flags);
4432 rdev = rcu_dereference(conf->disks[i].rdev);
4433 clear_bit(R5_ReadRepl, &dev->flags);
4434 }
4435 if (rdev && test_bit(Faulty, &rdev->flags))
4436 rdev = NULL;
4437 if (rdev) {
4438 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4439 &first_bad, &bad_sectors);
4440 if (s->blocked_rdev == NULL
4441 && (test_bit(Blocked, &rdev->flags)
4442 || is_bad < 0)) {
4443 if (is_bad < 0)
4444 set_bit(BlockedBadBlocks,
4445 &rdev->flags);
4446 s->blocked_rdev = rdev;
4447 atomic_inc(&rdev->nr_pending);
4448 }
4449 }
4450 clear_bit(R5_Insync, &dev->flags);
4451 if (!rdev)
4452 ;
4453 else if (is_bad) {
4454
4455 if (!test_bit(WriteErrorSeen, &rdev->flags) &&
4456 test_bit(R5_UPTODATE, &dev->flags)) {
4457
4458
4459
4460 set_bit(R5_Insync, &dev->flags);
4461 set_bit(R5_ReadError, &dev->flags);
4462 }
4463 } else if (test_bit(In_sync, &rdev->flags))
4464 set_bit(R5_Insync, &dev->flags);
4465 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
4466
4467 set_bit(R5_Insync, &dev->flags);
4468 else if (test_bit(R5_UPTODATE, &dev->flags) &&
4469 test_bit(R5_Expanded, &dev->flags))
4470
4471
4472
4473
4474 set_bit(R5_Insync, &dev->flags);
4475
4476 if (test_bit(R5_WriteError, &dev->flags)) {
4477
4478
4479 struct md_rdev *rdev2 = rcu_dereference(
4480 conf->disks[i].rdev);
4481 if (rdev2 == rdev)
4482 clear_bit(R5_Insync, &dev->flags);
4483 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4484 s->handle_bad_blocks = 1;
4485 atomic_inc(&rdev2->nr_pending);
4486 } else
4487 clear_bit(R5_WriteError, &dev->flags);
4488 }
4489 if (test_bit(R5_MadeGood, &dev->flags)) {
4490
4491
4492 struct md_rdev *rdev2 = rcu_dereference(
4493 conf->disks[i].rdev);
4494 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4495 s->handle_bad_blocks = 1;
4496 atomic_inc(&rdev2->nr_pending);
4497 } else
4498 clear_bit(R5_MadeGood, &dev->flags);
4499 }
4500 if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
4501 struct md_rdev *rdev2 = rcu_dereference(
4502 conf->disks[i].replacement);
4503 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4504 s->handle_bad_blocks = 1;
4505 atomic_inc(&rdev2->nr_pending);
4506 } else
4507 clear_bit(R5_MadeGoodRepl, &dev->flags);
4508 }
4509 if (!test_bit(R5_Insync, &dev->flags)) {
4510
4511 clear_bit(R5_ReadError, &dev->flags);
4512 clear_bit(R5_ReWrite, &dev->flags);
4513 }
4514 if (test_bit(R5_ReadError, &dev->flags))
4515 clear_bit(R5_Insync, &dev->flags);
4516 if (!test_bit(R5_Insync, &dev->flags)) {
4517 if (s->failed < 2)
4518 s->failed_num[s->failed] = i;
4519 s->failed++;
4520 if (rdev && !test_bit(Faulty, &rdev->flags))
4521 do_recovery = 1;
4522 else if (!rdev) {
4523 rdev = rcu_dereference(
4524 conf->disks[i].replacement);
4525 if (rdev && !test_bit(Faulty, &rdev->flags))
4526 do_recovery = 1;
4527 }
4528 }
4529
4530 if (test_bit(R5_InJournal, &dev->flags))
4531 s->injournal++;
4532 if (test_bit(R5_InJournal, &dev->flags) && dev->written)
4533 s->just_cached++;
4534 }
4535 if (test_bit(STRIPE_SYNCING, &sh->state)) {
4536
4537
4538
4539
4540
4541
4542
4543
4544 if (do_recovery ||
4545 sh->sector >= conf->mddev->recovery_cp ||
4546 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
4547 s->syncing = 1;
4548 else
4549 s->replacing = 1;
4550 }
4551 rcu_read_unlock();
4552}
4553
4554static int clear_batch_ready(struct stripe_head *sh)
4555{
4556
4557
4558
4559
4560 struct stripe_head *tmp;
4561 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
4562 return (sh->batch_head && sh->batch_head != sh);
4563 spin_lock(&sh->stripe_lock);
4564 if (!sh->batch_head) {
4565 spin_unlock(&sh->stripe_lock);
4566 return 0;
4567 }
4568
4569
4570
4571
4572
4573 if (sh->batch_head != sh) {
4574 spin_unlock(&sh->stripe_lock);
4575 return 1;
4576 }
4577 spin_lock(&sh->batch_lock);
4578 list_for_each_entry(tmp, &sh->batch_list, batch_list)
4579 clear_bit(STRIPE_BATCH_READY, &tmp->state);
4580 spin_unlock(&sh->batch_lock);
4581 spin_unlock(&sh->stripe_lock);
4582
4583
4584
4585
4586
4587 return 0;
4588}
4589
4590static void break_stripe_batch_list(struct stripe_head *head_sh,
4591 unsigned long handle_flags)
4592{
4593 struct stripe_head *sh, *next;
4594 int i;
4595 int do_wakeup = 0;
4596
4597 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
4598
4599 list_del_init(&sh->batch_list);
4600
4601 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
4602 (1 << STRIPE_SYNCING) |
4603 (1 << STRIPE_REPLACED) |
4604 (1 << STRIPE_DELAYED) |
4605 (1 << STRIPE_BIT_DELAY) |
4606 (1 << STRIPE_FULL_WRITE) |
4607 (1 << STRIPE_BIOFILL_RUN) |
4608 (1 << STRIPE_COMPUTE_RUN) |
4609 (1 << STRIPE_OPS_REQ_PENDING) |
4610 (1 << STRIPE_DISCARD) |
4611 (1 << STRIPE_BATCH_READY) |
4612 (1 << STRIPE_BATCH_ERR) |
4613 (1 << STRIPE_BITMAP_PENDING)),
4614 "stripe state: %lx\n", sh->state);
4615 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
4616 (1 << STRIPE_REPLACED)),
4617 "head stripe state: %lx\n", head_sh->state);
4618
4619 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
4620 (1 << STRIPE_PREREAD_ACTIVE) |
4621 (1 << STRIPE_DEGRADED) |
4622 (1 << STRIPE_ON_UNPLUG_LIST)),
4623 head_sh->state & (1 << STRIPE_INSYNC));
4624
4625 sh->check_state = head_sh->check_state;
4626 sh->reconstruct_state = head_sh->reconstruct_state;
4627 spin_lock_irq(&sh->stripe_lock);
4628 sh->batch_head = NULL;
4629 spin_unlock_irq(&sh->stripe_lock);
4630 for (i = 0; i < sh->disks; i++) {
4631 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
4632 do_wakeup = 1;
4633 sh->dev[i].flags = head_sh->dev[i].flags &
4634 (~((1 << R5_WriteError) | (1 << R5_Overlap)));
4635 }
4636 if (handle_flags == 0 ||
4637 sh->state & handle_flags)
4638 set_bit(STRIPE_HANDLE, &sh->state);
4639 raid5_release_stripe(sh);
4640 }
4641 spin_lock_irq(&head_sh->stripe_lock);
4642 head_sh->batch_head = NULL;
4643 spin_unlock_irq(&head_sh->stripe_lock);
4644 for (i = 0; i < head_sh->disks; i++)
4645 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
4646 do_wakeup = 1;
4647 if (head_sh->state & handle_flags)
4648 set_bit(STRIPE_HANDLE, &head_sh->state);
4649
4650 if (do_wakeup)
4651 wake_up(&head_sh->raid_conf->wait_for_overlap);
4652}
4653
4654static void handle_stripe(struct stripe_head *sh)
4655{
4656 struct stripe_head_state s;
4657 struct r5conf *conf = sh->raid_conf;
4658 int i;
4659 int prexor;
4660 int disks = sh->disks;
4661 struct r5dev *pdev, *qdev;
4662
4663 clear_bit(STRIPE_HANDLE, &sh->state);
4664 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
4665
4666
4667 set_bit(STRIPE_HANDLE, &sh->state);
4668 return;
4669 }
4670
4671 if (clear_batch_ready(sh) ) {
4672 clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
4673 return;
4674 }
4675
4676 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
4677 break_stripe_batch_list(sh, 0);
4678
4679 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
4680 spin_lock(&sh->stripe_lock);
4681
4682
4683
4684
4685 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
4686 !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
4687 !test_bit(STRIPE_DISCARD, &sh->state) &&
4688 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
4689 set_bit(STRIPE_SYNCING, &sh->state);
4690 clear_bit(STRIPE_INSYNC, &sh->state);
4691 clear_bit(STRIPE_REPLACED, &sh->state);
4692 }
4693 spin_unlock(&sh->stripe_lock);
4694 }
4695 clear_bit(STRIPE_DELAYED, &sh->state);
4696
4697 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
4698 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
4699 (unsigned long long)sh->sector, sh->state,
4700 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
4701 sh->check_state, sh->reconstruct_state);
4702
4703 analyse_stripe(sh, &s);
4704
4705 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
4706 goto finish;
4707
4708 if (s.handle_bad_blocks ||
4709 test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
4710 set_bit(STRIPE_HANDLE, &sh->state);
4711 goto finish;
4712 }
4713
4714 if (unlikely(s.blocked_rdev)) {
4715 if (s.syncing || s.expanding || s.expanded ||
4716 s.replacing || s.to_write || s.written) {
4717 set_bit(STRIPE_HANDLE, &sh->state);
4718 goto finish;
4719 }
4720
4721 rdev_dec_pending(s.blocked_rdev, conf->mddev);
4722 s.blocked_rdev = NULL;
4723 }
4724
4725 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
4726 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
4727 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
4728 }
4729
4730 pr_debug("locked=%d uptodate=%d to_read=%d"
4731 " to_write=%d failed=%d failed_num=%d,%d\n",
4732 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
4733 s.failed_num[0], s.failed_num[1]);
4734
4735
4736
4737
4738
4739
4740
4741 if (s.failed > conf->max_degraded ||
4742 (s.log_failed && s.injournal == 0)) {
4743 sh->check_state = 0;
4744 sh->reconstruct_state = 0;
4745 break_stripe_batch_list(sh, 0);
4746 if (s.to_read+s.to_write+s.written)
4747 handle_failed_stripe(conf, sh, &s, disks);
4748 if (s.syncing + s.replacing)
4749 handle_failed_sync(conf, sh, &s);
4750 }
4751
4752
4753
4754
4755 prexor = 0;
4756 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
4757 prexor = 1;
4758 if (sh->reconstruct_state == reconstruct_state_drain_result ||
4759 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
4760 sh->reconstruct_state = reconstruct_state_idle;
4761
4762
4763
4764
4765 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
4766 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
4767 BUG_ON(sh->qd_idx >= 0 &&
4768 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
4769 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
4770 for (i = disks; i--; ) {
4771 struct r5dev *dev = &sh->dev[i];
4772 if (test_bit(R5_LOCKED, &dev->flags) &&
4773 (i == sh->pd_idx || i == sh->qd_idx ||
4774 dev->written || test_bit(R5_InJournal,
4775 &dev->flags))) {
4776 pr_debug("Writing block %d\n", i);
4777 set_bit(R5_Wantwrite, &dev->flags);
4778 if (prexor)
4779 continue;
4780 if (s.failed > 1)
4781 continue;
4782 if (!test_bit(R5_Insync, &dev->flags) ||
4783 ((i == sh->pd_idx || i == sh->qd_idx) &&
4784 s.failed == 0))
4785 set_bit(STRIPE_INSYNC, &sh->state);
4786 }
4787 }
4788 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4789 s.dec_preread_active = 1;
4790 }
4791
4792
4793
4794
4795
4796 pdev = &sh->dev[sh->pd_idx];
4797 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
4798 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
4799 qdev = &sh->dev[sh->qd_idx];
4800 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
4801 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
4802 || conf->level < 6;
4803
4804 if (s.written &&
4805 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
4806 && !test_bit(R5_LOCKED, &pdev->flags)
4807 && (test_bit(R5_UPTODATE, &pdev->flags) ||
4808 test_bit(R5_Discard, &pdev->flags))))) &&
4809 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
4810 && !test_bit(R5_LOCKED, &qdev->flags)
4811 && (test_bit(R5_UPTODATE, &qdev->flags) ||
4812 test_bit(R5_Discard, &qdev->flags))))))
4813 handle_stripe_clean_event(conf, sh, disks);
4814
4815 if (s.just_cached)
4816 r5c_handle_cached_data_endio(conf, sh, disks);
4817 log_stripe_write_finished(sh);
4818
4819
4820
4821
4822
4823 if (s.to_read || s.non_overwrite
4824 || (conf->level == 6 && s.to_write && s.failed)
4825 || (s.syncing && (s.uptodate + s.compute < disks))
4826 || s.replacing
4827 || s.expanding)
4828 handle_stripe_fill(sh, &s, disks);
4829
4830
4831
4832
4833
4834
4835 r5c_finish_stripe_write_out(conf, sh, &s);
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
4847 if (!r5c_is_writeback(conf->log)) {
4848 if (s.to_write)
4849 handle_stripe_dirtying(conf, sh, &s, disks);
4850 } else {
4851 int ret = 0;
4852
4853
4854 if (s.to_write)
4855 ret = r5c_try_caching_write(conf, sh, &s,
4856 disks);
4857
4858
4859
4860
4861
4862
4863
4864 if (ret == -EAGAIN ||
4865
4866 (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
4867 s.injournal > 0)) {
4868 ret = handle_stripe_dirtying(conf, sh, &s,
4869 disks);
4870 if (ret == -EAGAIN)
4871 goto finish;
4872 }
4873 }
4874 }
4875
4876
4877
4878
4879
4880
4881 if (sh->check_state ||
4882 (s.syncing && s.locked == 0 &&
4883 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
4884 !test_bit(STRIPE_INSYNC, &sh->state))) {
4885 if (conf->level == 6)
4886 handle_parity_checks6(conf, sh, &s, disks);
4887 else
4888 handle_parity_checks5(conf, sh, &s, disks);
4889 }
4890
4891 if ((s.replacing || s.syncing) && s.locked == 0
4892 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
4893 && !test_bit(STRIPE_REPLACED, &sh->state)) {
4894
4895 for (i = 0; i < conf->raid_disks; i++)
4896 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
4897 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
4898 set_bit(R5_WantReplace, &sh->dev[i].flags);
4899 set_bit(R5_LOCKED, &sh->dev[i].flags);
4900 s.locked++;
4901 }
4902 if (s.replacing)
4903 set_bit(STRIPE_INSYNC, &sh->state);
4904 set_bit(STRIPE_REPLACED, &sh->state);
4905 }
4906 if ((s.syncing || s.replacing) && s.locked == 0 &&
4907 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
4908 test_bit(STRIPE_INSYNC, &sh->state)) {
4909 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
4910 clear_bit(STRIPE_SYNCING, &sh->state);
4911 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
4912 wake_up(&conf->wait_for_overlap);
4913 }
4914
4915
4916
4917
4918 if (s.failed <= conf->max_degraded && !conf->mddev->ro)
4919 for (i = 0; i < s.failed; i++) {
4920 struct r5dev *dev = &sh->dev[s.failed_num[i]];
4921 if (test_bit(R5_ReadError, &dev->flags)
4922 && !test_bit(R5_LOCKED, &dev->flags)
4923 && test_bit(R5_UPTODATE, &dev->flags)
4924 ) {
4925 if (!test_bit(R5_ReWrite, &dev->flags)) {
4926 set_bit(R5_Wantwrite, &dev->flags);
4927 set_bit(R5_ReWrite, &dev->flags);
4928 set_bit(R5_LOCKED, &dev->flags);
4929 s.locked++;
4930 } else {
4931
4932 set_bit(R5_Wantread, &dev->flags);
4933 set_bit(R5_LOCKED, &dev->flags);
4934 s.locked++;
4935 }
4936 }
4937 }
4938
4939
4940 if (sh->reconstruct_state == reconstruct_state_result) {
4941 struct stripe_head *sh_src
4942 = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
4943 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
4944
4945
4946
4947 set_bit(STRIPE_DELAYED, &sh->state);
4948 set_bit(STRIPE_HANDLE, &sh->state);
4949 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
4950 &sh_src->state))
4951 atomic_inc(&conf->preread_active_stripes);
4952 raid5_release_stripe(sh_src);
4953 goto finish;
4954 }
4955 if (sh_src)
4956 raid5_release_stripe(sh_src);
4957
4958 sh->reconstruct_state = reconstruct_state_idle;
4959 clear_bit(STRIPE_EXPANDING, &sh->state);
4960 for (i = conf->raid_disks; i--; ) {
4961 set_bit(R5_Wantwrite, &sh->dev[i].flags);
4962 set_bit(R5_LOCKED, &sh->dev[i].flags);
4963 s.locked++;
4964 }
4965 }
4966
4967 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
4968 !sh->reconstruct_state) {
4969
4970 sh->disks = conf->raid_disks;
4971 stripe_set_idx(sh->sector, conf, 0, sh);
4972 schedule_reconstruction(sh, &s, 1, 1);
4973 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
4974 clear_bit(STRIPE_EXPAND_READY, &sh->state);
4975 atomic_dec(&conf->reshape_stripes);
4976 wake_up(&conf->wait_for_overlap);
4977 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
4978 }
4979
4980 if (s.expanding && s.locked == 0 &&
4981 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
4982 handle_stripe_expansion(conf, sh);
4983
4984finish:
4985
4986 if (unlikely(s.blocked_rdev)) {
4987 if (conf->mddev->external)
4988 md_wait_for_blocked_rdev(s.blocked_rdev,
4989 conf->mddev);
4990 else
4991
4992
4993
4994
4995 rdev_dec_pending(s.blocked_rdev,
4996 conf->mddev);
4997 }
4998
4999 if (s.handle_bad_blocks)
5000 for (i = disks; i--; ) {
5001 struct md_rdev *rdev;
5002 struct r5dev *dev = &sh->dev[i];
5003 if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
5004
5005 rdev = conf->disks[i].rdev;
5006 if (!rdev_set_badblocks(rdev, sh->sector,
5007 STRIPE_SECTORS, 0))
5008 md_error(conf->mddev, rdev);
5009 rdev_dec_pending(rdev, conf->mddev);
5010 }
5011 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
5012 rdev = conf->disks[i].rdev;
5013 rdev_clear_badblocks(rdev, sh->sector,
5014 STRIPE_SECTORS, 0);
5015 rdev_dec_pending(rdev, conf->mddev);
5016 }
5017 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
5018 rdev = conf->disks[i].replacement;
5019 if (!rdev)
5020
5021 rdev = conf->disks[i].rdev;
5022 rdev_clear_badblocks(rdev, sh->sector,
5023 STRIPE_SECTORS, 0);
5024 rdev_dec_pending(rdev, conf->mddev);
5025 }
5026 }
5027
5028 if (s.ops_request)
5029 raid_run_ops(sh, s.ops_request);
5030
5031 ops_run_io(sh, &s);
5032
5033 if (s.dec_preread_active) {
5034
5035
5036
5037
5038 atomic_dec(&conf->preread_active_stripes);
5039 if (atomic_read(&conf->preread_active_stripes) <
5040 IO_THRESHOLD)
5041 md_wakeup_thread(conf->mddev->thread);
5042 }
5043
5044 clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
5045}
5046
5047static void raid5_activate_delayed(struct r5conf *conf)
5048{
5049 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
5050 while (!list_empty(&conf->delayed_list)) {
5051 struct list_head *l = conf->delayed_list.next;
5052 struct stripe_head *sh;
5053 sh = list_entry(l, struct stripe_head, lru);
5054 list_del_init(l);
5055 clear_bit(STRIPE_DELAYED, &sh->state);
5056 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5057 atomic_inc(&conf->preread_active_stripes);
5058 list_add_tail(&sh->lru, &conf->hold_list);
5059 raid5_wakeup_stripe_thread(sh);
5060 }
5061 }
5062}
5063
5064static void activate_bit_delay(struct r5conf *conf,
5065 struct list_head *temp_inactive_list)
5066{
5067
5068 struct list_head head;
5069 list_add(&head, &conf->bitmap_list);
5070 list_del_init(&conf->bitmap_list);
5071 while (!list_empty(&head)) {
5072 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
5073 int hash;
5074 list_del_init(&sh->lru);
5075 atomic_inc(&sh->count);
5076 hash = sh->hash_lock_index;
5077 __release_stripe(conf, sh, &temp_inactive_list[hash]);
5078 }
5079}
5080
5081static int raid5_congested(struct mddev *mddev, int bits)
5082{
5083 struct r5conf *conf = mddev->private;
5084
5085
5086
5087
5088
5089 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
5090 return 1;
5091
5092
5093 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state))
5094 return 1;
5095 if (conf->quiesce)
5096 return 1;
5097 if (atomic_read(&conf->empty_inactive_list_nr))
5098 return 1;
5099
5100 return 0;
5101}
5102
5103static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
5104{
5105 struct r5conf *conf = mddev->private;
5106 sector_t sector = bio->bi_iter.bi_sector;
5107 unsigned int chunk_sectors;
5108 unsigned int bio_sectors = bio_sectors(bio);
5109
5110 WARN_ON_ONCE(bio->bi_partno);
5111
5112 chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
5113 return chunk_sectors >=
5114 ((sector & (chunk_sectors - 1)) + bio_sectors);
5115}
5116
5117
5118
5119
5120
5121static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
5122{
5123 unsigned long flags;
5124
5125 spin_lock_irqsave(&conf->device_lock, flags);
5126
5127 bi->bi_next = conf->retry_read_aligned_list;
5128 conf->retry_read_aligned_list = bi;
5129
5130 spin_unlock_irqrestore(&conf->device_lock, flags);
5131 md_wakeup_thread(conf->mddev->thread);
5132}
5133
5134static struct bio *remove_bio_from_retry(struct r5conf *conf,
5135 unsigned int *offset)
5136{
5137 struct bio *bi;
5138
5139 bi = conf->retry_read_aligned;
5140 if (bi) {
5141 *offset = conf->retry_read_offset;
5142 conf->retry_read_aligned = NULL;
5143 return bi;
5144 }
5145 bi = conf->retry_read_aligned_list;
5146 if(bi) {
5147 conf->retry_read_aligned_list = bi->bi_next;
5148 bi->bi_next = NULL;
5149 *offset = 0;
5150 }
5151
5152 return bi;
5153}
5154
5155
5156
5157
5158
5159
5160
5161static void raid5_align_endio(struct bio *bi)
5162{
5163 struct bio* raid_bi = bi->bi_private;
5164 struct mddev *mddev;
5165 struct r5conf *conf;
5166 struct md_rdev *rdev;
5167 blk_status_t error = bi->bi_status;
5168
5169 bio_put(bi);
5170
5171 rdev = (void*)raid_bi->bi_next;
5172 raid_bi->bi_next = NULL;
5173 mddev = rdev->mddev;
5174 conf = mddev->private;
5175
5176 rdev_dec_pending(rdev, conf->mddev);
5177
5178 if (!error) {
5179 bio_endio(raid_bi);
5180 if (atomic_dec_and_test(&conf->active_aligned_reads))
5181 wake_up(&conf->wait_for_quiescent);
5182 return;
5183 }
5184
5185 pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
5186
5187 add_bio_to_retry(raid_bi, conf);
5188}
5189
5190static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
5191{
5192 struct r5conf *conf = mddev->private;
5193 int dd_idx;
5194 struct bio* align_bi;
5195 struct md_rdev *rdev;
5196 sector_t end_sector;
5197
5198 if (!in_chunk_boundary(mddev, raid_bio)) {
5199 pr_debug("%s: non aligned\n", __func__);
5200 return 0;
5201 }
5202
5203
5204
5205 align_bi = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set);
5206 if (!align_bi)
5207 return 0;
5208
5209
5210
5211
5212 align_bi->bi_end_io = raid5_align_endio;
5213 align_bi->bi_private = raid_bio;
5214
5215
5216
5217 align_bi->bi_iter.bi_sector =
5218 raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector,
5219 0, &dd_idx, NULL);
5220
5221 end_sector = bio_end_sector(align_bi);
5222 rcu_read_lock();
5223 rdev = rcu_dereference(conf->disks[dd_idx].replacement);
5224 if (!rdev || test_bit(Faulty, &rdev->flags) ||
5225 rdev->recovery_offset < end_sector) {
5226 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
5227 if (rdev &&
5228 (test_bit(Faulty, &rdev->flags) ||
5229 !(test_bit(In_sync, &rdev->flags) ||
5230 rdev->recovery_offset >= end_sector)))
5231 rdev = NULL;
5232 }
5233
5234 if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) {
5235 rcu_read_unlock();
5236 bio_put(align_bi);
5237 return 0;
5238 }
5239
5240 if (rdev) {
5241 sector_t first_bad;
5242 int bad_sectors;
5243
5244 atomic_inc(&rdev->nr_pending);
5245 rcu_read_unlock();
5246 raid_bio->bi_next = (void*)rdev;
5247 bio_set_dev(align_bi, rdev->bdev);
5248 bio_clear_flag(align_bi, BIO_SEG_VALID);
5249
5250 if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
5251 bio_sectors(align_bi),
5252 &first_bad, &bad_sectors)) {
5253 bio_put(align_bi);
5254 rdev_dec_pending(rdev, mddev);
5255 return 0;
5256 }
5257
5258
5259 align_bi->bi_iter.bi_sector += rdev->data_offset;
5260
5261 spin_lock_irq(&conf->device_lock);
5262 wait_event_lock_irq(conf->wait_for_quiescent,
5263 conf->quiesce == 0,
5264 conf->device_lock);
5265 atomic_inc(&conf->active_aligned_reads);
5266 spin_unlock_irq(&conf->device_lock);
5267
5268 if (mddev->gendisk)
5269 trace_block_bio_remap(align_bi->bi_disk->queue,
5270 align_bi, disk_devt(mddev->gendisk),
5271 raid_bio->bi_iter.bi_sector);
5272 generic_make_request(align_bi);
5273 return 1;
5274 } else {
5275 rcu_read_unlock();
5276 bio_put(align_bi);
5277 return 0;
5278 }
5279}
5280
5281static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
5282{
5283 struct bio *split;
5284 sector_t sector = raid_bio->bi_iter.bi_sector;
5285 unsigned chunk_sects = mddev->chunk_sectors;
5286 unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
5287
5288 if (sectors < bio_sectors(raid_bio)) {
5289 struct r5conf *conf = mddev->private;
5290 split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
5291 bio_chain(split, raid_bio);
5292 generic_make_request(raid_bio);
5293 raid_bio = split;
5294 }
5295
5296 if (!raid5_read_one_chunk(mddev, raid_bio))
5297 return raid_bio;
5298
5299 return NULL;
5300}
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
5313{
5314 struct stripe_head *sh, *tmp;
5315 struct list_head *handle_list = NULL;
5316 struct r5worker_group *wg;
5317 bool second_try = !r5c_is_writeback(conf->log) &&
5318 !r5l_log_disk_error(conf);
5319 bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
5320 r5l_log_disk_error(conf);
5321
5322again:
5323 wg = NULL;
5324 sh = NULL;
5325 if (conf->worker_cnt_per_group == 0) {
5326 handle_list = try_loprio ? &conf->loprio_list :
5327 &conf->handle_list;
5328 } else if (group != ANY_GROUP) {
5329 handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
5330 &conf->worker_groups[group].handle_list;
5331 wg = &conf->worker_groups[group];
5332 } else {
5333 int i;
5334 for (i = 0; i < conf->group_cnt; i++) {
5335 handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
5336 &conf->worker_groups[i].handle_list;
5337 wg = &conf->worker_groups[i];
5338 if (!list_empty(handle_list))
5339 break;
5340 }
5341 }
5342
5343 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
5344 __func__,
5345 list_empty(handle_list) ? "empty" : "busy",
5346 list_empty(&conf->hold_list) ? "empty" : "busy",
5347 atomic_read(&conf->pending_full_writes), conf->bypass_count);
5348
5349 if (!list_empty(handle_list)) {
5350 sh = list_entry(handle_list->next, typeof(*sh), lru);
5351
5352 if (list_empty(&conf->hold_list))
5353 conf->bypass_count = 0;
5354 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
5355 if (conf->hold_list.next == conf->last_hold)
5356 conf->bypass_count++;
5357 else {
5358 conf->last_hold = conf->hold_list.next;
5359 conf->bypass_count -= conf->bypass_threshold;
5360 if (conf->bypass_count < 0)
5361 conf->bypass_count = 0;
5362 }
5363 }
5364 } else if (!list_empty(&conf->hold_list) &&
5365 ((conf->bypass_threshold &&
5366 conf->bypass_count > conf->bypass_threshold) ||
5367 atomic_read(&conf->pending_full_writes) == 0)) {
5368
5369 list_for_each_entry(tmp, &conf->hold_list, lru) {
5370 if (conf->worker_cnt_per_group == 0 ||
5371 group == ANY_GROUP ||
5372 !cpu_online(tmp->cpu) ||
5373 cpu_to_group(tmp->cpu) == group) {
5374 sh = tmp;
5375 break;
5376 }
5377 }
5378
5379 if (sh) {
5380 conf->bypass_count -= conf->bypass_threshold;
5381 if (conf->bypass_count < 0)
5382 conf->bypass_count = 0;
5383 }
5384 wg = NULL;
5385 }
5386
5387 if (!sh) {
5388 if (second_try)
5389 return NULL;
5390 second_try = true;
5391 try_loprio = !try_loprio;
5392 goto again;
5393 }
5394
5395 if (wg) {
5396 wg->stripes_cnt--;
5397 sh->group = NULL;
5398 }
5399 list_del_init(&sh->lru);
5400 BUG_ON(atomic_inc_return(&sh->count) != 1);
5401 return sh;
5402}
5403
5404struct raid5_plug_cb {
5405 struct blk_plug_cb cb;
5406 struct list_head list;
5407 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
5408};
5409
5410static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
5411{
5412 struct raid5_plug_cb *cb = container_of(
5413 blk_cb, struct raid5_plug_cb, cb);
5414 struct stripe_head *sh;
5415 struct mddev *mddev = cb->cb.data;
5416 struct r5conf *conf = mddev->private;
5417 int cnt = 0;
5418 int hash;
5419
5420 if (cb->list.next && !list_empty(&cb->list)) {
5421 spin_lock_irq(&conf->device_lock);
5422 while (!list_empty(&cb->list)) {
5423 sh = list_first_entry(&cb->list, struct stripe_head, lru);
5424 list_del_init(&sh->lru);
5425
5426
5427
5428
5429
5430 smp_mb__before_atomic();
5431 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
5432
5433
5434
5435
5436 hash = sh->hash_lock_index;
5437 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
5438 cnt++;
5439 }
5440 spin_unlock_irq(&conf->device_lock);
5441 }
5442 release_inactive_stripe_list(conf, cb->temp_inactive_list,
5443 NR_STRIPE_HASH_LOCKS);
5444 if (mddev->queue)
5445 trace_block_unplug(mddev->queue, cnt, !from_schedule);
5446 kfree(cb);
5447}
5448
5449static void release_stripe_plug(struct mddev *mddev,
5450 struct stripe_head *sh)
5451{
5452 struct blk_plug_cb *blk_cb = blk_check_plugged(
5453 raid5_unplug, mddev,
5454 sizeof(struct raid5_plug_cb));
5455 struct raid5_plug_cb *cb;
5456
5457 if (!blk_cb) {
5458 raid5_release_stripe(sh);
5459 return;
5460 }
5461
5462 cb = container_of(blk_cb, struct raid5_plug_cb, cb);
5463
5464 if (cb->list.next == NULL) {
5465 int i;
5466 INIT_LIST_HEAD(&cb->list);
5467 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5468 INIT_LIST_HEAD(cb->temp_inactive_list + i);
5469 }
5470
5471 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
5472 list_add_tail(&sh->lru, &cb->list);
5473 else
5474 raid5_release_stripe(sh);
5475}
5476
5477static void make_discard_request(struct mddev *mddev, struct bio *bi)
5478{
5479 struct r5conf *conf = mddev->private;
5480 sector_t logical_sector, last_sector;
5481 struct stripe_head *sh;
5482 int stripe_sectors;
5483
5484 if (mddev->reshape_position != MaxSector)
5485
5486 return;
5487
5488 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5489 last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
5490
5491 bi->bi_next = NULL;
5492
5493 stripe_sectors = conf->chunk_sectors *
5494 (conf->raid_disks - conf->max_degraded);
5495 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
5496 stripe_sectors);
5497 sector_div(last_sector, stripe_sectors);
5498
5499 logical_sector *= conf->chunk_sectors;
5500 last_sector *= conf->chunk_sectors;
5501
5502 for (; logical_sector < last_sector;
5503 logical_sector += STRIPE_SECTORS) {
5504 DEFINE_WAIT(w);
5505 int d;
5506 again:
5507 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
5508 prepare_to_wait(&conf->wait_for_overlap, &w,
5509 TASK_UNINTERRUPTIBLE);
5510 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5511 if (test_bit(STRIPE_SYNCING, &sh->state)) {
5512 raid5_release_stripe(sh);
5513 schedule();
5514 goto again;
5515 }
5516 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5517 spin_lock_irq(&sh->stripe_lock);
5518 for (d = 0; d < conf->raid_disks; d++) {
5519 if (d == sh->pd_idx || d == sh->qd_idx)
5520 continue;
5521 if (sh->dev[d].towrite || sh->dev[d].toread) {
5522 set_bit(R5_Overlap, &sh->dev[d].flags);
5523 spin_unlock_irq(&sh->stripe_lock);
5524 raid5_release_stripe(sh);
5525 schedule();
5526 goto again;
5527 }
5528 }
5529 set_bit(STRIPE_DISCARD, &sh->state);
5530 finish_wait(&conf->wait_for_overlap, &w);
5531 sh->overwrite_disks = 0;
5532 for (d = 0; d < conf->raid_disks; d++) {
5533 if (d == sh->pd_idx || d == sh->qd_idx)
5534 continue;
5535 sh->dev[d].towrite = bi;
5536 set_bit(R5_OVERWRITE, &sh->dev[d].flags);
5537 bio_inc_remaining(bi);
5538 md_write_inc(mddev, bi);
5539 sh->overwrite_disks++;
5540 }
5541 spin_unlock_irq(&sh->stripe_lock);
5542 if (conf->mddev->bitmap) {
5543 for (d = 0;
5544 d < conf->raid_disks - conf->max_degraded;
5545 d++)
5546 md_bitmap_startwrite(mddev->bitmap,
5547 sh->sector,
5548 STRIPE_SECTORS,
5549 0);
5550 sh->bm_seq = conf->seq_flush + 1;
5551 set_bit(STRIPE_BIT_DELAY, &sh->state);
5552 }
5553
5554 set_bit(STRIPE_HANDLE, &sh->state);
5555 clear_bit(STRIPE_DELAYED, &sh->state);
5556 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5557 atomic_inc(&conf->preread_active_stripes);
5558 release_stripe_plug(mddev, sh);
5559 }
5560
5561 bio_endio(bi);
5562}
5563
5564static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
5565{
5566 struct r5conf *conf = mddev->private;
5567 int dd_idx;
5568 sector_t new_sector;
5569 sector_t logical_sector, last_sector;
5570 struct stripe_head *sh;
5571 const int rw = bio_data_dir(bi);
5572 DEFINE_WAIT(w);
5573 bool do_prepare;
5574 bool do_flush = false;
5575
5576 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
5577 int ret = log_handle_flush_request(conf, bi);
5578
5579 if (ret == 0)
5580 return true;
5581 if (ret == -ENODEV) {
5582 md_flush_request(mddev, bi);
5583 return true;
5584 }
5585
5586
5587
5588
5589
5590 do_flush = bi->bi_opf & REQ_PREFLUSH;
5591 }
5592
5593 if (!md_write_start(mddev, bi))
5594 return false;
5595
5596
5597
5598
5599
5600 if (rw == READ && mddev->degraded == 0 &&
5601 mddev->reshape_position == MaxSector) {
5602 bi = chunk_aligned_read(mddev, bi);
5603 if (!bi)
5604 return true;
5605 }
5606
5607 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
5608 make_discard_request(mddev, bi);
5609 md_write_end(mddev);
5610 return true;
5611 }
5612
5613 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5614 last_sector = bio_end_sector(bi);
5615 bi->bi_next = NULL;
5616
5617 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
5618 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
5619 int previous;
5620 int seq;
5621
5622 do_prepare = false;
5623 retry:
5624 seq = read_seqcount_begin(&conf->gen_lock);
5625 previous = 0;
5626 if (do_prepare)
5627 prepare_to_wait(&conf->wait_for_overlap, &w,
5628 TASK_UNINTERRUPTIBLE);
5629 if (unlikely(conf->reshape_progress != MaxSector)) {
5630
5631
5632
5633
5634
5635
5636
5637
5638 spin_lock_irq(&conf->device_lock);
5639 if (mddev->reshape_backwards
5640 ? logical_sector < conf->reshape_progress
5641 : logical_sector >= conf->reshape_progress) {
5642 previous = 1;
5643 } else {
5644 if (mddev->reshape_backwards
5645 ? logical_sector < conf->reshape_safe
5646 : logical_sector >= conf->reshape_safe) {
5647 spin_unlock_irq(&conf->device_lock);
5648 schedule();
5649 do_prepare = true;
5650 goto retry;
5651 }
5652 }
5653 spin_unlock_irq(&conf->device_lock);
5654 }
5655
5656 new_sector = raid5_compute_sector(conf, logical_sector,
5657 previous,
5658 &dd_idx, NULL);
5659 pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n",
5660 (unsigned long long)new_sector,
5661 (unsigned long long)logical_sector);
5662
5663 sh = raid5_get_active_stripe(conf, new_sector, previous,
5664 (bi->bi_opf & REQ_RAHEAD), 0);
5665 if (sh) {
5666 if (unlikely(previous)) {
5667
5668
5669
5670
5671
5672
5673
5674
5675 int must_retry = 0;
5676 spin_lock_irq(&conf->device_lock);
5677 if (mddev->reshape_backwards
5678 ? logical_sector >= conf->reshape_progress
5679 : logical_sector < conf->reshape_progress)
5680
5681 must_retry = 1;
5682 spin_unlock_irq(&conf->device_lock);
5683 if (must_retry) {
5684 raid5_release_stripe(sh);
5685 schedule();
5686 do_prepare = true;
5687 goto retry;
5688 }
5689 }
5690 if (read_seqcount_retry(&conf->gen_lock, seq)) {
5691
5692
5693
5694 raid5_release_stripe(sh);
5695 goto retry;
5696 }
5697
5698 if (test_bit(STRIPE_EXPANDING, &sh->state) ||
5699 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
5700
5701
5702
5703
5704 md_wakeup_thread(mddev->thread);
5705 raid5_release_stripe(sh);
5706 schedule();
5707 do_prepare = true;
5708 goto retry;
5709 }
5710 if (do_flush) {
5711 set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
5712
5713 do_flush = false;
5714 }
5715
5716 set_bit(STRIPE_HANDLE, &sh->state);
5717 clear_bit(STRIPE_DELAYED, &sh->state);
5718 if ((!sh->batch_head || sh == sh->batch_head) &&
5719 (bi->bi_opf & REQ_SYNC) &&
5720 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5721 atomic_inc(&conf->preread_active_stripes);
5722 release_stripe_plug(mddev, sh);
5723 } else {
5724
5725 bi->bi_status = BLK_STS_IOERR;
5726 break;
5727 }
5728 }
5729 finish_wait(&conf->wait_for_overlap, &w);
5730
5731 if (rw == WRITE)
5732 md_write_end(mddev);
5733 bio_endio(bi);
5734 return true;
5735}
5736
5737static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
5738
5739static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
5740{
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750 struct r5conf *conf = mddev->private;
5751 struct stripe_head *sh;
5752 struct md_rdev *rdev;
5753 sector_t first_sector, last_sector;
5754 int raid_disks = conf->previous_raid_disks;
5755 int data_disks = raid_disks - conf->max_degraded;
5756 int new_data_disks = conf->raid_disks - conf->max_degraded;
5757 int i;
5758 int dd_idx;
5759 sector_t writepos, readpos, safepos;
5760 sector_t stripe_addr;
5761 int reshape_sectors;
5762 struct list_head stripes;
5763 sector_t retn;
5764
5765 if (sector_nr == 0) {
5766
5767 if (mddev->reshape_backwards &&
5768 conf->reshape_progress < raid5_size(mddev, 0, 0)) {
5769 sector_nr = raid5_size(mddev, 0, 0)
5770 - conf->reshape_progress;
5771 } else if (mddev->reshape_backwards &&
5772 conf->reshape_progress == MaxSector) {
5773
5774 sector_nr = MaxSector;
5775 } else if (!mddev->reshape_backwards &&
5776 conf->reshape_progress > 0)
5777 sector_nr = conf->reshape_progress;
5778 sector_div(sector_nr, new_data_disks);
5779 if (sector_nr) {
5780 mddev->curr_resync_completed = sector_nr;
5781 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5782 *skipped = 1;
5783 retn = sector_nr;
5784 goto finish;
5785 }
5786 }
5787
5788
5789
5790
5791
5792
5793 reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
5794
5795
5796
5797
5798
5799
5800
5801 writepos = conf->reshape_progress;
5802 sector_div(writepos, new_data_disks);
5803 readpos = conf->reshape_progress;
5804 sector_div(readpos, data_disks);
5805 safepos = conf->reshape_safe;
5806 sector_div(safepos, data_disks);
5807 if (mddev->reshape_backwards) {
5808 BUG_ON(writepos < reshape_sectors);
5809 writepos -= reshape_sectors;
5810 readpos += reshape_sectors;
5811 safepos += reshape_sectors;
5812 } else {
5813 writepos += reshape_sectors;
5814
5815
5816
5817
5818 readpos -= min_t(sector_t, reshape_sectors, readpos);
5819 safepos -= min_t(sector_t, reshape_sectors, safepos);
5820 }
5821
5822
5823
5824
5825 if (mddev->reshape_backwards) {
5826 BUG_ON(conf->reshape_progress == 0);
5827 stripe_addr = writepos;
5828 BUG_ON((mddev->dev_sectors &
5829 ~((sector_t)reshape_sectors - 1))
5830 - reshape_sectors - stripe_addr
5831 != sector_nr);
5832 } else {
5833 BUG_ON(writepos != sector_nr + reshape_sectors);
5834 stripe_addr = sector_nr;
5835 }
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857 if (conf->min_offset_diff < 0) {
5858 safepos += -conf->min_offset_diff;
5859 readpos += -conf->min_offset_diff;
5860 } else
5861 writepos += conf->min_offset_diff;
5862
5863 if ((mddev->reshape_backwards
5864 ? (safepos > writepos && readpos < writepos)
5865 : (safepos < writepos && readpos > writepos)) ||
5866 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
5867
5868 wait_event(conf->wait_for_overlap,
5869 atomic_read(&conf->reshape_stripes)==0
5870 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5871 if (atomic_read(&conf->reshape_stripes) != 0)
5872 return 0;
5873 mddev->reshape_position = conf->reshape_progress;
5874 mddev->curr_resync_completed = sector_nr;
5875 if (!mddev->reshape_backwards)
5876
5877 rdev_for_each(rdev, mddev)
5878 if (rdev->raid_disk >= 0 &&
5879 !test_bit(Journal, &rdev->flags) &&
5880 !test_bit(In_sync, &rdev->flags) &&
5881 rdev->recovery_offset < sector_nr)
5882 rdev->recovery_offset = sector_nr;
5883
5884 conf->reshape_checkpoint = jiffies;
5885 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
5886 md_wakeup_thread(mddev->thread);
5887 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
5888 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5889 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5890 return 0;
5891 spin_lock_irq(&conf->device_lock);
5892 conf->reshape_safe = mddev->reshape_position;
5893 spin_unlock_irq(&conf->device_lock);
5894 wake_up(&conf->wait_for_overlap);
5895 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5896 }
5897
5898 INIT_LIST_HEAD(&stripes);
5899 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
5900 int j;
5901 int skipped_disk = 0;
5902 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
5903 set_bit(STRIPE_EXPANDING, &sh->state);
5904 atomic_inc(&conf->reshape_stripes);
5905
5906
5907
5908 for (j=sh->disks; j--;) {
5909 sector_t s;
5910 if (j == sh->pd_idx)
5911 continue;
5912 if (conf->level == 6 &&
5913 j == sh->qd_idx)
5914 continue;
5915 s = raid5_compute_blocknr(sh, j, 0);
5916 if (s < raid5_size(mddev, 0, 0)) {
5917 skipped_disk = 1;
5918 continue;
5919 }
5920 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
5921 set_bit(R5_Expanded, &sh->dev[j].flags);
5922 set_bit(R5_UPTODATE, &sh->dev[j].flags);
5923 }
5924 if (!skipped_disk) {
5925 set_bit(STRIPE_EXPAND_READY, &sh->state);
5926 set_bit(STRIPE_HANDLE, &sh->state);
5927 }
5928 list_add(&sh->lru, &stripes);
5929 }
5930 spin_lock_irq(&conf->device_lock);
5931 if (mddev->reshape_backwards)
5932 conf->reshape_progress -= reshape_sectors * new_data_disks;
5933 else
5934 conf->reshape_progress += reshape_sectors * new_data_disks;
5935 spin_unlock_irq(&conf->device_lock);
5936
5937
5938
5939
5940
5941 first_sector =
5942 raid5_compute_sector(conf, stripe_addr*(new_data_disks),
5943 1, &dd_idx, NULL);
5944 last_sector =
5945 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
5946 * new_data_disks - 1),
5947 1, &dd_idx, NULL);
5948 if (last_sector >= mddev->dev_sectors)
5949 last_sector = mddev->dev_sectors - 1;
5950 while (first_sector <= last_sector) {
5951 sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
5952 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
5953 set_bit(STRIPE_HANDLE, &sh->state);
5954 raid5_release_stripe(sh);
5955 first_sector += STRIPE_SECTORS;
5956 }
5957
5958
5959
5960 while (!list_empty(&stripes)) {
5961 sh = list_entry(stripes.next, struct stripe_head, lru);
5962 list_del_init(&sh->lru);
5963 raid5_release_stripe(sh);
5964 }
5965
5966
5967
5968 sector_nr += reshape_sectors;
5969 retn = reshape_sectors;
5970finish:
5971 if (mddev->curr_resync_completed > mddev->resync_max ||
5972 (sector_nr - mddev->curr_resync_completed) * 2
5973 >= mddev->resync_max - mddev->curr_resync_completed) {
5974
5975 wait_event(conf->wait_for_overlap,
5976 atomic_read(&conf->reshape_stripes) == 0
5977 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5978 if (atomic_read(&conf->reshape_stripes) != 0)
5979 goto ret;
5980 mddev->reshape_position = conf->reshape_progress;
5981 mddev->curr_resync_completed = sector_nr;
5982 if (!mddev->reshape_backwards)
5983
5984 rdev_for_each(rdev, mddev)
5985 if (rdev->raid_disk >= 0 &&
5986 !test_bit(Journal, &rdev->flags) &&
5987 !test_bit(In_sync, &rdev->flags) &&
5988 rdev->recovery_offset < sector_nr)
5989 rdev->recovery_offset = sector_nr;
5990 conf->reshape_checkpoint = jiffies;
5991 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
5992 md_wakeup_thread(mddev->thread);
5993 wait_event(mddev->sb_wait,
5994 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
5995 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5996 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5997 goto ret;
5998 spin_lock_irq(&conf->device_lock);
5999 conf->reshape_safe = mddev->reshape_position;
6000 spin_unlock_irq(&conf->device_lock);
6001 wake_up(&conf->wait_for_overlap);
6002 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6003 }
6004ret:
6005 return retn;
6006}
6007
6008static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
6009 int *skipped)
6010{
6011 struct r5conf *conf = mddev->private;
6012 struct stripe_head *sh;
6013 sector_t max_sector = mddev->dev_sectors;
6014 sector_t sync_blocks;
6015 int still_degraded = 0;
6016 int i;
6017
6018 if (sector_nr >= max_sector) {
6019
6020
6021 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
6022 end_reshape(conf);
6023 return 0;
6024 }
6025
6026 if (mddev->curr_resync < max_sector)
6027 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
6028 &sync_blocks, 1);
6029 else
6030 conf->fullsync = 0;
6031 md_bitmap_close_sync(mddev->bitmap);
6032
6033 return 0;
6034 }
6035
6036
6037 wait_event(conf->wait_for_overlap, conf->quiesce != 2);
6038
6039 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6040 return reshape_request(mddev, sector_nr, skipped);
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052 if (mddev->degraded >= conf->max_degraded &&
6053 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6054 sector_t rv = mddev->dev_sectors - sector_nr;
6055 *skipped = 1;
6056 return rv;
6057 }
6058 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
6059 !conf->fullsync &&
6060 !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
6061 sync_blocks >= STRIPE_SECTORS) {
6062
6063 sync_blocks /= STRIPE_SECTORS;
6064 *skipped = 1;
6065 return sync_blocks * STRIPE_SECTORS;
6066 }
6067
6068 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
6069
6070 sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0);
6071 if (sh == NULL) {
6072 sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0);
6073
6074
6075
6076 schedule_timeout_uninterruptible(1);
6077 }
6078
6079
6080
6081
6082 rcu_read_lock();
6083 for (i = 0; i < conf->raid_disks; i++) {
6084 struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
6085
6086 if (rdev == NULL || test_bit(Faulty, &rdev->flags))
6087 still_degraded = 1;
6088 }
6089 rcu_read_unlock();
6090
6091 md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
6092
6093 set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
6094 set_bit(STRIPE_HANDLE, &sh->state);
6095
6096 raid5_release_stripe(sh);
6097
6098 return STRIPE_SECTORS;
6099}
6100
6101static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
6102 unsigned int offset)
6103{
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114 struct stripe_head *sh;
6115 int dd_idx;
6116 sector_t sector, logical_sector, last_sector;
6117 int scnt = 0;
6118 int handled = 0;
6119
6120 logical_sector = raid_bio->bi_iter.bi_sector &
6121 ~((sector_t)STRIPE_SECTORS-1);
6122 sector = raid5_compute_sector(conf, logical_sector,
6123 0, &dd_idx, NULL);
6124 last_sector = bio_end_sector(raid_bio);
6125
6126 for (; logical_sector < last_sector;
6127 logical_sector += STRIPE_SECTORS,
6128 sector += STRIPE_SECTORS,
6129 scnt++) {
6130
6131 if (scnt < offset)
6132
6133 continue;
6134
6135 sh = raid5_get_active_stripe(conf, sector, 0, 1, 1);
6136
6137 if (!sh) {
6138
6139 conf->retry_read_aligned = raid_bio;
6140 conf->retry_read_offset = scnt;
6141 return handled;
6142 }
6143
6144 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
6145 raid5_release_stripe(sh);
6146 conf->retry_read_aligned = raid_bio;
6147 conf->retry_read_offset = scnt;
6148 return handled;
6149 }
6150
6151 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
6152 handle_stripe(sh);
6153 raid5_release_stripe(sh);
6154 handled++;
6155 }
6156
6157 bio_endio(raid_bio);
6158
6159 if (atomic_dec_and_test(&conf->active_aligned_reads))
6160 wake_up(&conf->wait_for_quiescent);
6161 return handled;
6162}
6163
6164static int handle_active_stripes(struct r5conf *conf, int group,
6165 struct r5worker *worker,
6166 struct list_head *temp_inactive_list)
6167{
6168 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
6169 int i, batch_size = 0, hash;
6170 bool release_inactive = false;
6171
6172 while (batch_size < MAX_STRIPE_BATCH &&
6173 (sh = __get_priority_stripe(conf, group)) != NULL)
6174 batch[batch_size++] = sh;
6175
6176 if (batch_size == 0) {
6177 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6178 if (!list_empty(temp_inactive_list + i))
6179 break;
6180 if (i == NR_STRIPE_HASH_LOCKS) {
6181 spin_unlock_irq(&conf->device_lock);
6182 log_flush_stripe_to_raid(conf);
6183 spin_lock_irq(&conf->device_lock);
6184 return batch_size;
6185 }
6186 release_inactive = true;
6187 }
6188 spin_unlock_irq(&conf->device_lock);
6189
6190 release_inactive_stripe_list(conf, temp_inactive_list,
6191 NR_STRIPE_HASH_LOCKS);
6192
6193 r5l_flush_stripe_to_raid(conf->log);
6194 if (release_inactive) {
6195 spin_lock_irq(&conf->device_lock);
6196 return 0;
6197 }
6198
6199 for (i = 0; i < batch_size; i++)
6200 handle_stripe(batch[i]);
6201 log_write_stripe_run(conf);
6202
6203 cond_resched();
6204
6205 spin_lock_irq(&conf->device_lock);
6206 for (i = 0; i < batch_size; i++) {
6207 hash = batch[i]->hash_lock_index;
6208 __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
6209 }
6210 return batch_size;
6211}
6212
6213static void raid5_do_work(struct work_struct *work)
6214{
6215 struct r5worker *worker = container_of(work, struct r5worker, work);
6216 struct r5worker_group *group = worker->group;
6217 struct r5conf *conf = group->conf;
6218 struct mddev *mddev = conf->mddev;
6219 int group_id = group - conf->worker_groups;
6220 int handled;
6221 struct blk_plug plug;
6222
6223 pr_debug("+++ raid5worker active\n");
6224
6225 blk_start_plug(&plug);
6226 handled = 0;
6227 spin_lock_irq(&conf->device_lock);
6228 while (1) {
6229 int batch_size, released;
6230
6231 released = release_stripe_list(conf, worker->temp_inactive_list);
6232
6233 batch_size = handle_active_stripes(conf, group_id, worker,
6234 worker->temp_inactive_list);
6235 worker->working = false;
6236 if (!batch_size && !released)
6237 break;
6238 handled += batch_size;
6239 wait_event_lock_irq(mddev->sb_wait,
6240 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6241 conf->device_lock);
6242 }
6243 pr_debug("%d stripes handled\n", handled);
6244
6245 spin_unlock_irq(&conf->device_lock);
6246
6247 flush_deferred_bios(conf);
6248
6249 r5l_flush_stripe_to_raid(conf->log);
6250
6251 async_tx_issue_pending_all();
6252 blk_finish_plug(&plug);
6253
6254 pr_debug("--- raid5worker inactive\n");
6255}
6256
6257
6258
6259
6260
6261
6262
6263
6264static void raid5d(struct md_thread *thread)
6265{
6266 struct mddev *mddev = thread->mddev;
6267 struct r5conf *conf = mddev->private;
6268 int handled;
6269 struct blk_plug plug;
6270
6271 pr_debug("+++ raid5d active\n");
6272
6273 md_check_recovery(mddev);
6274
6275 blk_start_plug(&plug);
6276 handled = 0;
6277 spin_lock_irq(&conf->device_lock);
6278 while (1) {
6279 struct bio *bio;
6280 int batch_size, released;
6281 unsigned int offset;
6282
6283 released = release_stripe_list(conf, conf->temp_inactive_list);
6284 if (released)
6285 clear_bit(R5_DID_ALLOC, &conf->cache_state);
6286
6287 if (
6288 !list_empty(&conf->bitmap_list)) {
6289
6290 conf->seq_flush++;
6291 spin_unlock_irq(&conf->device_lock);
6292 md_bitmap_unplug(mddev->bitmap);
6293 spin_lock_irq(&conf->device_lock);
6294 conf->seq_write = conf->seq_flush;
6295 activate_bit_delay(conf, conf->temp_inactive_list);
6296 }
6297 raid5_activate_delayed(conf);
6298
6299 while ((bio = remove_bio_from_retry(conf, &offset))) {
6300 int ok;
6301 spin_unlock_irq(&conf->device_lock);
6302 ok = retry_aligned_read(conf, bio, offset);
6303 spin_lock_irq(&conf->device_lock);
6304 if (!ok)
6305 break;
6306 handled++;
6307 }
6308
6309 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
6310 conf->temp_inactive_list);
6311 if (!batch_size && !released)
6312 break;
6313 handled += batch_size;
6314
6315 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
6316 spin_unlock_irq(&conf->device_lock);
6317 md_check_recovery(mddev);
6318 spin_lock_irq(&conf->device_lock);
6319 }
6320 }
6321 pr_debug("%d stripes handled\n", handled);
6322
6323 spin_unlock_irq(&conf->device_lock);
6324 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) &&
6325 mutex_trylock(&conf->cache_size_mutex)) {
6326 grow_one_stripe(conf, __GFP_NOWARN);
6327
6328
6329
6330 set_bit(R5_DID_ALLOC, &conf->cache_state);
6331 mutex_unlock(&conf->cache_size_mutex);
6332 }
6333
6334 flush_deferred_bios(conf);
6335
6336 r5l_flush_stripe_to_raid(conf->log);
6337
6338 async_tx_issue_pending_all();
6339 blk_finish_plug(&plug);
6340
6341 pr_debug("--- raid5d inactive\n");
6342}
6343
6344static ssize_t
6345raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
6346{
6347 struct r5conf *conf;
6348 int ret = 0;
6349 spin_lock(&mddev->lock);
6350 conf = mddev->private;
6351 if (conf)
6352 ret = sprintf(page, "%d\n", conf->min_nr_stripes);
6353 spin_unlock(&mddev->lock);
6354 return ret;
6355}
6356
6357int
6358raid5_set_cache_size(struct mddev *mddev, int size)
6359{
6360 struct r5conf *conf = mddev->private;
6361
6362 if (size <= 16 || size > 32768)
6363 return -EINVAL;
6364
6365 conf->min_nr_stripes = size;
6366 mutex_lock(&conf->cache_size_mutex);
6367 while (size < conf->max_nr_stripes &&
6368 drop_one_stripe(conf))
6369 ;
6370 mutex_unlock(&conf->cache_size_mutex);
6371
6372 md_allow_write(mddev);
6373
6374 mutex_lock(&conf->cache_size_mutex);
6375 while (size > conf->max_nr_stripes)
6376 if (!grow_one_stripe(conf, GFP_KERNEL))
6377 break;
6378 mutex_unlock(&conf->cache_size_mutex);
6379
6380 return 0;
6381}
6382EXPORT_SYMBOL(raid5_set_cache_size);
6383
6384static ssize_t
6385raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
6386{
6387 struct r5conf *conf;
6388 unsigned long new;
6389 int err;
6390
6391 if (len >= PAGE_SIZE)
6392 return -EINVAL;
6393 if (kstrtoul(page, 10, &new))
6394 return -EINVAL;
6395 err = mddev_lock(mddev);
6396 if (err)
6397 return err;
6398 conf = mddev->private;
6399 if (!conf)
6400 err = -ENODEV;
6401 else
6402 err = raid5_set_cache_size(mddev, new);
6403 mddev_unlock(mddev);
6404
6405 return err ?: len;
6406}
6407
6408static struct md_sysfs_entry
6409raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
6410 raid5_show_stripe_cache_size,
6411 raid5_store_stripe_cache_size);
6412
6413static ssize_t
6414raid5_show_rmw_level(struct mddev *mddev, char *page)
6415{
6416 struct r5conf *conf = mddev->private;
6417 if (conf)
6418 return sprintf(page, "%d\n", conf->rmw_level);
6419 else
6420 return 0;
6421}
6422
6423static ssize_t
6424raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len)
6425{
6426 struct r5conf *conf = mddev->private;
6427 unsigned long new;
6428
6429 if (!conf)
6430 return -ENODEV;
6431
6432 if (len >= PAGE_SIZE)
6433 return -EINVAL;
6434
6435 if (kstrtoul(page, 10, &new))
6436 return -EINVAL;
6437
6438 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
6439 return -EINVAL;
6440
6441 if (new != PARITY_DISABLE_RMW &&
6442 new != PARITY_ENABLE_RMW &&
6443 new != PARITY_PREFER_RMW)
6444 return -EINVAL;
6445
6446 conf->rmw_level = new;
6447 return len;
6448}
6449
6450static struct md_sysfs_entry
6451raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
6452 raid5_show_rmw_level,
6453 raid5_store_rmw_level);
6454
6455
6456static ssize_t
6457raid5_show_preread_threshold(struct mddev *mddev, char *page)
6458{
6459 struct r5conf *conf;
6460 int ret = 0;
6461 spin_lock(&mddev->lock);
6462 conf = mddev->private;
6463 if (conf)
6464 ret = sprintf(page, "%d\n", conf->bypass_threshold);
6465 spin_unlock(&mddev->lock);
6466 return ret;
6467}
6468
6469static ssize_t
6470raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
6471{
6472 struct r5conf *conf;
6473 unsigned long new;
6474 int err;
6475
6476 if (len >= PAGE_SIZE)
6477 return -EINVAL;
6478 if (kstrtoul(page, 10, &new))
6479 return -EINVAL;
6480
6481 err = mddev_lock(mddev);
6482 if (err)
6483 return err;
6484 conf = mddev->private;
6485 if (!conf)
6486 err = -ENODEV;
6487 else if (new > conf->min_nr_stripes)
6488 err = -EINVAL;
6489 else
6490 conf->bypass_threshold = new;
6491 mddev_unlock(mddev);
6492 return err ?: len;
6493}
6494
6495static struct md_sysfs_entry
6496raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
6497 S_IRUGO | S_IWUSR,
6498 raid5_show_preread_threshold,
6499 raid5_store_preread_threshold);
6500
6501static ssize_t
6502raid5_show_skip_copy(struct mddev *mddev, char *page)
6503{
6504 struct r5conf *conf;
6505 int ret = 0;
6506 spin_lock(&mddev->lock);
6507 conf = mddev->private;
6508 if (conf)
6509 ret = sprintf(page, "%d\n", conf->skip_copy);
6510 spin_unlock(&mddev->lock);
6511 return ret;
6512}
6513
6514static ssize_t
6515raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
6516{
6517 struct r5conf *conf;
6518 unsigned long new;
6519 int err;
6520
6521 if (len >= PAGE_SIZE)
6522 return -EINVAL;
6523 if (kstrtoul(page, 10, &new))
6524 return -EINVAL;
6525 new = !!new;
6526
6527 err = mddev_lock(mddev);
6528 if (err)
6529 return err;
6530 conf = mddev->private;
6531 if (!conf)
6532 err = -ENODEV;
6533 else if (new != conf->skip_copy) {
6534 mddev_suspend(mddev);
6535 conf->skip_copy = new;
6536 if (new)
6537 mddev->queue->backing_dev_info->capabilities |=
6538 BDI_CAP_STABLE_WRITES;
6539 else
6540 mddev->queue->backing_dev_info->capabilities &=
6541 ~BDI_CAP_STABLE_WRITES;
6542 mddev_resume(mddev);
6543 }
6544 mddev_unlock(mddev);
6545 return err ?: len;
6546}
6547
6548static struct md_sysfs_entry
6549raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
6550 raid5_show_skip_copy,
6551 raid5_store_skip_copy);
6552
6553static ssize_t
6554stripe_cache_active_show(struct mddev *mddev, char *page)
6555{
6556 struct r5conf *conf = mddev->private;
6557 if (conf)
6558 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
6559 else
6560 return 0;
6561}
6562
6563static struct md_sysfs_entry
6564raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
6565
6566static ssize_t
6567raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
6568{
6569 struct r5conf *conf;
6570 int ret = 0;
6571 spin_lock(&mddev->lock);
6572 conf = mddev->private;
6573 if (conf)
6574 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
6575 spin_unlock(&mddev->lock);
6576 return ret;
6577}
6578
6579static int alloc_thread_groups(struct r5conf *conf, int cnt,
6580 int *group_cnt,
6581 int *worker_cnt_per_group,
6582 struct r5worker_group **worker_groups);
6583static ssize_t
6584raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
6585{
6586 struct r5conf *conf;
6587 unsigned int new;
6588 int err;
6589 struct r5worker_group *new_groups, *old_groups;
6590 int group_cnt, worker_cnt_per_group;
6591
6592 if (len >= PAGE_SIZE)
6593 return -EINVAL;
6594 if (kstrtouint(page, 10, &new))
6595 return -EINVAL;
6596
6597 if (new > 8192)
6598 return -EINVAL;
6599
6600 err = mddev_lock(mddev);
6601 if (err)
6602 return err;
6603 conf = mddev->private;
6604 if (!conf)
6605 err = -ENODEV;
6606 else if (new != conf->worker_cnt_per_group) {
6607 mddev_suspend(mddev);
6608
6609 old_groups = conf->worker_groups;
6610 if (old_groups)
6611 flush_workqueue(raid5_wq);
6612
6613 err = alloc_thread_groups(conf, new,
6614 &group_cnt, &worker_cnt_per_group,
6615 &new_groups);
6616 if (!err) {
6617 spin_lock_irq(&conf->device_lock);
6618 conf->group_cnt = group_cnt;
6619 conf->worker_cnt_per_group = worker_cnt_per_group;
6620 conf->worker_groups = new_groups;
6621 spin_unlock_irq(&conf->device_lock);
6622
6623 if (old_groups)
6624 kfree(old_groups[0].workers);
6625 kfree(old_groups);
6626 }
6627 mddev_resume(mddev);
6628 }
6629 mddev_unlock(mddev);
6630
6631 return err ?: len;
6632}
6633
6634static struct md_sysfs_entry
6635raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
6636 raid5_show_group_thread_cnt,
6637 raid5_store_group_thread_cnt);
6638
6639static struct attribute *raid5_attrs[] = {
6640 &raid5_stripecache_size.attr,
6641 &raid5_stripecache_active.attr,
6642 &raid5_preread_bypass_threshold.attr,
6643 &raid5_group_thread_cnt.attr,
6644 &raid5_skip_copy.attr,
6645 &raid5_rmw_level.attr,
6646 &r5c_journal_mode.attr,
6647 NULL,
6648};
6649static struct attribute_group raid5_attrs_group = {
6650 .name = NULL,
6651 .attrs = raid5_attrs,
6652};
6653
6654static int alloc_thread_groups(struct r5conf *conf, int cnt,
6655 int *group_cnt,
6656 int *worker_cnt_per_group,
6657 struct r5worker_group **worker_groups)
6658{
6659 int i, j, k;
6660 ssize_t size;
6661 struct r5worker *workers;
6662
6663 *worker_cnt_per_group = cnt;
6664 if (cnt == 0) {
6665 *group_cnt = 0;
6666 *worker_groups = NULL;
6667 return 0;
6668 }
6669 *group_cnt = num_possible_nodes();
6670 size = sizeof(struct r5worker) * cnt;
6671 workers = kcalloc(size, *group_cnt, GFP_NOIO);
6672 *worker_groups = kcalloc(*group_cnt, sizeof(struct r5worker_group),
6673 GFP_NOIO);
6674 if (!*worker_groups || !workers) {
6675 kfree(workers);
6676 kfree(*worker_groups);
6677 return -ENOMEM;
6678 }
6679
6680 for (i = 0; i < *group_cnt; i++) {
6681 struct r5worker_group *group;
6682
6683 group = &(*worker_groups)[i];
6684 INIT_LIST_HEAD(&group->handle_list);
6685 INIT_LIST_HEAD(&group->loprio_list);
6686 group->conf = conf;
6687 group->workers = workers + i * cnt;
6688
6689 for (j = 0; j < cnt; j++) {
6690 struct r5worker *worker = group->workers + j;
6691 worker->group = group;
6692 INIT_WORK(&worker->work, raid5_do_work);
6693
6694 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
6695 INIT_LIST_HEAD(worker->temp_inactive_list + k);
6696 }
6697 }
6698
6699 return 0;
6700}
6701
6702static void free_thread_groups(struct r5conf *conf)
6703{
6704 if (conf->worker_groups)
6705 kfree(conf->worker_groups[0].workers);
6706 kfree(conf->worker_groups);
6707 conf->worker_groups = NULL;
6708}
6709
6710static sector_t
6711raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
6712{
6713 struct r5conf *conf = mddev->private;
6714
6715 if (!sectors)
6716 sectors = mddev->dev_sectors;
6717 if (!raid_disks)
6718
6719 raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
6720
6721 sectors &= ~((sector_t)conf->chunk_sectors - 1);
6722 sectors &= ~((sector_t)conf->prev_chunk_sectors - 1);
6723 return sectors * (raid_disks - conf->max_degraded);
6724}
6725
6726static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
6727{
6728 safe_put_page(percpu->spare_page);
6729 if (percpu->scribble)
6730 flex_array_free(percpu->scribble);
6731 percpu->spare_page = NULL;
6732 percpu->scribble = NULL;
6733}
6734
6735static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
6736{
6737 if (conf->level == 6 && !percpu->spare_page)
6738 percpu->spare_page = alloc_page(GFP_KERNEL);
6739 if (!percpu->scribble)
6740 percpu->scribble = scribble_alloc(max(conf->raid_disks,
6741 conf->previous_raid_disks),
6742 max(conf->chunk_sectors,
6743 conf->prev_chunk_sectors)
6744 / STRIPE_SECTORS,
6745 GFP_KERNEL);
6746
6747 if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
6748 free_scratch_buffer(conf, percpu);
6749 return -ENOMEM;
6750 }
6751
6752 return 0;
6753}
6754
6755static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node)
6756{
6757 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
6758
6759 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
6760 return 0;
6761}
6762
6763static void raid5_free_percpu(struct r5conf *conf)
6764{
6765 if (!conf->percpu)
6766 return;
6767
6768 cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
6769 free_percpu(conf->percpu);
6770}
6771
6772static void free_conf(struct r5conf *conf)
6773{
6774 int i;
6775
6776 log_exit(conf);
6777
6778 unregister_shrinker(&conf->shrinker);
6779 free_thread_groups(conf);
6780 shrink_stripes(conf);
6781 raid5_free_percpu(conf);
6782 for (i = 0; i < conf->pool_size; i++)
6783 if (conf->disks[i].extra_page)
6784 put_page(conf->disks[i].extra_page);
6785 kfree(conf->disks);
6786 bioset_exit(&conf->bio_split);
6787 kfree(conf->stripe_hashtbl);
6788 kfree(conf->pending_data);
6789 kfree(conf);
6790}
6791
6792static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
6793{
6794 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
6795 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
6796
6797 if (alloc_scratch_buffer(conf, percpu)) {
6798 pr_warn("%s: failed memory allocation for cpu%u\n",
6799 __func__, cpu);
6800 return -ENOMEM;
6801 }
6802 return 0;
6803}
6804
6805static int raid5_alloc_percpu(struct r5conf *conf)
6806{
6807 int err = 0;
6808
6809 conf->percpu = alloc_percpu(struct raid5_percpu);
6810 if (!conf->percpu)
6811 return -ENOMEM;
6812
6813 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
6814 if (!err) {
6815 conf->scribble_disks = max(conf->raid_disks,
6816 conf->previous_raid_disks);
6817 conf->scribble_sectors = max(conf->chunk_sectors,
6818 conf->prev_chunk_sectors);
6819 }
6820 return err;
6821}
6822
6823static unsigned long raid5_cache_scan(struct shrinker *shrink,
6824 struct shrink_control *sc)
6825{
6826 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
6827 unsigned long ret = SHRINK_STOP;
6828
6829 if (mutex_trylock(&conf->cache_size_mutex)) {
6830 ret= 0;
6831 while (ret < sc->nr_to_scan &&
6832 conf->max_nr_stripes > conf->min_nr_stripes) {
6833 if (drop_one_stripe(conf) == 0) {
6834 ret = SHRINK_STOP;
6835 break;
6836 }
6837 ret++;
6838 }
6839 mutex_unlock(&conf->cache_size_mutex);
6840 }
6841 return ret;
6842}
6843
6844static unsigned long raid5_cache_count(struct shrinker *shrink,
6845 struct shrink_control *sc)
6846{
6847 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
6848
6849 if (conf->max_nr_stripes < conf->min_nr_stripes)
6850
6851 return 0;
6852 return conf->max_nr_stripes - conf->min_nr_stripes;
6853}
6854
6855static struct r5conf *setup_conf(struct mddev *mddev)
6856{
6857 struct r5conf *conf;
6858 int raid_disk, memory, max_disks;
6859 struct md_rdev *rdev;
6860 struct disk_info *disk;
6861 char pers_name[6];
6862 int i;
6863 int group_cnt, worker_cnt_per_group;
6864 struct r5worker_group *new_group;
6865 int ret;
6866
6867 if (mddev->new_level != 5
6868 && mddev->new_level != 4
6869 && mddev->new_level != 6) {
6870 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
6871 mdname(mddev), mddev->new_level);
6872 return ERR_PTR(-EIO);
6873 }
6874 if ((mddev->new_level == 5
6875 && !algorithm_valid_raid5(mddev->new_layout)) ||
6876 (mddev->new_level == 6
6877 && !algorithm_valid_raid6(mddev->new_layout))) {
6878 pr_warn("md/raid:%s: layout %d not supported\n",
6879 mdname(mddev), mddev->new_layout);
6880 return ERR_PTR(-EIO);
6881 }
6882 if (mddev->new_level == 6 && mddev->raid_disks < 4) {
6883 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
6884 mdname(mddev), mddev->raid_disks);
6885 return ERR_PTR(-EINVAL);
6886 }
6887
6888 if (!mddev->new_chunk_sectors ||
6889 (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
6890 !is_power_of_2(mddev->new_chunk_sectors)) {
6891 pr_warn("md/raid:%s: invalid chunk size %d\n",
6892 mdname(mddev), mddev->new_chunk_sectors << 9);
6893 return ERR_PTR(-EINVAL);
6894 }
6895
6896 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
6897 if (conf == NULL)
6898 goto abort;
6899 INIT_LIST_HEAD(&conf->free_list);
6900 INIT_LIST_HEAD(&conf->pending_list);
6901 conf->pending_data = kcalloc(PENDING_IO_MAX,
6902 sizeof(struct r5pending_data),
6903 GFP_KERNEL);
6904 if (!conf->pending_data)
6905 goto abort;
6906 for (i = 0; i < PENDING_IO_MAX; i++)
6907 list_add(&conf->pending_data[i].sibling, &conf->free_list);
6908
6909 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
6910 &new_group)) {
6911 conf->group_cnt = group_cnt;
6912 conf->worker_cnt_per_group = worker_cnt_per_group;
6913 conf->worker_groups = new_group;
6914 } else
6915 goto abort;
6916 spin_lock_init(&conf->device_lock);
6917 seqcount_init(&conf->gen_lock);
6918 mutex_init(&conf->cache_size_mutex);
6919 init_waitqueue_head(&conf->wait_for_quiescent);
6920 init_waitqueue_head(&conf->wait_for_stripe);
6921 init_waitqueue_head(&conf->wait_for_overlap);
6922 INIT_LIST_HEAD(&conf->handle_list);
6923 INIT_LIST_HEAD(&conf->loprio_list);
6924 INIT_LIST_HEAD(&conf->hold_list);
6925 INIT_LIST_HEAD(&conf->delayed_list);
6926 INIT_LIST_HEAD(&conf->bitmap_list);
6927 init_llist_head(&conf->released_stripes);
6928 atomic_set(&conf->active_stripes, 0);
6929 atomic_set(&conf->preread_active_stripes, 0);
6930 atomic_set(&conf->active_aligned_reads, 0);
6931 spin_lock_init(&conf->pending_bios_lock);
6932 conf->batch_bio_dispatch = true;
6933 rdev_for_each(rdev, mddev) {
6934 if (test_bit(Journal, &rdev->flags))
6935 continue;
6936 if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
6937 conf->batch_bio_dispatch = false;
6938 break;
6939 }
6940 }
6941
6942 conf->bypass_threshold = BYPASS_THRESHOLD;
6943 conf->recovery_disabled = mddev->recovery_disabled - 1;
6944
6945 conf->raid_disks = mddev->raid_disks;
6946 if (mddev->reshape_position == MaxSector)
6947 conf->previous_raid_disks = mddev->raid_disks;
6948 else
6949 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
6950 max_disks = max(conf->raid_disks, conf->previous_raid_disks);
6951
6952 conf->disks = kcalloc(max_disks, sizeof(struct disk_info),
6953 GFP_KERNEL);
6954
6955 if (!conf->disks)
6956 goto abort;
6957
6958 for (i = 0; i < max_disks; i++) {
6959 conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
6960 if (!conf->disks[i].extra_page)
6961 goto abort;
6962 }
6963
6964 ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
6965 if (ret)
6966 goto abort;
6967 conf->mddev = mddev;
6968
6969 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
6970 goto abort;
6971
6972
6973
6974
6975
6976
6977 spin_lock_init(conf->hash_locks);
6978 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
6979 spin_lock_init(conf->hash_locks + i);
6980
6981 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6982 INIT_LIST_HEAD(conf->inactive_list + i);
6983
6984 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6985 INIT_LIST_HEAD(conf->temp_inactive_list + i);
6986
6987 atomic_set(&conf->r5c_cached_full_stripes, 0);
6988 INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
6989 atomic_set(&conf->r5c_cached_partial_stripes, 0);
6990 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
6991 atomic_set(&conf->r5c_flushing_full_stripes, 0);
6992 atomic_set(&conf->r5c_flushing_partial_stripes, 0);
6993
6994 conf->level = mddev->new_level;
6995 conf->chunk_sectors = mddev->new_chunk_sectors;
6996 if (raid5_alloc_percpu(conf) != 0)
6997 goto abort;
6998
6999 pr_debug("raid456: run(%s) called.\n", mdname(mddev));
7000
7001 rdev_for_each(rdev, mddev) {
7002 raid_disk = rdev->raid_disk;
7003 if (raid_disk >= max_disks
7004 || raid_disk < 0 || test_bit(Journal, &rdev->flags))
7005 continue;
7006 disk = conf->disks + raid_disk;
7007
7008 if (test_bit(Replacement, &rdev->flags)) {
7009 if (disk->replacement)
7010 goto abort;
7011 disk->replacement = rdev;
7012 } else {
7013 if (disk->rdev)
7014 goto abort;
7015 disk->rdev = rdev;
7016 }
7017
7018 if (test_bit(In_sync, &rdev->flags)) {
7019 char b[BDEVNAME_SIZE];
7020 pr_info("md/raid:%s: device %s operational as raid disk %d\n",
7021 mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
7022 } else if (rdev->saved_raid_disk != raid_disk)
7023
7024 conf->fullsync = 1;
7025 }
7026
7027 conf->level = mddev->new_level;
7028 if (conf->level == 6) {
7029 conf->max_degraded = 2;
7030 if (raid6_call.xor_syndrome)
7031 conf->rmw_level = PARITY_ENABLE_RMW;
7032 else
7033 conf->rmw_level = PARITY_DISABLE_RMW;
7034 } else {
7035 conf->max_degraded = 1;
7036 conf->rmw_level = PARITY_ENABLE_RMW;
7037 }
7038 conf->algorithm = mddev->new_layout;
7039 conf->reshape_progress = mddev->reshape_position;
7040 if (conf->reshape_progress != MaxSector) {
7041 conf->prev_chunk_sectors = mddev->chunk_sectors;
7042 conf->prev_algo = mddev->layout;
7043 } else {
7044 conf->prev_chunk_sectors = conf->chunk_sectors;
7045 conf->prev_algo = conf->algorithm;
7046 }
7047
7048 conf->min_nr_stripes = NR_STRIPES;
7049 if (mddev->reshape_position != MaxSector) {
7050 int stripes = max_t(int,
7051 ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
7052 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
7053 conf->min_nr_stripes = max(NR_STRIPES, stripes);
7054 if (conf->min_nr_stripes != NR_STRIPES)
7055 pr_info("md/raid:%s: force stripe size %d for reshape\n",
7056 mdname(mddev), conf->min_nr_stripes);
7057 }
7058 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
7059 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
7060 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
7061 if (grow_stripes(conf, conf->min_nr_stripes)) {
7062 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
7063 mdname(mddev), memory);
7064 goto abort;
7065 } else
7066 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
7067
7068
7069
7070
7071
7072 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
7073 conf->shrinker.scan_objects = raid5_cache_scan;
7074 conf->shrinker.count_objects = raid5_cache_count;
7075 conf->shrinker.batch = 128;
7076 conf->shrinker.flags = 0;
7077 if (register_shrinker(&conf->shrinker)) {
7078 pr_warn("md/raid:%s: couldn't register shrinker.\n",
7079 mdname(mddev));
7080 goto abort;
7081 }
7082
7083 sprintf(pers_name, "raid%d", mddev->new_level);
7084 conf->thread = md_register_thread(raid5d, mddev, pers_name);
7085 if (!conf->thread) {
7086 pr_warn("md/raid:%s: couldn't allocate thread.\n",
7087 mdname(mddev));
7088 goto abort;
7089 }
7090
7091 return conf;
7092
7093 abort:
7094 if (conf) {
7095 free_conf(conf);
7096 return ERR_PTR(-EIO);
7097 } else
7098 return ERR_PTR(-ENOMEM);
7099}
7100
7101static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
7102{
7103 switch (algo) {
7104 case ALGORITHM_PARITY_0:
7105 if (raid_disk < max_degraded)
7106 return 1;
7107 break;
7108 case ALGORITHM_PARITY_N:
7109 if (raid_disk >= raid_disks - max_degraded)
7110 return 1;
7111 break;
7112 case ALGORITHM_PARITY_0_6:
7113 if (raid_disk == 0 ||
7114 raid_disk == raid_disks - 1)
7115 return 1;
7116 break;
7117 case ALGORITHM_LEFT_ASYMMETRIC_6:
7118 case ALGORITHM_RIGHT_ASYMMETRIC_6:
7119 case ALGORITHM_LEFT_SYMMETRIC_6:
7120 case ALGORITHM_RIGHT_SYMMETRIC_6:
7121 if (raid_disk == raid_disks - 1)
7122 return 1;
7123 }
7124 return 0;
7125}
7126
7127static int raid5_run(struct mddev *mddev)
7128{
7129 struct r5conf *conf;
7130 int working_disks = 0;
7131 int dirty_parity_disks = 0;
7132 struct md_rdev *rdev;
7133 struct md_rdev *journal_dev = NULL;
7134 sector_t reshape_offset = 0;
7135 int i;
7136 long long min_offset_diff = 0;
7137 int first = 1;
7138
7139 if (mddev_init_writes_pending(mddev) < 0)
7140 return -ENOMEM;
7141
7142 if (mddev->recovery_cp != MaxSector)
7143 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
7144 mdname(mddev));
7145
7146 rdev_for_each(rdev, mddev) {
7147 long long diff;
7148
7149 if (test_bit(Journal, &rdev->flags)) {
7150 journal_dev = rdev;
7151 continue;
7152 }
7153 if (rdev->raid_disk < 0)
7154 continue;
7155 diff = (rdev->new_data_offset - rdev->data_offset);
7156 if (first) {
7157 min_offset_diff = diff;
7158 first = 0;
7159 } else if (mddev->reshape_backwards &&
7160 diff < min_offset_diff)
7161 min_offset_diff = diff;
7162 else if (!mddev->reshape_backwards &&
7163 diff > min_offset_diff)
7164 min_offset_diff = diff;
7165 }
7166
7167 if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) &&
7168 (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
7169 pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
7170 mdname(mddev));
7171 return -EINVAL;
7172 }
7173
7174 if (mddev->reshape_position != MaxSector) {
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187 sector_t here_new, here_old;
7188 int old_disks;
7189 int max_degraded = (mddev->level == 6 ? 2 : 1);
7190 int chunk_sectors;
7191 int new_data_disks;
7192
7193 if (journal_dev) {
7194 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
7195 mdname(mddev));
7196 return -EINVAL;
7197 }
7198
7199 if (mddev->new_level != mddev->level) {
7200 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
7201 mdname(mddev));
7202 return -EINVAL;
7203 }
7204 old_disks = mddev->raid_disks - mddev->delta_disks;
7205
7206
7207
7208
7209
7210
7211
7212 here_new = mddev->reshape_position;
7213 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
7214 new_data_disks = mddev->raid_disks - max_degraded;
7215 if (sector_div(here_new, chunk_sectors * new_data_disks)) {
7216 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
7217 mdname(mddev));
7218 return -EINVAL;
7219 }
7220 reshape_offset = here_new * chunk_sectors;
7221
7222 here_old = mddev->reshape_position;
7223 sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
7224
7225
7226 if (mddev->delta_disks == 0) {
7227
7228
7229
7230
7231
7232
7233
7234 if (abs(min_offset_diff) >= mddev->chunk_sectors &&
7235 abs(min_offset_diff) >= mddev->new_chunk_sectors)
7236 ;
7237 else if (mddev->ro == 0) {
7238 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
7239 mdname(mddev));
7240 return -EINVAL;
7241 }
7242 } else if (mddev->reshape_backwards
7243 ? (here_new * chunk_sectors + min_offset_diff <=
7244 here_old * chunk_sectors)
7245 : (here_new * chunk_sectors >=
7246 here_old * chunk_sectors + (-min_offset_diff))) {
7247
7248 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
7249 mdname(mddev));
7250 return -EINVAL;
7251 }
7252 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
7253
7254 } else {
7255 BUG_ON(mddev->level != mddev->new_level);
7256 BUG_ON(mddev->layout != mddev->new_layout);
7257 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
7258 BUG_ON(mddev->delta_disks != 0);
7259 }
7260
7261 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
7262 test_bit(MD_HAS_PPL, &mddev->flags)) {
7263 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
7264 mdname(mddev));
7265 clear_bit(MD_HAS_PPL, &mddev->flags);
7266 clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags);
7267 }
7268
7269 if (mddev->private == NULL)
7270 conf = setup_conf(mddev);
7271 else
7272 conf = mddev->private;
7273
7274 if (IS_ERR(conf))
7275 return PTR_ERR(conf);
7276
7277 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
7278 if (!journal_dev) {
7279 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
7280 mdname(mddev));
7281 mddev->ro = 1;
7282 set_disk_ro(mddev->gendisk, 1);
7283 } else if (mddev->recovery_cp == MaxSector)
7284 set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
7285 }
7286
7287 conf->min_offset_diff = min_offset_diff;
7288 mddev->thread = conf->thread;
7289 conf->thread = NULL;
7290 mddev->private = conf;
7291
7292 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
7293 i++) {
7294 rdev = conf->disks[i].rdev;
7295 if (!rdev && conf->disks[i].replacement) {
7296
7297 rdev = conf->disks[i].replacement;
7298 conf->disks[i].replacement = NULL;
7299 clear_bit(Replacement, &rdev->flags);
7300 conf->disks[i].rdev = rdev;
7301 }
7302 if (!rdev)
7303 continue;
7304 if (conf->disks[i].replacement &&
7305 conf->reshape_progress != MaxSector) {
7306
7307 pr_warn("md: cannot handle concurrent replacement and reshape.\n");
7308 goto abort;
7309 }
7310 if (test_bit(In_sync, &rdev->flags)) {
7311 working_disks++;
7312 continue;
7313 }
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323 if (mddev->major_version == 0 &&
7324 mddev->minor_version > 90)
7325 rdev->recovery_offset = reshape_offset;
7326
7327 if (rdev->recovery_offset < reshape_offset) {
7328
7329 if (!only_parity(rdev->raid_disk,
7330 conf->algorithm,
7331 conf->raid_disks,
7332 conf->max_degraded))
7333 continue;
7334 }
7335 if (!only_parity(rdev->raid_disk,
7336 conf->prev_algo,
7337 conf->previous_raid_disks,
7338 conf->max_degraded))
7339 continue;
7340 dirty_parity_disks++;
7341 }
7342
7343
7344
7345
7346 mddev->degraded = raid5_calc_degraded(conf);
7347
7348 if (has_failed(conf)) {
7349 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
7350 mdname(mddev), mddev->degraded, conf->raid_disks);
7351 goto abort;
7352 }
7353
7354
7355 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
7356 mddev->resync_max_sectors = mddev->dev_sectors;
7357
7358 if (mddev->degraded > dirty_parity_disks &&
7359 mddev->recovery_cp != MaxSector) {
7360 if (test_bit(MD_HAS_PPL, &mddev->flags))
7361 pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
7362 mdname(mddev));
7363 else if (mddev->ok_start_degraded)
7364 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
7365 mdname(mddev));
7366 else {
7367 pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
7368 mdname(mddev));
7369 goto abort;
7370 }
7371 }
7372
7373 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
7374 mdname(mddev), conf->level,
7375 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
7376 mddev->new_layout);
7377
7378 print_raid5_conf(conf);
7379
7380 if (conf->reshape_progress != MaxSector) {
7381 conf->reshape_safe = conf->reshape_progress;
7382 atomic_set(&conf->reshape_stripes, 0);
7383 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7384 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7385 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7386 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7387 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
7388 "reshape");
7389 }
7390
7391
7392 if (mddev->to_remove == &raid5_attrs_group)
7393 mddev->to_remove = NULL;
7394 else if (mddev->kobj.sd &&
7395 sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
7396 pr_warn("raid5: failed to create sysfs attributes for %s\n",
7397 mdname(mddev));
7398 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
7399
7400 if (mddev->queue) {
7401 int chunk_size;
7402
7403
7404
7405
7406 int data_disks = conf->previous_raid_disks - conf->max_degraded;
7407 int stripe = data_disks *
7408 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
7409 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
7410 mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
7411
7412 chunk_size = mddev->chunk_sectors << 9;
7413 blk_queue_io_min(mddev->queue, chunk_size);
7414 blk_queue_io_opt(mddev->queue, chunk_size *
7415 (conf->raid_disks - conf->max_degraded));
7416 mddev->queue->limits.raid_partial_stripes_expensive = 1;
7417
7418
7419
7420
7421 stripe = stripe * PAGE_SIZE;
7422
7423
7424 while ((stripe-1) & stripe)
7425 stripe = (stripe | (stripe-1)) + 1;
7426 mddev->queue->limits.discard_alignment = stripe;
7427 mddev->queue->limits.discard_granularity = stripe;
7428
7429 blk_queue_max_write_same_sectors(mddev->queue, 0);
7430 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
7431
7432 rdev_for_each(rdev, mddev) {
7433 disk_stack_limits(mddev->gendisk, rdev->bdev,
7434 rdev->data_offset << 9);
7435 disk_stack_limits(mddev->gendisk, rdev->bdev,
7436 rdev->new_data_offset << 9);
7437 }
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454 if (devices_handle_discard_safely &&
7455 mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
7456 mddev->queue->limits.discard_granularity >= stripe)
7457 blk_queue_flag_set(QUEUE_FLAG_DISCARD,
7458 mddev->queue);
7459 else
7460 blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
7461 mddev->queue);
7462
7463 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
7464 }
7465
7466 if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
7467 goto abort;
7468
7469 return 0;
7470abort:
7471 md_unregister_thread(&mddev->thread);
7472 print_raid5_conf(conf);
7473 free_conf(conf);
7474 mddev->private = NULL;
7475 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
7476 return -EIO;
7477}
7478
7479static void raid5_free(struct mddev *mddev, void *priv)
7480{
7481 struct r5conf *conf = priv;
7482
7483 free_conf(conf);
7484 mddev->to_remove = &raid5_attrs_group;
7485}
7486
7487static void raid5_status(struct seq_file *seq, struct mddev *mddev)
7488{
7489 struct r5conf *conf = mddev->private;
7490 int i;
7491
7492 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
7493 conf->chunk_sectors / 2, mddev->layout);
7494 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
7495 rcu_read_lock();
7496 for (i = 0; i < conf->raid_disks; i++) {
7497 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
7498 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
7499 }
7500 rcu_read_unlock();
7501 seq_printf (seq, "]");
7502}
7503
7504static void print_raid5_conf (struct r5conf *conf)
7505{
7506 int i;
7507 struct disk_info *tmp;
7508
7509 pr_debug("RAID conf printout:\n");
7510 if (!conf) {
7511 pr_debug("(conf==NULL)\n");
7512 return;
7513 }
7514 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
7515 conf->raid_disks,
7516 conf->raid_disks - conf->mddev->degraded);
7517
7518 for (i = 0; i < conf->raid_disks; i++) {
7519 char b[BDEVNAME_SIZE];
7520 tmp = conf->disks + i;
7521 if (tmp->rdev)
7522 pr_debug(" disk %d, o:%d, dev:%s\n",
7523 i, !test_bit(Faulty, &tmp->rdev->flags),
7524 bdevname(tmp->rdev->bdev, b));
7525 }
7526}
7527
7528static int raid5_spare_active(struct mddev *mddev)
7529{
7530 int i;
7531 struct r5conf *conf = mddev->private;
7532 struct disk_info *tmp;
7533 int count = 0;
7534 unsigned long flags;
7535
7536 for (i = 0; i < conf->raid_disks; i++) {
7537 tmp = conf->disks + i;
7538 if (tmp->replacement
7539 && tmp->replacement->recovery_offset == MaxSector
7540 && !test_bit(Faulty, &tmp->replacement->flags)
7541 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
7542
7543 if (!tmp->rdev
7544 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
7545 count++;
7546 if (tmp->rdev) {
7547
7548
7549
7550
7551 set_bit(Faulty, &tmp->rdev->flags);
7552 sysfs_notify_dirent_safe(
7553 tmp->rdev->sysfs_state);
7554 }
7555 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
7556 } else if (tmp->rdev
7557 && tmp->rdev->recovery_offset == MaxSector
7558 && !test_bit(Faulty, &tmp->rdev->flags)
7559 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
7560 count++;
7561 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
7562 }
7563 }
7564 spin_lock_irqsave(&conf->device_lock, flags);
7565 mddev->degraded = raid5_calc_degraded(conf);
7566 spin_unlock_irqrestore(&conf->device_lock, flags);
7567 print_raid5_conf(conf);
7568 return count;
7569}
7570
7571static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
7572{
7573 struct r5conf *conf = mddev->private;
7574 int err = 0;
7575 int number = rdev->raid_disk;
7576 struct md_rdev **rdevp;
7577 struct disk_info *p = conf->disks + number;
7578
7579 print_raid5_conf(conf);
7580 if (test_bit(Journal, &rdev->flags) && conf->log) {
7581
7582
7583
7584
7585
7586
7587 if (atomic_read(&conf->active_stripes) ||
7588 atomic_read(&conf->r5c_cached_full_stripes) ||
7589 atomic_read(&conf->r5c_cached_partial_stripes)) {
7590 return -EBUSY;
7591 }
7592 log_exit(conf);
7593 return 0;
7594 }
7595 if (rdev == p->rdev)
7596 rdevp = &p->rdev;
7597 else if (rdev == p->replacement)
7598 rdevp = &p->replacement;
7599 else
7600 return 0;
7601
7602 if (number >= conf->raid_disks &&
7603 conf->reshape_progress == MaxSector)
7604 clear_bit(In_sync, &rdev->flags);
7605
7606 if (test_bit(In_sync, &rdev->flags) ||
7607 atomic_read(&rdev->nr_pending)) {
7608 err = -EBUSY;
7609 goto abort;
7610 }
7611
7612
7613
7614 if (!test_bit(Faulty, &rdev->flags) &&
7615 mddev->recovery_disabled != conf->recovery_disabled &&
7616 !has_failed(conf) &&
7617 (!p->replacement || p->replacement == rdev) &&
7618 number < conf->raid_disks) {
7619 err = -EBUSY;
7620 goto abort;
7621 }
7622 *rdevp = NULL;
7623 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
7624 synchronize_rcu();
7625 if (atomic_read(&rdev->nr_pending)) {
7626
7627 err = -EBUSY;
7628 *rdevp = rdev;
7629 }
7630 }
7631 if (!err) {
7632 err = log_modify(conf, rdev, false);
7633 if (err)
7634 goto abort;
7635 }
7636 if (p->replacement) {
7637
7638 p->rdev = p->replacement;
7639 clear_bit(Replacement, &p->replacement->flags);
7640 smp_mb();
7641
7642
7643 p->replacement = NULL;
7644
7645 if (!err)
7646 err = log_modify(conf, p->rdev, true);
7647 }
7648
7649 clear_bit(WantReplacement, &rdev->flags);
7650abort:
7651
7652 print_raid5_conf(conf);
7653 return err;
7654}
7655
7656static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
7657{
7658 struct r5conf *conf = mddev->private;
7659 int err = -EEXIST;
7660 int disk;
7661 struct disk_info *p;
7662 int first = 0;
7663 int last = conf->raid_disks - 1;
7664
7665 if (test_bit(Journal, &rdev->flags)) {
7666 if (conf->log)
7667 return -EBUSY;
7668
7669 rdev->raid_disk = 0;
7670
7671
7672
7673
7674 log_init(conf, rdev, false);
7675 return 0;
7676 }
7677 if (mddev->recovery_disabled == conf->recovery_disabled)
7678 return -EBUSY;
7679
7680 if (rdev->saved_raid_disk < 0 && has_failed(conf))
7681
7682 return -EINVAL;
7683
7684 if (rdev->raid_disk >= 0)
7685 first = last = rdev->raid_disk;
7686
7687
7688
7689
7690
7691 if (rdev->saved_raid_disk >= 0 &&
7692 rdev->saved_raid_disk >= first &&
7693 conf->disks[rdev->saved_raid_disk].rdev == NULL)
7694 first = rdev->saved_raid_disk;
7695
7696 for (disk = first; disk <= last; disk++) {
7697 p = conf->disks + disk;
7698 if (p->rdev == NULL) {
7699 clear_bit(In_sync, &rdev->flags);
7700 rdev->raid_disk = disk;
7701 if (rdev->saved_raid_disk != disk)
7702 conf->fullsync = 1;
7703 rcu_assign_pointer(p->rdev, rdev);
7704
7705 err = log_modify(conf, rdev, true);
7706
7707 goto out;
7708 }
7709 }
7710 for (disk = first; disk <= last; disk++) {
7711 p = conf->disks + disk;
7712 if (test_bit(WantReplacement, &p->rdev->flags) &&
7713 p->replacement == NULL) {
7714 clear_bit(In_sync, &rdev->flags);
7715 set_bit(Replacement, &rdev->flags);
7716 rdev->raid_disk = disk;
7717 err = 0;
7718 conf->fullsync = 1;
7719 rcu_assign_pointer(p->replacement, rdev);
7720 break;
7721 }
7722 }
7723out:
7724 print_raid5_conf(conf);
7725 return err;
7726}
7727
7728static int raid5_resize(struct mddev *mddev, sector_t sectors)
7729{
7730
7731
7732
7733
7734
7735
7736
7737 sector_t newsize;
7738 struct r5conf *conf = mddev->private;
7739
7740 if (raid5_has_log(conf) || raid5_has_ppl(conf))
7741 return -EINVAL;
7742 sectors &= ~((sector_t)conf->chunk_sectors - 1);
7743 newsize = raid5_size(mddev, sectors, mddev->raid_disks);
7744 if (mddev->external_size &&
7745 mddev->array_sectors > newsize)
7746 return -EINVAL;
7747 if (mddev->bitmap) {
7748 int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0);
7749 if (ret)
7750 return ret;
7751 }
7752 md_set_array_sectors(mddev, newsize);
7753 if (sectors > mddev->dev_sectors &&
7754 mddev->recovery_cp > mddev->dev_sectors) {
7755 mddev->recovery_cp = mddev->dev_sectors;
7756 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7757 }
7758 mddev->dev_sectors = sectors;
7759 mddev->resync_max_sectors = sectors;
7760 return 0;
7761}
7762
7763static int check_stripe_cache(struct mddev *mddev)
7764{
7765
7766
7767
7768
7769
7770
7771
7772
7773 struct r5conf *conf = mddev->private;
7774 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
7775 > conf->min_nr_stripes ||
7776 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
7777 > conf->min_nr_stripes) {
7778 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
7779 mdname(mddev),
7780 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
7781 / STRIPE_SIZE)*4);
7782 return 0;
7783 }
7784 return 1;
7785}
7786
7787static int check_reshape(struct mddev *mddev)
7788{
7789 struct r5conf *conf = mddev->private;
7790
7791 if (raid5_has_log(conf) || raid5_has_ppl(conf))
7792 return -EINVAL;
7793 if (mddev->delta_disks == 0 &&
7794 mddev->new_layout == mddev->layout &&
7795 mddev->new_chunk_sectors == mddev->chunk_sectors)
7796 return 0;
7797 if (has_failed(conf))
7798 return -EINVAL;
7799 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) {
7800
7801
7802
7803
7804
7805 int min = 2;
7806 if (mddev->level == 6)
7807 min = 4;
7808 if (mddev->raid_disks + mddev->delta_disks < min)
7809 return -EINVAL;
7810 }
7811
7812 if (!check_stripe_cache(mddev))
7813 return -ENOSPC;
7814
7815 if (mddev->new_chunk_sectors > mddev->chunk_sectors ||
7816 mddev->delta_disks > 0)
7817 if (resize_chunks(conf,
7818 conf->previous_raid_disks
7819 + max(0, mddev->delta_disks),
7820 max(mddev->new_chunk_sectors,
7821 mddev->chunk_sectors)
7822 ) < 0)
7823 return -ENOMEM;
7824
7825 if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
7826 return 0;
7827 return resize_stripes(conf, (conf->previous_raid_disks
7828 + mddev->delta_disks));
7829}
7830
7831static int raid5_start_reshape(struct mddev *mddev)
7832{
7833 struct r5conf *conf = mddev->private;
7834 struct md_rdev *rdev;
7835 int spares = 0;
7836 unsigned long flags;
7837
7838 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
7839 return -EBUSY;
7840
7841 if (!check_stripe_cache(mddev))
7842 return -ENOSPC;
7843
7844 if (has_failed(conf))
7845 return -EINVAL;
7846
7847 rdev_for_each(rdev, mddev) {
7848 if (!test_bit(In_sync, &rdev->flags)
7849 && !test_bit(Faulty, &rdev->flags))
7850 spares++;
7851 }
7852
7853 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
7854
7855
7856
7857 return -EINVAL;
7858
7859
7860
7861
7862
7863 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
7864 < mddev->array_sectors) {
7865 pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
7866 mdname(mddev));
7867 return -EINVAL;
7868 }
7869
7870 atomic_set(&conf->reshape_stripes, 0);
7871 spin_lock_irq(&conf->device_lock);
7872 write_seqcount_begin(&conf->gen_lock);
7873 conf->previous_raid_disks = conf->raid_disks;
7874 conf->raid_disks += mddev->delta_disks;
7875 conf->prev_chunk_sectors = conf->chunk_sectors;
7876 conf->chunk_sectors = mddev->new_chunk_sectors;
7877 conf->prev_algo = conf->algorithm;
7878 conf->algorithm = mddev->new_layout;
7879 conf->generation++;
7880
7881
7882
7883 smp_mb();
7884 if (mddev->reshape_backwards)
7885 conf->reshape_progress = raid5_size(mddev, 0, 0);
7886 else
7887 conf->reshape_progress = 0;
7888 conf->reshape_safe = conf->reshape_progress;
7889 write_seqcount_end(&conf->gen_lock);
7890 spin_unlock_irq(&conf->device_lock);
7891
7892
7893
7894
7895
7896 mddev_suspend(mddev);
7897 mddev_resume(mddev);
7898
7899
7900
7901
7902
7903
7904
7905
7906 if (mddev->delta_disks >= 0) {
7907 rdev_for_each(rdev, mddev)
7908 if (rdev->raid_disk < 0 &&
7909 !test_bit(Faulty, &rdev->flags)) {
7910 if (raid5_add_disk(mddev, rdev) == 0) {
7911 if (rdev->raid_disk
7912 >= conf->previous_raid_disks)
7913 set_bit(In_sync, &rdev->flags);
7914 else
7915 rdev->recovery_offset = 0;
7916
7917 if (sysfs_link_rdev(mddev, rdev))
7918 ;
7919 }
7920 } else if (rdev->raid_disk >= conf->previous_raid_disks
7921 && !test_bit(Faulty, &rdev->flags)) {
7922
7923 set_bit(In_sync, &rdev->flags);
7924 }
7925
7926
7927
7928
7929
7930 spin_lock_irqsave(&conf->device_lock, flags);
7931 mddev->degraded = raid5_calc_degraded(conf);
7932 spin_unlock_irqrestore(&conf->device_lock, flags);
7933 }
7934 mddev->raid_disks = conf->raid_disks;
7935 mddev->reshape_position = conf->reshape_progress;
7936 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7937
7938 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7939 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7940 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
7941 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7942 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7943 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
7944 "reshape");
7945 if (!mddev->sync_thread) {
7946 mddev->recovery = 0;
7947 spin_lock_irq(&conf->device_lock);
7948 write_seqcount_begin(&conf->gen_lock);
7949 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
7950 mddev->new_chunk_sectors =
7951 conf->chunk_sectors = conf->prev_chunk_sectors;
7952 mddev->new_layout = conf->algorithm = conf->prev_algo;
7953 rdev_for_each(rdev, mddev)
7954 rdev->new_data_offset = rdev->data_offset;
7955 smp_wmb();
7956 conf->generation --;
7957 conf->reshape_progress = MaxSector;
7958 mddev->reshape_position = MaxSector;
7959 write_seqcount_end(&conf->gen_lock);
7960 spin_unlock_irq(&conf->device_lock);
7961 return -EAGAIN;
7962 }
7963 conf->reshape_checkpoint = jiffies;
7964 md_wakeup_thread(mddev->sync_thread);
7965 md_new_event(mddev);
7966 return 0;
7967}
7968
7969
7970
7971
7972static void end_reshape(struct r5conf *conf)
7973{
7974
7975 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
7976 struct md_rdev *rdev;
7977
7978 spin_lock_irq(&conf->device_lock);
7979 conf->previous_raid_disks = conf->raid_disks;
7980 md_finish_reshape(conf->mddev);
7981 smp_wmb();
7982 conf->reshape_progress = MaxSector;
7983 conf->mddev->reshape_position = MaxSector;
7984 rdev_for_each(rdev, conf->mddev)
7985 if (rdev->raid_disk >= 0 &&
7986 !test_bit(Journal, &rdev->flags) &&
7987 !test_bit(In_sync, &rdev->flags))
7988 rdev->recovery_offset = MaxSector;
7989 spin_unlock_irq(&conf->device_lock);
7990 wake_up(&conf->wait_for_overlap);
7991
7992
7993
7994
7995 if (conf->mddev->queue) {
7996 int data_disks = conf->raid_disks - conf->max_degraded;
7997 int stripe = data_disks * ((conf->chunk_sectors << 9)
7998 / PAGE_SIZE);
7999 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
8000 conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
8001 }
8002 }
8003}
8004
8005
8006
8007
8008static void raid5_finish_reshape(struct mddev *mddev)
8009{
8010 struct r5conf *conf = mddev->private;
8011
8012 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8013
8014 if (mddev->delta_disks <= 0) {
8015 int d;
8016 spin_lock_irq(&conf->device_lock);
8017 mddev->degraded = raid5_calc_degraded(conf);
8018 spin_unlock_irq(&conf->device_lock);
8019 for (d = conf->raid_disks ;
8020 d < conf->raid_disks - mddev->delta_disks;
8021 d++) {
8022 struct md_rdev *rdev = conf->disks[d].rdev;
8023 if (rdev)
8024 clear_bit(In_sync, &rdev->flags);
8025 rdev = conf->disks[d].replacement;
8026 if (rdev)
8027 clear_bit(In_sync, &rdev->flags);
8028 }
8029 }
8030 mddev->layout = conf->algorithm;
8031 mddev->chunk_sectors = conf->chunk_sectors;
8032 mddev->reshape_position = MaxSector;
8033 mddev->delta_disks = 0;
8034 mddev->reshape_backwards = 0;
8035 }
8036}
8037
8038static void raid5_quiesce(struct mddev *mddev, int quiesce)
8039{
8040 struct r5conf *conf = mddev->private;
8041
8042 if (quiesce) {
8043
8044 lock_all_device_hash_locks_irq(conf);
8045
8046
8047
8048 r5c_flush_cache(conf, INT_MAX);
8049 conf->quiesce = 2;
8050 wait_event_cmd(conf->wait_for_quiescent,
8051 atomic_read(&conf->active_stripes) == 0 &&
8052 atomic_read(&conf->active_aligned_reads) == 0,
8053 unlock_all_device_hash_locks_irq(conf),
8054 lock_all_device_hash_locks_irq(conf));
8055 conf->quiesce = 1;
8056 unlock_all_device_hash_locks_irq(conf);
8057
8058 wake_up(&conf->wait_for_overlap);
8059 } else {
8060
8061 lock_all_device_hash_locks_irq(conf);
8062 conf->quiesce = 0;
8063 wake_up(&conf->wait_for_quiescent);
8064 wake_up(&conf->wait_for_overlap);
8065 unlock_all_device_hash_locks_irq(conf);
8066 }
8067 log_quiesce(conf, quiesce);
8068}
8069
8070static void *raid45_takeover_raid0(struct mddev *mddev, int level)
8071{
8072 struct r0conf *raid0_conf = mddev->private;
8073 sector_t sectors;
8074
8075
8076 if (raid0_conf->nr_strip_zones > 1) {
8077 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
8078 mdname(mddev));
8079 return ERR_PTR(-EINVAL);
8080 }
8081
8082 sectors = raid0_conf->strip_zone[0].zone_end;
8083 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
8084 mddev->dev_sectors = sectors;
8085 mddev->new_level = level;
8086 mddev->new_layout = ALGORITHM_PARITY_N;
8087 mddev->new_chunk_sectors = mddev->chunk_sectors;
8088 mddev->raid_disks += 1;
8089 mddev->delta_disks = 1;
8090
8091 mddev->recovery_cp = MaxSector;
8092
8093 return setup_conf(mddev);
8094}
8095
8096static void *raid5_takeover_raid1(struct mddev *mddev)
8097{
8098 int chunksect;
8099 void *ret;
8100
8101 if (mddev->raid_disks != 2 ||
8102 mddev->degraded > 1)
8103 return ERR_PTR(-EINVAL);
8104
8105
8106
8107 chunksect = 64*2;
8108
8109
8110 while (chunksect && (mddev->array_sectors & (chunksect-1)))
8111 chunksect >>= 1;
8112
8113 if ((chunksect<<9) < STRIPE_SIZE)
8114
8115 return ERR_PTR(-EINVAL);
8116
8117 mddev->new_level = 5;
8118 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
8119 mddev->new_chunk_sectors = chunksect;
8120
8121 ret = setup_conf(mddev);
8122 if (!IS_ERR(ret))
8123 mddev_clear_unsupported_flags(mddev,
8124 UNSUPPORTED_MDDEV_FLAGS);
8125 return ret;
8126}
8127
8128static void *raid5_takeover_raid6(struct mddev *mddev)
8129{
8130 int new_layout;
8131
8132 switch (mddev->layout) {
8133 case ALGORITHM_LEFT_ASYMMETRIC_6:
8134 new_layout = ALGORITHM_LEFT_ASYMMETRIC;
8135 break;
8136 case ALGORITHM_RIGHT_ASYMMETRIC_6:
8137 new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
8138 break;
8139 case ALGORITHM_LEFT_SYMMETRIC_6:
8140 new_layout = ALGORITHM_LEFT_SYMMETRIC;
8141 break;
8142 case ALGORITHM_RIGHT_SYMMETRIC_6:
8143 new_layout = ALGORITHM_RIGHT_SYMMETRIC;
8144 break;
8145 case ALGORITHM_PARITY_0_6:
8146 new_layout = ALGORITHM_PARITY_0;
8147 break;
8148 case ALGORITHM_PARITY_N:
8149 new_layout = ALGORITHM_PARITY_N;
8150 break;
8151 default:
8152 return ERR_PTR(-EINVAL);
8153 }
8154 mddev->new_level = 5;
8155 mddev->new_layout = new_layout;
8156 mddev->delta_disks = -1;
8157 mddev->raid_disks -= 1;
8158 return setup_conf(mddev);
8159}
8160
8161static int raid5_check_reshape(struct mddev *mddev)
8162{
8163
8164
8165
8166
8167
8168 struct r5conf *conf = mddev->private;
8169 int new_chunk = mddev->new_chunk_sectors;
8170
8171 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
8172 return -EINVAL;
8173 if (new_chunk > 0) {
8174 if (!is_power_of_2(new_chunk))
8175 return -EINVAL;
8176 if (new_chunk < (PAGE_SIZE>>9))
8177 return -EINVAL;
8178 if (mddev->array_sectors & (new_chunk-1))
8179
8180 return -EINVAL;
8181 }
8182
8183
8184
8185 if (mddev->raid_disks == 2) {
8186
8187 if (mddev->new_layout >= 0) {
8188 conf->algorithm = mddev->new_layout;
8189 mddev->layout = mddev->new_layout;
8190 }
8191 if (new_chunk > 0) {
8192 conf->chunk_sectors = new_chunk ;
8193 mddev->chunk_sectors = new_chunk;
8194 }
8195 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8196 md_wakeup_thread(mddev->thread);
8197 }
8198 return check_reshape(mddev);
8199}
8200
8201static int raid6_check_reshape(struct mddev *mddev)
8202{
8203 int new_chunk = mddev->new_chunk_sectors;
8204
8205 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
8206 return -EINVAL;
8207 if (new_chunk > 0) {
8208 if (!is_power_of_2(new_chunk))
8209 return -EINVAL;
8210 if (new_chunk < (PAGE_SIZE >> 9))
8211 return -EINVAL;
8212 if (mddev->array_sectors & (new_chunk-1))
8213
8214 return -EINVAL;
8215 }
8216
8217
8218 return check_reshape(mddev);
8219}
8220
8221static void *raid5_takeover(struct mddev *mddev)
8222{
8223
8224
8225
8226
8227
8228
8229 if (mddev->level == 0)
8230 return raid45_takeover_raid0(mddev, 5);
8231 if (mddev->level == 1)
8232 return raid5_takeover_raid1(mddev);
8233 if (mddev->level == 4) {
8234 mddev->new_layout = ALGORITHM_PARITY_N;
8235 mddev->new_level = 5;
8236 return setup_conf(mddev);
8237 }
8238 if (mddev->level == 6)
8239 return raid5_takeover_raid6(mddev);
8240
8241 return ERR_PTR(-EINVAL);
8242}
8243
8244static void *raid4_takeover(struct mddev *mddev)
8245{
8246
8247
8248
8249
8250 if (mddev->level == 0)
8251 return raid45_takeover_raid0(mddev, 4);
8252 if (mddev->level == 5 &&
8253 mddev->layout == ALGORITHM_PARITY_N) {
8254 mddev->new_layout = 0;
8255 mddev->new_level = 4;
8256 return setup_conf(mddev);
8257 }
8258 return ERR_PTR(-EINVAL);
8259}
8260
8261static struct md_personality raid5_personality;
8262
8263static void *raid6_takeover(struct mddev *mddev)
8264{
8265
8266
8267
8268
8269 int new_layout;
8270
8271 if (mddev->pers != &raid5_personality)
8272 return ERR_PTR(-EINVAL);
8273 if (mddev->degraded > 1)
8274 return ERR_PTR(-EINVAL);
8275 if (mddev->raid_disks > 253)
8276 return ERR_PTR(-EINVAL);
8277 if (mddev->raid_disks < 3)
8278 return ERR_PTR(-EINVAL);
8279
8280 switch (mddev->layout) {
8281 case ALGORITHM_LEFT_ASYMMETRIC:
8282 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
8283 break;
8284 case ALGORITHM_RIGHT_ASYMMETRIC:
8285 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
8286 break;
8287 case ALGORITHM_LEFT_SYMMETRIC:
8288 new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
8289 break;
8290 case ALGORITHM_RIGHT_SYMMETRIC:
8291 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
8292 break;
8293 case ALGORITHM_PARITY_0:
8294 new_layout = ALGORITHM_PARITY_0_6;
8295 break;
8296 case ALGORITHM_PARITY_N:
8297 new_layout = ALGORITHM_PARITY_N;
8298 break;
8299 default:
8300 return ERR_PTR(-EINVAL);
8301 }
8302 mddev->new_level = 6;
8303 mddev->new_layout = new_layout;
8304 mddev->delta_disks = 1;
8305 mddev->raid_disks += 1;
8306 return setup_conf(mddev);
8307}
8308
8309static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
8310{
8311 struct r5conf *conf;
8312 int err;
8313
8314 err = mddev_lock(mddev);
8315 if (err)
8316 return err;
8317 conf = mddev->private;
8318 if (!conf) {
8319 mddev_unlock(mddev);
8320 return -ENODEV;
8321 }
8322
8323 if (strncmp(buf, "ppl", 3) == 0) {
8324
8325 if (!raid5_has_ppl(conf) && conf->level == 5) {
8326 err = log_init(conf, NULL, true);
8327 if (!err) {
8328 err = resize_stripes(conf, conf->pool_size);
8329 if (err)
8330 log_exit(conf);
8331 }
8332 } else
8333 err = -EINVAL;
8334 } else if (strncmp(buf, "resync", 6) == 0) {
8335 if (raid5_has_ppl(conf)) {
8336 mddev_suspend(mddev);
8337 log_exit(conf);
8338 mddev_resume(mddev);
8339 err = resize_stripes(conf, conf->pool_size);
8340 } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
8341 r5l_log_disk_error(conf)) {
8342 bool journal_dev_exists = false;
8343 struct md_rdev *rdev;
8344
8345 rdev_for_each(rdev, mddev)
8346 if (test_bit(Journal, &rdev->flags)) {
8347 journal_dev_exists = true;
8348 break;
8349 }
8350
8351 if (!journal_dev_exists) {
8352 mddev_suspend(mddev);
8353 clear_bit(MD_HAS_JOURNAL, &mddev->flags);
8354 mddev_resume(mddev);
8355 } else
8356 err = -EBUSY;
8357 } else
8358 err = -EINVAL;
8359 } else {
8360 err = -EINVAL;
8361 }
8362
8363 if (!err)
8364 md_update_sb(mddev, 1);
8365
8366 mddev_unlock(mddev);
8367
8368 return err;
8369}
8370
8371static int raid5_start(struct mddev *mddev)
8372{
8373 struct r5conf *conf = mddev->private;
8374
8375 return r5l_start(conf->log);
8376}
8377
8378static struct md_personality raid6_personality =
8379{
8380 .name = "raid6",
8381 .level = 6,
8382 .owner = THIS_MODULE,
8383 .make_request = raid5_make_request,
8384 .run = raid5_run,
8385 .start = raid5_start,
8386 .free = raid5_free,
8387 .status = raid5_status,
8388 .error_handler = raid5_error,
8389 .hot_add_disk = raid5_add_disk,
8390 .hot_remove_disk= raid5_remove_disk,
8391 .spare_active = raid5_spare_active,
8392 .sync_request = raid5_sync_request,
8393 .resize = raid5_resize,
8394 .size = raid5_size,
8395 .check_reshape = raid6_check_reshape,
8396 .start_reshape = raid5_start_reshape,
8397 .finish_reshape = raid5_finish_reshape,
8398 .quiesce = raid5_quiesce,
8399 .takeover = raid6_takeover,
8400 .congested = raid5_congested,
8401 .change_consistency_policy = raid5_change_consistency_policy,
8402};
8403static struct md_personality raid5_personality =
8404{
8405 .name = "raid5",
8406 .level = 5,
8407 .owner = THIS_MODULE,
8408 .make_request = raid5_make_request,
8409 .run = raid5_run,
8410 .start = raid5_start,
8411 .free = raid5_free,
8412 .status = raid5_status,
8413 .error_handler = raid5_error,
8414 .hot_add_disk = raid5_add_disk,
8415 .hot_remove_disk= raid5_remove_disk,
8416 .spare_active = raid5_spare_active,
8417 .sync_request = raid5_sync_request,
8418 .resize = raid5_resize,
8419 .size = raid5_size,
8420 .check_reshape = raid5_check_reshape,
8421 .start_reshape = raid5_start_reshape,
8422 .finish_reshape = raid5_finish_reshape,
8423 .quiesce = raid5_quiesce,
8424 .takeover = raid5_takeover,
8425 .congested = raid5_congested,
8426 .change_consistency_policy = raid5_change_consistency_policy,
8427};
8428
8429static struct md_personality raid4_personality =
8430{
8431 .name = "raid4",
8432 .level = 4,
8433 .owner = THIS_MODULE,
8434 .make_request = raid5_make_request,
8435 .run = raid5_run,
8436 .start = raid5_start,
8437 .free = raid5_free,
8438 .status = raid5_status,
8439 .error_handler = raid5_error,
8440 .hot_add_disk = raid5_add_disk,
8441 .hot_remove_disk= raid5_remove_disk,
8442 .spare_active = raid5_spare_active,
8443 .sync_request = raid5_sync_request,
8444 .resize = raid5_resize,
8445 .size = raid5_size,
8446 .check_reshape = raid5_check_reshape,
8447 .start_reshape = raid5_start_reshape,
8448 .finish_reshape = raid5_finish_reshape,
8449 .quiesce = raid5_quiesce,
8450 .takeover = raid4_takeover,
8451 .congested = raid5_congested,
8452 .change_consistency_policy = raid5_change_consistency_policy,
8453};
8454
8455static int __init raid5_init(void)
8456{
8457 int ret;
8458
8459 raid5_wq = alloc_workqueue("raid5wq",
8460 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
8461 if (!raid5_wq)
8462 return -ENOMEM;
8463
8464 ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE,
8465 "md/raid5:prepare",
8466 raid456_cpu_up_prepare,
8467 raid456_cpu_dead);
8468 if (ret) {
8469 destroy_workqueue(raid5_wq);
8470 return ret;
8471 }
8472 register_md_personality(&raid6_personality);
8473 register_md_personality(&raid5_personality);
8474 register_md_personality(&raid4_personality);
8475 return 0;
8476}
8477
8478static void raid5_exit(void)
8479{
8480 unregister_md_personality(&raid6_personality);
8481 unregister_md_personality(&raid5_personality);
8482 unregister_md_personality(&raid4_personality);
8483 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
8484 destroy_workqueue(raid5_wq);
8485}
8486
8487module_init(raid5_init);
8488module_exit(raid5_exit);
8489MODULE_LICENSE("GPL");
8490MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
8491MODULE_ALIAS("md-personality-4");
8492MODULE_ALIAS("md-raid5");
8493MODULE_ALIAS("md-raid4");
8494MODULE_ALIAS("md-level-5");
8495MODULE_ALIAS("md-level-4");
8496MODULE_ALIAS("md-personality-8");
8497MODULE_ALIAS("md-raid6");
8498MODULE_ALIAS("md-level-6");
8499
8500
8501MODULE_ALIAS("raid5");
8502MODULE_ALIAS("raid6");
8503